openmedialibrary/oml/meta/google.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4

from time import time, sleep

from ox.cache import get_json, store
import ox.web.google
import stdnum.isbn

from .utils import find_isbns, get_language, decode_html_data, to_isbn13

import logging
logger = logging.getLogger(__name__)


def find(query):
    logger.debug('find %s', query)
    query += ' isbn'
    isbns = []
    for r in ox.web.google.find(query):
        isbns += find_isbns(' '.join(r))
    logger.debug('isbns', isbns)
    results = []
    done = set()
    for isbn in isbns:
        if isbn not in done:
            r = {
                'isbn': isbn,
                'primaryid': ['isbn', isbn]
            }
            results.append(r)
            done.add(isbn)
            if len(isbn) == 10:
                done.add(stdnum.isbn.to_isbn13(isbn))
            if len(isbn) == 13 and isbn.startswith('978'):
                done.add(stdnum.isbn.to_isbn10(isbn))
    return results

def info(key, value):
    if key not in ('isbn', 'lccn', 'oclc'):
        raise IOError('unknwon key %s' % key)
    url = 'https://www.googleapis.com/books/v1/volumes?q=%s:%s' % (key, value)
    while not api_limit.consume(1):
        logger.debug('hitting google api to fast, waiting 1 second')
        sleep(1)
    r = get_json(url, timeout=-1)
    if 'error' in r:
        store.delete(url)
        raise IOError(url, r)
    if not 'items' in r:
        print('unknown %s: %s [%s]' % (key, value, r))
        return {}
    _data = r['items'][0]['volumeInfo']
    _id = r['items'][0]['id']
    data = {}
    for key in [
            'authors',
            'categories',
            'description',
            'pageCount',
            'publishedDate',
            'publisher',
            'title',
        ]:
        if key in _data:
            data[{
                'authors': 'author',
                'pageCount': 'pages',
                'publishedDate': 'date',
            }.get(key,key)] = _data[key]

    if 'subtitle' in _data and _data['subtitle'].strip():
        data['title'] = '{title}: {subtitle}'.format(**_data)
    if r['items'][0]['accessInfo']['viewability'] != 'NO_PAGES':
        #data['cover'] = 'https://books.google.com/books?id=%s&pg=PP1&img=1&zoom=0&hl=en' % _id
        data['cover'] = 'https://books.google.com/books/content/images/frontcover/%s?fife=w600-rw' % _id

    elif 'imageLinks' in _data:
        for size in ('extraLarge', 'large', 'medium', 'small', 'thumbnail', 'smallThumbnail'):
            if size in _data['imageLinks']:
                data['cover'] = _data['imageLinks'][size]
                break
    if 'industryIdentifiers' in _data:
        for k in _data['industryIdentifiers']:
            if k['type'].startswith('ISBN'):
                if not 'isbn' in data:
                    data['isbn'] = []
                data['isbn'].append(k['identifier'])
            else:
                print('unknown identifier', k)
        if 'isbn' in data:
            data['isbn'] = [to_isbn13(i) for i in data['isbn']][0]

    if 'publisher' in data and isinstance(data['publisher'], str):
        data['publisher'] = [data['publisher']]
    if 'language' in _data:
        data['language'] = [get_language(_data['language'])]
    data = decode_html_data(data)
    return data

class Limit(object):

    def __init__(self, fill_rate, capacity):
        self.timestamp = time()
        self.fill_rate = fill_rate
        self.capacity = capacity
        self._tokens = capacity

    def consume(self, tokens):
        if tokens <= self.tokens:
            self._tokens -= tokens
        else:
            return False
        return True

    def get_tokens(self):
        now = time()
        if self._tokens < self.capacity:
            delta = self.fill_rate * (now - self.timestamp)
            self._tokens = min(self.capacity, self._tokens + delta)
        self.timestamp = now
        return self._tokens
    tokens = property(get_tokens)

api_limit = Limit(fill_rate=1, capacity=10)
find 2014-05-16 10:06:11 +02:00			`# -- coding: utf-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`
port to python3 2014-09-03 00:32:44 +02:00
rate limit google requests 2016-01-23 18:06:40 +05:30			`from time import time, sleep`
find 2014-05-16 10:06:11 +02:00
inital google books and amazon parser 2016-01-05 12:58:30 +05:30			`from ox.cache import get_json, store`
find 2014-05-16 10:06:11 +02:00			`import ox.web.google`
			`import stdnum.isbn`

store metadata per user. remove primaryid. only store isbn13 2016-01-11 19:13:54 +05:30			`from .utils import find_isbns, get_language, decode_html_data, to_isbn13`
find 2014-05-16 10:06:11 +02:00
use python logging 2014-05-17 16:26:59 +02:00			`import logging`
use logging.getLogger(__name__) 2015-11-29 15:56:38 +01:00			`logger = logging.getLogger(__name__)`
use python logging 2014-05-17 16:26:59 +02:00
find 2014-05-16 10:06:11 +02:00
lots of stuff 2014-05-21 02:02:21 +02:00			`def find(query):`
			`logger.debug('find %s', query)`
find 2014-05-16 10:06:11 +02:00			`query += ' isbn'`
			`isbns = []`
			`for r in ox.web.google.find(query):`
			`isbns += find_isbns(' '.join(r))`
use python logging 2014-05-17 16:26:59 +02:00			`logger.debug('isbns', isbns)`
find 2014-05-16 10:06:11 +02:00			`results = []`
			`done = set()`
			`for isbn in isbns:`
			`if isbn not in done:`
			`r = {`
lots of stuff 2014-05-21 02:02:21 +02:00			`'isbn': isbn,`
			`'primaryid': ['isbn', isbn]`
find 2014-05-16 10:06:11 +02:00			`}`
			`results.append(r)`
			`done.add(isbn)`
			`if len(isbn) == 10:`
			`done.add(stdnum.isbn.to_isbn13(isbn))`
lots of stuff 2014-05-21 02:02:21 +02:00			`if len(isbn) == 13 and isbn.startswith('978'):`
ddg 2014-05-17 11:19:32 +02:00			`done.add(stdnum.isbn.to_isbn10(isbn))`
find 2014-05-16 10:06:11 +02:00			`return results`
inital google books and amazon parser 2016-01-05 12:58:30 +05:30
			`def info(key, value):`
			`if key not in ('isbn', 'lccn', 'oclc'):`
			`raise IOError('unknwon key %s' % key)`
			`url = 'https://www.googleapis.com/books/v1/volumes?q=%s:%s' % (key, value)`
rate limit google requests 2016-01-23 18:06:40 +05:30			`while not api_limit.consume(1):`
			`logger.debug('hitting google api to fast, waiting 1 second')`
			`sleep(1)`
inital google books and amazon parser 2016-01-05 12:58:30 +05:30			`r = get_json(url, timeout=-1)`
			`if 'error' in r:`
			`store.delete(url)`
			`raise IOError(url, r)`
			`if not 'items' in r:`
fail 2016-01-07 15:42:48 +05:30			`print('unknown %s: %s [%s]' % (key, value, r))`
inital google books and amazon parser 2016-01-05 12:58:30 +05:30			`return {}`
			`_data = r['items'][0]['volumeInfo']`
better cover link 2016-01-09 19:39:01 +05:30			`_id = r['items'][0]['id']`
inital google books and amazon parser 2016-01-05 12:58:30 +05:30			`data = {}`
			`for key in [`
			`'authors',`
store metadata per user. remove primaryid. only store isbn13 2016-01-11 19:13:54 +05:30			`'categories',`
inital google books and amazon parser 2016-01-05 12:58:30 +05:30			`'description',`
			`'pageCount',`
			`'publishedDate',`
			`'publisher',`
			`'title',`
			`]:`
			`if key in _data:`
			`data[{`
			`'authors': 'author',`
			`'pageCount': 'pages',`
			`'publishedDate': 'date',`
			`}.get(key,key)] = _data[key]`

dont add empty subtitle 2016-01-09 16:11:04 +05:30			`if 'subtitle' in _data and _data['subtitle'].strip():`
inital google books and amazon parser 2016-01-05 12:58:30 +05:30			`data['title'] = '{title}: {subtitle}'.format(**_data)`
			`if r['items'][0]['accessInfo']['viewability'] != 'NO_PAGES':`
better cover link 2016-01-09 19:39:01 +05:30			`#data['cover'] = 'https://books.google.com/books?id=%s&pg=PP1&img=1&zoom=0&hl=en' % _id`
			`data['cover'] = 'https://books.google.com/books/content/images/frontcover/%s?fife=w600-rw' % _id`

inital google books and amazon parser 2016-01-05 12:58:30 +05:30			`elif 'imageLinks' in _data:`
			`for size in ('extraLarge', 'large', 'medium', 'small', 'thumbnail', 'smallThumbnail'):`
			`if size in _data['imageLinks']:`
			`data['cover'] = _data['imageLinks'][size]`
			`break`
			`if 'industryIdentifiers' in _data:`
			`for k in _data['industryIdentifiers']:`
			`if k['type'].startswith('ISBN'):`
			`if not 'isbn' in data:`
			`data['isbn'] = []`
			`data['isbn'].append(k['identifier'])`
			`else:`
			`print('unknown identifier', k)`
store metadata per user. remove primaryid. only store isbn13 2016-01-11 19:13:54 +05:30			`if 'isbn' in data:`
			`data['isbn'] = [to_isbn13(i) for i in data['isbn']][0]`

fail 2016-01-07 15:42:48 +05:30			`if 'publisher' in data and isinstance(data['publisher'], str):`
			`data['publisher'] = [data['publisher']]`
inital google books and amazon parser 2016-01-05 12:58:30 +05:30			`if 'language' in _data:`
fail 2016-01-07 15:42:48 +05:30			`data['language'] = [get_language(_data['language'])]`
import text not html 2016-01-08 15:52:07 +05:30			`data = decode_html_data(data)`
inital google books and amazon parser 2016-01-05 12:58:30 +05:30			`return data`

rate limit google requests 2016-01-23 18:06:40 +05:30			`class Limit(object):`

			`def __init__(self, fill_rate, capacity):`
			`self.timestamp = time()`
			`self.fill_rate = fill_rate`
			`self.capacity = capacity`
			`self._tokens = capacity`

			`def consume(self, tokens):`
			`if tokens <= self.tokens:`
			`self._tokens -= tokens`
			`else:`
			`return False`
			`return True`

			`def get_tokens(self):`
			`now = time()`
			`if self._tokens < self.capacity:`
			`delta = self.fill_rate * (now - self.timestamp)`
			`self._tokens = min(self.capacity, self._tokens + delta)`
			`self.timestamp = now`
			`return self._tokens`
			`tokens = property(get_tokens)`

			`api_limit = Limit(fill_rate=1, capacity=10)`