openmedialibrary/oml/meta/google.py

126 lines
3.9 KiB
Python
Raw Normal View History

2014-05-16 08:06:11 +00:00
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
2014-09-02 22:32:44 +00:00
2016-01-23 12:36:40 +00:00
from time import time, sleep
2014-05-16 08:06:11 +00:00
2016-01-05 07:28:30 +00:00
from ox.cache import get_json, store
2014-05-16 08:06:11 +00:00
import ox.web.google
import stdnum.isbn
from .utils import find_isbns, get_language, decode_html_data, to_isbn13
2014-05-16 08:06:11 +00:00
2014-05-17 14:26:59 +00:00
import logging
2015-11-29 14:56:38 +00:00
logger = logging.getLogger(__name__)
2014-05-17 14:26:59 +00:00
2014-05-16 08:06:11 +00:00
2014-05-21 00:02:21 +00:00
def find(query):
logger.debug('find %s', query)
2014-05-16 08:06:11 +00:00
query += ' isbn'
isbns = []
for r in ox.web.google.find(query):
isbns += find_isbns(' '.join(r))
2014-05-17 14:26:59 +00:00
logger.debug('isbns', isbns)
2014-05-16 08:06:11 +00:00
results = []
done = set()
for isbn in isbns:
if isbn not in done:
r = {
2014-05-21 00:02:21 +00:00
'isbn': isbn,
'primaryid': ['isbn', isbn]
2014-05-16 08:06:11 +00:00
}
results.append(r)
done.add(isbn)
if len(isbn) == 10:
done.add(stdnum.isbn.to_isbn13(isbn))
2014-05-21 00:02:21 +00:00
if len(isbn) == 13 and isbn.startswith('978'):
2014-05-17 09:19:32 +00:00
done.add(stdnum.isbn.to_isbn10(isbn))
2014-05-16 08:06:11 +00:00
return results
2016-01-05 07:28:30 +00:00
def info(key, value):
if key not in ('isbn', 'lccn', 'oclc'):
raise IOError('unknwon key %s' % key)
url = 'https://www.googleapis.com/books/v1/volumes?q=%s:%s' % (key, value)
2016-01-23 12:36:40 +00:00
while not api_limit.consume(1):
logger.debug('hitting google api to fast, waiting 1 second')
sleep(1)
2016-01-05 07:28:30 +00:00
r = get_json(url, timeout=-1)
if 'error' in r:
store.delete(url)
raise IOError(url, r)
if not 'items' in r:
2016-01-07 10:12:48 +00:00
print('unknown %s: %s [%s]' % (key, value, r))
2016-01-05 07:28:30 +00:00
return {}
_data = r['items'][0]['volumeInfo']
2016-01-09 14:09:01 +00:00
_id = r['items'][0]['id']
2016-01-05 07:28:30 +00:00
data = {}
for key in [
'authors',
'categories',
2016-01-05 07:28:30 +00:00
'description',
'pageCount',
'publishedDate',
'publisher',
'title',
]:
if key in _data:
data[{
'authors': 'author',
'pageCount': 'pages',
'publishedDate': 'date',
}.get(key,key)] = _data[key]
2016-01-09 10:41:04 +00:00
if 'subtitle' in _data and _data['subtitle'].strip():
2016-01-05 07:28:30 +00:00
data['title'] = '{title}: {subtitle}'.format(**_data)
if r['items'][0]['accessInfo']['viewability'] != 'NO_PAGES':
2016-01-09 14:09:01 +00:00
#data['cover'] = 'https://books.google.com/books?id=%s&pg=PP1&img=1&zoom=0&hl=en' % _id
data['cover'] = 'https://books.google.com/books/content/images/frontcover/%s?fife=w600-rw' % _id
2016-01-05 07:28:30 +00:00
elif 'imageLinks' in _data:
for size in ('extraLarge', 'large', 'medium', 'small', 'thumbnail', 'smallThumbnail'):
if size in _data['imageLinks']:
data['cover'] = _data['imageLinks'][size]
break
if 'industryIdentifiers' in _data:
for k in _data['industryIdentifiers']:
if k['type'].startswith('ISBN'):
if not 'isbn' in data:
data['isbn'] = []
data['isbn'].append(k['identifier'])
else:
print('unknown identifier', k)
if 'isbn' in data:
data['isbn'] = [to_isbn13(i) for i in data['isbn']][0]
2016-01-07 10:12:48 +00:00
if 'publisher' in data and isinstance(data['publisher'], str):
data['publisher'] = [data['publisher']]
2016-01-05 07:28:30 +00:00
if 'language' in _data:
2016-01-07 10:12:48 +00:00
data['language'] = [get_language(_data['language'])]
2016-01-08 10:22:07 +00:00
data = decode_html_data(data)
2016-01-05 07:28:30 +00:00
return data
2016-01-23 12:36:40 +00:00
class Limit(object):
def __init__(self, fill_rate, capacity):
self.timestamp = time()
self.fill_rate = fill_rate
self.capacity = capacity
self._tokens = capacity
def consume(self, tokens):
if tokens <= self.tokens:
self._tokens -= tokens
else:
return False
return True
def get_tokens(self):
now = time()
if self._tokens < self.capacity:
delta = self.fill_rate * (now - self.timestamp)
self._tokens = min(self.capacity, self._tokens + delta)
self.timestamp = now
return self._tokens
tokens = property(get_tokens)
api_limit = Limit(fill_rate=1, capacity=10)