243 lines
8 KiB
Python
243 lines
8 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
from time import time, sleep
|
|
from urllib.parse import urlencode
|
|
import re
|
|
from functools import partial
|
|
|
|
from ox.cache import get_json, store, read_url
|
|
import ox.web.google
|
|
import stdnum.isbn
|
|
from lxml import etree
|
|
|
|
from .utils import find_isbns, get_language, decode_html_data, to_isbn13
|
|
import settings
|
|
|
|
import logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
NAMESPACES = {
|
|
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
|
|
'atom' : 'http://www.w3.org/2005/Atom',
|
|
'dc' : 'http://purl.org/dc/terms',
|
|
'gbs' : 'http://schemas.google.com/books/2008',
|
|
'gd' : 'http://schemas.google.com/g/2005'
|
|
}
|
|
XPath = partial(etree.XPath, namespaces=NAMESPACES)
|
|
|
|
def find_(query):
|
|
logger.debug('find %s', query)
|
|
query += ' isbn'
|
|
isbns = []
|
|
for r in ox.web.google.find(query):
|
|
isbns += find_isbns(' '.join(r))
|
|
logger.debug('isbns', isbns)
|
|
results = []
|
|
done = set()
|
|
for isbn in isbns:
|
|
if isbn not in done:
|
|
r = {
|
|
'isbn': isbn,
|
|
'primaryid': ['isbn', isbn]
|
|
}
|
|
results.append(r)
|
|
done.add(isbn)
|
|
if len(isbn) == 10:
|
|
done.add(stdnum.isbn.to_isbn13(isbn))
|
|
if len(isbn) == 13 and isbn.startswith('978'):
|
|
done.add(stdnum.isbn.to_isbn10(isbn))
|
|
return results
|
|
|
|
def parse_entry(entry_):
|
|
entry_id = XPath('descendant::atom:id')
|
|
creator = XPath('descendant::dc:creator')
|
|
date = XPath('descendant::dc:date')
|
|
description = XPath('descendant::dc:description')
|
|
_format = XPath('descendant::dc:format')
|
|
identifier = XPath('descendant::dc:identifier')
|
|
language = XPath('descendant::dc:language')
|
|
publisher = XPath('descendant::dc:publisher')
|
|
subject = XPath('descendant::dc:subject')
|
|
title = XPath('descendant::dc:title')
|
|
viewability = XPath('descendant::gbs:viewability')
|
|
id_url = entry_id(entry_)[0].text
|
|
_id = id_url.split('/')[-1]
|
|
info = {}
|
|
info['title'] = ': '.join([x.text for x in title(entry_)]).strip()
|
|
authors = [x.text.strip() for x in creator(entry_) if x.text and x.text.strip()]
|
|
if authors:
|
|
info['author'] = authors
|
|
info['description'] = '\n\n'.join([x.text for x in description(entry_)]).strip()
|
|
info['date'] = ''.join([x.text for x in date(entry_)]).strip()
|
|
info['categories'] = [x.text for x in subject(entry_)]
|
|
info['publisher'] = [x.text for x in publisher(entry_)]
|
|
info['language'] = [get_language(x.text) for x in language(entry_)]
|
|
v = viewability(entry_)
|
|
if v and v[0].attrib.get('value') != 'http://schemas.google.com/books/2008#view_no_pages':
|
|
info['cover'] = 'https://books.google.com/books/content/images/frontcover/%s?fife=w600-rw' % _id
|
|
format_ = ''.join([x.text for x in _format(entry_)])
|
|
if format_:
|
|
pages = re.compile('\d+').findall(format_)
|
|
if pages:
|
|
info['pages'] = int(pages[0])
|
|
for x in identifier(entry_):
|
|
t = str(x.text).strip()
|
|
if t[:5].upper() == 'ISBN:':
|
|
t = to_isbn13(t[5:])
|
|
if t:
|
|
info['isbn'] = t
|
|
break
|
|
info = decode_html_data(info)
|
|
return info
|
|
|
|
def find(title=None, author=None):
|
|
'''
|
|
parts = []
|
|
if title:
|
|
parts.append(' '.join(['intitle:%s' % p for p in title.split(' ')]))
|
|
if author:
|
|
parts.append(' '.join(['inauthor:%s' % p for p in author.split(' ')]))
|
|
q = '+'.join(parts)
|
|
'''
|
|
q = ''
|
|
if title:
|
|
q += title + ' '
|
|
if author:
|
|
q += author
|
|
url = 'http://books.google.com/books/feeds/volumes?' + urlencode({
|
|
'q': q.strip(),
|
|
'max-results': 20,
|
|
'start-index':1,
|
|
'min-viewability':'none',
|
|
})
|
|
data = read_url(url)
|
|
feed = etree.fromstring(data,
|
|
parser=etree.XMLParser(recover=True, no_network=True))
|
|
results = []
|
|
isbns = set()
|
|
for entry_ in XPath('//atom:entry')(feed):
|
|
info = parse_entry(entry_)
|
|
if 'isbn' in info and not 'isbn' in isbns:
|
|
results.append(info)
|
|
isbns.add(info['isbn'])
|
|
return results
|
|
|
|
def info_old(isbn):
|
|
url = 'http://books.google.com/books/feeds/volumes?' + urlencode({
|
|
'q': 'isnb:' + isbn,
|
|
'max-results':1,
|
|
'start-index':1,
|
|
'min-viewability':'none',
|
|
})
|
|
data = read_url(url)
|
|
feed = etree.fromstring(data,
|
|
parser=etree.XMLParser(recover=True, no_network=True))
|
|
for entry_ in XPath('//atom:entry')(feed):
|
|
info = parse_entry(entry_)
|
|
info['isbn'] = isbn
|
|
return info
|
|
return {}
|
|
|
|
def info(value):
|
|
key = 'isbn'
|
|
url = 'https://www.googleapis.com/books/v1/volumes?q=%s:%s' % (key, value)
|
|
api_key = settings.server.get('google_api_key')
|
|
if api_key:
|
|
url += '&key=' + api_key
|
|
if api_limit.error:
|
|
raise IOError(url)
|
|
while not api_limit.consume(1):
|
|
logger.debug('hitting google api to fast, waiting 1 second')
|
|
sleep(1)
|
|
r = get_json(url, timeout=-1)
|
|
if 'error' in r:
|
|
logger.debug('got google api error, dont call for 10 minutes')
|
|
store.delete(url)
|
|
api_limit.error = True
|
|
raise IOError(url, r)
|
|
if not 'items' in r:
|
|
logger.debug('unknown %s: %s [%s]', key, value, r)
|
|
return {}
|
|
_data = r['items'][0]['volumeInfo']
|
|
_id = r['items'][0]['id']
|
|
data = {}
|
|
for key in [
|
|
'authors',
|
|
'categories',
|
|
'description',
|
|
'pageCount',
|
|
'publishedDate',
|
|
'publisher',
|
|
'title',
|
|
]:
|
|
if key in _data:
|
|
data[{
|
|
'authors': 'author',
|
|
'pageCount': 'pages',
|
|
'publishedDate': 'date',
|
|
}.get(key,key)] = _data[key]
|
|
|
|
if 'subtitle' in _data and _data['subtitle'].strip():
|
|
data['title'] = '{title}: {subtitle}'.format(**_data)
|
|
if r['items'][0]['accessInfo']['viewability'] != 'NO_PAGES':
|
|
#data['cover'] = 'https://books.google.com/books?id=%s&pg=PP1&img=1&zoom=0&hl=en' % _id
|
|
data['cover'] = 'https://books.google.com/books/content/images/frontcover/%s?fife=w600-rw' % _id
|
|
|
|
elif 'imageLinks' in _data:
|
|
for size in ('extraLarge', 'large', 'medium', 'small', 'thumbnail', 'smallThumbnail'):
|
|
if size in _data['imageLinks']:
|
|
data['cover'] = _data['imageLinks'][size]
|
|
break
|
|
if 'industryIdentifiers' in _data:
|
|
for k in _data['industryIdentifiers']:
|
|
if k['type'].startswith('ISBN'):
|
|
if not 'isbn' in data:
|
|
data['isbn'] = []
|
|
data['isbn'].append(k['identifier'])
|
|
else:
|
|
logger.debug('unknown identifier %s', k)
|
|
if 'isbn' in data:
|
|
data['isbn'] = [to_isbn13(i) for i in data['isbn']][0]
|
|
|
|
if 'publisher' in data and isinstance(data['publisher'], str):
|
|
data['publisher'] = [data['publisher']]
|
|
if 'language' in _data:
|
|
data['language'] = [get_language(_data['language'])]
|
|
data = decode_html_data(data)
|
|
return data
|
|
|
|
class Limit(object):
|
|
_error = False
|
|
|
|
def __init__(self, fill_rate, capacity):
|
|
self.timestamp = time()
|
|
self.fill_rate = fill_rate
|
|
self.capacity = capacity
|
|
self._tokens = capacity
|
|
|
|
def consume(self, tokens):
|
|
if tokens <= self.tokens:
|
|
self._tokens -= tokens
|
|
else:
|
|
return False
|
|
return True
|
|
|
|
def get_tokens(self):
|
|
now = time()
|
|
if self._tokens < self.capacity:
|
|
delta = self.fill_rate * (now - self.timestamp)
|
|
self._tokens = min(self.capacity, self._tokens + delta)
|
|
self.timestamp = now
|
|
return self._tokens
|
|
tokens = property(get_tokens)
|
|
|
|
def get_error(self):
|
|
if self._error and self._error < (time() - 10*60):
|
|
self._error = False
|
|
return self._error != False
|
|
|
|
def set_error(self, value):
|
|
self._error = time()
|
|
error = property(get_error, set_error)
|
|
|
|
api_limit = Limit(fill_rate=0.5, capacity=25)
|