# -*- coding: utf-8 -*- from time import time, sleep from urllib.parse import urlencode import re from functools import partial from ox.cache import get_json, store, read_url import ox.web.google import stdnum.isbn from lxml import etree from .utils import find_isbns, get_language, decode_html_data, to_isbn13 import settings import logging logger = logging.getLogger(__name__) NAMESPACES = { 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', 'atom' : 'http://www.w3.org/2005/Atom', 'dc' : 'http://purl.org/dc/terms', 'gbs' : 'http://schemas.google.com/books/2008', 'gd' : 'http://schemas.google.com/g/2005' } XPath = partial(etree.XPath, namespaces=NAMESPACES) def find_(query): logger.debug('find %s', query) query += ' isbn' isbns = [] for r in ox.web.google.find(query): isbns += find_isbns(' '.join(r)) logger.debug('isbns', isbns) results = [] done = set() for isbn in isbns: if isbn not in done: r = { 'isbn': isbn, 'primaryid': ['isbn', isbn] } results.append(r) done.add(isbn) if len(isbn) == 10: done.add(stdnum.isbn.to_isbn13(isbn)) if len(isbn) == 13 and isbn.startswith('978'): done.add(stdnum.isbn.to_isbn10(isbn)) return results def parse_entry(entry_): entry_id = XPath('descendant::atom:id') creator = XPath('descendant::dc:creator') date = XPath('descendant::dc:date') description = XPath('descendant::dc:description') _format = XPath('descendant::dc:format') identifier = XPath('descendant::dc:identifier') language = XPath('descendant::dc:language') publisher = XPath('descendant::dc:publisher') subject = XPath('descendant::dc:subject') title = XPath('descendant::dc:title') viewability = XPath('descendant::gbs:viewability') id_url = entry_id(entry_)[0].text _id = id_url.split('/')[-1] info = {} info['title'] = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text and x.text.strip()] if authors: info['author'] = authors info['description'] = '\n\n'.join([x.text for x in description(entry_)]).strip() info['date'] = ''.join([x.text for x in date(entry_)]).strip() info['categories'] = [x.text for x in subject(entry_)] info['publisher'] = [x.text for x in publisher(entry_)] info['language'] = [get_language(x.text) for x in language(entry_)] v = viewability(entry_) if v and v[0].attrib.get('value') != 'http://schemas.google.com/books/2008#view_no_pages': info['cover'] = 'https://books.google.com/books/content/images/frontcover/%s?fife=w600-rw' % _id format_ = ''.join([x.text for x in _format(entry_)]) if format_: pages = re.compile('\d+').findall(format_) if pages: info['pages'] = int(pages[0]) for x in identifier(entry_): t = str(x.text).strip() if t[:5].upper() == 'ISBN:': t = to_isbn13(t[5:]) if t: info['isbn'] = t break info = decode_html_data(info) return info def find(title=None, author=None): ''' parts = [] if title: parts.append(' '.join(['intitle:%s' % p for p in title.split(' ')])) if author: parts.append(' '.join(['inauthor:%s' % p for p in author.split(' ')])) q = '+'.join(parts) ''' q = '' if title: q += title + ' ' if author: q += author url = 'http://books.google.com/books/feeds/volumes?' + urlencode({ 'q': q.strip(), 'max-results': 20, 'start-index':1, 'min-viewability':'none', }) data = read_url(url) feed = etree.fromstring(data, parser=etree.XMLParser(recover=True, no_network=True)) results = [] isbns = set() for entry_ in XPath('//atom:entry')(feed): info = parse_entry(entry_) if 'isbn' in info and not 'isbn' in isbns: results.append(info) isbns.add(info['isbn']) return results def info_old(isbn): url = 'http://books.google.com/books/feeds/volumes?' + urlencode({ 'q': 'isnb:' + isbn, 'max-results':1, 'start-index':1, 'min-viewability':'none', }) data = read_url(url) feed = etree.fromstring(data, parser=etree.XMLParser(recover=True, no_network=True)) for entry_ in XPath('//atom:entry')(feed): info = parse_entry(entry_) info['isbn'] = isbn return info return {} def info(value): key = 'isbn' url = 'https://www.googleapis.com/books/v1/volumes?q=%s:%s' % (key, value) api_key = settings.server.get('google_api_key') if api_key: url += '&key=' + api_key if api_limit.error: raise IOError(url) while not api_limit.consume(1): logger.debug('hitting google api to fast, waiting 1 second') sleep(1) r = get_json(url, timeout=-1) if 'error' in r: logger.debug('got google api error, dont call for 10 minutes') store.delete(url) api_limit.error = True raise IOError(url, r) if not 'items' in r: logger.debug('unknown %s: %s [%s]', key, value, r) return {} _data = r['items'][0]['volumeInfo'] _id = r['items'][0]['id'] data = {} for key in [ 'authors', 'categories', 'description', 'pageCount', 'publishedDate', 'publisher', 'title', ]: if key in _data: data[{ 'authors': 'author', 'pageCount': 'pages', 'publishedDate': 'date', }.get(key,key)] = _data[key] if 'subtitle' in _data and _data['subtitle'].strip(): data['title'] = '{title}: {subtitle}'.format(**_data) if r['items'][0]['accessInfo']['viewability'] != 'NO_PAGES': #data['cover'] = 'https://books.google.com/books?id=%s&pg=PP1&img=1&zoom=0&hl=en' % _id data['cover'] = 'https://books.google.com/books/content/images/frontcover/%s?fife=w600-rw' % _id elif 'imageLinks' in _data: for size in ('extraLarge', 'large', 'medium', 'small', 'thumbnail', 'smallThumbnail'): if size in _data['imageLinks']: data['cover'] = _data['imageLinks'][size] break if 'industryIdentifiers' in _data: for k in _data['industryIdentifiers']: if k['type'].startswith('ISBN'): if not 'isbn' in data: data['isbn'] = [] data['isbn'].append(k['identifier']) else: logger.debug('unknown identifier %s', k) if 'isbn' in data: data['isbn'] = [to_isbn13(i) for i in data['isbn']][0] if 'publisher' in data and isinstance(data['publisher'], str): data['publisher'] = [data['publisher']] if 'language' in _data: data['language'] = [get_language(_data['language'])] data = decode_html_data(data) return data class Limit(object): _error = False def __init__(self, fill_rate, capacity): self.timestamp = time() self.fill_rate = fill_rate self.capacity = capacity self._tokens = capacity def consume(self, tokens): if tokens <= self.tokens: self._tokens -= tokens else: return False return True def get_tokens(self): now = time() if self._tokens < self.capacity: delta = self.fill_rate * (now - self.timestamp) self._tokens = min(self.capacity, self._tokens + delta) self.timestamp = now return self._tokens tokens = property(get_tokens) def get_error(self): if self._error and self._error < (time() - 10*60): self._error = False return self._error != False def set_error(self, value): self._error = time() error = property(get_error, set_error) api_limit = Limit(fill_rate=0.5, capacity=25)