openmedialibrary/oml/meta/google.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4

from time import time, sleep
from urllib.parse import urlencode
import re
from functools import partial

from ox.cache import get_json, store, read_url
import ox.web.google
import stdnum.isbn
from lxml import etree

from .utils import find_isbns, get_language, decode_html_data, to_isbn13
import settings

import logging
logger = logging.getLogger(__name__)

NAMESPACES = {
  'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
  'atom' : 'http://www.w3.org/2005/Atom',
  'dc'   : 'http://purl.org/dc/terms',
  'gbs'  : 'http://schemas.google.com/books/2008',
  'gd'   : 'http://schemas.google.com/g/2005'
}
XPath = partial(etree.XPath, namespaces=NAMESPACES)

def find_(query):
    logger.debug('find %s', query)
    query += ' isbn'
    isbns = []
    for r in ox.web.google.find(query):
        isbns += find_isbns(' '.join(r))
    logger.debug('isbns', isbns)
    results = []
    done = set()
    for isbn in isbns:
        if isbn not in done:
            r = {
                'isbn': isbn,
                'primaryid': ['isbn', isbn]
            }
            results.append(r)
            done.add(isbn)
            if len(isbn) == 10:
                done.add(stdnum.isbn.to_isbn13(isbn))
            if len(isbn) == 13 and isbn.startswith('978'):
                done.add(stdnum.isbn.to_isbn10(isbn))
    return results

def parse_entry(entry_):
    entry_id       = XPath('descendant::atom:id')
    creator        = XPath('descendant::dc:creator')
    date           = XPath('descendant::dc:date')
    description    = XPath('descendant::dc:description')
    _format        = XPath('descendant::dc:format')
    identifier     = XPath('descendant::dc:identifier')
    language       = XPath('descendant::dc:language')
    publisher      = XPath('descendant::dc:publisher')
    subject        = XPath('descendant::dc:subject')
    title          = XPath('descendant::dc:title')
    viewability    = XPath('descendant::gbs:viewability')
    id_url = entry_id(entry_)[0].text
    _id = id_url.split('/')[-1]
    info = {}
    info['title'] = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if authors:
        info['author'] = authors
    info['description'] = '\n\n'.join([x.text for x in description(entry_)]).strip()
    info['date'] = ''.join([x.text for x in date(entry_)]).strip()
    info['categories'] = [x.text for x in subject(entry_)]
    info['publisher'] = [x.text for x in publisher(entry_)]
    info['language'] = [get_language(x.text) for x in language(entry_)]
    v = viewability(entry_)
    if v and v[0].attrib.get('value') != 'http://schemas.google.com/books/2008#view_no_pages':
        info['cover'] = 'https://books.google.com/books/content/images/frontcover/%s?fife=w600-rw' % _id
    format_ = ''.join([x.text for x in _format(entry_)])
    if format_:
        pages = re.compile('\d+').findall(format_)
        if pages:
            info['pages'] = int(pages[0])
    for x in identifier(entry_):
        t = str(x.text).strip()
        if t[:5].upper() == 'ISBN:':
            t = to_isbn13(t[5:])
            if t:
                info['isbn'] = t
                break
    info = decode_html_data(info)
    return info

def find(title=None, author=None):
    '''
    parts = []
    if title:
        parts.append(' '.join(['intitle:%s' % p for p in title.split(' ')]))
    if author:
        parts.append(' '.join(['inauthor:%s' % p for p in author.split(' ')]))
    q = '+'.join(parts)
    '''
    q = ''
    if title:
        q += title + ' '
    if author:
        q += author
    url = 'http://books.google.com/books/feeds/volumes?' + urlencode({
        'q': q.strip(),
        'max-results': 20,
        'start-index':1,
        'min-viewability':'none',
    })
    print(url)
    data = read_url(url)
    feed = etree.fromstring(data,
        parser=etree.XMLParser(recover=True, no_network=True))
    results = []
    isbns = set()
    for entry_ in XPath('//atom:entry')(feed):
        info = parse_entry(entry_)
        if 'isbn' in info and not 'isbn' in isbns:
            results.append(info)
            isbns.add(info['isbn'])
    return results

def info(isbn):
    url = 'http://books.google.com/books/feeds/volumes?' + urlencode({
        'q': 'isnb:' + isbn,
        'max-results':1,
        'start-index':1,
        'min-viewability':'none',
    })
    data = read_url(url)
    feed = etree.fromstring(data,
        parser=etree.XMLParser(recover=True, no_network=True))
    for entry_ in XPath('//atom:entry')(feed):
        info = parse_entry(entry_)
        info['isbn'] = isbn
        return info
    return {}

def info_newapi(value):
    key = 'isbn'
    url = 'https://www.googleapis.com/books/v1/volumes?q=%s:%s' % (key, value)
    api_key = settings.server.get('google_api_key')
    if api_key:
        url += '&key=' + api_key
    if api_limit.error:
        raise IOError(url)
    while not api_limit.consume(1):
        logger.debug('hitting google api to fast, waiting 1 second')
        sleep(1)
    r = get_json(url, timeout=-1)
    if 'error' in r:
        logger.debug('got google api error, dont call for 10 minutes')
        store.delete(url)
        api_limit.error = True
        raise IOError(url, r)
    if not 'items' in r:
        logger.debug('unknown %s: %s [%s]', key, value, r)
        return {}
    _data = r['items'][0]['volumeInfo']
    _id = r['items'][0]['id']
    data = {}
    for key in [
            'authors',
            'categories',
            'description',
            'pageCount',
            'publishedDate',
            'publisher',
            'title',
        ]:
        if key in _data:
            data[{
                'authors': 'author',
                'pageCount': 'pages',
                'publishedDate': 'date',
            }.get(key,key)] = _data[key]

    if 'subtitle' in _data and _data['subtitle'].strip():
        data['title'] = '{title}: {subtitle}'.format(**_data)
    if r['items'][0]['accessInfo']['viewability'] != 'NO_PAGES':
        #data['cover'] = 'https://books.google.com/books?id=%s&pg=PP1&img=1&zoom=0&hl=en' % _id
        data['cover'] = 'https://books.google.com/books/content/images/frontcover/%s?fife=w600-rw' % _id

    elif 'imageLinks' in _data:
        for size in ('extraLarge', 'large', 'medium', 'small', 'thumbnail', 'smallThumbnail'):
            if size in _data['imageLinks']:
                data['cover'] = _data['imageLinks'][size]
                break
    if 'industryIdentifiers' in _data:
        for k in _data['industryIdentifiers']:
            if k['type'].startswith('ISBN'):
                if not 'isbn' in data:
                    data['isbn'] = []
                data['isbn'].append(k['identifier'])
            else:
                logger.debug('unknown identifier %s', k)
        if 'isbn' in data:
            data['isbn'] = [to_isbn13(i) for i in data['isbn']][0]

    if 'publisher' in data and isinstance(data['publisher'], str):
        data['publisher'] = [data['publisher']]
    if 'language' in _data:
        data['language'] = [get_language(_data['language'])]
    data = decode_html_data(data)
    return data

class Limit(object):
    _error = False

    def __init__(self, fill_rate, capacity):
        self.timestamp = time()
        self.fill_rate = fill_rate
        self.capacity = capacity
        self._tokens = capacity

    def consume(self, tokens):
        if tokens <= self.tokens:
            self._tokens -= tokens
        else:
            return False
        return True

    def get_tokens(self):
        now = time()
        if self._tokens < self.capacity:
            delta = self.fill_rate * (now - self.timestamp)
            self._tokens = min(self.capacity, self._tokens + delta)
        self.timestamp = now
        return self._tokens
    tokens = property(get_tokens)

    def get_error(self):
        if self._error and self._error < (time() - 10*60):
            self._error = False
        return self._error != False

    def set_error(self, value):
        self._error = time()
    error = property(get_error, set_error)

api_limit = Limit(fill_rate=0.5, capacity=25)