openmedialibrary/oml/meta/google.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4

from time import time, sleep
from urllib.parse import urlencode
import re
from functools import partial

from ox.cache import get_json, store, read_url
import ox.web.google
import stdnum.isbn
from lxml import etree

from .utils import find_isbns, get_language, decode_html_data, to_isbn13
import settings

import logging
logger = logging.getLogger(__name__)

NAMESPACES = {
  'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
  'atom' : 'http://www.w3.org/2005/Atom',
  'dc'   : 'http://purl.org/dc/terms',
  'gbs'  : 'http://schemas.google.com/books/2008',
  'gd'   : 'http://schemas.google.com/g/2005'
}
XPath = partial(etree.XPath, namespaces=NAMESPACES)

def find_(query):
    logger.debug('find %s', query)
    query += ' isbn'
    isbns = []
    for r in ox.web.google.find(query):
        isbns += find_isbns(' '.join(r))
    logger.debug('isbns', isbns)
    results = []
    done = set()
    for isbn in isbns:
        if isbn not in done:
            r = {
                'isbn': isbn,
                'primaryid': ['isbn', isbn]
            }
            results.append(r)
            done.add(isbn)
            if len(isbn) == 10:
                done.add(stdnum.isbn.to_isbn13(isbn))
            if len(isbn) == 13 and isbn.startswith('978'):
                done.add(stdnum.isbn.to_isbn10(isbn))
    return results

def parse_entry(entry_):
    entry_id       = XPath('descendant::atom:id')
    creator        = XPath('descendant::dc:creator')
    date           = XPath('descendant::dc:date')
    description    = XPath('descendant::dc:description')
    _format        = XPath('descendant::dc:format')
    identifier     = XPath('descendant::dc:identifier')
    language       = XPath('descendant::dc:language')
    publisher      = XPath('descendant::dc:publisher')
    subject        = XPath('descendant::dc:subject')
    title          = XPath('descendant::dc:title')
    viewability    = XPath('descendant::gbs:viewability')
    id_url = entry_id(entry_)[0].text
    _id = id_url.split('/')[-1]
    info = {}
    info['title'] = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if authors:
        info['author'] = authors
    info['description'] = '\n\n'.join([x.text for x in description(entry_)]).strip()
    info['date'] = ''.join([x.text for x in date(entry_)]).strip()
    info['categories'] = [x.text for x in subject(entry_)]
    info['publisher'] = [x.text for x in publisher(entry_)]
    info['language'] = [get_language(x.text) for x in language(entry_)]
    v = viewability(entry_)
    if v and v[0].attrib.get('value') != 'http://schemas.google.com/books/2008#view_no_pages':
        info['cover'] = 'https://books.google.com/books/content/images/frontcover/%s?fife=w600-rw' % _id
    format_ = ''.join([x.text for x in _format(entry_)])
    if format_:
        pages = re.compile('\d+').findall(format_)
        if pages:
            info['pages'] = int(pages[0])
    for x in identifier(entry_):
        t = str(x.text).strip()
        if t[:5].upper() == 'ISBN:':
            t = to_isbn13(t[5:])
            if t:
                info['isbn'] = t
                break
    info = decode_html_data(info)
    return info

def find(title=None, author=None):
    '''
    parts = []
    if title:
        parts.append(' '.join(['intitle:%s' % p for p in title.split(' ')]))
    if author:
        parts.append(' '.join(['inauthor:%s' % p for p in author.split(' ')]))
    q = '+'.join(parts)
    '''
    q = ''
    if title:
        q += title + ' '
    if author:
        q += author
    url = 'http://books.google.com/books/feeds/volumes?' + urlencode({
        'q': q.strip(),
        'max-results': 20,
        'start-index':1,
        'min-viewability':'none',
    })
    data = read_url(url)
    feed = etree.fromstring(data,
        parser=etree.XMLParser(recover=True, no_network=True))
    results = []
    isbns = set()
    for entry_ in XPath('//atom:entry')(feed):
        info = parse_entry(entry_)
        if 'isbn' in info and not 'isbn' in isbns:
            results.append(info)
            isbns.add(info['isbn'])
    return results

def info(isbn):
    url = 'http://books.google.com/books/feeds/volumes?' + urlencode({
        'q': 'isnb:' + isbn,
        'max-results':1,
        'start-index':1,
        'min-viewability':'none',
    })
    data = read_url(url)
    feed = etree.fromstring(data,
        parser=etree.XMLParser(recover=True, no_network=True))
    for entry_ in XPath('//atom:entry')(feed):
        info = parse_entry(entry_)
        info['isbn'] = isbn
        return info
    return {}

def info_newapi(value):
    key = 'isbn'
    url = 'https://www.googleapis.com/books/v1/volumes?q=%s:%s' % (key, value)
    api_key = settings.server.get('google_api_key')
    if api_key:
        url += '&key=' + api_key
    if api_limit.error:
        raise IOError(url)
    while not api_limit.consume(1):
        logger.debug('hitting google api to fast, waiting 1 second')
        sleep(1)
    r = get_json(url, timeout=-1)
    if 'error' in r:
        logger.debug('got google api error, dont call for 10 minutes')
        store.delete(url)
        api_limit.error = True
        raise IOError(url, r)
    if not 'items' in r:
        logger.debug('unknown %s: %s [%s]', key, value, r)
        return {}
    _data = r['items'][0]['volumeInfo']
    _id = r['items'][0]['id']
    data = {}
    for key in [
            'authors',
            'categories',
            'description',
            'pageCount',
            'publishedDate',
            'publisher',
            'title',
        ]:
        if key in _data:
            data[{
                'authors': 'author',
                'pageCount': 'pages',
                'publishedDate': 'date',
            }.get(key,key)] = _data[key]

    if 'subtitle' in _data and _data['subtitle'].strip():
        data['title'] = '{title}: {subtitle}'.format(**_data)
    if r['items'][0]['accessInfo']['viewability'] != 'NO_PAGES':
        #data['cover'] = 'https://books.google.com/books?id=%s&pg=PP1&img=1&zoom=0&hl=en' % _id
        data['cover'] = 'https://books.google.com/books/content/images/frontcover/%s?fife=w600-rw' % _id

    elif 'imageLinks' in _data:
        for size in ('extraLarge', 'large', 'medium', 'small', 'thumbnail', 'smallThumbnail'):
            if size in _data['imageLinks']:
                data['cover'] = _data['imageLinks'][size]
                break
    if 'industryIdentifiers' in _data:
        for k in _data['industryIdentifiers']:
            if k['type'].startswith('ISBN'):
                if not 'isbn' in data:
                    data['isbn'] = []
                data['isbn'].append(k['identifier'])
            else:
                logger.debug('unknown identifier %s', k)
        if 'isbn' in data:
            data['isbn'] = [to_isbn13(i) for i in data['isbn']][0]

    if 'publisher' in data and isinstance(data['publisher'], str):
        data['publisher'] = [data['publisher']]
    if 'language' in _data:
        data['language'] = [get_language(_data['language'])]
    data = decode_html_data(data)
    return data

class Limit(object):
    _error = False

    def __init__(self, fill_rate, capacity):
        self.timestamp = time()
        self.fill_rate = fill_rate
        self.capacity = capacity
        self._tokens = capacity

    def consume(self, tokens):
        if tokens <= self.tokens:
            self._tokens -= tokens
        else:
            return False
        return True

    def get_tokens(self):
        now = time()
        if self._tokens < self.capacity:
            delta = self.fill_rate * (now - self.timestamp)
            self._tokens = min(self.capacity, self._tokens + delta)
        self.timestamp = now
        return self._tokens
    tokens = property(get_tokens)

    def get_error(self):
        if self._error and self._error < (time() - 10*60):
            self._error = False
        return self._error != False

    def set_error(self, value):
        self._error = time()
    error = property(get_error, set_error)

api_limit = Limit(fill_rate=0.5, capacity=25)
find 2014-05-16 08:06:11 +00:00			`# -- coding: utf-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`
port to python3 2014-09-02 22:32:44 +00:00
rate limit google requests 2016-01-23 12:36:40 +00:00			`from time import time, sleep`
use old google api 2016-02-02 19:30:40 +00:00			`from urllib.parse import urlencode`
			`import re`
			`from functools import partial`
find 2014-05-16 08:06:11 +00:00
use old google api 2016-02-02 19:30:40 +00:00			`from ox.cache import get_json, store, read_url`
find 2014-05-16 08:06:11 +00:00			`import ox.web.google`
			`import stdnum.isbn`
use old google api 2016-02-02 19:30:40 +00:00			`from lxml import etree`
find 2014-05-16 08:06:11 +00:00
store metadata per user. remove primaryid. only store isbn13 2016-01-11 13:43:54 +00:00			`from .utils import find_isbns, get_language, decode_html_data, to_isbn13`
optionally send api key 2016-01-25 11:08:57 +00:00			`import settings`
find 2014-05-16 08:06:11 +00:00
use python logging 2014-05-17 14:26:59 +00:00			`import logging`
use logging.getLogger(__name__) 2015-11-29 14:56:38 +00:00			`logger = logging.getLogger(__name__)`
use python logging 2014-05-17 14:26:59 +00:00
use old google api 2016-02-02 19:30:40 +00:00			`NAMESPACES = {`
			`'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',`
			`'atom' : 'http://www.w3.org/2005/Atom',`
			`'dc' : 'http://purl.org/dc/terms',`
			`'gbs' : 'http://schemas.google.com/books/2008',`
			`'gd' : 'http://schemas.google.com/g/2005'`
			`}`
			`XPath = partial(etree.XPath, namespaces=NAMESPACES)`
find 2014-05-16 08:06:11 +00:00
use old google api 2016-02-02 19:30:40 +00:00			`def find_(query):`
lots of stuff 2014-05-21 00:02:21 +00:00			`logger.debug('find %s', query)`
find 2014-05-16 08:06:11 +00:00			`query += ' isbn'`
			`isbns = []`
			`for r in ox.web.google.find(query):`
			`isbns += find_isbns(' '.join(r))`
use python logging 2014-05-17 14:26:59 +00:00			`logger.debug('isbns', isbns)`
find 2014-05-16 08:06:11 +00:00			`results = []`
			`done = set()`
			`for isbn in isbns:`
			`if isbn not in done:`
			`r = {`
lots of stuff 2014-05-21 00:02:21 +00:00			`'isbn': isbn,`
			`'primaryid': ['isbn', isbn]`
find 2014-05-16 08:06:11 +00:00			`}`
			`results.append(r)`
			`done.add(isbn)`
			`if len(isbn) == 10:`
			`done.add(stdnum.isbn.to_isbn13(isbn))`
lots of stuff 2014-05-21 00:02:21 +00:00			`if len(isbn) == 13 and isbn.startswith('978'):`
ddg 2014-05-17 09:19:32 +00:00			`done.add(stdnum.isbn.to_isbn10(isbn))`
find 2014-05-16 08:06:11 +00:00			`return results`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00
use old google api 2016-02-02 19:30:40 +00:00			`def parse_entry(entry_):`
			`entry_id = XPath('descendant::atom:id')`
			`creator = XPath('descendant::dc:creator')`
			`date = XPath('descendant::dc:date')`
			`description = XPath('descendant::dc:description')`
			`_format = XPath('descendant::dc:format')`
			`identifier = XPath('descendant::dc:identifier')`
			`language = XPath('descendant::dc:language')`
			`publisher = XPath('descendant::dc:publisher')`
			`subject = XPath('descendant::dc:subject')`
			`title = XPath('descendant::dc:title')`
			`viewability = XPath('descendant::gbs:viewability')`
			`id_url = entry_id(entry_)[0].text`
			`_id = id_url.split('/')[-1]`
			`info = {}`
			`info['title'] = ': '.join([x.text for x in title(entry_)]).strip()`
			`authors = [x.text.strip() for x in creator(entry_) if x.text]`
			`if authors:`
			`info['author'] = authors`
			`info['description'] = '\n\n'.join([x.text for x in description(entry_)]).strip()`
			`info['date'] = ''.join([x.text for x in date(entry_)]).strip()`
			`info['categories'] = [x.text for x in subject(entry_)]`
			`info['publisher'] = [x.text for x in publisher(entry_)]`
			`info['language'] = [get_language(x.text) for x in language(entry_)]`
			`v = viewability(entry_)`
			`if v and v[0].attrib.get('value') != 'http://schemas.google.com/books/2008#view_no_pages':`
			`info['cover'] = 'https://books.google.com/books/content/images/frontcover/%s?fife=w600-rw' % _id`
			`format_ = ''.join([x.text for x in _format(entry_)])`
			`if format_:`
			`pages = re.compile('\d+').findall(format_)`
			`if pages:`
			`info['pages'] = int(pages[0])`
			`for x in identifier(entry_):`
			`t = str(x.text).strip()`
			`if t[:5].upper() == 'ISBN:':`
			`t = to_isbn13(t[5:])`
			`if t:`
			`info['isbn'] = t`
			`break`
			`info = decode_html_data(info)`
			`return info`

			`def find(title=None, author=None):`
			`'''`
			`parts = []`
			`if title:`
			`parts.append(' '.join(['intitle:%s' % p for p in title.split(' ')]))`
			`if author:`
			`parts.append(' '.join(['inauthor:%s' % p for p in author.split(' ')]))`
			`q = '+'.join(parts)`
			`'''`
			`q = ''`
			`if title:`
			`q += title + ' '`
			`if author:`
			`q += author`
			`url = 'http://books.google.com/books/feeds/volumes?' + urlencode({`
			`'q': q.strip(),`
			`'max-results': 20,`
			`'start-index':1,`
			`'min-viewability':'none',`
			`})`
			`data = read_url(url)`
			`feed = etree.fromstring(data,`
			`parser=etree.XMLParser(recover=True, no_network=True))`
			`results = []`
			`isbns = set()`
			`for entry_ in XPath('//atom:entry')(feed):`
			`info = parse_entry(entry_)`
			`if 'isbn' in info and not 'isbn' in isbns:`
			`results.append(info)`
			`isbns.add(info['isbn'])`
			`return results`

			`def info(isbn):`
			`url = 'http://books.google.com/books/feeds/volumes?' + urlencode({`
			`'q': 'isnb:' + isbn,`
			`'max-results':1,`
			`'start-index':1,`
			`'min-viewability':'none',`
			`})`
			`data = read_url(url)`
			`feed = etree.fromstring(data,`
			`parser=etree.XMLParser(recover=True, no_network=True))`
			`for entry_ in XPath('//atom:entry')(feed):`
			`info = parse_entry(entry_)`
			`info['isbn'] = isbn`
			`return info`
			`return {}`

			`def info_newapi(value):`
			`key = 'isbn'`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`url = 'https://www.googleapis.com/books/v1/volumes?q=%s:%s' % (key, value)`
optionally send api key 2016-01-25 11:08:57 +00:00			`api_key = settings.server.get('google_api_key')`
			`if api_key:`
			`url += '&key=' + api_key`
futher google api tuning 2016-01-23 13:26:36 +00:00			`if api_limit.error:`
			`raise IOError(url)`
rate limit google requests 2016-01-23 12:36:40 +00:00			`while not api_limit.consume(1):`
			`logger.debug('hitting google api to fast, waiting 1 second')`
			`sleep(1)`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`r = get_json(url, timeout=-1)`
			`if 'error' in r:`
futher google api tuning 2016-01-23 13:26:36 +00:00			`logger.debug('got google api error, dont call for 10 minutes')`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`store.delete(url)`
futher google api tuning 2016-01-23 13:26:36 +00:00			`api_limit.error = True`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`raise IOError(url, r)`
			`if not 'items' in r:`
catch more errors in debug log 2016-01-24 09:11:40 +00:00			`logger.debug('unknown %s: %s [%s]', key, value, r)`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`return {}`
			`_data = r['items'][0]['volumeInfo']`
better cover link 2016-01-09 14:09:01 +00:00			`_id = r['items'][0]['id']`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`data = {}`
			`for key in [`
			`'authors',`
store metadata per user. remove primaryid. only store isbn13 2016-01-11 13:43:54 +00:00			`'categories',`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`'description',`
			`'pageCount',`
			`'publishedDate',`
			`'publisher',`
			`'title',`
			`]:`
			`if key in _data:`
			`data[{`
			`'authors': 'author',`
			`'pageCount': 'pages',`
			`'publishedDate': 'date',`
			`}.get(key,key)] = _data[key]`

dont add empty subtitle 2016-01-09 10:41:04 +00:00			`if 'subtitle' in _data and _data['subtitle'].strip():`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`data['title'] = '{title}: {subtitle}'.format(**_data)`
			`if r['items'][0]['accessInfo']['viewability'] != 'NO_PAGES':`
better cover link 2016-01-09 14:09:01 +00:00			`#data['cover'] = 'https://books.google.com/books?id=%s&pg=PP1&img=1&zoom=0&hl=en' % _id`
			`data['cover'] = 'https://books.google.com/books/content/images/frontcover/%s?fife=w600-rw' % _id`

inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`elif 'imageLinks' in _data:`
			`for size in ('extraLarge', 'large', 'medium', 'small', 'thumbnail', 'smallThumbnail'):`
			`if size in _data['imageLinks']:`
			`data['cover'] = _data['imageLinks'][size]`
			`break`
			`if 'industryIdentifiers' in _data:`
			`for k in _data['industryIdentifiers']:`
			`if k['type'].startswith('ISBN'):`
			`if not 'isbn' in data:`
			`data['isbn'] = []`
			`data['isbn'].append(k['identifier'])`
			`else:`
catch more errors in debug log 2016-01-24 09:11:40 +00:00			`logger.debug('unknown identifier %s', k)`
store metadata per user. remove primaryid. only store isbn13 2016-01-11 13:43:54 +00:00			`if 'isbn' in data:`
			`data['isbn'] = [to_isbn13(i) for i in data['isbn']][0]`

fail 2016-01-07 10:12:48 +00:00			`if 'publisher' in data and isinstance(data['publisher'], str):`
			`data['publisher'] = [data['publisher']]`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`if 'language' in _data:`
fail 2016-01-07 10:12:48 +00:00			`data['language'] = [get_language(_data['language'])]`
import text not html 2016-01-08 10:22:07 +00:00			`data = decode_html_data(data)`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`return data`

rate limit google requests 2016-01-23 12:36:40 +00:00			`class Limit(object):`
futher google api tuning 2016-01-23 13:26:36 +00:00			`_error = False`
rate limit google requests 2016-01-23 12:36:40 +00:00
			`def __init__(self, fill_rate, capacity):`
			`self.timestamp = time()`
			`self.fill_rate = fill_rate`
			`self.capacity = capacity`
			`self._tokens = capacity`

			`def consume(self, tokens):`
			`if tokens <= self.tokens:`
			`self._tokens -= tokens`
			`else:`
			`return False`
			`return True`

			`def get_tokens(self):`
			`now = time()`
			`if self._tokens < self.capacity:`
			`delta = self.fill_rate * (now - self.timestamp)`
			`self._tokens = min(self.capacity, self._tokens + delta)`
			`self.timestamp = now`
			`return self._tokens`
			`tokens = property(get_tokens)`

futher google api tuning 2016-01-23 13:26:36 +00:00			`def get_error(self):`
			`if self._error and self._error < (time() - 10*60):`
			`self._error = False`
			`return self._error != False`

			`def set_error(self, value):`
			`self._error = time()`
			`error = property(get_error, set_error)`

			`api_limit = Limit(fill_rate=0.5, capacity=25)`