openmedialibrary/oml/meta/google.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4


from ox.cache import get_json, store
import ox.web.google
import stdnum.isbn

from .utils import find_isbns, get_language, decode_html_data

import logging
logger = logging.getLogger(__name__)


def find(query):
    logger.debug('find %s', query)
    query += ' isbn'
    isbns = []
    for r in ox.web.google.find(query):
        isbns += find_isbns(' '.join(r))
    logger.debug('isbns', isbns)
    results = []
    done = set()
    for isbn in isbns:
        if isbn not in done:
            r = {
                'isbn': isbn,
                'primaryid': ['isbn', isbn]
            }
            results.append(r)
            done.add(isbn)
            if len(isbn) == 10:
                done.add(stdnum.isbn.to_isbn13(isbn))
            if len(isbn) == 13 and isbn.startswith('978'):
                done.add(stdnum.isbn.to_isbn10(isbn))
    return results

def info(key, value):
    if key not in ('isbn', 'lccn', 'oclc'):
        raise IOError('unknwon key %s' % key)
    url = 'https://www.googleapis.com/books/v1/volumes?q=%s:%s' % (key, value)
    r = get_json(url, timeout=-1)
    if 'error' in r:
        store.delete(url)
        raise IOError(url, r)
    if not 'items' in r:
        print('unknown %s: %s [%s]' % (key, value, r))
        return {}
    _data = r['items'][0]['volumeInfo']
    data = {}
    for key in [
            'authors',
            'description',
            'pageCount',
            'publishedDate',
            'publisher',
            'title',
        ]:
        if key in _data:
            data[{
                'authors': 'author',
                'pageCount': 'pages',
                'publishedDate': 'date',
            }.get(key,key)] = _data[key]

    if 'subtitle' in _data and _data['subtitle'].strip():
        data['title'] = '{title}: {subtitle}'.format(**_data)
    if r['items'][0]['accessInfo']['viewability'] != 'NO_PAGES':
        data['cover'] = 'https://books.google.com/books?id=%s&pg=PP1&img=1&zoom=0&hl=en' % r['items'][0]['id']
    elif 'imageLinks' in _data:
        for size in ('extraLarge', 'large', 'medium', 'small', 'thumbnail', 'smallThumbnail'):
            if size in _data['imageLinks']:
                data['cover'] = _data['imageLinks'][size]
                break
    if 'industryIdentifiers' in _data:
        for k in _data['industryIdentifiers']:
            if k['type'].startswith('ISBN'):
                if not 'isbn' in data:
                    data['isbn'] = []
                data['isbn'].append(k['identifier'])
            else:
                print('unknown identifier', k)
    if 'publisher' in data and isinstance(data['publisher'], str):
        data['publisher'] = [data['publisher']]
    if 'language' in _data:
        data['language'] = [get_language(_data['language'])]
    data = decode_html_data(data)
    return data
find 2014-05-16 08:06:11 +00:00			`# -- coding: utf-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`
port to python3 2014-09-02 22:32:44 +00:00
find 2014-05-16 08:06:11 +00:00
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`from ox.cache import get_json, store`
find 2014-05-16 08:06:11 +00:00			`import ox.web.google`
			`import stdnum.isbn`

import text not html 2016-01-08 10:22:07 +00:00			`from .utils import find_isbns, get_language, decode_html_data`
find 2014-05-16 08:06:11 +00:00
use python logging 2014-05-17 14:26:59 +00:00			`import logging`
use logging.getLogger(__name__) 2015-11-29 14:56:38 +00:00			`logger = logging.getLogger(__name__)`
use python logging 2014-05-17 14:26:59 +00:00
find 2014-05-16 08:06:11 +00:00
lots of stuff 2014-05-21 00:02:21 +00:00			`def find(query):`
			`logger.debug('find %s', query)`
find 2014-05-16 08:06:11 +00:00			`query += ' isbn'`
			`isbns = []`
			`for r in ox.web.google.find(query):`
			`isbns += find_isbns(' '.join(r))`
use python logging 2014-05-17 14:26:59 +00:00			`logger.debug('isbns', isbns)`
find 2014-05-16 08:06:11 +00:00			`results = []`
			`done = set()`
			`for isbn in isbns:`
			`if isbn not in done:`
			`r = {`
lots of stuff 2014-05-21 00:02:21 +00:00			`'isbn': isbn,`
			`'primaryid': ['isbn', isbn]`
find 2014-05-16 08:06:11 +00:00			`}`
			`results.append(r)`
			`done.add(isbn)`
			`if len(isbn) == 10:`
			`done.add(stdnum.isbn.to_isbn13(isbn))`
lots of stuff 2014-05-21 00:02:21 +00:00			`if len(isbn) == 13 and isbn.startswith('978'):`
ddg 2014-05-17 09:19:32 +00:00			`done.add(stdnum.isbn.to_isbn10(isbn))`
find 2014-05-16 08:06:11 +00:00			`return results`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00
			`def info(key, value):`
			`if key not in ('isbn', 'lccn', 'oclc'):`
			`raise IOError('unknwon key %s' % key)`
			`url = 'https://www.googleapis.com/books/v1/volumes?q=%s:%s' % (key, value)`
			`r = get_json(url, timeout=-1)`
			`if 'error' in r:`
			`store.delete(url)`
			`raise IOError(url, r)`
			`if not 'items' in r:`
fail 2016-01-07 10:12:48 +00:00			`print('unknown %s: %s [%s]' % (key, value, r))`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`return {}`
			`_data = r['items'][0]['volumeInfo']`
			`data = {}`
			`for key in [`
			`'authors',`
			`'description',`
			`'pageCount',`
			`'publishedDate',`
			`'publisher',`
			`'title',`
			`]:`
			`if key in _data:`
			`data[{`
			`'authors': 'author',`
			`'pageCount': 'pages',`
			`'publishedDate': 'date',`
			`}.get(key,key)] = _data[key]`

dont add empty subtitle 2016-01-09 10:41:04 +00:00			`if 'subtitle' in _data and _data['subtitle'].strip():`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`data['title'] = '{title}: {subtitle}'.format(**_data)`
			`if r['items'][0]['accessInfo']['viewability'] != 'NO_PAGES':`
			`data['cover'] = 'https://books.google.com/books?id=%s&pg=PP1&img=1&zoom=0&hl=en' % r['items'][0]['id']`
			`elif 'imageLinks' in _data:`
			`for size in ('extraLarge', 'large', 'medium', 'small', 'thumbnail', 'smallThumbnail'):`
			`if size in _data['imageLinks']:`
			`data['cover'] = _data['imageLinks'][size]`
			`break`
			`if 'industryIdentifiers' in _data:`
			`for k in _data['industryIdentifiers']:`
			`if k['type'].startswith('ISBN'):`
			`if not 'isbn' in data:`
			`data['isbn'] = []`
			`data['isbn'].append(k['identifier'])`
			`else:`
			`print('unknown identifier', k)`
fail 2016-01-07 10:12:48 +00:00			`if 'publisher' in data and isinstance(data['publisher'], str):`
			`data['publisher'] = [data['publisher']]`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`if 'language' in _data:`
fail 2016-01-07 10:12:48 +00:00			`data['language'] = [get_language(_data['language'])]`
import text not html 2016-01-08 10:22:07 +00:00			`data = decode_html_data(data)`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`return data`