openmedialibrary/oml/meta/openlibrary.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4


from datetime import datetime
from urllib.parse import urlencode
import json

from ox.cache import read_url

from .dewey import get_classification
from .marc_countries import COUNTRIES
from .utils import normalize_isbn

import logging
logger = logging.getLogger(__name__)


KEYS = {
    'authors': 'author',
    'covers': 'cover',
    'dewey_decimal_class': 'classification',
    'isbn_10': 'isbn',
    'isbn_13': 'isbn',
    'lccn': 'lccn',
    'number_of_pages': 'pages',
    'languages': 'language',
    'oclc_numbers': 'oclc',
    'publish_country': 'country',
    'publish_date': 'date',
    'publishers': 'publisher',
    'publish_places': 'place',
    'series': 'series',
    'title': 'title',
}

def find(query):
    query = query.strip()
    logger.debug('find %s', query)
    r = api.search(query)
    results = []
    ids = [b for b in r.get('result', []) if b.startswith('/books')]
    books = api.get_many(ids).get('result', [])
    for olid, value in books.items():
        olid = olid.split('/')[-1]
        book = format(value)
        book['olid'] = [olid]
        book['primaryid'] = ['olid', olid]
        results.append(book)
    return results


def get_ids(key, value):
    ids = []
    if key == 'olid':
        data = lookup(value)
        for id in ('isbn', 'lccn', 'oclc'):
            if id in data:
                for v in data[id]:
                    if (id, v) not in ids:
                        ids.append((id, v))
    elif key in ('isbn', 'oclc', 'lccn'):
        logger.debug('get_ids %s %s', key, value)
        if key == 'isbn':
            key = 'isbn_%s'%len(value)
        r = api.things({'type': '/type/edition', key: value})
        for b in r.get('result', []):
            if b.startswith('/books'):
                olid = b.split('/')[-1]
                for kv in [('olid', olid)] + get_ids('olid', olid):
                    if kv not in ids:
                        ids.append(kv)
    if ids:
        logger.debug('get_ids %s %s => %s', key, value, ids)
    return ids

def lookup(id, return_all=False):
    logger.debug('lookup %s', id)
    info = api.get('/books/' + id).get('result', {})
    #url = 'https://openlibrary.org/books/%s.json' % id
    #info = json.loads(read_url(url).decode('utf-8'))
    data = format(info, return_all)
    if 'olid' not in data:
        data['olid'] = []
    if id not in data['olid']:
        data['olid'] = [id]
    logger.debug('lookup %s => %s', id, list(data.keys()))
    return data

def get_type(obj):
    type_ = obj.get('type')
    if isinstance(type_, dict):
        type_ = type_['key']
    return type_

def parse_date(s):
    #"January 1, 1998"
    for pattern, fmt in (('%B %d, %Y', '%Y-%m-%d'), ('%B %Y', '%Y-%m')):
        try:
            d = datetime.strptime(s, pattern)
            s = d.strftime(fmt)
            return s
        except:
            pass
    return s

def format(info, return_all=False):
    data = {}
    if 'works' in info:
        work = api.get(info['works'][0]['key'])['result']
    else:
        work = None
    for key in KEYS:
        if key in info:
            value = info[key]
            if key == 'authors':
                if work:
                    value = resolve_names([r['author']
                        for r in work.get('authors', []) if get_type(r) == '/type/author_role'])
                else:
                    value = resolve_names(value)
            elif key == 'publish_country':
                value = value.strip()
                value = COUNTRIES.get(value, value)
            elif key == 'covers':
                value = 'https://covers.openlibrary.org/b/id/%s.jpg' % value[0]
            elif key == 'languages':
                value = resolve_names(value)
            elif key in ('isbn_10', 'isbn_13'):
                if not isinstance(value, list):
                    value = [value]
                value = list(map(normalize_isbn, value))
                if KEYS[key] in data:
                    value = data[KEYS[key]] + value
            elif isinstance(value, list) and key not in ('publish_places', 'lccn', 'oclc_numbers'):
                value = value[0]
            if key == 'publish_date':
                value = parse_date(value)
            data[KEYS[key]] = value
    if 'subtitle' in info:
        data['title'] += ' ' + info['subtitle']
    if 'classification' in data:
        value = data['classification']
        if isinstance(value, list):
            value = value[0]
        data['classification'] = get_classification(value.split('/')[0])
    return data

def resolve_names(objects, key='name'):
    r = []
    data = api.get_many([k['key'] for k in objects]).get('result', {})
    for k, value in data.items():
        if 'location' in value and value.get('type', {}).get('key') == '/type/redirect':
            value = api.get(value['location']).get('result', {})
        r.append(value[key])
    return r

class API(object):
    base = 'https://openlibrary.org/api'

    def _request(self, action, data, timeout=None):
        for key in data:
            if not isinstance(data[key], str):
                data[key] = json.dumps(data[key])
        url = self.base + '/' + action + '?' + urlencode(data)
        if timeout is None:
            r = read_url(url).decode('utf-8')
            if '504 Gateway Time-out' in r:
                r = read_url(url, timeout=-1).decode('utf-8')
            result = json.loads(r)
        else:
            r = read_url(url, timeout).decode('utf-8')
            if '504 Gateway Time-out' in r:
                r = read_url(url, timeout=-1).decode('utf-8')
            result = json.loads(r)
        if 'status' in result and result['status'] == 'error' or 'error' in result:
            logger.info('FAILED %s %s', action, data)
            logger.info('URL %s', url)
        return result

    def get(self, key):
        data = self._request('get', {'key': key})
        return data

    def get_many(self, keys):
        data = self._request('get_many', {'keys': keys})
        return data

    def search(self, query):
        if isinstance(query, str):
            query = {
                'query': query
            }
        data = self._request('search', {'q': query})
        if 'status' in data and data['status'] == 'error':
            logger.info('FAILED %s', query)
        return data

    def things(self, query):
        data = self._request('things', {'query': query})
        return data

api = API()