openmedialibrary/oml/meta/openlibrary.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import division

from urllib import urlencode
from ox.cache import read_url
import json

from marc_countries import COUNTRIES
from utils import normalize_isbn

KEYS = {
    'authors': 'author',
    'covers': 'cover',
    'dewey_decimal_class': 'classification',
    'isbn_10': 'isbn10',
    'isbn_13': 'isbn13',
    'languages': 'language',
    'lccn': 'lccn',
    'number_of_pages': 'pages',
    'oclc_numbers': 'oclc',
    'publish_country': 'country',
    'publish_date': 'date',
    'publishers': 'publisher',
    'publish_places': 'place',
    'series': 'series',
    'title': 'title',
}

def find(*args, **kargs):
    args = [a.replace(':', ' ') for a in args]
    for k in ('date', 'publisher'):
        if k in kargs:
            print 'ignoring %s on openlibrary' % k, kargs[k]
            del kargs[k]
    for k, v in kargs.iteritems():
        key = KEYS.keys()[KEYS.values().index(k)]
        if v:
            if not isinstance(v, list):
                v = [v]
            #v = ['%s:"%s"' % (key, value.replace(':', '\:')) for value in v]
            v = ['"%s"' % value.replace(':', ' ') for value in v]
            args += v
    query = ' '.join(args)
    query = query.strip()
    print 'openlibrary.find', query
    r = api.search(query)
    results = []
    ids = [b for b in r.get('result', []) if b.startswith('/books')]
    books = api.get_many(ids).get('result', [])
    for olid, value in books.iteritems():
        olid = olid.split('/')[-1]
        book = format(value)
        book['olid'] = olid
        results.append(book)
    return results


def get_ids(key, value):
    ids = []
    if key == 'olid':
        data = lookup(value, True)
        for id in ('isbn10', 'isbn13', 'lccn', 'oclc'):
            if id in data:
                for v in data[id]:
                    if (id, v) not in ids:
                        ids.append((id, v))
    elif key in ('isbn10', 'isbn13', 'oclc', 'lccn'):
        print 'openlibraryid.get_ids', key, value
        r = api.things({'type': '/type/edition', key.replace('isbn', 'isbn_'): value})
        for b in r.get('result', []):
            if b.startswith('/books'):
                olid = b.split('/')[-1]
                for kv in [('olid', olid)] + get_ids('olid', olid):
                    if kv not in ids:
                        ids.append(kv)
    if ids:
        print 'openlibraryid.get_ids', key, value
        print ids
    return ids

def lookup(id, return_all=False):
    #print 'openlibrary.lookup', id
    info = api.get('/books/' + id).get('result', {})
    #url = 'https://openlibrary.org/books/%s.json' % id
    #info = json.loads(read_url(url))
    data = format(info, return_all)
    data['olid'] = id
    print 'openlibrary.lookup', id, data.keys()
    return data

def format(info, return_all=False):
    data = {}
    for key in KEYS:
        if key in info:
            value = info[key]
            if key == 'authors':
                value = resolve_names(value)
            elif key == 'publish_country':
                value = value.strip()
                value = COUNTRIES.get(value, value)
            elif key == 'covers':
                value = 'https://covers.openlibrary.org/b/id/%s.jpg' % value[0]
                value = COUNTRIES.get(value, value)
            elif key == 'languages':
                value = resolve_names(value)
            elif not return_all and isinstance(value, list) and key not in ('publish_places'):
                value = value[0]
            if key in ('isbn_10', 'isbn_13'):
                if isinstance(value, list):
                    value = map(normalize_isbn, value)
                else:
                    value = normalize_isbn(value)
            data[KEYS[key]] = value
    return data

def resolve_names(objects, key='name'):
    r = []
    data = api.get_many([k['key'] for k in objects]).get('result', {})
    for k, value in data.iteritems():
        if 'location' in value and value.get('type', {}).get('key') == '/type/redirect':
            value = api.get(value['location']).get('result', {})
        r.append(value[key])
    return r

class API(object):
    base = 'https://openlibrary.org/api'

    def _request(self, action, data):
        for key in data:
            if not isinstance(data[key], basestring):
                data[key] = json.dumps(data[key])
        url = self.base + '/' + action + '?' + urlencode(data)
        result = json.loads(read_url(url))
        if 'status' in result and result['status'] == 'error' or 'error' in result:
            print 'FAILED', action, data
            print 'URL', url
        return result

    def get(self, key):
        data = self._request('get', {'key': key})
        return data

    def get_many(self, keys):
        data = self._request('get_many', {'keys': keys})
        return data

    def search(self, query):
        if isinstance(query, basestring):
            query = {
                'query': query
            }
        data = self._request('search', {'q': query})
        if 'status' in data and data['status'] == 'error':
            print 'FAILED', query
        return data

    def things(self, query):
        data = self._request('things', {'query': query})
        return data

api = API()