2014-05-04 17:26:43 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
|
|
|
from __future__ import division
|
|
|
|
|
2014-05-14 18:46:31 +00:00
|
|
|
from urllib import urlencode
|
2014-05-04 17:26:43 +00:00
|
|
|
import json
|
2014-05-26 08:23:10 +00:00
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
from ox.cache import read_url
|
2014-05-04 17:26:43 +00:00
|
|
|
|
|
|
|
from marc_countries import COUNTRIES
|
2014-05-21 00:02:21 +00:00
|
|
|
from dewey import get_classification
|
2014-05-14 09:57:11 +00:00
|
|
|
from utils import normalize_isbn
|
2014-05-04 17:26:43 +00:00
|
|
|
|
2014-05-17 14:26:59 +00:00
|
|
|
import logging
|
|
|
|
logger = logging.getLogger('meta.openlibrary')
|
|
|
|
|
2014-05-14 18:46:31 +00:00
|
|
|
KEYS = {
|
|
|
|
'authors': 'author',
|
|
|
|
'covers': 'cover',
|
|
|
|
'dewey_decimal_class': 'classification',
|
2014-05-21 00:02:21 +00:00
|
|
|
'isbn_10': 'isbn',
|
|
|
|
'isbn_13': 'isbn',
|
2014-05-14 18:46:31 +00:00
|
|
|
'lccn': 'lccn',
|
|
|
|
'number_of_pages': 'pages',
|
2014-05-21 00:02:21 +00:00
|
|
|
'languages': 'language',
|
2014-05-14 18:46:31 +00:00
|
|
|
'oclc_numbers': 'oclc',
|
|
|
|
'publish_country': 'country',
|
|
|
|
'publish_date': 'date',
|
|
|
|
'publishers': 'publisher',
|
|
|
|
'publish_places': 'place',
|
|
|
|
'series': 'series',
|
|
|
|
'title': 'title',
|
|
|
|
}
|
|
|
|
|
2014-05-21 00:02:21 +00:00
|
|
|
def find(query):
|
2014-05-14 18:46:31 +00:00
|
|
|
query = query.strip()
|
2014-05-17 14:26:59 +00:00
|
|
|
logger.debug('find %s', query)
|
2014-05-14 18:46:31 +00:00
|
|
|
r = api.search(query)
|
|
|
|
results = []
|
|
|
|
ids = [b for b in r.get('result', []) if b.startswith('/books')]
|
|
|
|
books = api.get_many(ids).get('result', [])
|
|
|
|
for olid, value in books.iteritems():
|
|
|
|
olid = olid.split('/')[-1]
|
|
|
|
book = format(value)
|
2014-05-21 00:02:21 +00:00
|
|
|
book['olid'] = [olid]
|
|
|
|
book['primaryid'] = ['olid', olid]
|
2014-05-14 18:46:31 +00:00
|
|
|
results.append(book)
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
2014-05-14 09:57:11 +00:00
|
|
|
def get_ids(key, value):
|
|
|
|
ids = []
|
|
|
|
if key == 'olid':
|
2014-05-21 00:02:21 +00:00
|
|
|
data = lookup(value)
|
|
|
|
for id in ('isbn', 'lccn', 'oclc'):
|
2014-05-14 09:57:11 +00:00
|
|
|
if id in data:
|
|
|
|
for v in data[id]:
|
|
|
|
if (id, v) not in ids:
|
|
|
|
ids.append((id, v))
|
2014-05-21 00:02:21 +00:00
|
|
|
elif key in ('isbn', 'oclc', 'lccn'):
|
2014-05-19 01:36:37 +00:00
|
|
|
logger.debug('get_ids %s %s', key, value)
|
2014-05-21 00:02:21 +00:00
|
|
|
if key == 'isbn':
|
|
|
|
key = 'isbn_%s'%len(value)
|
|
|
|
r = api.things({'type': '/type/edition', key: value})
|
2014-05-14 18:46:31 +00:00
|
|
|
for b in r.get('result', []):
|
|
|
|
if b.startswith('/books'):
|
|
|
|
olid = b.split('/')[-1]
|
|
|
|
for kv in [('olid', olid)] + get_ids('olid', olid):
|
2014-05-14 09:57:11 +00:00
|
|
|
if kv not in ids:
|
|
|
|
ids.append(kv)
|
|
|
|
if ids:
|
2014-05-17 14:26:59 +00:00
|
|
|
logger.debug('get_ids %s %s => %s', key, value, ids)
|
2014-05-14 09:57:11 +00:00
|
|
|
return ids
|
2014-05-04 17:26:43 +00:00
|
|
|
|
2014-05-14 09:57:11 +00:00
|
|
|
def lookup(id, return_all=False):
|
2014-05-21 11:07:41 +00:00
|
|
|
logger.debug('lookup %s', id)
|
2014-05-14 18:46:31 +00:00
|
|
|
info = api.get('/books/' + id).get('result', {})
|
|
|
|
#url = 'https://openlibrary.org/books/%s.json' % id
|
|
|
|
#info = json.loads(read_url(url))
|
|
|
|
data = format(info, return_all)
|
2014-05-21 00:02:21 +00:00
|
|
|
if 'olid' not in data:
|
|
|
|
data['olid'] = []
|
|
|
|
if id not in data['olid']:
|
|
|
|
data['olid'] = [id]
|
2014-05-17 14:26:59 +00:00
|
|
|
logger.debug('lookup %s => %s', id, data.keys())
|
2014-05-14 18:46:31 +00:00
|
|
|
return data
|
|
|
|
|
2014-05-21 11:07:41 +00:00
|
|
|
def get_type(obj):
|
|
|
|
type_ = obj.get('type')
|
|
|
|
if isinstance(type_, dict):
|
|
|
|
type_ = type_['key']
|
|
|
|
return type_
|
|
|
|
|
2014-05-26 08:23:10 +00:00
|
|
|
def parse_date(s):
|
|
|
|
#"January 1, 1998"
|
|
|
|
for pattern, fmt in (('%B %d, %Y', '%Y-%m-%d'), ('%B %Y', '%Y-%m')):
|
|
|
|
try:
|
|
|
|
d = datetime.strptime(s, pattern)
|
|
|
|
s = d.strftime(fmt)
|
|
|
|
return s
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
return s
|
|
|
|
|
2014-05-14 18:46:31 +00:00
|
|
|
def format(info, return_all=False):
|
|
|
|
data = {}
|
2014-05-21 10:37:00 +00:00
|
|
|
if 'works' in info:
|
|
|
|
work = api.get(info['works'][0]['key'])['result']
|
|
|
|
else:
|
|
|
|
work = None
|
2014-05-14 18:46:31 +00:00
|
|
|
for key in KEYS:
|
2014-05-04 17:26:43 +00:00
|
|
|
if key in info:
|
|
|
|
value = info[key]
|
|
|
|
if key == 'authors':
|
2014-05-21 10:37:00 +00:00
|
|
|
if work:
|
|
|
|
value = resolve_names([r['author']
|
2014-05-21 11:07:41 +00:00
|
|
|
for r in work.get('authors', []) if get_type(r) == '/type/author_role'])
|
2014-05-21 10:37:00 +00:00
|
|
|
else:
|
|
|
|
value = resolve_names(value)
|
2014-05-04 17:26:43 +00:00
|
|
|
elif key == 'publish_country':
|
2014-05-14 18:46:31 +00:00
|
|
|
value = value.strip()
|
2014-05-04 17:26:43 +00:00
|
|
|
value = COUNTRIES.get(value, value)
|
|
|
|
elif key == 'covers':
|
|
|
|
value = 'https://covers.openlibrary.org/b/id/%s.jpg' % value[0]
|
|
|
|
elif key == 'languages':
|
2014-05-14 18:46:31 +00:00
|
|
|
value = resolve_names(value)
|
2014-05-21 00:02:21 +00:00
|
|
|
elif key in ('isbn_10', 'isbn_13'):
|
|
|
|
if not isinstance(value, list):
|
|
|
|
value = [value]
|
|
|
|
value = map(normalize_isbn, value)
|
|
|
|
if KEYS[key] in data:
|
|
|
|
value = data[KEYS[key]] + value
|
|
|
|
elif isinstance(value, list) and key not in ('publish_places', 'lccn', 'oclc_numbers'):
|
2014-05-04 17:26:43 +00:00
|
|
|
value = value[0]
|
2014-05-26 08:23:10 +00:00
|
|
|
if key == 'publish_date':
|
|
|
|
value = parse_date(value)
|
2014-05-14 18:46:31 +00:00
|
|
|
data[KEYS[key]] = value
|
2014-05-21 10:37:00 +00:00
|
|
|
if 'subtitle' in info:
|
|
|
|
data['title'] += ' ' + info['subtitle']
|
2014-05-21 00:02:21 +00:00
|
|
|
if 'classification' in data:
|
|
|
|
value = data['classification']
|
|
|
|
if isinstance(value, list):
|
|
|
|
value = value[0]
|
|
|
|
data['classification'] = get_classification(value.split('/')[0])
|
2014-05-14 09:57:11 +00:00
|
|
|
return data
|
|
|
|
|
|
|
|
def resolve_names(objects, key='name'):
|
|
|
|
r = []
|
2014-05-14 18:46:31 +00:00
|
|
|
data = api.get_many([k['key'] for k in objects]).get('result', {})
|
|
|
|
for k, value in data.iteritems():
|
|
|
|
if 'location' in value and value.get('type', {}).get('key') == '/type/redirect':
|
|
|
|
value = api.get(value['location']).get('result', {})
|
|
|
|
r.append(value[key])
|
2014-05-14 09:57:11 +00:00
|
|
|
return r
|
|
|
|
|
2014-05-14 18:46:31 +00:00
|
|
|
class API(object):
|
|
|
|
base = 'https://openlibrary.org/api'
|
|
|
|
|
2014-05-25 10:59:19 +00:00
|
|
|
def _request(self, action, data, timeout=None):
|
2014-05-14 18:46:31 +00:00
|
|
|
for key in data:
|
|
|
|
if not isinstance(data[key], basestring):
|
|
|
|
data[key] = json.dumps(data[key])
|
|
|
|
url = self.base + '/' + action + '?' + urlencode(data)
|
2014-05-25 10:59:19 +00:00
|
|
|
if timeout is None:
|
|
|
|
result = json.loads(read_url(url))
|
|
|
|
else:
|
|
|
|
result = json.loads(read_url(url, timeout=timeout))
|
2014-05-14 18:46:31 +00:00
|
|
|
if 'status' in result and result['status'] == 'error' or 'error' in result:
|
2014-05-17 14:26:59 +00:00
|
|
|
logger.info('FAILED %s %s', action, data)
|
|
|
|
logger.info('URL %s', url)
|
2014-05-14 18:46:31 +00:00
|
|
|
return result
|
|
|
|
|
|
|
|
def get(self, key):
|
|
|
|
data = self._request('get', {'key': key})
|
|
|
|
return data
|
|
|
|
|
|
|
|
def get_many(self, keys):
|
|
|
|
data = self._request('get_many', {'keys': keys})
|
|
|
|
return data
|
|
|
|
|
|
|
|
def search(self, query):
|
|
|
|
if isinstance(query, basestring):
|
|
|
|
query = {
|
|
|
|
'query': query
|
|
|
|
}
|
|
|
|
data = self._request('search', {'q': query})
|
|
|
|
if 'status' in data and data['status'] == 'error':
|
2014-05-17 14:26:59 +00:00
|
|
|
logger.info('FAILED %s', query)
|
2014-05-14 18:46:31 +00:00
|
|
|
return data
|
|
|
|
|
|
|
|
def things(self, query):
|
|
|
|
data = self._request('things', {'query': query})
|
|
|
|
return data
|
2014-05-14 09:57:11 +00:00
|
|
|
|
2014-05-14 18:46:31 +00:00
|
|
|
api = API()
|