openmedialibrary/oml/meta/worldcat.py

109 lines
3.3 KiB
Python
Raw Normal View History

2014-05-14 09:57:11 +00:00
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
2014-09-02 22:32:44 +00:00
2014-05-14 09:57:11 +00:00
import re
2014-05-25 21:27:03 +00:00
import hashlib
2014-08-12 08:16:57 +00:00
from ox.cache import read_url
import lxml.html
2014-05-14 09:57:11 +00:00
import stdnum.isbn
2014-08-12 08:16:57 +00:00
from .utils import normalize_isbn
2014-05-17 14:26:59 +00:00
import logging
logger = logging.getLogger('meta.worldcat')
2014-08-12 08:16:57 +00:00
2014-05-14 09:57:11 +00:00
base_url = 'http://www.worldcat.org'
def get_ids(key, value):
ids = []
2014-05-21 00:02:21 +00:00
if key == 'isbn':
2014-05-14 09:57:11 +00:00
url = '%s/search?qt=worldcat_org_bks&q=%s' % (base_url, value)
2014-10-31 11:46:14 +00:00
html = read_url(url).decode('utf-8')
2014-05-14 09:57:11 +00:00
matches = re.compile('/title.*?oclc/(\d+).*?"').findall(html)
if matches:
info = lookup(matches[0])
ids.append(('oclc', matches[0]))
2014-05-21 00:02:21 +00:00
for v in info.get('isbn', []):
if v != value:
ids.append(('isbn', v))
2014-05-14 09:57:11 +00:00
elif key == 'oclc':
info = lookup(value)
2014-05-21 00:02:21 +00:00
if 'isbn' in info:
for value in info['isbn']:
ids.append(('isbn', value))
2014-05-14 09:57:11 +00:00
if ids:
2015-03-09 10:34:26 +00:00
logger.debug('get_ids %s %s => %s', key, value, ids)
2014-05-14 09:57:11 +00:00
return ids
def lookup(id):
data = {
2014-05-21 00:02:21 +00:00
'oclc': [id]
2014-05-14 09:57:11 +00:00
}
url = '%s/oclc/%s' % (base_url, id)
html = read_url(url).decode('utf-8')
doc = lxml.html.document_fromstring(html)
for e in doc.xpath("//*[contains(@id, 'bibtip')]"):
key = e.attrib['id'].replace('bibtip_', '')
value = e.text_content()
data[key] = value
2014-05-25 22:23:48 +00:00
info = doc.xpath('//textarea[@id="util-em-note"]')
if info:
info = info[0].text
info = dict([i.split(':', 1) for i in info.split('\n\n')[1].split('\n')])
for key in info:
k = key.lower()
data[k] = info[key].strip()
2014-05-14 09:57:11 +00:00
for key in ('id', 'instance', 'mediatype', 'reclist', 'shorttitle'):
if key in data:
del data[key]
if 'isxn' in data:
for isbn in data.pop('isxn').split(' '):
isbn = normalize_isbn(isbn)
if stdnum.isbn.is_valid(isbn):
2014-05-21 00:02:21 +00:00
if not 'isbn' in data:
data['isbn'] = []
if isbn not in data['isbn']:
data['isbn'].append(isbn)
cover = doc.xpath('//img[@class="cover"]')
if cover:
data['cover'] = cover[0].attrib['src']
if data['cover'].startswith('//'):
data['cover'] = 'http:' + data['cover']
2014-05-25 21:27:03 +00:00
cdata = read_url(data['cover'])
2014-08-11 11:30:32 +00:00
if hashlib.sha1(cdata).hexdigest() in (
'd2e9ab0c87193d69a7d3a3c21ae4aa550f7dcf00',
'70f16d3e077cdd47ef6b331001dbb1963677fa04'
):
2014-05-25 21:27:03 +00:00
del data['cover']
2014-05-14 09:57:11 +00:00
if 'author' in data:
2014-05-26 08:23:10 +00:00
data['author'] = data['author'].split('; ')
2014-05-21 00:02:21 +00:00
if 'title' in data:
data['title'] = data['title'].replace(' : ', ': ')
2014-05-26 08:23:10 +00:00
if 'publisher' in data:
m = re.compile('(.+) : (.+), (\d{4})').findall(data['publisher'])
if m:
place, publisher, date = m[0]
data['publisher'] = publisher
data['date'] = date
2014-05-26 08:30:18 +00:00
data['place'] = [place]
2014-08-11 11:27:11 +00:00
elif ':' in data['publisher']:
place, publisher = data['publisher'].split(':', 2)
2014-08-11 17:43:14 +00:00
data['place'] = [place.strip()]
2014-08-11 11:27:11 +00:00
data['publisher'] = publisher.split(',')[0].strip()
m = re.compile('\d{4}').findall(publisher)
if m:
data['date'] = m[0]
2014-05-26 08:23:10 +00:00
2014-09-02 22:32:44 +00:00
logger.debug('lookup %s => %s', id, list(data.keys()))
2014-05-14 09:57:11 +00:00
return data
info = lookup
def find(title, author, year):
return []