openmedialibrary/oml/meta/worldcat.py

109 lines
3.3 KiB
Python

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import hashlib
from ox.cache import read_url
import lxml.html
import stdnum.isbn
from .utils import normalize_isbn
import logging
logger = logging.getLogger(__name__)
base_url = 'http://www.worldcat.org'
def get_ids(key, value):
ids = []
if key == 'isbn':
url = '%s/search?qt=worldcat_org_bks&q=%s' % (base_url, value)
html = read_url(url).decode('utf-8')
matches = re.compile('/title.*?oclc/(\d+).*?"').findall(html)
if matches:
info = lookup(matches[0])
ids.append(('oclc', matches[0]))
for v in info.get('isbn', []):
if v != value:
ids.append(('isbn', v))
elif key == 'oclc':
info = lookup(value)
if 'isbn' in info:
for value in info['isbn']:
ids.append(('isbn', value))
if ids:
logger.debug('get_ids %s %s => %s', key, value, ids)
return ids
def lookup(id):
data = {
'oclc': [id]
}
url = '%s/oclc/%s' % (base_url, id)
html = read_url(url).decode('utf-8')
doc = lxml.html.document_fromstring(html)
for e in doc.xpath("//*[contains(@id, 'bibtip')]"):
key = e.attrib['id'].replace('bibtip_', '')
value = e.text_content()
data[key] = value
info = doc.xpath('//textarea[@id="util-em-note"]')
if info:
info = info[0].text
info = dict([i.split(':', 1) for i in info.split('\n\n')[1].split('\n')])
for key in info:
k = key.lower()
data[k] = info[key].strip()
for key in ('id', 'instance', 'mediatype', 'reclist', 'shorttitle'):
if key in data:
del data[key]
if 'isxn' in data:
for isbn in data.pop('isxn').split(' '):
isbn = normalize_isbn(isbn)
if stdnum.isbn.is_valid(isbn):
if not 'isbn' in data:
data['isbn'] = []
if isbn not in data['isbn']:
data['isbn'].append(isbn)
cover = doc.xpath('//img[@class="cover"]')
if cover:
data['cover'] = cover[0].attrib['src']
if data['cover'].startswith('//'):
data['cover'] = 'http:' + data['cover']
cdata = read_url(data['cover'])
if hashlib.sha1(cdata).hexdigest() in (
'd2e9ab0c87193d69a7d3a3c21ae4aa550f7dcf00',
'70f16d3e077cdd47ef6b331001dbb1963677fa04'
):
del data['cover']
if 'author' in data:
data['author'] = data['author'].split('; ')
if 'title' in data:
data['title'] = data['title'].replace(' : ', ': ')
if 'publisher' in data:
m = re.compile('(.+) : (.+), (\d{4})').findall(data['publisher'])
if m:
place, publisher, date = m[0]
data['publisher'] = publisher
data['date'] = date
data['place'] = [place]
elif ':' in data['publisher']:
place, publisher = data['publisher'].split(':', 1)
data['place'] = [place.strip()]
data['publisher'] = publisher.split(',')[0].strip()
m = re.compile('\d{4}').findall(publisher)
if m:
data['date'] = m[0]
logger.debug('lookup %s => %s', id, list(data.keys()))
return data
info = lookup
def find(title, author, year):
return []