openmedialibrary/oml/meta/worldcat.py

72 lines
2.1 KiB
Python
Raw Normal View History

2014-05-14 09:57:11 +00:00
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import division
from ox.cache import read_url
import lxml.html
import re
from utils import normalize_isbn
import stdnum.isbn
2014-05-17 14:26:59 +00:00
import logging
logger = logging.getLogger('meta.worldcat')
2014-05-14 09:57:11 +00:00
base_url = 'http://www.worldcat.org'
def get_ids(key, value):
ids = []
if key in ['isbn10', 'isbn13']:
url = '%s/search?qt=worldcat_org_bks&q=%s' % (base_url, value)
html = read_url(url)
matches = re.compile('/title.*?oclc/(\d+).*?"').findall(html)
if matches:
info = lookup(matches[0])
ids.append(('oclc', matches[0]))
for k in ['isbn10', 'isbn13']:
if k in info and k != key:
ids.append((k, info[k]))
elif key == 'oclc':
info = lookup(value)
for k in ['isbn10', 'isbn13']:
if k in info:
ids.append((k, info[k]))
if ids:
2014-05-17 14:26:59 +00:00
logger.debug('get_ids %s %s', key, value)
logger.debug('%s', ids)
2014-05-14 09:57:11 +00:00
return ids
def lookup(id):
data = {
'oclc': id
}
url = '%s/oclc/%s' % (base_url, id)
html = read_url(url).decode('utf-8')
doc = lxml.html.document_fromstring(html)
for e in doc.xpath("//*[contains(@id, 'bibtip')]"):
key = e.attrib['id'].replace('bibtip_', '')
value = e.text_content()
data[key] = value
info = doc.xpath('//textarea[@id="util-em-note"]')[0].text
info = dict([i.split(':', 1) for i in info.split('\n\n')[1].split('\n')])
for key in info:
k = key.lower()
data[k] = info[key].strip()
for key in ('id', 'instance', 'mediatype', 'reclist', 'shorttitle'):
if key in data:
del data[key]
if 'isxn' in data:
for isbn in data.pop('isxn').split(' '):
isbn = normalize_isbn(isbn)
if stdnum.isbn.is_valid(isbn):
data['isbn%d'%len(isbn)] = isbn
if 'author' in data:
data['author'] = [data['author']]
2014-05-17 14:26:59 +00:00
logger.debug('lookup %s => %s', id, data.keys())
2014-05-14 09:57:11 +00:00
return data
info = lookup
def find(title, author, year):
return []