2014-05-14 09:57:11 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
2014-09-02 22:32:44 +00:00
|
|
|
|
2014-05-14 09:57:11 +00:00
|
|
|
|
|
|
|
import re
|
2014-05-25 21:27:03 +00:00
|
|
|
import hashlib
|
2014-08-12 08:16:57 +00:00
|
|
|
|
|
|
|
from ox.cache import read_url
|
|
|
|
import lxml.html
|
2014-05-14 09:57:11 +00:00
|
|
|
import stdnum.isbn
|
|
|
|
|
2014-08-12 08:16:57 +00:00
|
|
|
from .utils import normalize_isbn
|
|
|
|
|
2014-05-17 14:26:59 +00:00
|
|
|
import logging
|
|
|
|
logger = logging.getLogger('meta.worldcat')
|
|
|
|
|
2014-08-12 08:16:57 +00:00
|
|
|
|
2014-05-14 09:57:11 +00:00
|
|
|
base_url = 'http://www.worldcat.org'
|
|
|
|
|
|
|
|
def get_ids(key, value):
|
|
|
|
ids = []
|
2014-05-21 00:02:21 +00:00
|
|
|
if key == 'isbn':
|
2014-05-14 09:57:11 +00:00
|
|
|
url = '%s/search?qt=worldcat_org_bks&q=%s' % (base_url, value)
|
2014-10-31 11:46:14 +00:00
|
|
|
html = read_url(url).decode('utf-8')
|
2014-05-14 09:57:11 +00:00
|
|
|
matches = re.compile('/title.*?oclc/(\d+).*?"').findall(html)
|
|
|
|
if matches:
|
|
|
|
info = lookup(matches[0])
|
|
|
|
ids.append(('oclc', matches[0]))
|
2014-05-21 00:02:21 +00:00
|
|
|
for v in info.get('isbn', []):
|
|
|
|
if v != value:
|
|
|
|
ids.append(('isbn', v))
|
2014-05-14 09:57:11 +00:00
|
|
|
elif key == 'oclc':
|
|
|
|
info = lookup(value)
|
2014-05-21 00:02:21 +00:00
|
|
|
if 'isbn' in info:
|
|
|
|
for value in info['isbn']:
|
|
|
|
ids.append(('isbn', value))
|
2014-05-14 09:57:11 +00:00
|
|
|
if ids:
|
2015-03-09 10:34:26 +00:00
|
|
|
logger.debug('get_ids %s %s => %s', key, value, ids)
|
2014-05-14 09:57:11 +00:00
|
|
|
return ids
|
|
|
|
|
|
|
|
def lookup(id):
|
|
|
|
data = {
|
2014-05-21 00:02:21 +00:00
|
|
|
'oclc': [id]
|
2014-05-14 09:57:11 +00:00
|
|
|
}
|
|
|
|
url = '%s/oclc/%s' % (base_url, id)
|
|
|
|
html = read_url(url).decode('utf-8')
|
|
|
|
doc = lxml.html.document_fromstring(html)
|
|
|
|
for e in doc.xpath("//*[contains(@id, 'bibtip')]"):
|
|
|
|
key = e.attrib['id'].replace('bibtip_', '')
|
|
|
|
value = e.text_content()
|
|
|
|
data[key] = value
|
2014-05-25 22:23:48 +00:00
|
|
|
info = doc.xpath('//textarea[@id="util-em-note"]')
|
|
|
|
if info:
|
|
|
|
info = info[0].text
|
|
|
|
info = dict([i.split(':', 1) for i in info.split('\n\n')[1].split('\n')])
|
|
|
|
for key in info:
|
|
|
|
k = key.lower()
|
|
|
|
data[k] = info[key].strip()
|
2014-05-14 09:57:11 +00:00
|
|
|
for key in ('id', 'instance', 'mediatype', 'reclist', 'shorttitle'):
|
|
|
|
if key in data:
|
|
|
|
del data[key]
|
|
|
|
if 'isxn' in data:
|
|
|
|
for isbn in data.pop('isxn').split(' '):
|
|
|
|
isbn = normalize_isbn(isbn)
|
|
|
|
if stdnum.isbn.is_valid(isbn):
|
2014-05-21 00:02:21 +00:00
|
|
|
if not 'isbn' in data:
|
|
|
|
data['isbn'] = []
|
|
|
|
if isbn not in data['isbn']:
|
|
|
|
data['isbn'].append(isbn)
|
2014-05-24 21:21:03 +00:00
|
|
|
cover = doc.xpath('//img[@class="cover"]')
|
|
|
|
if cover:
|
|
|
|
data['cover'] = cover[0].attrib['src']
|
|
|
|
if data['cover'].startswith('//'):
|
|
|
|
data['cover'] = 'http:' + data['cover']
|
2014-05-25 21:27:03 +00:00
|
|
|
cdata = read_url(data['cover'])
|
2014-08-11 11:30:32 +00:00
|
|
|
if hashlib.sha1(cdata).hexdigest() in (
|
|
|
|
'd2e9ab0c87193d69a7d3a3c21ae4aa550f7dcf00',
|
|
|
|
'70f16d3e077cdd47ef6b331001dbb1963677fa04'
|
|
|
|
):
|
2014-05-25 21:27:03 +00:00
|
|
|
del data['cover']
|
2014-05-24 21:21:03 +00:00
|
|
|
|
2014-05-14 09:57:11 +00:00
|
|
|
if 'author' in data:
|
2014-05-26 08:23:10 +00:00
|
|
|
data['author'] = data['author'].split('; ')
|
2014-05-21 00:02:21 +00:00
|
|
|
if 'title' in data:
|
|
|
|
data['title'] = data['title'].replace(' : ', ': ')
|
2014-05-26 08:23:10 +00:00
|
|
|
if 'publisher' in data:
|
|
|
|
m = re.compile('(.+) : (.+), (\d{4})').findall(data['publisher'])
|
|
|
|
if m:
|
|
|
|
place, publisher, date = m[0]
|
|
|
|
data['publisher'] = publisher
|
|
|
|
data['date'] = date
|
2014-05-26 08:30:18 +00:00
|
|
|
data['place'] = [place]
|
2014-08-11 11:27:11 +00:00
|
|
|
elif ':' in data['publisher']:
|
2015-11-03 22:58:55 +00:00
|
|
|
place, publisher = data['publisher'].split(':', 1)
|
2014-08-11 17:43:14 +00:00
|
|
|
data['place'] = [place.strip()]
|
2014-08-11 11:27:11 +00:00
|
|
|
data['publisher'] = publisher.split(',')[0].strip()
|
|
|
|
m = re.compile('\d{4}').findall(publisher)
|
|
|
|
if m:
|
|
|
|
data['date'] = m[0]
|
2014-05-26 08:23:10 +00:00
|
|
|
|
2014-09-02 22:32:44 +00:00
|
|
|
logger.debug('lookup %s => %s', id, list(data.keys()))
|
2014-05-14 09:57:11 +00:00
|
|
|
return data
|
|
|
|
|
|
|
|
info = lookup
|
|
|
|
|
|
|
|
def find(title, author, year):
|
|
|
|
return []
|
|
|
|
|