2014-08-12 08:16:57 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
2014-09-02 22:32:44 +00:00
|
|
|
|
2014-08-12 08:16:57 +00:00
|
|
|
|
|
|
|
import re
|
|
|
|
|
2014-05-14 09:57:11 +00:00
|
|
|
from ox.cache import read_url
|
2014-05-14 18:46:31 +00:00
|
|
|
from ox import find_re, strip_tags, decode_html
|
|
|
|
import stdnum.isbn
|
2014-05-14 09:57:11 +00:00
|
|
|
|
2014-09-02 22:32:44 +00:00
|
|
|
from .utils import find_isbns
|
2014-05-21 00:02:21 +00:00
|
|
|
|
2014-05-17 14:26:59 +00:00
|
|
|
import logging
|
|
|
|
logger = logging.getLogger('meta.lookupbyisbn')
|
|
|
|
|
2014-08-12 08:16:57 +00:00
|
|
|
|
2014-05-14 09:57:11 +00:00
|
|
|
base = 'http://www.lookupbyisbn.com'
|
|
|
|
|
|
|
|
def get_ids(key, value):
|
|
|
|
ids = []
|
2014-05-21 00:02:21 +00:00
|
|
|
|
|
|
|
def add_other_isbn(v):
|
|
|
|
if len(v) == 10:
|
|
|
|
ids.append(('isbn', stdnum.isbn.to_isbn13(v)))
|
|
|
|
if len(v) == 13 and v.startswith('978'):
|
|
|
|
ids.append(('isbn', stdnum.isbn.to_isbn10(v)))
|
|
|
|
|
|
|
|
if key in ('isbn', 'asin'):
|
2014-05-14 09:57:11 +00:00
|
|
|
url = '%s/Search/Book/%s/1' % (base, value)
|
|
|
|
data = read_url(url).decode('utf-8')
|
|
|
|
m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data)
|
|
|
|
if m:
|
|
|
|
asin = m[0].split('/')[-3]
|
2014-05-21 00:02:21 +00:00
|
|
|
if not stdnum.isbn.is_valid(asin):
|
|
|
|
ids.append(('asin', asin))
|
|
|
|
if key == 'isbn':
|
|
|
|
add_other_isbn(value)
|
2014-05-14 18:46:31 +00:00
|
|
|
if key == 'asin':
|
|
|
|
if stdnum.isbn.is_valid(value):
|
2014-05-21 00:02:21 +00:00
|
|
|
ids.append(('isbn', value))
|
|
|
|
add_other_isbn(value)
|
|
|
|
else:
|
|
|
|
for isbn in amazon_lookup(value):
|
|
|
|
if stdnum.isbn.is_valid(isbn):
|
|
|
|
ids.append(('isbn', isbn))
|
|
|
|
add_other_isbn(isbn)
|
2014-05-14 09:57:11 +00:00
|
|
|
if ids:
|
2014-05-17 14:26:59 +00:00
|
|
|
logger.debug('get_ids %s, %s => %s', key, value, ids)
|
2014-05-14 09:57:11 +00:00
|
|
|
return ids
|
|
|
|
|
|
|
|
def lookup(id):
|
2014-05-17 14:26:59 +00:00
|
|
|
logger.debug('lookup %s', id)
|
2014-05-14 09:57:11 +00:00
|
|
|
r = {
|
2014-05-21 00:02:21 +00:00
|
|
|
'asin': [id]
|
2014-05-14 09:57:11 +00:00
|
|
|
}
|
|
|
|
url = '%s/Lookup/Book/%s/%s/1' % (base, id, id)
|
|
|
|
data = read_url(url).decode('utf-8')
|
|
|
|
r["title"] = find_re(data, "<h2>(.*?)</h2>")
|
2014-05-26 08:23:10 +00:00
|
|
|
if r["title"] == 'Error!':
|
|
|
|
return {}
|
2014-05-14 09:57:11 +00:00
|
|
|
keys = {
|
|
|
|
'author': 'Author(s)',
|
|
|
|
'publisher': 'Publisher',
|
|
|
|
'date': 'Publication date',
|
|
|
|
'edition': 'Edition',
|
|
|
|
'binding': 'Binding',
|
|
|
|
'volume': 'Volume(s)',
|
|
|
|
'pages': 'Pages',
|
|
|
|
}
|
|
|
|
for key in keys:
|
|
|
|
r[key] = find_re(data, '<span class="title">%s:</span>(.*?)</li>'% re.escape(keys[key]))
|
2014-05-18 23:24:04 +00:00
|
|
|
if r[key] == '--' or not r[key]:
|
|
|
|
del r[key]
|
|
|
|
if key == 'pages' and key in r:
|
2014-05-14 09:57:11 +00:00
|
|
|
r[key] = int(r[key])
|
|
|
|
desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
|
|
|
|
desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ')
|
2014-05-16 08:06:11 +00:00
|
|
|
r['description'] = decode_html(strip_tags(desc))
|
2014-05-14 09:57:11 +00:00
|
|
|
r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace('._SL160_', '')
|
2014-05-14 18:46:31 +00:00
|
|
|
for key in r:
|
2014-09-02 22:32:44 +00:00
|
|
|
if isinstance(r[key], str):
|
2014-05-14 18:46:31 +00:00
|
|
|
r[key] = decode_html(strip_tags(r[key])).strip()
|
2014-09-02 22:32:44 +00:00
|
|
|
if 'author' in r and isinstance(r['author'], str) and r['author']:
|
2014-05-14 09:57:11 +00:00
|
|
|
r['author'] = [r['author']]
|
2014-05-16 08:06:11 +00:00
|
|
|
else:
|
|
|
|
r['author'] = []
|
2014-09-02 22:32:44 +00:00
|
|
|
if r['description'].lower() == 'Description of this item is not available at this time.'.lower():
|
2014-05-16 08:06:11 +00:00
|
|
|
r['description'] = ''
|
2014-05-14 09:57:11 +00:00
|
|
|
return r
|
|
|
|
|
2014-05-21 00:02:21 +00:00
|
|
|
def amazon_lookup(asin):
|
2015-03-09 11:13:33 +00:00
|
|
|
html = read_url('http://www.amazon.com/dp/%s' % asin).decode('utf-8', 'ignore')
|
2014-05-21 00:02:21 +00:00
|
|
|
return list(set(find_isbns(find_re(html, 'Formats</h3>.*?</table'))))
|