better metadata lookup

This commit is contained in:
j 2015-11-03 23:36:19 +01:00
parent 2298fe68b9
commit 7e37713c95
3 changed files with 20 additions and 10 deletions

View File

@ -21,6 +21,7 @@ providers = [
('openlibrary', 'olid'),
('loc', 'lccn'),
('worldcat', 'oclc'),
('worldcat', 'isbn'),
('lookupbyisbn', 'asin'),
('lookupbyisbn', 'isbn'),
('abebooks', 'isbn')
@ -36,23 +37,31 @@ def find(query):
'''
return results
def lookup_provider(arg):
provider, id, ids, key, value = arg
values = set()
for key, value in ids:
if key == id or provider in ('openlibrary', ):
for kv in globals()[provider].get_ids(key, value):
values.add(kv)
return values
def lookup(key, value):
if not isvalid_id(key, value):
return {}
data = {key: [value]}
ids = [(key, value)]
ids = set([(key, value)])
provider_data = {}
done = False
while not done:
done = True
for provider, id in providers:
for key, value in ids:
for kv in globals()[provider].get_ids(key, value):
if not kv in ids:
ids.append(kv)
done = False
result = lookup_provider((provider, id, ids, key, value))
done = not result - ids
ids.update(result)
logger.debug('FIXME: sort ids')
ids.sort(key=lambda i: ox.sort_string(''.join(i)))
ids = sorted(ids, key=lambda i: ox.sort_string(''.join(i)))
logger.debug('IDS %s', ids)
for k, v in ids:
for provider, id in providers:

View File

@ -15,7 +15,7 @@ base = 'http://www.abebooks.com'
def get_ids(key, value):
ids = []
if key == 'isbn':
url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)
url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, value)
data = read_url(url, unicode=True)
urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)
if urls:

View File

@ -31,7 +31,7 @@ def get_ids(key, value):
m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data)
if m:
asin = m[0].split('/')[-3]
if not stdnum.isbn.is_valid(asin):
if stdnum.isbn.to_isbn10(asin) or not stdnum.isbn.is_valid(asin):
ids.append(('asin', asin))
if key == 'isbn':
add_other_isbn(value)
@ -89,5 +89,6 @@ def lookup(id):
return r
def amazon_lookup(asin):
html = read_url('http://www.amazon.com/dp/%s' % asin).decode('utf-8', 'ignore')
url = 'http://www.amazon.com/dp/%s' % asin
html = read_url(url, timeout=-1).decode('utf-8', 'ignore')
return list(set(find_isbns(find_re(html, 'Formats</h3>.*?</table'))))