From 7e37713c95bcdb0630b7deab92142583977c817e Mon Sep 17 00:00:00 2001 From: j Date: Tue, 3 Nov 2015 23:36:19 +0100 Subject: [PATCH] better metadata lookup --- oml/meta/__init__.py | 23 ++++++++++++++++------- oml/meta/abebooks.py | 2 +- oml/meta/lookupbyisbn.py | 5 +++-- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/oml/meta/__init__.py b/oml/meta/__init__.py index 97805cb..f42fa29 100644 --- a/oml/meta/__init__.py +++ b/oml/meta/__init__.py @@ -21,6 +21,7 @@ providers = [ ('openlibrary', 'olid'), ('loc', 'lccn'), ('worldcat', 'oclc'), + ('worldcat', 'isbn'), ('lookupbyisbn', 'asin'), ('lookupbyisbn', 'isbn'), ('abebooks', 'isbn') @@ -36,23 +37,31 @@ def find(query): ''' return results +def lookup_provider(arg): + provider, id, ids, key, value = arg + values = set() + for key, value in ids: + if key == id or provider in ('openlibrary', ): + for kv in globals()[provider].get_ids(key, value): + values.add(kv) + return values + def lookup(key, value): if not isvalid_id(key, value): return {} data = {key: [value]} - ids = [(key, value)] + ids = set([(key, value)]) provider_data = {} done = False + while not done: done = True for provider, id in providers: - for key, value in ids: - for kv in globals()[provider].get_ids(key, value): - if not kv in ids: - ids.append(kv) - done = False + result = lookup_provider((provider, id, ids, key, value)) + done = not result - ids + ids.update(result) logger.debug('FIXME: sort ids') - ids.sort(key=lambda i: ox.sort_string(''.join(i))) + ids = sorted(ids, key=lambda i: ox.sort_string(''.join(i))) logger.debug('IDS %s', ids) for k, v in ids: for provider, id in providers: diff --git a/oml/meta/abebooks.py b/oml/meta/abebooks.py index ebb1352..b7054f6 100644 --- a/oml/meta/abebooks.py +++ b/oml/meta/abebooks.py @@ -15,7 +15,7 @@ base = 'http://www.abebooks.com' def get_ids(key, value): ids = [] if key == 'isbn': - url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id) + url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, value) data = read_url(url, unicode=True) urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data) if urls: diff --git a/oml/meta/lookupbyisbn.py b/oml/meta/lookupbyisbn.py index 2eb5365..2a38174 100644 --- a/oml/meta/lookupbyisbn.py +++ b/oml/meta/lookupbyisbn.py @@ -31,7 +31,7 @@ def get_ids(key, value): m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data) if m: asin = m[0].split('/')[-3] - if not stdnum.isbn.is_valid(asin): + if stdnum.isbn.to_isbn10(asin) or not stdnum.isbn.is_valid(asin): ids.append(('asin', asin)) if key == 'isbn': add_other_isbn(value) @@ -89,5 +89,6 @@ def lookup(id): return r def amazon_lookup(asin): - html = read_url('http://www.amazon.com/dp/%s' % asin).decode('utf-8', 'ignore') + url = 'http://www.amazon.com/dp/%s' % asin + html = read_url(url, timeout=-1).decode('utf-8', 'ignore') return list(set(find_isbns(find_re(html, 'Formats.*?