better metadata lookup

This commit is contained in:
j 2015-11-03 23:36:19 +01:00
parent 2298fe68b9
commit 7e37713c95
3 changed files with 20 additions and 10 deletions

View file

@ -21,6 +21,7 @@ providers = [
('openlibrary', 'olid'), ('openlibrary', 'olid'),
('loc', 'lccn'), ('loc', 'lccn'),
('worldcat', 'oclc'), ('worldcat', 'oclc'),
('worldcat', 'isbn'),
('lookupbyisbn', 'asin'), ('lookupbyisbn', 'asin'),
('lookupbyisbn', 'isbn'), ('lookupbyisbn', 'isbn'),
('abebooks', 'isbn') ('abebooks', 'isbn')
@ -36,23 +37,31 @@ def find(query):
''' '''
return results return results
def lookup_provider(arg):
provider, id, ids, key, value = arg
values = set()
for key, value in ids:
if key == id or provider in ('openlibrary', ):
for kv in globals()[provider].get_ids(key, value):
values.add(kv)
return values
def lookup(key, value): def lookup(key, value):
if not isvalid_id(key, value): if not isvalid_id(key, value):
return {} return {}
data = {key: [value]} data = {key: [value]}
ids = [(key, value)] ids = set([(key, value)])
provider_data = {} provider_data = {}
done = False done = False
while not done: while not done:
done = True done = True
for provider, id in providers: for provider, id in providers:
for key, value in ids: result = lookup_provider((provider, id, ids, key, value))
for kv in globals()[provider].get_ids(key, value): done = not result - ids
if not kv in ids: ids.update(result)
ids.append(kv)
done = False
logger.debug('FIXME: sort ids') logger.debug('FIXME: sort ids')
ids.sort(key=lambda i: ox.sort_string(''.join(i))) ids = sorted(ids, key=lambda i: ox.sort_string(''.join(i)))
logger.debug('IDS %s', ids) logger.debug('IDS %s', ids)
for k, v in ids: for k, v in ids:
for provider, id in providers: for provider, id in providers:

View file

@ -15,7 +15,7 @@ base = 'http://www.abebooks.com'
def get_ids(key, value): def get_ids(key, value):
ids = [] ids = []
if key == 'isbn': if key == 'isbn':
url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id) url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, value)
data = read_url(url, unicode=True) data = read_url(url, unicode=True)
urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data) urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)
if urls: if urls:

View file

@ -31,7 +31,7 @@ def get_ids(key, value):
m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data) m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data)
if m: if m:
asin = m[0].split('/')[-3] asin = m[0].split('/')[-3]
if not stdnum.isbn.is_valid(asin): if stdnum.isbn.to_isbn10(asin) or not stdnum.isbn.is_valid(asin):
ids.append(('asin', asin)) ids.append(('asin', asin))
if key == 'isbn': if key == 'isbn':
add_other_isbn(value) add_other_isbn(value)
@ -89,5 +89,6 @@ def lookup(id):
return r return r
def amazon_lookup(asin): def amazon_lookup(asin):
html = read_url('http://www.amazon.com/dp/%s' % asin).decode('utf-8', 'ignore') url = 'http://www.amazon.com/dp/%s' % asin
html = read_url(url, timeout=-1).decode('utf-8', 'ignore')
return list(set(find_isbns(find_re(html, 'Formats</h3>.*?</table')))) return list(set(find_isbns(find_re(html, 'Formats</h3>.*?</table'))))