better metadata lookup
This commit is contained in:
parent
2298fe68b9
commit
7e37713c95
3 changed files with 20 additions and 10 deletions
|
@ -21,6 +21,7 @@ providers = [
|
||||||
('openlibrary', 'olid'),
|
('openlibrary', 'olid'),
|
||||||
('loc', 'lccn'),
|
('loc', 'lccn'),
|
||||||
('worldcat', 'oclc'),
|
('worldcat', 'oclc'),
|
||||||
|
('worldcat', 'isbn'),
|
||||||
('lookupbyisbn', 'asin'),
|
('lookupbyisbn', 'asin'),
|
||||||
('lookupbyisbn', 'isbn'),
|
('lookupbyisbn', 'isbn'),
|
||||||
('abebooks', 'isbn')
|
('abebooks', 'isbn')
|
||||||
|
@ -36,23 +37,31 @@ def find(query):
|
||||||
'''
|
'''
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
def lookup_provider(arg):
|
||||||
|
provider, id, ids, key, value = arg
|
||||||
|
values = set()
|
||||||
|
for key, value in ids:
|
||||||
|
if key == id or provider in ('openlibrary', ):
|
||||||
|
for kv in globals()[provider].get_ids(key, value):
|
||||||
|
values.add(kv)
|
||||||
|
return values
|
||||||
|
|
||||||
def lookup(key, value):
|
def lookup(key, value):
|
||||||
if not isvalid_id(key, value):
|
if not isvalid_id(key, value):
|
||||||
return {}
|
return {}
|
||||||
data = {key: [value]}
|
data = {key: [value]}
|
||||||
ids = [(key, value)]
|
ids = set([(key, value)])
|
||||||
provider_data = {}
|
provider_data = {}
|
||||||
done = False
|
done = False
|
||||||
|
|
||||||
while not done:
|
while not done:
|
||||||
done = True
|
done = True
|
||||||
for provider, id in providers:
|
for provider, id in providers:
|
||||||
for key, value in ids:
|
result = lookup_provider((provider, id, ids, key, value))
|
||||||
for kv in globals()[provider].get_ids(key, value):
|
done = not result - ids
|
||||||
if not kv in ids:
|
ids.update(result)
|
||||||
ids.append(kv)
|
|
||||||
done = False
|
|
||||||
logger.debug('FIXME: sort ids')
|
logger.debug('FIXME: sort ids')
|
||||||
ids.sort(key=lambda i: ox.sort_string(''.join(i)))
|
ids = sorted(ids, key=lambda i: ox.sort_string(''.join(i)))
|
||||||
logger.debug('IDS %s', ids)
|
logger.debug('IDS %s', ids)
|
||||||
for k, v in ids:
|
for k, v in ids:
|
||||||
for provider, id in providers:
|
for provider, id in providers:
|
||||||
|
|
|
@ -15,7 +15,7 @@ base = 'http://www.abebooks.com'
|
||||||
def get_ids(key, value):
|
def get_ids(key, value):
|
||||||
ids = []
|
ids = []
|
||||||
if key == 'isbn':
|
if key == 'isbn':
|
||||||
url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)
|
url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, value)
|
||||||
data = read_url(url, unicode=True)
|
data = read_url(url, unicode=True)
|
||||||
urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)
|
urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)
|
||||||
if urls:
|
if urls:
|
||||||
|
|
|
@ -31,7 +31,7 @@ def get_ids(key, value):
|
||||||
m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data)
|
m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data)
|
||||||
if m:
|
if m:
|
||||||
asin = m[0].split('/')[-3]
|
asin = m[0].split('/')[-3]
|
||||||
if not stdnum.isbn.is_valid(asin):
|
if stdnum.isbn.to_isbn10(asin) or not stdnum.isbn.is_valid(asin):
|
||||||
ids.append(('asin', asin))
|
ids.append(('asin', asin))
|
||||||
if key == 'isbn':
|
if key == 'isbn':
|
||||||
add_other_isbn(value)
|
add_other_isbn(value)
|
||||||
|
@ -89,5 +89,6 @@ def lookup(id):
|
||||||
return r
|
return r
|
||||||
|
|
||||||
def amazon_lookup(asin):
|
def amazon_lookup(asin):
|
||||||
html = read_url('http://www.amazon.com/dp/%s' % asin).decode('utf-8', 'ignore')
|
url = 'http://www.amazon.com/dp/%s' % asin
|
||||||
|
html = read_url(url, timeout=-1).decode('utf-8', 'ignore')
|
||||||
return list(set(find_isbns(find_re(html, 'Formats</h3>.*?</table'))))
|
return list(set(find_isbns(find_re(html, 'Formats</h3>.*?</table'))))
|
||||||
|
|
Loading…
Reference in a new issue