meta
This commit is contained in:
parent
edd42dfd76
commit
d385853186
48 changed files with 1344 additions and 488 deletions
|
|
@ -0,0 +1,50 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import division
|
||||
|
||||
import abebooks
|
||||
import loc
|
||||
import lookupbyisbn
|
||||
import openlibrary
|
||||
import worldcat
|
||||
|
||||
providers = [
|
||||
('openlibrary', 'olid'),
|
||||
('loc', 'lccn'),
|
||||
('worldcat', 'oclc'),
|
||||
('lookupbyisbn', 'asin'),
|
||||
('abebooks', 'isbn10')
|
||||
]
|
||||
|
||||
def find(title, author=None, publisher=None, year=None):
|
||||
return []
|
||||
|
||||
def lookup(key, value):
|
||||
data = {key: value}
|
||||
ids = [(key, value)]
|
||||
provider_data = {}
|
||||
done = False
|
||||
while not done:
|
||||
done = True
|
||||
for provider, id in providers:
|
||||
for key, value in ids:
|
||||
for kv in globals()[provider].get_ids(key, value):
|
||||
if not kv in ids:
|
||||
ids.append(kv)
|
||||
done = False
|
||||
print ids
|
||||
for k, v in ids:
|
||||
for provider, id in providers:
|
||||
if id == k:
|
||||
provider_data[provider] = globals()[provider].lookup(v)
|
||||
for provider in sorted(
|
||||
provider_data.keys(),
|
||||
key=lambda x: -len(provider_data[x])
|
||||
):
|
||||
print provider, len(provider_data[provider])
|
||||
for k_, v_ in provider_data[provider].iteritems():
|
||||
if not k_ in data:
|
||||
data[k_] = v_
|
||||
return data
|
||||
|
||||
|
||||
38
oml/meta/abebooks.py
Normal file
38
oml/meta/abebooks.py
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
from ox.cache import read_url
|
||||
import re
|
||||
import lxml.html
|
||||
|
||||
def get_ids(key, value):
|
||||
ids = []
|
||||
if key in ('isbn10', 'isbn13'):
|
||||
base = 'http://www.abebooks.com'
|
||||
url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)
|
||||
data = read_url(url)
|
||||
urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)
|
||||
if urls:
|
||||
ids.append((key, value))
|
||||
if ids:
|
||||
print 'abebooks.get_ids', key, value
|
||||
print ids
|
||||
return ids
|
||||
|
||||
def lookup(id):
|
||||
print 'abebooks.lookup', id
|
||||
return {}
|
||||
|
||||
def get_data(id):
|
||||
info = {}
|
||||
base = 'http://www.abebooks.com'
|
||||
url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)
|
||||
data = read_url(url)
|
||||
urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)
|
||||
if urls:
|
||||
details = '%s%s' % (base, urls[0])
|
||||
data = read_url(details)
|
||||
doc = lxml.html.document_fromstring(data)
|
||||
for e in doc.xpath("//*[contains(@id, 'biblio')]"):
|
||||
key = e.attrib['id'].replace('biblio-', '')
|
||||
value = e.text_content()
|
||||
if value and key not in ('bookcondition', 'binding'):
|
||||
info[key] = value
|
||||
return info
|
||||
|
|
@ -4,18 +4,35 @@ from __future__ import division
|
|||
|
||||
import ox
|
||||
from ox.cache import read_url
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from utils import normalize_isbn
|
||||
from marc_countries import COUNTRIES
|
||||
|
||||
def info(id):
|
||||
def get_ids(key, value):
|
||||
ids = []
|
||||
if key in ['isbn10', 'isbn13']:
|
||||
url = 'http://www.loc.gov/search/?q=%s&all=true' % value
|
||||
html = ox.cache.read_url(url)
|
||||
match = re.search('"http://lccn.loc.gov/(\d+)"', html)
|
||||
if match:
|
||||
ids.append(('lccn', match.group(1)))
|
||||
if ids:
|
||||
print 'loc.get_ids', key, value
|
||||
print ids
|
||||
return ids
|
||||
|
||||
def lookup(id):
|
||||
print 'loc.lookup', id
|
||||
ns = '{http://www.loc.gov/mods/v3}'
|
||||
url = 'http://lccn.loc.gov/%s/mods' % id
|
||||
data = read_url(url)
|
||||
mods = ET.fromstring(data)
|
||||
|
||||
info = {}
|
||||
info = {
|
||||
'lccn': id
|
||||
}
|
||||
info['title'] = ''.join([e.text for e in mods.findall(ns + 'titleInfo')[0]])
|
||||
origin = mods.findall(ns + 'originInfo')
|
||||
if origin:
|
||||
|
|
@ -28,7 +45,9 @@ def info(id):
|
|||
elif terms and terms[0].attrib['type'] == 'code':
|
||||
e = terms[0]
|
||||
info['country'] = COUNTRIES.get(e.text, e.text)
|
||||
info['publisher'] = ''.join([e.text for e in origin[0].findall(ns + 'publisher')])
|
||||
publisher = [e.text for e in origin[0].findall(ns + 'publisher')]
|
||||
if publisher:
|
||||
info['publisher'] = publisher[0]
|
||||
info['date'] = ''.join([e.text for e in origin[0].findall(ns + 'dateIssued')])
|
||||
for i in mods.findall(ns + 'identifier'):
|
||||
if i.attrib['type'] == 'oclc':
|
||||
|
|
@ -43,10 +62,12 @@ def info(id):
|
|||
info['classification'] = i.text
|
||||
info['author'] = []
|
||||
for a in mods.findall(ns + 'name'):
|
||||
if a.attrib['usage'] == 'primary':
|
||||
info['author'].append(''.join([e.text for e in a.findall(ns + 'namePart')]))
|
||||
info['author'] = [ox.normalize_name(a[:-1]) for a in info['author']]
|
||||
if a.attrib.get('usage') == 'primary':
|
||||
info['author'].append(' '.join([e.text for e in a.findall(ns + 'namePart') if not e.attrib.get('type') in ('date', )]))
|
||||
info['author'] = [ox.normalize_name(a) for a in info['author']]
|
||||
for key in info.keys():
|
||||
if not info[key]:
|
||||
del info[key]
|
||||
return info
|
||||
|
||||
info = lookup
|
||||
53
oml/meta/lookupbyisbn.py
Normal file
53
oml/meta/lookupbyisbn.py
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
from ox.cache import read_url
|
||||
from ox import find_re, strip_tags
|
||||
import re
|
||||
|
||||
base = 'http://www.lookupbyisbn.com'
|
||||
|
||||
def get_ids(key, value):
|
||||
ids = []
|
||||
if key in ('isbn10', 'isbn13', 'asin'):
|
||||
url = '%s/Search/Book/%s/1' % (base, value)
|
||||
data = read_url(url).decode('utf-8')
|
||||
m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data)
|
||||
if m:
|
||||
asin = m[0].split('/')[-3]
|
||||
ids.append(('asin', asin))
|
||||
if ids:
|
||||
print 'lookupbyisbn.get_ids', key, value
|
||||
print ids
|
||||
return ids
|
||||
|
||||
def lookup(id):
|
||||
print 'lookupbyisbn.lookup', id
|
||||
r = {
|
||||
'asin': id
|
||||
}
|
||||
url = '%s/Lookup/Book/%s/%s/1' % (base, id, id)
|
||||
data = read_url(url).decode('utf-8')
|
||||
r["title"] = find_re(data, "<h2>(.*?)</h2>")
|
||||
keys = {
|
||||
'author': 'Author(s)',
|
||||
'publisher': 'Publisher',
|
||||
'date': 'Publication date',
|
||||
'edition': 'Edition',
|
||||
'binding': 'Binding',
|
||||
'volume': 'Volume(s)',
|
||||
'pages': 'Pages',
|
||||
}
|
||||
for key in keys:
|
||||
r[key] = find_re(data, '<span class="title">%s:</span>(.*?)</li>'% re.escape(keys[key]))
|
||||
if r[key] == '--':
|
||||
r[key] = ''
|
||||
if key == 'pages' and r[key]:
|
||||
r[key] = int(r[key])
|
||||
desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
|
||||
desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ')
|
||||
r['description'] = strip_tags(desc).strip()
|
||||
if r['description'] == u'Description of this item is not available at this time.':
|
||||
r['description'] = ''
|
||||
r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace('._SL160_', '')
|
||||
if 'author' in r and isinstance(r['author'], basestring):
|
||||
r['author'] = [r['author']]
|
||||
return r
|
||||
|
||||
|
|
@ -5,30 +5,39 @@ from __future__ import division
|
|||
from ox.cache import read_url
|
||||
import json
|
||||
|
||||
from utils import normalize_isbn
|
||||
from marc_countries import COUNTRIES
|
||||
from utils import normalize_isbn
|
||||
|
||||
def find(query):
|
||||
url = 'https://openlibrary.org/search.json?q=%s' % query
|
||||
data = json.loads(read_url(url))
|
||||
return data
|
||||
def get_ids(key, value):
|
||||
ids = []
|
||||
if key == 'olid':
|
||||
data = lookup(value, True)
|
||||
for id in ('isbn10', 'isbn13', 'lccn', 'oclc'):
|
||||
if id in data:
|
||||
for v in data[id]:
|
||||
if (id, v) not in ids:
|
||||
ids.append((id, v))
|
||||
elif key in ('isbn10', 'isbn13'):
|
||||
print 'openlibraryid.get_ids', key, value
|
||||
r = find('isbn:%s' % value)
|
||||
for d in sorted(r.get('docs', []), key=lambda d: -d['last_modified_i']):
|
||||
if 'edition_key' in d:
|
||||
v = d['edition_key']
|
||||
if isinstance(v, list):
|
||||
v = v[0]
|
||||
for kv in [('olid', v)] + get_ids('olid', v):
|
||||
if kv not in ids:
|
||||
ids.append(kv)
|
||||
if ids:
|
||||
print 'openlibraryid.get_ids', key, value
|
||||
print ids
|
||||
return ids
|
||||
|
||||
def authors(authors):
|
||||
return resolve_names(authors)
|
||||
|
||||
def resolve_names(objects, key='name'):
|
||||
r = []
|
||||
for o in objects:
|
||||
url = 'https://openlibrary.org%s.json' % o['key']
|
||||
data = json.loads(read_url(url))
|
||||
r.append(data[key])
|
||||
return r
|
||||
|
||||
def languages(languages):
|
||||
return resolve_names(languages)
|
||||
|
||||
def info(id):
|
||||
data = {}
|
||||
def lookup(id, return_all=False):
|
||||
#print 'openlibrary.lookup', id
|
||||
data = {
|
||||
'olid': id
|
||||
}
|
||||
url = 'https://openlibrary.org/books/%s.json' % id
|
||||
info = json.loads(read_url(url))
|
||||
keys = {
|
||||
|
|
@ -58,10 +67,34 @@ def info(id):
|
|||
value = COUNTRIES.get(value, value)
|
||||
elif key == 'languages':
|
||||
value = languages(value)
|
||||
elif isinstance(value, list) and key not in ('publish_places'):
|
||||
elif not return_all and isinstance(value, list) and key not in ('publish_places'):
|
||||
value = value[0]
|
||||
if key in ('isbn_10', 'isbn_13'):
|
||||
value = normalize_isbn(value)
|
||||
if isinstance(value, list):
|
||||
value = map(normalize_isbn, value)
|
||||
else:
|
||||
value = normalize_isbn(value)
|
||||
data[keys[key]] = value
|
||||
return data
|
||||
|
||||
info = lookup
|
||||
|
||||
def find(query):
|
||||
url = 'https://openlibrary.org/search.json?q=%s' % query
|
||||
data = json.loads(read_url(url))
|
||||
return data
|
||||
|
||||
def authors(authors):
|
||||
return resolve_names(authors)
|
||||
|
||||
def resolve_names(objects, key='name'):
|
||||
r = []
|
||||
for o in objects:
|
||||
url = 'https://openlibrary.org%s.json' % o['key']
|
||||
data = json.loads(read_url(url))
|
||||
r.append(data[key])
|
||||
return r
|
||||
|
||||
def languages(languages):
|
||||
return resolve_names(languages)
|
||||
|
||||
|
|
@ -4,7 +4,7 @@ import ox.web.lookupbyisbn
|
|||
|
||||
from utils import normalize_isbn
|
||||
|
||||
import ol
|
||||
import openlibrary as ol
|
||||
|
||||
def add_lookupbyisbn(item):
|
||||
isbn = item.meta.get('isbn10', item.meta.get('isbn13'))
|
||||
|
|
|
|||
5
oml/meta/utils.py
Normal file
5
oml/meta/utils.py
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
|
||||
|
||||
def normalize_isbn(value):
|
||||
return ''.join([s for s in value if s.isdigit() or s == 'X'])
|
||||
|
||||
69
oml/meta/worldcat.py
Normal file
69
oml/meta/worldcat.py
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import division
|
||||
|
||||
from ox.cache import read_url
|
||||
import lxml.html
|
||||
import re
|
||||
from utils import normalize_isbn
|
||||
import stdnum.isbn
|
||||
|
||||
base_url = 'http://www.worldcat.org'
|
||||
|
||||
def get_ids(key, value):
|
||||
ids = []
|
||||
if key in ['isbn10', 'isbn13']:
|
||||
url = '%s/search?qt=worldcat_org_bks&q=%s' % (base_url, value)
|
||||
html = read_url(url)
|
||||
matches = re.compile('/title.*?oclc/(\d+).*?"').findall(html)
|
||||
if matches:
|
||||
info = lookup(matches[0])
|
||||
ids.append(('oclc', matches[0]))
|
||||
for k in ['isbn10', 'isbn13']:
|
||||
if k in info and k != key:
|
||||
ids.append((k, info[k]))
|
||||
elif key == 'oclc':
|
||||
info = lookup(value)
|
||||
for k in ['isbn10', 'isbn13']:
|
||||
if k in info:
|
||||
ids.append((k, info[k]))
|
||||
if ids:
|
||||
print 'worldcat.get_ids', key, value
|
||||
print ids
|
||||
return ids
|
||||
|
||||
def lookup(id):
|
||||
data = {
|
||||
'oclc': id
|
||||
}
|
||||
url = '%s/oclc/%s' % (base_url, id)
|
||||
html = read_url(url).decode('utf-8')
|
||||
doc = lxml.html.document_fromstring(html)
|
||||
for e in doc.xpath("//*[contains(@id, 'bibtip')]"):
|
||||
key = e.attrib['id'].replace('bibtip_', '')
|
||||
value = e.text_content()
|
||||
data[key] = value
|
||||
info = doc.xpath('//textarea[@id="util-em-note"]')[0].text
|
||||
info = dict([i.split(':', 1) for i in info.split('\n\n')[1].split('\n')])
|
||||
for key in info:
|
||||
k = key.lower()
|
||||
data[k] = info[key].strip()
|
||||
for key in ('id', 'instance', 'mediatype', 'reclist', 'shorttitle'):
|
||||
if key in data:
|
||||
del data[key]
|
||||
if 'isxn' in data:
|
||||
for isbn in data.pop('isxn').split(' '):
|
||||
isbn = normalize_isbn(isbn)
|
||||
if stdnum.isbn.is_valid(isbn):
|
||||
data['isbn%d'%len(isbn)] = isbn
|
||||
if 'author' in data:
|
||||
data['author'] = [data['author']]
|
||||
print 'worldcat.lookup', id
|
||||
print data.keys()
|
||||
return data
|
||||
|
||||
info = lookup
|
||||
|
||||
def find(title, author, year):
|
||||
return []
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue