This commit is contained in:
j 2014-05-16 10:06:11 +02:00
commit e41942ea99
28 changed files with 240 additions and 84 deletions

View file

@ -7,6 +7,7 @@ import loc
import lookupbyisbn
import openlibrary
import worldcat
import google
providers = [
('openlibrary', 'olid'),
@ -17,9 +18,12 @@ providers = [
]
def find(title, author=None, publisher=None, date=None):
results = google.find(title=title, author=author, publisher=publisher, date=date)
'''
results = openlibrary.find(title=title, author=author, publisher=publisher, date=date)
for r in results:
r['mainid'] = 'olid'
'''
return results
def lookup(key, value):

38
oml/meta/google.py Normal file
View file

@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import division
import ox.web.google
import stdnum.isbn
from .utils import find_isbns
def find(title, author=None, publisher=None, date=None):
print 'google.find', title, author, publisher, date
query = title
if author:
if isinstance(author, list):
author = ' '.join(author)
query += ' ' + author
query += ' isbn'
isbns = []
for r in ox.web.google.find(query):
isbns += find_isbns(' '.join(r))
results = []
done = set()
for isbn in isbns:
if isbn not in done:
key = 'isbn%d'%len(isbn)
#r = lookup(key, isbn)
#r['mainid'] = key
r = {
key: isbn,
'mainid': key
}
results.append(r)
done.add(isbn)
if len(isbn) == 10:
done.add(stdnum.isbn.to_isbn13(isbn))
return results

View file

@ -33,7 +33,10 @@ def lookup(id):
info = {
'lccn': id
}
info['title'] = ''.join([e.text for e in mods.findall(ns + 'titleInfo')[0]])
title = mods.findall(ns + 'titleInfo')
if not title:
return {}
info['title'] = ''.join([e.text for e in title[0]])
origin = mods.findall(ns + 'originInfo')
if origin:
info['place'] = []

View file

@ -14,6 +14,8 @@ def get_ids(key, value):
if m:
asin = m[0].split('/')[-3]
ids.append(('asin', asin))
if key == 'isbn10':
ids.append(('isbn13', stdnum.isbn.to_isbn13(value)))
if key == 'asin':
if stdnum.isbn.is_valid(value):
ids.append(('isbn10', value))
@ -47,14 +49,16 @@ def lookup(id):
r[key] = int(r[key])
desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ')
r['description'] = desc
if r['description'] == u'Description of this item is not available at this time.':
r['description'] = ''
r['description'] = decode_html(strip_tags(desc))
r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace('._SL160_', '')
for key in r:
if isinstance(r[key], basestring):
r[key] = decode_html(strip_tags(r[key])).strip()
if 'author' in r and isinstance(r['author'], basestring):
if 'author' in r and isinstance(r['author'], basestring) and r['author']:
r['author'] = [r['author']]
else:
r['author'] = []
if r['description'].lower() == u'Description of this item is not available at this time.'.lower():
r['description'] = ''
return r

View file

@ -1,5 +1,16 @@
import re
import stdnum.isbn
def normalize_isbn(value):
return ''.join([s for s in value if s.isdigit() or s == 'X'])
def find_isbns(text):
matches = re.compile('\d[\d\-X\ ]+').findall(text)
matches = [normalize_isbn(value) for value in matches]
return [isbn for isbn in matches if stdnum.isbn.is_valid(isbn)
and len(isbn) in (10, 13)
and isbn not in (
'0' * 10,
'0' * 13,
)]