find

2014-05-16 10:06:11 +02:00 · 2014-05-16 10:06:11 +02:00 · e41942ea99
commit e41942ea99
parent a9c5fb43fe
28 changed files with 240 additions and 84 deletions
--- a/oml/meta/init.py
+++ b/oml/meta/init.py
@ -7,6 +7,7 @@ import loc
 import lookupbyisbn
 import openlibrary
 import worldcat
+import google

 providers = [
    ('openlibrary', 'olid'),
@ -17,9 +18,12 @@ providers = [
 ]

 def find(title, author=None, publisher=None, date=None):
+    results = google.find(title=title, author=author, publisher=publisher, date=date)
+    '''
    results = openlibrary.find(title=title, author=author, publisher=publisher, date=date)
    for r in results:
        r['mainid'] = 'olid'
+    '''
    return results

 def lookup(key, value):
--- a/oml/meta/google.py
+++ b/oml/meta/google.py
@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from __future__ import division
+
+import ox.web.google
+import stdnum.isbn
+
+from .utils import find_isbns
+
+
+def find(title, author=None, publisher=None, date=None):
+    print 'google.find', title, author, publisher, date
+    query = title
+    if author:
+        if isinstance(author, list):
+            author = ' '.join(author)
+        query += ' ' + author
+    query += ' isbn'
+    isbns = []
+    for r in ox.web.google.find(query):
+        isbns += find_isbns(' '.join(r))
+
+    results = []
+    done = set()
+    for isbn in isbns:
+        if isbn not in done:
+            key = 'isbn%d'%len(isbn)
+            #r = lookup(key, isbn)
+            #r['mainid'] = key
+            r = {
+                key: isbn,
+                'mainid': key
+            }
+            results.append(r)
+            done.add(isbn)
+            if len(isbn) == 10:
+                done.add(stdnum.isbn.to_isbn13(isbn))
+    return results
--- a/oml/meta/loc.py
+++ b/oml/meta/loc.py
@ -33,7 +33,10 @@ def lookup(id):
    info = {
        'lccn': id
    }
-    info['title'] = ''.join([e.text for e in mods.findall(ns + 'titleInfo')[0]])
+    title = mods.findall(ns + 'titleInfo')
+    if not title:
+        return {}
+    info['title'] = ''.join([e.text for e in title[0]])
    origin = mods.findall(ns + 'originInfo')
    if origin:
        info['place'] = []
--- a/oml/meta/lookupbyisbn.py
+++ b/oml/meta/lookupbyisbn.py
@ -14,6 +14,8 @@ def get_ids(key, value):
        if m:
            asin = m[0].split('/')[-3]
            ids.append(('asin', asin))
+    if key == 'isbn10':
+        ids.append(('isbn13', stdnum.isbn.to_isbn13(value)))
    if key == 'asin':
        if stdnum.isbn.is_valid(value):
            ids.append(('isbn10', value))
@ -47,14 +49,16 @@ def lookup(id):
            r[key] = int(r[key])
    desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
    desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ')
-    r['description'] = desc
-    if r['description'] == u'Description of this item is not available at this time.':
-        r['description'] = ''
+    r['description'] = decode_html(strip_tags(desc))
    r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace('._SL160_', '')
    for key in r:
        if isinstance(r[key], basestring):
            r[key] = decode_html(strip_tags(r[key])).strip()
-    if 'author' in r and isinstance(r['author'], basestring):
+    if 'author' in r and isinstance(r['author'], basestring) and r['author']:
        r['author'] = [r['author']]
+    else:
+        r['author'] = []
+    if r['description'].lower() == u'Description of this item is not available at this time.'.lower():
+        r['description'] = ''
    return r

--- a/oml/meta/utils.py
+++ b/oml/meta/utils.py
@ -1,5 +1,16 @@
-
+import re
+import stdnum.isbn

 def normalize_isbn(value):
    return ''.join([s for s in value if s.isdigit() or s == 'X'])

+def find_isbns(text):
+    matches = re.compile('\d[\d\-X\ ]+').findall(text)
+    matches = [normalize_isbn(value) for value in matches]
+    return [isbn for isbn in matches if stdnum.isbn.is_valid(isbn)
+        and len(isbn) in (10, 13)
+        and isbn not in (
+        '0' * 10,
+        '0' * 13,
+    )]
+