import/lists/autocompleteFolder

2014-05-19 01:24:04 +02:00 · 2014-05-19 01:24:04 +02:00 · d6f350e5a1
commit d6f350e5a1
parent 94443ee667
42 changed files with 955 additions and 436 deletions
--- a/oml/meta/init.py
+++ b/oml/meta/init.py
@ -2,8 +2,7 @@
 # vi:si:et:sw=4:sts=4:ts=4
 from __future__ import division

-import logging
-logger = logging.getLogger('meta')
+import stdnum.isbn

 import abebooks
 import loc
@ -13,6 +12,10 @@ import worldcat
 import google
 import duckduckgo

+import logging
+logger = logging.getLogger('meta')
+
+
 providers = [
    ('openlibrary', 'olid'),
    ('loc', 'lccn'),
@ -32,6 +35,8 @@ def find(title, author=None, publisher=None, date=None):
    return results

 def lookup(key, value):
+    if not isvalid_id(key, value):
+        return {}
    data = {key: value}
    ids = [(key, value)]
    provider_data = {}
@ -59,4 +64,13 @@ def lookup(key, value):
                data[k_] = v_
    return data

+def isvalid_id(key, value):
+    if key in ('isbn10', 'isbn13'):
+        if 'isbn%d'%len(value) != key or not stdnum.isbn.is_valid(value):
+            return False
+    if key == 'asin' and len(value) != 10:
+        return False
+    if key == 'olid' and not (value.startswith('OL') and value.endswith('M')):
+        return False
+    return True

--- a/oml/meta/abebooks.py
+++ b/oml/meta/abebooks.py
@ -9,10 +9,11 @@ import lxml.html
 import logging
 logger = logging.getLogger('meta.abebooks')

+base = 'http://www.abebooks.com'
+
 def get_ids(key, value):
    ids = []
    if key in ('isbn10', 'isbn13'):
-        base = 'http://www.abebooks.com'
        url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)
        data = read_url(url)
        urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)
@ -24,21 +25,20 @@ def get_ids(key, value):

 def lookup(id):
    logger.debug('lookup %s', id)
-    return {}
-
-def get_data(id):
-    info = {}
-    base = 'http://www.abebooks.com'
+    data = {}
    url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)
-    data = read_url(url)
-    urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)
+    html = read_url(url)
+    urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(html)
+    keys = {
+        'pubdate': 'date'
+    }
    if urls:
        details = '%s%s' % (base, urls[0])
-        data = read_url(details)
-        doc = lxml.html.document_fromstring(data)
+        html = read_url(details)
+        doc = lxml.html.document_fromstring(html)
        for e in doc.xpath("//*[contains(@id, 'biblio')]"):
            key = e.attrib['id'].replace('biblio-', '')
            value = e.text_content()
-            if value and key not in ('bookcondition', 'binding'):
-                info[key] = value
-    return info
+            if value and key not in ('bookcondition', 'binding', 'edition-amz'):
+                data[keys.get(key, key)] = value
+    return data
--- a/oml/meta/duckduckgo.py
+++ b/oml/meta/duckduckgo.py
@ -37,6 +37,6 @@ def find(title, author=None, publisher=None, date=None):
            done.add(isbn)
            if len(isbn) == 10:
                done.add(stdnum.isbn.to_isbn13(isbn))
-            if len(isbn) == 13:
+            if len(isbn) == 13 and isbn.startswith('978'):
                done.add(stdnum.isbn.to_isbn10(isbn))
    return results
--- a/oml/meta/lookupbyisbn.py
+++ b/oml/meta/lookupbyisbn.py
@ -45,9 +45,9 @@ def lookup(id):
    }
    for key in keys:
        r[key] = find_re(data, '<span class="title">%s:</span>(.*?)</li>'% re.escape(keys[key]))
-        if r[key] == '--':
-            r[key] = ''
-        if key == 'pages' and r[key]:
+        if r[key] == '--' or not r[key]:
+            del r[key]
+        if key == 'pages' and key in r:
            r[key] = int(r[key])
    desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
    desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ')