store metadata per user. remove primaryid. only store isbn13

2016-01-11 19:13:54 +05:30 · 2016-01-11 19:13:54 +05:30 · 02e040d9f5
commit 02e040d9f5
parent 90648f9e65
16 changed files with 245 additions and 192 deletions
--- a/oml/meta/amazon.py
+++ b/oml/meta/amazon.py
@ -46,11 +46,10 @@ def info(key, value):
            info['publisher'], info['edition'] = info['publisher'].split('; ', 1)

    if 'ISBN-13' in content_info:
-        if not 'isbn' in info: info['isbn'] = []
+        info['isbn'] = content_info['ISBN-13'].replace('-', '')
        info['isbn'].append(content_info['ISBN-13'].replace('-', ''))
-    if 'ISBN-10' in content_info:
-        if not 'isbn' in info: info['isbn'] = []
-        info['isbn'].append(content_info['ISBN-10'])
+    elif 'ISBN-10' in content_info:
+        info['isbn'] = stdnum.isbn.to_isbn13(content_info['ISBN-10'])

    a = doc.xpath('//span[@class="a-size-medium"]')
    if a:
--- a/oml/meta/duckduckgo.py
+++ b/oml/meta/duckduckgo.py
@ -21,14 +21,13 @@ def find(query):
    done = set()
    for isbn in isbns:
        if isbn not in done:
+            isbn = stdnum.isbn.to_isbn13(isbn)
            r = {
                'isbn': [isbn],
                'primaryid': ['isbn', isbn]
            }
            results.append(r)
            done.add(isbn)
-            if len(isbn) == 10:
-                done.add(stdnum.isbn.to_isbn13(isbn))
            if len(isbn) == 13 and isbn.startswith('978'):
                done.add(stdnum.isbn.to_isbn10(isbn))
    return results
--- a/oml/meta/google.py
+++ b/oml/meta/google.py
@ -6,7 +6,7 @@ from ox.cache import get_json, store
 import ox.web.google
 import stdnum.isbn

-from .utils import find_isbns, get_language, decode_html_data
+from .utils import find_isbns, get_language, decode_html_data, to_isbn13

 import logging
 logger = logging.getLogger(__name__)
@ -51,6 +51,7 @@ def info(key, value):
    data = {}
    for key in [
            'authors',
+            'categories',
            'description',
            'pageCount',
            'publishedDate',
@ -83,6 +84,9 @@ def info(key, value):
                data['isbn'].append(k['identifier'])
            else:
                print('unknown identifier', k)
+        if 'isbn' in data:
+            data['isbn'] = [to_isbn13(i) for i in data['isbn']][0]
+
    if 'publisher' in data and isinstance(data['publisher'], str):
        data['publisher'] = [data['publisher']]
    if 'language' in _data:
--- a/oml/meta/utils.py
+++ b/oml/meta/utils.py
@ -6,6 +6,16 @@ import re
 import stdnum.isbn

 import ox
+import ox.iso
+
+def to_isbn13(isbn):
+    try:
+        isbn = stdnum.isbn.validate(isbn, True)
+        if isbn[:2] != '97':
+            isbn = None
+    except:
+        isbn = None
+    return isbn

 def normalize_isbn(value):
    return ''.join([s for s in value if s.isdigit() or s == 'X'])
@ -13,14 +23,11 @@ def normalize_isbn(value):
 def find_isbns(text):
    if isinstance(text, bytes):
        text = text.decode()
-    matches = re.compile('\d[\d\-X\ ]+').findall(text)
+    matches = re.compile('\d[\d\-X\u2013\ ]+').findall(text)
    matches = [normalize_isbn(value) for value in matches]
-    return [isbn for isbn in matches if stdnum.isbn.is_valid(isbn)
-        and len(isbn) in (10, 13)
-        and isbn not in (
-        '0' * 10,
-        '0' * 13,
-    )]
+    matches = [to_isbn13(value) for value in matches]
+    matches = list(set([value for value in matches if value]))
+    return matches

 def get_language(lang):
    return ox.iso.codeToLang(lang.split('-')[0]) or lang