inital google books and amazon parser

2016-01-05 12:58:30 +05:30 · 2016-01-05 12:58:30 +05:30 · c3de15587c
commit c3de15587c
parent 75fbb88a78
2 changed files with 130 additions and 0 deletions
--- a/oml/meta/amazon.py
+++ b/oml/meta/amazon.py
@ -0,0 +1,78 @@
+from ox.cache import read_url
+from ox import decode_html, strip_tags, find_re
+import json
+import re
+from urllib.parse import unquote
+import lxml.html
+import stdnum.isbn
+
+def info(key, value):
+    if key not in ('isbn',):
+        raise IOError('unknwon key %s' % key)
+    if len(value) == 13:
+        value = stdnum.isbn.to_isbn10(value)
+    if len(value) != 10:
+        raise IOError('invalid isbn %s' % value)
+
+    url = 'http://www.amazon.com/dp/' + value
+    data = read_url(url).decode()
+    doc = lxml.html.document_fromstring(data)
+    info = {}
+    if '<title>404 - Document Not Found</title>' in data:
+        return info
+    for l in doc.xpath('//link[@rel="canonical" and @href]'):
+        info['asin'] = [l.get('href').rpartition('/')[-1]]
+        break
+    info['title'] = strip_tags(decode_html(doc.xpath('//span[@id="productTitle"]')[0].text))
+    info['description'] = strip_tags(decode_html(unquote(re.compile('encodedDescription\' : "(.*?)",').findall(data)[0])))
+    content = doc.xpath('//div[@class="content"]')[0]
+    content_info = {}
+    for li in content.xpath('.//li'):
+        v = li.text_content()
+        if ': ' in v:
+            k, v = li.text_content().split(': ', 1)
+            content_info[k.strip()] = v.strip()
+    if 'Language' in content_info:
+        info['language'] = content_info['Language']
+    if 'Publisher' in content_info:
+        if ' (' in content_info['Publisher']:
+            info['date'] = find_re(content_info['Publisher'].split(' (')[-1], '\d{4}')
+        info['publisher'] = content_info['Publisher'].split(' (')[0]
+        if '; ' in info['publisher']:
+            info['publisher'], info['edition'] = info['publisher'].split('; ', 1)
+
+    if 'ISBN-13' in content_info:
+        if not 'isbn' in info: info['isbn'] = []
+        info['isbn'].append(content_info['ISBN-13'].replace('-', ''))
+    if 'ISBN-10' in content_info:
+        if not 'isbn' in info: info['isbn'] = []
+        info['isbn'].append(content_info['ISBN-10'])
+
+    a = doc.xpath('//span[@class="a-size-medium"]')
+    if a:
+        for span in a:
+            r = span.getchildren()[0].text.strip()
+            if 'Translator' in r:
+                role = 'translator'
+            else:
+                role = 'author'
+            if not role in info: info[role] = []
+            info[role].append(span.text.strip())
+    else:
+        for span in doc.xpath('//span[@class="author notFaded"]'):
+            author = [x.strip() for x in span.text_content().strip().split('\n') if x.strip()]
+            if 'Translator' in author[-1]:
+                role = 'translator'
+            else:
+                role = 'author'
+            if not role in info: info[role] = []
+            info[role].append(author[0])
+
+    covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0]
+    covers = json.loads(decode_html(covers))
+    last = [0,0]
+    for url in covers:
+        if covers[url] > last:
+            last = covers[url]
+            info['cover'] = re.sub('(\._SX.+?_\.)', '.', url)
+    return info
--- a/oml/meta/google.py
+++ b/oml/meta/google.py
@ -2,9 +2,11 @@
 # vi:si:et:sw=4:sts=4:ts=4


+from ox.cache import get_json, store
 import ox.web.google
 import stdnum.isbn

+from utils import get_language
 from .utils import find_isbns

 import logging
@ -33,3 +35,53 @@ def find(query):
            if len(isbn) == 13 and isbn.startswith('978'):
                done.add(stdnum.isbn.to_isbn10(isbn))
    return results
+
+def info(key, value):
+    if key not in ('isbn', 'lccn', 'oclc'):
+        raise IOError('unknwon key %s' % key)
+    url = 'https://www.googleapis.com/books/v1/volumes?q=%s:%s' % (key, value)
+    r = get_json(url, timeout=-1)
+    if 'error' in r:
+        store.delete(url)
+        raise IOError(url, r)
+    if not 'items' in r:
+        print('unkown %s: %s [%s]' % (key, value, r))
+        return {}
+    _data = r['items'][0]['volumeInfo']
+    data = {}
+    for key in [
+            'authors',
+            'description',
+            'pageCount',
+            'publishedDate',
+            'publisher',
+            'title',
+        ]:
+        if key in _data:
+            data[{
+                'authors': 'author',
+                'pageCount': 'pages',
+                'publishedDate': 'date',
+            }.get(key,key)] = _data[key]
+
+    if 'subtitle' in _data:
+        data['title'] = '{title}: {subtitle}'.format(**_data)
+    if r['items'][0]['accessInfo']['viewability'] != 'NO_PAGES':
+        data['cover'] = 'https://books.google.com/books?id=%s&pg=PP1&img=1&zoom=0&hl=en' % r['items'][0]['id']
+    elif 'imageLinks' in _data:
+        for size in ('extraLarge', 'large', 'medium', 'small', 'thumbnail', 'smallThumbnail'):
+            if size in _data['imageLinks']:
+                data['cover'] = _data['imageLinks'][size]
+                break
+    if 'industryIdentifiers' in _data:
+        for k in _data['industryIdentifiers']:
+            if k['type'].startswith('ISBN'):
+                if not 'isbn' in data:
+                    data['isbn'] = []
+                data['isbn'].append(k['identifier'])
+            else:
+                print('unknown identifier', k)
+    if 'language' in _data:
+        data['language'] = get_language(_data['language'])
+    return data
+