inital google books and amazon parser
This commit is contained in:
parent
75fbb88a78
commit
c3de15587c
2 changed files with 130 additions and 0 deletions
78
oml/meta/amazon.py
Normal file
78
oml/meta/amazon.py
Normal file
|
@ -0,0 +1,78 @@
|
|||
from ox.cache import read_url
|
||||
from ox import decode_html, strip_tags, find_re
|
||||
import json
|
||||
import re
|
||||
from urllib.parse import unquote
|
||||
import lxml.html
|
||||
import stdnum.isbn
|
||||
|
||||
def info(key, value):
|
||||
if key not in ('isbn',):
|
||||
raise IOError('unknwon key %s' % key)
|
||||
if len(value) == 13:
|
||||
value = stdnum.isbn.to_isbn10(value)
|
||||
if len(value) != 10:
|
||||
raise IOError('invalid isbn %s' % value)
|
||||
|
||||
url = 'http://www.amazon.com/dp/' + value
|
||||
data = read_url(url).decode()
|
||||
doc = lxml.html.document_fromstring(data)
|
||||
info = {}
|
||||
if '<title>404 - Document Not Found</title>' in data:
|
||||
return info
|
||||
for l in doc.xpath('//link[@rel="canonical" and @href]'):
|
||||
info['asin'] = [l.get('href').rpartition('/')[-1]]
|
||||
break
|
||||
info['title'] = strip_tags(decode_html(doc.xpath('//span[@id="productTitle"]')[0].text))
|
||||
info['description'] = strip_tags(decode_html(unquote(re.compile('encodedDescription\' : "(.*?)",').findall(data)[0])))
|
||||
content = doc.xpath('//div[@class="content"]')[0]
|
||||
content_info = {}
|
||||
for li in content.xpath('.//li'):
|
||||
v = li.text_content()
|
||||
if ': ' in v:
|
||||
k, v = li.text_content().split(': ', 1)
|
||||
content_info[k.strip()] = v.strip()
|
||||
if 'Language' in content_info:
|
||||
info['language'] = content_info['Language']
|
||||
if 'Publisher' in content_info:
|
||||
if ' (' in content_info['Publisher']:
|
||||
info['date'] = find_re(content_info['Publisher'].split(' (')[-1], '\d{4}')
|
||||
info['publisher'] = content_info['Publisher'].split(' (')[0]
|
||||
if '; ' in info['publisher']:
|
||||
info['publisher'], info['edition'] = info['publisher'].split('; ', 1)
|
||||
|
||||
if 'ISBN-13' in content_info:
|
||||
if not 'isbn' in info: info['isbn'] = []
|
||||
info['isbn'].append(content_info['ISBN-13'].replace('-', ''))
|
||||
if 'ISBN-10' in content_info:
|
||||
if not 'isbn' in info: info['isbn'] = []
|
||||
info['isbn'].append(content_info['ISBN-10'])
|
||||
|
||||
a = doc.xpath('//span[@class="a-size-medium"]')
|
||||
if a:
|
||||
for span in a:
|
||||
r = span.getchildren()[0].text.strip()
|
||||
if 'Translator' in r:
|
||||
role = 'translator'
|
||||
else:
|
||||
role = 'author'
|
||||
if not role in info: info[role] = []
|
||||
info[role].append(span.text.strip())
|
||||
else:
|
||||
for span in doc.xpath('//span[@class="author notFaded"]'):
|
||||
author = [x.strip() for x in span.text_content().strip().split('\n') if x.strip()]
|
||||
if 'Translator' in author[-1]:
|
||||
role = 'translator'
|
||||
else:
|
||||
role = 'author'
|
||||
if not role in info: info[role] = []
|
||||
info[role].append(author[0])
|
||||
|
||||
covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0]
|
||||
covers = json.loads(decode_html(covers))
|
||||
last = [0,0]
|
||||
for url in covers:
|
||||
if covers[url] > last:
|
||||
last = covers[url]
|
||||
info['cover'] = re.sub('(\._SX.+?_\.)', '.', url)
|
||||
return info
|
|
@ -2,9 +2,11 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
|
||||
|
||||
from ox.cache import get_json, store
|
||||
import ox.web.google
|
||||
import stdnum.isbn
|
||||
|
||||
from utils import get_language
|
||||
from .utils import find_isbns
|
||||
|
||||
import logging
|
||||
|
@ -33,3 +35,53 @@ def find(query):
|
|||
if len(isbn) == 13 and isbn.startswith('978'):
|
||||
done.add(stdnum.isbn.to_isbn10(isbn))
|
||||
return results
|
||||
|
||||
def info(key, value):
|
||||
if key not in ('isbn', 'lccn', 'oclc'):
|
||||
raise IOError('unknwon key %s' % key)
|
||||
url = 'https://www.googleapis.com/books/v1/volumes?q=%s:%s' % (key, value)
|
||||
r = get_json(url, timeout=-1)
|
||||
if 'error' in r:
|
||||
store.delete(url)
|
||||
raise IOError(url, r)
|
||||
if not 'items' in r:
|
||||
print('unkown %s: %s [%s]' % (key, value, r))
|
||||
return {}
|
||||
_data = r['items'][0]['volumeInfo']
|
||||
data = {}
|
||||
for key in [
|
||||
'authors',
|
||||
'description',
|
||||
'pageCount',
|
||||
'publishedDate',
|
||||
'publisher',
|
||||
'title',
|
||||
]:
|
||||
if key in _data:
|
||||
data[{
|
||||
'authors': 'author',
|
||||
'pageCount': 'pages',
|
||||
'publishedDate': 'date',
|
||||
}.get(key,key)] = _data[key]
|
||||
|
||||
if 'subtitle' in _data:
|
||||
data['title'] = '{title}: {subtitle}'.format(**_data)
|
||||
if r['items'][0]['accessInfo']['viewability'] != 'NO_PAGES':
|
||||
data['cover'] = 'https://books.google.com/books?id=%s&pg=PP1&img=1&zoom=0&hl=en' % r['items'][0]['id']
|
||||
elif 'imageLinks' in _data:
|
||||
for size in ('extraLarge', 'large', 'medium', 'small', 'thumbnail', 'smallThumbnail'):
|
||||
if size in _data['imageLinks']:
|
||||
data['cover'] = _data['imageLinks'][size]
|
||||
break
|
||||
if 'industryIdentifiers' in _data:
|
||||
for k in _data['industryIdentifiers']:
|
||||
if k['type'].startswith('ISBN'):
|
||||
if not 'isbn' in data:
|
||||
data['isbn'] = []
|
||||
data['isbn'].append(k['identifier'])
|
||||
else:
|
||||
print('unknown identifier', k)
|
||||
if 'language' in _data:
|
||||
data['language'] = get_language(_data['language'])
|
||||
return data
|
||||
|
||||
|
|
Loading…
Reference in a new issue