openmedialibrary/oml/meta/amazon.py

from ox.cache import read_url
from ox import decode_html, strip_tags, find_re, fix_bad_unicode
import json
import re
from urllib.parse import unquote
import lxml.html
import stdnum.isbn

def info(key, value):
    if key not in ('isbn',):
        raise IOError('unknwon key %s' % key)
    if len(value) == 13:
        value = stdnum.isbn.to_isbn10(value)
    if len(value) != 10:
        raise IOError('invalid isbn %s' % value)
    url = 'http://www.amazon.com/dp/' + value
    data = read_url(url).decode()
    doc = lxml.html.document_fromstring(data)
    info = {}
    if '<title>404 - Document Not Found</title>' in data:
        return info
    if 'To discuss automated access to Amazon data please' in data:
        return info
    for l in doc.xpath('//link[@rel="canonical" and @href]'):
        info['asin'] = [l.get('href').rpartition('/')[-1]]
        break
    info['title'] = strip_tags(decode_html(doc.xpath('//span[@id="productTitle"]')[0].text))
    info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title'])
    info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title'])
    info['description'] = strip_tags(decode_html(unquote(re.compile('encodedDescription\' : "(.*?)",').findall(data)[0])))
    info['description'] = fix_bad_unicode(info['description'])
    content = doc.xpath('//div[@class="content"]')[0]
    content_info = {}
    for li in content.xpath('.//li'):
        v = li.text_content()
        if ': ' in v:
            k, v = li.text_content().split(': ', 1)
            content_info[k.strip()] = v.strip()
    if 'Language' in content_info:
        info['language'] = content_info['Language']
    if 'Publisher' in content_info:
        if ' (' in content_info['Publisher']:
            info['date'] = find_re(content_info['Publisher'].split(' (')[-1], '\d{4}')
        info['publisher'] = content_info['Publisher'].split(' (')[0]
        if '; ' in info['publisher']:
            info['publisher'], info['edition'] = info['publisher'].split('; ', 1)

    if 'ISBN-13' in content_info:
        if not 'isbn' in info: info['isbn'] = []
        info['isbn'].append(content_info['ISBN-13'].replace('-', ''))
    if 'ISBN-10' in content_info:
        if not 'isbn' in info: info['isbn'] = []
        info['isbn'].append(content_info['ISBN-10'])

    a = doc.xpath('//span[@class="a-size-medium"]')
    if a:
        for span in a:
            r = span.getchildren()[0].text.strip()
            role = get_role(r)
            if not role in info: info[role] = []
            info[role].append(span.text.strip())
    else:
        for span in doc.xpath('//span[@class="author notFaded"]'):
            author = [x.strip() for x in span.text_content().strip().split('\n') if x.strip()]
            role = get_role(author[-1])
            if not role in info: info[role] = []
            info[role].append(author[0])

    covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0]
    covers = json.loads(decode_html(covers))
    last = [0,0]
    for url in covers:
        if covers[url] > last:
            last = covers[url]
            info['cover'] = re.sub('(\._SX.+?_\.)', '.', url)
    return info

def get_role(value):
    if 'Translator' in value:
        role = 'translator'
    elif 'Editor' in value:
        role = 'editor'
    else:
        role = 'author'
    return role
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`from ox.cache import read_url`
dont include editors in authors 2016-01-05 07:46:30 +00:00			`from ox import decode_html, strip_tags, find_re, fix_bad_unicode`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`import json`
			`import re`
			`from urllib.parse import unquote`
			`import lxml.html`
			`import stdnum.isbn`

			`def info(key, value):`
			`if key not in ('isbn',):`
			`raise IOError('unknwon key %s' % key)`
			`if len(value) == 13:`
			`value = stdnum.isbn.to_isbn10(value)`
			`if len(value) != 10:`
			`raise IOError('invalid isbn %s' % value)`
			`url = 'http://www.amazon.com/dp/' + value`
			`data = read_url(url).decode()`
			`doc = lxml.html.document_fromstring(data)`
			`info = {}`
			`if '<title>404 - Document Not Found</title>' in data:`
			`return info`
catch more 404s 2016-01-05 08:30:25 +00:00			`if 'To discuss automated access to Amazon data please' in data:`
			`return info`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`for l in doc.xpath('//link[@rel="canonical" and @href]'):`
			`info['asin'] = [l.get('href').rpartition('/')[-1]]`
			`break`
			`info['title'] = strip_tags(decode_html(doc.xpath('//span[@id="productTitle"]')[0].text))`
dont include editors in authors 2016-01-05 07:46:30 +00:00			`info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title'])`
			`info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title'])`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`info['description'] = strip_tags(decode_html(unquote(re.compile('encodedDescription\' : "(.*?)",').findall(data)[0])))`
dont include editors in authors 2016-01-05 07:46:30 +00:00			`info['description'] = fix_bad_unicode(info['description'])`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`content = doc.xpath('//div[@class="content"]')[0]`
			`content_info = {}`
			`for li in content.xpath('.//li'):`
			`v = li.text_content()`
			`if ': ' in v:`
			`k, v = li.text_content().split(': ', 1)`
			`content_info[k.strip()] = v.strip()`
			`if 'Language' in content_info:`
			`info['language'] = content_info['Language']`
			`if 'Publisher' in content_info:`
			`if ' (' in content_info['Publisher']:`
			`info['date'] = find_re(content_info['Publisher'].split(' (')[-1], '\d{4}')`
			`info['publisher'] = content_info['Publisher'].split(' (')[0]`
			`if '; ' in info['publisher']:`
			`info['publisher'], info['edition'] = info['publisher'].split('; ', 1)`

			`if 'ISBN-13' in content_info:`
			`if not 'isbn' in info: info['isbn'] = []`
			`info['isbn'].append(content_info['ISBN-13'].replace('-', ''))`
			`if 'ISBN-10' in content_info:`
			`if not 'isbn' in info: info['isbn'] = []`
			`info['isbn'].append(content_info['ISBN-10'])`

			`a = doc.xpath('//span[@class="a-size-medium"]')`
			`if a:`
			`for span in a:`
			`r = span.getchildren()[0].text.strip()`
dont include editors in authors 2016-01-05 07:46:30 +00:00			`role = get_role(r)`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`if not role in info: info[role] = []`
			`info[role].append(span.text.strip())`
			`else:`
			`for span in doc.xpath('//span[@class="author notFaded"]'):`
			`author = [x.strip() for x in span.text_content().strip().split('\n') if x.strip()]`
dont include editors in authors 2016-01-05 07:46:30 +00:00			`role = get_role(author[-1])`
inital google books and amazon parser 2016-01-05 07:28:30 +00:00			`if not role in info: info[role] = []`
			`info[role].append(author[0])`

			`covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0]`
			`covers = json.loads(decode_html(covers))`
			`last = [0,0]`
			`for url in covers:`
			`if covers[url] > last:`
			`last = covers[url]`
			`info['cover'] = re.sub('(\._SX.+?_\.)', '.', url)`
			`return info`
dont include editors in authors 2016-01-05 07:46:30 +00:00
			`def get_role(value):`
			`if 'Translator' in value:`
			`role = 'translator'`
			`elif 'Editor' in value:`
			`role = 'editor'`
			`else:`
			`role = 'author'`
			`return role`