az fixes

2017-07-30 11:37:44 +02:00 · 2017-07-30 11:37:44 +02:00 · 6211253675
commit 6211253675
parent e0c2ccbb24
1 changed files with 28 additions and 6 deletions
--- a/oml/meta/amazon.py
+++ b/oml/meta/amazon.py
@ -27,8 +27,13 @@ def info(key, value):
    info['title'] = strip_tags(decode_html(doc.xpath('//span[@id="productTitle"]')[0].text))
    info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title'])
    info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title'])
-    info['description'] = strip_tags(decode_html(unquote(re.compile('encodedDescription\' : "(.*?)",').findall(data)[0])))
+    d = re.compile('encodedDescription\' : "(.*?)",').findall(data)
    if d:
        info['description'] = strip_tags(decode_html(unquote(d[0])))
        info['description'] = fix_bad_unicode(info['description'])
    else:
        info['description'] = ''
    content = doc.xpath('//div[@class="content"]')[0]
    content_info = {}
    for li in content.xpath('.//li'):
@ -47,7 +52,6 @@ def info(key, value):
    if 'ISBN-13' in content_info:
        info['isbn'] = content_info['ISBN-13'].replace('-', '')
        info['isbn'].append(content_info['ISBN-13'].replace('-', ''))
    elif 'ISBN-10' in content_info:
        info['isbn'] = stdnum.isbn.to_isbn13(content_info['ISBN-10'])
@ -56,24 +60,42 @@ def info(key, value):
        for span in a:
            r = span.getchildren()[0].text.strip()
            role = get_role(r)
-            if not role in info: info[role] = []
+            if role not in info:
                info[role] = []
            info[role].append(span.text.strip())
    else:
        for span in doc.xpath('//span[@class="author notFaded"]'):
            author = [x.strip() for x in span.text_content().strip().split('\n') if x.strip()]
            role = get_role(author[-1])
-            if not role in info: info[role] = []
+            if role not in info:
                info[role] = []
            info[role].append(author[0])
    covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0]
    covers = json.loads(decode_html(covers))
-    last = [0,0]
+    last = [0, 0]
    for url in covers:
        if covers[url] > last:
            last = covers[url]
            info['cover'] = re.sub('(\._SX.+?_\.)', '.', url)
    return info
 def get_price(asin, currency='EUR'):
    if currency == 'EUR':
        url = 'http://www.amazon.de/dp/' + asin
    else:
        url = 'http://www.amazon.com/dp/' + asin
    data = read_url(url).decode()
    doc = lxml.html.document_fromstring(data)
    for price in doc.xpath("//span[contains(@class, 'a-color-price')]"):
        price = price.text_content().strip()
        if currency == 'EUR':
            price = price.replace('EUR ', '').replace(',', '.')
        else:
            price = price.replace('$', '').strip()
        price = float(price)
        return price
 def get_role(value):
    if 'Translator' in value:
        role = 'translator'