This commit is contained in:
j 2017-07-30 11:37:44 +02:00
parent e0c2ccbb24
commit 6211253675

View file

@ -27,8 +27,13 @@ def info(key, value):
info['title'] = strip_tags(decode_html(doc.xpath('//span[@id="productTitle"]')[0].text)) info['title'] = strip_tags(decode_html(doc.xpath('//span[@id="productTitle"]')[0].text))
info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title']) info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title'])
info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title']) info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title'])
info['description'] = strip_tags(decode_html(unquote(re.compile('encodedDescription\' : "(.*?)",').findall(data)[0]))) d = re.compile('encodedDescription\' : "(.*?)",').findall(data)
if d:
info['description'] = strip_tags(decode_html(unquote(d[0])))
info['description'] = fix_bad_unicode(info['description']) info['description'] = fix_bad_unicode(info['description'])
else:
info['description'] = ''
content = doc.xpath('//div[@class="content"]')[0] content = doc.xpath('//div[@class="content"]')[0]
content_info = {} content_info = {}
for li in content.xpath('.//li'): for li in content.xpath('.//li'):
@ -47,7 +52,6 @@ def info(key, value):
if 'ISBN-13' in content_info: if 'ISBN-13' in content_info:
info['isbn'] = content_info['ISBN-13'].replace('-', '') info['isbn'] = content_info['ISBN-13'].replace('-', '')
info['isbn'].append(content_info['ISBN-13'].replace('-', ''))
elif 'ISBN-10' in content_info: elif 'ISBN-10' in content_info:
info['isbn'] = stdnum.isbn.to_isbn13(content_info['ISBN-10']) info['isbn'] = stdnum.isbn.to_isbn13(content_info['ISBN-10'])
@ -56,24 +60,42 @@ def info(key, value):
for span in a: for span in a:
r = span.getchildren()[0].text.strip() r = span.getchildren()[0].text.strip()
role = get_role(r) role = get_role(r)
if not role in info: info[role] = [] if role not in info:
info[role] = []
info[role].append(span.text.strip()) info[role].append(span.text.strip())
else: else:
for span in doc.xpath('//span[@class="author notFaded"]'): for span in doc.xpath('//span[@class="author notFaded"]'):
author = [x.strip() for x in span.text_content().strip().split('\n') if x.strip()] author = [x.strip() for x in span.text_content().strip().split('\n') if x.strip()]
role = get_role(author[-1]) role = get_role(author[-1])
if not role in info: info[role] = [] if role not in info:
info[role] = []
info[role].append(author[0]) info[role].append(author[0])
covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0] covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0]
covers = json.loads(decode_html(covers)) covers = json.loads(decode_html(covers))
last = [0,0] last = [0, 0]
for url in covers: for url in covers:
if covers[url] > last: if covers[url] > last:
last = covers[url] last = covers[url]
info['cover'] = re.sub('(\._SX.+?_\.)', '.', url) info['cover'] = re.sub('(\._SX.+?_\.)', '.', url)
return info return info
def get_price(asin, currency='EUR'):
if currency == 'EUR':
url = 'http://www.amazon.de/dp/' + asin
else:
url = 'http://www.amazon.com/dp/' + asin
data = read_url(url).decode()
doc = lxml.html.document_fromstring(data)
for price in doc.xpath("//span[contains(@class, 'a-color-price')]"):
price = price.text_content().strip()
if currency == 'EUR':
price = price.replace('EUR ', '').replace(',', '.')
else:
price = price.replace('$', '').strip()
price = float(price)
return price
def get_role(value): def get_role(value):
if 'Translator' in value: if 'Translator' in value:
role = 'translator' role = 'translator'