az fixes
This commit is contained in:
parent
e0c2ccbb24
commit
6211253675
1 changed files with 28 additions and 6 deletions
|
@ -27,8 +27,13 @@ def info(key, value):
|
|||
info['title'] = strip_tags(decode_html(doc.xpath('//span[@id="productTitle"]')[0].text))
|
||||
info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title'])
|
||||
info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title'])
|
||||
info['description'] = strip_tags(decode_html(unquote(re.compile('encodedDescription\' : "(.*?)",').findall(data)[0])))
|
||||
info['description'] = fix_bad_unicode(info['description'])
|
||||
d = re.compile('encodedDescription\' : "(.*?)",').findall(data)
|
||||
if d:
|
||||
info['description'] = strip_tags(decode_html(unquote(d[0])))
|
||||
info['description'] = fix_bad_unicode(info['description'])
|
||||
else:
|
||||
info['description'] = ''
|
||||
|
||||
content = doc.xpath('//div[@class="content"]')[0]
|
||||
content_info = {}
|
||||
for li in content.xpath('.//li'):
|
||||
|
@ -47,7 +52,6 @@ def info(key, value):
|
|||
|
||||
if 'ISBN-13' in content_info:
|
||||
info['isbn'] = content_info['ISBN-13'].replace('-', '')
|
||||
info['isbn'].append(content_info['ISBN-13'].replace('-', ''))
|
||||
elif 'ISBN-10' in content_info:
|
||||
info['isbn'] = stdnum.isbn.to_isbn13(content_info['ISBN-10'])
|
||||
|
||||
|
@ -56,24 +60,42 @@ def info(key, value):
|
|||
for span in a:
|
||||
r = span.getchildren()[0].text.strip()
|
||||
role = get_role(r)
|
||||
if not role in info: info[role] = []
|
||||
if role not in info:
|
||||
info[role] = []
|
||||
info[role].append(span.text.strip())
|
||||
else:
|
||||
for span in doc.xpath('//span[@class="author notFaded"]'):
|
||||
author = [x.strip() for x in span.text_content().strip().split('\n') if x.strip()]
|
||||
role = get_role(author[-1])
|
||||
if not role in info: info[role] = []
|
||||
if role not in info:
|
||||
info[role] = []
|
||||
info[role].append(author[0])
|
||||
|
||||
covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0]
|
||||
covers = json.loads(decode_html(covers))
|
||||
last = [0,0]
|
||||
last = [0, 0]
|
||||
for url in covers:
|
||||
if covers[url] > last:
|
||||
last = covers[url]
|
||||
info['cover'] = re.sub('(\._SX.+?_\.)', '.', url)
|
||||
return info
|
||||
|
||||
def get_price(asin, currency='EUR'):
|
||||
if currency == 'EUR':
|
||||
url = 'http://www.amazon.de/dp/' + asin
|
||||
else:
|
||||
url = 'http://www.amazon.com/dp/' + asin
|
||||
data = read_url(url).decode()
|
||||
doc = lxml.html.document_fromstring(data)
|
||||
for price in doc.xpath("//span[contains(@class, 'a-color-price')]"):
|
||||
price = price.text_content().strip()
|
||||
if currency == 'EUR':
|
||||
price = price.replace('EUR ', '').replace(',', '.')
|
||||
else:
|
||||
price = price.replace('$', '').strip()
|
||||
price = float(price)
|
||||
return price
|
||||
|
||||
def get_role(value):
|
||||
if 'Translator' in value:
|
||||
role = 'translator'
|
||||
|
|
Loading…
Reference in a new issue