fix amazon parser

This commit is contained in:
j 2014-04-03 01:34:15 +02:00
parent cc72dc96d3
commit 9c844d0ce7

View file

@ -6,6 +6,8 @@ from urllib import quote
from ox import find_re, strip_tags, decode_html from ox import find_re, strip_tags, decode_html
from ox.cache import read_url from ox.cache import read_url
import lxml
def findISBN(title, author): def findISBN(title, author):
q = '%s %s' % (title, author) q = '%s %s' % (title, author)
@ -28,12 +30,22 @@ def get_data(id):
r = {} r = {}
r['amazon'] = url r['amazon'] = url
r['title'] = find_re(data, '<span id="btAsinTitle" style="">(.*?)<span') r['title'] = find_re(data, '<span id="productTitle" class="a-size-large">(.*?)</span>')
r['authors'] = re.compile('<b class="h3color">(.*?)</b>.*?\(Author\)', re.DOTALL).findall(data) r['authors'] = []
r['authors'] = filter(lambda x: len(x)>1, [decode_html(a) for a in r['authors']]) doc = lxml.html.document_fromstring(data)
t = re.compile('>(.*?)</a> \(Translator\)').findall(data) for e in doc.xpath("//span[contains(@class, 'author')]"):
if t: print e
r['translator'] = t for secondary in e.xpath(".//span[contains(@class, 'a-color-secondary')]"):
if 'Author' in secondary.text:
author = e.xpath(".//span[contains(@class, 'a-size-medium')]")
if author:
r['authors'].append(author[0].text.strip())
else:
r['authors'].append(e.xpath('.//a')[0].text.strip())
break
elif 'Translator' in secondary.text:
r['translator'] = [e.xpath('.//a')[0].text]
break
r['publisher'] = find_data('Publisher') r['publisher'] = find_data('Publisher')
r['language'] = find_data('Language') r['language'] = find_data('Language')
r['isbn-10'] = find_data('ISBN-10') r['isbn-10'] = find_data('ISBN-10')
@ -46,7 +58,11 @@ def get_data(id):
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip() r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
r['description'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip() for e in doc.xpath('//noscript'):
for c in e.getchildren():
if c.tag == 'div':
r['description'] = strip_tags(decode_html(lxml.html.tostring(c))).strip()
break
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data) r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
if r['cover']: if r['cover']: