From 9c844d0ce755826914c56ffc3e869b11a38dbab8 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Thu, 3 Apr 2014 01:34:15 +0200 Subject: [PATCH] fix amazon parser --- ox/web/amazon.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/ox/web/amazon.py b/ox/web/amazon.py index 64289c8..c4490ff 100644 --- a/ox/web/amazon.py +++ b/ox/web/amazon.py @@ -6,6 +6,8 @@ from urllib import quote from ox import find_re, strip_tags, decode_html from ox.cache import read_url +import lxml + def findISBN(title, author): q = '%s %s' % (title, author) @@ -28,12 +30,22 @@ def get_data(id): r = {} r['amazon'] = url - r['title'] = find_re(data, '(.*?)(.*?).*?\(Author\)', re.DOTALL).findall(data) - r['authors'] = filter(lambda x: len(x)>1, [decode_html(a) for a in r['authors']]) - t = re.compile('>(.*?) \(Translator\)').findall(data) - if t: - r['translator'] = t + r['title'] = find_re(data, '(.*?)') + r['authors'] = [] + doc = lxml.html.document_fromstring(data) + for e in doc.xpath("//span[contains(@class, 'author')]"): + print e + for secondary in e.xpath(".//span[contains(@class, 'a-color-secondary')]"): + if 'Author' in secondary.text: + author = e.xpath(".//span[contains(@class, 'a-size-medium')]") + if author: + r['authors'].append(author[0].text.strip()) + else: + r['authors'].append(e.xpath('.//a')[0].text.strip()) + break + elif 'Translator' in secondary.text: + r['translator'] = [e.xpath('.//a')[0].text] + break r['publisher'] = find_data('Publisher') r['language'] = find_data('Language') r['isbn-10'] = find_data('ISBN-10') @@ -46,7 +58,11 @@ def get_data(id): r['review'] = strip_tags(find_re(data, '

Review

.*?
(.*?)
').replace('
', '\n')).strip() - r['description'] = strip_tags(find_re(data, '

Product Description

.*?
(.*?)
').replace('
', '\n')).strip() + for e in doc.xpath('//noscript'): + for c in e.getchildren(): + if c.tag == 'div': + r['description'] = strip_tags(decode_html(lxml.html.tostring(c))).strip() + break r['cover'] = re.findall('src="(.*?)" id="prodImage"', data) if r['cover']: