fix amazon parser

2014-04-03 01:34:15 +02:00 · 2014-04-03 01:34:15 +02:00 · 9c844d0ce7
commit 9c844d0ce7
parent cc72dc96d3
1 changed files with 23 additions and 7 deletions
--- a/ox/web/amazon.py
+++ b/ox/web/amazon.py
@ -6,6 +6,8 @@ from urllib import quote
 from ox import find_re, strip_tags, decode_html
 from ox.cache import read_url
 import lxml
 def findISBN(title, author):
    q = '%s %s' % (title, author)
@ -28,12 +30,22 @@ def get_data(id):
    r = {}
    r['amazon'] = url
-    r['title'] = find_re(data, '<span id="btAsinTitle" style="">(.*?)<span')
+    r['title'] = find_re(data, '<span id="productTitle" class="a-size-large">(.*?)</span>')
-    r['authors'] = re.compile('<b class="h3color">(.*?)</b>.*?\(Author\)', re.DOTALL).findall(data)
+    r['authors'] = []
-    r['authors'] = filter(lambda x: len(x)>1, [decode_html(a) for a in r['authors']])
+    doc = lxml.html.document_fromstring(data)
-    t = re.compile('>(.*?)</a> \(Translator\)').findall(data)
+    for e in doc.xpath("//span[contains(@class, 'author')]"):
-    if t:
+        print e
-        r['translator'] = t
+        for secondary in e.xpath(".//span[contains(@class, 'a-color-secondary')]"):
            if 'Author' in secondary.text:
                author = e.xpath(".//span[contains(@class, 'a-size-medium')]")
                if author:
                    r['authors'].append(author[0].text.strip())
                else:
                    r['authors'].append(e.xpath('.//a')[0].text.strip())
                break
            elif 'Translator' in secondary.text:
                r['translator'] = [e.xpath('.//a')[0].text]
                break
    r['publisher'] = find_data('Publisher')
    r['language'] = find_data('Language')
    r['isbn-10'] = find_data('ISBN-10')
@ -46,7 +58,11 @@ def get_data(id):
    r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
-    r['description'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
+    for e in doc.xpath('//noscript'):
        for c in e.getchildren():
            if c.tag == 'div':
                r['description'] = strip_tags(decode_html(lxml.html.tostring(c))).strip()
                break
    r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
    if r['cover']: