From 9c844d0ce755826914c56ffc3e869b11a38dbab8 Mon Sep 17 00:00:00 2001
From: j <0x006A@0x2620.org>
Date: Thu, 3 Apr 2014 01:34:15 +0200
Subject: [PATCH] fix amazon parser

---
 ox/web/amazon.py | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)
diff --git a/ox/web/amazon.py b/ox/web/amazon.py
index 64289c8..c4490ff 100644
--- a/ox/web/amazon.py
+++ b/ox/web/amazon.py
@@ -6,6 +6,8 @@ from urllib import quote
 from ox import find_re, strip_tags, decode_html
 from ox.cache import read_url
 
+import lxml
+
 
 def findISBN(title, author):
     q = '%s %s' % (title, author)
@@ -28,12 +30,22 @@ def get_data(id):
 
     r = {}
     r['amazon'] = url
-    r['title'] = find_re(data, '<span id="btAsinTitle" style="">(.*?)<span')
-    r['authors'] = re.compile('<b class="h3color">(.*?)</b>.*?\(Author\)', re.DOTALL).findall(data)
-    r['authors'] = filter(lambda x: len(x)>1, [decode_html(a) for a in r['authors']])
-    t = re.compile('>(.*?)</a> \(Translator\)').findall(data)
-    if t:
-        r['translator'] = t
+    r['title'] = find_re(data, '<span id="productTitle" class="a-size-large">(.*?)</span>')
+    r['authors'] = []
+    doc = lxml.html.document_fromstring(data)
+    for e in doc.xpath("//span[contains(@class, 'author')]"):
+        print e
+        for secondary in e.xpath(".//span[contains(@class, 'a-color-secondary')]"):
+            if 'Author' in secondary.text:
+                author = e.xpath(".//span[contains(@class, 'a-size-medium')]")
+                if author:
+                    r['authors'].append(author[0].text.strip())
+                else:
+                    r['authors'].append(e.xpath('.//a')[0].text.strip())
+                break
+            elif 'Translator' in secondary.text:
+                r['translator'] = [e.xpath('.//a')[0].text]
+                break
     r['publisher'] = find_data('Publisher')
     r['language'] = find_data('Language')
     r['isbn-10'] = find_data('ISBN-10')
@@ -46,7 +58,11 @@ def get_data(id):
 
     r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
 
-    r['description'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
+    for e in doc.xpath('//noscript'):
+        for c in e.getchildren():
+            if c.tag == 'div':
+                r['description'] = strip_tags(decode_html(lxml.html.tostring(c))).strip()
+                break
 
     r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
     if r['cover']: