fix amazon parser
This commit is contained in:
parent
cc72dc96d3
commit
9c844d0ce7
1 changed files with 23 additions and 7 deletions
|
@ -6,6 +6,8 @@ from urllib import quote
|
|||
from ox import find_re, strip_tags, decode_html
|
||||
from ox.cache import read_url
|
||||
|
||||
import lxml
|
||||
|
||||
|
||||
def findISBN(title, author):
|
||||
q = '%s %s' % (title, author)
|
||||
|
@ -28,12 +30,22 @@ def get_data(id):
|
|||
|
||||
r = {}
|
||||
r['amazon'] = url
|
||||
r['title'] = find_re(data, '<span id="btAsinTitle" style="">(.*?)<span')
|
||||
r['authors'] = re.compile('<b class="h3color">(.*?)</b>.*?\(Author\)', re.DOTALL).findall(data)
|
||||
r['authors'] = filter(lambda x: len(x)>1, [decode_html(a) for a in r['authors']])
|
||||
t = re.compile('>(.*?)</a> \(Translator\)').findall(data)
|
||||
if t:
|
||||
r['translator'] = t
|
||||
r['title'] = find_re(data, '<span id="productTitle" class="a-size-large">(.*?)</span>')
|
||||
r['authors'] = []
|
||||
doc = lxml.html.document_fromstring(data)
|
||||
for e in doc.xpath("//span[contains(@class, 'author')]"):
|
||||
print e
|
||||
for secondary in e.xpath(".//span[contains(@class, 'a-color-secondary')]"):
|
||||
if 'Author' in secondary.text:
|
||||
author = e.xpath(".//span[contains(@class, 'a-size-medium')]")
|
||||
if author:
|
||||
r['authors'].append(author[0].text.strip())
|
||||
else:
|
||||
r['authors'].append(e.xpath('.//a')[0].text.strip())
|
||||
break
|
||||
elif 'Translator' in secondary.text:
|
||||
r['translator'] = [e.xpath('.//a')[0].text]
|
||||
break
|
||||
r['publisher'] = find_data('Publisher')
|
||||
r['language'] = find_data('Language')
|
||||
r['isbn-10'] = find_data('ISBN-10')
|
||||
|
@ -46,7 +58,11 @@ def get_data(id):
|
|||
|
||||
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||
|
||||
r['description'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||
for e in doc.xpath('//noscript'):
|
||||
for c in e.getchildren():
|
||||
if c.tag == 'div':
|
||||
r['description'] = strip_tags(decode_html(lxml.html.tostring(c))).strip()
|
||||
break
|
||||
|
||||
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
|
||||
if r['cover']:
|
||||
|
|
Loading…
Reference in a new issue