fix amazon parser
This commit is contained in:
parent
cc72dc96d3
commit
9c844d0ce7
1 changed files with 23 additions and 7 deletions
|
@ -6,6 +6,8 @@ from urllib import quote
|
||||||
from ox import find_re, strip_tags, decode_html
|
from ox import find_re, strip_tags, decode_html
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
|
|
||||||
|
import lxml
|
||||||
|
|
||||||
|
|
||||||
def findISBN(title, author):
|
def findISBN(title, author):
|
||||||
q = '%s %s' % (title, author)
|
q = '%s %s' % (title, author)
|
||||||
|
@ -28,12 +30,22 @@ def get_data(id):
|
||||||
|
|
||||||
r = {}
|
r = {}
|
||||||
r['amazon'] = url
|
r['amazon'] = url
|
||||||
r['title'] = find_re(data, '<span id="btAsinTitle" style="">(.*?)<span')
|
r['title'] = find_re(data, '<span id="productTitle" class="a-size-large">(.*?)</span>')
|
||||||
r['authors'] = re.compile('<b class="h3color">(.*?)</b>.*?\(Author\)', re.DOTALL).findall(data)
|
r['authors'] = []
|
||||||
r['authors'] = filter(lambda x: len(x)>1, [decode_html(a) for a in r['authors']])
|
doc = lxml.html.document_fromstring(data)
|
||||||
t = re.compile('>(.*?)</a> \(Translator\)').findall(data)
|
for e in doc.xpath("//span[contains(@class, 'author')]"):
|
||||||
if t:
|
print e
|
||||||
r['translator'] = t
|
for secondary in e.xpath(".//span[contains(@class, 'a-color-secondary')]"):
|
||||||
|
if 'Author' in secondary.text:
|
||||||
|
author = e.xpath(".//span[contains(@class, 'a-size-medium')]")
|
||||||
|
if author:
|
||||||
|
r['authors'].append(author[0].text.strip())
|
||||||
|
else:
|
||||||
|
r['authors'].append(e.xpath('.//a')[0].text.strip())
|
||||||
|
break
|
||||||
|
elif 'Translator' in secondary.text:
|
||||||
|
r['translator'] = [e.xpath('.//a')[0].text]
|
||||||
|
break
|
||||||
r['publisher'] = find_data('Publisher')
|
r['publisher'] = find_data('Publisher')
|
||||||
r['language'] = find_data('Language')
|
r['language'] = find_data('Language')
|
||||||
r['isbn-10'] = find_data('ISBN-10')
|
r['isbn-10'] = find_data('ISBN-10')
|
||||||
|
@ -46,7 +58,11 @@ def get_data(id):
|
||||||
|
|
||||||
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||||
|
|
||||||
r['description'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
for e in doc.xpath('//noscript'):
|
||||||
|
for c in e.getchildren():
|
||||||
|
if c.tag == 'div':
|
||||||
|
r['description'] = strip_tags(decode_html(lxml.html.tostring(c))).strip()
|
||||||
|
break
|
||||||
|
|
||||||
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
|
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
|
||||||
if r['cover']:
|
if r['cover']:
|
||||||
|
|
Loading…
Reference in a new issue