python-ox/ox/web/amazon.py

79 lines
2.8 KiB
Python
Raw Permalink Normal View History

2010-10-26 17:33:32 +00:00
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
2014-09-30 19:27:26 +00:00
from __future__ import print_function
2010-10-26 17:33:32 +00:00
import re
2023-07-27 11:07:13 +00:00
from urllib.parse import quote
2010-10-26 17:33:32 +00:00
from ox import find_re, strip_tags, decode_html
from ox.cache import read_url
2010-10-26 17:33:32 +00:00
2015-11-03 22:16:34 +00:00
import lxml.html
2014-04-02 23:34:15 +00:00
2010-10-26 17:33:32 +00:00
def findISBN(title, author):
q = '%s %s' % (title, author)
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
data = read_url(url, unicode=True)
2010-10-26 17:33:32 +00:00
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
2012-08-15 15:15:40 +00:00
data = get_data(id)
2010-10-26 17:33:32 +00:00
if author in data['authors']:
return data
return {}
2012-08-15 15:15:40 +00:00
def get_data(id):
2010-10-26 17:33:32 +00:00
url = "http://www.amazon.com/title/dp/%s/" % id
data = read_url(url, unicode=True)
2010-10-26 17:33:32 +00:00
2012-08-15 15:15:40 +00:00
def find_data(key):
return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
2010-10-26 17:33:32 +00:00
r = {}
r['amazon'] = url
2014-04-02 23:34:15 +00:00
r['title'] = find_re(data, '<span id="productTitle" class="a-size-large">(.*?)</span>')
r['authors'] = []
doc = lxml.html.document_fromstring(data)
for e in doc.xpath("//span[contains(@class, 'author')]"):
2014-09-30 19:27:26 +00:00
print(e)
2014-04-02 23:34:15 +00:00
for secondary in e.xpath(".//span[contains(@class, 'a-color-secondary')]"):
if 'Author' in secondary.text:
author = e.xpath(".//span[contains(@class, 'a-size-medium')]")
if author:
r['authors'].append(author[0].text.strip())
else:
r['authors'].append(e.xpath('.//a')[0].text.strip())
break
elif 'Translator' in secondary.text:
r['translator'] = [e.xpath('.//a')[0].text]
break
2012-08-15 15:15:40 +00:00
r['publisher'] = find_data('Publisher')
r['language'] = find_data('Language')
r['isbn-10'] = find_data('ISBN-10')
r['isbn-13'] = find_data('ISBN-13').replace('-', '')
r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
2010-10-26 17:33:32 +00:00
2012-08-15 15:15:40 +00:00
r['pages'] = find_data('Paperback')
2010-10-26 17:33:32 +00:00
if not r['pages']:
2012-08-15 15:15:40 +00:00
r['pages'] = find_data('Hardcover')
2010-10-26 17:33:32 +00:00
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
2010-10-26 17:33:32 +00:00
2014-04-02 23:34:15 +00:00
for e in doc.xpath('//noscript'):
for c in e.getchildren():
if c.tag == 'div':
r['description'] = strip_tags(decode_html(lxml.html.tostring(c))).strip()
break
2010-10-26 17:33:32 +00:00
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
if r['cover']:
r['cover'] = r['cover'][0].split('._BO2')[0]
if not r['cover'].endswith('.jpg'):
r['cover'] = r['cover'] + '.jpg'
if 'no-image-avail-img' in r['cover']:
del r['cover']
else:
del r['cover']
return r