2010-10-26 17:33:32 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
|
|
|
import re
|
|
|
|
from urllib import quote
|
|
|
|
|
2012-08-14 14:12:43 +00:00
|
|
|
from ox import find_re, strip_tags, decode_html
|
2012-08-14 13:58:05 +00:00
|
|
|
from ox.cache import read_url
|
2010-10-26 17:33:32 +00:00
|
|
|
|
|
|
|
|
|
|
|
def findISBN(title, author):
|
|
|
|
q = '%s %s' % (title, author)
|
|
|
|
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
|
2012-08-14 13:58:05 +00:00
|
|
|
data = read_url(url, unicode=True)
|
2010-10-26 17:33:32 +00:00
|
|
|
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
|
2012-08-14 14:12:43 +00:00
|
|
|
id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
|
2012-08-15 15:15:40 +00:00
|
|
|
data = get_data(id)
|
2010-10-26 17:33:32 +00:00
|
|
|
if author in data['authors']:
|
|
|
|
return data
|
|
|
|
return {}
|
|
|
|
|
2012-08-15 15:15:40 +00:00
|
|
|
def get_data(id):
|
2010-10-26 17:33:32 +00:00
|
|
|
url = "http://www.amazon.com/title/dp/%s/" % id
|
2012-08-14 13:58:05 +00:00
|
|
|
data = read_url(url, unicode=True)
|
2010-10-26 17:33:32 +00:00
|
|
|
|
|
|
|
|
2012-08-15 15:15:40 +00:00
|
|
|
def find_data(key):
|
2012-08-14 14:12:43 +00:00
|
|
|
return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
|
2010-10-26 17:33:32 +00:00
|
|
|
|
|
|
|
r = {}
|
|
|
|
r['amazon'] = url
|
2012-08-14 14:12:43 +00:00
|
|
|
r['title'] = find_re(data, '<span id="btAsinTitle" style="">(.*?)<span')
|
2010-10-26 17:33:32 +00:00
|
|
|
r['authors'] = re.compile('<b class="h3color">(.*?)</b>.*?\(Author\)', re.DOTALL).findall(data)
|
2012-08-14 14:12:43 +00:00
|
|
|
r['authors'] = filter(lambda x: len(x)>1, [decode_html(a) for a in r['authors']])
|
2010-10-26 17:33:32 +00:00
|
|
|
t = re.compile('>(.*?)</a> \(Translator\)').findall(data)
|
|
|
|
if t:
|
|
|
|
r['translator'] = t
|
2012-08-15 15:15:40 +00:00
|
|
|
r['publisher'] = find_data('Publisher')
|
|
|
|
r['language'] = find_data('Language')
|
|
|
|
r['isbn-10'] = find_data('ISBN-10')
|
|
|
|
r['isbn-13'] = find_data('ISBN-13').replace('-', '')
|
2012-08-14 14:12:43 +00:00
|
|
|
r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
|
2010-10-26 17:33:32 +00:00
|
|
|
|
2012-08-15 15:15:40 +00:00
|
|
|
r['pages'] = find_data('Paperback')
|
2010-10-26 17:33:32 +00:00
|
|
|
if not r['pages']:
|
2012-08-15 15:15:40 +00:00
|
|
|
r['pages'] = find_data('Hardcover')
|
2010-10-26 17:33:32 +00:00
|
|
|
|
2012-08-14 14:12:43 +00:00
|
|
|
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
2010-10-26 17:33:32 +00:00
|
|
|
|
2012-08-14 14:12:43 +00:00
|
|
|
r['description'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
2010-10-26 17:33:32 +00:00
|
|
|
|
|
|
|
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
|
|
|
|
if r['cover']:
|
|
|
|
r['cover'] = r['cover'][0].split('._BO2')[0]
|
|
|
|
if not r['cover'].endswith('.jpg'):
|
|
|
|
r['cover'] = r['cover'] + '.jpg'
|
|
|
|
if 'no-image-avail-img' in r['cover']:
|
|
|
|
del r['cover']
|
|
|
|
else:
|
|
|
|
del r['cover']
|
|
|
|
return r
|
|
|
|
|