From bcd0c528fd8e4a36b7a0398d1114af814ce9c705 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Tue, 26 Oct 2010 19:33:32 +0200 Subject: [PATCH] add amazon --- ox/web/amazon.py | 61 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 ox/web/amazon.py diff --git a/ox/web/amazon.py b/ox/web/amazon.py new file mode 100644 index 0000000..69acafb --- /dev/null +++ b/ox/web/amazon.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +import re +from urllib import quote + +from ox import findRe, stripTags, decodeHtml +from ox.cache import readUrlUnicode + + +def findISBN(title, author): + q = '%s %s' % (title, author) + url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q) + data = readUrlUnicode(url) + links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data) + id = findRe(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/') + data = getData(id) + if author in data['authors']: + return data + return {} + +def getData(id): + url = "http://www.amazon.com/title/dp/%s/" % id + data = readUrlUnicode(url) + + + def findData(key): + return findRe(data, '
  • %s:(.*?)
  • '% key).strip() + + r = {} + r['amazon'] = url + r['title'] = findRe(data, '(.*?)(.*?).*?\(Author\)', re.DOTALL).findall(data) + r['authors'] = filter(lambda x: len(x)>1, [decodeHtml(a) for a in r['authors']]) + t = re.compile('>(.*?) \(Translator\)').findall(data) + if t: + r['translator'] = t + r['publisher'] = findData('Publisher') + r['language'] = findData('Language') + r['isbn-10'] = findData('ISBN-10') + r['isbn-13'] = findData('ISBN-13').replace('-', '') + r['dimensions'] = findRe(data, '
  • .*?Product Dimensions:.*?(.*?)
  • ') + + r['pages'] = findData('Paperback') + if not r['pages']: + r['pages'] = findData('Hardcover') + + r['review'] = stripTags(findRe(data, '

    Review

    .*?
    (.*?)
    ').replace('
    ', '\n')).strip() + + r['description'] = stripTags(findRe(data, '

    Product Description

    .*?
    (.*?)
    ').replace('
    ', '\n')).strip() + + r['cover'] = re.findall('src="(.*?)" id="prodImage"', data) + if r['cover']: + r['cover'] = r['cover'][0].split('._BO2')[0] + if not r['cover'].endswith('.jpg'): + r['cover'] = r['cover'] + '.jpg' + if 'no-image-avail-img' in r['cover']: + del r['cover'] + else: + del r['cover'] + return r +