# -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import re from urllib import quote from ox import findRe, stripTags, decodeHtml from ox.cache import readUrlUnicode def findISBN(title, author): q = '%s %s' % (title, author) url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q) data = readUrlUnicode(url) links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data) id = findRe(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/') data = getData(id) if author in data['authors']: return data return {} def getData(id): url = "http://www.amazon.com/title/dp/%s/" % id data = readUrlUnicode(url) def findData(key): return findRe(data, '
  • %s:(.*?)
  • '% key).strip() r = {} r['amazon'] = url r['title'] = findRe(data, '(.*?)(.*?).*?\(Author\)', re.DOTALL).findall(data) r['authors'] = filter(lambda x: len(x)>1, [decodeHtml(a) for a in r['authors']]) t = re.compile('>(.*?) \(Translator\)').findall(data) if t: r['translator'] = t r['publisher'] = findData('Publisher') r['language'] = findData('Language') r['isbn-10'] = findData('ISBN-10') r['isbn-13'] = findData('ISBN-13').replace('-', '') r['dimensions'] = findRe(data, '
  • .*?Product Dimensions:.*?(.*?)
  • ') r['pages'] = findData('Paperback') if not r['pages']: r['pages'] = findData('Hardcover') r['review'] = stripTags(findRe(data, '

    Review

    .*?
    (.*?)
    ').replace('
    ', '\n')).strip() r['description'] = stripTags(findRe(data, '

    Product Description

    .*?
    (.*?)
    ').replace('
    ', '\n')).strip() r['cover'] = re.findall('src="(.*?)" id="prodImage"', data) if r['cover']: r['cover'] = r['cover'][0].split('._BO2')[0] if not r['cover'].endswith('.jpg'): r['cover'] = r['cover'] + '.jpg' if 'no-image-avail-img' in r['cover']: del r['cover'] else: del r['cover'] return r