python-ox/ox/web/abebooks.py

from ox.cache import read_url
import re
import lxml.html

def get_data(id):
    info = {}
    base = 'http://www.abebooks.com'
    url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)
    data = read_url(url)
    urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)
    if urls:
        details = '%s%s' % (base, urls[0])
        data = read_url(details)
        doc = lxml.html.document_fromstring(data)
        for e in doc.xpath("//*[contains(@id, 'biblio')]"):
            key = e.attrib['id'].replace('biblio-', '')
            value = e.text_content()
            if value and key not in ('bookcondition', 'binding'):
                info[key] = value
    return info