diff --git a/ox/web/abebooks.py b/ox/web/abebooks.py new file mode 100644 index 0000000..837f313 --- /dev/null +++ b/ox/web/abebooks.py @@ -0,0 +1,20 @@ +from ox.cache import read_url +import re +import lxml.html + +def get_data(id): + info = {} + base = 'http://www.abebooks.com' + url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id) + data = read_url(url) + urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data) + if urls: + details = '%s%s' % (base, urls[0]) + data = read_url(details) + doc = lxml.html.document_fromstring(data) + for e in doc.xpath("//*[contains(@id, 'biblio')]"): + key = e.attrib['id'].replace('biblio-', '') + value = e.text_content() + if value and key not in ('bookcondition', 'binding'): + info[key] = value + return info