2014-05-17 14:26:59 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
2014-09-02 22:32:44 +00:00
|
|
|
|
2014-05-17 14:26:59 +00:00
|
|
|
|
2014-05-14 09:57:11 +00:00
|
|
|
import re
|
2014-08-12 08:16:57 +00:00
|
|
|
|
|
|
|
from ox.cache import read_url
|
2014-05-14 09:57:11 +00:00
|
|
|
import lxml.html
|
|
|
|
|
2014-05-17 14:26:59 +00:00
|
|
|
import logging
|
|
|
|
logger = logging.getLogger('meta.abebooks')
|
|
|
|
|
2014-05-18 23:24:04 +00:00
|
|
|
base = 'http://www.abebooks.com'
|
|
|
|
|
2014-05-14 09:57:11 +00:00
|
|
|
def get_ids(key, value):
|
|
|
|
ids = []
|
2014-05-21 00:02:21 +00:00
|
|
|
if key == 'isbn':
|
2014-05-14 09:57:11 +00:00
|
|
|
url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)
|
2014-10-31 11:46:14 +00:00
|
|
|
data = read_url(url, unicode=True)
|
2014-05-14 09:57:11 +00:00
|
|
|
urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)
|
|
|
|
if urls:
|
|
|
|
ids.append((key, value))
|
|
|
|
if ids:
|
2014-05-17 14:26:59 +00:00
|
|
|
logger.debug('get_ids %s %s => %s', key, value, ids)
|
2014-05-14 09:57:11 +00:00
|
|
|
return ids
|
|
|
|
|
|
|
|
def lookup(id):
|
2014-05-17 14:26:59 +00:00
|
|
|
logger.debug('lookup %s', id)
|
2014-05-18 23:24:04 +00:00
|
|
|
data = {}
|
2014-05-14 09:57:11 +00:00
|
|
|
url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)
|
2014-10-31 11:46:14 +00:00
|
|
|
html = read_url(url, unicode=True)
|
2014-05-18 23:24:04 +00:00
|
|
|
urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(html)
|
|
|
|
keys = {
|
|
|
|
'pubdate': 'date'
|
|
|
|
}
|
2014-05-14 09:57:11 +00:00
|
|
|
if urls:
|
|
|
|
details = '%s%s' % (base, urls[0])
|
2014-10-31 11:46:14 +00:00
|
|
|
html = read_url(details, unicode=True)
|
2014-05-18 23:24:04 +00:00
|
|
|
doc = lxml.html.document_fromstring(html)
|
2014-05-14 09:57:11 +00:00
|
|
|
for e in doc.xpath("//*[contains(@id, 'biblio')]"):
|
|
|
|
key = e.attrib['id'].replace('biblio-', '')
|
2014-05-26 08:23:10 +00:00
|
|
|
value = e.text_content().strip()
|
|
|
|
k = keys.get(key, key)
|
|
|
|
if k == 'date' and value == 'Publication Date:':
|
|
|
|
value = ''
|
|
|
|
elif k == 'publisher' and value == 'Publisher:':
|
|
|
|
value = ''
|
2014-05-18 23:24:04 +00:00
|
|
|
if value and key not in ('bookcondition', 'binding', 'edition-amz'):
|
2014-05-26 08:23:10 +00:00
|
|
|
data[k] = value
|
2014-05-18 23:24:04 +00:00
|
|
|
return data
|