openmedialibrary/oml/meta/abebooks.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import division

from ox.cache import read_url
import re
import lxml.html

import logging
logger = logging.getLogger('meta.abebooks')

base = 'http://www.abebooks.com'

def get_ids(key, value):
    ids = []
    if key == 'isbn':
        url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)
        data = read_url(url)
        urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)
        if urls:
            ids.append((key, value))
    if ids:
        logger.debug('get_ids %s %s => %s', key, value, ids)
    return ids

def lookup(id):
    logger.debug('lookup %s', id)
    data = {}
    url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)
    html = read_url(url)
    urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(html)
    keys = {
        'pubdate': 'date'
    }
    if urls:
        details = '%s%s' % (base, urls[0])
        html = read_url(details)
        doc = lxml.html.document_fromstring(html)
        for e in doc.xpath("//*[contains(@id, 'biblio')]"):
            key = e.attrib['id'].replace('biblio-', '')
            value = e.text_content().strip()
            k = keys.get(key, key)
            if k == 'date' and value == 'Publication Date:':
                value = ''
            elif k == 'publisher' and value == 'Publisher:':
                value = ''
            if value and key not in ('bookcondition', 'binding', 'edition-amz'):
                data[k] = value
    return data
use python logging 2014-05-17 14:26:59 +00:00			`# -- coding: utf-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`
			`from __future__ import division`

meta 2014-05-14 09:57:11 +00:00			`from ox.cache import read_url`
			`import re`
			`import lxml.html`

use python logging 2014-05-17 14:26:59 +00:00			`import logging`
			`logger = logging.getLogger('meta.abebooks')`

import/lists/autocompleteFolder 2014-05-18 23:24:04 +00:00			`base = 'http://www.abebooks.com'`

meta 2014-05-14 09:57:11 +00:00			`def get_ids(key, value):`
			`ids = []`
lots of stuff 2014-05-21 00:02:21 +00:00			`if key == 'isbn':`
meta 2014-05-14 09:57:11 +00:00			`url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)`
			`data = read_url(url)`
			`urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)`
			`if urls:`
			`ids.append((key, value))`
			`if ids:`
use python logging 2014-05-17 14:26:59 +00:00			`logger.debug('get_ids %s %s => %s', key, value, ids)`
meta 2014-05-14 09:57:11 +00:00			`return ids`

			`def lookup(id):`
use python logging 2014-05-17 14:26:59 +00:00			`logger.debug('lookup %s', id)`
import/lists/autocompleteFolder 2014-05-18 23:24:04 +00:00			`data = {}`
meta 2014-05-14 09:57:11 +00:00			`url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)`
import/lists/autocompleteFolder 2014-05-18 23:24:04 +00:00			`html = read_url(url)`
			`urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(html)`
			`keys = {`
			`'pubdate': 'date'`
			`}`
meta 2014-05-14 09:57:11 +00:00			`if urls:`
			`details = '%s%s' % (base, urls[0])`
import/lists/autocompleteFolder 2014-05-18 23:24:04 +00:00			`html = read_url(details)`
			`doc = lxml.html.document_fromstring(html)`
meta 2014-05-14 09:57:11 +00:00			`for e in doc.xpath("//*[contains(@id, 'biblio')]"):`
			`key = e.attrib['id'].replace('biblio-', '')`
cleanup meta parser 2014-05-26 08:23:10 +00:00			`value = e.text_content().strip()`
			`k = keys.get(key, key)`
			`if k == 'date' and value == 'Publication Date:':`
			`value = ''`
			`elif k == 'publisher' and value == 'Publisher:':`
			`value = ''`
import/lists/autocompleteFolder 2014-05-18 23:24:04 +00:00			`if value and key not in ('bookcondition', 'binding', 'edition-amz'):`
cleanup meta parser 2014-05-26 08:23:10 +00:00			`data[k] = value`
import/lists/autocompleteFolder 2014-05-18 23:24:04 +00:00			`return data`