diff --git a/oml/changelog.py b/oml/changelog.py index 9712369..51d0d43 100644 --- a/oml/changelog.py +++ b/oml/changelog.py @@ -161,6 +161,9 @@ class Changelog(db.Model): def action_edititem(self, user, timestamp, itemid, meta): from item.models import Item i = Item.get(itemid) + if not i: + logger.debug('ignore edititem for unknown item %s %s', timestamp, itemid) + return True if i.timestamp > timestamp: logger.debug('ignore edititem change %s %s %s', timestamp, itemid, meta) return True diff --git a/oml/media/__init__.py b/oml/media/__init__.py index 8b1fcf4..3cc0b8f 100644 --- a/oml/media/__init__.py +++ b/oml/media/__init__.py @@ -47,9 +47,19 @@ def metadata(f): if 'isbn' in data: data['primaryid'] = ['isbn', data['isbn'][0]] + if 'author' in data: + if isinstance(data['author'], basestring): + data['author'] = data['author'].split('; ') + if data['author'] in (['Administrator'], ['Default'], ['user']): + del data['author'] if not 'title' in data: data['title'] = os.path.splitext(os.path.basename(f))[0] - if 'author' in data and isinstance(data['author'], basestring): - data['author'] = [data['author']] + if data['title'].startswith('Microsoft Word - '): + data['title'] = data['title'][len('Microsoft Word - '):] + for postfix in ('.doc', 'docx', '.qxd', '.indd'): + if data['title'].endswith(postfix): + data['title'] = data['title'][:-len(postfix)] + if not data['title'].strip(): + del data['title'] return data diff --git a/oml/meta/abebooks.py b/oml/meta/abebooks.py index 3090ed3..7f4b4ed 100644 --- a/oml/meta/abebooks.py +++ b/oml/meta/abebooks.py @@ -38,7 +38,12 @@ def lookup(id): doc = lxml.html.document_fromstring(html) for e in doc.xpath("//*[contains(@id, 'biblio')]"): key = e.attrib['id'].replace('biblio-', '') - value = e.text_content() + value = e.text_content().strip() + k = keys.get(key, key) + if k == 'date' and value == 'Publication Date:': + value = '' + elif k == 'publisher' and value == 'Publisher:': + value = '' if value and key not in ('bookcondition', 'binding', 'edition-amz'): - data[keys.get(key, key)] = value + data[k] = value return data diff --git a/oml/meta/lookupbyisbn.py b/oml/meta/lookupbyisbn.py index 00476ad..2bfaff4 100644 --- a/oml/meta/lookupbyisbn.py +++ b/oml/meta/lookupbyisbn.py @@ -50,6 +50,8 @@ def lookup(id): url = '%s/Lookup/Book/%s/%s/1' % (base, id, id) data = read_url(url).decode('utf-8') r["title"] = find_re(data, "

(.*?)

") + if r["title"] == 'Error!': + return {} keys = { 'author': 'Author(s)', 'publisher': 'Publisher', diff --git a/oml/meta/openlibrary.py b/oml/meta/openlibrary.py index 045948d..86ecd8c 100644 --- a/oml/meta/openlibrary.py +++ b/oml/meta/openlibrary.py @@ -3,8 +3,10 @@ from __future__ import division from urllib import urlencode -from ox.cache import read_url import json +from datetime import datetime + +from ox.cache import read_url from marc_countries import COUNTRIES from dewey import get_classification @@ -90,6 +92,17 @@ def get_type(obj): type_ = type_['key'] return type_ +def parse_date(s): + #"January 1, 1998" + for pattern, fmt in (('%B %d, %Y', '%Y-%m-%d'), ('%B %Y', '%Y-%m')): + try: + d = datetime.strptime(s, pattern) + s = d.strftime(fmt) + return s + except: + pass + return s + def format(info, return_all=False): data = {} if 'works' in info: @@ -120,6 +133,8 @@ def format(info, return_all=False): value = data[KEYS[key]] + value elif isinstance(value, list) and key not in ('publish_places', 'lccn', 'oclc_numbers'): value = value[0] + if key == 'publish_date': + value = parse_date(value) data[KEYS[key]] = value if 'subtitle' in info: data['title'] += ' ' + info['subtitle'] diff --git a/oml/meta/worldcat.py b/oml/meta/worldcat.py index d05a57b..bf7d9b8 100644 --- a/oml/meta/worldcat.py +++ b/oml/meta/worldcat.py @@ -75,9 +75,17 @@ def lookup(id): del data['cover'] if 'author' in data: - data['author'] = [data['author']] + data['author'] = data['author'].split('; ') if 'title' in data: data['title'] = data['title'].replace(' : ', ': ') + if 'publisher' in data: + m = re.compile('(.+) : (.+), (\d{4})').findall(data['publisher']) + if m: + place, publisher, date = m[0] + data['publisher'] = publisher + data['date'] = date + data['places'] = [place] + logger.debug('lookup %s => %s', id, data.keys()) return data