cleanup meta parser

2014-05-26 10:23:10 +02:00 · 2014-05-26 10:23:10 +02:00 · 67d1814192
commit 67d1814192
parent fc7b3ee049
6 changed files with 49 additions and 6 deletions
--- a/oml/changelog.py
+++ b/oml/changelog.py
@ -161,6 +161,9 @@ class Changelog(db.Model):
    def action_edititem(self, user, timestamp, itemid, meta):
        from item.models import Item
        i = Item.get(itemid)
+        if not i:
+            logger.debug('ignore edititem for unknown item %s %s', timestamp, itemid)
+            return True
        if i.timestamp > timestamp:
            logger.debug('ignore edititem change %s %s %s', timestamp, itemid, meta)
            return True
--- a/oml/media/init.py
+++ b/oml/media/init.py
@ -47,9 +47,19 @@ def metadata(f):

    if 'isbn' in data:
        data['primaryid'] = ['isbn', data['isbn'][0]]
+    if 'author' in data:
+        if isinstance(data['author'], basestring):
+            data['author'] = data['author'].split('; ')
+        if data['author'] in (['Administrator'], ['Default'], ['user']):
+            del data['author']
    if not 'title' in data:
        data['title'] = os.path.splitext(os.path.basename(f))[0]
-    if 'author' in data and isinstance(data['author'], basestring):
-        data['author'] = [data['author']]
+        if data['title'].startswith('Microsoft Word - '):
+            data['title'] = data['title'][len('Microsoft Word - '):]
+        for postfix in ('.doc', 'docx', '.qxd', '.indd'):
+            if data['title'].endswith(postfix):
+                data['title'] = data['title'][:-len(postfix)]
+        if not data['title'].strip():
+            del data['title']
    return data

--- a/oml/meta/abebooks.py
+++ b/oml/meta/abebooks.py
@ -38,7 +38,12 @@ def lookup(id):
        doc = lxml.html.document_fromstring(html)
        for e in doc.xpath("//*[contains(@id, 'biblio')]"):
            key = e.attrib['id'].replace('biblio-', '')
-            value = e.text_content()
+            value = e.text_content().strip()
+            k = keys.get(key, key)
+            if k == 'date' and value == 'Publication Date:':
+                value = ''
+            elif k == 'publisher' and value == 'Publisher:':
+                value = ''
            if value and key not in ('bookcondition', 'binding', 'edition-amz'):
-                data[keys.get(key, key)] = value
+                data[k] = value
    return data
--- a/oml/meta/lookupbyisbn.py
+++ b/oml/meta/lookupbyisbn.py
@ -50,6 +50,8 @@ def lookup(id):
    url = '%s/Lookup/Book/%s/%s/1' % (base, id, id)
    data = read_url(url).decode('utf-8')
    r["title"] = find_re(data, "<h2>(.*?)</h2>")
+    if r["title"] == 'Error!':
+        return {}
    keys = {
        'author': 'Author(s)',
        'publisher': 'Publisher',
--- a/oml/meta/openlibrary.py
+++ b/oml/meta/openlibrary.py
@ -3,8 +3,10 @@
 from __future__ import division

 from urllib import urlencode
-from ox.cache import read_url
 import json
+from datetime import datetime
+
+from ox.cache import read_url

 from marc_countries import COUNTRIES
 from dewey import get_classification
@ -90,6 +92,17 @@ def get_type(obj):
        type_ = type_['key']
    return type_

+def parse_date(s):
+    #"January 1, 1998"
+    for pattern, fmt in (('%B %d, %Y', '%Y-%m-%d'), ('%B %Y', '%Y-%m')):
+        try:
+            d = datetime.strptime(s, pattern)
+            s = d.strftime(fmt)
+            return s
+        except:
+            pass
+    return s
+
 def format(info, return_all=False):
    data = {}
    if 'works' in info:
@ -120,6 +133,8 @@ def format(info, return_all=False):
                    value = data[KEYS[key]] + value
            elif isinstance(value, list) and key not in ('publish_places', 'lccn', 'oclc_numbers'):
                value = value[0]
+            if key == 'publish_date':
+                value = parse_date(value)
            data[KEYS[key]] = value
    if 'subtitle' in info:
        data['title'] += ' ' + info['subtitle']
--- a/oml/meta/worldcat.py
+++ b/oml/meta/worldcat.py
@ -75,9 +75,17 @@ def lookup(id):
            del data['cover']

    if 'author' in data:
-        data['author'] = [data['author']]
+        data['author'] = data['author'].split('; ')
    if 'title' in data:
        data['title'] = data['title'].replace(' : ', ': ')
+    if 'publisher' in data:
+        m = re.compile('(.+) : (.+), (\d{4})').findall(data['publisher'])
+        if m:
+            place, publisher, date = m[0]
+            data['publisher'] = publisher
+            data['date'] = date
+            data['places'] = [place]
+
    logger.debug('lookup %s => %s', id, data.keys())
    return data