cleanup meta parser
This commit is contained in:
parent
fc7b3ee049
commit
67d1814192
6 changed files with 49 additions and 6 deletions
|
@ -161,6 +161,9 @@ class Changelog(db.Model):
|
||||||
def action_edititem(self, user, timestamp, itemid, meta):
|
def action_edititem(self, user, timestamp, itemid, meta):
|
||||||
from item.models import Item
|
from item.models import Item
|
||||||
i = Item.get(itemid)
|
i = Item.get(itemid)
|
||||||
|
if not i:
|
||||||
|
logger.debug('ignore edititem for unknown item %s %s', timestamp, itemid)
|
||||||
|
return True
|
||||||
if i.timestamp > timestamp:
|
if i.timestamp > timestamp:
|
||||||
logger.debug('ignore edititem change %s %s %s', timestamp, itemid, meta)
|
logger.debug('ignore edititem change %s %s %s', timestamp, itemid, meta)
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -47,9 +47,19 @@ def metadata(f):
|
||||||
|
|
||||||
if 'isbn' in data:
|
if 'isbn' in data:
|
||||||
data['primaryid'] = ['isbn', data['isbn'][0]]
|
data['primaryid'] = ['isbn', data['isbn'][0]]
|
||||||
|
if 'author' in data:
|
||||||
|
if isinstance(data['author'], basestring):
|
||||||
|
data['author'] = data['author'].split('; ')
|
||||||
|
if data['author'] in (['Administrator'], ['Default'], ['user']):
|
||||||
|
del data['author']
|
||||||
if not 'title' in data:
|
if not 'title' in data:
|
||||||
data['title'] = os.path.splitext(os.path.basename(f))[0]
|
data['title'] = os.path.splitext(os.path.basename(f))[0]
|
||||||
if 'author' in data and isinstance(data['author'], basestring):
|
if data['title'].startswith('Microsoft Word - '):
|
||||||
data['author'] = [data['author']]
|
data['title'] = data['title'][len('Microsoft Word - '):]
|
||||||
|
for postfix in ('.doc', 'docx', '.qxd', '.indd'):
|
||||||
|
if data['title'].endswith(postfix):
|
||||||
|
data['title'] = data['title'][:-len(postfix)]
|
||||||
|
if not data['title'].strip():
|
||||||
|
del data['title']
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,12 @@ def lookup(id):
|
||||||
doc = lxml.html.document_fromstring(html)
|
doc = lxml.html.document_fromstring(html)
|
||||||
for e in doc.xpath("//*[contains(@id, 'biblio')]"):
|
for e in doc.xpath("//*[contains(@id, 'biblio')]"):
|
||||||
key = e.attrib['id'].replace('biblio-', '')
|
key = e.attrib['id'].replace('biblio-', '')
|
||||||
value = e.text_content()
|
value = e.text_content().strip()
|
||||||
|
k = keys.get(key, key)
|
||||||
|
if k == 'date' and value == 'Publication Date:':
|
||||||
|
value = ''
|
||||||
|
elif k == 'publisher' and value == 'Publisher:':
|
||||||
|
value = ''
|
||||||
if value and key not in ('bookcondition', 'binding', 'edition-amz'):
|
if value and key not in ('bookcondition', 'binding', 'edition-amz'):
|
||||||
data[keys.get(key, key)] = value
|
data[k] = value
|
||||||
return data
|
return data
|
||||||
|
|
|
@ -50,6 +50,8 @@ def lookup(id):
|
||||||
url = '%s/Lookup/Book/%s/%s/1' % (base, id, id)
|
url = '%s/Lookup/Book/%s/%s/1' % (base, id, id)
|
||||||
data = read_url(url).decode('utf-8')
|
data = read_url(url).decode('utf-8')
|
||||||
r["title"] = find_re(data, "<h2>(.*?)</h2>")
|
r["title"] = find_re(data, "<h2>(.*?)</h2>")
|
||||||
|
if r["title"] == 'Error!':
|
||||||
|
return {}
|
||||||
keys = {
|
keys = {
|
||||||
'author': 'Author(s)',
|
'author': 'Author(s)',
|
||||||
'publisher': 'Publisher',
|
'publisher': 'Publisher',
|
||||||
|
|
|
@ -3,8 +3,10 @@
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
|
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
from ox.cache import read_url
|
|
||||||
import json
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from ox.cache import read_url
|
||||||
|
|
||||||
from marc_countries import COUNTRIES
|
from marc_countries import COUNTRIES
|
||||||
from dewey import get_classification
|
from dewey import get_classification
|
||||||
|
@ -90,6 +92,17 @@ def get_type(obj):
|
||||||
type_ = type_['key']
|
type_ = type_['key']
|
||||||
return type_
|
return type_
|
||||||
|
|
||||||
|
def parse_date(s):
|
||||||
|
#"January 1, 1998"
|
||||||
|
for pattern, fmt in (('%B %d, %Y', '%Y-%m-%d'), ('%B %Y', '%Y-%m')):
|
||||||
|
try:
|
||||||
|
d = datetime.strptime(s, pattern)
|
||||||
|
s = d.strftime(fmt)
|
||||||
|
return s
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return s
|
||||||
|
|
||||||
def format(info, return_all=False):
|
def format(info, return_all=False):
|
||||||
data = {}
|
data = {}
|
||||||
if 'works' in info:
|
if 'works' in info:
|
||||||
|
@ -120,6 +133,8 @@ def format(info, return_all=False):
|
||||||
value = data[KEYS[key]] + value
|
value = data[KEYS[key]] + value
|
||||||
elif isinstance(value, list) and key not in ('publish_places', 'lccn', 'oclc_numbers'):
|
elif isinstance(value, list) and key not in ('publish_places', 'lccn', 'oclc_numbers'):
|
||||||
value = value[0]
|
value = value[0]
|
||||||
|
if key == 'publish_date':
|
||||||
|
value = parse_date(value)
|
||||||
data[KEYS[key]] = value
|
data[KEYS[key]] = value
|
||||||
if 'subtitle' in info:
|
if 'subtitle' in info:
|
||||||
data['title'] += ' ' + info['subtitle']
|
data['title'] += ' ' + info['subtitle']
|
||||||
|
|
|
@ -75,9 +75,17 @@ def lookup(id):
|
||||||
del data['cover']
|
del data['cover']
|
||||||
|
|
||||||
if 'author' in data:
|
if 'author' in data:
|
||||||
data['author'] = [data['author']]
|
data['author'] = data['author'].split('; ')
|
||||||
if 'title' in data:
|
if 'title' in data:
|
||||||
data['title'] = data['title'].replace(' : ', ': ')
|
data['title'] = data['title'].replace(' : ', ': ')
|
||||||
|
if 'publisher' in data:
|
||||||
|
m = re.compile('(.+) : (.+), (\d{4})').findall(data['publisher'])
|
||||||
|
if m:
|
||||||
|
place, publisher, date = m[0]
|
||||||
|
data['publisher'] = publisher
|
||||||
|
data['date'] = date
|
||||||
|
data['places'] = [place]
|
||||||
|
|
||||||
logger.debug('lookup %s => %s', id, data.keys())
|
logger.debug('lookup %s => %s', id, data.keys())
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue