openmedialibrary/oml/meta/loc.py

82 lines
2.8 KiB
Python
Raw Normal View History

2014-05-04 17:26:43 +00:00
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import division
import ox
from ox.cache import read_url
2014-05-14 09:57:11 +00:00
import re
2014-05-04 17:26:43 +00:00
import xml.etree.ElementTree as ET
from utils import normalize_isbn
from marc_countries import COUNTRIES
2014-05-17 14:26:59 +00:00
import logging
logger = logging.getLogger('meta.loc')
2014-05-14 09:57:11 +00:00
def get_ids(key, value):
ids = []
if key in ['isbn10', 'isbn13']:
url = 'http://www.loc.gov/search/?q=%s&all=true' % value
html = ox.cache.read_url(url)
match = re.search('"http://lccn.loc.gov/(\d+)"', html)
if match:
ids.append(('lccn', match.group(1)))
if ids:
2014-05-17 14:26:59 +00:00
logger.debug('get_ids %s,%s => %s', key, value, ids)
2014-05-14 09:57:11 +00:00
return ids
def lookup(id):
2014-05-17 14:26:59 +00:00
logger.debug('lookup %s', id)
2014-05-04 17:26:43 +00:00
ns = '{http://www.loc.gov/mods/v3}'
url = 'http://lccn.loc.gov/%s/mods' % id
data = read_url(url)
mods = ET.fromstring(data)
2014-05-14 09:57:11 +00:00
info = {
'lccn': id
}
2014-05-16 08:06:11 +00:00
title = mods.findall(ns + 'titleInfo')
if not title:
return {}
2014-05-19 20:58:00 +00:00
info['title'] = ''.join([': ' + e.text.strip() if e.tag == ns + 'subTitle' else ' ' + e.text.strip() for e in title[0]]).strip()
2014-05-04 17:26:43 +00:00
origin = mods.findall(ns + 'originInfo')
if origin:
info['place'] = []
for place in origin[0].findall(ns + 'place'):
terms = place.findall(ns + 'placeTerm')
if terms and terms[0].attrib['type'] == 'text':
e = terms[0]
info['place'].append(e.text)
elif terms and terms[0].attrib['type'] == 'code':
e = terms[0]
info['country'] = COUNTRIES.get(e.text, e.text)
2014-05-14 09:57:11 +00:00
publisher = [e.text for e in origin[0].findall(ns + 'publisher')]
if publisher:
info['publisher'] = publisher[0]
2014-05-04 17:26:43 +00:00
info['date'] = ''.join([e.text for e in origin[0].findall(ns + 'dateIssued')])
for i in mods.findall(ns + 'identifier'):
if i.attrib['type'] == 'oclc':
info['oclc'] = i.text.replace('ocn', '')
if i.attrib['type'] == 'lccn':
info['lccn'] = i.text
if i.attrib['type'] == 'isbn':
isbn = normalize_isbn(i.text)
info['isbn%s'%len(isbn)] = isbn
for i in mods.findall(ns + 'classification'):
if i.attrib['authority'] == 'ddc':
info['classification'] = i.text
info['author'] = []
for a in mods.findall(ns + 'name'):
2014-05-14 09:57:11 +00:00
if a.attrib.get('usage') == 'primary':
info['author'].append(' '.join([e.text for e in a.findall(ns + 'namePart') if not e.attrib.get('type') in ('date', )]))
info['author'] = [ox.normalize_name(a) for a in info['author']]
2014-05-19 20:58:00 +00:00
toc = mods.findall(ns + 'tableOfContents')
if toc:
info['description'] = toc[0].text.strip()
2014-05-04 17:26:43 +00:00
for key in info.keys():
if not info[key]:
del info[key]
return info
2014-05-14 09:57:11 +00:00
info = lookup