2014-05-04 17:26:43 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
2014-09-02 22:32:44 +00:00
|
|
|
|
2014-05-04 17:26:43 +00:00
|
|
|
|
|
|
|
from ox.cache import read_url
|
2014-08-12 08:16:57 +00:00
|
|
|
import ox
|
2014-05-14 09:57:11 +00:00
|
|
|
import re
|
2014-05-04 17:26:43 +00:00
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
|
2014-09-02 22:32:44 +00:00
|
|
|
from .dewey import get_classification
|
|
|
|
from .marc_countries import COUNTRIES
|
|
|
|
from .utils import normalize_isbn
|
2014-05-04 17:26:43 +00:00
|
|
|
|
2014-05-17 14:26:59 +00:00
|
|
|
import logging
|
2015-11-29 14:56:38 +00:00
|
|
|
logger = logging.getLogger(__name__)
|
2014-05-17 14:26:59 +00:00
|
|
|
|
2014-08-12 08:16:57 +00:00
|
|
|
|
2014-05-14 09:57:11 +00:00
|
|
|
def get_ids(key, value):
|
|
|
|
ids = []
|
2014-05-21 00:02:21 +00:00
|
|
|
if key == 'isbn':
|
2014-05-14 09:57:11 +00:00
|
|
|
url = 'http://www.loc.gov/search/?q=%s&all=true' % value
|
2015-11-03 22:40:10 +00:00
|
|
|
html = ox.cache.read_url(url).decode('utf-8', 'ignore')
|
2014-05-14 09:57:11 +00:00
|
|
|
match = re.search('"http://lccn.loc.gov/(\d+)"', html)
|
|
|
|
if match:
|
|
|
|
ids.append(('lccn', match.group(1)))
|
2014-05-21 00:02:21 +00:00
|
|
|
elif key == 'lccn':
|
|
|
|
info = lookup(value)
|
|
|
|
for key in ('oclc', 'isbn'):
|
|
|
|
if key in info:
|
|
|
|
for value in info[key]:
|
|
|
|
ids.append((key, value))
|
2014-05-14 09:57:11 +00:00
|
|
|
if ids:
|
2015-03-09 10:34:26 +00:00
|
|
|
logger.debug('get_ids %s %s => %s', key, value, ids)
|
2014-05-14 09:57:11 +00:00
|
|
|
return ids
|
|
|
|
|
|
|
|
def lookup(id):
|
2014-05-17 14:26:59 +00:00
|
|
|
logger.debug('lookup %s', id)
|
2014-05-04 17:26:43 +00:00
|
|
|
ns = '{http://www.loc.gov/mods/v3}'
|
|
|
|
url = 'http://lccn.loc.gov/%s/mods' % id
|
2014-05-14 09:57:11 +00:00
|
|
|
info = {
|
2014-05-21 00:02:21 +00:00
|
|
|
'lccn': [id]
|
2014-05-14 09:57:11 +00:00
|
|
|
}
|
2015-12-01 10:18:08 +00:00
|
|
|
try:
|
|
|
|
data = read_url(url).decode('utf-8')
|
|
|
|
mods = ET.fromstring(data)
|
|
|
|
except:
|
|
|
|
try:
|
|
|
|
data = read_url(url, timeout=0).decode('utf-8')
|
|
|
|
mods = ET.fromstring(data)
|
|
|
|
except:
|
2016-01-24 09:13:03 +00:00
|
|
|
logger.debug('lookup for %s url: %s failed', id, url, exc_info=True)
|
2015-12-01 10:18:08 +00:00
|
|
|
return info
|
|
|
|
|
2014-05-16 08:06:11 +00:00
|
|
|
title = mods.findall(ns + 'titleInfo')
|
|
|
|
if not title:
|
|
|
|
return {}
|
2014-05-19 20:58:00 +00:00
|
|
|
info['title'] = ''.join([': ' + e.text.strip() if e.tag == ns + 'subTitle' else ' ' + e.text.strip() for e in title[0]]).strip()
|
2014-05-04 17:26:43 +00:00
|
|
|
origin = mods.findall(ns + 'originInfo')
|
|
|
|
if origin:
|
|
|
|
info['place'] = []
|
|
|
|
for place in origin[0].findall(ns + 'place'):
|
|
|
|
terms = place.findall(ns + 'placeTerm')
|
|
|
|
if terms and terms[0].attrib['type'] == 'text':
|
|
|
|
e = terms[0]
|
|
|
|
info['place'].append(e.text)
|
|
|
|
elif terms and terms[0].attrib['type'] == 'code':
|
|
|
|
e = terms[0]
|
|
|
|
info['country'] = COUNTRIES.get(e.text, e.text)
|
2014-05-14 09:57:11 +00:00
|
|
|
publisher = [e.text for e in origin[0].findall(ns + 'publisher')]
|
|
|
|
if publisher:
|
|
|
|
info['publisher'] = publisher[0]
|
2014-05-23 12:52:36 +00:00
|
|
|
info['date'] = ''.join([e.text
|
|
|
|
for e in origin[0].findall(ns + 'dateIssued') if e.attrib.get('encoding') == 'marc'])
|
2014-05-04 17:26:43 +00:00
|
|
|
for i in mods.findall(ns + 'identifier'):
|
2014-05-21 00:02:21 +00:00
|
|
|
key = i.attrib['type']
|
|
|
|
value = i.text
|
|
|
|
if key in ('oclc', 'lccn', 'isbn'):
|
|
|
|
if i.attrib['type'] == 'oclc':
|
|
|
|
value = value.replace('ocn', '').replace('ocm', '')
|
|
|
|
if i.attrib['type'] == 'isbn':
|
|
|
|
value = normalize_isbn(i.text)
|
|
|
|
if not key in info:
|
|
|
|
info[key] = []
|
|
|
|
if value not in info[key]:
|
|
|
|
info[key].append(value)
|
2014-05-04 17:26:43 +00:00
|
|
|
for i in mods.findall(ns + 'classification'):
|
|
|
|
if i.attrib['authority'] == 'ddc':
|
2014-05-21 00:02:21 +00:00
|
|
|
info['classification'] = get_classification(i.text.split('/')[0])
|
2014-05-04 17:26:43 +00:00
|
|
|
info['author'] = []
|
|
|
|
for a in mods.findall(ns + 'name'):
|
2014-05-14 09:57:11 +00:00
|
|
|
if a.attrib.get('usage') == 'primary':
|
|
|
|
info['author'].append(' '.join([e.text for e in a.findall(ns + 'namePart') if not e.attrib.get('type') in ('date', )]))
|
|
|
|
info['author'] = [ox.normalize_name(a) for a in info['author']]
|
2014-05-19 20:58:00 +00:00
|
|
|
toc = mods.findall(ns + 'tableOfContents')
|
|
|
|
if toc:
|
|
|
|
info['description'] = toc[0].text.strip()
|
2014-09-02 22:32:44 +00:00
|
|
|
for key in list(info.keys()):
|
2014-05-04 17:26:43 +00:00
|
|
|
if not info[key]:
|
|
|
|
del info[key]
|
|
|
|
return info
|
2014-05-14 09:57:11 +00:00
|
|
|
|
|
|
|
info = lookup
|