openmedialibrary/oml/meta/loc.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import division

import ox
from ox.cache import read_url
import re
import xml.etree.ElementTree as ET

from utils import normalize_isbn
from marc_countries import COUNTRIES
from dewey import get_classification

import logging
logger = logging.getLogger('meta.loc')

def get_ids(key, value):
    ids = []
    if key == 'isbn':
        url = 'http://www.loc.gov/search/?q=%s&all=true' % value
        html = ox.cache.read_url(url)
        match = re.search('"http://lccn.loc.gov/(\d+)"', html)
        if match:
            ids.append(('lccn', match.group(1)))
    elif key == 'lccn':
        info = lookup(value)
        for key in ('oclc', 'isbn'):
            if key in info:
                for value in info[key]:
                    ids.append((key, value))
    if ids:
        logger.debug('get_ids %s,%s => %s', key, value, ids)
    return ids

def lookup(id):
    logger.debug('lookup %s', id)
    ns = '{http://www.loc.gov/mods/v3}'
    url = 'http://lccn.loc.gov/%s/mods' % id
    data = read_url(url)
    mods = ET.fromstring(data)

    info = {
        'lccn': [id]
    }
    title = mods.findall(ns + 'titleInfo')
    if not title:
        return {}
    info['title'] = ''.join([': ' + e.text.strip() if e.tag == ns + 'subTitle' else ' ' + e.text.strip() for e in title[0]]).strip()
    origin = mods.findall(ns + 'originInfo')
    if origin:
        info['place'] = []
        for place in origin[0].findall(ns + 'place'):
            terms = place.findall(ns + 'placeTerm')
            if terms and terms[0].attrib['type'] == 'text':
                e = terms[0]
                info['place'].append(e.text)
            elif terms and terms[0].attrib['type'] == 'code':
                e = terms[0]
                info['country'] = COUNTRIES.get(e.text, e.text)
        publisher = [e.text for e in origin[0].findall(ns + 'publisher')]
        if publisher:
            info['publisher'] = publisher[0]
        info['date'] = ''.join([e.text
            for e in origin[0].findall(ns + 'dateIssued') if e.attrib.get('encoding') == 'marc'])
        for i in mods.findall(ns + 'identifier'):
            key = i.attrib['type']
            value = i.text
            if key in ('oclc', 'lccn', 'isbn'):
                if i.attrib['type'] == 'oclc':
                    value = value.replace('ocn', '').replace('ocm', '')
                if i.attrib['type'] == 'isbn':
                    value = normalize_isbn(i.text)
                if not key in info:
                    info[key] = []
                if value not in info[key]:
                    info[key].append(value)
        for i in mods.findall(ns + 'classification'):
            if i.attrib['authority'] == 'ddc':
                info['classification'] = get_classification(i.text.split('/')[0])
        info['author'] = []
        for a in mods.findall(ns + 'name'):
            if a.attrib.get('usage') == 'primary':
                info['author'].append(' '.join([e.text for e in a.findall(ns + 'namePart') if not e.attrib.get('type') in ('date', )]))
        info['author'] = [ox.normalize_name(a) for a in info['author']]
    toc = mods.findall(ns + 'tableOfContents')
    if toc:
        info['description'] = toc[0].text.strip()
    for key in info.keys():
        if not info[key]:
            del info[key]
    return info

info = lookup
Open Media Library 2014-05-04 17:26:43 +00:00			`# -- coding: utf-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`
			`from __future__ import division`

			`import ox`
			`from ox.cache import read_url`
meta 2014-05-14 09:57:11 +00:00			`import re`
Open Media Library 2014-05-04 17:26:43 +00:00			`import xml.etree.ElementTree as ET`

			`from utils import normalize_isbn`
			`from marc_countries import COUNTRIES`
lots of stuff 2014-05-21 00:02:21 +00:00			`from dewey import get_classification`
Open Media Library 2014-05-04 17:26:43 +00:00
use python logging 2014-05-17 14:26:59 +00:00			`import logging`
			`logger = logging.getLogger('meta.loc')`

meta 2014-05-14 09:57:11 +00:00			`def get_ids(key, value):`
			`ids = []`
lots of stuff 2014-05-21 00:02:21 +00:00			`if key == 'isbn':`
meta 2014-05-14 09:57:11 +00:00			`url = 'http://www.loc.gov/search/?q=%s&all=true' % value`
			`html = ox.cache.read_url(url)`
			`match = re.search('"http://lccn.loc.gov/(\d+)"', html)`
			`if match:`
			`ids.append(('lccn', match.group(1)))`
lots of stuff 2014-05-21 00:02:21 +00:00			`elif key == 'lccn':`
			`info = lookup(value)`
			`for key in ('oclc', 'isbn'):`
			`if key in info:`
			`for value in info[key]:`
			`ids.append((key, value))`
meta 2014-05-14 09:57:11 +00:00			`if ids:`
use python logging 2014-05-17 14:26:59 +00:00			`logger.debug('get_ids %s,%s => %s', key, value, ids)`
meta 2014-05-14 09:57:11 +00:00			`return ids`

			`def lookup(id):`
use python logging 2014-05-17 14:26:59 +00:00			`logger.debug('lookup %s', id)`
Open Media Library 2014-05-04 17:26:43 +00:00			`ns = '{http://www.loc.gov/mods/v3}'`
			`url = 'http://lccn.loc.gov/%s/mods' % id`
			`data = read_url(url)`
			`mods = ET.fromstring(data)`

meta 2014-05-14 09:57:11 +00:00			`info = {`
lots of stuff 2014-05-21 00:02:21 +00:00			`'lccn': [id]`
meta 2014-05-14 09:57:11 +00:00			`}`
find 2014-05-16 08:06:11 +00:00			`title = mods.findall(ns + 'titleInfo')`
			`if not title:`
			`return {}`
allow custom metadata 2014-05-19 20:58:00 +00:00			`info['title'] = ''.join([': ' + e.text.strip() if e.tag == ns + 'subTitle' else ' ' + e.text.strip() for e in title[0]]).strip()`
Open Media Library 2014-05-04 17:26:43 +00:00			`origin = mods.findall(ns + 'originInfo')`
			`if origin:`
			`info['place'] = []`
			`for place in origin[0].findall(ns + 'place'):`
			`terms = place.findall(ns + 'placeTerm')`
			`if terms and terms[0].attrib['type'] == 'text':`
			`e = terms[0]`
			`info['place'].append(e.text)`
			`elif terms and terms[0].attrib['type'] == 'code':`
			`e = terms[0]`
			`info['country'] = COUNTRIES.get(e.text, e.text)`
meta 2014-05-14 09:57:11 +00:00			`publisher = [e.text for e in origin[0].findall(ns + 'publisher')]`
			`if publisher:`
			`info['publisher'] = publisher[0]`
date 2014-05-23 12:52:36 +00:00			`info['date'] = ''.join([e.text`
			`for e in origin[0].findall(ns + 'dateIssued') if e.attrib.get('encoding') == 'marc'])`
Open Media Library 2014-05-04 17:26:43 +00:00			`for i in mods.findall(ns + 'identifier'):`
lots of stuff 2014-05-21 00:02:21 +00:00			`key = i.attrib['type']`
			`value = i.text`
			`if key in ('oclc', 'lccn', 'isbn'):`
			`if i.attrib['type'] == 'oclc':`
			`value = value.replace('ocn', '').replace('ocm', '')`
			`if i.attrib['type'] == 'isbn':`
			`value = normalize_isbn(i.text)`
			`if not key in info:`
			`info[key] = []`
			`if value not in info[key]:`
			`info[key].append(value)`
Open Media Library 2014-05-04 17:26:43 +00:00			`for i in mods.findall(ns + 'classification'):`
			`if i.attrib['authority'] == 'ddc':`
lots of stuff 2014-05-21 00:02:21 +00:00			`info['classification'] = get_classification(i.text.split('/')[0])`
Open Media Library 2014-05-04 17:26:43 +00:00			`info['author'] = []`
			`for a in mods.findall(ns + 'name'):`
meta 2014-05-14 09:57:11 +00:00			`if a.attrib.get('usage') == 'primary':`
			`info['author'].append(' '.join([e.text for e in a.findall(ns + 'namePart') if not e.attrib.get('type') in ('date', )]))`
			`info['author'] = [ox.normalize_name(a) for a in info['author']]`
allow custom metadata 2014-05-19 20:58:00 +00:00			`toc = mods.findall(ns + 'tableOfContents')`
			`if toc:`
			`info['description'] = toc[0].text.strip()`
Open Media Library 2014-05-04 17:26:43 +00:00			`for key in info.keys():`
			`if not info[key]:`
			`del info[key]`
			`return info`
meta 2014-05-14 09:57:11 +00:00
			`info = lookup`