openmedialibrary/oml/meta/utils.py

55 lines
1.4 KiB
Python
Raw Normal View History

2014-08-12 08:16:57 +00:00
# -*- coding: utf-8 -*-
2014-09-02 22:32:44 +00:00
2014-08-12 08:16:57 +00:00
2014-05-16 08:06:11 +00:00
import re
import stdnum.isbn
2014-05-14 09:57:11 +00:00
import ox
import ox.iso
def to_isbn13(isbn):
try:
isbn = stdnum.isbn.validate(isbn, True)
if isbn[:2] != '97':
isbn = None
except:
isbn = None
2016-02-13 09:31:16 +00:00
if isbn == '9781111111113':
isbn = None
return isbn
2014-08-12 08:16:57 +00:00
2014-05-14 09:57:11 +00:00
def normalize_isbn(value):
return ''.join([s for s in value if s.isdigit() or s == 'X'])
2014-05-16 08:06:11 +00:00
def find_isbns(text):
2014-09-02 22:32:44 +00:00
if isinstance(text, bytes):
text = text.decode()
2016-01-16 04:48:30 +00:00
matches = re.compile('\d[\d\-X\u2013\ ]+').findall(text)
2014-05-16 08:06:11 +00:00
matches = [normalize_isbn(value) for value in matches]
matches = [to_isbn13(value) for value in matches]
matches = list(set([value for value in matches if value]))
return matches
2014-05-16 08:06:11 +00:00
def get_language(lang):
return ox.iso.codeToLang(lang.split('-')[0]) or lang
2016-01-08 10:22:07 +00:00
def decode_html_data(data):
if isinstance(data, dict):
for key in data:
data[key] = decode_html_data(data[key])
elif isinstance(data, list):
data = [decode_html_data(v) for v in data]
elif isinstance(data, str):
data = ox.decode_html(data)
return data
2016-02-04 09:55:27 +00:00
def strip_tags_data(data):
if isinstance(data, dict):
for key in data:
data[key] = strip_tags_data(data[key])
elif isinstance(data, list):
data = [strip_tags_data(v) for v in data]
elif isinstance(data, str):
data = ox.strip_tags(data)
return data