openmedialibrary/oml/meta/utils.py

55 lines
1.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import re
import stdnum.isbn
import ox
import ox.iso
def to_isbn13(isbn):
try:
isbn = stdnum.isbn.validate(isbn, True)
if isbn[:2] != '97':
isbn = None
except:
isbn = None
if isbn == '9781111111113':
isbn = None
return isbn
def normalize_isbn(value):
return ''.join([s for s in value if s.isdigit() or s == 'X'])
def find_isbns(text):
if isinstance(text, bytes):
text = text.decode()
matches = re.compile('\d[\d\-X\u2013\ ]+').findall(text)
matches = [normalize_isbn(value) for value in matches]
matches = [to_isbn13(value) for value in matches]
matches = list(set([value for value in matches if value]))
return matches
def get_language(lang):
return ox.iso.codeToLang(lang.split('-')[0]) or lang
def decode_html_data(data):
if isinstance(data, dict):
for key in data:
data[key] = decode_html_data(data[key])
elif isinstance(data, list):
data = [decode_html_data(v) for v in data]
elif isinstance(data, str):
data = ox.decode_html(data)
return data
def strip_tags_data(data):
if isinstance(data, dict):
for key in data:
data[key] = strip_tags_data(data[key])
elif isinstance(data, list):
data = [strip_tags_data(v) for v in data]
elif isinstance(data, str):
data = ox.strip_tags(data)
return data