54 lines
1.4 KiB
Python
54 lines
1.4 KiB
Python
# -*- coding: utf-8 -*-
|
||
|
||
|
||
import re
|
||
import stdnum.isbn
|
||
|
||
import ox
|
||
import ox.iso
|
||
|
||
def to_isbn13(isbn):
|
||
try:
|
||
isbn = stdnum.isbn.validate(isbn, True)
|
||
if isbn[:2] != '97':
|
||
isbn = None
|
||
except:
|
||
isbn = None
|
||
if isbn == '9781111111113':
|
||
isbn = None
|
||
return isbn
|
||
|
||
def normalize_isbn(value):
|
||
return ''.join([s for s in value if s.isdigit() or s == 'X'])
|
||
|
||
def find_isbns(text):
|
||
if isinstance(text, bytes):
|
||
text = text.decode()
|
||
matches = re.compile('\d[\d\-X\u2013–\ ]+').findall(text)
|
||
matches = [normalize_isbn(value) for value in matches]
|
||
matches = [to_isbn13(value) for value in matches]
|
||
matches = list(set([value for value in matches if value]))
|
||
return matches
|
||
|
||
def get_language(lang):
|
||
return ox.iso.codeToLang(lang.split('-')[0]) or lang
|
||
|
||
def decode_html_data(data):
|
||
if isinstance(data, dict):
|
||
for key in data:
|
||
data[key] = decode_html_data(data[key])
|
||
elif isinstance(data, list):
|
||
data = [decode_html_data(v) for v in data]
|
||
elif isinstance(data, str):
|
||
data = ox.decode_html(data)
|
||
return data
|
||
|
||
def strip_tags_data(data):
|
||
if isinstance(data, dict):
|
||
for key in data:
|
||
data[key] = strip_tags_data(data[key])
|
||
elif isinstance(data, list):
|
||
data = [strip_tags_data(v) for v in data]
|
||
elif isinstance(data, str):
|
||
data = ox.strip_tags(data)
|
||
return data
|