2014-08-12 08:16:57 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
2014-09-02 22:32:44 +00:00
|
|
|
|
2014-08-12 08:16:57 +00:00
|
|
|
|
2014-05-16 08:06:11 +00:00
|
|
|
import re
|
|
|
|
import stdnum.isbn
|
2014-05-14 09:57:11 +00:00
|
|
|
|
2016-01-08 04:32:24 +00:00
|
|
|
import ox
|
2014-08-12 08:16:57 +00:00
|
|
|
|
2014-05-14 09:57:11 +00:00
|
|
|
def normalize_isbn(value):
|
|
|
|
return ''.join([s for s in value if s.isdigit() or s == 'X'])
|
|
|
|
|
2014-05-16 08:06:11 +00:00
|
|
|
def find_isbns(text):
|
2014-09-02 22:32:44 +00:00
|
|
|
if isinstance(text, bytes):
|
|
|
|
text = text.decode()
|
2014-05-16 08:06:11 +00:00
|
|
|
matches = re.compile('\d[\d\-X\ ]+').findall(text)
|
|
|
|
matches = [normalize_isbn(value) for value in matches]
|
|
|
|
return [isbn for isbn in matches if stdnum.isbn.is_valid(isbn)
|
|
|
|
and len(isbn) in (10, 13)
|
|
|
|
and isbn not in (
|
|
|
|
'0' * 10,
|
|
|
|
'0' * 13,
|
|
|
|
)]
|
|
|
|
|
2016-01-08 04:32:24 +00:00
|
|
|
def get_language(lang):
|
|
|
|
return ox.iso.codeToLang(lang.split('-')[0]) or lang
|
2016-01-08 10:22:07 +00:00
|
|
|
|
|
|
|
def decode_html_data(data):
|
|
|
|
if isinstance(data, dict):
|
|
|
|
for key in data:
|
|
|
|
data[key] = decode_html_data(data[key])
|
|
|
|
elif isinstance(data, list):
|
|
|
|
data = [decode_html_data(v) for v in data]
|
|
|
|
elif isinstance(data, str):
|
|
|
|
data = ox.decode_html(data)
|
|
|
|
return data
|