# -*- coding: utf-8 -*- import re import stdnum.isbn import ox import ox.iso def to_isbn13(isbn): try: isbn = stdnum.isbn.validate(isbn, True) if isbn[:2] != '97': isbn = None except: isbn = None if isbn == '9781111111113': isbn = None return isbn def normalize_isbn(value): return ''.join([s for s in value if s.isdigit() or s == 'X']) def find_isbns(text): if isinstance(text, bytes): text = text.decode() matches = re.compile('\d[\d\-X\u2013–\ ]+').findall(text) matches = [normalize_isbn(value) for value in matches] matches = [to_isbn13(value) for value in matches] matches = list(set([value for value in matches if value])) return matches def get_language(lang): return ox.iso.codeToLang(lang.split('-')[0]) or lang def decode_html_data(data): if isinstance(data, dict): for key in data: data[key] = decode_html_data(data[key]) elif isinstance(data, list): data = [decode_html_data(v) for v in data] elif isinstance(data, str): data = ox.decode_html(data) return data def strip_tags_data(data): if isinstance(data, dict): for key in data: data[key] = strip_tags_data(data[key]) elif isinstance(data, list): data = [strip_tags_data(v) for v in data] elif isinstance(data, str): data = ox.strip_tags(data) return data