python-ox/ox/text.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import gzip
import math
import re
import unicodedata
from io import BytesIO

from functools import reduce

ARTICLES = list(set([
    # def sg, def pl, indef sg, indef pl (each m/f/n)
    'der', 'die', 'das', 'ein', 'eine',  # de
    'the', 'a', 'an',  # en
    'el', 'la', 'lo', 'los', 'las', 'un', 'una', 'unos', 'unas',  # es
    'le', "l'", 'la', 'les', 'un', 'une', 'des',  # fr
    'il', 'lo', "l'" 'la', '_i', 'gli', 'le',  # it
    'de', 'het', 'een',  # nl
    'o', 'a', 'os', '_as', 'um', 'uma', '_uns', 'umas'  # pt
    # some _disabled because of collisions
]))
# every given name in 0xDB that matches Xxxx-yyyy Lastname
ASIAN_FIRST_NAMES = [
    'a', 'ae', 'aeng', 'ah', 'ai', 'an', 'back', 'bae', 'ban', 'bang', 'bao', 
    'beom', 'bi', 'bin', 'bo', 'bok', 'bon', 'bong', 'bu', 'bum', 'byeong', 
    'byoung', 'byung', 'cai', 'chae', 'chan', 'chang', 'chao', 'cheal', 'chen', 
    'cheng', 'cheol', 'cheon', 'cheong', 'cheul', 'chi', 'chia', 'chiao', 
    'chieh', 'chien', 'chih', 'chin', 'ching', 'cho', 'choi', 'chong', 'choo', 
    'chu', 'chuan', 'chuen', 'chul', 'chun', 'chung', 'chuo', 'chyi', 'da', 
    'dae', 'dah', 'dal', 'dan', 'deok', 'do', 'dong', 'doo', 'duek', 'duk', 
    'e', 'el', 'en', 'eui', 'eul', 'eun', 'eung', 'fai', 'fan', 'fang', 'fei', 
    'fen', 'feng', 'fo', 'foo', 'fu', 'ga', 'gae', 'gam', 'gang', 'ge', 'gen', 
    'geon', 'geun', 'gi', 'gil', 'gin', 'gnad', 'gok', 'goo', 'gook', 'gu', 
    'gun', 'gwan', 'gye', 'gyeong', 'gyu', 'gyun', 'ha', 'hae', 'hak', 'han', 
    'hang', 'hao', 'he', 'hee', 'heng', 'heon', 'hie', 'ho', 'hoi', 'hong', 
    'hoo', 'hoon', 'hou', 'hsi', 'hsiang', 'hsiao', 'hsieh', 'hsien', 'hsin', 
    'hsing', 'hsiung', 'hu', 'hua', 'huai', 'huang', 'hue', 'hui', 'hun', 
    'hung', 'hwa', 'hwan', 'hwang', 'hye', 'hyeok', 'hyeon', 'hyeong', 'hyo', 
    'hyuk', 'hyun', 'hyung', 'i', 'ik', 'il', 'in', 'ja', 'jae', 'jan', 'jang', 
    'je', 'jee', 'jen', 'jeok', 'jeong', 'jeung', 'ji', 'jia', 'jian', 'jik', 
    'jin', 'jing', 'jo', 'jong', 'joo', 'joon', 'ju', 'juan', 'jun', 'jung', 
    'ka', 'kai', 'kam', 'kan', 'kang', 'kap', 'kar', 'ke', 'kee', 'kei', 
    'keng', 'keum', 'keung', 'ki', 'kil', 'kin', 'kit', 'kot', 'ku', 'kua', 
    'kuan', 'kuang', 'kuen', 'kun', 'kuo', 'kwang', 'kwok', 'kwon', 'kwong', 
    'kyeong', 'kyo', 'kyoon', 'kyou', 'kyoung', 'kyu', 'kyun', 'kyung', 'lai', 
    'lau', 'lee', 'lei', 'leng', 'leung', 'li', 'liang', 'lien', 'lin', 'ling', 
    'lock', 'long', 'lun', 'lung', 'maeng', 'man', 'mei', 'mi', 'miao', 'min', 
    'ming', 'mo', 'mok', 'moo', 'mook', 'moon', 'mu', 'mun', 'myeong', 
    'myoeng', 'myong', 'myung', 'na', 'nae', 'nai', 'nam', 'nan', 'neung', 
    'ngaru', 'ni', 'no', 'nyeo', 'oh', 'ok', 'ou', 'pai', 'pei', 'pen', 'peng', 
    'pi', 'pil', 'pin', 'ping', 'po', 'pui', 'pyo', 'pyung', 'qing', 'qun', 
    'ra', 'rak', 'ram', 'ran', 'reum', 'ri', 'rim', 'rin', 'roe', 'rok', 'ru', 
    'rui', 'ryeon', 'ryol', 'ryong', 'sa', 'sae', 'san', 'sang', 'se', 'seo', 
    'seob', 'seok', 'seol', 'seon', 'seong', 'seung', 'shan', 'shen', 'sheng', 
    'shi', 'shia', 'shiang', 'shih', 'shik', 'shim', 'shin', 'shing', 'shou', 
    'shu', 'shun', 'si', 'sik', 'sin', 'siu', 'so', 'song', 'soo', 'sook', 
    'soon', 'su', 'suk', 'sun', 'sung', 'sup', 'szu', "t'ien", 'ta', 'tae', 
    'taek', 'tai', 'tak', 'te', 'ti', 'tian', 'ting', 'to', 'toa', 'tsai', 
    'tsan', 'tse', 'tso', 'tsui', 'tung', 'tzu', 'ua', 'ui', 'un', 'wah', 
    'wai', 'wan', 'wei', 'wen', 'weon', 'wing', 'wit', 'wol', 'won', 'woo', 
    'wook', 'woon', 'woong', 'wuk', 'xiao', 'ya', 'yan', 'yang', 'yao', 'ye', 
    'yea', 'yee', 'yeh', 'yen', 'yeo', 'yeol', 'yeon', 'yeong', 'yeop', 'yi', 
    'yin', 'ying', 'yiu', 'yoeng', 'yong', 'yoo', 'yoon', 'you', 'young', 'yu', 
    'yuan', 'yue', 'yuen', 'yuk', 'yull', 'yun', 'yune', 'yung', 'zhi', 
    'zhong', 'zhu'
]
# see http://en.wikipedia.org/wiki/List_of_common_Chinese_surnames
# and http://en.wikipedia.org/wiki/List_of_Korean_family_names
ASIAN_LAST_NAMES = [
    'chan', 'chang', 'chao',
    'chen', 'cheong', 'cheung',
    'chong', 'choo',
    'chu', 'chun',
    'hou', 'hsieh', 'hsu', 'hu', 'huang',
    'kuo',
    'li', 'liang', 'lin', 'liu',
    '_park',
    'sun', 'sung',
    'tsao',
    'wang', 'Wong',
    'yang', 'yeong', 'yeung'
]
PREFIXES = [
    'al', 'bin', 'da', 'de', 'del', 'dem', 'den', 'der', 'di', 'dos', 'du',
    'e', 'el', 'la', 'san', 'the', 'van', 'vom', 'von', 'y', 'zu'
]
MIDFIXES = ['und']
SUFFIXES = ['ii', 'iii', 'jr', 'jr.', 'ph.d.', 'phd', 'sr', 'sr.']

UA_ALIASES = {
    'browser': {
        'Chrome': '(CriOS|CrMo)',
        'Firefox': '(Fennec|Firebird|Iceweasel|Minefield|Namoroka|Phoenix|SeaMonkey|Shiretoko)',
        'Nokia Browser': '(OviBrowser)'
    },
    'robot': {},
    'system': {
        'BSD': '(FreeBSD|NetBSD|OpenBSD)',
        'Linux': '(CrOS|MeeGo|webOS)',
        'Unix': '(AIX|HP-UX|IRIX|SunOS)'
    }
}
UA_NAMES = {
    'browser': {
        'chromeframe': 'Chrome Frame',
        'FBForIPhone': 'WebKit',
        'Gecko': 'Mozilla',
        'IEMobile': 'Internet Explorer',
        'konqueror': 'Konqueror',
        'Mozilla': 'Netscape',
        'MSIE': 'Internet Explorer',
        'NokiaBrowser': 'Nokia Browser',
        'Trident': 'Internet Explorer'
    },
    'robot': {},
    'system': {
        'BB': 'BlackBerry',
        'CPU OS': 'iOS',
        'iPhone': 'iOS',
        'iPhone OS': 'iOS',
        'J2ME/MIDP': 'Java',
        'Mac_PowerPC': 'Mac OS',
        'Mac_PPC': 'Mac OS',
        'Macintosh': 'Mac OS',
        'PLAYSTATION': 'PlayStation',
        'S': 'Nokia',
        'Series': 'Nokia',
        'Win': 'Windows',
        'Windows Phone OS': 'Windows Phone',
        'X11': 'Linux'
    }
}
UA_REGEXPS = {
    'browser': [
        r'(Camino)\/(\d+)',
        r'(Chimera)\/(\d+)',
        r'(chromeframe)\/(\d+)',
        r'(Edge)\/(\d+)',
        r'(Epiphany)\/(\d+)',  # before Chrome, Chromium and Safari
        r'(Chromium)\/(\d+)',  # before Chrome
        r'(Chrome)\/(\d+)',
        r'(FBForIPhone)',
        r'(Firefox)\/(\d+)',
        r'(Galeon)\/(\d+)',
        r'(IEMobile)\/(\d+)',
        r'(iCab) (\d+)',
        r'(iCab)\/(\d+)',
        r'(konqueror)\/(\d+)',
        r'(Konqueror)\/(\d+)',
        r'(Lynx)\/(\d+)',
        r'(Netscape)\d?\/(\d+)',
        r'(NokiaBrowser)\/(\d+)',
        r'(OmniWeb)\/(\d+)',
        r'(Opera)\/.+Version\/(\d+)',
        r'(OviBrowser)\/(\d+)',
        r'Version\/(\d+).+(Safari)',
        r'(WebKit)\/(\d+)',
        r'(MSIE) (\d\d?(?!\d))',  # last, since Opera used to mask as MSIE
        r'(Trident)\/.*?rv:(\d+)',
        r'(Gecko)',
        r'(Mozilla)\/(3|4)'
    ],
    'robot': [
        r'(BingPreview)\/(\d+)',
        r'(Google Web Preview).+Chrome\/(\d+)',
        r'(Googlebot)\/(\d+)',
        r'(WebCrawler)\/(\d+)',
        r'(Yahoo! Slurp)\/(\d+)',
        r'(YandexBot)\/([\d\.]+)',
        r'(YandexMobileBot)\/([\d\.]+)',
    ],
    'system': [
        r'(Android) (\d+)',
        r'(Android)',
        r'(BB)(\d+)',
        r'(BeOS)',
        r'(BlackBerry) (\d+)',
        r'(BlackBerry)',
        r'(Darwin)',
        r'(BSD) (FreeBSD|NetBSD|OpenBSD)',
        r'(CPU OS) (\d+)',
        r'(iPhone OS) (\d+)',
        r'(iPhone)',  # Opera
        r'(J2ME\/MIDP)',
        r'(Linux).+(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS)',
        r'(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS).+(Linux)',
        r'(Linux)',
        r'(Mac OS X) (10.\d+)',
        r'(Mac OS X)',
        r'(Mac_PowerPC)',
        r'(Mac_PPC)',
        r'(Macintosh)',
        r'Nintendo (Wii).+NX\/(\d+)',
        r'(PLAYSTATION) (\d+)',
        r'(PlayStation) Vita (\d+)',
        r'(RIM Tablet OS) (\d+)',
        r'(S)(60);',
        r'(Series) ?(40|60)',
        r'(Symbian OS)',
        r'(SymbianOS)\/(\d+)',
        r'(SymbOS)',
        r'(OS\/2)',
        r'(Unix) (AIX|HP-UX|IRIX|SunOS)',
        r'(Unix)',
        r'(Windows) (NT \d\.\d)',
        r'(Windows Phone) (\d+)',
        r'(Windows Phone OS) (\d+)',
        r'(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)',  # Opera
        r'(Win) (9x 4\.90)',  # Firefox
        r'(Win)(16)',  # Firefox
        r'(Win)(9\d)',  # Firefox
        r'(Win)(NT)',  # Firefox
        r'(Win)(NT4\.0)',  # Firefox
        r'(X11)'
    ]
}
UA_VERSIONS = {
    'browser': {},
    'robot': {},
    'system': {
        '10.0': '10.0 (Cheetah)',
        '10.1': '10.1 (Puma)',
        '10.2': '10.2 (Jaguar)',
        '10.3': '10.3 (Panther)',
        '10.4': '10.4 (Tiger)',
        '10.5': '10.5 (Leopard)',
        '10.6': '10.6 (Snow Leopard)',
        '10.7': '10.7 (Lion)',
        '10.8': '10.8 (Mountain Lion)',
        '10.9': '10.9 (Mavericks)',
        '10.10': '10.10 (Yosemite)',
        '10.11': '10.11 (El Capitan)',
        '40': 'Series 40',
        '60': 'Series 60',
        'NT 3.1': 'NT 3.1 (3.1)',
        'NT 3.5': 'NT 3.5 (NT)',
        'NT 4.0': 'NT 4.0 (NT)',
        'NT 4.1': 'NT 4.1 (98)',
        '9x 4.90': 'NT 4.9 (ME)',
        'NT 5.0': 'NT 5.0 (2000)',
        'NT 5.1': 'NT 5.1 (XP)',
        'NT 5.2': 'NT 5.2 (2003)',
        'NT 6.0': 'NT 6.0 (Vista)',
        'NT 6.1': 'NT 6.1 (7)',
        'NT 6.2': 'NT 6.2 (8)',
        'NT 6.3': 'NT 6.3 (8.1)',
        'NT 6.4': 'NT 6.4 (10)',
        '16': 'NT 3.1 (3.1)',
        '3.1': 'NT 3.1 (3.1)',
        '95': 'NT 4.0 (95)',
        'NT': 'NT 4.0 (NT)',
        'NT4.0': 'NT 4.0 (NT)',
        '98': 'NT 4.1 (98)',
        'ME': 'NT 4.9 (ME)',
        '2000': 'NT 5.0 (2000)',
        'XP': 'NT 5.1 (XP)',
        '2003': 'NT 5.2 (2003)'
    }
}

def get_sort_name(name):
    """

    >>> get_sort_name('Alfred Hitchcock')
    'Hitchcock, Alfred'

    >>> get_sort_name('Jean-Luc Godard')
    'Godard, Jean-Luc'

    >>> get_sort_name('Rainer Werner Fassbinder')
    'Fassbinder, Rainer Werner'

    >>> get_sort_name('Brian De Palma')
    'De Palma, Brian'

    >>> get_sort_name('Johan van der Keuken')
    'van der Keuken, Johan'

    >>> get_sort_name('Edward D. Wood Jr.')
    'Wood Jr., Edward D.'

    >>> get_sort_name('Bing Wang')
    'Wang Bing'

    >>> get_sort_name('Frank Capra III')
    'Capra III, Frank'

    >>> get_sort_name('The Queen of England')
    'Queen of England, The'

    >>> get_sort_name('Sham 69')
    'Sham 69'

    >>> get_sort_name('Scorsese, Martin')
    'Scorsese, Martin'
    """
    if ' ' not in name or ', ' in name:
        return name
    if name.lower().startswith('the '):
        return get_sort_title(name)

    def add_name():
        if len(first_names):
            last_names.insert(0, first_names.pop())

    def find_name(names):
        return len(first_names) and first_names[-1].lower() in names

    if is_asian_name(name):
        names = name.replace('-', ' ').split(' ')
        if len(names) == 2:
            if names[0].lower() in ASIAN_LAST_NAMES:
                lastname, firstname = names
            else:
                firstname, lastname = names
        else:
            names_ = name.split(' ')
            if '-' in names_[0]:
                lastname, firstname = [names[2], names[0] + '-' + names[1].lower()]
            elif '-' in names_[1]:
                lastname, firstname = [names[0], names[1] + '-' + names[2].lower()]
            elif names[0].lower() in ASIAN_FIRST_NAMES and names[2].lower() not in ASIAN_FIRST_NAMES:
                lastname, firstname = [names[2], names[0] + ' ' + names[1]]
            elif names[0].lower() not in ASIAN_FIRST_NAMES and names[2].lower() in ASIAN_FIRST_NAMES:
                lastname, firstname = [names[0], names[1] + ' ' + names[2]]
            elif names[0].lower() in ASIAN_LAST_NAMES:
                lastname, firstname = [names[0], names[1] + ' ' + names[2]]
            else:
                lastname, firstname = [names[2], names[0] + ' ' + names[1]]
        return lastname + ' ' + firstname

    first_names = name.split(' ')
    last_names = []
    if re.search(r'^[0-9]+$', first_names[-1]):
        add_name()
    if re.search(r'[(\[].+?[)\]]$', first_names[-1]):
        add_name()
    if find_name(SUFFIXES):
        add_name()
    add_name()
    if find_name(MIDFIXES):
        add_name()
        add_name()
    while find_name(PREFIXES):
        add_name()
    name = ' '.join(last_names)
    if len(first_names):
        separator = ' ' if last_names[0].lower() in ASIAN_LAST_NAMES else ', '
        name += separator + ' '.join(first_names)
    return name

def get_sort_title(title):
    """

    >>> get_sort_title('Themroc')
    'Themroc'

    >>> get_sort_title('Die Hard')
    'Hard, Die'

    >>> get_sort_title("L'atalante")
    "atalante, L'"

    """
    for article in ARTICLES:
        spaces = 0 if article.endswith("'") else 1
        if title.lower().startswith(article + ' ' * spaces):
            length = len(article)
            return title[length + spaces:] + ', ' + title[:length]
    return title

def find_re(string, regexp):
    result = re.compile(regexp, re.DOTALL).findall(string)
    if result:
        return result[0].strip()
    return ''

def find_string(string, string0='', string1=''):
    """Return the string between string0 and string1.

    If string0 or string1 is left out, begining or end of string is used.

    >>> find_string('i am not there', string1=' not there')
    'i am'

    >>> find_string('i am not there', 'i am ', ' there')
    'not'

    >>> find_string('i am not there', 'i am not t')
    'here'

    """
    if string0:
        string0 = re.escape(string0)
    else:
        string0 = '^'
    if string1:
        string1 = re.escape(string1)
    else:
        string1 = '$'
    return find_re(string, string0 + '(.*?)' + string1)

def is_asian_name(name):
    names = name.replace('-', ' ').lower().split(' ')
    return (len(names) == 2 and not '-' in name and (
        (names[0] in ASIAN_FIRST_NAMES and names[1] in ASIAN_LAST_NAMES) or
        (names[0] in ASIAN_LAST_NAMES and names[1] in ASIAN_FIRST_NAMES)
    )) or (
        len(names) == 3 and names[1] in ASIAN_FIRST_NAMES and (
            names[0] in ASIAN_FIRST_NAMES or names[2] in ASIAN_FIRST_NAMES
        )
    )

def parse_useragent(useragent):
    data = {}
    for key in UA_REGEXPS:
        for alias, regexp in UA_ALIASES[key].items():
            alias = alias if key == 'browser' else alias + ' \\1'
            useragent = re.sub(regexp, alias, useragent)
        for regexp in UA_REGEXPS[key]:
            data[key] = {'name': '', 'version': '', 'string': ''}
            match = re.compile(regexp).search(useragent)
            if match:
                matches = list(match.groups())
                if len(matches) == 1:
                    matches.append('')
                swap = re.match(r'^\d', matches[0]) or matches[1] == 'Linux'
                name = matches[1 if swap else 0]
                version = matches[0 if swap else 1].replace('_', '.')
                name = UA_NAMES[key][name] if name in UA_NAMES[key] else name
                version = UA_VERSIONS[key][version] if version in UA_VERSIONS[key] else version
                string = name
                if version:
                    string = string + ' ' + (
                        '(' + version + ')' if name in ['BSD', 'Linux', 'Unix'] else version
                    )
                data[key] = {
                    'name': name,
                    'version': version,
                    'string': string
                }
                break
    return data

def remove_special_characters(text):
    """
    Removes special characters inserted by Word.
    """
    text = text.replace(u'\u2013', '-')
    text = text.replace(u'\u2026O', "'")
    text = text.replace(u'\u2019', "'")
    text = text.replace(u'', "'")
    text = text.replace(u'', "'")
    text = text.replace(u'', "-")
    return text

def wrap(text, width):
    """
    A word-wrap function that preserves existing line breaks and most spaces in
    the text. Expects that existing line breaks are posix newlines (\n).
    See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
    """

    def reduce_line(line, word):
        return '%s%s%s' % (
            line,
            ' \n'[
                (len(line[line.rfind('\n')+1:]) + len(word.split('\n', 1)[0]) >= width)
            ],
            word
        )

    return reduce(reduce_line, text.split(' '))

def wrap_string(string, length=80, separator='\n', balance=False):
    '''
    >>> wrap_string("Anticonstitutionellement, Paris s'eveille", 16)
    "Anticonstitution\\nellement, Paris \\ns'eveille"
    >>> wrap_string(u'All you can eat', 12, '\\n', True)
    'All you \\ncan eat'
    '''
    words = string.split(' ')
    if balance:
        # balance lines: test if same number of lines
        # can be achieved with a shorter line length
        lines = wrap_string(string, length, separator, False).split(separator)
        if len(lines) > 1:
            while length > max([len(x) for x in words]):
                length -= 1
                if len(wrap_string(string, length, separator, False).split(separator)) > len(lines):
                    length += 1
                    break
    lines = ['']
    for word in words:
        if len(lines[len(lines) - 1] + word + ' ') <= length + 1:
            # word fits in current line
            lines[len(lines) - 1] += word + ' '
        else:
            if len(word) <= length:
                # word fits in next line
                lines.append(word + ' ')
            else:
                # word is longer than line
                position = length - len(lines[len(lines) - 1])
                lines[len(lines) - 1] += word[0:position]
                for i in range(position, len(word), length):
                    lines.append(word[i:i+length])
                lines[len(lines) - 1] += ' '
    return separator.join(lines).strip()

def truncate_string(string, length, padding='...', position='right'):
    #  >>> truncate_string('anticonstitutionellement', 16, '...', 'left')
    #  '...utionellement'
    #  >>> truncate_string('anticonstitutionellement', 16, '...', 'center')
    #  'anticon...lement'
    #  >>> truncate_string('anticonstitutionellement', 16, '...', 'right')
    #  'anticonstitut...'
    stringLength = len(string)
    paddingLength = len(padding)
    if stringLength > length:
        if position == 'left':
            string = '%s%s' % (padding, string[stringLength + paddingLength - length:])
        elif position == 'center':
            left = int(math.ceil(float(length - paddingLength) / 2))
            right = int(stringLength - math.floor(float(length - paddingLength) / 2))
            string = '%s%s%s' % (string[:left], padding, string[right:])
        elif position == 'right':
            string = '%s%s' % (string[:length - paddingLength], padding)
    return string

def truncate_words(s, num):
    """Truncates a string after a certain number of chacters, but ends with a word

    >>> truncate_words('Truncates a string after a certain number of chacters, but ends with a word', 23)
    'Truncates a string...'
    >>> truncate_words('Truncates a string', 23)
    'Truncates a string'

    """
    length = int(num)
    if len(s) <= length:
        return s
    words = s.split()
    ts = ""
    while words and len(ts) + len(words[0]) < length:
        ts += " " + words.pop(0)
    if words:
        ts += "..."
    return ts.strip()

def trim_string(string, num):
    """Truncates a string after a certain number of chacters, adding ... at -10 characters

    >>> trim_string('Truncates a string after a certain number of chacters', 23)
    'Truncates ...f chacters'
    >>> trim_string('Truncates a string', 23)
    'Truncates a string'
    """
    if len(string) > num:
        string = string[:num - 13] + '...' + string[-10:]
    return string

def get_valid_filename(s):
    """
    Returns the given string converted to a string that can be used for a clean
    filename. Specifically, leading and trailing spaces are removed;
    all non-filename-safe characters are removed.

    >>> get_valid_filename("john's portrait in 2004.jpg")
    'john_s_portrait_in_2004.jpg'
    """
    s = s.strip()
    s = s.replace(' ', '_')
    s = re.sub(r'[^-A-Za-z0-9_.\[\]\ ]', '_', s)
    s = s.replace('__', '_').replace('__', '_')
    return s

def get_text_list(list_, last_word='or'):
    """
    >>> get_text_list(['a', 'b', 'c', 'd'])
    'a, b, c or d'
    >>> get_text_list(['a', 'b', 'c'], 'and')
    'a, b and c'
    >>> get_text_list(['a', 'b'], 'and')
    'a and b'
    >>> get_text_list(['a'])
    'a'
    >>> get_text_list([])
    ''
    """
    if len(list_) == 0:
        return ''
    if len(list_) == 1:
        return list_[0]
    return '%s %s %s' % (', '.join([i for i in list_][:-1]), last_word, list_[-1])

def get_list_text(text, last_word='or'):
    """
    >>> get_list_text('a, b, c or d')
    ['a', 'b', 'c', 'd']
    >>> get_list_text('a, b and c', 'and')
    ['a', 'b', 'c']
    >>> get_list_text('a and b', 'and')
    ['a', 'b']
    >>> get_list_text('a')
    ['a']
    >>> get_list_text('')
    []
    """
    list_ = []
    if text:
        list_ = text.split(', ')
        if list_:
            i = len(list_)-1
            last = list_[i].split(last_word)
            if len(last) == 2:
                list_[i] = last[0].strip()
                list_.append(last[1].strip())
    return list_

def normalize_newlines(text):
    return re.sub(r'\r\n|\r|\n', '\n', text)

def recapitalize(text):
    "Recapitalizes text, placing caps after end-of-sentence punctuation."
    # capwords = ()
    text = text.lower()
    capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
    text = capsRE.sub(lambda x: x.group(1).upper(), text)
    # for capword in capwords:
    #    capwordRE = re.compile(r'\b%s\b' % capword, re.I)
    #    text = capwordRE.sub(capword, text)
    return text

def phone2numeric(phone):
    "Converts a phone number with letters into its numeric equivalent."
    letters = re.compile(r'[A-PR-Y]', re.I)

    def char2number(m):
        return {
            'a': '2', 'c': '2', 'b': '2', 'e': '3',
            'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
            'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
            's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
            'y': '9', 'x': '9'
        }.get(m.group(0).lower())
    return letters.sub(char2number, phone)

def compress_string(s):
    zbuf = BytesIO()
    zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
    zfile.write(s)
    zfile.close()
    return zbuf.getvalue()

smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')

def smart_split(text):
    """
    Generator that splits a string by spaces, leaving quoted phrases together.
    Supports both single and double quotes, and supports escaping quotes with
    backslashes. In the output, strings will keep their initial and trailing
    quote marks.
    >>> list(smart_split('This is "a person\\'s" test.'))
    ['This', 'is', '"a person\\'s"', 'test.']
    """
    for bit in smart_split_re.finditer(text):
        bit = bit.group(0)
        if bit[0] == '"':
            yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
        elif bit[0] == "'":
            yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
        else:
            yield bit

def words(text):
    """
        returns words in text, removing punctuation
    """
    text = text.split()
    return [re.sub("(([.!?:-_]|'s)$)", '', x) for x in text]

def sort_string(string):
    string = string.replace('Æ', 'AE').replace('Ø', 'O').replace('Þ', 'Th')

    # pad numbered titles
    string = re.sub(r'(\d),(\d{3})', '\\1\\2', string)
    string = re.sub(r'(\d+)', lambda x: '%010d' % int(x.group(0)), string)
    return unicodedata.normalize('NFKD', string)

def sorted_strings(strings, key=None):
    if not key:
        key = sort_string
    return sorted(strings, key=key)
-												add some functions

											
										
										
											2008-04-27 16:54:37 +00:00
+								# -*- coding: utf-8 -*-
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								# vi:si:et:sw=4:sts=4:ts=4
-												move and rename some

											
										
										
											2008-07-06 13:00:06 +00:00
+								# GPL 2008
-												drop six and python2 support

											
										
										
											2023-07-27 11:07:13 +00:00
+								import gzip
-												truncateString

											
										
										
											2008-07-06 15:34:29 +00:00
+								import math
-												add some functions

											
										
										
											2008-04-27 16:54:37 +00:00
+								import re
-												add sort_string, sorted_strings

											
										
										
											2012-05-16 10:29:52 +00:00
+								import unicodedata
-												drop six and python2 support

											
										
										
											2023-07-27 11:07:13 +00:00
+								from io import BytesIO
-												add some functions

											
										
										
											2008-04-27 16:54:37 +00:00
-												drop six and python2 support

											
										
										
											2023-07-27 11:07:13 +00:00
+								from functools import reduce
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
-												update file module, add get_sort_name and get_sort_title to text module

											
										
										
											2011-10-11 14:14:29 +00:00
+								ARTICLES = list(set([
 								    # def sg, def pl, indef sg, indef pl (each m/f/n)
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
+								    'der', 'die', 'das', 'ein', 'eine',  # de
 								    'the', 'a', 'an',  # en
 								    'el', 'la', 'lo', 'los', 'las', 'un', 'una', 'unos', 'unas',  # es
 								    'le', "l'", 'la', 'les', 'un', 'une', 'des',  # fr
 								    'il', 'lo', "l'" 'la', '_i', 'gli', 'le',  # it
 								    'de', 'het', 'een',  # nl
 								    'o', 'a', 'os', '_as', 'um', 'uma', '_uns', 'umas'  # pt
 								    # some _disabled because of collisions
-												update file module, add get_sort_name and get_sort_title to text module

											
										
										
											2011-10-11 14:14:29 +00:00
+								]))
-												add is_asian_name

											
										
										
											2017-08-02 16:39:51 +00:00
+								# every given name in 0xDB that matches Xxxx-yyyy Lastname
 								ASIAN_FIRST_NAMES = [
 								    'a', 'ae', 'aeng', 'ah', 'ai', 'an', 'back', 'bae', 'ban', 'bang', 'bao',
 								    'beom', 'bi', 'bin', 'bo', 'bok', 'bon', 'bong', 'bu', 'bum', 'byeong',
 								    'byoung', 'byung', 'cai', 'chae', 'chan', 'chang', 'chao', 'cheal', 'chen',
 								    'cheng', 'cheol', 'cheon', 'cheong', 'cheul', 'chi', 'chia', 'chiao',
 								    'chieh', 'chien', 'chih', 'chin', 'ching', 'cho', 'choi', 'chong', 'choo',
 								    'chu', 'chuan', 'chuen', 'chul', 'chun', 'chung', 'chuo', 'chyi', 'da',
 								    'dae', 'dah', 'dal', 'dan', 'deok', 'do', 'dong', 'doo', 'duek', 'duk',
 								    'e', 'el', 'en', 'eui', 'eul', 'eun', 'eung', 'fai', 'fan', 'fang', 'fei',
 								    'fen', 'feng', 'fo', 'foo', 'fu', 'ga', 'gae', 'gam', 'gang', 'ge', 'gen',
 								    'geon', 'geun', 'gi', 'gil', 'gin', 'gnad', 'gok', 'goo', 'gook', 'gu',
 								    'gun', 'gwan', 'gye', 'gyeong', 'gyu', 'gyun', 'ha', 'hae', 'hak', 'han',
 								    'hang', 'hao', 'he', 'hee', 'heng', 'heon', 'hie', 'ho', 'hoi', 'hong',
 								    'hoo', 'hoon', 'hou', 'hsi', 'hsiang', 'hsiao', 'hsieh', 'hsien', 'hsin',
 								    'hsing', 'hsiung', 'hu', 'hua', 'huai', 'huang', 'hue', 'hui', 'hun',
 								    'hung', 'hwa', 'hwan', 'hwang', 'hye', 'hyeok', 'hyeon', 'hyeong', 'hyo',
 								    'hyuk', 'hyun', 'hyung', 'i', 'ik', 'il', 'in', 'ja', 'jae', 'jan', 'jang',
 								    'je', 'jee', 'jen', 'jeok', 'jeong', 'jeung', 'ji', 'jia', 'jian', 'jik',
 								    'jin', 'jing', 'jo', 'jong', 'joo', 'joon', 'ju', 'juan', 'jun', 'jung',
 								    'ka', 'kai', 'kam', 'kan', 'kang', 'kap', 'kar', 'ke', 'kee', 'kei',
 								    'keng', 'keum', 'keung', 'ki', 'kil', 'kin', 'kit', 'kot', 'ku', 'kua',
 								    'kuan', 'kuang', 'kuen', 'kun', 'kuo', 'kwang', 'kwok', 'kwon', 'kwong',
 								    'kyeong', 'kyo', 'kyoon', 'kyou', 'kyoung', 'kyu', 'kyun', 'kyung', 'lai',
 								    'lau', 'lee', 'lei', 'leng', 'leung', 'li', 'liang', 'lien', 'lin', 'ling',
 								    'lock', 'long', 'lun', 'lung', 'maeng', 'man', 'mei', 'mi', 'miao', 'min',
 								    'ming', 'mo', 'mok', 'moo', 'mook', 'moon', 'mu', 'mun', 'myeong',
 								    'myoeng', 'myong', 'myung', 'na', 'nae', 'nai', 'nam', 'nan', 'neung',
 								    'ngaru', 'ni', 'no', 'nyeo', 'oh', 'ok', 'ou', 'pai', 'pei', 'pen', 'peng',
 								    'pi', 'pil', 'pin', 'ping', 'po', 'pui', 'pyo', 'pyung', 'qing', 'qun',
 								    'ra', 'rak', 'ram', 'ran', 'reum', 'ri', 'rim', 'rin', 'roe', 'rok', 'ru',
 								    'rui', 'ryeon', 'ryol', 'ryong', 'sa', 'sae', 'san', 'sang', 'se', 'seo',
 								    'seob', 'seok', 'seol', 'seon', 'seong', 'seung', 'shan', 'shen', 'sheng',
 								    'shi', 'shia', 'shiang', 'shih', 'shik', 'shim', 'shin', 'shing', 'shou',
 								    'shu', 'shun', 'si', 'sik', 'sin', 'siu', 'so', 'song', 'soo', 'sook',
 								    'soon', 'su', 'suk', 'sun', 'sung', 'sup', 'szu', "t'ien", 'ta', 'tae',
 								    'taek', 'tai', 'tak', 'te', 'ti', 'tian', 'ting', 'to', 'toa', 'tsai',
 								    'tsan', 'tse', 'tso', 'tsui', 'tung', 'tzu', 'ua', 'ui', 'un', 'wah',
 								    'wai', 'wan', 'wei', 'wen', 'weon', 'wing', 'wit', 'wol', 'won', 'woo',
 								    'wook', 'woon', 'woong', 'wuk', 'xiao', 'ya', 'yan', 'yang', 'yao', 'ye',
 								    'yea', 'yee', 'yeh', 'yen', 'yeo', 'yeol', 'yeon', 'yeong', 'yeop', 'yi',
 								    'yin', 'ying', 'yiu', 'yoeng', 'yong', 'yoo', 'yoon', 'you', 'young', 'yu',
 								    'yuan', 'yue', 'yuen', 'yuk', 'yull', 'yun', 'yune', 'yung', 'zhi',
 								    'zhong', 'zhu'
 								]
-												add some support for sorting asian names

											
										
										
											2011-10-11 14:48:51 +00:00
+								# see http://en.wikipedia.org/wiki/List_of_common_Chinese_surnames
 								# and http://en.wikipedia.org/wiki/List_of_Korean_family_names
-												better is_asian_name

											
										
										
											2017-08-02 17:46:40 +00:00
+								ASIAN_LAST_NAMES = [
-												handle some more corner cases when sorting names

											
										
										
											2011-10-11 15:19:31 +00:00
+								    'chan', 'chang', 'chao',
 								    'chen', 'cheong', 'cheung',
 								    'chong', 'choo',
 								    'chu', 'chun',
 								    'hou', 'hsieh', 'hsu', 'hu', 'huang',
 								    'kuo',
 								    'li', 'liang', 'lin', 'liu',
 								    '_park',
 								    'sun', 'sung',
 								    'tsao',
 								    'wang', 'Wong',
 								    'yang', 'yeong', 'yeung'
-												add some support for sorting asian names

											
										
										
											2011-10-11 14:48:51 +00:00
+								]
-												update file module, add get_sort_name and get_sort_title to text module

											
										
										
											2011-10-11 14:14:29 +00:00
+								PREFIXES = [
-												add JSONC module

											
										
										
											2011-10-12 10:19:57 +00:00
+								    'al', 'bin', 'da', 'de', 'del', 'dem', 'den', 'der', 'di', 'dos', 'du',
-												add assertion

											
										
										
											2011-10-11 19:19:54 +00:00
+								    'e', 'el', 'la', 'san', 'the', 'van', 'vom', 'von', 'y', 'zu'
-												update file module, add get_sort_name and get_sort_title to text module

											
										
										
											2011-10-11 14:14:29 +00:00
+								]
 								MIDFIXES = ['und']
-												please romans

											
										
										
											2011-10-11 19:10:36 +00:00
+								SUFFIXES = ['ii', 'iii', 'jr', 'jr.', 'ph.d.', 'phd', 'sr', 'sr.']
-												update file module, add get_sort_name and get_sort_title to text module

											
										
										
											2011-10-11 14:14:29 +00:00
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								UA_ALIASES = {
 								    'browser': {
-												ua parser: detect chrome on ios

											
										
										
											2012-11-09 22:07:31 +00:00
+								        'Chrome': '(CriOS|CrMo)',
-												update parse_useragent

											
										
										
											2012-10-27 16:51:39 +00:00
+								        'Firefox': '(Fennec|Firebird|Iceweasel|Minefield|Namoroka|Phoenix|SeaMonkey|Shiretoko)',
 								        'Nokia Browser': '(OviBrowser)'
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								    },
-												handle webkit; make robot a dict (since robots may run on specific systems or emulate specific browsers, they should be returned separately)

											
										
										
											2012-03-27 10:20:22 +00:00
+								    'robot': {},
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								    'system': {
 								        'BSD': '(FreeBSD|NetBSD|OpenBSD)',
 								        'Linux': '(CrOS|MeeGo|webOS)',
 								        'Unix': '(AIX|HP-UX|IRIX|SunOS)'
 								    }
 								}
 								UA_NAMES = {
 								    'browser': {
 								        'chromeframe': 'Chrome Frame',
-												update parse_useragent

											
										
										
											2012-10-27 16:51:39 +00:00
+								        'FBForIPhone': 'WebKit',
-												update user agent parser

											
										
										
											2012-08-15 15:58:46 +00:00
+								        'Gecko': 'Mozilla',
-												update ua parser

											
										
										
											2013-07-29 16:22:22 +00:00
+								        'IEMobile': 'Internet Explorer',
-												improve parse_useragent

											
										
										
											2012-10-27 19:59:40 +00:00
+								        'konqueror': 'Konqueror',
-												update user agent parser

											
										
										
											2012-08-15 15:58:46 +00:00
+								        'Mozilla': 'Netscape',
-												update parse_useragent

											
										
										
											2012-10-27 16:51:39 +00:00
+								        'MSIE': 'Internet Explorer',
-												detect IE11, part of ticket #1917

											
										
										
											2013-10-23 22:24:13 +00:00
+								        'NokiaBrowser': 'Nokia Browser',
 								        'Trident': 'Internet Explorer'
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								    },
-												handle webkit; make robot a dict (since robots may run on specific systems or emulate specific browsers, they should be returned separately)

											
										
										
											2012-03-27 10:20:22 +00:00
+								    'robot': {},
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								    'system': {
-												update ua parser

											
										
										
											2013-07-29 17:03:46 +00:00
+								        'BB': 'BlackBerry',
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								        'CPU OS': 'iOS',
-												update user agent parser

											
										
										
											2012-08-15 15:58:46 +00:00
+								        'iPhone': 'iOS',
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								        'iPhone OS': 'iOS',
-												update parse_useragent

											
										
										
											2012-10-27 16:51:39 +00:00
+								        'J2ME/MIDP': 'Java',
-												update user agent parser

											
										
										
											2012-08-15 15:58:46 +00:00
+								        'Mac_PowerPC': 'Mac OS',
 								        'Mac_PPC': 'Mac OS',
 								        'Macintosh': 'Mac OS',
-												update parse_useragent

											
										
										
											2012-10-27 16:51:39 +00:00
+								        'PLAYSTATION': 'PlayStation',
 								        'S': 'Nokia',
 								        'Series': 'Nokia',
-												update user agent parser

											
										
										
											2012-08-15 15:58:46 +00:00
+								        'Win': 'Windows',
-												fix typos

											
										
										
											2013-07-30 13:22:23 +00:00
+								        'Windows Phone OS': 'Windows Phone',
-												update user agent parser

											
										
										
											2012-08-15 15:58:46 +00:00
+								        'X11': 'Linux'
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								    }
 								}
 								UA_REGEXPS = {
 								    'browser': [
-												use r'' for regex strings

											
										
										
											2024-06-30 08:52:59 +00:00
+								        r'(Camino)\/(\d+)',
 								        r'(Chimera)\/(\d+)',
 								        r'(chromeframe)\/(\d+)',
 								        r'(Edge)\/(\d+)',
 								        r'(Epiphany)\/(\d+)',  # before Chrome, Chromium and Safari
 								        r'(Chromium)\/(\d+)',  # before Chrome
 								        r'(Chrome)\/(\d+)',
 								        r'(FBForIPhone)',
 								        r'(Firefox)\/(\d+)',
 								        r'(Galeon)\/(\d+)',
 								        r'(IEMobile)\/(\d+)',
 								        r'(iCab) (\d+)',
 								        r'(iCab)\/(\d+)',
 								        r'(konqueror)\/(\d+)',
 								        r'(Konqueror)\/(\d+)',
 								        r'(Lynx)\/(\d+)',
 								        r'(Netscape)\d?\/(\d+)',
 								        r'(NokiaBrowser)\/(\d+)',
 								        r'(OmniWeb)\/(\d+)',
 								        r'(Opera)\/.+Version\/(\d+)',
 								        r'(OviBrowser)\/(\d+)',
 								        r'Version\/(\d+).+(Safari)',
 								        r'(WebKit)\/(\d+)',
 								        r'(MSIE) (\d\d?(?!\d))',  # last, since Opera used to mask as MSIE
 								        r'(Trident)\/.*?rv:(\d+)',
 								        r'(Gecko)',
 								        r'(Mozilla)\/(3|4)'
-												handle webkit; make robot a dict (since robots may run on specific systems or emulate specific browsers, they should be returned separately)

											
										
										
											2012-03-27 10:20:22 +00:00
+								    ],
 								    'robot': [
-												use r'' for regex strings

											
										
										
											2024-06-30 08:52:59 +00:00
+								        r'(BingPreview)\/(\d+)',
 								        r'(Google Web Preview).+Chrome\/(\d+)',
 								        r'(Googlebot)\/(\d+)',
 								        r'(WebCrawler)\/(\d+)',
 								        r'(Yahoo! Slurp)\/(\d+)',
 								        r'(YandexBot)\/([\d\.]+)',
 								        r'(YandexMobileBot)\/([\d\.]+)',
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								    ],
 								    'system': [
-												use r'' for regex strings

											
										
										
											2024-06-30 08:52:59 +00:00
+								        r'(Android) (\d+)',
 								        r'(Android)',
 								        r'(BB)(\d+)',
 								        r'(BeOS)',
 								        r'(BlackBerry) (\d+)',
 								        r'(BlackBerry)',
 								        r'(Darwin)',
 								        r'(BSD) (FreeBSD|NetBSD|OpenBSD)',
 								        r'(CPU OS) (\d+)',
 								        r'(iPhone OS) (\d+)',
 								        r'(iPhone)',  # Opera
 								        r'(J2ME\/MIDP)',
 								        r'(Linux).+(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS)',
 								        r'(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS).+(Linux)',
 								        r'(Linux)',
 								        r'(Mac OS X) (10.\d+)',
 								        r'(Mac OS X)',
 								        r'(Mac_PowerPC)',
 								        r'(Mac_PPC)',
 								        r'(Macintosh)',
 								        r'Nintendo (Wii).+NX\/(\d+)',
 								        r'(PLAYSTATION) (\d+)',
 								        r'(PlayStation) Vita (\d+)',
 								        r'(RIM Tablet OS) (\d+)',
 								        r'(S)(60);',
 								        r'(Series) ?(40|60)',
 								        r'(Symbian OS)',
 								        r'(SymbianOS)\/(\d+)',
 								        r'(SymbOS)',
 								        r'(OS\/2)',
 								        r'(Unix) (AIX|HP-UX|IRIX|SunOS)',
 								        r'(Unix)',
 								        r'(Windows) (NT \d\.\d)',
 								        r'(Windows Phone) (\d+)',
 								        r'(Windows Phone OS) (\d+)',
 								        r'(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)',  # Opera
 								        r'(Win) (9x 4\.90)',  # Firefox
 								        r'(Win)(16)',  # Firefox
 								        r'(Win)(9\d)',  # Firefox
 								        r'(Win)(NT)',  # Firefox
 								        r'(Win)(NT4\.0)',  # Firefox
 								        r'(X11)'
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								    ]
 								}
 								UA_VERSIONS = {
 								    'browser': {},
-												handle webkit; make robot a dict (since robots may run on specific systems or emulate specific browsers, they should be returned separately)

											
										
										
											2012-03-27 10:20:22 +00:00
+								    'robot': {},
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								    'system': {
-												update parse_useragent

											
										
										
											2012-03-21 07:47:04 +00:00
+								        '10.0': '10.0 (Cheetah)',
 								        '10.1': '10.1 (Puma)',
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								        '10.2': '10.2 (Jaguar)',
 								        '10.3': '10.3 (Panther)',
 								        '10.4': '10.4 (Tiger)',
 								        '10.5': '10.5 (Leopard)',
 								        '10.6': '10.6 (Snow Leopard)',
 								        '10.7': '10.7 (Lion)',
-												update parse_useragent

											
										
										
											2012-03-21 07:47:04 +00:00
+								        '10.8': '10.8 (Mountain Lion)',
-												fix typos

											
										
										
											2013-07-30 13:22:23 +00:00
+								        '10.9': '10.9 (Mavericks)',
-												parse_useragent: add Mac OS X 10.10 Yosemite

											
										
										
											2014-09-04 16:50:34 +00:00
+								        '10.10': '10.10 (Yosemite)',
-												UA strings: Edge+El Capitan

											
										
										
											2015-08-04 17:23:47 +00:00
+								        '10.11': '10.11 (El Capitan)',
-												update parse_useragent

											
										
										
											2012-10-27 16:51:39 +00:00
+								        '40': 'Series 40',
 								        '60': 'Series 60',
-												update user agent parser

											
										
										
											2012-08-15 15:58:46 +00:00
+								        'NT 3.1': 'NT 3.1 (3.1)',
 								        'NT 3.5': 'NT 3.5 (NT)',
-												Change the string for versions of Microsoft Windows from 'NT X.Y (Windows ABC)' to 'NT X.Y (ABC)'

											
										
										
											2012-03-22 20:17:14 +00:00
+								        'NT 4.0': 'NT 4.0 (NT)',
 								        'NT 4.1': 'NT 4.1 (98)',
-												update user agent parser

											
										
										
											2012-08-15 15:58:46 +00:00
+								        '9x 4.90': 'NT 4.9 (ME)',
-												Change the string for versions of Microsoft Windows from 'NT X.Y (Windows ABC)' to 'NT X.Y (ABC)'

											
										
										
											2012-03-22 20:17:14 +00:00
+								        'NT 5.0': 'NT 5.0 (2000)',
 								        'NT 5.1': 'NT 5.1 (XP)',
 								        'NT 5.2': 'NT 5.2 (2003)',
 								        'NT 6.0': 'NT 6.0 (Vista)',
 								        'NT 6.1': 'NT 6.1 (7)',
 								        'NT 6.2': 'NT 6.2 (8)',
-												update UA parser

											
										
										
											2013-07-30 17:06:01 +00:00
+								        'NT 6.3': 'NT 6.3 (8.1)',
-												update UA_VERSIONS.system

											
										
										
											2014-11-21 09:46:12 +00:00
+								        'NT 6.4': 'NT 6.4 (10)',
-												update user agent parser

											
										
										
											2012-08-15 15:58:46 +00:00
+								        '16': 'NT 3.1 (3.1)',
 								        '3.1': 'NT 3.1 (3.1)',
-												Change the string for versions of Microsoft Windows from 'NT X.Y (Windows ABC)' to 'NT X.Y (ABC)'

											
										
										
											2012-03-22 20:17:14 +00:00
+								        '95': 'NT 4.0 (95)',
 								        'NT': 'NT 4.0 (NT)',
-												update user agent parser

											
										
										
											2012-08-15 15:58:46 +00:00
+								        'NT4.0': 'NT 4.0 (NT)',
-												Change the string for versions of Microsoft Windows from 'NT X.Y (Windows ABC)' to 'NT X.Y (ABC)'

											
										
										
											2012-03-22 20:17:14 +00:00
+								        '98': 'NT 4.1 (98)',
 								        'ME': 'NT 4.9 (ME)',
 								        '2000': 'NT 5.0 (2000)',
 								        'XP': 'NT 5.1 (XP)',
-												update user agent parser

											
										
										
											2012-08-15 15:58:46 +00:00
+								        '2003': 'NT 5.2 (2003)'
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								    }
 								}
-												update file module, add get_sort_name and get_sort_title to text module

											
										
										
											2011-10-11 14:14:29 +00:00
+								def get_sort_name(name):
 								    """
 								    >>> get_sort_name('Alfred Hitchcock')
 								    'Hitchcock, Alfred'
 								    >>> get_sort_name('Jean-Luc Godard')
 								    'Godard, Jean-Luc'
 								    >>> get_sort_name('Rainer Werner Fassbinder')
 								    'Fassbinder, Rainer Werner'
 								    >>> get_sort_name('Brian De Palma')
 								    'De Palma, Brian'
 								    >>> get_sort_name('Johan van der Keuken')
 								    'van der Keuken, Johan'
 								    >>> get_sort_name('Edward D. Wood Jr.')
 								    'Wood Jr., Edward D.'
-												add some support for sorting asian names

											
										
										
											2011-10-11 14:48:51 +00:00
+								    >>> get_sort_name('Bing Wang')
 								    'Wang Bing'
-												add assertion

											
										
										
											2011-10-11 19:19:54 +00:00
+								    >>> get_sort_name('Frank Capra III')
 								    'Capra III, Frank'
-												handle some more corner cases when sorting names

											
										
										
											2011-10-11 15:19:31 +00:00
+								    >>> get_sort_name('The Queen of England')
 								    'Queen of England, The'
 								    >>> get_sort_name('Sham 69')
 								    'Sham 69'
-												add some support for sorting asian names

											
										
										
											2011-10-11 14:48:51 +00:00
+								    >>> get_sort_name('Scorsese, Martin')
 								    'Scorsese, Martin'
-												update file module, add get_sort_name and get_sort_title to text module

											
										
										
											2011-10-11 14:14:29 +00:00
+								    """
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
+								    if ' ' not in name or ', ' in name:
-												add some support for sorting asian names

											
										
										
											2011-10-11 14:48:51 +00:00
+								        return name
-												handle some more corner cases when sorting names

											
										
										
											2011-10-11 15:19:31 +00:00
+								    if name.lower().startswith('the '):
 								        return get_sort_title(name)
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
-												update file module, add get_sort_name and get_sort_title to text module

											
										
										
											2011-10-11 14:14:29 +00:00
+								    def add_name():
 								        if len(first_names):
 								            last_names.insert(0, first_names.pop())
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
-												update file module, add get_sort_name and get_sort_title to text module

											
										
										
											2011-10-11 14:14:29 +00:00
+								    def find_name(names):
 								        return len(first_names) and first_names[-1].lower() in names
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
-												add is_asian_name

											
										
										
											2017-08-02 16:39:51 +00:00
+								    if is_asian_name(name):
-												fix is_asian_name

											
										
										
											2017-08-02 17:51:14 +00:00
+								        names = name.replace('-', ' ').split(' ')
-												better is_asian_name

											
										
										
											2017-08-02 17:46:40 +00:00
+								        if len(names) == 2:
 								            if names[0].lower() in ASIAN_LAST_NAMES:
 								                lastname, firstname = names
 								            else:
 								                firstname, lastname = names
 								        else:
 								            names_ = name.split(' ')
 								            if '-' in names_[0]:
 								                lastname, firstname = [names[2], names[0] + '-' + names[1].lower()]
 								            elif '-' in names_[1]:
-												fix is_asian_name

											
										
										
											2017-08-02 17:49:49 +00:00
+								                lastname, firstname = [names[0], names[1] + '-' + names[2].lower()]
-												fix is_asian_name

											
										
										
											2017-08-02 17:52:06 +00:00
+								            elif names[0].lower() in ASIAN_FIRST_NAMES and names[2].lower() not in ASIAN_FIRST_NAMES:
-												better is_asian_name

											
										
										
											2017-08-02 17:46:40 +00:00
+								                lastname, firstname = [names[2], names[0] + ' ' + names[1]]
-												fix is_asian_name

											
										
										
											2017-08-02 17:52:06 +00:00
+								            elif names[0].lower() not in ASIAN_FIRST_NAMES and names[2].lower() in ASIAN_FIRST_NAMES:
-												better is_asian_name

											
										
										
											2017-08-02 17:46:40 +00:00
+								                lastname, firstname = [names[0], names[1] + ' ' + names[2]]
-												fix is_asian_name

											
										
										
											2017-08-02 17:52:06 +00:00
+								            elif names[0].lower() in ASIAN_LAST_NAMES:
-												better is_asian_name

											
										
										
											2017-08-02 17:46:40 +00:00
+								                lastname, firstname = [names[0], names[1] + ' ' + names[2]]
 								            else:
 								                lastname, firstname = [names[2], names[0] + ' ' + names[1]]
-												fix is_asian_name

											
										
										
											2017-08-02 17:52:40 +00:00
+								        return lastname + ' ' + firstname
-												add is_asian_name

											
										
										
											2017-08-02 16:39:51 +00:00
-												update file module, add get_sort_name and get_sort_title to text module

											
										
										
											2011-10-11 14:14:29 +00:00
+								    first_names = name.split(' ')
 								    last_names = []
-												use r'' for regex strings

											
										
										
											2024-06-30 08:52:59 +00:00
+								    if re.search(r'^[0-9]+$', first_names[-1]):
-												handle some more corner cases when sorting names

											
										
										
											2011-10-11 15:19:31 +00:00
+								        add_name()
-												use r'' for regex strings

											
										
										
											2024-06-30 08:52:59 +00:00
+								    if re.search(r'[(\[].+?[)\]]$', first_names[-1]):
-												when sorting names, handle trailing (...) and [...]

											
										
										
											2014-12-16 18:11:14 +00:00
+								        add_name()
-												update file module, add get_sort_name and get_sort_title to text module

											
										
										
											2011-10-11 14:14:29 +00:00
+								    if find_name(SUFFIXES):
 								        add_name()
 								    add_name()
 								    if find_name(MIDFIXES):
 								        add_name()
 								        add_name()
 								    while find_name(PREFIXES):
 								        add_name()
-												handle some more corner cases when sorting names

											
										
										
											2011-10-11 15:19:31 +00:00
+								    name = ' '.join(last_names)
 								    if len(first_names):
-												better is_asian_name

											
										
										
											2017-08-02 17:46:40 +00:00
+								        separator = ' ' if last_names[0].lower() in ASIAN_LAST_NAMES else ', '
-												handle some more corner cases when sorting names

											
										
										
											2011-10-11 15:19:31 +00:00
+								        name += separator + ' '.join(first_names)
 								    return name
-												update file module, add get_sort_name and get_sort_title to text module

											
										
										
											2011-10-11 14:14:29 +00:00
 								def get_sort_title(title):
 								    """
 								    >>> get_sort_title('Themroc')
 								    'Themroc'
 								    >>> get_sort_title('Die Hard')
 								    'Hard, Die'
-												fix a bug in get_sort_title with articles that end with a single quote

											
										
										
											2011-10-11 16:51:28 +00:00
+								    >>> get_sort_title("L'atalante")
 								    "atalante, L'"
-												update file module, add get_sort_name and get_sort_title to text module

											
										
										
											2011-10-11 14:14:29 +00:00
+								    """
 								    for article in ARTICLES:
-												fix a bug in get_sort_title with articles that end with a single quote

											
										
										
											2011-10-11 16:51:28 +00:00
+								        spaces = 0 if article.endswith("'") else 1
 								        if title.lower().startswith(article + ' ' * spaces):
-												update file module, add get_sort_name and get_sort_title to text module

											
										
										
											2011-10-11 14:14:29 +00:00
+								            length = len(article)
-												fix a bug in get_sort_title with articles that end with a single quote

											
										
										
											2011-10-11 16:51:28 +00:00
+								            return title[length + spaces:] + ', ' + title[:length]
-												update file module, add get_sort_name and get_sort_title to text module

											
										
										
											2011-10-11 14:14:29 +00:00
+								    return title
-												add some functions

											
										
										
											2008-04-27 16:54:37 +00:00
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								def find_re(string, regexp):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    result = re.compile(regexp, re.DOTALL).findall(string)
 								    if result:
 								        return result[0].strip()
 								    return ''
-												make findRegexp do more

											
										
										
											2008-04-29 13:34:27 +00:00
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
+								def find_string(string, string0='', string1=''):
 								    """Return the string between string0 and string1.
-												make findRegexp do more

											
										
										
											2008-04-29 13:34:27 +00:00
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    If string0 or string1 is left out, begining or end of string is used.
-												make findRegexp do more

											
										
										
											2008-04-29 13:34:27 +00:00
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								    >>> find_string('i am not there', string1=' not there')
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    'i am'
-												make findRegexp do more

											
										
										
											2008-04-29 13:34:27 +00:00
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								    >>> find_string('i am not there', 'i am ', ' there')
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    'not'
-												adding findString()

											
										
										
											2008-04-29 11:26:42 +00:00
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								    >>> find_string('i am not there', 'i am not t')
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    'here'
-												make findRegexp do more

											
										
										
											2008-04-29 13:34:27 +00:00
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    """
 								    if string0:
 								        string0 = re.escape(string0)
 								    else:
 								        string0 = '^'
 								    if string1:
 								        string1 = re.escape(string1)
 								    else:
 								        string1 = '$'
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								    return find_re(string, string0 + '(.*?)' + string1)
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
-												add is_asian_name

											
										
										
											2017-08-02 16:39:51 +00:00
+								def is_asian_name(name):
-												fix is_asian_name

											
										
										
											2017-08-02 16:44:32 +00:00
+								    names = name.replace('-', ' ').lower().split(' ')
-												better is_asian_name

											
										
										
											2017-08-02 17:46:40 +00:00
+								    return (len(names) == 2 and not '-' in name and (
 								        (names[0] in ASIAN_FIRST_NAMES and names[1] in ASIAN_LAST_NAMES) or
 								        (names[0] in ASIAN_LAST_NAMES and names[1] in ASIAN_FIRST_NAMES)
-												fix is_asian_name

											
										
										
											2017-08-02 17:49:49 +00:00
+								    )) or (
-												better is_asian_name

											
										
										
											2017-08-02 17:46:40 +00:00
+								        len(names) == 3 and names[1] in ASIAN_FIRST_NAMES and (
 								            names[0] in ASIAN_FIRST_NAMES or names[2] in ASIAN_FIRST_NAMES
 								        )
 								    )
-												add is_asian_name

											
										
										
											2017-08-02 16:39:51 +00:00
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								def parse_useragent(useragent):
 								    data = {}
 								    for key in UA_REGEXPS:
-												fix ox.text in python 3

											
										
										
											2014-09-30 19:17:15 +00:00
+								        for alias, regexp in UA_ALIASES[key].items():
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								            alias = alias if key == 'browser' else alias + ' \\1'
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
+								            useragent = re.sub(regexp, alias, useragent)
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								        for regexp in UA_REGEXPS[key]:
 								            data[key] = {'name': '', 'version': '', 'string': ''}
 								            match = re.compile(regexp).search(useragent)
 								            if match:
 								                matches = list(match.groups())
 								                if len(matches) == 1:
 								                    matches.append('')
-												use r'' for regex strings

											
										
										
											2024-06-30 08:52:59 +00:00
+								                swap = re.match(r'^\d', matches[0]) or matches[1] == 'Linux'
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								                name = matches[1 if swap else 0]
 								                version = matches[0 if swap else 1].replace('_', '.')
 								                name = UA_NAMES[key][name] if name in UA_NAMES[key] else name
 								                version = UA_VERSIONS[key][version] if version in UA_VERSIONS[key] else version
 								                string = name
 								                if version:
 								                    string = string + ' ' + (
 								                        '(' + version + ')' if name in ['BSD', 'Linux', 'Unix'] else version
 								                    )
 								                data[key] = {
 								                    'name': name,
 								                    'version': version,
 								                    'string': string
 								                }
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
+								                break
-												add parse_useragent

											
										
										
											2012-03-21 07:44:24 +00:00
+								    return data
-												adding findString()

											
										
										
											2008-04-29 11:26:42 +00:00
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								def remove_special_characters(text):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    """
 								    Removes special characters inserted by Word.
 								    """
 								    text = text.replace(u'\u2013', '-')
 								    text = text.replace(u'\u2026O', "'")
 								    text = text.replace(u'\u2019', "'")
 								    text = text.replace(u'', "'")
 								    text = text.replace(u'', "'")
 								    text = text.replace(u'', "-")
 								    return text
-												add some functions

											
										
										
											2008-04-27 16:54:37 +00:00
 								def wrap(text, width):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    """
 								    A word-wrap function that preserves existing line breaks and most spaces in
 								    the text. Expects that existing line breaks are posix newlines (\n).
 								    See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
 								    """
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
 								    def reduce_line(line, word):
 								        return '%s%s%s' % (
 								            line,
 								            ' \n'[
 								                (len(line[line.rfind('\n')+1:]) + len(word.split('\n', 1)[0]) >= width)
 								            ],
 								            word
 								        )
 								    return reduce(reduce_line, text.split(' '))
-												add some functions

											
										
										
											2008-04-27 16:54:37 +00:00
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								def wrap_string(string, length=80, separator='\n', balance=False):
-												wrapString(): defaults and tests

											
										
										
											2008-07-06 10:34:09 +00:00
+								    '''
-												get rid of u string literal

											
										
										
											2023-07-27 16:12:13 +00:00
+								    >>> wrap_string("Anticonstitutionellement, Paris s'eveille", 16)
 								    "Anticonstitution\\nellement, Paris \\ns'eveille"
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								    >>> wrap_string(u'All you can eat', 12, '\\n', True)
-												get rid of u string literal

											
										
										
											2023-07-27 16:12:13 +00:00
+								    'All you \\ncan eat'
-												wrapString(): defaults and tests

											
										
										
											2008-07-06 10:34:09 +00:00
+								    '''
-												optimize

											
										
										
											2008-07-06 10:37:29 +00:00
+								    words = string.split(' ')
-												off by one

											
										
										
											2008-07-06 10:21:55 +00:00
+								    if balance:
 								        # balance lines: test if same number of lines
 								        # can be achieved with a shorter line length
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								        lines = wrap_string(string, length, separator, False).split(separator)
-												off by one

											
										
										
											2008-07-06 10:21:55 +00:00
+								        if len(lines) > 1:
-												fix ox.text in python 3

											
										
										
											2014-09-30 19:17:15 +00:00
+								            while length > max([len(x) for x in words]):
-												off by one

											
										
										
											2008-07-06 10:21:55 +00:00
+								                length -= 1
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								                if len(wrap_string(string, length, separator, False).split(separator)) > len(lines):
-												off by one

											
										
										
											2008-07-06 10:21:55 +00:00
+								                    length += 1
 								                    break
 								    lines = ['']
 								    for word in words:
-												get rid of u string literal

											
										
										
											2023-07-27 16:12:13 +00:00
+								        if len(lines[len(lines) - 1] + word + ' ') <= length + 1:
-												off by one

											
										
										
											2008-07-06 10:21:55 +00:00
+								            # word fits in current line
-												get rid of u string literal

											
										
										
											2023-07-27 16:12:13 +00:00
+								            lines[len(lines) - 1] += word + ' '
-												off by one

											
										
										
											2008-07-06 10:21:55 +00:00
+								        else:
 								            if len(word) <= length:
 								                # word fits in next line
-												get rid of u string literal

											
										
										
											2023-07-27 16:12:13 +00:00
+								                lines.append(word + ' ')
-												off by one

											
										
										
											2008-07-06 10:21:55 +00:00
+								            else:
 								                # word is longer than line
 								                position = length - len(lines[len(lines) - 1])
 								                lines[len(lines) - 1] += word[0:position]
 								                for i in range(position, len(word), length):
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
+								                    lines.append(word[i:i+length])
-												get rid of u string literal

											
										
										
											2023-07-27 16:12:13 +00:00
+								                lines[len(lines) - 1] += ' '
-												off by one

											
										
										
											2008-07-06 10:21:55 +00:00
+								    return separator.join(lines).strip()
-												wrapString

											
										
										
											2008-07-06 09:43:06 +00:00
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								def truncate_string(string, length, padding='...', position='right'):
 								    #  >>> truncate_string('anticonstitutionellement', 16, '...', 'left')
-												truncateString

											
										
										
											2008-07-06 15:18:16 +00:00
+								    #  '...utionellement'
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								    #  >>> truncate_string('anticonstitutionellement', 16, '...', 'center')
-												truncateString

											
										
										
											2008-07-06 15:18:16 +00:00
+								    #  'anticon...lement'
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								    #  >>> truncate_string('anticonstitutionellement', 16, '...', 'right')
-												truncateString

											
										
										
											2008-07-06 15:18:16 +00:00
+								    #  'anticonstitut...'
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
+								    stringLength = len(string)
-												truncateString

											
										
										
											2008-07-06 15:18:16 +00:00
+								    paddingLength = len(padding)
 								    if stringLength > length:
 								        if position == 'left':
-												truncateString

											
										
										
											2008-07-06 15:34:29 +00:00
+								            string = '%s%s' % (padding, string[stringLength + paddingLength - length:])
-												truncateString

											
										
										
											2008-07-06 15:18:16 +00:00
+								        elif position == 'center':
-												truncateString

											
										
										
											2008-07-06 15:34:29 +00:00
+								            left = int(math.ceil(float(length - paddingLength) / 2))
 								            right = int(stringLength - math.floor(float(length - paddingLength) / 2))
 								            string = '%s%s%s' % (string[:left], padding, string[right:])
-												truncateString

											
										
										
											2008-07-06 15:18:16 +00:00
+								        elif position == 'right':
-												truncateString

											
										
										
											2008-07-06 15:34:29 +00:00
+								            string = '%s%s' % (string[:length - paddingLength], padding)
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
+								    return string
-												truncateString

											
										
										
											2008-07-06 15:18:16 +00:00
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								def truncate_words(s, num):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    """Truncates a string after a certain number of chacters, but ends with a word
-												fix some failing tests

											
										
										
											2012-09-09 17:28:11 +00:00
+								    >>> truncate_words('Truncates a string after a certain number of chacters, but ends with a word', 23)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    'Truncates a string...'
-												fix some failing tests

											
										
										
											2012-09-09 17:28:11 +00:00
+								    >>> truncate_words('Truncates a string', 23)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    'Truncates a string'
 								    """
 								    length = int(num)
 								    if len(s) <= length:
 								        return s
 								    words = s.split()
 								    ts = ""
 								    while words and len(ts) + len(words[0]) < length:
 								        ts += " " + words.pop(0)
 								    if words:
 								        ts += "..."
 								    return ts.strip()
-												add htmldecode, trimString, import missing chardet

											
										
										
											2008-04-28 09:50:34 +00:00
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								def trim_string(string, num):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    """Truncates a string after a certain number of chacters, adding ... at -10 characters
-												add test and cleanup some errors found while doing so

											
										
										
											2008-05-05 18:12:27 +00:00
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								    >>> trim_string('Truncates a string after a certain number of chacters', 23)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    'Truncates ...f chacters'
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								    >>> trim_string('Truncates a string', 23)
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    'Truncates a string'
 								    """
 								    if len(string) > num:
 								        string = string[:num - 13] + '...' + string[-10:]
 								    return string
-												add htmldecode, trimString, import missing chardet

											
										
										
											2008-04-28 09:50:34 +00:00
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								def get_valid_filename(s):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    """
 								    Returns the given string converted to a string that can be used for a clean
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
+								    filename. Specifically, leading and trailing spaces are removed;
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    all non-filename-safe characters are removed.
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								    >>> get_valid_filename("john's portrait in 2004.jpg")
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    'john_s_portrait_in_2004.jpg'
 								    """
 								    s = s.strip()
 								    s = s.replace(' ', '_')
 								    s = re.sub(r'[^-A-Za-z0-9_.\[\]\ ]', '_', s)
 								    s = s.replace('__', '_').replace('__', '_')
 								    return s
-												add some functions

											
										
										
											2008-04-27 16:54:37 +00:00
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								def get_text_list(list_, last_word='or'):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    """
-												get rid of u string literal

											
										
										
											2023-07-27 16:12:13 +00:00
+								    >>> get_text_list(['a', 'b', 'c', 'd'])
 								    'a, b, c or d'
 								    >>> get_text_list(['a', 'b', 'c'], 'and')
 								    'a, b and c'
 								    >>> get_text_list(['a', 'b'], 'and')
 								    'a and b'
 								    >>> get_text_list(['a'])
 								    'a'
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								    >>> get_text_list([])
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    ''
 								    """
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
+								    if len(list_) == 0:
 								        return ''
 								    if len(list_) == 1:
 								        return list_[0]
-												get rid of u string literal

											
										
										
											2023-07-27 16:12:13 +00:00
+								    return '%s %s %s' % (', '.join([i for i in list_][:-1]), last_word, list_[-1])
-												add some functions

											
										
										
											2008-04-27 16:54:37 +00:00
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								def get_list_text(text, last_word='or'):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    """
-												get rid of u string literal

											
										
										
											2023-07-27 16:12:13 +00:00
+								    >>> get_list_text('a, b, c or d')
 								    ['a', 'b', 'c', 'd']
 								    >>> get_list_text('a, b and c', 'and')
 								    ['a', 'b', 'c']
 								    >>> get_list_text('a and b', 'and')
 								    ['a', 'b']
 								    >>> get_list_text('a')
 								    ['a']
 								    >>> get_list_text('')
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    []
 								    """
 								    list_ = []
 								    if text:
-												get rid of u string literal

											
										
										
											2023-07-27 16:12:13 +00:00
+								        list_ = text.split(', ')
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								        if list_:
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
+								            i = len(list_)-1
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								            last = list_[i].split(last_word)
 								            if len(last) == 2:
 								                list_[i] = last[0].strip()
 								                list_.append(last[1].strip())
 								    return list_
-												getListText inverse of getTextList

											
										
										
											2008-05-06 11:37:40 +00:00
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								def normalize_newlines(text):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    return re.sub(r'\r\n|\r|\n', '\n', text)
-												add some functions

											
										
										
											2008-04-27 16:54:37 +00:00
 								def recapitalize(text):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    "Recapitalizes text, placing caps after end-of-sentence punctuation."
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
+								    # capwords = ()
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    text = text.lower()
 								    capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
 								    text = capsRE.sub(lambda x: x.group(1).upper(), text)
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
+								    # for capword in capwords:
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    #    capwordRE = re.compile(r'\b%s\b' % capword, re.I)
 								    #    text = capwordRE.sub(capword, text)
 								    return text
-												add some functions

											
										
										
											2008-04-27 16:54:37 +00:00
 								def phone2numeric(phone):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    "Converts a phone number with letters into its numeric equivalent."
 								    letters = re.compile(r'[A-PR-Y]', re.I)
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
 								    def char2number(m):
 								        return {
 								            'a': '2', 'c': '2', 'b': '2', 'e': '3',
 								            'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
 								            'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
 								            's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
 								            'y': '9', 'x': '9'
 								        }.get(m.group(0).lower())
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    return letters.sub(char2number, phone)
-												add some functions

											
										
										
											2008-04-27 16:54:37 +00:00
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								def compress_string(s):
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
+								    zbuf = BytesIO()
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
 								    zfile.write(s)
 								    zfile.close()
 								    return zbuf.getvalue()
-												add some functions

											
										
										
											2008-04-27 16:54:37 +00:00
 								smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								def smart_split(text):
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    """
 								    Generator that splits a string by spaces, leaving quoted phrases together.
 								    Supports both single and double quotes, and supports escaping quotes with
 								    backslashes. In the output, strings will keep their initial and trailing
 								    quote marks.
-												replace all CammelCase with under_score in ox


											
										
										
											2012-08-14 14:12:43 +00:00
+								    >>> list(smart_split('This is "a person\\'s" test.'))
-												vi:si:et:sw=4:sts=4:ts=4

											
										
										
											2008-06-19 09:21:21 +00:00
+								    ['This', 'is', '"a person\\'s"', 'test.']
 								    """
 								    for bit in smart_split_re.finditer(text):
 								        bit = bit.group(0)
 								        if bit[0] == '"':
 								            yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
 								        elif bit[0] == "'":
 								            yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
 								        else:
 								            yield bit
-												make findRegexp do more

											
										
										
											2008-04-29 13:34:27 +00:00
-												words

											
										
										
											2011-10-30 11:54:59 +00:00
+								def words(text):
 								    """
 								        returns words in text, removing punctuation
 								    """
 								    text = text.split()
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
+								    return [re.sub("(([.!?:-_]|'s)$)", '', x) for x in text]
-												add sort_string, sorted_strings

											
										
										
											2012-05-16 10:29:52 +00:00
 								def sort_string(string):
-												get rid of u string literal

											
										
										
											2023-07-27 16:12:13 +00:00
+								    string = string.replace('Æ', 'AE').replace('Ø', 'O').replace('Þ', 'Th')
-												add sort_string, sorted_strings

											
										
										
											2012-05-16 10:29:52 +00:00
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
+								    # pad numbered titles
-												use r'' for regex strings

											
										
										
											2024-06-30 08:52:59 +00:00
+								    string = re.sub(r'(\d),(\d{3})', '\\1\\2', string)
 								    string = re.sub(r'(\d+)', lambda x: '%010d' % int(x.group(0)), string)
-												add sort_string, sorted_strings

											
										
										
											2012-05-16 10:29:52 +00:00
+								    return unicodedata.normalize('NFKD', string)
-												key not keys

											
										
										
											2012-08-20 18:34:23 +00:00
+								def sorted_strings(strings, key=None):
-												fix ox.text in python 3

											
										
										
											2014-09-30 19:17:15 +00:00
+								    if not key:
-												fix python3 ox.text

											
										
										
											2016-06-08 10:27:55 +00:00
+								        key = sort_string
-												fix ox.text in python 3

											
										
										
											2014-09-30 19:17:15 +00:00
+								    return sorted(strings, key=key)