fix python3 ox.text

2016-06-08 12:27:55 +02:00 · 2016-06-08 12:27:55 +02:00 · 51da4fd809
commit 51da4fd809
parent ac2e829016
1 changed files with 68 additions and 52 deletions
--- a/ox/text.py
+++ b/ox/text.py
@ -5,16 +5,18 @@ import math
 import re
 import unicodedata

+from six.moves import reduce
+
 ARTICLES = list(set([
    # def sg, def pl, indef sg, indef pl (each m/f/n)
-    'der', 'die', 'das', 'ein', 'eine', # de
-    'the', 'a', 'an', # en
-    'el', 'la', 'lo', 'los', 'las', 'un', 'una', 'unos', 'unas', # es
-    'le', "l'", 'la', 'les', 'un', 'une', 'des', # fr
-    'il', 'lo', "l'" 'la', '_i', 'gli', 'le', # it
-    'de', 'het', 'een', # nl
-     'o', 'a', 'os', '_as', 'um', 'uma', '_uns', 'umas' # pt
-     # some _disabled because of collisions
+    'der', 'die', 'das', 'ein', 'eine',  # de
+    'the', 'a', 'an',  # en
+    'el', 'la', 'lo', 'los', 'las', 'un', 'una', 'unos', 'unas',  # es
+    'le', "l'", 'la', 'les', 'un', 'une', 'des',  # fr
+    'il', 'lo', "l'" 'la', '_i', 'gli', 'le',  # it
+    'de', 'het', 'een',  # nl
+    'o', 'a', 'os', '_as', 'um', 'uma', '_uns', 'umas'  # pt
+    # some _disabled because of collisions
 ]))
 # see http://en.wikipedia.org/wiki/List_of_common_Chinese_surnames
 # and http://en.wikipedia.org/wiki/List_of_Korean_family_names
@ -88,8 +90,8 @@ UA_REGEXPS = {
        '(Chimera)\/(\d+)',
        '(chromeframe)\/(\d+)',
        '(Edge)\/(\d+)',
-        '(Epiphany)\/(\d+)', # before Chrome, Chromium and Safari
-        '(Chromium)\/(\d+)', # before Chrome
+        '(Epiphany)\/(\d+)',  # before Chrome, Chromium and Safari
+        '(Chromium)\/(\d+)',  # before Chrome
        '(Chrome)\/(\d+)',
        '(FBForIPhone)',
        '(Firefox)\/(\d+)',
@ -107,7 +109,7 @@ UA_REGEXPS = {
        '(OviBrowser)\/(\d+)',
        'Version\/(\d+).+(Safari)',
        '(WebKit)\/(\d+)',
-        '(MSIE) (\d\d?(?!\d))', # last, since Opera used to mask as MSIE
+        '(MSIE) (\d\d?(?!\d))',  # last, since Opera used to mask as MSIE
        '(Trident)\/.*?rv:(\d+)',
        '(Gecko)',
        '(Mozilla)\/(3|4)'
@ -130,7 +132,7 @@ UA_REGEXPS = {
        '(BSD) (FreeBSD|NetBSD|OpenBSD)',
        '(CPU OS) (\d+)',
        '(iPhone OS) (\d+)',
-        '(iPhone)', # Opera
+        '(iPhone)',  # Opera
        '(J2ME\/MIDP)',
        '(Linux).+(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS)',
        '(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS).+(Linux)',
@ -155,12 +157,12 @@ UA_REGEXPS = {
        '(Windows) (NT \d\.\d)',
        '(Windows Phone) (\d+)',
        '(Windows Phone OS) (\d+)',
-        '(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)', # Opera
-        '(Win) (9x 4\.90)', # Firefox
-        '(Win)(16)', # Firefox
-        '(Win)(9\d)', # Firefox
-        '(Win)(NT)', # Firefox
-        '(Win)(NT4\.0)', # Firefox
+        '(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)',  # Opera
+        '(Win) (9x 4\.90)',  # Firefox
+        '(Win)(16)',  # Firefox
+        '(Win)(9\d)',  # Firefox
+        '(Win)(NT)',  # Firefox
+        '(Win)(NT4\.0)',  # Firefox
        '(X11)'
    ]
 }
@ -244,15 +246,18 @@ def get_sort_name(name):
    >>> get_sort_name('Scorsese, Martin')
    'Scorsese, Martin'
    """
-    if not ' ' in name or ', ' in name:
+    if ' ' not in name or ', ' in name:
        return name
    if name.lower().startswith('the '):
        return get_sort_title(name)
+
    def add_name():
        if len(first_names):
            last_names.insert(0, first_names.pop())
+
    def find_name(names):
        return len(first_names) and first_names[-1].lower() in names
+
    first_names = name.split(' ')
    last_names = []
    if re.search('^[0-9]+$', first_names[-1]):
@ -299,8 +304,8 @@ def find_re(string, regexp):
        return result[0].strip()
    return ''

-def find_string(string, string0='', string1 = ''):
-    """Return the string between string0 and string1. 
+def find_string(string, string0='', string1=''):
+    """Return the string between string0 and string1.

    If string0 or string1 is left out, begining or end of string is used.

@ -329,7 +334,7 @@ def parse_useragent(useragent):
    for key in UA_REGEXPS:
        for alias, regexp in UA_ALIASES[key].items():
            alias = alias if key == 'browser' else alias + ' \\1'
-            useragent = re.sub(regexp, alias, useragent)                    
+            useragent = re.sub(regexp, alias, useragent)
        for regexp in UA_REGEXPS[key]:
            data[key] = {'name': '', 'version': '', 'string': ''}
            match = re.compile(regexp).search(useragent)
@ -352,7 +357,7 @@ def parse_useragent(useragent):
                    'version': version,
                    'string': string
                }
-                break;
+                break
    return data

 def remove_special_characters(text):
@ -373,14 +378,17 @@ def wrap(text, width):
    the text. Expects that existing line breaks are posix newlines (\n).
    See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
    """
-    return reduce(lambda line, word, width=width: '%s%s%s' %
-                  (line,
-                    ' \n'[(len(line[line.rfind('\n')+1:])
-                          + len(word.split('\n',1)[0]
-                              ) >= width)],
-                    word),
-                  text.split(' ')
-                  )
+
+    def reduce_line(line, word):
+        return '%s%s%s' % (
+            line,
+            ' \n'[
+                (len(line[line.rfind('\n')+1:]) + len(word.split('\n', 1)[0]) >= width)
+            ],
+            word
+        )
+
+    return reduce(reduce_line, text.split(' '))

 def wrap_string(string, length=80, separator='\n', balance=False):
    '''
@ -404,7 +412,7 @@ def wrap_string(string, length=80, separator='\n', balance=False):
    for word in words:
        if len(lines[len(lines) - 1] + word + u' ') <= length + 1:
            # word fits in current line
-            lines[len(lines) - 1] += word + u' ';
+            lines[len(lines) - 1] += word + u' '
        else:
            if len(word) <= length:
                # word fits in next line
@ -414,7 +422,7 @@ def wrap_string(string, length=80, separator='\n', balance=False):
                position = length - len(lines[len(lines) - 1])
                lines[len(lines) - 1] += word[0:position]
                for i in range(position, len(word), length):
-                    lines.append(word[i:i+length]);
+                    lines.append(word[i:i+length])
                lines[len(lines) - 1] += u' '
    return separator.join(lines).strip()

@ -425,7 +433,7 @@ def truncate_string(string, length, padding='...', position='right'):
    #  'anticon...lement'
    #  >>> truncate_string('anticonstitutionellement', 16, '...', 'right')
    #  'anticonstitut...'
-    stringLength = len(string);
+    stringLength = len(string)
    paddingLength = len(padding)
    if stringLength > length:
        if position == 'left':
@ -436,7 +444,7 @@ def truncate_string(string, length, padding='...', position='right'):
            string = '%s%s%s' % (string[:left], padding, string[right:])
        elif position == 'right':
            string = '%s%s' % (string[:length - paddingLength], padding)
-    return string;
+    return string

 def truncate_words(s, num):
    """Truncates a string after a certain number of chacters, but ends with a word
@ -473,7 +481,7 @@ def trim_string(string, num):
 def get_valid_filename(s):
    """
    Returns the given string converted to a string that can be used for a clean
-    filename. Specifically, leading and trailing spaces are removed; 
+    filename. Specifically, leading and trailing spaces are removed;
    all non-filename-safe characters are removed.

    >>> get_valid_filename("john's portrait in 2004.jpg")
@ -498,9 +506,11 @@ def get_text_list(list_, last_word='or'):
    >>> get_text_list([])
    ''
    """
-    if len(list_) == 0: return ''
-    if len(list_) == 1: return list_[0]
-    return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1])
+    if len(list_) == 0:
+        return ''
+    if len(list_) == 1:
+        return list_[0]
+    return u'%s %s %s' % (u', '.join([i for i in list_][:-1]), last_word, list_[-1])

 def get_list_text(text, last_word='or'):
    """
@ -519,7 +529,7 @@ def get_list_text(text, last_word='or'):
    if text:
        list_ = text.split(u', ')
        if list_:
-            i=len(list_)-1
+            i = len(list_)-1
            last = list_[i].split(last_word)
            if len(last) == 2:
                list_[i] = last[0].strip()
@ -531,11 +541,11 @@ def normalize_newlines(text):

 def recapitalize(text):
    "Recapitalizes text, placing caps after end-of-sentence punctuation."
-    #capwords = ()
+    # capwords = ()
    text = text.lower()
    capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
    text = capsRE.sub(lambda x: x.group(1).upper(), text)
-    #for capword in capwords:
+    # for capword in capwords:
    #    capwordRE = re.compile(r'\b%s\b' % capword, re.I)
    #    text = capwordRE.sub(capword, text)
    return text
@ -543,22 +553,28 @@ def recapitalize(text):
 def phone2numeric(phone):
    "Converts a phone number with letters into its numeric equivalent."
    letters = re.compile(r'[A-PR-Y]', re.I)
-    char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
-          'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
-          'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
-          's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
-          'y': '9', 'x': '9'}.get(m.group(0).lower())
+
+    def char2number(m):
+        return {
+            'a': '2', 'c': '2', 'b': '2', 'e': '3',
+            'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
+            'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
+            's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
+            'y': '9', 'x': '9'
+        }.get(m.group(0).lower())
    return letters.sub(char2number, phone)

 def compress_string(s):
-    import cStringIO, gzip
-    zbuf = cStringIO.StringIO()
+    import gzip
+    from six import BytesIO
+    zbuf = BytesIO()
    zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
    zfile.write(s)
    zfile.close()
    return zbuf.getvalue()

 smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
+
 def smart_split(text):
    """
    Generator that splits a string by spaces, leaving quoted phrases together.
@ -582,17 +598,17 @@ def words(text):
        returns words in text, removing punctuation
    """
    text = text.split()
-    return map(lambda x: re.sub("(([.!?:-_]|'s)$)", '', x), text)
+    return [re.sub("(([.!?:-_]|'s)$)", '', x) for x in text]

 def sort_string(string):
    string = string.replace(u'Æ', 'AE').replace(u'Ø', 'O').replace(u'Þ', 'Th')

-    #pad numbered titles
+    # pad numbered titles
    string = re.sub('(\d),(\d{3})', '\\1\\2', string)
    string = re.sub('(\d+)', lambda x: '%010d' % int(x.group(0)), string)
    return unicodedata.normalize('NFKD', string)

 def sorted_strings(strings, key=None):
    if not key:
-        key = lambda k: sort_string(k)
+        key = sort_string
    return sorted(strings, key=key)