fix python3 ox.text

This commit is contained in:
j 2016-06-08 12:27:55 +02:00
parent ac2e829016
commit 51da4fd809

View file

@ -5,16 +5,18 @@ import math
import re
import unicodedata
from six.moves import reduce
ARTICLES = list(set([
# def sg, def pl, indef sg, indef pl (each m/f/n)
'der', 'die', 'das', 'ein', 'eine', # de
'the', 'a', 'an', # en
'el', 'la', 'lo', 'los', 'las', 'un', 'una', 'unos', 'unas', # es
'le', "l'", 'la', 'les', 'un', 'une', 'des', # fr
'il', 'lo', "l'" 'la', '_i', 'gli', 'le', # it
'de', 'het', 'een', # nl
'o', 'a', 'os', '_as', 'um', 'uma', '_uns', 'umas' # pt
# some _disabled because of collisions
'der', 'die', 'das', 'ein', 'eine', # de
'the', 'a', 'an', # en
'el', 'la', 'lo', 'los', 'las', 'un', 'una', 'unos', 'unas', # es
'le', "l'", 'la', 'les', 'un', 'une', 'des', # fr
'il', 'lo', "l'" 'la', '_i', 'gli', 'le', # it
'de', 'het', 'een', # nl
'o', 'a', 'os', '_as', 'um', 'uma', '_uns', 'umas' # pt
# some _disabled because of collisions
]))
# see http://en.wikipedia.org/wiki/List_of_common_Chinese_surnames
# and http://en.wikipedia.org/wiki/List_of_Korean_family_names
@ -88,8 +90,8 @@ UA_REGEXPS = {
'(Chimera)\/(\d+)',
'(chromeframe)\/(\d+)',
'(Edge)\/(\d+)',
'(Epiphany)\/(\d+)', # before Chrome, Chromium and Safari
'(Chromium)\/(\d+)', # before Chrome
'(Epiphany)\/(\d+)', # before Chrome, Chromium and Safari
'(Chromium)\/(\d+)', # before Chrome
'(Chrome)\/(\d+)',
'(FBForIPhone)',
'(Firefox)\/(\d+)',
@ -107,7 +109,7 @@ UA_REGEXPS = {
'(OviBrowser)\/(\d+)',
'Version\/(\d+).+(Safari)',
'(WebKit)\/(\d+)',
'(MSIE) (\d\d?(?!\d))', # last, since Opera used to mask as MSIE
'(MSIE) (\d\d?(?!\d))', # last, since Opera used to mask as MSIE
'(Trident)\/.*?rv:(\d+)',
'(Gecko)',
'(Mozilla)\/(3|4)'
@ -130,7 +132,7 @@ UA_REGEXPS = {
'(BSD) (FreeBSD|NetBSD|OpenBSD)',
'(CPU OS) (\d+)',
'(iPhone OS) (\d+)',
'(iPhone)', # Opera
'(iPhone)', # Opera
'(J2ME\/MIDP)',
'(Linux).+(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS)',
'(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS).+(Linux)',
@ -155,12 +157,12 @@ UA_REGEXPS = {
'(Windows) (NT \d\.\d)',
'(Windows Phone) (\d+)',
'(Windows Phone OS) (\d+)',
'(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)', # Opera
'(Win) (9x 4\.90)', # Firefox
'(Win)(16)', # Firefox
'(Win)(9\d)', # Firefox
'(Win)(NT)', # Firefox
'(Win)(NT4\.0)', # Firefox
'(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)', # Opera
'(Win) (9x 4\.90)', # Firefox
'(Win)(16)', # Firefox
'(Win)(9\d)', # Firefox
'(Win)(NT)', # Firefox
'(Win)(NT4\.0)', # Firefox
'(X11)'
]
}
@ -244,15 +246,18 @@ def get_sort_name(name):
>>> get_sort_name('Scorsese, Martin')
'Scorsese, Martin'
"""
if not ' ' in name or ', ' in name:
if ' ' not in name or ', ' in name:
return name
if name.lower().startswith('the '):
return get_sort_title(name)
def add_name():
if len(first_names):
last_names.insert(0, first_names.pop())
def find_name(names):
return len(first_names) and first_names[-1].lower() in names
first_names = name.split(' ')
last_names = []
if re.search('^[0-9]+$', first_names[-1]):
@ -299,7 +304,7 @@ def find_re(string, regexp):
return result[0].strip()
return ''
def find_string(string, string0='', string1 = ''):
def find_string(string, string0='', string1=''):
"""Return the string between string0 and string1.
If string0 or string1 is left out, begining or end of string is used.
@ -352,7 +357,7 @@ def parse_useragent(useragent):
'version': version,
'string': string
}
break;
break
return data
def remove_special_characters(text):
@ -373,14 +378,17 @@ def wrap(text, width):
the text. Expects that existing line breaks are posix newlines (\n).
See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
"""
return reduce(lambda line, word, width=width: '%s%s%s' %
(line,
' \n'[(len(line[line.rfind('\n')+1:])
+ len(word.split('\n',1)[0]
) >= width)],
word),
text.split(' ')
)
def reduce_line(line, word):
return '%s%s%s' % (
line,
' \n'[
(len(line[line.rfind('\n')+1:]) + len(word.split('\n', 1)[0]) >= width)
],
word
)
return reduce(reduce_line, text.split(' '))
def wrap_string(string, length=80, separator='\n', balance=False):
'''
@ -404,7 +412,7 @@ def wrap_string(string, length=80, separator='\n', balance=False):
for word in words:
if len(lines[len(lines) - 1] + word + u' ') <= length + 1:
# word fits in current line
lines[len(lines) - 1] += word + u' ';
lines[len(lines) - 1] += word + u' '
else:
if len(word) <= length:
# word fits in next line
@ -414,7 +422,7 @@ def wrap_string(string, length=80, separator='\n', balance=False):
position = length - len(lines[len(lines) - 1])
lines[len(lines) - 1] += word[0:position]
for i in range(position, len(word), length):
lines.append(word[i:i+length]);
lines.append(word[i:i+length])
lines[len(lines) - 1] += u' '
return separator.join(lines).strip()
@ -425,7 +433,7 @@ def truncate_string(string, length, padding='...', position='right'):
# 'anticon...lement'
# >>> truncate_string('anticonstitutionellement', 16, '...', 'right')
# 'anticonstitut...'
stringLength = len(string);
stringLength = len(string)
paddingLength = len(padding)
if stringLength > length:
if position == 'left':
@ -436,7 +444,7 @@ def truncate_string(string, length, padding='...', position='right'):
string = '%s%s%s' % (string[:left], padding, string[right:])
elif position == 'right':
string = '%s%s' % (string[:length - paddingLength], padding)
return string;
return string
def truncate_words(s, num):
"""Truncates a string after a certain number of chacters, but ends with a word
@ -498,9 +506,11 @@ def get_text_list(list_, last_word='or'):
>>> get_text_list([])
''
"""
if len(list_) == 0: return ''
if len(list_) == 1: return list_[0]
return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1])
if len(list_) == 0:
return ''
if len(list_) == 1:
return list_[0]
return u'%s %s %s' % (u', '.join([i for i in list_][:-1]), last_word, list_[-1])
def get_list_text(text, last_word='or'):
"""
@ -519,7 +529,7 @@ def get_list_text(text, last_word='or'):
if text:
list_ = text.split(u', ')
if list_:
i=len(list_)-1
i = len(list_)-1
last = list_[i].split(last_word)
if len(last) == 2:
list_[i] = last[0].strip()
@ -531,11 +541,11 @@ def normalize_newlines(text):
def recapitalize(text):
"Recapitalizes text, placing caps after end-of-sentence punctuation."
#capwords = ()
# capwords = ()
text = text.lower()
capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
text = capsRE.sub(lambda x: x.group(1).upper(), text)
#for capword in capwords:
# for capword in capwords:
# capwordRE = re.compile(r'\b%s\b' % capword, re.I)
# text = capwordRE.sub(capword, text)
return text
@ -543,22 +553,28 @@ def recapitalize(text):
def phone2numeric(phone):
"Converts a phone number with letters into its numeric equivalent."
letters = re.compile(r'[A-PR-Y]', re.I)
char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
'y': '9', 'x': '9'}.get(m.group(0).lower())
def char2number(m):
return {
'a': '2', 'c': '2', 'b': '2', 'e': '3',
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
'y': '9', 'x': '9'
}.get(m.group(0).lower())
return letters.sub(char2number, phone)
def compress_string(s):
import cStringIO, gzip
zbuf = cStringIO.StringIO()
import gzip
from six import BytesIO
zbuf = BytesIO()
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
zfile.write(s)
zfile.close()
return zbuf.getvalue()
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
def smart_split(text):
"""
Generator that splits a string by spaces, leaving quoted phrases together.
@ -582,17 +598,17 @@ def words(text):
returns words in text, removing punctuation
"""
text = text.split()
return map(lambda x: re.sub("(([.!?:-_]|'s)$)", '', x), text)
return [re.sub("(([.!?:-_]|'s)$)", '', x) for x in text]
def sort_string(string):
string = string.replace(u'Æ', 'AE').replace(u'Ø', 'O').replace(u'Þ', 'Th')
#pad numbered titles
# pad numbered titles
string = re.sub('(\d),(\d{3})', '\\1\\2', string)
string = re.sub('(\d+)', lambda x: '%010d' % int(x.group(0)), string)
return unicodedata.normalize('NFKD', string)
def sorted_strings(strings, key=None):
if not key:
key = lambda k: sort_string(k)
key = sort_string
return sorted(strings, key=key)