fix python3 ox.text

This commit is contained in:
j 2016-06-08 12:27:55 +02:00
parent ac2e829016
commit 51da4fd809

View file

@ -5,6 +5,8 @@ import math
import re import re
import unicodedata import unicodedata
from six.moves import reduce
ARTICLES = list(set([ ARTICLES = list(set([
# def sg, def pl, indef sg, indef pl (each m/f/n) # def sg, def pl, indef sg, indef pl (each m/f/n)
'der', 'die', 'das', 'ein', 'eine', # de 'der', 'die', 'das', 'ein', 'eine', # de
@ -244,15 +246,18 @@ def get_sort_name(name):
>>> get_sort_name('Scorsese, Martin') >>> get_sort_name('Scorsese, Martin')
'Scorsese, Martin' 'Scorsese, Martin'
""" """
if not ' ' in name or ', ' in name: if ' ' not in name or ', ' in name:
return name return name
if name.lower().startswith('the '): if name.lower().startswith('the '):
return get_sort_title(name) return get_sort_title(name)
def add_name(): def add_name():
if len(first_names): if len(first_names):
last_names.insert(0, first_names.pop()) last_names.insert(0, first_names.pop())
def find_name(names): def find_name(names):
return len(first_names) and first_names[-1].lower() in names return len(first_names) and first_names[-1].lower() in names
first_names = name.split(' ') first_names = name.split(' ')
last_names = [] last_names = []
if re.search('^[0-9]+$', first_names[-1]): if re.search('^[0-9]+$', first_names[-1]):
@ -352,7 +357,7 @@ def parse_useragent(useragent):
'version': version, 'version': version,
'string': string 'string': string
} }
break; break
return data return data
def remove_special_characters(text): def remove_special_characters(text):
@ -373,15 +378,18 @@ def wrap(text, width):
the text. Expects that existing line breaks are posix newlines (\n). the text. Expects that existing line breaks are posix newlines (\n).
See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061 See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
""" """
return reduce(lambda line, word, width=width: '%s%s%s' %
(line, def reduce_line(line, word):
' \n'[(len(line[line.rfind('\n')+1:]) return '%s%s%s' % (
+ len(word.split('\n',1)[0] line,
) >= width)], ' \n'[
word), (len(line[line.rfind('\n')+1:]) + len(word.split('\n', 1)[0]) >= width)
text.split(' ') ],
word
) )
return reduce(reduce_line, text.split(' '))
def wrap_string(string, length=80, separator='\n', balance=False): def wrap_string(string, length=80, separator='\n', balance=False):
''' '''
>>> wrap_string(u"Anticonstitutionellement, Paris s'eveille", 16) >>> wrap_string(u"Anticonstitutionellement, Paris s'eveille", 16)
@ -404,7 +412,7 @@ def wrap_string(string, length=80, separator='\n', balance=False):
for word in words: for word in words:
if len(lines[len(lines) - 1] + word + u' ') <= length + 1: if len(lines[len(lines) - 1] + word + u' ') <= length + 1:
# word fits in current line # word fits in current line
lines[len(lines) - 1] += word + u' '; lines[len(lines) - 1] += word + u' '
else: else:
if len(word) <= length: if len(word) <= length:
# word fits in next line # word fits in next line
@ -414,7 +422,7 @@ def wrap_string(string, length=80, separator='\n', balance=False):
position = length - len(lines[len(lines) - 1]) position = length - len(lines[len(lines) - 1])
lines[len(lines) - 1] += word[0:position] lines[len(lines) - 1] += word[0:position]
for i in range(position, len(word), length): for i in range(position, len(word), length):
lines.append(word[i:i+length]); lines.append(word[i:i+length])
lines[len(lines) - 1] += u' ' lines[len(lines) - 1] += u' '
return separator.join(lines).strip() return separator.join(lines).strip()
@ -425,7 +433,7 @@ def truncate_string(string, length, padding='...', position='right'):
# 'anticon...lement' # 'anticon...lement'
# >>> truncate_string('anticonstitutionellement', 16, '...', 'right') # >>> truncate_string('anticonstitutionellement', 16, '...', 'right')
# 'anticonstitut...' # 'anticonstitut...'
stringLength = len(string); stringLength = len(string)
paddingLength = len(padding) paddingLength = len(padding)
if stringLength > length: if stringLength > length:
if position == 'left': if position == 'left':
@ -436,7 +444,7 @@ def truncate_string(string, length, padding='...', position='right'):
string = '%s%s%s' % (string[:left], padding, string[right:]) string = '%s%s%s' % (string[:left], padding, string[right:])
elif position == 'right': elif position == 'right':
string = '%s%s' % (string[:length - paddingLength], padding) string = '%s%s' % (string[:length - paddingLength], padding)
return string; return string
def truncate_words(s, num): def truncate_words(s, num):
"""Truncates a string after a certain number of chacters, but ends with a word """Truncates a string after a certain number of chacters, but ends with a word
@ -498,9 +506,11 @@ def get_text_list(list_, last_word='or'):
>>> get_text_list([]) >>> get_text_list([])
'' ''
""" """
if len(list_) == 0: return '' if len(list_) == 0:
if len(list_) == 1: return list_[0] return ''
return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1]) if len(list_) == 1:
return list_[0]
return u'%s %s %s' % (u', '.join([i for i in list_][:-1]), last_word, list_[-1])
def get_list_text(text, last_word='or'): def get_list_text(text, last_word='or'):
""" """
@ -543,22 +553,28 @@ def recapitalize(text):
def phone2numeric(phone): def phone2numeric(phone):
"Converts a phone number with letters into its numeric equivalent." "Converts a phone number with letters into its numeric equivalent."
letters = re.compile(r'[A-PR-Y]', re.I) letters = re.compile(r'[A-PR-Y]', re.I)
char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
def char2number(m):
return {
'a': '2', 'c': '2', 'b': '2', 'e': '3',
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5', 'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7', 'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8', 's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
'y': '9', 'x': '9'}.get(m.group(0).lower()) 'y': '9', 'x': '9'
}.get(m.group(0).lower())
return letters.sub(char2number, phone) return letters.sub(char2number, phone)
def compress_string(s): def compress_string(s):
import cStringIO, gzip import gzip
zbuf = cStringIO.StringIO() from six import BytesIO
zbuf = BytesIO()
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf) zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
zfile.write(s) zfile.write(s)
zfile.close() zfile.close()
return zbuf.getvalue() return zbuf.getvalue()
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)') smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
def smart_split(text): def smart_split(text):
""" """
Generator that splits a string by spaces, leaving quoted phrases together. Generator that splits a string by spaces, leaving quoted phrases together.
@ -582,7 +598,7 @@ def words(text):
returns words in text, removing punctuation returns words in text, removing punctuation
""" """
text = text.split() text = text.split()
return map(lambda x: re.sub("(([.!?:-_]|'s)$)", '', x), text) return [re.sub("(([.!?:-_]|'s)$)", '', x) for x in text]
def sort_string(string): def sort_string(string):
string = string.replace(u'Æ', 'AE').replace(u'Ø', 'O').replace(u'Þ', 'Th') string = string.replace(u'Æ', 'AE').replace(u'Ø', 'O').replace(u'Þ', 'Th')
@ -594,5 +610,5 @@ def sort_string(string):
def sorted_strings(strings, key=None): def sorted_strings(strings, key=None):
if not key: if not key:
key = lambda k: sort_string(k) key = sort_string
return sorted(strings, key=key) return sorted(strings, key=key)