fix python3 ox.text

This commit is contained in:
j 2016-06-08 12:27:55 +02:00
parent ac2e829016
commit 51da4fd809

View file

@ -5,6 +5,8 @@ import math
import re import re
import unicodedata import unicodedata
from six.moves import reduce
ARTICLES = list(set([ ARTICLES = list(set([
# def sg, def pl, indef sg, indef pl (each m/f/n) # def sg, def pl, indef sg, indef pl (each m/f/n)
'der', 'die', 'das', 'ein', 'eine', # de 'der', 'die', 'das', 'ein', 'eine', # de
@ -244,15 +246,18 @@ def get_sort_name(name):
>>> get_sort_name('Scorsese, Martin') >>> get_sort_name('Scorsese, Martin')
'Scorsese, Martin' 'Scorsese, Martin'
""" """
if not ' ' in name or ', ' in name: if ' ' not in name or ', ' in name:
return name return name
if name.lower().startswith('the '): if name.lower().startswith('the '):
return get_sort_title(name) return get_sort_title(name)
def add_name(): def add_name():
if len(first_names): if len(first_names):
last_names.insert(0, first_names.pop()) last_names.insert(0, first_names.pop())
def find_name(names): def find_name(names):
return len(first_names) and first_names[-1].lower() in names return len(first_names) and first_names[-1].lower() in names
first_names = name.split(' ') first_names = name.split(' ')
last_names = [] last_names = []
if re.search('^[0-9]+$', first_names[-1]): if re.search('^[0-9]+$', first_names[-1]):
@ -299,7 +304,7 @@ def find_re(string, regexp):
return result[0].strip() return result[0].strip()
return '' return ''
def find_string(string, string0='', string1 = ''): def find_string(string, string0='', string1=''):
"""Return the string between string0 and string1. """Return the string between string0 and string1.
If string0 or string1 is left out, begining or end of string is used. If string0 or string1 is left out, begining or end of string is used.
@ -352,7 +357,7 @@ def parse_useragent(useragent):
'version': version, 'version': version,
'string': string 'string': string
} }
break; break
return data return data
def remove_special_characters(text): def remove_special_characters(text):
@ -373,15 +378,18 @@ def wrap(text, width):
the text. Expects that existing line breaks are posix newlines (\n). the text. Expects that existing line breaks are posix newlines (\n).
See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061 See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
""" """
return reduce(lambda line, word, width=width: '%s%s%s' %
(line, def reduce_line(line, word):
' \n'[(len(line[line.rfind('\n')+1:]) return '%s%s%s' % (
+ len(word.split('\n',1)[0] line,
) >= width)], ' \n'[
word), (len(line[line.rfind('\n')+1:]) + len(word.split('\n', 1)[0]) >= width)
text.split(' ') ],
word
) )
return reduce(reduce_line, text.split(' '))
def wrap_string(string, length=80, separator='\n', balance=False): def wrap_string(string, length=80, separator='\n', balance=False):
''' '''
>>> wrap_string(u"Anticonstitutionellement, Paris s'eveille", 16) >>> wrap_string(u"Anticonstitutionellement, Paris s'eveille", 16)
@ -404,7 +412,7 @@ def wrap_string(string, length=80, separator='\n', balance=False):
for word in words: for word in words:
if len(lines[len(lines) - 1] + word + u' ') <= length + 1: if len(lines[len(lines) - 1] + word + u' ') <= length + 1:
# word fits in current line # word fits in current line
lines[len(lines) - 1] += word + u' '; lines[len(lines) - 1] += word + u' '
else: else:
if len(word) <= length: if len(word) <= length:
# word fits in next line # word fits in next line
@ -414,7 +422,7 @@ def wrap_string(string, length=80, separator='\n', balance=False):
position = length - len(lines[len(lines) - 1]) position = length - len(lines[len(lines) - 1])
lines[len(lines) - 1] += word[0:position] lines[len(lines) - 1] += word[0:position]
for i in range(position, len(word), length): for i in range(position, len(word), length):
lines.append(word[i:i+length]); lines.append(word[i:i+length])
lines[len(lines) - 1] += u' ' lines[len(lines) - 1] += u' '
return separator.join(lines).strip() return separator.join(lines).strip()
@ -425,7 +433,7 @@ def truncate_string(string, length, padding='...', position='right'):
# 'anticon...lement' # 'anticon...lement'
# >>> truncate_string('anticonstitutionellement', 16, '...', 'right') # >>> truncate_string('anticonstitutionellement', 16, '...', 'right')
# 'anticonstitut...' # 'anticonstitut...'
stringLength = len(string); stringLength = len(string)
paddingLength = len(padding) paddingLength = len(padding)
if stringLength > length: if stringLength > length:
if position == 'left': if position == 'left':
@ -436,7 +444,7 @@ def truncate_string(string, length, padding='...', position='right'):
string = '%s%s%s' % (string[:left], padding, string[right:]) string = '%s%s%s' % (string[:left], padding, string[right:])
elif position == 'right': elif position == 'right':
string = '%s%s' % (string[:length - paddingLength], padding) string = '%s%s' % (string[:length - paddingLength], padding)
return string; return string
def truncate_words(s, num): def truncate_words(s, num):
"""Truncates a string after a certain number of chacters, but ends with a word """Truncates a string after a certain number of chacters, but ends with a word
@ -498,9 +506,11 @@ def get_text_list(list_, last_word='or'):
>>> get_text_list([]) >>> get_text_list([])
'' ''
""" """
if len(list_) == 0: return '' if len(list_) == 0:
if len(list_) == 1: return list_[0] return ''
return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1]) if len(list_) == 1:
return list_[0]
return u'%s %s %s' % (u', '.join([i for i in list_][:-1]), last_word, list_[-1])
def get_list_text(text, last_word='or'): def get_list_text(text, last_word='or'):
""" """
@ -519,7 +529,7 @@ def get_list_text(text, last_word='or'):
if text: if text:
list_ = text.split(u', ') list_ = text.split(u', ')
if list_: if list_:
i=len(list_)-1 i = len(list_)-1
last = list_[i].split(last_word) last = list_[i].split(last_word)
if len(last) == 2: if len(last) == 2:
list_[i] = last[0].strip() list_[i] = last[0].strip()
@ -531,11 +541,11 @@ def normalize_newlines(text):
def recapitalize(text): def recapitalize(text):
"Recapitalizes text, placing caps after end-of-sentence punctuation." "Recapitalizes text, placing caps after end-of-sentence punctuation."
#capwords = () # capwords = ()
text = text.lower() text = text.lower()
capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])') capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
text = capsRE.sub(lambda x: x.group(1).upper(), text) text = capsRE.sub(lambda x: x.group(1).upper(), text)
#for capword in capwords: # for capword in capwords:
# capwordRE = re.compile(r'\b%s\b' % capword, re.I) # capwordRE = re.compile(r'\b%s\b' % capword, re.I)
# text = capwordRE.sub(capword, text) # text = capwordRE.sub(capword, text)
return text return text
@ -543,22 +553,28 @@ def recapitalize(text):
def phone2numeric(phone): def phone2numeric(phone):
"Converts a phone number with letters into its numeric equivalent." "Converts a phone number with letters into its numeric equivalent."
letters = re.compile(r'[A-PR-Y]', re.I) letters = re.compile(r'[A-PR-Y]', re.I)
char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
def char2number(m):
return {
'a': '2', 'c': '2', 'b': '2', 'e': '3',
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5', 'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7', 'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8', 's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
'y': '9', 'x': '9'}.get(m.group(0).lower()) 'y': '9', 'x': '9'
}.get(m.group(0).lower())
return letters.sub(char2number, phone) return letters.sub(char2number, phone)
def compress_string(s): def compress_string(s):
import cStringIO, gzip import gzip
zbuf = cStringIO.StringIO() from six import BytesIO
zbuf = BytesIO()
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf) zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
zfile.write(s) zfile.write(s)
zfile.close() zfile.close()
return zbuf.getvalue() return zbuf.getvalue()
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)') smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
def smart_split(text): def smart_split(text):
""" """
Generator that splits a string by spaces, leaving quoted phrases together. Generator that splits a string by spaces, leaving quoted phrases together.
@ -582,17 +598,17 @@ def words(text):
returns words in text, removing punctuation returns words in text, removing punctuation
""" """
text = text.split() text = text.split()
return map(lambda x: re.sub("(([.!?:-_]|'s)$)", '', x), text) return [re.sub("(([.!?:-_]|'s)$)", '', x) for x in text]
def sort_string(string): def sort_string(string):
string = string.replace(u'Æ', 'AE').replace(u'Ø', 'O').replace(u'Þ', 'Th') string = string.replace(u'Æ', 'AE').replace(u'Ø', 'O').replace(u'Þ', 'Th')
#pad numbered titles # pad numbered titles
string = re.sub('(\d),(\d{3})', '\\1\\2', string) string = re.sub('(\d),(\d{3})', '\\1\\2', string)
string = re.sub('(\d+)', lambda x: '%010d' % int(x.group(0)), string) string = re.sub('(\d+)', lambda x: '%010d' % int(x.group(0)), string)
return unicodedata.normalize('NFKD', string) return unicodedata.normalize('NFKD', string)
def sorted_strings(strings, key=None): def sorted_strings(strings, key=None):
if not key: if not key:
key = lambda k: sort_string(k) key = sort_string
return sorted(strings, key=key) return sorted(strings, key=key)