python-ox/ox/text.py

677 lines
22 KiB
Python
Raw Normal View History

2008-04-27 16:54:37 +00:00
# -*- coding: utf-8 -*-
2008-06-19 09:21:21 +00:00
# vi:si:et:sw=4:sts=4:ts=4
2008-07-06 13:00:06 +00:00
# GPL 2008
2008-07-06 15:34:29 +00:00
import math
2008-04-27 16:54:37 +00:00
import re
2012-05-16 10:29:52 +00:00
import unicodedata
2008-04-27 16:54:37 +00:00
2016-06-08 10:27:55 +00:00
from six.moves import reduce
ARTICLES = list(set([
# def sg, def pl, indef sg, indef pl (each m/f/n)
2016-06-08 10:27:55 +00:00
'der', 'die', 'das', 'ein', 'eine', # de
'the', 'a', 'an', # en
'el', 'la', 'lo', 'los', 'las', 'un', 'una', 'unos', 'unas', # es
'le', "l'", 'la', 'les', 'un', 'une', 'des', # fr
'il', 'lo', "l'" 'la', '_i', 'gli', 'le', # it
'de', 'het', 'een', # nl
'o', 'a', 'os', '_as', 'um', 'uma', '_uns', 'umas' # pt
# some _disabled because of collisions
]))
2017-08-02 16:39:51 +00:00
# every given name in 0xDB that matches Xxxx-yyyy Lastname
ASIAN_FIRST_NAMES = [
'a', 'ae', 'aeng', 'ah', 'ai', 'an', 'back', 'bae', 'ban', 'bang', 'bao',
'beom', 'bi', 'bin', 'bo', 'bok', 'bon', 'bong', 'bu', 'bum', 'byeong',
'byoung', 'byung', 'cai', 'chae', 'chan', 'chang', 'chao', 'cheal', 'chen',
'cheng', 'cheol', 'cheon', 'cheong', 'cheul', 'chi', 'chia', 'chiao',
'chieh', 'chien', 'chih', 'chin', 'ching', 'cho', 'choi', 'chong', 'choo',
'chu', 'chuan', 'chuen', 'chul', 'chun', 'chung', 'chuo', 'chyi', 'da',
'dae', 'dah', 'dal', 'dan', 'deok', 'do', 'dong', 'doo', 'duek', 'duk',
'e', 'el', 'en', 'eui', 'eul', 'eun', 'eung', 'fai', 'fan', 'fang', 'fei',
'fen', 'feng', 'fo', 'foo', 'fu', 'ga', 'gae', 'gam', 'gang', 'ge', 'gen',
'geon', 'geun', 'gi', 'gil', 'gin', 'gnad', 'gok', 'goo', 'gook', 'gu',
'gun', 'gwan', 'gye', 'gyeong', 'gyu', 'gyun', 'ha', 'hae', 'hak', 'han',
'hang', 'hao', 'he', 'hee', 'heng', 'heon', 'hie', 'ho', 'hoi', 'hong',
'hoo', 'hoon', 'hou', 'hsi', 'hsiang', 'hsiao', 'hsieh', 'hsien', 'hsin',
'hsing', 'hsiung', 'hu', 'hua', 'huai', 'huang', 'hue', 'hui', 'hun',
'hung', 'hwa', 'hwan', 'hwang', 'hye', 'hyeok', 'hyeon', 'hyeong', 'hyo',
'hyuk', 'hyun', 'hyung', 'i', 'ik', 'il', 'in', 'ja', 'jae', 'jan', 'jang',
'je', 'jee', 'jen', 'jeok', 'jeong', 'jeung', 'ji', 'jia', 'jian', 'jik',
'jin', 'jing', 'jo', 'jong', 'joo', 'joon', 'ju', 'juan', 'jun', 'jung',
'ka', 'kai', 'kam', 'kan', 'kang', 'kap', 'kar', 'ke', 'kee', 'kei',
'keng', 'keum', 'keung', 'ki', 'kil', 'kin', 'kit', 'kot', 'ku', 'kua',
'kuan', 'kuang', 'kuen', 'kun', 'kuo', 'kwang', 'kwok', 'kwon', 'kwong',
'kyeong', 'kyo', 'kyoon', 'kyou', 'kyoung', 'kyu', 'kyun', 'kyung', 'lai',
'lau', 'lee', 'lei', 'leng', 'leung', 'li', 'liang', 'lien', 'lin', 'ling',
'lock', 'long', 'lun', 'lung', 'maeng', 'man', 'mei', 'mi', 'miao', 'min',
'ming', 'mo', 'mok', 'moo', 'mook', 'moon', 'mu', 'mun', 'myeong',
'myoeng', 'myong', 'myung', 'na', 'nae', 'nai', 'nam', 'nan', 'neung',
'ngaru', 'ni', 'no', 'nyeo', 'oh', 'ok', 'ou', 'pai', 'pei', 'pen', 'peng',
'pi', 'pil', 'pin', 'ping', 'po', 'pui', 'pyo', 'pyung', 'qing', 'qun',
'ra', 'rak', 'ram', 'ran', 'reum', 'ri', 'rim', 'rin', 'roe', 'rok', 'ru',
'rui', 'ryeon', 'ryol', 'ryong', 'sa', 'sae', 'san', 'sang', 'se', 'seo',
'seob', 'seok', 'seol', 'seon', 'seong', 'seung', 'shan', 'shen', 'sheng',
'shi', 'shia', 'shiang', 'shih', 'shik', 'shim', 'shin', 'shing', 'shou',
'shu', 'shun', 'si', 'sik', 'sin', 'siu', 'so', 'song', 'soo', 'sook',
'soon', 'su', 'suk', 'sun', 'sung', 'sup', 'szu', "t'ien", 'ta', 'tae',
'taek', 'tai', 'tak', 'te', 'ti', 'tian', 'ting', 'to', 'toa', 'tsai',
'tsan', 'tse', 'tso', 'tsui', 'tung', 'tzu', 'ua', 'ui', 'un', 'wah',
'wai', 'wan', 'wei', 'wen', 'weon', 'wing', 'wit', 'wol', 'won', 'woo',
'wook', 'woon', 'woong', 'wuk', 'xiao', 'ya', 'yan', 'yang', 'yao', 'ye',
'yea', 'yee', 'yeh', 'yen', 'yeo', 'yeol', 'yeon', 'yeong', 'yeop', 'yi',
'yin', 'ying', 'yiu', 'yoeng', 'yong', 'yoo', 'yoon', 'you', 'young', 'yu',
'yuan', 'yue', 'yuen', 'yuk', 'yull', 'yun', 'yune', 'yung', 'zhi',
'zhong', 'zhu'
]
# see http://en.wikipedia.org/wiki/List_of_common_Chinese_surnames
# and http://en.wikipedia.org/wiki/List_of_Korean_family_names
ASIAN_NAMES = [
'chan', 'chang', 'chao',
'chen', 'cheong', 'cheung',
'chong', 'choo',
'chu', 'chun',
'hou', 'hsieh', 'hsu', 'hu', 'huang',
'kuo',
'li', 'liang', 'lin', 'liu',
'_park',
'sun', 'sung',
'tsao',
'wang', 'Wong',
'yang', 'yeong', 'yeung'
]
PREFIXES = [
2011-10-12 10:19:57 +00:00
'al', 'bin', 'da', 'de', 'del', 'dem', 'den', 'der', 'di', 'dos', 'du',
2011-10-11 19:19:54 +00:00
'e', 'el', 'la', 'san', 'the', 'van', 'vom', 'von', 'y', 'zu'
]
MIDFIXES = ['und']
2011-10-11 19:10:36 +00:00
SUFFIXES = ['ii', 'iii', 'jr', 'jr.', 'ph.d.', 'phd', 'sr', 'sr.']
2012-03-21 07:44:24 +00:00
UA_ALIASES = {
'browser': {
2012-11-09 22:07:31 +00:00
'Chrome': '(CriOS|CrMo)',
2012-10-27 16:51:39 +00:00
'Firefox': '(Fennec|Firebird|Iceweasel|Minefield|Namoroka|Phoenix|SeaMonkey|Shiretoko)',
'Nokia Browser': '(OviBrowser)'
2012-03-21 07:44:24 +00:00
},
'robot': {},
2012-03-21 07:44:24 +00:00
'system': {
'BSD': '(FreeBSD|NetBSD|OpenBSD)',
'Linux': '(CrOS|MeeGo|webOS)',
'Unix': '(AIX|HP-UX|IRIX|SunOS)'
}
}
UA_NAMES = {
'browser': {
'chromeframe': 'Chrome Frame',
2012-10-27 16:51:39 +00:00
'FBForIPhone': 'WebKit',
2012-08-15 15:58:46 +00:00
'Gecko': 'Mozilla',
2013-07-29 16:22:22 +00:00
'IEMobile': 'Internet Explorer',
2012-10-27 19:59:40 +00:00
'konqueror': 'Konqueror',
2012-08-15 15:58:46 +00:00
'Mozilla': 'Netscape',
2012-10-27 16:51:39 +00:00
'MSIE': 'Internet Explorer',
2013-10-23 22:24:13 +00:00
'NokiaBrowser': 'Nokia Browser',
'Trident': 'Internet Explorer'
2012-03-21 07:44:24 +00:00
},
'robot': {},
2012-03-21 07:44:24 +00:00
'system': {
2013-07-29 17:03:46 +00:00
'BB': 'BlackBerry',
2012-03-21 07:44:24 +00:00
'CPU OS': 'iOS',
2012-08-15 15:58:46 +00:00
'iPhone': 'iOS',
2012-03-21 07:44:24 +00:00
'iPhone OS': 'iOS',
2012-10-27 16:51:39 +00:00
'J2ME/MIDP': 'Java',
2012-08-15 15:58:46 +00:00
'Mac_PowerPC': 'Mac OS',
'Mac_PPC': 'Mac OS',
'Macintosh': 'Mac OS',
2012-10-27 16:51:39 +00:00
'PLAYSTATION': 'PlayStation',
'S': 'Nokia',
'Series': 'Nokia',
2012-08-15 15:58:46 +00:00
'Win': 'Windows',
2013-07-30 13:22:23 +00:00
'Windows Phone OS': 'Windows Phone',
2012-08-15 15:58:46 +00:00
'X11': 'Linux'
2012-03-21 07:44:24 +00:00
}
}
UA_REGEXPS = {
'browser': [
'(Camino)\/(\d+)',
2012-08-15 15:58:46 +00:00
'(Chimera)\/(\d+)',
2012-03-21 07:44:24 +00:00
'(chromeframe)\/(\d+)',
2015-08-04 17:23:47 +00:00
'(Edge)\/(\d+)',
2016-06-08 10:27:55 +00:00
'(Epiphany)\/(\d+)', # before Chrome, Chromium and Safari
'(Chromium)\/(\d+)', # before Chrome
2012-03-21 07:44:24 +00:00
'(Chrome)\/(\d+)',
2012-10-27 16:51:39 +00:00
'(FBForIPhone)',
2012-03-21 07:44:24 +00:00
'(Firefox)\/(\d+)',
'(Galeon)\/(\d+)',
2013-07-29 16:22:22 +00:00
'(IEMobile)\/(\d+)',
2012-08-15 15:58:46 +00:00
'(iCab) (\d+)',
'(iCab)\/(\d+)',
2012-10-27 19:59:40 +00:00
'(konqueror)\/(\d+)',
2012-03-21 07:44:24 +00:00
'(Konqueror)\/(\d+)',
2012-08-15 15:58:46 +00:00
'(Lynx)\/(\d+)',
2012-03-21 07:44:24 +00:00
'(Netscape)\d?\/(\d+)',
'(NokiaBrowser)\/(\d+)',
2012-08-15 15:58:46 +00:00
'(OmniWeb)\/(\d+)',
2012-03-21 07:44:24 +00:00
'(Opera)\/.+Version\/(\d+)',
2012-10-27 16:51:39 +00:00
'(OviBrowser)\/(\d+)',
'Version\/(\d+).+(Safari)',
2012-08-15 15:58:46 +00:00
'(WebKit)\/(\d+)',
2016-06-08 10:27:55 +00:00
'(MSIE) (\d\d?(?!\d))', # last, since Opera used to mask as MSIE
2013-10-23 22:24:13 +00:00
'(Trident)\/.*?rv:(\d+)',
2012-08-15 15:58:46 +00:00
'(Gecko)',
'(Mozilla)\/(3|4)'
],
'robot': [
'(BingPreview)\/(\d+)',
'(Google Web Preview).+Chrome\/(\d+)',
2012-08-15 15:58:46 +00:00
'(Googlebot)\/(\d+)',
2012-10-27 16:51:39 +00:00
'(WebCrawler)\/(\d+)',
2016-11-25 21:24:45 +00:00
'(Yahoo! Slurp)\/(\d+)',
'(YandexBot)\/([\d\.]+)',
2016-11-25 21:34:27 +00:00
'(YandexMobileBot)\/([\d\.]+)',
2012-03-21 07:44:24 +00:00
],
'system': [
'(Android) (\d+)',
2012-10-27 16:51:39 +00:00
'(Android)',
2013-07-29 17:03:46 +00:00
'(BB)(\d+)',
2012-03-21 07:44:24 +00:00
'(BeOS)',
'(BlackBerry) (\d+)',
2012-08-15 15:58:46 +00:00
'(BlackBerry)',
2012-03-21 07:44:24 +00:00
'(Darwin)',
'(BSD) (FreeBSD|NetBSD|OpenBSD)',
'(CPU OS) (\d+)',
'(iPhone OS) (\d+)',
2016-06-08 10:27:55 +00:00
'(iPhone)', # Opera
2012-10-27 16:51:39 +00:00
'(J2ME\/MIDP)',
2012-03-21 07:44:24 +00:00
'(Linux).+(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS)',
'(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS).+(Linux)',
'(Linux)',
'(Mac OS X) (10.\d+)',
2012-03-21 07:44:24 +00:00
'(Mac OS X)',
2012-08-15 15:58:46 +00:00
'(Mac_PowerPC)',
'(Mac_PPC)',
2012-03-21 07:44:24 +00:00
'(Macintosh)',
2013-07-29 16:22:22 +00:00
'Nintendo (Wii).+NX\/(\d+)',
2012-10-27 16:51:39 +00:00
'(PLAYSTATION) (\d+)',
2013-07-30 16:33:33 +00:00
'(PlayStation) Vita (\d+)',
2013-07-29 16:22:22 +00:00
'(RIM Tablet OS) (\d+)',
2012-10-27 16:51:39 +00:00
'(S)(60);',
'(Series) ?(40|60)',
2012-08-15 15:58:46 +00:00
'(Symbian OS)',
2012-03-21 07:44:24 +00:00
'(SymbianOS)\/(\d+)',
'(SymbOS)',
'(OS\/2)',
'(Unix) (AIX|HP-UX|IRIX|SunOS)',
'(Unix)',
'(Windows) (NT \d\.\d)',
2013-07-29 16:22:22 +00:00
'(Windows Phone) (\d+)',
'(Windows Phone OS) (\d+)',
2016-06-08 10:27:55 +00:00
'(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)', # Opera
'(Win) (9x 4\.90)', # Firefox
'(Win)(16)', # Firefox
'(Win)(9\d)', # Firefox
'(Win)(NT)', # Firefox
'(Win)(NT4\.0)', # Firefox
2012-08-15 15:58:46 +00:00
'(X11)'
2012-03-21 07:44:24 +00:00
]
}
UA_VERSIONS = {
'browser': {},
'robot': {},
2012-03-21 07:44:24 +00:00
'system': {
2012-03-21 07:47:04 +00:00
'10.0': '10.0 (Cheetah)',
'10.1': '10.1 (Puma)',
2012-03-21 07:44:24 +00:00
'10.2': '10.2 (Jaguar)',
'10.3': '10.3 (Panther)',
'10.4': '10.4 (Tiger)',
'10.5': '10.5 (Leopard)',
'10.6': '10.6 (Snow Leopard)',
'10.7': '10.7 (Lion)',
2012-03-21 07:47:04 +00:00
'10.8': '10.8 (Mountain Lion)',
2013-07-30 13:22:23 +00:00
'10.9': '10.9 (Mavericks)',
'10.10': '10.10 (Yosemite)',
2015-08-04 17:23:47 +00:00
'10.11': '10.11 (El Capitan)',
2012-10-27 16:51:39 +00:00
'40': 'Series 40',
'60': 'Series 60',
2012-08-15 15:58:46 +00:00
'NT 3.1': 'NT 3.1 (3.1)',
'NT 3.5': 'NT 3.5 (NT)',
'NT 4.0': 'NT 4.0 (NT)',
'NT 4.1': 'NT 4.1 (98)',
2012-08-15 15:58:46 +00:00
'9x 4.90': 'NT 4.9 (ME)',
'NT 5.0': 'NT 5.0 (2000)',
'NT 5.1': 'NT 5.1 (XP)',
'NT 5.2': 'NT 5.2 (2003)',
'NT 6.0': 'NT 6.0 (Vista)',
'NT 6.1': 'NT 6.1 (7)',
'NT 6.2': 'NT 6.2 (8)',
2013-07-30 17:06:01 +00:00
'NT 6.3': 'NT 6.3 (8.1)',
2014-11-21 09:46:12 +00:00
'NT 6.4': 'NT 6.4 (10)',
2012-08-15 15:58:46 +00:00
'16': 'NT 3.1 (3.1)',
'3.1': 'NT 3.1 (3.1)',
'95': 'NT 4.0 (95)',
'NT': 'NT 4.0 (NT)',
2012-08-15 15:58:46 +00:00
'NT4.0': 'NT 4.0 (NT)',
'98': 'NT 4.1 (98)',
'ME': 'NT 4.9 (ME)',
'2000': 'NT 5.0 (2000)',
'XP': 'NT 5.1 (XP)',
2012-08-15 15:58:46 +00:00
'2003': 'NT 5.2 (2003)'
2012-03-21 07:44:24 +00:00
}
}
def get_sort_name(name):
"""
>>> get_sort_name('Alfred Hitchcock')
'Hitchcock, Alfred'
>>> get_sort_name('Jean-Luc Godard')
'Godard, Jean-Luc'
>>> get_sort_name('Rainer Werner Fassbinder')
'Fassbinder, Rainer Werner'
>>> get_sort_name('Brian De Palma')
'De Palma, Brian'
>>> get_sort_name('Johan van der Keuken')
'van der Keuken, Johan'
>>> get_sort_name('Edward D. Wood Jr.')
'Wood Jr., Edward D.'
>>> get_sort_name('Bing Wang')
'Wang Bing'
2011-10-11 19:19:54 +00:00
>>> get_sort_name('Frank Capra III')
'Capra III, Frank'
>>> get_sort_name('The Queen of England')
'Queen of England, The'
>>> get_sort_name('Sham 69')
'Sham 69'
>>> get_sort_name('Scorsese, Martin')
'Scorsese, Martin'
"""
2016-06-08 10:27:55 +00:00
if ' ' not in name or ', ' in name:
return name
if name.lower().startswith('the '):
return get_sort_title(name)
2016-06-08 10:27:55 +00:00
def add_name():
if len(first_names):
last_names.insert(0, first_names.pop())
2016-06-08 10:27:55 +00:00
def find_name(names):
return len(first_names) and first_names[-1].lower() in names
2016-06-08 10:27:55 +00:00
2017-08-02 16:39:51 +00:00
if is_asian_name(name):
names = name.split(' ')
2017-08-02 16:47:39 +00:00
if '-' in names[0]:
firstnames = names[0].split('-')
names[0] = '-'.join(firstnames[0], firstnames[1].lower())
2017-08-02 16:39:51 +00:00
return names[-1] + ' ' + ' '.join(names[:-1])
first_names = name.split(' ')
last_names = []
if re.search('^[0-9]+$', first_names[-1]):
add_name()
if re.search('[(\[].+?[)\]]$', first_names[-1]):
add_name()
if find_name(SUFFIXES):
add_name()
add_name()
if find_name(MIDFIXES):
add_name()
add_name()
while find_name(PREFIXES):
add_name()
name = ' '.join(last_names)
if len(first_names):
separator = ' ' if last_names[0].lower() in ASIAN_NAMES else ', '
name += separator + ' '.join(first_names)
return name
def get_sort_title(title):
"""
>>> get_sort_title('Themroc')
'Themroc'
>>> get_sort_title('Die Hard')
'Hard, Die'
>>> get_sort_title("L'atalante")
"atalante, L'"
"""
for article in ARTICLES:
spaces = 0 if article.endswith("'") else 1
if title.lower().startswith(article + ' ' * spaces):
length = len(article)
return title[length + spaces:] + ', ' + title[:length]
return title
2008-04-27 16:54:37 +00:00
def find_re(string, regexp):
2008-06-19 09:21:21 +00:00
result = re.compile(regexp, re.DOTALL).findall(string)
if result:
return result[0].strip()
return ''
2008-04-29 13:34:27 +00:00
2016-06-08 10:27:55 +00:00
def find_string(string, string0='', string1=''):
"""Return the string between string0 and string1.
2008-04-29 13:34:27 +00:00
2008-06-19 09:21:21 +00:00
If string0 or string1 is left out, begining or end of string is used.
2008-04-29 13:34:27 +00:00
>>> find_string('i am not there', string1=' not there')
2008-06-19 09:21:21 +00:00
'i am'
2008-04-29 13:34:27 +00:00
>>> find_string('i am not there', 'i am ', ' there')
2008-06-19 09:21:21 +00:00
'not'
2008-04-29 11:26:42 +00:00
>>> find_string('i am not there', 'i am not t')
2008-06-19 09:21:21 +00:00
'here'
2008-04-29 13:34:27 +00:00
2008-06-19 09:21:21 +00:00
"""
if string0:
string0 = re.escape(string0)
else:
string0 = '^'
if string1:
string1 = re.escape(string1)
else:
string1 = '$'
return find_re(string, string0 + '(.*?)' + string1)
2012-03-21 07:44:24 +00:00
2017-08-02 16:39:51 +00:00
def is_asian_name(name):
2017-08-02 16:44:32 +00:00
names = name.replace('-', ' ').lower().split(' ')
2017-08-02 16:39:51 +00:00
if len(names) != 3:
return False
if names[0] in ASIAN_FIRST_NAMES and names[1] in ASIAN_FIRST_NAMES:
return True
return False
2012-03-21 07:44:24 +00:00
def parse_useragent(useragent):
data = {}
for key in UA_REGEXPS:
2014-09-30 19:17:15 +00:00
for alias, regexp in UA_ALIASES[key].items():
2012-03-21 07:44:24 +00:00
alias = alias if key == 'browser' else alias + ' \\1'
2016-06-08 10:27:55 +00:00
useragent = re.sub(regexp, alias, useragent)
2012-03-21 07:44:24 +00:00
for regexp in UA_REGEXPS[key]:
data[key] = {'name': '', 'version': '', 'string': ''}
match = re.compile(regexp).search(useragent)
if match:
matches = list(match.groups())
if len(matches) == 1:
matches.append('')
swap = re.match('^\d', matches[0]) or matches[1] == 'Linux'
name = matches[1 if swap else 0]
version = matches[0 if swap else 1].replace('_', '.')
name = UA_NAMES[key][name] if name in UA_NAMES[key] else name
version = UA_VERSIONS[key][version] if version in UA_VERSIONS[key] else version
string = name
if version:
string = string + ' ' + (
'(' + version + ')' if name in ['BSD', 'Linux', 'Unix'] else version
)
data[key] = {
'name': name,
'version': version,
'string': string
}
2016-06-08 10:27:55 +00:00
break
2012-03-21 07:44:24 +00:00
return data
2008-04-29 11:26:42 +00:00
def remove_special_characters(text):
2008-06-19 09:21:21 +00:00
"""
Removes special characters inserted by Word.
"""
text = text.replace(u'\u2013', '-')
text = text.replace(u'\u2026O', "'")
text = text.replace(u'\u2019', "'")
text = text.replace(u'', "'")
text = text.replace(u'', "'")
text = text.replace(u'', "-")
return text
2008-04-27 16:54:37 +00:00
def wrap(text, width):
2008-06-19 09:21:21 +00:00
"""
A word-wrap function that preserves existing line breaks and most spaces in
the text. Expects that existing line breaks are posix newlines (\n).
See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
"""
2016-06-08 10:27:55 +00:00
def reduce_line(line, word):
return '%s%s%s' % (
line,
' \n'[
(len(line[line.rfind('\n')+1:]) + len(word.split('\n', 1)[0]) >= width)
],
word
)
return reduce(reduce_line, text.split(' '))
2008-04-27 16:54:37 +00:00
def wrap_string(string, length=80, separator='\n', balance=False):
2008-07-06 10:34:09 +00:00
'''
>>> wrap_string(u"Anticonstitutionellement, Paris s'eveille", 16)
2008-09-30 14:08:21 +00:00
u"Anticonstitution\\nellement, Paris \\ns'eveille"
>>> wrap_string(u'All you can eat', 12, '\\n', True)
2008-09-30 14:08:21 +00:00
u'All you \\ncan eat'
2008-07-06 10:34:09 +00:00
'''
2008-07-06 10:37:29 +00:00
words = string.split(' ')
2008-07-06 10:21:55 +00:00
if balance:
# balance lines: test if same number of lines
# can be achieved with a shorter line length
lines = wrap_string(string, length, separator, False).split(separator)
2008-07-06 10:21:55 +00:00
if len(lines) > 1:
2014-09-30 19:17:15 +00:00
while length > max([len(x) for x in words]):
2008-07-06 10:21:55 +00:00
length -= 1
if len(wrap_string(string, length, separator, False).split(separator)) > len(lines):
2008-07-06 10:21:55 +00:00
length += 1
break
lines = ['']
for word in words:
2008-09-30 14:08:21 +00:00
if len(lines[len(lines) - 1] + word + u' ') <= length + 1:
2008-07-06 10:21:55 +00:00
# word fits in current line
2016-06-08 10:27:55 +00:00
lines[len(lines) - 1] += word + u' '
2008-07-06 10:21:55 +00:00
else:
if len(word) <= length:
# word fits in next line
2008-09-30 14:08:21 +00:00
lines.append(word + u' ')
2008-07-06 10:21:55 +00:00
else:
# word is longer than line
position = length - len(lines[len(lines) - 1])
lines[len(lines) - 1] += word[0:position]
for i in range(position, len(word), length):
2016-06-08 10:27:55 +00:00
lines.append(word[i:i+length])
2008-09-30 14:08:21 +00:00
lines[len(lines) - 1] += u' '
2008-07-06 10:21:55 +00:00
return separator.join(lines).strip()
2008-07-06 09:43:06 +00:00
def truncate_string(string, length, padding='...', position='right'):
# >>> truncate_string('anticonstitutionellement', 16, '...', 'left')
2008-07-06 15:18:16 +00:00
# '...utionellement'
# >>> truncate_string('anticonstitutionellement', 16, '...', 'center')
2008-07-06 15:18:16 +00:00
# 'anticon...lement'
# >>> truncate_string('anticonstitutionellement', 16, '...', 'right')
2008-07-06 15:18:16 +00:00
# 'anticonstitut...'
2016-06-08 10:27:55 +00:00
stringLength = len(string)
2008-07-06 15:18:16 +00:00
paddingLength = len(padding)
if stringLength > length:
if position == 'left':
2008-07-06 15:34:29 +00:00
string = '%s%s' % (padding, string[stringLength + paddingLength - length:])
2008-07-06 15:18:16 +00:00
elif position == 'center':
2008-07-06 15:34:29 +00:00
left = int(math.ceil(float(length - paddingLength) / 2))
right = int(stringLength - math.floor(float(length - paddingLength) / 2))
string = '%s%s%s' % (string[:left], padding, string[right:])
2008-07-06 15:18:16 +00:00
elif position == 'right':
2008-07-06 15:34:29 +00:00
string = '%s%s' % (string[:length - paddingLength], padding)
2016-06-08 10:27:55 +00:00
return string
2008-07-06 15:18:16 +00:00
def truncate_words(s, num):
2008-06-19 09:21:21 +00:00
"""Truncates a string after a certain number of chacters, but ends with a word
2012-09-09 17:28:11 +00:00
>>> truncate_words('Truncates a string after a certain number of chacters, but ends with a word', 23)
2008-06-19 09:21:21 +00:00
'Truncates a string...'
2012-09-09 17:28:11 +00:00
>>> truncate_words('Truncates a string', 23)
2008-06-19 09:21:21 +00:00
'Truncates a string'
"""
length = int(num)
if len(s) <= length:
return s
words = s.split()
ts = ""
while words and len(ts) + len(words[0]) < length:
ts += " " + words.pop(0)
if words:
ts += "..."
return ts.strip()
def trim_string(string, num):
2008-06-19 09:21:21 +00:00
"""Truncates a string after a certain number of chacters, adding ... at -10 characters
>>> trim_string('Truncates a string after a certain number of chacters', 23)
2008-06-19 09:21:21 +00:00
'Truncates ...f chacters'
>>> trim_string('Truncates a string', 23)
2008-06-19 09:21:21 +00:00
'Truncates a string'
"""
if len(string) > num:
string = string[:num - 13] + '...' + string[-10:]
return string
def get_valid_filename(s):
2008-06-19 09:21:21 +00:00
"""
Returns the given string converted to a string that can be used for a clean
2016-06-08 10:27:55 +00:00
filename. Specifically, leading and trailing spaces are removed;
2008-06-19 09:21:21 +00:00
all non-filename-safe characters are removed.
>>> get_valid_filename("john's portrait in 2004.jpg")
2008-06-19 09:21:21 +00:00
'john_s_portrait_in_2004.jpg'
"""
s = s.strip()
s = s.replace(' ', '_')
s = re.sub(r'[^-A-Za-z0-9_.\[\]\ ]', '_', s)
s = s.replace('__', '_').replace('__', '_')
return s
2008-04-27 16:54:37 +00:00
def get_text_list(list_, last_word='or'):
2008-06-19 09:21:21 +00:00
"""
>>> get_text_list([u'a', u'b', u'c', u'd'])
2008-09-30 14:08:21 +00:00
u'a, b, c or d'
>>> get_text_list([u'a', u'b', u'c'], 'and')
2008-09-30 14:08:21 +00:00
u'a, b and c'
>>> get_text_list([u'a', u'b'], 'and')
2008-09-30 14:08:21 +00:00
u'a and b'
>>> get_text_list([u'a'])
2008-09-30 14:08:21 +00:00
u'a'
>>> get_text_list([])
2008-06-19 09:21:21 +00:00
''
"""
2016-06-08 10:27:55 +00:00
if len(list_) == 0:
return ''
if len(list_) == 1:
return list_[0]
return u'%s %s %s' % (u', '.join([i for i in list_][:-1]), last_word, list_[-1])
2008-04-27 16:54:37 +00:00
def get_list_text(text, last_word='or'):
2008-06-19 09:21:21 +00:00
"""
>>> get_list_text(u'a, b, c or d')
2008-09-30 14:08:21 +00:00
[u'a', u'b', u'c', u'd']
>>> get_list_text(u'a, b and c', u'and')
2008-09-30 14:08:21 +00:00
[u'a', u'b', u'c']
>>> get_list_text(u'a and b', u'and')
2008-09-30 14:08:21 +00:00
[u'a', u'b']
>>> get_list_text(u'a')
2008-09-30 14:08:21 +00:00
[u'a']
>>> get_list_text(u'')
2008-06-19 09:21:21 +00:00
[]
"""
list_ = []
if text:
2008-09-30 14:08:21 +00:00
list_ = text.split(u', ')
2008-06-19 09:21:21 +00:00
if list_:
2016-06-08 10:27:55 +00:00
i = len(list_)-1
2008-06-19 09:21:21 +00:00
last = list_[i].split(last_word)
if len(last) == 2:
list_[i] = last[0].strip()
list_.append(last[1].strip())
return list_
2008-05-06 11:37:40 +00:00
def normalize_newlines(text):
2008-06-19 09:21:21 +00:00
return re.sub(r'\r\n|\r|\n', '\n', text)
2008-04-27 16:54:37 +00:00
def recapitalize(text):
2008-06-19 09:21:21 +00:00
"Recapitalizes text, placing caps after end-of-sentence punctuation."
2016-06-08 10:27:55 +00:00
# capwords = ()
2008-06-19 09:21:21 +00:00
text = text.lower()
capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
text = capsRE.sub(lambda x: x.group(1).upper(), text)
2016-06-08 10:27:55 +00:00
# for capword in capwords:
2008-06-19 09:21:21 +00:00
# capwordRE = re.compile(r'\b%s\b' % capword, re.I)
# text = capwordRE.sub(capword, text)
return text
2008-04-27 16:54:37 +00:00
def phone2numeric(phone):
2008-06-19 09:21:21 +00:00
"Converts a phone number with letters into its numeric equivalent."
letters = re.compile(r'[A-PR-Y]', re.I)
2016-06-08 10:27:55 +00:00
def char2number(m):
return {
'a': '2', 'c': '2', 'b': '2', 'e': '3',
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
'y': '9', 'x': '9'
}.get(m.group(0).lower())
2008-06-19 09:21:21 +00:00
return letters.sub(char2number, phone)
2008-04-27 16:54:37 +00:00
def compress_string(s):
2016-06-08 10:27:55 +00:00
import gzip
from six import BytesIO
zbuf = BytesIO()
2008-06-19 09:21:21 +00:00
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
zfile.write(s)
zfile.close()
return zbuf.getvalue()
2008-04-27 16:54:37 +00:00
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
2016-06-08 10:27:55 +00:00
def smart_split(text):
2008-06-19 09:21:21 +00:00
"""
Generator that splits a string by spaces, leaving quoted phrases together.
Supports both single and double quotes, and supports escaping quotes with
backslashes. In the output, strings will keep their initial and trailing
quote marks.
>>> list(smart_split('This is "a person\\'s" test.'))
2008-06-19 09:21:21 +00:00
['This', 'is', '"a person\\'s"', 'test.']
"""
for bit in smart_split_re.finditer(text):
bit = bit.group(0)
if bit[0] == '"':
yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
elif bit[0] == "'":
yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
else:
yield bit
2008-04-29 13:34:27 +00:00
2011-10-30 11:54:59 +00:00
def words(text):
"""
returns words in text, removing punctuation
"""
text = text.split()
2016-06-08 10:27:55 +00:00
return [re.sub("(([.!?:-_]|'s)$)", '', x) for x in text]
2012-05-16 10:29:52 +00:00
def sort_string(string):
string = string.replace(u'Æ', 'AE').replace(u'Ø', 'O').replace(u'Þ', 'Th')
2016-06-08 10:27:55 +00:00
# pad numbered titles
2013-02-01 10:43:40 +00:00
string = re.sub('(\d),(\d{3})', '\\1\\2', string)
2012-05-16 10:29:52 +00:00
string = re.sub('(\d+)', lambda x: '%010d' % int(x.group(0)), string)
return unicodedata.normalize('NFKD', string)
2012-08-20 18:34:23 +00:00
def sorted_strings(strings, key=None):
2014-09-30 19:17:15 +00:00
if not key:
2016-06-08 10:27:55 +00:00
key = sort_string
2014-09-30 19:17:15 +00:00
return sorted(strings, key=key)