diff --git a/ox/text.py b/ox/text.py index d650262..dec6b3d 100644 --- a/ox/text.py +++ b/ox/text.py @@ -133,86 +133,86 @@ UA_NAMES = { } UA_REGEXPS = { 'browser': [ - '(Camino)\/(\d+)', - '(Chimera)\/(\d+)', - '(chromeframe)\/(\d+)', - '(Edge)\/(\d+)', - '(Epiphany)\/(\d+)', # before Chrome, Chromium and Safari - '(Chromium)\/(\d+)', # before Chrome - '(Chrome)\/(\d+)', - '(FBForIPhone)', - '(Firefox)\/(\d+)', - '(Galeon)\/(\d+)', - '(IEMobile)\/(\d+)', - '(iCab) (\d+)', - '(iCab)\/(\d+)', - '(konqueror)\/(\d+)', - '(Konqueror)\/(\d+)', - '(Lynx)\/(\d+)', - '(Netscape)\d?\/(\d+)', - '(NokiaBrowser)\/(\d+)', - '(OmniWeb)\/(\d+)', - '(Opera)\/.+Version\/(\d+)', - '(OviBrowser)\/(\d+)', - 'Version\/(\d+).+(Safari)', - '(WebKit)\/(\d+)', - '(MSIE) (\d\d?(?!\d))', # last, since Opera used to mask as MSIE - '(Trident)\/.*?rv:(\d+)', - '(Gecko)', - '(Mozilla)\/(3|4)' + r'(Camino)\/(\d+)', + r'(Chimera)\/(\d+)', + r'(chromeframe)\/(\d+)', + r'(Edge)\/(\d+)', + r'(Epiphany)\/(\d+)', # before Chrome, Chromium and Safari + r'(Chromium)\/(\d+)', # before Chrome + r'(Chrome)\/(\d+)', + r'(FBForIPhone)', + r'(Firefox)\/(\d+)', + r'(Galeon)\/(\d+)', + r'(IEMobile)\/(\d+)', + r'(iCab) (\d+)', + r'(iCab)\/(\d+)', + r'(konqueror)\/(\d+)', + r'(Konqueror)\/(\d+)', + r'(Lynx)\/(\d+)', + r'(Netscape)\d?\/(\d+)', + r'(NokiaBrowser)\/(\d+)', + r'(OmniWeb)\/(\d+)', + r'(Opera)\/.+Version\/(\d+)', + r'(OviBrowser)\/(\d+)', + r'Version\/(\d+).+(Safari)', + r'(WebKit)\/(\d+)', + r'(MSIE) (\d\d?(?!\d))', # last, since Opera used to mask as MSIE + r'(Trident)\/.*?rv:(\d+)', + r'(Gecko)', + r'(Mozilla)\/(3|4)' ], 'robot': [ - '(BingPreview)\/(\d+)', - '(Google Web Preview).+Chrome\/(\d+)', - '(Googlebot)\/(\d+)', - '(WebCrawler)\/(\d+)', - '(Yahoo! Slurp)\/(\d+)', - '(YandexBot)\/([\d\.]+)', - '(YandexMobileBot)\/([\d\.]+)', + r'(BingPreview)\/(\d+)', + r'(Google Web Preview).+Chrome\/(\d+)', + r'(Googlebot)\/(\d+)', + r'(WebCrawler)\/(\d+)', + r'(Yahoo! Slurp)\/(\d+)', + r'(YandexBot)\/([\d\.]+)', + r'(YandexMobileBot)\/([\d\.]+)', ], 'system': [ - '(Android) (\d+)', - '(Android)', - '(BB)(\d+)', - '(BeOS)', - '(BlackBerry) (\d+)', - '(BlackBerry)', - '(Darwin)', - '(BSD) (FreeBSD|NetBSD|OpenBSD)', - '(CPU OS) (\d+)', - '(iPhone OS) (\d+)', - '(iPhone)', # Opera - '(J2ME\/MIDP)', - '(Linux).+(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS)', - '(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS).+(Linux)', - '(Linux)', - '(Mac OS X) (10.\d+)', - '(Mac OS X)', - '(Mac_PowerPC)', - '(Mac_PPC)', - '(Macintosh)', - 'Nintendo (Wii).+NX\/(\d+)', - '(PLAYSTATION) (\d+)', - '(PlayStation) Vita (\d+)', - '(RIM Tablet OS) (\d+)', - '(S)(60);', - '(Series) ?(40|60)', - '(Symbian OS)', - '(SymbianOS)\/(\d+)', - '(SymbOS)', - '(OS\/2)', - '(Unix) (AIX|HP-UX|IRIX|SunOS)', - '(Unix)', - '(Windows) (NT \d\.\d)', - '(Windows Phone) (\d+)', - '(Windows Phone OS) (\d+)', - '(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)', # Opera - '(Win) (9x 4\.90)', # Firefox - '(Win)(16)', # Firefox - '(Win)(9\d)', # Firefox - '(Win)(NT)', # Firefox - '(Win)(NT4\.0)', # Firefox - '(X11)' + r'(Android) (\d+)', + r'(Android)', + r'(BB)(\d+)', + r'(BeOS)', + r'(BlackBerry) (\d+)', + r'(BlackBerry)', + r'(Darwin)', + r'(BSD) (FreeBSD|NetBSD|OpenBSD)', + r'(CPU OS) (\d+)', + r'(iPhone OS) (\d+)', + r'(iPhone)', # Opera + r'(J2ME\/MIDP)', + r'(Linux).+(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS)', + r'(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS).+(Linux)', + r'(Linux)', + r'(Mac OS X) (10.\d+)', + r'(Mac OS X)', + r'(Mac_PowerPC)', + r'(Mac_PPC)', + r'(Macintosh)', + r'Nintendo (Wii).+NX\/(\d+)', + r'(PLAYSTATION) (\d+)', + r'(PlayStation) Vita (\d+)', + r'(RIM Tablet OS) (\d+)', + r'(S)(60);', + r'(Series) ?(40|60)', + r'(Symbian OS)', + r'(SymbianOS)\/(\d+)', + r'(SymbOS)', + r'(OS\/2)', + r'(Unix) (AIX|HP-UX|IRIX|SunOS)', + r'(Unix)', + r'(Windows) (NT \d\.\d)', + r'(Windows Phone) (\d+)', + r'(Windows Phone OS) (\d+)', + r'(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)', # Opera + r'(Win) (9x 4\.90)', # Firefox + r'(Win)(16)', # Firefox + r'(Win)(9\d)', # Firefox + r'(Win)(NT)', # Firefox + r'(Win)(NT4\.0)', # Firefox + r'(X11)' ] } UA_VERSIONS = { @@ -332,9 +332,9 @@ def get_sort_name(name): first_names = name.split(' ') last_names = [] - if re.search('^[0-9]+$', first_names[-1]): + if re.search(r'^[0-9]+$', first_names[-1]): add_name() - if re.search('[(\[].+?[)\]]$', first_names[-1]): + if re.search(r'[(\[].+?[)\]]$', first_names[-1]): add_name() if find_name(SUFFIXES): add_name() @@ -425,7 +425,7 @@ def parse_useragent(useragent): matches = list(match.groups()) if len(matches) == 1: matches.append('') - swap = re.match('^\d', matches[0]) or matches[1] == 'Linux' + swap = re.match(r'^\d', matches[0]) or matches[1] == 'Linux' name = matches[1 if swap else 0] version = matches[0 if swap else 1].replace('_', '.') name = UA_NAMES[key][name] if name in UA_NAMES[key] else name @@ -685,8 +685,8 @@ def sort_string(string): string = string.replace('Æ', 'AE').replace('Ø', 'O').replace('Þ', 'Th') # pad numbered titles - string = re.sub('(\d),(\d{3})', '\\1\\2', string) - string = re.sub('(\d+)', lambda x: '%010d' % int(x.group(0)), string) + string = re.sub(r'(\d),(\d{3})', '\\1\\2', string) + string = re.sub(r'(\d+)', lambda x: '%010d' % int(x.group(0)), string) return unicodedata.normalize('NFKD', string) def sorted_strings(strings, key=None):