From ae10c5c9b9025c63fed049f4b1edd4a4c987fd2c Mon Sep 17 00:00:00 2001 From: j Date: Fri, 30 Aug 2024 13:30:47 +0200 Subject: [PATCH] more raw regexp strings --- ox/format.py | 4 ++-- ox/html.py | 20 ++++++++++---------- ox/movie.py | 30 +++++++++++++++--------------- ox/normalize.py | 2 +- ox/web/allmovie.py | 2 +- ox/web/apple.py | 4 ++-- ox/web/arsenalberlin.py | 2 +- ox/web/google.py | 2 +- ox/web/piratecinema.py | 2 +- ox/web/wikipedia.py | 20 ++++++++++---------- setup.py | 2 +- 11 files changed, 45 insertions(+), 45 deletions(-) diff --git a/ox/format.py b/ox/format.py index 83756c1..2aa0868 100644 --- a/ox/format.py +++ b/ox/format.py @@ -236,7 +236,7 @@ def int_value(strValue, default=''): '' """ try: - val = re.compile('(\d+)').findall(str(strValue).strip())[0] + val = re.compile(r'(\d+)').findall(str(strValue).strip())[0] except: val = default return val @@ -253,7 +253,7 @@ def float_value(strValue, default=''): '' """ try: - val = re.compile('([\d.]+)').findall(str(strValue).strip())[0] + val = re.compile(r'([\d.]+)').findall(str(strValue).strip())[0] except: val = default return val diff --git a/ox/html.py b/ox/html.py index f7ca816..06ae96f 100644 --- a/ox/html.py +++ b/ox/html.py @@ -178,10 +178,10 @@ def highlight(text, query, hlClass="hl"): """ if query: text = text.replace('
', '|') - query = re.escape(query).replace('\ ', '.') + query = re.escape(query).replace(r'\ ', '.') m = re.compile("(%s)" % query, re.IGNORECASE).findall(text) for i in m: - text = re.sub("(%s)" % re.escape(i).replace('\ ', '.'), '\\1' % hlClass, text) + text = re.sub(r"(%s)" % re.escape(i).replace(r'\ ', '.'), r'\\1' % hlClass, text) text = text.replace('|', '
') return text @@ -234,7 +234,7 @@ def sanitize_html(html, tags=None, global_attributes=[]): ' ' ''' if not tags: - valid_url = '^((https?:\/\/|\/|mailto:).*?)' + valid_url = r'^((https?:\/\/|\/|mailto:).*?)' tags = [ # inline formatting {'name': 'b'}, @@ -300,8 +300,8 @@ def sanitize_html(html, tags=None, global_attributes=[]): 'optional': ['width', 'height'], 'required': ['src'], 'validation': { - 'width': '^\d+$', - 'height': '^\d+$', + 'width': r'^\d+$', + 'height': r'^\d+$', 'src': valid_url } }, @@ -310,8 +310,8 @@ def sanitize_html(html, tags=None, global_attributes=[]): 'optional': ['width', 'height'], 'required': ['src'], 'validation': { - 'width': '^\d+$', - 'height': '^\d+$', + 'width': r'^\d+$', + 'height': r'^\d+$', 'src': valid_url }, }, @@ -319,8 +319,8 @@ def sanitize_html(html, tags=None, global_attributes=[]): {'name': 'figcaption'} ] - tag_re = re.compile('<(/)?([^\ /]+)(.*?)(/)?>') - attr_re = re.compile('([^=\ ]+)="([^"]+)"') + tag_re = re.compile(r'<(/)?([^\ /]+)(.*?)(/)?>') + attr_re = re.compile(r'([^=\ ]+)="([^"]+)"') escaped = {} level = 0 @@ -338,7 +338,7 @@ def sanitize_html(html, tags=None, global_attributes=[]): if '[]' in validation: html = re.sub( - re.compile('\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE), + re.compile(r'\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE), '\\3', html) parts = split_tags(html) diff --git a/ox/movie.py b/ox/movie.py index 54ede0c..314df47 100644 --- a/ox/movie.py +++ b/ox/movie.py @@ -25,7 +25,7 @@ The Title[ ([SXX][EYY[+ZZ|-ZZ]])[ Episode Title]][.Version][.Part XY[.Part Title def format_path(data, directory_key='director'): def format_underscores(string): - return re.sub('^\.|\.$|:|/|\?|<|>', '_', string) + return re.sub(r'^\.|\.$|:|/|\?|<|>', '_', string) director = data['directorSort'] or ['Unknown Director'] title = data['seriesTitle' if data['isEpisode'] else 'title'] or 'Untitled' year = data['seriesYear' if data['isEpisode'] else 'year'] or None @@ -199,14 +199,14 @@ def parse_path(path, directory_key='director'): string = re.sub('^_', '.', string) string = re.sub('_$', '.', string) # '_.foo$' or '_ (' is '?' - string = re.sub(re.compile('_(?=(\.\w+$| \())', re.U), '?', string) + string = re.sub(re.compile(r'_(?=(\.\w+$| \())', re.U), '?', string) # ' _..._ ' is '<...>' string = re.sub('(?<= )_(.+)_(?= )', '<\g<1>>', string) # 'foo_bar' or 'foo _ bar' is '/' - string = re.sub(re.compile('(?<=\w)_(?=\w)', re.U), '/', string) + string = re.sub(re.compile(r'(?<=\w)_(?=\w)', re.U), '/', string) string = re.sub(' _ ', ' / ', string) # 'foo_ ' is ':' - string = re.sub(re.compile('(?<=[\w\)\]])_ ', re.U), ': ', string) + string = re.sub(re.compile(r'(?<=[\w\)\]])_ ', re.U), ': ', string) string = unicodedata.normalize('NFD', string) return string @@ -238,14 +238,14 @@ def parse_path(path, directory_key='director'): # title, year data['title'] = data['year'] = None if title: - match = re.search(' \(\d{4}(-(\d{4})?)?\)$', title) + match = re.search(r' \(\d{4}(-(\d{4})?)?\)$', title) data['title'] = title[:-len(match.group(0))] if match else title data['year'] = match.group(0)[2:-1] if match else None file_title = re.sub('[/:]', '_', data['title']) # (remove title from beginning of filename if the rest contains a dot) file = re.sub('^' + re.escape(file_title) + '(?=.*\.)', '', file) # (split by nospace+dot+word, but remove spaces preceding extension) - parts = re.split('(?(.*?)')).strip() data['themes'] = parse_list(html, 'themes') data['types'] = parse_list(html, 'types') - data['year'] = find_re(html, '.*?(\d+)') + data['year'] = find_re(html, r'.*?(\d+)') #data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('(.*?)', '\\1', data) - for a in re.compile('(.*?).*?(.*?)<\/span>').findall(data): + for a in re.compile(r'(.*?).*?(.*?)<\/span>').findall(data): results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2])))) if len(results) >= max_results: break diff --git a/ox/web/piratecinema.py b/ox/web/piratecinema.py index c452f04..e896c9e 100644 --- a/ox/web/piratecinema.py +++ b/ox/web/piratecinema.py @@ -8,7 +8,7 @@ from ox.net import read_url def get_poster_url(id): url = 'http://piratecinema.org/posters/' html = read_url(url).decode('utf-8') - results = re.compile('src="(.+)" title=".+\((\d{6}\d+)\)"').findall(html) + results = re.compile(r'src="(.+)" title=".+\((\d{6}\d+)\)"').findall(html) for result in results: if result[1] == id: return url + result[0] diff --git a/ox/web/wikipedia.py b/ox/web/wikipedia.py index 5d86655..aad0aba 100644 --- a/ox/web/wikipedia.py +++ b/ox/web/wikipedia.py @@ -81,36 +81,36 @@ def get_movie_data(wikipedia_url): if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit(): del filmbox['amg_id'] if 'Allmovie movie' in data: - filmbox['amg_id'] = find_re(data, 'Allmovie movie\|.*?(\d+)') + filmbox['amg_id'] = find_re(data, r'Allmovie movie\|.*?(\d+)') elif 'Allmovie title' in data: - filmbox['amg_id'] = find_re(data, 'Allmovie title\|.*?(\d+)') + filmbox['amg_id'] = find_re(data, r'Allmovie title\|.*?(\d+)') if 'Official website' in data: - filmbox['website'] = find_re(data, 'Official website\|(.*?)}').strip() + filmbox['website'] = find_re(data, r'Official website\|(.*?)}').strip() - r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data) + r = re.compile(r'{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data) if r: filmbox['imdb_id'] = r[0] else: - r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data) + r = re.compile(r'{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data) if r: filmbox['imdb_id'] = r[0] - r = re.compile('{{Internet Archive.*?\|id=(.*?)[\|}]', re.IGNORECASE).findall(data) + r = re.compile(r'{{Internet Archive.*?\|id=(.*?)[\|}]', re.IGNORECASE).findall(data) if r: filmbox['archiveorg_id'] = r[0] - r = re.compile('{{mojo title\|(.*?)[\|}]', re.IGNORECASE).findall(data) + r = re.compile(r'{{mojo title\|(.*?)[\|}]', re.IGNORECASE).findall(data) if r: filmbox['mojo_id'] = r[0].replace('id=', '') - r = re.compile('{{rotten-tomatoes\|(.*?)[\|}]', re.IGNORECASE).findall(data) + r = re.compile(r'{{rotten-tomatoes\|(.*?)[\|}]', re.IGNORECASE).findall(data) if r: filmbox['rottentomatoes_id'] = r[0].replace('id=', '') if 'google video' in data: - filmbox['google_video_id'] = find_re(data, 'google video\|.*?(\d*?)[\|}]') + filmbox['google_video_id'] = find_re(data, r'google video\|.*?(\d*?)[\|}]') if 'DEFAULTSORT' in data: - filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''') + filmbox['title_sort'] = find_re(data, r'''\{\{DEFAULTSORT:(.*?)\}\}''') return filmbox def get_image_url(name): diff --git a/setup.py b/setup.py index fd7e507..4e9cd48 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def get_version(): f = open(changelog) head = f.read().strip().split('\n')[0] f.close() - rev = re.compile('\d+\.\d+\.(\d+)').findall(head) + rev = re.compile(r'\d+\.\d+\.(\d+)').findall(head) if rev: return '3.0.%s' % rev[0] return '3.0.x'