From 301babd1dd436a5557374076eaa6a50a46b2748b Mon Sep 17 00:00:00 2001 From: j Date: Mon, 8 Jul 2024 13:33:07 +0100 Subject: [PATCH] more raw regexp strings --- ox/web/imdb.py | 76 +++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 1b93459..7698dbf 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -43,8 +43,8 @@ def reference_section(id): return { 'page': 'reference', 're': [ - '

'.format(id=id), - ''.format(id=id), + r'' + label + '.*?', - '(.*?)' + r'_label">' + label + '.*?', + r'(.*?)' ], 'type': 'list', } @@ -67,7 +67,7 @@ def zebra_table(label, more=None, type='string'): conditions = { 'page': 'reference', 're': [ - '_label">' + label + '.*?(.*?)', + r'_label">' + label + '.*?(.*?)', ], 'type': type, } @@ -97,9 +97,9 @@ def technical(label): return { 'page': 'technical', 're': [ - '\s*?%s\s*?.*?\s*?(.*?)\s*?' % label, + r'\s*?%s\s*?.*?\s*?(.*?)\s*?' % label, lambda data: [ - re.sub('\s+', ' ', d.strip()) for d in data.strip().split('
') + re.sub(r'\s+', ' ', d.strip()) for d in data.strip().split('
') ] if data else [] ], 'type': 'list' @@ -258,13 +258,13 @@ class Imdb(SiteParser): 'aspectratio': { 'page': 'reference', 're': [ - 'Aspect Ratio.*?ipl-inline-list__item">\s+([\d\.\:\ ]+)', + r'Aspect Ratio.*?ipl-inline-list__item">\s+([\d\.\:\ ]+)', parse_aspectratio, ], 'type': 'float', }, 'budget': zebra_table('Budget', more=[ - lambda data: find_re(decode_html(data).replace(',', ''), '\d+') + lambda data: find_re(decode_html(data).replace(',', ''), r'\d+') ], type='int'), 'cast': { 'page': 'reference', @@ -287,12 +287,12 @@ class Imdb(SiteParser): }, 'genre': zebra_list('Genres', more=['(.*?)
']), 'gross': zebra_table('Cumulative Worldwide Gross', more=[ - lambda data: find_re(decode_html(data).replace(',', ''), '\d+') + lambda data: find_re(decode_html(data).replace(',', ''), r'\d+') ], type='int'), 'language': zebra_list('Language', more=['(.*?)']), 'originalTitle': { 'page': 'releaseinfo', - 're': '

', 'type': 'string' }, 'trivia': { 'page': 'trivia', 're': [ - '
(.*?)<(br|/div)', + r'
(.*?)<(br|/div)', lambda data: data[0] ], 'type': 'list', @@ -382,7 +382,7 @@ class Imdb(SiteParser): 'votes': { 'page': 'reference', 're': [ - 'class="ipl-rating-star__total-votes">\((.*?)\)', + r'class="ipl-rating-star__total-votes">\((.*?)\)', lambda r: r.replace(',', '') ], 'type': 'string' @@ -391,8 +391,8 @@ class Imdb(SiteParser): 'year': { 'page': 'reference', 're': [ - '(.*?)', - '(\d+)', + r'(.*?)', + r'(\d+)', ], 'type': 'int' }, @@ -400,7 +400,7 @@ class Imdb(SiteParser): 'page': 'fullcredits', 're': [ lambda data: data.split('(.*?).*?()', + r'>(.*?).*?()', lambda data: [d for d in data if d] ], 'type': 'list' @@ -468,7 +468,7 @@ class Imdb(SiteParser): title = title[1:-1] if title.startswith("'") and title.endswith("'"): title = title[1:-1] - title = re.sub('\(\#[.\d]+\)', '', title) + title = re.sub(r'\(\#[.\d]+\)', '', title) return title.strip() for t in ('title', 'originalTitle'): @@ -518,7 +518,7 @@ class Imdb(SiteParser): self['actor'] = [c[0] for c in self['cast']] def cleanup_character(c): c = c.replace('(uncredited)', '').strip() - c = re.sub('\s+', ' ', c) + c = re.sub(r'\s+', ' ', c) return c self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])} for x in self['cast']] @@ -528,7 +528,7 @@ class Imdb(SiteParser): del self['isSeries'] self['isSeries'] = True if 'episodeTitle' in self: - self['episodeTitle'] = re.sub('Episode \#\d+\.\d+', '', self['episodeTitle']) + self['episodeTitle'] = re.sub(r'Episode \#\d+\.\d+', '', self['episodeTitle']) #make lists unique but keep order @@ -790,7 +790,7 @@ def get_movie_by_title(title, timeout=-1): url = "http://www.imdb.com/find?" + params data = read_url(url, timeout=timeout, unicode=True) #if search results in redirect, get id of current page - r = '' + r = r'' results = re.compile(r).findall(data) if results: return results[0] @@ -869,12 +869,12 @@ def get_movie_id(title, director='', year='', timeout=-1): data = read_url(url, timeout=timeout, unicode=True) #if search results in redirect, get id of current page - r = '' + r = r'' results = re.compile(r).findall(data) if results: return results[0] #otherwise get first result - r = '.*?.*?.*?
S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data): + for e in re.compile(r'
.*?
S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data): episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0] else: data = cache.read_url(url) - match = re.compile('Season (\d+)').findall(data) + match = re.compile(r'Season (\d+)').findall(data) if match: for season in range(1, int(match[0]) + 1): episodes.update(get_episodes(imdbId, season)) @@ -927,7 +927,7 @@ def max_votes(): data = cache.read_url(url).decode('utf-8', 'ignore') votes = max([ int(v.replace(',', '')) - for v in re.compile('Votes.*?([\d,]+)', re.DOTALL).findall(data) + for v in re.compile(r'Votes.*?([\d,]+)', re.DOTALL).findall(data) ]) return votes