more raw regexp strings

This commit is contained in:
j 2024-08-30 13:30:47 +02:00
parent 29a309f15e
commit ae10c5c9b9
11 changed files with 45 additions and 45 deletions

View file

@ -236,7 +236,7 @@ def int_value(strValue, default=''):
'' ''
""" """
try: try:
val = re.compile('(\d+)').findall(str(strValue).strip())[0] val = re.compile(r'(\d+)').findall(str(strValue).strip())[0]
except: except:
val = default val = default
return val return val
@ -253,7 +253,7 @@ def float_value(strValue, default=''):
'' ''
""" """
try: try:
val = re.compile('([\d.]+)').findall(str(strValue).strip())[0] val = re.compile(r'([\d.]+)').findall(str(strValue).strip())[0]
except: except:
val = default val = default
return val return val

View file

@ -178,10 +178,10 @@ def highlight(text, query, hlClass="hl"):
""" """
if query: if query:
text = text.replace('<br />', '|') text = text.replace('<br />', '|')
query = re.escape(query).replace('\ ', '.') query = re.escape(query).replace(r'\ ', '.')
m = re.compile("(%s)" % query, re.IGNORECASE).findall(text) m = re.compile("(%s)" % query, re.IGNORECASE).findall(text)
for i in m: for i in m:
text = re.sub("(%s)" % re.escape(i).replace('\ ', '.'), '<span class="%s">\\1</span>' % hlClass, text) text = re.sub(r"(%s)" % re.escape(i).replace(r'\ ', '.'), r'<span class="%s">\\1</span>' % hlClass, text)
text = text.replace('|', '<br />') text = text.replace('|', '<br />')
return text return text
@ -234,7 +234,7 @@ def sanitize_html(html, tags=None, global_attributes=[]):
' ' ' '
''' '''
if not tags: if not tags:
valid_url = '^((https?:\/\/|\/|mailto:).*?)' valid_url = r'^((https?:\/\/|\/|mailto:).*?)'
tags = [ tags = [
# inline formatting # inline formatting
{'name': 'b'}, {'name': 'b'},
@ -300,8 +300,8 @@ def sanitize_html(html, tags=None, global_attributes=[]):
'optional': ['width', 'height'], 'optional': ['width', 'height'],
'required': ['src'], 'required': ['src'],
'validation': { 'validation': {
'width': '^\d+$', 'width': r'^\d+$',
'height': '^\d+$', 'height': r'^\d+$',
'src': valid_url 'src': valid_url
} }
}, },
@ -310,8 +310,8 @@ def sanitize_html(html, tags=None, global_attributes=[]):
'optional': ['width', 'height'], 'optional': ['width', 'height'],
'required': ['src'], 'required': ['src'],
'validation': { 'validation': {
'width': '^\d+$', 'width': r'^\d+$',
'height': '^\d+$', 'height': r'^\d+$',
'src': valid_url 'src': valid_url
}, },
}, },
@ -319,8 +319,8 @@ def sanitize_html(html, tags=None, global_attributes=[]):
{'name': 'figcaption'} {'name': 'figcaption'}
] ]
tag_re = re.compile('<(/)?([^\ /]+)(.*?)(/)?>') tag_re = re.compile(r'<(/)?([^\ /]+)(.*?)(/)?>')
attr_re = re.compile('([^=\ ]+)="([^"]+)"') attr_re = re.compile(r'([^=\ ]+)="([^"]+)"')
escaped = {} escaped = {}
level = 0 level = 0
@ -338,7 +338,7 @@ def sanitize_html(html, tags=None, global_attributes=[]):
if '[]' in validation: if '[]' in validation:
html = re.sub( html = re.sub(
re.compile('\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE), re.compile(r'\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE),
'<a href="\\1">\\3</a>', html) '<a href="\\1">\\3</a>', html)
parts = split_tags(html) parts = split_tags(html)

View file

@ -25,7 +25,7 @@ The Title[ ([SXX][EYY[+ZZ|-ZZ]])[ Episode Title]][.Version][.Part XY[.Part Title
def format_path(data, directory_key='director'): def format_path(data, directory_key='director'):
def format_underscores(string): def format_underscores(string):
return re.sub('^\.|\.$|:|/|\?|<|>', '_', string) return re.sub(r'^\.|\.$|:|/|\?|<|>', '_', string)
director = data['directorSort'] or ['Unknown Director'] director = data['directorSort'] or ['Unknown Director']
title = data['seriesTitle' if data['isEpisode'] else 'title'] or 'Untitled' title = data['seriesTitle' if data['isEpisode'] else 'title'] or 'Untitled'
year = data['seriesYear' if data['isEpisode'] else 'year'] or None year = data['seriesYear' if data['isEpisode'] else 'year'] or None
@ -199,14 +199,14 @@ def parse_path(path, directory_key='director'):
string = re.sub('^_', '.', string) string = re.sub('^_', '.', string)
string = re.sub('_$', '.', string) string = re.sub('_$', '.', string)
# '_.foo$' or '_ (' is '?' # '_.foo$' or '_ (' is '?'
string = re.sub(re.compile('_(?=(\.\w+$| \())', re.U), '?', string) string = re.sub(re.compile(r'_(?=(\.\w+$| \())', re.U), '?', string)
# ' _..._ ' is '<...>' # ' _..._ ' is '<...>'
string = re.sub('(?<= )_(.+)_(?= )', '<\g<1>>', string) string = re.sub('(?<= )_(.+)_(?= )', '<\g<1>>', string)
# 'foo_bar' or 'foo _ bar' is '/' # 'foo_bar' or 'foo _ bar' is '/'
string = re.sub(re.compile('(?<=\w)_(?=\w)', re.U), '/', string) string = re.sub(re.compile(r'(?<=\w)_(?=\w)', re.U), '/', string)
string = re.sub(' _ ', ' / ', string) string = re.sub(' _ ', ' / ', string)
# 'foo_ ' is ':' # 'foo_ ' is ':'
string = re.sub(re.compile('(?<=[\w\)\]])_ ', re.U), ': ', string) string = re.sub(re.compile(r'(?<=[\w\)\]])_ ', re.U), ': ', string)
string = unicodedata.normalize('NFD', string) string = unicodedata.normalize('NFD', string)
return string return string
@ -238,14 +238,14 @@ def parse_path(path, directory_key='director'):
# title, year # title, year
data['title'] = data['year'] = None data['title'] = data['year'] = None
if title: if title:
match = re.search(' \(\d{4}(-(\d{4})?)?\)$', title) match = re.search(r' \(\d{4}(-(\d{4})?)?\)$', title)
data['title'] = title[:-len(match.group(0))] if match else title data['title'] = title[:-len(match.group(0))] if match else title
data['year'] = match.group(0)[2:-1] if match else None data['year'] = match.group(0)[2:-1] if match else None
file_title = re.sub('[/:]', '_', data['title']) file_title = re.sub('[/:]', '_', data['title'])
# (remove title from beginning of filename if the rest contains a dot) # (remove title from beginning of filename if the rest contains a dot)
file = re.sub('^' + re.escape(file_title) + '(?=.*\.)', '', file) file = re.sub('^' + re.escape(file_title) + '(?=.*\.)', '', file)
# (split by nospace+dot+word, but remove spaces preceding extension) # (split by nospace+dot+word, but remove spaces preceding extension)
parts = re.split('(?<!\s)\.(?=\w)', re.sub('\s+(?=.\w+$)', '', file)) parts = re.split(r'(?<!\s)\.(?=\w)', re.sub(r'\s+(?=.\w+$)', '', file))
title, parts, extension = [ title, parts, extension = [
parts[0], parts[0],
parts[1:-1], parts[1:-1],
@ -256,7 +256,7 @@ def parse_path(path, directory_key='director'):
# season, episode, episodes, episodeTitle # season, episode, episodes, episodeTitle
data['season'] = data['episode'] = data['episodeTitle'] = None data['season'] = data['episode'] = data['episodeTitle'] = None
data['episodes'] = [] data['episodes'] = []
match = re.search(' \((S\d{2})?(E\d{2}([+-]\d{2})?)?\)(.+)?', title) match = re.search(r' \((S\d{2})?(E\d{2}([+-]\d{2})?)?\)(.+)?', title)
if match: if match:
if match.group(1): if match.group(1):
data['season'] = int(match.group(1)[1:]) data['season'] = int(match.group(1)[1:])
@ -267,7 +267,7 @@ def parse_path(path, directory_key='director'):
data['episodes'] = range(int(match.group(2)[1:3]), int(match.group(2)[-2:]) + 1) data['episodes'] = range(int(match.group(2)[1:3]), int(match.group(2)[-2:]) + 1)
if match.group(4): if match.group(4):
data['episodeTitle'] = match.group(4)[1:] data['episodeTitle'] = match.group(4)[1:]
while data['episodeTitle'] and len(parts) and re.search('^\w+\.*$', parts[0]) and not re.search('^[a-z]{2}$', parts[0]): while data['episodeTitle'] and len(parts) and re.search(r'^\w+\.*$', parts[0]) and not re.search(r'^[a-z]{2}$', parts[0]):
data['episodeTitle'] += '.%s' % parts.pop(0) data['episodeTitle'] += '.%s' % parts.pop(0)
# isEpisode, seriesTitle, seriesYear # isEpisode, seriesTitle, seriesYear
data['isEpisode'] = False data['isEpisode'] = False
@ -343,14 +343,14 @@ def parse_movie_path(path):
if title.startswith('_'): if title.startswith('_'):
title = '.' + title[1:] title = '.' + title[1:]
year = find_re(title, '(\(\d{4}\))') year = find_re(title, r'(\(\d{4}\))')
if not year: if not year:
year = find_re(title, '(\(\d{4}-\d*\))') year = find_re(title, r'(\(\d{4}-\d*\))')
if year and title.endswith(year): if year and title.endswith(year):
title = title[:-len(year)].strip() title = title[:-len(year)].strip()
year = year[1:-1] year = year[1:-1]
if '-' in year: if '-' in year:
year = find_re(year, '\d{4}') year = find_re(year, r'\d{4}')
#director #director
if len(parts) == 4: if len(parts) == 4:
@ -373,7 +373,7 @@ def parse_movie_path(path):
language = '' language = ''
#season/episode/episodeTitle #season/episode/episodeTitle
match = re.compile('(.+?) \((S(\d+))?(E(\d+))?\)( (.+?))?\.').match(parts[-1]) match = re.compile(r'(.+?) \((S(\d+))?(E(\d+))?\)( (.+?))?\.').match(parts[-1])
if match: if match:
seriesTitle = match.group(1) seriesTitle = match.group(1)
season = match.group(3) season = match.group(3)
@ -386,13 +386,13 @@ def parse_movie_path(path):
if episode and not season: if episode and not season:
season = 1 season = 1
else: else:
season = find_re(parts[-1], '\.Season (\d+)\.') season = find_re(parts[-1], r'\.Season (\d+)\.')
if season: if season:
season = int(season) season = int(season)
else: else:
season = None season = None
episode = find_re(parts[-1], '\.Episode[s]* ([\d+]+)\.') episode = find_re(parts[-1], r'\.Episode[s]* ([\d+]+)\.')
if episode: if episode:
episode = episode.split('+')[0] episode = episode.split('+')[0]
episode = int(episode) episode = int(episode)
@ -422,7 +422,7 @@ def parse_movie_path(path):
title = u'%s %s' % (title, episodeTitle) title = u'%s %s' % (title, episodeTitle)
#part #part
part = find_re(parts[-1], '\.Part (\d+)\.') part = find_re(parts[-1], r'\.Part (\d+)\.')
if part: if part:
part = int(part) part = int(part)
else: else:

View file

@ -102,7 +102,7 @@ def normalize_imdbid(imdbId):
'0159206' '0159206'
""" """
if isinstance(imdbId, str): if isinstance(imdbId, str):
imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId) imdbId = re.sub(r'.*(\d{7}).*', '\\1', imdbId)
elif isinstance(imdbId, int): elif isinstance(imdbId, int):
imdbId = "%07d" % imdbId imdbId = "%07d" % imdbId
return imdbId return imdbId

View file

@ -43,7 +43,7 @@ def get_data(id):
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip() data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
data['themes'] = parse_list(html, 'themes') data['themes'] = parse_list(html, 'themes')
data['types'] = parse_list(html, 'types') data['types'] = parse_list(html, 'types')
data['year'] = find_re(html, '<span class="year">.*?(\d+)') data['year'] = find_re(html, r'<span class="year">.*?(\d+)')
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)] #data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html) data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True) #html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)

View file

@ -51,11 +51,11 @@ def get_movie_data(title, director):
'User-Agent': USER_AGENT 'User-Agent': USER_AGENT
} }
html = read_url(url, headers=headers, unicode=True) html = read_url(url, headers=headers, unicode=True)
results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html) results = re.compile(r'"(' + host + r'.*?poster\.jpg)"').findall(html)
if results: if results:
data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg') data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')
html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True) html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True)
results = re.compile('"(' + host + '\S+\.mov)"').findall(html) results = re.compile(r'"(' + host + r'\S+\.mov)"').findall(html)
if results: if results:
data['trailer'] = results[-1] data['trailer'] = results[-1]
return data return data

View file

@ -28,7 +28,7 @@ def get_data(id, language='en'):
if m: if m:
data['director'] = m[0] data['director'] = m[0]
m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html) m = re.compile(r"caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html)
if m: if m:
data['image'] = m[0] data['image'] = m[0]

View file

@ -60,7 +60,7 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
url += '&start=%d' % offset url += '&start=%d' % offset
data = read_url(url, timeout=timeout) data = read_url(url, timeout=timeout)
data = re.sub('<span class="f">(.*?)</span>', '\\1', data) data = re.sub('<span class="f">(.*?)</span>', '\\1', data)
for a in re.compile('<a href="(htt\S+?)".*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>').findall(data): for a in re.compile(r'<a href="(htt\S+?)".*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>').findall(data):
results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2])))) results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2]))))
if len(results) >= max_results: if len(results) >= max_results:
break break

View file

@ -8,7 +8,7 @@ from ox.net import read_url
def get_poster_url(id): def get_poster_url(id):
url = 'http://piratecinema.org/posters/' url = 'http://piratecinema.org/posters/'
html = read_url(url).decode('utf-8') html = read_url(url).decode('utf-8')
results = re.compile('src="(.+)" title=".+\((\d{6}\d+)\)"').findall(html) results = re.compile(r'src="(.+)" title=".+\((\d{6}\d+)\)"').findall(html)
for result in results: for result in results:
if result[1] == id: if result[1] == id:
return url + result[0] return url + result[0]

View file

@ -81,36 +81,36 @@ def get_movie_data(wikipedia_url):
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit(): if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
del filmbox['amg_id'] del filmbox['amg_id']
if 'Allmovie movie' in data: if 'Allmovie movie' in data:
filmbox['amg_id'] = find_re(data, 'Allmovie movie\|.*?(\d+)') filmbox['amg_id'] = find_re(data, r'Allmovie movie\|.*?(\d+)')
elif 'Allmovie title' in data: elif 'Allmovie title' in data:
filmbox['amg_id'] = find_re(data, 'Allmovie title\|.*?(\d+)') filmbox['amg_id'] = find_re(data, r'Allmovie title\|.*?(\d+)')
if 'Official website' in data: if 'Official website' in data:
filmbox['website'] = find_re(data, 'Official website\|(.*?)}').strip() filmbox['website'] = find_re(data, r'Official website\|(.*?)}').strip()
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data) r = re.compile(r'{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
if r: if r:
filmbox['imdb_id'] = r[0] filmbox['imdb_id'] = r[0]
else: else:
r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data) r = re.compile(r'{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
if r: if r:
filmbox['imdb_id'] = r[0] filmbox['imdb_id'] = r[0]
r = re.compile('{{Internet Archive.*?\|id=(.*?)[\|}]', re.IGNORECASE).findall(data) r = re.compile(r'{{Internet Archive.*?\|id=(.*?)[\|}]', re.IGNORECASE).findall(data)
if r: if r:
filmbox['archiveorg_id'] = r[0] filmbox['archiveorg_id'] = r[0]
r = re.compile('{{mojo title\|(.*?)[\|}]', re.IGNORECASE).findall(data) r = re.compile(r'{{mojo title\|(.*?)[\|}]', re.IGNORECASE).findall(data)
if r: if r:
filmbox['mojo_id'] = r[0].replace('id=', '') filmbox['mojo_id'] = r[0].replace('id=', '')
r = re.compile('{{rotten-tomatoes\|(.*?)[\|}]', re.IGNORECASE).findall(data) r = re.compile(r'{{rotten-tomatoes\|(.*?)[\|}]', re.IGNORECASE).findall(data)
if r: if r:
filmbox['rottentomatoes_id'] = r[0].replace('id=', '') filmbox['rottentomatoes_id'] = r[0].replace('id=', '')
if 'google video' in data: if 'google video' in data:
filmbox['google_video_id'] = find_re(data, 'google video\|.*?(\d*?)[\|}]') filmbox['google_video_id'] = find_re(data, r'google video\|.*?(\d*?)[\|}]')
if 'DEFAULTSORT' in data: if 'DEFAULTSORT' in data:
filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''') filmbox['title_sort'] = find_re(data, r'''\{\{DEFAULTSORT:(.*?)\}\}''')
return filmbox return filmbox
def get_image_url(name): def get_image_url(name):

View file

@ -32,7 +32,7 @@ def get_version():
f = open(changelog) f = open(changelog)
head = f.read().strip().split('\n')[0] head = f.read().strip().split('\n')[0]
f.close() f.close()
rev = re.compile('\d+\.\d+\.(\d+)').findall(head) rev = re.compile(r'\d+\.\d+\.(\d+)').findall(head)
if rev: if rev:
return '3.0.%s' % rev[0] return '3.0.%s' % rev[0]
return '3.0.x' return '3.0.x'