more raw regexp strings

This commit is contained in:
j 2024-08-30 13:30:47 +02:00
commit ae10c5c9b9
11 changed files with 45 additions and 45 deletions

View file

@ -43,7 +43,7 @@ def get_data(id):
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
data['themes'] = parse_list(html, 'themes')
data['types'] = parse_list(html, 'types')
data['year'] = find_re(html, '<span class="year">.*?(\d+)')
data['year'] = find_re(html, r'<span class="year">.*?(\d+)')
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)

View file

@ -51,11 +51,11 @@ def get_movie_data(title, director):
'User-Agent': USER_AGENT
}
html = read_url(url, headers=headers, unicode=True)
results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html)
results = re.compile(r'"(' + host + r'.*?poster\.jpg)"').findall(html)
if results:
data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')
html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True)
results = re.compile('"(' + host + '\S+\.mov)"').findall(html)
results = re.compile(r'"(' + host + r'\S+\.mov)"').findall(html)
if results:
data['trailer'] = results[-1]
return data

View file

@ -28,7 +28,7 @@ def get_data(id, language='en'):
if m:
data['director'] = m[0]
m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html)
m = re.compile(r"caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html)
if m:
data['image'] = m[0]

View file

@ -60,7 +60,7 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
url += '&start=%d' % offset
data = read_url(url, timeout=timeout)
data = re.sub('<span class="f">(.*?)</span>', '\\1', data)
for a in re.compile('<a href="(htt\S+?)".*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>').findall(data):
for a in re.compile(r'<a href="(htt\S+?)".*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>').findall(data):
results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2]))))
if len(results) >= max_results:
break

View file

@ -8,7 +8,7 @@ from ox.net import read_url
def get_poster_url(id):
url = 'http://piratecinema.org/posters/'
html = read_url(url).decode('utf-8')
results = re.compile('src="(.+)" title=".+\((\d{6}\d+)\)"').findall(html)
results = re.compile(r'src="(.+)" title=".+\((\d{6}\d+)\)"').findall(html)
for result in results:
if result[1] == id:
return url + result[0]

View file

@ -81,36 +81,36 @@ def get_movie_data(wikipedia_url):
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
del filmbox['amg_id']
if 'Allmovie movie' in data:
filmbox['amg_id'] = find_re(data, 'Allmovie movie\|.*?(\d+)')
filmbox['amg_id'] = find_re(data, r'Allmovie movie\|.*?(\d+)')
elif 'Allmovie title' in data:
filmbox['amg_id'] = find_re(data, 'Allmovie title\|.*?(\d+)')
filmbox['amg_id'] = find_re(data, r'Allmovie title\|.*?(\d+)')
if 'Official website' in data:
filmbox['website'] = find_re(data, 'Official website\|(.*?)}').strip()
filmbox['website'] = find_re(data, r'Official website\|(.*?)}').strip()
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
r = re.compile(r'{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
if r:
filmbox['imdb_id'] = r[0]
else:
r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
r = re.compile(r'{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
if r:
filmbox['imdb_id'] = r[0]
r = re.compile('{{Internet Archive.*?\|id=(.*?)[\|}]', re.IGNORECASE).findall(data)
r = re.compile(r'{{Internet Archive.*?\|id=(.*?)[\|}]', re.IGNORECASE).findall(data)
if r:
filmbox['archiveorg_id'] = r[0]
r = re.compile('{{mojo title\|(.*?)[\|}]', re.IGNORECASE).findall(data)
r = re.compile(r'{{mojo title\|(.*?)[\|}]', re.IGNORECASE).findall(data)
if r:
filmbox['mojo_id'] = r[0].replace('id=', '')
r = re.compile('{{rotten-tomatoes\|(.*?)[\|}]', re.IGNORECASE).findall(data)
r = re.compile(r'{{rotten-tomatoes\|(.*?)[\|}]', re.IGNORECASE).findall(data)
if r:
filmbox['rottentomatoes_id'] = r[0].replace('id=', '')
if 'google video' in data:
filmbox['google_video_id'] = find_re(data, 'google video\|.*?(\d*?)[\|}]')
filmbox['google_video_id'] = find_re(data, r'google video\|.*?(\d*?)[\|}]')
if 'DEFAULTSORT' in data:
filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
filmbox['title_sort'] = find_re(data, r'''\{\{DEFAULTSORT:(.*?)\}\}''')
return filmbox
def get_image_url(name):