more raw regexp strings
This commit is contained in:
parent
29a309f15e
commit
ae10c5c9b9
11 changed files with 45 additions and 45 deletions
|
|
@ -43,7 +43,7 @@ def get_data(id):
|
|||
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
data['themes'] = parse_list(html, 'themes')
|
||||
data['types'] = parse_list(html, 'types')
|
||||
data['year'] = find_re(html, '<span class="year">.*?(\d+)')
|
||||
data['year'] = find_re(html, r'<span class="year">.*?(\d+)')
|
||||
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
|
||||
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
|
||||
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
|
||||
|
|
|
|||
|
|
@ -51,11 +51,11 @@ def get_movie_data(title, director):
|
|||
'User-Agent': USER_AGENT
|
||||
}
|
||||
html = read_url(url, headers=headers, unicode=True)
|
||||
results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html)
|
||||
results = re.compile(r'"(' + host + r'.*?poster\.jpg)"').findall(html)
|
||||
if results:
|
||||
data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')
|
||||
html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True)
|
||||
results = re.compile('"(' + host + '\S+\.mov)"').findall(html)
|
||||
results = re.compile(r'"(' + host + r'\S+\.mov)"').findall(html)
|
||||
if results:
|
||||
data['trailer'] = results[-1]
|
||||
return data
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ def get_data(id, language='en'):
|
|||
if m:
|
||||
data['director'] = m[0]
|
||||
|
||||
m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html)
|
||||
m = re.compile(r"caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html)
|
||||
if m:
|
||||
data['image'] = m[0]
|
||||
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
|||
url += '&start=%d' % offset
|
||||
data = read_url(url, timeout=timeout)
|
||||
data = re.sub('<span class="f">(.*?)</span>', '\\1', data)
|
||||
for a in re.compile('<a href="(htt\S+?)".*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>').findall(data):
|
||||
for a in re.compile(r'<a href="(htt\S+?)".*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>').findall(data):
|
||||
results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2]))))
|
||||
if len(results) >= max_results:
|
||||
break
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ from ox.net import read_url
|
|||
def get_poster_url(id):
|
||||
url = 'http://piratecinema.org/posters/'
|
||||
html = read_url(url).decode('utf-8')
|
||||
results = re.compile('src="(.+)" title=".+\((\d{6}\d+)\)"').findall(html)
|
||||
results = re.compile(r'src="(.+)" title=".+\((\d{6}\d+)\)"').findall(html)
|
||||
for result in results:
|
||||
if result[1] == id:
|
||||
return url + result[0]
|
||||
|
|
|
|||
|
|
@ -81,36 +81,36 @@ def get_movie_data(wikipedia_url):
|
|||
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
|
||||
del filmbox['amg_id']
|
||||
if 'Allmovie movie' in data:
|
||||
filmbox['amg_id'] = find_re(data, 'Allmovie movie\|.*?(\d+)')
|
||||
filmbox['amg_id'] = find_re(data, r'Allmovie movie\|.*?(\d+)')
|
||||
elif 'Allmovie title' in data:
|
||||
filmbox['amg_id'] = find_re(data, 'Allmovie title\|.*?(\d+)')
|
||||
filmbox['amg_id'] = find_re(data, r'Allmovie title\|.*?(\d+)')
|
||||
|
||||
if 'Official website' in data:
|
||||
filmbox['website'] = find_re(data, 'Official website\|(.*?)}').strip()
|
||||
filmbox['website'] = find_re(data, r'Official website\|(.*?)}').strip()
|
||||
|
||||
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
|
||||
r = re.compile(r'{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
|
||||
if r:
|
||||
filmbox['imdb_id'] = r[0]
|
||||
else:
|
||||
r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
|
||||
r = re.compile(r'{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
|
||||
if r:
|
||||
filmbox['imdb_id'] = r[0]
|
||||
|
||||
r = re.compile('{{Internet Archive.*?\|id=(.*?)[\|}]', re.IGNORECASE).findall(data)
|
||||
r = re.compile(r'{{Internet Archive.*?\|id=(.*?)[\|}]', re.IGNORECASE).findall(data)
|
||||
if r:
|
||||
filmbox['archiveorg_id'] = r[0]
|
||||
|
||||
r = re.compile('{{mojo title\|(.*?)[\|}]', re.IGNORECASE).findall(data)
|
||||
r = re.compile(r'{{mojo title\|(.*?)[\|}]', re.IGNORECASE).findall(data)
|
||||
if r:
|
||||
filmbox['mojo_id'] = r[0].replace('id=', '')
|
||||
|
||||
r = re.compile('{{rotten-tomatoes\|(.*?)[\|}]', re.IGNORECASE).findall(data)
|
||||
r = re.compile(r'{{rotten-tomatoes\|(.*?)[\|}]', re.IGNORECASE).findall(data)
|
||||
if r:
|
||||
filmbox['rottentomatoes_id'] = r[0].replace('id=', '')
|
||||
if 'google video' in data:
|
||||
filmbox['google_video_id'] = find_re(data, 'google video\|.*?(\d*?)[\|}]')
|
||||
filmbox['google_video_id'] = find_re(data, r'google video\|.*?(\d*?)[\|}]')
|
||||
if 'DEFAULTSORT' in data:
|
||||
filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
|
||||
filmbox['title_sort'] = find_re(data, r'''\{\{DEFAULTSORT:(.*?)\}\}''')
|
||||
return filmbox
|
||||
|
||||
def get_image_url(name):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue