escape strings
This commit is contained in:
parent
ae10c5c9b9
commit
41edea1862
20 changed files with 74 additions and 74 deletions
|
@ -181,7 +181,7 @@ def highlight(text, query, hlClass="hl"):
|
||||||
query = re.escape(query).replace(r'\ ', '.')
|
query = re.escape(query).replace(r'\ ', '.')
|
||||||
m = re.compile("(%s)" % query, re.IGNORECASE).findall(text)
|
m = re.compile("(%s)" % query, re.IGNORECASE).findall(text)
|
||||||
for i in m:
|
for i in m:
|
||||||
text = re.sub(r"(%s)" % re.escape(i).replace(r'\ ', '.'), r'<span class="%s">\\1</span>' % hlClass, text)
|
text = re.sub(r"(%s)" % re.escape(i).replace(r'\ ', '.'), '<span class="%s">\\1</span>' % hlClass, text)
|
||||||
text = text.replace('|', '<br />')
|
text = text.replace('|', '<br />')
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
|
@ -201,7 +201,7 @@ def parse_path(path, directory_key='director'):
|
||||||
# '_.foo$' or '_ (' is '?'
|
# '_.foo$' or '_ (' is '?'
|
||||||
string = re.sub(re.compile(r'_(?=(\.\w+$| \())', re.U), '?', string)
|
string = re.sub(re.compile(r'_(?=(\.\w+$| \())', re.U), '?', string)
|
||||||
# ' _..._ ' is '<...>'
|
# ' _..._ ' is '<...>'
|
||||||
string = re.sub('(?<= )_(.+)_(?= )', '<\g<1>>', string)
|
string = re.sub(r'(?<= )_(.+)_(?= )', r'<\g<1>>', string)
|
||||||
# 'foo_bar' or 'foo _ bar' is '/'
|
# 'foo_bar' or 'foo _ bar' is '/'
|
||||||
string = re.sub(re.compile(r'(?<=\w)_(?=\w)', re.U), '/', string)
|
string = re.sub(re.compile(r'(?<=\w)_(?=\w)', re.U), '/', string)
|
||||||
string = re.sub(' _ ', ' / ', string)
|
string = re.sub(' _ ', ' / ', string)
|
||||||
|
@ -243,7 +243,7 @@ def parse_path(path, directory_key='director'):
|
||||||
data['year'] = match.group(0)[2:-1] if match else None
|
data['year'] = match.group(0)[2:-1] if match else None
|
||||||
file_title = re.sub('[/:]', '_', data['title'])
|
file_title = re.sub('[/:]', '_', data['title'])
|
||||||
# (remove title from beginning of filename if the rest contains a dot)
|
# (remove title from beginning of filename if the rest contains a dot)
|
||||||
file = re.sub('^' + re.escape(file_title) + '(?=.*\.)', '', file)
|
file = re.sub(r'^' + re.escape(file_title) + r'(?=.*\.)', '', file)
|
||||||
# (split by nospace+dot+word, but remove spaces preceding extension)
|
# (split by nospace+dot+word, but remove spaces preceding extension)
|
||||||
parts = re.split(r'(?<!\s)\.(?=\w)', re.sub(r'\s+(?=.\w+$)', '', file))
|
parts = re.split(r'(?<!\s)\.(?=\w)', re.sub(r'\s+(?=.\w+$)', '', file))
|
||||||
title, parts, extension = [
|
title, parts, extension = [
|
||||||
|
|
|
@ -36,7 +36,7 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
||||||
except:
|
except:
|
||||||
html = read_url(data["url"], timeout=timeout).decode('utf-8', 'ignore')
|
html = read_url(data["url"], timeout=timeout).decode('utf-8', 'ignore')
|
||||||
|
|
||||||
data["number"] = find_re(html, "<b>Spine #(\d+)")
|
data["number"] = find_re(html, r"<b>Spine #(\d+)")
|
||||||
|
|
||||||
data["title"] = decode_html(find_re(html, "<h1 class=\"header__primarytitle\".*?>(.*?)</h1>"))
|
data["title"] = decode_html(find_re(html, "<h1 class=\"header__primarytitle\".*?>(.*?)</h1>"))
|
||||||
data["title"] = data["title"].split(' \u2014 The Television Version')[0].strip()
|
data["title"] = data["title"].split(' \u2014 The Television Version')[0].strip()
|
||||||
|
@ -77,7 +77,7 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
||||||
data["posters"] = [result.replace("_w100", "")]
|
data["posters"] = [result.replace("_w100", "")]
|
||||||
else:
|
else:
|
||||||
data["posters"] = []
|
data["posters"] = []
|
||||||
data['posters'] = [re.sub('(\?\d+)$', '', p) for p in data['posters']]
|
data['posters'] = [re.sub(r'(\?\d+)$', '', p) for p in data['posters']]
|
||||||
data['posters'] = [p for p in data['posters'] if p]
|
data['posters'] = [p for p in data['posters'] if p]
|
||||||
|
|
||||||
posters = find_re(html, '<div class="product-box-art".*?>(.*?)</div>')
|
posters = find_re(html, '<div class="product-box-art".*?>(.*?)</div>')
|
||||||
|
@ -103,12 +103,12 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
||||||
def get_ids(page=None):
|
def get_ids(page=None):
|
||||||
ids = []
|
ids = []
|
||||||
html = read_url("https://www.criterion.com/shop/browse/list?sort=spine_number", unicode=True)
|
html = read_url("https://www.criterion.com/shop/browse/list?sort=spine_number", unicode=True)
|
||||||
results = re.compile("films/(\d+)-").findall(html)
|
results = re.compile(r"films/(\d+)-").findall(html)
|
||||||
ids += results
|
ids += results
|
||||||
results = re.compile("boxsets/(.*?)\"").findall(html)
|
results = re.compile(r"boxsets/(.*?)\"").findall(html)
|
||||||
for result in results:
|
for result in results:
|
||||||
html = read_url("https://www.criterion.com/boxsets/" + result, unicode=True)
|
html = read_url("https://www.criterion.com/boxsets/" + result, unicode=True)
|
||||||
results = re.compile("films/(\d+)-").findall(html)
|
results = re.compile(r"films/(\d+)-").findall(html)
|
||||||
ids += results
|
ids += results
|
||||||
return sorted(set(ids), key=int)
|
return sorted(set(ids), key=int)
|
||||||
|
|
||||||
|
|
|
@ -25,10 +25,10 @@ def get_show_data(url):
|
||||||
data = read_url(url, unicode=True)
|
data = read_url(url, unicode=True)
|
||||||
r = {}
|
r = {}
|
||||||
r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
|
r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
|
||||||
r['imdb'] = find_re(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
|
r['imdb'] = find_re(data, r'<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
|
||||||
r['episodes'] = {}
|
r['episodes'] = {}
|
||||||
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
|
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
|
||||||
for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):
|
for episode in re.compile(r'(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):
|
||||||
air_date = episode[3].strip()
|
air_date = episode[3].strip()
|
||||||
#'22 Sep 04' -> 2004-09-22
|
#'22 Sep 04' -> 2004-09-22
|
||||||
try:
|
try:
|
||||||
|
@ -42,7 +42,7 @@ def get_show_data(url):
|
||||||
'prod code': episode[2],
|
'prod code': episode[2],
|
||||||
'air date': air_date,
|
'air date': air_date,
|
||||||
'url': episode[4],
|
'url': episode[4],
|
||||||
'title':episode[5],
|
'title': episode[5],
|
||||||
}
|
}
|
||||||
except:
|
except:
|
||||||
print("oxweb.epguides failed,", url)
|
print("oxweb.epguides failed,", url)
|
||||||
|
|
|
@ -11,7 +11,7 @@ def get_ids():
|
||||||
for i in string.ascii_uppercase:
|
for i in string.ascii_uppercase:
|
||||||
url = "http://www.filmsdivision.org/search.php?title=%s" % i
|
url = "http://www.filmsdivision.org/search.php?title=%s" % i
|
||||||
data = ox.cache.read_url(url)
|
data = ox.cache.read_url(url)
|
||||||
links = re.compile('view_video.php\?movId=(.*?)[\'"]', re.DOTALL).findall(data)
|
links = re.compile(r'view_video.php\?movId=(.*?)[\'"]', re.DOTALL).findall(data)
|
||||||
result += links
|
result += links
|
||||||
return list(set(result))
|
return list(set(result))
|
||||||
|
|
||||||
|
@ -20,20 +20,20 @@ def get_data(id):
|
||||||
url = "http://www.filmsdivision.org/view_video.php?movId=%s" % id
|
url = "http://www.filmsdivision.org/view_video.php?movId=%s" % id
|
||||||
data = ox.cache.read_url(url)
|
data = ox.cache.read_url(url)
|
||||||
result['title'] = re.compile('<td.*?class="vdoheadtxt".*?>(.*?)</td>').findall(data)[0]
|
result['title'] = re.compile('<td.*?class="vdoheadtxt".*?>(.*?)</td>').findall(data)[0]
|
||||||
result['year'] = re.compile('Release: (\d{4})').findall(data)[0]
|
result['year'] = re.compile(r'Release: (\d{4})').findall(data)[0]
|
||||||
result['duration'] = int(re.compile('Duration: (\d+)mins').findall(data)[0]) * 60
|
result['duration'] = int(re.compile(r'Duration: (\d+)mins').findall(data)[0]) * 60
|
||||||
result['producer'] = re.compile('Producer: (.*?)\t').findall(data)[0].strip()
|
result['producer'] = re.compile(r'Producer: (.*?)\t').findall(data)[0].strip()
|
||||||
if 'Director:' in data:
|
if 'Director:' in data:
|
||||||
result['director'] = re.compile('Director: (.*?)\t').findall(data)[0].strip()
|
result['director'] = re.compile(r'Director: (.*?)\t').findall(data)[0].strip()
|
||||||
else:
|
else:
|
||||||
result['director'] = "Unknown Director"
|
result['director'] = "Unknown Director"
|
||||||
result['url'] = re.compile('value="(.*?.wmv)"').findall(data)[0]
|
result['url'] = re.compile(r'value="(.*?.wmv)"').findall(data)[0]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def download_video(url, filename):
|
def download_video(url, filename):
|
||||||
dirname = os.path.dirname(filename)
|
dirname = os.path.dirname(filename)
|
||||||
if not os.path.exists(dirname):
|
if not os.path.exists(dirname):
|
||||||
os.makedirs(dirname)
|
os.makedirs(dirname)
|
||||||
p = subprocess.Popen(['gst-launch', 'mmssrc', 'location=%s'%url, '!', 'filesink', 'locaiton='%filename])
|
p = subprocess.Popen(['gst-launch', 'mmssrc', 'location=%s' % url, '!', 'filesink', 'location=' % filename])
|
||||||
p.wait()
|
p.wait()
|
||||||
return p.returncode == 0
|
return p.returncode == 0
|
||||||
|
|
|
@ -30,13 +30,13 @@ class Freebase(dict):
|
||||||
'metacritic': '/source/metacritic/movie',
|
'metacritic': '/source/metacritic/movie',
|
||||||
}
|
}
|
||||||
for key in keys:
|
for key in keys:
|
||||||
links = filter(lambda x: x['namespace'] == keys[key],data['ids'])
|
links = filter(lambda x: x['namespace'] == keys[key], data['ids'])
|
||||||
if links:
|
if links:
|
||||||
self[key] = links[0]['uri']
|
self[key] = links[0]['uri']
|
||||||
|
|
||||||
if 'nytimes' in self:
|
if 'nytimes' in self:
|
||||||
self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-'))
|
self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-'))
|
||||||
self['amgId'] = find_re(self['nytimes'], 'movie/(\d+)/')
|
self['amgId'] = find_re(self['nytimes'], r'movie/(\d+)/')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -31,13 +31,13 @@ def get_data(id):
|
||||||
'url': get_url(id)
|
'url': get_url(id)
|
||||||
}
|
}
|
||||||
html = read_url(data['url'])
|
html = read_url(data['url'])
|
||||||
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
|
data['imdbId'] = find_re(html, r'imdb.com/title/tt(\d{7})')
|
||||||
if not data['imdbId']:
|
if not data['imdbId']:
|
||||||
data['imdbId'] = _id_map.get(id, '')
|
data['imdbId'] = _id_map.get(id, '')
|
||||||
data['title'] = strip_tags(find_re(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
|
data['title'] = strip_tags(find_re(html, r'<p class="name white">(.*?) \(<a href="alpha1.html">'))
|
||||||
data['year'] = find_re(html, '\(<a href="alpha1.html">(.*?)</a>\)')
|
data['year'] = find_re(html, r'\(<a href="alpha1.html">(.*?)</a>\)')
|
||||||
data['posters'] = []
|
data['posters'] = []
|
||||||
poster = find_re(html, '<img src="(posters.*?)"')
|
poster = find_re(html, r'<img src="(posters.*?)"')
|
||||||
if poster:
|
if poster:
|
||||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster)
|
poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster)
|
||||||
data['posters'].append(poster)
|
data['posters'].append(poster)
|
||||||
|
@ -46,7 +46,7 @@ def get_data(id):
|
||||||
result = result.replace('_xlg.html', '.html')
|
result = result.replace('_xlg.html', '.html')
|
||||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||||
html = read_url(url)
|
html = read_url(url)
|
||||||
result = find_re(html, '<a href = (\w*?_xlg.html)')
|
result = find_re(html, r'<a href = (\w*?_xlg.html)')
|
||||||
if result:
|
if result:
|
||||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||||
html = read_url(url)
|
html = read_url(url)
|
||||||
|
@ -62,7 +62,7 @@ def get_id(url):
|
||||||
split = split[4][:-5].split('_')
|
split = split[4][:-5].split('_')
|
||||||
if split[-1] == 'xlg':
|
if split[-1] == 'xlg':
|
||||||
split.pop()
|
split.pop()
|
||||||
if find_re(split[-1], 'ver\d+$'):
|
if find_re(split[-1], r'ver\d+$'):
|
||||||
split.pop()
|
split.pop()
|
||||||
id = '%s/%s' % (year, '_'.join(split))
|
id = '%s/%s' % (year, '_'.join(split))
|
||||||
return id
|
return id
|
||||||
|
@ -72,7 +72,7 @@ def get_ids(page=None):
|
||||||
ids = []
|
ids = []
|
||||||
if page:
|
if page:
|
||||||
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout=-1)
|
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout=-1)
|
||||||
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
|
results = re.compile(r'<a href = \.\./(.*?)>', re.DOTALL).findall(html)
|
||||||
for result in results:
|
for result in results:
|
||||||
url = 'http://impawards.com/%s' % result
|
url = 'http://impawards.com/%s' % result
|
||||||
ids.append(get_id(url))
|
ids.append(get_id(url))
|
||||||
|
|
|
@ -97,8 +97,8 @@ def parse_movies(xml, title):
|
||||||
strings.pop()
|
strings.pop()
|
||||||
for string in strings:
|
for string in strings:
|
||||||
list.append({
|
list.append({
|
||||||
'id': find_re(string, 'viewMovie\?id=(.*?)&'),
|
'id': find_re(string, r'viewMovie\?id=(.*?)&'),
|
||||||
'title': find_re(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
|
'title': find_re(string, r'<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
|
||||||
})
|
})
|
||||||
return list
|
return list
|
||||||
except:
|
except:
|
||||||
|
@ -115,7 +115,7 @@ class ItunesAlbum:
|
||||||
def get_id(self):
|
def get_id(self):
|
||||||
url = compose_url('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
url = compose_url('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
||||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||||
id = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
id = find_re(xml, r'viewAlbum\?id=(.*?)&')
|
||||||
return id
|
return id
|
||||||
|
|
||||||
def get_data(self):
|
def get_data(self):
|
||||||
|
@ -146,7 +146,7 @@ class ItunesMovie:
|
||||||
def get_id(self):
|
def get_id(self):
|
||||||
url = compose_url('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
url = compose_url('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
||||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||||
id = find_re(xml, 'viewMovie\?id=(.*?)&')
|
id = find_re(xml, r'viewMovie\?id=(.*?)&')
|
||||||
return id
|
return id
|
||||||
|
|
||||||
def get_data(self):
|
def get_data(self):
|
||||||
|
@ -170,7 +170,7 @@ class ItunesMovie:
|
||||||
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
|
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
|
||||||
data['runTime'] = find_re(xml, 'Run Time:(.*?)<')
|
data['runTime'] = find_re(xml, 'Run Time:(.*?)<')
|
||||||
data['screenwriters'] = parse_cast(xml, 'screenwriters')
|
data['screenwriters'] = parse_cast(xml, 'screenwriters')
|
||||||
data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
data['soundtrackId'] = find_re(xml, r'viewAlbum\?id=(.*?)&')
|
||||||
data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"')
|
data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"')
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
|
@ -32,7 +32,7 @@ def get_data(isbn):
|
||||||
r[key] = ''
|
r[key] = ''
|
||||||
if key == 'pages' and r[key]:
|
if key == 'pages' and r[key]:
|
||||||
r[key] = int(r[key])
|
r[key] = int(r[key])
|
||||||
desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
|
desc = find_re(data, r'<h2>Description:<\/h2>(.*?)<div ')
|
||||||
desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ')
|
desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ')
|
||||||
r['description'] = strip_tags(desc).strip()
|
r['description'] = strip_tags(desc).strip()
|
||||||
if r['description'] == u'Description of this item is not available at this time.':
|
if r['description'] == u'Description of this item is not available at this time.':
|
||||||
|
|
|
@ -12,7 +12,7 @@ def get_lyrics(title, artist):
|
||||||
key = find_re(html, '<font color=green><b>(.*?)</b></font>')
|
key = find_re(html, '<font color=green><b>(.*?)</b></font>')
|
||||||
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
|
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
|
||||||
xml = read_url(url)
|
xml = read_url(url)
|
||||||
lyrics = find_re(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
|
lyrics = find_re(xml, r'<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
|
||||||
lyrics = lyrics.replace('\n', '').replace('\r', '')
|
lyrics = lyrics.replace('\n', '').replace('\r', '')
|
||||||
lyrics = lyrics.replace('[br]', '\n').strip()
|
lyrics = lyrics.replace('[br]', '\n').strip()
|
||||||
lyrics.replace('\n\n\n', '\n\n')
|
lyrics.replace('\n\n\n', '\n\n')
|
||||||
|
|
|
@ -23,7 +23,7 @@ def get_show_url(title):
|
||||||
title = quote(title)
|
title = quote(title)
|
||||||
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
|
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
|
||||||
data = read_url(url)
|
data = read_url(url)
|
||||||
return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
|
return find_re(data, r'(http://www.metacritic.com/tv/shows/.*?)\?')
|
||||||
|
|
||||||
def get_data(url):
|
def get_data(url):
|
||||||
data = read_url(url, unicode=True)
|
data = read_url(url, unicode=True)
|
||||||
|
|
|
@ -28,13 +28,13 @@ def get_posters(url, group=True, timeout=-1):
|
||||||
html = read_url(url, timeout=timeout, unicode=True)
|
html = read_url(url, timeout=timeout, unicode=True)
|
||||||
if url in html:
|
if url in html:
|
||||||
if group:
|
if group:
|
||||||
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
|
results = re.compile(r'<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
|
||||||
for result in results:
|
for result in results:
|
||||||
posters += get_posters(result, False)
|
posters += get_posters(result, False)
|
||||||
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
|
results = re.compile(r'<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
|
||||||
for result in results:
|
for result in results:
|
||||||
html = read_url(result, timeout=timeout, unicode=True)
|
html = read_url(result, timeout=timeout, unicode=True)
|
||||||
posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
|
posters.append(find_re(html, r'"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
|
||||||
return posters
|
return posters
|
||||||
|
|
||||||
def get_url(id):
|
def get_url(id):
|
||||||
|
|
|
@ -24,8 +24,8 @@ def get_data(url):
|
||||||
r = {}
|
r = {}
|
||||||
r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
|
r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
|
||||||
if '(' in r['title']:
|
if '(' in r['title']:
|
||||||
r['year'] = find_re(r['title'], '\((\d*?)\)')
|
r['year'] = find_re(r['title'], r'\((\d*?)\)')
|
||||||
r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
|
r['title'] = strip_tags(re.sub(r'\((\d*?)\)', '', r['title'])).strip()
|
||||||
r['summary'] = strip_tags(find_re(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
|
r['summary'] = strip_tags(find_re(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
|
||||||
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
|
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
|
||||||
if not r['summary']:
|
if not r['summary']:
|
||||||
|
@ -35,9 +35,9 @@ def get_data(url):
|
||||||
meter = [m for m in meter if m[1].isdigit()]
|
meter = [m for m in meter if m[1].isdigit()]
|
||||||
if meter:
|
if meter:
|
||||||
r['tomatometer'] = meter[0][1]
|
r['tomatometer'] = meter[0][1]
|
||||||
r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')
|
r['rating'] = find_re(data, r'Average Rating: <span>([\d.]+)/10</span>')
|
||||||
r['user_score'] = find_re(data, '<span class="meter popcorn numeric ">(\d+)</span>')
|
r['user_score'] = find_re(data, r'<span class="meter popcorn numeric ">(\d+)</span>')
|
||||||
r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5')
|
r['user_rating'] = find_re(data, r'Average Rating: ([\d.]+)/5')
|
||||||
poster = get_og(data, 'image')
|
poster = get_og(data, 'image')
|
||||||
if poster and not 'poster_default.gif' in poster:
|
if poster and not 'poster_default.gif' in poster:
|
||||||
r['posters'] = [poster]
|
r['posters'] = [poster]
|
||||||
|
|
|
@ -106,7 +106,7 @@ def get_issue(year, week):
|
||||||
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
|
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
|
||||||
contents = []
|
contents = []
|
||||||
data = ox.cache.read_url(url)
|
data = ox.cache.read_url(url)
|
||||||
items = re.compile('<a.?href="http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=".?>(.*?)</a>').findall(data)
|
items = re.compile(r'<a.?href="http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=".?>(.*?)</a>').findall(data)
|
||||||
for item in items:
|
for item in items:
|
||||||
item = item[1]
|
item = item[1]
|
||||||
page = int(re.compile('&SE=(.*?)"').findall(item)[0])
|
page = int(re.compile('&SE=(.*?)"').findall(item)[0])
|
||||||
|
|
|
@ -50,10 +50,10 @@ def find_movies(query=None, imdb=None, max_results=10):
|
||||||
def get_id(piratebayId):
|
def get_id(piratebayId):
|
||||||
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
|
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
|
||||||
piratebayId = piratebayId.split('org/')[1]
|
piratebayId = piratebayId.split('org/')[1]
|
||||||
d = find_re(piratebayId, "tor/(\d+)")
|
d = find_re(piratebayId, r"tor/(\d+)")
|
||||||
if d:
|
if d:
|
||||||
piratebayId = d
|
piratebayId = d
|
||||||
d = find_re(piratebayId, "torrent/(\d+)")
|
d = find_re(piratebayId, r"torrent/(\d+)")
|
||||||
if d:
|
if d:
|
||||||
piratebayId = d
|
piratebayId = d
|
||||||
return piratebayId
|
return piratebayId
|
||||||
|
@ -72,26 +72,26 @@ def get_data(piratebayId):
|
||||||
}
|
}
|
||||||
piratebayId = get_id(piratebayId)
|
piratebayId = get_id(piratebayId)
|
||||||
torrent = dict()
|
torrent = dict()
|
||||||
torrent[u'id'] = piratebayId
|
torrent['id'] = piratebayId
|
||||||
torrent[u'domain'] = 'thepiratebay.org'
|
torrent['domain'] = 'thepiratebay.org'
|
||||||
torrent[u'comment_link'] = baseurl + 'torrent/%s' % piratebayId
|
torrent['comment_link'] = baseurl + 'torrent/%s' % piratebayId
|
||||||
|
|
||||||
data = read_url(torrent['comment_link'], unicode=True)
|
data = read_url(torrent['comment_link'], unicode=True)
|
||||||
torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
torrent['title'] = find_re(data, r'<title>(.*?) \(download torrent\) - TPB</title>')
|
||||||
if not torrent[u'title']:
|
if not torrent['title']:
|
||||||
return None
|
return None
|
||||||
torrent[u'title'] = decode_html(torrent[u'title']).strip()
|
torrent['title'] = decode_html(torrent['title']).strip()
|
||||||
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
|
torrent['imdbId'] = find_re(data, r'title/tt(\d{7})')
|
||||||
title = quote(torrent['title'].encode('utf-8'))
|
title = quote(torrent['title'].encode('utf-8'))
|
||||||
torrent[u'magent_link']= find_re(data, '"(magnet:.*?)"')
|
torrent['magent_link'] = find_re(data, r'"(magnet:.*?)"')
|
||||||
torrent[u'infohash'] = find_re(torrent[u'magent_link'], "btih:(.*?)&")
|
torrent['infohash'] = find_re(torrent['magent_link'], "btih:(.*?)&")
|
||||||
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
for d in re.compile(r'dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
||||||
key = d[0].lower().strip()
|
key = d[0].lower().strip()
|
||||||
key = _key_map.get(key, key)
|
key = _key_map.get(key, key)
|
||||||
value = decode_html(strip_tags(d[1].strip()))
|
value = decode_html(strip_tags(d[1].strip()))
|
||||||
if not '<' in key:
|
if '<' not in key:
|
||||||
torrent[key] = value
|
torrent[key] = value
|
||||||
torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>')
|
torrent['description'] = find_re(data, '<div class="nfo">(.*?)</div>')
|
||||||
if torrent[u'description']:
|
if torrent['description']:
|
||||||
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
|
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
|
||||||
return torrent
|
return torrent
|
||||||
|
|
|
@ -22,7 +22,7 @@ def get_episode_data(url):
|
||||||
#episode score
|
#episode score
|
||||||
r['episode score'] = find_re(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
|
r['episode score'] = find_re(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
|
||||||
|
|
||||||
match = re.compile('Episode Number: (\d*?) Season Num: (\d*?) First Aired: (.*?)  ').findall(data)
|
match = re.compile(r'Episode Number: (\d*?) Season Num: (\d*?) First Aired: (.*?)  ').findall(data)
|
||||||
if match:
|
if match:
|
||||||
r['season'] = int(match[0][1])
|
r['season'] = int(match[0][1])
|
||||||
r['episode'] = int(match[0][0])
|
r['episode'] = int(match[0][0])
|
||||||
|
|
|
@ -24,10 +24,10 @@ def find(query=None, user=None, timeout=60):
|
||||||
user = re.compile('data-name="(.*?)"').findall(t)[0]
|
user = re.compile('data-name="(.*?)"').findall(t)[0]
|
||||||
user = ox.decode_html(ox.strip_tags(user)).strip()
|
user = ox.decode_html(ox.strip_tags(user)).strip()
|
||||||
tweets.append({
|
tweets.append({
|
||||||
'id': re.compile('data-tweet-id="(\d+)"').findall(t)[0],
|
'id': re.compile(r'data-tweet-id="(\d+)"').findall(t)[0],
|
||||||
'user-id': re.compile('data-user-id="(\d+)"').findall(t)[0],
|
'user-id': re.compile(r'data-user-id="(\d+)"').findall(t)[0],
|
||||||
'name': re.compile('data-screen-name="(.*?)"').findall(t)[0],
|
'name': re.compile(r'data-screen-name="(.*?)"').findall(t)[0],
|
||||||
'time': datetime.fromtimestamp(int(re.compile('data-time="(\d+)"').findall(t)[0])),
|
'time': datetime.fromtimestamp(int(re.compile(r'data-time="(\d+)"').findall(t)[0])),
|
||||||
'user': user,
|
'user': user,
|
||||||
'text': text,
|
'text': text,
|
||||||
'html': html,
|
'html': html,
|
||||||
|
|
|
@ -43,7 +43,7 @@ def get_data(url):
|
||||||
if not 'url' in m:
|
if not 'url' in m:
|
||||||
print(url, 'missing')
|
print(url, 'missing')
|
||||||
if 'title' in m:
|
if 'title' in m:
|
||||||
m['title'] = re.sub('(.*?) \(\d{4}\)$', '\\1', m['title'])
|
m['title'] = re.sub(r'(.*?) \(\d{4}\)$', '\\1', m['title'])
|
||||||
|
|
||||||
if not 'title' in m:
|
if not 'title' in m:
|
||||||
match = re.compile('<span id="ubuwork">(.*?)</span>').findall(data)
|
match = re.compile('<span id="ubuwork">(.*?)</span>').findall(data)
|
||||||
|
@ -52,7 +52,7 @@ def get_data(url):
|
||||||
if not 'title' in m:
|
if not 'title' in m:
|
||||||
match = re.compile("<title>.*?&(.*?)</title>", re.DOTALL).findall(data)
|
match = re.compile("<title>.*?&(.*?)</title>", re.DOTALL).findall(data)
|
||||||
if match:
|
if match:
|
||||||
m['title'] = re.sub('\s+', ' ', match[0]).strip()
|
m['title'] = re.sub(r'\s+', ' ', match[0]).strip()
|
||||||
if ' - ' in m['title']:
|
if ' - ' in m['title']:
|
||||||
m['title'] = m['title'].split(' - ', 1)[-1]
|
m['title'] = m['title'].split(' - ', 1)[-1]
|
||||||
if 'title' in m:
|
if 'title' in m:
|
||||||
|
@ -83,7 +83,7 @@ def get_data(url):
|
||||||
if len(txt) > 1 and txt[0].strip() == m.get('title'):
|
if len(txt) > 1 and txt[0].strip() == m.get('title'):
|
||||||
txt = txt[1:]
|
txt = txt[1:]
|
||||||
m['description'] = '\n\n'.join(txt).split('RESOURCES')[0].split('RELATED')[0].strip()
|
m['description'] = '\n\n'.join(txt).split('RESOURCES')[0].split('RELATED')[0].strip()
|
||||||
y = re.compile('\((\d{4})\)').findall(data)
|
y = re.compile(r'\((\d{4})\)').findall(data)
|
||||||
if y:
|
if y:
|
||||||
m['year'] = int(y[0])
|
m['year'] = int(y[0])
|
||||||
d = re.compile('Director: (.+)').findall(data)
|
d = re.compile('Director: (.+)').findall(data)
|
||||||
|
@ -98,7 +98,7 @@ def get_data(url):
|
||||||
if a:
|
if a:
|
||||||
m['artist'] = strip_tags(decode_html(a[0][1])).strip()
|
m['artist'] = strip_tags(decode_html(a[0][1])).strip()
|
||||||
else:
|
else:
|
||||||
a = re.compile('<b>(.*?)\(b\..*?\d{4}\)').findall(data)
|
a = re.compile(r'<b>(.*?)\(b\..*?\d{4}\)').findall(data)
|
||||||
if a:
|
if a:
|
||||||
m['artist'] = strip_tags(decode_html(a[0])).strip()
|
m['artist'] = strip_tags(decode_html(a[0])).strip()
|
||||||
elif m['id'] == 'film/lawder_color':
|
elif m['id'] == 'film/lawder_color':
|
||||||
|
@ -125,11 +125,11 @@ def get_ids():
|
||||||
data = read_url('http://www.ubu.com/film/')
|
data = read_url('http://www.ubu.com/film/')
|
||||||
ids = []
|
ids = []
|
||||||
author_urls = []
|
author_urls = []
|
||||||
for url, author in re.compile('<a href="(\./.*?)">(.*?)</a>').findall(data):
|
for url, author in re.compile(r'<a href="(\./.*?)">(.*?)</a>').findall(data):
|
||||||
url = 'http://www.ubu.com/film' + url[1:]
|
url = 'http://www.ubu.com/film' + url[1:]
|
||||||
data = read_url(url)
|
data = read_url(url)
|
||||||
author_urls.append(url)
|
author_urls.append(url)
|
||||||
for u, title in re.compile('<a href="(.*?)">(.*?)</a>').findall(data):
|
for u, title in re.compile(r'<a href="(.*?)">(.*?)</a>').findall(data):
|
||||||
if not u.startswith('http'):
|
if not u.startswith('http'):
|
||||||
if u == '../../sound/burroughs.html':
|
if u == '../../sound/burroughs.html':
|
||||||
u = 'http://www.ubu.com/sound/burroughs.html'
|
u = 'http://www.ubu.com/sound/burroughs.html'
|
||||||
|
@ -145,7 +145,7 @@ def get_ids():
|
||||||
def get_sound_ids():
|
def get_sound_ids():
|
||||||
data = read_url('http://www.ubu.com/sound/')
|
data = read_url('http://www.ubu.com/sound/')
|
||||||
ids = []
|
ids = []
|
||||||
for url, author in re.compile('<a href="(\./.*?)">(.*?)</a>').findall(data):
|
for url, author in re.compile(r'<a href="(\./.*?)">(.*?)</a>').findall(data):
|
||||||
url = 'http://www.ubu.com/sound' + url[1:]
|
url = 'http://www.ubu.com/sound' + url[1:]
|
||||||
ids.append(url)
|
ids.append(url)
|
||||||
ids = [get_id(url) for url in sorted(set(ids))]
|
ids = [get_id(url) for url in sorted(set(ids))]
|
||||||
|
|
|
@ -50,7 +50,7 @@ def get_movie_data(wikipedia_url):
|
||||||
if not wikipedia_url.startswith('http'):
|
if not wikipedia_url.startswith('http'):
|
||||||
wikipedia_url = get_url(wikipedia_url)
|
wikipedia_url = get_url(wikipedia_url)
|
||||||
data = get_wiki_data(wikipedia_url)
|
data = get_wiki_data(wikipedia_url)
|
||||||
filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
|
filmbox_data = find_re(data, r'''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
|
||||||
filmbox = {}
|
filmbox = {}
|
||||||
_box = filmbox_data.strip().split('|')
|
_box = filmbox_data.strip().split('|')
|
||||||
for row in _box:
|
for row in _box:
|
||||||
|
|
|
@ -46,7 +46,7 @@ def video_url(youtubeId, format='mp4', timeout=cache_timeout):
|
||||||
def get_video_info(id):
|
def get_video_info(id):
|
||||||
eurl = get_url(id)
|
eurl = get_url(id)
|
||||||
data = read_url(eurl).decode('utf-8')
|
data = read_url(eurl).decode('utf-8')
|
||||||
t = re.compile('\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]').findall(data)
|
t = re.compile(r'\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]').findall(data)
|
||||||
if t:
|
if t:
|
||||||
t = t[0]
|
t = t[0]
|
||||||
else:
|
else:
|
||||||
|
@ -162,7 +162,7 @@ def videos(id, format=''):
|
||||||
def playlist(url):
|
def playlist(url):
|
||||||
data = read_url(url).decode('utf-8')
|
data = read_url(url).decode('utf-8')
|
||||||
items = []
|
items = []
|
||||||
for i in list(set(re.compile('<a href="(/watch\?v=.*?)" title="(.*?)" ').findall(data))):
|
for i in list(set(re.compile(r'<a href="(/watch\?v=.*?)" title="(.*?)" ').findall(data))):
|
||||||
items.append({
|
items.append({
|
||||||
'title': i[1],
|
'title': i[1],
|
||||||
'url': 'http://www.youtube.com' + i[0].split('&')[0]
|
'url': 'http://www.youtube.com' + i[0].split('&')[0]
|
||||||
|
|
Loading…
Reference in a new issue