diff --git a/ox/html.py b/ox/html.py index 06ae96f..50d91bf 100644 --- a/ox/html.py +++ b/ox/html.py @@ -181,7 +181,7 @@ def highlight(text, query, hlClass="hl"): query = re.escape(query).replace(r'\ ', '.') m = re.compile("(%s)" % query, re.IGNORECASE).findall(text) for i in m: - text = re.sub(r"(%s)" % re.escape(i).replace(r'\ ', '.'), r'\\1' % hlClass, text) + text = re.sub(r"(%s)" % re.escape(i).replace(r'\ ', '.'), '\\1' % hlClass, text) text = text.replace('|', '
') return text diff --git a/ox/movie.py b/ox/movie.py index 314df47..1f9a824 100644 --- a/ox/movie.py +++ b/ox/movie.py @@ -201,7 +201,7 @@ def parse_path(path, directory_key='director'): # '_.foo$' or '_ (' is '?' string = re.sub(re.compile(r'_(?=(\.\w+$| \())', re.U), '?', string) # ' _..._ ' is '<...>' - string = re.sub('(?<= )_(.+)_(?= )', '<\g<1>>', string) + string = re.sub(r'(?<= )_(.+)_(?= )', r'<\g<1>>', string) # 'foo_bar' or 'foo _ bar' is '/' string = re.sub(re.compile(r'(?<=\w)_(?=\w)', re.U), '/', string) string = re.sub(' _ ', ' / ', string) @@ -243,7 +243,7 @@ def parse_path(path, directory_key='director'): data['year'] = match.group(0)[2:-1] if match else None file_title = re.sub('[/:]', '_', data['title']) # (remove title from beginning of filename if the rest contains a dot) - file = re.sub('^' + re.escape(file_title) + '(?=.*\.)', '', file) + file = re.sub(r'^' + re.escape(file_title) + r'(?=.*\.)', '', file) # (split by nospace+dot+word, but remove spaces preceding extension) parts = re.split(r'(?Spine #(\d+)") + data["number"] = find_re(html, r"Spine #(\d+)") data["title"] = decode_html(find_re(html, "

(.*?)

")) data["title"] = data["title"].split(' \u2014 The Television Version')[0].strip() @@ -77,7 +77,7 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): data["posters"] = [result.replace("_w100", "")] else: data["posters"] = [] - data['posters'] = [re.sub('(\?\d+)$', '', p) for p in data['posters']] + data['posters'] = [re.sub(r'(\?\d+)$', '', p) for p in data['posters']] data['posters'] = [p for p in data['posters'] if p] posters = find_re(html, '
(.*?)
') @@ -103,12 +103,12 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): def get_ids(page=None): ids = [] html = read_url("https://www.criterion.com/shop/browse/list?sort=spine_number", unicode=True) - results = re.compile("films/(\d+)-").findall(html) + results = re.compile(r"films/(\d+)-").findall(html) ids += results - results = re.compile("boxsets/(.*?)\"").findall(html) + results = re.compile(r"boxsets/(.*?)\"").findall(html) for result in results: html = read_url("https://www.criterion.com/boxsets/" + result, unicode=True) - results = re.compile("films/(\d+)-").findall(html) + results = re.compile(r"films/(\d+)-").findall(html) ids += results return sorted(set(ids), key=int) diff --git a/ox/web/epguides.py b/ox/web/epguides.py index 65670e7..97ce82a 100644 --- a/ox/web/epguides.py +++ b/ox/web/epguides.py @@ -25,10 +25,10 @@ def get_show_data(url): data = read_url(url, unicode=True) r = {} r['title'] = strip_tags(find_re(data, '

(.*?)

')) - r['imdb'] = find_re(data, '

.*?

') + r['imdb'] = find_re(data, r'

.*?

') r['episodes'] = {} #1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear - for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) (.*?)').findall(data): + for episode in re.compile(r'(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) (.*?)').findall(data): air_date = episode[3].strip() #'22 Sep 04' -> 2004-09-22 try: @@ -42,7 +42,7 @@ def get_show_data(url): 'prod code': episode[2], 'air date': air_date, 'url': episode[4], - 'title':episode[5], + 'title': episode[5], } except: print("oxweb.epguides failed,", url) diff --git a/ox/web/filmsdivision.py b/ox/web/filmsdivision.py index d3f9185..36c5c62 100644 --- a/ox/web/filmsdivision.py +++ b/ox/web/filmsdivision.py @@ -11,7 +11,7 @@ def get_ids(): for i in string.ascii_uppercase: url = "http://www.filmsdivision.org/search.php?title=%s" % i data = ox.cache.read_url(url) - links = re.compile('view_video.php\?movId=(.*?)[\'"]', re.DOTALL).findall(data) + links = re.compile(r'view_video.php\?movId=(.*?)[\'"]', re.DOTALL).findall(data) result += links return list(set(result)) @@ -20,20 +20,20 @@ def get_data(id): url = "http://www.filmsdivision.org/view_video.php?movId=%s" % id data = ox.cache.read_url(url) result['title'] = re.compile('(.*?)').findall(data)[0] - result['year'] = re.compile('Release: (\d{4})').findall(data)[0] - result['duration'] = int(re.compile('Duration: (\d+)mins').findall(data)[0]) * 60 - result['producer'] = re.compile('Producer: (.*?)\t').findall(data)[0].strip() + result['year'] = re.compile(r'Release: (\d{4})').findall(data)[0] + result['duration'] = int(re.compile(r'Duration: (\d+)mins').findall(data)[0]) * 60 + result['producer'] = re.compile(r'Producer: (.*?)\t').findall(data)[0].strip() if 'Director:' in data: - result['director'] = re.compile('Director: (.*?)\t').findall(data)[0].strip() + result['director'] = re.compile(r'Director: (.*?)\t').findall(data)[0].strip() else: result['director'] = "Unknown Director" - result['url'] = re.compile('value="(.*?.wmv)"').findall(data)[0] + result['url'] = re.compile(r'value="(.*?.wmv)"').findall(data)[0] return result def download_video(url, filename): dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) - p = subprocess.Popen(['gst-launch', 'mmssrc', 'location=%s'%url, '!', 'filesink', 'locaiton='%filename]) + p = subprocess.Popen(['gst-launch', 'mmssrc', 'location=%s' % url, '!', 'filesink', 'location=' % filename]) p.wait() return p.returncode == 0 diff --git a/ox/web/freebase.py b/ox/web/freebase.py index c1cf37e..4458eee 100644 --- a/ox/web/freebase.py +++ b/ox/web/freebase.py @@ -30,13 +30,13 @@ class Freebase(dict): 'metacritic': '/source/metacritic/movie', } for key in keys: - links = filter(lambda x: x['namespace'] == keys[key],data['ids']) + links = filter(lambda x: x['namespace'] == keys[key], data['ids']) if links: self[key] = links[0]['uri'] if 'nytimes' in self: self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-')) - self['amgId'] = find_re(self['nytimes'], 'movie/(\d+)/') + self['amgId'] = find_re(self['nytimes'], r'movie/(\d+)/') diff --git a/ox/web/impawards.py b/ox/web/impawards.py index 28e9d64..d497ad0 100644 --- a/ox/web/impawards.py +++ b/ox/web/impawards.py @@ -31,13 +31,13 @@ def get_data(id): 'url': get_url(id) } html = read_url(data['url']) - data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})') + data['imdbId'] = find_re(html, r'imdb.com/title/tt(\d{7})') if not data['imdbId']: data['imdbId'] = _id_map.get(id, '') - data['title'] = strip_tags(find_re(html, '

(.*?) \(')) - data['year'] = find_re(html, '\((.*?)\)') + data['title'] = strip_tags(find_re(html, r'

(.*?) \(')) + data['year'] = find_re(html, r'\((.*?)\)') data['posters'] = [] - poster = find_re(html, '', re.DOTALL).findall(html) + results = re.compile(r'', re.DOTALL).findall(html) for result in results: url = 'http://impawards.com/%s' % result ids.append(get_id(url)) diff --git a/ox/web/itunes.py b/ox/web/itunes.py index bb85952..bc33361 100644 --- a/ox/web/itunes.py +++ b/ox/web/itunes.py @@ -97,8 +97,8 @@ def parse_movies(xml, title): strings.pop() for string in strings: list.append({ - 'id': find_re(string, 'viewMovie\?id=(.*?)&'), - 'title': find_re(string, '(.*?)') + 'id': find_re(string, r'viewMovie\?id=(.*?)&'), + 'title': find_re(string, r'(.*?)') }) return list except: @@ -115,7 +115,7 @@ class ItunesAlbum: def get_id(self): url = compose_url('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist}) xml = read_url(url, headers = ITUNES_HEADERS) - id = find_re(xml, 'viewAlbum\?id=(.*?)&') + id = find_re(xml, r'viewAlbum\?id=(.*?)&') return id def get_data(self): @@ -146,7 +146,7 @@ class ItunesMovie: def get_id(self): url = compose_url('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director}) xml = read_url(url, headers = ITUNES_HEADERS) - id = find_re(xml, 'viewMovie\?id=(.*?)&') + id = find_re(xml, r'viewMovie\?id=(.*?)&') return id def get_data(self): @@ -170,7 +170,7 @@ class ItunesMovie: data['releaseDate'] = find_re(xml, 'Released(.*?)<') data['runTime'] = find_re(xml, 'Run Time:(.*?)<') data['screenwriters'] = parse_cast(xml, 'screenwriters') - data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&') + data['soundtrackId'] = find_re(xml, r'viewAlbum\?id=(.*?)&') data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"') return data diff --git a/ox/web/lookupbyisbn.py b/ox/web/lookupbyisbn.py index b4c2dcb..f2a4fbb 100644 --- a/ox/web/lookupbyisbn.py +++ b/ox/web/lookupbyisbn.py @@ -32,7 +32,7 @@ def get_data(isbn): r[key] = '' if key == 'pages' and r[key]: r[key] = int(r[key]) - desc = find_re(data, '

Description:<\/h2>(.*?)
Description:<\/h2>(.*?)

', ' ').replace('
', ' ').replace('
', ' ') r['description'] = strip_tags(desc).strip() if r['description'] == u'Description of this item is not available at this time.': diff --git a/ox/web/lyricsfly.py b/ox/web/lyricsfly.py index b69cda9..253b0a4 100644 --- a/ox/web/lyricsfly.py +++ b/ox/web/lyricsfly.py @@ -12,7 +12,7 @@ def get_lyrics(title, artist): key = find_re(html, '(.*?)') url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title) xml = read_url(url) - lyrics = find_re(xml, '(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com') + lyrics = find_re(xml, r'(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com') lyrics = lyrics.replace('\n', '').replace('\r', '') lyrics = lyrics.replace('[br]', '\n').strip() lyrics.replace('\n\n\n', '\n\n') diff --git a/ox/web/metacritic.py b/ox/web/metacritic.py index 8c59998..226e937 100644 --- a/ox/web/metacritic.py +++ b/ox/web/metacritic.py @@ -23,7 +23,7 @@ def get_show_url(title): title = quote(title) url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title data = read_url(url) - return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?') + return find_re(data, r'(http://www.metacritic.com/tv/shows/.*?)\?') def get_data(url): data = read_url(url, unicode=True) diff --git a/ox/web/movieposterdb.py b/ox/web/movieposterdb.py index eb90910..73d6f7c 100644 --- a/ox/web/movieposterdb.py +++ b/ox/web/movieposterdb.py @@ -28,13 +28,13 @@ def get_posters(url, group=True, timeout=-1): html = read_url(url, timeout=timeout, unicode=True) if url in html: if group: - results = re.compile('
', re.DOTALL).findall(html) + results = re.compile(r'', re.DOTALL).findall(html) for result in results: posters += get_posters(result, False) - results = re.compile('', re.DOTALL).findall(html) + results = re.compile(r'', re.DOTALL).findall(html) for result in results: html = read_url(result, timeout=timeout, unicode=True) - posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"')) + posters.append(find_re(html, r'"(http://www.movieposterdb.com/posters/.+?\.jpg)"')) return posters def get_url(id): diff --git a/ox/web/rottentomatoes.py b/ox/web/rottentomatoes.py index 605f313..4b4285a 100644 --- a/ox/web/rottentomatoes.py +++ b/ox/web/rottentomatoes.py @@ -24,8 +24,8 @@ def get_data(url): r = {} r['title'] = find_re(data, '

(.*?)

') if '(' in r['title']: - r['year'] = find_re(r['title'], '\((\d*?)\)') - r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip() + r['year'] = find_re(r['title'], r'\((\d*?)\)') + r['title'] = strip_tags(re.sub(r'\((\d*?)\)', '', r['title'])).strip() r['summary'] = strip_tags(find_re(data, '

(.*?)

')).strip() r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ') if not r['summary']: @@ -35,9 +35,9 @@ def get_data(url): meter = [m for m in meter if m[1].isdigit()] if meter: r['tomatometer'] = meter[0][1] - r['rating'] = find_re(data, 'Average Rating: ([\d.]+)/10') - r['user_score'] = find_re(data, '(\d+)') - r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5') + r['rating'] = find_re(data, r'Average Rating: ([\d.]+)/10') + r['user_score'] = find_re(data, r'(\d+)') + r['user_rating'] = find_re(data, r'Average Rating: ([\d.]+)/5') poster = get_og(data, 'image') if poster and not 'poster_default.gif' in poster: r['posters'] = [poster] diff --git a/ox/web/spiegel.py b/ox/web/spiegel.py index 455aec8..77acd30 100644 --- a/ox/web/spiegel.py +++ b/ox/web/spiegel.py @@ -106,7 +106,7 @@ def get_issue(year, week): url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week) contents = [] data = ox.cache.read_url(url) - items = re.compile('(.*?)
').findall(data) + items = re.compile(r'(.*?)').findall(data) for item in items: item = item[1] page = int(re.compile('&SE=(.*?)"').findall(item)[0]) diff --git a/ox/web/thepiratebay.py b/ox/web/thepiratebay.py index cbbdf56..28656c8 100644 --- a/ox/web/thepiratebay.py +++ b/ox/web/thepiratebay.py @@ -50,10 +50,10 @@ def find_movies(query=None, imdb=None, max_results=10): def get_id(piratebayId): if piratebayId.startswith('http://torrents.thepiratebay.org/'): piratebayId = piratebayId.split('org/')[1] - d = find_re(piratebayId, "tor/(\d+)") + d = find_re(piratebayId, r"tor/(\d+)") if d: piratebayId = d - d = find_re(piratebayId, "torrent/(\d+)") + d = find_re(piratebayId, r"torrent/(\d+)") if d: piratebayId = d return piratebayId @@ -72,26 +72,26 @@ def get_data(piratebayId): } piratebayId = get_id(piratebayId) torrent = dict() - torrent[u'id'] = piratebayId - torrent[u'domain'] = 'thepiratebay.org' - torrent[u'comment_link'] = baseurl + 'torrent/%s' % piratebayId + torrent['id'] = piratebayId + torrent['domain'] = 'thepiratebay.org' + torrent['comment_link'] = baseurl + 'torrent/%s' % piratebayId data = read_url(torrent['comment_link'], unicode=True) - torrent[u'title'] = find_re(data, '(.*?) \(download torrent\) - TPB') - if not torrent[u'title']: + torrent['title'] = find_re(data, r'(.*?) \(download torrent\) - TPB') + if not torrent['title']: return None - torrent[u'title'] = decode_html(torrent[u'title']).strip() - torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})') + torrent['title'] = decode_html(torrent['title']).strip() + torrent['imdbId'] = find_re(data, r'title/tt(\d{7})') title = quote(torrent['title'].encode('utf-8')) - torrent[u'magent_link']= find_re(data, '"(magnet:.*?)"') - torrent[u'infohash'] = find_re(torrent[u'magent_link'], "btih:(.*?)&") - for d in re.compile('dt>(.*?):.*?(.*?)', re.DOTALL).findall(data): + torrent['magent_link'] = find_re(data, r'"(magnet:.*?)"') + torrent['infohash'] = find_re(torrent['magent_link'], "btih:(.*?)&") + for d in re.compile(r'dt>(.*?):.*?(.*?)', re.DOTALL).findall(data): key = d[0].lower().strip() key = _key_map.get(key, key) value = decode_html(strip_tags(d[1].strip())) - if not '<' in key: + if '<' not in key: torrent[key] = value - torrent[u'description'] = find_re(data, '
(.*?)
') - if torrent[u'description']: + torrent['description'] = find_re(data, '
(.*?)
') + if torrent['description']: torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip() return torrent diff --git a/ox/web/tv.py b/ox/web/tv.py index 33e3399..24427a3 100644 --- a/ox/web/tv.py +++ b/ox/web/tv.py @@ -22,7 +22,7 @@ def get_episode_data(url): #episode score r['episode score'] = find_re(data, '(.*?)') - match = re.compile('Episode Number: (\d*?)    Season Num: (\d*?)    First Aired: (.*?)  ').findall(data) + match = re.compile(r'Episode Number: (\d*?)    Season Num: (\d*?)    First Aired: (.*?)  ').findall(data) if match: r['season'] = int(match[0][1]) r['episode'] = int(match[0][0]) diff --git a/ox/web/twitter.py b/ox/web/twitter.py index 619c458..959ea3b 100644 --- a/ox/web/twitter.py +++ b/ox/web/twitter.py @@ -24,10 +24,10 @@ def find(query=None, user=None, timeout=60): user = re.compile('data-name="(.*?)"').findall(t)[0] user = ox.decode_html(ox.strip_tags(user)).strip() tweets.append({ - 'id': re.compile('data-tweet-id="(\d+)"').findall(t)[0], - 'user-id': re.compile('data-user-id="(\d+)"').findall(t)[0], - 'name': re.compile('data-screen-name="(.*?)"').findall(t)[0], - 'time': datetime.fromtimestamp(int(re.compile('data-time="(\d+)"').findall(t)[0])), + 'id': re.compile(r'data-tweet-id="(\d+)"').findall(t)[0], + 'user-id': re.compile(r'data-user-id="(\d+)"').findall(t)[0], + 'name': re.compile(r'data-screen-name="(.*?)"').findall(t)[0], + 'time': datetime.fromtimestamp(int(re.compile(r'data-time="(\d+)"').findall(t)[0])), 'user': user, 'text': text, 'html': html, diff --git a/ox/web/ubu.py b/ox/web/ubu.py index 2d532f1..32b4cfb 100644 --- a/ox/web/ubu.py +++ b/ox/web/ubu.py @@ -43,7 +43,7 @@ def get_data(url): if not 'url' in m: print(url, 'missing') if 'title' in m: - m['title'] = re.sub('(.*?) \(\d{4}\)$', '\\1', m['title']) + m['title'] = re.sub(r'(.*?) \(\d{4}\)$', '\\1', m['title']) if not 'title' in m: match = re.compile('(.*?)').findall(data) @@ -52,7 +52,7 @@ def get_data(url): if not 'title' in m: match = re.compile(".*?&(.*?)", re.DOTALL).findall(data) if match: - m['title'] = re.sub('\s+', ' ', match[0]).strip() + m['title'] = re.sub(r'\s+', ' ', match[0]).strip() if ' - ' in m['title']: m['title'] = m['title'].split(' - ', 1)[-1] if 'title' in m: @@ -83,7 +83,7 @@ def get_data(url): if len(txt) > 1 and txt[0].strip() == m.get('title'): txt = txt[1:] m['description'] = '\n\n'.join(txt).split('RESOURCES')[0].split('RELATED')[0].strip() - y = re.compile('\((\d{4})\)').findall(data) + y = re.compile(r'\((\d{4})\)').findall(data) if y: m['year'] = int(y[0]) d = re.compile('Director: (.+)').findall(data) @@ -98,7 +98,7 @@ def get_data(url): if a: m['artist'] = strip_tags(decode_html(a[0][1])).strip() else: - a = re.compile('(.*?)\(b\..*?\d{4}\)').findall(data) + a = re.compile(r'(.*?)\(b\..*?\d{4}\)').findall(data) if a: m['artist'] = strip_tags(decode_html(a[0])).strip() elif m['id'] == 'film/lawder_color': @@ -125,11 +125,11 @@ def get_ids(): data = read_url('http://www.ubu.com/film/') ids = [] author_urls = [] - for url, author in re.compile('(.*?)').findall(data): + for url, author in re.compile(r'(.*?)').findall(data): url = 'http://www.ubu.com/film' + url[1:] data = read_url(url) author_urls.append(url) - for u, title in re.compile('(.*?)').findall(data): + for u, title in re.compile(r'(.*?)').findall(data): if not u.startswith('http'): if u == '../../sound/burroughs.html': u = 'http://www.ubu.com/sound/burroughs.html' @@ -145,7 +145,7 @@ def get_ids(): def get_sound_ids(): data = read_url('http://www.ubu.com/sound/') ids = [] - for url, author in re.compile('(.*?)').findall(data): + for url, author in re.compile(r'(.*?)').findall(data): url = 'http://www.ubu.com/sound' + url[1:] ids.append(url) ids = [get_id(url) for url in sorted(set(ids))] diff --git a/ox/web/wikipedia.py b/ox/web/wikipedia.py index aad0aba..783e2f6 100644 --- a/ox/web/wikipedia.py +++ b/ox/web/wikipedia.py @@ -50,7 +50,7 @@ def get_movie_data(wikipedia_url): if not wikipedia_url.startswith('http'): wikipedia_url = get_url(wikipedia_url) data = get_wiki_data(wikipedia_url) - filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''') + filmbox_data = find_re(data, r'''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''') filmbox = {} _box = filmbox_data.strip().split('|') for row in _box: diff --git a/ox/web/youtube.py b/ox/web/youtube.py index 0f59b80..3dfd91a 100644 --- a/ox/web/youtube.py +++ b/ox/web/youtube.py @@ -46,7 +46,7 @@ def video_url(youtubeId, format='mp4', timeout=cache_timeout): def get_video_info(id): eurl = get_url(id) data = read_url(eurl).decode('utf-8') - t = re.compile('\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]').findall(data) + t = re.compile(r'\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]').findall(data) if t: t = t[0] else: @@ -162,7 +162,7 @@ def videos(id, format=''): def playlist(url): data = read_url(url).decode('utf-8') items = [] - for i in list(set(re.compile('