From b7979779fee60b41319da3041994fc968ad73216 Mon Sep 17 00:00:00 2001 From: j Date: Mon, 22 Jul 2019 10:38:14 +0200 Subject: [PATCH 01/19] .rec files --- ox/file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ox/file.py b/ox/file.py index ab789a3..956f215 100644 --- a/ox/file.py +++ b/ox/file.py @@ -32,7 +32,7 @@ EXTENSIONS = { 'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm4v', 'mkv', 'mov', 'mp4', 'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'rmvb', 'vob', 'webm', 'wmv', 'asf', 'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD - 'mxf', 'ts' + 'mxf', 'ts', 'rec', ], } From 8c14d28aa26e86002fe3a05b69534d7e6e9dfe77 Mon Sep 17 00:00:00 2001 From: j Date: Mon, 22 Jul 2019 17:11:46 +0200 Subject: [PATCH 02/19] remove rec again, not a real format --- ox/file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ox/file.py b/ox/file.py index 956f215..ab789a3 100644 --- a/ox/file.py +++ b/ox/file.py @@ -32,7 +32,7 @@ EXTENSIONS = { 'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm4v', 'mkv', 'mov', 'mp4', 'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'rmvb', 'vob', 'webm', 'wmv', 'asf', 'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD - 'mxf', 'ts', 'rec', + 'mxf', 'ts' ], } From fb8b33d916660e59aaefedb844a6b64e285979ac Mon Sep 17 00:00:00 2001 From: j Date: Tue, 23 Jul 2019 16:09:07 +0200 Subject: [PATCH 03/19] fix variable name --- ox/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ox/api.py b/ox/api.py index 51742c2..159aec7 100644 --- a/ox/api.py +++ b/ox/api.py @@ -201,7 +201,7 @@ class API(object): return False if data['status']['code'] != 200: print("request returned error, will try again in 5 seconds") - if DEBUG: + if self.DEBUG: print(data) time.sleep(5) if data and data.get('result') == 1: From 9c90aaa5f8e6f17f22dc71f3eeba0a4bf40d2f15 Mon Sep 17 00:00:00 2001 From: j Date: Tue, 23 Jul 2019 16:24:06 +0200 Subject: [PATCH 04/19] imdb can also be 8 digits --- ox/web/imdb.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 4821b0c..db745bf 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -267,7 +267,7 @@ class Imdb(SiteParser): }, 'series': { 'page': 'reference', - 're': '

.*?.*?(.*?)').findall(data) + #cc[rel] = re.compile('(.*?)').findall(data) def get_conn(c): r = { 'id': c[0], @@ -432,7 +432,7 @@ class Imdb(SiteParser): if len(description) == 2 and description[-1].strip() != '-': r['description'] = description[-1].strip() return r - cc[rel] = list(map(get_conn, re.compile('(.*?)(.*?)<\/div', re.DOTALL).findall(data))) + cc[rel] = list(map(get_conn, re.compile('(.*?)(.*?)<\/div', re.DOTALL).findall(data))) self['connections'] = cc @@ -618,7 +618,7 @@ def get_movie_by_title(title, timeout=-1): url = "http://akas.imdb.com/find?" + params data = read_url(url, timeout=timeout, unicode=True) #if search results in redirect, get id of current page - r = '' + r = '' results = re.compile(r).findall(data) if results: return results[0] @@ -697,12 +697,12 @@ def get_movie_id(title, director='', year='', timeout=-1): data = read_url(url, timeout=timeout, unicode=True) #if search results in redirect, get id of current page - r = '' + r = '' results = re.compile(r).findall(data) if results: return results[0] #otherwise get first result - r = '.*?.*?
S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data): + for e in re.compile('
.*?
S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data): episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0] else: data = cache.read_url(url) From d632cd3803c8f42f75593fc74bfa626422fcf3ba Mon Sep 17 00:00:00 2001 From: j Date: Tue, 23 Jul 2019 16:42:20 +0200 Subject: [PATCH 05/19] match as many digits as possible --- ox/web/imdb.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index db745bf..7d91dc7 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -267,7 +267,7 @@ class Imdb(SiteParser): }, 'series': { 'page': 'reference', - 're': '

.*?.*?(.*?)').findall(data) + #cc[rel] = re.compile('(.*?)').findall(data) def get_conn(c): r = { 'id': c[0], @@ -432,7 +432,7 @@ class Imdb(SiteParser): if len(description) == 2 and description[-1].strip() != '-': r['description'] = description[-1].strip() return r - cc[rel] = list(map(get_conn, re.compile('(.*?)(.*?)<\/div', re.DOTALL).findall(data))) + cc[rel] = list(map(get_conn, re.compile('(.*?)(.*?)<\/div', re.DOTALL).findall(data))) self['connections'] = cc @@ -618,7 +618,7 @@ def get_movie_by_title(title, timeout=-1): url = "http://akas.imdb.com/find?" + params data = read_url(url, timeout=timeout, unicode=True) #if search results in redirect, get id of current page - r = '' + r = '' results = re.compile(r).findall(data) if results: return results[0] @@ -697,12 +697,12 @@ def get_movie_id(title, director='', year='', timeout=-1): data = read_url(url, timeout=timeout, unicode=True) #if search results in redirect, get id of current page - r = '' + r = '' results = re.compile(r).findall(data) if results: return results[0] #otherwise get first result - r = '.*?.*?
S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data): + for e in re.compile('
.*?
S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data): episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0] else: data = cache.read_url(url) From b49acd47dc9d8da00226ba7f7eef712f2c5fdae2 Mon Sep 17 00:00:00 2001 From: j Date: Thu, 1 Aug 2019 16:28:00 +0200 Subject: [PATCH 06/19] load subtitle info --- ox/file.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ox/file.py b/ox/file.py index ab789a3..58f2a49 100644 --- a/ox/file.py +++ b/ox/file.py @@ -293,6 +293,18 @@ def ffprobe(filename): 'sample_aspect_ratio': 'pixel_aspect_ratio', }.get(key, key)] = fix_value(key, s[key]) info[s['codec_type']].append(stream) + elif s.get('codec_type') == 'subtitle': + info['subtitles'] = info.get('subtitles', []) + stream = {} + for key in ( + 'codec_name', 'language' + ): + if key in s: + stream[{ + 'codec_name': 'codec', + + }.get(key, key)] = s[key] + info['subtitles'].append(stream) else: pass # print s From 23a641189ca5d4b5ea592547ac5e712ba4d6e406 Mon Sep 17 00:00:00 2001 From: j Date: Thu, 1 Aug 2019 20:54:04 +0200 Subject: [PATCH 07/19] fix subtitle language --- ox/file.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ox/file.py b/ox/file.py index 58f2a49..177ca1f 100644 --- a/ox/file.py +++ b/ox/file.py @@ -296,13 +296,17 @@ def ffprobe(filename): elif s.get('codec_type') == 'subtitle': info['subtitles'] = info.get('subtitles', []) stream = {} + if language and language != 'und': + stream['language'] = language for key in ( - 'codec_name', 'language' + 'codec_name', + 'language', + 'width', + 'height', ): if key in s: stream[{ 'codec_name': 'codec', - }.get(key, key)] = s[key] info['subtitles'].append(stream) else: From e78519998da1ea897c9767fb0dcc40f28c3c2738 Mon Sep 17 00:00:00 2001 From: j Date: Fri, 2 Aug 2019 14:23:07 +0200 Subject: [PATCH 08/19] use requests session --- ox/cache.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ox/cache.py b/ox/cache.py index 904f31d..b5f9a9e 100644 --- a/ox/cache.py +++ b/ox/cache.py @@ -16,6 +16,7 @@ from six import PY2 try: import requests USE_REQUESTS = True + requests_session = requests.Session() except: USE_REQUESTS = False @@ -101,7 +102,7 @@ def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, un url_headers = {} if not result: if USE_REQUESTS: - r = requests.get(url, headers=headers) + r = requests_session.get(url, headers=headers) for key in r.headers: url_headers[key.lower()] = r.headers[key] result = r.content From d84503055748c6675ff6938d498dc58eb952a7f8 Mon Sep 17 00:00:00 2001 From: j Date: Fri, 2 Aug 2019 16:26:22 +0200 Subject: [PATCH 09/19] 8 digit imdb ids --- ox/web/piratecinema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ox/web/piratecinema.py b/ox/web/piratecinema.py index 4ed946b..c452f04 100644 --- a/ox/web/piratecinema.py +++ b/ox/web/piratecinema.py @@ -8,7 +8,7 @@ from ox.net import read_url def get_poster_url(id): url = 'http://piratecinema.org/posters/' html = read_url(url).decode('utf-8') - results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html) + results = re.compile('src="(.+)" title=".+\((\d{6}\d+)\)"').findall(html) for result in results: if result[1] == id: return url + result[0] From cc1bad76cdcac8af5f53b3f12f67ed8c948f3037 Mon Sep 17 00:00:00 2001 From: j Date: Sat, 3 Aug 2019 23:35:16 +0200 Subject: [PATCH 10/19] update user agent --- ox/net.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ox/net.py b/ox/net.py index 2a5e71b..59e6abe 100644 --- a/ox/net.py +++ b/ox/net.py @@ -21,7 +21,7 @@ from chardet.universaldetector import UniversalDetector DEBUG = False # Default headers for HTTP requests. DEFAULT_HEADERS = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4', From 388f33ebb629c56751dc5bef6c6bbe2d33f60876 Mon Sep 17 00:00:00 2001 From: j Date: Sat, 3 Aug 2019 23:38:31 +0200 Subject: [PATCH 11/19] cache imdb urls in parallel --- ox/web/siteparser.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ox/web/siteparser.py b/ox/web/siteparser.py index 61a79bd..8c212bf 100644 --- a/ox/web/siteparser.py +++ b/ox/web/siteparser.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import re +from multiprocessing.pool import ThreadPool from six import string_types @@ -28,6 +29,7 @@ def cleanup(key, data, data_type): class SiteParser(dict): baseUrl = '' regex = {} + pool = ThreadPool(8) def get_url(self, page): return "%s%s" % (self.baseUrl, page) @@ -39,6 +41,9 @@ class SiteParser(dict): def __init__(self, timeout=-1): self._cache = {} + urls = list(set(self.get_url(self.regex[key]['page']) for key in self.regex)) + self.pool.map(self.get_url, urls) + for key in self.regex: url = self.get_url(self.regex[key]['page']) data = self.read_url(url, timeout) From 665a4038b2df1222b9584ec950c3c35f1fe81a01 Mon Sep 17 00:00:00 2001 From: j Date: Thu, 8 Aug 2019 17:08:13 +0200 Subject: [PATCH 12/19] space --- ox/web/wikipedia.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ox/web/wikipedia.py b/ox/web/wikipedia.py index cb73758..de8b064 100644 --- a/ox/web/wikipedia.py +++ b/ox/web/wikipedia.py @@ -17,7 +17,7 @@ def get_id(url): def get_url(id=None, imdb=None, allmovie=None): if imdb: - query = '"%s"'% imdb + query = '"%s"' % imdb result = find(query) if result: url = result[0][1] @@ -26,7 +26,7 @@ def get_url(id=None, imdb=None, allmovie=None): return url return "" if allmovie: - query = '"amg_id = 1:%s"'% allmovie + query = '"amg_id = 1:%s"' % allmovie result = find(query) if result: url = result[0][1] @@ -140,7 +140,7 @@ def get_allmovie_id(wikipedia_url): return data.get('amg_id', '') def find(query, max_results=10): - query = {'action': 'query', 'list':'search', 'format': 'json', + query = {'action': 'query', 'list': 'search', 'format': 'json', 'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')} url = "http://en.wikipedia.org/w/api.php?" + urllib.parse.urlencode(query) data = read_url(url) From cef85fc4defb57be8442bf83bdd3fbc1751e3ce1 Mon Sep 17 00:00:00 2001 From: j Date: Fri, 15 Nov 2019 14:51:13 +0100 Subject: [PATCH 13/19] depend on lxml --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index b7509ec..51c3f99 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ chardet six>=1.5.2 +lxml From 03c119155081f7b9f65e1f55d3a58708c9dc6704 Mon Sep 17 00:00:00 2001 From: j Date: Fri, 15 Nov 2019 14:51:32 +0100 Subject: [PATCH 14/19] fall back to storyline for summary --- ox/web/imdb.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 7d91dc7..2f14e33 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -199,6 +199,11 @@ class Imdb(SiteParser): 'summary': zebra_table('Plot Summary', more=[ '

(.*?)Storyline

.*?

(.*?)

', + 'type': 'string' + }, 'posterId': { 'page': 'reference', 're': '', @@ -517,10 +522,13 @@ class Imdb(SiteParser): ]) if self['releasedate'] == 'x': del self['releasedate'] + + if 'summary' not in self and 'storyline' in self: + self['summary'] = self.pop('storyline') if 'summary' in self: if isinstance(self['summary'], list): self['summary'] = self['summary'][0] - self['summary'] = self['summary'].split(' Date: Sat, 21 Dec 2019 20:18:19 +0200 Subject: [PATCH 15/19] srt fixes --- ox/srt.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/ox/srt.py b/ox/srt.py index 5191a55..c29ae8b 100644 --- a/ox/srt.py +++ b/ox/srt.py @@ -63,10 +63,6 @@ def load(filename, offset=0): Returns list with dicts that have in, out, value and id ''' srt = [] - - def parse_time(t): - return offset + ox.time2ms(t.replace(',', '.')) / 1000 - with open(filename, 'rb') as f: encoding = _detect_encoding(f) data = f.read() @@ -77,7 +73,21 @@ def load(filename, offset=0): data = data.decode('latin-1') except: print("failed to detect encoding, giving up") - return srt + return [] + return loads(data, offset) + +def loads(data, offset=0): + '''Parses an srt file + + filename: path to an srt file + offset (float, seconds): shift all in/out points by offset + + Returns list with dicts that have in, out, value and id + ''' + srt = [] + + def parse_time(t): + return offset + ox.time2ms(t.replace(',', '.')) / 1000 data = data.replace('\r\n', '\n') if not data.endswith('\n\n'): From 3574be2975bca8fbc1a42634e79e4ee9a088054a Mon Sep 17 00:00:00 2001 From: j Date: Sat, 21 Dec 2019 20:29:44 +0200 Subject: [PATCH 16/19] don't fall back to ffmpeg2theora --- ox/file.py | 45 ++------------------------------------------- 1 file changed, 2 insertions(+), 43 deletions(-) diff --git a/ox/file.py b/ox/file.py index 177ca1f..ec9da4b 100644 --- a/ox/file.py +++ b/ox/file.py @@ -159,51 +159,10 @@ def avinfo(filename, cached=True): if os.path.getsize(filename): if find_executable('ffprobe'): return ffprobe(filename) - ffmpeg2theora = cmd('ffmpeg2theora') - p = subprocess.Popen([ffmpeg2theora], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, error = p.communicate() - stdout = stdout.decode('utf-8') - version = stdout.split('\n')[0].split(' - ')[0].split(' ')[-1] - if version < '0.27': - raise EnvironmentError('version of ffmpeg2theora needs to be 0.27 or later, found %s' % version) - p = subprocess.Popen([ffmpeg2theora, '--info', filename], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, error = p.communicate() - stdout = stdout.decode('utf-8') - try: - info = json.loads(stdout) - except: - # remove metadata, can be broken - reg = re.compile('"metadata": {.*?},', re.DOTALL) - stdout = re.sub(reg, '', stdout) - info = json.loads(stdout) - if 'video' in info: - for v in info['video']: - if 'display_aspect_ratio' not in v and 'width' in v: - v['display_aspect_ratio'] = '%d:%d' % (v['width'], v['height']) - v['pixel_aspect_ratio'] = '1:1' - if len(info.get('audio', [])) > 1: - if 'metadata' in info['audio'][0]: - for stream in info['audio']: - language = stream.get('metadata', {}).get('language') - if language and language != 'und': - stream['language'] = language[0] - else: - ffmpeg = cmd('ffmpeg') - p = subprocess.Popen([ffmpeg, '-i', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate() - stderr = stderr.decode('utf-8') - languages = [re.compile('\((.+?)\):').findall(l) for l in stderr.split('\n') if 'Stream' in l and 'Audio' in l] - if len(languages) == len(info['audio']): - for i, stream in enumerate(info['audio']): - language = languages[i] - if language and language[0] != 'und': - stream['language'] = language[0] - fix_coverart(info) - return info - + raise EnvironmentError('could to find ffprobe. please install ffmpeg') return {'path': filename, 'size': 0} + def ffprobe(filename): p = subprocess.Popen([ cmd('ffprobe'), From da51407e7d066b8efd621acb7a9f3c997cc75b51 Mon Sep 17 00:00:00 2001 From: j Date: Wed, 5 Feb 2020 16:51:28 +0100 Subject: [PATCH 17/19] fix alternative titles --- ox/web/imdb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 2f14e33..7ccc63d 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -124,8 +124,8 @@ class Imdb(SiteParser): 'alternativeTitles': { 'page': 'releaseinfo', 're': [ - ']*?id="akas"[^>]*?>(.*?)', - "td>(.*?).*?(.*?)" + ']*?id="akas"[^>]*?>(.*?)', + "td[^>]*?>(.*?).*?]*?>(.*?)" ], 'type': 'list' }, From 09e0a521af7adf01c6d362250decd1fd55d06558 Mon Sep 17 00:00:00 2001 From: j Date: Tue, 18 Feb 2020 16:27:25 +0100 Subject: [PATCH 18/19] add Accept-Language, use akas --- ox/web/imdb.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 7ccc63d..272185b 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -23,6 +23,8 @@ def prepare_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cac headers = headers.copy() # https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau headers['X-Forwarded-For'] = '72.21.206.80' + headers['Accept-Language'] = 'en' + return url, data, headers, timeout, unicode def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False): @@ -34,7 +36,7 @@ def delete_url(url, data=None, headers=cache.DEFAULT_HEADERS): cache.store.delete(url, data, headers) def get_url(id): - return "http://www.imdb.com/title/tt%s/" % id + return "http://akas.imdb.com/title/tt%s/" % id def reference_section(id): From 926b8ad2550a6bcde54bbd8f421172969f4b16a6 Mon Sep 17 00:00:00 2001 From: j Date: Tue, 18 Feb 2020 16:59:08 +0100 Subject: [PATCH 19/19] fight geolocalization --- ox/web/imdb.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 272185b..fb109be 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -379,6 +379,9 @@ class Imdb(SiteParser): if 'alternativeTitles' in self: alt = {} + for t in self['alternativeTitles']: + if t[0].strip() in ('World-wide (English title)', ): + self['title'] = cleanup_title(t[1]) for t in self['alternativeTitles']: title = cleanup_title(t[1]) if title.lower() not in (self.get('title', '').lower(), self.get('originalTitle', '').lower()):