From 887760acc170daeff4c77d926893769dd2d5279e Mon Sep 17 00:00:00 2001 From: j Date: Fri, 18 Jun 2021 12:23:10 +0100 Subject: [PATCH 01/41] e.read() returns bytes --- ox/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ox/api.py b/ox/api.py index b784788..894e26d 100644 --- a/ox/api.py +++ b/ox/api.py @@ -100,7 +100,7 @@ class API(object): if self.DEBUG: import webbrowser if e.code >= 500: - with open('/tmp/error.html', 'w') as f: + with open('/tmp/error.html', 'wb') as f: f.write(e.read()) webbrowser.open_new_tab('/tmp/error.html') From 2172bcb3fb6a3ab5c3b8290878e931b41237ddc7 Mon Sep 17 00:00:00 2001 From: j Date: Sat, 7 Aug 2021 11:30:23 +0200 Subject: [PATCH 02/41] fix criterion parser --- ox/web/criterion.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ox/web/criterion.py b/ox/web/criterion.py index 6cef01e..d7914be 100644 --- a/ox/web/criterion.py +++ b/ox/web/criterion.py @@ -43,8 +43,12 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): results = find_re(html, '') info = re.compile('
  • (.*?)
  • ', re.DOTALL).findall(results) info = {k: strip_tags(v).strip() for k, v in info} + meta = re.compile('.*?src="(.*?)"', re.DOTALL).findall(html) + #result = find_re(html, "\"Film Date: Sun, 29 Aug 2021 13:43:33 +0200 Subject: [PATCH 03/41] parse google infobox --- ox/web/google.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/ox/web/google.py b/ox/web/google.py index 72aa32f..0842d01 100644 --- a/ox/web/google.py +++ b/ox/web/google.py @@ -17,6 +17,31 @@ def quote_plus(s): s = s.encode('utf-8') return urllib.parse.quote_plus(s) + +def infobox(query, timeout=DEFAULT_TIMEOUT): + import lxml.html + data = read_url(url, timeout=timeout) + doc = lxml.html.document_fromstring(data) + k = 'kp-wholepage' + wholepage = doc.cssselect('.' + k) + infobox = {} + if wholepage: + page = wholepage[0] + for a in page.cssselect('a'): + if a.attrib.get('href', '').startswith('http'): + domain = '.'.join(a.attrib['href'].split('/')[2].split('.')[-2:]) + infobox[domain] = a.attrib['href'] + for e in page.cssselect('*[data-attrid]'): + key = e.attrib['data-attrid'] + value = e.text_content() + if value and key not in ( + 'kc:/film/film:media_actions_wholepage', + 'action:watch_film' + ): + infobox[key] = value + return infobox + + def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT): """ Return max_results tuples with title, url, description From 67c6c2413110dc2979e651e1027e40e96a6019fb Mon Sep 17 00:00:00 2001 From: j Date: Wed, 22 Sep 2021 18:56:25 +0200 Subject: [PATCH 04/41] add m2v --- ox/file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ox/file.py b/ox/file.py index f12aee7..ccaa838 100644 --- a/ox/file.py +++ b/ox/file.py @@ -29,7 +29,7 @@ EXTENSIONS = { ], 'video': [ '3gp', - 'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm4v', 'mkv', 'mov', 'mp4', + 'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm2v', 'm4v', 'mkv', 'mov', 'mp4', 'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'rmvb', 'vob', 'webm', 'wmv', 'asf', 'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD 'mxf', 'ts', From 868a401553004d6e350b8be2d6bd62c410bb3b5c Mon Sep 17 00:00:00 2001 From: j Date: Sun, 14 Nov 2021 13:35:26 +0000 Subject: [PATCH 05/41] detect add real media files --- ox/file.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ox/file.py b/ox/file.py index ccaa838..7a48cd6 100644 --- a/ox/file.py +++ b/ox/file.py @@ -19,7 +19,8 @@ __all__ = ['sha1sum', 'oshash', 'avinfo', 'makedirs', 'iexists'] EXTENSIONS = { 'audio': [ 'aac', 'aif', 'aiff', 'amr', - 'flac', 'm4a', 'mp3', 'oga', 'ogg', 'wav', 'wma', 'opus' + 'flac', 'm4a', 'mp3', 'oga', 'ogg', 'wav', 'wma', 'opus', + 'ra', # Real Audio ], 'image': [ 'bmp', 'gif', 'jpeg', 'jpg', 'png', 'svg', 'webp' @@ -34,6 +35,7 @@ EXTENSIONS = { 'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD 'mxf', 'ts', 'dat', # VOD files + 'rm', # Real Media ], } From 373ff6ee0ffc20ad154dd5b5339dbcff97a72487 Mon Sep 17 00:00:00 2001 From: j Date: Sat, 1 Jan 2022 14:31:33 +0100 Subject: [PATCH 06/41] split real media --- ox/file.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ox/file.py b/ox/file.py index 7a48cd6..e4fedcd 100644 --- a/ox/file.py +++ b/ox/file.py @@ -31,11 +31,11 @@ EXTENSIONS = { 'video': [ '3gp', 'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm2v', 'm4v', 'mkv', 'mov', 'mp4', - 'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'rmvb', 'vob', 'webm', 'wmv', 'asf', + 'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'vob', 'webm', 'wmv', 'asf', 'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD 'mxf', 'ts', 'dat', # VOD files - 'rm', # Real Media + 'rm', 'rmvb', # Real Media ], } From 6d968d54cc6065b1b78de80845bab217e6b9406a Mon Sep 17 00:00:00 2001 From: j Date: Mon, 18 Apr 2022 22:59:16 +0100 Subject: [PATCH 07/41] fix series creator --- ox/web/imdb.py | 105 +++++++++++++++++++++++++------------------------ 1 file changed, 53 insertions(+), 52 deletions(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index ac12c83..316b926 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -158,15 +158,6 @@ class Imdb(SiteParser): 'type': 'list' }, 'country': zebra_list('Country', more=['(.*?)']), - 'creator': { - 'page': '', - 're': [ - '
    .*?Creator.?:(.*?)
    ', - ' 10: - series['creator'] = series['director'][:1] - - for key in ['creator', 'country']: - if key in series: - self[key] = series[key] - - if 'year' in series: - self['seriesYear'] = series['year'] - if 'year' not in self: - self['year'] = series['year'] - - if 'year' in self: - self['episodeYear'] = self['year'] - if 'creator' in self: - self['seriesDirector'] = self['creator'] - if 'originalTitle' in self: - del self['originalTitle'] - else: - for key in ('seriesTitle', 'episodeTitle', 'season', 'episode'): - if key in self: - del self[key] - if 'creator' in self: - if 'director' in self: - self['episodeDirector'] = self['director'] - self['director'] = self['creator'] #make lists unique but keep order for key in ('director', 'language'): @@ -581,6 +531,57 @@ class Imdb(SiteParser): series_credit = [c for c in self['credits'] if c.get('deparment') == deparment] if series_credit: self[key] = [c['name'] for c in series_credit] + creator = [] + for c in self.get('credits', []): + if '(created by)' in c['roles'] and c['name'] not in creator: + creator.append(c['name']) + if creator: + self['creator'] = creator + + if 'series' in self: + series = Imdb(self['series'], timeout=timeout) + self['seriesTitle'] = series['title'] + if 'episodeTitle' in self: + self['seriesTitle'] = series['title'] + if 'season' in self and 'episode' in self: + self['title'] = "%s (S%02dE%02d) %s" % ( + self['seriesTitle'], self['season'], self['episode'], self['episodeTitle']) + else: + self['title'] = "%s (S01) %s" % (self['seriesTitle'], self['episodeTitle']) + self['season'] = 1 + self['title'] = self['title'].strip() + if 'director' in self: + self['episodeDirector'] = self['director'] + + if 'creator' not in series and 'director' in series: + series['creator'] = series['director'] + if len(series['creator']) > 10: + series['creator'] = series['director'][:1] + + for key in ['creator', 'country']: + if key in series: + self[key] = series[key] + + if 'year' in series: + self['seriesYear'] = series['year'] + if 'year' not in self: + self['year'] = series['year'] + + if 'year' in self: + self['episodeYear'] = self['year'] + if 'creator' in self: + self['seriesDirector'] = self['creator'] + if 'originalTitle' in self: + del self['originalTitle'] + else: + for key in ('seriesTitle', 'episodeTitle', 'season', 'episode'): + if key in self: + del self[key] + if 'creator' in self: + if 'director' in self: + self['episodeDirector'] = self['director'] + self['director'] = self['creator'] + class ImdbCombined(Imdb): def __init__(self, id, timeout=-1): From d9870232cb050f1bbfea23e500e5a7771849d9e6 Mon Sep 17 00:00:00 2001 From: j Date: Mon, 18 Apr 2022 23:00:11 +0100 Subject: [PATCH 08/41] add debug --- ox/web/siteparser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ox/web/siteparser.py b/ox/web/siteparser.py index 8c212bf..6aa9e15 100644 --- a/ox/web/siteparser.py +++ b/ox/web/siteparser.py @@ -30,6 +30,7 @@ class SiteParser(dict): baseUrl = '' regex = {} pool = ThreadPool(8) + debug = False def get_url(self, page): return "%s%s" % (self.baseUrl, page) From a1a3de685cef3dd5cbcebeda773b9719fc581bd3 Mon Sep 17 00:00:00 2001 From: j Date: Mon, 18 Apr 2022 23:23:01 +0100 Subject: [PATCH 09/41] more creators --- ox/web/imdb.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 316b926..06e3e9d 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -535,6 +535,8 @@ class Imdb(SiteParser): for c in self.get('credits', []): if '(created by)' in c['roles'] and c['name'] not in creator: creator.append(c['name']) + if '(creator)' in c['roles'] and c['name'] not in creator: + creator.append(c['name']) if creator: self['creator'] = creator From 8e6bea8972be3439522e20e2cd57861b2dc97118 Mon Sep 17 00:00:00 2001 From: j Date: Tue, 14 Jun 2022 22:29:47 +0200 Subject: [PATCH 10/41] flip display_aspect_ratio if rotated --- ox/file.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ox/file.py b/ox/file.py index e4fedcd..885a982 100644 --- a/ox/file.py +++ b/ox/file.py @@ -278,6 +278,8 @@ def ffprobe(filename): if 'rotate' in info.get('metadata', {}) and int(info['metadata']['rotate']) in (-180, -90, 90, 180): v['width'], v['height'] = v['height'], v['width'] k = 'display_aspect_ratio' + if k in v: + v[k] = ':'.join(reversed(v[k].split(':'))) if k not in v and 'width' in v \ or (k in v and v[k] == '0:1'): v[k] = '%d:%d' % (v['width'], v['height']) From 5919345d3dea34050ce151acd6499472da6b62da Mon Sep 17 00:00:00 2001 From: j Date: Sat, 22 Oct 2022 11:50:46 +0200 Subject: [PATCH 11/41] fix aspect ratio --- ox/file.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ox/file.py b/ox/file.py index 885a982..d60fb3c 100644 --- a/ox/file.py +++ b/ox/file.py @@ -275,11 +275,11 @@ def ffprobe(filename): pass # print s for v in info['video']: + k = 'display_aspect_ratio' if 'rotate' in info.get('metadata', {}) and int(info['metadata']['rotate']) in (-180, -90, 90, 180): v['width'], v['height'] = v['height'], v['width'] - k = 'display_aspect_ratio' - if k in v: - v[k] = ':'.join(reversed(v[k].split(':'))) + if k in v: + v[k] = ':'.join(reversed(v[k].split(':'))) if k not in v and 'width' in v \ or (k in v and v[k] == '0:1'): v[k] = '%d:%d' % (v['width'], v['height']) From e1657994ca5cc9abb553ca244d3ccd4e7aca3b28 Mon Sep 17 00:00:00 2001 From: j Date: Fri, 3 Feb 2023 16:28:05 +0100 Subject: [PATCH 12/41] add type json --- ox/web/siteparser.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ox/web/siteparser.py b/ox/web/siteparser.py index 6aa9e15..b8b78f8 100644 --- a/ox/web/siteparser.py +++ b/ox/web/siteparser.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import re +import json from multiprocessing.pool import ThreadPool from six import string_types @@ -77,6 +78,10 @@ class SiteParser(dict): elif self.regex[key]['type'] == 'date': parse_date = lambda d: d and datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d') data = apply_f(parse_date, data) + elif self.regex[key]['type'] == 'json': + if isinstance(data, list) and len(data) == 1: + data = data[0] + data = json.loads(data) if data: self[key] = data From a3cef06ad73a1419c01c3552842b52948b178c9b Mon Sep 17 00:00:00 2001 From: j Date: Fri, 3 Feb 2023 18:28:49 +0100 Subject: [PATCH 13/41] fix imdb parsing --- ox/web/imdb.py | 186 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 123 insertions(+), 63 deletions(-) diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 06e3e9d..d683973 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -2,12 +2,13 @@ # vi:si:et:sw=4:sts=4:ts=4 from __future__ import print_function +import json import re import time import unicodedata from six.moves.urllib.parse import urlencode -from six import text_type, string_types +from six import string_types from .. import find_re, strip_tags, decode_html from .. import cache @@ -106,6 +107,89 @@ def technical(label): } +def tech_spec(metadata): + tech = {} + for row in metadata['props']['pageProps']['contentData']['section']['items']: + title = { + 'aspect ratio': 'aspectratio', + 'sound mix': 'sound', + }.get(row['rowTitle'].lower(), row['rowTitle'].lower()) + tech[title] = [] + for content in row['listContent']: + value = content['text'] + tech[title].append(value) + return tech + + +def movie_connections(metadata): + connections = {} + for row in metadata['props']['pageProps']['contentData']['categories']: + title = { + }.get(row['name'], row['name']) + if title not in connections: + connections[title] = [] + + for item in row['section']['items']: + item_ = { + 'id': item['id'][2:], + } + + item_['title'] = re.compile('(.*?)').findall(item['listContent'][0]['html'])[0] + if len(item['listContent']) >=2: + item_['description'] = strip_tags(item['listContent'][1]['html']) + connections[title].append(item_) + return connections + + +def get_category_by_id(metadata, id): + for category in metadata['props']['pageProps']['contentData']['categories']: + if category['id'] == id: + return category + + +def get_release_date(metadata): + releases = get_category_by_id(metadata, 'releases') + def parse_date(d): + parsed = None + for fmt in ( + '%B %d, %Y', + '%d %B %Y', + '%B %Y', + ): + try: + parsed = datetime.strptime(d, fmt) + break + except: + pass + if not parsed: + return None + return '%d-%02d-%02d' % (parsed.year, parsed.month, parsed.day) + + dates = [] + for item in releases['section']['items']: + content = item['listContent'][0] + date = parse_date(content['text']) + if date: + dates.append(date) + + if dates: + return min(dates) + + +def alternative_titles(metadata): + titles = [] + akas = get_category_by_id(metadata, 'akas') + for row in akas['section']['items']: + content = row['listContent'][0] + titles.append({ + 'title': content['text'], + 'country': row['rowTitle'], + }) + if content.get('subText'): + titles[-1]['subText'] = content['subText'] + return titles + + ''' 'posterIds': { 'page': 'posters', @@ -116,18 +200,17 @@ def technical(label): class Imdb(SiteParser): ''' - >>> Imdb('0068646')['title'] == text_type(u'The Godfather') + >>> Imdb('0068646')['title'] == 'The Godfather' True - >>> Imdb('0133093')['title'] == text_type(u'The Matrix') + >>> Imdb('0133093')['title'] == 'The Matrix' True ''' regex = { 'alternativeTitles': { 'page': 'releaseinfo', 're': [ - ']*?id="akas"[^>]*?>(.*?)', - "td[^>]*?>(.*?).*?]*?>(.*?)" + '