From 887760acc170daeff4c77d926893769dd2d5279e Mon Sep 17 00:00:00 2001
From: j
Date: Fri, 18 Jun 2021 12:23:10 +0100
Subject: [PATCH 01/41] e.read() returns bytes
---
ox/api.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ox/api.py b/ox/api.py
index b784788..894e26d 100644
--- a/ox/api.py
+++ b/ox/api.py
@@ -100,7 +100,7 @@ class API(object):
if self.DEBUG:
import webbrowser
if e.code >= 500:
- with open('/tmp/error.html', 'w') as f:
+ with open('/tmp/error.html', 'wb') as f:
f.write(e.read())
webbrowser.open_new_tab('/tmp/error.html')
From 2172bcb3fb6a3ab5c3b8290878e931b41237ddc7 Mon Sep 17 00:00:00 2001
From: j
Date: Sat, 7 Aug 2021 11:30:23 +0200
Subject: [PATCH 02/41] fix criterion parser
---
ox/web/criterion.py | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/ox/web/criterion.py b/ox/web/criterion.py
index 6cef01e..d7914be 100644
--- a/ox/web/criterion.py
+++ b/ox/web/criterion.py
@@ -43,8 +43,12 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
results = find_re(html, '')
info = re.compile('(.*?)', re.DOTALL).findall(results)
info = {k: strip_tags(v).strip() for k, v in info}
+ meta = re.compile('.*?src="(.*?)"', re.DOTALL).findall(html)
+ #result = find_re(html, "
Date: Sun, 29 Aug 2021 13:43:33 +0200
Subject: [PATCH 03/41] parse google infobox
---
ox/web/google.py | 25 +++++++++++++++++++++++++
1 file changed, 25 insertions(+)
diff --git a/ox/web/google.py b/ox/web/google.py
index 72aa32f..0842d01 100644
--- a/ox/web/google.py
+++ b/ox/web/google.py
@@ -17,6 +17,31 @@ def quote_plus(s):
s = s.encode('utf-8')
return urllib.parse.quote_plus(s)
+
+def infobox(query, timeout=DEFAULT_TIMEOUT):
+ import lxml.html
+ data = read_url(url, timeout=timeout)
+ doc = lxml.html.document_fromstring(data)
+ k = 'kp-wholepage'
+ wholepage = doc.cssselect('.' + k)
+ infobox = {}
+ if wholepage:
+ page = wholepage[0]
+ for a in page.cssselect('a'):
+ if a.attrib.get('href', '').startswith('http'):
+ domain = '.'.join(a.attrib['href'].split('/')[2].split('.')[-2:])
+ infobox[domain] = a.attrib['href']
+ for e in page.cssselect('*[data-attrid]'):
+ key = e.attrib['data-attrid']
+ value = e.text_content()
+ if value and key not in (
+ 'kc:/film/film:media_actions_wholepage',
+ 'action:watch_film'
+ ):
+ infobox[key] = value
+ return infobox
+
+
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
"""
Return max_results tuples with title, url, description
From 67c6c2413110dc2979e651e1027e40e96a6019fb Mon Sep 17 00:00:00 2001
From: j
Date: Wed, 22 Sep 2021 18:56:25 +0200
Subject: [PATCH 04/41] add m2v
---
ox/file.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ox/file.py b/ox/file.py
index f12aee7..ccaa838 100644
--- a/ox/file.py
+++ b/ox/file.py
@@ -29,7 +29,7 @@ EXTENSIONS = {
],
'video': [
'3gp',
- 'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm4v', 'mkv', 'mov', 'mp4',
+ 'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm2v', 'm4v', 'mkv', 'mov', 'mp4',
'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'rmvb', 'vob', 'webm', 'wmv', 'asf',
'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD
'mxf', 'ts',
From 868a401553004d6e350b8be2d6bd62c410bb3b5c Mon Sep 17 00:00:00 2001
From: j
Date: Sun, 14 Nov 2021 13:35:26 +0000
Subject: [PATCH 05/41] detect add real media files
---
ox/file.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/ox/file.py b/ox/file.py
index ccaa838..7a48cd6 100644
--- a/ox/file.py
+++ b/ox/file.py
@@ -19,7 +19,8 @@ __all__ = ['sha1sum', 'oshash', 'avinfo', 'makedirs', 'iexists']
EXTENSIONS = {
'audio': [
'aac', 'aif', 'aiff', 'amr',
- 'flac', 'm4a', 'mp3', 'oga', 'ogg', 'wav', 'wma', 'opus'
+ 'flac', 'm4a', 'mp3', 'oga', 'ogg', 'wav', 'wma', 'opus',
+ 'ra', # Real Audio
],
'image': [
'bmp', 'gif', 'jpeg', 'jpg', 'png', 'svg', 'webp'
@@ -34,6 +35,7 @@ EXTENSIONS = {
'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD
'mxf', 'ts',
'dat', # VOD files
+ 'rm', # Real Media
],
}
From 373ff6ee0ffc20ad154dd5b5339dbcff97a72487 Mon Sep 17 00:00:00 2001
From: j
Date: Sat, 1 Jan 2022 14:31:33 +0100
Subject: [PATCH 06/41] split real media
---
ox/file.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/ox/file.py b/ox/file.py
index 7a48cd6..e4fedcd 100644
--- a/ox/file.py
+++ b/ox/file.py
@@ -31,11 +31,11 @@ EXTENSIONS = {
'video': [
'3gp',
'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm2v', 'm4v', 'mkv', 'mov', 'mp4',
- 'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'rmvb', 'vob', 'webm', 'wmv', 'asf',
+ 'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'vob', 'webm', 'wmv', 'asf',
'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD
'mxf', 'ts',
'dat', # VOD files
- 'rm', # Real Media
+ 'rm', 'rmvb', # Real Media
],
}
From 6d968d54cc6065b1b78de80845bab217e6b9406a Mon Sep 17 00:00:00 2001
From: j
Date: Mon, 18 Apr 2022 22:59:16 +0100
Subject: [PATCH 07/41] fix series creator
---
ox/web/imdb.py | 105 +++++++++++++++++++++++++------------------------
1 file changed, 53 insertions(+), 52 deletions(-)
diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index ac12c83..316b926 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -158,15 +158,6 @@ class Imdb(SiteParser):
'type': 'list'
},
'country': zebra_list('Country', more=['(.*?)']),
- 'creator': {
- 'page': '',
- 're': [
- '.*?
Creator.?:(.*?)
',
- ' 10:
- series['creator'] = series['director'][:1]
-
- for key in ['creator', 'country']:
- if key in series:
- self[key] = series[key]
-
- if 'year' in series:
- self['seriesYear'] = series['year']
- if 'year' not in self:
- self['year'] = series['year']
-
- if 'year' in self:
- self['episodeYear'] = self['year']
- if 'creator' in self:
- self['seriesDirector'] = self['creator']
- if 'originalTitle' in self:
- del self['originalTitle']
- else:
- for key in ('seriesTitle', 'episodeTitle', 'season', 'episode'):
- if key in self:
- del self[key]
- if 'creator' in self:
- if 'director' in self:
- self['episodeDirector'] = self['director']
- self['director'] = self['creator']
#make lists unique but keep order
for key in ('director', 'language'):
@@ -581,6 +531,57 @@ class Imdb(SiteParser):
series_credit = [c for c in self['credits'] if c.get('deparment') == deparment]
if series_credit:
self[key] = [c['name'] for c in series_credit]
+ creator = []
+ for c in self.get('credits', []):
+ if '(created by)' in c['roles'] and c['name'] not in creator:
+ creator.append(c['name'])
+ if creator:
+ self['creator'] = creator
+
+ if 'series' in self:
+ series = Imdb(self['series'], timeout=timeout)
+ self['seriesTitle'] = series['title']
+ if 'episodeTitle' in self:
+ self['seriesTitle'] = series['title']
+ if 'season' in self and 'episode' in self:
+ self['title'] = "%s (S%02dE%02d) %s" % (
+ self['seriesTitle'], self['season'], self['episode'], self['episodeTitle'])
+ else:
+ self['title'] = "%s (S01) %s" % (self['seriesTitle'], self['episodeTitle'])
+ self['season'] = 1
+ self['title'] = self['title'].strip()
+ if 'director' in self:
+ self['episodeDirector'] = self['director']
+
+ if 'creator' not in series and 'director' in series:
+ series['creator'] = series['director']
+ if len(series['creator']) > 10:
+ series['creator'] = series['director'][:1]
+
+ for key in ['creator', 'country']:
+ if key in series:
+ self[key] = series[key]
+
+ if 'year' in series:
+ self['seriesYear'] = series['year']
+ if 'year' not in self:
+ self['year'] = series['year']
+
+ if 'year' in self:
+ self['episodeYear'] = self['year']
+ if 'creator' in self:
+ self['seriesDirector'] = self['creator']
+ if 'originalTitle' in self:
+ del self['originalTitle']
+ else:
+ for key in ('seriesTitle', 'episodeTitle', 'season', 'episode'):
+ if key in self:
+ del self[key]
+ if 'creator' in self:
+ if 'director' in self:
+ self['episodeDirector'] = self['director']
+ self['director'] = self['creator']
+
class ImdbCombined(Imdb):
def __init__(self, id, timeout=-1):
From d9870232cb050f1bbfea23e500e5a7771849d9e6 Mon Sep 17 00:00:00 2001
From: j
Date: Mon, 18 Apr 2022 23:00:11 +0100
Subject: [PATCH 08/41] add debug
---
ox/web/siteparser.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/ox/web/siteparser.py b/ox/web/siteparser.py
index 8c212bf..6aa9e15 100644
--- a/ox/web/siteparser.py
+++ b/ox/web/siteparser.py
@@ -30,6 +30,7 @@ class SiteParser(dict):
baseUrl = ''
regex = {}
pool = ThreadPool(8)
+ debug = False
def get_url(self, page):
return "%s%s" % (self.baseUrl, page)
From a1a3de685cef3dd5cbcebeda773b9719fc581bd3 Mon Sep 17 00:00:00 2001
From: j
Date: Mon, 18 Apr 2022 23:23:01 +0100
Subject: [PATCH 09/41] more creators
---
ox/web/imdb.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index 316b926..06e3e9d 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -535,6 +535,8 @@ class Imdb(SiteParser):
for c in self.get('credits', []):
if '(created by)' in c['roles'] and c['name'] not in creator:
creator.append(c['name'])
+ if '(creator)' in c['roles'] and c['name'] not in creator:
+ creator.append(c['name'])
if creator:
self['creator'] = creator
From 8e6bea8972be3439522e20e2cd57861b2dc97118 Mon Sep 17 00:00:00 2001
From: j
Date: Tue, 14 Jun 2022 22:29:47 +0200
Subject: [PATCH 10/41] flip display_aspect_ratio if rotated
---
ox/file.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/ox/file.py b/ox/file.py
index e4fedcd..885a982 100644
--- a/ox/file.py
+++ b/ox/file.py
@@ -278,6 +278,8 @@ def ffprobe(filename):
if 'rotate' in info.get('metadata', {}) and int(info['metadata']['rotate']) in (-180, -90, 90, 180):
v['width'], v['height'] = v['height'], v['width']
k = 'display_aspect_ratio'
+ if k in v:
+ v[k] = ':'.join(reversed(v[k].split(':')))
if k not in v and 'width' in v \
or (k in v and v[k] == '0:1'):
v[k] = '%d:%d' % (v['width'], v['height'])
From 5919345d3dea34050ce151acd6499472da6b62da Mon Sep 17 00:00:00 2001
From: j
Date: Sat, 22 Oct 2022 11:50:46 +0200
Subject: [PATCH 11/41] fix aspect ratio
---
ox/file.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/ox/file.py b/ox/file.py
index 885a982..d60fb3c 100644
--- a/ox/file.py
+++ b/ox/file.py
@@ -275,11 +275,11 @@ def ffprobe(filename):
pass
# print s
for v in info['video']:
+ k = 'display_aspect_ratio'
if 'rotate' in info.get('metadata', {}) and int(info['metadata']['rotate']) in (-180, -90, 90, 180):
v['width'], v['height'] = v['height'], v['width']
- k = 'display_aspect_ratio'
- if k in v:
- v[k] = ':'.join(reversed(v[k].split(':')))
+ if k in v:
+ v[k] = ':'.join(reversed(v[k].split(':')))
if k not in v and 'width' in v \
or (k in v and v[k] == '0:1'):
v[k] = '%d:%d' % (v['width'], v['height'])
From e1657994ca5cc9abb553ca244d3ccd4e7aca3b28 Mon Sep 17 00:00:00 2001
From: j
Date: Fri, 3 Feb 2023 16:28:05 +0100
Subject: [PATCH 12/41] add type json
---
ox/web/siteparser.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/ox/web/siteparser.py b/ox/web/siteparser.py
index 6aa9e15..b8b78f8 100644
--- a/ox/web/siteparser.py
+++ b/ox/web/siteparser.py
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
+import json
from multiprocessing.pool import ThreadPool
from six import string_types
@@ -77,6 +78,10 @@ class SiteParser(dict):
elif self.regex[key]['type'] == 'date':
parse_date = lambda d: d and datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
data = apply_f(parse_date, data)
+ elif self.regex[key]['type'] == 'json':
+ if isinstance(data, list) and len(data) == 1:
+ data = data[0]
+ data = json.loads(data)
if data:
self[key] = data
From a3cef06ad73a1419c01c3552842b52948b178c9b Mon Sep 17 00:00:00 2001
From: j
Date: Fri, 3 Feb 2023 18:28:49 +0100
Subject: [PATCH 13/41] fix imdb parsing
---
ox/web/imdb.py | 186 ++++++++++++++++++++++++++++++++-----------------
1 file changed, 123 insertions(+), 63 deletions(-)
diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index 06e3e9d..d683973 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -2,12 +2,13 @@
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import print_function
+import json
import re
import time
import unicodedata
from six.moves.urllib.parse import urlencode
-from six import text_type, string_types
+from six import string_types
from .. import find_re, strip_tags, decode_html
from .. import cache
@@ -106,6 +107,89 @@ def technical(label):
}
+def tech_spec(metadata):
+ tech = {}
+ for row in metadata['props']['pageProps']['contentData']['section']['items']:
+ title = {
+ 'aspect ratio': 'aspectratio',
+ 'sound mix': 'sound',
+ }.get(row['rowTitle'].lower(), row['rowTitle'].lower())
+ tech[title] = []
+ for content in row['listContent']:
+ value = content['text']
+ tech[title].append(value)
+ return tech
+
+
+def movie_connections(metadata):
+ connections = {}
+ for row in metadata['props']['pageProps']['contentData']['categories']:
+ title = {
+ }.get(row['name'], row['name'])
+ if title not in connections:
+ connections[title] = []
+
+ for item in row['section']['items']:
+ item_ = {
+ 'id': item['id'][2:],
+ }
+
+ item_['title'] = re.compile('(.*?)').findall(item['listContent'][0]['html'])[0]
+ if len(item['listContent']) >=2:
+ item_['description'] = strip_tags(item['listContent'][1]['html'])
+ connections[title].append(item_)
+ return connections
+
+
+def get_category_by_id(metadata, id):
+ for category in metadata['props']['pageProps']['contentData']['categories']:
+ if category['id'] == id:
+ return category
+
+
+def get_release_date(metadata):
+ releases = get_category_by_id(metadata, 'releases')
+ def parse_date(d):
+ parsed = None
+ for fmt in (
+ '%B %d, %Y',
+ '%d %B %Y',
+ '%B %Y',
+ ):
+ try:
+ parsed = datetime.strptime(d, fmt)
+ break
+ except:
+ pass
+ if not parsed:
+ return None
+ return '%d-%02d-%02d' % (parsed.year, parsed.month, parsed.day)
+
+ dates = []
+ for item in releases['section']['items']:
+ content = item['listContent'][0]
+ date = parse_date(content['text'])
+ if date:
+ dates.append(date)
+
+ if dates:
+ return min(dates)
+
+
+def alternative_titles(metadata):
+ titles = []
+ akas = get_category_by_id(metadata, 'akas')
+ for row in akas['section']['items']:
+ content = row['listContent'][0]
+ titles.append({
+ 'title': content['text'],
+ 'country': row['rowTitle'],
+ })
+ if content.get('subText'):
+ titles[-1]['subText'] = content['subText']
+ return titles
+
+
'''
'posterIds': {
'page': 'posters',
@@ -116,18 +200,17 @@ def technical(label):
class Imdb(SiteParser):
'''
- >>> Imdb('0068646')['title'] == text_type(u'The Godfather')
+ >>> Imdb('0068646')['title'] == 'The Godfather'
True
- >>> Imdb('0133093')['title'] == text_type(u'The Matrix')
+ >>> Imdb('0133093')['title'] == 'The Matrix'
True
'''
regex = {
'alternativeTitles': {
'page': 'releaseinfo',
're': [
- ']*?id="akas"[^>]*?>(.*?)',
- "td[^>]*?>(.*?).*?| ]*?>(.*?) | "
+ '
Date: Fri, 10 Mar 2023 17:39:31 +0100
Subject: [PATCH 14/41] imdb fixes
---
ox/web/imdb.py | 38 ++++++++++++++++++++++++++++----------
1 file changed, 28 insertions(+), 10 deletions(-)
diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index d683973..b541b38 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
-from __future__ import print_function
+from collections import defaultdict
import json
import re
@@ -17,7 +17,7 @@ from .. import cache
from . siteparser import SiteParser
from . import duckduckgo
from ..utils import datetime
-from ..geo import normalize_country_name
+from ..geo import normalize_country_name, get_country_name
def prepare_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
@@ -176,18 +176,36 @@ def get_release_date(metadata):
return min(dates)
+def get_entity_metadata(metadata):
+ data = {}
+ entity = metadata['props']['pageProps']['contentData']['entityMetadata']
+ data['title'] = entity['titleText']['text']
+ data['originalTitle'] = entity['originalTitleText']['text']
+ data['year'] = entity['releaseYear']['year']
+ data['plot'] = entity['plot']['plotText']['plainText']
+ data['country'] = [get_country_name(c['id']) for c in entity['countriesOfOrigin']['countries']]
+ data['poster'] = metadata['props']['pageProps']['contentData']['posterData']['image']['url']
+ return data
+
+
def alternative_titles(metadata):
- titles = []
+ titles = defaultdict(list)
akas = get_category_by_id(metadata, 'akas')
+
+ skip = [
+ metadata['props']['pageProps']['contentData']['entityMetadata']['titleText']['text'],
+ metadata['props']['pageProps']['contentData']['entityMetadata']['originalTitleText']['text']
+ ]
for row in akas['section']['items']:
content = row['listContent'][0]
- titles.append({
- 'title': content['text'],
- 'country': row['rowTitle'],
- })
- if content.get('subText'):
- titles[-1]['subText'] = content['subText']
- return titles
+ title = content['text']
+ country = row['rowTitle']
+ if title in skip:
+ continue
+ titles[title].append(country)
+ #if content.get('subText'):
+ # titles[-1]['subText'] = content['subText']
+ return [kv for kv in titles.items()]
'''
From baec9c4ea67e2f2c6a11f334bbc20d82b30bc4a3 Mon Sep 17 00:00:00 2001
From: j
Date: Thu, 6 Jul 2023 17:07:11 +0530
Subject: [PATCH 15/41] 180/-180 rotation does not change width/height
---
ox/file.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ox/file.py b/ox/file.py
index d60fb3c..01623ca 100644
--- a/ox/file.py
+++ b/ox/file.py
@@ -276,7 +276,7 @@ def ffprobe(filename):
# print s
for v in info['video']:
k = 'display_aspect_ratio'
- if 'rotate' in info.get('metadata', {}) and int(info['metadata']['rotate']) in (-180, -90, 90, 180):
+ if 'rotate' in info.get('metadata', {}) and int(info['metadata']['rotate']) in (-90, 90):
v['width'], v['height'] = v['height'], v['width']
if k in v:
v[k] = ':'.join(reversed(v[k].split(':')))
From e6782b3c171eae34a41d7700482841cb2b7441df Mon Sep 17 00:00:00 2001
From: j
Date: Thu, 6 Jul 2023 18:13:26 +0530
Subject: [PATCH 16/41] not all movies have connections
---
ox/web/imdb.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index b541b38..e9f1973 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -123,6 +123,8 @@ def tech_spec(metadata):
def movie_connections(metadata):
connections = {}
+ if 'props' not in metadata:
+ return connections
for row in metadata['props']['pageProps']['contentData']['categories']:
title = {
}.get(row['name'], row['name'])
From d630f4b19c63808fadd76bc774982f27d1f82d70 Mon Sep 17 00:00:00 2001
From: j
Date: Thu, 6 Jul 2023 18:32:45 +0530
Subject: [PATCH 17/41] parse keywords
---
ox/web/imdb.py | 21 ++++++++++++++++-----
1 file changed, 16 insertions(+), 5 deletions(-)
diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index e9f1973..755a63e 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -178,6 +178,17 @@ def get_release_date(metadata):
return min(dates)
+def get_keywords(metadata):
+ try:
+ keywords = [
+ row['rowTitle']
+ for row in metadata['props']['pageProps']['contentData']['section']['items']
+ ]
+ except:
+ keywords = []
+ return keywords
+
+
def get_entity_metadata(metadata):
data = {}
entity = metadata['props']['pageProps']['contentData']['entityMetadata']
@@ -276,11 +287,6 @@ class Imdb(SiteParser):
'gross': zebra_table('Cumulative Worldwide Gross', more=[
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
], type='int'),
- 'keyword': {
- 'page': 'keywords',
- 're': 'data-item-keyword="(.*?)"',
- 'type': 'list'
- },
'language': zebra_list('Language', more=['(.*?)']),
'originalTitle': {
'page': 'releaseinfo',
@@ -543,6 +549,11 @@ class Imdb(SiteParser):
if releasedate:
self['releasedate'] = releasedate
+ metadata = self.get_page_data('keywords')
+ keywords = get_keywords(metadata)
+ if keywords:
+ self['keywords'] = keywords
+
if 'summary' not in self and 'storyline' in self:
self['summary'] = self.pop('storyline')
if 'summary' in self:
From 4feacb4a97aafd442cc9ec77162b1b7efaa83182 Mon Sep 17 00:00:00 2001
From: j
Date: Thu, 6 Jul 2023 18:34:08 +0530
Subject: [PATCH 18/41] don't pass forwarded for header
---
ox/web/imdb.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index 755a63e..dffb6e4 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -23,7 +23,7 @@ from ..geo import normalize_country_name, get_country_name
def prepare_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
headers = headers.copy()
# https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau
- headers['X-Forwarded-For'] = '72.21.206.80'
+ #headers['X-Forwarded-For'] = '72.21.206.80'
headers['Accept-Language'] = 'en'
return url, data, headers, timeout, unicode
From 4b531c55aafa5c11bc4e8aa8540e74f9f32929c0 Mon Sep 17 00:00:00 2001
From: j
Date: Thu, 6 Jul 2023 18:35:13 +0530
Subject: [PATCH 19/41] use requests for api
---
ox/api.py | 73 +++++++++++++++++++-----------------------------
requirements.txt | 1 +
2 files changed, 30 insertions(+), 44 deletions(-)
diff --git a/ox/api.py b/ox/api.py
index 894e26d..639fec0 100644
--- a/ox/api.py
+++ b/ox/api.py
@@ -4,6 +4,7 @@
from __future__ import print_function
from types import MethodType
import gzip
+import mimetypes
import os
import shutil
import sys
@@ -13,6 +14,7 @@ from six.moves import http_cookiejar as cookielib
from six import BytesIO, PY2
from six.moves import urllib
from six.moves.urllib.parse import urlparse
+import requests
from . import __version__
from .utils import json
@@ -37,12 +39,13 @@ class API(object):
self._cj = cj
else:
self._cj = cookielib.CookieJar()
- self._opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self._cj),
- urllib.request.HTTPHandler(debuglevel=self.debuglevel))
- self._opener.addheaders = [
- ('User-Agent', '%s/%s' % (self.__name__, self.__version__))
- ]
+ self._requests_session = requests.Session()
+ self._requests_session.cookies = self._cj
+ self._requests_session.headers = {
+ 'User-Agent': '%s/%s' % (self.__name__, self.__version__),
+ 'Accept-Encoding': 'gzip, deflate',
+ }
self.url = url
r = self._request('api', {'docs': True})
self._properties = r['data']['actions']
@@ -76,26 +79,12 @@ class API(object):
method.func_name = action
self._add_method(method, action)
- def _json_request(self, url, form):
+ def _json_request(self, url, data, files=None):
result = {}
try:
- body = form.body()
- if PY2:
- if not isinstance(url, bytes):
- url = url.encode('utf-8')
- request = urllib.request.Request(url)
- request.add_data(body)
- else:
- request = urllib.request.Request(url, data=body, method='POST')
- request.add_header('Content-Type', form.get_content_type())
- request.add_header('Content-Length', str(len(body)))
- request.add_header('Accept-Encoding', 'gzip, deflate')
- f = self._opener.open(request)
- result = f.read()
- if f.headers.get('content-encoding', None) == 'gzip':
- result = gzip.GzipFile(fileobj=BytesIO(result)).read()
- result = result.decode('utf-8')
- return json.loads(result)
+ request = self._requests_session.post(url, data=data, files=files)
+ result = request.json()
+ return result
except urllib.error.HTTPError as e:
if self.DEBUG:
import webbrowser
@@ -125,17 +114,15 @@ class API(object):
raise
def _request(self, action, data=None):
- form = MultiPartForm()
- form.add_field('action', action)
+ form = {
+ 'action': action
+ }
if data:
- form.add_field('data', json.dumps(data))
+ form['data'] = json.dumps(data)
return self._json_request(self.url, form)
def get_url(self, url):
- request = urllib.request.Request(url, method='GET')
- f = self._opener.open(request)
- result = f.read()
- return result
+ return self._requests_session.get(url).content
def save_url(self, url, filename, overwrite=False):
chunk_size = 16 * 1024
@@ -143,21 +130,15 @@ class API(object):
dirname = os.path.dirname(filename)
if dirname and not os.path.exists(dirname):
os.makedirs(dirname)
- request = urllib.request.Request(url, method='GET')
tmpname = filename + '.tmp'
with open(tmpname, 'wb') as fd:
- u = self._opener.open(request)
- for chunk in iter(lambda: u.read(chunk_size), b''):
+ r = self._requests_session.get(url)
+ for chunk in iter(lambda: r.read(chunk_size), b''):
fd.write(chunk)
shutil.move(tmpname, filename)
-
def upload_chunks(self, url, filename, data=None, silent=False):
- form = MultiPartForm()
- if data:
- for key in data:
- form.add_field(key, data[key])
- data = self._json_request(url, form)
+ data = self._json_request(url, data)
def full_url(path):
if path.startswith('/'):
@@ -178,16 +159,20 @@ class API(object):
resume_offset = 0
chunk = f.read(CHUNK_SIZE)
fname = os.path.basename(filename)
+ mime_type = mimetypes.guess_type(fname)[0] or 'application/octet-stream'
if not isinstance(fname, bytes):
fname = fname.encode('utf-8')
while chunk:
- form = MultiPartForm()
- form.add_file('chunk', fname, chunk)
+ meta = {
+ 'offset': str(done)
+ }
if len(chunk) < CHUNK_SIZE or f.tell() == fsize:
- form.add_field('done', '1')
- form.add_field('offset', str(done))
+ meta['done'] = '1'
+ files = [
+ ('chunk', (fname, chunk, mime_type))
+ ]
try:
- data = self._json_request(uploadUrl, form)
+ data = self._json_request(uploadUrl, meta, files=files)
except KeyboardInterrupt:
if not slient:
print("\ninterrupted by user.")
diff --git a/requirements.txt b/requirements.txt
index 51c3f99..4e7d966 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
chardet
six>=1.5.2
lxml
+requests
From 16f1c35875da0a628706deb13ee9707a22377d78 Mon Sep 17 00:00:00 2001
From: j
Date: Thu, 6 Jul 2023 18:37:23 +0530
Subject: [PATCH 20/41] keywords=>keyword
---
ox/web/imdb.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index dffb6e4..c9844fa 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -552,7 +552,7 @@ class Imdb(SiteParser):
metadata = self.get_page_data('keywords')
keywords = get_keywords(metadata)
if keywords:
- self['keywords'] = keywords
+ self['keyword'] = keywords
if 'summary' not in self and 'storyline' in self:
self['summary'] = self.pop('storyline')
From 2d5171bb3f3cb441c028b95962af7b99f969c849 Mon Sep 17 00:00:00 2001
From: j
Date: Thu, 6 Jul 2023 18:44:32 +0530
Subject: [PATCH 21/41] fix filmingLocations
---
ox/web/imdb.py | 23 +++++++++++++++--------
1 file changed, 15 insertions(+), 8 deletions(-)
diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index c9844fa..5da8795 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -177,6 +177,16 @@ def get_release_date(metadata):
if dates:
return min(dates)
+def get_locations(metadata):
+ try:
+ keywords = [
+ row['cardText']
+ for row in metadata['props']['pageProps']['contentData']['categories'][0]['section']['items']
+ ]
+ except:
+ keywords = []
+ return keywords
+
def get_keywords(metadata):
try:
@@ -275,14 +285,6 @@ class Imdb(SiteParser):
're': '