From b7979779fee60b41319da3041994fc968ad73216 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Mon, 22 Jul 2019 10:38:14 +0200
Subject: [PATCH 01/19] .rec files

---
 ox/file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ox/file.py b/ox/file.py
index ab789a3..956f215 100644
--- a/ox/file.py
+++ b/ox/file.py
@@ -32,7 +32,7 @@ EXTENSIONS = {
         'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm4v', 'mkv', 'mov', 'mp4',
         'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'rmvb', 'vob', 'webm', 'wmv', 'asf',
         'mod', 'tod',  # http://en.wikipedia.org/wiki/MOD_and_TOD
-        'mxf', 'ts'
+        'mxf', 'ts', 'rec',
     ],
 }
 

From 8c14d28aa26e86002fe3a05b69534d7e6e9dfe77 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Mon, 22 Jul 2019 17:11:46 +0200
Subject: [PATCH 02/19] remove rec again, not a real format

---
 ox/file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ox/file.py b/ox/file.py
index 956f215..ab789a3 100644
--- a/ox/file.py
+++ b/ox/file.py
@@ -32,7 +32,7 @@ EXTENSIONS = {
         'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm4v', 'mkv', 'mov', 'mp4',
         'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'rmvb', 'vob', 'webm', 'wmv', 'asf',
         'mod', 'tod',  # http://en.wikipedia.org/wiki/MOD_and_TOD
-        'mxf', 'ts', 'rec',
+        'mxf', 'ts'
     ],
 }
 

From fb8b33d916660e59aaefedb844a6b64e285979ac Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Tue, 23 Jul 2019 16:09:07 +0200
Subject: [PATCH 03/19] fix variable name

---
 ox/api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ox/api.py b/ox/api.py
index 51742c2..159aec7 100644
--- a/ox/api.py
+++ b/ox/api.py
@@ -201,7 +201,7 @@ class API(object):
                         return False
                     if data['status']['code'] != 200:
                         print("request returned error, will try again in 5 seconds")
-                        if DEBUG:
+                        if self.DEBUG:
                             print(data)
                         time.sleep(5)
                 if data and data.get('result') == 1:

From 9c90aaa5f8e6f17f22dc71f3eeba0a4bf40d2f15 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Tue, 23 Jul 2019 16:24:06 +0200
Subject: [PATCH 04/19] imdb can also be 8 digits

---
 ox/web/imdb.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index 4821b0c..db745bf 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -267,7 +267,7 @@ class Imdb(SiteParser):
         },
         'series': {
             'page': 'reference',
-            're': '<h4 itemprop="name">.*?<a href="/title/tt(\d{7})',
+            're': '<h4 itemprop="name">.*?<a href="/title/tt(\d+?)',
             'type': 'string'
         },
         'isSeries': {
@@ -422,7 +422,7 @@ class Imdb(SiteParser):
             for rel, data, _ in self['connections']:
                 if isinstance(rel, bytes):
                     rel = rel.decode('utf-8')
-                #cc[rel] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
+                #cc[rel] = re.compile('<a href="/title/tt(\d+?)/">(.*?)</a>').findall(data)
                 def get_conn(c):
                     r = {
                         'id': c[0],
@@ -432,7 +432,7 @@ class Imdb(SiteParser):
                     if len(description) == 2 and description[-1].strip() != '-':
                         r['description'] = description[-1].strip()
                     return r
-                cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
+                cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d+?)/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
 
             self['connections'] = cc
 
@@ -618,7 +618,7 @@ def get_movie_by_title(title, timeout=-1):
     url = "http://akas.imdb.com/find?" + params
     data = read_url(url, timeout=timeout, unicode=True)
     #if search results in redirect, get id of current page
-    r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
+    r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+?)/" />'
     results = re.compile(r).findall(data)    
     if results:
         return results[0]
@@ -697,12 +697,12 @@ def get_movie_id(title, director='', year='', timeout=-1):
 
     data = read_url(url, timeout=timeout, unicode=True)
     #if search results in redirect, get id of current page
-    r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
+    r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+?)/" />'
     results = re.compile(r).findall(data)    
     if results:
         return results[0]
     #otherwise get first result
-    r = '<td valign="top">.*?<a href="/title/tt(\d{7})/"'
+    r = '<td valign="top">.*?<a href="/title/tt(\d+?)/"'
     results = re.compile(r).findall(data)
     if results:
         return results[0]
@@ -713,7 +713,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
     results = duckduckgo.find(google_query, timeout=timeout)
     if results:
         for r in results[:2]:
-            imdbId = find_re(r[1], 'title/tt(\d{7})')
+            imdbId = find_re(r[1], 'title/tt(\d+?)')
             if imdbId:
                 return imdbId
     #or nothing
@@ -740,7 +740,7 @@ def get_episodes(imdbId, season=None):
     if season:
         url += '?season=%d' % season
         data = cache.read_url(url).decode()
-        for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
+        for e in re.compile('<div data-const="tt(\d+?)".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
             episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
     else:
         data = cache.read_url(url)

From d632cd3803c8f42f75593fc74bfa626422fcf3ba Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Tue, 23 Jul 2019 16:42:20 +0200
Subject: [PATCH 05/19] match as many digits as possible

---
 ox/web/imdb.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index db745bf..7d91dc7 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -267,7 +267,7 @@ class Imdb(SiteParser):
         },
         'series': {
             'page': 'reference',
-            're': '<h4 itemprop="name">.*?<a href="/title/tt(\d+?)',
+            're': '<h4 itemprop="name">.*?<a href="/title/tt(\d+)',
             'type': 'string'
         },
         'isSeries': {
@@ -422,7 +422,7 @@ class Imdb(SiteParser):
             for rel, data, _ in self['connections']:
                 if isinstance(rel, bytes):
                     rel = rel.decode('utf-8')
-                #cc[rel] = re.compile('<a href="/title/tt(\d+?)/">(.*?)</a>').findall(data)
+                #cc[rel] = re.compile('<a href="/title/tt(\d+)/">(.*?)</a>').findall(data)
                 def get_conn(c):
                     r = {
                         'id': c[0],
@@ -432,7 +432,7 @@ class Imdb(SiteParser):
                     if len(description) == 2 and description[-1].strip() != '-':
                         r['description'] = description[-1].strip()
                     return r
-                cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d+?)/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
+                cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d+)/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
 
             self['connections'] = cc
 
@@ -618,7 +618,7 @@ def get_movie_by_title(title, timeout=-1):
     url = "http://akas.imdb.com/find?" + params
     data = read_url(url, timeout=timeout, unicode=True)
     #if search results in redirect, get id of current page
-    r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+?)/" />'
+    r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
     results = re.compile(r).findall(data)    
     if results:
         return results[0]
@@ -697,12 +697,12 @@ def get_movie_id(title, director='', year='', timeout=-1):
 
     data = read_url(url, timeout=timeout, unicode=True)
     #if search results in redirect, get id of current page
-    r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+?)/" />'
+    r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
     results = re.compile(r).findall(data)    
     if results:
         return results[0]
     #otherwise get first result
-    r = '<td valign="top">.*?<a href="/title/tt(\d+?)/"'
+    r = '<td valign="top">.*?<a href="/title/tt(\d+)/"'
     results = re.compile(r).findall(data)
     if results:
         return results[0]
@@ -713,7 +713,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
     results = duckduckgo.find(google_query, timeout=timeout)
     if results:
         for r in results[:2]:
-            imdbId = find_re(r[1], 'title/tt(\d+?)')
+            imdbId = find_re(r[1], 'title/tt(\d+)')
             if imdbId:
                 return imdbId
     #or nothing
@@ -740,7 +740,7 @@ def get_episodes(imdbId, season=None):
     if season:
         url += '?season=%d' % season
         data = cache.read_url(url).decode()
-        for e in re.compile('<div data-const="tt(\d+?)".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
+        for e in re.compile('<div data-const="tt(\d+)".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
             episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
     else:
         data = cache.read_url(url)

From b49acd47dc9d8da00226ba7f7eef712f2c5fdae2 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Thu, 1 Aug 2019 16:28:00 +0200
Subject: [PATCH 06/19] load subtitle info

---
 ox/file.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/ox/file.py b/ox/file.py
index ab789a3..58f2a49 100644
--- a/ox/file.py
+++ b/ox/file.py
@@ -293,6 +293,18 @@ def ffprobe(filename):
                             'sample_aspect_ratio': 'pixel_aspect_ratio',
                         }.get(key, key)] = fix_value(key, s[key])
                 info[s['codec_type']].append(stream)
+            elif s.get('codec_type') == 'subtitle':
+                info['subtitles'] = info.get('subtitles', [])
+                stream = {}
+                for key in (
+                    'codec_name', 'language'
+                ):
+                    if key in s:
+                        stream[{
+                            'codec_name': 'codec',
+
+                        }.get(key, key)] = s[key]
+                info['subtitles'].append(stream)
             else:
                 pass
                 # print s

From 23a641189ca5d4b5ea592547ac5e712ba4d6e406 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Thu, 1 Aug 2019 20:54:04 +0200
Subject: [PATCH 07/19] fix subtitle language

---
 ox/file.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/ox/file.py b/ox/file.py
index 58f2a49..177ca1f 100644
--- a/ox/file.py
+++ b/ox/file.py
@@ -296,13 +296,17 @@ def ffprobe(filename):
             elif s.get('codec_type') == 'subtitle':
                 info['subtitles'] = info.get('subtitles', [])
                 stream = {}
+                if language and language != 'und':
+                    stream['language'] = language
                 for key in (
-                    'codec_name', 'language'
+                    'codec_name',
+                    'language',
+                    'width',
+                    'height',
                 ):
                     if key in s:
                         stream[{
                             'codec_name': 'codec',
-
                         }.get(key, key)] = s[key]
                 info['subtitles'].append(stream)
             else:

From e78519998da1ea897c9767fb0dcc40f28c3c2738 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Fri, 2 Aug 2019 14:23:07 +0200
Subject: [PATCH 08/19] use requests session

---
 ox/cache.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ox/cache.py b/ox/cache.py
index 904f31d..b5f9a9e 100644
--- a/ox/cache.py
+++ b/ox/cache.py
@@ -16,6 +16,7 @@ from six import PY2
 try:
     import requests
     USE_REQUESTS = True
+    requests_session = requests.Session()
 except:
     USE_REQUESTS = False
 
@@ -101,7 +102,7 @@ def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, un
     url_headers = {}
     if not result:
         if USE_REQUESTS:
-            r = requests.get(url, headers=headers)
+            r = requests_session.get(url, headers=headers)
             for key in r.headers:
                 url_headers[key.lower()] = r.headers[key]
             result = r.content

From d84503055748c6675ff6938d498dc58eb952a7f8 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Fri, 2 Aug 2019 16:26:22 +0200
Subject: [PATCH 09/19] 8 digit imdb ids

---
 ox/web/piratecinema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ox/web/piratecinema.py b/ox/web/piratecinema.py
index 4ed946b..c452f04 100644
--- a/ox/web/piratecinema.py
+++ b/ox/web/piratecinema.py
@@ -8,7 +8,7 @@ from ox.net import read_url
 def get_poster_url(id):
     url = 'http://piratecinema.org/posters/'
     html = read_url(url).decode('utf-8')
-    results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html)
+    results = re.compile('src="(.+)" title=".+\((\d{6}\d+)\)"').findall(html)
     for result in results:
         if result[1] == id:
             return url + result[0]

From cc1bad76cdcac8af5f53b3f12f67ed8c948f3037 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Sat, 3 Aug 2019 23:35:16 +0200
Subject: [PATCH 10/19] update user agent

---
 ox/net.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ox/net.py b/ox/net.py
index 2a5e71b..59e6abe 100644
--- a/ox/net.py
+++ b/ox/net.py
@@ -21,7 +21,7 @@ from chardet.universaldetector import UniversalDetector
 DEBUG = False
 # Default headers for HTTP requests.
 DEFAULT_HEADERS = {
-    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0',
+    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0',
     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
     'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4',

From 388f33ebb629c56751dc5bef6c6bbe2d33f60876 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Sat, 3 Aug 2019 23:38:31 +0200
Subject: [PATCH 11/19] cache imdb urls in parallel

---
 ox/web/siteparser.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ox/web/siteparser.py b/ox/web/siteparser.py
index 61a79bd..8c212bf 100644
--- a/ox/web/siteparser.py
+++ b/ox/web/siteparser.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
 import re
+from multiprocessing.pool import ThreadPool
 
 from six import string_types
 
@@ -28,6 +29,7 @@ def cleanup(key, data, data_type):
 class SiteParser(dict):
     baseUrl = ''
     regex = {}
+    pool = ThreadPool(8)
 
     def get_url(self, page):
         return "%s%s" % (self.baseUrl, page)
@@ -39,6 +41,9 @@ class SiteParser(dict):
 
     def __init__(self, timeout=-1):
         self._cache = {}
+        urls = list(set(self.get_url(self.regex[key]['page']) for key in self.regex))
+        self.pool.map(self.get_url, urls)
+
         for key in self.regex:
             url = self.get_url(self.regex[key]['page'])
             data = self.read_url(url, timeout)

From 665a4038b2df1222b9584ec950c3c35f1fe81a01 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Thu, 8 Aug 2019 17:08:13 +0200
Subject: [PATCH 12/19] space

---
 ox/web/wikipedia.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ox/web/wikipedia.py b/ox/web/wikipedia.py
index cb73758..de8b064 100644
--- a/ox/web/wikipedia.py
+++ b/ox/web/wikipedia.py
@@ -17,7 +17,7 @@ def get_id(url):
 
 def get_url(id=None, imdb=None, allmovie=None):
     if imdb:
-        query = '"%s"'% imdb
+        query = '"%s"' % imdb
         result = find(query)
         if result:
             url = result[0][1]
@@ -26,7 +26,7 @@ def get_url(id=None, imdb=None, allmovie=None):
                 return url
         return ""
     if allmovie:
-        query = '"amg_id = 1:%s"'% allmovie
+        query = '"amg_id = 1:%s"' % allmovie
         result = find(query)
         if result:
             url = result[0][1]
@@ -140,7 +140,7 @@ def get_allmovie_id(wikipedia_url):
     return data.get('amg_id', '')
 
 def find(query, max_results=10):
-    query = {'action': 'query', 'list':'search', 'format': 'json',
+    query = {'action': 'query', 'list': 'search', 'format': 'json',
              'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
     url = "http://en.wikipedia.org/w/api.php?" + urllib.parse.urlencode(query)
     data = read_url(url)

From cef85fc4defb57be8442bf83bdd3fbc1751e3ce1 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Fri, 15 Nov 2019 14:51:13 +0100
Subject: [PATCH 13/19] depend on lxml

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index b7509ec..51c3f99 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 chardet
 six>=1.5.2
+lxml

From 03c119155081f7b9f65e1f55d3a58708c9dc6704 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Fri, 15 Nov 2019 14:51:32 +0100
Subject: [PATCH 14/19] fall back to storyline for summary

---
 ox/web/imdb.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index 7d91dc7..2f14e33 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -199,6 +199,11 @@ class Imdb(SiteParser):
         'summary': zebra_table('Plot Summary', more=[
             '<p>(.*?)<em'
         ]),
+        'storyline': {
+            'page': '',
+            're': '<h2>Storyline</h2>.*?<p>(.*?)</p>',
+            'type': 'string'
+        },
         'posterId': {
             'page': 'reference',
             're': '<img.*?class="titlereference-primary-image".*?src="(.*?)".*?>',
@@ -517,10 +522,13 @@ class Imdb(SiteParser):
             ])
             if self['releasedate'] == 'x':
                 del self['releasedate']
+
+        if 'summary' not in self and 'storyline' in self:
+            self['summary'] = self.pop('storyline')
         if 'summary' in self:
             if isinstance(self['summary'], list):
                 self['summary'] = self['summary'][0]
-            self['summary'] = self['summary'].split('</p')[0].strip()
+            self['summary'] = strip_tags(self['summary'].split('</p')[0]).split('  Written by\n')[0].strip()
 
         if 'credits' in self:
             credits = [

From 75cfe74877aa41a547b60123f6941b4458239dc1 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Sat, 21 Dec 2019 20:18:19 +0200
Subject: [PATCH 15/19] srt fixes

---
 ox/srt.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/ox/srt.py b/ox/srt.py
index 5191a55..c29ae8b 100644
--- a/ox/srt.py
+++ b/ox/srt.py
@@ -63,10 +63,6 @@ def load(filename, offset=0):
     Returns list with dicts that have in, out, value and id
     '''
     srt = []
-
-    def parse_time(t):
-        return offset + ox.time2ms(t.replace(',', '.')) / 1000
-
     with open(filename, 'rb') as f:
         encoding = _detect_encoding(f)
         data = f.read()
@@ -77,7 +73,21 @@ def load(filename, offset=0):
             data = data.decode('latin-1')
         except:
             print("failed to detect encoding, giving up")
-            return srt
+            return []
+    return loads(data, offset)
+
+def loads(data, offset=0):
+    '''Parses an srt file
+
+    filename: path to an srt file
+    offset (float, seconds): shift all in/out points by offset
+
+    Returns list with dicts that have in, out, value and id
+    '''
+    srt = []
+
+    def parse_time(t):
+        return offset + ox.time2ms(t.replace(',', '.')) / 1000
 
     data = data.replace('\r\n', '\n')
     if not data.endswith('\n\n'):

From 3574be2975bca8fbc1a42634e79e4ee9a088054a Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Sat, 21 Dec 2019 20:29:44 +0200
Subject: [PATCH 16/19] don't fall back to ffmpeg2theora

---
 ox/file.py | 45 ++-------------------------------------------
 1 file changed, 2 insertions(+), 43 deletions(-)

diff --git a/ox/file.py b/ox/file.py
index 177ca1f..ec9da4b 100644
--- a/ox/file.py
+++ b/ox/file.py
@@ -159,51 +159,10 @@ def avinfo(filename, cached=True):
     if os.path.getsize(filename):
         if find_executable('ffprobe'):
             return ffprobe(filename)
-        ffmpeg2theora = cmd('ffmpeg2theora')
-        p = subprocess.Popen([ffmpeg2theora], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        stdout, error = p.communicate()
-        stdout = stdout.decode('utf-8')
-        version = stdout.split('\n')[0].split(' - ')[0].split(' ')[-1]
-        if version < '0.27':
-            raise EnvironmentError('version of ffmpeg2theora needs to be 0.27 or later, found %s' % version)
-        p = subprocess.Popen([ffmpeg2theora, '--info', filename],
-                             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        stdout, error = p.communicate()
-        stdout = stdout.decode('utf-8')
-        try:
-            info = json.loads(stdout)
-        except:
-            # remove metadata, can be broken
-            reg = re.compile('"metadata": {.*?},', re.DOTALL)
-            stdout = re.sub(reg, '', stdout)
-            info = json.loads(stdout)
-        if 'video' in info:
-            for v in info['video']:
-                if 'display_aspect_ratio' not in v and 'width' in v:
-                    v['display_aspect_ratio'] = '%d:%d' % (v['width'], v['height'])
-                    v['pixel_aspect_ratio'] = '1:1'
-        if len(info.get('audio', [])) > 1:
-            if 'metadata' in info['audio'][0]:
-                for stream in info['audio']:
-                    language = stream.get('metadata', {}).get('language')
-                    if language and language != 'und':
-                        stream['language'] = language[0]
-            else:
-                ffmpeg = cmd('ffmpeg')
-                p = subprocess.Popen([ffmpeg, '-i', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-                stdout, stderr = p.communicate()
-                stderr = stderr.decode('utf-8')
-                languages = [re.compile('\((.+?)\):').findall(l) for l in stderr.split('\n') if 'Stream' in l and 'Audio' in l]
-                if len(languages) == len(info['audio']):
-                    for i, stream in enumerate(info['audio']):
-                        language = languages[i]
-                        if language and language[0] != 'und':
-                            stream['language'] = language[0]
-        fix_coverart(info)
-        return info
-
+        raise EnvironmentError('could to find ffprobe. please install ffmpeg')
     return {'path': filename, 'size': 0}
 
+
 def ffprobe(filename):
     p = subprocess.Popen([
         cmd('ffprobe'),

From da51407e7d066b8efd621acb7a9f3c997cc75b51 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Wed, 5 Feb 2020 16:51:28 +0100
Subject: [PATCH 17/19] fix alternative titles

---
 ox/web/imdb.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index 2f14e33..7ccc63d 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -124,8 +124,8 @@ class Imdb(SiteParser):
         'alternativeTitles': {
             'page': 'releaseinfo',
             're': [
-                '<table[^>]*?id="akas"[^>]*?>(.*?)</table>',
-                "td>(.*?)</td>.*?<td>(.*?)</td>"
+                '<h4[^>]*?id="akas"[^>]*?>(.*?)</table>',
+                "td[^>]*?>(.*?)</td>.*?<td[^>]*?>(.*?)</td>"
             ],
             'type': 'list'
         },

From 09e0a521af7adf01c6d362250decd1fd55d06558 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Tue, 18 Feb 2020 16:27:25 +0100
Subject: [PATCH 18/19] add Accept-Language, use akas

---
 ox/web/imdb.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index 7ccc63d..272185b 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -23,6 +23,8 @@ def prepare_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cac
     headers = headers.copy()
     # https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau
     headers['X-Forwarded-For'] = '72.21.206.80'
+    headers['Accept-Language'] = 'en'
+
     return url, data, headers, timeout, unicode
 
 def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
@@ -34,7 +36,7 @@ def delete_url(url, data=None, headers=cache.DEFAULT_HEADERS):
     cache.store.delete(url, data, headers)
 
 def get_url(id):
-    return "http://www.imdb.com/title/tt%s/" % id
+    return "http://akas.imdb.com/title/tt%s/" % id
 
 
 def reference_section(id):

From 926b8ad2550a6bcde54bbd8f421172969f4b16a6 Mon Sep 17 00:00:00 2001
From: j <j@mailb.org>
Date: Tue, 18 Feb 2020 16:59:08 +0100
Subject: [PATCH 19/19] fight geolocalization

---
 ox/web/imdb.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ox/web/imdb.py b/ox/web/imdb.py
index 272185b..fb109be 100644
--- a/ox/web/imdb.py
+++ b/ox/web/imdb.py
@@ -379,6 +379,9 @@ class Imdb(SiteParser):
 
         if 'alternativeTitles' in self:
             alt = {}
+            for t in self['alternativeTitles']:
+                if t[0].strip() in ('World-wide (English title)', ):
+                    self['title'] = cleanup_title(t[1])
             for t in self['alternativeTitles']:
                 title = cleanup_title(t[1])
                 if title.lower() not in (self.get('title', '').lower(), self.get('originalTitle', '').lower()):