Compare commits
19 commits
2026b64faf
...
926b8ad255
| Author | SHA1 | Date | |
|---|---|---|---|
| 926b8ad255 | |||
| 09e0a521af | |||
| da51407e7d | |||
| 3574be2975 | |||
| 75cfe74877 | |||
| 03c1191550 | |||
| cef85fc4de | |||
| 665a4038b2 | |||
| 388f33ebb6 | |||
| cc1bad76cd | |||
| d845030557 | |||
| e78519998d | |||
| 23a641189c | |||
| b49acd47dc | |||
| d632cd3803 | |||
| 9c90aaa5f8 | |||
| fb8b33d916 | |||
| 8c14d28aa2 | |||
| b7979779fe |
10 changed files with 72 additions and 67 deletions
|
|
@ -201,7 +201,7 @@ class API(object):
|
||||||
return False
|
return False
|
||||||
if data['status']['code'] != 200:
|
if data['status']['code'] != 200:
|
||||||
print("request returned error, will try again in 5 seconds")
|
print("request returned error, will try again in 5 seconds")
|
||||||
if DEBUG:
|
if self.DEBUG:
|
||||||
print(data)
|
print(data)
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
if data and data.get('result') == 1:
|
if data and data.get('result') == 1:
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,7 @@ from six import PY2
|
||||||
try:
|
try:
|
||||||
import requests
|
import requests
|
||||||
USE_REQUESTS = True
|
USE_REQUESTS = True
|
||||||
|
requests_session = requests.Session()
|
||||||
except:
|
except:
|
||||||
USE_REQUESTS = False
|
USE_REQUESTS = False
|
||||||
|
|
||||||
|
|
@ -101,7 +102,7 @@ def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, un
|
||||||
url_headers = {}
|
url_headers = {}
|
||||||
if not result:
|
if not result:
|
||||||
if USE_REQUESTS:
|
if USE_REQUESTS:
|
||||||
r = requests.get(url, headers=headers)
|
r = requests_session.get(url, headers=headers)
|
||||||
for key in r.headers:
|
for key in r.headers:
|
||||||
url_headers[key.lower()] = r.headers[key]
|
url_headers[key.lower()] = r.headers[key]
|
||||||
result = r.content
|
result = r.content
|
||||||
|
|
|
||||||
61
ox/file.py
61
ox/file.py
|
|
@ -159,51 +159,10 @@ def avinfo(filename, cached=True):
|
||||||
if os.path.getsize(filename):
|
if os.path.getsize(filename):
|
||||||
if find_executable('ffprobe'):
|
if find_executable('ffprobe'):
|
||||||
return ffprobe(filename)
|
return ffprobe(filename)
|
||||||
ffmpeg2theora = cmd('ffmpeg2theora')
|
raise EnvironmentError('could to find ffprobe. please install ffmpeg')
|
||||||
p = subprocess.Popen([ffmpeg2theora], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
||||||
stdout, error = p.communicate()
|
|
||||||
stdout = stdout.decode('utf-8')
|
|
||||||
version = stdout.split('\n')[0].split(' - ')[0].split(' ')[-1]
|
|
||||||
if version < '0.27':
|
|
||||||
raise EnvironmentError('version of ffmpeg2theora needs to be 0.27 or later, found %s' % version)
|
|
||||||
p = subprocess.Popen([ffmpeg2theora, '--info', filename],
|
|
||||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
||||||
stdout, error = p.communicate()
|
|
||||||
stdout = stdout.decode('utf-8')
|
|
||||||
try:
|
|
||||||
info = json.loads(stdout)
|
|
||||||
except:
|
|
||||||
# remove metadata, can be broken
|
|
||||||
reg = re.compile('"metadata": {.*?},', re.DOTALL)
|
|
||||||
stdout = re.sub(reg, '', stdout)
|
|
||||||
info = json.loads(stdout)
|
|
||||||
if 'video' in info:
|
|
||||||
for v in info['video']:
|
|
||||||
if 'display_aspect_ratio' not in v and 'width' in v:
|
|
||||||
v['display_aspect_ratio'] = '%d:%d' % (v['width'], v['height'])
|
|
||||||
v['pixel_aspect_ratio'] = '1:1'
|
|
||||||
if len(info.get('audio', [])) > 1:
|
|
||||||
if 'metadata' in info['audio'][0]:
|
|
||||||
for stream in info['audio']:
|
|
||||||
language = stream.get('metadata', {}).get('language')
|
|
||||||
if language and language != 'und':
|
|
||||||
stream['language'] = language[0]
|
|
||||||
else:
|
|
||||||
ffmpeg = cmd('ffmpeg')
|
|
||||||
p = subprocess.Popen([ffmpeg, '-i', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
||||||
stdout, stderr = p.communicate()
|
|
||||||
stderr = stderr.decode('utf-8')
|
|
||||||
languages = [re.compile('\((.+?)\):').findall(l) for l in stderr.split('\n') if 'Stream' in l and 'Audio' in l]
|
|
||||||
if len(languages) == len(info['audio']):
|
|
||||||
for i, stream in enumerate(info['audio']):
|
|
||||||
language = languages[i]
|
|
||||||
if language and language[0] != 'und':
|
|
||||||
stream['language'] = language[0]
|
|
||||||
fix_coverart(info)
|
|
||||||
return info
|
|
||||||
|
|
||||||
return {'path': filename, 'size': 0}
|
return {'path': filename, 'size': 0}
|
||||||
|
|
||||||
|
|
||||||
def ffprobe(filename):
|
def ffprobe(filename):
|
||||||
p = subprocess.Popen([
|
p = subprocess.Popen([
|
||||||
cmd('ffprobe'),
|
cmd('ffprobe'),
|
||||||
|
|
@ -293,6 +252,22 @@ def ffprobe(filename):
|
||||||
'sample_aspect_ratio': 'pixel_aspect_ratio',
|
'sample_aspect_ratio': 'pixel_aspect_ratio',
|
||||||
}.get(key, key)] = fix_value(key, s[key])
|
}.get(key, key)] = fix_value(key, s[key])
|
||||||
info[s['codec_type']].append(stream)
|
info[s['codec_type']].append(stream)
|
||||||
|
elif s.get('codec_type') == 'subtitle':
|
||||||
|
info['subtitles'] = info.get('subtitles', [])
|
||||||
|
stream = {}
|
||||||
|
if language and language != 'und':
|
||||||
|
stream['language'] = language
|
||||||
|
for key in (
|
||||||
|
'codec_name',
|
||||||
|
'language',
|
||||||
|
'width',
|
||||||
|
'height',
|
||||||
|
):
|
||||||
|
if key in s:
|
||||||
|
stream[{
|
||||||
|
'codec_name': 'codec',
|
||||||
|
}.get(key, key)] = s[key]
|
||||||
|
info['subtitles'].append(stream)
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
# print s
|
# print s
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,7 @@ from chardet.universaldetector import UniversalDetector
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
# Default headers for HTTP requests.
|
# Default headers for HTTP requests.
|
||||||
DEFAULT_HEADERS = {
|
DEFAULT_HEADERS = {
|
||||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0',
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0',
|
||||||
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
|
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||||
'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4',
|
'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4',
|
||||||
|
|
|
||||||
20
ox/srt.py
20
ox/srt.py
|
|
@ -63,10 +63,6 @@ def load(filename, offset=0):
|
||||||
Returns list with dicts that have in, out, value and id
|
Returns list with dicts that have in, out, value and id
|
||||||
'''
|
'''
|
||||||
srt = []
|
srt = []
|
||||||
|
|
||||||
def parse_time(t):
|
|
||||||
return offset + ox.time2ms(t.replace(',', '.')) / 1000
|
|
||||||
|
|
||||||
with open(filename, 'rb') as f:
|
with open(filename, 'rb') as f:
|
||||||
encoding = _detect_encoding(f)
|
encoding = _detect_encoding(f)
|
||||||
data = f.read()
|
data = f.read()
|
||||||
|
|
@ -77,7 +73,21 @@ def load(filename, offset=0):
|
||||||
data = data.decode('latin-1')
|
data = data.decode('latin-1')
|
||||||
except:
|
except:
|
||||||
print("failed to detect encoding, giving up")
|
print("failed to detect encoding, giving up")
|
||||||
return srt
|
return []
|
||||||
|
return loads(data, offset)
|
||||||
|
|
||||||
|
def loads(data, offset=0):
|
||||||
|
'''Parses an srt file
|
||||||
|
|
||||||
|
filename: path to an srt file
|
||||||
|
offset (float, seconds): shift all in/out points by offset
|
||||||
|
|
||||||
|
Returns list with dicts that have in, out, value and id
|
||||||
|
'''
|
||||||
|
srt = []
|
||||||
|
|
||||||
|
def parse_time(t):
|
||||||
|
return offset + ox.time2ms(t.replace(',', '.')) / 1000
|
||||||
|
|
||||||
data = data.replace('\r\n', '\n')
|
data = data.replace('\r\n', '\n')
|
||||||
if not data.endswith('\n\n'):
|
if not data.endswith('\n\n'):
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,8 @@ def prepare_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cac
|
||||||
headers = headers.copy()
|
headers = headers.copy()
|
||||||
# https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau
|
# https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau
|
||||||
headers['X-Forwarded-For'] = '72.21.206.80'
|
headers['X-Forwarded-For'] = '72.21.206.80'
|
||||||
|
headers['Accept-Language'] = 'en'
|
||||||
|
|
||||||
return url, data, headers, timeout, unicode
|
return url, data, headers, timeout, unicode
|
||||||
|
|
||||||
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||||
|
|
@ -34,7 +36,7 @@ def delete_url(url, data=None, headers=cache.DEFAULT_HEADERS):
|
||||||
cache.store.delete(url, data, headers)
|
cache.store.delete(url, data, headers)
|
||||||
|
|
||||||
def get_url(id):
|
def get_url(id):
|
||||||
return "http://www.imdb.com/title/tt%s/" % id
|
return "http://akas.imdb.com/title/tt%s/" % id
|
||||||
|
|
||||||
|
|
||||||
def reference_section(id):
|
def reference_section(id):
|
||||||
|
|
@ -124,8 +126,8 @@ class Imdb(SiteParser):
|
||||||
'alternativeTitles': {
|
'alternativeTitles': {
|
||||||
'page': 'releaseinfo',
|
'page': 'releaseinfo',
|
||||||
're': [
|
're': [
|
||||||
'<table[^>]*?id="akas"[^>]*?>(.*?)</table>',
|
'<h4[^>]*?id="akas"[^>]*?>(.*?)</table>',
|
||||||
"td>(.*?)</td>.*?<td>(.*?)</td>"
|
"td[^>]*?>(.*?)</td>.*?<td[^>]*?>(.*?)</td>"
|
||||||
],
|
],
|
||||||
'type': 'list'
|
'type': 'list'
|
||||||
},
|
},
|
||||||
|
|
@ -199,6 +201,11 @@ class Imdb(SiteParser):
|
||||||
'summary': zebra_table('Plot Summary', more=[
|
'summary': zebra_table('Plot Summary', more=[
|
||||||
'<p>(.*?)<em'
|
'<p>(.*?)<em'
|
||||||
]),
|
]),
|
||||||
|
'storyline': {
|
||||||
|
'page': '',
|
||||||
|
're': '<h2>Storyline</h2>.*?<p>(.*?)</p>',
|
||||||
|
'type': 'string'
|
||||||
|
},
|
||||||
'posterId': {
|
'posterId': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': '<img.*?class="titlereference-primary-image".*?src="(.*?)".*?>',
|
're': '<img.*?class="titlereference-primary-image".*?src="(.*?)".*?>',
|
||||||
|
|
@ -267,7 +274,7 @@ class Imdb(SiteParser):
|
||||||
},
|
},
|
||||||
'series': {
|
'series': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': '<h4 itemprop="name">.*?<a href="/title/tt(\d{7})',
|
're': '<h4 itemprop="name">.*?<a href="/title/tt(\d+)',
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
},
|
},
|
||||||
'isSeries': {
|
'isSeries': {
|
||||||
|
|
@ -372,6 +379,9 @@ class Imdb(SiteParser):
|
||||||
|
|
||||||
if 'alternativeTitles' in self:
|
if 'alternativeTitles' in self:
|
||||||
alt = {}
|
alt = {}
|
||||||
|
for t in self['alternativeTitles']:
|
||||||
|
if t[0].strip() in ('World-wide (English title)', ):
|
||||||
|
self['title'] = cleanup_title(t[1])
|
||||||
for t in self['alternativeTitles']:
|
for t in self['alternativeTitles']:
|
||||||
title = cleanup_title(t[1])
|
title = cleanup_title(t[1])
|
||||||
if title.lower() not in (self.get('title', '').lower(), self.get('originalTitle', '').lower()):
|
if title.lower() not in (self.get('title', '').lower(), self.get('originalTitle', '').lower()):
|
||||||
|
|
@ -422,7 +432,7 @@ class Imdb(SiteParser):
|
||||||
for rel, data, _ in self['connections']:
|
for rel, data, _ in self['connections']:
|
||||||
if isinstance(rel, bytes):
|
if isinstance(rel, bytes):
|
||||||
rel = rel.decode('utf-8')
|
rel = rel.decode('utf-8')
|
||||||
#cc[rel] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
|
#cc[rel] = re.compile('<a href="/title/tt(\d+)/">(.*?)</a>').findall(data)
|
||||||
def get_conn(c):
|
def get_conn(c):
|
||||||
r = {
|
r = {
|
||||||
'id': c[0],
|
'id': c[0],
|
||||||
|
|
@ -432,7 +442,7 @@ class Imdb(SiteParser):
|
||||||
if len(description) == 2 and description[-1].strip() != '-':
|
if len(description) == 2 and description[-1].strip() != '-':
|
||||||
r['description'] = description[-1].strip()
|
r['description'] = description[-1].strip()
|
||||||
return r
|
return r
|
||||||
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
|
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d+)/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
|
||||||
|
|
||||||
self['connections'] = cc
|
self['connections'] = cc
|
||||||
|
|
||||||
|
|
@ -517,10 +527,13 @@ class Imdb(SiteParser):
|
||||||
])
|
])
|
||||||
if self['releasedate'] == 'x':
|
if self['releasedate'] == 'x':
|
||||||
del self['releasedate']
|
del self['releasedate']
|
||||||
|
|
||||||
|
if 'summary' not in self and 'storyline' in self:
|
||||||
|
self['summary'] = self.pop('storyline')
|
||||||
if 'summary' in self:
|
if 'summary' in self:
|
||||||
if isinstance(self['summary'], list):
|
if isinstance(self['summary'], list):
|
||||||
self['summary'] = self['summary'][0]
|
self['summary'] = self['summary'][0]
|
||||||
self['summary'] = self['summary'].split('</p')[0].strip()
|
self['summary'] = strip_tags(self['summary'].split('</p')[0]).split(' Written by\n')[0].strip()
|
||||||
|
|
||||||
if 'credits' in self:
|
if 'credits' in self:
|
||||||
credits = [
|
credits = [
|
||||||
|
|
@ -618,7 +631,7 @@ def get_movie_by_title(title, timeout=-1):
|
||||||
url = "http://akas.imdb.com/find?" + params
|
url = "http://akas.imdb.com/find?" + params
|
||||||
data = read_url(url, timeout=timeout, unicode=True)
|
data = read_url(url, timeout=timeout, unicode=True)
|
||||||
#if search results in redirect, get id of current page
|
#if search results in redirect, get id of current page
|
||||||
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
|
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
|
||||||
results = re.compile(r).findall(data)
|
results = re.compile(r).findall(data)
|
||||||
if results:
|
if results:
|
||||||
return results[0]
|
return results[0]
|
||||||
|
|
@ -697,12 +710,12 @@ def get_movie_id(title, director='', year='', timeout=-1):
|
||||||
|
|
||||||
data = read_url(url, timeout=timeout, unicode=True)
|
data = read_url(url, timeout=timeout, unicode=True)
|
||||||
#if search results in redirect, get id of current page
|
#if search results in redirect, get id of current page
|
||||||
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
|
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
|
||||||
results = re.compile(r).findall(data)
|
results = re.compile(r).findall(data)
|
||||||
if results:
|
if results:
|
||||||
return results[0]
|
return results[0]
|
||||||
#otherwise get first result
|
#otherwise get first result
|
||||||
r = '<td valign="top">.*?<a href="/title/tt(\d{7})/"'
|
r = '<td valign="top">.*?<a href="/title/tt(\d+)/"'
|
||||||
results = re.compile(r).findall(data)
|
results = re.compile(r).findall(data)
|
||||||
if results:
|
if results:
|
||||||
return results[0]
|
return results[0]
|
||||||
|
|
@ -713,7 +726,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
|
||||||
results = duckduckgo.find(google_query, timeout=timeout)
|
results = duckduckgo.find(google_query, timeout=timeout)
|
||||||
if results:
|
if results:
|
||||||
for r in results[:2]:
|
for r in results[:2]:
|
||||||
imdbId = find_re(r[1], 'title/tt(\d{7})')
|
imdbId = find_re(r[1], 'title/tt(\d+)')
|
||||||
if imdbId:
|
if imdbId:
|
||||||
return imdbId
|
return imdbId
|
||||||
#or nothing
|
#or nothing
|
||||||
|
|
@ -740,7 +753,7 @@ def get_episodes(imdbId, season=None):
|
||||||
if season:
|
if season:
|
||||||
url += '?season=%d' % season
|
url += '?season=%d' % season
|
||||||
data = cache.read_url(url).decode()
|
data = cache.read_url(url).decode()
|
||||||
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
|
for e in re.compile('<div data-const="tt(\d+)".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
|
||||||
episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
|
episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
|
||||||
else:
|
else:
|
||||||
data = cache.read_url(url)
|
data = cache.read_url(url)
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ from ox.net import read_url
|
||||||
def get_poster_url(id):
|
def get_poster_url(id):
|
||||||
url = 'http://piratecinema.org/posters/'
|
url = 'http://piratecinema.org/posters/'
|
||||||
html = read_url(url).decode('utf-8')
|
html = read_url(url).decode('utf-8')
|
||||||
results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html)
|
results = re.compile('src="(.+)" title=".+\((\d{6}\d+)\)"').findall(html)
|
||||||
for result in results:
|
for result in results:
|
||||||
if result[1] == id:
|
if result[1] == id:
|
||||||
return url + result[0]
|
return url + result[0]
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
import re
|
import re
|
||||||
|
from multiprocessing.pool import ThreadPool
|
||||||
|
|
||||||
from six import string_types
|
from six import string_types
|
||||||
|
|
||||||
|
|
@ -28,6 +29,7 @@ def cleanup(key, data, data_type):
|
||||||
class SiteParser(dict):
|
class SiteParser(dict):
|
||||||
baseUrl = ''
|
baseUrl = ''
|
||||||
regex = {}
|
regex = {}
|
||||||
|
pool = ThreadPool(8)
|
||||||
|
|
||||||
def get_url(self, page):
|
def get_url(self, page):
|
||||||
return "%s%s" % (self.baseUrl, page)
|
return "%s%s" % (self.baseUrl, page)
|
||||||
|
|
@ -39,6 +41,9 @@ class SiteParser(dict):
|
||||||
|
|
||||||
def __init__(self, timeout=-1):
|
def __init__(self, timeout=-1):
|
||||||
self._cache = {}
|
self._cache = {}
|
||||||
|
urls = list(set(self.get_url(self.regex[key]['page']) for key in self.regex))
|
||||||
|
self.pool.map(self.get_url, urls)
|
||||||
|
|
||||||
for key in self.regex:
|
for key in self.regex:
|
||||||
url = self.get_url(self.regex[key]['page'])
|
url = self.get_url(self.regex[key]['page'])
|
||||||
data = self.read_url(url, timeout)
|
data = self.read_url(url, timeout)
|
||||||
|
|
|
||||||
|
|
@ -1,2 +1,3 @@
|
||||||
chardet
|
chardet
|
||||||
six>=1.5.2
|
six>=1.5.2
|
||||||
|
lxml
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue