Compare commits

...

19 commits

Author SHA1 Message Date
j
926b8ad255 fight geolocalization 2020-02-18 16:59:08 +01:00
j
09e0a521af add Accept-Language, use akas 2020-02-18 16:45:17 +01:00
j
da51407e7d fix alternative titles 2020-02-05 16:51:28 +01:00
j
3574be2975 don't fall back to ffmpeg2theora 2019-12-21 20:29:44 +02:00
j
75cfe74877 srt fixes 2019-12-21 20:18:19 +02:00
j
03c1191550 fall back to storyline for summary 2019-11-15 14:51:32 +01:00
j
cef85fc4de depend on lxml 2019-11-15 14:51:13 +01:00
j
665a4038b2 space 2019-08-08 17:08:13 +02:00
j
388f33ebb6 cache imdb urls in parallel 2019-08-03 23:38:31 +02:00
j
cc1bad76cd update user agent 2019-08-03 23:35:16 +02:00
j
d845030557 8 digit imdb ids 2019-08-02 16:26:22 +02:00
j
e78519998d use requests session 2019-08-02 14:23:07 +02:00
j
23a641189c fix subtitle language 2019-08-01 20:54:04 +02:00
j
b49acd47dc load subtitle info 2019-08-01 16:28:00 +02:00
j
d632cd3803 match as many digits as possible 2019-07-23 16:42:20 +02:00
j
9c90aaa5f8 imdb can also be 8 digits 2019-07-23 16:24:06 +02:00
j
fb8b33d916 fix variable name 2019-07-23 16:09:07 +02:00
j
8c14d28aa2 remove rec again, not a real format 2019-07-22 17:11:46 +02:00
j
b7979779fe .rec files 2019-07-22 10:38:14 +02:00
10 changed files with 72 additions and 67 deletions

View file

@ -201,7 +201,7 @@ class API(object):
return False return False
if data['status']['code'] != 200: if data['status']['code'] != 200:
print("request returned error, will try again in 5 seconds") print("request returned error, will try again in 5 seconds")
if DEBUG: if self.DEBUG:
print(data) print(data)
time.sleep(5) time.sleep(5)
if data and data.get('result') == 1: if data and data.get('result') == 1:

View file

@ -16,6 +16,7 @@ from six import PY2
try: try:
import requests import requests
USE_REQUESTS = True USE_REQUESTS = True
requests_session = requests.Session()
except: except:
USE_REQUESTS = False USE_REQUESTS = False
@ -101,7 +102,7 @@ def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, un
url_headers = {} url_headers = {}
if not result: if not result:
if USE_REQUESTS: if USE_REQUESTS:
r = requests.get(url, headers=headers) r = requests_session.get(url, headers=headers)
for key in r.headers: for key in r.headers:
url_headers[key.lower()] = r.headers[key] url_headers[key.lower()] = r.headers[key]
result = r.content result = r.content

View file

@ -159,51 +159,10 @@ def avinfo(filename, cached=True):
if os.path.getsize(filename): if os.path.getsize(filename):
if find_executable('ffprobe'): if find_executable('ffprobe'):
return ffprobe(filename) return ffprobe(filename)
ffmpeg2theora = cmd('ffmpeg2theora') raise EnvironmentError('could to find ffprobe. please install ffmpeg')
p = subprocess.Popen([ffmpeg2theora], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, error = p.communicate()
stdout = stdout.decode('utf-8')
version = stdout.split('\n')[0].split(' - ')[0].split(' ')[-1]
if version < '0.27':
raise EnvironmentError('version of ffmpeg2theora needs to be 0.27 or later, found %s' % version)
p = subprocess.Popen([ffmpeg2theora, '--info', filename],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, error = p.communicate()
stdout = stdout.decode('utf-8')
try:
info = json.loads(stdout)
except:
# remove metadata, can be broken
reg = re.compile('"metadata": {.*?},', re.DOTALL)
stdout = re.sub(reg, '', stdout)
info = json.loads(stdout)
if 'video' in info:
for v in info['video']:
if 'display_aspect_ratio' not in v and 'width' in v:
v['display_aspect_ratio'] = '%d:%d' % (v['width'], v['height'])
v['pixel_aspect_ratio'] = '1:1'
if len(info.get('audio', [])) > 1:
if 'metadata' in info['audio'][0]:
for stream in info['audio']:
language = stream.get('metadata', {}).get('language')
if language and language != 'und':
stream['language'] = language[0]
else:
ffmpeg = cmd('ffmpeg')
p = subprocess.Popen([ffmpeg, '-i', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
stderr = stderr.decode('utf-8')
languages = [re.compile('\((.+?)\):').findall(l) for l in stderr.split('\n') if 'Stream' in l and 'Audio' in l]
if len(languages) == len(info['audio']):
for i, stream in enumerate(info['audio']):
language = languages[i]
if language and language[0] != 'und':
stream['language'] = language[0]
fix_coverart(info)
return info
return {'path': filename, 'size': 0} return {'path': filename, 'size': 0}
def ffprobe(filename): def ffprobe(filename):
p = subprocess.Popen([ p = subprocess.Popen([
cmd('ffprobe'), cmd('ffprobe'),
@ -293,6 +252,22 @@ def ffprobe(filename):
'sample_aspect_ratio': 'pixel_aspect_ratio', 'sample_aspect_ratio': 'pixel_aspect_ratio',
}.get(key, key)] = fix_value(key, s[key]) }.get(key, key)] = fix_value(key, s[key])
info[s['codec_type']].append(stream) info[s['codec_type']].append(stream)
elif s.get('codec_type') == 'subtitle':
info['subtitles'] = info.get('subtitles', [])
stream = {}
if language and language != 'und':
stream['language'] = language
for key in (
'codec_name',
'language',
'width',
'height',
):
if key in s:
stream[{
'codec_name': 'codec',
}.get(key, key)] = s[key]
info['subtitles'].append(stream)
else: else:
pass pass
# print s # print s

View file

@ -21,7 +21,7 @@ from chardet.universaldetector import UniversalDetector
DEBUG = False DEBUG = False
# Default headers for HTTP requests. # Default headers for HTTP requests.
DEFAULT_HEADERS = { DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4', 'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4',

View file

@ -63,10 +63,6 @@ def load(filename, offset=0):
Returns list with dicts that have in, out, value and id Returns list with dicts that have in, out, value and id
''' '''
srt = [] srt = []
def parse_time(t):
return offset + ox.time2ms(t.replace(',', '.')) / 1000
with open(filename, 'rb') as f: with open(filename, 'rb') as f:
encoding = _detect_encoding(f) encoding = _detect_encoding(f)
data = f.read() data = f.read()
@ -77,7 +73,21 @@ def load(filename, offset=0):
data = data.decode('latin-1') data = data.decode('latin-1')
except: except:
print("failed to detect encoding, giving up") print("failed to detect encoding, giving up")
return srt return []
return loads(data, offset)
def loads(data, offset=0):
'''Parses an srt file
filename: path to an srt file
offset (float, seconds): shift all in/out points by offset
Returns list with dicts that have in, out, value and id
'''
srt = []
def parse_time(t):
return offset + ox.time2ms(t.replace(',', '.')) / 1000
data = data.replace('\r\n', '\n') data = data.replace('\r\n', '\n')
if not data.endswith('\n\n'): if not data.endswith('\n\n'):

View file

@ -23,6 +23,8 @@ def prepare_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cac
headers = headers.copy() headers = headers.copy()
# https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau # https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau
headers['X-Forwarded-For'] = '72.21.206.80' headers['X-Forwarded-For'] = '72.21.206.80'
headers['Accept-Language'] = 'en'
return url, data, headers, timeout, unicode return url, data, headers, timeout, unicode
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False): def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
@ -34,7 +36,7 @@ def delete_url(url, data=None, headers=cache.DEFAULT_HEADERS):
cache.store.delete(url, data, headers) cache.store.delete(url, data, headers)
def get_url(id): def get_url(id):
return "http://www.imdb.com/title/tt%s/" % id return "http://akas.imdb.com/title/tt%s/" % id
def reference_section(id): def reference_section(id):
@ -124,8 +126,8 @@ class Imdb(SiteParser):
'alternativeTitles': { 'alternativeTitles': {
'page': 'releaseinfo', 'page': 'releaseinfo',
're': [ 're': [
'<table[^>]*?id="akas"[^>]*?>(.*?)</table>', '<h4[^>]*?id="akas"[^>]*?>(.*?)</table>',
"td>(.*?)</td>.*?<td>(.*?)</td>" "td[^>]*?>(.*?)</td>.*?<td[^>]*?>(.*?)</td>"
], ],
'type': 'list' 'type': 'list'
}, },
@ -199,6 +201,11 @@ class Imdb(SiteParser):
'summary': zebra_table('Plot Summary', more=[ 'summary': zebra_table('Plot Summary', more=[
'<p>(.*?)<em' '<p>(.*?)<em'
]), ]),
'storyline': {
'page': '',
're': '<h2>Storyline</h2>.*?<p>(.*?)</p>',
'type': 'string'
},
'posterId': { 'posterId': {
'page': 'reference', 'page': 'reference',
're': '<img.*?class="titlereference-primary-image".*?src="(.*?)".*?>', 're': '<img.*?class="titlereference-primary-image".*?src="(.*?)".*?>',
@ -267,7 +274,7 @@ class Imdb(SiteParser):
}, },
'series': { 'series': {
'page': 'reference', 'page': 'reference',
're': '<h4 itemprop="name">.*?<a href="/title/tt(\d{7})', 're': '<h4 itemprop="name">.*?<a href="/title/tt(\d+)',
'type': 'string' 'type': 'string'
}, },
'isSeries': { 'isSeries': {
@ -372,6 +379,9 @@ class Imdb(SiteParser):
if 'alternativeTitles' in self: if 'alternativeTitles' in self:
alt = {} alt = {}
for t in self['alternativeTitles']:
if t[0].strip() in ('World-wide (English title)', ):
self['title'] = cleanup_title(t[1])
for t in self['alternativeTitles']: for t in self['alternativeTitles']:
title = cleanup_title(t[1]) title = cleanup_title(t[1])
if title.lower() not in (self.get('title', '').lower(), self.get('originalTitle', '').lower()): if title.lower() not in (self.get('title', '').lower(), self.get('originalTitle', '').lower()):
@ -422,7 +432,7 @@ class Imdb(SiteParser):
for rel, data, _ in self['connections']: for rel, data, _ in self['connections']:
if isinstance(rel, bytes): if isinstance(rel, bytes):
rel = rel.decode('utf-8') rel = rel.decode('utf-8')
#cc[rel] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data) #cc[rel] = re.compile('<a href="/title/tt(\d+)/">(.*?)</a>').findall(data)
def get_conn(c): def get_conn(c):
r = { r = {
'id': c[0], 'id': c[0],
@ -432,7 +442,7 @@ class Imdb(SiteParser):
if len(description) == 2 and description[-1].strip() != '-': if len(description) == 2 and description[-1].strip() != '-':
r['description'] = description[-1].strip() r['description'] = description[-1].strip()
return r return r
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data))) cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d+)/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
self['connections'] = cc self['connections'] = cc
@ -517,10 +527,13 @@ class Imdb(SiteParser):
]) ])
if self['releasedate'] == 'x': if self['releasedate'] == 'x':
del self['releasedate'] del self['releasedate']
if 'summary' not in self and 'storyline' in self:
self['summary'] = self.pop('storyline')
if 'summary' in self: if 'summary' in self:
if isinstance(self['summary'], list): if isinstance(self['summary'], list):
self['summary'] = self['summary'][0] self['summary'] = self['summary'][0]
self['summary'] = self['summary'].split('</p')[0].strip() self['summary'] = strip_tags(self['summary'].split('</p')[0]).split(' Written by\n')[0].strip()
if 'credits' in self: if 'credits' in self:
credits = [ credits = [
@ -618,7 +631,7 @@ def get_movie_by_title(title, timeout=-1):
url = "http://akas.imdb.com/find?" + params url = "http://akas.imdb.com/find?" + params
data = read_url(url, timeout=timeout, unicode=True) data = read_url(url, timeout=timeout, unicode=True)
#if search results in redirect, get id of current page #if search results in redirect, get id of current page
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />' r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
results = re.compile(r).findall(data) results = re.compile(r).findall(data)
if results: if results:
return results[0] return results[0]
@ -697,12 +710,12 @@ def get_movie_id(title, director='', year='', timeout=-1):
data = read_url(url, timeout=timeout, unicode=True) data = read_url(url, timeout=timeout, unicode=True)
#if search results in redirect, get id of current page #if search results in redirect, get id of current page
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />' r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
results = re.compile(r).findall(data) results = re.compile(r).findall(data)
if results: if results:
return results[0] return results[0]
#otherwise get first result #otherwise get first result
r = '<td valign="top">.*?<a href="/title/tt(\d{7})/"' r = '<td valign="top">.*?<a href="/title/tt(\d+)/"'
results = re.compile(r).findall(data) results = re.compile(r).findall(data)
if results: if results:
return results[0] return results[0]
@ -713,7 +726,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
results = duckduckgo.find(google_query, timeout=timeout) results = duckduckgo.find(google_query, timeout=timeout)
if results: if results:
for r in results[:2]: for r in results[:2]:
imdbId = find_re(r[1], 'title/tt(\d{7})') imdbId = find_re(r[1], 'title/tt(\d+)')
if imdbId: if imdbId:
return imdbId return imdbId
#or nothing #or nothing
@ -740,7 +753,7 @@ def get_episodes(imdbId, season=None):
if season: if season:
url += '?season=%d' % season url += '?season=%d' % season
data = cache.read_url(url).decode() data = cache.read_url(url).decode()
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data): for e in re.compile('<div data-const="tt(\d+)".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0] episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
else: else:
data = cache.read_url(url) data = cache.read_url(url)

View file

@ -8,7 +8,7 @@ from ox.net import read_url
def get_poster_url(id): def get_poster_url(id):
url = 'http://piratecinema.org/posters/' url = 'http://piratecinema.org/posters/'
html = read_url(url).decode('utf-8') html = read_url(url).decode('utf-8')
results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html) results = re.compile('src="(.+)" title=".+\((\d{6}\d+)\)"').findall(html)
for result in results: for result in results:
if result[1] == id: if result[1] == id:
return url + result[0] return url + result[0]

View file

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
from multiprocessing.pool import ThreadPool
from six import string_types from six import string_types
@ -28,6 +29,7 @@ def cleanup(key, data, data_type):
class SiteParser(dict): class SiteParser(dict):
baseUrl = '' baseUrl = ''
regex = {} regex = {}
pool = ThreadPool(8)
def get_url(self, page): def get_url(self, page):
return "%s%s" % (self.baseUrl, page) return "%s%s" % (self.baseUrl, page)
@ -39,6 +41,9 @@ class SiteParser(dict):
def __init__(self, timeout=-1): def __init__(self, timeout=-1):
self._cache = {} self._cache = {}
urls = list(set(self.get_url(self.regex[key]['page']) for key in self.regex))
self.pool.map(self.get_url, urls)
for key in self.regex: for key in self.regex:
url = self.get_url(self.regex[key]['page']) url = self.get_url(self.regex[key]['page'])
data = self.read_url(url, timeout) data = self.read_url(url, timeout)

View file

@ -1,2 +1,3 @@
chardet chardet
six>=1.5.2 six>=1.5.2
lxml