Compare commits
No commits in common. "ae10c5c9b9025c63fed049f4b1edd4a4c987fd2c" and "e00d23e35ca89e29556c73f524c7f4321681b8ac" have entirely different histories.
ae10c5c9b9
...
e00d23e35c
16 changed files with 179 additions and 199 deletions
12
ox/api.py
12
ox/api.py
|
@ -126,7 +126,7 @@ class API(object):
|
||||||
tmpname = filename + '.tmp'
|
tmpname = filename + '.tmp'
|
||||||
with open(tmpname, 'wb') as fd:
|
with open(tmpname, 'wb') as fd:
|
||||||
r = self._requests_session.get(url)
|
r = self._requests_session.get(url)
|
||||||
for chunk in r.iter_content(chunk_size=chunk_size):
|
for chunk in iter(lambda: r.read(chunk_size), b''):
|
||||||
fd.write(chunk)
|
fd.write(chunk)
|
||||||
shutil.move(tmpname, filename)
|
shutil.move(tmpname, filename)
|
||||||
|
|
||||||
|
@ -167,22 +167,22 @@ class API(object):
|
||||||
try:
|
try:
|
||||||
data = self._json_request(uploadUrl, meta, files=files)
|
data = self._json_request(uploadUrl, meta, files=files)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
if not silent:
|
if not slient:
|
||||||
print("\ninterrupted by user.")
|
print("\ninterrupted by user.")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
except:
|
except:
|
||||||
if not silent:
|
if not slient:
|
||||||
print("uploading chunk failed, will try again in 5 seconds\r", end='')
|
print("uploading chunk failed, will try again in 5 seconds\r", end='')
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
data = {'result': -1}
|
data = {'result': -1}
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
if data and 'status' in data:
|
if data and 'status' in data:
|
||||||
if data['status']['code'] == 403:
|
if data['status']['code'] == 403:
|
||||||
if not silent:
|
if not slient:
|
||||||
print("login required")
|
print("login required")
|
||||||
return False
|
return False
|
||||||
if data['status']['code'] != 200:
|
if data['status']['code'] != 200:
|
||||||
if not silent:
|
if not slient:
|
||||||
print("request returned error, will try again in 5 seconds")
|
print("request returned error, will try again in 5 seconds")
|
||||||
if self.DEBUG:
|
if self.DEBUG:
|
||||||
print(data)
|
print(data)
|
||||||
|
@ -190,7 +190,7 @@ class API(object):
|
||||||
if data and data.get('result') == 1:
|
if data and data.get('result') == 1:
|
||||||
done += len(chunk)
|
done += len(chunk)
|
||||||
if data.get('offset') not in (None, done):
|
if data.get('offset') not in (None, done):
|
||||||
if not silent:
|
if not slient:
|
||||||
print('server offset out of sync, continue from', data['offset'])
|
print('server offset out of sync, continue from', data['offset'])
|
||||||
done = data['offset']
|
done = data['offset']
|
||||||
f.seek(done)
|
f.seek(done)
|
||||||
|
|
25
ox/file.py
25
ox/file.py
|
@ -2,6 +2,7 @@
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
# GPL 2008
|
# GPL 2008
|
||||||
from __future__ import division, print_function
|
from __future__ import division, print_function
|
||||||
|
from distutils.spawn import find_executable
|
||||||
from glob import glob
|
from glob import glob
|
||||||
import hashlib
|
import hashlib
|
||||||
import os
|
import os
|
||||||
|
@ -38,24 +39,6 @@ EXTENSIONS = {
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
def is_exe(fpath):
|
|
||||||
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
|
|
||||||
|
|
||||||
def which(program):
|
|
||||||
local = os.path.expanduser('~/.ox/bin/%s' % program)
|
|
||||||
if os.path.exists(local):
|
|
||||||
return local
|
|
||||||
fpath, fname = os.path.split(program)
|
|
||||||
if fpath:
|
|
||||||
if is_exe(program):
|
|
||||||
return program
|
|
||||||
else:
|
|
||||||
for path in os.environ.get("PATH", "").split(os.pathsep):
|
|
||||||
exe_file = os.path.join(path, program)
|
|
||||||
if is_exe(exe_file):
|
|
||||||
return exe_file
|
|
||||||
return None
|
|
||||||
|
|
||||||
def cmd(program):
|
def cmd(program):
|
||||||
local = os.path.expanduser('~/.ox/bin/%s' % program)
|
local = os.path.expanduser('~/.ox/bin/%s' % program)
|
||||||
if os.path.exists(local):
|
if os.path.exists(local):
|
||||||
|
@ -177,11 +160,12 @@ def avinfo(filename, cached=True):
|
||||||
if cached:
|
if cached:
|
||||||
return cache(filename, 'info')
|
return cache(filename, 'info')
|
||||||
if os.path.getsize(filename):
|
if os.path.getsize(filename):
|
||||||
if which('ffprobe'):
|
if find_executable('ffprobe'):
|
||||||
return ffprobe(filename)
|
return ffprobe(filename)
|
||||||
raise EnvironmentError('could to find ffprobe. please install ffmpeg')
|
raise EnvironmentError('could to find ffprobe. please install ffmpeg')
|
||||||
return {'path': filename, 'size': 0}
|
return {'path': filename, 'size': 0}
|
||||||
|
|
||||||
|
|
||||||
def ffprobe(filename):
|
def ffprobe(filename):
|
||||||
p = subprocess.Popen([
|
p = subprocess.Popen([
|
||||||
cmd('ffprobe'),
|
cmd('ffprobe'),
|
||||||
|
@ -274,9 +258,6 @@ def ffprobe(filename):
|
||||||
'pix_fmt': 'pixel_format',
|
'pix_fmt': 'pixel_format',
|
||||||
'sample_aspect_ratio': 'pixel_aspect_ratio',
|
'sample_aspect_ratio': 'pixel_aspect_ratio',
|
||||||
}.get(key, key)] = fix_value(key, s[key])
|
}.get(key, key)] = fix_value(key, s[key])
|
||||||
if 'avg_frame_rate' in s and stream.get('framerate') == "90000:1":
|
|
||||||
stream['framerate'] = fix_value('r_frame_rate', s['avg_frame_rate'])
|
|
||||||
stream['force_framerate'] = True
|
|
||||||
info[s['codec_type']].append(stream)
|
info[s['codec_type']].append(stream)
|
||||||
elif s.get('codec_type') == 'subtitle':
|
elif s.get('codec_type') == 'subtitle':
|
||||||
info['subtitles'] = info.get('subtitles', [])
|
info['subtitles'] = info.get('subtitles', [])
|
||||||
|
|
|
@ -236,7 +236,7 @@ def int_value(strValue, default=''):
|
||||||
''
|
''
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
val = re.compile(r'(\d+)').findall(str(strValue).strip())[0]
|
val = re.compile('(\d+)').findall(str(strValue).strip())[0]
|
||||||
except:
|
except:
|
||||||
val = default
|
val = default
|
||||||
return val
|
return val
|
||||||
|
@ -253,7 +253,7 @@ def float_value(strValue, default=''):
|
||||||
''
|
''
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
val = re.compile(r'([\d.]+)').findall(str(strValue).strip())[0]
|
val = re.compile('([\d.]+)').findall(str(strValue).strip())[0]
|
||||||
except:
|
except:
|
||||||
val = default
|
val = default
|
||||||
return val
|
return val
|
||||||
|
|
24
ox/html.py
24
ox/html.py
|
@ -16,7 +16,7 @@ TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>', "'", '"']
|
||||||
DOTS = ['·', '*', '\xe2\x80\xa2', '•', '•', '•']
|
DOTS = ['·', '*', '\xe2\x80\xa2', '•', '•', '•']
|
||||||
|
|
||||||
unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
|
unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
|
||||||
word_split_re = re.compile(r'(\s+|<br>)')
|
word_split_re = re.compile(r'(\s+)')
|
||||||
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % (
|
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % (
|
||||||
'|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
|
'|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
|
||||||
'|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
|
'|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
|
||||||
|
@ -178,10 +178,10 @@ def highlight(text, query, hlClass="hl"):
|
||||||
"""
|
"""
|
||||||
if query:
|
if query:
|
||||||
text = text.replace('<br />', '|')
|
text = text.replace('<br />', '|')
|
||||||
query = re.escape(query).replace(r'\ ', '.')
|
query = re.escape(query).replace('\ ', '.')
|
||||||
m = re.compile("(%s)" % query, re.IGNORECASE).findall(text)
|
m = re.compile("(%s)" % query, re.IGNORECASE).findall(text)
|
||||||
for i in m:
|
for i in m:
|
||||||
text = re.sub(r"(%s)" % re.escape(i).replace(r'\ ', '.'), r'<span class="%s">\\1</span>' % hlClass, text)
|
text = re.sub("(%s)" % re.escape(i).replace('\ ', '.'), '<span class="%s">\\1</span>' % hlClass, text)
|
||||||
text = text.replace('|', '<br />')
|
text = text.replace('|', '<br />')
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
@ -234,7 +234,7 @@ def sanitize_html(html, tags=None, global_attributes=[]):
|
||||||
' '
|
' '
|
||||||
'''
|
'''
|
||||||
if not tags:
|
if not tags:
|
||||||
valid_url = r'^((https?:\/\/|\/|mailto:).*?)'
|
valid_url = '^((https?:\/\/|\/|mailto:).*?)'
|
||||||
tags = [
|
tags = [
|
||||||
# inline formatting
|
# inline formatting
|
||||||
{'name': 'b'},
|
{'name': 'b'},
|
||||||
|
@ -300,8 +300,8 @@ def sanitize_html(html, tags=None, global_attributes=[]):
|
||||||
'optional': ['width', 'height'],
|
'optional': ['width', 'height'],
|
||||||
'required': ['src'],
|
'required': ['src'],
|
||||||
'validation': {
|
'validation': {
|
||||||
'width': r'^\d+$',
|
'width': '^\d+$',
|
||||||
'height': r'^\d+$',
|
'height': '^\d+$',
|
||||||
'src': valid_url
|
'src': valid_url
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -310,8 +310,8 @@ def sanitize_html(html, tags=None, global_attributes=[]):
|
||||||
'optional': ['width', 'height'],
|
'optional': ['width', 'height'],
|
||||||
'required': ['src'],
|
'required': ['src'],
|
||||||
'validation': {
|
'validation': {
|
||||||
'width': r'^\d+$',
|
'width': '^\d+$',
|
||||||
'height': r'^\d+$',
|
'height': '^\d+$',
|
||||||
'src': valid_url
|
'src': valid_url
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -319,8 +319,8 @@ def sanitize_html(html, tags=None, global_attributes=[]):
|
||||||
{'name': 'figcaption'}
|
{'name': 'figcaption'}
|
||||||
]
|
]
|
||||||
|
|
||||||
tag_re = re.compile(r'<(/)?([^\ /]+)(.*?)(/)?>')
|
tag_re = re.compile('<(/)?([^\ /]+)(.*?)(/)?>')
|
||||||
attr_re = re.compile(r'([^=\ ]+)="([^"]+)"')
|
attr_re = re.compile('([^=\ ]+)="([^"]+)"')
|
||||||
|
|
||||||
escaped = {}
|
escaped = {}
|
||||||
level = 0
|
level = 0
|
||||||
|
@ -338,7 +338,7 @@ def sanitize_html(html, tags=None, global_attributes=[]):
|
||||||
|
|
||||||
if '[]' in validation:
|
if '[]' in validation:
|
||||||
html = re.sub(
|
html = re.sub(
|
||||||
re.compile(r'\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE),
|
re.compile('\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE),
|
||||||
'<a href="\\1">\\3</a>', html)
|
'<a href="\\1">\\3</a>', html)
|
||||||
|
|
||||||
parts = split_tags(html)
|
parts = split_tags(html)
|
||||||
|
@ -392,8 +392,8 @@ def sanitize_html(html, tags=None, global_attributes=[]):
|
||||||
else:
|
else:
|
||||||
parts[i] = escape_html(decode_html(part))
|
parts[i] = escape_html(decode_html(part))
|
||||||
html = ''.join(parts)
|
html = ''.join(parts)
|
||||||
|
html = html.replace('\n\n', '<br/><br/>')
|
||||||
html = add_links(html)
|
html = add_links(html)
|
||||||
html = html.replace('\n\n', '<br><br>')
|
|
||||||
return sanitize_fragment(html)
|
return sanitize_fragment(html)
|
||||||
|
|
||||||
def split_tags(string):
|
def split_tags(string):
|
||||||
|
|
30
ox/movie.py
30
ox/movie.py
|
@ -25,7 +25,7 @@ The Title[ ([SXX][EYY[+ZZ|-ZZ]])[ Episode Title]][.Version][.Part XY[.Part Title
|
||||||
|
|
||||||
def format_path(data, directory_key='director'):
|
def format_path(data, directory_key='director'):
|
||||||
def format_underscores(string):
|
def format_underscores(string):
|
||||||
return re.sub(r'^\.|\.$|:|/|\?|<|>', '_', string)
|
return re.sub('^\.|\.$|:|/|\?|<|>', '_', string)
|
||||||
director = data['directorSort'] or ['Unknown Director']
|
director = data['directorSort'] or ['Unknown Director']
|
||||||
title = data['seriesTitle' if data['isEpisode'] else 'title'] or 'Untitled'
|
title = data['seriesTitle' if data['isEpisode'] else 'title'] or 'Untitled'
|
||||||
year = data['seriesYear' if data['isEpisode'] else 'year'] or None
|
year = data['seriesYear' if data['isEpisode'] else 'year'] or None
|
||||||
|
@ -199,14 +199,14 @@ def parse_path(path, directory_key='director'):
|
||||||
string = re.sub('^_', '.', string)
|
string = re.sub('^_', '.', string)
|
||||||
string = re.sub('_$', '.', string)
|
string = re.sub('_$', '.', string)
|
||||||
# '_.foo$' or '_ (' is '?'
|
# '_.foo$' or '_ (' is '?'
|
||||||
string = re.sub(re.compile(r'_(?=(\.\w+$| \())', re.U), '?', string)
|
string = re.sub(re.compile('_(?=(\.\w+$| \())', re.U), '?', string)
|
||||||
# ' _..._ ' is '<...>'
|
# ' _..._ ' is '<...>'
|
||||||
string = re.sub('(?<= )_(.+)_(?= )', '<\g<1>>', string)
|
string = re.sub('(?<= )_(.+)_(?= )', '<\g<1>>', string)
|
||||||
# 'foo_bar' or 'foo _ bar' is '/'
|
# 'foo_bar' or 'foo _ bar' is '/'
|
||||||
string = re.sub(re.compile(r'(?<=\w)_(?=\w)', re.U), '/', string)
|
string = re.sub(re.compile('(?<=\w)_(?=\w)', re.U), '/', string)
|
||||||
string = re.sub(' _ ', ' / ', string)
|
string = re.sub(' _ ', ' / ', string)
|
||||||
# 'foo_ ' is ':'
|
# 'foo_ ' is ':'
|
||||||
string = re.sub(re.compile(r'(?<=[\w\)\]])_ ', re.U), ': ', string)
|
string = re.sub(re.compile('(?<=[\w\)\]])_ ', re.U), ': ', string)
|
||||||
string = unicodedata.normalize('NFD', string)
|
string = unicodedata.normalize('NFD', string)
|
||||||
return string
|
return string
|
||||||
|
|
||||||
|
@ -238,14 +238,14 @@ def parse_path(path, directory_key='director'):
|
||||||
# title, year
|
# title, year
|
||||||
data['title'] = data['year'] = None
|
data['title'] = data['year'] = None
|
||||||
if title:
|
if title:
|
||||||
match = re.search(r' \(\d{4}(-(\d{4})?)?\)$', title)
|
match = re.search(' \(\d{4}(-(\d{4})?)?\)$', title)
|
||||||
data['title'] = title[:-len(match.group(0))] if match else title
|
data['title'] = title[:-len(match.group(0))] if match else title
|
||||||
data['year'] = match.group(0)[2:-1] if match else None
|
data['year'] = match.group(0)[2:-1] if match else None
|
||||||
file_title = re.sub('[/:]', '_', data['title'])
|
file_title = re.sub('[/:]', '_', data['title'])
|
||||||
# (remove title from beginning of filename if the rest contains a dot)
|
# (remove title from beginning of filename if the rest contains a dot)
|
||||||
file = re.sub('^' + re.escape(file_title) + '(?=.*\.)', '', file)
|
file = re.sub('^' + re.escape(file_title) + '(?=.*\.)', '', file)
|
||||||
# (split by nospace+dot+word, but remove spaces preceding extension)
|
# (split by nospace+dot+word, but remove spaces preceding extension)
|
||||||
parts = re.split(r'(?<!\s)\.(?=\w)', re.sub(r'\s+(?=.\w+$)', '', file))
|
parts = re.split('(?<!\s)\.(?=\w)', re.sub('\s+(?=.\w+$)', '', file))
|
||||||
title, parts, extension = [
|
title, parts, extension = [
|
||||||
parts[0],
|
parts[0],
|
||||||
parts[1:-1],
|
parts[1:-1],
|
||||||
|
@ -256,7 +256,7 @@ def parse_path(path, directory_key='director'):
|
||||||
# season, episode, episodes, episodeTitle
|
# season, episode, episodes, episodeTitle
|
||||||
data['season'] = data['episode'] = data['episodeTitle'] = None
|
data['season'] = data['episode'] = data['episodeTitle'] = None
|
||||||
data['episodes'] = []
|
data['episodes'] = []
|
||||||
match = re.search(r' \((S\d{2})?(E\d{2}([+-]\d{2})?)?\)(.+)?', title)
|
match = re.search(' \((S\d{2})?(E\d{2}([+-]\d{2})?)?\)(.+)?', title)
|
||||||
if match:
|
if match:
|
||||||
if match.group(1):
|
if match.group(1):
|
||||||
data['season'] = int(match.group(1)[1:])
|
data['season'] = int(match.group(1)[1:])
|
||||||
|
@ -267,7 +267,7 @@ def parse_path(path, directory_key='director'):
|
||||||
data['episodes'] = range(int(match.group(2)[1:3]), int(match.group(2)[-2:]) + 1)
|
data['episodes'] = range(int(match.group(2)[1:3]), int(match.group(2)[-2:]) + 1)
|
||||||
if match.group(4):
|
if match.group(4):
|
||||||
data['episodeTitle'] = match.group(4)[1:]
|
data['episodeTitle'] = match.group(4)[1:]
|
||||||
while data['episodeTitle'] and len(parts) and re.search(r'^\w+\.*$', parts[0]) and not re.search(r'^[a-z]{2}$', parts[0]):
|
while data['episodeTitle'] and len(parts) and re.search('^\w+\.*$', parts[0]) and not re.search('^[a-z]{2}$', parts[0]):
|
||||||
data['episodeTitle'] += '.%s' % parts.pop(0)
|
data['episodeTitle'] += '.%s' % parts.pop(0)
|
||||||
# isEpisode, seriesTitle, seriesYear
|
# isEpisode, seriesTitle, seriesYear
|
||||||
data['isEpisode'] = False
|
data['isEpisode'] = False
|
||||||
|
@ -343,14 +343,14 @@ def parse_movie_path(path):
|
||||||
if title.startswith('_'):
|
if title.startswith('_'):
|
||||||
title = '.' + title[1:]
|
title = '.' + title[1:]
|
||||||
|
|
||||||
year = find_re(title, r'(\(\d{4}\))')
|
year = find_re(title, '(\(\d{4}\))')
|
||||||
if not year:
|
if not year:
|
||||||
year = find_re(title, r'(\(\d{4}-\d*\))')
|
year = find_re(title, '(\(\d{4}-\d*\))')
|
||||||
if year and title.endswith(year):
|
if year and title.endswith(year):
|
||||||
title = title[:-len(year)].strip()
|
title = title[:-len(year)].strip()
|
||||||
year = year[1:-1]
|
year = year[1:-1]
|
||||||
if '-' in year:
|
if '-' in year:
|
||||||
year = find_re(year, r'\d{4}')
|
year = find_re(year, '\d{4}')
|
||||||
|
|
||||||
#director
|
#director
|
||||||
if len(parts) == 4:
|
if len(parts) == 4:
|
||||||
|
@ -373,7 +373,7 @@ def parse_movie_path(path):
|
||||||
language = ''
|
language = ''
|
||||||
|
|
||||||
#season/episode/episodeTitle
|
#season/episode/episodeTitle
|
||||||
match = re.compile(r'(.+?) \((S(\d+))?(E(\d+))?\)( (.+?))?\.').match(parts[-1])
|
match = re.compile('(.+?) \((S(\d+))?(E(\d+))?\)( (.+?))?\.').match(parts[-1])
|
||||||
if match:
|
if match:
|
||||||
seriesTitle = match.group(1)
|
seriesTitle = match.group(1)
|
||||||
season = match.group(3)
|
season = match.group(3)
|
||||||
|
@ -386,13 +386,13 @@ def parse_movie_path(path):
|
||||||
if episode and not season:
|
if episode and not season:
|
||||||
season = 1
|
season = 1
|
||||||
else:
|
else:
|
||||||
season = find_re(parts[-1], r'\.Season (\d+)\.')
|
season = find_re(parts[-1], '\.Season (\d+)\.')
|
||||||
if season:
|
if season:
|
||||||
season = int(season)
|
season = int(season)
|
||||||
else:
|
else:
|
||||||
season = None
|
season = None
|
||||||
|
|
||||||
episode = find_re(parts[-1], r'\.Episode[s]* ([\d+]+)\.')
|
episode = find_re(parts[-1], '\.Episode[s]* ([\d+]+)\.')
|
||||||
if episode:
|
if episode:
|
||||||
episode = episode.split('+')[0]
|
episode = episode.split('+')[0]
|
||||||
episode = int(episode)
|
episode = int(episode)
|
||||||
|
@ -422,7 +422,7 @@ def parse_movie_path(path):
|
||||||
title = u'%s %s' % (title, episodeTitle)
|
title = u'%s %s' % (title, episodeTitle)
|
||||||
|
|
||||||
#part
|
#part
|
||||||
part = find_re(parts[-1], r'\.Part (\d+)\.')
|
part = find_re(parts[-1], '\.Part (\d+)\.')
|
||||||
if part:
|
if part:
|
||||||
part = int(part)
|
part = int(part)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -18,7 +18,7 @@ from chardet.universaldetector import UniversalDetector
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
# Default headers for HTTP requests.
|
# Default headers for HTTP requests.
|
||||||
DEFAULT_HEADERS = {
|
DEFAULT_HEADERS = {
|
||||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0',
|
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0',
|
||||||
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
|
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||||
'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4',
|
'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4',
|
||||||
|
@ -113,15 +113,14 @@ get_url = read_url
|
||||||
def save_url(url, filename, overwrite=False):
|
def save_url(url, filename, overwrite=False):
|
||||||
if not os.path.exists(filename) or overwrite:
|
if not os.path.exists(filename) or overwrite:
|
||||||
dirname = os.path.dirname(filename)
|
dirname = os.path.dirname(filename)
|
||||||
os.makedirs(dirname, exist_ok=True)
|
if dirname and not os.path.exists(dirname):
|
||||||
|
os.makedirs(dirname)
|
||||||
headers = DEFAULT_HEADERS.copy()
|
headers = DEFAULT_HEADERS.copy()
|
||||||
r = requests.get(url, headers=headers, stream=True)
|
r = requests.get(url, headers=headers, stream=True)
|
||||||
filename_tmp = filename + '~'
|
with open(filename, 'wb') as f:
|
||||||
with open(filename_tmp, 'wb') as f:
|
|
||||||
for chunk in r.iter_content(chunk_size=1024):
|
for chunk in r.iter_content(chunk_size=1024):
|
||||||
if chunk: # filter out keep-alive new chunks
|
if chunk: # filter out keep-alive new chunks
|
||||||
f.write(chunk)
|
f.write(chunk)
|
||||||
os.rename(filename_tmp, filename)
|
|
||||||
|
|
||||||
def _get_size(url):
|
def _get_size(url):
|
||||||
req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
|
req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
|
||||||
|
|
|
@ -102,7 +102,7 @@ def normalize_imdbid(imdbId):
|
||||||
'0159206'
|
'0159206'
|
||||||
"""
|
"""
|
||||||
if isinstance(imdbId, str):
|
if isinstance(imdbId, str):
|
||||||
imdbId = re.sub(r'.*(\d{7}).*', '\\1', imdbId)
|
imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId)
|
||||||
elif isinstance(imdbId, int):
|
elif isinstance(imdbId, int):
|
||||||
imdbId = "%07d" % imdbId
|
imdbId = "%07d" % imdbId
|
||||||
return imdbId
|
return imdbId
|
||||||
|
|
162
ox/text.py
162
ox/text.py
|
@ -133,86 +133,86 @@ UA_NAMES = {
|
||||||
}
|
}
|
||||||
UA_REGEXPS = {
|
UA_REGEXPS = {
|
||||||
'browser': [
|
'browser': [
|
||||||
r'(Camino)\/(\d+)',
|
'(Camino)\/(\d+)',
|
||||||
r'(Chimera)\/(\d+)',
|
'(Chimera)\/(\d+)',
|
||||||
r'(chromeframe)\/(\d+)',
|
'(chromeframe)\/(\d+)',
|
||||||
r'(Edge)\/(\d+)',
|
'(Edge)\/(\d+)',
|
||||||
r'(Epiphany)\/(\d+)', # before Chrome, Chromium and Safari
|
'(Epiphany)\/(\d+)', # before Chrome, Chromium and Safari
|
||||||
r'(Chromium)\/(\d+)', # before Chrome
|
'(Chromium)\/(\d+)', # before Chrome
|
||||||
r'(Chrome)\/(\d+)',
|
'(Chrome)\/(\d+)',
|
||||||
r'(FBForIPhone)',
|
'(FBForIPhone)',
|
||||||
r'(Firefox)\/(\d+)',
|
'(Firefox)\/(\d+)',
|
||||||
r'(Galeon)\/(\d+)',
|
'(Galeon)\/(\d+)',
|
||||||
r'(IEMobile)\/(\d+)',
|
'(IEMobile)\/(\d+)',
|
||||||
r'(iCab) (\d+)',
|
'(iCab) (\d+)',
|
||||||
r'(iCab)\/(\d+)',
|
'(iCab)\/(\d+)',
|
||||||
r'(konqueror)\/(\d+)',
|
'(konqueror)\/(\d+)',
|
||||||
r'(Konqueror)\/(\d+)',
|
'(Konqueror)\/(\d+)',
|
||||||
r'(Lynx)\/(\d+)',
|
'(Lynx)\/(\d+)',
|
||||||
r'(Netscape)\d?\/(\d+)',
|
'(Netscape)\d?\/(\d+)',
|
||||||
r'(NokiaBrowser)\/(\d+)',
|
'(NokiaBrowser)\/(\d+)',
|
||||||
r'(OmniWeb)\/(\d+)',
|
'(OmniWeb)\/(\d+)',
|
||||||
r'(Opera)\/.+Version\/(\d+)',
|
'(Opera)\/.+Version\/(\d+)',
|
||||||
r'(OviBrowser)\/(\d+)',
|
'(OviBrowser)\/(\d+)',
|
||||||
r'Version\/(\d+).+(Safari)',
|
'Version\/(\d+).+(Safari)',
|
||||||
r'(WebKit)\/(\d+)',
|
'(WebKit)\/(\d+)',
|
||||||
r'(MSIE) (\d\d?(?!\d))', # last, since Opera used to mask as MSIE
|
'(MSIE) (\d\d?(?!\d))', # last, since Opera used to mask as MSIE
|
||||||
r'(Trident)\/.*?rv:(\d+)',
|
'(Trident)\/.*?rv:(\d+)',
|
||||||
r'(Gecko)',
|
'(Gecko)',
|
||||||
r'(Mozilla)\/(3|4)'
|
'(Mozilla)\/(3|4)'
|
||||||
],
|
],
|
||||||
'robot': [
|
'robot': [
|
||||||
r'(BingPreview)\/(\d+)',
|
'(BingPreview)\/(\d+)',
|
||||||
r'(Google Web Preview).+Chrome\/(\d+)',
|
'(Google Web Preview).+Chrome\/(\d+)',
|
||||||
r'(Googlebot)\/(\d+)',
|
'(Googlebot)\/(\d+)',
|
||||||
r'(WebCrawler)\/(\d+)',
|
'(WebCrawler)\/(\d+)',
|
||||||
r'(Yahoo! Slurp)\/(\d+)',
|
'(Yahoo! Slurp)\/(\d+)',
|
||||||
r'(YandexBot)\/([\d\.]+)',
|
'(YandexBot)\/([\d\.]+)',
|
||||||
r'(YandexMobileBot)\/([\d\.]+)',
|
'(YandexMobileBot)\/([\d\.]+)',
|
||||||
],
|
],
|
||||||
'system': [
|
'system': [
|
||||||
r'(Android) (\d+)',
|
'(Android) (\d+)',
|
||||||
r'(Android)',
|
'(Android)',
|
||||||
r'(BB)(\d+)',
|
'(BB)(\d+)',
|
||||||
r'(BeOS)',
|
'(BeOS)',
|
||||||
r'(BlackBerry) (\d+)',
|
'(BlackBerry) (\d+)',
|
||||||
r'(BlackBerry)',
|
'(BlackBerry)',
|
||||||
r'(Darwin)',
|
'(Darwin)',
|
||||||
r'(BSD) (FreeBSD|NetBSD|OpenBSD)',
|
'(BSD) (FreeBSD|NetBSD|OpenBSD)',
|
||||||
r'(CPU OS) (\d+)',
|
'(CPU OS) (\d+)',
|
||||||
r'(iPhone OS) (\d+)',
|
'(iPhone OS) (\d+)',
|
||||||
r'(iPhone)', # Opera
|
'(iPhone)', # Opera
|
||||||
r'(J2ME\/MIDP)',
|
'(J2ME\/MIDP)',
|
||||||
r'(Linux).+(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS)',
|
'(Linux).+(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS)',
|
||||||
r'(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS).+(Linux)',
|
'(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS).+(Linux)',
|
||||||
r'(Linux)',
|
'(Linux)',
|
||||||
r'(Mac OS X) (10.\d+)',
|
'(Mac OS X) (10.\d+)',
|
||||||
r'(Mac OS X)',
|
'(Mac OS X)',
|
||||||
r'(Mac_PowerPC)',
|
'(Mac_PowerPC)',
|
||||||
r'(Mac_PPC)',
|
'(Mac_PPC)',
|
||||||
r'(Macintosh)',
|
'(Macintosh)',
|
||||||
r'Nintendo (Wii).+NX\/(\d+)',
|
'Nintendo (Wii).+NX\/(\d+)',
|
||||||
r'(PLAYSTATION) (\d+)',
|
'(PLAYSTATION) (\d+)',
|
||||||
r'(PlayStation) Vita (\d+)',
|
'(PlayStation) Vita (\d+)',
|
||||||
r'(RIM Tablet OS) (\d+)',
|
'(RIM Tablet OS) (\d+)',
|
||||||
r'(S)(60);',
|
'(S)(60);',
|
||||||
r'(Series) ?(40|60)',
|
'(Series) ?(40|60)',
|
||||||
r'(Symbian OS)',
|
'(Symbian OS)',
|
||||||
r'(SymbianOS)\/(\d+)',
|
'(SymbianOS)\/(\d+)',
|
||||||
r'(SymbOS)',
|
'(SymbOS)',
|
||||||
r'(OS\/2)',
|
'(OS\/2)',
|
||||||
r'(Unix) (AIX|HP-UX|IRIX|SunOS)',
|
'(Unix) (AIX|HP-UX|IRIX|SunOS)',
|
||||||
r'(Unix)',
|
'(Unix)',
|
||||||
r'(Windows) (NT \d\.\d)',
|
'(Windows) (NT \d\.\d)',
|
||||||
r'(Windows Phone) (\d+)',
|
'(Windows Phone) (\d+)',
|
||||||
r'(Windows Phone OS) (\d+)',
|
'(Windows Phone OS) (\d+)',
|
||||||
r'(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)', # Opera
|
'(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)', # Opera
|
||||||
r'(Win) (9x 4\.90)', # Firefox
|
'(Win) (9x 4\.90)', # Firefox
|
||||||
r'(Win)(16)', # Firefox
|
'(Win)(16)', # Firefox
|
||||||
r'(Win)(9\d)', # Firefox
|
'(Win)(9\d)', # Firefox
|
||||||
r'(Win)(NT)', # Firefox
|
'(Win)(NT)', # Firefox
|
||||||
r'(Win)(NT4\.0)', # Firefox
|
'(Win)(NT4\.0)', # Firefox
|
||||||
r'(X11)'
|
'(X11)'
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
UA_VERSIONS = {
|
UA_VERSIONS = {
|
||||||
|
@ -332,9 +332,9 @@ def get_sort_name(name):
|
||||||
|
|
||||||
first_names = name.split(' ')
|
first_names = name.split(' ')
|
||||||
last_names = []
|
last_names = []
|
||||||
if re.search(r'^[0-9]+$', first_names[-1]):
|
if re.search('^[0-9]+$', first_names[-1]):
|
||||||
add_name()
|
add_name()
|
||||||
if re.search(r'[(\[].+?[)\]]$', first_names[-1]):
|
if re.search('[(\[].+?[)\]]$', first_names[-1]):
|
||||||
add_name()
|
add_name()
|
||||||
if find_name(SUFFIXES):
|
if find_name(SUFFIXES):
|
||||||
add_name()
|
add_name()
|
||||||
|
@ -425,7 +425,7 @@ def parse_useragent(useragent):
|
||||||
matches = list(match.groups())
|
matches = list(match.groups())
|
||||||
if len(matches) == 1:
|
if len(matches) == 1:
|
||||||
matches.append('')
|
matches.append('')
|
||||||
swap = re.match(r'^\d', matches[0]) or matches[1] == 'Linux'
|
swap = re.match('^\d', matches[0]) or matches[1] == 'Linux'
|
||||||
name = matches[1 if swap else 0]
|
name = matches[1 if swap else 0]
|
||||||
version = matches[0 if swap else 1].replace('_', '.')
|
version = matches[0 if swap else 1].replace('_', '.')
|
||||||
name = UA_NAMES[key][name] if name in UA_NAMES[key] else name
|
name = UA_NAMES[key][name] if name in UA_NAMES[key] else name
|
||||||
|
@ -685,8 +685,8 @@ def sort_string(string):
|
||||||
string = string.replace('Æ', 'AE').replace('Ø', 'O').replace('Þ', 'Th')
|
string = string.replace('Æ', 'AE').replace('Ø', 'O').replace('Þ', 'Th')
|
||||||
|
|
||||||
# pad numbered titles
|
# pad numbered titles
|
||||||
string = re.sub(r'(\d),(\d{3})', '\\1\\2', string)
|
string = re.sub('(\d),(\d{3})', '\\1\\2', string)
|
||||||
string = re.sub(r'(\d+)', lambda x: '%010d' % int(x.group(0)), string)
|
string = re.sub('(\d+)', lambda x: '%010d' % int(x.group(0)), string)
|
||||||
return unicodedata.normalize('NFKD', string)
|
return unicodedata.normalize('NFKD', string)
|
||||||
|
|
||||||
def sorted_strings(strings, key=None):
|
def sorted_strings(strings, key=None):
|
||||||
|
|
|
@ -43,7 +43,7 @@ def get_data(id):
|
||||||
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||||
data['themes'] = parse_list(html, 'themes')
|
data['themes'] = parse_list(html, 'themes')
|
||||||
data['types'] = parse_list(html, 'types')
|
data['types'] = parse_list(html, 'types')
|
||||||
data['year'] = find_re(html, r'<span class="year">.*?(\d+)')
|
data['year'] = find_re(html, '<span class="year">.*?(\d+)')
|
||||||
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
|
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
|
||||||
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
|
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
|
||||||
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
|
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
|
||||||
|
|
|
@ -51,11 +51,11 @@ def get_movie_data(title, director):
|
||||||
'User-Agent': USER_AGENT
|
'User-Agent': USER_AGENT
|
||||||
}
|
}
|
||||||
html = read_url(url, headers=headers, unicode=True)
|
html = read_url(url, headers=headers, unicode=True)
|
||||||
results = re.compile(r'"(' + host + r'.*?poster\.jpg)"').findall(html)
|
results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html)
|
||||||
if results:
|
if results:
|
||||||
data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')
|
data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')
|
||||||
html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True)
|
html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True)
|
||||||
results = re.compile(r'"(' + host + r'\S+\.mov)"').findall(html)
|
results = re.compile('"(' + host + '\S+\.mov)"').findall(html)
|
||||||
if results:
|
if results:
|
||||||
data['trailer'] = results[-1]
|
data['trailer'] = results[-1]
|
||||||
return data
|
return data
|
||||||
|
|
|
@ -28,7 +28,7 @@ def get_data(id, language='en'):
|
||||||
if m:
|
if m:
|
||||||
data['director'] = m[0]
|
data['director'] = m[0]
|
||||||
|
|
||||||
m = re.compile(r"caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html)
|
m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html)
|
||||||
if m:
|
if m:
|
||||||
data['image'] = m[0]
|
data['image'] = m[0]
|
||||||
|
|
||||||
|
|
|
@ -60,7 +60,7 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
||||||
url += '&start=%d' % offset
|
url += '&start=%d' % offset
|
||||||
data = read_url(url, timeout=timeout)
|
data = read_url(url, timeout=timeout)
|
||||||
data = re.sub('<span class="f">(.*?)</span>', '\\1', data)
|
data = re.sub('<span class="f">(.*?)</span>', '\\1', data)
|
||||||
for a in re.compile(r'<a href="(htt\S+?)".*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>').findall(data):
|
for a in re.compile('<a href="(htt\S+?)".*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>').findall(data):
|
||||||
results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2]))))
|
results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2]))))
|
||||||
if len(results) >= max_results:
|
if len(results) >= max_results:
|
||||||
break
|
break
|
||||||
|
|
|
@ -43,8 +43,8 @@ def reference_section(id):
|
||||||
return {
|
return {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
r'<h4 name="{id}" id="{id}".*?<table(.*?)</table>'.format(id=id),
|
'<h4 name="{id}" id="{id}".*?<table(.*?)</table>'.format(id=id),
|
||||||
r'<a href="/name/.*?>(.*?)</a>'
|
'<a href="/name/.*?>(.*?)</a>'
|
||||||
],
|
],
|
||||||
'type': 'list'
|
'type': 'list'
|
||||||
}
|
}
|
||||||
|
@ -54,8 +54,8 @@ def zebra_list(label, more=None):
|
||||||
conditions = {
|
conditions = {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
r'_label">' + label + '</td>.*?<ul(.*?)</ul>',
|
'_label">' + label + '</td>.*?<ul(.*?)</ul>',
|
||||||
r'<li.*?>(.*?)</li>'
|
'<li.*?>(.*?)</li>'
|
||||||
],
|
],
|
||||||
'type': 'list',
|
'type': 'list',
|
||||||
}
|
}
|
||||||
|
@ -67,7 +67,7 @@ def zebra_table(label, more=None, type='string'):
|
||||||
conditions = {
|
conditions = {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
r'_label">' + label + '</td>.*?<td>(.*?)</td>',
|
'_label">' + label + '</td>.*?<td>(.*?)</td>',
|
||||||
],
|
],
|
||||||
'type': type,
|
'type': type,
|
||||||
}
|
}
|
||||||
|
@ -97,9 +97,9 @@ def technical(label):
|
||||||
return {
|
return {
|
||||||
'page': 'technical',
|
'page': 'technical',
|
||||||
're': [
|
're': [
|
||||||
r'<td class="label">\s*?%s\s*?</td>.*?<td>\s*?(.*?)\s*?</td>' % label,
|
'<td class="label">\s*?%s\s*?</td>.*?<td>\s*?(.*?)\s*?</td>' % label,
|
||||||
lambda data: [
|
lambda data: [
|
||||||
re.sub(r'\s+', ' ', d.strip()) for d in data.strip().split('<br>')
|
re.sub('\s+', ' ', d.strip()) for d in data.strip().split('<br>')
|
||||||
] if data else []
|
] if data else []
|
||||||
],
|
],
|
||||||
'type': 'list'
|
'type': 'list'
|
||||||
|
@ -258,13 +258,13 @@ class Imdb(SiteParser):
|
||||||
'aspectratio': {
|
'aspectratio': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
r'Aspect Ratio</td>.*?ipl-inline-list__item">\s+([\d\.\:\ ]+)',
|
'Aspect Ratio</td>.*?ipl-inline-list__item">\s+([\d\.\:\ ]+)',
|
||||||
parse_aspectratio,
|
parse_aspectratio,
|
||||||
],
|
],
|
||||||
'type': 'float',
|
'type': 'float',
|
||||||
},
|
},
|
||||||
'budget': zebra_table('Budget', more=[
|
'budget': zebra_table('Budget', more=[
|
||||||
lambda data: find_re(decode_html(data).replace(',', ''), r'\d+')
|
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
|
||||||
], type='int'),
|
], type='int'),
|
||||||
'cast': {
|
'cast': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
|
@ -287,12 +287,12 @@ class Imdb(SiteParser):
|
||||||
},
|
},
|
||||||
'genre': zebra_list('Genres', more=['<a.*?>(.*?)</a>']),
|
'genre': zebra_list('Genres', more=['<a.*?>(.*?)</a>']),
|
||||||
'gross': zebra_table('Cumulative Worldwide Gross', more=[
|
'gross': zebra_table('Cumulative Worldwide Gross', more=[
|
||||||
lambda data: find_re(decode_html(data).replace(',', ''), r'\d+')
|
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
|
||||||
], type='int'),
|
], type='int'),
|
||||||
'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
|
'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
|
||||||
'originalTitle': {
|
'originalTitle': {
|
||||||
'page': 'releaseinfo',
|
'page': 'releaseinfo',
|
||||||
're': r'<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">\(original title\)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
|
're': '<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">\(original title\)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
},
|
},
|
||||||
'summary': zebra_table('Plot Summary', more=[
|
'summary': zebra_table('Plot Summary', more=[
|
||||||
|
@ -300,7 +300,7 @@ class Imdb(SiteParser):
|
||||||
]),
|
]),
|
||||||
'storyline': {
|
'storyline': {
|
||||||
'page': '',
|
'page': '',
|
||||||
're': r'<h2>Storyline</h2>.*?<p>(.*?)</p>',
|
're': '<h2>Storyline</h2>.*?<p>(.*?)</p>',
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
},
|
},
|
||||||
'posterId': {
|
'posterId': {
|
||||||
|
@ -312,16 +312,16 @@ class Imdb(SiteParser):
|
||||||
'productionCompany': {
|
'productionCompany': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
r'Production Companies.*?<ul(.*?)</ul>',
|
'Production Companies.*?<ul(.*?)</ul>',
|
||||||
r'<a href="/company/.*?/">(.*?)</a>'
|
'<a href="/company/.*?/">(.*?)</a>'
|
||||||
],
|
],
|
||||||
'type': 'list'
|
'type': 'list'
|
||||||
},
|
},
|
||||||
'rating': {
|
'rating': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
r'<div class="ipl-rating-star ">(.*?)</div>',
|
'<div class="ipl-rating-star ">(.*?)</div>',
|
||||||
r'ipl-rating-star__rating">([\d,.]+?)</span>',
|
'ipl-rating-star__rating">([\d,.]+?)</span>',
|
||||||
],
|
],
|
||||||
'type': 'float'
|
'type': 'float'
|
||||||
},
|
},
|
||||||
|
@ -343,38 +343,38 @@ class Imdb(SiteParser):
|
||||||
'season': {
|
'season': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
r'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
|
'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
|
||||||
r'Season (\d+)',
|
'Season (\d+)',
|
||||||
],
|
],
|
||||||
'type': 'int'
|
'type': 'int'
|
||||||
},
|
},
|
||||||
'episode': {
|
'episode': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
r'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
|
'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
|
||||||
r'Episode (\d+)',
|
'Episode (\d+)',
|
||||||
],
|
],
|
||||||
'type': 'int'
|
'type': 'int'
|
||||||
},
|
},
|
||||||
'series': {
|
'series': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': r'<h4 itemprop="name">.*?<a href="/title/tt(\d+)',
|
're': '<h4 itemprop="name">.*?<a href="/title/tt(\d+)',
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
},
|
},
|
||||||
'isSeries': {
|
'isSeries': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': r'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"',
|
're': 'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"',
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
},
|
},
|
||||||
'title': {
|
'title': {
|
||||||
'page': 'releaseinfo',
|
'page': 'releaseinfo',
|
||||||
're': r'<h2.*?>(.*?)</h2>',
|
're': '<h2.*?>(.*?)</h2>',
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
},
|
},
|
||||||
'trivia': {
|
'trivia': {
|
||||||
'page': 'trivia',
|
'page': 'trivia',
|
||||||
're': [
|
're': [
|
||||||
r'<div class="sodatext">(.*?)<(br|/div)',
|
'<div class="sodatext">(.*?)<(br|/div)',
|
||||||
lambda data: data[0]
|
lambda data: data[0]
|
||||||
],
|
],
|
||||||
'type': 'list',
|
'type': 'list',
|
||||||
|
@ -382,7 +382,7 @@ class Imdb(SiteParser):
|
||||||
'votes': {
|
'votes': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
r'class="ipl-rating-star__total-votes">\((.*?)\)',
|
'class="ipl-rating-star__total-votes">\((.*?)\)',
|
||||||
lambda r: r.replace(',', '')
|
lambda r: r.replace(',', '')
|
||||||
],
|
],
|
||||||
'type': 'string'
|
'type': 'string'
|
||||||
|
@ -391,8 +391,8 @@ class Imdb(SiteParser):
|
||||||
'year': {
|
'year': {
|
||||||
'page': 'reference',
|
'page': 'reference',
|
||||||
're': [
|
're': [
|
||||||
r'<span class="titlereference-title-year">(.*?)</span>',
|
'<span class="titlereference-title-year">(.*?)</span>',
|
||||||
r'<a.*?>(\d+)',
|
'<a.*?>(\d+)',
|
||||||
],
|
],
|
||||||
'type': 'int'
|
'type': 'int'
|
||||||
},
|
},
|
||||||
|
@ -400,7 +400,7 @@ class Imdb(SiteParser):
|
||||||
'page': 'fullcredits',
|
'page': 'fullcredits',
|
||||||
're': [
|
're': [
|
||||||
lambda data: data.split('<h4'),
|
lambda data: data.split('<h4'),
|
||||||
r'>(.*?)</h4>.*?(<table.*?</table>)',
|
'>(.*?)</h4>.*?(<table.*?</table>)',
|
||||||
lambda data: [d for d in data if d]
|
lambda data: [d for d in data if d]
|
||||||
],
|
],
|
||||||
'type': 'list'
|
'type': 'list'
|
||||||
|
@ -468,7 +468,7 @@ class Imdb(SiteParser):
|
||||||
title = title[1:-1]
|
title = title[1:-1]
|
||||||
if title.startswith("'") and title.endswith("'"):
|
if title.startswith("'") and title.endswith("'"):
|
||||||
title = title[1:-1]
|
title = title[1:-1]
|
||||||
title = re.sub(r'\(\#[.\d]+\)', '', title)
|
title = re.sub('\(\#[.\d]+\)', '', title)
|
||||||
return title.strip()
|
return title.strip()
|
||||||
|
|
||||||
for t in ('title', 'originalTitle'):
|
for t in ('title', 'originalTitle'):
|
||||||
|
@ -518,7 +518,7 @@ class Imdb(SiteParser):
|
||||||
self['actor'] = [c[0] for c in self['cast']]
|
self['actor'] = [c[0] for c in self['cast']]
|
||||||
def cleanup_character(c):
|
def cleanup_character(c):
|
||||||
c = c.replace('(uncredited)', '').strip()
|
c = c.replace('(uncredited)', '').strip()
|
||||||
c = re.sub(r'\s+', ' ', c)
|
c = re.sub('\s+', ' ', c)
|
||||||
return c
|
return c
|
||||||
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
|
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
|
||||||
for x in self['cast']]
|
for x in self['cast']]
|
||||||
|
@ -528,7 +528,7 @@ class Imdb(SiteParser):
|
||||||
del self['isSeries']
|
del self['isSeries']
|
||||||
self['isSeries'] = True
|
self['isSeries'] = True
|
||||||
if 'episodeTitle' in self:
|
if 'episodeTitle' in self:
|
||||||
self['episodeTitle'] = re.sub(r'Episode \#\d+\.\d+', '', self['episodeTitle'])
|
self['episodeTitle'] = re.sub('Episode \#\d+\.\d+', '', self['episodeTitle'])
|
||||||
|
|
||||||
|
|
||||||
#make lists unique but keep order
|
#make lists unique but keep order
|
||||||
|
@ -790,7 +790,7 @@ def get_movie_by_title(title, timeout=-1):
|
||||||
url = "http://www.imdb.com/find?" + params
|
url = "http://www.imdb.com/find?" + params
|
||||||
data = read_url(url, timeout=timeout, unicode=True)
|
data = read_url(url, timeout=timeout, unicode=True)
|
||||||
#if search results in redirect, get id of current page
|
#if search results in redirect, get id of current page
|
||||||
r = r'<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
|
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
|
||||||
results = re.compile(r).findall(data)
|
results = re.compile(r).findall(data)
|
||||||
if results:
|
if results:
|
||||||
return results[0]
|
return results[0]
|
||||||
|
@ -869,12 +869,12 @@ def get_movie_id(title, director='', year='', timeout=-1):
|
||||||
|
|
||||||
data = read_url(url, timeout=timeout, unicode=True)
|
data = read_url(url, timeout=timeout, unicode=True)
|
||||||
#if search results in redirect, get id of current page
|
#if search results in redirect, get id of current page
|
||||||
r = r'<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
|
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d+)/" />'
|
||||||
results = re.compile(r).findall(data)
|
results = re.compile(r).findall(data)
|
||||||
if results:
|
if results:
|
||||||
return results[0]
|
return results[0]
|
||||||
#otherwise get first result
|
#otherwise get first result
|
||||||
r = r'<td valign="top">.*?<a href="/title/tt(\d+)/"'
|
r = '<td valign="top">.*?<a href="/title/tt(\d+)/"'
|
||||||
results = re.compile(r).findall(data)
|
results = re.compile(r).findall(data)
|
||||||
if results:
|
if results:
|
||||||
return results[0]
|
return results[0]
|
||||||
|
@ -885,7 +885,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
|
||||||
results = duckduckgo.find(google_query, timeout=timeout)
|
results = duckduckgo.find(google_query, timeout=timeout)
|
||||||
if results:
|
if results:
|
||||||
for r in results[:2]:
|
for r in results[:2]:
|
||||||
imdbId = find_re(r[1], r'title/tt(\d+)')
|
imdbId = find_re(r[1], 'title/tt(\d+)')
|
||||||
if imdbId:
|
if imdbId:
|
||||||
return imdbId
|
return imdbId
|
||||||
#or nothing
|
#or nothing
|
||||||
|
@ -912,11 +912,11 @@ def get_episodes(imdbId, season=None):
|
||||||
if season:
|
if season:
|
||||||
url += '?season=%d' % season
|
url += '?season=%d' % season
|
||||||
data = cache.read_url(url).decode()
|
data = cache.read_url(url).decode()
|
||||||
for e in re.compile(r'<div data-const="tt(\d+)".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
|
for e in re.compile('<div data-const="tt(\d+)".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
|
||||||
episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
|
episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
|
||||||
else:
|
else:
|
||||||
data = cache.read_url(url)
|
data = cache.read_url(url)
|
||||||
match = re.compile(r'<strong>Season (\d+)</strong>').findall(data)
|
match = re.compile('<strong>Season (\d+)</strong>').findall(data)
|
||||||
if match:
|
if match:
|
||||||
for season in range(1, int(match[0]) + 1):
|
for season in range(1, int(match[0]) + 1):
|
||||||
episodes.update(get_episodes(imdbId, season))
|
episodes.update(get_episodes(imdbId, season))
|
||||||
|
@ -927,7 +927,7 @@ def max_votes():
|
||||||
data = cache.read_url(url).decode('utf-8', 'ignore')
|
data = cache.read_url(url).decode('utf-8', 'ignore')
|
||||||
votes = max([
|
votes = max([
|
||||||
int(v.replace(',', ''))
|
int(v.replace(',', ''))
|
||||||
for v in re.compile(r'Votes</span>.*?([\d,]+)', re.DOTALL).findall(data)
|
for v in re.compile('<span name="nv" data-value="(\d+)"').findall(data)
|
||||||
])
|
])
|
||||||
return votes
|
return votes
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ from ox.net import read_url
|
||||||
def get_poster_url(id):
|
def get_poster_url(id):
|
||||||
url = 'http://piratecinema.org/posters/'
|
url = 'http://piratecinema.org/posters/'
|
||||||
html = read_url(url).decode('utf-8')
|
html = read_url(url).decode('utf-8')
|
||||||
results = re.compile(r'src="(.+)" title=".+\((\d{6}\d+)\)"').findall(html)
|
results = re.compile('src="(.+)" title=".+\((\d{6}\d+)\)"').findall(html)
|
||||||
for result in results:
|
for result in results:
|
||||||
if result[1] == id:
|
if result[1] == id:
|
||||||
return url + result[0]
|
return url + result[0]
|
||||||
|
|
|
@ -81,36 +81,36 @@ def get_movie_data(wikipedia_url):
|
||||||
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
|
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
|
||||||
del filmbox['amg_id']
|
del filmbox['amg_id']
|
||||||
if 'Allmovie movie' in data:
|
if 'Allmovie movie' in data:
|
||||||
filmbox['amg_id'] = find_re(data, r'Allmovie movie\|.*?(\d+)')
|
filmbox['amg_id'] = find_re(data, 'Allmovie movie\|.*?(\d+)')
|
||||||
elif 'Allmovie title' in data:
|
elif 'Allmovie title' in data:
|
||||||
filmbox['amg_id'] = find_re(data, r'Allmovie title\|.*?(\d+)')
|
filmbox['amg_id'] = find_re(data, 'Allmovie title\|.*?(\d+)')
|
||||||
|
|
||||||
if 'Official website' in data:
|
if 'Official website' in data:
|
||||||
filmbox['website'] = find_re(data, r'Official website\|(.*?)}').strip()
|
filmbox['website'] = find_re(data, 'Official website\|(.*?)}').strip()
|
||||||
|
|
||||||
r = re.compile(r'{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
|
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
|
||||||
if r:
|
if r:
|
||||||
filmbox['imdb_id'] = r[0]
|
filmbox['imdb_id'] = r[0]
|
||||||
else:
|
else:
|
||||||
r = re.compile(r'{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
|
r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
|
||||||
if r:
|
if r:
|
||||||
filmbox['imdb_id'] = r[0]
|
filmbox['imdb_id'] = r[0]
|
||||||
|
|
||||||
r = re.compile(r'{{Internet Archive.*?\|id=(.*?)[\|}]', re.IGNORECASE).findall(data)
|
r = re.compile('{{Internet Archive.*?\|id=(.*?)[\|}]', re.IGNORECASE).findall(data)
|
||||||
if r:
|
if r:
|
||||||
filmbox['archiveorg_id'] = r[0]
|
filmbox['archiveorg_id'] = r[0]
|
||||||
|
|
||||||
r = re.compile(r'{{mojo title\|(.*?)[\|}]', re.IGNORECASE).findall(data)
|
r = re.compile('{{mojo title\|(.*?)[\|}]', re.IGNORECASE).findall(data)
|
||||||
if r:
|
if r:
|
||||||
filmbox['mojo_id'] = r[0].replace('id=', '')
|
filmbox['mojo_id'] = r[0].replace('id=', '')
|
||||||
|
|
||||||
r = re.compile(r'{{rotten-tomatoes\|(.*?)[\|}]', re.IGNORECASE).findall(data)
|
r = re.compile('{{rotten-tomatoes\|(.*?)[\|}]', re.IGNORECASE).findall(data)
|
||||||
if r:
|
if r:
|
||||||
filmbox['rottentomatoes_id'] = r[0].replace('id=', '')
|
filmbox['rottentomatoes_id'] = r[0].replace('id=', '')
|
||||||
if 'google video' in data:
|
if 'google video' in data:
|
||||||
filmbox['google_video_id'] = find_re(data, r'google video\|.*?(\d*?)[\|}]')
|
filmbox['google_video_id'] = find_re(data, 'google video\|.*?(\d*?)[\|}]')
|
||||||
if 'DEFAULTSORT' in data:
|
if 'DEFAULTSORT' in data:
|
||||||
filmbox['title_sort'] = find_re(data, r'''\{\{DEFAULTSORT:(.*?)\}\}''')
|
filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
|
||||||
return filmbox
|
return filmbox
|
||||||
|
|
||||||
def get_image_url(name):
|
def get_image_url(name):
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def get_version():
|
||||||
f = open(changelog)
|
f = open(changelog)
|
||||||
head = f.read().strip().split('\n')[0]
|
head = f.read().strip().split('\n')[0]
|
||||||
f.close()
|
f.close()
|
||||||
rev = re.compile(r'\d+\.\d+\.(\d+)').findall(head)
|
rev = re.compile('\d+\.\d+\.(\d+)').findall(head)
|
||||||
if rev:
|
if rev:
|
||||||
return '3.0.%s' % rev[0]
|
return '3.0.%s' % rev[0]
|
||||||
return '3.0.x'
|
return '3.0.x'
|
||||||
|
|
Loading…
Reference in a new issue