280 lines
No EOL
8.8 KiB
Python
280 lines
No EOL
8.8 KiB
Python
# -*- coding: utf-8 -*-
|
|
# -*- Mode: Python; -*-
|
|
# vi:si:et:sw=2:sts=2:ts=2
|
|
|
|
import re
|
|
import os
|
|
from os.path import abspath, join, dirname
|
|
import shutil
|
|
import time
|
|
|
|
import chardet
|
|
|
|
img_extension = 'jpg'
|
|
|
|
def srt2txt(srt, encoding = "utf-8"):
|
|
subtitles = srt2dict(srt, encoding)
|
|
txt = ''
|
|
for k in sorted([int(k) for k in subtitles.keys()]):
|
|
txt += "%s\n\n" % subtitles["%s" % k]['text']
|
|
return txt.strip()
|
|
|
|
def srt2dict(srt, encoding = "utf-8"):
|
|
'''convert srt string into a dict in the form
|
|
dict(num = dict(start, stop, text))
|
|
'''
|
|
subdict = {}
|
|
srt = srt.replace('\r', '').strip()
|
|
subtitles = srt.strip().split('\n\n')
|
|
for subtitle in subtitles:
|
|
if subtitle.strip():
|
|
subtitle = subtitle.strip().split('\n')
|
|
if len(subtitle) > 2:
|
|
start_stop = subtitle[1].split(' --> ')
|
|
subtitle[0] =u"%s" % int(subtitle[0])
|
|
subdict[subtitle[0]] = {
|
|
'start': start_stop[0],
|
|
'stop': start_stop[1],
|
|
'text': u'\n'.join(subtitle[2:]),
|
|
}
|
|
return subdict
|
|
|
|
def dict2srt(subtitles, encoding = "utf-8"):
|
|
'''convert dict in the form dict(num = dict(start, stop, text))
|
|
into an srt file
|
|
'''
|
|
srt = ''
|
|
for k in sorted([int(k) for k in subtitles.keys()]):
|
|
k = "%s" % k
|
|
srt += "%s\r\n%s --> %s\r\n%s\r\n\r\n" % (
|
|
k,
|
|
subtitles[k]['start'],
|
|
subtitles[k]['stop'],
|
|
subtitles[k]['text'])
|
|
srt = srt.strip()
|
|
return srt.encode(encoding)
|
|
|
|
def time_str2msec(time_string):
|
|
from datetime import datetime, timedelta
|
|
import time
|
|
if len(time_string.split(',')) > 1:
|
|
msec = float("0." + time_string.split(',')[-1])
|
|
else:
|
|
msec = 0.0
|
|
time_string = time_string.split(',')[0]
|
|
time_string = "2007 " + time_string
|
|
offset = time.mktime(time.strptime(time_string, "%Y %H:%M:%S")) + msec
|
|
base = time.mktime(time.strptime("2007 00:00:00", "%Y %H:%M:%S"))
|
|
return int((offset - base) * 1000)
|
|
|
|
def msec2time_str(msec):
|
|
import time
|
|
msec_string = "%s" % msec
|
|
ms = ",%s" % msec_string[-3:]
|
|
sec = float(msec) / 1000
|
|
return time.strftime("%H:%M:%S", time.gmtime(sec)) + ms
|
|
|
|
def shift_time(offset, time_string):
|
|
''' return time shifted by offset milliseconds
|
|
format of time is expedted to be 01:50:52,123
|
|
'''
|
|
new_time = time_str2msec(time_string) + offset
|
|
return msec2time_str(new_time)
|
|
|
|
def shift_subtitles(offset, offset_num, subtitles):
|
|
'''
|
|
shifts a subtitle by offset, where offsest is a tuple (time, position)
|
|
'''
|
|
sdict = {}
|
|
for k in sorted([int(k) for k in subtitles.keys()]):
|
|
ko = "%s" % (k + offset_num)
|
|
sdict[ko] = subtitles["%s" % k]
|
|
sdict[ko]['start'] = shift_time(offset, sdict[ko]['start'])
|
|
sdict[ko]['stop'] = shift_time(offset, sdict[ko]['stop'])
|
|
return sdict
|
|
|
|
def merge_subtitles(subtitles):
|
|
'''
|
|
converts a list of subtitles / dict(txt, length)
|
|
into one srt subtitle
|
|
'''
|
|
subs = {}
|
|
offset = 0
|
|
for k in sorted(subtitles.keys()):
|
|
sdict = srt2dict(subtitles[k]['txt'])
|
|
if offset:
|
|
sdict = shift_subtitles(offset, len(subs), sdict)
|
|
for key in sdict:
|
|
subs[key] = sdict[key]
|
|
offset += subtitles[k]['length']
|
|
return dict2srt(subs)
|
|
|
|
def split_subtitle(subtitles, offset):
|
|
'''
|
|
split subtitles at offset
|
|
'''
|
|
offset_time = time.strftime("%H:%M:%S", offset)
|
|
one = {}
|
|
two = {}
|
|
for k in sorted([int(k) for k in subtitles.keys()]):
|
|
if subtitles['stop'] < offset_time:
|
|
one[k] = subtitle[k]
|
|
else:
|
|
two[k] = subtitle[k]
|
|
two = shift_subtitles(-offset, -len(two), two)
|
|
|
|
def extract_flash_ng(movie_file, flash_file, inpoint, outpoint, width=128, height=96, offset = 0):
|
|
ext = movie_file.split('.')[-1]
|
|
if ext in ('sub', 'srt'):
|
|
print "this is not a movie file, will not try to extract frames"
|
|
return
|
|
if offset:
|
|
print "Inpoint ", inpoint,
|
|
inpoint = shift_time(-offset, inpoint)
|
|
outpoint = shift_time(-offset, outpoint)
|
|
print " becomes ", inpoint
|
|
|
|
print "extracting %s -> %s" % (inpoint, outpoint)
|
|
duration = time_str2msec(outpoint) - time_str2msec(inpoint)
|
|
inpoint = time_str2msec(inpoint)
|
|
extractClipScript = abspath(join(dirname(__file__), "tools/extract_clip.py"))
|
|
|
|
cmd = '''%s "%s" %s %s %s''' % (extractClipScript, movie_file, flash_file, inpoint, duration)
|
|
os.system(cmd.encode('utf-8'))
|
|
|
|
def extract_flash(movie_file, flash_file, inpoint, outpoint, width=128, height=96, offset = 0):
|
|
import warnings
|
|
warnings.filterwarnings("ignore", "tempnam")
|
|
ext = movie_file.split('.')[-1]
|
|
if ext in ('sub', 'srt', 'mkv'):
|
|
print "this is not a movie file, will not try to extract frames"
|
|
return
|
|
framedir = os.tempnam()
|
|
os.mkdir(framedir)
|
|
os.chdir(framedir)
|
|
if offset:
|
|
print "Inpoint ", inpoint,
|
|
inpoint = shift_time(-offset, inpoint)
|
|
outpoint = shift_time(-offset, outpoint)
|
|
print " becomes ", inpoint
|
|
print "extracting %s -> %s" % (inpoint, outpoint)
|
|
outpoint = float(time_str2msec(outpoint) - time_str2msec(inpoint)) / 1000 + 1
|
|
|
|
audiorate = "44100"
|
|
if os.path.exists(movie_file):
|
|
mencoder_options = ''
|
|
mencoder_options += " '%s'" % movie_file
|
|
mencoder_options += " -ss '%s' -endpos %0.2f" % (inpoint, outpoint)
|
|
mencoder_options += ' -ovc copy -oac copy -o tempfile.avi '
|
|
mencoder = "mencoder %s >/dev/null 2>&1" % mencoder_options
|
|
#print mencoder.encode('utf-8')
|
|
os.system(mencoder.encode('utf-8'))
|
|
|
|
ffmpeg_options = ''
|
|
#ffmpeg_options += " -ss '%s' -t %0.2f" % (inpoint, outpoint)
|
|
ffmpeg_options += " -y -i 'tempfile.avi'"
|
|
ffmpeg_options += " -ar %s -b 128000 '%s'" % (audiorate, flash_file)
|
|
ffmpeg = "ffmpeg %s >/dev/null 2>&1" % ffmpeg_options
|
|
#print ffmpeg.encode('utf-8')
|
|
os.system(ffmpeg.encode('utf-8'))
|
|
else:
|
|
print "update the cache %s missing" % movie_file.encode('utf-8')
|
|
shutil.rmtree(framedir)
|
|
|
|
def extract_frame(movie_file, timestamp, img_folder, width=128, offset = 0, redo = False):
|
|
import warnings
|
|
warnings.filterwarnings("ignore", "tempnam")
|
|
ext = movie_file.split('.')[-1]
|
|
if ext in ('sub', 'srt'):
|
|
print "this is not a movie file, will not try to extract frames"
|
|
return
|
|
framedir = os.tempnam()
|
|
|
|
os.mkdir(framedir)
|
|
os.chdir(framedir)
|
|
if offset:
|
|
timestamp_in_file = shift_time(-offset, timestamp)
|
|
else:
|
|
timestamp_in_file = timestamp
|
|
if os.path.exists(movie_file):
|
|
mplayer_options = ''
|
|
mplayer_options += " '%s'" % movie_file
|
|
mplayer_options += " -ss '%s' -frames 2" % (timestamp_in_file)
|
|
mplayer_options += " -vo jpeg:quality=90 -vf scale -zoom -xy %d " % width
|
|
mplayer_options += " -ao null"
|
|
mplayer = "mplayer %s >/dev/null 2>&1" % mplayer_options
|
|
frame = os.path.join(img_folder, "%s.%s" % (timestamp.replace(':', '.'), img_extension))
|
|
if redo or not os.path.exists(frame):
|
|
print mplayer.encode('utf-8')
|
|
os.system (mplayer.encode('utf-8'))
|
|
files = os.listdir(framedir)
|
|
if files:
|
|
print "creating frame ", frame
|
|
shutil.move(os.path.join(framedir,files[-1]), frame)
|
|
if len(files)>1:
|
|
for f in files[:-2]:
|
|
print "unlink", f
|
|
os.unlink(f)
|
|
time.sleep(0.1)
|
|
else:
|
|
print "update the cache %s missing" % movie_file
|
|
shutil.rmtree(framedir)
|
|
|
|
|
|
def extract_subtitles(movie_file, srt, img_folder, width=128, offset = 0, redo = False):
|
|
subtitles = srt2dict(srt)
|
|
for k in sorted([int(k) for k in subtitles.keys()]):
|
|
timestamp = subtitles["%s" % k]['start']
|
|
extract_frame(movie_file, timestamp, img_folder, width, offset, redo)
|
|
|
|
def detectEncoding(fp):
|
|
bomDict={ # bytepattern : name
|
|
(0x00, 0x00, 0xFE, 0xFF) : "utf_32_be",
|
|
(0xFF, 0xFE, 0x00, 0x00) : "utf_32_le",
|
|
(0xFE, 0xFF, None, None) : "utf_16_be",
|
|
(0xFF, 0xFE, None, None) : "utf_16_le",
|
|
(0xEF, 0xBB, 0xBF, None) : "utf_8",
|
|
}
|
|
|
|
# go to beginning of file and get the first 4 bytes
|
|
oldFP = fp.tell()
|
|
fp.seek(0)
|
|
(byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
|
|
|
|
# try bom detection using 4 bytes, 3 bytes, or 2 bytes
|
|
bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
|
|
if not bomDetection :
|
|
bomDetection = bomDict.get((byte1, byte2, byte3, None))
|
|
if not bomDetection :
|
|
bomDetection = bomDict.get((byte1, byte2, None, None))
|
|
|
|
## if BOM detected, we're done :-)
|
|
fp.seek(oldFP)
|
|
if bomDetection :
|
|
return bomDetection
|
|
|
|
encoding = 'latin-1'
|
|
#more character detecting magick using http://chardet.feedparser.org/
|
|
fp.seek(0)
|
|
rawdata = fp.read()
|
|
encoding = chardet.detect(rawdata)['encoding']
|
|
fp.seek(oldFP)
|
|
return encoding
|
|
|
|
def loadSrt(fname):
|
|
f = open(fname)
|
|
encoding = detectEncoding(f)
|
|
data = f.read()
|
|
f.close()
|
|
try:
|
|
udata = unicode(data, encoding)
|
|
except:
|
|
try:
|
|
udata = unicode(data, 'latin-1')
|
|
except:
|
|
print "failed to detect encoding, giving up"
|
|
udate = u''
|
|
if udata.startswith(u'\ufeff'):
|
|
udata = udata[1:]
|
|
return udata
|
|
|