oxdbarchive/oxdbarchive/subtitles.py
2007-07-12 19:47:19 +00:00

280 lines
No EOL
8.8 KiB
Python

# -*- coding: utf-8 -*-
# -*- Mode: Python; -*-
# vi:si:et:sw=2:sts=2:ts=2
import re
import os
from os.path import abspath, join, dirname
import shutil
import time
import chardet
img_extension = 'jpg'
def srt2txt(srt, encoding = "utf-8"):
subtitles = srt2dict(srt, encoding)
txt = ''
for k in sorted([int(k) for k in subtitles.keys()]):
txt += "%s\n\n" % subtitles["%s" % k]['text']
return txt.strip()
def srt2dict(srt, encoding = "utf-8"):
'''convert srt string into a dict in the form
dict(num = dict(start, stop, text))
'''
subdict = {}
srt = srt.replace('\r', '').strip()
subtitles = srt.strip().split('\n\n')
for subtitle in subtitles:
if subtitle.strip():
subtitle = subtitle.strip().split('\n')
if len(subtitle) > 2:
start_stop = subtitle[1].split(' --> ')
subtitle[0] =u"%s" % int(subtitle[0])
subdict[subtitle[0]] = {
'start': start_stop[0],
'stop': start_stop[1],
'text': u'\n'.join(subtitle[2:]),
}
return subdict
def dict2srt(subtitles, encoding = "utf-8"):
'''convert dict in the form dict(num = dict(start, stop, text))
into an srt file
'''
srt = ''
for k in sorted([int(k) for k in subtitles.keys()]):
k = "%s" % k
srt += "%s\r\n%s --> %s\r\n%s\r\n\r\n" % (
k,
subtitles[k]['start'],
subtitles[k]['stop'],
subtitles[k]['text'])
srt = srt.strip()
return srt.encode(encoding)
def time_str2msec(time_string):
from datetime import datetime, timedelta
import time
if len(time_string.split(',')) > 1:
msec = float("0." + time_string.split(',')[-1])
else:
msec = 0.0
time_string = time_string.split(',')[0]
time_string = "2007 " + time_string
offset = time.mktime(time.strptime(time_string, "%Y %H:%M:%S")) + msec
base = time.mktime(time.strptime("2007 00:00:00", "%Y %H:%M:%S"))
return int((offset - base) * 1000)
def msec2time_str(msec):
import time
msec_string = "%s" % msec
ms = ",%s" % msec_string[-3:]
sec = float(msec) / 1000
return time.strftime("%H:%M:%S", time.gmtime(sec)) + ms
def shift_time(offset, time_string):
''' return time shifted by offset milliseconds
format of time is expedted to be 01:50:52,123
'''
new_time = time_str2msec(time_string) + offset
return msec2time_str(new_time)
def shift_subtitles(offset, offset_num, subtitles):
'''
shifts a subtitle by offset, where offsest is a tuple (time, position)
'''
sdict = {}
for k in sorted([int(k) for k in subtitles.keys()]):
ko = "%s" % (k + offset_num)
sdict[ko] = subtitles["%s" % k]
sdict[ko]['start'] = shift_time(offset, sdict[ko]['start'])
sdict[ko]['stop'] = shift_time(offset, sdict[ko]['stop'])
return sdict
def merge_subtitles(subtitles):
'''
converts a list of subtitles / dict(txt, length)
into one srt subtitle
'''
subs = {}
offset = 0
for k in sorted(subtitles.keys()):
sdict = srt2dict(subtitles[k]['txt'])
if offset:
sdict = shift_subtitles(offset, len(subs), sdict)
for key in sdict:
subs[key] = sdict[key]
offset += subtitles[k]['length']
return dict2srt(subs)
def split_subtitle(subtitles, offset):
'''
split subtitles at offset
'''
offset_time = time.strftime("%H:%M:%S", offset)
one = {}
two = {}
for k in sorted([int(k) for k in subtitles.keys()]):
if subtitles['stop'] < offset_time:
one[k] = subtitle[k]
else:
two[k] = subtitle[k]
two = shift_subtitles(-offset, -len(two), two)
def extract_flash_ng(movie_file, flash_file, inpoint, outpoint, width=128, height=96, offset = 0):
ext = movie_file.split('.')[-1]
if ext in ('sub', 'srt'):
print "this is not a movie file, will not try to extract frames"
return
if offset:
print "Inpoint ", inpoint,
inpoint = shift_time(-offset, inpoint)
outpoint = shift_time(-offset, outpoint)
print " becomes ", inpoint
print "extracting %s -> %s" % (inpoint, outpoint)
duration = time_str2msec(outpoint) - time_str2msec(inpoint)
inpoint = time_str2msec(inpoint)
extractClipScript = abspath(join(dirname(__file__), "tools/extract_clip.py"))
cmd = '''%s "%s" %s %s %s''' % (extractClipScript, movie_file, flash_file, inpoint, duration)
os.system(cmd.encode('utf-8'))
def extract_flash(movie_file, flash_file, inpoint, outpoint, width=128, height=96, offset = 0):
import warnings
warnings.filterwarnings("ignore", "tempnam")
ext = movie_file.split('.')[-1]
if ext in ('sub', 'srt', 'mkv'):
print "this is not a movie file, will not try to extract frames"
return
framedir = os.tempnam()
os.mkdir(framedir)
os.chdir(framedir)
if offset:
print "Inpoint ", inpoint,
inpoint = shift_time(-offset, inpoint)
outpoint = shift_time(-offset, outpoint)
print " becomes ", inpoint
print "extracting %s -> %s" % (inpoint, outpoint)
outpoint = float(time_str2msec(outpoint) - time_str2msec(inpoint)) / 1000 + 1
audiorate = "44100"
if os.path.exists(movie_file):
mencoder_options = ''
mencoder_options += " '%s'" % movie_file
mencoder_options += " -ss '%s' -endpos %0.2f" % (inpoint, outpoint)
mencoder_options += ' -ovc copy -oac copy -o tempfile.avi '
mencoder = "mencoder %s >/dev/null 2>&1" % mencoder_options
#print mencoder.encode('utf-8')
os.system(mencoder.encode('utf-8'))
ffmpeg_options = ''
#ffmpeg_options += " -ss '%s' -t %0.2f" % (inpoint, outpoint)
ffmpeg_options += " -y -i 'tempfile.avi'"
ffmpeg_options += " -ar %s -b 128000 '%s'" % (audiorate, flash_file)
ffmpeg = "ffmpeg %s >/dev/null 2>&1" % ffmpeg_options
#print ffmpeg.encode('utf-8')
os.system(ffmpeg.encode('utf-8'))
else:
print "update the cache %s missing" % movie_file.encode('utf-8')
shutil.rmtree(framedir)
def extract_frame(movie_file, timestamp, img_folder, width=128, offset = 0, redo = False):
import warnings
warnings.filterwarnings("ignore", "tempnam")
ext = movie_file.split('.')[-1]
if ext in ('sub', 'srt'):
print "this is not a movie file, will not try to extract frames"
return
framedir = os.tempnam()
os.mkdir(framedir)
os.chdir(framedir)
if offset:
timestamp_in_file = shift_time(-offset, timestamp)
else:
timestamp_in_file = timestamp
if os.path.exists(movie_file):
mplayer_options = ''
mplayer_options += " '%s'" % movie_file
mplayer_options += " -ss '%s' -frames 2" % (timestamp_in_file)
mplayer_options += " -vo jpeg:quality=90 -vf scale -zoom -xy %d " % width
mplayer_options += " -ao null"
mplayer = "mplayer %s >/dev/null 2>&1" % mplayer_options
frame = os.path.join(img_folder, "%s.%s" % (timestamp.replace(':', '.'), img_extension))
if redo or not os.path.exists(frame):
print mplayer.encode('utf-8')
os.system (mplayer.encode('utf-8'))
files = os.listdir(framedir)
if files:
print "creating frame ", frame
shutil.move(os.path.join(framedir,files[-1]), frame)
if len(files)>1:
for f in files[:-2]:
print "unlink", f
os.unlink(f)
time.sleep(0.1)
else:
print "update the cache %s missing" % movie_file
shutil.rmtree(framedir)
def extract_subtitles(movie_file, srt, img_folder, width=128, offset = 0, redo = False):
subtitles = srt2dict(srt)
for k in sorted([int(k) for k in subtitles.keys()]):
timestamp = subtitles["%s" % k]['start']
extract_frame(movie_file, timestamp, img_folder, width, offset, redo)
def detectEncoding(fp):
bomDict={ # bytepattern : name
(0x00, 0x00, 0xFE, 0xFF) : "utf_32_be",
(0xFF, 0xFE, 0x00, 0x00) : "utf_32_le",
(0xFE, 0xFF, None, None) : "utf_16_be",
(0xFF, 0xFE, None, None) : "utf_16_le",
(0xEF, 0xBB, 0xBF, None) : "utf_8",
}
# go to beginning of file and get the first 4 bytes
oldFP = fp.tell()
fp.seek(0)
(byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
# try bom detection using 4 bytes, 3 bytes, or 2 bytes
bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
if not bomDetection :
bomDetection = bomDict.get((byte1, byte2, byte3, None))
if not bomDetection :
bomDetection = bomDict.get((byte1, byte2, None, None))
## if BOM detected, we're done :-)
fp.seek(oldFP)
if bomDetection :
return bomDetection
encoding = 'latin-1'
#more character detecting magick using http://chardet.feedparser.org/
fp.seek(0)
rawdata = fp.read()
encoding = chardet.detect(rawdata)['encoding']
fp.seek(oldFP)
return encoding
def loadSrt(fname):
f = open(fname)
encoding = detectEncoding(f)
data = f.read()
f.close()
try:
udata = unicode(data, encoding)
except:
try:
udata = unicode(data, 'latin-1')
except:
print "failed to detect encoding, giving up"
udate = u''
if udata.startswith(u'\ufeff'):
udata = udata[1:]
return udata