oxdbarchive/oxdbarchive/tools/subtitles.py

140 lines
3.6 KiB
Python

#!/usr/bin/env python
# depends on
# subtitleripper - http://subtitleripper.sourceforge.net
# unrar
# tesseract-ocr - http://tesseract-ocr.googlecode.com
#
import Image
import os
import sys
import shutil
from glob import glob
import warnings
warnings.filterwarnings("ignore", "tempnam")
colors = ('0,255,255,255', '255,0,255,255', '255,255,0,255', '255,255,255,0')
def readFile(fname):
f = open(fname)
data = f.read()
f.close()
return data
def getColorChangeCount(image_name):
i = Image.open(image_name)
background = i.getpixel((0,0))
c = 0
max_line_count = 0
if background == 255:
for x in range(0, i.size[0]):
black = 0
line_count = 0
for y in range(0, i.size[1]):
p = i.getpixel((x,y))
if p == background:
if black:
line_count+=1
black = 0
else:
black +=1
max_line_count = max(line_count, max_line_count)
return max_line_count
def getBestMask(filename):
outputs = {}
for c in colors:
output = os.path.join(workdir, c.replace(',', '-'))
input_file = os.path.join(output,filename)
change_count = getColorChangeCount(input_file)
if change_count:
outputs[change_count] = dict(
output=output,
input_file=input_file,
)
return outputs[min(outputs.keys())]
#main
input_base = sys.argv[1]
if input_base.endswith('.'):
input_base = input_base[:-1]
input_base = os.path.abspath(input_base)
workdir = os.tempnam()
os.mkdir(workdir)
os.chdir(workdir)
input_files = glob("%s*" % input_base)
sub_file = "%s.sub" % input_base
rar_file = "%s.rar" % input_base
idx_file = "%s.idx" % input_base
srt_file = "%s.srt" % input_base
working_base = input_base
if sub_file not in input_files and rar_file in input_files:
working_base = os.path.join(workdir, os.path.basename(input_base))
shutil.copy(rar_file, "%s.rar" % working_base)
rar_file = "%s.rar" % working_base
sub_file = "%s.sub" % working_base
shutil.copy(idx_file, "%s.idx" % working_base)
idx_file = "%s.idx" % working_base
cmd="unrar x '%s' > /dev/null 2>&1" % rar_file
os.system(cmd)
subs = glob("*.sub")
if subs:
os.rename(subs[0], sub_file)
else:
print "no sub file found"
#cleanup
shutil.rmtree(workdir)
sys.exit(1)
sub_lang = "en"
language = ''
for l in readFile(idx_file).split('\n'):
if l.startswith('id: %s' % sub_lang):
language = "-t %s" % l.split('index: ')[-1].strip()
for c in colors:
output = os.path.join(workdir, c.replace(',', '-'))
if not os.path.exists(output):
os.makedirs(output)
cmd = "vobsub2pgm %s -c %s '%s' %s/english >/dev/null 2>&1" % (language, c, working_base, output)
os.system(cmd)
best_output = getBestMask("english0010.pgm")
pgms = glob("%s/english*.pgm" % best_output['output'])
for pgm in sorted(pgms):
#input_pgm = getBestMask(os.path.basename(pgm))['input_file']
input_pgm = pgm
subtitle_tif = os.path.join(workdir, 'subtitle.tif')
cmd = "convert %s %s;tesseract %s %s >/dev/null 2>&1" %(input_pgm, subtitle_tif, subtitle_tif, pgm)
os.system(cmd)
#FIXME what about adding ispell here, interactive again
cmd = "ispell %s/english*.txt" % best_output['output']
cmd = "srttool -s -w < %s/english.srtx > '%s'" % (best_output['output'], srt_file)
os.system(cmd)
#correct some common mistaces of tesseract
sed_script = os.path.join(workdir, 'fix_sed_script')
f = open(sed_script, 'w')
f.write('''s/Idn'/ldn'/g
s/Id'v/ld'v/g
s/ldn'\!/ldn't/g
s/\\\/\\l/W/g
s/V\\\l/W/g
s/eII/ell/g
s/></x/g
''')
cmd = """sed -f %s -i '%s'""" % (sed_script, srt_file)
os.system(cmd)
#cleanup
shutil.rmtree(workdir)