140 lines
3.6 KiB
Python
140 lines
3.6 KiB
Python
#!/usr/bin/env python
|
|
# depends on
|
|
# subtitleripper - http://subtitleripper.sourceforge.net
|
|
# unrar
|
|
# tesseract-ocr - http://tesseract-ocr.googlecode.com
|
|
#
|
|
|
|
import Image
|
|
import os
|
|
import sys
|
|
import shutil
|
|
from glob import glob
|
|
import warnings
|
|
|
|
warnings.filterwarnings("ignore", "tempnam")
|
|
|
|
colors = ('0,255,255,255', '255,0,255,255', '255,255,0,255', '255,255,255,0')
|
|
|
|
def readFile(fname):
|
|
f = open(fname)
|
|
data = f.read()
|
|
f.close()
|
|
return data
|
|
|
|
def getColorChangeCount(image_name):
|
|
i = Image.open(image_name)
|
|
background = i.getpixel((0,0))
|
|
c = 0
|
|
max_line_count = 0
|
|
if background == 255:
|
|
for x in range(0, i.size[0]):
|
|
black = 0
|
|
line_count = 0
|
|
for y in range(0, i.size[1]):
|
|
p = i.getpixel((x,y))
|
|
if p == background:
|
|
if black:
|
|
line_count+=1
|
|
black = 0
|
|
else:
|
|
black +=1
|
|
max_line_count = max(line_count, max_line_count)
|
|
return max_line_count
|
|
|
|
def getBestMask(filename):
|
|
outputs = {}
|
|
for c in colors:
|
|
output = os.path.join(workdir, c.replace(',', '-'))
|
|
input_file = os.path.join(output,filename)
|
|
change_count = getColorChangeCount(input_file)
|
|
if change_count:
|
|
outputs[change_count] = dict(
|
|
output=output,
|
|
input_file=input_file,
|
|
)
|
|
return outputs[min(outputs.keys())]
|
|
|
|
#main
|
|
input_base = sys.argv[1]
|
|
if input_base.endswith('.'):
|
|
input_base = input_base[:-1]
|
|
input_base = os.path.abspath(input_base)
|
|
|
|
workdir = os.tempnam()
|
|
os.mkdir(workdir)
|
|
os.chdir(workdir)
|
|
|
|
input_files = glob("%s*" % input_base)
|
|
sub_file = "%s.sub" % input_base
|
|
rar_file = "%s.rar" % input_base
|
|
idx_file = "%s.idx" % input_base
|
|
srt_file = "%s.srt" % input_base
|
|
|
|
working_base = input_base
|
|
if sub_file not in input_files and rar_file in input_files:
|
|
working_base = os.path.join(workdir, os.path.basename(input_base))
|
|
shutil.copy(rar_file, "%s.rar" % working_base)
|
|
rar_file = "%s.rar" % working_base
|
|
sub_file = "%s.sub" % working_base
|
|
shutil.copy(idx_file, "%s.idx" % working_base)
|
|
idx_file = "%s.idx" % working_base
|
|
|
|
cmd="unrar x '%s' > /dev/null 2>&1" % rar_file
|
|
os.system(cmd)
|
|
subs = glob("*.sub")
|
|
if subs:
|
|
os.rename(subs[0], sub_file)
|
|
else:
|
|
print "no sub file found"
|
|
#cleanup
|
|
shutil.rmtree(workdir)
|
|
sys.exit(1)
|
|
|
|
sub_lang = "en"
|
|
language = ''
|
|
for l in readFile(idx_file).split('\n'):
|
|
if l.startswith('id: %s' % sub_lang):
|
|
language = "-t %s" % l.split('index: ')[-1].strip()
|
|
|
|
for c in colors:
|
|
output = os.path.join(workdir, c.replace(',', '-'))
|
|
if not os.path.exists(output):
|
|
os.makedirs(output)
|
|
cmd = "vobsub2pgm %s -c %s '%s' %s/english >/dev/null 2>&1" % (language, c, working_base, output)
|
|
os.system(cmd)
|
|
|
|
best_output = getBestMask("english0010.pgm")
|
|
pgms = glob("%s/english*.pgm" % best_output['output'])
|
|
for pgm in sorted(pgms):
|
|
#input_pgm = getBestMask(os.path.basename(pgm))['input_file']
|
|
input_pgm = pgm
|
|
subtitle_tif = os.path.join(workdir, 'subtitle.tif')
|
|
cmd = "convert %s %s;tesseract %s %s >/dev/null 2>&1" %(input_pgm, subtitle_tif, subtitle_tif, pgm)
|
|
os.system(cmd)
|
|
|
|
#FIXME what about adding ispell here, interactive again
|
|
cmd = "ispell %s/english*.txt" % best_output['output']
|
|
|
|
|
|
cmd = "srttool -s -w < %s/english.srtx > '%s'" % (best_output['output'], srt_file)
|
|
os.system(cmd)
|
|
|
|
#correct some common mistaces of tesseract
|
|
sed_script = os.path.join(workdir, 'fix_sed_script')
|
|
f = open(sed_script, 'w')
|
|
f.write('''s/Idn'/ldn'/g
|
|
s/Id'v/ld'v/g
|
|
s/ldn'\!/ldn't/g
|
|
s/\\\/\\l/W/g
|
|
s/V\\\l/W/g
|
|
s/eII/ell/g
|
|
s/></x/g
|
|
''')
|
|
|
|
cmd = """sed -f %s -i '%s'""" % (sed_script, srt_file)
|
|
os.system(cmd)
|
|
|
|
#cleanup
|
|
shutil.rmtree(workdir)
|