oxdbarchive/oxdbarchive/tools/subtitles.py

#!/usr/bin/env python
# depends on
# subtitleripper - http://subtitleripper.sourceforge.net
# unrar
# tesseract-ocr - http://tesseract-ocr.googlecode.com
#

import Image
import os
import sys
import shutil
from glob import glob
import warnings

warnings.filterwarnings("ignore", "tempnam")

colors = ('0,255,255,255', '255,0,255,255', '255,255,0,255', '255,255,255,0')

def readFile(fname):
  f = open(fname)
  data = f.read()
  f.close()
  return data

def getColorChangeCount(image_name):
  i = Image.open(image_name)
  background = i.getpixel((0,0))
  c = 0
  max_line_count = 0
  if background == 255:
    for x in range(0, i.size[0]):
      black = 0
      line_count = 0
      for y in range(0, i.size[1]):
        p = i.getpixel((x,y))
        if p == background:
          if black:
            line_count+=1
          black = 0
        else:
          black +=1
      max_line_count = max(line_count, max_line_count)
  return max_line_count

def getBestMask(filename):
  outputs = {}
  for c in colors:
    output = os.path.join(workdir, c.replace(',', '-'))
    input_file = os.path.join(output,filename)
    change_count = getColorChangeCount(input_file)
    if change_count:
      outputs[change_count] = dict(
                                output=output,
                                input_file=input_file,
                              )
  return outputs[min(outputs.keys())]

#main
input_base = sys.argv[1]
if input_base.endswith('.'):
  input_base = input_base[:-1]
input_base = os.path.abspath(input_base)

workdir = os.tempnam()
os.mkdir(workdir)
os.chdir(workdir)

input_files = glob("%s*" % input_base)
sub_file = "%s.sub" % input_base
rar_file = "%s.rar" % input_base
idx_file = "%s.idx" % input_base
srt_file = "%s.srt" % input_base

working_base = input_base
if sub_file not in input_files and rar_file in input_files:
  working_base = os.path.join(workdir, os.path.basename(input_base))
  shutil.copy(rar_file, "%s.rar" % working_base)
  rar_file = "%s.rar" % working_base
  sub_file = "%s.sub" % working_base
  shutil.copy(idx_file, "%s.idx" % working_base)
  idx_file = "%s.idx" % working_base

  cmd="unrar x '%s' > /dev/null 2>&1" % rar_file
  os.system(cmd)
  subs = glob("*.sub")
  if subs:
    os.rename(subs[0], sub_file)
  else:
    print "no sub file found"
    #cleanup
    shutil.rmtree(workdir)
    sys.exit(1)

sub_lang = "en"
language = ''
for l in readFile(idx_file).split('\n'):
  if l.startswith('id: %s' % sub_lang):
    language = "-t %s" % l.split('index: ')[-1].strip()

for c in colors:
  output = os.path.join(workdir, c.replace(',', '-'))
  if not os.path.exists(output):
    os.makedirs(output)
  cmd = "vobsub2pgm %s -c %s '%s' %s/english >/dev/null 2>&1" % (language, c, working_base, output)
  os.system(cmd)

best_output = getBestMask("english0010.pgm")
pgms = glob("%s/english*.pgm" % best_output['output'])
for pgm in sorted(pgms):
  #input_pgm = getBestMask(os.path.basename(pgm))['input_file']
  input_pgm = pgm
  subtitle_tif = os.path.join(workdir, 'subtitle.tif')
  cmd = "convert %s %s;tesseract %s %s >/dev/null 2>&1" %(input_pgm, subtitle_tif, subtitle_tif, pgm)
  os.system(cmd)

#FIXME what about adding ispell here, interactive again
cmd = "ispell %s/english*.txt"  % best_output['output']


cmd = "srttool -s -w < %s/english.srtx > '%s'" % (best_output['output'], srt_file)
os.system(cmd)

#correct some common mistaces of tesseract
sed_script = os.path.join(workdir, 'fix_sed_script')
f = open(sed_script, 'w')
f.write('''s/Idn'/ldn'/g
s/Id'v/ld'v/g
s/ldn'\!/ldn't/g
s/\\\/\\l/W/g
s/V\\\l/W/g
s/eII/ell/g
s/></x/g
''')

cmd = """sed -f %s -i '%s'""" % (sed_script, srt_file)
os.system(cmd)

#cleanup
shutil.rmtree(workdir)