oxdbarchive/oxdbarchive/subtitles.py

# -*- coding: utf-8 -*-
# -*- Mode: Python; -*-
# vi:si:et:sw=2:sts=2:ts=2

import re
import os
from os.path import abspath, join, dirname
import shutil
import time

import chardet

img_extension = 'jpg'

def srt2txt(srt, encoding = "utf-8"):
  subtitles = srt2dict(srt, encoding)
  txt = ''
  for k in sorted([int(k) for k in subtitles.keys()]):
    txt += "%s\n\n" % subtitles["%s" % k]['text']
  return txt.strip()

def srt2dict(srt, encoding = "utf-8"):
  '''convert srt string into a dict in the form
     dict(num = dict(start, stop, text))
  '''
  subdict = {}
  srt = srt.replace('\r', '').strip()
  subtitles = srt.strip().split('\n\n')
  for subtitle in subtitles:
    if subtitle.strip():
      subtitle = subtitle.strip().split('\n')
      if len(subtitle) > 2:
        start_stop = subtitle[1].split(' --> ')
        subtitle[0] =u"%s" % int(subtitle[0])
        subdict[subtitle[0]] = {
          'start': start_stop[0],
          'stop': start_stop[1],
          'text': u'\n'.join(subtitle[2:]),
          }
  return subdict

def dict2srt(subtitles, encoding = "utf-8"):
  '''convert dict in the form dict(num = dict(start, stop, text))
     into an srt file
  '''
  srt = ''
  for k in sorted([int(k) for k in subtitles.keys()]):
    k = "%s" % k
    srt += "%s\r\n%s --> %s\r\n%s\r\n\r\n" % (
      k,
      subtitles[k]['start'],
      subtitles[k]['stop'],
      subtitles[k]['text'])
  srt = srt.strip()
  return srt.encode(encoding)

def time_str2msec(time_string):
  from datetime import datetime, timedelta
  import time
  if len(time_string.split(',')) > 1:
    msec = float("0." + time_string.split(',')[-1])
  else:
    msec = 0.0
  time_string = time_string.split(',')[0]
  time_string = "2007 " + time_string
  offset = time.mktime(time.strptime(time_string, "%Y %H:%M:%S")) + msec
  base = time.mktime(time.strptime("2007 00:00:00", "%Y %H:%M:%S"))
  return int((offset - base) * 1000)

def msec2time_str(msec):
  import time
  msec_string = "%s" % msec
  ms = ",%s" % msec_string[-3:]
  sec = float(msec) / 1000
  return time.strftime("%H:%M:%S", time.gmtime(sec)) + ms

def shift_time(offset, time_string):
  ''' return time shifted by offset milliseconds
      format of time is expedted to be 01:50:52,123
  '''
  new_time = time_str2msec(time_string) + offset
  return msec2time_str(new_time)

def shift_subtitles(offset, offset_num, subtitles):
  '''
    shifts a subtitle by offset, where offsest is a tuple (time, position)
  '''
  sdict = {}
  for k in sorted([int(k) for k in subtitles.keys()]):
    ko = "%s" % (k + offset_num)
    sdict[ko] = subtitles["%s" % k]
    sdict[ko]['start'] = shift_time(offset, sdict[ko]['start'])
    sdict[ko]['stop'] = shift_time(offset, sdict[ko]['stop'])
  return sdict

def merge_subtitles(subtitles):
  '''
    converts a list of subtitles / dict(txt, length)
    into one srt subtitle
  '''
  subs = {}
  offset = 0
  for k in sorted(subtitles.keys()):
    sdict = srt2dict(subtitles[k]['txt'])
    if offset:
      sdict = shift_subtitles(offset, len(subs), sdict)
    for key in sdict:
      subs[key] = sdict[key]
    offset += subtitles[k]['length']
  return dict2srt(subs)

def split_subtitle(subtitles, offset):
  '''
    split subtitles at offset
  '''
  offset_time = time.strftime("%H:%M:%S", offset)
  one = {}
  two = {}
  for k in sorted([int(k) for k in subtitles.keys()]):
    if subtitles['stop'] < offset_time:
      one[k] = subtitle[k]
    else:
      two[k] = subtitle[k]
  two = shift_subtitles(-offset, -len(two), two)

def extract_flash_ng(movie_file, flash_file, inpoint, outpoint, width=128, height=96, offset = 0):
  ext = movie_file.split('.')[-1]
  if ext in ('sub', 'srt'):
    print "this is not a movie file, will not try to extract frames"
    return
  if offset:
    print "Inpoint ", inpoint,
    inpoint = shift_time(-offset, inpoint)
    outpoint = shift_time(-offset, outpoint)
    print " becomes ", inpoint

  print "extracting %s -> %s" % (inpoint, outpoint)
  duration = time_str2msec(outpoint) - time_str2msec(inpoint)
  inpoint = time_str2msec(inpoint)
  extractClipScript = abspath(join(dirname(__file__), "tools/extract_clip.py"))

  cmd = '''%s "%s" %s %s %s''' % (extractClipScript, movie_file, flash_file, inpoint, duration)
  os.system(cmd.encode('utf-8'))

def extract_flash(movie_file, flash_file, inpoint, outpoint, width=128, height=96, offset = 0):
  import warnings
  warnings.filterwarnings("ignore", "tempnam")
  ext = movie_file.split('.')[-1]
  if ext in ('sub', 'srt', 'mkv'):
    print "this is not a movie file, will not try to extract frames"
    return
  framedir = os.tempnam()
  os.mkdir(framedir)
  os.chdir(framedir)
  if offset:
    print "Inpoint ", inpoint,
    inpoint = shift_time(-offset, inpoint)
    outpoint = shift_time(-offset, outpoint)
    print " becomes ", inpoint
  print "extracting %s -> %s" % (inpoint, outpoint)
  outpoint = float(time_str2msec(outpoint) - time_str2msec(inpoint)) / 1000 + 1

  audiorate = "44100"
  if os.path.exists(movie_file):
    mencoder_options = ''
    mencoder_options += " '%s'" % movie_file
    mencoder_options += " -ss '%s' -endpos %0.2f" % (inpoint, outpoint)
    mencoder_options += ' -ovc copy -oac copy -o tempfile.avi '
    mencoder = "mencoder %s >/dev/null 2>&1" % mencoder_options
    #print mencoder.encode('utf-8')
    os.system(mencoder.encode('utf-8'))

    ffmpeg_options = ''
    #ffmpeg_options += " -ss '%s' -t %0.2f" % (inpoint, outpoint)
    ffmpeg_options += " -y -i 'tempfile.avi'"
    ffmpeg_options += " -ar %s -b 128000 '%s'" % (audiorate, flash_file)
    ffmpeg = "ffmpeg %s >/dev/null 2>&1" % ffmpeg_options
    #print ffmpeg.encode('utf-8')
    os.system(ffmpeg.encode('utf-8'))
  else:
    print "update the cache %s missing" % movie_file.encode('utf-8')
  shutil.rmtree(framedir)

def extract_frame(movie_file, timestamp, img_folder, width=128, offset = 0, redo = False):
  import warnings
  warnings.filterwarnings("ignore", "tempnam")
  ext = movie_file.split('.')[-1]
  if ext in ('sub', 'srt'):
    print "this is not a movie file, will not try to extract frames"
    return
  framedir = os.tempnam()

  os.mkdir(framedir)
  os.chdir(framedir)
  if offset:
    timestamp_in_file = shift_time(-offset, timestamp)
  else:
    timestamp_in_file = timestamp
  if os.path.exists(movie_file):
    mplayer_options = ''
    mplayer_options += " '%s'" % movie_file
    mplayer_options += " -ss '%s' -frames 2" % (timestamp_in_file)
    mplayer_options += " -vo jpeg:quality=90 -vf scale -zoom -xy %d " % width
    mplayer_options += " -ao null"
    mplayer = "mplayer %s >/dev/null 2>&1" % mplayer_options
    frame = os.path.join(img_folder, "%s.%s" % (timestamp.replace(':', '.'), img_extension))
    if redo or not os.path.exists(frame):
      print mplayer.encode('utf-8')
      os.system (mplayer.encode('utf-8'))
      files = os.listdir(framedir)
      if files:
        print "creating frame ", frame
        shutil.move(os.path.join(framedir,files[-1]), frame)
        if len(files)>1:
          for f in files[:-2]:
            print "unlink", f
            os.unlink(f)
    time.sleep(0.1)
  else:
    print "update the cache %s missing" % movie_file
  shutil.rmtree(framedir)


def extract_subtitles(movie_file, srt, img_folder, width=128, offset = 0, redo = False):
  subtitles = srt2dict(srt)
  for k in sorted([int(k) for k in subtitles.keys()]):
    timestamp = subtitles["%s" % k]['start']
    extract_frame(movie_file, timestamp, img_folder, width, offset, redo)

def detectEncoding(fp):
    bomDict={ # bytepattern : name
             (0x00, 0x00, 0xFE, 0xFF) : "utf_32_be",
             (0xFF, 0xFE, 0x00, 0x00) : "utf_32_le",
             (0xFE, 0xFF, None, None) : "utf_16_be",
             (0xFF, 0xFE, None, None) : "utf_16_le",
             (0xEF, 0xBB, 0xBF, None) : "utf_8",
            }

    # go to beginning of file and get the first 4 bytes
    oldFP = fp.tell()
    fp.seek(0)
    (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))

    # try bom detection using 4 bytes, 3 bytes, or 2 bytes
    bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
    if not bomDetection :
        bomDetection = bomDict.get((byte1, byte2, byte3, None))
        if not bomDetection :
            bomDetection = bomDict.get((byte1, byte2, None, None))

    ## if BOM detected, we're done :-)
    fp.seek(oldFP)
    if bomDetection :
        return bomDetection

    encoding = 'latin-1'
    #more character detecting magick using http://chardet.feedparser.org/
    fp.seek(0)
    rawdata = fp.read()
    encoding = chardet.detect(rawdata)['encoding']
    fp.seek(oldFP)
    return encoding

def loadSrt(fname):
  f = open(fname)
  encoding = detectEncoding(f)
  data = f.read()
  f.close()
  try:
    udata = unicode(data, encoding)
  except:
    try:
      udata = unicode(data, 'latin-1')
    except:
      print "failed to detect encoding, giving up"
      udate = u''
  if udata.startswith(u'\ufeff'):
    udata = udata[1:]
  return udata