From 755eaec501a223498dee592d9ee7254fab48055d Mon Sep 17 00:00:00 2001
From: j <0x006A@0x2620.org>
Date: Mon, 2 Jan 2012 22:14:10 +0530
Subject: [PATCH] add srt.load

---
 ox/__init__.py |  1 +
 ox/srt.py      | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+)
 create mode 100644 ox/srt.py

diff --git a/ox/__init__.py b/ox/__init__.py
index 3fe735f..f8cf598 100644
--- a/ox/__init__.py
+++ b/ox/__init__.py
@@ -7,6 +7,7 @@ import cache
 import js
 import jsonc
 import net
+import srt
 
 from api import *
 from file import *
diff --git a/ox/srt.py b/ox/srt.py
new file mode 100644
index 0000000..c43a966
--- /dev/null
+++ b/ox/srt.py
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=4:sts=4:ts=4
+from __future__ import with_statement, division
+import chardet
+import re
+
+import ox
+
+
+__all__ = []
+
+
+def _detectEncoding(fp):
+    bomDict={ # bytepattern : name
+              (0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
+              (0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
+              (0xFE, 0xFF, None, None): "utf_16_be",
+              (0xFF, 0xFE, None, None): "utf_16_le",
+              (0xEF, 0xBB, 0xBF, None): "utf_8",
+            }
+
+    # go to beginning of file and get the first 4 bytes
+    oldFP = fp.tell()
+    fp.seek(0)
+    (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
+
+    # try bom detection using 4 bytes, 3 bytes, or 2 bytes
+    bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
+    if not bomDetection:
+        bomDetection = bomDict.get((byte1, byte2, byte3, None))
+        if not bomDetection:
+            bomDetection = bomDict.get((byte1, byte2, None, None))
+    ## if BOM detected, we're done :-)
+    fp.seek(oldFP)
+    if bomDetection:
+        return bomDetection
+
+    encoding = 'latin-1'
+    #more character detecting magick using http://chardet.feedparser.org/
+    fp.seek(0)
+    rawdata = fp.read()
+    encoding = chardet.detect(rawdata)['encoding']
+    fp.seek(oldFP)
+    return encoding
+
+
+def load(filename, offset=0):
+    '''
+        filename path to an srt file
+        offset in seconds shift all in/out points by offset
+
+        returns list with objects that have in,out,value and id
+    '''
+    srt = []
+
+    def parse_time(t):
+        return offset + ox.time2ms(t.replace(',', '.')) / 1000
+
+    with open(filename) as f:
+        encoding = _detectEncoding(f)
+        data = f.read()
+    try:
+        data = unicode(data, encoding)
+    except:
+        try:
+            data = unicode(data, 'latin-1')
+        except:
+            print "failed to detect encoding, giving up"
+            return srt
+
+    data = data.replace('\r\n', '\n')
+    srts = re.compile('(\d\d:\d\d:\d\d[,.]\d\d\d)\s*-->\s*(\d\d:\d\d:\d\d[,.]\d\d\d)\s*(.+?)\n\n', re.DOTALL)
+    i = 0
+    for s in srts.findall(data):
+        _s = {'id': str(i),
+              'in': parse_time(s[0]),
+              'out': parse_time(s[1]),
+              'value': s[2].strip()
+        }
+        srt.append(_s)
+        i += 1
+    return srt