fix python3 ox.srt

2016-06-08 11:36:55 +02:00 · 2016-06-08 11:36:55 +02:00 · ac2e829016
commit ac2e829016
parent 1e3d2d24bb
1 changed files with 29 additions and 23 deletions
--- a/ox/srt.py
+++ b/ox/srt.py
@ -1,10 +1,11 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
 from __future__ import with_statement, division, print_function
-import chardet
-import re
 import codecs
+import re

+import chardet
+from six import PY2
 import ox


@ -23,7 +24,10 @@ def _detect_encoding(fp):
    # go to beginning of file and get the first 4 bytes
    oldFP = fp.tell()
    fp.seek(0)
-    (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
+    if PY2:
+        (byte1, byte2, byte3, byte4) = [ord(b) for b in fp.read(4)]
+    else:
+        (byte1, byte2, byte3, byte4) = fp.read(4)

    # try bom detection using 4 bytes, 3 bytes, or 2 bytes
    bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
@ -31,7 +35,7 @@ def _detect_encoding(fp):
        bomDetection = bomDict.get((byte1, byte2, byte3, None))
        if not bomDetection:
            bomDetection = bomDict.get((byte1, byte2, None, None))
-    ## if BOM detected, we're done :-)
+    # if BOM detected, we're done :-)
    fp.seek(oldFP)
    if bomDetection:
        return bomDetection
@ -42,7 +46,7 @@ def _detect_encoding(fp):
    # if data can be decoded as utf-8 use that, try chardet otherwise
    # chardet detects utf-8 as ISO-8859-2 most of the time
    try:
-        data = unicode(rawdata, 'utf-8')
+        rawdata.decode('utf-8')
        encoding = 'utf-8'
    except:
        encoding = chardet.detect(rawdata)['encoding']
@ -63,23 +67,25 @@ def load(filename, offset=0):
    def parse_time(t):
        return offset + ox.time2ms(t.replace(',', '.')) / 1000

-    with open(filename) as f:
+    with open(filename, 'rb') as f:
        encoding = _detect_encoding(f)
        data = f.read()
    try:
-        data = unicode(data, encoding)
+        data = data.decode(encoding)
    except:
        try:
-            data = unicode(data, 'latin-1')
+            data = data.decode('latin-1')
        except:
            print("failed to detect encoding, giving up")
            return srt

    data = data.replace('\r\n', '\n')
-    srts = re.compile('(\d\d:\d\d:\d\d[,.]\d\d\d)\s*?-->\s*?(\d\d:\d\d:\d\d[,.]\d\d\d).*?\n(.*?)\n\n', re.DOTALL)
+    regexp = r'(\d\d:\d\d:\d\d[,.]\d\d\d)\s*?-->\s*?(\d\d:\d\d:\d\d[,.]\d\d\d).*?\n(.*?)\n\n'
+    srts = re.compile(regexp, re.DOTALL)
    i = 0
    for s in srts.findall(data):
-        _s = {'id': str(i),
+        _s = {
+            'id': str(i),
            'in': parse_time(s[0]),
            'out': parse_time(s[1]),
            'value': s[2].strip()