diff --git a/ox/srt.py b/ox/srt.py index 1b3f8b3..1eb912f 100644 --- a/ox/srt.py +++ b/ox/srt.py @@ -1,10 +1,11 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 from __future__ import with_statement, division, print_function -import chardet -import re import codecs +import re +import chardet +from six import PY2 import ox @@ -12,18 +13,21 @@ __all__ = [] def _detect_encoding(fp): - bomDict={ # bytepattern : name - (0x00, 0x00, 0xFE, 0xFF): "utf_32_be", - (0xFF, 0xFE, 0x00, 0x00): "utf_32_le", - (0xFE, 0xFF, None, None): "utf_16_be", - (0xFF, 0xFE, None, None): "utf_16_le", - (0xEF, 0xBB, 0xBF, None): "utf_8", - } + bomDict = { # bytepattern : name + (0x00, 0x00, 0xFE, 0xFF): "utf_32_be", + (0xFF, 0xFE, 0x00, 0x00): "utf_32_le", + (0xFE, 0xFF, None, None): "utf_16_be", + (0xFF, 0xFE, None, None): "utf_16_le", + (0xEF, 0xBB, 0xBF, None): "utf_8", + } # go to beginning of file and get the first 4 bytes oldFP = fp.tell() fp.seek(0) - (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4))) + if PY2: + (byte1, byte2, byte3, byte4) = [ord(b) for b in fp.read(4)] + else: + (byte1, byte2, byte3, byte4) = fp.read(4) # try bom detection using 4 bytes, 3 bytes, or 2 bytes bomDetection = bomDict.get((byte1, byte2, byte3, byte4)) @@ -31,18 +35,18 @@ def _detect_encoding(fp): bomDetection = bomDict.get((byte1, byte2, byte3, None)) if not bomDetection: bomDetection = bomDict.get((byte1, byte2, None, None)) - ## if BOM detected, we're done :-) + # if BOM detected, we're done :-) fp.seek(oldFP) if bomDetection: return bomDetection encoding = 'latin-1' - #more character detecting magick using http://chardet.feedparser.org/ + # more character detecting magick using http://chardet.feedparser.org/ fp.seek(0) rawdata = fp.read() - #if data can be decoded as utf-8 use that, try chardet otherwise - #chardet detects utf-8 as ISO-8859-2 most of the time + # if data can be decoded as utf-8 use that, try chardet otherwise + # chardet detects utf-8 as ISO-8859-2 most of the time try: - data = unicode(rawdata, 'utf-8') + rawdata.decode('utf-8') encoding = 'utf-8' except: encoding = chardet.detect(rawdata)['encoding'] @@ -63,26 +67,28 @@ def load(filename, offset=0): def parse_time(t): return offset + ox.time2ms(t.replace(',', '.')) / 1000 - with open(filename) as f: + with open(filename, 'rb') as f: encoding = _detect_encoding(f) data = f.read() try: - data = unicode(data, encoding) + data = data.decode(encoding) except: try: - data = unicode(data, 'latin-1') + data = data.decode('latin-1') except: print("failed to detect encoding, giving up") return srt data = data.replace('\r\n', '\n') - srts = re.compile('(\d\d:\d\d:\d\d[,.]\d\d\d)\s*?-->\s*?(\d\d:\d\d:\d\d[,.]\d\d\d).*?\n(.*?)\n\n', re.DOTALL) + regexp = r'(\d\d:\d\d:\d\d[,.]\d\d\d)\s*?-->\s*?(\d\d:\d\d:\d\d[,.]\d\d\d).*?\n(.*?)\n\n' + srts = re.compile(regexp, re.DOTALL) i = 0 for s in srts.findall(data): - _s = {'id': str(i), - 'in': parse_time(s[0]), - 'out': parse_time(s[1]), - 'value': s[2].strip() + _s = { + 'id': str(i), + 'in': parse_time(s[0]), + 'out': parse_time(s[1]), + 'value': s[2].strip() } srt.append(_s) i += 1