fix python3 ox.srt

This commit is contained in:
j 2016-06-08 11:36:55 +02:00
parent 1e3d2d24bb
commit ac2e829016

View file

@ -1,10 +1,11 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
from __future__ import with_statement, division, print_function from __future__ import with_statement, division, print_function
import chardet
import re
import codecs import codecs
import re
import chardet
from six import PY2
import ox import ox
@ -12,7 +13,7 @@ __all__ = []
def _detect_encoding(fp): def _detect_encoding(fp):
bomDict={ # bytepattern : name bomDict = { # bytepattern : name
(0x00, 0x00, 0xFE, 0xFF): "utf_32_be", (0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
(0xFF, 0xFE, 0x00, 0x00): "utf_32_le", (0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
(0xFE, 0xFF, None, None): "utf_16_be", (0xFE, 0xFF, None, None): "utf_16_be",
@ -23,7 +24,10 @@ def _detect_encoding(fp):
# go to beginning of file and get the first 4 bytes # go to beginning of file and get the first 4 bytes
oldFP = fp.tell() oldFP = fp.tell()
fp.seek(0) fp.seek(0)
(byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4))) if PY2:
(byte1, byte2, byte3, byte4) = [ord(b) for b in fp.read(4)]
else:
(byte1, byte2, byte3, byte4) = fp.read(4)
# try bom detection using 4 bytes, 3 bytes, or 2 bytes # try bom detection using 4 bytes, 3 bytes, or 2 bytes
bomDetection = bomDict.get((byte1, byte2, byte3, byte4)) bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
@ -31,18 +35,18 @@ def _detect_encoding(fp):
bomDetection = bomDict.get((byte1, byte2, byte3, None)) bomDetection = bomDict.get((byte1, byte2, byte3, None))
if not bomDetection: if not bomDetection:
bomDetection = bomDict.get((byte1, byte2, None, None)) bomDetection = bomDict.get((byte1, byte2, None, None))
## if BOM detected, we're done :-) # if BOM detected, we're done :-)
fp.seek(oldFP) fp.seek(oldFP)
if bomDetection: if bomDetection:
return bomDetection return bomDetection
encoding = 'latin-1' encoding = 'latin-1'
#more character detecting magick using http://chardet.feedparser.org/ # more character detecting magick using http://chardet.feedparser.org/
fp.seek(0) fp.seek(0)
rawdata = fp.read() rawdata = fp.read()
#if data can be decoded as utf-8 use that, try chardet otherwise # if data can be decoded as utf-8 use that, try chardet otherwise
#chardet detects utf-8 as ISO-8859-2 most of the time # chardet detects utf-8 as ISO-8859-2 most of the time
try: try:
data = unicode(rawdata, 'utf-8') rawdata.decode('utf-8')
encoding = 'utf-8' encoding = 'utf-8'
except: except:
encoding = chardet.detect(rawdata)['encoding'] encoding = chardet.detect(rawdata)['encoding']
@ -63,23 +67,25 @@ def load(filename, offset=0):
def parse_time(t): def parse_time(t):
return offset + ox.time2ms(t.replace(',', '.')) / 1000 return offset + ox.time2ms(t.replace(',', '.')) / 1000
with open(filename) as f: with open(filename, 'rb') as f:
encoding = _detect_encoding(f) encoding = _detect_encoding(f)
data = f.read() data = f.read()
try: try:
data = unicode(data, encoding) data = data.decode(encoding)
except: except:
try: try:
data = unicode(data, 'latin-1') data = data.decode('latin-1')
except: except:
print("failed to detect encoding, giving up") print("failed to detect encoding, giving up")
return srt return srt
data = data.replace('\r\n', '\n') data = data.replace('\r\n', '\n')
srts = re.compile('(\d\d:\d\d:\d\d[,.]\d\d\d)\s*?-->\s*?(\d\d:\d\d:\d\d[,.]\d\d\d).*?\n(.*?)\n\n', re.DOTALL) regexp = r'(\d\d:\d\d:\d\d[,.]\d\d\d)\s*?-->\s*?(\d\d:\d\d:\d\d[,.]\d\d\d).*?\n(.*?)\n\n'
srts = re.compile(regexp, re.DOTALL)
i = 0 i = 0
for s in srts.findall(data): for s in srts.findall(data):
_s = {'id': str(i), _s = {
'id': str(i),
'in': parse_time(s[0]), 'in': parse_time(s[0]),
'out': parse_time(s[1]), 'out': parse_time(s[1]),
'value': s[2].strip() 'value': s[2].strip()