fix python3 ox.srt
This commit is contained in:
parent
1e3d2d24bb
commit
ac2e829016
1 changed files with 29 additions and 23 deletions
34
ox/srt.py
34
ox/srt.py
|
@ -1,10 +1,11 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import with_statement, division, print_function
|
||||
import chardet
|
||||
import re
|
||||
import codecs
|
||||
import re
|
||||
|
||||
import chardet
|
||||
from six import PY2
|
||||
import ox
|
||||
|
||||
|
||||
|
@ -12,7 +13,7 @@ __all__ = []
|
|||
|
||||
|
||||
def _detect_encoding(fp):
|
||||
bomDict={ # bytepattern : name
|
||||
bomDict = { # bytepattern : name
|
||||
(0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
|
||||
(0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
|
||||
(0xFE, 0xFF, None, None): "utf_16_be",
|
||||
|
@ -23,7 +24,10 @@ def _detect_encoding(fp):
|
|||
# go to beginning of file and get the first 4 bytes
|
||||
oldFP = fp.tell()
|
||||
fp.seek(0)
|
||||
(byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
|
||||
if PY2:
|
||||
(byte1, byte2, byte3, byte4) = [ord(b) for b in fp.read(4)]
|
||||
else:
|
||||
(byte1, byte2, byte3, byte4) = fp.read(4)
|
||||
|
||||
# try bom detection using 4 bytes, 3 bytes, or 2 bytes
|
||||
bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
|
||||
|
@ -31,18 +35,18 @@ def _detect_encoding(fp):
|
|||
bomDetection = bomDict.get((byte1, byte2, byte3, None))
|
||||
if not bomDetection:
|
||||
bomDetection = bomDict.get((byte1, byte2, None, None))
|
||||
## if BOM detected, we're done :-)
|
||||
# if BOM detected, we're done :-)
|
||||
fp.seek(oldFP)
|
||||
if bomDetection:
|
||||
return bomDetection
|
||||
encoding = 'latin-1'
|
||||
#more character detecting magick using http://chardet.feedparser.org/
|
||||
# more character detecting magick using http://chardet.feedparser.org/
|
||||
fp.seek(0)
|
||||
rawdata = fp.read()
|
||||
#if data can be decoded as utf-8 use that, try chardet otherwise
|
||||
#chardet detects utf-8 as ISO-8859-2 most of the time
|
||||
# if data can be decoded as utf-8 use that, try chardet otherwise
|
||||
# chardet detects utf-8 as ISO-8859-2 most of the time
|
||||
try:
|
||||
data = unicode(rawdata, 'utf-8')
|
||||
rawdata.decode('utf-8')
|
||||
encoding = 'utf-8'
|
||||
except:
|
||||
encoding = chardet.detect(rawdata)['encoding']
|
||||
|
@ -63,23 +67,25 @@ def load(filename, offset=0):
|
|||
def parse_time(t):
|
||||
return offset + ox.time2ms(t.replace(',', '.')) / 1000
|
||||
|
||||
with open(filename) as f:
|
||||
with open(filename, 'rb') as f:
|
||||
encoding = _detect_encoding(f)
|
||||
data = f.read()
|
||||
try:
|
||||
data = unicode(data, encoding)
|
||||
data = data.decode(encoding)
|
||||
except:
|
||||
try:
|
||||
data = unicode(data, 'latin-1')
|
||||
data = data.decode('latin-1')
|
||||
except:
|
||||
print("failed to detect encoding, giving up")
|
||||
return srt
|
||||
|
||||
data = data.replace('\r\n', '\n')
|
||||
srts = re.compile('(\d\d:\d\d:\d\d[,.]\d\d\d)\s*?-->\s*?(\d\d:\d\d:\d\d[,.]\d\d\d).*?\n(.*?)\n\n', re.DOTALL)
|
||||
regexp = r'(\d\d:\d\d:\d\d[,.]\d\d\d)\s*?-->\s*?(\d\d:\d\d:\d\d[,.]\d\d\d).*?\n(.*?)\n\n'
|
||||
srts = re.compile(regexp, re.DOTALL)
|
||||
i = 0
|
||||
for s in srts.findall(data):
|
||||
_s = {'id': str(i),
|
||||
_s = {
|
||||
'id': str(i),
|
||||
'in': parse_time(s[0]),
|
||||
'out': parse_time(s[1]),
|
||||
'value': s[2].strip()
|
||||
|
|
Loading…
Reference in a new issue