srt parsing moved to ox.srt.load
This commit is contained in:
parent
4c25291c88
commit
7757582aaf
1 changed files with 5 additions and 61 deletions
|
@ -171,67 +171,11 @@ class File(models.Model):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def srt(self, offset=0):
|
def srt(self, offset=0):
|
||||||
|
srt = ox.load_srt(self.data.path)
|
||||||
def _detectEncoding(fp):
|
#subtitles should not overlap
|
||||||
bomDict={ # bytepattern : name
|
for i in range(1, len(srt)):
|
||||||
(0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
|
if srt[i-1]['out'] > srt[i]['in']:
|
||||||
(0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
|
srt[i-1]['out'] = srt[i]['in']
|
||||||
(0xFE, 0xFF, None, None): "utf_16_be",
|
|
||||||
(0xFF, 0xFE, None, None): "utf_16_le",
|
|
||||||
(0xEF, 0xBB, 0xBF, None): "utf_8",
|
|
||||||
}
|
|
||||||
|
|
||||||
# go to beginning of file and get the first 4 bytes
|
|
||||||
oldFP = fp.tell()
|
|
||||||
fp.seek(0)
|
|
||||||
(byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
|
|
||||||
|
|
||||||
# try bom detection using 4 bytes, 3 bytes, or 2 bytes
|
|
||||||
bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
|
|
||||||
if not bomDetection:
|
|
||||||
bomDetection = bomDict.get((byte1, byte2, byte3, None))
|
|
||||||
if not bomDetection:
|
|
||||||
bomDetection = bomDict.get((byte1, byte2, None, None))
|
|
||||||
## if BOM detected, we're done :-)
|
|
||||||
fp.seek(oldFP)
|
|
||||||
if bomDetection:
|
|
||||||
return bomDetection
|
|
||||||
|
|
||||||
encoding = 'latin-1'
|
|
||||||
#more character detecting magick using http://chardet.feedparser.org/
|
|
||||||
fp.seek(0)
|
|
||||||
rawdata = fp.read()
|
|
||||||
encoding = chardet.detect(rawdata)['encoding']
|
|
||||||
fp.seek(oldFP)
|
|
||||||
return encoding
|
|
||||||
|
|
||||||
def parseTime(t):
|
|
||||||
return offset + ox.time2ms(t.replace(',', '.')) / 1000
|
|
||||||
|
|
||||||
srt = []
|
|
||||||
|
|
||||||
f = open(self.data.path)
|
|
||||||
encoding = _detectEncoding(f)
|
|
||||||
data = f.read()
|
|
||||||
f.close()
|
|
||||||
try:
|
|
||||||
data = unicode(data, encoding)
|
|
||||||
except:
|
|
||||||
try:
|
|
||||||
data = unicode(data, 'latin-1')
|
|
||||||
except:
|
|
||||||
print "failed to detect encoding, giving up"
|
|
||||||
return srt
|
|
||||||
data = data.replace('\r\n', '\n')
|
|
||||||
srts = re.compile('(\d\d:\d\d:\d\d[,.]\d\d\d)\s*-->\s*(\d\d:\d\d:\d\d[,.]\d\d\d)\s*(.+?)\n\n', re.DOTALL)
|
|
||||||
i = 0
|
|
||||||
for s in srts.findall(data):
|
|
||||||
_s = {'id': str(i),
|
|
||||||
'in': parseTime(s[0]), 'out': parseTime(s[1]), 'value': s[2].strip()}
|
|
||||||
if srt and srt[-1]['out'] > _s['in']:
|
|
||||||
srt[-1]['out'] = _s['in']
|
|
||||||
srt.append(_s)
|
|
||||||
i += 1
|
|
||||||
return srt
|
return srt
|
||||||
|
|
||||||
def editable(self, user):
|
def editable(self, user):
|
||||||
|
|
Loading…
Reference in a new issue