python-ox/ox/srt.py

132 lines
3.5 KiB
Python
Raw Permalink Normal View History

2012-01-02 16:44:10 +00:00
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import division, print_function
2012-03-02 21:37:35 +00:00
import codecs
2016-06-08 09:36:55 +00:00
import re
2012-01-02 16:44:10 +00:00
2016-06-08 09:36:55 +00:00
import chardet
2012-01-02 16:44:10 +00:00
import ox
__all__ = []
def _detect_encoding(fp):
2016-06-08 09:36:55 +00:00
bomDict = { # bytepattern : name
(0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
(0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
(0xFE, 0xFF, None, None): "utf_16_be",
(0xFF, 0xFE, None, None): "utf_16_le",
(0xEF, 0xBB, 0xBF, None): "utf_8",
}
2012-01-02 16:44:10 +00:00
# go to beginning of file and get the first 4 bytes
oldFP = fp.tell()
fp.seek(0)
2023-07-27 11:07:13 +00:00
(byte1, byte2, byte3, byte4) = fp.read(4)
2012-01-02 16:44:10 +00:00
# try bom detection using 4 bytes, 3 bytes, or 2 bytes
bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
if not bomDetection:
bomDetection = bomDict.get((byte1, byte2, byte3, None))
if not bomDetection:
bomDetection = bomDict.get((byte1, byte2, None, None))
2016-06-08 09:36:55 +00:00
# if BOM detected, we're done :-)
2012-01-02 16:44:10 +00:00
fp.seek(oldFP)
if bomDetection:
return bomDetection
encoding = 'latin-1'
2016-06-08 09:36:55 +00:00
# more character detecting magick using http://chardet.feedparser.org/
2012-01-02 16:44:10 +00:00
fp.seek(0)
rawdata = fp.read()
2016-06-08 09:36:55 +00:00
# if data can be decoded as utf-8 use that, try chardet otherwise
# chardet detects utf-8 as ISO-8859-2 most of the time
2012-01-28 08:26:42 +00:00
try:
2016-06-08 09:36:55 +00:00
rawdata.decode('utf-8')
2012-01-28 08:26:42 +00:00
encoding = 'utf-8'
except:
encoding = chardet.detect(rawdata)['encoding']
2012-01-02 16:44:10 +00:00
fp.seek(oldFP)
return encoding
def load(filename, offset=0):
2016-03-11 12:10:32 +00:00
'''Parses an srt file
filename: path to an srt file
offset (float, seconds): shift all in/out points by offset
2012-01-02 16:44:10 +00:00
2016-03-11 12:10:32 +00:00
Returns list with dicts that have in, out, value and id
2012-01-02 16:44:10 +00:00
'''
srt = []
2016-06-08 09:36:55 +00:00
with open(filename, 'rb') as f:
encoding = _detect_encoding(f)
2012-01-02 16:44:10 +00:00
data = f.read()
try:
2016-06-08 09:36:55 +00:00
data = data.decode(encoding)
2012-01-02 16:44:10 +00:00
except:
try:
2016-06-08 09:36:55 +00:00
data = data.decode('latin-1')
2012-01-02 16:44:10 +00:00
except:
2014-09-30 19:04:46 +00:00
print("failed to detect encoding, giving up")
2019-12-21 18:18:19 +00:00
return []
return loads(data, offset)
def loads(data, offset=0):
'''Parses an srt file
filename: path to an srt file
offset (float, seconds): shift all in/out points by offset
Returns list with dicts that have in, out, value and id
'''
srt = []
def parse_time(t):
return offset + ox.time2ms(t.replace(',', '.')) / 1000
2012-01-02 16:44:10 +00:00
data = data.replace('\r\n', '\n')
2018-05-01 09:23:43 +00:00
if not data.endswith('\n\n'):
data += '\n\n'
2016-06-08 09:36:55 +00:00
regexp = r'(\d\d:\d\d:\d\d[,.]\d\d\d)\s*?-->\s*?(\d\d:\d\d:\d\d[,.]\d\d\d).*?\n(.*?)\n\n'
srts = re.compile(regexp, re.DOTALL)
2012-01-02 16:44:10 +00:00
i = 0
for s in srts.findall(data):
2016-06-08 09:36:55 +00:00
_s = {
'id': str(i),
'in': parse_time(s[0]),
'out': parse_time(s[1]),
'value': s[2].strip()
2012-01-02 16:44:10 +00:00
}
srt.append(_s)
i += 1
return srt
2012-01-02 17:07:17 +00:00
2016-03-11 12:10:32 +00:00
def _srt_timecode(t):
return ox.format_duration(t * 1000, years=False).replace('.', ',')
2012-01-02 17:07:17 +00:00
def encode(data):
2016-03-11 12:10:32 +00:00
"""Encodes subtitles into SRT format
data: list of dicts with 'in', 'out': float and 'value': unicode
Returns: a UTF-8-encoded bytestring
>>> encode([{'in': 1.25, 'out': 60 * 60 + 1, 'value': u'touch\\u00E9'}])
'\\xef\\xbb\\xbf1\\r\\n00:00:01,250 --> 01:00:01,000\\r\\ntouch\\xc3\\xa9\\r\\n\\r\\n'
"""
2012-01-02 17:07:17 +00:00
srt = u''
2016-03-11 12:10:32 +00:00
for i, s in enumerate(data, 1):
2012-01-02 17:07:17 +00:00
srt += '%d\r\n%s --> %s\r\n%s\r\n\r\n' % (
i,
2016-03-11 12:10:32 +00:00
_srt_timecode(s['in']),
_srt_timecode(s['out']),
2012-01-02 17:07:17 +00:00
s['value'].replace('\n', '\r\n').strip()
)
2016-03-11 12:10:32 +00:00
return codecs.BOM_UTF8 + srt.encode('utf-8')