python-ox/ox/srt.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import with_statement, division, print_function
import codecs
import re

import chardet
from six import PY2
import ox


__all__ = []


def _detect_encoding(fp):
    bomDict = {  # bytepattern : name
        (0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
        (0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
        (0xFE, 0xFF, None, None): "utf_16_be",
        (0xFF, 0xFE, None, None): "utf_16_le",
        (0xEF, 0xBB, 0xBF, None): "utf_8",
    }

    # go to beginning of file and get the first 4 bytes
    oldFP = fp.tell()
    fp.seek(0)
    if PY2:
        (byte1, byte2, byte3, byte4) = [ord(b) for b in fp.read(4)]
    else:
        (byte1, byte2, byte3, byte4) = fp.read(4)

    # try bom detection using 4 bytes, 3 bytes, or 2 bytes
    bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
    if not bomDetection:
        bomDetection = bomDict.get((byte1, byte2, byte3, None))
        if not bomDetection:
            bomDetection = bomDict.get((byte1, byte2, None, None))
    # if BOM detected, we're done :-)
    fp.seek(oldFP)
    if bomDetection:
        return bomDetection
    encoding = 'latin-1'
    # more character detecting magick using http://chardet.feedparser.org/
    fp.seek(0)
    rawdata = fp.read()
    # if data can be decoded as utf-8 use that, try chardet otherwise
    # chardet detects utf-8 as ISO-8859-2 most of the time
    try:
        rawdata.decode('utf-8')
        encoding = 'utf-8'
    except:
        encoding = chardet.detect(rawdata)['encoding']
    fp.seek(oldFP)
    return encoding


def load(filename, offset=0):
    '''Parses an srt file

    filename: path to an srt file
    offset (float, seconds): shift all in/out points by offset

    Returns list with dicts that have in, out, value and id
    '''
    srt = []

    def parse_time(t):
        return offset + ox.time2ms(t.replace(',', '.')) / 1000

    with open(filename, 'rb') as f:
        encoding = _detect_encoding(f)
        data = f.read()
    try:
        data = data.decode(encoding)
    except:
        try:
            data = data.decode('latin-1')
        except:
            print("failed to detect encoding, giving up")
            return srt

    data = data.replace('\r\n', '\n')
    regexp = r'(\d\d:\d\d:\d\d[,.]\d\d\d)\s*?-->\s*?(\d\d:\d\d:\d\d[,.]\d\d\d).*?\n(.*?)\n\n'
    srts = re.compile(regexp, re.DOTALL)
    i = 0
    for s in srts.findall(data):
        _s = {
            'id': str(i),
            'in': parse_time(s[0]),
            'out': parse_time(s[1]),
            'value': s[2].strip()
        }
        srt.append(_s)
        i += 1
    return srt


def _srt_timecode(t):
    return ox.format_duration(t * 1000, years=False).replace('.', ',')


def encode(data):
    """Encodes subtitles into SRT format

    data: list of dicts with 'in', 'out': float and 'value': unicode

    Returns: a UTF-8-encoded bytestring

    >>> encode([{'in': 1.25, 'out': 60 * 60 + 1, 'value': u'touch\\u00E9'}])
    '\\xef\\xbb\\xbf1\\r\\n00:00:01,250 --> 01:00:01,000\\r\\ntouch\\xc3\\xa9\\r\\n\\r\\n'
    """

    srt = u''

    for i, s in enumerate(data, 1):
        srt += '%d\r\n%s --> %s\r\n%s\r\n\r\n' % (
            i,
            _srt_timecode(s['in']),
            _srt_timecode(s['out']),
            s['value'].replace('\n', '\r\n').strip()
        )

    return codecs.BOM_UTF8 + srt.encode('utf-8')
add srt.load 2012-01-02 16:44:10 +00:00			`# -- coding: utf-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`from __future__ import with_statement, division, print_function`
add utf-8 BOM to srt 2012-03-02 21:37:35 +00:00			`import codecs`
fix python3 ox.srt 2016-06-08 09:36:55 +00:00			`import re`
add srt.load 2012-01-02 16:44:10 +00:00
fix python3 ox.srt 2016-06-08 09:36:55 +00:00			`import chardet`
			`from six import PY2`
add srt.load 2012-01-02 16:44:10 +00:00			`import ox`


			`__all__ = []`


replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`def _detect_encoding(fp):`
fix python3 ox.srt 2016-06-08 09:36:55 +00:00			`bomDict = { # bytepattern : name`
			`(0x00, 0x00, 0xFE, 0xFF): "utf_32_be",`
			`(0xFF, 0xFE, 0x00, 0x00): "utf_32_le",`
			`(0xFE, 0xFF, None, None): "utf_16_be",`
			`(0xFF, 0xFE, None, None): "utf_16_le",`
			`(0xEF, 0xBB, 0xBF, None): "utf_8",`
			`}`
add srt.load 2012-01-02 16:44:10 +00:00
			`# go to beginning of file and get the first 4 bytes`
			`oldFP = fp.tell()`
			`fp.seek(0)`
fix python3 ox.srt 2016-06-08 09:36:55 +00:00			`if PY2:`
			`(byte1, byte2, byte3, byte4) = [ord(b) for b in fp.read(4)]`
			`else:`
			`(byte1, byte2, byte3, byte4) = fp.read(4)`
add srt.load 2012-01-02 16:44:10 +00:00
			`# try bom detection using 4 bytes, 3 bytes, or 2 bytes`
			`bomDetection = bomDict.get((byte1, byte2, byte3, byte4))`
			`if not bomDetection:`
			`bomDetection = bomDict.get((byte1, byte2, byte3, None))`
			`if not bomDetection:`
			`bomDetection = bomDict.get((byte1, byte2, None, None))`
fix python3 ox.srt 2016-06-08 09:36:55 +00:00			`# if BOM detected, we're done :-)`
add srt.load 2012-01-02 16:44:10 +00:00			`fp.seek(oldFP)`
			`if bomDetection:`
			`return bomDetection`
			`encoding = 'latin-1'`
fix python3 ox.srt 2016-06-08 09:36:55 +00:00			`# more character detecting magick using http://chardet.feedparser.org/`
add srt.load 2012-01-02 16:44:10 +00:00			`fp.seek(0)`
			`rawdata = fp.read()`
fix python3 ox.srt 2016-06-08 09:36:55 +00:00			`# if data can be decoded as utf-8 use that, try chardet otherwise`
			`# chardet detects utf-8 as ISO-8859-2 most of the time`
utf-8 srts are more common 2012-01-28 08:26:42 +00:00			`try:`
fix python3 ox.srt 2016-06-08 09:36:55 +00:00			`rawdata.decode('utf-8')`
utf-8 srts are more common 2012-01-28 08:26:42 +00:00			`encoding = 'utf-8'`
			`except:`
			`encoding = chardet.detect(rawdata)['encoding']`
add srt.load 2012-01-02 16:44:10 +00:00			`fp.seek(oldFP)`
			`return encoding`


			`def load(filename, offset=0):`
srt: neater docstrings, some cleanup 2016-03-11 12:10:32 +00:00			`'''Parses an srt file`

			`filename: path to an srt file`
			`offset (float, seconds): shift all in/out points by offset`
add srt.load 2012-01-02 16:44:10 +00:00
srt: neater docstrings, some cleanup 2016-03-11 12:10:32 +00:00			`Returns list with dicts that have in, out, value and id`
add srt.load 2012-01-02 16:44:10 +00:00			`'''`
			`srt = []`

			`def parse_time(t):`
			`return offset + ox.time2ms(t.replace(',', '.')) / 1000`

fix python3 ox.srt 2016-06-08 09:36:55 +00:00			`with open(filename, 'rb') as f:`
replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`encoding = _detect_encoding(f)`
add srt.load 2012-01-02 16:44:10 +00:00			`data = f.read()`
			`try:`
fix python3 ox.srt 2016-06-08 09:36:55 +00:00			`data = data.decode(encoding)`
add srt.load 2012-01-02 16:44:10 +00:00			`except:`
			`try:`
fix python3 ox.srt 2016-06-08 09:36:55 +00:00			`data = data.decode('latin-1')`
add srt.load 2012-01-02 16:44:10 +00:00			`except:`
use six to support python 2 and 3 2014-09-30 19:04:46 +00:00			`print("failed to detect encoding, giving up")`
add srt.load 2012-01-02 16:44:10 +00:00			`return srt`

			`data = data.replace('\r\n', '\n')`
fix python3 ox.srt 2016-06-08 09:36:55 +00:00			`regexp = r'(\d\d:\d\d:\d\d[,.]\d\d\d)\s?-->\s?(\d\d:\d\d:\d\d[,.]\d\d\d).?\n(.?)\n\n'`
			`srts = re.compile(regexp, re.DOTALL)`
add srt.load 2012-01-02 16:44:10 +00:00			`i = 0`
			`for s in srts.findall(data):`
fix python3 ox.srt 2016-06-08 09:36:55 +00:00			`_s = {`
			`'id': str(i),`
			`'in': parse_time(s[0]),`
			`'out': parse_time(s[1]),`
			`'value': s[2].strip()`
add srt.load 2012-01-02 16:44:10 +00:00			`}`
			`srt.append(_s)`
			`i += 1`
			`return srt`
add srt.encode 2012-01-02 17:07:17 +00:00
srt: neater docstrings, some cleanup 2016-03-11 12:10:32 +00:00
			`def _srt_timecode(t):`
			`return ox.format_duration(t * 1000, years=False).replace('.', ',')`


add srt.encode 2012-01-02 17:07:17 +00:00			`def encode(data):`
srt: neater docstrings, some cleanup 2016-03-11 12:10:32 +00:00			`"""Encodes subtitles into SRT format`

			`data: list of dicts with 'in', 'out': float and 'value': unicode`

			`Returns: a UTF-8-encoded bytestring`

			`>>> encode([{'in': 1.25, 'out': 60 * 60 + 1, 'value': u'touch\\u00E9'}])`
			`'\\xef\\xbb\\xbf1\\r\\n00:00:01,250 --> 01:00:01,000\\r\\ntouch\\xc3\\xa9\\r\\n\\r\\n'`
			`"""`

add srt.encode 2012-01-02 17:07:17 +00:00			`srt = u''`
srt: neater docstrings, some cleanup 2016-03-11 12:10:32 +00:00
			`for i, s in enumerate(data, 1):`
add srt.encode 2012-01-02 17:07:17 +00:00			`srt += '%d\r\n%s --> %s\r\n%s\r\n\r\n' % (`
			`i,`
srt: neater docstrings, some cleanup 2016-03-11 12:10:32 +00:00			`_srt_timecode(s['in']),`
			`_srt_timecode(s['out']),`
add srt.encode 2012-01-02 17:07:17 +00:00			`s['value'].replace('\n', '\r\n').strip()`
			`)`

srt: neater docstrings, some cleanup 2016-03-11 12:10:32 +00:00			`return codecs.BOM_UTF8 + srt.encode('utf-8')`