srt parsing moved to ox.srt.load

2012-01-02 22:15:01 +05:30 · 2012-01-02 22:15:01 +05:30 · 7757582aaf
commit 7757582aaf
parent 4c25291c88
1 changed files with 5 additions and 61 deletions
--- a/pandora/archive/models.py
+++ b/pandora/archive/models.py
@ -171,67 +171,11 @@ class File(models.Model):
        return None
    def srt(self, offset=0):
-
+        srt = ox.load_srt(self.data.path)
-        def _detectEncoding(fp):
+        #subtitles should not overlap
-            bomDict={ # bytepattern : name
+        for i in range(1, len(srt)):
-                      (0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
+            if srt[i-1]['out'] > srt[i]['in']:
-                      (0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
+                srt[i-1]['out'] = srt[i]['in']
                      (0xFE, 0xFF, None, None): "utf_16_be",
                      (0xFF, 0xFE, None, None): "utf_16_le",
                      (0xEF, 0xBB, 0xBF, None): "utf_8",
                    }
            # go to beginning of file and get the first 4 bytes
            oldFP = fp.tell()
            fp.seek(0)
            (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
            # try bom detection using 4 bytes, 3 bytes, or 2 bytes
            bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
            if not bomDetection:
                bomDetection = bomDict.get((byte1, byte2, byte3, None))
                if not bomDetection:
                    bomDetection = bomDict.get((byte1, byte2, None, None))
            ## if BOM detected, we're done :-)
            fp.seek(oldFP)
            if bomDetection:
                return bomDetection
            encoding = 'latin-1'
            #more character detecting magick using http://chardet.feedparser.org/
            fp.seek(0)
            rawdata = fp.read()
            encoding = chardet.detect(rawdata)['encoding']
            fp.seek(oldFP)
            return encoding
        def parseTime(t):
            return offset + ox.time2ms(t.replace(',', '.')) / 1000
        srt = []
        f = open(self.data.path)
        encoding = _detectEncoding(f)
        data = f.read()
        f.close()
        try:
            data = unicode(data, encoding)
        except:
            try:
                data = unicode(data, 'latin-1')
            except:
                print "failed to detect encoding, giving up"
                return srt
        data = data.replace('\r\n', '\n')
        srts = re.compile('(\d\d:\d\d:\d\d[,.]\d\d\d)\s*-->\s*(\d\d:\d\d:\d\d[,.]\d\d\d)\s*(.+?)\n\n', re.DOTALL)
        i = 0
        for s in srts.findall(data):
            _s = {'id': str(i),
                  'in': parseTime(s[0]), 'out': parseTime(s[1]), 'value': s[2].strip()}
            if srt and srt[-1]['out'] > _s['in']:
                srt[-1]['out'] = _s['in']
            srt.append(_s)
            i += 1
        return srt
    def editable(self, user):