subtitles

2010-09-18 16:44:35 +02:00 · 2010-09-18 16:44:35 +02:00 · e64dd48ee2
commit e64dd48ee2
parent c5b74c6f77
4 changed files with 115 additions and 85 deletions
--- a/pandora/archive/models.py
+++ b/pandora/archive/models.py
@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 # vi:si:et:sw=4:sts=4:ts=4
+from __future__ import division
 from datetime import datetime
 import os.path
 import random
@ -19,6 +20,7 @@ import ox
 from ox import stripTags
 from ox.normalize import canonicalTitle, canonicalName
 from firefogg import Firefogg
+import chardet

 from backend import utils
 from pandora.backend.models import Movie
@ -149,6 +151,65 @@ class File(models.Model):
            return self.data.read()
        return None

+    def srt(self):
+        def _detectEncoding(fp):
+            bomDict={ # bytepattern : name              
+                      (0x00, 0x00, 0xFE, 0xFF) : "utf_32_be",        
+                      (0xFF, 0xFE, 0x00, 0x00) : "utf_32_le",
+                      (0xFE, 0xFF, None, None) : "utf_16_be", 
+                      (0xFF, 0xFE, None, None) : "utf_16_le", 
+                      (0xEF, 0xBB, 0xBF, None) : "utf_8",
+                    }
+
+            # go to beginning of file and get the first 4 bytes
+            oldFP = fp.tell()
+            fp.seek(0)
+            (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
+
+            # try bom detection using 4 bytes, 3 bytes, or 2 bytes
+            bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
+            if not bomDetection :
+                bomDetection = bomDict.get((byte1, byte2, byte3, None))
+                if not bomDetection :
+                    bomDetection = bomDict.get((byte1, byte2, None, None))
+
+            ## if BOM detected, we're done :-)
+            fp.seek(oldFP)
+            if bomDetection :
+                  return bomDetection
+
+            encoding = 'latin-1'
+            #more character detecting magick using http://chardet.feedparser.org/
+            fp.seek(0)
+            rawdata = fp.read()
+            encoding = chardet.detect(rawdata)['encoding']
+            fp.seek(oldFP)
+            return encoding
+
+        def parseTime(t):
+            return ox.time2ms(t.replace(',', '.')) / 1000
+
+        srt = []
+
+        f = open(self.data.path)
+        encoding = _detectEncoding(f)
+        data = f.read()
+        f.close()
+        try:
+            data = unicode(data, encoding)
+        except:
+            try:
+                data = unicode(data, 'latin-1')
+            except:
+                print "failed to detect encoding, giving up"
+                return srt
+
+        srts = re.compile('(\d\d:\d\d:\d\d[,.]\d\d\d)\s*-->\s*(\d\d:\d\d:\d\d[,.]\d\d\d)\s*(.+?)\n\n', re.DOTALL)
+        for s in srts.findall(data):
+            _s = {'in': parseTime(s[0]), 'out': parseTime(s[1]), 'text': s[2].strip()}
+            srt.append(_s)
+        return srt
+
    def editable(self, user):
        #FIXME: check that user has instance of this file
        return True