utf-8 srts are more common

2012-01-28 13:56:42 +05:30 · 2012-01-28 13:56:42 +05:30 · ca7bfd06d5
commit ca7bfd06d5
parent b7bb4a4e80
1 changed files with 7 additions and 2 deletions
--- a/ox/srt.py
+++ b/ox/srt.py
@ -34,12 +34,17 @@ def _detectEncoding(fp):
    fp.seek(oldFP)
    if bomDetection:
        return bomDetection
-
    encoding = 'latin-1'
    #more character detecting magick using http://chardet.feedparser.org/
    fp.seek(0)
    rawdata = fp.read()
-    encoding = chardet.detect(rawdata)['encoding']
+    #if data can be decoded as utf-8 use that, try chardet otherwise
+    #chardet detects utf-8 as ISO-8859-2 most of the time
+    try:
+        data = unicode(rawdata, 'utf-8')
+        encoding = 'utf-8'
+    except:
+        encoding = chardet.detect(rawdata)['encoding']
    fp.seek(oldFP)
    return encoding