faster and more reliable encoding detection of html content

This commit is contained in:
j 2013-06-01 13:29:24 +02:00
parent 3165e3a8b1
commit f535b82e7b

View file

@ -3,6 +3,7 @@
# GPL 2008 # GPL 2008
import os import os
import gzip import gzip
import re
import StringIO import StringIO
import struct import struct
import urllib import urllib
@ -70,11 +71,12 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unic
return result return result
def detect_encoding(data): def detect_encoding(data):
if 'content="text/html; charset=utf-8"' in data.lower() or \ data_lower = data.lower()
'meta charset="utf-8"' in data.lower(): charset = re.compile('content="text/html; charset=(.*?)"').findall(data)
return 'utf-8' if not charset:
elif 'content="text/html; charset=iso-8859-1"' in data: charset = re.compile('meta charset="(.*?)"').findall(data)
return 'iso-8859-1' if charset:
return charset[0].lower()
detector = UniversalDetector() detector = UniversalDetector()
for line in data.split('\n'): for line in data.split('\n'):
detector.feed(line) detector.feed(line)