detecht iso-8859-1 in html header

This commit is contained in:
j 2009-07-15 15:53:40 +02:00
parent 7a531dc8ef
commit 8b2d9c5a87

View file

@ -57,16 +57,6 @@ def getUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):
return dict(f.headers), data return dict(f.headers), data
return data return data
def saveUrl(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite:
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
os.makedirs(dirname)
data = getUrl(url)
f = open(filename, 'w')
f.write(data)
f.close()
def getUrlUnicode(url): def getUrlUnicode(url):
data = getUrl(url) data = getUrl(url)
encoding = getEncoding(data) encoding = getEncoding(data)
@ -77,6 +67,8 @@ def getUrlUnicode(url):
def getEncoding(data): def getEncoding(data):
if 'content="text/html; charset=utf-8"' in data: if 'content="text/html; charset=utf-8"' in data:
return 'utf-8' return 'utf-8'
elif 'content="text/html; charset=iso-8859-1"' in data:
return 'iso-8859-1'
detector = UniversalDetector() detector = UniversalDetector()
for line in data.split('\n'): for line in data.split('\n'):
detector.feed(line) detector.feed(line)
@ -85,3 +77,13 @@ def getEncoding(data):
detector.close() detector.close()
return detector.result['encoding'] return detector.result['encoding']
def saveUrl(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite:
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
os.makedirs(dirname)
data = getUrl(url)
f = open(filename, 'w')
f.write(data)
f.close()