detecht iso-8859-1 in html header
This commit is contained in:
parent
7a531dc8ef
commit
8b2d9c5a87
1 changed files with 12 additions and 10 deletions
22
oxlib/net.py
22
oxlib/net.py
|
@ -57,16 +57,6 @@ def getUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):
|
|||
return dict(f.headers), data
|
||||
return data
|
||||
|
||||
def saveUrl(url, filename, overwrite=False):
|
||||
if not os.path.exists(filename) or overwrite:
|
||||
dirname = os.path.dirname(filename)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
data = getUrl(url)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
|
||||
def getUrlUnicode(url):
|
||||
data = getUrl(url)
|
||||
encoding = getEncoding(data)
|
||||
|
@ -77,6 +67,8 @@ def getUrlUnicode(url):
|
|||
def getEncoding(data):
|
||||
if 'content="text/html; charset=utf-8"' in data:
|
||||
return 'utf-8'
|
||||
elif 'content="text/html; charset=iso-8859-1"' in data:
|
||||
return 'iso-8859-1'
|
||||
detector = UniversalDetector()
|
||||
for line in data.split('\n'):
|
||||
detector.feed(line)
|
||||
|
@ -85,3 +77,13 @@ def getEncoding(data):
|
|||
detector.close()
|
||||
return detector.result['encoding']
|
||||
|
||||
def saveUrl(url, filename, overwrite=False):
|
||||
if not os.path.exists(filename) or overwrite:
|
||||
dirname = os.path.dirname(filename)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
data = getUrl(url)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
|
||||
|
|
Loading…
Reference in a new issue