detecht iso-8859-1 in html header
This commit is contained in:
parent
7a531dc8ef
commit
8b2d9c5a87
1 changed files with 12 additions and 10 deletions
22
oxlib/net.py
22
oxlib/net.py
|
@ -57,16 +57,6 @@ def getUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):
|
||||||
return dict(f.headers), data
|
return dict(f.headers), data
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def saveUrl(url, filename, overwrite=False):
|
|
||||||
if not os.path.exists(filename) or overwrite:
|
|
||||||
dirname = os.path.dirname(filename)
|
|
||||||
if not os.path.exists(dirname):
|
|
||||||
os.makedirs(dirname)
|
|
||||||
data = getUrl(url)
|
|
||||||
f = open(filename, 'w')
|
|
||||||
f.write(data)
|
|
||||||
f.close()
|
|
||||||
|
|
||||||
def getUrlUnicode(url):
|
def getUrlUnicode(url):
|
||||||
data = getUrl(url)
|
data = getUrl(url)
|
||||||
encoding = getEncoding(data)
|
encoding = getEncoding(data)
|
||||||
|
@ -77,6 +67,8 @@ def getUrlUnicode(url):
|
||||||
def getEncoding(data):
|
def getEncoding(data):
|
||||||
if 'content="text/html; charset=utf-8"' in data:
|
if 'content="text/html; charset=utf-8"' in data:
|
||||||
return 'utf-8'
|
return 'utf-8'
|
||||||
|
elif 'content="text/html; charset=iso-8859-1"' in data:
|
||||||
|
return 'iso-8859-1'
|
||||||
detector = UniversalDetector()
|
detector = UniversalDetector()
|
||||||
for line in data.split('\n'):
|
for line in data.split('\n'):
|
||||||
detector.feed(line)
|
detector.feed(line)
|
||||||
|
@ -85,3 +77,13 @@ def getEncoding(data):
|
||||||
detector.close()
|
detector.close()
|
||||||
return detector.result['encoding']
|
return detector.result['encoding']
|
||||||
|
|
||||||
|
def saveUrl(url, filename, overwrite=False):
|
||||||
|
if not os.path.exists(filename) or overwrite:
|
||||||
|
dirname = os.path.dirname(filename)
|
||||||
|
if not os.path.exists(dirname):
|
||||||
|
os.makedirs(dirname)
|
||||||
|
data = getUrl(url)
|
||||||
|
f = open(filename, 'w')
|
||||||
|
f.write(data)
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue