utf-8 foobar
This commit is contained in:
parent
c434187130
commit
d23720b1ff
1 changed files with 8 additions and 1 deletions
|
@ -14,6 +14,7 @@ from google import google
|
||||||
from utils import stripTags, htmldecode
|
from utils import stripTags, htmldecode
|
||||||
|
|
||||||
import utils
|
import utils
|
||||||
|
import chardet
|
||||||
|
|
||||||
cache_base = "/var/cache/scrapeit/cache/"
|
cache_base = "/var/cache/scrapeit/cache/"
|
||||||
|
|
||||||
|
@ -26,16 +27,22 @@ def read_url_utf8(url):
|
||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
f = open(path)
|
f = open(path)
|
||||||
data = f.read()
|
data = f.read()
|
||||||
|
encoding = chardet.detect(data)['encoding']
|
||||||
|
if not encoding: encoding = 'latin-1'
|
||||||
f.close()
|
f.close()
|
||||||
|
data = unicode(data, encoding)
|
||||||
return data
|
return data
|
||||||
else:
|
else:
|
||||||
data = utils.read_url_utf8(url)
|
data = utils.read_url(url)
|
||||||
folder = os.path.dirname(path)
|
folder = os.path.dirname(path)
|
||||||
if not os.path.exists(folder):
|
if not os.path.exists(folder):
|
||||||
os.makedirs(folder)
|
os.makedirs(folder)
|
||||||
f = open(path, 'w')
|
f = open(path, 'w')
|
||||||
f.write(data)
|
f.write(data)
|
||||||
f.close()
|
f.close()
|
||||||
|
encoding = chardet.detect(data)['encoding']
|
||||||
|
if not encoding: encoding = 'latin-1'
|
||||||
|
data = unicode(data, encoding)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def read_url(url):
|
def read_url(url):
|
||||||
|
|
Loading…
Reference in a new issue