utf-8 foobar

This commit is contained in:
j 2007-07-15 13:35:13 +00:00
parent c434187130
commit d23720b1ff

View file

@ -14,6 +14,7 @@ from google import google
from utils import stripTags, htmldecode from utils import stripTags, htmldecode
import utils import utils
import chardet
cache_base = "/var/cache/scrapeit/cache/" cache_base = "/var/cache/scrapeit/cache/"
@ -26,16 +27,22 @@ def read_url_utf8(url):
if os.path.exists(path): if os.path.exists(path):
f = open(path) f = open(path)
data = f.read() data = f.read()
encoding = chardet.detect(data)['encoding']
if not encoding: encoding = 'latin-1'
f.close() f.close()
data = unicode(data, encoding)
return data return data
else: else:
data = utils.read_url_utf8(url) data = utils.read_url(url)
folder = os.path.dirname(path) folder = os.path.dirname(path)
if not os.path.exists(folder): if not os.path.exists(folder):
os.makedirs(folder) os.makedirs(folder)
f = open(path, 'w') f = open(path, 'w')
f.write(data) f.write(data)
f.close() f.close()
encoding = chardet.detect(data)['encoding']
if not encoding: encoding = 'latin-1'
data = unicode(data, encoding)
return data return data
def read_url(url): def read_url(url):