utf-8 foobar

This commit is contained in:
j 2007-07-15 13:35:13 +00:00
parent c434187130
commit d23720b1ff
1 changed files with 8 additions and 1 deletions

View File

@ -14,6 +14,7 @@ from google import google
from utils import stripTags, htmldecode
import utils
import chardet
cache_base = "/var/cache/scrapeit/cache/"
@ -26,16 +27,22 @@ def read_url_utf8(url):
if os.path.exists(path):
f = open(path)
data = f.read()
encoding = chardet.detect(data)['encoding']
if not encoding: encoding = 'latin-1'
f.close()
data = unicode(data, encoding)
return data
else:
data = utils.read_url_utf8(url)
data = utils.read_url(url)
folder = os.path.dirname(path)
if not os.path.exists(folder):
os.makedirs(folder)
f = open(path, 'w')
f.write(data)
f.close()
encoding = chardet.detect(data)['encoding']
if not encoding: encoding = 'latin-1'
data = unicode(data, encoding)
return data
def read_url(url):