From d23720b1ffed7e54ff11a5dc3db8d005efb2fa3d Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Sun, 15 Jul 2007 13:35:13 +0000 Subject: [PATCH] utf-8 foobar --- scrapeit/imdb.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scrapeit/imdb.py b/scrapeit/imdb.py index ef59fb1..0dc0c31 100644 --- a/scrapeit/imdb.py +++ b/scrapeit/imdb.py @@ -14,6 +14,7 @@ from google import google from utils import stripTags, htmldecode import utils +import chardet cache_base = "/var/cache/scrapeit/cache/" @@ -26,16 +27,22 @@ def read_url_utf8(url): if os.path.exists(path): f = open(path) data = f.read() + encoding = chardet.detect(data)['encoding'] + if not encoding: encoding = 'latin-1' f.close() + data = unicode(data, encoding) return data else: - data = utils.read_url_utf8(url) + data = utils.read_url(url) folder = os.path.dirname(path) if not os.path.exists(folder): os.makedirs(folder) f = open(path, 'w') f.write(data) f.close() + encoding = chardet.detect(data)['encoding'] + if not encoding: encoding = 'latin-1' + data = unicode(data, encoding) return data def read_url(url):