diff --git a/scrapeit/imdb.py b/scrapeit/imdb.py index ef59fb1..0dc0c31 100644 --- a/scrapeit/imdb.py +++ b/scrapeit/imdb.py @@ -14,6 +14,7 @@ from google import google from utils import stripTags, htmldecode import utils +import chardet cache_base = "/var/cache/scrapeit/cache/" @@ -26,16 +27,22 @@ def read_url_utf8(url): if os.path.exists(path): f = open(path) data = f.read() + encoding = chardet.detect(data)['encoding'] + if not encoding: encoding = 'latin-1' f.close() + data = unicode(data, encoding) return data else: - data = utils.read_url_utf8(url) + data = utils.read_url(url) folder = os.path.dirname(path) if not os.path.exists(folder): os.makedirs(folder) f = open(path, 'w') f.write(data) f.close() + encoding = chardet.detect(data)['encoding'] + if not encoding: encoding = 'latin-1' + data = unicode(data, encoding) return data def read_url(url):