utf-8 foobar
This commit is contained in:
parent
c434187130
commit
d23720b1ff
1 changed files with 8 additions and 1 deletions
|
@ -14,6 +14,7 @@ from google import google
|
|||
from utils import stripTags, htmldecode
|
||||
|
||||
import utils
|
||||
import chardet
|
||||
|
||||
cache_base = "/var/cache/scrapeit/cache/"
|
||||
|
||||
|
@ -26,16 +27,22 @@ def read_url_utf8(url):
|
|||
if os.path.exists(path):
|
||||
f = open(path)
|
||||
data = f.read()
|
||||
encoding = chardet.detect(data)['encoding']
|
||||
if not encoding: encoding = 'latin-1'
|
||||
f.close()
|
||||
data = unicode(data, encoding)
|
||||
return data
|
||||
else:
|
||||
data = utils.read_url_utf8(url)
|
||||
data = utils.read_url(url)
|
||||
folder = os.path.dirname(path)
|
||||
if not os.path.exists(folder):
|
||||
os.makedirs(folder)
|
||||
f = open(path, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
encoding = chardet.detect(data)['encoding']
|
||||
if not encoding: encoding = 'latin-1'
|
||||
data = unicode(data, encoding)
|
||||
return data
|
||||
|
||||
def read_url(url):
|
||||
|
|
Loading…
Reference in a new issue