cache read_url_utf8 too

2007-07-14 11:25:50 +00:00 · 2007-07-14 11:25:50 +00:00 · c434187130
commit c434187130
parent 23e27d1b36
1 changed files with 25 additions and 3 deletions
--- a/scrapeit/imdb.py
+++ b/scrapeit/imdb.py
@ -11,13 +11,35 @@ from elementtree.ElementTree import parse, tostring
 from BeautifulSoup import BeautifulSoup

 from google import google
-from utils import stripTags, read_url_utf8, htmldecode
+from utils import stripTags, htmldecode

 import utils

+cache_base = "/var/cache/scrapeit/cache/"
+
+def read_url_utf8(url):
+  path = os.path.join(cache_base, url.replace('http://',''))
+  if path.endswith('/'):
+    path = "%sindex.html" % path
+  if os.path.isdir(path):
+    path = "%s/index.html" % path
+  if os.path.exists(path):
+    f = open(path)
+    data = f.read()
+    f.close()
+    return data
+  else:
+    data = utils.read_url_utf8(url)
+    folder = os.path.dirname(path)
+    if not os.path.exists(folder):
+      os.makedirs(folder)
+    f = open(path, 'w')
+    f.write(data)
+    f.close()
+    return data
+
 def read_url(url):
-  base = "/var/cache/scrapeit/cache/"
-  path = os.path.join(base, url.replace('http://',''))
+  path = os.path.join(cache_base, url.replace('http://',''))
  if path.endswith('/'):
    path = "%sindex.html" % path
  if os.path.isdir(path):