refracture cache and make it selfexpire after 30 days

2008-03-18 14:58:01 +00:00 · 2008-03-18 14:58:01 +00:00 · 36a70bb365
commit 36a70bb365
parent 6bedcaa9d6
3 changed files with 66 additions and 41 deletions
--- a/scrapeit/btutils.py
+++ b/scrapeit/btutils.py
@ -30,26 +30,29 @@ def torrentsWeLike(link):
      return True
  return False

-def movieType(movie):
-  if 'cam' in movie['title'].lower():
+def movieType(title):
+  title = title.lower()
+  if 'trailer' in title:
+    return 'trailer'
+  if 'cam' in title:
    return 'cam'
-  if 'vcd' in movie['title'].lower():
+  if 'vcd' in title:
    return 'vcd'
  for key in ('telesync', 'telecine', '.ts', '.tc', ' tc ', ' ts', 'ts-screener'):
-    if key in movie['title'].lower():
+    if key in title:
      return 'telecine'
-  for key in ('dvdrip', 'dvdscrs'):
-    if key in movie['title'].lower():
+  for key in ('dvdrip', 'dvdscrs', 'dvdscr', 'dvd rip'):
+    if key in title:
      return 'dvdrip'
-  if 'screener' in movie['title'].lower():
+  if 'screener' in title:
    return 'screener'
-  if 'xvid' in movie['title'].lower():
+  if 'xvid' in title:
    return 'Xvid'
-  if '1080p' in movie['title'].lower():
+  if '1080p' in title:
    return '1080p'
-  if '720p' in movie['title'].lower():
+  if '720p' in title:
    return '720p'
-  if 'dvdr' in movie['title'].lower():
+  if 'dvdr' in title:
    return 'DVDR'
  return ''

@ -61,7 +64,7 @@ def filterMovies(movies):
      movie['imdb'] = imdb_id[0]
    else:
      movie['imdb'] = ''
-      movie['source_type'] = movieType(movie)
+    movie['source_type'] = movieType(movie['title'])
    m2.append(movie)
  return m2

--- a/scrapeit/cache.py
+++ b/scrapeit/cache.py
@ -0,0 +1,50 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+
+import urllib2
+from urllib import quote
+import re, time
+import os
+import time
+
+import utils
+import chardet
+import imdbpy_utils
+
+
+cache_base = "/var/cache/scrapeit/cache/"
+cache_timeout = 30*24*60*60 # 30 days
+
+def read_url(url):
+  cache_file = os.cache_file.join(cache_base, url.replace('http://',''))
+  if cache_file.endswith('/'):
+    cache_file = "%sindex.html" % cache_file
+  if os.cache_file.isdir(cache_file):
+    cache_file = os.cache_file.join(cache_file, "index.html")
+  ctime = os.stat(cache_file).st_ctime
+  now = time.mktime(time.localtime())
+  file_age = now-ctime
+  print cache_timeout-file_age
+  if file_age < cache_timeout and os.cache_file.exists(cache_file):
+    f = open(cache_file)
+    data = f.read()
+    f.close()
+    return data
+  else:
+    data = utils.read_url(url)
+    folder = os.cache_file.dirname(cache_file)
+    if not os.cache_file.exists(folder):
+      os.makedirs(folder)
+    f = open(cache_file, 'w')
+    f.write(data)
+    f.close()
+    return data
+
+def read_url_utf8(url):
+  data = read_url(url)
+  encoding = chardet.detect(data)['encoding']
+  if not encoding: encoding = 'latin-1'
+  data = unicode(data, encoding)
+  return data
+
--- a/scrapeit/imdb.py
+++ b/scrapeit/imdb.py
@ -18,36 +18,8 @@ import utils
 import chardet
 import imdbpy_utils

-cache_base = "/var/cache/scrapeit/cache/"
+from cache import read_url, read_url_utf8

-def read_url_utf8(url):
-  data = read_url(url)
-  encoding = chardet.detect(data)['encoding']
-  if not encoding: encoding = 'latin-1'
-  data = unicode(data, encoding)
-  return data
-
-def read_url(url):
-  path = os.path.join(cache_base, url.replace('http://',''))
-  if path.endswith('/'):
-    path = "%sindex.html" % path
-  if os.path.isdir(path):
-    path = "%s/index.html" % path
-  if os.path.exists(path):
-    f = open(path)
-    data = f.read()
-    f.close()
-    return data
-  else:
-    data = utils.read_url(url)
-    folder = os.path.dirname(path)
-    if not os.path.exists(folder):
-      os.makedirs(folder)
-    f = open(path, 'w')
-    f.write(data)
-    f.close()
-    return data
-  
 def  _get_data(url):
  data = None
  try: