From 0bd7980f98bc038394c0a836048f2f4a4a24c0a6 Mon Sep 17 00:00:00 2001
From: j <0x006A@0x2620.org>
Date: Sun, 16 Oct 2011 20:24:17 +0200
Subject: [PATCH] faster check for known ids
---
oxdata/movie/models.py | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/oxdata/movie/models.py b/oxdata/movie/models.py
index 5ba880b..6f848d3 100644
--- a/oxdata/movie/models.py
+++ b/oxdata/movie/models.py
@@ -94,15 +94,18 @@ class Imdb(models.Model):
return j
def get_new_ids(timeout=-1):
+ known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
robot = ox.cache.readUrl('http://www.imdb.com/robots.txt', timeout=timeout)
sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
sitemap = ox.cache.readUrl(sitemap_url, timeout=timeout)
urls = re.compile('(.+?)').findall(sitemap)
for url in sorted(urls, reverse=True):
- print url
s = ox.cache.readUrl(url, timeout=timeout)
ids = re.compile('http://www.imdb.com/title/tt(\d{7})/combined').findall(s)
- for i in ids:
- m, created = Imdb.objects.get_or_create(imdb=i)
- if created:
- m.update()
+ added = 0
+ for i in frozenset(ids) - known_ids:
+ m= Imdb(imdb=i)
+ m.update()
+ added += 1
+ if added:
+ print url, added