diff --git a/oxdata/movie/models.py b/oxdata/movie/models.py index 5ba880b..6f848d3 100644 --- a/oxdata/movie/models.py +++ b/oxdata/movie/models.py @@ -94,15 +94,18 @@ class Imdb(models.Model): return j def get_new_ids(timeout=-1): + known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')]) robot = ox.cache.readUrl('http://www.imdb.com/robots.txt', timeout=timeout) sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0] sitemap = ox.cache.readUrl(sitemap_url, timeout=timeout) urls = re.compile('(.+?)').findall(sitemap) for url in sorted(urls, reverse=True): - print url s = ox.cache.readUrl(url, timeout=timeout) ids = re.compile('http://www.imdb.com/title/tt(\d{7})/combined').findall(s) - for i in ids: - m, created = Imdb.objects.get_or_create(imdb=i) - if created: - m.update() + added = 0 + for i in frozenset(ids) - known_ids: + m= Imdb(imdb=i) + m.update() + added += 1 + if added: + print url, added