faster check for known ids

This commit is contained in:
j 2011-10-16 20:24:17 +02:00
parent a465654353
commit 0bd7980f98

View file

@ -94,15 +94,18 @@ class Imdb(models.Model):
return j return j
def get_new_ids(timeout=-1): def get_new_ids(timeout=-1):
known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
robot = ox.cache.readUrl('http://www.imdb.com/robots.txt', timeout=timeout) robot = ox.cache.readUrl('http://www.imdb.com/robots.txt', timeout=timeout)
sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0] sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
sitemap = ox.cache.readUrl(sitemap_url, timeout=timeout) sitemap = ox.cache.readUrl(sitemap_url, timeout=timeout)
urls = re.compile('<loc>(.+?)</loc>').findall(sitemap) urls = re.compile('<loc>(.+?)</loc>').findall(sitemap)
for url in sorted(urls, reverse=True): for url in sorted(urls, reverse=True):
print url
s = ox.cache.readUrl(url, timeout=timeout) s = ox.cache.readUrl(url, timeout=timeout)
ids = re.compile('<loc>http://www.imdb.com/title/tt(\d{7})/combined</loc>').findall(s) ids = re.compile('<loc>http://www.imdb.com/title/tt(\d{7})/combined</loc>').findall(s)
for i in ids: added = 0
m, created = Imdb.objects.get_or_create(imdb=i) for i in frozenset(ids) - known_ids:
if created: m= Imdb(imdb=i)
m.update() m.update()
added += 1
if added:
print url, added