faster check for known ids
This commit is contained in:
parent
a465654353
commit
0bd7980f98
1 changed files with 8 additions and 5 deletions
|
@ -94,15 +94,18 @@ class Imdb(models.Model):
|
||||||
return j
|
return j
|
||||||
|
|
||||||
def get_new_ids(timeout=-1):
|
def get_new_ids(timeout=-1):
|
||||||
|
known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
|
||||||
robot = ox.cache.readUrl('http://www.imdb.com/robots.txt', timeout=timeout)
|
robot = ox.cache.readUrl('http://www.imdb.com/robots.txt', timeout=timeout)
|
||||||
sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
|
sitemap_url = re.compile('\nSitemap: (http.+)').findall(robot)[0]
|
||||||
sitemap = ox.cache.readUrl(sitemap_url, timeout=timeout)
|
sitemap = ox.cache.readUrl(sitemap_url, timeout=timeout)
|
||||||
urls = re.compile('<loc>(.+?)</loc>').findall(sitemap)
|
urls = re.compile('<loc>(.+?)</loc>').findall(sitemap)
|
||||||
for url in sorted(urls, reverse=True):
|
for url in sorted(urls, reverse=True):
|
||||||
print url
|
|
||||||
s = ox.cache.readUrl(url, timeout=timeout)
|
s = ox.cache.readUrl(url, timeout=timeout)
|
||||||
ids = re.compile('<loc>http://www.imdb.com/title/tt(\d{7})/combined</loc>').findall(s)
|
ids = re.compile('<loc>http://www.imdb.com/title/tt(\d{7})/combined</loc>').findall(s)
|
||||||
for i in ids:
|
added = 0
|
||||||
m, created = Imdb.objects.get_or_create(imdb=i)
|
for i in frozenset(ids) - known_ids:
|
||||||
if created:
|
m= Imdb(imdb=i)
|
||||||
m.update()
|
m.update()
|
||||||
|
added += 1
|
||||||
|
if added:
|
||||||
|
print url, added
|
||||||
|
|
Loading…
Reference in a new issue