diff --git a/oxdata/movie/imdbids.py b/oxdata/movie/imdbids.py index 2cf36b0..abf4671 100644 --- a/oxdata/movie/imdbids.py +++ b/oxdata/movie/imdbids.py @@ -7,6 +7,7 @@ import json import os import re import sys +import time from django.conf import settings import lxml.html @@ -20,6 +21,14 @@ DAY = 24 * 60 * 60 TIMEOUT = 90 * DAY DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb') +def read_url(url, timeout): + data = ox.web.imdb.read_url(url, unicode=True, timeout=timeout) + while '>500 Error - IMDb<' in data: + print('Error', url) + time.sleep(10) + data = ox.web.imdb.read_url(url, unicode=True, timeout=0) + return data + def get_range(from_, to): base_url = 'http://www.imdb.com' url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to) @@ -44,7 +53,7 @@ def get_film_count(year, month=None, day=None): url = get_month(year, month) else: url = get_year(year) - data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT) + data = read_url(url, timeout=TIMEOUT) total = re.compile('50.*?of (.*?) titles', re.DOTALL).findall(data) if not total: total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data) @@ -149,7 +158,7 @@ def update_ids(year, month=None, day=None, sort=None): else: urls = [url] for url in urls: - data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT) + data = read_url(url, timeout=TIMEOUT) n = True page = 2 while n: @@ -163,6 +172,7 @@ def update_ids(year, month=None, day=None, sort=None): article = article[0] else: print('no article on', '%s&page=%s' % (url, page-2)) + ox.web.imdb.delete_url('%s&page=%s' % (url, page-2)) break for content in article.find_class('lister-item-content'): header = content.find_class('lister-item-header')[0] @@ -221,7 +231,7 @@ def update_ids(year, month=None, day=None, sort=None): #print(key, len(films), 'films') if n: #print(n) - data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT) + data = read_url(n, timeout=TIMEOUT) path = get_path('ids/%s.json' % key) with open(path, 'w') as fd: json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)