fix imdb parser
This commit is contained in:
parent
031439ccd5
commit
ef8bc68f79
2 changed files with 36 additions and 25 deletions
|
@ -21,6 +21,13 @@ DAY = 24 * 60 * 60
|
||||||
TIMEOUT = 90 * DAY
|
TIMEOUT = 90 * DAY
|
||||||
DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb')
|
DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb')
|
||||||
|
|
||||||
|
DEBUG = False
|
||||||
|
|
||||||
|
def debug(*args, **kwargs):
|
||||||
|
if DEBUG:
|
||||||
|
print(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def read_url(url, timeout):
|
def read_url(url, timeout):
|
||||||
data = ox.web.imdb.read_url(url, unicode=True, timeout=timeout)
|
data = ox.web.imdb.read_url(url, unicode=True, timeout=timeout)
|
||||||
while '>500 Error - IMDb<' in data:
|
while '>500 Error - IMDb<' in data:
|
||||||
|
@ -85,11 +92,9 @@ def update_year(year, film_counts):
|
||||||
if film_count != film_counts.get(key):
|
if film_count != film_counts.get(key):
|
||||||
print_info(key, film_count, film_counts)
|
print_info(key, film_count, film_counts)
|
||||||
film_counts[key] = film_count
|
film_counts[key] = film_count
|
||||||
update_month(year, month, film_counts)
|
update_month(year, month, film_counts, expected=film_count)
|
||||||
else:
|
else:
|
||||||
r = update_ids(year)
|
r = update_ids(year)
|
||||||
if r != film_counts[key]:
|
|
||||||
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
|
|
||||||
save_film_counts(film_counts)
|
save_film_counts(film_counts)
|
||||||
|
|
||||||
def update_month(year, month, film_counts):
|
def update_month(year, month, film_counts):
|
||||||
|
@ -106,23 +111,17 @@ def update_month(year, month, film_counts):
|
||||||
print_info(key, film_count, film_counts)
|
print_info(key, film_count, film_counts)
|
||||||
film_counts[key] = film_count
|
film_counts[key] = film_count
|
||||||
if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE:
|
if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE:
|
||||||
r = update_ids(year, month, day, sort='alpha')
|
r = update_ids(year, month, day, sort='alpha', expected=film_count)
|
||||||
if r != film_counts[key]:
|
|
||||||
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
|
|
||||||
save_film_counts(film_counts)
|
save_film_counts(film_counts)
|
||||||
elif film_count > MAX_PER_RANGE:
|
elif film_count > MAX_PER_RANGE:
|
||||||
print(key, '!!!to many per day')
|
print(key, '!!!to many per day', film_count, key)
|
||||||
else:
|
else:
|
||||||
r = update_ids(year, month, day)
|
r = update_ids(year, month, day, expected=film_count)
|
||||||
if r != film_counts[key]:
|
|
||||||
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
|
|
||||||
save_film_counts(film_counts)
|
save_film_counts(film_counts)
|
||||||
if days_total != month_total:
|
if days_total != month_total:
|
||||||
print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
|
print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
|
||||||
else:
|
else:
|
||||||
r = update_ids(year, month)
|
r = update_ids(year, month, expected=film_count)
|
||||||
if r != film_counts[key]:
|
|
||||||
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
|
|
||||||
save_film_counts(film_counts)
|
save_film_counts(film_counts)
|
||||||
|
|
||||||
def parse_cast(string):
|
def parse_cast(string):
|
||||||
|
@ -139,7 +138,7 @@ def parse_cast(string):
|
||||||
results[rkey] = cast[key].split(', \n')
|
results[rkey] = cast[key].split(', \n')
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def update_ids(year, month=None, day=None, sort=None):
|
def update_ids(year, month=None, day=None, sort=None, expexted=None):
|
||||||
films = {}
|
films = {}
|
||||||
if day is not None:
|
if day is not None:
|
||||||
url = get_day(year, month, day)
|
url = get_day(year, month, day)
|
||||||
|
@ -157,16 +156,23 @@ def update_ids(year, month=None, day=None, sort=None):
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
urls = [url]
|
urls = [url]
|
||||||
|
|
||||||
|
if not expexted:
|
||||||
|
expexted = get_film_count(year, month, day)
|
||||||
|
|
||||||
for url in urls:
|
for url in urls:
|
||||||
data = read_url(url, timeout=TIMEOUT)
|
data = read_url(url, timeout=TIMEOUT)
|
||||||
n = True
|
n = True
|
||||||
step = 50
|
step = 50
|
||||||
start = 1
|
start = 1
|
||||||
while n:
|
while n:
|
||||||
n = re.compile('Next »</a>', re.DOTALL).findall(data)
|
|
||||||
if n:
|
|
||||||
n = '%s&start=%s' % (url, start)
|
|
||||||
start += step
|
start += step
|
||||||
|
next_link = 'start=%s&ref_=adv_nxt"' % (start)
|
||||||
|
has_next = re.compile(next_link).findall(data)
|
||||||
|
if has_next:
|
||||||
|
n = '%s&start=%s' % (url, start)
|
||||||
|
else:
|
||||||
|
n = False
|
||||||
doc = lxml.html.fromstring(data)
|
doc = lxml.html.fromstring(data)
|
||||||
article = doc.find_class('article')
|
article = doc.find_class('article')
|
||||||
if article:
|
if article:
|
||||||
|
@ -228,21 +234,26 @@ def update_ids(year, month=None, day=None, sort=None):
|
||||||
films[id]['genre'] = genre
|
films[id]['genre'] = genre
|
||||||
if cast:
|
if cast:
|
||||||
films[id].update(cast)
|
films[id].update(cast)
|
||||||
|
if expected and len(films) == expected and sort == 'alpha':
|
||||||
#print(key, len(films), 'films')
|
n = False
|
||||||
|
debug('%s: %s of %s films - next: %s' % (key, len(films), expected, n))
|
||||||
if n:
|
if n:
|
||||||
#print(n)
|
|
||||||
data = read_url(n, timeout=TIMEOUT)
|
data = read_url(n, timeout=TIMEOUT)
|
||||||
path = get_path('ids/%s.json' % key)
|
path = get_path('ids/%s.json' % key)
|
||||||
with open(path, 'w') as fd:
|
with open(path, 'w') as fd:
|
||||||
json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
|
json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
|
||||||
return len(films)
|
r = len(films)
|
||||||
|
if r != expected:
|
||||||
|
print('%s: got %s, expected %s' % (key, r, expexted))
|
||||||
|
return r
|
||||||
|
|
||||||
def save_film_counts(film_counts):
|
def save_film_counts(film_counts):
|
||||||
with open(get_path('film_counts.json'), 'w') as fd:
|
with open(get_path('film_counts.json'), 'w') as fd:
|
||||||
json.dump(film_counts, fd, indent=4, sort_keys=True)
|
json.dump(film_counts, fd, indent=4, sort_keys=True)
|
||||||
|
|
||||||
def update_index():
|
def update_index(from_year=None):
|
||||||
|
if from_year is None:
|
||||||
|
from_year = 1874
|
||||||
film_counts_json = get_path('film_counts.json')
|
film_counts_json = get_path('film_counts.json')
|
||||||
if os.path.exists(film_counts_json):
|
if os.path.exists(film_counts_json):
|
||||||
with open(film_counts_json) as fd:
|
with open(film_counts_json) as fd:
|
||||||
|
@ -250,7 +261,7 @@ def update_index():
|
||||||
else:
|
else:
|
||||||
film_counts = {}
|
film_counts = {}
|
||||||
|
|
||||||
for year in range(1894, datetime.now().year+1):
|
for year in range(from_year, datetime.now().year+1):
|
||||||
film_count = get_film_count(year)
|
film_count = get_film_count(year)
|
||||||
key = '%s' % year
|
key = '%s' % year
|
||||||
if film_count != film_counts.get(key):
|
if film_count != film_counts.get(key):
|
||||||
|
|
|
@ -255,13 +255,13 @@ class Imdb(models.Model):
|
||||||
j['year'] = int(j['year'])
|
j['year'] = int(j['year'])
|
||||||
return j
|
return j
|
||||||
|
|
||||||
def get_new_ids(timeout=-1):
|
def get_new_ids(timeout=-1, from_year=None):
|
||||||
new_ids_cache = '/tmp/missing.json'
|
new_ids_cache = '/tmp/missing.json'
|
||||||
if os.path.exists(new_ids_cache):
|
if os.path.exists(new_ids_cache):
|
||||||
with open(new_ids_cache) as fd:
|
with open(new_ids_cache) as fd:
|
||||||
new_ids = set(json.load(fd))
|
new_ids = set(json.load(fd))
|
||||||
else:
|
else:
|
||||||
update_index()
|
update_index(from_year)
|
||||||
known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
|
known_ids = frozenset([i['imdb'] for i in Imdb.objects.all().values('imdb')])
|
||||||
new_ids = get_unknown_ids(known_ids)
|
new_ids = get_unknown_ids(known_ids)
|
||||||
if new_ids:
|
if new_ids:
|
||||||
|
|
Loading…
Reference in a new issue