ignore errors

This commit is contained in:
j 2018-06-03 14:04:02 +02:00
parent 501fe8cd3e
commit e076b43904

View file

@ -7,6 +7,7 @@ import json
import os import os
import re import re
import sys import sys
import time
from django.conf import settings from django.conf import settings
import lxml.html import lxml.html
@ -20,6 +21,14 @@ DAY = 24 * 60 * 60
TIMEOUT = 90 * DAY TIMEOUT = 90 * DAY
DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb') DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb')
def read_url(url, timeout):
data = ox.web.imdb.read_url(url, unicode=True, timeout=timeout)
while '>500 Error - IMDb<' in data:
print('Error', url)
time.sleep(10)
data = ox.web.imdb.read_url(url, unicode=True, timeout=0)
return data
def get_range(from_, to): def get_range(from_, to):
base_url = 'http://www.imdb.com' base_url = 'http://www.imdb.com'
url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to) url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to)
@ -44,7 +53,7 @@ def get_film_count(year, month=None, day=None):
url = get_month(year, month) url = get_month(year, month)
else: else:
url = get_year(year) url = get_year(year)
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT) data = read_url(url, timeout=TIMEOUT)
total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data) total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
if not total: if not total:
total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data) total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data)
@ -149,7 +158,7 @@ def update_ids(year, month=None, day=None, sort=None):
else: else:
urls = [url] urls = [url]
for url in urls: for url in urls:
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT) data = read_url(url, timeout=TIMEOUT)
n = True n = True
page = 2 page = 2
while n: while n:
@ -163,6 +172,7 @@ def update_ids(year, month=None, day=None, sort=None):
article = article[0] article = article[0]
else: else:
print('no article on', '%s&page=%s' % (url, page-2)) print('no article on', '%s&page=%s' % (url, page-2))
ox.web.imdb.delete_url('%s&page=%s' % (url, page-2))
break break
for content in article.find_class('lister-item-content'): for content in article.find_class('lister-item-content'):
header = content.find_class('lister-item-header')[0] header = content.find_class('lister-item-header')[0]
@ -221,7 +231,7 @@ def update_ids(year, month=None, day=None, sort=None):
#print(key, len(films), 'films') #print(key, len(films), 'films')
if n: if n:
#print(n) #print(n)
data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT) data = read_url(n, timeout=TIMEOUT)
path = get_path('ids/%s.json' % key) path = get_path('ids/%s.json' % key)
with open(path, 'w') as fd: with open(path, 'w') as fd:
json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True) json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)