ignore errors
This commit is contained in:
parent
501fe8cd3e
commit
e076b43904
1 changed files with 13 additions and 3 deletions
|
@ -7,6 +7,7 @@ import json
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
|
||||
from django.conf import settings
|
||||
import lxml.html
|
||||
|
@ -20,6 +21,14 @@ DAY = 24 * 60 * 60
|
|||
TIMEOUT = 90 * DAY
|
||||
DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb')
|
||||
|
||||
def read_url(url, timeout):
|
||||
data = ox.web.imdb.read_url(url, unicode=True, timeout=timeout)
|
||||
while '>500 Error - IMDb<' in data:
|
||||
print('Error', url)
|
||||
time.sleep(10)
|
||||
data = ox.web.imdb.read_url(url, unicode=True, timeout=0)
|
||||
return data
|
||||
|
||||
def get_range(from_, to):
|
||||
base_url = 'http://www.imdb.com'
|
||||
url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to)
|
||||
|
@ -44,7 +53,7 @@ def get_film_count(year, month=None, day=None):
|
|||
url = get_month(year, month)
|
||||
else:
|
||||
url = get_year(year)
|
||||
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
|
||||
data = read_url(url, timeout=TIMEOUT)
|
||||
total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
|
||||
if not total:
|
||||
total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data)
|
||||
|
@ -149,7 +158,7 @@ def update_ids(year, month=None, day=None, sort=None):
|
|||
else:
|
||||
urls = [url]
|
||||
for url in urls:
|
||||
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
|
||||
data = read_url(url, timeout=TIMEOUT)
|
||||
n = True
|
||||
page = 2
|
||||
while n:
|
||||
|
@ -163,6 +172,7 @@ def update_ids(year, month=None, day=None, sort=None):
|
|||
article = article[0]
|
||||
else:
|
||||
print('no article on', '%s&page=%s' % (url, page-2))
|
||||
ox.web.imdb.delete_url('%s&page=%s' % (url, page-2))
|
||||
break
|
||||
for content in article.find_class('lister-item-content'):
|
||||
header = content.find_class('lister-item-header')[0]
|
||||
|
@ -221,7 +231,7 @@ def update_ids(year, month=None, day=None, sort=None):
|
|||
#print(key, len(films), 'films')
|
||||
if n:
|
||||
#print(n)
|
||||
data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT)
|
||||
data = read_url(n, timeout=TIMEOUT)
|
||||
path = get_path('ids/%s.json' % key)
|
||||
with open(path, 'w') as fd:
|
||||
json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
|
||||
|
|
Loading…
Reference in a new issue