ignore errors
This commit is contained in:
parent
501fe8cd3e
commit
e076b43904
1 changed files with 13 additions and 3 deletions
|
@ -7,6 +7,7 @@ import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
import lxml.html
|
import lxml.html
|
||||||
|
@ -20,6 +21,14 @@ DAY = 24 * 60 * 60
|
||||||
TIMEOUT = 90 * DAY
|
TIMEOUT = 90 * DAY
|
||||||
DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb')
|
DATA_ROOT = os.path.join(settings.MEDIA_ROOT, 'imdb')
|
||||||
|
|
||||||
|
def read_url(url, timeout):
|
||||||
|
data = ox.web.imdb.read_url(url, unicode=True, timeout=timeout)
|
||||||
|
while '>500 Error - IMDb<' in data:
|
||||||
|
print('Error', url)
|
||||||
|
time.sleep(10)
|
||||||
|
data = ox.web.imdb.read_url(url, unicode=True, timeout=0)
|
||||||
|
return data
|
||||||
|
|
||||||
def get_range(from_, to):
|
def get_range(from_, to):
|
||||||
base_url = 'http://www.imdb.com'
|
base_url = 'http://www.imdb.com'
|
||||||
url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to)
|
url = '%s/search/title?adult=include&release_date=%s,%s&sort=release_date,asc&count=50' % (base_url, from_, to)
|
||||||
|
@ -44,7 +53,7 @@ def get_film_count(year, month=None, day=None):
|
||||||
url = get_month(year, month)
|
url = get_month(year, month)
|
||||||
else:
|
else:
|
||||||
url = get_year(year)
|
url = get_year(year)
|
||||||
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
|
data = read_url(url, timeout=TIMEOUT)
|
||||||
total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
|
total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
|
||||||
if not total:
|
if not total:
|
||||||
total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data)
|
total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data)
|
||||||
|
@ -149,7 +158,7 @@ def update_ids(year, month=None, day=None, sort=None):
|
||||||
else:
|
else:
|
||||||
urls = [url]
|
urls = [url]
|
||||||
for url in urls:
|
for url in urls:
|
||||||
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
|
data = read_url(url, timeout=TIMEOUT)
|
||||||
n = True
|
n = True
|
||||||
page = 2
|
page = 2
|
||||||
while n:
|
while n:
|
||||||
|
@ -163,6 +172,7 @@ def update_ids(year, month=None, day=None, sort=None):
|
||||||
article = article[0]
|
article = article[0]
|
||||||
else:
|
else:
|
||||||
print('no article on', '%s&page=%s' % (url, page-2))
|
print('no article on', '%s&page=%s' % (url, page-2))
|
||||||
|
ox.web.imdb.delete_url('%s&page=%s' % (url, page-2))
|
||||||
break
|
break
|
||||||
for content in article.find_class('lister-item-content'):
|
for content in article.find_class('lister-item-content'):
|
||||||
header = content.find_class('lister-item-header')[0]
|
header = content.find_class('lister-item-header')[0]
|
||||||
|
@ -221,7 +231,7 @@ def update_ids(year, month=None, day=None, sort=None):
|
||||||
#print(key, len(films), 'films')
|
#print(key, len(films), 'films')
|
||||||
if n:
|
if n:
|
||||||
#print(n)
|
#print(n)
|
||||||
data = ox.web.imdb.read_url(n, unicode=True, timeout=TIMEOUT)
|
data = read_url(n, timeout=TIMEOUT)
|
||||||
path = get_path('ids/%s.json' % key)
|
path = get_path('ids/%s.json' % key)
|
||||||
with open(path, 'w') as fd:
|
with open(path, 'w') as fd:
|
||||||
json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
|
json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
|
||||||
|
|
Loading…
Reference in a new issue