Compare commits

...

2 Commits

Author SHA1 Message Date
j 785550c753 refactor 2019-08-05 13:18:39 +02:00
j ee9e430ef8 typos 2019-08-05 12:29:25 +02:00
2 changed files with 81 additions and 80 deletions

View File

@ -110,10 +110,6 @@ def update_month(year, month, film_counts):
if film_count != film_counts.get(key):
print_info(key, film_count, film_counts)
film_counts[key] = film_count
if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE:
r = update_ids(year, month, day, sort='alpha', expected=film_count)
save_film_counts(film_counts)
else:
r = update_ids(year, month, day, expected=film_count)
save_film_counts(film_counts)
if days_total != month_total:
@ -136,53 +132,14 @@ def parse_cast(string):
results[rkey] = cast[key].split(', \n')
return results
def update_ids(year, month=None, day=None, sort=None, expexted=None):
films = {}
if day is not None:
url = get_day(year, month, day)
key = '%04d-%02d-%02d' % (year, month, day)
elif month is not None:
url = get_month(year, month)
key = '%04d-%02d' % (year, month)
else:
url = get_year(year)
key = '%04d' % year
if sort == 'alpha':
urls = [
url.replace('sort=release_date,asc', 'sort=alpha,asc'),
url.replace('sort=release_date,asc', 'sort=alpha,desc'),
]
else:
urls = [url]
if not expexted:
expexted = get_film_count(year, month, day)
for url in urls:
data = read_url(url, timeout=TIMEOUT)
n = True
step = 50
start = 1
while n:
start += step
next_link = 'start=%s&ref_=adv_nxt"' % (start)
after_link = 'href="(.*?after=.*?&ref_=adv_nxt)"'
has_next = re.compile(next_link).findall(data)
has_after = re.compile(after_link).findall(data)
if has_next:
n = '%s&start=%s' % (url, start)
elif sort != 'alpha' and start > MAX_PER_RANGE and has_after:
n = '%s%s' % (base_url, has_after[0])
else:
n = False
def get_films(data):
films = []
doc = lxml.html.fromstring(data)
article = doc.find_class('article')
if article:
article = article[0]
else:
print('no article on', '%s&start=%s' % (url, start - 2*step))
ox.web.imdb.delete_url('%s&start=%s' % (url, start - 2*step))
break
return films
for content in article.find_class('lister-item-content'):
header = content.find_class('lister-item-header')[0]
a = header.xpath('.//a')
@ -225,19 +182,56 @@ def update_ids(year, month=None, day=None, sort=None, expexted=None):
if cast:
cast = parse_cast(cast[0].text_content())
if id not in films:
films[id] = {
film = {
'title': title,
'year': y
}
if plot:
films[id]['plot'] = plot
film['plot'] = plot
if genre:
films[id]['genre'] = genre
film['genre'] = genre
if cast:
films[id].update(cast)
if expected and len(films) == expected and sort == 'alpha':
film.update(cast)
films.append((id, film))
return films
def update_ids(year, month=None, day=None, sort=None, expected=None):
films = {}
if day is not None:
url = get_day(year, month, day)
key = '%04d-%02d-%02d' % (year, month, day)
elif month is not None:
url = get_month(year, month)
key = '%04d-%02d' % (year, month)
else:
url = get_year(year)
key = '%04d' % year
urls = [url]
if not expected:
expected = get_film_count(year, month, day)
for url in urls:
data = read_url(url, timeout=TIMEOUT)
n = True
step = 50
start = 1
while n:
start += step
next_link = 'start=%s&ref_=adv_nxt"' % (start)
after_link = 'href="(.*?after=.*?&ref_=adv_nxt)"'
has_next = re.compile(next_link).findall(data)
has_after = re.compile(after_link).findall(data)
if has_next:
n = '%s&start=%s' % (url, start)
elif has_after:
n = '%s%s' % (base_url, has_after[0])
else:
n = False
for id, film in get_films(data):
if id not in films:
films[id] = film
debug('%s: %s of %s films - next: %s' % (key, len(films), expected, n))
if n:
data = read_url(n, timeout=TIMEOUT)
@ -246,7 +240,7 @@ def update_ids(year, month=None, day=None, sort=None, expexted=None):
json.dump(films, fd, indent=4, ensure_ascii=False, sort_keys=True)
r = len(films)
if r != expected:
print('%s: got %s, expected %s' % (key, r, expexted))
print('%s: got %s, expected %s' % (key, r, expected))
return r
def save_film_counts(film_counts):

View File

@ -3,6 +3,7 @@
from django.core.management.base import BaseCommand
import movie.models
import movie.imdbids
class Command(BaseCommand):
"""
@ -11,6 +12,12 @@ class Command(BaseCommand):
help = 'load ids from sites that dont support search.'
args = ''
def add_arguments(self, parser):
parser.add_argument('--debug', action='store_true', dest='debug', default=False, help='print debug info')
def handle(self, **options):
timeout = 30*24*60*60
if options.get('debug'):
movie.imdbids.DEBUG = True
movie.models.get_new_ids(timeout)