refactor
This commit is contained in:
parent
ee9e430ef8
commit
785550c753
1 changed files with 70 additions and 76 deletions
|
@ -110,10 +110,6 @@ def update_month(year, month, film_counts):
|
|||
if film_count != film_counts.get(key):
|
||||
print_info(key, film_count, film_counts)
|
||||
film_counts[key] = film_count
|
||||
if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE:
|
||||
r = update_ids(year, month, day, sort='alpha', expected=film_count)
|
||||
save_film_counts(film_counts)
|
||||
else:
|
||||
r = update_ids(year, month, day, expected=film_count)
|
||||
save_film_counts(film_counts)
|
||||
if days_total != month_total:
|
||||
|
@ -136,53 +132,14 @@ def parse_cast(string):
|
|||
results[rkey] = cast[key].split(', \n')
|
||||
return results
|
||||
|
||||
def update_ids(year, month=None, day=None, sort=None, expected=None):
|
||||
films = {}
|
||||
if day is not None:
|
||||
url = get_day(year, month, day)
|
||||
key = '%04d-%02d-%02d' % (year, month, day)
|
||||
elif month is not None:
|
||||
url = get_month(year, month)
|
||||
key = '%04d-%02d' % (year, month)
|
||||
else:
|
||||
url = get_year(year)
|
||||
key = '%04d' % year
|
||||
if sort == 'alpha':
|
||||
urls = [
|
||||
url.replace('sort=release_date,asc', 'sort=alpha,asc'),
|
||||
url.replace('sort=release_date,asc', 'sort=alpha,desc'),
|
||||
]
|
||||
else:
|
||||
urls = [url]
|
||||
|
||||
if not expected:
|
||||
expected = get_film_count(year, month, day)
|
||||
|
||||
for url in urls:
|
||||
data = read_url(url, timeout=TIMEOUT)
|
||||
n = True
|
||||
step = 50
|
||||
start = 1
|
||||
while n:
|
||||
start += step
|
||||
next_link = 'start=%s&ref_=adv_nxt"' % (start)
|
||||
after_link = 'href="(.*?after=.*?&ref_=adv_nxt)"'
|
||||
has_next = re.compile(next_link).findall(data)
|
||||
has_after = re.compile(after_link).findall(data)
|
||||
if has_next:
|
||||
n = '%s&start=%s' % (url, start)
|
||||
elif sort != 'alpha' and has_after:
|
||||
n = '%s%s' % (base_url, has_after[0])
|
||||
else:
|
||||
n = False
|
||||
def get_films(data):
|
||||
films = []
|
||||
doc = lxml.html.fromstring(data)
|
||||
article = doc.find_class('article')
|
||||
if article:
|
||||
article = article[0]
|
||||
else:
|
||||
print('no article on', '%s&start=%s' % (url, start - 2*step))
|
||||
ox.web.imdb.delete_url('%s&start=%s' % (url, start - 2*step))
|
||||
break
|
||||
return films
|
||||
for content in article.find_class('lister-item-content'):
|
||||
header = content.find_class('lister-item-header')[0]
|
||||
a = header.xpath('.//a')
|
||||
|
@ -225,19 +182,56 @@ def update_ids(year, month=None, day=None, sort=None, expected=None):
|
|||
if cast:
|
||||
cast = parse_cast(cast[0].text_content())
|
||||
|
||||
if id not in films:
|
||||
films[id] = {
|
||||
film = {
|
||||
'title': title,
|
||||
'year': y
|
||||
}
|
||||
if plot:
|
||||
films[id]['plot'] = plot
|
||||
film['plot'] = plot
|
||||
if genre:
|
||||
films[id]['genre'] = genre
|
||||
film['genre'] = genre
|
||||
if cast:
|
||||
films[id].update(cast)
|
||||
if expected and len(films) == expected and sort == 'alpha':
|
||||
film.update(cast)
|
||||
films.append((id, film))
|
||||
return films
|
||||
|
||||
def update_ids(year, month=None, day=None, sort=None, expected=None):
|
||||
films = {}
|
||||
if day is not None:
|
||||
url = get_day(year, month, day)
|
||||
key = '%04d-%02d-%02d' % (year, month, day)
|
||||
elif month is not None:
|
||||
url = get_month(year, month)
|
||||
key = '%04d-%02d' % (year, month)
|
||||
else:
|
||||
url = get_year(year)
|
||||
key = '%04d' % year
|
||||
|
||||
urls = [url]
|
||||
|
||||
if not expected:
|
||||
expected = get_film_count(year, month, day)
|
||||
|
||||
for url in urls:
|
||||
data = read_url(url, timeout=TIMEOUT)
|
||||
n = True
|
||||
step = 50
|
||||
start = 1
|
||||
while n:
|
||||
start += step
|
||||
next_link = 'start=%s&ref_=adv_nxt"' % (start)
|
||||
after_link = 'href="(.*?after=.*?&ref_=adv_nxt)"'
|
||||
has_next = re.compile(next_link).findall(data)
|
||||
has_after = re.compile(after_link).findall(data)
|
||||
if has_next:
|
||||
n = '%s&start=%s' % (url, start)
|
||||
elif has_after:
|
||||
n = '%s%s' % (base_url, has_after[0])
|
||||
else:
|
||||
n = False
|
||||
for id, film in get_films(data):
|
||||
if id not in films:
|
||||
films[id] = film
|
||||
debug('%s: %s of %s films - next: %s' % (key, len(films), expected, n))
|
||||
if n:
|
||||
data = read_url(n, timeout=TIMEOUT)
|
||||
|
|
Loading…
Reference in a new issue