refactor
This commit is contained in:
parent
ee9e430ef8
commit
785550c753
1 changed files with 70 additions and 76 deletions
|
@ -110,12 +110,8 @@ def update_month(year, month, film_counts):
|
||||||
if film_count != film_counts.get(key):
|
if film_count != film_counts.get(key):
|
||||||
print_info(key, film_count, film_counts)
|
print_info(key, film_count, film_counts)
|
||||||
film_counts[key] = film_count
|
film_counts[key] = film_count
|
||||||
if film_count > MAX_PER_RANGE and film_count < 2*MAX_PER_RANGE:
|
r = update_ids(year, month, day, expected=film_count)
|
||||||
r = update_ids(year, month, day, sort='alpha', expected=film_count)
|
save_film_counts(film_counts)
|
||||||
save_film_counts(film_counts)
|
|
||||||
else:
|
|
||||||
r = update_ids(year, month, day, expected=film_count)
|
|
||||||
save_film_counts(film_counts)
|
|
||||||
if days_total != month_total:
|
if days_total != month_total:
|
||||||
print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
|
print('!! month and days don\'t add up: %s month vs %s days total' % (month_total, days_total))
|
||||||
else:
|
else:
|
||||||
|
@ -136,6 +132,69 @@ def parse_cast(string):
|
||||||
results[rkey] = cast[key].split(', \n')
|
results[rkey] = cast[key].split(', \n')
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
def get_films(data):
|
||||||
|
films = []
|
||||||
|
doc = lxml.html.fromstring(data)
|
||||||
|
article = doc.find_class('article')
|
||||||
|
if article:
|
||||||
|
article = article[0]
|
||||||
|
else:
|
||||||
|
return films
|
||||||
|
for content in article.find_class('lister-item-content'):
|
||||||
|
header = content.find_class('lister-item-header')[0]
|
||||||
|
a = header.xpath('.//a')
|
||||||
|
if 'Episode:' in [
|
||||||
|
e.text_content()
|
||||||
|
for e in header.xpath(".//small")
|
||||||
|
] and len(a) > 1:
|
||||||
|
title = a[0].text_content().strip() + ': '
|
||||||
|
a = a[1]
|
||||||
|
else:
|
||||||
|
title = ''
|
||||||
|
a = a[0]
|
||||||
|
id = re.compile('title/tt(\d+)').findall(a.attrib['href'])[0]
|
||||||
|
title += a.text_content().strip()
|
||||||
|
try:
|
||||||
|
y = header.find_class('lister-item-year')[0].text_content()
|
||||||
|
y = re.sub('\([^\d]+\)', '', y)
|
||||||
|
y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
|
||||||
|
if not y:
|
||||||
|
y = year
|
||||||
|
else:
|
||||||
|
y = int(y)
|
||||||
|
except:
|
||||||
|
print(n)
|
||||||
|
print(header.find_class('lister-item-year')[0].text_content())
|
||||||
|
raise
|
||||||
|
|
||||||
|
text = content.xpath(".//p[contains(@class, 'text-muted')]")
|
||||||
|
plot = text[1].text_content().strip()
|
||||||
|
plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
|
||||||
|
if plot == 'Add a Plot':
|
||||||
|
plot = ''
|
||||||
|
genre = content.find_class('genre')
|
||||||
|
if genre:
|
||||||
|
genre = genre[0].text_content().strip().split(', ')
|
||||||
|
else:
|
||||||
|
genre = []
|
||||||
|
cast = content.xpath(".//p[contains(@class, '')]")
|
||||||
|
cast = [t for t in cast if t.attrib.get('class') == '']
|
||||||
|
if cast:
|
||||||
|
cast = parse_cast(cast[0].text_content())
|
||||||
|
|
||||||
|
film = {
|
||||||
|
'title': title,
|
||||||
|
'year': y
|
||||||
|
}
|
||||||
|
if plot:
|
||||||
|
film['plot'] = plot
|
||||||
|
if genre:
|
||||||
|
film['genre'] = genre
|
||||||
|
if cast:
|
||||||
|
film.update(cast)
|
||||||
|
films.append((id, film))
|
||||||
|
return films
|
||||||
|
|
||||||
def update_ids(year, month=None, day=None, sort=None, expected=None):
|
def update_ids(year, month=None, day=None, sort=None, expected=None):
|
||||||
films = {}
|
films = {}
|
||||||
if day is not None:
|
if day is not None:
|
||||||
|
@ -147,13 +206,8 @@ def update_ids(year, month=None, day=None, sort=None, expected=None):
|
||||||
else:
|
else:
|
||||||
url = get_year(year)
|
url = get_year(year)
|
||||||
key = '%04d' % year
|
key = '%04d' % year
|
||||||
if sort == 'alpha':
|
|
||||||
urls = [
|
urls = [url]
|
||||||
url.replace('sort=release_date,asc', 'sort=alpha,asc'),
|
|
||||||
url.replace('sort=release_date,asc', 'sort=alpha,desc'),
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
urls = [url]
|
|
||||||
|
|
||||||
if not expected:
|
if not expected:
|
||||||
expected = get_film_count(year, month, day)
|
expected = get_film_count(year, month, day)
|
||||||
|
@ -171,73 +225,13 @@ def update_ids(year, month=None, day=None, sort=None, expected=None):
|
||||||
has_after = re.compile(after_link).findall(data)
|
has_after = re.compile(after_link).findall(data)
|
||||||
if has_next:
|
if has_next:
|
||||||
n = '%s&start=%s' % (url, start)
|
n = '%s&start=%s' % (url, start)
|
||||||
elif sort != 'alpha' and has_after:
|
elif has_after:
|
||||||
n = '%s%s' % (base_url, has_after[0])
|
n = '%s%s' % (base_url, has_after[0])
|
||||||
else:
|
else:
|
||||||
n = False
|
n = False
|
||||||
doc = lxml.html.fromstring(data)
|
for id, film in get_films(data):
|
||||||
article = doc.find_class('article')
|
|
||||||
if article:
|
|
||||||
article = article[0]
|
|
||||||
else:
|
|
||||||
print('no article on', '%s&start=%s' % (url, start - 2*step))
|
|
||||||
ox.web.imdb.delete_url('%s&start=%s' % (url, start - 2*step))
|
|
||||||
break
|
|
||||||
for content in article.find_class('lister-item-content'):
|
|
||||||
header = content.find_class('lister-item-header')[0]
|
|
||||||
a = header.xpath('.//a')
|
|
||||||
if 'Episode:' in [
|
|
||||||
e.text_content()
|
|
||||||
for e in header.xpath(".//small")
|
|
||||||
] and len(a) > 1:
|
|
||||||
title = a[0].text_content().strip() + ': '
|
|
||||||
a = a[1]
|
|
||||||
else:
|
|
||||||
title = ''
|
|
||||||
a = a[0]
|
|
||||||
id = re.compile('title/tt(\d+)').findall(a.attrib['href'])[0]
|
|
||||||
title += a.text_content().strip()
|
|
||||||
try:
|
|
||||||
y = header.find_class('lister-item-year')[0].text_content()
|
|
||||||
y = re.sub('\([^\d]+\)', '', y)
|
|
||||||
y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
|
|
||||||
if not y:
|
|
||||||
y = year
|
|
||||||
else:
|
|
||||||
y = int(y)
|
|
||||||
except:
|
|
||||||
print(n)
|
|
||||||
print(header.find_class('lister-item-year')[0].text_content())
|
|
||||||
raise
|
|
||||||
|
|
||||||
text = content.xpath(".//p[contains(@class, 'text-muted')]")
|
|
||||||
plot = text[1].text_content().strip()
|
|
||||||
plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
|
|
||||||
if plot == 'Add a Plot':
|
|
||||||
plot = ''
|
|
||||||
genre = content.find_class('genre')
|
|
||||||
if genre:
|
|
||||||
genre = genre[0].text_content().strip().split(', ')
|
|
||||||
else:
|
|
||||||
genre = []
|
|
||||||
cast = content.xpath(".//p[contains(@class, '')]")
|
|
||||||
cast = [t for t in cast if t.attrib.get('class') == '']
|
|
||||||
if cast:
|
|
||||||
cast = parse_cast(cast[0].text_content())
|
|
||||||
|
|
||||||
if id not in films:
|
if id not in films:
|
||||||
films[id] = {
|
films[id] = film
|
||||||
'title': title,
|
|
||||||
'year': y
|
|
||||||
}
|
|
||||||
if plot:
|
|
||||||
films[id]['plot'] = plot
|
|
||||||
if genre:
|
|
||||||
films[id]['genre'] = genre
|
|
||||||
if cast:
|
|
||||||
films[id].update(cast)
|
|
||||||
if expected and len(films) == expected and sort == 'alpha':
|
|
||||||
n = False
|
|
||||||
debug('%s: %s of %s films - next: %s' % (key, len(films), expected, n))
|
debug('%s: %s of %s films - next: %s' % (key, len(films), expected, n))
|
||||||
if n:
|
if n:
|
||||||
data = read_url(n, timeout=TIMEOUT)
|
data = read_url(n, timeout=TIMEOUT)
|
||||||
|
|
Loading…
Reference in a new issue