Compare commits
4 commits
fe06a8c664
...
501fe8cd3e
| Author | SHA1 | Date | |
|---|---|---|---|
| 501fe8cd3e | |||
| 844d25008b | |||
| e22d5c5ad0 | |||
| ca50d091a8 |
3 changed files with 54 additions and 4 deletions
|
|
@ -43,7 +43,7 @@ def get_ids():
|
||||||
print('missing impawards', ox.web.impawards.get_url(id))
|
print('missing impawards', ox.web.impawards.get_url(id))
|
||||||
|
|
||||||
for id in ox.web.criterion.get_ids():
|
for id in ox.web.criterion.get_ids():
|
||||||
if id in ('626', '835'):
|
if id in ('626', '835', '1079', '28907'):
|
||||||
continue
|
continue
|
||||||
if models.MovieId.objects.all().filter(criterion_id=id).count() == 0:
|
if models.MovieId.objects.all().filter(criterion_id=id).count() == 0:
|
||||||
print('criterion', id)
|
print('criterion', id)
|
||||||
|
|
|
||||||
|
|
@ -46,6 +46,8 @@ def get_film_count(year, month=None, day=None):
|
||||||
url = get_year(year)
|
url = get_year(year)
|
||||||
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
|
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
|
||||||
total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
|
total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
|
||||||
|
if not total:
|
||||||
|
total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data)
|
||||||
if total:
|
if total:
|
||||||
return int(total[0].replace(',', ''))
|
return int(total[0].replace(',', ''))
|
||||||
print('no movies', url)
|
print('no movies', url)
|
||||||
|
|
@ -114,6 +116,19 @@ def update_month(year, month, film_counts):
|
||||||
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
|
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
|
||||||
save_film_counts(film_counts)
|
save_film_counts(film_counts)
|
||||||
|
|
||||||
|
def parse_cast(string):
|
||||||
|
results = {}
|
||||||
|
for part in string.split('|'):
|
||||||
|
cast = iter([t.strip() for t in part.split(':\n')])
|
||||||
|
cast = dict(zip(cast, cast))
|
||||||
|
for key in cast:
|
||||||
|
rkey = key.lower()
|
||||||
|
rkey = {
|
||||||
|
'director': 'directors',
|
||||||
|
'star': 'stars',
|
||||||
|
}.get(rkey, rkey)
|
||||||
|
results[rkey] = cast[key].split(', \n')
|
||||||
|
return results
|
||||||
|
|
||||||
def update_ids(year, month=None, day=None, sort=None):
|
def update_ids(year, month=None, day=None, sort=None):
|
||||||
films = {}
|
films = {}
|
||||||
|
|
@ -149,10 +164,20 @@ def update_ids(year, month=None, day=None, sort=None):
|
||||||
else:
|
else:
|
||||||
print('no article on', '%s&page=%s' % (url, page-2))
|
print('no article on', '%s&page=%s' % (url, page-2))
|
||||||
break
|
break
|
||||||
for header in article.find_class('lister-item-header'):
|
for content in article.find_class('lister-item-content'):
|
||||||
a = header.xpath('.//a')[0]
|
header = content.find_class('lister-item-header')[0]
|
||||||
|
a = header.xpath('.//a')
|
||||||
|
if 'Episode:' in [
|
||||||
|
e.text_content()
|
||||||
|
for e in header.xpath(".//small")
|
||||||
|
] and len(a) > 1:
|
||||||
|
title = a[0].text_content().strip() + ': '
|
||||||
|
a = a[1]
|
||||||
|
else:
|
||||||
|
title = ''
|
||||||
|
a = a[0]
|
||||||
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
|
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
|
||||||
title = a.text_content().strip()
|
title += a.text_content().strip()
|
||||||
try:
|
try:
|
||||||
y = header.find_class('lister-item-year')[0].text_content()
|
y = header.find_class('lister-item-year')[0].text_content()
|
||||||
y = re.sub('\([^\d]+\)', '', y)
|
y = re.sub('\([^\d]+\)', '', y)
|
||||||
|
|
@ -165,11 +190,34 @@ def update_ids(year, month=None, day=None, sort=None):
|
||||||
print(n)
|
print(n)
|
||||||
print(header.find_class('lister-item-year')[0].text_content())
|
print(header.find_class('lister-item-year')[0].text_content())
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
text = content.xpath(".//p[contains(@class, 'text-muted')]")
|
||||||
|
plot = text[1].text_content().strip()
|
||||||
|
plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
|
||||||
|
if plot == 'Add a Plot':
|
||||||
|
plot = ''
|
||||||
|
genre = content.find_class('genre')
|
||||||
|
if genre:
|
||||||
|
genre = genre[0].text_content().strip().split(', ')
|
||||||
|
else:
|
||||||
|
genre = []
|
||||||
|
cast = content.xpath(".//p[contains(@class, '')]")
|
||||||
|
cast = [t for t in cast if t.attrib.get('class') == '']
|
||||||
|
if cast:
|
||||||
|
cast = parse_cast(cast[0].text_content())
|
||||||
|
|
||||||
if id not in films:
|
if id not in films:
|
||||||
films[id] = {
|
films[id] = {
|
||||||
'title': title,
|
'title': title,
|
||||||
'year': y
|
'year': y
|
||||||
}
|
}
|
||||||
|
if plot:
|
||||||
|
films[id]['plot'] = plot
|
||||||
|
if genre:
|
||||||
|
films[id]['genre'] = genre
|
||||||
|
if cast:
|
||||||
|
films[id].update(cast)
|
||||||
|
|
||||||
#print(key, len(films), 'films')
|
#print(key, len(films), 'films')
|
||||||
if n:
|
if n:
|
||||||
#print(n)
|
#print(n)
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,8 @@ actions.register(getIds)
|
||||||
def getData(request, data):
|
def getData(request, data):
|
||||||
response = json_response()
|
response = json_response()
|
||||||
id = data['id']
|
id = data['id']
|
||||||
|
if isinstance(id, int):
|
||||||
|
id = str(id)
|
||||||
if len(id) == 7:
|
if len(id) == 7:
|
||||||
i, created = models.Imdb.objects.get_or_create(imdb=id)
|
i, created = models.Imdb.objects.get_or_create(imdb=id)
|
||||||
if created:
|
if created:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue