Compare commits
No commits in common. "501fe8cd3e1bd33b29eac6ffce85b4404eb46b61" and "fe06a8c6645396f6c6149b896f9509e3c556ef7d" have entirely different histories.
501fe8cd3e
...
fe06a8c664
3 changed files with 4 additions and 54 deletions
|
|
@ -43,7 +43,7 @@ def get_ids():
|
||||||
print('missing impawards', ox.web.impawards.get_url(id))
|
print('missing impawards', ox.web.impawards.get_url(id))
|
||||||
|
|
||||||
for id in ox.web.criterion.get_ids():
|
for id in ox.web.criterion.get_ids():
|
||||||
if id in ('626', '835', '1079', '28907'):
|
if id in ('626', '835'):
|
||||||
continue
|
continue
|
||||||
if models.MovieId.objects.all().filter(criterion_id=id).count() == 0:
|
if models.MovieId.objects.all().filter(criterion_id=id).count() == 0:
|
||||||
print('criterion', id)
|
print('criterion', id)
|
||||||
|
|
|
||||||
|
|
@ -46,8 +46,6 @@ def get_film_count(year, month=None, day=None):
|
||||||
url = get_year(year)
|
url = get_year(year)
|
||||||
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
|
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
|
||||||
total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
|
total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
|
||||||
if not total:
|
|
||||||
total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data)
|
|
||||||
if total:
|
if total:
|
||||||
return int(total[0].replace(',', ''))
|
return int(total[0].replace(',', ''))
|
||||||
print('no movies', url)
|
print('no movies', url)
|
||||||
|
|
@ -116,19 +114,6 @@ def update_month(year, month, film_counts):
|
||||||
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
|
print('%s: count %s, got ids %s' % (key, film_counts[key], r))
|
||||||
save_film_counts(film_counts)
|
save_film_counts(film_counts)
|
||||||
|
|
||||||
def parse_cast(string):
|
|
||||||
results = {}
|
|
||||||
for part in string.split('|'):
|
|
||||||
cast = iter([t.strip() for t in part.split(':\n')])
|
|
||||||
cast = dict(zip(cast, cast))
|
|
||||||
for key in cast:
|
|
||||||
rkey = key.lower()
|
|
||||||
rkey = {
|
|
||||||
'director': 'directors',
|
|
||||||
'star': 'stars',
|
|
||||||
}.get(rkey, rkey)
|
|
||||||
results[rkey] = cast[key].split(', \n')
|
|
||||||
return results
|
|
||||||
|
|
||||||
def update_ids(year, month=None, day=None, sort=None):
|
def update_ids(year, month=None, day=None, sort=None):
|
||||||
films = {}
|
films = {}
|
||||||
|
|
@ -164,20 +149,10 @@ def update_ids(year, month=None, day=None, sort=None):
|
||||||
else:
|
else:
|
||||||
print('no article on', '%s&page=%s' % (url, page-2))
|
print('no article on', '%s&page=%s' % (url, page-2))
|
||||||
break
|
break
|
||||||
for content in article.find_class('lister-item-content'):
|
for header in article.find_class('lister-item-header'):
|
||||||
header = content.find_class('lister-item-header')[0]
|
a = header.xpath('.//a')[0]
|
||||||
a = header.xpath('.//a')
|
|
||||||
if 'Episode:' in [
|
|
||||||
e.text_content()
|
|
||||||
for e in header.xpath(".//small")
|
|
||||||
] and len(a) > 1:
|
|
||||||
title = a[0].text_content().strip() + ': '
|
|
||||||
a = a[1]
|
|
||||||
else:
|
|
||||||
title = ''
|
|
||||||
a = a[0]
|
|
||||||
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
|
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
|
||||||
title += a.text_content().strip()
|
title = a.text_content().strip()
|
||||||
try:
|
try:
|
||||||
y = header.find_class('lister-item-year')[0].text_content()
|
y = header.find_class('lister-item-year')[0].text_content()
|
||||||
y = re.sub('\([^\d]+\)', '', y)
|
y = re.sub('\([^\d]+\)', '', y)
|
||||||
|
|
@ -190,34 +165,11 @@ def update_ids(year, month=None, day=None, sort=None):
|
||||||
print(n)
|
print(n)
|
||||||
print(header.find_class('lister-item-year')[0].text_content())
|
print(header.find_class('lister-item-year')[0].text_content())
|
||||||
raise
|
raise
|
||||||
|
|
||||||
text = content.xpath(".//p[contains(@class, 'text-muted')]")
|
|
||||||
plot = text[1].text_content().strip()
|
|
||||||
plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
|
|
||||||
if plot == 'Add a Plot':
|
|
||||||
plot = ''
|
|
||||||
genre = content.find_class('genre')
|
|
||||||
if genre:
|
|
||||||
genre = genre[0].text_content().strip().split(', ')
|
|
||||||
else:
|
|
||||||
genre = []
|
|
||||||
cast = content.xpath(".//p[contains(@class, '')]")
|
|
||||||
cast = [t for t in cast if t.attrib.get('class') == '']
|
|
||||||
if cast:
|
|
||||||
cast = parse_cast(cast[0].text_content())
|
|
||||||
|
|
||||||
if id not in films:
|
if id not in films:
|
||||||
films[id] = {
|
films[id] = {
|
||||||
'title': title,
|
'title': title,
|
||||||
'year': y
|
'year': y
|
||||||
}
|
}
|
||||||
if plot:
|
|
||||||
films[id]['plot'] = plot
|
|
||||||
if genre:
|
|
||||||
films[id]['genre'] = genre
|
|
||||||
if cast:
|
|
||||||
films[id].update(cast)
|
|
||||||
|
|
||||||
#print(key, len(films), 'films')
|
#print(key, len(films), 'films')
|
||||||
if n:
|
if n:
|
||||||
#print(n)
|
#print(n)
|
||||||
|
|
|
||||||
|
|
@ -28,8 +28,6 @@ actions.register(getIds)
|
||||||
def getData(request, data):
|
def getData(request, data):
|
||||||
response = json_response()
|
response = json_response()
|
||||||
id = data['id']
|
id = data['id']
|
||||||
if isinstance(id, int):
|
|
||||||
id = str(id)
|
|
||||||
if len(id) == 7:
|
if len(id) == 7:
|
||||||
i, created = models.Imdb.objects.get_or_create(imdb=id)
|
i, created = models.Imdb.objects.get_or_create(imdb=id)
|
||||||
if created:
|
if created:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue