Compare commits

...

4 commits

Author SHA1 Message Date
j
501fe8cd3e parse more info from list 2018-06-03 14:00:03 +02:00
j
844d25008b int or str 2018-06-03 13:59:35 +02:00
j
e22d5c5ad0 ignore 2018-06-03 13:58:53 +02:00
j
ca50d091a8 get episode ids 2018-05-03 15:35:02 +02:00
3 changed files with 54 additions and 4 deletions

View file

@ -43,7 +43,7 @@ def get_ids():
print('missing impawards', ox.web.impawards.get_url(id)) print('missing impawards', ox.web.impawards.get_url(id))
for id in ox.web.criterion.get_ids(): for id in ox.web.criterion.get_ids():
if id in ('626', '835'): if id in ('626', '835', '1079', '28907'):
continue continue
if models.MovieId.objects.all().filter(criterion_id=id).count() == 0: if models.MovieId.objects.all().filter(criterion_id=id).count() == 0:
print('criterion', id) print('criterion', id)

View file

@ -46,6 +46,8 @@ def get_film_count(year, month=None, day=None):
url = get_year(year) url = get_year(year)
data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT) data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data) total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
if not total:
total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data)
if total: if total:
return int(total[0].replace(',', '')) return int(total[0].replace(',', ''))
print('no movies', url) print('no movies', url)
@ -114,6 +116,19 @@ def update_month(year, month, film_counts):
print('%s: count %s, got ids %s' % (key, film_counts[key], r)) print('%s: count %s, got ids %s' % (key, film_counts[key], r))
save_film_counts(film_counts) save_film_counts(film_counts)
def parse_cast(string):
results = {}
for part in string.split('|'):
cast = iter([t.strip() for t in part.split(':\n')])
cast = dict(zip(cast, cast))
for key in cast:
rkey = key.lower()
rkey = {
'director': 'directors',
'star': 'stars',
}.get(rkey, rkey)
results[rkey] = cast[key].split(', \n')
return results
def update_ids(year, month=None, day=None, sort=None): def update_ids(year, month=None, day=None, sort=None):
films = {} films = {}
@ -149,10 +164,20 @@ def update_ids(year, month=None, day=None, sort=None):
else: else:
print('no article on', '%s&page=%s' % (url, page-2)) print('no article on', '%s&page=%s' % (url, page-2))
break break
for header in article.find_class('lister-item-header'): for content in article.find_class('lister-item-content'):
a = header.xpath('.//a')[0] header = content.find_class('lister-item-header')[0]
a = header.xpath('.//a')
if 'Episode:' in [
e.text_content()
for e in header.xpath(".//small")
] and len(a) > 1:
title = a[0].text_content().strip() + ': '
a = a[1]
else:
title = ''
a = a[0]
id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0] id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
title = a.text_content().strip() title += a.text_content().strip()
try: try:
y = header.find_class('lister-item-year')[0].text_content() y = header.find_class('lister-item-year')[0].text_content()
y = re.sub('\([^\d]+\)', '', y) y = re.sub('\([^\d]+\)', '', y)
@ -165,11 +190,34 @@ def update_ids(year, month=None, day=None, sort=None):
print(n) print(n)
print(header.find_class('lister-item-year')[0].text_content()) print(header.find_class('lister-item-year')[0].text_content())
raise raise
text = content.xpath(".//p[contains(@class, 'text-muted')]")
plot = text[1].text_content().strip()
plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
if plot == 'Add a Plot':
plot = ''
genre = content.find_class('genre')
if genre:
genre = genre[0].text_content().strip().split(', ')
else:
genre = []
cast = content.xpath(".//p[contains(@class, '')]")
cast = [t for t in cast if t.attrib.get('class') == '']
if cast:
cast = parse_cast(cast[0].text_content())
if id not in films: if id not in films:
films[id] = { films[id] = {
'title': title, 'title': title,
'year': y 'year': y
} }
if plot:
films[id]['plot'] = plot
if genre:
films[id]['genre'] = genre
if cast:
films[id].update(cast)
#print(key, len(films), 'films') #print(key, len(films), 'films')
if n: if n:
#print(n) #print(n)

View file

@ -28,6 +28,8 @@ actions.register(getIds)
def getData(request, data): def getData(request, data):
response = json_response() response = json_response()
id = data['id'] id = data['id']
if isinstance(id, int):
id = str(id)
if len(id) == 7: if len(id) == 7:
i, created = models.Imdb.objects.get_or_create(imdb=id) i, created = models.Imdb.objects.get_or_create(imdb=id)
if created: if created: