parse more info from list

int or str
ignore
2018-06-03 14:00:03 +02:00 · 2018-06-03 13:59:35 +02:00 · 2018-06-03 13:58:53 +02:00 · 2018-05-03 15:35:02 +02:00
3 changed files with 54 additions and 4 deletions
--- a/oxdata/lookup/cache.py
+++ b/oxdata/lookup/cache.py
@ -43,7 +43,7 @@ def get_ids():
                print('missing impawards', ox.web.impawards.get_url(id))
    for id in ox.web.criterion.get_ids():
-        if id in ('626', '835'):
+        if id in ('626', '835', '1079', '28907'):
            continue
        if models.MovieId.objects.all().filter(criterion_id=id).count() == 0:
            print('criterion', id)
--- a/oxdata/movie/imdbids.py
+++ b/oxdata/movie/imdbids.py
@ -46,6 +46,8 @@ def get_film_count(year, month=None, day=None):
        url = get_year(year)
    data = ox.web.imdb.read_url(url, unicode=True, timeout=TIMEOUT)
    total = re.compile('<span class="lister-current-last-item">50</span>.*?of (.*?) titles', re.DOTALL).findall(data)
    if not total:
        total = re.compile(' ([\d+,]+) titles\n', re.DOTALL).findall(data)
    if total:
        return int(total[0].replace(',', ''))
    print('no movies', url)
@ -114,6 +116,19 @@ def update_month(year, month, film_counts):
            print('%s: count %s, got ids %s' % (key, film_counts[key], r))
        save_film_counts(film_counts)
 def parse_cast(string):
    results = {}
    for part in string.split('|'):
        cast = iter([t.strip() for t in part.split(':\n')])
        cast = dict(zip(cast, cast))
        for key in cast:
            rkey = key.lower()
            rkey = {
                'director': 'directors',
                'star': 'stars',
            }.get(rkey, rkey)
            results[rkey] = cast[key].split(', \n')
    return results
 def update_ids(year, month=None, day=None, sort=None):
    films = {}
@ -149,10 +164,20 @@ def update_ids(year, month=None, day=None, sort=None):
            else:
                print('no article on', '%s&page=%s' % (url, page-2))
                break
-            for header in article.find_class('lister-item-header'):
+            for content in article.find_class('lister-item-content'):
-                a = header.xpath('.//a')[0]
+                header = content.find_class('lister-item-header')[0]
                a = header.xpath('.//a')
                if 'Episode:' in [
                    e.text_content()
                    for e in header.xpath(".//small")
                ] and len(a) > 1:
                    title = a[0].text_content().strip() + ': '
                    a = a[1]
                else:
                    title = ''
                    a = a[0]
                id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
-                title = a.text_content().strip()
+                title += a.text_content().strip()
                try:
                    y = header.find_class('lister-item-year')[0].text_content()
                    y = re.sub('\([^\d]+\)', '', y)
@ -165,11 +190,34 @@ def update_ids(year, month=None, day=None, sort=None):
                    print(n)
                    print(header.find_class('lister-item-year')[0].text_content())
                    raise
                text = content.xpath(".//p[contains(@class, 'text-muted')]")
                plot = text[1].text_content().strip()
                plot = plot.replace('See full summary »', '').replace('See full summary\xa0»', '').strip()
                if plot == 'Add a Plot':
                    plot = ''
                genre = content.find_class('genre')
                if genre:
                    genre = genre[0].text_content().strip().split(', ')
                else:
                    genre = []
                cast = content.xpath(".//p[contains(@class, '')]")
                cast = [t for t in cast if t.attrib.get('class') == '']
                if cast:
                    cast = parse_cast(cast[0].text_content())
                if id not in films:
                    films[id] = {
                        'title': title,
                        'year': y
                    }
                    if plot:
                        films[id]['plot'] = plot
                    if genre:
                        films[id]['genre'] = genre
                    if cast:
                        films[id].update(cast)
            #print(key, len(films), 'films')
            if n:
                #print(n)
--- a/oxdata/movie/views.py
+++ b/oxdata/movie/views.py
@ -28,6 +28,8 @@ actions.register(getIds)
 def getData(request, data):
    response = json_response()
    id = data['id']
    if isinstance(id, int):
        id = str(id)
    if len(id) == 7:
        i, created = models.Imdb.objects.get_or_create(imdb=id)
        if created:
Author	SHA1	Message	Date
j	501fe8cd3e	parse more info from list	2018-06-03 14:00:03 +02:00
j	844d25008b	int or str	2018-06-03 13:59:35 +02:00
j	e22d5c5ad0	ignore	2018-06-03 13:58:53 +02:00
j	ca50d091a8	get episode ids	2018-05-03 15:35:02 +02:00