parse year from index, fix import

2018-04-06 16:00:45 +05:30 · 2018-04-06 16:00:45 +05:30 · b876eef0d0
commit b876eef0d0
parent 322d63f234
3 changed files with 62 additions and 30 deletions
--- a/add_metadata.py
+++ b/add_metadata.py
@ -17,11 +17,20 @@ def add_metadata(films, country, output):
        with open(output) as fd:
            meta = json.load(fd)

-    known_ids = set([f['imdbId'] for f in meta])
+    ignore = output + '.ignored'
+    if os.path.exists(ignore):
+        with open(ignore) as fd:
+            ignored = fd.read().strip().split('\n')
+    else:
+        ignored = []
+
+    known_ids = set([f['imdbId'] for f in meta] + ignored)

    def save():
        with codecs.open(output, 'w', encoding='utf-8') as fd:
            json.dump(meta, fd, indent=1, ensure_ascii=False)
+        with open(ignore, 'w') as fd:
+            fd.write('\n'.join(ignored))

    for info in films:
        if info['imdbId'] in known_ids:
@ -53,12 +62,12 @@ def add_metadata(films, country, output):
        if y:
            y = int(y)
        if '(????)' in info.get('title', '') or not y or y >= current_year:
-            info['delete'] = True
+            ignored.append(info['imdbId'])
            print('skip unknown or current year', info['imdbId'], info.get('title'), info.get('year'))
            continue
        if 'isSeries' in extra or ('country' in extra and country not in extra['country']):
-            info['delete'] = True
-            print('deleting', info['imdbId'], info.get('title'))
+            ignored.append(info['imdbId'])
+            print('ignoring', info['imdbId'], info.get('title'))
            continue
        if 'originalTitle' in extra:
            info['alternativeTitles'] = [[info['title'], '']]
--- a/films_by_country.py
+++ b/films_by_country.py
@ -21,10 +21,11 @@ def reset_url(url):

 def write(films, filename):
    data = []
-    for id, title in films.items():
+    for id, film in films.items():
        data.append({
            'imdbId': id,
-            'title': title
+            'title': film[0],
+            'year': film[1],
        })

    with codecs.open(filename, 'w', encoding='utf-8') as fd:
@ -54,6 +55,7 @@ if __name__ == '__main__':
    added = 0

    while year < datetime.now().year:
+        print('<<', year)
        url = '%s/search/title?release_date=%s-01-01,%s-01-01&countries=%s&sort=release_date,asc' % (base_url, year, year + 1, country)

        data = ox.web.imdb.read_url(url, unicode=True)
@ -70,6 +72,25 @@ if __name__ == '__main__':
                article = article[0]
            else:
                n = None
+            for header in article.find_class('lister-item-header'):
+                a = header.xpath('.//a')[0]
+                id = re.compile('title/tt(\d{7})').findall(a.attrib['href'])[0]
+                title = a.text_content()
+                try:
+                    fully = y = header.find_class('lister-item-year')[0].text_content()
+                    y = y.rsplit('(', 1)[-1].split(')')[0].split('–')[0].split(' ')[0].strip()
+                    if not y:
+                        y = year
+                    else:
+                        y = int(y)
+                except:
+                    print(n)
+                    print(header.find_class('lister-item-year')[0].text_content())
+                    raise
+                if id not in films:
+                    films[id] = (title, y)
+                    added += 1
+            '''
            for a in article.xpath('.//a'):
                if '/title/tt' in a.attrib['href']:
                    img = a.xpath('.//img')
@ -80,6 +101,7 @@ if __name__ == '__main__':
                            title = ox.decode_html(title)
                            films[id] = title
                            added += 1
+            '''
            print(len(films), 'films')
            if n:
                data = ox.web.imdb.read_url(n, unicode=True)
@ -89,7 +111,7 @@ if __name__ == '__main__':
            if added > 1000:
                added = 0
                write(films, filename)
-        year += 1
        print('>> year', year)
+        year += 1

    write(films, filename)
--- a/import_json.py
+++ b/import_json.py
@ -12,29 +12,30 @@ def load(data_json):
    import item.models as models
    import archive.models
    import os
-    archive.models.File.objects.all().delete()
-    archive.models.Instance.objects.all().delete()
-    archive.models.Volume.objects.all().delete()
-    models.Item.objects.all().delete()
-    reset_table(archive.models.File._meta.db_table)
-    reset_table(archive.models.Instance._meta.db_table)
-    reset_table(archive.models.Volume._meta.db_table)
-    reset_table(models.Item._meta.db_table)
-    transaction.commit_unless_managed()
-    os.system('rm -r /srv/pandora/data/media')
-    os.system('rm -r /srv/pandora/data/items')
+    with transaction.atomic():
+        archive.models.File.objects.all().delete()
+        archive.models.Instance.objects.all().delete()
+        archive.models.Volume.objects.all().delete()
+        models.Item.objects.all().delete()
+        reset_table(archive.models.File._meta.db_table)
+        reset_table(archive.models.Instance._meta.db_table)
+        reset_table(archive.models.Volume._meta.db_table)
+        reset_table(models.Item._meta.db_table)
+    with transaction.atomic():
+        os.system('rm -r /srv/pandora/data/media')
+        os.system('rm -r /srv/pandora/data/items')
+
+        films = json.load(open(data_json))
+        for data in sorted(films, key=lambda f: (f['year'], f['title'], f.get('director', []))):
+            item = models.Item()
+            item.data = data
+            item.save()
+            item.make_poster()
+            item.make_icon()
+            item.level = 2
+            item.save()
+            print(item)

-    films = json.load(open(data_json))
-    for data in films:
-        item = models.Item()
-        item.data = data
-        item.save()
-        item.make_poster(True)
-        item.make_icon()
-        item.level = 2
-        item.save()
-        print item

 if __name__ == '__main__':
-    print 'please import from ./manage.py and run import_json.load(path_to_json)'
-
+    print('please import from ./manage.py and run import_json.load(path_to_json)')