pandora_t_for_time/management/commands/import_subtitles.py

import json
import os
import subprocess

import ox

from django.core.management.base import BaseCommand
from django.conf import settings

from item.models import Item
from annotation.models import Annotation


class Command(BaseCommand):
    help = 'export all subtitles for translations'

    def add_arguments(self, parser):
        parser.add_argument('--lang', action='store', dest='lang', default=None, help='subtitle language')
        parser.add_argument('--test', action='store_true', dest='test', default=False, help='test run')
        parser.add_argument('args', metavar='args', type=str, nargs='*', help='file or url')

    def handle(self, filename, **options):
        if not options["lang"]:
            print("--lang is required")
            return
        lang = options["lang"]

        if filename.startswith("http"):
            data = ox.net.read_url(filename).decode()
        else:
            with open(filename) as fd:
                data = fd.read()

        data = data.strip().split('\n## ')[1:]

        invalid = []
        valid = []
        for block in data:
            title, block = block.split('\n', 1)
            block = block.strip()
            title = title.strip()
            item_id = title.split(' ')[-1]
            item = Item.objects.get(public_id=item_id)

            subtitles_en = item.annotations.filter(layer="subtitles", languages=None).exclude(value='')
            lines = block.split('\n\n')
            if len(lines) != subtitles_en.count():
                print('%s: number of subtitles does not match, en: %s vs %s: %s' % (title, subtitles_en.count(), lang, len(lines)))
                continue

            if options["test"]:
                print('%s: valid %s subtitles' % (title, len(lines)))
            else:
                n = 0
                item.annotations.filter(layer="subtitles", languages=lang).delete()
                for sub_en in subtitles_en.order_by('start'):
                    sub = Annotation()
                    sub.item = sub_en.item
                    sub.user = sub_en.user
                    sub.layer = sub_en.layer
                    sub.start = sub_en.start
                    sub.end = sub_en.end
                    sub.value = '<span lang="%s">%s</span>' % (lang, lines[n])
                    sub.save()
                    n += 1

            '''
            srt = 'vocals_txt/%s/%s' % (title[0], title.replace('.wav', '.srt'))
            filename = 'vocals_txt/%s/%s' % (title[0], title.replace('.wav', '.' + lang + '.srt'))

            folder = os.path.dirname(filename)
            if not os.path.exists(folder):
                os.makedirs(folder)
            data = json.load(open(srt + '.json'))
            subs = block.replace('\n\n', '\n').split('\n')
            if len(data) != len(subs):
                print('invalid', title, 'expected', len(data), 'got', len(subs))
                invalid.append('## %s\n\n%s' % (title, block))
                valid.append('## %s\n\n%s' % (title, '\n\n'.join([d['value'] for d in data])))
                continue

            for i, sub in enumerate(data):
                sub['value'] = subs[i]
            kodata = ox.srt.encode(data)
            current = None
            if os.path.exists(filename):
                with open(filename, 'rb') as fd:
                    current = fd.read()
            if current != kodata:
                print('update', title, filename)
                with open(filename, 'wb') as fd:
                    fd.write(kodata)
                with open(filename + '.json', 'w') as fd:
                    ko = [{
                        'in': s['in'],
                        'out': s['out'],
                        'value': s['value'],
                    } for s in data]
                    json.dump(ko, fd, ensure_ascii=False, indent=4)

        if invalid:
            with open('invalid_%s_subtitles.txt' % lang, 'w') as fd:
                fd.write('\n\n\n\n'.join(invalid))
            with open('invalid_%s_subtitles_en.txt' % lang, 'w') as fd:
                fd.write('\n\n\n\n'.join(valid))
                '''