pandora_cdosea/subtitles.py

#!/usr/bin/python3
from argparse import ArgumentParser
from collections import defaultdict
from copy import deepcopy
from glob import glob
import json
import os
import re
import string
import subprocess
import sys

import ox
import ox.web.auth


base_url = 'http://127.0.0.1:2620'

FRAME_DURATION = 1/60
MAX_DURATION = 40

HIDDEN_TAGS = [
    "women with white males",
    "gene z hanrahan"
]

# items to not use at all
BLACKLIST = [
    'XN'
]

api = None

def get_api():
    global api
    if not api:
        api = ox.API(base_url + '/api/')
        api.signin(**ox.web.auth.get('cdosea'))


def update_subtitles():
    get_api()
    items = api.find({
        'query': {
            'conditions': [{'key': 'tags', 'value': 'Vocal', 'operator': '=='}]
        },
        'keys': ['id', 'title'],
        'range': [0, 1000]})['data']['items']
    for item in items:
        '''
        info = api.findMedia({
            'query': {
                'conditions': [
                    {'key': 'id', 'operator': '==', 'value': item['id']}
                ]
            },
            'keys': ['id', 'extension'],
            'range': [0, 1]
        })['data']['items'][0]
        '''
        item['subtitles'] = api.get({'id': item['id'], 'keys': ['layers']})['data']['layers']['subtitles']

    return items

def get_subtitles(items, id):
    for item in items:
        if item['title'].startswith(id):
            return deepcopy(item['subtitles'])

def render_subtitles(item_json, output_json, output_srt, lang):
    with open(item_json) as fd:
        item = json.load(fd)

    subtitles = []
    position = 0
    subs = {}
    for clip in item['vocals']:
        if not clip.get('blank'):
            # vocals/A/A4_chaton.wav
            id = clip['path'].split('/')[-1][:2]
            clip_subtitles = get_subtitles(items, id)
            clip_subtitles.sort(key=lambda c: (c['in'], c['out'], c['id']))

            for sub in clip_subtitles:
                sub_in = float('%0.3f' % (sub['in'] + position))
                sub_out = float('%0.3f' % (sub['out'] + position))
                sub_id = '%0.3f-%0.3f' % (sub_in, sub_out)
                if sub_id not in subs:
                    subs[sub_id] = {
                        'in': sub_in,
                        'out': sub_out,
                        'value': [],
                    }
                    if isinstance(lang, list):
                        subs[sub_id]['ids'] = []
                        subs[sub_id]['languages'] = []

                slang = re.compile('span lang="(..)"').findall(sub['value'])
                value = sub['value'].replace('<br>', '').strip()
                if slang:
                    slang = slang[0]
                    value = value.replace('<span lang="' + slang + '">', '').replace('</span>', '').strip()
                else:
                    slang = 'en'
                # just use strip_tags?
                # value = ox.strip_tags(ox.decode_html(sub['value']))
                if isinstance(lang, list) and slang in lang:
                    if lang.index(slang) == 0:
                        subs[sub_id]['value'].insert(0, value)
                    else:
                        subs[sub_id]['value'].append(value)
                    subs[sub_id]['ids'].append(sub['id'])
                    subs[sub_id]['languages'].append(slang)
                elif slang == lang:
                    subs[sub_id]['value'].append(value)
        position += clip['duration']

    if isinstance(lang, list):
        #fixme = [sub for sub in subs.values() if [s for s in list(subs.values())]]
        fixme = [sub for sub in subs.values() if set(sub['languages']) != set(lang) and sub['value']]
        if fixme:
            remove = []
            for key, sub in list(subs.items()):
                intersections = []
                for s in list(subs.values()):
                    intersections += overlaps(sub, s)
                if intersections:
                    points = list(sorted(set([sub['in'], sub['out']] + intersections)))
                    #print(points, sub['value'])
                    sub_in = points[0]
                    for sub_out in points[1:]:
                        sub_id = '%0.3f-%0.3f' % (sub_in, sub_out)
                        if sub_id not in subs:
                            subs[sub_id] = {
                                'in': sub_in,
                                'out': sub_out,
                                'value': [],
                                'ids': [],
                                'languages': []
                            }
                        if not sub['value']:
                            continue
                        if set(subs[sub_id]['languages']) != set(lang):
                            if not subs[sub_id]['languages']:
                                subs[sub_id]['value'] += sub['value']
                                subs[sub_id]['languages'] += sub['languages']
                                subs[sub_id]['ids'] += sub['ids']
                            elif subs[sub_id]['languages'] == [lang[0]] \
                                    and sub['languages'][0] not in subs[sub_id]['languages']:
                                subs[sub_id]['value'].append(sub['value'][0])
                                subs[sub_id]['languages'].append(sub['languages'][0])
                                subs[sub_id]['ids'] += sub['ids']
                            elif subs[sub_id]['languages'] == [lang[1]] \
                                    and sub['languages'][0] not in subs[sub_id]['languages']:
                                subs[sub_id]['value'].insert(0, sub['value'][0])
                                subs[sub_id]['languages'].insert(0, sub['languages'][0])
                                subs[sub_id]['ids'] += sub['ids']
                            #else:
                            #    print('WTF', sub['languages'], subs[sub_id]['languages'])

                        sub_in = sub_out
                    remove.append(key)
            #for key, sub in list(subs.items()):
            #    if len(sub['languages']) == 1:
            #        del subs[key]
            for key in remove:
                if len(subs[key]['languages']) == 1:
                    del subs[key]
            for key, sub in list(subs.items()):
                if abs(sub['out'] - sub['in']) <= 0.040001:
                    del subs[key]
    subs = sorted(subs.values(), key=lambda c: (c['in'], c['out']))
    for sub in subs:
        sub['value'] = '\n'.join(sub['value'])
        if sub['value'].strip():
            subtitles.append(sub)

    merged = []
    p = None
    for sub in subtitles:
        if not p:
            merged.append(sub)
            p = sub
        else:
            if p['out'] > sub['in']:
                if p['value'] == sub['value']:
                    p['out'] = max(p['out'], sub['out'])
                else:
                    p['out'] = sub['in']
                    merged.append(sub)
                    p = sub
            else:
                merged.append(sub)
                p = sub
    subtitles = merged

    if output_srt:
        with open(output_srt, 'wb') as fd:
            fd.write(ox.srt.encode(subtitles))
    with open(output_json, 'w') as fd:
        json.dump(subtitles, fd, indent=4, ensure_ascii=False, sort_keys=True)


def overlaps(src, other):
    src_in = float('%0.2f' % src['in'])
    src_out = float('%0.2f' % src['out'])
    other_in = float('%0.2f' % other['in'])
    other_out = float('%0.2f' % other['out'])
    points = []
    if src_in != other_in or src_out != other_out:
        # src inside
        if other_in >= src_in and other_in < src_out:
            points += [other['in']]
        if other_out > src_in and other_out <= src_out:
            points += [other['out']]
    return points


if __name__ == '__main__':
    usage = "usage: %(prog)s [options] json"
    parser = ArgumentParser(usage=usage)
    parser.add_argument('-p', '--prefix', dest='prefix',
                    help='version prefix', default='.')
    parser.add_argument('files', metavar='path', type=str, nargs='*', help='json files')
    opts = parser.parse_args()

    if os.path.exists('subtitles.json'):
        items = json.load(open('subtitles.json'))
    else:
        items = update_subtitles()
        with open('subtitles.json', 'w') as fd:
            json.dump(items, fd, indent=4, ensure_ascii=False, sort_keys=True)

    files = opts.files
    if not files:
        files = glob(os.path.join(opts.prefix, 'output/*/*.json'))
        files = [f for f in files if not 'gong' in f]
    for item_json in files:
        prefix = 'public/' + item_json.split('/')[-1][0].lower() + item_json.split('/')[-2] + '.'
        prefix = os.path.join(opts.prefix, prefix)
        output_json = prefix + '1080p.json'
        output_srt = prefix + '1080p.srt'
        render_subtitles(item_json, output_json, output_srt, ['ko', 'en'])
        output_json = prefix + 'no-en.json'
        output_srt = prefix + 'no-en.srt'
        render_subtitles(item_json, output_json, output_srt, ['no', 'en'])
        output_json = prefix + 'ch-en.json'
        output_srt = prefix + 'ch-en.srt'
        render_subtitles(item_json, output_json, output_srt, ['ch', 'en'])
        for lang in ('en', 'ko', 'no', 'ch'):
            output_json = prefix + lang + '.json'
            output_srt = prefix + lang + '.srt'
            render_subtitles(item_json, output_json, output_srt, lang)
render subtitles 2017-05-16 12:59:32 +00:00			`#!/usr/bin/python3`
fix import, pass prefix to render_mlt.py too 2017-10-02 12:39:36 +00:00			`from argparse import ArgumentParser`
			`from collections import defaultdict`
			`from copy import deepcopy`
			`from glob import glob`
render subtitles 2017-05-16 12:59:32 +00:00			`import json`
fix import, pass prefix to render_mlt.py too 2017-10-02 12:39:36 +00:00			`import os`
multi lingual 2017-08-25 09:12:32 +00:00			`import re`
render subtitles 2017-05-16 12:59:32 +00:00			`import string`
fix import, pass prefix to render_mlt.py too 2017-10-02 12:39:36 +00:00			`import subprocess`
			`import sys`
render subtitles 2017-05-16 12:59:32 +00:00
			`import ox`
			`import ox.web.auth`


			`base_url = 'http://127.0.0.1:2620'`

			`FRAME_DURATION = 1/60`
			`MAX_DURATION = 40`

			`HIDDEN_TAGS = [`
			`"women with white males",`
			`"gene z hanrahan"`
			`]`

			`# items to not use at all`
			`BLACKLIST = [`
			`'XN'`
			`]`

			`api = None`

			`def get_api():`
			`global api`
			`if not api:`
			`api = ox.API(base_url + '/api/')`
			`api.signin(**ox.web.auth.get('cdosea'))`


typos 2017-05-20 15:32:51 +00:00			`def update_subtitles():`
render subtitles 2017-05-16 12:59:32 +00:00			`get_api()`
			`items = api.find({`
			`'query': {`
			`'conditions': [{'key': 'tags', 'value': 'Vocal', 'operator': '=='}]`
			`},`
			`'keys': ['id', 'title'],`
			`'range': [0, 1000]})['data']['items']`
			`for item in items:`
			`'''`
			`info = api.findMedia({`
			`'query': {`
			`'conditions': [`
			`{'key': 'id', 'operator': '==', 'value': item['id']}`
			`]`
			`},`
			`'keys': ['id', 'extension'],`
			`'range': [0, 1]`
			`})['data']['items'][0]`
			`'''`
			`item['subtitles'] = api.get({'id': item['id'], 'keys': ['layers']})['data']['layers']['subtitles']`

			`return items`

			`def get_subtitles(items, id):`
			`for item in items:`
			`if item['title'].startswith(id):`
			`return deepcopy(item['subtitles'])`

multi lingual 2017-08-25 09:12:32 +00:00			`def render_subtitles(item_json, output_json, output_srt, lang):`
render subtitles 2017-05-16 12:59:32 +00:00			`with open(item_json) as fd:`
			`item = json.load(fd)`

			`subtitles = []`
			`position = 0`
subs 2017-05-21 11:55:48 +00:00			`subs = {}`
render subtitles 2017-05-16 12:59:32 +00:00			`for clip in item['vocals']:`
			`if not clip.get('blank'):`
			`# vocals/A/A4_chaton.wav`
			`id = clip['path'].split('/')[-1][:2]`
			`clip_subtitles = get_subtitles(items, id)`
subs 2017-05-21 11:55:48 +00:00			`clip_subtitles.sort(key=lambda c: (c['in'], c['out'], c['id']))`

render subtitles 2017-05-16 12:59:32 +00:00			`for sub in clip_subtitles:`
korean 2017-05-21 20:48:54 +00:00			`sub_in = float('%0.3f' % (sub['in'] + position))`
subs 2017-05-21 11:55:48 +00:00			`sub_out = float('%0.3f' % (sub['out'] + position))`
			`sub_id = '%0.3f-%0.3f' % (sub_in, sub_out)`
			`if sub_id not in subs:`
			`subs[sub_id] = {`
			`'in': sub_in,`
			`'out': sub_out,`
			`'value': [],`
			`}`
multi lingual 2017-08-25 09:12:32 +00:00			`if isinstance(lang, list):`
less 2017-06-17 16:02:14 +00:00			`subs[sub_id]['ids'] = []`
multi lingual 2017-08-25 09:12:32 +00:00			`subs[sub_id]['languages'] = []`

			`slang = re.compile('span lang="(..)"').findall(sub['value'])`
			`value = sub['value'].replace('<br>', '').strip()`
			`if slang:`
			`slang = slang[0]`
			`value = value.replace('<span lang="' + slang + '">', '').replace('</span>', '').strip()`
			`else:`
			`slang = 'en'`
korean 2017-05-21 20:48:54 +00:00			`# just use strip_tags?`
			`# value = ox.strip_tags(ox.decode_html(sub['value']))`
multi lingual 2017-08-25 09:12:32 +00:00			`if isinstance(lang, list) and slang in lang:`
			`if lang.index(slang) == 0:`
refactor subtitles 2017-06-17 16:00:24 +00:00			`subs[sub_id]['value'].insert(0, value)`
			`else:`
			`subs[sub_id]['value'].append(value)`
			`subs[sub_id]['ids'].append(sub['id'])`
multi lingual 2017-08-25 09:12:32 +00:00			`subs[sub_id]['languages'].append(slang)`
			`elif slang == lang:`
korean first 2017-05-21 21:14:15 +00:00			`subs[sub_id]['value'].append(value)`
render subtitles 2017-05-16 12:59:32 +00:00			`position += clip['duration']`

multi lingual 2017-08-25 09:12:32 +00:00			`if isinstance(lang, list):`
overlapping subtitles 2017-08-29 14:59:15 +00:00			`#fixme = [sub for sub in subs.values() if [s for s in list(subs.values())]]`
			`fixme = [sub for sub in subs.values() if set(sub['languages']) != set(lang) and sub['value']]`
multi lingual 2017-08-25 09:12:32 +00:00			`if fixme:`
overlapping subtitles 2017-08-29 14:59:15 +00:00			`remove = []`
			`for key, sub in list(subs.items()):`
			`intersections = []`
			`for s in list(subs.values()):`
			`intersections += overlaps(sub, s)`
			`if intersections:`
			`points = list(sorted(set([sub['in'], sub['out']] + intersections)))`
			`#print(points, sub['value'])`
			`sub_in = points[0]`
			`for sub_out in points[1:]:`
			`sub_id = '%0.3f-%0.3f' % (sub_in, sub_out)`
			`if sub_id not in subs:`
			`subs[sub_id] = {`
			`'in': sub_in,`
			`'out': sub_out,`
			`'value': [],`
			`'ids': [],`
			`'languages': []`
			`}`
merge subs 2017-08-31 20:08:03 +00:00			`if not sub['value']:`
			`continue`
overlapping subtitles 2017-08-29 14:59:15 +00:00			`if set(subs[sub_id]['languages']) != set(lang):`
			`if not subs[sub_id]['languages']:`
			`subs[sub_id]['value'] += sub['value']`
			`subs[sub_id]['languages'] += sub['languages']`
			`subs[sub_id]['ids'] += sub['ids']`
			`elif subs[sub_id]['languages'] == [lang[0]] \`
			`and sub['languages'][0] not in subs[sub_id]['languages']:`
			`subs[sub_id]['value'].append(sub['value'][0])`
			`subs[sub_id]['languages'].append(sub['languages'][0])`
			`subs[sub_id]['ids'] += sub['ids']`
			`elif subs[sub_id]['languages'] == [lang[1]] \`
			`and sub['languages'][0] not in subs[sub_id]['languages']:`
			`subs[sub_id]['value'].insert(0, sub['value'][0])`
			`subs[sub_id]['languages'].insert(0, sub['languages'][0])`
			`subs[sub_id]['ids'] += sub['ids']`
			`#else:`
			`# print('WTF', sub['languages'], subs[sub_id]['languages'])`

			`sub_in = sub_out`
			`remove.append(key)`
			`#for key, sub in list(subs.items()):`
			`# if len(sub['languages']) == 1:`
			`# del subs[key]`
			`for key in remove:`
			`if len(subs[key]['languages']) == 1:`
			`del subs[key]`
			`for key, sub in list(subs.items()):`
			`if abs(sub['out'] - sub['in']) <= 0.040001:`
			`del subs[key]`
			`subs = sorted(subs.values(), key=lambda c: (c['in'], c['out']))`
			`for sub in subs:`
			`sub['value'] = '\n'.join(sub['value'])`
			`if sub['value'].strip():`
			`subtitles.append(sub)`
multi lingual 2017-08-25 09:12:32 +00:00
merge subs 2017-08-31 20:08:03 +00:00			`merged = []`
			`p = None`
			`for sub in subtitles:`
			`if not p:`
			`merged.append(sub)`
			`p = sub`
			`else:`
			`if p['out'] > sub['in']:`
			`if p['value'] == sub['value']:`
			`p['out'] = max(p['out'], sub['out'])`
			`else:`
			`p['out'] = sub['in']`
			`merged.append(sub)`
			`p = sub`
			`else:`
			`merged.append(sub)`
			`p = sub`
			`subtitles = merged`

refactor subtitles 2017-06-17 16:00:24 +00:00			`if output_srt:`
			`with open(output_srt, 'wb') as fd:`
			`fd.write(ox.srt.encode(subtitles))`
render subtitles 2017-05-16 12:59:32 +00:00			`with open(output_json, 'w') as fd:`
sort_keys=True 2017-06-17 16:03:13 +00:00			`json.dump(subtitles, fd, indent=4, ensure_ascii=False, sort_keys=True)`
render subtitles 2017-05-16 12:59:32 +00:00
korean 2017-05-21 20:48:54 +00:00
overlapping subtitles 2017-08-29 14:59:15 +00:00			`def overlaps(src, other):`
rounding issues 2017-08-30 14:10:35 +00:00			`src_in = float('%0.2f' % src['in'])`
			`src_out = float('%0.2f' % src['out'])`
			`other_in = float('%0.2f' % other['in'])`
			`other_out = float('%0.2f' % other['out'])`
overlapping subtitles 2017-08-29 14:59:15 +00:00			`points = []`
			`if src_in != other_in or src_out != other_out:`
			`# src inside`
			`if other_in >= src_in and other_in < src_out:`
			`points += [other['in']]`
			`if other_out > src_in and other_out <= src_out:`
			`points += [other['out']]`
			`return points`

rounding issues 2017-08-30 14:10:35 +00:00
render subtitles 2017-05-16 12:59:32 +00:00			`if __name__ == '__main__':`
add prefix 2017-10-02 12:28:40 +00:00			`usage = "usage: %(prog)s [options] json"`
			`parser = ArgumentParser(usage=usage)`
			`parser.add_argument('-p', '--prefix', dest='prefix',`
			`help='version prefix', default='.')`
			`parser.add_argument('files', metavar='path', type=str, nargs='*', help='json files')`
			`opts = parser.parse_args()`

render subtitles 2017-05-16 12:59:32 +00:00			`if os.path.exists('subtitles.json'):`
			`items = json.load(open('subtitles.json'))`
			`else:`
typos 2017-05-20 15:32:51 +00:00			`items = update_subtitles()`
render subtitles 2017-05-16 12:59:32 +00:00			`with open('subtitles.json', 'w') as fd:`
sort_keys=True 2017-06-17 16:03:13 +00:00			`json.dump(items, fd, indent=4, ensure_ascii=False, sort_keys=True)`
render subtitles 2017-05-16 12:59:32 +00:00
add prefix 2017-10-02 12:28:40 +00:00			`files = opts.files`
			`if not files:`
			`files = glob(os.path.join(opts.prefix, 'output//.json'))`
write out srt files for each language too 2017-10-04 11:04:19 +00:00			`files = [f for f in files if not 'gong' in f]`
update subtitles 2017-05-17 10:15:28 +00:00			`for item_json in files:`
refactor subtitles 2017-06-17 16:00:24 +00:00			`prefix = 'public/' + item_json.split('/')[-1][0].lower() + item_json.split('/')[-2] + '.'`
add prefix 2017-10-02 12:28:40 +00:00			`prefix = os.path.join(opts.prefix, prefix)`
refactor subtitles 2017-06-17 16:00:24 +00:00			`output_json = prefix + '1080p.json'`
multi lingual 2017-08-25 09:12:32 +00:00			`output_srt = prefix + '1080p.srt'`
			`render_subtitles(item_json, output_json, output_srt, ['ko', 'en'])`
			`output_json = prefix + 'no-en.json'`
			`output_srt = prefix + 'no-en.srt'`
			`render_subtitles(item_json, output_json, output_srt, ['no', 'en'])`
add chinese subtitles 2018-07-03 14:55:16 +00:00			`output_json = prefix + 'ch-en.json'`
			`output_srt = prefix + 'ch-en.srt'`
			`render_subtitles(item_json, output_json, output_srt, ['ch', 'en'])`
			`for lang in ('en', 'ko', 'no', 'ch'):`
refactor subtitles 2017-06-17 16:00:24 +00:00			`output_json = prefix + lang + '.json'`
write out srt files for each language too 2017-10-04 11:04:19 +00:00			`output_srt = prefix + lang + '.srt'`
			`render_subtitles(item_json, output_json, output_srt, lang)`