pandora_cdosea/subtitles.py

#!/usr/bin/python3
import os
import sys
import json
import re
import subprocess
from collections import defaultdict
import string
from glob import glob
from copy import deepcopy

import ox
import ox.web.auth


base_url = 'http://127.0.0.1:2620'

FRAME_DURATION = 1/60
MAX_DURATION = 40

HIDDEN_TAGS = [
    "women with white males",
    "gene z hanrahan"
]

# items to not use at all
BLACKLIST = [
    'XN'
]

api = None

def get_api():
    global api
    if not api:
        api = ox.API(base_url + '/api/')
        api.signin(**ox.web.auth.get('cdosea'))


def update_subtitles():
    get_api()
    items = api.find({
        'query': {
            'conditions': [{'key': 'tags', 'value': 'Vocal', 'operator': '=='}]
        },
        'keys': ['id', 'title'],
        'range': [0, 1000]})['data']['items']
    for item in items:
        '''
        info = api.findMedia({
            'query': {
                'conditions': [
                    {'key': 'id', 'operator': '==', 'value': item['id']}
                ]
            },
            'keys': ['id', 'extension'],
            'range': [0, 1]
        })['data']['items'][0]
        '''
        item['subtitles'] = api.get({'id': item['id'], 'keys': ['layers']})['data']['layers']['subtitles']

    return items

def get_subtitles(items, id):
    for item in items:
        if item['title'].startswith(id):
            return deepcopy(item['subtitles'])

def render_subtitles(item_json, output_json, output_srt, lang):
    with open(item_json) as fd:
        item = json.load(fd)

    subtitles = []
    position = 0
    subs = {}
    for clip in item['vocals']:
        if not clip.get('blank'):
            # vocals/A/A4_chaton.wav
            id = clip['path'].split('/')[-1][:2]
            clip_subtitles = get_subtitles(items, id)
            clip_subtitles.sort(key=lambda c: (c['in'], c['out'], c['id']))

            for sub in clip_subtitles:
                sub_in = float('%0.3f' % (sub['in'] + position))
                sub_out = float('%0.3f' % (sub['out'] + position))
                sub_id = '%0.3f-%0.3f' % (sub_in, sub_out)
                if sub_id not in subs:
                    subs[sub_id] = {
                        'in': sub_in,
                        'out': sub_out,
                        'value': [],
                    }
                    if isinstance(lang, list):
                        subs[sub_id]['ids'] = []
                        subs[sub_id]['languages'] = []

                slang = re.compile('span lang="(..)"').findall(sub['value'])
                value = sub['value'].replace('<br>', '').strip()
                if slang:
                    slang = slang[0]
                    value = value.replace('<span lang="' + slang + '">', '').replace('</span>', '').strip()
                else:
                    slang = 'en'
                # just use strip_tags?
                # value = ox.strip_tags(ox.decode_html(sub['value']))
                if isinstance(lang, list) and slang in lang:
                    if lang.index(slang) == 0:
                        subs[sub_id]['value'].insert(0, value)
                    else:
                        subs[sub_id]['value'].append(value)
                    subs[sub_id]['ids'].append(sub['id'])
                    subs[sub_id]['languages'].append(slang)
                elif slang == lang:
                    subs[sub_id]['value'].append(value)
        position += clip['duration']

    if isinstance(lang, list):
        #fixme = [sub for sub in subs.values() if [s for s in list(subs.values())]]
        fixme = [sub for sub in subs.values() if set(sub['languages']) != set(lang) and sub['value']]
        if fixme:
            remove = []
            for key, sub in list(subs.items()):
                intersections = []
                for s in list(subs.values()):
                    intersections += overlaps(sub, s)
                if intersections:
                    points = list(sorted(set([sub['in'], sub['out']] + intersections)))
                    #print(points, sub['value'])
                    sub_in = points[0]
                    for sub_out in points[1:]:
                        sub_id = '%0.3f-%0.3f' % (sub_in, sub_out)
                        if sub_id not in subs:
                            subs[sub_id] = {
                                'in': sub_in,
                                'out': sub_out,
                                'value': [],
                                'ids': [],
                                'languages': []
                            }
                        if set(subs[sub_id]['languages']) != set(lang):
                            if not subs[sub_id]['languages']:
                                subs[sub_id]['value'] += sub['value']
                                subs[sub_id]['languages'] += sub['languages']
                                subs[sub_id]['ids'] += sub['ids']
                            elif subs[sub_id]['languages'] == [lang[0]] \
                                    and sub['languages'][0] not in subs[sub_id]['languages']:
                                subs[sub_id]['value'].append(sub['value'][0])
                                subs[sub_id]['languages'].append(sub['languages'][0])
                                subs[sub_id]['ids'] += sub['ids']
                            elif subs[sub_id]['languages'] == [lang[1]] \
                                    and sub['languages'][0] not in subs[sub_id]['languages']:
                                subs[sub_id]['value'].insert(0, sub['value'][0])
                                subs[sub_id]['languages'].insert(0, sub['languages'][0])
                                subs[sub_id]['ids'] += sub['ids']
                            #else:
                            #    print('WTF', sub['languages'], subs[sub_id]['languages'])

                        sub_in = sub_out
                    remove.append(key)
            #for key, sub in list(subs.items()):
            #    if len(sub['languages']) == 1:
            #        del subs[key]
            for key in remove:
                if len(subs[key]['languages']) == 1:
                    del subs[key]
            for key, sub in list(subs.items()):
                if abs(sub['out'] - sub['in']) <= 0.040001:
                    del subs[key]
    subs = sorted(subs.values(), key=lambda c: (c['in'], c['out']))
    for sub in subs:
        sub['value'] = '\n'.join(sub['value'])
        if sub['value'].strip():
            subtitles.append(sub)

    if output_srt:
        with open(output_srt, 'wb') as fd:
            fd.write(ox.srt.encode(subtitles))
    with open(output_json, 'w') as fd:
        json.dump(subtitles, fd, indent=4, ensure_ascii=False, sort_keys=True)


def overlaps(src, other):
    src_in = float('%0.2f' % src['in'])
    src_out = float('%0.2f' % src['out'])
    other_in = float('%0.2f' % other['in'])
    other_out = float('%0.2f' % other['out'])
    points = []
    if src_in != other_in or src_out != other_out:
        # src inside
        if other_in >= src_in and other_in < src_out:
            points += [other['in']]
        if other_out > src_in and other_out <= src_out:
            points += [other['out']]
    return points


if __name__ == '__main__':
    if os.path.exists('subtitles.json'):
        items = json.load(open('subtitles.json'))
    else:
        items = update_subtitles()
        with open('subtitles.json', 'w') as fd:
            json.dump(items, fd, indent=4, ensure_ascii=False, sort_keys=True)

    if len(sys.argv) > 1:
        files = sys.argv[1:]
    else:
        files = glob('output/*/*.json')
    for item_json in files:
        prefix = 'public/' + item_json.split('/')[-1][0].lower() + item_json.split('/')[-2] + '.'
        output_json = prefix + '1080p.json'
        output_srt = prefix + '1080p.srt'
        render_subtitles(item_json, output_json, output_srt, ['ko', 'en'])
        output_json = prefix + 'no-en.json'
        output_srt = prefix + 'no-en.srt'
        render_subtitles(item_json, output_json, output_srt, ['no', 'en'])
        for lang in ('en', 'ko', 'no'):
            output_json = prefix + lang + '.json'
            render_subtitles(item_json, output_json, None, lang)
render subtitles 2017-05-16 12:59:32 +00:00			`#!/usr/bin/python3`
			`import os`
			`import sys`
			`import json`
multi lingual 2017-08-25 09:12:32 +00:00			`import re`
render subtitles 2017-05-16 12:59:32 +00:00			`import subprocess`
			`from collections import defaultdict`
			`import string`
			`from glob import glob`
			`from copy import deepcopy`

			`import ox`
			`import ox.web.auth`


			`base_url = 'http://127.0.0.1:2620'`

			`FRAME_DURATION = 1/60`
			`MAX_DURATION = 40`

			`HIDDEN_TAGS = [`
			`"women with white males",`
			`"gene z hanrahan"`
			`]`

			`# items to not use at all`
			`BLACKLIST = [`
			`'XN'`
			`]`

			`api = None`

			`def get_api():`
			`global api`
			`if not api:`
			`api = ox.API(base_url + '/api/')`
			`api.signin(**ox.web.auth.get('cdosea'))`


typos 2017-05-20 15:32:51 +00:00			`def update_subtitles():`
render subtitles 2017-05-16 12:59:32 +00:00			`get_api()`
			`items = api.find({`
			`'query': {`
			`'conditions': [{'key': 'tags', 'value': 'Vocal', 'operator': '=='}]`
			`},`
			`'keys': ['id', 'title'],`
			`'range': [0, 1000]})['data']['items']`
			`for item in items:`
			`'''`
			`info = api.findMedia({`
			`'query': {`
			`'conditions': [`
			`{'key': 'id', 'operator': '==', 'value': item['id']}`
			`]`
			`},`
			`'keys': ['id', 'extension'],`
			`'range': [0, 1]`
			`})['data']['items'][0]`
			`'''`
			`item['subtitles'] = api.get({'id': item['id'], 'keys': ['layers']})['data']['layers']['subtitles']`

			`return items`

			`def get_subtitles(items, id):`
			`for item in items:`
			`if item['title'].startswith(id):`
			`return deepcopy(item['subtitles'])`

multi lingual 2017-08-25 09:12:32 +00:00			`def render_subtitles(item_json, output_json, output_srt, lang):`
render subtitles 2017-05-16 12:59:32 +00:00			`with open(item_json) as fd:`
			`item = json.load(fd)`

			`subtitles = []`
			`position = 0`
subs 2017-05-21 11:55:48 +00:00			`subs = {}`
render subtitles 2017-05-16 12:59:32 +00:00			`for clip in item['vocals']:`
			`if not clip.get('blank'):`
			`# vocals/A/A4_chaton.wav`
			`id = clip['path'].split('/')[-1][:2]`
			`clip_subtitles = get_subtitles(items, id)`
subs 2017-05-21 11:55:48 +00:00			`clip_subtitles.sort(key=lambda c: (c['in'], c['out'], c['id']))`

render subtitles 2017-05-16 12:59:32 +00:00			`for sub in clip_subtitles:`
korean 2017-05-21 20:48:54 +00:00			`sub_in = float('%0.3f' % (sub['in'] + position))`
subs 2017-05-21 11:55:48 +00:00			`sub_out = float('%0.3f' % (sub['out'] + position))`
			`sub_id = '%0.3f-%0.3f' % (sub_in, sub_out)`
			`if sub_id not in subs:`
			`subs[sub_id] = {`
			`'in': sub_in,`
			`'out': sub_out,`
			`'value': [],`
			`}`
multi lingual 2017-08-25 09:12:32 +00:00			`if isinstance(lang, list):`
less 2017-06-17 16:02:14 +00:00			`subs[sub_id]['ids'] = []`
multi lingual 2017-08-25 09:12:32 +00:00			`subs[sub_id]['languages'] = []`

			`slang = re.compile('span lang="(..)"').findall(sub['value'])`
			`value = sub['value'].replace('<br>', '').strip()`
			`if slang:`
			`slang = slang[0]`
			`value = value.replace('<span lang="' + slang + '">', '').replace('</span>', '').strip()`
			`else:`
			`slang = 'en'`
korean 2017-05-21 20:48:54 +00:00			`# just use strip_tags?`
			`# value = ox.strip_tags(ox.decode_html(sub['value']))`
multi lingual 2017-08-25 09:12:32 +00:00			`if isinstance(lang, list) and slang in lang:`
			`if lang.index(slang) == 0:`
refactor subtitles 2017-06-17 16:00:24 +00:00			`subs[sub_id]['value'].insert(0, value)`
			`else:`
			`subs[sub_id]['value'].append(value)`
			`subs[sub_id]['ids'].append(sub['id'])`
multi lingual 2017-08-25 09:12:32 +00:00			`subs[sub_id]['languages'].append(slang)`
			`elif slang == lang:`
korean first 2017-05-21 21:14:15 +00:00			`subs[sub_id]['value'].append(value)`
render subtitles 2017-05-16 12:59:32 +00:00			`position += clip['duration']`

multi lingual 2017-08-25 09:12:32 +00:00			`if isinstance(lang, list):`
overlapping subtitles 2017-08-29 14:59:15 +00:00			`#fixme = [sub for sub in subs.values() if [s for s in list(subs.values())]]`
			`fixme = [sub for sub in subs.values() if set(sub['languages']) != set(lang) and sub['value']]`
multi lingual 2017-08-25 09:12:32 +00:00			`if fixme:`
overlapping subtitles 2017-08-29 14:59:15 +00:00			`remove = []`
			`for key, sub in list(subs.items()):`
			`intersections = []`
			`for s in list(subs.values()):`
			`intersections += overlaps(sub, s)`
			`if intersections:`
			`points = list(sorted(set([sub['in'], sub['out']] + intersections)))`
			`#print(points, sub['value'])`
			`sub_in = points[0]`
			`for sub_out in points[1:]:`
			`sub_id = '%0.3f-%0.3f' % (sub_in, sub_out)`
			`if sub_id not in subs:`
			`subs[sub_id] = {`
			`'in': sub_in,`
			`'out': sub_out,`
			`'value': [],`
			`'ids': [],`
			`'languages': []`
			`}`
			`if set(subs[sub_id]['languages']) != set(lang):`
			`if not subs[sub_id]['languages']:`
			`subs[sub_id]['value'] += sub['value']`
			`subs[sub_id]['languages'] += sub['languages']`
			`subs[sub_id]['ids'] += sub['ids']`
			`elif subs[sub_id]['languages'] == [lang[0]] \`
			`and sub['languages'][0] not in subs[sub_id]['languages']:`
			`subs[sub_id]['value'].append(sub['value'][0])`
			`subs[sub_id]['languages'].append(sub['languages'][0])`
			`subs[sub_id]['ids'] += sub['ids']`
			`elif subs[sub_id]['languages'] == [lang[1]] \`
			`and sub['languages'][0] not in subs[sub_id]['languages']:`
			`subs[sub_id]['value'].insert(0, sub['value'][0])`
			`subs[sub_id]['languages'].insert(0, sub['languages'][0])`
			`subs[sub_id]['ids'] += sub['ids']`
			`#else:`
			`# print('WTF', sub['languages'], subs[sub_id]['languages'])`

			`sub_in = sub_out`
			`remove.append(key)`
			`#for key, sub in list(subs.items()):`
			`# if len(sub['languages']) == 1:`
			`# del subs[key]`
			`for key in remove:`
			`if len(subs[key]['languages']) == 1:`
			`del subs[key]`
			`for key, sub in list(subs.items()):`
			`if abs(sub['out'] - sub['in']) <= 0.040001:`
			`del subs[key]`
			`subs = sorted(subs.values(), key=lambda c: (c['in'], c['out']))`
			`for sub in subs:`
			`sub['value'] = '\n'.join(sub['value'])`
			`if sub['value'].strip():`
			`subtitles.append(sub)`
multi lingual 2017-08-25 09:12:32 +00:00
refactor subtitles 2017-06-17 16:00:24 +00:00			`if output_srt:`
			`with open(output_srt, 'wb') as fd:`
			`fd.write(ox.srt.encode(subtitles))`
render subtitles 2017-05-16 12:59:32 +00:00			`with open(output_json, 'w') as fd:`
sort_keys=True 2017-06-17 16:03:13 +00:00			`json.dump(subtitles, fd, indent=4, ensure_ascii=False, sort_keys=True)`
render subtitles 2017-05-16 12:59:32 +00:00
korean 2017-05-21 20:48:54 +00:00
overlapping subtitles 2017-08-29 14:59:15 +00:00			`def overlaps(src, other):`
rounding issues 2017-08-30 14:10:35 +00:00			`src_in = float('%0.2f' % src['in'])`
			`src_out = float('%0.2f' % src['out'])`
			`other_in = float('%0.2f' % other['in'])`
			`other_out = float('%0.2f' % other['out'])`
overlapping subtitles 2017-08-29 14:59:15 +00:00			`points = []`
			`if src_in != other_in or src_out != other_out:`
			`# src inside`
			`if other_in >= src_in and other_in < src_out:`
			`points += [other['in']]`
			`if other_out > src_in and other_out <= src_out:`
			`points += [other['out']]`
			`return points`

rounding issues 2017-08-30 14:10:35 +00:00
render subtitles 2017-05-16 12:59:32 +00:00			`if __name__ == '__main__':`
			`if os.path.exists('subtitles.json'):`
			`items = json.load(open('subtitles.json'))`
			`else:`
typos 2017-05-20 15:32:51 +00:00			`items = update_subtitles()`
render subtitles 2017-05-16 12:59:32 +00:00			`with open('subtitles.json', 'w') as fd:`
sort_keys=True 2017-06-17 16:03:13 +00:00			`json.dump(items, fd, indent=4, ensure_ascii=False, sort_keys=True)`
render subtitles 2017-05-16 12:59:32 +00:00
update subtitles 2017-05-17 10:15:28 +00:00			`if len(sys.argv) > 1:`
korean 2017-05-21 20:48:54 +00:00			`files = sys.argv[1:]`
update subtitles 2017-05-17 10:15:28 +00:00			`else:`
			`files = glob('output//.json')`
			`for item_json in files:`
refactor subtitles 2017-06-17 16:00:24 +00:00			`prefix = 'public/' + item_json.split('/')[-1][0].lower() + item_json.split('/')[-2] + '.'`
			`output_json = prefix + '1080p.json'`
multi lingual 2017-08-25 09:12:32 +00:00			`output_srt = prefix + '1080p.srt'`
			`render_subtitles(item_json, output_json, output_srt, ['ko', 'en'])`
			`output_json = prefix + 'no-en.json'`
			`output_srt = prefix + 'no-en.srt'`
			`render_subtitles(item_json, output_json, output_srt, ['no', 'en'])`
			`for lang in ('en', 'ko', 'no'):`
refactor subtitles 2017-06-17 16:00:24 +00:00			`output_json = prefix + lang + '.json'`
			`render_subtitles(item_json, output_json, None, lang)`