2017-02-25 10:24:02 +00:00
|
|
|
#!/usr/bin/python3
|
2017-02-25 00:55:24 +00:00
|
|
|
import json
|
|
|
|
import itertools
|
|
|
|
from collections import Counter
|
|
|
|
|
|
|
|
import ox
|
|
|
|
import ox.web.auth
|
|
|
|
|
|
|
|
from keywords import KEYWORDS
|
|
|
|
|
|
|
|
def splitint(number, by):
|
|
|
|
div = int(number/by)
|
|
|
|
mod = number % by
|
|
|
|
return [div + 1 if i > (by - 1 - mod) else div for i in range(by)]
|
|
|
|
|
|
|
|
|
|
|
|
known_tags = set(itertools.chain.from_iterable(KEYWORDS.values()))
|
|
|
|
|
|
|
|
api = ox.API('https://cdosea.0x2620.org/api/')
|
|
|
|
api.signin(**ox.web.auth.get('cdosea'))
|
|
|
|
|
|
|
|
query = {
|
|
|
|
'conditions': [
|
|
|
|
{'key': 'layer', 'operator': '==', 'value': 'keywords'},
|
|
|
|
],
|
|
|
|
'operator': '&'
|
|
|
|
}
|
|
|
|
|
|
|
|
total = api.findAnnotations({
|
|
|
|
'query': query
|
|
|
|
})['data']['items']
|
|
|
|
|
|
|
|
offset = 0
|
|
|
|
chunk = 1000
|
|
|
|
clips = []
|
|
|
|
while offset < total:
|
|
|
|
clips += api.findAnnotations({
|
|
|
|
'query': query,
|
|
|
|
'keys': ['id', 'in', 'out', 'value'],
|
|
|
|
'range': [offset, offset+chunk]})['data']['items']
|
|
|
|
offset += chunk
|
|
|
|
|
|
|
|
clips0 = [c for c in clips if c['out'] == c['in']]
|
|
|
|
clips = [c for c in clips if c['out'] != c['in']]
|
|
|
|
|
|
|
|
tags = {c['value'] for c in clips}
|
|
|
|
|
|
|
|
unknown_tags = [t for t in tags if t not in known_tags]
|
|
|
|
missing_tags = [t for t in known_tags if t not in tags]
|
|
|
|
|
|
|
|
if missing_tags:
|
|
|
|
print('missing tags\n', ', '.join(missing_tags))
|
|
|
|
|
|
|
|
for letter in sorted(KEYWORDS):
|
|
|
|
print('\n\n')
|
|
|
|
letter_clips = [c for c in clips if c['value'] in KEYWORDS[letter]]
|
|
|
|
letter_tags = {c['value'] for c in letter_clips}
|
|
|
|
durations = {c['out'] - c['in'] for c in letter_clips}
|
|
|
|
duration = ox.format_duration(sum(durations) * 1000)
|
|
|
|
print('%s: %s clips with %s of %s tags total duration: %s' % (letter, len(letter_clips), len(letter_tags), len(KEYWORDS[letter]), duration))
|
|
|
|
if len(letter_tags) != len(KEYWORDS[letter]):
|
|
|
|
print('missing tags:', ', '.join(set(KEYWORDS[letter]) - set(letter_tags)))
|
|
|
|
|
|
|
|
buckets = {}
|
|
|
|
letter_clips.sort(key=lambda c: c['out'] - c['in'])
|
|
|
|
size = splitint(len(letter_clips), 10)
|
|
|
|
p = 0
|
|
|
|
for i in range(10):
|
|
|
|
buckets[i+1] = letter_clips[p:+p+size[i]]
|
|
|
|
p += size[i]
|
|
|
|
for size in buckets:
|
|
|
|
bucket_tags = {c['value'] for c in buckets[size]}
|
|
|
|
durations = {c['out'] - c['in'] for c in buckets[size]}
|
|
|
|
dmin = min(durations)
|
|
|
|
dmax = max(durations)
|
|
|
|
print(size)
|
|
|
|
print('\t', len(buckets[size]), 'clips', len(bucket_tags), 'tags', 'durations from %.3f to %.3f' % (dmin, dmax))
|
|
|
|
|
|
|
|
if set(letter_tags) - bucket_tags:
|
|
|
|
print('\t', 'used tags:', ', '.join(bucket_tags))
|
|
|
|
print('\t', 'missing tags:', ', '.join(set(letter_tags) - bucket_tags))
|
|
|
|
|
|
|
|
|
|
|
|
for tag in sorted(known_tags):
|
|
|
|
tag_clips = [c for c in clips if c['value'] == tag]
|
|
|
|
if tag_clips:
|
|
|
|
durations = {c['out'] - c['in'] for c in tag_clips}
|
|
|
|
duration = ox.format_duration(sum(durations) * 1000)
|
|
|
|
print('\n\n%s - %d clips total duration: %s' % (tag, len(tag_clips), duration))
|
|
|
|
|
|
|
|
distribution = Counter([round(d) for d in durations])
|
|
|
|
distribution = '\n\t'.join('% 2d - %s clips' % (duration, count) for duration, count in sorted(distribution.items()))
|
|
|
|
print('\t' + distribution)
|
|
|
|
|
|
|
|
if clips0:
|
|
|
|
print('\n\n%d clips with 0 duration' % len(clips0))
|
|
|
|
print('\t' + '\n\t'.join(sorted(['https://cdosea.0x2620.org/' + c['id'] + ' ' + c['value'] for c in clips0])))
|
|
|
|
if unknown_tags:
|
|
|
|
print('\n\nunknown tags\n', ', '.join(unknown_tags))
|