103 lines
3.4 KiB
Executable file
103 lines
3.4 KiB
Executable file
import json
import itertools
from collections import Counter
import ox
import ox.web.auth
from keywords import KEYWORDS
def splitint(number, by):
div = int(number/by)
mod = number % by
return [div + 1 if i > (by - 1 - mod) else div for i in range(by)]
known_tags = set(itertools.chain.from_iterable(KEYWORDS.values()))
api = ox.API('')
query = {
'conditions': [
{'key': 'layer', 'operator': '==', 'value': 'keywords'},
'operator': '&'
total = api.findAnnotations({
'query': query
offset = 0
chunk = 1000
clips = []
while offset < total:
clips += api.findAnnotations({
'query': query,
'keys': ['id', 'in', 'out', 'value'],
'range': [offset, offset+chunk]})['data']['items']
offset += chunk
clips0 = [c for c in clips if c['out'] == c['in']]
clips = [c for c in clips if c['out'] != c['in']]
tags = {c['value'] for c in clips}
unknown_tags = [t for t in tags if t not in known_tags]
missing_tags = [t for t in known_tags if t not in tags]
if missing_tags:
print('missing tags\n', ', '.join(sorted(missing_tags)))
for letter in sorted(KEYWORDS):
letter_clips = [c for c in clips if c['value'] in KEYWORDS[letter]]
letter_tags = {c['value'] for c in letter_clips}
durations = {c['out'] - c['in'] for c in letter_clips}
duration = ox.format_duration(sum(durations) * 1000)
print('%s: %s clips with %s of %s tags total duration: %s' % (letter, len(letter_clips), len(letter_tags), len(KEYWORDS[letter]), duration))
if set(KEYWORDS[letter]) - set(letter_tags):
print('missing tags:', ', '.join(set(KEYWORDS[letter]) - set(letter_tags)))
buckets = {}
letter_clips.sort(key=lambda c: c['out'] - c['in'])
size = splitint(len(letter_clips), 10)
p = 0
for i in range(10):
buckets[i+1] = letter_clips[p:+p+size[i]]
p += size[i]
for size in buckets:
bucket_tags = Counter([c['value'] for c in buckets[size]])
durations = {c['out'] - c['in'] for c in buckets[size]}
dmin = min(durations)
dmax = max(durations)
print('\t', len(buckets[size]), 'clips', len(bucket_tags), 'tags', 'durations from %.3f to %.3f' % (dmin, dmax))
used_tags = [
'%s (%d)' % (t, bucket_tags[t])
for t in sorted(bucket_tags, key=lambda t: (-bucket_tags[t],t))
print('\t', 'used tags:', ', '.join(used_tags))
if set(letter_tags) - set(bucket_tags):
print('\t', 'missing tags:', ', '.join(sorted(set(letter_tags) - set(bucket_tags))))
for tag in sorted(known_tags):
tag_clips = [c for c in clips if c['value'] == tag]
if tag_clips:
durations = {c['out'] - c['in'] for c in tag_clips}
duration = ox.format_duration(sum(durations) * 1000)
print('\n\n%s - %d clips total duration: %s' % (tag, len(tag_clips), duration))
distribution = Counter([round(d) for d in durations])
distribution = '\n\t'.join('% 2d - %s clips' % (duration, count) for duration, count in sorted(distribution.items()))
print('\t' + distribution)
if clips0:
print('\n\n%d clips with 0 duration' % len(clips0))
print('\t' + '\n\t'.join(sorted(['https://cdosea.0x2620.org/' + c['id'] + ' ' + c['value'] for c in clips0])))
if unknown_tags:
print('\n\nunknown tags\n', ', '.join(sorted(unknown_tags)))