diff --git a/keyword_overview.py b/keyword_overview.py new file mode 100644 index 0000000..0232bee --- /dev/null +++ b/keyword_overview.py @@ -0,0 +1,98 @@ +import json +import itertools +from collections import Counter + +import ox +import ox.web.auth + +from keywords import KEYWORDS + +def splitint(number, by): + div = int(number/by) + mod = number % by + return [div + 1 if i > (by - 1 - mod) else div for i in range(by)] + + +known_tags = set(itertools.chain.from_iterable(KEYWORDS.values())) + +api = ox.API('https://cdosea.0x2620.org/api/') +api.signin(**ox.web.auth.get('cdosea')) + +query = { + 'conditions': [ + {'key': 'layer', 'operator': '==', 'value': 'keywords'}, + ], + 'operator': '&' +} + +total = api.findAnnotations({ + 'query': query +})['data']['items'] + +offset = 0 +chunk = 1000 +clips = [] +while offset < total: + clips += api.findAnnotations({ + 'query': query, + 'keys': ['id', 'in', 'out', 'value'], + 'range': [offset, offset+chunk]})['data']['items'] + offset += chunk + +clips0 = [c for c in clips if c['out'] == c['in']] +clips = [c for c in clips if c['out'] != c['in']] + +tags = {c['value'] for c in clips} + +unknown_tags = [t for t in tags if t not in known_tags] +missing_tags = [t for t in known_tags if t not in tags] + +if missing_tags: + print('missing tags\n', ', '.join(missing_tags)) + +for letter in sorted(KEYWORDS): + print('\n\n') + letter_clips = [c for c in clips if c['value'] in KEYWORDS[letter]] + letter_tags = {c['value'] for c in letter_clips} + durations = {c['out'] - c['in'] for c in letter_clips} + duration = ox.format_duration(sum(durations) * 1000) + print('%s: %s clips with %s of %s tags total duration: %s' % (letter, len(letter_clips), len(letter_tags), len(KEYWORDS[letter]), duration)) + if len(letter_tags) != len(KEYWORDS[letter]): + print('missing tags:', ', '.join(set(KEYWORDS[letter]) - set(letter_tags))) + + buckets = {} + letter_clips.sort(key=lambda c: c['out'] - c['in']) + size = splitint(len(letter_clips), 10) + p = 0 + for i in range(10): + buckets[i+1] = letter_clips[p:+p+size[i]] + p += size[i] + for size in buckets: + bucket_tags = {c['value'] for c in buckets[size]} + durations = {c['out'] - c['in'] for c in buckets[size]} + dmin = min(durations) + dmax = max(durations) + print(size) + print('\t', len(buckets[size]), 'clips', len(bucket_tags), 'tags', 'durations from %.3f to %.3f' % (dmin, dmax)) + + if set(letter_tags) - bucket_tags: + print('\t', 'used tags:', ', '.join(bucket_tags)) + print('\t', 'missing tags:', ', '.join(set(letter_tags) - bucket_tags)) + + +for tag in sorted(known_tags): + tag_clips = [c for c in clips if c['value'] == tag] + if tag_clips: + durations = {c['out'] - c['in'] for c in tag_clips} + duration = ox.format_duration(sum(durations) * 1000) + print('\n\n%s - %d clips total duration: %s' % (tag, len(tag_clips), duration)) + + distribution = Counter([round(d) for d in durations]) + distribution = '\n\t'.join('% 2d - %s clips' % (duration, count) for duration, count in sorted(distribution.items())) + print('\t' + distribution) + +if clips0: + print('\n\n%d clips with 0 duration' % len(clips0)) + print('\t' + '\n\t'.join(sorted(['https://cdosea.0x2620.org/' + c['id'] + ' ' + c['value'] for c in clips0]))) +if unknown_tags: + print('\n\nunknown tags\n', ', '.join(unknown_tags))