#!/usr/bin/python3 import json import itertools from collections import Counter import ox import ox.web.auth from keywords import KEYWORDS def splitint(number, by): div = int(number/by) mod = number % by return [div + 1 if i > (by - 1 - mod) else div for i in range(by)] known_tags = set(itertools.chain.from_iterable(KEYWORDS.values())) api = ox.API('http://127.0.0.1:2620/api/') api.signin(**ox.web.auth.get('cdosea')) query = { 'conditions': [ {'key': 'layer', 'operator': '==', 'value': 'keywords'}, ], 'operator': '&' } total = api.findAnnotations({ 'query': query })['data']['items'] offset = 0 chunk = 1000 clips = [] while offset < total: clips += api.findAnnotations({ 'query': query, 'keys': ['id', 'in', 'out', 'value'], 'range': [offset, offset+chunk]})['data']['items'] offset += chunk clips0 = [c for c in clips if c['out'] == c['in']] clips = [c for c in clips if c['out'] != c['in']] tags = {c['value'] for c in clips} unknown_tags = [t for t in tags if t not in known_tags] missing_tags = [t for t in known_tags if t not in tags] if missing_tags: print('missing tags\n', ', '.join(sorted(missing_tags))) for letter in sorted(KEYWORDS): print('\n\n') letter_clips = [c for c in clips if c['value'] in KEYWORDS[letter]] letter_tags = {c['value'] for c in letter_clips} durations = {c['out'] - c['in'] for c in letter_clips} duration = ox.format_duration(sum(durations) * 1000) print('%s: %s clips with %s of %s tags total duration: %s' % (letter, len(letter_clips), len(letter_tags), len(KEYWORDS[letter]), duration)) if set(KEYWORDS[letter]) - set(letter_tags): print('missing tags:', ', '.join(set(KEYWORDS[letter]) - set(letter_tags))) buckets = {} letter_clips.sort(key=lambda c: c['out'] - c['in']) size = splitint(len(letter_clips), 10) p = 0 for i in range(10): buckets[i+1] = letter_clips[p:+p+size[i]] p += size[i] for size in buckets: bucket_tags = Counter([c['value'] for c in buckets[size]]) durations = {c['out'] - c['in'] for c in buckets[size]} dmin = min(durations) dmax = max(durations) print(size) print('\t', len(buckets[size]), 'clips', len(bucket_tags), 'tags', 'durations from %.3f to %.3f' % (dmin, dmax)) used_tags = [ '%s (%d)' % (t, bucket_tags[t]) for t in sorted(bucket_tags, key=lambda t: (-bucket_tags[t],t)) ] print('\t', 'used tags:', ', '.join(used_tags)) if set(letter_tags) - set(bucket_tags): print('\t', 'missing tags:', ', '.join(sorted(set(letter_tags) - set(bucket_tags)))) for tag in sorted(known_tags): tag_clips = [c for c in clips if c['value'] == tag] if tag_clips: durations = {c['out'] - c['in'] for c in tag_clips} duration = ox.format_duration(sum(durations) * 1000) print('\n\n%s - %d clips total duration: %s' % (tag, len(tag_clips), duration)) distribution = Counter([round(d) for d in durations]) distribution = '\n\t'.join('% 2d - %s clips' % (duration, count) for duration, count in sorted(distribution.items())) print('\t' + distribution) if clips0: print('\n\n%d clips with 0 duration' % len(clips0)) print('\t' + '\n\t'.join(sorted(['https://cdosea.0x2620.org/' + c['id'] + ' ' + c['value'] for c in clips0]))) if unknown_tags: print('\n\nunknown tags\n', ', '.join(sorted(unknown_tags)))