From ebf2ba4dbd354536c348a212b6b9060930202abd Mon Sep 17 00:00:00 2001 From: j Date: Thu, 15 Nov 2018 15:30:49 +0000 Subject: [PATCH] load annotations in chunks --- ontology/update_keywords.py | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/ontology/update_keywords.py b/ontology/update_keywords.py index 1b8621e..f829b48 100755 --- a/ontology/update_keywords.py +++ b/ontology/update_keywords.py @@ -9,24 +9,35 @@ site = 'pandora.cinemusespace.com' api = ox.api.signin('https://%s/api/' % site) keywords = collections.Counter() -for annotation in api.findAnnotations({ - 'query': { +query = { 'conditions': [{ 'key': 'layer', 'value': 'keywords', 'operator': '==' }], 'operator': '&' - }, - 'keys': ['id', 'in', 'out', 'value', 'user', 'created'], - 'range': [0, 500000] -})['data']['items']: - if annotation['id'].startswith('BA/'): - continue - keyword = annotation['value'] - if ': ' not in keyword: - keyword = 'other: ' + keyword - keywords[keyword] += 1 +} +count = api.findAnnotations({'query': query})['data']['items'] +position = 0 +chunk = 1000 + +while position < count: + r = api.findAnnotations({ + 'query': query, + 'keys': ['id', 'in', 'out', 'value', 'user', 'created'], + 'sort': [{'key': 'public_id', 'operator': '+'}], + 'range': [position, position+chunk] + }) + if 'data' not in r: + print('failed', r) + for annotation in r['data']['items']: + if annotation['id'].startswith('BA/'): + continue + keyword = annotation['value'] + if ': ' not in keyword: + keyword = 'other: ' + keyword + keywords[keyword] += 1 + position += chunk with open('keywords.json', 'w') as fd: json.dump(keywords, fd, indent=4, ensure_ascii=False, sort_keys=True)