load annotations in chunks

2018-11-15 15:30:49 +00:00 · 2018-11-15 15:30:49 +00:00 · ebf2ba4dbd
commit ebf2ba4dbd
parent 0b5d56ed94
1 changed files with 23 additions and 12 deletions
--- a/ontology/update_keywords.py
+++ b/ontology/update_keywords.py
@ -9,24 +9,35 @@ site = 'pandora.cinemusespace.com'
 api = ox.api.signin('https://%s/api/' % site)

 keywords = collections.Counter()
-for annotation in api.findAnnotations({
-    'query': {
+query = {
        'conditions': [{
            'key': 'layer',
            'value': 'keywords',
            'operator': '=='
        }],
        'operator': '&'
-    },
-    'keys': ['id', 'in', 'out', 'value', 'user', 'created'],
-    'range': [0, 500000]
-})['data']['items']:
-    if annotation['id'].startswith('BA/'):
-        continue
-    keyword = annotation['value']
-    if ': ' not in keyword:
-        keyword = 'other: ' + keyword
-    keywords[keyword] += 1
+}
+count = api.findAnnotations({'query': query})['data']['items']
+position = 0
+chunk = 1000
+
+while position < count:
+    r = api.findAnnotations({
+        'query': query,
+        'keys': ['id', 'in', 'out', 'value', 'user', 'created'],
+        'sort': [{'key': 'public_id', 'operator': '+'}],
+        'range': [position, position+chunk]
+    })
+    if 'data' not in r:
+        print('failed', r)
+    for annotation in r['data']['items']:
+        if annotation['id'].startswith('BA/'):
+            continue
+        keyword = annotation['value']
+        if ': ' not in keyword:
+            keyword = 'other: ' + keyword
+        keywords[keyword] += 1
+    position += chunk

 with open('keywords.json', 'w') as fd:
    json.dump(keywords, fd, indent=4, ensure_ascii=False, sort_keys=True)