diff --git a/pandora_client/__init__.py b/pandora_client/__init__.py index 8e00e85..1ded002 100755 --- a/pandora_client/__init__.py +++ b/pandora_client/__init__.py @@ -784,28 +784,48 @@ class Client(object): print(r) def _get_documents(self): - files = self.api.findMedia({ - 'query': { - 'conditions': [ - {'key': 'filename', 'operator': '', 'value': value} - for value in DOCUMENT_FORMATS - ], - 'operator': '|' - }, - 'keys': ['item', 'id', 'extension'], - 'range': [0, 5000] - })['data']['items'] - d = self.api.findDocuments({ - 'query': { - 'conditions': [ - {'key': 'oshash', 'operator': '==', 'value': f['id']} - for f in files - ], - 'operator': '|' - }, - 'keys': ['id', 'oshash', 'extension'], - 'range': [0, len(files)] - })['data']['items'] + query = { + 'conditions': [ + {'key': 'filename', 'operator': '', 'value': value} + for value in DOCUMENT_FORMATS + ], + 'operator': '|' + } + n = self.api.findMedia({'query': query})['data']['items'] + if n: + o = 0 + chunk = 5000 + files = [] + while o < n: + files += self.api.findMedia({ + 'query': { + 'conditions': [ + {'key': 'filename', 'operator': '', 'value': value} + for value in DOCUMENT_FORMATS + ], + 'operator': '|' + }, + 'keys': ['item', 'id', 'extension'], + 'range': [o, o+chunk] + })['data']['items'] + o += chunk + d = [] + o = 0 + while o < len(files): + d += self.api.findDocuments({ + 'query': { + 'conditions': [ + {'key': 'oshash', 'operator': '==', 'value': f['id']} + for f in files[o:o+chunk] + ], + 'operator': '|' + }, + 'keys': ['id', 'oshash', 'extension'], + 'range': [0, chunk] + })['data']['items'] + o += chunk + else: + d = [] available = set(f['oshash'] for f in d if f['extension'] in DOCUMENT_FORMATS) missing = [(f['id'], f['item']) for f in files