#!/usr/bin/python3 import os import re import json import urllib.error from datetime import datetime import sys import ox import dateparser # upload pdfs and add metadata import ox.api api = ox.api.signin('https://archive.leftove.rs/api/') if os.path.exists('titles.json'): titles = json.load(open('titles.json')) else: titles = api.findDocuments({ 'query': {}, 'keys': ['id', 'title', 'date'], 'range': [0, 15000] })['data']['items'] print('request titles', len(titles)) with open('titles.json', 'w') as fd: json.dump(titles, fd) fr_months = { 'Janvier': 'January', 'Février': 'February', 'Mars': 'March', 'Avril': 'April', 'Mai': 'May', 'Juin': 'June', 'Juillet': 'July', 'Août': 'August', 'Septembre': 'September', 'Octobre': 'October', 'Novembre': 'November', 'Décembre': 'December' } def en_title(title): x = title for fr, en in fr_months.items(): title = title.replace(fr, en) return title for doc in titles: date = None #date = dateparser.parse(doc['title']) #if date: # print(doc['title'], type(date), date) title = en_title(doc['title']) for regexp, fmt, outfmt in ( ('[^\d](\d\d?, [^,]+, \d{4})', '%d, %B, %Y', '%Y-%m-%d'), ('[^\d](\d\d?, [^,]{3}, \d{4})', '%d, %b, %Y', '%Y-%m-%d'), ('(\d{4}-\d{2}-\d{2})', '%Y-%m-%d', '%Y-%m-%d'), ('[^\d]([^, (-]+? \d+, \d{4})', '%B %d, %Y', '%Y-%m-%d'), ('[^\d]([^,]{3}, \d{4})', '%b, %Y', '%Y-%m'), ('[^\d]([^,]+?, \d{4})', '%B, %Y', '%Y-%m'), ('[^\d]([^ (]+?, \d{4})', '%B, %Y', '%Y-%m'), ('[^\d]([^ (-]+? \d{4})', '%B %Y', '%Y-%m'), ('[^\d]([^ (-]{3} \d{4})', '%b %Y', '%Y-%m'), ('(\d{4})\)', '%Y', '%Y'), ): try: date = re.compile(regexp).findall(title) if date: date = datetime.strptime(date[0], fmt).strftime(outfmt) break except: #print(date) date = None continue if date: update = False if doc.get('date') and len(doc.get('date')) < len(date): print('update', doc['id'], doc['title'], doc['date'], '=>', date) update = True elif not doc.get('date'): print(doc['id'], doc['title'], '==', date) update = True if update: r = api.editDocument({'id': doc['id'], 'date': date}) #else: # print('fail', doc['title'])