From a4f4af02e7aec0d870bf2d5ee60211b4681a38ba Mon Sep 17 00:00:00 2001 From: j Date: Sat, 9 May 2020 17:14:58 +0200 Subject: [PATCH] cleanup dates --- cleanup/add_date.py | 94 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 cleanup/add_date.py diff --git a/cleanup/add_date.py b/cleanup/add_date.py new file mode 100644 index 0000000..885f881 --- /dev/null +++ b/cleanup/add_date.py @@ -0,0 +1,94 @@ +#!/usr/bin/python3 +import os +import re +import json +import urllib.error +from datetime import datetime +import sys + +import ox +import dateparser + + +# upload pdfs and add metadata +import ox.api +api = ox.api.signin('https://leftove.rs/api/') + +if os.path.exists('titles.json'): + titles = json.load(open('titles.json')) +else: + titles = api.findDocuments({ + 'query': {}, + 'keys': ['id', 'title', 'date'], + 'range': [0, 15000] + })['data']['items'] + print('request titles', len(titles)) + with open('titles.json', 'w') as fd: + json.dump(titles, fd) + +fr_months = { + 'Janvier': 'January', + 'Février': 'February', + 'Mars': 'March', + 'Avril': 'April', + 'Mai': 'May', + 'Juin': 'June', + 'Juillet': 'July', + 'Août': 'August', + 'Septembre': 'September', + 'Octobre': 'October', + 'Novembre': 'November', + 'Décembre': 'December' +} + +def en_title(title): + x = title + for fr, en in fr_months.items(): + title = title.replace(fr, en) + return title + +for doc in titles: + date = None + #date = dateparser.parse(doc['title']) + #if date: + # print(doc['title'], type(date), date) + + title = en_title(doc['title']) + + for regexp, fmt, outfmt in ( + ('[^\d](\d\d?, [^,]+, \d{4})', '%d, %B, %Y', '%Y-%m-%d'), + ('[^\d](\d\d?, [^,]{3}, \d{4})', '%d, %b, %Y', '%Y-%m-%d'), + ('(\d{4}-\d{2}-\d{2})', '%Y-%m-%d', '%Y-%m-%d'), + + ('[^\d]([^, (-]+? \d+, \d{4})', '%B %d, %Y', '%Y-%m-%d'), + + ('[^\d]([^,]{3}, \d{4})', '%b, %Y', '%Y-%m'), + ('[^\d]([^,]+?, \d{4})', '%B, %Y', '%Y-%m'), + ('[^\d]([^ (]+?, \d{4})', '%B, %Y', '%Y-%m'), + ('[^\d]([^ (-]+? \d{4})', '%B %Y', '%Y-%m'), + ('[^\d]([^ (-]{3} \d{4})', '%b %Y', '%Y-%m'), + ('(\d{4})\)', '%Y', '%Y'), + ): + try: + date = re.compile(regexp).findall(title) + if date: + date = datetime.strptime(date[0], fmt).strftime(outfmt) + break + except: + #print(date) + date = None + continue + + if date: + update = False + if doc.get('date') and len(doc.get('date')) < len(date): + print('update', doc['id'], doc['title'], doc['date'], '=>', date) + update = True + elif not doc.get('date'): + print(doc['id'], doc['title'], '==', date) + update = True + if update: + r = api.editDocument({'id': doc['id'], 'date': date}) + #else: + # print('fail', doc['title']) +