cleanup dates

This commit is contained in:
j 2020-05-09 17:14:58 +02:00
parent 116f4ef951
commit a4f4af02e7

94
cleanup/add_date.py Normal file
View file

@ -0,0 +1,94 @@
#!/usr/bin/python3
import os
import re
import json
import urllib.error
from datetime import datetime
import sys
import ox
import dateparser
# upload pdfs and add metadata
import ox.api
api = ox.api.signin('https://leftove.rs/api/')
if os.path.exists('titles.json'):
titles = json.load(open('titles.json'))
else:
titles = api.findDocuments({
'query': {},
'keys': ['id', 'title', 'date'],
'range': [0, 15000]
})['data']['items']
print('request titles', len(titles))
with open('titles.json', 'w') as fd:
json.dump(titles, fd)
fr_months = {
'Janvier': 'January',
'Février': 'February',
'Mars': 'March',
'Avril': 'April',
'Mai': 'May',
'Juin': 'June',
'Juillet': 'July',
'Août': 'August',
'Septembre': 'September',
'Octobre': 'October',
'Novembre': 'November',
'Décembre': 'December'
}
def en_title(title):
x = title
for fr, en in fr_months.items():
title = title.replace(fr, en)
return title
for doc in titles:
date = None
#date = dateparser.parse(doc['title'])
#if date:
# print(doc['title'], type(date), date)
title = en_title(doc['title'])
for regexp, fmt, outfmt in (
('[^\d](\d\d?, [^,]+, \d{4})', '%d, %B, %Y', '%Y-%m-%d'),
('[^\d](\d\d?, [^,]{3}, \d{4})', '%d, %b, %Y', '%Y-%m-%d'),
('(\d{4}-\d{2}-\d{2})', '%Y-%m-%d', '%Y-%m-%d'),
('[^\d]([^, (-]+? \d+, \d{4})', '%B %d, %Y', '%Y-%m-%d'),
('[^\d]([^,]{3}, \d{4})', '%b, %Y', '%Y-%m'),
('[^\d]([^,]+?, \d{4})', '%B, %Y', '%Y-%m'),
('[^\d]([^ (]+?, \d{4})', '%B, %Y', '%Y-%m'),
('[^\d]([^ (-]+? \d{4})', '%B %Y', '%Y-%m'),
('[^\d]([^ (-]{3} \d{4})', '%b %Y', '%Y-%m'),
('(\d{4})\)', '%Y', '%Y'),
):
try:
date = re.compile(regexp).findall(title)
if date:
date = datetime.strptime(date[0], fmt).strftime(outfmt)
break
except:
#print(date)
date = None
continue
if date:
update = False
if doc.get('date') and len(doc.get('date')) < len(date):
print('update', doc['id'], doc['title'], doc['date'], '=>', date)
update = True
elif not doc.get('date'):
print(doc['id'], doc['title'], '==', date)
update = True
if update:
r = api.editDocument({'id': doc['id'], 'date': date})
#else:
# print('fail', doc['title'])