cleanup dates
This commit is contained in:
parent
116f4ef951
commit
a4f4af02e7
1 changed files with 94 additions and 0 deletions
94
cleanup/add_date.py
Normal file
94
cleanup/add_date.py
Normal file
|
@ -0,0 +1,94 @@
|
|||
#!/usr/bin/python3
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import urllib.error
|
||||
from datetime import datetime
|
||||
import sys
|
||||
|
||||
import ox
|
||||
import dateparser
|
||||
|
||||
|
||||
# upload pdfs and add metadata
|
||||
import ox.api
|
||||
api = ox.api.signin('https://leftove.rs/api/')
|
||||
|
||||
if os.path.exists('titles.json'):
|
||||
titles = json.load(open('titles.json'))
|
||||
else:
|
||||
titles = api.findDocuments({
|
||||
'query': {},
|
||||
'keys': ['id', 'title', 'date'],
|
||||
'range': [0, 15000]
|
||||
})['data']['items']
|
||||
print('request titles', len(titles))
|
||||
with open('titles.json', 'w') as fd:
|
||||
json.dump(titles, fd)
|
||||
|
||||
fr_months = {
|
||||
'Janvier': 'January',
|
||||
'Février': 'February',
|
||||
'Mars': 'March',
|
||||
'Avril': 'April',
|
||||
'Mai': 'May',
|
||||
'Juin': 'June',
|
||||
'Juillet': 'July',
|
||||
'Août': 'August',
|
||||
'Septembre': 'September',
|
||||
'Octobre': 'October',
|
||||
'Novembre': 'November',
|
||||
'Décembre': 'December'
|
||||
}
|
||||
|
||||
def en_title(title):
|
||||
x = title
|
||||
for fr, en in fr_months.items():
|
||||
title = title.replace(fr, en)
|
||||
return title
|
||||
|
||||
for doc in titles:
|
||||
date = None
|
||||
#date = dateparser.parse(doc['title'])
|
||||
#if date:
|
||||
# print(doc['title'], type(date), date)
|
||||
|
||||
title = en_title(doc['title'])
|
||||
|
||||
for regexp, fmt, outfmt in (
|
||||
('[^\d](\d\d?, [^,]+, \d{4})', '%d, %B, %Y', '%Y-%m-%d'),
|
||||
('[^\d](\d\d?, [^,]{3}, \d{4})', '%d, %b, %Y', '%Y-%m-%d'),
|
||||
('(\d{4}-\d{2}-\d{2})', '%Y-%m-%d', '%Y-%m-%d'),
|
||||
|
||||
('[^\d]([^, (-]+? \d+, \d{4})', '%B %d, %Y', '%Y-%m-%d'),
|
||||
|
||||
('[^\d]([^,]{3}, \d{4})', '%b, %Y', '%Y-%m'),
|
||||
('[^\d]([^,]+?, \d{4})', '%B, %Y', '%Y-%m'),
|
||||
('[^\d]([^ (]+?, \d{4})', '%B, %Y', '%Y-%m'),
|
||||
('[^\d]([^ (-]+? \d{4})', '%B %Y', '%Y-%m'),
|
||||
('[^\d]([^ (-]{3} \d{4})', '%b %Y', '%Y-%m'),
|
||||
('(\d{4})\)', '%Y', '%Y'),
|
||||
):
|
||||
try:
|
||||
date = re.compile(regexp).findall(title)
|
||||
if date:
|
||||
date = datetime.strptime(date[0], fmt).strftime(outfmt)
|
||||
break
|
||||
except:
|
||||
#print(date)
|
||||
date = None
|
||||
continue
|
||||
|
||||
if date:
|
||||
update = False
|
||||
if doc.get('date') and len(doc.get('date')) < len(date):
|
||||
print('update', doc['id'], doc['title'], doc['date'], '=>', date)
|
||||
update = True
|
||||
elif not doc.get('date'):
|
||||
print(doc['id'], doc['title'], '==', date)
|
||||
update = True
|
||||
if update:
|
||||
r = api.editDocument({'id': doc['id'], 'date': date})
|
||||
#else:
|
||||
# print('fail', doc['title'])
|
||||
|
Loading…
Reference in a new issue