107 lines
2.7 KiB
Python
107 lines
2.7 KiB
Python
#!/usr/bin/python3
|
|
import os
|
|
import re
|
|
import json
|
|
import urllib.error
|
|
from datetime import datetime
|
|
import sys
|
|
|
|
import ox
|
|
import dateparser
|
|
|
|
|
|
# upload pdfs and add metadata
|
|
import ox.api
|
|
api = ox.api.signin('https://archive.leftove.rs/api/')
|
|
|
|
if len(sys.argv) > 1:
|
|
collection = sys.argv[1]
|
|
query = {
|
|
'conditions': [
|
|
{'key': 'collection', 'operator': '==', 'value': collection}
|
|
]
|
|
}
|
|
else:
|
|
query = {}
|
|
|
|
|
|
if os.path.exists('titles.json'):
|
|
titles = json.load(open('titles.json'))
|
|
else:
|
|
titles = api.findDocuments({
|
|
'query': query,
|
|
'keys': ['id', 'title', 'date'],
|
|
'range': [0, 15000]
|
|
})['data']['items']
|
|
print('request titles', len(titles))
|
|
with open('titles.json', 'w') as fd:
|
|
json.dump(titles, fd)
|
|
|
|
fr_months = {
|
|
'Janvier': 'January',
|
|
'Février': 'February',
|
|
'Mars': 'March',
|
|
'Avril': 'April',
|
|
'Mai': 'May',
|
|
'Juin': 'June',
|
|
'Juillet': 'July',
|
|
'Août': 'August',
|
|
'Septembre': 'September',
|
|
'Octobre': 'October',
|
|
'Novembre': 'November',
|
|
'Décembre': 'December'
|
|
}
|
|
|
|
def en_title(title):
|
|
x = title
|
|
for fr, en in fr_months.items():
|
|
title = title.replace(fr, en)
|
|
return title
|
|
|
|
for doc in titles:
|
|
date = None
|
|
#date = dateparser.parse(doc['title'])
|
|
#if date:
|
|
# print(doc['title'], type(date), date)
|
|
|
|
title = en_title(doc['title'])
|
|
|
|
for regexp, fmt, outfmt in (
|
|
('[^\d](\d\d?, [^,]+, \d{4})', '%d, %B, %Y', '%Y-%m-%d'),
|
|
('[^\d](\d\d?, [^,]{3}, \d{4})', '%d, %b, %Y', '%Y-%m-%d'),
|
|
('(\d{4}-\d{2}-\d{2})', '%Y-%m-%d', '%Y-%m-%d'),
|
|
|
|
('[^\d]([^, (-]+? \d+, \d{4})', '%B %d, %Y', '%Y-%m-%d'),
|
|
|
|
('([^\d]{3} \d+ \d{4})', '%b %d %Y', '%Y-%m-%d'),
|
|
|
|
('[^\d]([^,]{3}, \d{4})', '%b, %Y', '%Y-%m'),
|
|
('[^\d]([^,]+?, \d{4})', '%B, %Y', '%Y-%m'),
|
|
('[^\d]([^ (]+?, \d{4})', '%B, %Y', '%Y-%m'),
|
|
('[^\d]([^ (-]+? \d{4})', '%B %Y', '%Y-%m'),
|
|
('[^\d]([^ (-]{3} \d{4})', '%b %Y', '%Y-%m'),
|
|
('(\d{4})\)', '%Y', '%Y'),
|
|
):
|
|
try:
|
|
date = re.compile(regexp).findall(title)
|
|
if date:
|
|
date = datetime.strptime(date[0], fmt).strftime(outfmt)
|
|
break
|
|
except:
|
|
#print(date)
|
|
date = None
|
|
continue
|
|
|
|
if date:
|
|
update = False
|
|
if doc.get('date') and len(doc.get('date')) < len(date):
|
|
print('update', doc['id'], doc['title'], doc['date'], '=>', date)
|
|
update = True
|
|
elif not doc.get('date'):
|
|
print(doc['id'], doc['title'], '==', date)
|
|
update = True
|
|
if update:
|
|
r = api.editDocument({'id': doc['id'], 'date': date})
|
|
#else:
|
|
# print('fail', doc['title'])
|
|
|