event-timeline/app/event/management/commands/extract_urls.py

41 lines
1.1 KiB
Python
Raw Normal View History

2022-04-22 17:20:42 +00:00
import csv
import re
import sys
import ox
from django.core.management.base import BaseCommand
from django.conf import settings
from ... import models
class Command(BaseCommand):
help = 'extract urls'
args = ''
def add_arguments(self, parser):
parser.add_argument('--debug', action='store_true', dest='debug',
default=False, help='debug something')
def handle(self, **options):
urls = set()
for event in models.Event.objects.all():
for url in re.compile('href="(.*?)"').findall(event.body):
urls.add(url)
for url in re.compile('src="(.*?)"').findall(event.body):
urls.add(url)
writer = csv.writer(sys.stdout)
writer.writerow(['url', 'pandora', 'archive'])
for url in sorted(urls):
url = ox.decode_html(url)
if url[0] in ('/', '#'):
continue
if 'youtube' in url or 'vimeo' in url:
p = 'y'
else:
p = ''
writer.writerow([url, p, 'https://web.archive.org/web/*/' + url])