import csv import re import sys import ox from django.core.management.base import BaseCommand from django.conf import settings from ... import models class Command(BaseCommand): help = 'extract urls' args = '' def add_arguments(self, parser): parser.add_argument('--debug', action='store_true', dest='debug', default=False, help='debug something') def handle(self, **options): urls = set() for event in models.Event.objects.all(): for url in re.compile('href="(.*?)"').findall(event.body): urls.add(url) for url in re.compile('src="(.*?)"').findall(event.body): urls.add(url) writer = csv.writer(sys.stdout) writer.writerow(['url', 'pandora', 'archive']) for url in sorted(urls): url = ox.decode_html(url) if url[0] in ('/', '#'): continue if 'youtube' in url or 'vimeo' in url: p = 'y' else: p = '' writer.writerow([url, p, 'https://web.archive.org/web/*/' + url])