41 lines
1.1 KiB
Python
41 lines
1.1 KiB
Python
|
import csv
|
||
|
import re
|
||
|
import sys
|
||
|
import ox
|
||
|
|
||
|
from django.core.management.base import BaseCommand
|
||
|
from django.conf import settings
|
||
|
|
||
|
from ... import models
|
||
|
|
||
|
|
||
|
class Command(BaseCommand):
|
||
|
help = 'extract urls'
|
||
|
args = ''
|
||
|
|
||
|
def add_arguments(self, parser):
|
||
|
parser.add_argument('--debug', action='store_true', dest='debug',
|
||
|
default=False, help='debug something')
|
||
|
|
||
|
def handle(self, **options):
|
||
|
urls = set()
|
||
|
for event in models.Event.objects.all():
|
||
|
for url in re.compile('href="(.*?)"').findall(event.body):
|
||
|
urls.add(url)
|
||
|
for url in re.compile('src="(.*?)"').findall(event.body):
|
||
|
urls.add(url)
|
||
|
|
||
|
writer = csv.writer(sys.stdout)
|
||
|
writer.writerow(['url', 'pandora', 'archive'])
|
||
|
for url in sorted(urls):
|
||
|
url = ox.decode_html(url)
|
||
|
if url[0] in ('/', '#'):
|
||
|
continue
|
||
|
if 'youtube' in url or 'vimeo' in url:
|
||
|
p = 'y'
|
||
|
else:
|
||
|
p = ''
|
||
|
writer.writerow([url, p, 'https://web.archive.org/web/*/' + url])
|
||
|
|
||
|
|