add import scripts
This commit is contained in:
parent
925136e79b
commit
c5cd5cb8cb
2 changed files with 189 additions and 0 deletions
2
download.sh
Executable file
2
download.sh
Executable file
|
@ -0,0 +1,2 @@
|
|||
wget -r -c -np https://www.indymedia.org.uk/media/
|
||||
wget -r -c -np https://www.indymedia.org.uk/content/
|
187
video_import.py
Normal file
187
video_import.py
Normal file
|
@ -0,0 +1,187 @@
|
|||
#!/usr/bin/python
|
||||
from datetime import datetime
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
import sys
|
||||
|
||||
import ox
|
||||
import pandora_client
|
||||
import lxml.html
|
||||
|
||||
def get_html(doc, classname):
|
||||
element = '*'
|
||||
d = doc.xpath("//%s[contains(@class, '%s')]" % (element, classname))
|
||||
if not len(d):
|
||||
return ''
|
||||
d = d[0]
|
||||
html = lxml.html.tostring(d, pretty_print=True)
|
||||
html = ox.sanitize_html(html).replace('<a></a>', '').strip()
|
||||
while html.endswith('<br><br>'):
|
||||
html = html.rstrip('<br><br>').strip()
|
||||
html = '\n'.join(html.split('\n')[1:-1]).strip()
|
||||
html = re.sub('<img src="/img/.*?">', '', html)
|
||||
html = html.replace('href="/', 'href="https://www.indymedia.org.uk/')
|
||||
html = html.replace('src="/', 'src="https://www.indymedia.org.uk/')
|
||||
return html
|
||||
|
||||
def parse_content(path):
|
||||
info = {}
|
||||
with open(path) as fd:
|
||||
data = fd.read().decode('utf-8')
|
||||
doc = lxml.html.fromstring(data)
|
||||
title = doc.xpath("//a[contains(@class, 'arttitle')]")[0].text_content().strip()
|
||||
url = 'https://' + path[path.index('www.indymedia.org.uk'):].replace('/content/', '/en/')
|
||||
info['links'] = [url]
|
||||
info['title'] = title
|
||||
p = doc.xpath("//p[contains(@class, 'date')]")[0]
|
||||
date = p.text_content().strip()
|
||||
date = re.compile('\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}').findall(date)
|
||||
if date:
|
||||
info['date'] = datetime.strptime(date[0], '%d.%m.%Y %H:%M').strftime('%Y-%m-%d %H:%M')
|
||||
info['themes'] = [a.text_content() for a in p.xpath('//a') if a.attrib.get('href', '').startswith('/en/topics')]
|
||||
info['themes'] += [a.text_content() for a in p.xpath('//a') if a.attrib.get('href', '').startswith('/en/actions')]
|
||||
l = ', '.join([a.text_content() for a in p.xpath('//a') if a.attrib.get('href', '').startswith('/en/regions')])
|
||||
if l:
|
||||
info['location'] = l
|
||||
|
||||
mids = set()
|
||||
media = []
|
||||
for id, title in re.compile('href="(/media/.*?)">(.*?)</a>', re.DOTALL).findall(data) \
|
||||
+ re.compile('href="http://www.indymedia.org.uk(/media/.*?)">(.*?)</a>', re.DOTALL).findall(data):
|
||||
id = ox.decode_html(re.sub('\s+', ' ', id.strip()).replace('//', '/'))
|
||||
title = ox.decode_html(re.sub('\s+', ' ', title.strip()).replace('//', '/'))
|
||||
if id not in mids:
|
||||
mids.add(id)
|
||||
media.append([id, title])
|
||||
|
||||
info['media'] = media
|
||||
|
||||
info['video'] = [m for m in media if m[0].split('.')[-1] in ox.file.EXTENSIONS['video']]
|
||||
|
||||
images = re.compile('src="(/images/.*?)".*?alt="(.*?)">', re.DOTALL).findall(data)
|
||||
images = [[ox.decode_html(re.sub('\s+', ' ', e.strip()).replace('//', '/')) for e in r] for r in images]
|
||||
info['images'] = images
|
||||
|
||||
info['abstract'] = get_html(doc, 'intro')
|
||||
info['content'] = get_html(doc, 'articlecontent')
|
||||
try:
|
||||
d = doc.xpath("//p[contains(@class, 'creator')]")[0]
|
||||
info['creator'] = {
|
||||
'name': d.xpath('.//strong')[0].text_content()
|
||||
}
|
||||
info['depositor'] = info['creator']['name']
|
||||
info['creator']['url'] = d.xpath('.//a')[0].attrib['href']
|
||||
except:
|
||||
pass
|
||||
content = []
|
||||
content.append(info['abstract'])
|
||||
images = []
|
||||
for image in info['images']:
|
||||
img = u'<figure><img src="{url}"><figcaption>{info}</figcaption></figure>'.format(
|
||||
url='https://www.indymedia.org.uk' + image[0], info=image[1])
|
||||
images.append(img)
|
||||
content.append(u'\n'.join(images))
|
||||
content.append(info['content'])
|
||||
if 'creator' in info and 'url' in info['creator']:
|
||||
content.append(u'Creator: <a href="{url}">{name}</a>'.format(**info['creator']))
|
||||
info['summary'] = u'\n\n'.join(content).strip()
|
||||
info['summary'] = re.sub(info['summary'], '\n\n+', '\n\n')
|
||||
return info
|
||||
|
||||
if __name__ == '__main__':
|
||||
api_url = 'https://urg.0x2620.org/api/'
|
||||
api = pandora_client.API(api_url)
|
||||
api.signin(username='import', password='indyport')
|
||||
|
||||
failed = []
|
||||
missing = []
|
||||
invalid = []
|
||||
fileids = set()
|
||||
for root, folders, files in os.walk('www.indymedia.org.uk/content/', topdown=True):
|
||||
folders.sort()
|
||||
for f in sorted(files):
|
||||
if f.endswith('.html'):
|
||||
id = f.split('/')[-1].split('.')[0]
|
||||
url = os.path.join(root, f).replace('www.indymedia.org.uk/content/', '')
|
||||
path = os.path.join(root, f)
|
||||
url = 'https://www.indymedia.org.uk/en/' + url
|
||||
with open(path) as fd:
|
||||
data = fd.read()
|
||||
if '<!-- content -->' in data:
|
||||
try:
|
||||
p = parse_content(path)
|
||||
except:
|
||||
if '/media/' in data:
|
||||
failed.append(url)
|
||||
continue
|
||||
if p['video']:
|
||||
print(url)
|
||||
r = api.find({
|
||||
'query': {'conditions': [{'key': 'links', 'value': url, 'operator': '=='}]},
|
||||
'keys': ['id'],
|
||||
'range': [0, 100]
|
||||
})
|
||||
info = {}
|
||||
for key in ('title', 'summary', 'links', 'themes', 'location', 'date', 'depositor'):
|
||||
if key in p:
|
||||
info[key] = p[key]
|
||||
if r['data']['items']:
|
||||
item_id = r['data']['items'][0]['id']
|
||||
'''
|
||||
info['id'] = item_id
|
||||
r = api.edit(**info)
|
||||
if r['status']['code'] != 200:
|
||||
print(r)
|
||||
assert(r['status']['code'] == 200)
|
||||
# fixme, sync videos
|
||||
'''
|
||||
else:
|
||||
item_id = api.add(title=p['title'])['data']['id']
|
||||
info['id'] = item_id
|
||||
r = api.edit(**info)
|
||||
if r['status']['code'] != 200:
|
||||
print(item_id)
|
||||
print(info)
|
||||
print(r)
|
||||
assert(r['status']['code'] == 200)
|
||||
for id, title in p['video']:
|
||||
f = 'www.indymedia.org.uk' + id
|
||||
f = os.path.abspath(f)
|
||||
if not os.path.exists(f):
|
||||
missing.append(f)
|
||||
continue
|
||||
info = ox.avinfo(f)
|
||||
if 'error' in info or 'oshash' not in info:
|
||||
invalid.append(f)
|
||||
continue
|
||||
oshash = info['oshash']
|
||||
if oshash in fileids:
|
||||
print('WTF', oshash, 'known')
|
||||
continue
|
||||
fileids.add(oshash)
|
||||
if 'path' in info:
|
||||
del info['path']
|
||||
# print('adding', f, info)
|
||||
r = api.addMedia({
|
||||
'id': oshash,
|
||||
'item': item_id,
|
||||
'filename': id
|
||||
})
|
||||
print(r)
|
||||
assert(r['status']['code'] == 200)
|
||||
# upload media file
|
||||
url = '%supload/direct/' % api_url
|
||||
r = api.upload_chunks(url, f, {
|
||||
'id': oshash
|
||||
})
|
||||
print('chunk upload', oshash, r)
|
||||
assert(r)
|
||||
|
||||
with open('import_failed.json', 'w') as fd:
|
||||
json.dump(failed, fd, indent=4)
|
||||
with open('import_missing.json', 'w') as fd:
|
||||
json.dump(missing, fd, indent=4)
|
||||
with open('import_invalid.json', 'w') as fd:
|
||||
json.dump(invalid, fd, indent=4)
|
Loading…
Reference in a new issue