187 lines
8.3 KiB
Python
187 lines
8.3 KiB
Python
#!/usr/bin/python
|
|
from datetime import datetime
|
|
import json
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import sys
|
|
|
|
import ox
|
|
import pandora_client
|
|
import lxml.html
|
|
|
|
def get_html(doc, classname):
|
|
element = '*'
|
|
d = doc.xpath("//%s[contains(@class, '%s')]" % (element, classname))
|
|
if not len(d):
|
|
return ''
|
|
d = d[0]
|
|
html = lxml.html.tostring(d, pretty_print=True)
|
|
html = ox.sanitize_html(html).replace('<a></a>', '').strip()
|
|
while html.endswith('<br><br>'):
|
|
html = html.rstrip('<br><br>').strip()
|
|
html = '\n'.join(html.split('\n')[1:-1]).strip()
|
|
html = re.sub('<img src="/img/.*?">', '', html)
|
|
html = html.replace('href="/', 'href="https://www.indymedia.org.uk/')
|
|
html = html.replace('src="/', 'src="https://www.indymedia.org.uk/')
|
|
return html
|
|
|
|
def parse_content(path):
|
|
info = {}
|
|
with open(path) as fd:
|
|
data = fd.read().decode('utf-8')
|
|
doc = lxml.html.fromstring(data)
|
|
title = doc.xpath("//a[contains(@class, 'arttitle')]")[0].text_content().strip()
|
|
url = 'https://' + path[path.index('www.indymedia.org.uk'):].replace('/content/', '/en/')
|
|
info['links'] = [url]
|
|
info['title'] = title
|
|
p = doc.xpath("//p[contains(@class, 'date')]")[0]
|
|
date = p.text_content().strip()
|
|
date = re.compile('\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}').findall(date)
|
|
if date:
|
|
info['date'] = datetime.strptime(date[0], '%d.%m.%Y %H:%M').strftime('%Y-%m-%d %H:%M')
|
|
info['themes'] = [a.text_content() for a in p.xpath('//a') if a.attrib.get('href', '').startswith('/en/topics')]
|
|
info['themes'] += [a.text_content() for a in p.xpath('//a') if a.attrib.get('href', '').startswith('/en/actions')]
|
|
l = ', '.join([a.text_content() for a in p.xpath('//a') if a.attrib.get('href', '').startswith('/en/regions')])
|
|
if l:
|
|
info['location'] = l
|
|
|
|
mids = set()
|
|
media = []
|
|
for id, title in re.compile('href="(/media/.*?)">(.*?)</a>', re.DOTALL).findall(data) \
|
|
+ re.compile('href="http://www.indymedia.org.uk(/media/.*?)">(.*?)</a>', re.DOTALL).findall(data):
|
|
id = ox.decode_html(re.sub('\s+', ' ', id.strip()).replace('//', '/'))
|
|
title = ox.decode_html(re.sub('\s+', ' ', title.strip()).replace('//', '/'))
|
|
if id not in mids:
|
|
mids.add(id)
|
|
media.append([id, title])
|
|
|
|
info['media'] = media
|
|
|
|
info['video'] = [m for m in media if m[0].split('.')[-1] in ox.file.EXTENSIONS['video']]
|
|
|
|
images = re.compile('src="(/images/.*?)".*?alt="(.*?)">', re.DOTALL).findall(data)
|
|
images = [[ox.decode_html(re.sub('\s+', ' ', e.strip()).replace('//', '/')) for e in r] for r in images]
|
|
info['images'] = images
|
|
|
|
info['abstract'] = get_html(doc, 'intro')
|
|
info['content'] = get_html(doc, 'articlecontent')
|
|
try:
|
|
d = doc.xpath("//p[contains(@class, 'creator')]")[0]
|
|
info['creator'] = {
|
|
'name': d.xpath('.//strong')[0].text_content()
|
|
}
|
|
info['depositor'] = info['creator']['name']
|
|
info['creator']['url'] = d.xpath('.//a')[0].attrib['href']
|
|
except:
|
|
pass
|
|
content = []
|
|
content.append(info['abstract'])
|
|
images = []
|
|
for image in info['images']:
|
|
img = u'<figure><img src="{url}"><figcaption>{info}</figcaption></figure>'.format(
|
|
url='https://www.indymedia.org.uk' + image[0], info=image[1])
|
|
images.append(img)
|
|
content.append(u'\n'.join(images))
|
|
content.append(info['content'])
|
|
if 'creator' in info and 'url' in info['creator']:
|
|
content.append(u'Creator: <a href="{url}">{name}</a>'.format(**info['creator']))
|
|
info['summary'] = u'\n\n'.join(content).strip()
|
|
info['summary'] = re.sub(info['summary'], '\n\n+', '\n\n')
|
|
return info
|
|
|
|
if __name__ == '__main__':
|
|
api_url = 'https://urg.0x2620.org/api/'
|
|
api = pandora_client.API(api_url)
|
|
api.signin(username='import', password='indyport')
|
|
|
|
failed = []
|
|
missing = []
|
|
invalid = []
|
|
fileids = set()
|
|
for root, folders, files in os.walk('www.indymedia.org.uk/content/', topdown=True):
|
|
folders.sort()
|
|
for f in sorted(files):
|
|
if f.endswith('.html'):
|
|
id = f.split('/')[-1].split('.')[0]
|
|
url = os.path.join(root, f).replace('www.indymedia.org.uk/content/', '')
|
|
path = os.path.join(root, f)
|
|
url = 'https://www.indymedia.org.uk/en/' + url
|
|
with open(path) as fd:
|
|
data = fd.read()
|
|
if '<!-- content -->' in data:
|
|
try:
|
|
p = parse_content(path)
|
|
except:
|
|
if '/media/' in data:
|
|
failed.append(url)
|
|
continue
|
|
if p['video']:
|
|
print(url)
|
|
r = api.find({
|
|
'query': {'conditions': [{'key': 'links', 'value': url, 'operator': '=='}]},
|
|
'keys': ['id'],
|
|
'range': [0, 100]
|
|
})
|
|
info = {}
|
|
for key in ('title', 'summary', 'links', 'themes', 'location', 'date', 'depositor'):
|
|
if key in p:
|
|
info[key] = p[key]
|
|
if r['data']['items']:
|
|
item_id = r['data']['items'][0]['id']
|
|
'''
|
|
info['id'] = item_id
|
|
r = api.edit(**info)
|
|
if r['status']['code'] != 200:
|
|
print(r)
|
|
assert(r['status']['code'] == 200)
|
|
# fixme, sync videos
|
|
'''
|
|
else:
|
|
item_id = api.add(title=p['title'])['data']['id']
|
|
info['id'] = item_id
|
|
r = api.edit(**info)
|
|
if r['status']['code'] != 200:
|
|
print(item_id)
|
|
print(info)
|
|
print(r)
|
|
assert(r['status']['code'] == 200)
|
|
for id, title in p['video']:
|
|
f = 'www.indymedia.org.uk' + id
|
|
f = os.path.abspath(f)
|
|
if not os.path.exists(f):
|
|
missing.append(f)
|
|
continue
|
|
info = ox.avinfo(f)
|
|
if 'error' in info or 'oshash' not in info:
|
|
invalid.append(f)
|
|
continue
|
|
oshash = info['oshash']
|
|
if oshash in fileids:
|
|
print('WTF', oshash, 'known')
|
|
continue
|
|
fileids.add(oshash)
|
|
if 'path' in info:
|
|
del info['path']
|
|
# print('adding', f, info)
|
|
r = api.addMedia({
|
|
'id': oshash,
|
|
'item': item_id,
|
|
'filename': id
|
|
})
|
|
print(r)
|
|
assert(r['status']['code'] == 200)
|
|
# upload media file
|
|
url = '%supload/direct/' % api_url
|
|
r = api.upload_chunks(url, f, {
|
|
'id': oshash
|
|
})
|
|
print('chunk upload', oshash, r)
|
|
assert(r)
|
|
|
|
with open('import_failed.json', 'w') as fd:
|
|
json.dump(failed, fd, indent=4)
|
|
with open('import_missing.json', 'w') as fd:
|
|
json.dump(missing, fd, indent=4)
|
|
with open('import_invalid.json', 'w') as fd:
|
|
json.dump(invalid, fd, indent=4)
|