pandora_amp/video_import.py
2016-11-29 23:04:04 +01:00

187 lines
8.3 KiB
Python

#!/usr/bin/python
from datetime import datetime
import json
import os
import re
import sqlite3
import sys
import ox
import pandora_client
import lxml.html
def get_html(doc, classname):
element = '*'
d = doc.xpath("//%s[contains(@class, '%s')]" % (element, classname))
if not len(d):
return ''
d = d[0]
html = lxml.html.tostring(d, pretty_print=True)
html = ox.sanitize_html(html).replace('<a></a>', '').strip()
while html.endswith('<br><br>'):
html = html.rstrip('<br><br>').strip()
html = '\n'.join(html.split('\n')[1:-1]).strip()
html = re.sub('<img src="/img/.*?">', '', html)
html = html.replace('href="/', 'href="https://www.indymedia.org.uk/')
html = html.replace('src="/', 'src="https://www.indymedia.org.uk/')
return html
def parse_content(path):
info = {}
with open(path) as fd:
data = fd.read().decode('utf-8')
doc = lxml.html.fromstring(data)
title = doc.xpath("//a[contains(@class, 'arttitle')]")[0].text_content().strip()
url = 'https://' + path[path.index('www.indymedia.org.uk'):].replace('/content/', '/en/')
info['links'] = [url]
info['title'] = title
p = doc.xpath("//p[contains(@class, 'date')]")[0]
date = p.text_content().strip()
date = re.compile('\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}').findall(date)
if date:
info['date'] = datetime.strptime(date[0], '%d.%m.%Y %H:%M').strftime('%Y-%m-%d %H:%M')
info['themes'] = [a.text_content() for a in p.xpath('//a') if a.attrib.get('href', '').startswith('/en/topics')]
info['themes'] += [a.text_content() for a in p.xpath('//a') if a.attrib.get('href', '').startswith('/en/actions')]
l = ', '.join([a.text_content() for a in p.xpath('//a') if a.attrib.get('href', '').startswith('/en/regions')])
if l:
info['location'] = l
mids = set()
media = []
for id, title in re.compile('href="(/media/.*?)">(.*?)</a>', re.DOTALL).findall(data) \
+ re.compile('href="http://www.indymedia.org.uk(/media/.*?)">(.*?)</a>', re.DOTALL).findall(data):
id = ox.decode_html(re.sub('\s+', ' ', id.strip()).replace('//', '/'))
title = ox.decode_html(re.sub('\s+', ' ', title.strip()).replace('//', '/'))
if id not in mids:
mids.add(id)
media.append([id, title])
info['media'] = media
info['video'] = [m for m in media if m[0].split('.')[-1] in ox.file.EXTENSIONS['video']]
images = re.compile('src="(/images/.*?)".*?alt="(.*?)">', re.DOTALL).findall(data)
images = [[ox.decode_html(re.sub('\s+', ' ', e.strip()).replace('//', '/')) for e in r] for r in images]
info['images'] = images
info['abstract'] = get_html(doc, 'intro')
info['content'] = get_html(doc, 'articlecontent')
try:
d = doc.xpath("//p[contains(@class, 'creator')]")[0]
info['creator'] = {
'name': d.xpath('.//strong')[0].text_content()
}
info['depositor'] = info['creator']['name']
info['creator']['url'] = d.xpath('.//a')[0].attrib['href']
except:
pass
content = []
content.append(info['abstract'])
images = []
for image in info['images']:
img = u'<figure><img src="{url}"><figcaption>{info}</figcaption></figure>'.format(
url='https://www.indymedia.org.uk' + image[0], info=image[1])
images.append(img)
content.append(u'\n'.join(images))
content.append(info['content'])
if 'creator' in info and 'url' in info['creator']:
content.append(u'Creator: <a href="{url}">{name}</a>'.format(**info['creator']))
info['summary'] = u'\n\n'.join(content).strip()
info['summary'] = re.sub(info['summary'], '\n\n+', '\n\n')
return info
if __name__ == '__main__':
api_url = 'https://urg.0x2620.org/api/'
api = pandora_client.API(api_url)
api.signin(username='import', password='indyport')
failed = []
missing = []
invalid = []
fileids = set()
for root, folders, files in os.walk('www.indymedia.org.uk/content/', topdown=True):
folders.sort()
for f in sorted(files):
if f.endswith('.html'):
id = f.split('/')[-1].split('.')[0]
url = os.path.join(root, f).replace('www.indymedia.org.uk/content/', '')
path = os.path.join(root, f)
url = 'https://www.indymedia.org.uk/en/' + url
with open(path) as fd:
data = fd.read()
if '<!-- content -->' in data:
try:
p = parse_content(path)
except:
if '/media/' in data:
failed.append(url)
continue
if p['video']:
print(url)
r = api.find({
'query': {'conditions': [{'key': 'links', 'value': url, 'operator': '=='}]},
'keys': ['id'],
'range': [0, 100]
})
info = {}
for key in ('title', 'summary', 'links', 'themes', 'location', 'date', 'depositor'):
if key in p:
info[key] = p[key]
if r['data']['items']:
item_id = r['data']['items'][0]['id']
'''
info['id'] = item_id
r = api.edit(**info)
if r['status']['code'] != 200:
print(r)
assert(r['status']['code'] == 200)
# fixme, sync videos
'''
else:
item_id = api.add(title=p['title'])['data']['id']
info['id'] = item_id
r = api.edit(**info)
if r['status']['code'] != 200:
print(item_id)
print(info)
print(r)
assert(r['status']['code'] == 200)
for id, title in p['video']:
f = 'www.indymedia.org.uk' + id
f = os.path.abspath(f)
if not os.path.exists(f):
missing.append(f)
continue
info = ox.avinfo(f)
if 'error' in info or 'oshash' not in info:
invalid.append(f)
continue
oshash = info['oshash']
if oshash in fileids:
print('WTF', oshash, 'known')
continue
fileids.add(oshash)
if 'path' in info:
del info['path']
# print('adding', f, info)
r = api.addMedia({
'id': oshash,
'item': item_id,
'filename': id
})
print(r)
assert(r['status']['code'] == 200)
# upload media file
url = '%supload/direct/' % api_url
r = api.upload_chunks(url, f, {
'id': oshash
})
print('chunk upload', oshash, r)
assert(r)
with open('import_failed.json', 'w') as fd:
json.dump(failed, fd, indent=4)
with open('import_missing.json', 'w') as fd:
json.dump(missing, fd, indent=4)
with open('import_invalid.json', 'w') as fd:
json.dump(invalid, fd, indent=4)