#!/usr/bin/python from datetime import datetime import json import os import re import sqlite3 import sys import ox import pandora_client import lxml.html def get_html(doc, classname): element = '*' d = doc.xpath("//%s[contains(@class, '%s')]" % (element, classname)) if not len(d): return '' d = d[0] html = lxml.html.tostring(d, pretty_print=True) html = ox.sanitize_html(html).replace('', '').strip() while html.endswith('

'): html = html.rstrip('

').strip() html = '\n'.join(html.split('\n')[1:-1]).strip() html = re.sub('', '', html) html = html.replace('href="/', 'href="https://www.indymedia.org.uk/') html = html.replace('src="/', 'src="https://www.indymedia.org.uk/') return html def parse_content(path): info = {} with open(path) as fd: data = fd.read().decode('utf-8') doc = lxml.html.fromstring(data) title = doc.xpath("//a[contains(@class, 'arttitle')]")[0].text_content().strip() url = 'https://' + path[path.index('www.indymedia.org.uk'):].replace('/content/', '/en/') info['links'] = [url] info['title'] = title p = doc.xpath("//p[contains(@class, 'date')]")[0] date = p.text_content().strip() date = re.compile('\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}').findall(date) if date: info['date'] = datetime.strptime(date[0], '%d.%m.%Y %H:%M').strftime('%Y-%m-%d %H:%M') info['themes'] = [a.text_content() for a in p.xpath('//a') if a.attrib.get('href', '').startswith('/en/topics')] info['themes'] += [a.text_content() for a in p.xpath('//a') if a.attrib.get('href', '').startswith('/en/actions')] l = ', '.join([a.text_content() for a in p.xpath('//a') if a.attrib.get('href', '').startswith('/en/regions')]) if l: info['location'] = l mids = set() media = [] for id, title in re.compile('href="(/media/.*?)">(.*?)', re.DOTALL).findall(data) \ + re.compile('href="http://www.indymedia.org.uk(/media/.*?)">(.*?)', re.DOTALL).findall(data): id = ox.decode_html(re.sub('\s+', ' ', id.strip()).replace('//', '/')) title = ox.decode_html(re.sub('\s+', ' ', title.strip()).replace('//', '/')) if id not in mids: mids.add(id) media.append([id, title]) info['media'] = media info['video'] = [m for m in media if m[0].split('.')[-1] in ox.file.EXTENSIONS['video']] images = re.compile('src="(/images/.*?)".*?alt="(.*?)">', re.DOTALL).findall(data) images = [[ox.decode_html(re.sub('\s+', ' ', e.strip()).replace('//', '/')) for e in r] for r in images] info['images'] = images info['abstract'] = get_html(doc, 'intro') info['content'] = get_html(doc, 'articlecontent') try: d = doc.xpath("//p[contains(@class, 'creator')]")[0] info['creator'] = { 'name': d.xpath('.//strong')[0].text_content() } info['depositor'] = info['creator']['name'] info['creator']['url'] = d.xpath('.//a')[0].attrib['href'] except: pass content = [] content.append(info['abstract']) images = [] for image in info['images']: img = u'
{info}
'.format( url='https://www.indymedia.org.uk' + image[0], info=image[1]) images.append(img) content.append(u'\n'.join(images)) content.append(info['content']) if 'creator' in info and 'url' in info['creator']: content.append(u'Creator: {name}'.format(**info['creator'])) info['summary'] = u'\n\n'.join(content).strip() info['summary'] = re.sub(info['summary'], '\n\n+', '\n\n') return info if __name__ == '__main__': api_url = 'https://urg.0x2620.org/api/' api = pandora_client.API(api_url) api.signin(username='import', password='indyport') failed = [] missing = [] invalid = [] fileids = set() for root, folders, files in os.walk('www.indymedia.org.uk/content/', topdown=True): folders.sort() for f in sorted(files): if f.endswith('.html'): id = f.split('/')[-1].split('.')[0] url = os.path.join(root, f).replace('www.indymedia.org.uk/content/', '') path = os.path.join(root, f) url = 'https://www.indymedia.org.uk/en/' + url with open(path) as fd: data = fd.read() if '' in data: try: p = parse_content(path) except: if '/media/' in data: failed.append(url) continue if p['video']: print(url) r = api.find({ 'query': {'conditions': [{'key': 'links', 'value': url, 'operator': '=='}]}, 'keys': ['id'], 'range': [0, 100] }) info = {} for key in ('title', 'summary', 'links', 'themes', 'location', 'date', 'depositor'): if key in p: info[key] = p[key] if r['data']['items']: item_id = r['data']['items'][0]['id'] ''' info['id'] = item_id r = api.edit(**info) if r['status']['code'] != 200: print(r) assert(r['status']['code'] == 200) # fixme, sync videos ''' else: item_id = api.add(title=p['title'])['data']['id'] info['id'] = item_id r = api.edit(**info) if r['status']['code'] != 200: print(item_id) print(info) print(r) assert(r['status']['code'] == 200) for id, title in p['video']: f = 'www.indymedia.org.uk' + id f = os.path.abspath(f) if not os.path.exists(f): missing.append(f) continue info = ox.avinfo(f) if 'error' in info or 'oshash' not in info: invalid.append(f) continue oshash = info['oshash'] if oshash in fileids: print('WTF', oshash, 'known') continue fileids.add(oshash) if 'path' in info: del info['path'] # print('adding', f, info) r = api.addMedia({ 'id': oshash, 'item': item_id, 'filename': id }) print(r) assert(r['status']['code'] == 200) # upload media file url = '%supload/direct/' % api_url r = api.upload_chunks(url, f, { 'id': oshash }) print('chunk upload', oshash, r) assert(r) with open('import_failed.json', 'w') as fd: json.dump(failed, fd, indent=4) with open('import_missing.json', 'w') as fd: json.dump(missing, fd, indent=4) with open('import_invalid.json', 'w') as fd: json.dump(invalid, fd, indent=4)