#!/usr/bin/python3 from datetime import datetime import os import re import ox import lxml.html def parse_info(f): info = {} with open(f) as fd: html = fd.read() doc = lxml.html.document_fromstring(html) mp4 = [a for a in doc.xpath('.//a') if a.text_content() == '"MP4"'] if not mp4: return None mp4 = mp4[0].attrib['href'] info['mp4'] = 'london.indymedia.org/shared' + mp4 if os.path.exists(info['mp4']): info['oshash'] = ox.oshash(info['mp4']) else: print('video missing', info['mp4']) return None info['title'] = doc.xpath('.//h1[contains(@class, "title")]')[0].text_content().strip() info['links'] = ['http://london.indymedia.org/videos/%s' % f.split('/')[-1].split('.')[0]] info['summary'] = lxml.html.tostring(doc.xpath('.//div[contains(@class, "singlepost")]')[0].xpath('.//p')[-1]).decode().strip()[3:-4] byline = doc.xpath('.//p[contains(@class, "byline")]')[0] uploader = byline.xpath('.//strong')[0].text_content().strip() info['summary'] += '
\n
\nCreator: %s' % uploader info['depositor'] = uploader info['collection'] = 'Indymedia London' published = byline.xpath('.//small')[0].text_content().strip() published = published.replace('Published:', '').strip() published = datetime.strptime(published, '%B %d, %Y %H:%M').strftime('%Y-%m-%d %H:%M') info['date'] = published group = [a for a in doc.xpath('.//a') if '/groups/' in a.attrib.get('href', '')] if group: group = group[0].text_content().strip() info['summary'] += '
\n
\nGroup: %s' % group tags = [a for a in doc.xpath('.//a') if '/about/' in a.attrib.get('href', '')] if tags: info['themes'] = [t.text_content().strip().replace('_', ' ') for t in tags] info['themes'] = [t for t in info['themes'] if t not in ('repression', 'solidarity')] if not info['themes']: del info['themes'] tags = [a for a in doc.xpath('.//a') if '/in/' in a.attrib.get('href', '')] if tags: info['location'] = ', '.join([t.text_content().strip().replace('_', ' ') for t in tags]) return info def parse_folder(base='london.indymedia.org/shared/system/cache/videos'): index = [] for root, folders, files in os.walk(base): for f in files: f = os.path.join(root, f) if f.endswith('.html'): #print(f) info = parse_info(f) if info: index.append(info) return index def import_video(api, info): if not api.findMedia({ 'query': { 'conditions': [{'key': 'oshash', 'value': info['oshash']}] } })['data']['items']: filename = info['mp4'] oshash = info['oshash'] avinfo = ox.avinfo(filename) if 'path' in avinfo: del avinfo['path'] r = api.addMedia({ 'id': oshash, 'filename': os.path.basename(filename), 'info': avinfo }) item_id = r['data']['item'] url = '%supload/direct/' % api.url if api.upload_chunks(url, filename, {'id': oshash}): del info['oshash'] del info['mp4'] info['id'] = item_id api.edit(info) else: return False return True if __name__ == '__main__': index = parse_folder() api = ox.api.signin('https://amp.0x2620.org/api/') for i in index: if not import_video(api, i): print('!! failed', i['links'])