From 81264667522cd9b98660deab71f2de44e3bfca88 Mon Sep 17 00:00:00 2001 From: j Date: Thu, 2 May 2019 21:31:46 +0200 Subject: [PATCH] add london imc import script --- import_scripts/london_imc_import.py | 105 ++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 import_scripts/london_imc_import.py diff --git a/import_scripts/london_imc_import.py b/import_scripts/london_imc_import.py new file mode 100644 index 0000000..50f646a --- /dev/null +++ b/import_scripts/london_imc_import.py @@ -0,0 +1,105 @@ +#!/usr/bin/python3 +from datetime import datetime +import os +import re + +import ox +import lxml.html + +def parse_info(f): + info = {} + with open(f) as fd: + html = fd.read() + doc = lxml.html.document_fromstring(html) + + mp4 = [a for a in doc.xpath('.//a') if a.text_content() == '"MP4"'] + if not mp4: + return None + mp4 = mp4[0].attrib['href'] + info['mp4'] = 'london.indymedia.org/shared' + mp4 + if os.path.exists(info['mp4']): + info['oshash'] = ox.oshash(info['mp4']) + else: + print('video missing', info['mp4']) + return None + info['title'] = doc.xpath('.//h1[contains(@class, "title")]')[0].text_content().strip() + info['links'] = ['http://london.indymedia.org/videos/%s' % f.split('/')[-1].split('.')[0]] + info['summary'] = lxml.html.tostring(doc.xpath('.//div[contains(@class, "singlepost")]')[0].xpath('.//p')[-1]).decode().strip()[3:-4] + + byline = doc.xpath('.//p[contains(@class, "byline")]')[0] + uploader = byline.xpath('.//strong')[0].text_content().strip() + + info['summary'] += '
\n
\nCreator: %s' % uploader + info['depositor'] = uploader + info['collection'] = 'Indymedia London' + + published = byline.xpath('.//small')[0].text_content().strip() + published = published.replace('Published:', '').strip() + published = datetime.strptime(published, '%B %d, %Y %H:%M').strftime('%Y-%m-%d %H:%M') + info['date'] = published + + group = [a for a in doc.xpath('.//a') if '/groups/' in a.attrib.get('href', '')] + if group: + group = group[0].text_content().strip() + info['summary'] += '
\n
\nGroup: %s' % group + + tags = [a for a in doc.xpath('.//a') if '/about/' in a.attrib.get('href', '')] + if tags: + info['themes'] = [t.text_content().strip().replace('_', ' ') for t in tags] + info['themes'] = [t for t in info['themes'] if t not in ('repression', 'solidarity')] + if not info['themes']: + del info['themes'] + + tags = [a for a in doc.xpath('.//a') if '/in/' in a.attrib.get('href', '')] + if tags: + info['location'] = ', '.join([t.text_content().strip().replace('_', ' ') for t in tags]) + + return info + +def parse_folder(base='london.indymedia.org/shared/system/cache/videos'): + index = [] + for root, folders, files in os.walk(base): + for f in files: + f = os.path.join(root, f) + if f.endswith('.html'): + #print(f) + info = parse_info(f) + if info: + index.append(info) + return index + + +def import_video(api, info): + if not api.findMedia({ + 'query': { + 'conditions': [{'key': 'oshash', 'value': info['oshash']}] + } + })['data']['items']: + filename = info['mp4'] + oshash = info['oshash'] + avinfo = ox.avinfo(filename) + if 'path' in avinfo: + del avinfo['path'] + r = api.addMedia({ + 'id': oshash, + 'filename': os.path.basename(filename), + 'info': avinfo + }) + item_id = r['data']['item'] + url = '%supload/direct/' % api.url + if api.upload_chunks(url, filename, {'id': oshash}): + del info['oshash'] + del info['mp4'] + info['id'] = item_id + api.edit(info) + else: + return False + return True + + +if __name__ == '__main__': + index = parse_folder() + api = ox.api.signin('https://amp.0x2620.org/api/') + for i in index: + if not import_video(api, i): + print('!! failed', i['links'])