add london imc import script

2019-05-02 21:31:46 +02:00 · 2019-05-02 21:31:46 +02:00 · 8126466752
commit 8126466752
parent 23b0bd5f28
1 changed files with 105 additions and 0 deletions
--- a/import_scripts/london_imc_import.py
+++ b/import_scripts/london_imc_import.py
@ -0,0 +1,105 @@
+#!/usr/bin/python3
+from datetime import datetime
+import os
+import re
+
+import ox
+import lxml.html
+
+def parse_info(f):
+    info = {}
+    with open(f) as fd:
+        html = fd.read()
+    doc = lxml.html.document_fromstring(html)
+
+    mp4 = [a for a in doc.xpath('.//a') if a.text_content() == '"MP4"']
+    if not mp4:
+        return None
+    mp4 = mp4[0].attrib['href']
+    info['mp4'] = 'london.indymedia.org/shared' + mp4
+    if os.path.exists(info['mp4']):
+        info['oshash'] = ox.oshash(info['mp4'])
+    else:
+        print('video missing', info['mp4'])
+        return None
+    info['title'] = doc.xpath('.//h1[contains(@class, "title")]')[0].text_content().strip()
+    info['links'] = ['http://london.indymedia.org/videos/%s' % f.split('/')[-1].split('.')[0]]
+    info['summary'] = lxml.html.tostring(doc.xpath('.//div[contains(@class, "singlepost")]')[0].xpath('.//p')[-1]).decode().strip()[3:-4]
+
+    byline = doc.xpath('.//p[contains(@class, "byline")]')[0]
+    uploader = byline.xpath('.//strong')[0].text_content().strip()
+
+    info['summary'] += '<br>\n<br>\nCreator: %s' % uploader
+    info['depositor'] = uploader
+    info['collection'] = 'Indymedia London'
+
+    published = byline.xpath('.//small')[0].text_content().strip()
+    published = published.replace('Published:', '').strip()
+    published = datetime.strptime(published, '%B %d, %Y %H:%M').strftime('%Y-%m-%d %H:%M')
+    info['date'] = published
+
+    group = [a for a in doc.xpath('.//a') if '/groups/' in a.attrib.get('href', '')]
+    if group:
+        group = group[0].text_content().strip()
+        info['summary'] += '<br>\n<br>\nGroup: %s' % group
+
+    tags = [a for a in doc.xpath('.//a') if '/about/' in a.attrib.get('href', '')]
+    if tags:
+        info['themes'] = [t.text_content().strip().replace('_', ' ') for t in tags]
+        info['themes'] = [t for t in info['themes'] if t not in ('repression', 'solidarity')]
+        if not info['themes']:
+            del info['themes']
+
+    tags = [a for a in doc.xpath('.//a') if '/in/' in a.attrib.get('href', '')]
+    if tags:
+        info['location'] = ', '.join([t.text_content().strip().replace('_', ' ') for t in tags])
+
+    return info
+
+def parse_folder(base='london.indymedia.org/shared/system/cache/videos'):
+    index = []
+    for root, folders, files in os.walk(base):
+        for f in files:
+            f = os.path.join(root, f)
+            if f.endswith('.html'):
+                #print(f)
+                info = parse_info(f)
+                if info:
+                    index.append(info)
+    return index
+
+
+def import_video(api, info):
+    if not api.findMedia({
+        'query': {
+            'conditions': [{'key': 'oshash', 'value': info['oshash']}]
+        }
+    })['data']['items']:
+        filename = info['mp4']
+        oshash = info['oshash']
+        avinfo = ox.avinfo(filename)
+        if 'path' in avinfo:
+            del avinfo['path']
+        r = api.addMedia({
+            'id': oshash,
+            'filename': os.path.basename(filename),
+            'info': avinfo
+        })
+        item_id = r['data']['item']
+        url = '%supload/direct/' % api.url
+        if api.upload_chunks(url, filename, {'id': oshash}):
+            del info['oshash']
+            del info['mp4']
+            info['id'] = item_id
+            api.edit(info)
+        else:
+            return False
+    return True
+
+
+if __name__ == '__main__':
+    index = parse_folder()
+    api = ox.api.signin('https://amp.0x2620.org/api/')
+    for i in index:
+        if not import_video(api, i):
+            print('!! failed', i['links'])