add import scripts

2016-06-26 17:37:30 +02:00 · 2016-06-26 17:37:30 +02:00 · c5cd5cb8cb
commit c5cd5cb8cb
parent 925136e79b
2 changed files with 189 additions and 0 deletions
--- a/download.sh
+++ b/download.sh
@ -0,0 +1,2 @@
+wget -r -c -np https://www.indymedia.org.uk/media/
+wget -r -c -np https://www.indymedia.org.uk/content/
--- a/video_import.py
+++ b/video_import.py
@ -0,0 +1,187 @@
+#!/usr/bin/python
+from datetime import datetime
+import json
+import os
+import re
+import sqlite3
+import sys
+
+import ox
+import pandora_client
+import lxml.html
+
+def get_html(doc, classname):
+    element = '*'
+    d = doc.xpath("//%s[contains(@class, '%s')]" % (element, classname))
+    if not len(d):
+        return ''
+    d = d[0]
+    html = lxml.html.tostring(d, pretty_print=True)
+    html = ox.sanitize_html(html).replace('<a></a>', '').strip()
+    while html.endswith('<br><br>'):
+        html = html.rstrip('<br><br>').strip()
+    html = '\n'.join(html.split('\n')[1:-1]).strip()
+    html = re.sub('<img src="/img/.*?">', '', html)
+    html = html.replace('href="/', 'href="https://www.indymedia.org.uk/')
+    html = html.replace('src="/', 'src="https://www.indymedia.org.uk/')
+    return html
+
+def parse_content(path):
+    info = {}
+    with open(path) as fd:
+        data = fd.read().decode('utf-8')
+    doc = lxml.html.fromstring(data)
+    title = doc.xpath("//a[contains(@class, 'arttitle')]")[0].text_content().strip()
+    url = 'https://' + path[path.index('www.indymedia.org.uk'):].replace('/content/', '/en/')
+    info['links'] = [url]
+    info['title'] = title
+    p = doc.xpath("//p[contains(@class, 'date')]")[0]
+    date = p.text_content().strip()
+    date = re.compile('\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}').findall(date)
+    if date:
+        info['date'] = datetime.strptime(date[0], '%d.%m.%Y %H:%M').strftime('%Y-%m-%d %H:%M')
+    info['themes'] = [a.text_content() for a in p.xpath('//a') if a.attrib.get('href', '').startswith('/en/topics')]
+    info['themes'] += [a.text_content() for a in p.xpath('//a') if a.attrib.get('href', '').startswith('/en/actions')]
+    l = ', '.join([a.text_content() for a in p.xpath('//a') if a.attrib.get('href', '').startswith('/en/regions')])
+    if l:
+        info['location'] = l
+
+    mids = set()
+    media = []
+    for id, title in re.compile('href="(/media/.*?)">(.*?)</a>', re.DOTALL).findall(data) \
+            + re.compile('href="http://www.indymedia.org.uk(/media/.*?)">(.*?)</a>', re.DOTALL).findall(data):
+        id = ox.decode_html(re.sub('\s+', ' ', id.strip()).replace('//', '/'))
+        title = ox.decode_html(re.sub('\s+', ' ', title.strip()).replace('//', '/'))
+        if id not in mids:
+            mids.add(id)
+            media.append([id, title])
+
+    info['media'] = media
+
+    info['video'] = [m for m in media if m[0].split('.')[-1] in ox.file.EXTENSIONS['video']]
+
+    images = re.compile('src="(/images/.*?)".*?alt="(.*?)">', re.DOTALL).findall(data)
+    images = [[ox.decode_html(re.sub('\s+', ' ', e.strip()).replace('//', '/')) for e in r] for r in images]
+    info['images'] = images
+
+    info['abstract'] = get_html(doc, 'intro')
+    info['content'] = get_html(doc, 'articlecontent')
+    try:
+        d = doc.xpath("//p[contains(@class, 'creator')]")[0]
+        info['creator'] = {
+            'name': d.xpath('.//strong')[0].text_content()
+        }
+        info['depositor'] = info['creator']['name']
+        info['creator']['url'] = d.xpath('.//a')[0].attrib['href']
+    except:
+        pass
+    content = []
+    content.append(info['abstract'])
+    images = []
+    for image in info['images']:
+        img = u'<figure><img src="{url}"><figcaption>{info}</figcaption></figure>'.format(
+            url='https://www.indymedia.org.uk' + image[0], info=image[1])
+        images.append(img)
+    content.append(u'\n'.join(images))
+    content.append(info['content'])
+    if 'creator' in info and 'url' in info['creator']:
+        content.append(u'Creator: <a href="{url}">{name}</a>'.format(**info['creator']))
+    info['summary'] = u'\n\n'.join(content).strip()
+    info['summary'] = re.sub(info['summary'], '\n\n+', '\n\n')
+    return info
+
+if __name__ == '__main__':
+    api_url = 'https://urg.0x2620.org/api/'
+    api = pandora_client.API(api_url)
+    api.signin(username='import', password='indyport')
+
+    failed = []
+    missing = []
+    invalid = []
+    fileids = set()
+    for root, folders, files in os.walk('www.indymedia.org.uk/content/', topdown=True):
+        folders.sort()
+        for f in sorted(files):
+            if f.endswith('.html'):
+                id = f.split('/')[-1].split('.')[0]
+                url = os.path.join(root, f).replace('www.indymedia.org.uk/content/', '')
+                path = os.path.join(root, f)
+                url = 'https://www.indymedia.org.uk/en/' + url
+                with open(path) as fd:
+                    data = fd.read()
+                    if '<!-- content -->' in data:
+                        try:
+                            p = parse_content(path)
+                        except:
+                            if '/media/' in data:
+                                failed.append(url)
+                            continue
+                        if p['video']:
+                            print(url)
+                            r = api.find({
+                                'query': {'conditions': [{'key': 'links', 'value': url, 'operator': '=='}]},
+                                'keys': ['id'],
+                                'range': [0, 100]
+                            })
+                            info = {}
+                            for key in ('title', 'summary', 'links', 'themes', 'location', 'date', 'depositor'):
+                                if key in p:
+                                    info[key] = p[key]
+                            if r['data']['items']:
+                                item_id = r['data']['items'][0]['id']
+                                '''
+                                info['id'] = item_id
+                                r = api.edit(**info)
+                                if r['status']['code'] != 200:
+                                    print(r)
+                                assert(r['status']['code'] == 200)
+                                # fixme, sync videos
+                                '''
+                            else:
+                                item_id = api.add(title=p['title'])['data']['id']
+                                info['id'] = item_id
+                                r = api.edit(**info)
+                                if r['status']['code'] != 200:
+                                    print(item_id)
+                                    print(info)
+                                    print(r)
+                                assert(r['status']['code'] == 200)
+                                for id, title in p['video']:
+                                    f = 'www.indymedia.org.uk' + id
+                                    f = os.path.abspath(f)
+                                    if not os.path.exists(f):
+                                        missing.append(f)
+                                        continue
+                                    info = ox.avinfo(f)
+                                    if 'error' in info or 'oshash' not in info:
+                                        invalid.append(f)
+                                        continue
+                                    oshash = info['oshash']
+                                    if oshash in fileids:
+                                        print('WTF', oshash, 'known')
+                                        continue
+                                    fileids.add(oshash)
+                                    if 'path' in info:
+                                        del info['path']
+                                    # print('adding', f, info)
+                                    r = api.addMedia({
+                                        'id': oshash,
+                                        'item': item_id,
+                                        'filename': id
+                                    })
+                                    print(r)
+                                    assert(r['status']['code'] == 200)
+                                    # upload media file
+                                    url = '%supload/direct/' % api_url
+                                    r = api.upload_chunks(url, f, {
+                                        'id': oshash
+                                    })
+                                    print('chunk upload', oshash, r)
+                                    assert(r)
+
+    with open('import_failed.json', 'w') as fd:
+        json.dump(failed, fd, indent=4)
+    with open('import_missing.json', 'w') as fd:
+        json.dump(missing, fd, indent=4)
+    with open('import_invalid.json', 'w') as fd:
+        json.dump(invalid, fd, indent=4)