add london imc import script
This commit is contained in:
parent
23b0bd5f28
commit
8126466752
1 changed files with 105 additions and 0 deletions
105
import_scripts/london_imc_import.py
Normal file
105
import_scripts/london_imc_import.py
Normal file
|
@ -0,0 +1,105 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
from datetime import datetime
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
import ox
|
||||||
|
import lxml.html
|
||||||
|
|
||||||
|
def parse_info(f):
|
||||||
|
info = {}
|
||||||
|
with open(f) as fd:
|
||||||
|
html = fd.read()
|
||||||
|
doc = lxml.html.document_fromstring(html)
|
||||||
|
|
||||||
|
mp4 = [a for a in doc.xpath('.//a') if a.text_content() == '"MP4"']
|
||||||
|
if not mp4:
|
||||||
|
return None
|
||||||
|
mp4 = mp4[0].attrib['href']
|
||||||
|
info['mp4'] = 'london.indymedia.org/shared' + mp4
|
||||||
|
if os.path.exists(info['mp4']):
|
||||||
|
info['oshash'] = ox.oshash(info['mp4'])
|
||||||
|
else:
|
||||||
|
print('video missing', info['mp4'])
|
||||||
|
return None
|
||||||
|
info['title'] = doc.xpath('.//h1[contains(@class, "title")]')[0].text_content().strip()
|
||||||
|
info['links'] = ['http://london.indymedia.org/videos/%s' % f.split('/')[-1].split('.')[0]]
|
||||||
|
info['summary'] = lxml.html.tostring(doc.xpath('.//div[contains(@class, "singlepost")]')[0].xpath('.//p')[-1]).decode().strip()[3:-4]
|
||||||
|
|
||||||
|
byline = doc.xpath('.//p[contains(@class, "byline")]')[0]
|
||||||
|
uploader = byline.xpath('.//strong')[0].text_content().strip()
|
||||||
|
|
||||||
|
info['summary'] += '<br>\n<br>\nCreator: %s' % uploader
|
||||||
|
info['depositor'] = uploader
|
||||||
|
info['collection'] = 'Indymedia London'
|
||||||
|
|
||||||
|
published = byline.xpath('.//small')[0].text_content().strip()
|
||||||
|
published = published.replace('Published:', '').strip()
|
||||||
|
published = datetime.strptime(published, '%B %d, %Y %H:%M').strftime('%Y-%m-%d %H:%M')
|
||||||
|
info['date'] = published
|
||||||
|
|
||||||
|
group = [a for a in doc.xpath('.//a') if '/groups/' in a.attrib.get('href', '')]
|
||||||
|
if group:
|
||||||
|
group = group[0].text_content().strip()
|
||||||
|
info['summary'] += '<br>\n<br>\nGroup: %s' % group
|
||||||
|
|
||||||
|
tags = [a for a in doc.xpath('.//a') if '/about/' in a.attrib.get('href', '')]
|
||||||
|
if tags:
|
||||||
|
info['themes'] = [t.text_content().strip().replace('_', ' ') for t in tags]
|
||||||
|
info['themes'] = [t for t in info['themes'] if t not in ('repression', 'solidarity')]
|
||||||
|
if not info['themes']:
|
||||||
|
del info['themes']
|
||||||
|
|
||||||
|
tags = [a for a in doc.xpath('.//a') if '/in/' in a.attrib.get('href', '')]
|
||||||
|
if tags:
|
||||||
|
info['location'] = ', '.join([t.text_content().strip().replace('_', ' ') for t in tags])
|
||||||
|
|
||||||
|
return info
|
||||||
|
|
||||||
|
def parse_folder(base='london.indymedia.org/shared/system/cache/videos'):
|
||||||
|
index = []
|
||||||
|
for root, folders, files in os.walk(base):
|
||||||
|
for f in files:
|
||||||
|
f = os.path.join(root, f)
|
||||||
|
if f.endswith('.html'):
|
||||||
|
#print(f)
|
||||||
|
info = parse_info(f)
|
||||||
|
if info:
|
||||||
|
index.append(info)
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
def import_video(api, info):
|
||||||
|
if not api.findMedia({
|
||||||
|
'query': {
|
||||||
|
'conditions': [{'key': 'oshash', 'value': info['oshash']}]
|
||||||
|
}
|
||||||
|
})['data']['items']:
|
||||||
|
filename = info['mp4']
|
||||||
|
oshash = info['oshash']
|
||||||
|
avinfo = ox.avinfo(filename)
|
||||||
|
if 'path' in avinfo:
|
||||||
|
del avinfo['path']
|
||||||
|
r = api.addMedia({
|
||||||
|
'id': oshash,
|
||||||
|
'filename': os.path.basename(filename),
|
||||||
|
'info': avinfo
|
||||||
|
})
|
||||||
|
item_id = r['data']['item']
|
||||||
|
url = '%supload/direct/' % api.url
|
||||||
|
if api.upload_chunks(url, filename, {'id': oshash}):
|
||||||
|
del info['oshash']
|
||||||
|
del info['mp4']
|
||||||
|
info['id'] = item_id
|
||||||
|
api.edit(info)
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
index = parse_folder()
|
||||||
|
api = ox.api.signin('https://amp.0x2620.org/api/')
|
||||||
|
for i in index:
|
||||||
|
if not import_video(api, i):
|
||||||
|
print('!! failed', i['links'])
|
Loading…
Reference in a new issue