From 60ad26d201fc28b41cbc98805b35e9f9be9b3026 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Sun, 15 Mar 2015 01:07:34 +0530 Subject: [PATCH] update ubu/archive --- ox/web/archive.py | 11 ++++++++--- ox/web/ubu.py | 23 +++++++++++++++++++++-- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/ox/web/archive.py b/ox/web/archive.py index 7f4b572..52ac487 100644 --- a/ox/web/archive.py +++ b/ox/web/archive.py @@ -15,9 +15,14 @@ def get_data(id): details = cache.read_url('%s?output=json' % url) details = json.loads(details) for key in ('title', 'description', 'runtime'): - data[key] = details['metadata'][key] - if isinstance(data[key], list): - data[key] = data[key][0] + if key in details['metadata']: + data[key] = details['metadata'][key] + if isinstance(data[key], list): + data[key] = data[key][0] + if isinstance(data[key], basestring): + data[key] = data[key].strip() + if data[key][0] == '[' and data[key][-1] == ']': + data[key] = data[key][1:-1] data['url'] = url data['image'] = 'http://archive.org/download/%s/format=thumbnail' % id data['ogg'] = 'http://archive.org/download/%s/format=Ogg+video' % id diff --git a/ox/web/ubu.py b/ox/web/ubu.py index 7286234..5870bc1 100644 --- a/ox/web/ubu.py +++ b/ox/web/ubu.py @@ -3,12 +3,14 @@ from __future__ import print_function import re -from ox import find_re, strip_tags, decode_html +import lxml.html + +from ox import strip_tags, decode_html from ox.cache import read_url def get_id(url): - return url.replace('http://www.ubu.com/', '').split('.html')[0] + return url.replace('http://www.ubu.com/', '').split('.html')[0].replace('/./', '/') def get_url(id): return 'http://www.ubu.com/%s.html' % id @@ -41,6 +43,22 @@ def get_data(url): m['flv'] = match[0] m['flv'] = m['flv'].replace('/video/ ', '/video/').replace(' ', '%20') + match = re.compile('''src=(.*?) type="video/mp4"''').findall(data) + if match: + m['mp4'] = match[0].strip('"').strip("'") + + doc = lxml.html.document_fromstring(read_url(url)) + desc = doc.xpath("//div[contains(@id, 'ubudesc')]") + if len(desc): + txt = [] + for part in desc[0].text_content().split('\n\n'): + if part == 'RESOURCES:': + break + txt.append(part) + if txt: + if len(txt) > 1: + txt = txt[1:] + m['description'] = '\n\n'.join(txt).strip() y = re.compile('\((\d{4})\)').findall(data) if y: m['year'] = int(y[0]) @@ -61,6 +79,7 @@ def get_data(url): m['artist'] = strip_tags(decode_html(a[0])).strip() elif m['id'] == 'film/lawder_color': m['artist'] = 'Standish Lawder' + if 'artist' in m: m['artist'] = m['artist'].replace('in UbuWeb Film', '') m['artist'] = m['artist'].replace('on UbuWeb Film', '').strip()