From 60ad26d201fc28b41cbc98805b35e9f9be9b3026 Mon Sep 17 00:00:00 2001
From: j <0x006A@0x2620.org>
Date: Sun, 15 Mar 2015 01:07:34 +0530
Subject: [PATCH] update ubu/archive

---
 ox/web/archive.py | 11 ++++++++---
 ox/web/ubu.py     | 23 +++++++++++++++++++++--
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/ox/web/archive.py b/ox/web/archive.py
index 7f4b572..52ac487 100644
--- a/ox/web/archive.py
+++ b/ox/web/archive.py
@@ -15,9 +15,14 @@ def get_data(id):
     details = cache.read_url('%s?output=json' % url)
     details = json.loads(details)
     for key in ('title', 'description', 'runtime'):
-        data[key] = details['metadata'][key]
-        if isinstance(data[key], list):
-            data[key] = data[key][0]
+        if key in details['metadata']:
+            data[key] = details['metadata'][key]
+            if isinstance(data[key], list):
+                data[key] = data[key][0]
+            if isinstance(data[key], basestring):
+                data[key] = data[key].strip()
+                if data[key][0] == '[' and data[key][-1] == ']':
+                    data[key] = data[key][1:-1] 
     data['url'] = url
     data['image'] = 'http://archive.org/download/%s/format=thumbnail' % id
     data['ogg'] = 'http://archive.org/download/%s/format=Ogg+video' % id
diff --git a/ox/web/ubu.py b/ox/web/ubu.py
index 7286234..5870bc1 100644
--- a/ox/web/ubu.py
+++ b/ox/web/ubu.py
@@ -3,12 +3,14 @@
 from __future__ import print_function
 import re
 
-from ox import find_re, strip_tags, decode_html
+import lxml.html
+
+from ox import strip_tags, decode_html
 from ox.cache import read_url
 
 
 def get_id(url):
-    return url.replace('http://www.ubu.com/', '').split('.html')[0]
+    return url.replace('http://www.ubu.com/', '').split('.html')[0].replace('/./', '/')
 
 def get_url(id):
     return 'http://www.ubu.com/%s.html' % id
@@ -41,6 +43,22 @@ def get_data(url):
         m['flv'] = match[0]
         m['flv'] = m['flv'].replace('/video/ ', '/video/').replace(' ', '%20')
 
+    match = re.compile('''src=(.*?) type="video/mp4"''').findall(data)
+    if match:
+        m['mp4'] = match[0].strip('"').strip("'")
+
+    doc = lxml.html.document_fromstring(read_url(url))
+    desc = doc.xpath("//div[contains(@id, 'ubudesc')]")
+    if len(desc):
+        txt = []
+        for part in desc[0].text_content().split('\n\n'):
+            if part == 'RESOURCES:':
+                break
+            txt.append(part)
+        if txt:
+            if len(txt) > 1:
+                txt = txt[1:]
+            m['description'] = '\n\n'.join(txt).strip()
     y = re.compile('\((\d{4})\)').findall(data)
     if y:
         m['year'] = int(y[0])
@@ -61,6 +79,7 @@ def get_data(url):
                 m['artist'] = strip_tags(decode_html(a[0])).strip()
             elif m['id'] == 'film/lawder_color':
                 m['artist'] = 'Standish Lawder'
+
     if 'artist' in m:
         m['artist'] = m['artist'].replace('in UbuWeb Film', '')
         m['artist'] = m['artist'].replace('on UbuWeb Film', '').strip()