better ubu parser

2015-04-24 19:02:25 +02:00 · 2015-04-24 19:02:25 +02:00 · 5c883e19e6
commit 5c883e19e6
parent 47bdf3c897
1 changed files with 6 additions and 2 deletions
--- a/ox/web/ubu.py
+++ b/ox/web/ubu.py
@ -45,6 +45,8 @@ def get_data(url):
            m['title'] = match[0].strip()
            if ' - ' in m['title']:
                m['title'] = m['title'].split(' - ', 1)[-1]
+    if 'title' in m:
+        m['title'] = strip_tags(decode_html(m['title']).strip())
    match = re.compile("flashvars','file=(.*?.flv)'").findall(data)
    if match:
        m['flv'] = match[0]
@ -52,8 +54,10 @@ def get_data(url):

    match = re.compile('''src=(.*?) type="video/mp4"''').findall(data)
    if match:
-        m['mp4'] = match[0].strip('"').strip("'")
-    elif 'video' in m and m['video'].endswith('.mp4'):
+        m['mp4'] = match[0].strip('"').strip("'").replace(' ', '%20')
+        if not m['mp4'].startswith('http'):
+            m['mp4'] = 'http://ubumexico.centro.org.mx/video/' + m['mp4']
+    elif 'video' in m and (m['video'].endswith('.mp4') or m['video'].endswith('.m4v')):
        m['mp4'] = m['video']

    doc = lxml.html.document_fromstring(read_url(url))