ubu cleanup

2015-04-26 15:29:32 +02:00 · 2015-04-26 15:29:32 +02:00 · b147c61f5c
commit b147c61f5c
parent 5c883e19e6
1 changed files with 5 additions and 1 deletions
--- a/ox/web/ubu.py
+++ b/ox/web/ubu.py
@ -39,6 +39,10 @@ def get_data(url):
    if 'title' in m:
        m['title'] = re.sub('(.*?) \(\d{4}\)$', '\\1', m['title'])

+    if not 'title' in m:
+        match = re.compile('<span id="ubuwork">(.*?)</span>').findall(data)
+        if match:
+            m['title'] = strip_tags(decode_html(match[0])).strip()
    if not 'title' in m:
        match = re.compile("<title>.*?&amp;(.*?)</title>").findall(data)
        if match:
@ -72,7 +76,7 @@ def get_data(url):
        if txt:
            if len(txt) > 1 and txt[0].strip() == m.get('title'):
                txt = txt[1:]
-            m['description'] = '\n\n'.join(txt).split('RESOURCES')[0].strip()
+            m['description'] = '\n\n'.join(txt).split('RESOURCES')[0].split('RELATED')[0].strip()
    y = re.compile('\((\d{4})\)').findall(data)
    if y:
        m['year'] = int(y[0])