ubu cleanup

This commit is contained in:
j 2015-04-26 15:29:32 +02:00
parent 5c883e19e6
commit b147c61f5c

View file

@ -39,6 +39,10 @@ def get_data(url):
if 'title' in m: if 'title' in m:
m['title'] = re.sub('(.*?) \(\d{4}\)$', '\\1', m['title']) m['title'] = re.sub('(.*?) \(\d{4}\)$', '\\1', m['title'])
if not 'title' in m:
match = re.compile('<span id="ubuwork">(.*?)</span>').findall(data)
if match:
m['title'] = strip_tags(decode_html(match[0])).strip()
if not 'title' in m: if not 'title' in m:
match = re.compile("<title>.*?&amp;(.*?)</title>").findall(data) match = re.compile("<title>.*?&amp;(.*?)</title>").findall(data)
if match: if match:
@ -72,7 +76,7 @@ def get_data(url):
if txt: if txt:
if len(txt) > 1 and txt[0].strip() == m.get('title'): if len(txt) > 1 and txt[0].strip() == m.get('title'):
txt = txt[1:] txt = txt[1:]
m['description'] = '\n\n'.join(txt).split('RESOURCES')[0].strip() m['description'] = '\n\n'.join(txt).split('RESOURCES')[0].split('RELATED')[0].strip()
y = re.compile('\((\d{4})\)').findall(data) y = re.compile('\((\d{4})\)').findall(data)
if y: if y:
m['year'] = int(y[0]) m['year'] = int(y[0])