better ubu parser

This commit is contained in:
j 2015-04-24 19:02:25 +02:00
parent 47bdf3c897
commit 5c883e19e6

View file

@ -45,6 +45,8 @@ def get_data(url):
m['title'] = match[0].strip() m['title'] = match[0].strip()
if ' - ' in m['title']: if ' - ' in m['title']:
m['title'] = m['title'].split(' - ', 1)[-1] m['title'] = m['title'].split(' - ', 1)[-1]
if 'title' in m:
m['title'] = strip_tags(decode_html(m['title']).strip())
match = re.compile("flashvars','file=(.*?.flv)'").findall(data) match = re.compile("flashvars','file=(.*?.flv)'").findall(data)
if match: if match:
m['flv'] = match[0] m['flv'] = match[0]
@ -52,8 +54,10 @@ def get_data(url):
match = re.compile('''src=(.*?) type="video/mp4"''').findall(data) match = re.compile('''src=(.*?) type="video/mp4"''').findall(data)
if match: if match:
m['mp4'] = match[0].strip('"').strip("'") m['mp4'] = match[0].strip('"').strip("'").replace(' ', '%20')
elif 'video' in m and m['video'].endswith('.mp4'): if not m['mp4'].startswith('http'):
m['mp4'] = 'http://ubumexico.centro.org.mx/video/' + m['mp4']
elif 'video' in m and (m['video'].endswith('.mp4') or m['video'].endswith('.m4v')):
m['mp4'] = m['video'] m['mp4'] = m['video']
doc = lxml.html.document_fromstring(read_url(url)) doc = lxml.html.document_fromstring(read_url(url))