update ubu/archive
This commit is contained in:
parent
7f7b0c3ee8
commit
60ad26d201
2 changed files with 29 additions and 5 deletions
|
@ -15,9 +15,14 @@ def get_data(id):
|
||||||
details = cache.read_url('%s?output=json' % url)
|
details = cache.read_url('%s?output=json' % url)
|
||||||
details = json.loads(details)
|
details = json.loads(details)
|
||||||
for key in ('title', 'description', 'runtime'):
|
for key in ('title', 'description', 'runtime'):
|
||||||
|
if key in details['metadata']:
|
||||||
data[key] = details['metadata'][key]
|
data[key] = details['metadata'][key]
|
||||||
if isinstance(data[key], list):
|
if isinstance(data[key], list):
|
||||||
data[key] = data[key][0]
|
data[key] = data[key][0]
|
||||||
|
if isinstance(data[key], basestring):
|
||||||
|
data[key] = data[key].strip()
|
||||||
|
if data[key][0] == '[' and data[key][-1] == ']':
|
||||||
|
data[key] = data[key][1:-1]
|
||||||
data['url'] = url
|
data['url'] = url
|
||||||
data['image'] = 'http://archive.org/download/%s/format=thumbnail' % id
|
data['image'] = 'http://archive.org/download/%s/format=thumbnail' % id
|
||||||
data['ogg'] = 'http://archive.org/download/%s/format=Ogg+video' % id
|
data['ogg'] = 'http://archive.org/download/%s/format=Ogg+video' % id
|
||||||
|
|
|
@ -3,12 +3,14 @@
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from ox import find_re, strip_tags, decode_html
|
import lxml.html
|
||||||
|
|
||||||
|
from ox import strip_tags, decode_html
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
|
|
||||||
|
|
||||||
def get_id(url):
|
def get_id(url):
|
||||||
return url.replace('http://www.ubu.com/', '').split('.html')[0]
|
return url.replace('http://www.ubu.com/', '').split('.html')[0].replace('/./', '/')
|
||||||
|
|
||||||
def get_url(id):
|
def get_url(id):
|
||||||
return 'http://www.ubu.com/%s.html' % id
|
return 'http://www.ubu.com/%s.html' % id
|
||||||
|
@ -41,6 +43,22 @@ def get_data(url):
|
||||||
m['flv'] = match[0]
|
m['flv'] = match[0]
|
||||||
m['flv'] = m['flv'].replace('/video/ ', '/video/').replace(' ', '%20')
|
m['flv'] = m['flv'].replace('/video/ ', '/video/').replace(' ', '%20')
|
||||||
|
|
||||||
|
match = re.compile('''src=(.*?) type="video/mp4"''').findall(data)
|
||||||
|
if match:
|
||||||
|
m['mp4'] = match[0].strip('"').strip("'")
|
||||||
|
|
||||||
|
doc = lxml.html.document_fromstring(read_url(url))
|
||||||
|
desc = doc.xpath("//div[contains(@id, 'ubudesc')]")
|
||||||
|
if len(desc):
|
||||||
|
txt = []
|
||||||
|
for part in desc[0].text_content().split('\n\n'):
|
||||||
|
if part == 'RESOURCES:':
|
||||||
|
break
|
||||||
|
txt.append(part)
|
||||||
|
if txt:
|
||||||
|
if len(txt) > 1:
|
||||||
|
txt = txt[1:]
|
||||||
|
m['description'] = '\n\n'.join(txt).strip()
|
||||||
y = re.compile('\((\d{4})\)').findall(data)
|
y = re.compile('\((\d{4})\)').findall(data)
|
||||||
if y:
|
if y:
|
||||||
m['year'] = int(y[0])
|
m['year'] = int(y[0])
|
||||||
|
@ -61,6 +79,7 @@ def get_data(url):
|
||||||
m['artist'] = strip_tags(decode_html(a[0])).strip()
|
m['artist'] = strip_tags(decode_html(a[0])).strip()
|
||||||
elif m['id'] == 'film/lawder_color':
|
elif m['id'] == 'film/lawder_color':
|
||||||
m['artist'] = 'Standish Lawder'
|
m['artist'] = 'Standish Lawder'
|
||||||
|
|
||||||
if 'artist' in m:
|
if 'artist' in m:
|
||||||
m['artist'] = m['artist'].replace('in UbuWeb Film', '')
|
m['artist'] = m['artist'].replace('in UbuWeb Film', '')
|
||||||
m['artist'] = m['artist'].replace('on UbuWeb Film', '').strip()
|
m['artist'] = m['artist'].replace('on UbuWeb Film', '').strip()
|
||||||
|
|
Loading…
Reference in a new issue