youtube video urls

This commit is contained in:
j 2007-06-08 19:19:22 +00:00
parent 3942d76b6e
commit b2c51e03b2
2 changed files with 58 additions and 2 deletions

View File

@ -47,6 +47,11 @@ def read_url_utf8(url, headers=DEFAULT_HEADERS, blocking=True):
data = unicode(data, charset)
return data
def open_url(url, headers=DEFAULT_HEADERS):
url = url.replace(' ', '%20')
req = urllib2.Request(url, None, headers)
return urllib2.urlopen(req)
def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
"""
Read str contents of given str URL.
@ -57,12 +62,12 @@ def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
at which point the str page contents is yielded.
"""
url = url.replace(' ', '%20')
req = urllib2.Request(url, None, headers)
f = urllib2.urlopen(req)
f = open_url(url, headers)
data = f.read()
f.close()
return data
def get_url(url, headers=DEFAULT_HEADERS, blocking=True):
"""
opens given str URL and returns the url after redirection.

51
scrapeit/youtube.py Normal file
View File

@ -0,0 +1,51 @@
# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
import re
import urllib2
from urllib import quote
from utils import read_url, open_url,
def get_video_url(id, get_redirected = False):
url = 'http://www.youtube.com/watch?v=%s' % id
data = read_url(url)
video = re.compile('player2.swf\?.*video_id=(.*?)\&').findall(data)
if video:
url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(id, video[0])
if get_redirected:
url = get_url(url)
return url
return None
def reg_search(reg, data):
result = re.compile(reg).findall(data)
if result:
result = result[0].strip()
else:
result = ''
return result
def search(query, video_url_base = None):
url = "http://youtube.com/results?search_query=%s&search=Search" % quote(query)
data = read_url(url)
regx = re.compile('''<a href="/watch.v=(.*?)">(.*?)</a><br/>''')
id_title = regx.findall(data)
data_flat = data.replace('\n', ' ')
videos = []
for video in id_title:
v = dict()
v['id'] = video[0]
v['link'] = "http//youtube.com/watch.v=%s" % v['id']
v['title'] = video[1].strip()
if video_url_base:
v['video_link'] = "%s/%s" % (video_url_base, v['id'])
else:
v['video_url'] = get_video_url(v['id'])
v['description'] = reg_search('''BeginvidDesc%s">(.*?)</span>''' % v['id'], data_flat)
v['thumbnail'] = reg_search('<img src="(.*?)" class="vimg120" alt="%s" />' % v['id'], data)
videos.append(v)
return videos