youtube video urls
This commit is contained in:
parent
3942d76b6e
commit
b2c51e03b2
2 changed files with 58 additions and 2 deletions
|
@ -47,6 +47,11 @@ def read_url_utf8(url, headers=DEFAULT_HEADERS, blocking=True):
|
|||
data = unicode(data, charset)
|
||||
return data
|
||||
|
||||
def open_url(url, headers=DEFAULT_HEADERS):
|
||||
url = url.replace(' ', '%20')
|
||||
req = urllib2.Request(url, None, headers)
|
||||
return urllib2.urlopen(req)
|
||||
|
||||
def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
|
||||
"""
|
||||
Read str contents of given str URL.
|
||||
|
@ -57,12 +62,12 @@ def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
|
|||
at which point the str page contents is yielded.
|
||||
"""
|
||||
url = url.replace(' ', '%20')
|
||||
req = urllib2.Request(url, None, headers)
|
||||
f = urllib2.urlopen(req)
|
||||
f = open_url(url, headers)
|
||||
data = f.read()
|
||||
f.close()
|
||||
return data
|
||||
|
||||
|
||||
def get_url(url, headers=DEFAULT_HEADERS, blocking=True):
|
||||
"""
|
||||
opens given str URL and returns the url after redirection.
|
||||
|
|
51
scrapeit/youtube.py
Normal file
51
scrapeit/youtube.py
Normal file
|
@ -0,0 +1,51 @@
|
|||
# -*- Mode: Python; -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=2:sts=2:ts=2
|
||||
|
||||
import re
|
||||
import urllib2
|
||||
from urllib import quote
|
||||
|
||||
from utils import read_url, open_url,
|
||||
|
||||
|
||||
def get_video_url(id, get_redirected = False):
|
||||
url = 'http://www.youtube.com/watch?v=%s' % id
|
||||
data = read_url(url)
|
||||
video = re.compile('player2.swf\?.*video_id=(.*?)\&').findall(data)
|
||||
if video:
|
||||
url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(id, video[0])
|
||||
if get_redirected:
|
||||
url = get_url(url)
|
||||
return url
|
||||
return None
|
||||
|
||||
def reg_search(reg, data):
|
||||
result = re.compile(reg).findall(data)
|
||||
if result:
|
||||
result = result[0].strip()
|
||||
else:
|
||||
result = ''
|
||||
return result
|
||||
|
||||
def search(query, video_url_base = None):
|
||||
url = "http://youtube.com/results?search_query=%s&search=Search" % quote(query)
|
||||
data = read_url(url)
|
||||
regx = re.compile('''<a href="/watch.v=(.*?)">(.*?)</a><br/>''')
|
||||
id_title = regx.findall(data)
|
||||
data_flat = data.replace('\n', ' ')
|
||||
videos = []
|
||||
for video in id_title:
|
||||
v = dict()
|
||||
v['id'] = video[0]
|
||||
v['link'] = "http//youtube.com/watch.v=%s" % v['id']
|
||||
v['title'] = video[1].strip()
|
||||
if video_url_base:
|
||||
v['video_link'] = "%s/%s" % (video_url_base, v['id'])
|
||||
else:
|
||||
v['video_url'] = get_video_url(v['id'])
|
||||
v['description'] = reg_search('''BeginvidDesc%s">(.*?)</span>''' % v['id'], data_flat)
|
||||
v['thumbnail'] = reg_search('<img src="(.*?)" class="vimg120" alt="%s" />' % v['id'], data)
|
||||
videos.append(v)
|
||||
return videos
|
||||
|
Loading…
Reference in a new issue