From b2c51e03b2eb986b0b6941ee4038ff0058680414 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Fri, 8 Jun 2007 19:19:22 +0000 Subject: [PATCH] youtube video urls --- scrapeit/utils.py | 9 ++++++-- scrapeit/youtube.py | 51 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 scrapeit/youtube.py diff --git a/scrapeit/utils.py b/scrapeit/utils.py index 5fd7a73..6bcc2ff 100644 --- a/scrapeit/utils.py +++ b/scrapeit/utils.py @@ -47,6 +47,11 @@ def read_url_utf8(url, headers=DEFAULT_HEADERS, blocking=True): data = unicode(data, charset) return data +def open_url(url, headers=DEFAULT_HEADERS): + url = url.replace(' ', '%20') + req = urllib2.Request(url, None, headers) + return urllib2.urlopen(req) + def read_url(url, headers=DEFAULT_HEADERS, blocking=True): """ Read str contents of given str URL. @@ -57,12 +62,12 @@ def read_url(url, headers=DEFAULT_HEADERS, blocking=True): at which point the str page contents is yielded. """ url = url.replace(' ', '%20') - req = urllib2.Request(url, None, headers) - f = urllib2.urlopen(req) + f = open_url(url, headers) data = f.read() f.close() return data + def get_url(url, headers=DEFAULT_HEADERS, blocking=True): """ opens given str URL and returns the url after redirection. diff --git a/scrapeit/youtube.py b/scrapeit/youtube.py new file mode 100644 index 0000000..99125a9 --- /dev/null +++ b/scrapeit/youtube.py @@ -0,0 +1,51 @@ +# -*- Mode: Python; -*- +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 + +import re +import urllib2 +from urllib import quote + +from utils import read_url, open_url, + + +def get_video_url(id, get_redirected = False): + url = 'http://www.youtube.com/watch?v=%s' % id + data = read_url(url) + video = re.compile('player2.swf\?.*video_id=(.*?)\&').findall(data) + if video: + url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(id, video[0]) + if get_redirected: + url = get_url(url) + return url + return None + +def reg_search(reg, data): + result = re.compile(reg).findall(data) + if result: + result = result[0].strip() + else: + result = '' + return result + +def search(query, video_url_base = None): + url = "http://youtube.com/results?search_query=%s&search=Search" % quote(query) + data = read_url(url) + regx = re.compile('''(.*?)
''') + id_title = regx.findall(data) + data_flat = data.replace('\n', ' ') + videos = [] + for video in id_title: + v = dict() + v['id'] = video[0] + v['link'] = "http//youtube.com/watch.v=%s" % v['id'] + v['title'] = video[1].strip() + if video_url_base: + v['video_link'] = "%s/%s" % (video_url_base, v['id']) + else: + v['video_url'] = get_video_url(v['id']) + v['description'] = reg_search('''BeginvidDesc%s">(.*?)''' % v['id'], data_flat) + v['thumbnail'] = reg_search('%s' % v['id'], data) + videos.append(v) + return videos +