From b2c51e03b2eb986b0b6941ee4038ff0058680414 Mon Sep 17 00:00:00 2001
From: j <0x006A@0x2620.org>
Date: Fri, 8 Jun 2007 19:19:22 +0000
Subject: [PATCH] youtube video urls
---
scrapeit/utils.py | 9 ++++++--
scrapeit/youtube.py | 51 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 58 insertions(+), 2 deletions(-)
create mode 100644 scrapeit/youtube.py
diff --git a/scrapeit/utils.py b/scrapeit/utils.py
index 5fd7a73..6bcc2ff 100644
--- a/scrapeit/utils.py
+++ b/scrapeit/utils.py
@@ -47,6 +47,11 @@ def read_url_utf8(url, headers=DEFAULT_HEADERS, blocking=True):
data = unicode(data, charset)
return data
+def open_url(url, headers=DEFAULT_HEADERS):
+ url = url.replace(' ', '%20')
+ req = urllib2.Request(url, None, headers)
+ return urllib2.urlopen(req)
+
def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
"""
Read str contents of given str URL.
@@ -57,12 +62,12 @@ def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
at which point the str page contents is yielded.
"""
url = url.replace(' ', '%20')
- req = urllib2.Request(url, None, headers)
- f = urllib2.urlopen(req)
+ f = open_url(url, headers)
data = f.read()
f.close()
return data
+
def get_url(url, headers=DEFAULT_HEADERS, blocking=True):
"""
opens given str URL and returns the url after redirection.
diff --git a/scrapeit/youtube.py b/scrapeit/youtube.py
new file mode 100644
index 0000000..99125a9
--- /dev/null
+++ b/scrapeit/youtube.py
@@ -0,0 +1,51 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+
+import re
+import urllib2
+from urllib import quote
+
+from utils import read_url, open_url,
+
+
+def get_video_url(id, get_redirected = False):
+ url = 'http://www.youtube.com/watch?v=%s' % id
+ data = read_url(url)
+ video = re.compile('player2.swf\?.*video_id=(.*?)\&').findall(data)
+ if video:
+ url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(id, video[0])
+ if get_redirected:
+ url = get_url(url)
+ return url
+ return None
+
+def reg_search(reg, data):
+ result = re.compile(reg).findall(data)
+ if result:
+ result = result[0].strip()
+ else:
+ result = ''
+ return result
+
+def search(query, video_url_base = None):
+ url = "http://youtube.com/results?search_query=%s&search=Search" % quote(query)
+ data = read_url(url)
+ regx = re.compile('''(.*?)
''')
+ id_title = regx.findall(data)
+ data_flat = data.replace('\n', ' ')
+ videos = []
+ for video in id_title:
+ v = dict()
+ v['id'] = video[0]
+ v['link'] = "http//youtube.com/watch.v=%s" % v['id']
+ v['title'] = video[1].strip()
+ if video_url_base:
+ v['video_link'] = "%s/%s" % (video_url_base, v['id'])
+ else:
+ v['video_url'] = get_video_url(v['id'])
+ v['description'] = reg_search('''BeginvidDesc%s">(.*?)''' % v['id'], data_flat)
+ v['thumbnail'] = reg_search('' % v['id'], data)
+ videos.append(v)
+ return videos
+