From b2c51e03b2eb986b0b6941ee4038ff0058680414 Mon Sep 17 00:00:00 2001
From: j <0x006A@0x2620.org>
Date: Fri, 8 Jun 2007 19:19:22 +0000
Subject: [PATCH] youtube video urls

---
 scrapeit/utils.py   |  9 ++++++--
 scrapeit/youtube.py | 51 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 2 deletions(-)
 create mode 100644 scrapeit/youtube.py
diff --git a/scrapeit/utils.py b/scrapeit/utils.py
index 5fd7a73..6bcc2ff 100644
--- a/scrapeit/utils.py
+++ b/scrapeit/utils.py
@@ -47,6 +47,11 @@ def read_url_utf8(url, headers=DEFAULT_HEADERS, blocking=True):
   data = unicode(data, charset)
   return data
 
+def open_url(url, headers=DEFAULT_HEADERS):
+  url = url.replace(' ', '%20')
+  req = urllib2.Request(url, None, headers)
+  return urllib2.urlopen(req)
+
 def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
   """
   Read str contents of given str URL.
@@ -57,12 +62,12 @@ def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
   at which point the str page contents is yielded.
   """
   url = url.replace(' ', '%20')
-  req = urllib2.Request(url, None, headers)
-  f = urllib2.urlopen(req)
+  f = open_url(url, headers)
   data = f.read()
   f.close()
   return data
 
+
 def get_url(url, headers=DEFAULT_HEADERS, blocking=True):
   """
   opens given str URL and returns the url after redirection.
diff --git a/scrapeit/youtube.py b/scrapeit/youtube.py
new file mode 100644
index 0000000..99125a9
--- /dev/null
+++ b/scrapeit/youtube.py
@@ -0,0 +1,51 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+
+import re
+import urllib2
+from urllib import quote
+
+from utils import read_url, open_url, 
+
+
+def get_video_url(id, get_redirected = False):
+  url = 'http://www.youtube.com/watch?v=%s' % id
+  data = read_url(url)
+  video = re.compile('player2.swf\?.*video_id=(.*?)\&').findall(data)
+  if video:
+    url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(id, video[0])
+    if get_redirected:
+      url = get_url(url)
+    return url
+  return None
+
+def reg_search(reg, data):
+  result = re.compile(reg).findall(data)
+  if result:
+      result = result[0].strip()
+  else: 
+      result = ''
+  return result
+
+def search(query, video_url_base = None):
+  url = "http://youtube.com/results?search_query=%s&search=Search" % quote(query)
+  data = read_url(url)
+  regx = re.compile('''<a href="/watch.v=(.*?)">(.*?)</a><br/>''')
+  id_title = regx.findall(data)
+  data_flat = data.replace('\n', ' ')
+  videos = []
+  for video in id_title:
+    v = dict()
+    v['id'] = video[0]
+    v['link'] = "http//youtube.com/watch.v=%s" % v['id']
+    v['title'] = video[1].strip()
+    if video_url_base:
+      v['video_link'] = "%s/%s" % (video_url_base, v['id'])
+    else:
+      v['video_url'] = get_video_url(v['id'])
+    v['description'] = reg_search('''BeginvidDesc%s">(.*?)</span>''' % v['id'], data_flat)
+    v['thumbnail'] = reg_search('<img src="(.*?)" class="vimg120" alt="%s" />' % v['id'], data)
+    videos.append(v)
+  return videos
+