youtube video urls

2007-06-08 19:19:22 +00:00 · 2007-06-08 19:19:22 +00:00 · b2c51e03b2
commit b2c51e03b2
parent 3942d76b6e
2 changed files with 58 additions and 2 deletions
--- a/scrapeit/utils.py
+++ b/scrapeit/utils.py
@ -47,6 +47,11 @@ def read_url_utf8(url, headers=DEFAULT_HEADERS, blocking=True):
  data = unicode(data, charset)
  return data

+def open_url(url, headers=DEFAULT_HEADERS):
+  url = url.replace(' ', '%20')
+  req = urllib2.Request(url, None, headers)
+  return urllib2.urlopen(req)
+
 def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
  """
  Read str contents of given str URL.
@ -57,12 +62,12 @@ def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
  at which point the str page contents is yielded.
  """
  url = url.replace(' ', '%20')
-  req = urllib2.Request(url, None, headers)
-  f = urllib2.urlopen(req)
+  f = open_url(url, headers)
  data = f.read()
  f.close()
  return data

+
 def get_url(url, headers=DEFAULT_HEADERS, blocking=True):
  """
  opens given str URL and returns the url after redirection.
--- a/scrapeit/youtube.py
+++ b/scrapeit/youtube.py
@ -0,0 +1,51 @@
+# -*- Mode: Python; -*-
+# -*- coding: utf-8 -*-
+# vi:si:et:sw=2:sts=2:ts=2
+
+import re
+import urllib2
+from urllib import quote
+
+from utils import read_url, open_url, 
+
+
+def get_video_url(id, get_redirected = False):
+  url = 'http://www.youtube.com/watch?v=%s' % id
+  data = read_url(url)
+  video = re.compile('player2.swf\?.*video_id=(.*?)\&').findall(data)
+  if video:
+    url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(id, video[0])
+    if get_redirected:
+      url = get_url(url)
+    return url
+  return None
+
+def reg_search(reg, data):
+  result = re.compile(reg).findall(data)
+  if result:
+      result = result[0].strip()
+  else: 
+      result = ''
+  return result
+
+def search(query, video_url_base = None):
+  url = "http://youtube.com/results?search_query=%s&search=Search" % quote(query)
+  data = read_url(url)
+  regx = re.compile('''<a href="/watch.v=(.*?)">(.*?)</a><br/>''')
+  id_title = regx.findall(data)
+  data_flat = data.replace('\n', ' ')
+  videos = []
+  for video in id_title:
+    v = dict()
+    v['id'] = video[0]
+    v['link'] = "http//youtube.com/watch.v=%s" % v['id']
+    v['title'] = video[1].strip()
+    if video_url_base:
+      v['video_link'] = "%s/%s" % (video_url_base, v['id'])
+    else:
+      v['video_url'] = get_video_url(v['id'])
+    v['description'] = reg_search('''BeginvidDesc%s">(.*?)</span>''' % v['id'], data_flat)
+    v['thumbnail'] = reg_search('<img src="(.*?)" class="vimg120" alt="%s" />' % v['id'], data)
+    videos.append(v)
+  return videos
+