update ox.web.youtube

2014-02-19 14:09:54 +05:30 · 2014-02-19 14:09:54 +05:30 · 075e735cd1
commit 075e735cd1
parent 1c871f4d31
1 changed files with 42 additions and 16 deletions
--- a/ox/web/youtube.py
+++ b/ox/web/youtube.py
@ -5,19 +5,26 @@ import urllib2
 import cookielib
 import re
 from xml.dom.minidom import parseString
+import json

 import feedparser
 import ox
 from ox.cache import read_url, cache_timeout


+def get_id(url):
+    match = re.compile('v=(.+?)($|&)').findall(url)
+    if match:
+        return match[0][0]
+
+def get_url(id):
+    return 'http://www.youtube.com/watch?v=%s' % id
+
 def video_url(youtubeId, format='mp4', timeout=cache_timeout):
    """
        youtubeId - if of video
        format - video format, options: webm, 1080p, 720p, mp4, high
    """
-    def get_url(stream):
-        return '%s&signature=%s' % (stream['url'], stream['sig'])
    fmt = None
    if format == '4k':
        fmt=38
@ -31,11 +38,11 @@ def video_url(youtubeId, format='mp4', timeout=cache_timeout):
        fmt=35
    elif format == 'webm':
        streams = videos(youtubeId, 'webm')
-        return get_url(streams[max(streams.keys())])
+        return streams[max(streams.keys())]['stream_url']

    streams = videos(youtubeId)
    if str(fmt) in streams:
-        return get_url(streams[str(fmt)])
+        return streams[str(fmt)]['stream_url']

 def find(query, max_results=10, offset=1, orderBy='relevance'):
    query = quote(query)
@ -54,10 +61,15 @@ def find(query, max_results=10, offset=1, orderBy='relevance'):

 def info(id):
    info = {}
+    if id.startswith('http'):
+        id = get_id(id)
+        if not id:
+            return info
    url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2" % id
    data = read_url(url)
    xml = parseString(data)
-    info['url'] = 'http://www.youtube.com/watch?v=%s' % id
+    info['id'] = id
+    info['url'] = get_url(id)
    info['title'] = xml.getElementsByTagName('title')[0].firstChild.data
    info['description'] = xml.getElementsByTagName('media:description')[0].firstChild.data
    info['date'] = xml.getElementsByTagName('published')[0].firstChild.data.split('T')[0]
@ -70,14 +82,13 @@ def info(id):
    k = xml.getElementsByTagName('media:keywords')[0].firstChild
    if k:
        info['keywords'] = k.data.split(', ')
-    url = "http://www.youtube.com/watch?v=%s" % id
-    data = read_url(url)
+    data = read_url(info['url'])
    match = re.compile('<h4>License:</h4>(.*?)</p>', re.DOTALL).findall(data)
    if match:
        info['license'] = match[0].strip()
        info['license'] = re.sub('<.+?>', '', info['license']).strip()

-    url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1"%id
+    url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1" % id
    data = read_url(url)
    xml = parseString(data)
    languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]
@ -113,14 +124,17 @@ def videos(id, format=''):
    data = read_url(url)
    match = re.compile('"url_encoded_fmt_stream_map": "(.*?)"').findall(data)
    streams = {}
-    for x in match[0].split(','):
-        stream = {}
-        for s in x.split('\\u0026'):
-            key, value = s.split('=')
-            value = unquote_plus(value)
-            stream[key] = value
-        if not stream_type or stream['type'].startswith(stream_type):
-            streams[stream['itag']] = stream
+    if match:
+        for x in match[0].split(','):
+            stream = {}
+            for s in x.split('\\u0026'):
+                key, value = s.split('=')
+                value = unquote_plus(value)
+                stream[key] = value
+            if 'url' in stream and 'sig' in stream:
+                stream['stream_url'] = '%s&signature=%s' % (stream['url'], stream['sig'])
+            if not stream_type or stream['type'].startswith(stream_type):
+                streams[stream['itag']] = stream
    return streams

 def playlist(url):
@ -172,3 +186,15 @@ def download_webm(id, filename):
    f.close()
    u.close()
    return filename
+
+def get_config(id):
+    if id.startswith('http'):
+        url = id
+    else:
+        url = get_url(id)
+    data = read_url(url)
+    match = re.compile('ytplayer.config = (.*?);<').findall(data)
+    if match:
+        config = json.load(match[0])
+    return config
+