python-ox/ox/web/youtube.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from urllib.parse import quote, unquote_plus
import urllib
from http import cookiejar as cookielib
import re
from xml.dom.minidom import parseString
import json

import ox
from ox.cache import read_url, cache_timeout


def get_id(url):
    match = re.compile('v=(.+?)($|&)').findall(url)
    if match:
        return match[0][0]

def get_url(id):
    return 'http://www.youtube.com/watch?v=%s' % id

def video_url(youtubeId, format='mp4', timeout=cache_timeout):
    """
        youtubeId - if of video
        format - video format, options: webm, 1080p, 720p, mp4, high
    """
    fmt = None
    if format == '4k':
        fmt = 38
    elif format == '1080p':
        fmt = 37
    elif format == '720p':
        fmt = 22
    elif format == 'mp4':
        fmt = 18
    elif format == 'high':
        fmt = 35
    elif format == 'webm':
        streams = videos(youtubeId, 'webm')
        return streams[max(streams.keys())]['url']

    streams = videos(youtubeId)
    if str(fmt) in streams:
        return streams[str(fmt)]['url']

def get_video_info(id):
    eurl = get_url(id)
    data = read_url(eurl).decode('utf-8')
    t = re.compile(r'\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]').findall(data)
    if t:
        t = t[0]
    else:
        raise IOError
    url = "http://www.youtube.com/get_video_info?&video_id=%s&el=$el&ps=default&eurl=%s&hl=en_US&t=%s" % (id, quote(eurl), quote(t))
    data = read_url(url).decode('utf-8')
    info = {}
    for part in data.split('&'):
        key, value = part.split('=')
        info[key] = unquote_plus(value).replace('+', ' ')
    return info

def find(query, max_results=10, offset=1, orderBy='relevance'):
    import feedparser
    query = quote(query)
    url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
    data = read_url(url)
    fd = feedparser.parse(data)
    videos = []
    for item in fd.entries:
        id = item['id'].split('/')[-1]
        title = item['title']
        description = item['description']
        videos.append((title, id, description))
        if len(videos) >= max_results:
            return videos
    return videos

def info(id, timeout=cache_timeout):
    info = {}
    if id.startswith('http'):
        id = get_id(id)
        if not id:
            return info
    url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2" % id
    data = read_url(url, timeout=timeout)
    xml = parseString(data)
    info['id'] = id
    info['url'] = get_url(id)
    info['title'] = xml.getElementsByTagName('title')[0].firstChild.data
    info['description'] = xml.getElementsByTagName('media:description')[0].firstChild.data
    info['date'] = xml.getElementsByTagName('published')[0].firstChild.data.split('T')[0]
    info['author'] = "http://www.youtube.com/user/%s"%xml.getElementsByTagName('name')[0].firstChild.data

    info['categories'] = []
    for cat in xml.getElementsByTagName('media:category'):
        info['categories'].append(cat.firstChild.data)

    k = xml.getElementsByTagName('media:keywords')[0].firstChild
    if k:
        info['keywords'] = k.data.split(', ')
    data = read_url(info['url'], timeout=timeout)
    match = re.compile('<h4>License:</h4>(.*?)</p>', re.DOTALL).findall(data)
    if match:
        info['license'] = match[0].strip()
        info['license'] = re.sub('<.+?>', '', info['license']).strip()

    subs = subtitles(id, timeout)
    if subs:
        info['subtitles'] = subs
    return info

def subtitles(id, timeout=cache_timeout):
    url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1" % id
    data = read_url(url, timeout=timeout)
    xml = parseString(data)
    languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]
    subtitles = {}
    if languages:
        for language in languages:
            url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind" % (id, language)
            data = read_url(url, timeout=timeout)
            xml = parseString(data)
            subs = []
            for t in xml.getElementsByTagName('text'):
                start = float(t.getAttribute('start'))
                duration = t.getAttribute('dur')
                if not duration:
                    duration = '2'
                end = start + float(duration)
                if t.firstChild:
                    text = t.firstChild.data
                    subs.append({
                        'in': start,
                        'out': end,
                        'value': ox.decode_html(text),
                    })
            subtitles[language] = subs
    return subtitles

def videos(id, format=''):
    stream_type = {
        'flv': 'video/x-flv',
        'webm': 'video/webm',
        'mp4': 'video/mp4'
    }.get(format)
    info = get_video_info(id)
    stream_map = info['url_encoded_fmt_stream_map']
    streams = {}
    for x in stream_map.split(','):
        stream = {}
        #for s in x.split('\\u0026'):
        for s in x.split('&'):
            key, value = s.split('=')
            value = unquote_plus(value)
            stream[key] = value
        if 'url' in stream and 'sig' in stream:
            stream['url'] = '%s&signature=%s' % (stream['url'], stream['sig'])
        if not stream_type or stream['type'].startswith(stream_type):
            streams[stream['itag']] = stream
    return streams

def playlist(url):
    data = read_url(url).decode('utf-8')
    items = []
    for i in list(set(re.compile(r'<a href="(/watch\?v=.*?)" title="(.*?)" ').findall(data))):
        items.append({
            'title': i[1],
            'url': 'http://www.youtube.com' + i[0].split('&amp;')[0]
        })
    return items

def download_webm(id, filename):
    stream_type = 'video/webm'
    url = "http://www.youtube.com/watch?v=%s" % id
    cj = cookielib.CookieJar()
    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    opener.addheaders = [
        ('User-Agent',
         'Mozilla/5.0 (X11; Linux i686; rv:2.0) Gecko/20100101 Firefox/4.0'),
        ('Accept-Language', 'en-us, en;q=0.50')
    ]
    u = opener.open(url)
    data = u.read()
    u.close()
    match = re.compile('"url_encoded_fmt_stream_map": "(.*?)"').findall(data)
    streams = {}
    for x in match[0].split(','):
        stream = {}
        for s in x.split('\\u0026'):
            key, value = s.split('=')
            value = unquote_plus(value)
            stream[key] = value
        if stream['type'].startswith(stream_type):
            streams[stream['itag']] = stream
    if streams:
        s = max(streams.keys())
        url = streams[s]['url']
        if 'sig' in streams[s]:
            url += 'signature=' + streams[s]['sig']
    else:
        return None

    #download video and save to file.
    u = opener.open(url)
    f = open(filename, 'w')
    data = True
    while data:
        data = u.read(4096)
        f.write(data)
    f.close()
    u.close()
    return filename

def get_config(id):
    if id.startswith('http'):
        url = id
    else:
        url = get_url(id)
    data = read_url(url)
    match = re.compile('ytplayer.config = (.*?);<').findall(data)
    if match:
        config = json.load(match[0])
    return config
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`# -- coding: utf-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`
drop six and python2 support 2023-07-27 11:07:13 +00:00			`from urllib.parse import quote, unquote_plus`
			`import urllib`
			`from http import cookiejar as cookielib`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`import re`
all new youtube bindings 2011-12-27 16:29:13 +00:00			`from xml.dom.minidom import parseString`
update ox.web.youtube 2014-02-19 08:39:54 +00:00			`import json`
add ox.web to this repos 2010-07-07 23:25:57 +00:00
ox.web.youtube: use in/out/value like ox.srt, decode html value 2014-01-15 14:42:14 +00:00			`import ox`
net/cache readUrl->read_url / Unicode -> unicode=True format replace all CammelCase with under_score 2012-08-14 13:58:05 +00:00			`from ox.cache import read_url, cache_timeout`
add ox.web to this repos 2010-07-07 23:25:57 +00:00
add yt 4k format 2014-01-15 16:33:39 +00:00
update ox.web.youtube 2014-02-19 08:39:54 +00:00			`def get_id(url):`
			`match = re.compile('v=(.+?)($\|&)').findall(url)`
			`if match:`
			`return match[0][0]`

			`def get_url(id):`
			`return 'http://www.youtube.com/watch?v=%s' % id`

ox.web under_score api rewrite 2012-08-15 15:15:40 +00:00			`def video_url(youtubeId, format='mp4', timeout=cache_timeout):`
pass on timeout 2010-12-27 13:26:11 +00:00			`"""`
			`youtubeId - if of video`
			`format - video format, options: webm, 1080p, 720p, mp4, high`
			`"""`
fix video urls 2010-08-12 21:53:33 +00:00			`fmt = None`
add yt 4k format 2014-01-15 16:33:39 +00:00			`if format == '4k':`
cleanup pylint errors and py2/3 issues 2016-06-08 13:32:46 +00:00			`fmt = 38`
add yt 4k format 2014-01-15 16:33:39 +00:00			`elif format == '1080p':`
cleanup pylint errors and py2/3 issues 2016-06-08 13:32:46 +00:00			`fmt = 37`
add yt 4k format 2014-01-15 16:33:39 +00:00			`elif format == '720p':`
cleanup pylint errors and py2/3 issues 2016-06-08 13:32:46 +00:00			`fmt = 22`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`elif format == 'mp4':`
cleanup pylint errors and py2/3 issues 2016-06-08 13:32:46 +00:00			`fmt = 18`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`elif format == 'high':`
cleanup pylint errors and py2/3 issues 2016-06-08 13:32:46 +00:00			`fmt = 35`
all new youtube bindings 2011-12-27 16:29:13 +00:00			`elif format == 'webm':`
			`streams = videos(youtubeId, 'webm')`
fix ox.web.youtube 2014-09-28 19:57:45 +00:00			`return streams[max(streams.keys())]['url']`
add iframe embed 2010-08-13 12:58:15 +00:00
all new youtube bindings 2011-12-27 16:29:13 +00:00			`streams = videos(youtubeId)`
			`if str(fmt) in streams:`
fix ox.web.youtube 2014-09-28 19:57:45 +00:00			`return streams[str(fmt)]['url']`

			`def get_video_info(id):`
			`eurl = get_url(id)`
py3 2017-08-02 14:48:01 +00:00			`data = read_url(eurl).decode('utf-8')`
escape strings 2024-09-11 21:52:01 +00:00			`t = re.compile(r'\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]').findall(data)`
fix ox.web.youtube 2014-09-28 19:57:45 +00:00			`if t:`
			`t = t[0]`
			`else:`
			`raise IOError`
			`url = "http://www.youtube.com/get_video_info?&video_id=%s&el=$el&ps=default&eurl=%s&hl=en_US&t=%s" % (id, quote(eurl), quote(t))`
py3 2017-08-02 14:48:01 +00:00			`data = read_url(url).decode('utf-8')`
fix ox.web.youtube 2014-09-28 19:57:45 +00:00			`info = {}`
			`for part in data.split('&'):`
			`key, value = part.split('=')`
			`info[key] = unquote_plus(value).replace('+', ' ')`
			`return info`
add ox.web to this repos 2010-07-07 23:25:57 +00:00
all new youtube bindings 2011-12-27 16:29:13 +00:00			`def find(query, max_results=10, offset=1, orderBy='relevance'):`
remove feedparser 2018-01-14 15:47:15 +00:00			`import feedparser`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`query = quote(query)`
			`url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)`
net/cache readUrl->read_url / Unicode -> unicode=True format replace all CammelCase with under_score 2012-08-14 13:58:05 +00:00			`data = read_url(url)`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`fd = feedparser.parse(data)`
			`videos = []`
all new youtube bindings 2011-12-27 16:29:13 +00:00			`for item in fd.entries:`
			`id = item['id'].split('/')[-1]`
			`title = item['title']`
			`description = item['description']`
			`videos.append((title, id, description))`
add ox.web to this repos 2010-07-07 23:25:57 +00:00			`if len(videos) >= max_results:`
			`return videos`
			`return videos`

add timeout argument to ox.web.youtube.info 2014-04-22 14:15:20 +00:00			`def info(id, timeout=cache_timeout):`
all new youtube bindings 2011-12-27 16:29:13 +00:00			`info = {}`
update ox.web.youtube 2014-02-19 08:39:54 +00:00			`if id.startswith('http'):`
			`id = get_id(id)`
			`if not id:`
			`return info`
all new youtube bindings 2011-12-27 16:29:13 +00:00			`url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2" % id`
add timeout argument to ox.web.youtube.info 2014-04-22 14:15:20 +00:00			`data = read_url(url, timeout=timeout)`
all new youtube bindings 2011-12-27 16:29:13 +00:00			`xml = parseString(data)`
update ox.web.youtube 2014-02-19 08:39:54 +00:00			`info['id'] = id`
			`info['url'] = get_url(id)`
all new youtube bindings 2011-12-27 16:29:13 +00:00			`info['title'] = xml.getElementsByTagName('title')[0].firstChild.data`
			`info['description'] = xml.getElementsByTagName('media:description')[0].firstChild.data`
			`info['date'] = xml.getElementsByTagName('published')[0].firstChild.data.split('T')[0]`
			`info['author'] = "http://www.youtube.com/user/%s"%xml.getElementsByTagName('name')[0].firstChild.data`

			`info['categories'] = []`
			`for cat in xml.getElementsByTagName('media:category'):`
			`info['categories'].append(cat.firstChild.data)`

youtube 2013-03-23 16:58:47 +00:00			`k = xml.getElementsByTagName('media:keywords')[0].firstChild`
			`if k:`
			`info['keywords'] = k.data.split(', ')`
add timeout argument to ox.web.youtube.info 2014-04-22 14:15:20 +00:00			`data = read_url(info['url'], timeout=timeout)`
all new youtube bindings 2011-12-27 16:29:13 +00:00			`match = re.compile('<h4>License:</h4>(.*?)</p>', re.DOTALL).findall(data)`
			`if match:`
			`info['license'] = match[0].strip()`
			`info['license'] = re.sub('<.+?>', '', info['license']).strip()`
add ox.web to this repos 2010-07-07 23:25:57 +00:00
add ox.web.youtube.subtitles 2016-07-20 17:51:41 +00:00			`subs = subtitles(id, timeout)`
			`if subs:`
			`info['subtitles'] = subs`
			`return info`

			`def subtitles(id, timeout=cache_timeout):`
update ox.web.youtube 2014-02-19 08:39:54 +00:00			`url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1" % id`
add timeout argument to ox.web.youtube.info 2014-04-22 14:15:20 +00:00			`data = read_url(url, timeout=timeout)`
all new youtube bindings 2011-12-27 16:29:13 +00:00			`xml = parseString(data)`
			`languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]`
add ox.web.youtube.subtitles 2016-07-20 17:51:41 +00:00			`subtitles = {}`
all new youtube bindings 2011-12-27 16:29:13 +00:00			`if languages:`
			`for language in languages:`
add ox.web.youtube.subtitles 2016-07-20 17:51:41 +00:00			`url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind" % (id, language)`
add timeout argument to ox.web.youtube.info 2014-04-22 14:15:20 +00:00			`data = read_url(url, timeout=timeout)`
all new youtube bindings 2011-12-27 16:29:13 +00:00			`xml = parseString(data)`
			`subs = []`
			`for t in xml.getElementsByTagName('text'):`
			`start = float(t.getAttribute('start'))`
			`duration = t.getAttribute('dur')`
			`if not duration:`
			`duration = '2'`
			`end = start + float(duration)`
handly empty subs 2014-07-20 09:20:31 +00:00			`if t.firstChild:`
			`text = t.firstChild.data`
			`subs.append({`
			`'in': start,`
			`'out': end,`
			`'value': ox.decode_html(text),`
			`})`
add ox.web.youtube.subtitles 2016-07-20 17:51:41 +00:00			`subtitles[language] = subs`
			`return subtitles`
all new youtube bindings 2011-12-27 16:29:13 +00:00
			`def videos(id, format=''):`
			`stream_type = {`
			`'flv': 'video/x-flv',`
			`'webm': 'video/webm',`
			`'mp4': 'video/mp4'`
			`}.get(format)`
fix ox.web.youtube 2014-09-28 19:57:45 +00:00			`info = get_video_info(id)`
			`stream_map = info['url_encoded_fmt_stream_map']`
all new youtube bindings 2011-12-27 16:29:13 +00:00			`streams = {}`
fix ox.web.youtube 2014-09-28 19:57:45 +00:00			`for x in stream_map.split(','):`
			`stream = {}`
			`#for s in x.split('\\u0026'):`
			`for s in x.split('&'):`
			`key, value = s.split('=')`
			`value = unquote_plus(value)`
			`stream[key] = value`
			`if 'url' in stream and 'sig' in stream:`
			`stream['url'] = '%s&signature=%s' % (stream['url'], stream['sig'])`
			`if not stream_type or stream['type'].startswith(stream_type):`
			`streams[stream['itag']] = stream`
all new youtube bindings 2011-12-27 16:29:13 +00:00			`return streams`
youtube 2013-03-23 16:58:47 +00:00
add youtube playlist parser 2013-06-12 16:28:19 +00:00			`def playlist(url):`
py3 2017-06-06 14:54:49 +00:00			`data = read_url(url).decode('utf-8')`
add youtube playlist parser 2013-06-12 16:28:19 +00:00			`items = []`
escape strings 2024-09-11 21:52:01 +00:00			`for i in list(set(re.compile(r'<a href="(/watch\?v=.?)" title="(.?)" ').findall(data))):`
add youtube playlist parser 2013-06-12 16:28:19 +00:00			`items.append({`
			`'title': i[1],`
add www. 2013-06-17 20:43:44 +00:00			`'url': 'http://www.youtube.com' + i[0].split('&')[0]`
add youtube playlist parser 2013-06-12 16:28:19 +00:00			`})`
			`return items`

youtube 2013-03-23 16:58:47 +00:00			`def download_webm(id, filename):`
			`stream_type = 'video/webm'`
			`url = "http://www.youtube.com/watch?v=%s" % id`
			`cj = cookielib.CookieJar()`
get rid of all urllib2 calls 2014-10-05 18:06:22 +00:00			`opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))`
youtube 2013-03-23 16:58:47 +00:00			`opener.addheaders = [`
			`('User-Agent',`
			`'Mozilla/5.0 (X11; Linux i686; rv:2.0) Gecko/20100101 Firefox/4.0'),`
			`('Accept-Language', 'en-us, en;q=0.50')`
			`]`
			`u = opener.open(url)`
			`data = u.read()`
			`u.close()`
			`match = re.compile('"url_encoded_fmt_stream_map": "(.*?)"').findall(data)`
			`streams = {}`
			`for x in match[0].split(','):`
			`stream = {}`
			`for s in x.split('\\u0026'):`
			`key, value = s.split('=')`
			`value = unquote_plus(value)`
			`stream[key] = value`
			`if stream['type'].startswith(stream_type):`
			`streams[stream['itag']] = stream`
			`if streams:`
			`s = max(streams.keys())`
fix ox.web.youtube 2014-09-28 19:57:45 +00:00			`url = streams[s]['url']`
			`if 'sig' in streams[s]:`
			`url += 'signature=' + streams[s]['sig']`
youtube 2013-03-23 16:58:47 +00:00			`else:`
			`return None`

			`#download video and save to file.`
			`u = opener.open(url)`
			`f = open(filename, 'w')`
			`data = True`
			`while data:`
			`data = u.read(4096)`
			`f.write(data)`
			`f.close()`
			`u.close()`
			`return filename`
update ox.web.youtube 2014-02-19 08:39:54 +00:00
			`def get_config(id):`
			`if id.startswith('http'):`
			`url = id`
			`else:`
			`url = get_url(id)`
			`data = read_url(url)`
			`match = re.compile('ytplayer.config = (.*?);<').findall(data)`
			`if match:`
			`config = json.load(match[0])`
			`return config`