python-ox/ox/web/youtube.py

224 lines
7.2 KiB
Python
Raw Permalink Normal View History

2010-07-07 23:25:57 +00:00
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
2023-07-27 11:07:13 +00:00
from urllib.parse import quote, unquote_plus
import urllib
from http import cookiejar as cookielib
2010-07-07 23:25:57 +00:00
import re
2011-12-27 16:29:13 +00:00
from xml.dom.minidom import parseString
2014-02-19 08:39:54 +00:00
import json
2010-07-07 23:25:57 +00:00
import ox
from ox.cache import read_url, cache_timeout
2010-07-07 23:25:57 +00:00
2014-01-15 16:33:39 +00:00
2014-02-19 08:39:54 +00:00
def get_id(url):
match = re.compile('v=(.+?)($|&)').findall(url)
if match:
return match[0][0]
def get_url(id):
return 'http://www.youtube.com/watch?v=%s' % id
2012-08-15 15:15:40 +00:00
def video_url(youtubeId, format='mp4', timeout=cache_timeout):
2010-12-27 13:26:11 +00:00
"""
youtubeId - if of video
format - video format, options: webm, 1080p, 720p, mp4, high
"""
2010-08-12 21:53:33 +00:00
fmt = None
2014-01-15 16:33:39 +00:00
if format == '4k':
2016-06-08 13:32:46 +00:00
fmt = 38
2014-01-15 16:33:39 +00:00
elif format == '1080p':
2016-06-08 13:32:46 +00:00
fmt = 37
2014-01-15 16:33:39 +00:00
elif format == '720p':
2016-06-08 13:32:46 +00:00
fmt = 22
2010-07-07 23:25:57 +00:00
elif format == 'mp4':
2016-06-08 13:32:46 +00:00
fmt = 18
2010-07-07 23:25:57 +00:00
elif format == 'high':
2016-06-08 13:32:46 +00:00
fmt = 35
2011-12-27 16:29:13 +00:00
elif format == 'webm':
streams = videos(youtubeId, 'webm')
2014-09-28 19:57:45 +00:00
return streams[max(streams.keys())]['url']
2010-08-13 12:58:15 +00:00
2011-12-27 16:29:13 +00:00
streams = videos(youtubeId)
if str(fmt) in streams:
2014-09-28 19:57:45 +00:00
return streams[str(fmt)]['url']
def get_video_info(id):
eurl = get_url(id)
2017-08-02 14:48:01 +00:00
data = read_url(eurl).decode('utf-8')
2024-09-11 21:52:01 +00:00
t = re.compile(r'\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]').findall(data)
2014-09-28 19:57:45 +00:00
if t:
t = t[0]
else:
raise IOError
url = "http://www.youtube.com/get_video_info?&video_id=%s&el=$el&ps=default&eurl=%s&hl=en_US&t=%s" % (id, quote(eurl), quote(t))
2017-08-02 14:48:01 +00:00
data = read_url(url).decode('utf-8')
2014-09-28 19:57:45 +00:00
info = {}
for part in data.split('&'):
key, value = part.split('=')
info[key] = unquote_plus(value).replace('+', ' ')
return info
2010-07-07 23:25:57 +00:00
2011-12-27 16:29:13 +00:00
def find(query, max_results=10, offset=1, orderBy='relevance'):
2018-01-14 15:47:15 +00:00
import feedparser
2010-07-07 23:25:57 +00:00
query = quote(query)
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
data = read_url(url)
2010-07-07 23:25:57 +00:00
fd = feedparser.parse(data)
videos = []
2011-12-27 16:29:13 +00:00
for item in fd.entries:
id = item['id'].split('/')[-1]
title = item['title']
description = item['description']
videos.append((title, id, description))
2010-07-07 23:25:57 +00:00
if len(videos) >= max_results:
return videos
return videos
def info(id, timeout=cache_timeout):
2011-12-27 16:29:13 +00:00
info = {}
2014-02-19 08:39:54 +00:00
if id.startswith('http'):
id = get_id(id)
if not id:
return info
2011-12-27 16:29:13 +00:00
url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2" % id
data = read_url(url, timeout=timeout)
2011-12-27 16:29:13 +00:00
xml = parseString(data)
2014-02-19 08:39:54 +00:00
info['id'] = id
info['url'] = get_url(id)
2011-12-27 16:29:13 +00:00
info['title'] = xml.getElementsByTagName('title')[0].firstChild.data
info['description'] = xml.getElementsByTagName('media:description')[0].firstChild.data
info['date'] = xml.getElementsByTagName('published')[0].firstChild.data.split('T')[0]
info['author'] = "http://www.youtube.com/user/%s"%xml.getElementsByTagName('name')[0].firstChild.data
info['categories'] = []
for cat in xml.getElementsByTagName('media:category'):
info['categories'].append(cat.firstChild.data)
2013-03-23 16:58:47 +00:00
k = xml.getElementsByTagName('media:keywords')[0].firstChild
if k:
info['keywords'] = k.data.split(', ')
data = read_url(info['url'], timeout=timeout)
2011-12-27 16:29:13 +00:00
match = re.compile('<h4>License:</h4>(.*?)</p>', re.DOTALL).findall(data)
if match:
info['license'] = match[0].strip()
info['license'] = re.sub('<.+?>', '', info['license']).strip()
2010-07-07 23:25:57 +00:00
2016-07-20 17:51:41 +00:00
subs = subtitles(id, timeout)
if subs:
info['subtitles'] = subs
return info
def subtitles(id, timeout=cache_timeout):
2014-02-19 08:39:54 +00:00
url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1" % id
data = read_url(url, timeout=timeout)
2011-12-27 16:29:13 +00:00
xml = parseString(data)
languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]
2016-07-20 17:51:41 +00:00
subtitles = {}
2011-12-27 16:29:13 +00:00
if languages:
for language in languages:
2016-07-20 17:51:41 +00:00
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind" % (id, language)
data = read_url(url, timeout=timeout)
2011-12-27 16:29:13 +00:00
xml = parseString(data)
subs = []
for t in xml.getElementsByTagName('text'):
start = float(t.getAttribute('start'))
duration = t.getAttribute('dur')
if not duration:
duration = '2'
end = start + float(duration)
2014-07-20 09:20:31 +00:00
if t.firstChild:
text = t.firstChild.data
subs.append({
'in': start,
'out': end,
'value': ox.decode_html(text),
})
2016-07-20 17:51:41 +00:00
subtitles[language] = subs
return subtitles
2011-12-27 16:29:13 +00:00
def videos(id, format=''):
stream_type = {
'flv': 'video/x-flv',
'webm': 'video/webm',
'mp4': 'video/mp4'
}.get(format)
2014-09-28 19:57:45 +00:00
info = get_video_info(id)
stream_map = info['url_encoded_fmt_stream_map']
2011-12-27 16:29:13 +00:00
streams = {}
2014-09-28 19:57:45 +00:00
for x in stream_map.split(','):
stream = {}
#for s in x.split('\\u0026'):
for s in x.split('&'):
key, value = s.split('=')
value = unquote_plus(value)
stream[key] = value
if 'url' in stream and 'sig' in stream:
stream['url'] = '%s&signature=%s' % (stream['url'], stream['sig'])
if not stream_type or stream['type'].startswith(stream_type):
streams[stream['itag']] = stream
2011-12-27 16:29:13 +00:00
return streams
2013-03-23 16:58:47 +00:00
2013-06-12 16:28:19 +00:00
def playlist(url):
2017-06-06 14:54:49 +00:00
data = read_url(url).decode('utf-8')
2013-06-12 16:28:19 +00:00
items = []
2024-09-11 21:52:01 +00:00
for i in list(set(re.compile(r'<a href="(/watch\?v=.*?)" title="(.*?)" ').findall(data))):
2013-06-12 16:28:19 +00:00
items.append({
'title': i[1],
2013-06-17 20:43:44 +00:00
'url': 'http://www.youtube.com' + i[0].split('&amp;')[0]
2013-06-12 16:28:19 +00:00
})
return items
2013-03-23 16:58:47 +00:00
def download_webm(id, filename):
stream_type = 'video/webm'
url = "http://www.youtube.com/watch?v=%s" % id
cj = cookielib.CookieJar()
2014-10-05 18:06:22 +00:00
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
2013-03-23 16:58:47 +00:00
opener.addheaders = [
('User-Agent',
'Mozilla/5.0 (X11; Linux i686; rv:2.0) Gecko/20100101 Firefox/4.0'),
('Accept-Language', 'en-us, en;q=0.50')
]
u = opener.open(url)
data = u.read()
u.close()
match = re.compile('"url_encoded_fmt_stream_map": "(.*?)"').findall(data)
streams = {}
for x in match[0].split(','):
stream = {}
for s in x.split('\\u0026'):
key, value = s.split('=')
value = unquote_plus(value)
stream[key] = value
if stream['type'].startswith(stream_type):
streams[stream['itag']] = stream
if streams:
s = max(streams.keys())
2014-09-28 19:57:45 +00:00
url = streams[s]['url']
if 'sig' in streams[s]:
url += 'signature=' + streams[s]['sig']
2013-03-23 16:58:47 +00:00
else:
return None
#download video and save to file.
u = opener.open(url)
f = open(filename, 'w')
data = True
while data:
data = u.read(4096)
f.write(data)
f.close()
u.close()
return filename
2014-02-19 08:39:54 +00:00
def get_config(id):
if id.startswith('http'):
url = id
else:
url = get_url(id)
data = read_url(url)
match = re.compile('ytplayer.config = (.*?);<').findall(data)
if match:
config = json.load(match[0])
return config