[youtube] fix 403 error and throttling (close #2950)

This commit is contained in:
Mort Yao 2024-06-23 10:17:02 +02:00
parent 29f513821d
commit 567d1059fc
No known key found for this signature in database
GPG Key ID: 07DA00CB78203251
3 changed files with 88 additions and 97 deletions

View File

@ -56,7 +56,8 @@ setup(
entry_points = {'console_scripts': proj_info['console_scripts']}, entry_points = {'console_scripts': proj_info['console_scripts']},
extras_require={ install_requires = ['dukpy'],
extras_require = {
'socks': ['PySocks'], 'socks': ['PySocks'],
} }
) )

View File

@ -3,6 +3,8 @@
from ..common import * from ..common import *
from ..extractor import VideoExtractor from ..extractor import VideoExtractor
import dukpy
from urllib.parse import urlparse, parse_qs, urlencode
from xml.dom.minidom import parseString from xml.dom.minidom import parseString
class YouTube(VideoExtractor): class YouTube(VideoExtractor):
@ -68,45 +70,32 @@ class YouTube(VideoExtractor):
'audio_encoding': 'AAC', 'audio_bitrate': '24'}, 'audio_encoding': 'AAC', 'audio_bitrate': '24'},
] ]
def dethrottle(js, url):
def n_to_n(js, n):
# Examples:
# yma - https://www.youtube.com/s/player/84314bef/player_ias.vflset/en_US/base.js
# Xka - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js
f1 = match1(js, r'a\.set\("n",b\),[$\w]+\.length\|\|([$\w]+)\(""\)')
f1def = match1(js, r'\W%s=(function\(\w+\).+?\)});' % re.escape(f1))
n = dukpy.evaljs('(%s)("%s")' % (f1def, n))
return n
u = urlparse(url)
qs = parse_qs(u.query)
n = n_to_n(js, qs['n'][0])
qs['n'] = [n]
return u._replace(query=urlencode(qs, doseq=True)).geturl()
def s_to_sig(js, s): def s_to_sig(js, s):
# Examples: # Examples:
# - https://www.youtube.com/yts/jsbin/player-da_DK-vflWlK-zq/base.js # BPa - https://www.youtube.com/s/player/84314bef/player_ias.vflset/en_US/base.js
# - https://www.youtube.com/yts/jsbin/player-vflvABTsY/da_DK/base.js # Xva - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js
# - https://www.youtube.com/yts/jsbin/player-vfls4aurX/da_DK/base.js js_code = ''
# - https://www.youtube.com/yts/jsbin/player_ias-vfl_RGK2l/en_US/base.js f1 = match1(js, r'=([$\w]+)\(decodeURIComponent\(')
# - https://www.youtube.com/yts/jsbin/player-vflRjqq_w/da_DK/base.js f1def = match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1))
# - https://www.youtube.com/yts/jsbin/player_ias-vfl-jbnrr/da_DK/base.js f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def) # remove . prefix
# - https://www.youtube.com/s/player/0b643cd1/player_ias.vflset/sv_SE/base.js
# - https://www.youtube.com/s/player/50e823fc/player_ias.vflset/sv_SE/base.js
# - https://www.youtube.com/s/player/3b5d5649/player_ias.vflset/sv_SE/base.js
# - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js
def tr_js(code):
code = re.sub(r'function', r'def', code)
# add prefix '_sig_' to prevent namespace pollution
code = re.sub(r'(\W)([$\w][$\w][$\w]?)\(', r'\1_sig_\2(', code)
code = re.sub(r'\$', '_dollar', code)
code = re.sub(r'\{', r': ', code)
code = re.sub(r'\}', r'\n', code)
code = re.sub(r'var\s+', r'', code)
code = re.sub(r'(\w+).join\(""\)', r'"".join(\1)', code)
code = re.sub(r'(\w+).length', r'len(\1)', code)
code = re.sub(r'(\w+).slice\((\w+)\)', r'\1[\2:]', code)
code = re.sub(r'(\w+).splice\((\w+),(\w+)\)', r'del \1[\2:\2+\3]', code)
code = re.sub(r'(\w+).split\(""\)', r'list(\1)', code)
return code
js = js.replace('\n', ' ')
f1 = match1(js, r'\.set\(\w+\.sp,encodeURIComponent\(([$\w]+)') or \
match1(js, r'\.set\(\w+\.sp,\(0,window\.encodeURIComponent\)\(([$\w]+)') or \
match1(js, r'\.set\(\w+\.sp,([$\w]+)\(\w+\.s\)\)') or \
match1(js, r'"signature",([$\w]+)\(\w+\.\w+\)') or \
match1(js, r'=([$\w]+)\(decodeURIComponent\(')
f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \
match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1))
f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def)
f1def = 'function %s%s' % (f1, f1def) f1def = 'function %s%s' % (f1, f1def)
code = tr_js(f1def) f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def)) # find all invoked function names
f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def))
for f2 in f2s: for f2 in f2s:
f2e = re.escape(f2) f2e = re.escape(f2)
f2def = re.search(r'[^$\w]%s:function\((\w+,\w+)\)(\{[^\{\}]+\})' % f2e, js) f2def = re.search(r'[^$\w]%s:function\((\w+,\w+)\)(\{[^\{\}]+\})' % f2e, js)
@ -115,13 +104,10 @@ class YouTube(VideoExtractor):
else: else:
f2def = re.search(r'[^$\w]%s:function\((\w+)\)(\{[^\{\}]+\})' % f2e, js) f2def = re.search(r'[^$\w]%s:function\((\w+)\)(\{[^\{\}]+\})' % f2e, js)
f2def = 'function {}({},b){}'.format(f2e, f2def.group(1), f2def.group(2)) f2def = 'function {}({},b){}'.format(f2e, f2def.group(1), f2def.group(2))
f2 = re.sub(r'\$', '_dollar', f2) # replace dollar sign js_code += f2def + ';'
code = code + 'global _sig_%s\n' % f2 + tr_js(f2def) js_code += f1def + ';%s("%s")' % (f1, s)
sig = dukpy.evaljs(js_code)
f1 = re.sub(r'\$', '_dollar', f1) # replace dollar sign return sig
code = code + '_sig=_sig_%s(s)' % f1
exec(code, globals(), locals())
return locals()['_sig']
def chunk_by_range(url, size): def chunk_by_range(url, size):
urls = [] urls = []
@ -209,6 +195,7 @@ class YouTube(VideoExtractor):
raise raise
elif video_info['status'] == ['ok']: elif video_info['status'] == ['ok']:
if 'use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']: if 'use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']:
# FIXME: this is basically dead code, use_cipher_signature is always true
self.title = parse.unquote_plus(json.loads(video_info["player_response"][0])["videoDetails"]["title"]) self.title = parse.unquote_plus(json.loads(video_info["player_response"][0])["videoDetails"]["title"])
# Parse video page (for DASH) # Parse video page (for DASH)
video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid)
@ -259,23 +246,30 @@ class YouTube(VideoExtractor):
self.html5player = None self.html5player = None
else: else:
# Parse video page instead # Extract from video page
logging.debug('Extracting from the video page...')
video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid)
try: # FIXME: we should extract ytInitialPlayerResponse more reliably try:
ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});</script>', video_page).group(1)) jsUrl = re.search('([^"]*/base\.js)"', video_page).group(1)
except: except:
ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) log.wtf('[Failed] Unable to find base.js on the video page')
# FIXME: do we still need this?
jsUrl = jsUrl.replace('\/', '/') # unescape URL (for age-restricted videos)
self.html5player = 'https://www.youtube.com' + jsUrl
logging.debug('Retrieving the player code...')
self.js = get_content(self.html5player).replace('\n', ' ')
logging.debug('Loading ytInitialPlayerResponse...')
ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});(\n|</script>)', video_page).group(1))
# Get the video title
self.title = ytInitialPlayerResponse["videoDetails"]["title"] self.title = ytInitialPlayerResponse["videoDetails"]["title"]
if re.search('([^"]*/base\.js)"', video_page):
self.html5player = 'https://www.youtube.com' + re.search('([^"]*/base\.js)"', video_page).group(1)
else:
self.html5player = None
stream_list = ytInitialPlayerResponse['streamingData']['formats'] stream_list = ytInitialPlayerResponse['streamingData']['formats']
elif video_info['status'] == ['fail']: elif video_info['status'] == ['fail']:
# FIXME: this is basically dead code, status is always ok
logging.debug('ERRORCODE: %s' % video_info['errorcode'][0]) logging.debug('ERRORCODE: %s' % video_info['errorcode'][0])
if video_info['errorcode'] == ['150']: if video_info['errorcode'] == ['150']:
# FIXME: still relevant? # FIXME: still relevant?
@ -327,7 +321,7 @@ class YouTube(VideoExtractor):
log.wtf('[Failed] Invalid status.', exit_code=None) log.wtf('[Failed] Invalid status.', exit_code=None)
raise raise
# YouTube Live # FIXME: YouTube Live
if ytplayer_config and (ytplayer_config['args'].get('livestream') == '1' or ytplayer_config['args'].get('live_playback') == '1'): if ytplayer_config and (ytplayer_config['args'].get('livestream') == '1' or ytplayer_config['args'].get('live_playback') == '1'):
if 'hlsvp' in ytplayer_config['args']: if 'hlsvp' in ytplayer_config['args']:
hlsvp = ytplayer_config['args']['hlsvp'] hlsvp = ytplayer_config['args']['hlsvp']
@ -343,6 +337,7 @@ class YouTube(VideoExtractor):
for stream in stream_list: for stream in stream_list:
if isinstance(stream, str): if isinstance(stream, str):
# FIXME: dead code?
metadata = parse.parse_qs(stream) metadata = parse.parse_qs(stream)
stream_itag = metadata['itag'][0] stream_itag = metadata['itag'][0]
self.streams[stream_itag] = { self.streams[stream_itag] = {
@ -357,22 +352,29 @@ class YouTube(VideoExtractor):
'container': mime_to_container(metadata['type'][0].split(';')[0]), 'container': mime_to_container(metadata['type'][0].split(';')[0]),
} }
else: else:
stream_itag = str(stream['itag']) if 'signatureCipher' in stream:
self.streams[stream_itag] = { logging.debug('Parsing signatureCipher for itag=%s...' % stream['itag'])
qs = parse_qs(stream['signatureCipher'])
#logging.debug(qs)
sp = qs['sp'][0]
sig = self.__class__.s_to_sig(self.js, qs['s'][0])
url = qs['url'][0] + '&{}={}'.format(sp, sig)
elif 'url' in stream:
url = stream['url']
else:
log.wtf('No signatureCipher or url for itag=%s' % stream['itag'])
url = self.__class__.dethrottle(self.js, url)
self.streams[str(stream['itag'])] = {
'itag': str(stream['itag']), 'itag': str(stream['itag']),
'url': stream['url'] if 'url' in stream else None, 'url': url,
'sig': None,
's': None,
'quality': stream['quality'], 'quality': stream['quality'],
'type': stream['mimeType'], 'type': stream['mimeType'],
'mime': stream['mimeType'].split(';')[0], 'mime': stream['mimeType'].split(';')[0],
'container': mime_to_container(stream['mimeType'].split(';')[0]), 'container': mime_to_container(stream['mimeType'].split(';')[0]),
} }
if 'signatureCipher' in stream:
self.streams[stream_itag].update(dict([(_.split('=')[0], parse.unquote(_.split('=')[1]))
for _ in stream['signatureCipher'].split('&')]))
# Prepare caption tracks # FIXME: Prepare caption tracks
try: try:
try: try:
caption_tracks = json.loads(ytplayer_config['args']['player_response'])['captions']['playerCaptionsTracklistRenderer']['captionTracks'] caption_tracks = json.loads(ytplayer_config['args']['player_response'])['captions']['playerCaptionsTracklistRenderer']['captionTracks']
@ -408,6 +410,7 @@ class YouTube(VideoExtractor):
# Prepare DASH streams (NOTE: not every video has DASH streams!) # Prepare DASH streams (NOTE: not every video has DASH streams!)
try: try:
# FIXME: dead code?
dashmpd = ytplayer_config['args']['dashmpd'] dashmpd = ytplayer_config['args']['dashmpd']
dash_xml = parseString(get_content(dashmpd)) dash_xml = parseString(get_content(dashmpd))
for aset in dash_xml.getElementsByTagName('AdaptationSet'): for aset in dash_xml.getElementsByTagName('AdaptationSet'):
@ -473,12 +476,8 @@ class YouTube(VideoExtractor):
'size': int(dash_size) + int(dash_webm_a_size) 'size': int(dash_size) + int(dash_webm_a_size)
} }
except: except:
# VEVO
if not self.html5player: return
self.html5player = self.html5player.replace('\/', '/') # unescape URL (for age-restricted videos)
self.js = get_content(self.html5player)
try: try:
# FIXME: dead code?
# Video info from video page (not always available) # Video info from video page (not always available)
streams = [dict([(i.split('=')[0], streams = [dict([(i.split('=')[0],
parse.unquote(i.split('=')[1])) parse.unquote(i.split('=')[1]))
@ -486,6 +485,7 @@ class YouTube(VideoExtractor):
for afmt in ytplayer_config['args']['adaptive_fmts'].split(',')] for afmt in ytplayer_config['args']['adaptive_fmts'].split(',')]
except: except:
if 'adaptive_fmts' in video_info: if 'adaptive_fmts' in video_info:
# FIXME: dead code?
streams = [dict([(i.split('=')[0], streams = [dict([(i.split('=')[0],
parse.unquote(i.split('=')[1])) parse.unquote(i.split('=')[1]))
for i in afmt.split('&')]) for i in afmt.split('&')])
@ -493,12 +493,15 @@ class YouTube(VideoExtractor):
else: else:
try: try:
try: try:
# FIXME: dead code?
streams = json.loads(video_info['player_response'][0])['streamingData']['adaptiveFormats'] streams = json.loads(video_info['player_response'][0])['streamingData']['adaptiveFormats']
except: except:
streams = ytInitialPlayerResponse['streamingData']['adaptiveFormats'] streams = ytInitialPlayerResponse['streamingData']['adaptiveFormats']
except: # no DASH stream at all except: # no DASH stream at all
# FIXME: dead code?
return return
# FIXME: dead code?
# streams without contentLength got broken urls, just remove them (#2767) # streams without contentLength got broken urls, just remove them (#2767)
streams = [stream for stream in streams if 'contentLength' in stream] streams = [stream for stream in streams if 'contentLength' in stream]
@ -523,34 +526,33 @@ class YouTube(VideoExtractor):
del stream['contentLength'] del stream['contentLength']
del stream['initRange'] del stream['initRange']
del stream['indexRange'] del stream['indexRange']
if 'signatureCipher' in stream:
stream.update(dict([(_.split('=')[0], parse.unquote(_.split('=')[1]))
for _ in stream['signatureCipher'].split('&')]))
del stream['signatureCipher']
for stream in streams: # get over speed limiting if 'signatureCipher' in stream:
stream['url'] += '&ratebypass=yes' logging.debug('Parsing signatureCipher for itag=%s...' % stream['itag'])
qs = parse_qs(stream['signatureCipher'])
#logging.debug(qs)
sp = qs['sp'][0]
sig = self.__class__.s_to_sig(self.js, qs['s'][0])
url = qs['url'][0] + '&ratebypass=yes&{}={}'.format(sp, sig)
elif 'url' in stream:
url = stream['url']
else:
log.wtf('No signatureCipher or url for itag=%s' % stream['itag'])
url = self.__class__.dethrottle(self.js, url)
stream['url'] = url
for stream in streams: # audio for stream in streams: # audio
if stream['type'].startswith('audio/mp4'): if stream['type'].startswith('audio/mp4'):
dash_mp4_a_url = stream['url'] dash_mp4_a_url = stream['url']
if 's' in stream:
sig = self.__class__.s_to_sig(self.js, stream['s'])
dash_mp4_a_url += '&sig={}'.format(sig)
dash_mp4_a_size = stream['clen'] dash_mp4_a_size = stream['clen']
elif stream['type'].startswith('audio/webm'): elif stream['type'].startswith('audio/webm'):
dash_webm_a_url = stream['url'] dash_webm_a_url = stream['url']
if 's' in stream:
sig = self.__class__.s_to_sig(self.js, stream['s'])
dash_webm_a_url += '&sig={}'.format(sig)
dash_webm_a_size = stream['clen'] dash_webm_a_size = stream['clen']
for stream in streams: # video for stream in streams: # video
if 'size' in stream: if 'size' in stream:
if stream['type'].startswith('video/mp4'): if stream['type'].startswith('video/mp4'):
mimeType = 'video/mp4' mimeType = 'video/mp4'
dash_url = stream['url'] dash_url = stream['url']
if 's' in stream:
sig = self.__class__.s_to_sig(self.js, stream['s'])
dash_url += '&sig={}'.format(sig)
dash_size = stream['clen'] dash_size = stream['clen']
itag = stream['itag'] itag = stream['itag']
dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size)) dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size))
@ -567,9 +569,6 @@ class YouTube(VideoExtractor):
elif stream['type'].startswith('video/webm'): elif stream['type'].startswith('video/webm'):
mimeType = 'video/webm' mimeType = 'video/webm'
dash_url = stream['url'] dash_url = stream['url']
if 's' in stream:
sig = self.__class__.s_to_sig(self.js, stream['s'])
dash_url += '&sig={}'.format(sig)
dash_size = stream['clen'] dash_size = stream['clen']
itag = stream['itag'] itag = stream['itag']
audio_url = None audio_url = None
@ -610,15 +609,6 @@ class YouTube(VideoExtractor):
if stream_id in self.streams: if stream_id in self.streams:
src = self.streams[stream_id]['url'] src = self.streams[stream_id]['url']
if self.streams[stream_id]['sig'] is not None:
sig = self.streams[stream_id]['sig']
src += '&sig={}'.format(sig)
elif self.streams[stream_id]['s'] is not None:
if not hasattr(self, 'js'):
self.js = get_content(self.html5player)
s = self.streams[stream_id]['s']
sig = self.__class__.s_to_sig(self.js, s)
src += '&sig={}'.format(sig)
self.streams[stream_id]['src'] = [src] self.streams[stream_id]['src'] = [src]
self.streams[stream_id]['size'] = urls_size(self.streams[stream_id]['src']) self.streams[stream_id]['size'] = urls_size(self.streams[stream_id]['src'])

View File

@ -36,9 +36,9 @@ class YouGetTests(unittest.TestCase):
# 'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa # 'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa
# info_only=True # info_only=True
#) #)
#youtube.download( youtube.download(
# 'https://www.youtube.com/watch?v=Fpr4fQSh1cc', info_only=True 'https://www.youtube.com/watch?v=oRdxUFDoQe0', info_only=True
#) )
def test_acfun(self): def test_acfun(self):
acfun.download('https://www.acfun.cn/v/ac44560432', info_only=True) acfun.download('https://www.acfun.cn/v/ac44560432', info_only=True)