From 567d1059fce22fe790e059af9812ab7cd12135db Mon Sep 17 00:00:00 2001
From: Mort Yao <soi@mort.ninja>
Date: Sun, 23 Jun 2024 10:17:02 +0200
Subject: [PATCH] [youtube] fix 403 error and throttling (close #2950)

---
 setup.py                          |   3 +-
 src/you_get/extractors/youtube.py | 176 ++++++++++++++----------------
 tests/test.py                     |   6 +-
 3 files changed, 88 insertions(+), 97 deletions(-)

diff --git a/setup.py b/setup.py
index 470c99ed..0804ae33 100755
--- a/setup.py
+++ b/setup.py
@@ -56,7 +56,8 @@ setup(
 
     entry_points = {'console_scripts': proj_info['console_scripts']},
 
-    extras_require={
+    install_requires = ['dukpy'],
+    extras_require = {
         'socks': ['PySocks'],
     }
 )
diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py
index ddf12be9..ee30644b 100644
--- a/src/you_get/extractors/youtube.py
+++ b/src/you_get/extractors/youtube.py
@@ -3,6 +3,8 @@
 from ..common import *
 from ..extractor import VideoExtractor
 
+import dukpy
+from urllib.parse import urlparse, parse_qs, urlencode
 from xml.dom.minidom import parseString
 
 class YouTube(VideoExtractor):
@@ -68,45 +70,32 @@ class YouTube(VideoExtractor):
          'audio_encoding': 'AAC', 'audio_bitrate': '24'},
     ]
 
+    def dethrottle(js, url):
+        def n_to_n(js, n):
+            # Examples:
+            #   yma - https://www.youtube.com/s/player/84314bef/player_ias.vflset/en_US/base.js
+            #   Xka - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js
+            f1 = match1(js, r'a\.set\("n",b\),[$\w]+\.length\|\|([$\w]+)\(""\)')
+            f1def = match1(js, r'\W%s=(function\(\w+\).+?\)});' % re.escape(f1))
+            n = dukpy.evaljs('(%s)("%s")' % (f1def, n))
+            return n
+
+        u = urlparse(url)
+        qs = parse_qs(u.query)
+        n = n_to_n(js, qs['n'][0])
+        qs['n'] = [n]
+        return u._replace(query=urlencode(qs, doseq=True)).geturl()
+
     def s_to_sig(js, s):
         # Examples:
-        # - https://www.youtube.com/yts/jsbin/player-da_DK-vflWlK-zq/base.js
-        # - https://www.youtube.com/yts/jsbin/player-vflvABTsY/da_DK/base.js
-        # - https://www.youtube.com/yts/jsbin/player-vfls4aurX/da_DK/base.js
-        # - https://www.youtube.com/yts/jsbin/player_ias-vfl_RGK2l/en_US/base.js
-        # - https://www.youtube.com/yts/jsbin/player-vflRjqq_w/da_DK/base.js
-        # - https://www.youtube.com/yts/jsbin/player_ias-vfl-jbnrr/da_DK/base.js
-        # - https://www.youtube.com/s/player/0b643cd1/player_ias.vflset/sv_SE/base.js
-        # - https://www.youtube.com/s/player/50e823fc/player_ias.vflset/sv_SE/base.js
-        # - https://www.youtube.com/s/player/3b5d5649/player_ias.vflset/sv_SE/base.js
-        # - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js
-        def tr_js(code):
-            code = re.sub(r'function', r'def', code)
-            # add prefix '_sig_' to prevent namespace pollution
-            code = re.sub(r'(\W)([$\w][$\w][$\w]?)\(', r'\1_sig_\2(', code)
-            code = re.sub(r'\$', '_dollar', code)
-            code = re.sub(r'\{', r': ', code)
-            code = re.sub(r'\}', r'\n', code)
-            code = re.sub(r'var\s+', r'', code)
-            code = re.sub(r'(\w+).join\(""\)', r'"".join(\1)', code)
-            code = re.sub(r'(\w+).length', r'len(\1)', code)
-            code = re.sub(r'(\w+).slice\((\w+)\)', r'\1[\2:]', code)
-            code = re.sub(r'(\w+).splice\((\w+),(\w+)\)', r'del \1[\2:\2+\3]', code)
-            code = re.sub(r'(\w+).split\(""\)', r'list(\1)', code)
-            return code
-
-        js = js.replace('\n', ' ')
-        f1 = match1(js, r'\.set\(\w+\.sp,encodeURIComponent\(([$\w]+)') or \
-            match1(js, r'\.set\(\w+\.sp,\(0,window\.encodeURIComponent\)\(([$\w]+)') or \
-            match1(js, r'\.set\(\w+\.sp,([$\w]+)\(\w+\.s\)\)') or \
-            match1(js, r'"signature",([$\w]+)\(\w+\.\w+\)') or \
-            match1(js, r'=([$\w]+)\(decodeURIComponent\(')
-        f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \
-                match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1))
-        f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def)
+        #   BPa - https://www.youtube.com/s/player/84314bef/player_ias.vflset/en_US/base.js
+        #   Xva - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js
+        js_code = ''
+        f1 = match1(js, r'=([$\w]+)\(decodeURIComponent\(')
+        f1def = match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1))
+        f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def)  # remove . prefix
         f1def = 'function %s%s' % (f1, f1def)
-        code = tr_js(f1def)
-        f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def))
+        f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def))  # find all invoked function names
         for f2 in f2s:
             f2e = re.escape(f2)
             f2def = re.search(r'[^$\w]%s:function\((\w+,\w+)\)(\{[^\{\}]+\})' % f2e, js)
@@ -115,13 +104,10 @@ class YouTube(VideoExtractor):
             else:
                 f2def = re.search(r'[^$\w]%s:function\((\w+)\)(\{[^\{\}]+\})' % f2e, js)
                 f2def = 'function {}({},b){}'.format(f2e, f2def.group(1), f2def.group(2))
-            f2 = re.sub(r'\$', '_dollar', f2)  # replace dollar sign
-            code = code + 'global _sig_%s\n' % f2 + tr_js(f2def)
-
-        f1 = re.sub(r'\$', '_dollar', f1)  # replace dollar sign
-        code = code + '_sig=_sig_%s(s)' % f1
-        exec(code, globals(), locals())
-        return locals()['_sig']
+            js_code += f2def + ';'
+        js_code += f1def + ';%s("%s")' % (f1, s)
+        sig = dukpy.evaljs(js_code)
+        return sig
 
     def chunk_by_range(url, size):
         urls = []
@@ -209,6 +195,7 @@ class YouTube(VideoExtractor):
             raise
         elif video_info['status'] == ['ok']:
             if 'use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']:
+                # FIXME: this is basically dead code, use_cipher_signature is always true
                 self.title = parse.unquote_plus(json.loads(video_info["player_response"][0])["videoDetails"]["title"])
                 # Parse video page (for DASH)
                 video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid)
@@ -259,23 +246,30 @@ class YouTube(VideoExtractor):
                         self.html5player = None
 
             else:
-                # Parse video page instead
+                # Extract from video page
+                logging.debug('Extracting from the video page...')
                 video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid)
 
-                try:  # FIXME: we should extract ytInitialPlayerResponse more reliably
-                    ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});</script>', video_page).group(1))
+                try:
+                    jsUrl = re.search('([^"]*/base\.js)"', video_page).group(1)
                 except:
-                    ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1))
+                    log.wtf('[Failed] Unable to find base.js on the video page')
+                # FIXME: do we still need this?
+                jsUrl = jsUrl.replace('\/', '/')  # unescape URL (for age-restricted videos)
+                self.html5player = 'https://www.youtube.com' + jsUrl
+                logging.debug('Retrieving the player code...')
+                self.js = get_content(self.html5player).replace('\n', ' ')
 
+                logging.debug('Loading ytInitialPlayerResponse...')
+                ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});(\n|</script>)', video_page).group(1))
+
+                # Get the video title
                 self.title = ytInitialPlayerResponse["videoDetails"]["title"]
-                if re.search('([^"]*/base\.js)"', video_page):
-                    self.html5player = 'https://www.youtube.com' + re.search('([^"]*/base\.js)"', video_page).group(1)
-                else:
-                    self.html5player = None
 
                 stream_list = ytInitialPlayerResponse['streamingData']['formats']
 
         elif video_info['status'] == ['fail']:
+            # FIXME: this is basically dead code, status is always ok
             logging.debug('ERRORCODE: %s' % video_info['errorcode'][0])
             if video_info['errorcode'] == ['150']:
                 # FIXME: still relevant?
@@ -327,7 +321,7 @@ class YouTube(VideoExtractor):
             log.wtf('[Failed] Invalid status.', exit_code=None)
             raise
 
-        # YouTube Live
+        # FIXME: YouTube Live
         if ytplayer_config and (ytplayer_config['args'].get('livestream') == '1' or ytplayer_config['args'].get('live_playback') == '1'):
             if 'hlsvp' in ytplayer_config['args']:
                 hlsvp = ytplayer_config['args']['hlsvp']
@@ -343,6 +337,7 @@ class YouTube(VideoExtractor):
 
         for stream in stream_list:
             if isinstance(stream, str):
+                # FIXME: dead code?
                 metadata = parse.parse_qs(stream)
                 stream_itag = metadata['itag'][0]
                 self.streams[stream_itag] = {
@@ -357,22 +352,29 @@ class YouTube(VideoExtractor):
                     'container': mime_to_container(metadata['type'][0].split(';')[0]),
                 }
             else:
-                stream_itag = str(stream['itag'])
-                self.streams[stream_itag] = {
+                if 'signatureCipher' in stream:
+                    logging.debug('Parsing signatureCipher for itag=%s...' % stream['itag'])
+                    qs = parse_qs(stream['signatureCipher'])
+                    #logging.debug(qs)
+                    sp = qs['sp'][0]
+                    sig = self.__class__.s_to_sig(self.js, qs['s'][0])
+                    url = qs['url'][0] + '&{}={}'.format(sp, sig)
+                elif 'url' in stream:
+                    url = stream['url']
+                else:
+                    log.wtf('No signatureCipher or url for itag=%s' % stream['itag'])
+                url = self.__class__.dethrottle(self.js, url)
+
+                self.streams[str(stream['itag'])] = {
                     'itag': str(stream['itag']),
-                    'url': stream['url'] if 'url' in stream else None,
-                    'sig': None,
-                    's': None,
+                    'url': url,
                     'quality': stream['quality'],
                     'type': stream['mimeType'],
                     'mime': stream['mimeType'].split(';')[0],
                     'container': mime_to_container(stream['mimeType'].split(';')[0]),
                 }
-                if 'signatureCipher' in stream:
-                    self.streams[stream_itag].update(dict([(_.split('=')[0], parse.unquote(_.split('=')[1]))
-                                                           for _ in stream['signatureCipher'].split('&')]))
 
-        # Prepare caption tracks
+        # FIXME: Prepare caption tracks
         try:
             try:
                 caption_tracks = json.loads(ytplayer_config['args']['player_response'])['captions']['playerCaptionsTracklistRenderer']['captionTracks']
@@ -408,6 +410,7 @@ class YouTube(VideoExtractor):
 
         # Prepare DASH streams (NOTE: not every video has DASH streams!)
         try:
+            # FIXME: dead code?
             dashmpd = ytplayer_config['args']['dashmpd']
             dash_xml = parseString(get_content(dashmpd))
             for aset in dash_xml.getElementsByTagName('AdaptationSet'):
@@ -473,12 +476,8 @@ class YouTube(VideoExtractor):
                             'size': int(dash_size) + int(dash_webm_a_size)
                         }
         except:
-            # VEVO
-            if not self.html5player: return
-            self.html5player = self.html5player.replace('\/', '/') # unescape URL (for age-restricted videos)
-            self.js = get_content(self.html5player)
-
             try:
+                # FIXME: dead code?
                 # Video info from video page (not always available)
                 streams = [dict([(i.split('=')[0],
                                   parse.unquote(i.split('=')[1]))
@@ -486,6 +485,7 @@ class YouTube(VideoExtractor):
                            for afmt in ytplayer_config['args']['adaptive_fmts'].split(',')]
             except:
                 if 'adaptive_fmts' in video_info:
+                    # FIXME: dead code?
                     streams = [dict([(i.split('=')[0],
                                       parse.unquote(i.split('=')[1]))
                                      for i in afmt.split('&')])
@@ -493,12 +493,15 @@ class YouTube(VideoExtractor):
                 else:
                     try:
                         try:
+                            # FIXME: dead code?
                             streams = json.loads(video_info['player_response'][0])['streamingData']['adaptiveFormats']
                         except:
                             streams = ytInitialPlayerResponse['streamingData']['adaptiveFormats']
                     except:  # no DASH stream at all
+                        # FIXME: dead code?
                         return
 
+                    # FIXME: dead code?
                     # streams without contentLength got broken urls, just remove them (#2767)
                     streams = [stream for stream in streams if 'contentLength' in stream]
 
@@ -523,34 +526,33 @@ class YouTube(VideoExtractor):
                         del stream['contentLength']
                         del stream['initRange']
                         del stream['indexRange']
-                        if 'signatureCipher' in stream:
-                            stream.update(dict([(_.split('=')[0], parse.unquote(_.split('=')[1]))
-                                                for _ in stream['signatureCipher'].split('&')]))
-                            del stream['signatureCipher']
 
-            for stream in streams: # get over speed limiting
-                stream['url'] += '&ratebypass=yes'
+                        if 'signatureCipher' in stream:
+                            logging.debug('Parsing signatureCipher for itag=%s...' % stream['itag'])
+                            qs = parse_qs(stream['signatureCipher'])
+                            #logging.debug(qs)
+                            sp = qs['sp'][0]
+                            sig = self.__class__.s_to_sig(self.js, qs['s'][0])
+                            url = qs['url'][0] + '&ratebypass=yes&{}={}'.format(sp, sig)
+                        elif 'url' in stream:
+                            url = stream['url']
+                        else:
+                            log.wtf('No signatureCipher or url for itag=%s' % stream['itag'])
+                        url = self.__class__.dethrottle(self.js, url)
+                        stream['url'] = url
+
             for stream in streams: # audio
                 if stream['type'].startswith('audio/mp4'):
                     dash_mp4_a_url = stream['url']
-                    if 's' in stream:
-                        sig = self.__class__.s_to_sig(self.js, stream['s'])
-                        dash_mp4_a_url += '&sig={}'.format(sig)
                     dash_mp4_a_size = stream['clen']
                 elif stream['type'].startswith('audio/webm'):
                     dash_webm_a_url = stream['url']
-                    if 's' in stream:
-                        sig = self.__class__.s_to_sig(self.js, stream['s'])
-                        dash_webm_a_url += '&sig={}'.format(sig)
                     dash_webm_a_size = stream['clen']
             for stream in streams: # video
                 if 'size' in stream:
                     if stream['type'].startswith('video/mp4'):
                         mimeType = 'video/mp4'
                         dash_url = stream['url']
-                        if 's' in stream:
-                            sig = self.__class__.s_to_sig(self.js, stream['s'])
-                            dash_url += '&sig={}'.format(sig)
                         dash_size = stream['clen']
                         itag = stream['itag']
                         dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size))
@@ -567,9 +569,6 @@ class YouTube(VideoExtractor):
                     elif stream['type'].startswith('video/webm'):
                         mimeType = 'video/webm'
                         dash_url = stream['url']
-                        if 's' in stream:
-                            sig = self.__class__.s_to_sig(self.js, stream['s'])
-                            dash_url += '&sig={}'.format(sig)
                         dash_size = stream['clen']
                         itag = stream['itag']
                         audio_url = None
@@ -610,15 +609,6 @@ class YouTube(VideoExtractor):
 
         if stream_id in self.streams:
             src = self.streams[stream_id]['url']
-            if self.streams[stream_id]['sig'] is not None:
-                sig = self.streams[stream_id]['sig']
-                src += '&sig={}'.format(sig)
-            elif self.streams[stream_id]['s'] is not None:
-                if not hasattr(self, 'js'):
-                    self.js = get_content(self.html5player)
-                s = self.streams[stream_id]['s']
-                sig = self.__class__.s_to_sig(self.js, s)
-                src += '&sig={}'.format(sig)
 
             self.streams[stream_id]['src'] = [src]
             self.streams[stream_id]['size'] = urls_size(self.streams[stream_id]['src'])
diff --git a/tests/test.py b/tests/test.py
index e8a378f5..8d348fbc 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -36,9 +36,9 @@ class YouGetTests(unittest.TestCase):
         #    'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare',  # noqa
         #    info_only=True
         #)
-        #youtube.download(
-        #    'https://www.youtube.com/watch?v=Fpr4fQSh1cc', info_only=True
-        #)
+        youtube.download(
+            'https://www.youtube.com/watch?v=oRdxUFDoQe0', info_only=True
+        )
 
     def test_acfun(self):
         acfun.download('https://www.acfun.cn/v/ac44560432', info_only=True)