1
0
mirror of https://github.com/ytdl-org/youtube-dl synced 2024-12-16 14:36:47 +00:00

Compare commits

...

8 Commits

Author SHA1 Message Date
dirkf
87ba614c4c
Merge 3d649843fe into c5098961b0 2024-12-13 03:15:44 +00:00
dirkf
3d649843fe [YouTube] Simplify pattern for nsig function name extraction 2024-12-13 03:15:05 +00:00
dirkf
9bc8fcf23c [YouTube] Pass nsig value as return hook, fixes player 3bb1f723 2024-12-13 03:15:05 +00:00
dirkf
a93fff3052 [YouTube] Handle player 3bb1f723
* fix signature code extraction
* raise if n function returns input value
* add new tests from yt-dlp

Co-authored-by: bashonly
2024-12-13 03:15:05 +00:00
dirkf
2968201ae0 [jsinterp] Add return hook for player 3bb1f723
* set var `_ytdl_do_not_return` to a specific value in the scope of a function
* if an expression to be returned has that value, `return` becomes `void`
2024-12-13 03:15:05 +00:00
dirkf
067ad7439c [jsinterp] Strip /* comments */ when parsing
* NB: _separate() is looking creaky
2024-12-13 03:15:05 +00:00
dirkf
7ab85d88d6 [jsinterp] Fix and improve "methods"
* push, unshift return new length
* impove edge cases for push/pop, shift/unshift, forEach, indexOf, charCodeAt
* increase test coverage
2024-12-13 03:15:04 +00:00
dirkf
4bf85ca5ba [jsinterp] Fix and improve split/join
* improve split/join edge cases
* correctly implement regex split (not like re.split)
2024-12-13 03:15:04 +00:00
4 changed files with 238 additions and 89 deletions

View File

@ -160,7 +160,6 @@ class TestJSInterpreter(unittest.TestCase):
self._test('function f(){var x = 20; x += 30 + 1; return x;}', 51) self._test('function f(){var x = 20; x += 30 + 1; return x;}', 51)
self._test('function f(){var x = 20; x -= 30 + 1; return x;}', -11) self._test('function f(){var x = 20; x -= 30 + 1; return x;}', -11)
@unittest.skip('Not yet fully implemented')
def test_comments(self): def test_comments(self):
self._test(''' self._test('''
function f() { function f() {
@ -179,6 +178,15 @@ class TestJSInterpreter(unittest.TestCase):
} }
''', 3) ''', 3)
self._test('''
function f() {
var x = ( /* 1 + */ 2 +
/* 30 * 40 */
50);
return x;
}
''', 52)
def test_precedence(self): def test_precedence(self):
self._test(''' self._test('''
function f() { function f() {
@ -483,6 +491,13 @@ class TestJSInterpreter(unittest.TestCase):
self._test(jsi, 't-e-s-t', args=[test_input, '-']) self._test(jsi, 't-e-s-t', args=[test_input, '-'])
self._test(jsi, '', args=[[], '-']) self._test(jsi, '', args=[[], '-'])
self._test('function f(){return '
'[1, 1.0, "abc", {a: 1}, null, undefined, Infinity, NaN].join()}',
'1,1,abc,[object Object],,,Infinity,NaN')
self._test('function f(){return '
'[1, 1.0, "abc", {a: 1}, null, undefined, Infinity, NaN].join("~")}',
'1~1~abc~[object Object]~~~Infinity~NaN')
def test_split(self): def test_split(self):
test_result = list('test') test_result = list('test')
tests = [ tests = [
@ -496,6 +511,18 @@ class TestJSInterpreter(unittest.TestCase):
self._test(jsi, test_result, args=['t-e-s-t', '-']) self._test(jsi, test_result, args=['t-e-s-t', '-'])
self._test(jsi, [''], args=['', '-']) self._test(jsi, [''], args=['', '-'])
self._test(jsi, [], args=['', '']) self._test(jsi, [], args=['', ''])
# RegExp split
self._test('function f(){return "test".split(/(?:)/)}',
['t', 'e', 's', 't'])
self._test('function f(){return "t-e-s-t".split(/[es-]+/)}',
['t', 't'])
# from MDN: surrogate pairs aren't handled: case 1 fails
# self._test('function f(){return "😄😄".split(/(?:)/)}',
# ['\ud83d', '\ude04', '\ud83d', '\ude04'])
# case 2 beats Py3.2: it gets the case 1 result
if sys.version_info >= (2, 6) and not ((3, 0) <= sys.version_info < (3, 3)):
self._test('function f(){return "😄😄".split(/(?:)/u)}',
['😄', '😄'])
def test_slice(self): def test_slice(self):
self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice()}', [0, 1, 2, 3, 4, 5, 6, 7, 8]) self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice()}', [0, 1, 2, 3, 4, 5, 6, 7, 8])
@ -525,6 +552,40 @@ class TestJSInterpreter(unittest.TestCase):
self._test('function f(){return "012345678".slice(-1, 1)}', '') self._test('function f(){return "012345678".slice(-1, 1)}', '')
self._test('function f(){return "012345678".slice(-3, -1)}', '67') self._test('function f(){return "012345678".slice(-3, -1)}', '67')
def test_pop(self):
# pop
self._test('function f(){var a = [0, 1, 2, 3, 4, 5, 6, 7, 8]; return [a.pop(), a]}',
[8, [0, 1, 2, 3, 4, 5, 6, 7]])
self._test('function f(){return [].pop()}', JS_Undefined)
# push
self._test('function f(){var a = [0, 1, 2]; return [a.push(3, 4), a]}',
[5, [0, 1, 2, 3, 4]])
self._test('function f(){var a = [0, 1, 2]; return [a.push(), a]}',
[3, [0, 1, 2]])
def test_shift(self):
# shift
self._test('function f(){var a = [0, 1, 2, 3, 4, 5, 6, 7, 8]; return [a.shift(), a]}',
[0, [1, 2, 3, 4, 5, 6, 7, 8]])
self._test('function f(){return [].shift()}', JS_Undefined)
# unshift
self._test('function f(){var a = [0, 1, 2]; return [a.unshift(3, 4), a]}',
[5, [3, 4, 0, 1, 2]])
self._test('function f(){var a = [0, 1, 2]; return [a.unshift(), a]}',
[3, [0, 1, 2]])
def test_forEach(self):
self._test('function f(){var ret = []; var l = [4, 2]; '
'var log = function(e,i,a){ret.push([e,i,a]);}; '
'l.forEach(log); '
'return [ret.length, ret[0][0], ret[1][1], ret[0][2]]}',
[2, 4, 1, [4, 2]])
self._test('function f(){var ret = []; var l = [4, 2]; '
'var log = function(e,i,a){this.push([e,i,a]);}; '
'l.forEach(log, ret); '
'return [ret.length, ret[0][0], ret[1][1], ret[0][2]]}',
[2, 4, 1, [4, 2]])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
@ -12,6 +13,7 @@ import re
import string import string
from youtube_dl.compat import ( from youtube_dl.compat import (
compat_contextlib_suppress,
compat_open as open, compat_open as open,
compat_str, compat_str,
compat_urlretrieve, compat_urlretrieve,
@ -50,23 +52,33 @@ _SIG_TESTS = [
( (
'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflBb0OQx.js', 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflBb0OQx.js',
84, 84,
'123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQ0STUVWXYZ!"#$%&\'()*+,@./:;<=>' '123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQ0STUVWXYZ!"#$%&\'()*+,@./:;<=>',
), ),
( (
'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl9FYC6l.js', 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl9FYC6l.js',
83, 83,
'123456789abcdefghijklmnopqr0tuvwxyzABCDETGHIJKLMNOPQRS>UVWXYZ!"#$%&\'()*+,-./:;<=F' '123456789abcdefghijklmnopqr0tuvwxyzABCDETGHIJKLMNOPQRS>UVWXYZ!"#$%&\'()*+,-./:;<=F',
), ),
( (
'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflCGk6yw/html5player.js', 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflCGk6yw/html5player.js',
'4646B5181C6C3020DF1D9C7FCFEA.AD80ABF70C39BD369CCCAE780AFBB98FA6B6CB42766249D9488C288', '4646B5181C6C3020DF1D9C7FCFEA.AD80ABF70C39BD369CCCAE780AFBB98FA6B6CB42766249D9488C288',
'82C8849D94266724DC6B6AF89BBFA087EACCD963.B93C07FBA084ACAEFCF7C9D1FD0203C6C1815B6B' '82C8849D94266724DC6B6AF89BBFA087EACCD963.B93C07FBA084ACAEFCF7C9D1FD0203C6C1815B6B',
), ),
( (
'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js',
'312AA52209E3623129A412D56A40F11CB0AF14AE.3EE09501CB14E3BCDC3B2AE808BF3F1D14E7FBF12', '312AA52209E3623129A412D56A40F11CB0AF14AE.3EE09501CB14E3BCDC3B2AE808BF3F1D14E7FBF12',
'112AA5220913623229A412D56A40F11CB0AF14AE.3EE0950FCB14EEBCDC3B2AE808BF331D14E7FBF3', '112AA5220913623229A412D56A40F11CB0AF14AE.3EE0950FCB14EEBCDC3B2AE808BF331D14E7FBF3',
) ),
(
'https://www.youtube.com/s/player/6ed0d907/player_ias.vflset/en_US/base.js',
'2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
'AOq0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL2QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0',
),
(
'https://www.youtube.com/s/player/3bb1f723/player_ias.vflset/en_US/base.js',
'2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
'MyOSJXtKI3m-uME_jv7-pT12gOFC02RFkGoqWpzE0Cs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
),
] ]
_NSIG_TESTS = [ _NSIG_TESTS = [
@ -142,6 +154,10 @@ _NSIG_TESTS = [
'https://www.youtube.com/s/player/5a3b6271/player_ias.vflset/en_US/base.js', 'https://www.youtube.com/s/player/5a3b6271/player_ias.vflset/en_US/base.js',
'B2j7f_UPT4rfje85Lu_e', 'm5DmNymaGQ5RdQ', 'B2j7f_UPT4rfje85Lu_e', 'm5DmNymaGQ5RdQ',
), ),
(
'https://www.youtube.com/s/player/7a062b77/player_ias.vflset/en_US/base.js',
'NRcE3y3mVtm_cV-W', 'VbsCYUATvqlt5w',
),
( (
'https://www.youtube.com/s/player/dac945fd/player_ias.vflset/en_US/base.js', 'https://www.youtube.com/s/player/dac945fd/player_ias.vflset/en_US/base.js',
'o8BkRxXhuYsBCWi6RplPdP', '3Lx32v_hmzTm6A', 'o8BkRxXhuYsBCWi6RplPdP', '3Lx32v_hmzTm6A',
@ -154,6 +170,10 @@ _NSIG_TESTS = [
'https://www.youtube.com/s/player/cfa9e7cb/player_ias.vflset/en_US/base.js', 'https://www.youtube.com/s/player/cfa9e7cb/player_ias.vflset/en_US/base.js',
'qO0NiMtYQ7TeJnfFG2', 'k9cuJDHNS5O7kQ', 'qO0NiMtYQ7TeJnfFG2', 'k9cuJDHNS5O7kQ',
), ),
(
'https://www.youtube.com/s/player/8c7583ff/player_ias.vflset/en_US/base.js',
'1wWCVpRR96eAmMI87L', 'KSkWAVv1ZQxC3A',
),
( (
'https://www.youtube.com/s/player/b7910ca8/player_ias.vflset/en_US/base.js', 'https://www.youtube.com/s/player/b7910ca8/player_ias.vflset/en_US/base.js',
'_hXMCwMt9qE310D', 'LoZMgkkofRMCZQ', '_hXMCwMt9qE310D', 'LoZMgkkofRMCZQ',
@ -182,6 +202,14 @@ _NSIG_TESTS = [
'https://www.youtube.com/s/player/b12cc44b/player_ias.vflset/en_US/base.js', 'https://www.youtube.com/s/player/b12cc44b/player_ias.vflset/en_US/base.js',
'keLa5R2U00sR9SQK', 'N1OGyujjEwMnLw', 'keLa5R2U00sR9SQK', 'N1OGyujjEwMnLw',
), ),
(
'https://www.youtube.com/s/player/3bb1f723/player_ias.vflset/en_US/base.js',
'gK15nzVyaXE9RsMP3z', 'ZFFWFLPWx9DEgQ',
),
(
'https://www.youtube.com/s/player/f8f53e1a/player_ias.vflset/en_US/base.js',
'VTQOUOv0mCIeJ7i8kZB', 'kcfD8wy0sNLyNQ',
),
] ]
@ -216,11 +244,9 @@ class TestSignature(unittest.TestCase):
os.mkdir(self.TESTDATA_DIR) os.mkdir(self.TESTDATA_DIR)
def tearDown(self): def tearDown(self):
try: with compat_contextlib_suppress(OSError):
for f in os.listdir(self.TESTDATA_DIR): for f in os.listdir(self.TESTDATA_DIR):
os.remove(f) os.remove(f)
except OSError:
pass
def t_factory(name, sig_func, url_pattern): def t_factory(name, sig_func, url_pattern):
@ -254,11 +280,12 @@ def signature(jscode, sig_input):
def n_sig(jscode, sig_input): def n_sig(jscode, sig_input):
funcname = YoutubeIE(FakeYDL())._extract_n_function_name(jscode) funcname = YoutubeIE(FakeYDL())._extract_n_function_name(jscode)
return JSInterpreter(jscode).call_function(funcname, sig_input) return JSInterpreter(jscode).call_function(
funcname, sig_input, _ytdl_do_not_return=sig_input)
make_sig_test = t_factory( make_sig_test = t_factory(
'signature', signature, re.compile(r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$')) 'signature', signature, re.compile(r'.*(?:-|/player/)(?P<id>[a-zA-Z0-9_-]+)(?:/.+\.js|(?:/watch_as3|/html5player)?\.[a-z]+)$'))
for test_spec in _SIG_TESTS: for test_spec in _SIG_TESTS:
make_sig_test(*test_spec) make_sig_test(*test_spec)

View File

@ -1579,19 +1579,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self.to_screen('Extracted signature function:\n' + code) self.to_screen('Extracted signature function:\n' + code)
def _parse_sig_js(self, jscode): def _parse_sig_js(self, jscode):
# Examples where `sig` is funcname:
# sig=function(a){a=a.split(""); ... ;return a.join("")};
# ;c&&(c=sig(decodeURIComponent(c)),a.set(b,encodeURIComponent(c)));return a};
# {var l=f,m=h.sp,n=sig(decodeURIComponent(h.s));l.set(m,encodeURIComponent(n))}
# sig=function(J){J=J.split(""); ... ;return J.join("")};
# ;N&&(N=sig(decodeURIComponent(N)),J.set(R,encodeURIComponent(N)));return J};
# {var H=u,k=f.sp,v=sig(decodeURIComponent(f.s));H.set(k,encodeURIComponent(v))}
funcname = self._search_regex( funcname = self._search_regex(
(r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', (r'\b(?P<var>[a-zA-Z0-9$]+)&&\((?P=var)=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\((?P=var)\)\)',
r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*(?P<arg>[a-zA-Z0-9$]+)\s*\)\s*{\s*(?P=arg)\s*=\s*(?P=arg)\.split\(\s*""\s*\)\s*;\s*[^}]+;\s*return\s+(?P=arg)\.join\(\s*""\s*\)',
r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\))?',
# Old patterns
r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\bm=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)', r'\bm=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)',
r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)',
r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\))?',
r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
# Obsolete patterns # Obsolete patterns
r'("|\')signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'("|\')signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
jscode, 'Initial JS player signature function name', group='sig') jscode, 'Initial JS player signature function name', group='sig')
@ -1658,36 +1665,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_n_function_name(self, jscode): def _extract_n_function_name(self, jscode):
func_name, idx = self._search_regex( func_name, idx = self._search_regex(
# new: (b=String.fromCharCode(110),c=a.get(b))&&c=nfunc[idx](c) # (y=NuD(),Mw(k),q=k.Z[y]||null)&&(q=narray[idx](q),k.set(y,q),k.V||NuD(''))}};
# or: (b="nn"[+a.D],c=a.get(b))&&(c=nfunc[idx](c) # (R="nn"[+J.Z],mW(J),N=J.K[R]||null)&&(N=narray[idx](N),J.set(R,N))}};
# or: (PL(a),b=a.j.n||null)&&(b=nfunc[idx](b) # or: (b=String.fromCharCode(110),c=a.get(b))&&c=narray[idx](c)
# or: (b="nn"[+a.D],c=a.get(b))&&(c=narray[idx](c)
# or: (PL(a),b=a.j.n||null)&&(b=narray[idx](b)
# or: (b="nn"[+a.D],vL(a),c=a.j[b]||null)&&(c=narray[idx](c),a.set(b,c),narray.length||nfunc("") # or: (b="nn"[+a.D],vL(a),c=a.j[b]||null)&&(c=narray[idx](c),a.set(b,c),narray.length||nfunc("")
# old: (b=a.get("n"))&&(b=nfunc[idx](b)(?P<c>[a-z])\s*=\s*[a-z]\s* # old: (b=a.get("n"))&&(b=narray[idx](b)(?P<c>[a-z])\s*=\s*[a-z]\s*
# older: (b=a.get("n"))&&(b=nfunc(b) # older: (b=a.get("n"))&&(b=nfunc(b)
r'''(?x) r'''(?x)
\((?:[\w$()\s]+,)*?\s* # ( # (expr, ...,
(?P<b>[a-z])\s*=\s* # b= \((?:(?:\s*[\w$]+\s*=)?(?:[\w$"+\.\s(\[]+(?:[)\]]\s*)?),)*
(?: # b=...
(?: # expect ,c=a.get(b) (etc) (?P<b>[\w$]+)\s*=\s*(?!(?P=b)[^\w$])[\w$]+\s*(?:(?:
String\s*\.\s*fromCharCode\s*\(\s*110\s*\)| \.\s*[\w$]+ |
"n+"\[\s*\+?s*[\w$.]+\s*] \[\s*[\w$]+\s*\] |
)\s*(?:,[\w$()\s]+(?=,))*| \.\s*get\s*\(\s*[\w$"]+\s*\)
(?P<old>[\w$]+) # a (old[er]) )\s*){,2}(?:\s*\|\|\s*null(?=\s*\)))?\s*
)\s* \)\s*&&\s*\( # ...)&&(
(?(old) # b = nfunc, b = narray[idx]
# b.get("n") (?P=b)\s*=\s*(?P<nfunc>[\w$]+)\s*
(?:\.\s*[\w$]+\s*|\[\s*[\w$]+\s*]\s*)*? (?:\[\s*(?P<idx>[\w$]+)\s*\]\s*)?
(?:\.\s*n|\[\s*"n"\s*]|\.\s*get\s*\(\s*"n"\s*\)) # (...)
| # ,c=a.get(b) \(\s*[\w$]+\s*\)
,\s*(?P<c>[a-z])\s*=\s*[a-z]\s*
(?:\.\s*[\w$]+\s*|\[\s*[\w$]+\s*]\s*)*?
(?:\[\s*(?P=b)\s*]|\.\s*get\s*\(\s*(?P=b)\s*\))
)
# interstitial junk
\s*(?:\|\|\s*null\s*)?(?:\)\s*)?&&\s*(?:\(\s*)?
(?(c)(?P=c)|(?P=b))\s*=\s* # [c|b]=
# nfunc|nfunc[idx]
(?P<nfunc>[a-zA-Z_$][\w$]*)(?:\s*\[(?P<idx>\d+)\])?\s*\(\s*[\w$]+\s*\)
''', jscode, 'Initial JS player n function name', group=('nfunc', 'idx'), ''', jscode, 'Initial JS player n function name', group=('nfunc', 'idx'),
default=(None, None)) default=(None, None))
# thx bashonly: yt-dlp/yt-dlp/pull/10611 # thx bashonly: yt-dlp/yt-dlp/pull/10611
@ -1697,15 +1697,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
r'''(?xs) r'''(?xs)
(?:(?<=[^\w$])|^) # instead of \b, which ignores $ (?:(?<=[^\w$])|^) # instead of \b, which ignores $
(?P<name>(?!\d)[a-zA-Z\d_$]+)\s*=\s*function\((?!\d)[a-zA-Z\d_$]+\) (?P<name>(?!\d)[a-zA-Z\d_$]+)\s*=\s*function\((?!\d)[a-zA-Z\d_$]+\)
\s*\{(?:(?!};).)+?["']enhanced_except_ \s*\{(?:(?!};).)+?(?:
["']enhanced_except_ |
return\s*(?P<q>"|')[a-zA-Z\d-]+_w8_(?P=q)\s*\+\s*[a-zA-Z0-9_$]+
)
''', jscode, 'Initial JS player n function name', group='name') ''', jscode, 'Initial JS player n function name', group='name')
if not idx: if not idx:
return func_name return func_name
return self._parse_json(self._search_regex( return self._search_json(
r'var\s+{0}\s*=\s*(\[.+?\])\s*[,;]'.format(re.escape(func_name)), jscode, r'var\s+{0}\s*='.format(re.escape(func_name)), jscode,
'Initial JS player n function list ({0}.{1})'.format(func_name, idx)), 'Initial JS player n function list ({0}.{1})'.format(func_name, idx),
func_name, transform_source=js_to_json)[int(idx)] func_name, contains_pattern=r'\[[\s\S]+\]', end_pattern='[,;]',
transform_source=js_to_json)[int(idx)]
def _extract_n_function_code(self, video_id, player_url): def _extract_n_function_code(self, video_id, player_url):
player_id = self._extract_player_info(player_url) player_id = self._extract_player_info(player_url)
@ -1728,13 +1732,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def extract_nsig(s): def extract_nsig(s):
try: try:
ret = func([s]) ret = func([s], kwargs={'_ytdl_do_not_return': s})
except JSInterpreter.Exception: except JSInterpreter.Exception:
raise raise
except Exception as e: except Exception as e:
raise JSInterpreter.Exception(traceback.format_exc(), cause=e) raise JSInterpreter.Exception(traceback.format_exc(), cause=e)
if ret.startswith('enhanced_except_'): if ret.startswith('enhanced_except_') or ret.endswith(s):
raise JSInterpreter.Exception('Signature function returned an exception') raise JSInterpreter.Exception('Signature function returned an exception')
return ret return ret

View File

@ -368,7 +368,7 @@ class Debugger(object):
raise raise
if cls.ENABLED and stmt.strip(): if cls.ENABLED and stmt.strip():
if should_ret or repr(ret) != stmt: if should_ret or repr(ret) != stmt:
cls.write(['->', '=>'][should_ret], repr(ret), '<-|', stmt, level=allow_recursion) cls.write(['->', '=>'][bool(should_ret)], repr(ret), '<-|', stmt, level=allow_recursion)
return ret, should_ret return ret, should_ret
return interpret_statement return interpret_statement
@ -397,6 +397,9 @@ class JSInterpreter(object):
RE_FLAGS = { RE_FLAGS = {
# special knowledge: Python's re flags are bitmask values, current max 128 # special knowledge: Python's re flags are bitmask values, current max 128
# invent new bitmask values well above that for literal parsing # invent new bitmask values well above that for literal parsing
# JS 'u' flag is effectively always set (surrogate pairs aren't seen),
# but \u{...} and \p{...} escapes aren't handled); no additional JS 'v'
# features are supported
# TODO: execute matches with these flags (remaining: d, y) # TODO: execute matches with these flags (remaining: d, y)
'd': 1024, # Generate indices for substring matches 'd': 1024, # Generate indices for substring matches
'g': 2048, # Global search 'g': 2048, # Global search
@ -404,6 +407,7 @@ class JSInterpreter(object):
'm': re.M, # Multi-line search 'm': re.M, # Multi-line search
's': re.S, # Allows . to match newline characters 's': re.S, # Allows . to match newline characters
'u': re.U, # Treat a pattern as a sequence of unicode code points 'u': re.U, # Treat a pattern as a sequence of unicode code points
'v': re.U, # Like 'u' with extended character class and \p{} syntax
'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string 'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string
} }
@ -484,9 +488,18 @@ class JSInterpreter(object):
skipping = 0 skipping = 0
if skip_delims: if skip_delims:
skip_delims = variadic(skip_delims) skip_delims = variadic(skip_delims)
skip_txt = None
for idx, char in enumerate(expr): for idx, char in enumerate(expr):
if skip_txt and idx <= skip_txt[1]:
continue
paren_delta = 0 paren_delta = 0
if not in_quote: if not in_quote:
if char == '/' and expr[idx:idx + 2] == '/*':
# skip a comment
skip_txt = expr[idx:].find('*/', 2)
skip_txt = [idx, idx + skip_txt + 1] if skip_txt >= 2 else None
if skip_txt:
continue
if char in _MATCHING_PARENS: if char in _MATCHING_PARENS:
counters[_MATCHING_PARENS[char]] += 1 counters[_MATCHING_PARENS[char]] += 1
paren_delta = 1 paren_delta = 1
@ -519,12 +532,19 @@ class JSInterpreter(object):
if pos < delim_len: if pos < delim_len:
pos += 1 pos += 1
continue continue
yield expr[start: idx - delim_len] if skip_txt and skip_txt[0] >= start and skip_txt[1] <= idx - delim_len:
yield expr[start:skip_txt[0]] + expr[skip_txt[1] + 1: idx - delim_len]
else:
yield expr[start: idx - delim_len]
skip_txt = None
start, pos = idx + 1, 0 start, pos = idx + 1, 0
splits += 1 splits += 1
if max_split and splits >= max_split: if max_split and splits >= max_split:
break break
yield expr[start:] if skip_txt and skip_txt[0] >= start:
yield expr[start:skip_txt[0]] + expr[skip_txt[1] + 1:]
else:
yield expr[start:]
@classmethod @classmethod
def _separate_at_paren(cls, expr, delim=None): def _separate_at_paren(cls, expr, delim=None):
@ -583,7 +603,7 @@ class JSInterpreter(object):
# used below # used below
_VAR_RET_THROW_RE = re.compile(r'''(?x) _VAR_RET_THROW_RE = re.compile(r'''(?x)
(?P<var>(?:var|const|let)\s)|return(?:\s+|(?=["'])|$)|(?P<throw>throw\s+) (?:(?P<var>var|const|let)\s+|(?P<ret>return)(?:\s+|(?=["'])|$)|(?P<throw>throw)\s+)
''') ''')
_COMPOUND_RE = re.compile(r'''(?x) _COMPOUND_RE = re.compile(r'''(?x)
(?P<try>try)\s*\{| (?P<try>try)\s*\{|
@ -663,7 +683,7 @@ class JSInterpreter(object):
expr = stmt[len(m.group(0)):].strip() expr = stmt[len(m.group(0)):].strip()
if m.group('throw'): if m.group('throw'):
raise JS_Throw(self.interpret_expression(expr, local_vars, allow_recursion)) raise JS_Throw(self.interpret_expression(expr, local_vars, allow_recursion))
should_return = not m.group('var') should_return = 'return' if m.group('ret') else False
if not expr: if not expr:
return None, should_return return None, should_return
@ -948,14 +968,20 @@ class JSInterpreter(object):
return _Infinity, should_return return _Infinity, should_return
elif md.get('return'): elif md.get('return'):
return local_vars[m.group('name')], should_return ret = local_vars[m.group('name')]
# challenge may try to force returning the original value
# use an optional internal var to block this
if should_return == 'return':
if '_ytdl_do_not_return' not in local_vars:
return ret, True
return (ret, True) if ret != local_vars['_ytdl_do_not_return'] else (ret, False)
else:
return ret, should_return
try: with compat_contextlib_suppress(ValueError):
ret = json.loads(js_to_json(expr)) # strict=True) ret = json.loads(js_to_json(expr)) # strict=True)
if not md.get('attribute'): if not md.get('attribute'):
return ret, should_return return ret, should_return
except ValueError:
pass
if md.get('indexing'): if md.get('indexing'):
val = local_vars[m.group('in')] val = local_vars[m.group('in')]
@ -1047,13 +1073,47 @@ class JSInterpreter(object):
raise self.Exception('Unsupported Math method ' + member, expr=expr) raise self.Exception('Unsupported Math method ' + member, expr=expr)
if member == 'split': if member == 'split':
assertion(argvals, 'takes one or more arguments') assertion(len(argvals) <= 2, 'takes at most two arguments')
assertion(len(argvals) == 1, 'with limit argument is not implemented') if len(argvals) > 1:
return obj.split(argvals[0]) if argvals[0] else list(obj) limit = argvals[1]
assertion(isinstance(limit, int) and limit >= 0, 'integer limit >= 0')
if limit == 0:
return []
else:
limit = 0
if len(argvals) == 0:
argvals = [JS_Undefined]
elif isinstance(argvals[0], self.JS_RegExp):
# avoid re.split(), similar but not enough
def where():
for m in argvals[0].finditer(obj):
yield m.span(0)
yield (None, None)
def splits(limit=limit):
i = 0
for j, jj in where():
if j == jj == 0:
continue
if j is None and i >= len(obj):
break
yield obj[i:j]
if jj is None or limit == 1:
break
limit -= 1
i = jj
return list(splits())
return (
obj.split(argvals[0], limit - 1) if argvals[0] and argvals[0] != JS_Undefined
else list(obj)[:limit or None])
elif member == 'join': elif member == 'join':
assertion(isinstance(obj, list), 'must be applied on a list') assertion(isinstance(obj, list), 'must be applied on a list')
assertion(len(argvals) == 1, 'takes exactly one argument') assertion(len(argvals) <= 1, 'takes at most one argument')
return argvals[0].join(obj) return (',' if len(argvals) == 0 else argvals[0]).join(
('' if x in (None, JS_Undefined) else _js_toString(x))
for x in obj)
elif member == 'reverse': elif member == 'reverse':
assertion(not argvals, 'does not take any arguments') assertion(not argvals, 'does not take any arguments')
obj.reverse() obj.reverse()
@ -1075,37 +1135,31 @@ class JSInterpreter(object):
index, how_many = map(int, (argvals + [len(obj)])[:2]) index, how_many = map(int, (argvals + [len(obj)])[:2])
if index < 0: if index < 0:
index += len(obj) index += len(obj)
add_items = argvals[2:] res = [obj.pop(index)
res = [] for _ in range(index, min(index + how_many, len(obj)))]
for _ in range(index, min(index + how_many, len(obj))): obj[index:index] = argvals[2:]
res.append(obj.pop(index))
for i, item in enumerate(add_items):
obj.insert(index + i, item)
return res return res
elif member == 'unshift': elif member in ('shift', 'pop'):
assertion(isinstance(obj, list), 'must be applied on a list')
assertion(argvals, 'takes one or more arguments')
for item in reversed(argvals):
obj.insert(0, item)
return obj
elif member == 'pop':
assertion(isinstance(obj, list), 'must be applied on a list') assertion(isinstance(obj, list), 'must be applied on a list')
assertion(not argvals, 'does not take any arguments') assertion(not argvals, 'does not take any arguments')
if not obj: return obj.pop(0 if member == 'shift' else -1) if len(obj) > 0 else JS_Undefined
return elif member == 'unshift':
return obj.pop() assertion(isinstance(obj, list), 'must be applied on a list')
# not enforced: assertion(argvals, 'takes one or more arguments')
obj[0:0] = argvals
return len(obj)
elif member == 'push': elif member == 'push':
assertion(argvals, 'takes one or more arguments') # not enforced: assertion(argvals, 'takes one or more arguments')
obj.extend(argvals) obj.extend(argvals)
return obj return len(obj)
elif member == 'forEach': elif member == 'forEach':
assertion(argvals, 'takes one or more arguments') assertion(argvals, 'takes one or more arguments')
assertion(len(argvals) <= 2, 'takes at-most 2 arguments') assertion(len(argvals) <= 2, 'takes at most 2 arguments')
f, this = (argvals + [''])[:2] f, this = (argvals + [''])[:2]
return [f((item, idx, obj), {'this': this}, allow_recursion) for idx, item in enumerate(obj)] return [f((item, idx, obj), {'this': this}, allow_recursion) for idx, item in enumerate(obj)]
elif member == 'indexOf': elif member == 'indexOf':
assertion(argvals, 'takes one or more arguments') assertion(argvals, 'takes one or more arguments')
assertion(len(argvals) <= 2, 'takes at-most 2 arguments') assertion(len(argvals) <= 2, 'takes at most 2 arguments')
idx, start = (argvals + [0])[:2] idx, start = (argvals + [0])[:2]
try: try:
return obj.index(idx, start) return obj.index(idx, start)
@ -1114,7 +1168,7 @@ class JSInterpreter(object):
elif member == 'charCodeAt': elif member == 'charCodeAt':
assertion(isinstance(obj, compat_str), 'must be applied on a string') assertion(isinstance(obj, compat_str), 'must be applied on a string')
# assertion(len(argvals) == 1, 'takes exactly one argument') # but not enforced # assertion(len(argvals) == 1, 'takes exactly one argument') # but not enforced
idx = argvals[0] if isinstance(argvals[0], int) else 0 idx = argvals[0] if len(argvals) > 0 and isinstance(argvals[0], int) else 0
if idx >= len(obj): if idx >= len(obj):
return None return None
return ord(obj[idx]) return ord(obj[idx])
@ -1165,7 +1219,7 @@ class JSInterpreter(object):
yield self.interpret_expression(v, local_vars, allow_recursion) yield self.interpret_expression(v, local_vars, allow_recursion)
def extract_object(self, objname): def extract_object(self, objname):
_FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' _FUNC_NAME_RE = r'''(?:{n}|"{n}"|'{n}')'''.format(n=_NAME_RE)
obj = {} obj = {}
fields = next(filter(None, ( fields = next(filter(None, (
obj_m.group('fields') for obj_m in re.finditer( obj_m.group('fields') for obj_m in re.finditer(
@ -1224,6 +1278,7 @@ class JSInterpreter(object):
def extract_function_from_code(self, argnames, code, *global_stack): def extract_function_from_code(self, argnames, code, *global_stack):
local_vars = {} local_vars = {}
while True: while True:
mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code) mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code)
if mobj is None: if mobj is None:
@ -1234,10 +1289,11 @@ class JSInterpreter(object):
[x.strip() for x in mobj.group('args').split(',')], [x.strip() for x in mobj.group('args').split(',')],
body, local_vars, *global_stack)) body, local_vars, *global_stack))
code = code[:start] + name + remaining code = code[:start] + name + remaining
return self.build_function(argnames, code, local_vars, *global_stack) return self.build_function(argnames, code, local_vars, *global_stack)
def call_function(self, funcname, *args): def call_function(self, funcname, *args, **kw_global_vars):
return self.extract_function(funcname)(args) return self.extract_function(funcname)(args, kw_global_vars)
@classmethod @classmethod
def build_arglist(cls, arg_text): def build_arglist(cls, arg_text):
@ -1256,8 +1312,9 @@ class JSInterpreter(object):
global_stack = list(global_stack) or [{}] global_stack = list(global_stack) or [{}]
argnames = tuple(argnames) argnames = tuple(argnames)
def resf(args, kwargs={}, allow_recursion=100): def resf(args, kwargs=None, allow_recursion=100):
global_stack[0].update(zip_longest(argnames, args, fillvalue=None)) kwargs = kwargs or {}
global_stack[0].update(zip_longest(argnames, args, fillvalue=JS_Undefined))
global_stack[0].update(kwargs) global_stack[0].update(kwargs)
var_stack = LocalNameSpace(*global_stack) var_stack = LocalNameSpace(*global_stack)
ret, should_abort = self.interpret_statement(code.replace('\n', ' '), var_stack, allow_recursion - 1) ret, should_abort = self.interpret_statement(code.replace('\n', ' '), var_stack, allow_recursion - 1)