From e02b214fdff4b41719ac25737c3e928db1e6f3fe Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 7 Dec 2024 03:37:39 +0000 Subject: [PATCH 1/2] [jsinterp] Implement `typeof` operator --- test/test_jsinterp.py | 15 +++- youtube_dl/jsinterp.py | 156 ++++++++++++++++++++++++++++------------- 2 files changed, 121 insertions(+), 50 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index c7a4f2cbf..d063bbd36 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -266,7 +266,20 @@ class TestJSInterpreter(unittest.TestCase): self._test('function f() { return (l=[0,1,2,3], function(a, b){return a+b})((l[1], l[2]), l[3]) }', 5) def test_void(self): - self._test('function f() { return void 42; }', None) + self._test('function f() { return void 42; }', JS_Undefined) + + def test_typeof(self): + self._test('function f() { return typeof undefined; }', 'undefined') + self._test('function f() { return typeof NaN; }', 'number') + self._test('function f() { return typeof Infinity; }', 'number') + self._test('function f() { return typeof true; }', 'boolean') + self._test('function f() { return typeof null; }', 'object') + self._test('function f() { return typeof "a string"; }', 'string') + self._test('function f() { return typeof 42; }', 'number') + self._test('function f() { return typeof 42.42; }', 'number') + self._test('function f() { var g = function(){}; return typeof g; }', 'function') + self._test('function f() { return typeof {key: "value"}; }', 'object') + # not yet implemented: Symbol, BigInt def test_return_function(self): jsi = JSInterpreter(''' diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index a616ad070..2e6a3f56b 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -5,7 +5,7 @@ import json import operator import re -from functools import update_wrapper +from functools import update_wrapper, wraps from .utils import ( error_to_compat_str, @@ -23,6 +23,7 @@ from .compat import ( compat_filter as filter, compat_itertools_zip_longest as zip_longest, compat_map as map, + compat_numeric_types, compat_str, ) @@ -138,6 +139,43 @@ def _js_ternary(cndn, if_true=True, if_false=False): return if_true +def _js_unary_op(op): + + @wraps_op(op) + def wrapped(_, a): + return op(a) + + return wrapped + + +# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/typeof +def _js_typeof(expr): + try: + result = { + JS_Undefined: 'undefined', + _NaN: 'number', + _Infinity: 'number', + True: 'boolean', + False: 'boolean', + None: 'object', + }[expr] + except (TypeError, KeyError): + result = None + if result is None: + for t, n in ( + (compat_basestring, 'string'), + (compat_numeric_types, 'number'), + ): + if isinstance(expr, t): + result = n + break + else: + if callable(expr): + result = 'function' + # TODO: Symbol, BigInt + return 'object' if result is None else result + + # (op, definition) in order of binding priority, tightest first # avoid dict to maintain order # definition None => Defined in JSInterpreter._operator @@ -176,6 +214,11 @@ _SC_OPERATORS = ( ('&&', None), ) +_UNARY_OPERATORS_X = ( + ('void', _js_unary_op(lambda _: JS_Undefined)), + ('typeof', _js_unary_op(_js_typeof)), +) + _OPERATOR_RE = '|'.join(map(lambda x: re.escape(x[0]), _OPERATORS + _LOG_OPERATORS)) _NAME_RE = r'[a-zA-Z_$][\w$]*' @@ -242,6 +285,7 @@ class Debugger(object): @classmethod def wrap_interpreter(cls, f): + @wraps(f) def interpret_statement(self, stmt, local_vars, allow_recursion, *args, **kwargs): if cls.ENABLED and stmt.strip(): cls.write(stmt, level=allow_recursion) @@ -347,6 +391,8 @@ class JSInterpreter(object): def __op_chars(cls): op_chars = set(';,[') for op in cls._all_operators(): + if op[0].isalpha(): + continue op_chars.update(op[0]) return op_chars @@ -425,7 +471,7 @@ class JSInterpreter(object): if not _cached: _cached.extend(itertools.chain( # Ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Operator_Precedence - _SC_OPERATORS, _LOG_OPERATORS, _COMP_OPERATORS, _OPERATORS)) + _SC_OPERATORS, _LOG_OPERATORS, _COMP_OPERATORS, _OPERATORS, _UNARY_OPERATORS_X)) return _cached def _operator(self, op, left_val, right_expr, expr, local_vars, allow_recursion): @@ -479,6 +525,52 @@ class JSInterpreter(object): _FINALLY_RE = re.compile(r'finally\s*\{') _SWITCH_RE = re.compile(r'switch\s*\(') + def handle_operators(self, expr, local_vars, allow_recursion): + + for op, _ in self._all_operators(): + # hackety: have higher priority than <>, but don't confuse them + skip_delim = (op + op) if op in '<>*?' else None + if op == '?': + skip_delim = (skip_delim, '?.') + separated = list(self._separate(expr, op, skip_delims=skip_delim)) + if len(separated) < 2: + continue + + right_expr = separated.pop() + # handle operators that are both unary and binary, minimal BODMAS + if op in ('+', '-'): + # simplify/adjust consecutive instances of these operators + undone = 0 + separated = [s.strip() for s in separated] + while len(separated) > 1 and not separated[-1]: + undone += 1 + separated.pop() + if op == '-' and undone % 2 != 0: + right_expr = op + right_expr + elif op == '+': + while len(separated) > 1 and set(separated[-1]) <= self.OP_CHARS: + right_expr = separated.pop() + right_expr + if separated[-1][-1:] in self.OP_CHARS: + right_expr = separated.pop() + right_expr + # hanging op at end of left => unary + (strip) or - (push right) + left_val = separated[-1] if separated else '' + for dm_op in ('*', '%', '/', '**'): + bodmas = tuple(self._separate(left_val, dm_op, skip_delims=skip_delim)) + if len(bodmas) > 1 and not bodmas[-1].strip(): + expr = op.join(separated) + op + right_expr + if len(separated) > 1: + separated.pop() + right_expr = op.join((left_val, right_expr)) + else: + separated = [op.join((left_val, right_expr))] + right_expr = None + break + if right_expr is None: + continue + + left_val = self.interpret_expression(op.join(separated), local_vars, allow_recursion) + return self._operator(op, left_val, right_expr, expr, local_vars, allow_recursion), True + @Debugger.wrap_interpreter def interpret_statement(self, stmt, local_vars, allow_recursion=100): if allow_recursion < 0: @@ -533,9 +625,15 @@ class JSInterpreter(object): else: raise self.Exception('Unsupported object {obj:.100}'.format(**locals()), expr=expr) - if expr.startswith('void '): - left = self.interpret_expression(expr[5:], local_vars, allow_recursion) - return None, should_return + for op, _ in _UNARY_OPERATORS_X: + if not expr.startswith(op): + continue + operand = expr[len(op):] + if not operand or operand[0] != ' ': + continue + op_result = self.handle_operators(expr, local_vars, allow_recursion) + if op_result: + return op_result[0], should_return if expr.startswith('{'): inner, outer = self._separate_at_paren(expr) @@ -582,7 +680,7 @@ class JSInterpreter(object): if_expr, expr = self._separate_at_paren(expr) else: # may lose ... else ... because of ll.368-374 - if_expr, expr = self._separate_at_paren(expr, delim=';') + if_expr, expr = self._separate_at_paren(' %s;' % (expr,), delim=';') else_expr = None m = re.match(r'else\s*(?P\{)?', expr) if m: @@ -790,49 +888,9 @@ class JSInterpreter(object): idx = self.interpret_expression(m.group('idx'), local_vars, allow_recursion) return self._index(val, idx), should_return - for op, _ in self._all_operators(): - # hackety: have higher priority than <>, but don't confuse them - skip_delim = (op + op) if op in '<>*?' else None - if op == '?': - skip_delim = (skip_delim, '?.') - separated = list(self._separate(expr, op, skip_delims=skip_delim)) - if len(separated) < 2: - continue - - right_expr = separated.pop() - # handle operators that are both unary and binary, minimal BODMAS - if op in ('+', '-'): - # simplify/adjust consecutive instances of these operators - undone = 0 - separated = [s.strip() for s in separated] - while len(separated) > 1 and not separated[-1]: - undone += 1 - separated.pop() - if op == '-' and undone % 2 != 0: - right_expr = op + right_expr - elif op == '+': - while len(separated) > 1 and set(separated[-1]) <= self.OP_CHARS: - right_expr = separated.pop() + right_expr - if separated[-1][-1:] in self.OP_CHARS: - right_expr = separated.pop() + right_expr - # hanging op at end of left => unary + (strip) or - (push right) - left_val = separated[-1] if separated else '' - for dm_op in ('*', '%', '/', '**'): - bodmas = tuple(self._separate(left_val, dm_op, skip_delims=skip_delim)) - if len(bodmas) > 1 and not bodmas[-1].strip(): - expr = op.join(separated) + op + right_expr - if len(separated) > 1: - separated.pop() - right_expr = op.join((left_val, right_expr)) - else: - separated = [op.join((left_val, right_expr))] - right_expr = None - break - if right_expr is None: - continue - - left_val = self.interpret_expression(op.join(separated), local_vars, allow_recursion) - return self._operator(op, left_val, right_expr, expr, local_vars, allow_recursion), should_return + op_result = self.handle_operators(expr, local_vars, allow_recursion) + if op_result: + return op_result[0], should_return if md.get('attribute'): variable, member, nullish = m.group('var', 'member', 'nullish') From 938e7e199399361e982129bfb8b575e5bb33dd5e Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 7 Dec 2024 03:39:44 +0000 Subject: [PATCH 2/2] [YouTube] Handle player `3bb1f723` * fix signature code extraction * raise if n function returns input value * add new tests from yt-dlp Co-authored-by: bashonly --- test/test_youtube_signature.py | 37 +++++++++++++++++++++++++------- youtube_dl/extractor/youtube.py | 38 ++++++++++++++++++++++----------- 2 files changed, 54 insertions(+), 21 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 56e92fac5..7d1ff90ba 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -12,6 +12,7 @@ import re import string from youtube_dl.compat import ( + compat_contextlib_suppress, compat_open as open, compat_str, compat_urlretrieve, @@ -50,23 +51,33 @@ _SIG_TESTS = [ ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflBb0OQx.js', 84, - '123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQ0STUVWXYZ!"#$%&\'()*+,@./:;<=>' + '123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQ0STUVWXYZ!"#$%&\'()*+,@./:;<=>', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl9FYC6l.js', 83, - '123456789abcdefghijklmnopqr0tuvwxyzABCDETGHIJKLMNOPQRS>UVWXYZ!"#$%&\'()*+,-./:;<=F' + '123456789abcdefghijklmnopqr0tuvwxyzABCDETGHIJKLMNOPQRS>UVWXYZ!"#$%&\'()*+,-./:;<=F', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflCGk6yw/html5player.js', '4646B5181C6C3020DF1D9C7FCFEA.AD80ABF70C39BD369CCCAE780AFBB98FA6B6CB42766249D9488C288', - '82C8849D94266724DC6B6AF89BBFA087EACCD963.B93C07FBA084ACAEFCF7C9D1FD0203C6C1815B6B' + '82C8849D94266724DC6B6AF89BBFA087EACCD963.B93C07FBA084ACAEFCF7C9D1FD0203C6C1815B6B', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', '312AA52209E3623129A412D56A40F11CB0AF14AE.3EE09501CB14E3BCDC3B2AE808BF3F1D14E7FBF12', '112AA5220913623229A412D56A40F11CB0AF14AE.3EE0950FCB14EEBCDC3B2AE808BF331D14E7FBF3', - ) + ), + ( + 'https://www.youtube.com/s/player/6ed0d907/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'AOq0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL2QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', + ), + ( + 'https://www.youtube.com/s/player/3bb1f723/player_ias.vflset/en_US/base.js', + '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + 'MyOSJXtKI3m-uME_jv7-pT12gOFC02RFkGoqWpzE0Cs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', + ), ] _NSIG_TESTS = [ @@ -142,6 +153,10 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/5a3b6271/player_ias.vflset/en_US/base.js', 'B2j7f_UPT4rfje85Lu_e', 'm5DmNymaGQ5RdQ', ), + ( + 'https://www.youtube.com/s/player/7a062b77/player_ias.vflset/en_US/base.js', + 'NRcE3y3mVtm_cV-W', 'VbsCYUATvqlt5w', + ), ( 'https://www.youtube.com/s/player/dac945fd/player_ias.vflset/en_US/base.js', 'o8BkRxXhuYsBCWi6RplPdP', '3Lx32v_hmzTm6A', @@ -154,6 +169,10 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/cfa9e7cb/player_ias.vflset/en_US/base.js', 'qO0NiMtYQ7TeJnfFG2', 'k9cuJDHNS5O7kQ', ), + ( + 'https://www.youtube.com/s/player/8c7583ff/player_ias.vflset/en_US/base.js', + '1wWCVpRR96eAmMI87L', 'KSkWAVv1ZQxC3A', + ), ( 'https://www.youtube.com/s/player/b7910ca8/player_ias.vflset/en_US/base.js', '_hXMCwMt9qE310D', 'LoZMgkkofRMCZQ', @@ -182,6 +201,10 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/b12cc44b/player_ias.vflset/en_US/base.js', 'keLa5R2U00sR9SQK', 'N1OGyujjEwMnLw', ), + ( + 'https://www.youtube.com/s/player/3bb1f723/player_ias.vflset/en_US/base.js', + 'gK15nzVyaXE9RsMP3z', 'ZFFWFLPWx9DEgQ', + ), ] @@ -216,11 +239,9 @@ class TestSignature(unittest.TestCase): os.mkdir(self.TESTDATA_DIR) def tearDown(self): - try: + with compat_contextlib_suppress(OSError): for f in os.listdir(self.TESTDATA_DIR): os.remove(f) - except OSError: - pass def t_factory(name, sig_func, url_pattern): @@ -258,7 +279,7 @@ def n_sig(jscode, sig_input): make_sig_test = t_factory( - 'signature', signature, re.compile(r'.*-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$')) + 'signature', signature, re.compile(r'.*(?:-|/player/)(?P[a-zA-Z0-9_-]+)(?:/.+\.js|(?:/watch_as3|/html5player)?\.[a-z]+)$')) for test_spec in _SIG_TESTS: make_sig_test(*test_spec) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6fe520e9a..d633032ae 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1579,19 +1579,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.to_screen('Extracted signature function:\n' + code) def _parse_sig_js(self, jscode): + # Examples where `sig` is funcname: + # sig=function(a){a=a.split(""); ... ;return a.join("")}; + # ;c&&(c=sig(decodeURIComponent(c)),a.set(b,encodeURIComponent(c)));return a}; + # {var l=f,m=h.sp,n=sig(decodeURIComponent(h.s));l.set(m,encodeURIComponent(n))} + # sig=function(J){J=J.split(""); ... ;return J.join("")}; + # ;N&&(N=sig(decodeURIComponent(N)),J.set(R,encodeURIComponent(N)));return J}; + # {var H=u,k=f.sp,v=sig(decodeURIComponent(f.s));H.set(k,encodeURIComponent(v))} funcname = self._search_regex( - (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + (r'\b(?P[a-zA-Z0-9$]+)&&\((?P=var)=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\((?P=var)\)\)', + r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*(?P[a-zA-Z0-9$]+)\s*\)\s*{\s*(?P=arg)\s*=\s*(?P=arg)\.split\(\s*""\s*\)\s*;\s*[^}]+;\s*return\s+(?P=arg)\.join\(\s*""\s*\)', + r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\))?', + # Old patterns + r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\bm=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)', - r'\bc&&\(c=(?P[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)', - r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\))?', - r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns r'("|\')signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P[a-zA-Z0-9$]+)\(', r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P[a-zA-Z0-9$]+)\(', r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', - r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') @@ -1658,6 +1665,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_n_function_name(self, jscode): func_name, idx = self._search_regex( + # (R="nn"[+J.Z],mW(J),N=J.K[R]||null)&&(N=narray[idx](N),J.set(R,N))}}; # new: (b=String.fromCharCode(110),c=a.get(b))&&c=nfunc[idx](c) # or: (b="nn"[+a.D],c=a.get(b))&&(c=nfunc[idx](c) # or: (PL(a),b=a.j.n||null)&&(b=nfunc[idx](b) @@ -1666,7 +1674,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # older: (b=a.get("n"))&&(b=nfunc(b) r'''(?x) \((?:[\w$()\s]+,)*?\s* # ( - (?P[a-z])\s*=\s* # b= + (?P[a-zA-Z])\s*=\s* # b=, R= (?: (?: # expect ,c=a.get(b) (etc) String\s*\.\s*fromCharCode\s*\(\s*110\s*\)| @@ -1679,7 +1687,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:\.\s*[\w$]+\s*|\[\s*[\w$]+\s*]\s*)*? (?:\.\s*n|\[\s*"n"\s*]|\.\s*get\s*\(\s*"n"\s*\)) | # ,c=a.get(b) - ,\s*(?P[a-z])\s*=\s*[a-z]\s* + ,\s*(?P[a-zA-Z])\s*=\s*[a-zA-Z]\s* (?:\.\s*[\w$]+\s*|\[\s*[\w$]+\s*]\s*)*? (?:\[\s*(?P=b)\s*]|\.\s*get\s*\(\s*(?P=b)\s*\)) ) @@ -1697,15 +1705,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'''(?xs) (?:(?<=[^\w$])|^) # instead of \b, which ignores $ (?P(?!\d)[a-zA-Z\d_$]+)\s*=\s*function\((?!\d)[a-zA-Z\d_$]+\) - \s*\{(?:(?!};).)+?["']enhanced_except_ + \s*\{(?:(?!};).)+?(?: + ["']enhanced_except_ | + return\s*(?P"|')[a-zA-Z\d-]+_w8_(?P=q)\s*\+\s*[a-zA-Z0-9_$]+ + ) ''', jscode, 'Initial JS player n function name', group='name') if not idx: return func_name - return self._parse_json(self._search_regex( - r'var\s+{0}\s*=\s*(\[.+?\])\s*[,;]'.format(re.escape(func_name)), jscode, - 'Initial JS player n function list ({0}.{1})'.format(func_name, idx)), - func_name, transform_source=js_to_json)[int(idx)] + return self._search_json( + r'var\s+{0}\s*='.format(re.escape(func_name)), jscode, + 'Initial JS player n function list ({0}.{1})'.format(func_name, idx), + func_name, contains_pattern=r'\[[\s\S]+\]', end_pattern='[,;]', + transform_source=js_to_json)[int(idx)] def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) @@ -1734,7 +1746,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): except Exception as e: raise JSInterpreter.Exception(traceback.format_exc(), cause=e) - if ret.startswith('enhanced_except_'): + if ret.startswith('enhanced_except_') or ret.endswith(s): raise JSInterpreter.Exception('Signature function returned an exception') return ret