From 14ef89a8dab4f6ba6185d6f5bf0317a705d7b842 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 1 Feb 2023 09:39:49 +0530 Subject: [PATCH 001/156] Support `if` statements Fix for yt-dlp/yt_dlp#6131 Closes #31509 --- test/test_jsinterp.py | 32 ++++++++++++++++++++++++++++++++ test/test_youtube_signature.py | 4 ++++ youtube_dl/jsinterp.py | 21 ++++++++++++++++++--- 3 files changed, 54 insertions(+), 3 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 5121c8cf8..c47def737 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -158,6 +158,38 @@ class TestJSInterpreter(unittest.TestCase): self.assertEqual(jsi.call_function('z'), 5) self.assertEqual(jsi.call_function('y'), 2) + def test_if(self): + jsi = JSInterpreter(''' + function x() { + let a = 9; + if (0==0) {a++} + return a + }''') + self.assertEqual(jsi.call_function('x'), 10) + + jsi = JSInterpreter(''' + function x() { + if (0==0) {return 10} + }''') + self.assertEqual(jsi.call_function('x'), 10) + + jsi = JSInterpreter(''' + function x() { + if (0!=0) {return 1} + else {return 10} + }''') + self.assertEqual(jsi.call_function('x'), 10) + + """ # Unsupported + jsi = JSInterpreter(''' + function x() { + if (0!=0) {return 1} + else if (1==0) {return 2} + else {return 10} + }''') + self.assertEqual(jsi.call_function('x'), 10) + """ + def test_for_loop(self): # function x() { a=0; for (i=0; i-10; i++) {a++} a } jsi = JSInterpreter(''' diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 4e678cae0..ac37ffa45 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -135,6 +135,10 @@ _NSIG_TESTS = [ 'https://www.youtube.com/s/player/5a3b6271/player_ias.vflset/en_US/base.js', 'B2j7f_UPT4rfje85Lu_e', 'm5DmNymaGQ5RdQ', ), + ( + 'https://www.youtube.com/s/player/dac945fd/player_ias.vflset/en_US/base.js', + 'o8BkRxXhuYsBCWi6RplPdP', '3Lx32v_hmzTm6A', + ), ] diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 530a705b4..9a3b8d7f2 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -214,7 +214,7 @@ class JSInterpreter(object): def __init__(self, msg, *args, **kwargs): expr = kwargs.pop('expr', None) if expr is not None: - msg = '{0} in: {1!r:.100}'.format(msg.rstrip(), expr) + msg = '{0} in: {1!r}'.format(msg.rstrip(), expr[:100]) super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) @classmethod @@ -268,7 +268,7 @@ class JSInterpreter(object): elif in_quote == '/' and char in '[]': in_regex_char_group = char == '[' escaping = not escaping and in_quote and char == '\\' - after_op = not in_quote and (char in cls.OP_CHARS or char == '[' or (char.isspace() and after_op)) + after_op = not in_quote and (char in cls.OP_CHARS or (char.isspace() and after_op)) if char != delim[pos] or any(counters.values()) or in_quote: pos = skipping = 0 @@ -301,7 +301,7 @@ class JSInterpreter(object): separated = list(cls._separate(expr, delim, 1)) if len(separated) < 2: - raise cls.Exception('No terminating paren {delim} in {expr:.100}'.format(**locals())) + raise cls.Exception('No terminating paren {delim} in {expr}'.format(**locals())) return separated[0][1:].strip(), separated[1].strip() @staticmethod @@ -428,10 +428,25 @@ class JSInterpreter(object): m = re.match(r'''(?x) (?Ptry)\s*\{| + (?Pif)\s*\(| (?Pswitch)\s*\(| (?Pfor)\s*\( ''', expr) md = m.groupdict() if m else {} + if md.get('if'): + cndn, expr = self._separate_at_paren(expr[m.end() - 1:]) + if_expr, expr = self._separate_at_paren(expr.lstrip()) + # TODO: "else if" is not handled + else_expr = None + m = re.match(r'else\s*{', expr) + if m: + else_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + cndn = _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)) + ret, should_abort = self.interpret_statement( + if_expr if cndn else else_expr, local_vars, allow_recursion) + if should_abort: + return ret, True + if md.get('try'): try_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) err = None From 295736c9cba714fb5de7d1c3dd31d86e50091cf8 Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 2 Feb 2023 14:28:32 +0000 Subject: [PATCH 002/156] [jsinterp] Improve parsing * support subset `... else if ...` * support `while` * add `RegExp` class * generalise `new` support * limited more debug strings * matching test changes --- test/test_jsinterp.py | 53 +++++++++++++- youtube_dl/jsinterp.py | 156 +++++++++++++++++++++++++++-------------- 2 files changed, 154 insertions(+), 55 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index c47def737..b5962356c 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -11,8 +11,6 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import math import re -from youtube_dl.compat import compat_re_Pattern - from youtube_dl.jsinterp import JS_Undefined, JSInterpreter @@ -140,15 +138,23 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertTrue(math.isnan(jsi.call_function('x'))) + def test_Date(self): jsi = JSInterpreter(''' function x() { return new Date('Wednesday 31 December 1969 18:01:26 MDT') - 0; } ''') self.assertEqual(jsi.call_function('x'), 86000) + jsi = JSInterpreter(''' function x(dt) { return new Date(dt) - 0; } ''') self.assertEqual(jsi.call_function('x', 'Wednesday 31 December 1969 18:01:26 MDT'), 86000) + # date format m/d/y + jsi = JSInterpreter(''' + function x() { return new Date('12/31/1969 18:01:26 MDT') - 0; } + ''') + self.assertEqual(jsi.call_function('x'), 86000) + def test_call(self): jsi = JSInterpreter(''' function x() { return 2; } @@ -181,6 +187,15 @@ class TestJSInterpreter(unittest.TestCase): self.assertEqual(jsi.call_function('x'), 10) """ # Unsupported + jsi = JSInterpreter(''' + function x() { + if (0!=0) return 1; + else {return 10} + }''') + self.assertEqual(jsi.call_function('x'), 10) + """ + + def test_elseif(self): jsi = JSInterpreter(''' function x() { if (0!=0) {return 1} @@ -188,6 +203,16 @@ class TestJSInterpreter(unittest.TestCase): else {return 10} }''') self.assertEqual(jsi.call_function('x'), 10) + + """ # Unsupported + jsi = JSInterpreter(''' + function x() { + if (0!=0) return 1; + else if (1==0) {return 2} + else {return 10} + }''') + self.assertEqual(jsi.call_function('x'), 10) + # etc """ def test_for_loop(self): @@ -197,6 +222,13 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertEqual(jsi.call_function('x'), 10) + def test_while_loop(self): + # function x() { a=0; while (a<10) {a++} a } + jsi = JSInterpreter(''' + function x() { a=0; while (a<10) {a++} return a } + ''') + self.assertEqual(jsi.call_function('x'), 10) + def test_switch(self): jsi = JSInterpreter(''' function x(f) { switch(f){ @@ -415,13 +447,28 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter(''' function x() { let a=/,,[/,913,/](,)}/; return a; } ''') - self.assertIsInstance(jsi.call_function('x'), compat_re_Pattern) + attrs = set(('findall', 'finditer', 'flags', 'groupindex', + 'groups', 'match', 'pattern', 'scanner', + 'search', 'split', 'sub', 'subn')) + self.assertTrue(set(dir(jsi.call_function('x'))) > attrs) jsi = JSInterpreter(''' function x() { let a=/,,[/,913,/](,)}/i; return a; } ''') self.assertEqual(jsi.call_function('x').flags & ~re.U, re.I) + jsi = JSInterpreter(r''' + function x() { let a=[/[)\\]/]; return a[0]; } + ''') + self.assertEqual(jsi.call_function('x').pattern, r'[)\\]') + + """ # fails + jsi = JSInterpreter(r''' + function x() { let a=100; a/=/[0-9]+/.exec('divide by 20 today')[0]; } + ''') + self.assertEqual(jsi.call_function('x'), 5) + """ + def test_char_code_at(self): jsi = JSInterpreter('function x(i){return "test".charCodeAt(i)}') self.assertEqual(jsi.call_function('x', 0), 116) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 9a3b8d7f2..1e7b342ac 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -187,19 +187,6 @@ class LocalNameSpace(ChainMap): class JSInterpreter(object): __named_object_counter = 0 - _RE_FLAGS = { - # special knowledge: Python's re flags are bitmask values, current max 128 - # invent new bitmask values well above that for literal parsing - # TODO: new pattern class to execute matches with these flags - 'd': 1024, # Generate indices for substring matches - 'g': 2048, # Global search - 'i': re.I, # Case-insensitive search - 'm': re.M, # Multi-line search - 's': re.S, # Allows . to match newline characters - 'u': re.U, # Treat a pattern as a sequence of unicode code points - 'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string - } - _OBJ_NAME = '__youtube_dl_jsinterp_obj' OP_CHARS = None @@ -217,9 +204,48 @@ class JSInterpreter(object): msg = '{0} in: {1!r}'.format(msg.rstrip(), expr[:100]) super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) + class JS_RegExp(object): + _RE_FLAGS = { + # special knowledge: Python's re flags are bitmask values, current max 128 + # invent new bitmask values well above that for literal parsing + # TODO: new pattern class to execute matches with these flags + 'd': 1024, # Generate indices for substring matches + 'g': 2048, # Global search + 'i': re.I, # Case-insensitive search + 'm': re.M, # Multi-line search + 's': re.S, # Allows . to match newline characters + 'u': re.U, # Treat a pattern as a sequence of unicode code points + 'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string + } + + def __init__(self, pattern_txt, flags=''): + if isinstance(flags, compat_str): + flags, _ = self.regex_flags(flags) + # Thx: https://stackoverflow.com/questions/44773522/setattr-on-python2-sre-sre-pattern + # First, avoid https://github.com/python/cpython/issues/74534 + self.__self = re.compile(pattern_txt.replace('[[', r'[\['), flags) + for name in dir(self.__self): + # Only these? Obviously __class__, __init__. + # PyPy creates a __weakref__ attribute with value None + # that can't be setattr'd but also can't need to be copied. + if name in ('__class__', '__init__', '__weakref__'): + continue + setattr(self, name, getattr(self.__self, name)) + + @classmethod + def regex_flags(cls, expr): + flags = 0 + if not expr: + return flags, expr + for idx, ch in enumerate(expr): + if ch not in cls._RE_FLAGS: + break + flags |= cls._RE_FLAGS[ch] + return flags, expr[idx + 1:] + @classmethod def __op_chars(cls): - op_chars = set(';,') + op_chars = set(';,[') for op in cls._all_operators(): for c in op[0]: op_chars.add(c) @@ -231,17 +257,6 @@ class JSInterpreter(object): namespace[name] = obj return name - @classmethod - def _regex_flags(cls, expr): - flags = 0 - if not expr: - return flags, expr - for idx, ch in enumerate(expr): - if ch not in cls._RE_FLAGS: - break - flags |= cls._RE_FLAGS[ch] - return flags, expr[idx + 1:] - @classmethod def _separate(cls, expr, delim=',', max_split=None, skip_delims=None): if not expr: @@ -328,7 +343,7 @@ class JSInterpreter(object): try: return opfunc(left_val, right_val) except Exception as e: - raise self.Exception('Failed to evaluate {left_val!r} {op} {right_val!r}'.format(**locals()), expr, cause=e) + raise self.Exception('Failed to evaluate {left_val!r:.50} {op} {right_val!r:.50}'.format(**locals()), expr, cause=e) def _index(self, obj, idx, allow_undefined=False): if idx == 'length': @@ -338,7 +353,7 @@ class JSInterpreter(object): except Exception as e: if allow_undefined: return JS_Undefined - raise self.Exception('Cannot get index {idx}'.format(**locals()), expr=repr(obj), cause=e) + raise self.Exception('Cannot get index {idx:.100}'.format(**locals()), expr=repr(obj), cause=e) def _dump(self, obj, namespace): try: @@ -352,6 +367,7 @@ class JSInterpreter(object): allow_recursion -= 1 should_return = False + # fails on (eg) if (...) stmt1; else stmt2; sub_statements = list(self._separate(stmt, ';')) or [''] expr = stmt = sub_statements.pop().strip() for sub_stmt in sub_statements: @@ -371,25 +387,30 @@ class JSInterpreter(object): if expr[0] in _QUOTES: inner, outer = self._separate(expr, expr[0], 1) if expr[0] == '/': - flags, outer = self._regex_flags(outer) - inner = re.compile(inner[1:], flags=flags) # , strict=True)) + flags, outer = self.JS_RegExp.regex_flags(outer) + inner = self.JS_RegExp(inner[1:], flags=flags) else: inner = json.loads(js_to_json(inner + expr[0])) # , strict=True)) if not outer: return inner, should_return expr = self._named_object(local_vars, inner) + outer - if expr.startswith('new '): - obj = expr[4:] - if obj.startswith('Date('): - left, right = self._separate_at_paren(obj[4:]) - expr = unified_timestamp( - self.interpret_expression(left, local_vars, allow_recursion), False) + new_kw, _, obj = expr.partition('new ') + if not new_kw: + for klass, konstr in (('Date', lambda x: int(unified_timestamp(x, False) * 1000)), + ('RegExp', self.JS_RegExp), + ('Error', self.Exception)): + if not obj.startswith(klass + '('): + continue + left, right = self._separate_at_paren(obj[len(klass):]) + argvals = self.interpret_iter(left, local_vars, allow_recursion) + expr = konstr(*argvals) if not expr: - raise self.Exception('Failed to parse date {left!r}'.format(**locals()), expr=expr) - expr = self._dump(int(expr * 1000), local_vars) + right + raise self.Exception('Failed to parse {klass} {left!r:.100}'.format(**locals()), expr=expr) + expr = self._dump(expr, local_vars) + right + break else: - raise self.Exception('Unsupported object {obj}'.format(**locals()), expr=expr) + raise self.Exception('Unsupported object {obj:.100}'.format(**locals()), expr=expr) if expr.startswith('void '): left = self.interpret_expression(expr[5:], local_vars, allow_recursion) @@ -430,24 +451,45 @@ class JSInterpreter(object): (?Ptry)\s*\{| (?Pif)\s*\(| (?Pswitch)\s*\(| - (?Pfor)\s*\( + (?Pfor)\s*\(| + (?Pwhile)\s*\( ''', expr) md = m.groupdict() if m else {} if md.get('if'): cndn, expr = self._separate_at_paren(expr[m.end() - 1:]) - if_expr, expr = self._separate_at_paren(expr.lstrip()) - # TODO: "else if" is not handled + if expr.startswith('{'): + if_expr, expr = self._separate_at_paren(expr) + else: + # may lose ... else ... because of ll.368-374 + if_expr, expr = self._separate_at_paren(expr, delim=';') else_expr = None - m = re.match(r'else\s*{', expr) + m = re.match(r'else\s*(?P\{)?', expr) if m: - else_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + if m.group('block'): + else_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + else: + # handle subset ... else if (...) {...} else ... + # TODO: make interpret_statement do this properly, if possible + exprs = list(self._separate(expr[m.end():], delim='}', max_split=2)) + if len(exprs) > 1: + if re.match(r'\s*if\s*\(', exprs[0]) and re.match(r'\s*else\b', exprs[1]): + else_expr = exprs[0] + '}' + exprs[1] + expr = (exprs[2] + '}') if len(exprs) == 3 else None + else: + else_expr = exprs[0] + exprs.append('') + expr = '}'.join(exprs[1:]) + else: + else_expr = exprs[0] + expr = None + else_expr = else_expr.lstrip() + '}' cndn = _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)) ret, should_abort = self.interpret_statement( if_expr if cndn else else_expr, local_vars, allow_recursion) if should_abort: return ret, True - if md.get('try'): + elif md.get('try'): try_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) err = None try: @@ -484,8 +526,8 @@ class JSInterpreter(object): if err: raise err - elif md.get('for'): - constructor, remaining = self._separate_at_paren(expr[m.end() - 1:]) + elif md.get('for') or md.get('while'): + init_or_cond, remaining = self._separate_at_paren(expr[m.end() - 1:]) if remaining.startswith('{'): body, expr = self._separate_at_paren(remaining) else: @@ -496,11 +538,12 @@ class JSInterpreter(object): body = 'switch(%s){%s}' % (switch_val, body) else: body, expr = remaining, '' - start, cndn, increment = self._separate(constructor, ';') - self.interpret_expression(start, local_vars, allow_recursion) - while True: - if not _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): - break + if md.get('for'): + start, cndn, increment = self._separate(init_or_cond, ';') + self.interpret_expression(start, local_vars, allow_recursion) + else: + cndn, increment = init_or_cond, None + while _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): try: ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion) if should_abort: @@ -509,7 +552,8 @@ class JSInterpreter(object): break except JS_Continue: pass - self.interpret_expression(increment, local_vars, allow_recursion) + if increment: + self.interpret_expression(increment, local_vars, allow_recursion) elif md.get('switch'): switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:]) @@ -764,6 +808,10 @@ class JSInterpreter(object): if idx >= len(obj): return None return ord(obj[idx]) + elif member == 'replace': + assertion(isinstance(obj, compat_str), 'must be applied on a string') + assertion(len(argvals) == 2, 'takes exactly two arguments') + return re.sub(argvals[0], argvals[1], obj) idx = int(member) if isinstance(obj, list) else member return obj[idx](argvals, allow_recursion=allow_recursion) @@ -795,6 +843,10 @@ class JSInterpreter(object): raise self.Exception('Cannot return from an expression', expr) return ret + def interpret_iter(self, list_txt, local_vars, allow_recursion): + for v in self._separate(list_txt): + yield self.interpret_expression(v, local_vars, allow_recursion) + def extract_object(self, objname): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' obj = {} From 37cbdfa0e7c9d00d450af32dc9cdaf93cbfc4576 Mon Sep 17 00:00:00 2001 From: Brian Marks Date: Thu, 2 Feb 2023 11:58:21 -0500 Subject: [PATCH 003/156] [americastestkitchen] Add support for downloading entire series (#31493) Also * support new sites and URL patterns * back-port from yt-dlp Co-authored-by: dirkf --- youtube_dl/extractor/americastestkitchen.py | 115 +++++++++++++++----- 1 file changed, 88 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index be960c0f9..08d3604e9 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -15,7 +15,7 @@ from ..utils import ( class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?Pepisode|videos)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?:cooks(?:country|illustrated)/)?(?Pepisode|videos)/(?P\d+)' _TESTS = [{ 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', @@ -23,15 +23,20 @@ class AmericasTestKitchenIE(InfoExtractor): 'id': '5b400b9ee338f922cb06450c', 'title': 'Japanese Suppers', 'ext': 'mp4', + 'display_id': 'weeknight-japanese-suppers', 'description': 'md5:64e606bfee910627efc4b5f050de92b3', - 'thumbnail': r're:^https?://', - 'timestamp': 1523318400, - 'upload_date': '20180410', - 'release_date': '20180410', + 'timestamp': 1523304000, + 'upload_date': '20180409', + 'release_date': '20180409', 'series': "America's Test Kitchen", + 'season': 'Season 18', 'season_number': 18, 'episode': 'Japanese Suppers', 'episode_number': 15, + 'duration': 1376, + 'thumbnail': r're:^https?://', + 'average_rating': 0, + 'view_count': int, }, 'params': { 'skip_download': True, @@ -44,15 +49,20 @@ class AmericasTestKitchenIE(InfoExtractor): 'id': '5fbe8c61bda2010001c6763b', 'title': 'Simple Chicken Dinner', 'ext': 'mp4', + 'display_id': 'atktv_2103_simple-chicken-dinner_full-episode_web-mp4', 'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7', - 'thumbnail': r're:^https?://', - 'timestamp': 1610755200, - 'upload_date': '20210116', - 'release_date': '20210116', + 'timestamp': 1610737200, + 'upload_date': '20210115', + 'release_date': '20210115', 'series': "America's Test Kitchen", + 'season': 'Season 21', 'season_number': 21, 'episode': 'Simple Chicken Dinner', 'episode_number': 3, + 'duration': 1397, + 'thumbnail': r're:^https?://', + 'view_count': int, + 'average_rating': 0, }, 'params': { 'skip_download': True, @@ -60,6 +70,12 @@ class AmericasTestKitchenIE(InfoExtractor): }, { 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, + }, { + 'url': 'https://www.americastestkitchen.com/cookscountry/episode/564-when-only-chocolate-will-do', + 'only_matching': True, + }, { + 'url': 'https://www.americastestkitchen.com/cooksillustrated/videos/4478-beef-wellington', + 'only_matching': True, }, { 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', 'only_matching': True, @@ -94,7 +110,7 @@ class AmericasTestKitchenIE(InfoExtractor): class AmericasTestKitchenSeasonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pamericastestkitchen|cookscountry)\.com/episodes/browse/season_(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?Pamericastestkitchen|(?Pcooks(?:country|illustrated)))\.com(?:(?:/(?Pcooks(?:country|illustrated)))?(?:/?$|(?\d+)))' _TESTS = [{ # ATK Season 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', @@ -105,48 +121,93 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): 'playlist_count': 13, }, { # Cooks Country Season - 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'url': 'https://www.americastestkitchen.com/cookscountry/episodes/browse/season_12', 'info_dict': { 'id': 'season_12', 'title': 'Season 12', }, 'playlist_count': 13, + }, { + # America's Test Kitchen Series + 'url': 'https://www.americastestkitchen.com/', + 'info_dict': { + 'id': 'americastestkitchen', + 'title': 'America\'s Test Kitchen', + }, + 'playlist_count': 558, + }, { + # Cooks Country Series + 'url': 'https://www.americastestkitchen.com/cookscountry', + 'info_dict': { + 'id': 'cookscountry', + 'title': 'Cook\'s Country', + }, + 'playlist_count': 199, + }, { + 'url': 'https://www.americastestkitchen.com/cookscountry/', + 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com', + 'only_matching': True, + }, { + 'url': 'https://www.americastestkitchen.com/cooksillustrated/', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com', + 'only_matching': True, }] def _real_extract(self, url): - show_name, season_number = re.match(self._VALID_URL, url).groups() - season_number = int(season_number) + match = re.match(self._VALID_URL, url).groupdict() + show = match.get('show2') + show_path = ('/' + show) if show else '' + show = show or match['show'] + season_number = int_or_none(match.get('season')) - slug = 'atk' if show_name == 'americastestkitchen' else 'cco' + slug, title = { + 'americastestkitchen': ('atk', 'America\'s Test Kitchen'), + 'cookscountry': ('cco', 'Cook\'s Country'), + 'cooksillustrated': ('cio', 'Cook\'s Illustrated'), + }[show] - season = 'Season %d' % season_number + facet_filters = [ + 'search_document_klass:episode', + 'search_show_slug:' + slug, + ] + + if season_number: + playlist_id = 'season_%d' % season_number + playlist_title = 'Season %d' % season_number + facet_filters.append('search_season_list:' + playlist_title) + else: + playlist_id = show + playlist_title = title season_search = self._download_json( 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, - season, headers={ - 'Origin': 'https://www.%s.com' % show_name, + playlist_id, headers={ + 'Origin': 'https://www.americastestkitchen.com', 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', 'X-Algolia-Application-Id': 'Y1FNZXUI30', }, query={ - 'facetFilters': json.dumps([ - 'search_season_list:' + season, - 'search_document_klass:episode', - 'search_show_slug:' + slug, - ]), - 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug, + 'facetFilters': json.dumps(facet_filters), + 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title,search_atk_episode_season' % slug, 'attributesToHighlight': '', 'hitsPerPage': 1000, }) def entries(): for episode in (season_search.get('hits') or []): - search_url = episode.get('search_url') + search_url = episode.get('search_url') # always formatted like '/episode/123-title-of-episode' if not search_url: continue yield { '_type': 'url', - 'url': 'https://www.%s.com%s' % (show_name, search_url), - 'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]), + 'url': 'https://www.americastestkitchen.com%s%s' % (show_path, search_url), + 'id': try_get(episode, lambda e: e['objectID'].rsplit('_', 1)[-1]), 'title': episode.get('title'), 'description': episode.get('description'), 'timestamp': unified_timestamp(episode.get('search_document_date')), @@ -156,4 +217,4 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): } return self.playlist_result( - entries(), 'season_%d' % season_number, season) + entries(), playlist_id, playlist_title) From 297fbff23b347612a5f6002b40adba9dfad85413 Mon Sep 17 00:00:00 2001 From: Rodrigo Dias Date: Thu, 2 Feb 2023 17:10:09 +0000 Subject: [PATCH 004/156] [doc] Fixed typo appearing to promise an example (#31489) Resolves #31425 Co-authored-by: dirkf --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cd888c731..6e07ddb1c 100644 --- a/README.md +++ b/README.md @@ -632,7 +632,7 @@ To use percent literals in an output template use `%%`. To output to stdout use The current default template is `%(title)s-%(id)s.%(ext)s`. -In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title: +In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title. #### Output template and Windows batch files From 807e593a32a1ace8fa0be8129fc5071d86516c99 Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Thu, 2 Feb 2023 13:12:36 -0400 Subject: [PATCH 005/156] [cammodels] fix and improve extractor (#31453) Co-authored-by: dirkf --- youtube_dl/extractor/cammodels.py | 34 +++++++++---------------------- 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py index 1eb81b75e..d2e860b24 100644 --- a/youtube_dl/extractor/cammodels.py +++ b/youtube_dl/extractor/cammodels.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - ExtractorError, int_or_none, url_or_none, ) @@ -20,32 +19,11 @@ class CamModelsIE(InfoExtractor): def _real_extract(self, url): user_id = self._match_id(url) - webpage = self._download_webpage( - url, user_id, headers=self.geo_verification_headers()) - - manifest_root = self._html_search_regex( - r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) - - if not manifest_root: - ERRORS = ( - ("I'm offline, but let's stay connected", 'This user is currently offline'), - ('in a private show', 'This user is in a private show'), - ('is currently performing LIVE', 'This model is currently performing live'), - ) - for pattern, message in ERRORS: - if pattern in webpage: - error = message - expected = True - break - else: - error = 'Unable to find manifest URL root' - expected = False - raise ExtractorError(error, expected=expected) - manifest = self._download_json( - '%s%s.json' % (manifest_root, user_id), user_id) + 'https://manifest-server.naiadsystems.com/live/s:%s.json' % user_id, user_id) formats = [] + thumbnails = [] for format_id, format_dict in manifest['formats'].items(): if not isinstance(format_dict, dict): continue @@ -85,6 +63,13 @@ class CamModelsIE(InfoExtractor): 'preference': -1, }) else: + if format_id == 'jpeg': + thumbnails.append({ + 'url': f['url'], + 'width': f['width'], + 'height': f['height'], + 'format_id': f['format_id'], + }) continue formats.append(f) self._sort_formats(formats) @@ -92,6 +77,7 @@ class CamModelsIE(InfoExtractor): return { 'id': user_id, 'title': self._live_title(user_id), + 'thumbnails': thumbnails, 'is_live': True, 'formats': formats, 'age_limit': 18 From e9611a2a3603ee201d0c1ba99e8bfd8ec1e697cd Mon Sep 17 00:00:00 2001 From: Leon Etienne <40911701+Leonetienne@users.noreply.github.com> Date: Thu, 2 Feb 2023 18:13:39 +0100 Subject: [PATCH 006/156] [pr0gramm] implement InfoExtractor, Resolves #31433 (#31434) * [pr0gramm] implement infoextractor * [pr0gramm] remove misplaced comment, uncapture regex-group * [pr0gramm]: specify utf-8 coding * [pr0gramm]: add trailing comma to lists for maintainability * [pr0gramm]: ie only sets upload_date attribute * [pr0gramm]: add video_id to title * [pr0gramm]: more forgiving _valid_url regex * [pr0gramm]: add uploader to title, if set * Discriminate URL pattern --------- Co-authored-by: dirkf --- youtube_dl/extractor/extractors.py | 4 ++ youtube_dl/extractor/pr0gramm.py | 105 +++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 youtube_dl/extractor/pr0gramm.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 947cbe8fd..cf0388ed2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1667,3 +1667,7 @@ from .zingmp3 import ( ) from .zoom import ZoomIE from .zype import ZypeIE +from .pr0gramm import ( + Pr0grammIE, + Pr0grammStaticIE, +) diff --git a/youtube_dl/extractor/pr0gramm.py b/youtube_dl/extractor/pr0gramm.py new file mode 100644 index 000000000..b68224fd5 --- /dev/null +++ b/youtube_dl/extractor/pr0gramm.py @@ -0,0 +1,105 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +import re +from ..utils import ( + merge_dicts, +) + + +class Pr0grammStaticIE(InfoExtractor): + # Possible urls: + # https://pr0gramm.com/static/5466437 + _VALID_URL = r'https?://pr0gramm\.com/static/(?P[0-9]+)' + _TEST = { + 'url': 'https://pr0gramm.com/static/5466437', + 'md5': '52fa540d70d3edc286846f8ca85938aa', + 'info_dict': { + 'id': '5466437', + 'ext': 'mp4', + 'title': 'pr0gramm-5466437 by g11st', + 'uploader': 'g11st', + 'upload_date': '20221221', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # Fetch media sources + entries = self._parse_html5_media_entries(url, webpage, video_id) + media_info = entries[0] + + # this raises if there are no formats + self._sort_formats(media_info.get('formats') or []) + + # Fetch author + uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader') + + # Fetch approx upload timestamp from filename + # Have None-defaults in case the extraction fails + uploadDay = None + uploadMon = None + uploadYear = None + uploadTimestr = None + # (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4) + m = re.search(r'//img\.pr0gramm\.com/(?P[\d]+)/(?P[\d]+)/(?P[\d]+)/\w+\.\w{,4}', webpage) + + if (m): + # Up to a day of accuracy should suffice... + uploadDay = m.groupdict().get('day') + uploadMon = m.groupdict().get('mon') + uploadYear = m.groupdict().get('year') + uploadTimestr = uploadYear + uploadMon + uploadDay + + return merge_dicts({ + 'id': video_id, + 'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''), + 'uploader': uploader, + 'upload_date': uploadTimestr + }, media_info) + + +# This extractor is for the primary url (used for sharing, and appears in the +# location bar) Since this page loads the DOM via JS, yt-dl can't find any +# video information here. So let's redirect to a compatibility version of +# the site, which does contain the