From 64d6dd64c8b7a35a87655d27fc83f2e98ef6ce13 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 23 Apr 2023 22:58:35 +0100 Subject: [PATCH 1/5] [YouTube] Support Releases tab --- youtube_dl/extractor/youtube.py | 114 +++++++++++++++++++------------- youtube_dl/utils.py | 9 ++- 2 files changed, 74 insertions(+), 49 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 80fff7ada..0411c49f1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -31,6 +31,7 @@ from ..utils import ( extract_attributes, get_element_by_attribute, int_or_none, + join_nonempty, js_to_json, LazyList, merge_dicts, @@ -45,6 +46,7 @@ from ..utils import ( str_to_int, traverse_obj, try_get, + txt_or_none, unescapeHTML, unified_strdate, unsmuggle_url, @@ -2608,6 +2610,17 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'uploader_id': '@lexwill718', }, 'playlist_mincount': 75, + }, { + # Releases tab + 'url': 'https://www.youtube.com/@daftpunk/releases', + 'info_dict': { + 'id': 'UC_kRDKYrUlrbtrSiyu5Tflg', + 'title': 'Daft Punk - Releases', + 'description': 'Daft Punk (1993 - 2021) - Official YouTube Channel', + 'uploader_id': '@daftpunk', + 'uploader': 'Daft Punk', + }, + 'playlist_mincount': 36, }, { 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'only_matching': True, @@ -2822,6 +2835,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): continue return renderer + @staticmethod + def _get_text(r, k): + return traverse_obj( + r, (k, 'runs', 0, 'text'), (k, 'simpleText'), + expected_type=txt_or_none) + def _grid_entries(self, grid_renderer): for item in grid_renderer['items']: if not isinstance(item, dict): @@ -2829,9 +2848,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): renderer = self._extract_grid_item_renderer(item) if not isinstance(renderer, dict): continue - title = try_get( - renderer, (lambda x: x['title']['runs'][0]['text'], - lambda x: x['title']['simpleText']), compat_str) + title = self._get_text(renderer, 'title') # playlist playlist_id = renderer.get('playlistId') if playlist_id: @@ -2848,8 +2865,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): # channel channel_id = renderer.get('channelId') if channel_id: - title = try_get( - renderer, lambda x: x['title']['simpleText'], compat_str) + title = self._get_text(renderer, 'title') yield self.url_result( 'https://www.youtube.com/channel/%s' % channel_id, ie=YoutubeTabIE.ie_key(), video_title=title) @@ -2958,15 +2974,26 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): def _rich_grid_entries(self, contents): for content in contents: - video_renderer = try_get( - content, - (lambda x: x['richItemRenderer']['content']['videoRenderer'], - lambda x: x['richItemRenderer']['content']['reelItemRenderer']), - dict) + content = traverse_obj( + content, ('richItemRenderer', 'content'), + expected_type=dict) or {} + video_renderer = traverse_obj( + content, 'videoRenderer', 'reelItemRenderer', + expected_type=dict) if video_renderer: entry = self._video_entry(video_renderer) if entry: yield entry + # playlist + renderer = traverse_obj( + content, 'playlistRenderer', expected_type=dict) or {} + title = self._get_text(renderer, 'title') + playlist_id = renderer.get('playlistId') + if playlist_id: + yield self.url_result( + 'https://www.youtube.com/playlist?list=%s' % playlist_id, + ie=YoutubeTabIE.ie_key(), video_id=playlist_id, + video_title=title) @staticmethod def _build_continuation_query(continuation, ctp=None): @@ -3071,6 +3098,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): return for entry in self._rich_grid_entries(rich_grid_renderer.get('contents') or []): yield entry + continuation = self._extract_continuation(rich_grid_renderer) ytcfg = self._extract_ytcfg(item_id, webpage) @@ -3213,50 +3241,41 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): uploader['channel'] = uploader['uploader'] return uploader - @staticmethod - def _extract_alert(data): + @classmethod + def _extract_alert(cls, data): alerts = [] - for alert in try_get(data, lambda x: x['alerts'], list) or []: - if not isinstance(alert, dict): - continue - alert_text = try_get( - alert, lambda x: x['alertRenderer']['text'], dict) + for alert in traverse_obj(data, ('alerts', Ellipsis), expected_type=dict): + alert_text = traverse_obj( + alert, (None, lambda x: x['alertRenderer']['text']), get_all=False) if not alert_text: continue - text = try_get( - alert_text, - (lambda x: x['simpleText'], lambda x: x['runs'][0]['text']), - compat_str) + text = cls._get_text(alert_text, 'text') if text: alerts.append(text) return '\n'.join(alerts) def _extract_from_tabs(self, item_id, webpage, data, tabs): selected_tab = self._extract_selected_tab(tabs) - renderer = try_get( - data, lambda x: x['metadata']['channelMetadataRenderer'], dict) + renderer = traverse_obj(data, ('metadata', 'channelMetadataRenderer'), + expected_type=dict) or {} playlist_id = item_id title = description = None if renderer: - channel_title = renderer.get('title') or item_id - tab_title = selected_tab.get('title') - title = channel_title or item_id - if tab_title: - title += ' - %s' % tab_title - if selected_tab.get('expandedText'): - title += ' - %s' % selected_tab['expandedText'] - description = renderer.get('description') - playlist_id = renderer.get('externalId') + channel_title = txt_or_none(renderer.get('title')) or item_id + tab_title = txt_or_none(selected_tab.get('title')) + title = join_nonempty( + channel_title or item_id, tab_title, + txt_or_none(selected_tab.get('expandedText')), + delim=' - ') + description = txt_or_none(renderer.get('description')) + playlist_id = txt_or_none(renderer.get('externalId')) or playlist_id else: - renderer = try_get( - data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) - if renderer: - title = renderer.get('title') - else: - renderer = try_get( - data, lambda x: x['header']['hashtagHeaderRenderer'], dict) - if renderer: - title = try_get(renderer, lambda x: x['hashtag']['simpleText']) + renderer = traverse_obj(data, + ('metadata', 'playlistMetadataRenderer'), + ('header', 'hashtagHeaderRenderer'), + expected_type=dict) or {} + title = traverse_obj(renderer, 'title', ('hashtag', 'simpleText'), + expected_type=txt_or_none) playlist = self.playlist_result( self._entries(selected_tab, item_id, webpage), playlist_id=playlist_id, playlist_title=title, @@ -3264,15 +3283,16 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): return merge_dicts(playlist, self._extract_uploader(renderer, data)) def _extract_from_playlist(self, item_id, url, data, playlist): - title = playlist.get('title') or try_get( - data, lambda x: x['titleText']['simpleText'], compat_str) - playlist_id = playlist.get('playlistId') or item_id + title = traverse_obj((playlist, data), + (0, 'title'), (1, 'titleText', 'simpleText'), + expected_type=txt_or_none) + playlist_id = txt_or_none(playlist.get('playlistId')) or item_id # Inline playlist rendition continuation does not always work # at Youtube side, so delegating regular tab-based playlist URL # processing whenever possible. - playlist_url = urljoin(url, try_get( - playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str)) + playlist_url = urljoin(url, traverse_obj( + playlist, ('endpoint', 'commandMetadata', 'webCommandMetadata', 'url'), + expected_type=url_or_none)) if playlist_url and playlist_url != url: return self.url_result( playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d80ceb007..65ddb3b0f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3753,6 +3753,11 @@ def strip_or_none(v, default=None): return v.strip() if isinstance(v, compat_str) else default +def txt_or_none(v, default=None): + """ Combine str/strip_or_none, disallow blank value (for traverse_obj) """ + return default if v is None else (compat_str(v).strip() or default) + + def url_or_none(url): if not url or not isinstance(url, compat_str): return None @@ -4096,8 +4101,8 @@ def escape_url(url): ).geturl() -def parse_qs(url): - return compat_parse_qs(compat_urllib_parse.urlparse(url).query) +def parse_qs(url, **kwargs): + return compat_parse_qs(compat_urllib_parse.urlparse(url).query, **kwargs) def read_batch_urls(batch_fd): From 11cc3f3ad03a88d6cb1eab18a8e5dd6bf148ac54 Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 11 May 2023 20:53:07 +0100 Subject: [PATCH 2/5] [utils] Fix `compiled_regex_type` in 249f2b6 --- youtube_dl/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 65ddb3b0f..584581b6a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -56,6 +56,7 @@ from .compat import ( compat_kwargs, compat_os_name, compat_re_Match, + compat_re_Pattern, compat_shlex_quote, compat_str, compat_struct_pack, @@ -86,7 +87,7 @@ def register_socks_protocols(): # Unfavoured alias -compiled_regex_type = compat_re_Match +compiled_regex_type = compat_re_Pattern def random_user_agent(): From a85a875fef2e9b097c3f6f93f1d0cead06f84e43 Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 11 May 2023 20:59:30 +0100 Subject: [PATCH 3/5] [jsinterp] Handle NaN in bitwise operators * also add _NaN * also pull function naming from yt-dlp --- test/test_jsinterp.py | 11 +++++++++++ youtube_dl/jsinterp.py | 41 ++++++++++++++++++++++++++++++++--------- 2 files changed, 43 insertions(+), 9 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index e121358d7..a8f312fde 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -18,6 +18,7 @@ class TestJSInterpreter(unittest.TestCase): def test_basic(self): jsi = JSInterpreter('function x(){;}') self.assertEqual(jsi.call_function('x'), None) + self.assertEqual(repr(jsi.extract_function('x')), 'F') jsi = JSInterpreter('function x3(){return 42;}') self.assertEqual(jsi.call_function('x3'), 42) @@ -505,6 +506,16 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('function x(){return 1236566549 << 5}') self.assertEqual(jsi.call_function('x'), 915423904) + def test_bitwise_operators_madness(self): + jsi = JSInterpreter('function x(){return null << 5}') + self.assertEqual(jsi.call_function('x'), 0) + + jsi = JSInterpreter('function x(){return undefined >> 5}') + self.assertEqual(jsi.call_function('x'), 0) + + jsi = JSInterpreter('function x(){return 42 << NaN}') + self.assertEqual(jsi.call_function('x'), 42) + def test_32066(self): jsi = JSInterpreter("function x(){return Math.pow(3, 5) + new Date('1970-01-01T08:01:42.000+08:00') / 1000 * -239 - -24205;}") self.assertEqual(jsi.call_function('x'), 70) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index a06fc4ff5..bb406647a 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -1,12 +1,13 @@ from __future__ import unicode_literals -from functools import update_wrapper import itertools import json import math import operator import re +from functools import update_wrapper + from .utils import ( error_to_compat_str, ExtractorError, @@ -24,6 +25,22 @@ from .compat import ( ) +# name JS functions +class function_with_repr(object): + # from yt_dlp/utils.py, but in this module + # repr_ is always set + def __init__(self, func, repr_): + update_wrapper(self, func) + self.func, self.__repr = func, repr_ + + def __call__(self, *args, **kwargs): + return self.func(*args, **kwargs) + + def __repr__(self): + return self.__repr + + +# name JS operators def wraps_op(op): def update_and_rename_wrapper(w): @@ -35,10 +52,13 @@ def wraps_op(op): return update_and_rename_wrapper +_NaN = float('nan') + + def _js_bit_op(op): def zeroise(x): - return 0 if x in (None, JS_Undefined) else x + return 0 if x in (None, JS_Undefined, _NaN) else x @wraps_op(op) def wrapped(a, b): @@ -52,7 +72,7 @@ def _js_arith_op(op): @wraps_op(op) def wrapped(a, b): if JS_Undefined in (a, b): - return float('nan') + return _NaN return op(a or 0, b or 0) return wrapped @@ -60,13 +80,13 @@ def _js_arith_op(op): def _js_div(a, b): if JS_Undefined in (a, b) or not (a and b): - return float('nan') + return _NaN return operator.truediv(a or 0, b) if b else float('inf') def _js_mod(a, b): if JS_Undefined in (a, b) or not b: - return float('nan') + return _NaN return (a or 0) % b @@ -74,7 +94,7 @@ def _js_exp(a, b): if not b: return 1 # even 0 ** 0 !! elif JS_Undefined in (a, b): - return float('nan') + return _NaN return (a or 0) ** b @@ -285,6 +305,8 @@ class JSInterpreter(object): def _named_object(self, namespace, obj): self.__named_object_counter += 1 name = '%s%d' % (self._OBJ_NAME, self.__named_object_counter) + if callable(obj) and not isinstance(obj, function_with_repr): + obj = function_with_repr(obj, 'F<%s>' % (self.__named_object_counter, )) namespace[name] = obj return name @@ -693,7 +715,7 @@ class JSInterpreter(object): elif expr == 'undefined': return JS_Undefined, should_return elif expr == 'NaN': - return float('NaN'), should_return + return _NaN, should_return elif md.get('return'): return local_vars[m.group('name')], should_return @@ -953,7 +975,9 @@ class JSInterpreter(object): return self.build_arglist(func_m.group('args')), code def extract_function(self, funcname): - return self.extract_function_from_code(*self.extract_function_code(funcname)) + return function_with_repr( + self.extract_function_from_code(*self.extract_function_code(funcname)), + 'F<%s>' % (funcname, )) def extract_function_from_code(self, argnames, code, *global_stack): local_vars = {} @@ -988,7 +1012,6 @@ class JSInterpreter(object): def build_function(self, argnames, code, *global_stack): global_stack = list(global_stack) or [{}] argnames = tuple(argnames) - # import pdb; pdb.set_trace() def resf(args, kwargs={}, allow_recursion=100): global_stack[0].update( From 6ed34338285f722d0da312ce0af3a15a077a3e2a Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 11 May 2023 21:02:01 +0100 Subject: [PATCH 4/5] [jsinterp] Add short-cut evaluation for common expression * special handling for (d%e.length+e.length)%e.length speeds up ~6% --- youtube_dl/jsinterp.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index bb406647a..f837865c4 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -502,8 +502,15 @@ class JSInterpreter(object): expr = self._dump(inner, local_vars) + outer if expr.startswith('('): - inner, outer = self._separate_at_paren(expr) - inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion) + + m = re.match(r'\((?P[a-z])%(?P[a-z])\.length\+(?P=e)\.length\)%(?P=e)\.length', expr) + if m: + # short-cut eval of frequently used `(d%e.length+e.length)%e.length`, worth ~6% on `pytest -k test_nsig` + outer = None + inner, should_abort = self._offset_e_by_d(m.group('d'), m.group('e'), local_vars) + else: + inner, outer = self._separate_at_paren(expr) + inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion) if not outer or should_abort: return inner, should_abort or should_return else: @@ -957,6 +964,17 @@ class JSInterpreter(object): return obj + @staticmethod + def _offset_e_by_d(d, e, local_vars): + """ Short-cut eval: (d%e.length+e.length)%e.length """ + try: + d = local_vars[d] + e = local_vars[e] + e = len(e) + return _js_mod(_js_mod(d, e) + e, e), False + except Exception: + return None, True + def extract_function_code(self, funcname): """ @returns argnames, code """ func_m = re.search( From d1c6c5c4d618fa950813c0c71aede34a5ac851e9 Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 11 May 2023 21:17:31 +0100 Subject: [PATCH 5/5] [core] Improve platform debug log, based on yt-dlp --- youtube_dl/YoutubeDL.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 117f1c513..212c04298 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -25,6 +25,7 @@ import tokenize import traceback import random +from ssl import OPENSSL_VERSION from string import ascii_letters from .compat import ( @@ -66,6 +67,7 @@ from .utils import ( HEADRequest, int_or_none, ISO3166Utils, + join_nonempty, locked_file, LazyList, make_HTTPS_handler, @@ -2395,9 +2397,20 @@ class YoutubeDL(object): return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3] return impl_name - self._write_string('[debug] Python version %s (%s) - %s\n' % ( - platform.python_version(), python_implementation(), - platform_name())) + def libc_ver(): + try: + return platform.libc_ver() + except OSError: # We may not have access to the executable + return [] + + self._write_string('[debug] Python %s (%s %s) - %s (%s%s)\n' % ( + platform.python_version(), + python_implementation(), + platform.architecture()[0], + platform_name(), + OPENSSL_VERSION, + ', %s' % (join_nonempty(*libc_ver(), delim=' ') or '-'), + )) exe_versions = FFmpegPostProcessor.get_versions(self) exe_versions['rtmpdump'] = rtmpdump_version()