From ab7c61ca29ed1d1216d463d01794eb112a9144d5 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 6 Jan 2025 01:22:16 +0000 Subject: [PATCH 1/8] [YouTube] Apply code style changes, trailing commas, etc --- youtube_dl/extractor/youtube.py | 60 +++++++++++++++++---------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7ea30fd40..e9603d155 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -9,6 +9,7 @@ import json import os.path import random import re +import string import time import traceback @@ -67,6 +68,7 @@ from ..utils import ( class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" + _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' @@ -138,7 +140,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], - 1, [None, None, []], None, None, None, True + 1, [None, None, []], None, None, None, True, ], username, ] @@ -160,7 +162,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): None, 1, None, [1, None, None, None, [password, None, True]], [ None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], - 1, [None, None, []], None, None, None, True + 1, [None, None, []], None, None, None, True, ]] challenge_results = req( @@ -213,7 +215,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): user_hash, None, 2, None, [ 9, None, None, None, None, None, None, None, - [None, tfa_code, True, 2] + [None, tfa_code, True, 2], ]] tfa_results = req( @@ -284,7 +286,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'client': { 'clientName': 'WEB', 'clientVersion': '2.20201021.03.00', - } + }, }, } @@ -385,7 +387,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'client': { 'clientName': 'WEB', 'clientVersion': '2.20201021.03.00', - } + }, }, 'query': query, } @@ -462,7 +464,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # (HTML, videodetails, metadata, renderers) 'name': ('content', 'author', (('ownerChannelName', None), 'title'), ['text']), 'url': ('href', 'ownerProfileUrl', 'vanityChannelUrl', - ['navigationEndpoint', 'browseEndpoint', 'canonicalBaseUrl']) + ['navigationEndpoint', 'browseEndpoint', 'canonicalBaseUrl']), } if any((videodetails, metadata, renderers)): result = ( @@ -671,7 +673,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', 'description': '', 'uploader': '8KVIDEO', - 'title': 'UHDTV TEST 8K VIDEO.mp4' + 'title': 'UHDTV TEST 8K VIDEO.mp4', }, 'params': { 'youtube_include_dash_manifest': True, @@ -711,7 +713,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@theamazingatheist', 'title': 'Burning Everyone\'s Koran', 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', - } + }, }, # Age-gated videos { @@ -839,7 +841,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'expected_warnings': [ 'DASH manifest missing', - ] + ], }, # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) { @@ -1820,8 +1822,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # cpn generation algorithm is reverse engineered from base.js. # In fact it works even with dummy cpn. - CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' - cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)) + CPN_ALPHABET = string.ascii_letters + string.digits + '-_' + cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(16)) # more consistent results setting it to right before the end qs = parse_qs(playback_url) @@ -1881,8 +1883,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): mobj = re.match(cls._VALID_URL, url, re.VERBOSE) if mobj is None: raise ExtractorError('Invalid URL: %s' % url) - video_id = mobj.group(2) - return video_id + return mobj.group(2) def _extract_chapters_from_json(self, data, video_id, duration): chapters_list = try_get( @@ -2035,7 +2036,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): headers = { 'X-YouTube-Client-Name': '85', 'X-YouTube-Client-Version': '2.0', - 'Origin': 'https://www.youtube.com' + 'Origin': 'https://www.youtube.com', } video_info = self._call_api('player', query, video_id, fatal=False, headers=headers) @@ -2064,8 +2065,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)]) search_meta = ( - lambda x: self._html_search_meta(x, webpage, default=None)) \ - if webpage else lambda x: None + (lambda x: self._html_search_meta(x, webpage, default=None)) + if webpage else lambda _: None) video_details = player_response.get('videoDetails') or {} microformat = try_get( @@ -2137,7 +2138,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def build_fragments(f): return LazyList({ 'url': update_url_query(f['url'], { - 'range': '{0}-{1}'.format(range_start, min(range_start + CHUNK_SIZE - 1, f['filesize'])) + 'range': '{0}-{1}'.format(range_start, min(range_start + CHUNK_SIZE - 1, f['filesize'])), }) } for range_start in range(0, f['filesize'], CHUNK_SIZE)) @@ -2236,7 +2237,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'protocol': 'http_dash_segments', 'fragments': build_fragments(dct), } if dct['filesize'] else { - 'downloader_options': {'http_chunk_size': CHUNK_SIZE} # No longer useful? + 'downloader_options': {'http_chunk_size': CHUNK_SIZE}, # No longer useful? }) formats.append(dct) @@ -2454,7 +2455,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): info['subtitles'] = subtitles parsed_url = compat_urllib_parse_urlparse(url) - for component in [parsed_url.fragment, parsed_url.query]: + for component in (parsed_url.fragment, parsed_url.query): query = compat_parse_qs(component) for k, v in query.items(): for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]: @@ -2684,7 +2685,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': 'Super Cooper Shorts - Shorts', 'uploader': 'Super Cooper Shorts', 'uploader_id': '@SuperCooperShorts', - } + }, }, { # Channel that does not have a Shorts tab. Test should just download videos on Home tab instead 'url': 'https://www.youtube.com/@emergencyawesome/shorts', @@ -2738,7 +2739,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'description': 'md5:609399d937ea957b0f53cbffb747a14c', 'uploader': 'ThirstForScience', 'uploader_id': '@ThirstForScience', - } + }, }, { 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', 'only_matching': True, @@ -3037,7 +3038,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'uploader': '3Blue1Brown', 'uploader_id': '@3blue1brown', 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', - } + }, }] @classmethod @@ -3335,7 +3336,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'client': { 'clientName': 'WEB', 'clientVersion': client_version, - } + }, } visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str) @@ -3354,7 +3355,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): headers['x-goog-visitor-id'] = visitor_data data['continuation'] = continuation['continuation'] data['clickTracking'] = { - 'clickTrackingParams': continuation['itct'] + 'clickTrackingParams': continuation['itct'], } count = 0 retries = 3 @@ -3613,7 +3614,7 @@ class YoutubePlaylistIE(InfoExtractor): 'uploader': 'milan', 'uploader_id': '@milan5503', 'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw', - } + }, }, { 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', 'playlist_mincount': 455, @@ -3623,7 +3624,7 @@ class YoutubePlaylistIE(InfoExtractor): 'uploader': 'LBK', 'uploader_id': '@music_king', 'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA', - } + }, }, { 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', 'only_matching': True, @@ -3734,7 +3735,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'youtube-dl test video', 'title': 'youtube-dl test video', - } + }, }] def _get_n_results(self, query, n): @@ -3754,7 +3755,7 @@ class YoutubeSearchDateIE(YoutubeSearchIE): 'info_dict': { 'id': 'youtube-dl test video', 'title': 'youtube-dl test video', - } + }, }] @@ -3769,7 +3770,7 @@ class YoutubeSearchURLIE(YoutubeBaseInfoExtractor): 'id': 'youtube-dl test video', 'title': 'youtube-dl test video', }, - 'params': {'playlistend': 5} + 'params': {'playlistend': 5}, }, { 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', 'only_matching': True, @@ -3785,6 +3786,7 @@ class YoutubeSearchURLIE(YoutubeBaseInfoExtractor): class YoutubeFeedsInfoExtractor(YoutubeTabIE): """ Base class for feed extractors + Subclasses must define the _FEED_NAME property. """ _LOGIN_REQUIRED = True From 00ad2b8ca12d4f9b830ed83876d0d1ab3d698675 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 6 Jan 2025 01:24:30 +0000 Subject: [PATCH 2/8] [YouTube] Refactor subtitle processing * move to internal function * use `traverse-obj()` --- youtube_dl/extractor/youtube.py | 46 +++++++++++++++++---------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e9603d155..56957a661 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2415,9 +2415,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'is_live': is_live, } - pctr = try_get( + pctr = traverse_obj( player_response, - lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict) + ('captions', 'playerCaptionsTracklistRenderer', T(dict))) if pctr: def process_language(container, base_url, lang_code, query): lang_subs = [] @@ -2431,28 +2431,30 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }) container[lang_code] = lang_subs - subtitles = {} - for caption_track in (pctr.get('captionTracks') or []): - base_url = caption_track.get('baseUrl') - if not base_url: - continue - if caption_track.get('kind') != 'asr': - lang_code = caption_track.get('languageCode') - if not lang_code: + def process_subtitles(): + subtitles = {} + for caption_track in traverse_obj(pctr, ( + 'captionTracks', lambda _, v: v.get('baseUrl'))): + if not base_url: continue - process_language( - subtitles, base_url, lang_code, {}) - continue - automatic_captions = {} - for translation_language in (pctr.get('translationLanguages') or []): - translation_language_code = translation_language.get('languageCode') - if not translation_language_code: + if caption_track.get('kind') != 'asr': + lang_code = caption_track.get('languageCode') + if not lang_code: + continue + process_language( + subtitles, base_url, lang_code, {}) continue - process_language( - automatic_captions, base_url, translation_language_code, - {'tlang': translation_language_code}) - info['automatic_captions'] = automatic_captions - info['subtitles'] = subtitles + automatic_captions = {} + for translation_language in traverse_obj(pctr, ( + 'translationLanguages', lambda _, v: v.get('languageCode'))): + translation_language_code = translation_language['languageCode'] + process_language( + automatic_captions, base_url, translation_language_code, + {'tlang': translation_language_code}) + info['automatic_captions'] = automatic_captions + info['subtitles'] = subtitles + + process_subtitles() parsed_url = compat_urllib_parse_urlparse(url) for component in (parsed_url.fragment, parsed_url.query): From 1036478d130c5f2001eca2d7d12558abe601d933 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 6 Jan 2025 01:39:04 +0000 Subject: [PATCH 3/8] [YouTube] Endure subtitle URLs are complete * WEB URLs are, MWEB not * resolves #33017 --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 56957a661..6171df84a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2435,6 +2435,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): subtitles = {} for caption_track in traverse_obj(pctr, ( 'captionTracks', lambda _, v: v.get('baseUrl'))): + base_url = self._yt_urljoin(caption_track['baseUrl']) if not base_url: continue if caption_track.get('kind') != 'asr': From 21fff051217d1c14a7d50fa752052eadbcafee4e Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 15 Jan 2025 03:19:15 +0000 Subject: [PATCH 4/8] [YouTube] Switch to TV API client * thx yt-dlp/yt-dlp#12059 --- youtube_dl/extractor/youtube.py | 102 ++++++++++++++++++++++++++------ 1 file changed, 83 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6171df84a..1424277ac 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -31,7 +31,9 @@ from ..utils import ( dict_get, error_to_compat_str, ExtractorError, + filter_dict, float_or_none, + get_first, extract_attributes, get_element_by_attribute, int_or_none, @@ -82,6 +84,34 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' + _INNERTUBE_CLIENTS = { + # mweb has 'ultralow' formats + # See: https://github.com/yt-dlp/yt-dlp/pull/557 + 'mweb': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'MWEB', + 'clientVersion': '2.20241202.07.00', + # mweb previously did not require PO Token with this UA + 'userAgent': 'Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 2, + 'REQUIRE_PO_TOKEN': True, + 'SUPPORTS_COOKIES': True, + }, + 'tv': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'TVHTML5', + 'clientVersion': '7.20241201.18.00', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 7, + 'SUPPORTS_COOKIES': True, + }, + } + def _login(self): """ Attempt to log in to YouTube. @@ -321,19 +351,24 @@ class YoutubeBaseInfoExtractor(InfoExtractor): '{0} {1} {2}'.format(time_now, self._SAPISID, origin).encode('utf-8')).hexdigest() return 'SAPISIDHASH {0}_{1}'.format(time_now, sapisidhash) - def _call_api(self, ep, query, video_id, fatal=True, headers=None): + def _call_api(self, ep, query, video_id, fatal=True, headers=None, + note='Downloading API JSON'): data = self._DEFAULT_API_DATA.copy() data.update(query) real_headers = {'content-type': 'application/json'} if headers: real_headers.update(headers) + # was: 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' + api_key = self.get_param('youtube_innertube_key') return self._download_json( 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id, - note='Downloading API JSON', errnote='Unable to download API page', + note=note, errnote='Unable to download API page', data=json.dumps(data).encode('utf8'), fatal=fatal, - headers=real_headers, - query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'}) + headers=real_headers, query=filter_dict({ + 'key': api_key, + 'prettyPrint': 'false', + })) def _extract_yt_initial_data(self, video_id, webpage): return self._parse_json( @@ -342,6 +377,22 @@ class YoutubeBaseInfoExtractor(InfoExtractor): self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'), video_id) + def _extract_visitor_data(self, *args): + """ + Extract visitorData from an API response or ytcfg + + Appears to be used to track session state + """ + visitor_data = self.get_param('youtube_visitor_data') + if visitor_data: + return visitor_data + + return get_first( + args, (('VISITOR_DATA', + ('INNERTUBE_CONTEXT', 'client', 'visitorData'), + ('responseContext', 'visitorData')), + T(compat_str))) + def _extract_ytcfg(self, video_id, webpage): return self._parse_json( self._search_regex( @@ -1957,6 +2008,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if sts: pb_context['signatureTimestamp'] = sts + client = traverse_obj(self._INNERTUBE_CLIENTS, ( + lambda _, v: not v.get('REQUIRE_PO_TOKEN')), + get_all=False) + query = { 'playbackContext': { 'contentPlaybackContext': pb_context, @@ -1964,30 +2019,39 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'racyCheckOk': True, }, 'context': { - 'client': { - 'clientName': 'MWEB', - 'clientVersion': '2.20241202.07.00', - 'hl': 'en', - 'userAgent': 'Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)', - 'timeZone': 'UTC', - 'utcOffsetMinutes': 0, - }, + 'client': merge_dicts( + traverse_obj(client, ('INNERTUBE_CONTEXT', 'client')), { + 'hl': 'en', + 'timeZone': 'UTC', + 'utcOffsetMinutes': 0, + }), }, 'videoId': video_id, } - headers = { - 'X-YouTube-Client-Name': '2', - 'X-YouTube-Client-Version': '2.20241202.07.00', - 'Origin': origin, + + headers = merge_dicts({ 'Sec-Fetch-Mode': 'navigate', - 'User-Agent': query['context']['client']['userAgent'], - } + 'Origin': origin, + # 'X-Goog-Visitor-Id': self._extract_visitor_data(ytcfg) or '', + }, traverse_obj(client, { + 'X-YouTube-Client-Name': 'INNERTUBE_CONTEXT_CLIENT_NAME', + 'X-YouTube-Client-Version': ( + 'INNERTUBE_CONTEXT', 'client', 'clientVersion'), + 'User-Agent': ( + 'INNERTUBE_CONTEXT', 'client', 'userAgent'), + })) + auth = self._generate_sapisidhash_header(origin) if auth is not None: headers['Authorization'] = auth headers['X-Origin'] = origin - player_response = self._call_api('player', query, video_id, fatal=False, headers=headers) + player_response = self._call_api( + 'player', query, video_id, fatal=False, headers=headers, + note=join_nonempty( + 'Downloading', traverse_obj(query, ( + 'context', 'client', 'clientName')), + 'API JSON', delim=' ')) def is_agegated(playability): if not isinstance(playability, dict): From 55ad8a24cacee03a91fe70d8d48aa9a02cc0ab11 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 15 Jan 2025 03:22:56 +0000 Subject: [PATCH 5/8] [YouTube] Support `... /feeds/videos.xml?playlist_id={pl_id}` --- youtube_dl/extractor/youtube.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1424277ac..f0406b357 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3601,10 +3601,23 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): def _real_extract(self, url): item_id = self._match_id(url) url = update_url(url, netloc='www.youtube.com') - # Handle both video/playlist URLs qs = parse_qs(url) - video_id = qs.get('v', [None])[0] - playlist_id = qs.get('list', [None])[0] + + def qs_get(key, default=None): + return qs.get(key, [default])[-1] + + # Go around for /feeds/videos.xml?playlist_id={pl_id} + if item_id == 'feeds' and '/feeds/videos.xml?' in url: + playlist_id = qs_get('playlist_id') + if playlist_id: + return self.url_result( + update_url_query('https://www.youtube.com/playlist', { + 'list': playlist_id, + }), ie=self.ie_key(), video_id=playlist_id) + + # Handle both video/playlist URLs + video_id = qs_get('v') + playlist_id = qs_get('list') if video_id and playlist_id: if self._downloader.params.get('noplaylist'): self.to_screen('Downloading just video %s because of --no-playlist' % video_id) From b09442a2f4a8d255569abf0bb6b4867c53d0c2e9 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 19 Jan 2025 01:18:34 +0000 Subject: [PATCH 6/8] [YouTube] Also use ios client when is_live --- youtube_dl/extractor/youtube.py | 118 ++++++++++++++++++++++---------- 1 file changed, 81 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f0406b357..32e836d49 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -85,6 +85,22 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' _INNERTUBE_CLIENTS = { + 'ios': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'IOS', + 'clientVersion': '19.45.4', + 'deviceMake': 'Apple', + 'deviceModel': 'iPhone16,2', + 'userAgent': 'com.google.ios.youtube/19.45.4 (iPhone16,2; U; CPU iOS 18_1_0 like Mac OS X;)', + 'osName': 'iPhone', + 'osVersion': '18.1.0.22B83', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, + 'REQUIRE_JS_PLAYER': False, + 'REQUIRE_PO_TOKEN': True, + }, # mweb has 'ultralow' formats # See: https://github.com/yt-dlp/yt-dlp/pull/557 'mweb': { @@ -110,6 +126,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'INNERTUBE_CONTEXT_CLIENT_NAME': 7, 'SUPPORTS_COOKIES': True, }, + 'web': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20241126.01.00', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, + 'REQUIRE_PO_TOKEN': True, + 'SUPPORTS_COOKIES': True, + }, } def _login(self): @@ -1995,6 +2022,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_response = self._extract_yt_initial_variable( webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') + is_live = traverse_obj(player_response, ('videoDetails', 'isLive')) + if False and not player_response: player_response = self._call_api( 'player', {'videoId': video_id}, video_id) @@ -2008,50 +2037,65 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if sts: pb_context['signatureTimestamp'] = sts - client = traverse_obj(self._INNERTUBE_CLIENTS, ( - lambda _, v: not v.get('REQUIRE_PO_TOKEN')), - get_all=False) + client_names = traverse_obj(self._INNERTUBE_CLIENTS, ( + T(dict.items), lambda _, k_v: not k_v[1].get('REQUIRE_PO_TOKEN'), + 0))[:1] - query = { - 'playbackContext': { - 'contentPlaybackContext': pb_context, - 'contentCheckOk': True, - 'racyCheckOk': True, - }, - 'context': { - 'client': merge_dicts( - traverse_obj(client, ('INNERTUBE_CONTEXT', 'client')), { - 'hl': 'en', - 'timeZone': 'UTC', - 'utcOffsetMinutes': 0, - }), - }, - 'videoId': video_id, - } + if is_live and 'ios' not in client_names: + client_names.append('ios') - headers = merge_dicts({ + headers = { 'Sec-Fetch-Mode': 'navigate', 'Origin': origin, # 'X-Goog-Visitor-Id': self._extract_visitor_data(ytcfg) or '', - }, traverse_obj(client, { - 'X-YouTube-Client-Name': 'INNERTUBE_CONTEXT_CLIENT_NAME', - 'X-YouTube-Client-Version': ( - 'INNERTUBE_CONTEXT', 'client', 'clientVersion'), - 'User-Agent': ( - 'INNERTUBE_CONTEXT', 'client', 'userAgent'), - })) - + } auth = self._generate_sapisidhash_header(origin) if auth is not None: headers['Authorization'] = auth headers['X-Origin'] = origin - player_response = self._call_api( - 'player', query, video_id, fatal=False, headers=headers, - note=join_nonempty( - 'Downloading', traverse_obj(query, ( - 'context', 'client', 'clientName')), - 'API JSON', delim=' ')) + for client in traverse_obj(self._INNERTUBE_CLIENTS, (client_names, T(dict))): + + query = { + 'playbackContext': { + 'contentPlaybackContext': pb_context, + 'contentCheckOk': True, + 'racyCheckOk': True, + }, + 'context': { + 'client': merge_dicts( + traverse_obj(client, ('INNERTUBE_CONTEXT', 'client')), { + 'hl': 'en', + 'timeZone': 'UTC', + 'utcOffsetMinutes': 0, + }), + }, + 'videoId': video_id, + } + + api_headers = merge_dicts(headers, traverse_obj(client, { + 'X-YouTube-Client-Name': 'INNERTUBE_CONTEXT_CLIENT_NAME', + 'X-YouTube-Client-Version': ( + 'INNERTUBE_CONTEXT', 'client', 'clientVersion'), + 'User-Agent': ( + 'INNERTUBE_CONTEXT', 'client', 'userAgent'), + })) + + api_player_response = self._call_api( + 'player', query, video_id, fatal=False, headers=api_headers, + note=join_nonempty( + 'Downloading', traverse_obj(query, ( + 'context', 'client', 'clientName')), + 'API JSON', delim=' ')) + + hls = [ + traverse_obj( + resp, ('streamingData', 'hlsManifestUrl', T(url_or_none))) + for resp in (player_response, api_player_response)] + if not hls[0] and hls[1]: + player_response['streamingData']['hlsManifestUrl'] = hls[1] + else: + player_response.update(api_player_response or {}) def is_agegated(playability): if not isinstance(playability, dict): @@ -2194,6 +2238,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): itag_qualities = {} q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) CHUNK_SIZE = 10 << 20 + is_live = video_details.get('isLive') streaming_data = player_response.get('streamingData') or {} streaming_formats = streaming_data.get('formats') or [] @@ -2338,7 +2383,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): hls_manifest_url = streaming_data.get('hlsManifestUrl') if hls_manifest_url: for f in self._extract_m3u8_formats( - hls_manifest_url, video_id, 'mp4', fatal=False): + hls_manifest_url, video_id, 'mp4', + entry_protocol='m3u8_native', live=is_live, fatal=False): if process_manifest_format( f, 'hls', None, self._search_regex( r'/itag/(\d+)', f['url'], 'itag', default=None)): @@ -2444,8 +2490,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Strictly de-prioritize damaged formats f['preference'] = -10 - is_live = video_details.get('isLive') - owner_profile_url = self._yt_urljoin(self._extract_author_var( webpage, 'url', videodetails=video_details, metadata=microformat)) From 63fb0fc4159397618b12fa115f957b9ba70f3f88 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 20 Jan 2025 13:23:54 +0000 Subject: [PATCH 7/8] [YouTube] Retain .videoDetails members from all player responses --- youtube_dl/extractor/youtube.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 32e836d49..edaae5bd3 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2095,7 +2095,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not hls[0] and hls[1]: player_response['streamingData']['hlsManifestUrl'] = hls[1] else: + video_details = merge_dicts(*traverse_obj( + (player_response, api_player_response), + (Ellipsis, 'videoDetails', T(dict)))) player_response.update(api_player_response or {}) + player_response['videoDetails'] = video_details def is_agegated(playability): if not isinstance(playability, dict): From 5975d7bb96095fae7c35e7cfcd819255a5b57087 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 22 Jan 2025 06:52:40 +0000 Subject: [PATCH 8/8] [YouTube] Use X-Goog-Visitor-Id * required with tv player client * resolves #33030 --- youtube_dl/extractor/youtube.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index edaae5bd3..c93a2a1f9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -142,6 +142,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _login(self): """ Attempt to log in to YouTube. + True is returned if successful or skipped. False is returned if login failed. @@ -2040,6 +2041,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): client_names = traverse_obj(self._INNERTUBE_CLIENTS, ( T(dict.items), lambda _, k_v: not k_v[1].get('REQUIRE_PO_TOKEN'), 0))[:1] + if 'web' not in client_names: + # webpage links won't download: ignore links and playability + player_response = filter_dict( + player_response or {}, + lambda k, _: k not in ('streamingData', 'playabilityStatus')) if is_live and 'ios' not in client_names: client_names.append('ios') @@ -2047,7 +2053,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): headers = { 'Sec-Fetch-Mode': 'navigate', 'Origin': origin, - # 'X-Goog-Visitor-Id': self._extract_visitor_data(ytcfg) or '', + 'X-Goog-Visitor-Id': self._extract_visitor_data(ytcfg) or '', } auth = self._generate_sapisidhash_header(origin) if auth is not None: @@ -2059,9 +2065,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): query = { 'playbackContext': { 'contentPlaybackContext': pb_context, - 'contentCheckOk': True, - 'racyCheckOk': True, }, + 'contentCheckOk': True, + 'racyCheckOk': True, 'context': { 'client': merge_dicts( traverse_obj(client, ('INNERTUBE_CONTEXT', 'client')), { @@ -2088,11 +2094,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'context', 'client', 'clientName')), 'API JSON', delim=' ')) - hls = [ - traverse_obj( - resp, ('streamingData', 'hlsManifestUrl', T(url_or_none))) - for resp in (player_response, api_player_response)] - if not hls[0] and hls[1]: + hls = traverse_obj( + (player_response, api_player_response), + (Ellipsis, 'streamingData', 'hlsManifestUrl', T(url_or_none))) + if len(hls) == 2 and not hls[0] and hls[1]: player_response['streamingData']['hlsManifestUrl'] = hls[1] else: video_details = merge_dicts(*traverse_obj( @@ -3467,7 +3472,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if not continuation: break if visitor_data: - headers['x-goog-visitor-id'] = visitor_data + headers['X-Goog-Visitor-Id'] = visitor_data data['continuation'] = continuation['continuation'] data['clickTracking'] = { 'clickTrackingParams': continuation['itct'],