Merge eaf29d53e4 into 1036478d13

[YouTube] Endure subtitle URLs are complete
* WEB URLs are, MWEB not * resolves #33017
2025-01-09 21:10:10 +09:00 · 2025-01-07 18:06:07 +05:30 · 2025-01-06 01:39:04 +00:00 · 2025-01-06 01:24:30 +00:00 · 2025-01-06 01:22:16 +00:00 · 2021-07-20 19:17:48 +01:00
9 changed files with 156 additions and 51 deletions
--- a/README.md
+++ b/README.md
@ -404,6 +404,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo
    -2, --twofactor TWOFACTOR            Two-factor authentication code
    -n, --netrc                          Use .netrc authentication data
    --video-password PASSWORD            Video password (vimeo, youku)
+    --client-certificate                 Path to a single certificate file in 
+                                         PEM format, used to authenticate to the
+                                         site (including private key)

 ## Adobe Pass Options:
    --ap-mso MSO                         Adobe Pass multiple-system operator (TV
--- a/test/test_clientcert.py
+++ b/test/test_clientcert.py
@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# coding: utf-8
+from __future__ import unicode_literals
+
+# Allow direct execution
+import os
+import sys
+import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from test.helper import http_server_port
+from youtube_dl import YoutubeDL
+from youtube_dl.compat import compat_http_server
+import ssl
+import threading
+
+from test.test_http import HTTPTestRequestHandler, FakeLogger
+
+
+# See https://gist.github.com/dergachev/7028596
+# and http://www.piware.de/2011/01/creating-an-https-server-in-python/
+# and https://blog.devolutions.net/2020/07/tutorial-how-to-generate-secure-self-signed-server-and-client-certificates-with-openssl
+
+
+TEST_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+class TestClientCert(unittest.TestCase):
+    def setUp(self):
+        certfn = os.path.join(TEST_DIR, 'testcert.pem')
+        cacertfn = os.path.join(TEST_DIR, 'testdata', 'clientcert', 'ca.crt')
+        self.httpd = compat_http_server.HTTPServer(('127.0.0.1', 0), HTTPTestRequestHandler)
+        self.httpd.socket = ssl.wrap_socket(
+            self.httpd.socket, cert_reqs=ssl.CERT_REQUIRED, ca_certs=cacertfn, certfile=certfn, server_side=True)
+        self.port = http_server_port(self.httpd)
+        self.server_thread = threading.Thread(target=self.httpd.serve_forever)
+        self.server_thread.daemon = True
+        self.server_thread.start()
+
+    def test_check_clientcertificate(self):
+        clientcertfn = os.path.join(TEST_DIR, 'testdata', 'clientcert', 'client.crt')
+        ydl = YoutubeDL({'logger': FakeLogger(), 'clientcertificate': clientcertfn,
+            # Disable client-side validation of unacceptable self-signed testcert.pem
+            # The test is of a check on the server side, so unaffected
+            'nocheckcertificate': True,
+            })
+        r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port)
+        self.assertEqual(r['entries'][0]['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/testdata/clientcert/ca.crt
+++ b/test/testdata/clientcert/ca.crt
@ -0,0 +1,11 @@
+-----BEGIN CERTIFICATE-----
+MIIBnTCCAUOgAwIBAgIUN4jSR5qgSLKJs4lWdBUQPiOyUPEwCgYIKoZIzj0EAwIw
+JDETMBEGA1UECgwKWW91dHViZS1ETDENMAsGA1UEAwwEVGVzdDAeFw0yMTA3MTkx
+NTE1MzJaFw0zODAxMTgxNTE1MzJaMCQxEzARBgNVBAoMCllvdXR1YmUtREwxDTAL
+BgNVBAMMBFRlc3QwWTATBgcqhkjOPQIBBggqhkjOPQMBBwNCAAQ8DNZrIDTIQ4mN
+IofBxPIDF2nZUKKAUPAMyn6ntXh99WhLRO5caGKTPWip+LspF5j2uyd2DAcAKMZ5
+170qIbSCo1MwUTAdBgNVHQ4EFgQUj+rfMTfIDHufM94pYwjvI8pZKlYwHwYDVR0j
+BBgwFoAUj+rfMTfIDHufM94pYwjvI8pZKlYwDwYDVR0TAQH/BAUwAwEB/zAKBggq
+hkjOPQQDAgNIADBFAiAwhl8mpPiZoAXWfPHSZaxiLPjy2m4pZK70O0BHnxmSJQIh
+AOeQgZh0j0SkZW0kXHBPWguCgvVm5tqQPQJCevgNDKWP
+-----END CERTIFICATE-----
--- a/test/testdata/clientcert/client.crt
+++ b/test/testdata/clientcert/client.crt
@ -0,0 +1,14 @@
+-----BEGIN CERTIFICATE-----
+MIIBSTCB8AIUE2DY1KuqtYWIi0KYeSYvta9sV+swCgYIKoZIzj0EAwIwJDETMBEG
+A1UECgwKWW91dHViZS1ETDENMAsGA1UEAwwEVGVzdDAeFw0yMTA3MTkxNTE2MjZa
+Fw0zODAxMTgxNTE2MjZaMCsxEzARBgNVBAoMCllvdXR1YmUtREwxFDASBgNVBAMM
+C1Rlc3QgQ2xpZW50MFkwEwYHKoZIzj0CAQYIKoZIzj0DAQcDQgAEc0ldxFETUFCS
+CsMq01OUEYp9zkPbXZ9IkTUu1RQhliuPYCsc4Q+UZ8z+Ttcyqa76jAMcmQWh+n2P
+4i7uCDvZ8zAKBggqhkjOPQQDAgNIADBFAiEAiuQWNv6F7EO+bZGhDDxhUkGdhWOy
+36YbZa+BZ8CYae0CIBVfdEnrG5M9tc6PZjXiXgoUMUrnPnRXs76ihQ55hHPW
+-----END CERTIFICATE-----
+-----BEGIN EC PRIVATE KEY-----
+MHcCAQEEIBVDCR/z/PuVFzGKFCOt9GYGpwQ8vJTXAj59jPwP4OFVoAoGCCqGSM49
+AwEHoUQDQgAEc0ldxFETUFCSCsMq01OUEYp9zkPbXZ9IkTUu1RQhliuPYCsc4Q+U
+Z8z+Ttcyqa76jAMcmQWh+n2P4i7uCDvZ8w==
+-----END EC PRIVATE KEY-----
--- a/test/testdata/clientcert/instructions.txt
+++ b/test/testdata/clientcert/instructions.txt
@ -0,0 +1,11 @@
+#https://blog.devolutions.net/2020/07/tutorial-how-to-generate-secure-self-signed-server-and-client-certificates-with-openssl
+# Adapt the commands below
+# 6027 days from the time of signing to the day before Y2038
+# Recalculate or use -preserve_dates if re-signing, until
+# 32-bit time_t is not an issue 
+#openssl ecparam -name prime256v1 -genkey -noout -out ca.key
+#openssl req -new -x509 -sha256 -days 6027 -key ca.key -out ca.crt
+#openssl ecparam -name prime256v1 -genkey -noout -out client.key
+#openssl req -new -sha256 -key client.key -out client.csr
+#openssl x509 -req -in client.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out client.crt -days 6027 -sha256
+#cat client.key >> client.crt
--- a/youtube_dl/init.py
+++ b/youtube_dl/init.py
@ -322,6 +322,7 @@ def _real_main(argv=None):
        'password': opts.password,
        'twofactor': opts.twofactor,
        'videopassword': opts.videopassword,
+        'clientcertificate': opts.clientcertificate,
        'ap_mso': opts.ap_mso,
        'ap_username': opts.ap_username,
        'ap_password': opts.ap_password,
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -9,6 +9,7 @@ import json
 import os.path
 import random
 import re
+import string
 import time
 import traceback

@ -67,6 +68,7 @@ from ..utils import (

 class YoutubeBaseInfoExtractor(InfoExtractor):
    """Provide base functions for Youtube extractors"""
+
    _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
    _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'

@ -138,7 +140,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
                [2, 1, None, 1,
                 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
                 None, [], 4],
-                1, [None, None, []], None, None, None, True
+                1, [None, None, []], None, None, None, True,
            ],
            username,
        ]
@ -160,7 +162,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
            None, 1, None, [1, None, None, None, [password, None, True]],
            [
                None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
-                1, [None, None, []], None, None, None, True
+                1, [None, None, []], None, None, None, True,
            ]]

        challenge_results = req(
@ -213,7 +215,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
                    user_hash, None, 2, None,
                    [
                        9, None, None, None, None, None, None, None,
-                        [None, tfa_code, True, 2]
+                        [None, tfa_code, True, 2],
                    ]]

                tfa_results = req(
@ -284,7 +286,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
            'client': {
                'clientName': 'WEB',
                'clientVersion': '2.20201021.03.00',
-            }
+            },
        },
    }

@ -385,7 +387,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
                'client': {
                    'clientName': 'WEB',
                    'clientVersion': '2.20201021.03.00',
-                }
+                },
            },
            'query': query,
        }
@ -462,7 +464,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
            #       (HTML, videodetails, metadata, renderers)
            'name': ('content', 'author', (('ownerChannelName', None), 'title'), ['text']),
            'url': ('href', 'ownerProfileUrl', 'vanityChannelUrl',
-                    ['navigationEndpoint', 'browseEndpoint', 'canonicalBaseUrl'])
+                    ['navigationEndpoint', 'browseEndpoint', 'canonicalBaseUrl']),
        }
        if any((videodetails, metadata, renderers)):
            result = (
@ -671,7 +673,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
                'description': '',
                'uploader': '8KVIDEO',
-                'title': 'UHDTV TEST 8K VIDEO.mp4'
+                'title': 'UHDTV TEST 8K VIDEO.mp4',
            },
            'params': {
                'youtube_include_dash_manifest': True,
@ -711,7 +713,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader_url': r're:https?://(?:www\.)?youtube\.com/@theamazingatheist',
                'title': 'Burning Everyone\'s Koran',
                'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
-            }
+            },
        },
        # Age-gated videos
        {
@ -839,7 +841,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            },
            'expected_warnings': [
                'DASH manifest missing',
-            ]
+            ],
        },
        # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
        {
@ -1820,8 +1822,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):

        # cpn generation algorithm is reverse engineered from base.js.
        # In fact it works even with dummy cpn.
-        CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
-        cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))
+        CPN_ALPHABET = string.ascii_letters + string.digits + '-_'
+        cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(16))

        # more consistent results setting it to right before the end
        qs = parse_qs(playback_url)
@ -1881,8 +1883,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
        if mobj is None:
            raise ExtractorError('Invalid URL: %s' % url)
-        video_id = mobj.group(2)
-        return video_id
+        return mobj.group(2)

    def _extract_chapters_from_json(self, data, video_id, duration):
        chapters_list = try_get(
@ -2035,7 +2036,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            headers = {
                'X-YouTube-Client-Name': '85',
                'X-YouTube-Client-Version': '2.0',
-                'Origin': 'https://www.youtube.com'
+                'Origin': 'https://www.youtube.com',
            }

            video_info = self._call_api('player', query, video_id, fatal=False, headers=headers)
@ -2064,8 +2065,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)])

        search_meta = (
-            lambda x: self._html_search_meta(x, webpage, default=None)) \
-            if webpage else lambda x: None
+            (lambda x: self._html_search_meta(x, webpage, default=None))
+            if webpage else lambda _: None)

        video_details = player_response.get('videoDetails') or {}
        microformat = try_get(
@ -2137,7 +2138,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        def build_fragments(f):
            return LazyList({
                'url': update_url_query(f['url'], {
-                    'range': '{0}-{1}'.format(range_start, min(range_start + CHUNK_SIZE - 1, f['filesize']))
+                    'range': '{0}-{1}'.format(range_start, min(range_start + CHUNK_SIZE - 1, f['filesize'])),
                })
            } for range_start in range(0, f['filesize'], CHUNK_SIZE))

@ -2236,7 +2237,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                    'protocol': 'http_dash_segments',
                    'fragments': build_fragments(dct),
                } if dct['filesize'] else {
-                    'downloader_options': {'http_chunk_size': CHUNK_SIZE}  # No longer useful?
+                    'downloader_options': {'http_chunk_size': CHUNK_SIZE},  # No longer useful?
                })

            formats.append(dct)
@ -2414,9 +2415,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            'is_live': is_live,
        }

-        pctr = try_get(
+        pctr = traverse_obj(
            player_response,
-            lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict)
+            ('captions', 'playerCaptionsTracklistRenderer', T(dict)))
        if pctr:
            def process_language(container, base_url, lang_code, query):
                lang_subs = []
@ -2430,31 +2431,34 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                    })
                container[lang_code] = lang_subs

-            subtitles = {}
-            for caption_track in (pctr.get('captionTracks') or []):
-                base_url = caption_track.get('baseUrl')
-                if not base_url:
-                    continue
-                if caption_track.get('kind') != 'asr':
-                    lang_code = caption_track.get('languageCode')
-                    if not lang_code:
+            def process_subtitles():
+                subtitles = {}
+                for caption_track in traverse_obj(pctr, (
+                        'captionTracks', lambda _, v: v.get('baseUrl'))):
+                    base_url = self._yt_urljoin(caption_track['baseUrl'])
+                    if not base_url:
                        continue
-                    process_language(
-                        subtitles, base_url, lang_code, {})
-                    continue
-                automatic_captions = {}
-                for translation_language in (pctr.get('translationLanguages') or []):
-                    translation_language_code = translation_language.get('languageCode')
-                    if not translation_language_code:
+                    if caption_track.get('kind') != 'asr':
+                        lang_code = caption_track.get('languageCode')
+                        if not lang_code:
+                            continue
+                        process_language(
+                            subtitles, base_url, lang_code, {})
                        continue
-                    process_language(
-                        automatic_captions, base_url, translation_language_code,
-                        {'tlang': translation_language_code})
-                info['automatic_captions'] = automatic_captions
-            info['subtitles'] = subtitles
+                    automatic_captions = {}
+                    for translation_language in traverse_obj(pctr, (
+                            'translationLanguages', lambda _, v: v.get('languageCode'))):
+                        translation_language_code = translation_language['languageCode']
+                        process_language(
+                            automatic_captions, base_url, translation_language_code,
+                            {'tlang': translation_language_code})
+                    info['automatic_captions'] = automatic_captions
+                info['subtitles'] = subtitles
+
+            process_subtitles()

        parsed_url = compat_urllib_parse_urlparse(url)
-        for component in [parsed_url.fragment, parsed_url.query]:
+        for component in (parsed_url.fragment, parsed_url.query):
            query = compat_parse_qs(component)
            for k, v in query.items():
                for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
@ -2684,7 +2688,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
            'title': 'Super Cooper Shorts - Shorts',
            'uploader': 'Super Cooper Shorts',
            'uploader_id': '@SuperCooperShorts',
-        }
+        },
    }, {
        # Channel that does not have a Shorts tab. Test should just download videos on Home tab instead
        'url': 'https://www.youtube.com/@emergencyawesome/shorts',
@ -2738,7 +2742,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
            'description': 'md5:609399d937ea957b0f53cbffb747a14c',
            'uploader': 'ThirstForScience',
            'uploader_id': '@ThirstForScience',
-        }
+        },
    }, {
        'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
        'only_matching': True,
@ -3037,7 +3041,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
            'uploader': '3Blue1Brown',
            'uploader_id': '@3blue1brown',
            'channel_id': 'UCYO_jab_esuFRV4b17AJtAw',
-        }
+        },
    }]

    @classmethod
@ -3335,7 +3339,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
            'client': {
                'clientName': 'WEB',
                'clientVersion': client_version,
-            }
+            },
        }
        visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str)

@ -3354,7 +3358,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
                headers['x-goog-visitor-id'] = visitor_data
            data['continuation'] = continuation['continuation']
            data['clickTracking'] = {
-                'clickTrackingParams': continuation['itct']
+                'clickTrackingParams': continuation['itct'],
            }
            count = 0
            retries = 3
@ -3613,7 +3617,7 @@ class YoutubePlaylistIE(InfoExtractor):
            'uploader': 'milan',
            'uploader_id': '@milan5503',
            'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
-        }
+        },
    }, {
        'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
        'playlist_mincount': 455,
@ -3623,7 +3627,7 @@ class YoutubePlaylistIE(InfoExtractor):
            'uploader': 'LBK',
            'uploader_id': '@music_king',
            'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA',
-        }
+        },
    }, {
        'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
        'only_matching': True,
@ -3734,7 +3738,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
        'info_dict': {
            'id': 'youtube-dl test video',
            'title': 'youtube-dl test video',
-        }
+        },
    }]

    def _get_n_results(self, query, n):
@ -3754,7 +3758,7 @@ class YoutubeSearchDateIE(YoutubeSearchIE):
        'info_dict': {
            'id': 'youtube-dl test video',
            'title': 'youtube-dl test video',
-        }
+        },
    }]


@ -3769,7 +3773,7 @@ class YoutubeSearchURLIE(YoutubeBaseInfoExtractor):
            'id': 'youtube-dl test video',
            'title': 'youtube-dl test video',
        },
-        'params': {'playlistend': 5}
+        'params': {'playlistend': 5},
    }, {
        'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
        'only_matching': True,
@ -3785,6 +3789,7 @@ class YoutubeSearchURLIE(YoutubeBaseInfoExtractor):
 class YoutubeFeedsInfoExtractor(YoutubeTabIE):
    """
    Base class for feed extractors
+
    Subclasses must define the _FEED_NAME property.
    """
    _LOGIN_REQUIRED = True
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@ -368,6 +368,10 @@ def parseOpts(overrideArguments=None):
        '--video-password',
        dest='videopassword', metavar='PASSWORD',
        help='Video password (vimeo, youku)')
+    authentication.add_option(
+        '--client-certificate',
+        dest='clientcertificate', metavar='PATH',
+        help='Path to a single certificate file in PEM format, used to authenticate to the site (including private key)')

    adobe_pass = optparse.OptionGroup(parser, 'Adobe Pass Options')
    adobe_pass.add_option(
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -2529,6 +2529,10 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
    # https://github.com/ytdl-org/youtube-dl/issues/6727)
    if sys.version_info < (3, 0):
        kwargs['strict'] = True
+    if is_https:
+        client_cert_path = ydl_handler._params.get('clientcertificate')
+        if client_cert_path:
+            kwargs['cert_file'] = client_cert_path
    hc = http_class(*args, **compat_kwargs(kwargs))
    source_address = ydl_handler._params.get('source_address')
Author	SHA1	Message	Date
dirkf	c2a8513f9f	Merge `eaf29d53e4` into `1036478d13`	2025-01-07 18:06:07 +05:30
dirkf	1036478d13	[YouTube] Endure subtitle URLs are complete * WEB URLs are, MWEB not * resolves #33017	2025-01-06 01:39:04 +00:00
dirkf	00ad2b8ca1	[YouTube] Refactor subtitle processing * move to internal function * use `traverse-obj()`	2025-01-06 01:24:30 +00:00
dirkf	ab7c61ca29	[YouTube] Apply code style changes, trailing commas, etc	2025-01-06 01:22:16 +00:00
df	eaf29d53e4	Disable hopeless server certificate validation by test client	2021-07-20 19:17:48 +01:00
df	f2c3bef77b	Update --client-certificate option help	2021-07-19 17:04:23 +01:00
df	8f2341c531	Added test routine for clientcertificate option	2021-07-19 17:04:23 +01:00
df	a67dafe3aa	Add and implement --client-certificate option Use a PEM certificate to authenticate HTTPS access to site	2021-07-14 16:51:24 +01:00