Compare commits

...

5 Commits

Author SHA1 Message Date
dirkf
e102b9993a [workflows/ci.yml] Move pinned Ubuntu runner images from withdrawn 20.4 to 22.04
* fix consequent missing `python-is-python2` package
2025-05-03 18:33:39 +01:00
dirkf
680069a149 [YouTube] Improve n-sig function extraction for player aa3fc80b
Resolves #33123
2025-05-03 00:09:21 +01:00
dirkf
4a31290ae1 [YouTube] Delete cached problem nsig cache data on descrambling error
* inspired by yt-dlp/yt-dlp#12750
2025-05-03 00:09:21 +01:00
dirkf
3a42f6ad37 [YouTube] Cache signature timestamp from player JS
* if the YT webpage can't be loaded, getting the `sts` requires loading the
player JS: this caches it
* based on yt-dlp/yt-dlp#13047, thx bashonly
2025-05-03 00:09:21 +01:00
dirkf
ec75141bf0 [Cache] Add clear function 2025-05-03 00:09:20 +01:00
5 changed files with 190 additions and 67 deletions

View File

@ -116,7 +116,7 @@ jobs:
strategy: strategy:
fail-fast: true fail-fast: true
matrix: matrix:
os: [ubuntu-20.04] os: [ubuntu-22.04]
python-version: ${{ fromJSON(needs.select.outputs.cpython-versions) }} python-version: ${{ fromJSON(needs.select.outputs.cpython-versions) }}
python-impl: [cpython] python-impl: [cpython]
ytdl-test-set: ${{ fromJSON(needs.select.outputs.test-set) }} ytdl-test-set: ${{ fromJSON(needs.select.outputs.test-set) }}
@ -133,12 +133,12 @@ jobs:
ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'download') && 'download' || 'nodownload' }} ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'download') && 'download' || 'nodownload' }}
run-tests-ext: bat run-tests-ext: bat
# jython # jython
- os: ubuntu-20.04 - os: ubuntu-22.04
python-version: 2.7 python-version: 2.7
python-impl: jython python-impl: jython
ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'core') && 'core' || 'nocore' }} ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'core') && 'core' || 'nocore' }}
run-tests-ext: sh run-tests-ext: sh
- os: ubuntu-20.04 - os: ubuntu-22.04
python-version: 2.7 python-version: 2.7
python-impl: jython python-impl: jython
ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'download') && 'download' || 'nodownload' }} ytdl-test-set: ${{ contains(needs.select.outputs.test-set, 'download') && 'download' || 'nodownload' }}
@ -160,7 +160,7 @@ jobs:
# NB may run apt-get install in Linux # NB may run apt-get install in Linux
uses: ytdl-org/setup-python@v1 uses: ytdl-org/setup-python@v1
env: env:
# Temporary workaround for Python 3.5 failures - May 2024 # Temporary (?) workaround for Python 3.5 failures - May 2024
PIP_TRUSTED_HOST: "pypi.python.org pypi.org files.pythonhosted.org" PIP_TRUSTED_HOST: "pypi.python.org pypi.org files.pythonhosted.org"
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
@ -240,7 +240,10 @@ jobs:
# install 2.7 # install 2.7
shell: bash shell: bash
run: | run: |
sudo apt-get install -y python2 python-is-python2 # Ubuntu 22.04 no longer has python-is-python2: fetch it
curl -L "http://launchpadlibrarian.net/474693132/python-is-python2_2.7.17-4_all.deb" -o python-is-python2.deb
sudo apt-get install -y python2
sudo dpkg --force-breaks -i python-is-python2.deb
echo "PYTHONHOME=/usr" >> "$GITHUB_ENV" echo "PYTHONHOME=/usr" >> "$GITHUB_ENV"
#-------- Python 2.6 -- #-------- Python 2.6 --
- name: Set up Python 2.6 environment - name: Set up Python 2.6 environment

View File

@ -66,6 +66,18 @@ class TestCache(unittest.TestCase):
new_version = '.'.join(('%0.2d' % ((v + 1) if i == 0 else v, )) for i, v in enumerate(version_tuple(__version__))) new_version = '.'.join(('%0.2d' % ((v + 1) if i == 0 else v, )) for i, v in enumerate(version_tuple(__version__)))
self.assertIs(c.load('test_cache', 'k.', min_ver=new_version), None) self.assertIs(c.load('test_cache', 'k.', min_ver=new_version), None)
def test_cache_clear(self):
ydl = FakeYDL({
'cachedir': self.test_dir,
})
c = Cache(ydl)
c.store('test_cache', 'k.', 'kay')
c.store('test_cache', 'l.', 'ell')
self.assertEqual(c.load('test_cache', 'k.'), 'kay')
c.clear('test_cache', 'k.')
self.assertEqual(c.load('test_cache', 'k.'), None)
self.assertEqual(c.load('test_cache', 'l.'), 'ell')
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -346,6 +346,10 @@ _NSIG_TESTS = [
'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js', 'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js',
'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE', 'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE',
), ),
(
'https://www.youtube.com/s/player/aa3fc80b/player_ias.vflset/en_US/base.js',
'0qY9dal2uzOnOGwa-48hha', 'VSh1KDfQMk-eag',
),
] ]

View File

@ -1,6 +1,7 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import errno
import json import json
import os import os
import re import re
@ -8,7 +9,6 @@ import shutil
import traceback import traceback
from .compat import ( from .compat import (
compat_contextlib_suppress,
compat_getenv, compat_getenv,
compat_open as open, compat_open as open,
compat_os_makedirs, compat_os_makedirs,
@ -78,6 +78,22 @@ class Cache(object):
tb = traceback.format_exc() tb = traceback.format_exc()
self._report_warning('Writing cache to {fn!r} failed: {tb}'.format(fn=fn, tb=tb)) self._report_warning('Writing cache to {fn!r} failed: {tb}'.format(fn=fn, tb=tb))
def clear(self, section, key, dtype='json'):
if not self.enabled:
return
fn = self._get_cache_fn(section, key, dtype)
self._write_debug('Clearing {section}.{key} from cache'.format(section=section, key=key))
try:
os.remove(fn)
except Exception as e:
if getattr(e, 'errno') == errno.ENOENT:
# file not found
return
tb = traceback.format_exc()
self._report_warning('Clearing cache from {fn!r} failed: {tb}'.format(fn=fn, tb=tb))
def _validate(self, data, min_ver): def _validate(self, data, min_ver):
version = traverse_obj(data, self._VERSION_KEY) version = traverse_obj(data, self._VERSION_KEY)
if not version: # Backward compatibility if not version: # Backward compatibility
@ -94,17 +110,21 @@ class Cache(object):
return default return default
cache_fn = self._get_cache_fn(section, key, dtype) cache_fn = self._get_cache_fn(section, key, dtype)
with compat_contextlib_suppress(IOError): # If no cache available try:
with open(cache_fn, encoding='utf-8') as cachef:
self._write_debug('Loading {section}.{key} from cache'.format(section=section, key=key), only_once=True)
return self._validate(json.load(cachef), min_ver)
except (ValueError, KeyError):
try: try:
with open(cache_fn, encoding='utf-8') as cachef: file_size = 'size: %d' % os.path.getsize(cache_fn)
self._write_debug('Loading {section}.{key} from cache'.format(section=section, key=key), only_once=True) except (OSError, IOError) as oe:
return self._validate(json.load(cachef), min_ver) file_size = error_to_compat_str(oe)
except (ValueError, KeyError): self._report_warning('Cache retrieval from %s failed (%s)' % (cache_fn, file_size))
try: except Exception as e:
file_size = os.path.getsize(cache_fn) if getattr(e, 'errno') == errno.ENOENT:
except (OSError, IOError) as oe: # no cache available
file_size = error_to_compat_str(oe) return
self._report_warning('Cache retrieval from %s failed (%s)' % (cache_fn, file_size)) self._report_warning('Cache retrieval from %s failed' % (cache_fn,))
return default return default

View File

@ -49,6 +49,7 @@ from ..utils import (
parse_duration, parse_duration,
parse_qs, parse_qs,
qualities, qualities,
remove_end,
remove_start, remove_start,
smuggle_url, smuggle_url,
str_or_none, str_or_none,
@ -1584,6 +1585,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
} }
_PLAYER_JS_VARIANT_MAP = (
('main', 'player_ias.vflset/en_US/base.js'),
('tce', 'player_ias_tce.vflset/en_US/base.js'),
('tv', 'tv-player-ias.vflset/tv-player-ias.js'),
('tv_es6', 'tv-player-es6.vflset/tv-player-es6.js'),
('phone', 'player-plasma-ias-phone-en_US.vflset/base.js'),
('tablet', 'player-plasma-ias-tablet-en_US.vflset/base.js'),
)
@classmethod @classmethod
def suitable(cls, url): def suitable(cls, url):
if parse_qs(url).get('list', [None])[0]: if parse_qs(url).get('list', [None])[0]:
@ -1631,36 +1641,90 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
raise ExtractorError( raise ExtractorError(
'Cannot identify player %r' % (player_url,), cause=e) 'Cannot identify player %r' % (player_url,), cause=e)
def _load_player(self, video_id, player_url, fatal=True, player_id=None): def _player_js_cache_key(self, player_url, extra_id=None, _cache={}):
if not player_id: if player_url not in _cache:
player_id = self._extract_player_info(player_url) player_id = self._extract_player_info(player_url)
if player_id not in self._code_cache: player_path = remove_start(
compat_urllib_parse.urlparse(player_url).path,
'/s/player/{0}/'.format(player_id))
variant = next((k for k, v in self._PLAYER_JS_VARIANT_MAP
if v == player_path), None)
if not variant:
variant = next(
(k for k, v in self._PLAYER_JS_VARIANT_MAP
if re.match(re.escape(v).replace('en_US', r'\w+') + '$', player_path)),
None)
if not variant:
self.write_debug(
'Unable to determine player JS variant\n'
' player = {0}'.format(player_url), only_once=True)
variant = re.sub(r'[^a-zA-Z0-9]', '_', remove_end(player_path, '.js'))
_cache[player_url] = join_nonempty(player_id, variant)
if extra_id:
extra_id = '-'.join((_cache[player_url], extra_id))
assert os.path.basename(extra_id) == extra_id
return extra_id
return _cache[player_url]
def _load_player(self, video_id, player_url, fatal=True):
player_js_key = self._player_js_cache_key(player_url)
if player_js_key not in self._code_cache:
code = self._download_webpage( code = self._download_webpage(
player_url, video_id, fatal=fatal, player_url, video_id, fatal=fatal,
note='Downloading player ' + player_id, note='Downloading player {0}'.format(player_js_key),
errnote='Download of %s failed' % player_url) errnote='Download of {0} failed'.format(player_url))
if code: if code:
self._code_cache[player_id] = code self._code_cache[player_js_key] = code
return self._code_cache[player_id] if fatal else self._code_cache.get(player_id) return self._code_cache.get(player_js_key)
def _load_player_data_from_cache(self, name, player_url, extra_id=None):
cache_id = ('youtube-{0}'.format(name), self._player_js_cache_key(player_url, extra_id))
data = self._player_cache.get(cache_id)
if data:
return data
data = self.cache.load(*cache_id, min_ver='2025.04.07')
if data:
self._player_cache[cache_id] = data
return data
def _store_player_data_to_cache(self, name, player_url, data, extra_id=None):
cache_id = ('youtube-{0}'.format(name), self._player_js_cache_key(player_url, extra_id))
if cache_id not in self._player_cache:
self.cache.store(cache_id[0], cache_id[1], data)
self._player_cache[cache_id] = data
def _remove_player_data_from_cache(self, name, player_url, extra_id=None):
cache_id = ('youtube-{0}'.format(name), self._player_js_cache_key(player_url, extra_id))
if cache_id in self._player_cache:
self.cache.clear(*cache_id)
self._player_cache.pop(cache_id, None)
def _extract_signature_function(self, video_id, player_url, example_sig): def _extract_signature_function(self, video_id, player_url, example_sig):
player_id = self._extract_player_info(player_url) # player_id = self._extract_player_info(player_url)
# Read from filesystem cache # Read from filesystem cache
func_id = 'js_{0}_{1}'.format( extra_id = self._signature_cache_id(example_sig)
player_id, self._signature_cache_id(example_sig)) self.write_debug('Extracting signature function {0}-{1}'.format(player_url, extra_id))
assert os.path.basename(func_id) == func_id cache_spec, code = self._load_player_data_from_cache(
'sigfuncs', player_url, extra_id=extra_id, min_ver='2025.04.07'
self.write_debug('Extracting signature function {0}'.format(func_id)) ), None
cache_spec, code = self.cache.load('youtube-sigfuncs', func_id, min_ver='2025.04.07'), None
if not cache_spec: if not cache_spec:
code = self._load_player(video_id, player_url, player_id) code = self._load_player(video_id, player_url)
if code: if code:
res = self._parse_sig_js(code) res = self._parse_sig_js(code)
test_string = ''.join(map(compat_chr, range(len(example_sig)))) test_string = ''.join(map(compat_chr, range(len(example_sig))))
cache_spec = [ord(c) for c in res(test_string)] cache_spec = [ord(c) for c in res(test_string)]
self.cache.store('youtube-sigfuncs', func_id, cache_spec) self._store_player_data_to_cache(
'sigfuncs', player_url, cache_spec, extra_id=extra_id)
else:
self.report_warning(
'Failed to compute signature function {0}-{1}'.format(
player_url, extra_id))
return lambda s: ''.join(s[i] for i in cache_spec) return lambda s: ''.join(s[i] for i in cache_spec)
@ -1814,6 +1878,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_n_function_name(self, jscode): def _extract_n_function_name(self, jscode):
func_name, idx = None, None func_name, idx = None, None
def generic_n_function_search(func_name=None):
return self._search_regex(
r'''(?xs)
(?:(?<=[^\w$])|^) # instead of \b, which ignores $
(?P<name>%s)\s*=\s*function\((?!\d)[a-zA-Z\d_$]+\)
\s*\{(?:(?!};).)+?(?:
["']enhanced_except_ |
return\s*(?P<q>"|')[a-zA-Z\d-]+_w8_(?P=q)\s*\+\s*[\w$]+
)
''' % (func_name or r'(?!\d)[a-zA-Z\d_$]+',), jscode,
'Initial JS player n function name', group='name',
default=None if func_name else NO_DEFAULT)
# these special cases are redundant and probably obsolete (2025-04): # these special cases are redundant and probably obsolete (2025-04):
# they make the tests run ~10% faster without fallback warnings # they make the tests run ~10% faster without fallback warnings
r""" r"""
@ -1854,26 +1932,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(?(idx)|\[\s*)(?P<nfunc>(?!\d)[\w$]+)(?(idx)|\s*\]) (?(idx)|\[\s*)(?P<nfunc>(?!\d)[\w$]+)(?(idx)|\s*\])
\s*?[;\n] \s*?[;\n]
''', jscode): ''', jscode):
func_name = self._search_regex( fn = self._search_regex(
r'[;,]\s*(function\s+)?({0})(?(1)|\s*=\s*function)\s*\((?!\d)[\w$]+\)\s*\{1}(?!\s*return\s)'.format( r'[;,]\s*(function\s+)?({0})(?(1)|\s*=\s*function)\s*\((?!\d)[\w$]+\)\s*\{1}(?!\s*return\s)'.format(
re.escape(m.group('nfunc')), '{'), re.escape(m.group('nfunc')), '{'),
jscode, 'Initial JS player n function name (2)', group=2, default=None) jscode, 'Initial JS player n function name (2)', group=2, default=None)
if func_name: if fn:
func_name = fn
idx = m.group('idx') idx = m.group('idx')
break if generic_n_function_search(func_name):
# don't look any further
break
# thx bashonly: yt-dlp/yt-dlp/pull/10611 # thx bashonly: yt-dlp/yt-dlp/pull/10611
if not func_name: if not func_name:
self.report_warning('Falling back to generic n function search', only_once=True) self.report_warning('Falling back to generic n function search', only_once=True)
return self._search_regex( return generic_n_function_search()
r'''(?xs)
(?:(?<=[^\w$])|^) # instead of \b, which ignores $
(?P<name>(?!\d)[a-zA-Z\d_$]+)\s*=\s*function\((?!\d)[a-zA-Z\d_$]+\)
\s*\{(?:(?!};).)+?(?:
["']enhanced_except_ |
return\s*(?P<q>"|')[a-zA-Z\d-]+_w8_(?P=q)\s*\+\s*[\w$]+
)
''', jscode, 'Initial JS player n function name', group='name')
if not idx: if not idx:
return func_name return func_name
@ -1885,22 +1959,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_n_function_code(self, video_id, player_url): def _extract_n_function_code(self, video_id, player_url):
player_id = self._extract_player_info(player_url) player_id = self._extract_player_info(player_url)
func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.04.07') func_code = self._load_player_data_from_cache('nsig', player_url)
jscode = func_code or self._load_player(video_id, player_url) jscode = func_code or self._load_player(video_id, player_url)
jsi = JSInterpreter(jscode) jsi = JSInterpreter(jscode)
if func_code: if func_code:
return jsi, player_id, func_code return jsi, player_id, func_code
return self._extract_n_function_code_jsi(video_id, jsi, player_id)
def _extract_n_function_code_jsi(self, video_id, jsi, player_id=None): return self._extract_n_function_code_jsi(video_id, jsi, player_id, player_url)
def _extract_n_function_code_jsi(self, video_id, jsi, player_id=None, player_url=None):
func_name = self._extract_n_function_name(jsi.code) func_name = self._extract_n_function_name(jsi.code)
func_code = self._extract_sig_fn(jsi, func_name) func_code = self._extract_sig_fn(jsi, func_name)
if player_url:
if player_id: self._store_player_data_to_cache('nsig', player_url, func_code)
self.cache.store('youtube-nsig', player_id, func_code)
return jsi, player_id, func_code return jsi, player_id, func_code
def _extract_n_function_from_code(self, jsi, func_code): def _extract_n_function_from_code(self, jsi, func_code):
@ -1933,7 +2006,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
n_param = n_param[-1] n_param = n_param[-1]
n_response = decrypt_nsig(n_param)(n_param, video_id, player_url) n_response = decrypt_nsig(n_param)(n_param, video_id, player_url)
if n_response is None: if n_response is None:
# give up if descrambling failed # give up and forget cached data if descrambling failed
self._remove_player_data_from_cache('nsig', player_url)
break break
fmt['url'] = update_url_query(fmt['url'], {'n': n_response}) fmt['url'] = update_url_query(fmt['url'], {'n': n_response})
@ -1944,18 +2018,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
Required to tell API what sig/player version is in use. Required to tell API what sig/player version is in use.
""" """
sts = traverse_obj(ytcfg, 'STS', expected_type=int) sts = traverse_obj(ytcfg, 'STS', expected_type=int)
if not sts: if sts:
# Attempt to extract from player return sts
if player_url is None:
error_msg = 'Cannot extract signature timestamp without player_url.' if not player_url:
if fatal: error_msg = 'Cannot extract signature timestamp without player url'
raise ExtractorError(error_msg) if fatal:
self.report_warning(error_msg) raise ExtractorError(error_msg)
return self.report_warning(error_msg)
code = self._load_player(video_id, player_url, fatal=fatal) return None
sts = int_or_none(self._search_regex(
r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code or '', sts = self._load_player_data_from_cache('sts', player_url)
'JS player signature timestamp', group='sts', fatal=fatal)) if sts:
return sts
# Attempt to extract from player
code = self._load_player(video_id, player_url, fatal=fatal)
sts = int_or_none(self._search_regex(
r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code or '',
'JS player signature timestamp', group='sts', fatal=fatal))
if sts:
self._store_player_data_to_cache('sts', player_url, sts)
return sts return sts
def _mark_watched(self, video_id, player_response): def _mark_watched(self, video_id, player_response):