Compare commits

...

18 Commits

Author SHA1 Message Date
Zenon Mousmoulas
31376aea4a
Merge 96800222076e707e1f98364024aa4fcedb980810 into 2b4fbfce25902d557b86b003cf48f738129efce4 2025-03-26 07:44:33 +00:00
dirkf
2b4fbfce25 [YouTube] Support player 4fcd6e4a
thx seproDev, bashonly: yt-dlp/yt-dlp#12748
2025-03-26 02:27:25 +00:00
dirkf
1bc45b8b6c [JSInterp] Use , for join() with null/undefined argument
Eg: [1,2,3].join(null) -> '1,2,3'
2025-03-25 22:35:06 +00:00
dirkf
b982d77d0b [YouTube] Align signature tests with yt-dlp
thx bashonly, yt-dlp/yt-dlp#12725
2025-03-25 22:35:06 +00:00
dirkf
c55dbf4838 [YouTube] Update signature extraction for players 643afba4, 363db69b 2025-03-25 22:35:06 +00:00
dirkf
087d865230 [YouTube] Support new player URL patterns 2025-03-25 22:35:06 +00:00
dirkf
a4fc1151f1 [JSInterp] Improve indexing
* catch invalid list index with `ValueError` (eg [1, 2]['ab'] -> undefined)
* allow assignment outside existing list (eg var l = [1,2]; l[9] = 0;)
2025-03-25 22:35:05 +00:00
dirkf
a464c159e6 [YouTube] Make _extract_player_info() use _search_regex() 2025-03-25 22:35:05 +00:00
dirkf
7dca08eff0 [YouTube] Also get original of translated automatic captions 2025-03-25 22:35:05 +00:00
dirkf
2239ee7965 [YouTube] Get subtitles/automatic captions from both web and API responses 2025-03-25 22:35:05 +00:00
Zenon Mousmoulas
9680022207 Remove unused method 2021-12-13 00:08:02 +02:00
Zenon Mousmoulas
ae8fb74131 Fix typo
url -> origin_url
2021-11-19 08:23:55 +02:00
Zenon Mousmoulas
699390c40d Remove unnecessary quote escape 2021-11-13 08:47:38 +02:00
Zenon Mousmoulas
d303e1e05f GlomexEmbedIE: Reuse _VALID_URL in _extract_urls
* Let _extract_urls reuse _VALID_URL after making scheme optional and
  simplifying the query string part
* Upon an iframe match
  * Add the scheme to the matched URL, if necessary
  * Match the URL against the full _VALID_URL
2021-11-11 11:16:29 +02:00
Zenon Mousmoulas
4225c46d3b Revert to _VALID_URL to match video_id and integration
* Retrieve the last instance of said parameters that appears in the
  query string, rather than the first previously
* Resolve the respective comment in #30212
2021-11-11 11:07:56 +02:00
Zenon Mousmoulas
abfc16a123 Regex fixup 2021-11-11 08:30:56 +02:00
Zenon Mousmoulas
6880bf4334 Force evaluation 2021-11-10 07:34:16 +02:00
Zenon Mousmoulas
f561e0d817 Add Glomex IEs
* Add new IEs
  * GlomexBaseIE: Base IE class
  * GlomexIE: Extract videos from video.glomex.com (by deferring to
    glomex:embed)
  * GlomexEmbedIE: Extract Glomex videos by matching the player URL
* Query the API to extract metadata, detect video formats and get the
  respective (JWT protected) stream/source URLs
* The API query may return one or more videos: the latter case is
  treated as a playlist
  * As this is otherwise identically handled, a separate IE was not
    deemed necessary
  * However title and description fields are not set for playlist
    results
    * They do not exist in the parent object; obtaining them from the
      first entry is not indicative for the playlist content
    * As the playlist order is not always stable (this is true at least
      for related videos playlists), it makes writing test cases
      impossible
* Let GenericIE detect embeds by matching all three integration methods:
  * HTML: glomex-player tag or data attributes
  * Javascript: naive parsing of inline scripts for string constants
    assigned to integration parameters
  * Iframe: src attribute GlomexEmbedIE._VALID_URL
* Let GlomexIE and the former embed detection pass the origin URL to
  GlomexEmbedIE by smuggling it in the player URL, as this is an
  expected parameter in API requests
* Add test cases for both single videos and two playlist flavors
2021-11-08 08:17:47 +02:00
6 changed files with 394 additions and 48 deletions

View File

@ -84,6 +84,21 @@ _SIG_TESTS = [
'2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
'0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xxAj7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJ2OySqa0q',
),
(
'https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js',
'2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
'AAOAOq0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xx8j7vgpDL0QwbdV06sCIEzpWqMGkFR20CFOS21Tp-7vj_EMu-m37KtXJoOy1',
),
(
'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js',
'2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
'0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
),
(
'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js',
'2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0',
),
]
_NSIG_TESTS = [
@ -153,7 +168,7 @@ _NSIG_TESTS = [
),
(
'https://www.youtube.com/s/player/c57c113c/player_ias.vflset/en_US/base.js',
'-Txvy6bT5R6LqgnQNx', 'dcklJCnRUHbgSg',
'M92UUMHa8PdvPd3wyM', '3hPqLJsiNZx7yA',
),
(
'https://www.youtube.com/s/player/5a3b6271/player_ias.vflset/en_US/base.js',
@ -173,7 +188,7 @@ _NSIG_TESTS = [
),
(
'https://www.youtube.com/s/player/cfa9e7cb/player_ias.vflset/en_US/base.js',
'qO0NiMtYQ7TeJnfFG2', 'k9cuJDHNS5O7kQ',
'aCi3iElgd2kq0bxVbQ', 'QX1y8jGb2IbZ0w',
),
(
'https://www.youtube.com/s/player/8c7583ff/player_ias.vflset/en_US/base.js',
@ -231,10 +246,6 @@ _NSIG_TESTS = [
'https://www.youtube.com/s/player/f6e09c70/player_ias_tce.vflset/en_US/base.js',
'W9HJZKktxuYoDTqW', 'jHbbkcaxm54',
),
(
'https://www.youtube.com/s/player/643afba4/player_ias.vflset/en_US/base.js',
'W9HJZKktxuYoDTqW', 'larxUlagTRAcSw',
),
(
'https://www.youtube.com/s/player/e7567ecf/player_ias_tce.vflset/en_US/base.js',
'Sy4aDGc0VpYRR9ew_', '5UPOT1VhoZxNLQ',
@ -259,6 +270,22 @@ _NSIG_TESTS = [
'https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js',
'ir9-V6cdbCiyKxhr', '2PL7ZDYAALMfmA',
),
(
'https://www.youtube.com/s/player/643afba4/player_ias.vflset/en_US/base.js',
'W9HJZKktxuYoDTqW', 'larxUlagTRAcSw',
),
(
'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js',
'eWYu5d5YeY_4LyEDc', 'XJQqf-N7Xra3gg',
),
(
'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js',
'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A',
),
(
'https://www.youtube.com/s/player/4fcd6e4a/tv-player-ias.vflset/tv-player-ias.js',
'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A',
),
]
@ -271,6 +298,8 @@ class TestPlayerInfo(unittest.TestCase):
('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-en_US.vflset/base.js', '64dddad9'),
('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-de_DE.vflset/base.js', '64dddad9'),
('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-tablet-en_US.vflset/base.js', '64dddad9'),
('https://www.youtube.com/s/player/e7567ecf/player_ias_tce.vflset/en_US/base.js', 'e7567ecf'),
('https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', '643afba4'),
# obsolete
('https://www.youtube.com/yts/jsbin/player_ias-vfle4-e03/en_US/base.js', 'vfle4-e03'),
('https://www.youtube.com/yts/jsbin/player_ias-vfl49f_g4/en_US/base.js', 'vfl49f_g4'),
@ -280,8 +309,9 @@ class TestPlayerInfo(unittest.TestCase):
('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', 'vflXGBaUN'),
('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'vflKjOTVq'),
)
ie = YoutubeIE(FakeYDL({'cachedir': False}))
for player_url, expected_player_id in PLAYER_URLS:
player_id = YoutubeIE._extract_player_info(player_url)
player_id = ie._extract_player_info(player_url)
self.assertEqual(player_id, expected_player_id)
@ -301,8 +331,8 @@ class TestSignature(unittest.TestCase):
def t_factory(name, sig_func, url_pattern):
def make_tfunc(url, sig_input, expected_sig):
m = url_pattern.match(url)
assert m, '%r should follow URL format' % url
test_id = m.group('id')
assert m, '{0!r} should follow URL format'.format(url)
test_id = re.sub(r'[/.-]', '_', m.group('id') or m.group('compat_id'))
def test_func(self):
basename = 'player-{0}-{1}.js'.format(name, test_id)
@ -335,12 +365,16 @@ def n_sig(jscode, sig_input):
make_sig_test = t_factory(
'signature', signature, re.compile(r'.*(?:-|/player/)(?P<id>[a-zA-Z0-9_-]+)(?:/.+\.js|(?:/watch_as3|/html5player)?\.[a-z]+)$'))
'signature', signature,
re.compile(r'''(?x)
.+/(?P<h5>html5)?player(?(h5)(?:-en_US)?-|/)(?P<id>[a-zA-Z0-9/._-]+)
(?(h5)/(?:watch_as3|html5player))?\.js$
'''))
for test_spec in _SIG_TESTS:
make_sig_test(*test_spec)
make_nsig_test = t_factory(
'nsig', n_sig, re.compile(r'.+/player/(?P<id>[a-zA-Z0-9_-]+)/.+.js$'))
'nsig', n_sig, re.compile(r'.+/player/(?P<id>[a-zA-Z0-9_/.-]+)\.js$'))
for test_spec in _NSIG_TESTS:
make_nsig_test(*test_spec)

View File

@ -1078,6 +1078,10 @@ from .rutube import (
RutubePersonIE,
RutubePlaylistIE,
)
from .glomex import (
GlomexIE,
GlomexEmbedIE,
)
from .rutv import RUTVIE
from .ruutu import RuutuIE
from .ruv import RuvIE

View File

@ -102,6 +102,7 @@ from .ustream import UstreamIE
from .arte import ArteTVEmbedIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
from .glomex import GlomexEmbedIE
from .limelight import LimelightBaseIE
from .anvato import AnvatoIE
from .washingtonpost import WashingtonPostIE
@ -3400,6 +3401,12 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
# Look for Glomex embeds
glomex_urls = list(GlomexEmbedIE._extract_urls(webpage, url))
if glomex_urls:
return self.playlist_from_matches(
glomex_urls, video_id, video_title, ie=GlomexEmbedIE.ie_key())
# Look for WashingtonPost embeds
wapo_urls = WashingtonPostIE._extract_urls(webpage)
if wapo_urls:

View File

@ -0,0 +1,279 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse_urlparse,
compat_urllib_parse_urlencode,
)
from ..utils import (
ExtractorError,
determine_ext,
int_or_none,
try_get,
smuggle_url,
unsmuggle_url,
unescapeHTML,
)
class GlomexBaseIE(InfoExtractor):
_DEFAULT_ORIGIN_URL = 'https://player.glomex.com/'
_API_URL = 'https://integration-cloudfront-eu-west-1.mes.glomex.cloud/'
@staticmethod
def _smuggle_origin_url(url, origin_url):
return smuggle_url(url, {'origin': origin_url})
@classmethod
def _unsmuggle_origin_url(cls, url, fallback_origin_url=None):
defaults = {'origin': fallback_origin_url or cls._DEFAULT_ORIGIN_URL}
unsmuggled_url, data = unsmuggle_url(url, default=defaults)
return unsmuggled_url, data['origin']
def _get_videoid_type(self, video_id):
_VIDEOID_TYPES = {
'v': 'video',
'pl': 'playlist',
'rl': 'related videos playlist',
'cl': 'curated playlist',
}
prefix = video_id.split('-')[0]
return _VIDEOID_TYPES.get(prefix, 'unknown type')
def _download_api_data(self, video_id, integration, current_url=None):
query = {
'integration_id': integration,
'playlist_id': video_id,
'current_url': current_url or self._DEFAULT_ORIGIN_URL,
}
video_id_type = self._get_videoid_type(video_id)
return self._download_json(
self._API_URL,
video_id, 'Downloading %s JSON' % video_id_type,
'Unable to download %s JSON' % video_id_type,
query=query)
def _download_and_extract_api_data(self, video_id, integration, current_url):
api_data = self._download_api_data(video_id, integration, current_url)
videos = api_data['videos']
if not videos:
raise ExtractorError('no videos found for %s' % video_id)
if len(videos) == 1:
return self._extract_api_data(videos[0], video_id)
# assume some kind of playlist
videos = [
self._extract_api_data(video, video_id)
for video in videos
]
return self.playlist_result(videos, video_id)
def _extract_api_data(self, video, video_id):
if video.get('error_code') == 'contentGeoblocked':
self.raise_geo_restricted(countries=video['geo_locations'])
info = self._extract_info(video, video_id)
info['formats'] = self._extract_formats(video, video_id)
return info
@staticmethod
def _extract_info(video, video_id=None, require_title=True):
title = video['title'] if require_title else video.get('title')
def append_image_url(url, default='profile:player-960x540'):
if url:
return '%s/%s' % (url, default)
thumbnail = append_image_url(try_get(video,
lambda x: x['image']['url']))
thumbnails = [
dict(width=960, height=540,
**{k: append_image_url(v) if k == 'url' else v
for k, v in image.items() if k in ('id', 'url')})
for image in video.get('images', [])
] or None
return {
'id': video.get('clip_id') or video_id,
'title': title,
'description': video.get('description'),
'thumbnail': thumbnail,
'thumbnails': thumbnails,
'duration': int_or_none(video.get('clip_duration')),
'timestamp': video.get('created_at'),
}
def _extract_formats(self, options, video_id):
formats = []
for format_id, format_url in options['source'].items():
ext = determine_ext(format_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4', m3u8_id=format_id,
fatal=False))
else:
formats.append({
'url': format_url,
'format_id': format_id,
})
if options.get('language'):
for format in formats:
format['language'] = options.get('language')
self._sort_formats(formats)
return formats
class GlomexIE(GlomexBaseIE):
IE_NAME = 'glomex'
IE_DESC = 'Glomex videos'
_VALID_URL = r'https?://video\.glomex\.com/[^/]+/(?P<id>v-[^-]+)'
# Hard-coded integration ID for video.glomex.com
_INTEGRATION_ID = '19syy24xjn1oqlpc'
_TEST = {
'url': 'https://video.glomex.com/sport/v-cb24uwg77hgh-nach-2-0-sieg-guardiola-mit-mancity-vor-naechstem-titel',
'md5': 'cec33a943c4240c9cb33abea8c26242e',
'info_dict': {
'id': 'v-cb24uwg77hgh',
'ext': 'mp4',
'title': 'md5:38a90cedcfadd72982c81acf13556e0c',
'description': 'md5:1ea6b6caff1443fcbbba159e432eedb8',
'duration': 29600,
'timestamp': 1619895017,
'upload_date': '20210501',
'age_limit': None,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
# Defer to glomex:embed IE: Build and return a player URL using the
# matched video ID and the hard-coded integration ID
return self.url_result(
GlomexEmbedIE.build_player_url(video_id, self._INTEGRATION_ID,
url),
GlomexEmbedIE.ie_key(),
video_id
)
class GlomexEmbedIE(GlomexBaseIE):
IE_NAME = 'glomex:embed'
IE_DESC = 'Glomex embedded videos'
_BASE_PLAYER_URL = 'https://player.glomex.com/integration/1/iframe-player.html'
_VALID_URL = r'''(?x)https?://player\.glomex\.com/integration/[^/]+/iframe-player\.html
\?(?:(?:integrationId=(?P<integration>[^&#]+)|playlistId=(?P<id>[^&#]+)|[^&=#]+=[^&#]+)&?)+'''
_TESTS = [{
'url': 'https://player.glomex.com/integration/1/iframe-player.html?integrationId=4059a013k56vb2yd&playlistId=v-cfa6lye0dkdd-sf',
'info_dict': {
'id': 'v-cfa6lye0dkdd-sf',
'ext': 'mp4',
'timestamp': 1635337199,
'duration': 133080,
'upload_date': '20211027',
'description': 'md5:e741185fc309310ff5d0c789b437be66',
'title': 'md5:35647293513a6c92363817a0fb0a7961',
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://player.glomex.com/integration/1/iframe-player.html?origin=fullpage&integrationId=19syy24xjn1oqlpc&playlistId=rl-vcb49w1fb592p&playlistIndex=0',
'info_dict': {
'id': 'rl-vcb49w1fb592p',
},
'playlist_count': 100,
'params': {
'skip_download': True,
},
}, {
'url': 'https://player.glomex.com/integration/1/iframe-player.html?playlistId=cl-bgqaata6aw8x&integrationId=19syy24xjn1oqlpc',
'info_dict': {
'id': 'cl-bgqaata6aw8x',
},
'playlist_mincount': 2,
'params': {
'skip_download': True,
},
}]
@classmethod
def build_player_url(cls, video_id, integration, origin_url=None):
query_string = compat_urllib_parse_urlencode({
'playlistId': video_id,
'integrationId': integration,
})
player_url = '%s?%s' % (cls._BASE_PLAYER_URL, query_string)
if origin_url is not None:
player_url = cls._smuggle_origin_url(player_url, origin_url)
return player_url
@classmethod
def _extract_urls(cls, webpage, origin_url):
# make the scheme in _VALID_URL optional
_URL_RE = r'(?:https?:)?//' + cls._VALID_URL.split('://', 1)[1]
# simplify the query string part of _VALID_URL; after extracting iframe
# src, the URL will be matched again
_URL_RE = _URL_RE.split(r'\?', 1)[0] + r'\?(?:(?!(?P=_q1)).)+'
# https://docs.glomex.com/publisher/video-player-integration/javascript-api/
EMBED_RE = r'''(?x)
(?:
<iframe[^>]+?src=(?P<_q1>%(quot_re)s)
(?P<url>%(url_re)s)(?P=_q1)|
<(?P<html_tag>glomex-player|div)(?:
data-integration-id=(?P<_q2>%(quot_re)s)(?P<integration_html>(?:(?!(?P=_q2)).)+)(?P=_q2)|
data-playlist-id=(?P<_q3>%(quot_re)s)(?P<id_html>(?:(?!(?P=_q3)).)+)(?P=_q3)|
data-glomex-player=(?P<_q4>%(quot_re)s)(?P<glomex_player>true)(?P=_q4)|
[^>]*?
)+>|
# naive parsing of inline scripts for hard-coded integration parameters
<(?P<script_tag>script)[^<]*?>(?:
(?P<_stjs1>dataset\.)?integrationId\s*(?(_stjs1)=|:)\s*
(?P<_q5>%(quot_re)s)(?P<integration_js>(?:(?!(?P=_q5)).)+)(?P=_q5)\s*(?(_stjs1);|,)?|
(?P<_stjs2>dataset\.)?playlistId\s*(?(_stjs2)=|:)\s*
(?P<_q6>%(quot_re)s)(?P<id_js>(?:(?!(?P=_q6)).)+)(?P=_q6)\s*(?(_stjs2);|,)?|
(?:\s|.)*?
)+</script>
)
''' % {'quot_re': r'["\']', 'url_re': _URL_RE}
for mobj in re.finditer(EMBED_RE, webpage):
url, html_tag, video_id_html, integration_html, glomex_player, \
script_tag, video_id_js, integration_js = \
mobj.group('url', 'html_tag', 'id_html',
'integration_html', 'glomex_player', 'script_tag',
'id_js', 'integration_js')
if url:
url = unescapeHTML(url)
if url.startswith('//'):
scheme = compat_urllib_parse_urlparse(origin_url).scheme \
if origin_url else 'https'
url = '%s:%s' % (scheme, url)
if not cls.suitable(url):
continue
yield cls._smuggle_origin_url(url, origin_url)
elif html_tag:
if html_tag == "div" and not glomex_player:
continue
if not video_id_html or not integration_html:
continue
yield cls.build_player_url(video_id_html, integration_html,
origin_url)
elif script_tag:
if not video_id_js or not integration_js:
continue
yield cls.build_player_url(video_id_js, integration_js,
origin_url)
def _real_extract(self, url):
url, origin_url = self._unsmuggle_origin_url(url)
# must return a valid match since it was already tested when selecting the IE
try:
matches = self._VALID_URL_RE.match(url).groupdict()
except AttributeError:
matches = re.match(self._VALID_URL, url).groupdict()
# id is not enforced in the pattern, so do it now; ditto integration
video_id = matches['id']
integration = matches['integration']
return self._download_and_extract_api_data(video_id, integration,
origin_url)

View File

@ -692,9 +692,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'invidious': '|'.join(_INVIDIOUS_SITES),
}
_PLAYER_INFO_RE = (
r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})//(?:tv-)?player',
r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/(?:tv-)?player',
r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias(?:_tce)?\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
r'\b(?P<id>vfl[a-zA-Z0-9_-]{6,})\b.*?\.js$',
)
_SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
@ -1626,15 +1626,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
""" Return a string representation of a signature """
return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
@classmethod
def _extract_player_info(cls, player_url):
for player_re in cls._PLAYER_INFO_RE:
id_m = re.search(player_re, player_url)
if id_m:
break
else:
raise ExtractorError('Cannot identify player %r' % player_url)
return id_m.group('id')
def _extract_player_info(self, player_url):
try:
return self._search_regex(
self._PLAYER_INFO_RE, player_url, 'player info', group='id')
except ExtractorError as e:
raise ExtractorError(
'Cannot identify player %r' % (player_url,), cause=e)
def _load_player(self, video_id, player_url, fatal=True, player_id=None):
if not player_id:
@ -1711,6 +1709,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
' return %s\n') % (signature_id_tuple, expr_code)
self.to_screen('Extracted signature function:\n' + code)
def _extract_sig_fn(self, jsi, funcname):
var_ay = self._search_regex(
r'''(?x)
(?:\*/|\{|\n|^)\s*(?:'[^']+'\s*;\s*)
(var\s*[\w$]+\s*=\s*(?:
('|")(?:\\\2|(?!\2).)+\2\s*\.\s*split\(\s*('|")\W+\3\s*\)|
\[\s*(?:('|")(?:\\\4|(?!\4).)*\4\s*(?:(?=\])|,\s*))+\]
))(?=\s*[,;])
''', jsi.code, 'useful values', default='')
sig_fn = jsi.extract_function_code(funcname)
if var_ay:
sig_fn = (sig_fn[0], ';\n'.join((var_ay, sig_fn[1])))
return sig_fn
def _parse_sig_js(self, jscode):
# Examples where `sig` is funcname:
# sig=function(a){a=a.split(""); ... ;return a.join("")};
@ -1736,8 +1751,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
jscode, 'Initial JS player signature function name', group='sig')
jsi = JSInterpreter(jscode)
initial_function = jsi.extract_function(funcname)
return lambda s: initial_function([s])
initial_function = self._extract_sig_fn(jsi, funcname)
func = jsi.extract_function_from_code(*initial_function)
return lambda s: func([s])
def _cached(self, func, *cache_id):
def inner(*args, **kwargs):
@ -1856,15 +1875,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_n_function_code_jsi(self, video_id, jsi, player_id=None):
var_ay = self._search_regex(
r'(?:[;\s]|^)\s*(var\s*[\w$]+\s*=\s*"(?:\\"|[^"])+"\s*\.\s*split\("\W+"\))(?=\s*[,;])',
jsi.code, 'useful values', default='')
func_name = self._extract_n_function_name(jsi.code)
func_code = jsi.extract_function_code(func_name)
if var_ay:
func_code = (func_code[0], ';\n'.join((var_ay, func_code[1])))
func_code = self._extract_sig_fn(jsi, func_name)
if player_id:
self.cache.store('youtube-nsig', player_id, func_code)
@ -2136,7 +2149,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_details = merge_dicts(*traverse_obj(
(player_response, api_player_response),
(Ellipsis, 'videoDetails', T(dict))))
player_response.update(api_player_response or {})
player_response.update(filter_dict(
api_player_response or {}, cndn=lambda k, _: k != 'captions'))
player_response['videoDetails'] = video_details
def is_agegated(playability):
@ -2566,8 +2580,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}
pctr = traverse_obj(
player_response,
('captions', 'playerCaptionsTracklistRenderer', T(dict)))
(player_response, api_player_response),
(Ellipsis, 'captions', 'playerCaptionsTracklistRenderer', T(dict)))
if pctr:
def process_language(container, base_url, lang_code, query):
lang_subs = []
@ -2584,20 +2598,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def process_subtitles():
subtitles = {}
for caption_track in traverse_obj(pctr, (
'captionTracks', lambda _, v: v.get('baseUrl'))):
Ellipsis, 'captionTracks', lambda _, v: (
v.get('baseUrl') and v.get('languageCode')))):
base_url = self._yt_urljoin(caption_track['baseUrl'])
if not base_url:
continue
lang_code = caption_track['languageCode']
if caption_track.get('kind') != 'asr':
lang_code = caption_track.get('languageCode')
if not lang_code:
continue
process_language(
subtitles, base_url, lang_code, {})
continue
automatic_captions = {}
process_language(
automatic_captions, base_url, lang_code, {})
for translation_language in traverse_obj(pctr, (
'translationLanguages', lambda _, v: v.get('languageCode'))):
Ellipsis, 'translationLanguages', lambda _, v: v.get('languageCode'))):
translation_language_code = translation_language['languageCode']
process_language(
automatic_captions, base_url, translation_language_code,

View File

@ -678,7 +678,7 @@ class JSInterpreter(object):
return len(obj)
try:
return obj[int(idx)] if isinstance(obj, list) else obj[compat_str(idx)]
except (TypeError, KeyError, IndexError) as e:
except (TypeError, KeyError, IndexError, ValueError) as e:
# allow_undefined is None gives correct behaviour
if allow_undefined or (
allow_undefined is None and not isinstance(e, TypeError)):
@ -1038,6 +1038,10 @@ class JSInterpreter(object):
left_val = self._index(left_val, idx)
if isinstance(idx, float):
idx = int(idx)
if isinstance(left_val, list) and len(left_val) <= int_or_none(idx, default=-1):
# JS Array is a sparsely assignable list
# TODO: handle extreme sparsity without memory bloat, eg using auxiliary dict
left_val.extend((idx - len(left_val) + 1) * [JS_Undefined])
left_val[idx] = self._operator(
m.group('op'), self._index(left_val, idx) if m.group('op') else None,
m.group('expr'), expr, local_vars, allow_recursion)
@ -1204,9 +1208,10 @@ class JSInterpreter(object):
elif member == 'join':
assertion(isinstance(obj, list), 'must be applied on a list')
assertion(len(argvals) <= 1, 'takes at most one argument')
return (',' if len(argvals) == 0 else argvals[0]).join(
('' if x in (None, JS_Undefined) else _js_toString(x))
for x in obj)
return (',' if len(argvals) == 0 or argvals[0] in (None, JS_Undefined)
else argvals[0]).join(
('' if x in (None, JS_Undefined) else _js_toString(x))
for x in obj)
elif member == 'reverse':
assertion(not argvals, 'does not take any arguments')
obj.reverse()
@ -1364,19 +1369,21 @@ class JSInterpreter(object):
code, _ = self._separate_at_paren(func_m.group('code')) # refine the match
return self.build_arglist(func_m.group('args')), code
def extract_function(self, funcname):
def extract_function(self, funcname, *global_stack):
return function_with_repr(
self.extract_function_from_code(*self.extract_function_code(funcname)),
self.extract_function_from_code(*itertools.chain(
self.extract_function_code(funcname), global_stack)),
'F<%s>' % (funcname,))
def extract_function_from_code(self, argnames, code, *global_stack):
local_vars = {}
start = None
while True:
mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code)
mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code[start:])
if mobj is None:
break
start, body_start = mobj.span()
start, body_start = ((start or 0) + x for x in mobj.span())
body, remaining = self._separate_at_paren(code[body_start - 1:])
name = self._named_object(local_vars, self.extract_function_from_code(
[x.strip() for x in mobj.group('args').split(',')],