Compare commits

...

15 Commits

Author SHA1 Message Date
Olivier Trichet
86195979fb
Merge 72db2172897a76665414fc9da3fc79f096df9fab into 2b4fbfce25902d557b86b003cf48f738129efce4 2025-03-26 07:07:56 +00:00
dirkf
2b4fbfce25 [YouTube] Support player 4fcd6e4a
thx seproDev, bashonly: yt-dlp/yt-dlp#12748
2025-03-26 02:27:25 +00:00
dirkf
1bc45b8b6c [JSInterp] Use , for join() with null/undefined argument
Eg: [1,2,3].join(null) -> '1,2,3'
2025-03-25 22:35:06 +00:00
dirkf
b982d77d0b [YouTube] Align signature tests with yt-dlp
thx bashonly, yt-dlp/yt-dlp#12725
2025-03-25 22:35:06 +00:00
dirkf
c55dbf4838 [YouTube] Update signature extraction for players 643afba4, 363db69b 2025-03-25 22:35:06 +00:00
dirkf
087d865230 [YouTube] Support new player URL patterns 2025-03-25 22:35:06 +00:00
dirkf
a4fc1151f1 [JSInterp] Improve indexing
* catch invalid list index with `ValueError` (eg [1, 2]['ab'] -> undefined)
* allow assignment outside existing list (eg var l = [1,2]; l[9] = 0;)
2025-03-25 22:35:05 +00:00
dirkf
a464c159e6 [YouTube] Make _extract_player_info() use _search_regex() 2025-03-25 22:35:05 +00:00
dirkf
7dca08eff0 [YouTube] Also get original of translated automatic captions 2025-03-25 22:35:05 +00:00
dirkf
2239ee7965 [YouTube] Get subtitles/automatic captions from both web and API responses 2025-03-25 22:35:05 +00:00
Olivier Trichet
72db217289 [RadioFrance] Extractor fo thematic webradios 2022-12-22 14:22:19 -05:00
Olivier Trichet
fc933e686b [RadioFrance] Refactoring 2022-12-22 13:01:10 -05:00
Olivier Trichet
ea02c40539 [RadioFrance] Extractor for podcast playlists 2022-12-22 13:00:54 -05:00
Olivier Trichet
7270ecf3d6 [RadioFrance] Extractor for podcast of Radio France stations 2022-12-22 13:00:17 -05:00
Olivier Trichet
dade9111f1 [RadioFrance] Remove old Radio France stations extractors
These are not working anymore after their respectives websites were
merged into www.radiofrance.fr.
2022-12-22 13:00:08 -05:00
7 changed files with 381 additions and 227 deletions

View File

@ -84,6 +84,21 @@ _SIG_TESTS = [
'2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
'0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xxAj7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJ2OySqa0q', '0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xxAj7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJ2OySqa0q',
), ),
(
'https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js',
'2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
'AAOAOq0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xx8j7vgpDL0QwbdV06sCIEzpWqMGkFR20CFOS21Tp-7vj_EMu-m37KtXJoOy1',
),
(
'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js',
'2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
'0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
),
(
'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js',
'2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0',
),
] ]
_NSIG_TESTS = [ _NSIG_TESTS = [
@ -153,7 +168,7 @@ _NSIG_TESTS = [
), ),
( (
'https://www.youtube.com/s/player/c57c113c/player_ias.vflset/en_US/base.js', 'https://www.youtube.com/s/player/c57c113c/player_ias.vflset/en_US/base.js',
'-Txvy6bT5R6LqgnQNx', 'dcklJCnRUHbgSg', 'M92UUMHa8PdvPd3wyM', '3hPqLJsiNZx7yA',
), ),
( (
'https://www.youtube.com/s/player/5a3b6271/player_ias.vflset/en_US/base.js', 'https://www.youtube.com/s/player/5a3b6271/player_ias.vflset/en_US/base.js',
@ -173,7 +188,7 @@ _NSIG_TESTS = [
), ),
( (
'https://www.youtube.com/s/player/cfa9e7cb/player_ias.vflset/en_US/base.js', 'https://www.youtube.com/s/player/cfa9e7cb/player_ias.vflset/en_US/base.js',
'qO0NiMtYQ7TeJnfFG2', 'k9cuJDHNS5O7kQ', 'aCi3iElgd2kq0bxVbQ', 'QX1y8jGb2IbZ0w',
), ),
( (
'https://www.youtube.com/s/player/8c7583ff/player_ias.vflset/en_US/base.js', 'https://www.youtube.com/s/player/8c7583ff/player_ias.vflset/en_US/base.js',
@ -231,10 +246,6 @@ _NSIG_TESTS = [
'https://www.youtube.com/s/player/f6e09c70/player_ias_tce.vflset/en_US/base.js', 'https://www.youtube.com/s/player/f6e09c70/player_ias_tce.vflset/en_US/base.js',
'W9HJZKktxuYoDTqW', 'jHbbkcaxm54', 'W9HJZKktxuYoDTqW', 'jHbbkcaxm54',
), ),
(
'https://www.youtube.com/s/player/643afba4/player_ias.vflset/en_US/base.js',
'W9HJZKktxuYoDTqW', 'larxUlagTRAcSw',
),
( (
'https://www.youtube.com/s/player/e7567ecf/player_ias_tce.vflset/en_US/base.js', 'https://www.youtube.com/s/player/e7567ecf/player_ias_tce.vflset/en_US/base.js',
'Sy4aDGc0VpYRR9ew_', '5UPOT1VhoZxNLQ', 'Sy4aDGc0VpYRR9ew_', '5UPOT1VhoZxNLQ',
@ -259,6 +270,22 @@ _NSIG_TESTS = [
'https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', 'https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js',
'ir9-V6cdbCiyKxhr', '2PL7ZDYAALMfmA', 'ir9-V6cdbCiyKxhr', '2PL7ZDYAALMfmA',
), ),
(
'https://www.youtube.com/s/player/643afba4/player_ias.vflset/en_US/base.js',
'W9HJZKktxuYoDTqW', 'larxUlagTRAcSw',
),
(
'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js',
'eWYu5d5YeY_4LyEDc', 'XJQqf-N7Xra3gg',
),
(
'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js',
'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A',
),
(
'https://www.youtube.com/s/player/4fcd6e4a/tv-player-ias.vflset/tv-player-ias.js',
'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A',
),
] ]
@ -271,6 +298,8 @@ class TestPlayerInfo(unittest.TestCase):
('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-en_US.vflset/base.js', '64dddad9'), ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-en_US.vflset/base.js', '64dddad9'),
('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-de_DE.vflset/base.js', '64dddad9'), ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-de_DE.vflset/base.js', '64dddad9'),
('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-tablet-en_US.vflset/base.js', '64dddad9'), ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-tablet-en_US.vflset/base.js', '64dddad9'),
('https://www.youtube.com/s/player/e7567ecf/player_ias_tce.vflset/en_US/base.js', 'e7567ecf'),
('https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', '643afba4'),
# obsolete # obsolete
('https://www.youtube.com/yts/jsbin/player_ias-vfle4-e03/en_US/base.js', 'vfle4-e03'), ('https://www.youtube.com/yts/jsbin/player_ias-vfle4-e03/en_US/base.js', 'vfle4-e03'),
('https://www.youtube.com/yts/jsbin/player_ias-vfl49f_g4/en_US/base.js', 'vfl49f_g4'), ('https://www.youtube.com/yts/jsbin/player_ias-vfl49f_g4/en_US/base.js', 'vfl49f_g4'),
@ -280,8 +309,9 @@ class TestPlayerInfo(unittest.TestCase):
('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', 'vflXGBaUN'), ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', 'vflXGBaUN'),
('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'vflKjOTVq'), ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'vflKjOTVq'),
) )
ie = YoutubeIE(FakeYDL({'cachedir': False}))
for player_url, expected_player_id in PLAYER_URLS: for player_url, expected_player_id in PLAYER_URLS:
player_id = YoutubeIE._extract_player_info(player_url) player_id = ie._extract_player_info(player_url)
self.assertEqual(player_id, expected_player_id) self.assertEqual(player_id, expected_player_id)
@ -301,8 +331,8 @@ class TestSignature(unittest.TestCase):
def t_factory(name, sig_func, url_pattern): def t_factory(name, sig_func, url_pattern):
def make_tfunc(url, sig_input, expected_sig): def make_tfunc(url, sig_input, expected_sig):
m = url_pattern.match(url) m = url_pattern.match(url)
assert m, '%r should follow URL format' % url assert m, '{0!r} should follow URL format'.format(url)
test_id = m.group('id') test_id = re.sub(r'[/.-]', '_', m.group('id') or m.group('compat_id'))
def test_func(self): def test_func(self):
basename = 'player-{0}-{1}.js'.format(name, test_id) basename = 'player-{0}-{1}.js'.format(name, test_id)
@ -335,12 +365,16 @@ def n_sig(jscode, sig_input):
make_sig_test = t_factory( make_sig_test = t_factory(
'signature', signature, re.compile(r'.*(?:-|/player/)(?P<id>[a-zA-Z0-9_-]+)(?:/.+\.js|(?:/watch_as3|/html5player)?\.[a-z]+)$')) 'signature', signature,
re.compile(r'''(?x)
.+/(?P<h5>html5)?player(?(h5)(?:-en_US)?-|/)(?P<id>[a-zA-Z0-9/._-]+)
(?(h5)/(?:watch_as3|html5player))?\.js$
'''))
for test_spec in _SIG_TESTS: for test_spec in _SIG_TESTS:
make_sig_test(*test_spec) make_sig_test(*test_spec)
make_nsig_test = t_factory( make_nsig_test = t_factory(
'nsig', n_sig, re.compile(r'.+/player/(?P<id>[a-zA-Z0-9_-]+)/.+.js$')) 'nsig', n_sig, re.compile(r'.+/player/(?P<id>[a-zA-Z0-9_/.-]+)\.js$'))
for test_spec in _NSIG_TESTS: for test_spec in _NSIG_TESTS:
make_nsig_test(*test_spec) make_nsig_test(*test_spec)

View File

@ -413,8 +413,6 @@ from .foxnews import (
FoxNewsArticleIE, FoxNewsArticleIE,
) )
from .foxsports import FoxSportsIE from .foxsports import FoxSportsIE
from .franceculture import FranceCultureIE
from .franceinter import FranceInterIE
from .francetv import ( from .francetv import (
FranceTVIE, FranceTVIE,
FranceTVSiteIE, FranceTVSiteIE,
@ -1011,7 +1009,11 @@ from .radiocanada import (
from .radiode import RadioDeIE from .radiode import RadioDeIE
from .radiojavan import RadioJavanIE from .radiojavan import RadioJavanIE
from .radiobremen import RadioBremenIE from .radiobremen import RadioBremenIE
from .radiofrance import RadioFranceIE from .radiofrance import (
RadioFrancePodcastEpisodeIE,
RadioFrancePodcastPlaylistIE,
RadioFranceWebradioIE,
)
from .rai import ( from .rai import (
RaiPlayIE, RaiPlayIE,
RaiPlayLiveIE, RaiPlayLiveIE,

View File

@ -1,73 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
determine_ext,
extract_attributes,
int_or_none,
)
class FranceCultureIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks',
'info_dict': {
'id': 'rendez-vous-au-pays-des-geeks',
'display_id': 'rendez-vous-au-pays-des-geeks',
'ext': 'mp3',
'title': 'Rendez-vous au pays des geeks',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20140301',
'timestamp': 1393700400,
'vcodec': 'none',
}
}, {
# no thumbnail
'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_data = extract_attributes(self._search_regex(
r'''(?sx)
(?:
</h1>|
<div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>
).*?
(<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>)
''',
webpage, 'video data'))
video_url = video_data.get('data-url') or video_data['data-asset-source']
title = video_data.get('data-asset-title') or video_data.get('data-diffusion-title') or self._og_search_title(webpage)
description = self._html_search_regex(
r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>',
webpage, 'description', default=None)
thumbnail = self._search_regex(
r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"',
webpage, 'thumbnail', default=None)
uploader = self._html_search_regex(
r'(?s)<span class="author">(.*?)</span>',
webpage, 'uploader', default=None)
ext = determine_ext(video_url.lower())
return {
'id': display_id,
'display_id': display_id,
'url': video_url,
'title': title,
'description': description,
'thumbnail': thumbnail,
'ext': ext,
'vcodec': 'none' if ext == 'mp3' else None,
'uploader': uploader,
'timestamp': int_or_none(video_data.get('data-start-time')) or int_or_none(video_data.get('data-asset-created-date')),
'duration': int_or_none(video_data.get('data-duration')),
}

View File

@ -1,59 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import month_by_name
class FranceInterIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?franceinter\.fr/emissions/(?P<id>[^?#]+)'
_TEST = {
'url': 'https://www.franceinter.fr/emissions/affaires-sensibles/affaires-sensibles-07-septembre-2016',
'md5': '9e54d7bdb6fdc02a841007f8a975c094',
'info_dict': {
'id': 'affaires-sensibles/affaires-sensibles-07-septembre-2016',
'ext': 'mp3',
'title': 'Affaire Cahuzac : le contentieux du compte en Suisse',
'description': 'md5:401969c5d318c061f86bda1fa359292b',
'thumbnail': r're:^https?://.*\.jpg',
'upload_date': '20160907',
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
r'(?s)<div[^>]+class=["\']page-diffusion["\'][^>]*>.*?<button[^>]+data-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
webpage, 'video url', group='url')
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage)
upload_date_str = self._search_regex(
r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<',
webpage, 'upload date', fatal=False)
if upload_date_str:
upload_date_list = upload_date_str.split()
upload_date_list.reverse()
upload_date_list[1] = '%02d' % (month_by_name(upload_date_list[1], lang='fr') or 0)
upload_date_list[2] = '%02d' % int(upload_date_list[2])
upload_date = ''.join(upload_date_list)
else:
upload_date = None
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'upload_date': upload_date,
'formats': [{
'url': video_url,
'vcodec': 'none',
}],
}

View File

@ -4,56 +4,284 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
ExtractorError,
get_element_by_attribute,
int_or_none,
parse_iso8601,
strip_or_none,
url_or_none
)
class RadioFranceIE(InfoExtractor): class RadioFranceBaseIE(InfoExtractor):
_VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)' _BASE_URL = r'https://www.radiofrance.fr/'
IE_NAME = 'radiofrance'
_TEST = { def extract_api_data(self, api_path, id, html):
'url': 'http://maison.radiofrance.fr/radiovisions/one-one', pattern = r'<script [^>]*sveltekit:data-url="https://www\.radiofrance\.fr/api/v[\d.]+/%s[^>]*>(?P<json>.*)</script>' % api_path
'md5': 'bdbb28ace95ed0e04faab32ba3160daf', json = self._search_regex(pattern, html, 'API data', flags=re.DOTALL, group='json')
if not json:
raise ExtractorError('%s: JSON data not found' % id)
try:
json = self._parse_json(json, id)
json = self._parse_json(json['body'], id)
if api_path == 'path':
return json['content']
elif api_path == 'stations':
return json
else:
raise ExtractorError('Coding error')
except KeyError:
raise ExtractorError('%s: Invalid JSON' % id)
def get_title(self, api_data, webpage=None):
title = strip_or_none(api_data.get('title'))
if not title and webpage:
title = strip_or_none(get_element_by_attribute('h1', None, webpage, False)) or strip_or_none(self._og_search_title(webpage))
return title
def get_description(self, api_data, webpage=None):
description = strip_or_none(api_data.get('standFirst'))
if not description and webpage:
description = strip_or_none(self._og_search_description(webpage))
return description
def get_thumbnail(self, api_data, webpage=None):
thumbnail = None
visual = api_data.get('visual')
if visual:
thumbnail = url_or_none(visual.get('src'))
if not thumbnail and webpage:
thumbnail = self._og_search_thumbnail(webpage)
return thumbnail
def get_timestamp(self, api_data, webpage=None):
timestamp = api_data.get('publishedDate')
if not timestamp and webpage:
timestamp = parse_iso8601(self._html_search_meta('article:published_time', webpage, 'publication time', ))
return timestamp
def get_brand(self, api_data, webpage=None):
brand = strip_or_none(api_data.get('brand'))
if not brand and webpage:
brand = self._og_search_property('site_name', webpage, 'Station name', fatal=False)
return brand
def extract_episode(self, episode_id, api_data):
manifestations = api_data.get('manifestations')
if manifestations is None or len(manifestations) == 0:
return None, None
url = url_or_none(manifestations[0]['url'])
duration = int_or_none(manifestations[0].get('duration'))
return url, duration
def get_playlist_entries(self, playlist_url, playlist_id, api_data, direction):
playlist_data = api_data['expressions']
entries = []
items = playlist_data.get('items')
for item in items:
episode_path = item.get('path')
if episode_path is None:
self.report_warning('No path found for episode "%s"', item.get('title'))
continue
episode_id = RadioFrancePodcastEpisodeIE._match_id(self._BASE_URL + episode_path)
if episode_id is None:
self.report_warning('Could not parse id of episode from path: "%s"' % episode_path)
continue
episode_url, duration = self.extract_episode(episode_id, item)
if episode_url is None:
self.to_screen('Episode "%s" is not available' % episode_path)
continue
entry = {
'id': episode_id,
'url': episode_url,
'title': self.get_title(item),
'description': self.get_description(item),
'timestamp': self.get_timestamp(item),
'thumbnail': self.get_thumbnail(item),
'duration': duration,
}
entries.append(entry)
page_number = int_or_none(playlist_data.get('pageNumber'))
if page_number:
if direction in ['both', 'prev'] and playlist_data.get('prev') is not None:
webpage, other_api_data = self.get_data(playlist_url, 'path', playlist_id, page=page_number - 1)
entries = self.get_playlist_entries(playlist_url, playlist_id, other_api_data, direction='prev') + entries
if direction in ['both', 'next'] and playlist_data.get('next') is not None:
webpage, other_api_data = self.get_data(playlist_url, 'path', playlist_id, page=page_number + 1)
entries = entries + self.get_playlist_entries(playlist_url, playlist_id, other_api_data, direction='next')
return entries
def get_data(self, url, api_path, id, page=None):
query = {}
note = None
if page:
query['p'] = page
note = "Downloading page %i" % page
webpage = self._download_webpage(url, id, query=query, note=note)
api_data = self.extract_api_data(api_path, id, webpage)
return webpage, api_data
class RadioFrancePodcastEpisodeIE(RadioFranceBaseIE):
_VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/podcasts/.+/.+-(?P<id>\d+)$'
_TESTS = [{
'note': 'Podcast episode with audio from France Info',
'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-brief-eco/le-brief-eco-du-lundi-05-septembre-2022-8310713',
'info_dict': { 'info_dict': {
'id': 'one-one', 'id': '8310713',
'ext': 'ogg', 'ext': 'mp3',
'title': 'One to one', 'url': r're:^https?://.*\.mp3$',
'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.", 'title': 'Pour la première fois en vingt ans, leuro passe sous les 0,99\u00a0dollar',
'uploader': 'Thomas Hercouët', 'description': str,
}, 'thumbnail': r're:^https?://.*\.jpg$',
} 'timestamp': int,
'duration': int,
'upload_date': str
}
}, {
'note': 'Podcast episode from France Musique',
'url': 'https://www.radiofrance.fr/francemusique/podcasts/allegretto/lever-du-jour-9233228',
'only_matching': True
}, {
'note': 'Podcast episode from FranceInter',
'url': 'https://www.radiofrance.fr/franceinter/podcasts/rendez-vous-avec-x/un-mysterieux-echange-digne-de-la-guerre-froide-9343281',
'only_matching': True
}, {
'note': 'Podcast episode from France Culture',
'url': 'https://www.radiofrance.fr/franceculture/podcasts/la-science-cqfd/teotihuacan-la-plus-mysterieuse-des-cites-d-or-9224610',
'only_matching': True
}, {
'note': 'Podcast episode from Le Mouv',
'url': 'https://www.radiofrance.fr/mouv/podcasts/mouv-dj-la-caution/ncr2a-ne-cherche-rien-d-autre-ailleurs-1197950',
'only_matching': True
}, {
'note': 'Podcast episode from FIP',
'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip/hommage-au-cinema-de-vangelis-4734742',
'only_matching': True
}]
def _real_extract(self, url): def _real_extract(self, url):
m = re.match(self._VALID_URL, url) id = self._match_id(url)
video_id = m.group('id') webpage, api_data = self.get_data(url, 'path', id)
url, duration = self.extract_episode(id, api_data)
webpage = self._download_webpage(url, video_id) if url is None:
title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title') msg = 'Podcast file is not available. If the show is too recent, the file may not have been uploaded yet: try again later.'
description = self._html_search_regex( raise ExtractorError(msg, expected=True, video_id=id)
r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
webpage, 'description', fatal=False)
uploader = self._html_search_regex(
r'<div class="credit">&nbsp;&nbsp;&copy;&nbsp;(.*?)</div>',
webpage, 'uploader', fatal=False)
formats_str = self._html_search_regex(
r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
webpage, 'audio URLs')
formats = [
{
'format_id': fm[0],
'url': fm[1],
'vcodec': 'none',
'preference': i,
}
for i, fm in
enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str))
]
self._sort_formats(formats)
return { return {
'id': video_id, 'id': id,
'title': title, 'url': url,
'formats': formats, 'title': self.get_title(api_data, webpage),
'description': description, 'description': self.get_description(api_data, webpage),
'uploader': uploader, 'timestamp': self.get_timestamp(api_data, webpage),
'thumbnail': self.get_thumbnail(api_data, webpage),
'channel_id': self.get_brand(api_data, webpage),
'duration': duration,
}
class RadioFrancePodcastPlaylistIE(RadioFranceBaseIE):
_VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/podcasts/(?P<id>[^/]+?)(?:[?#].*)?$'
_TESTS = [{
'note': 'Podcast show with multiple pages of episodes and some of them are missing',
'url': 'https://www.radiofrance.fr/franceculture/podcasts/une-semaine-dans-le-monde-10-11?p=2',
'info_dict': {
'id': 'une-semaine-dans-le-monde-10-11',
'title': 'Une semaine dans le monde | 10-11',
'description': str,
'timestamp': int
},
'playlist_count': 23,
}]
def _real_extract(self, url):
id = self._match_id(url)
webpage, api_data = self.get_data(url, 'path', id)
entries = self.get_playlist_entries(url, id, api_data, direction='both')
entries.reverse()
return {
'id': id,
'_type': 'playlist',
'entries': entries,
'title': self.get_title(api_data, webpage),
'description': self.get_description(api_data, webpage),
'timestamp': self.get_timestamp(api_data, webpage),
'thumbnail': self.get_thumbnail(api_data, webpage),
'channel_id': self.get_brand(api_data, webpage),
}
class RadioFranceWebradioIE(RadioFranceBaseIE):
_VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/(?P<id>radio-[^/]+)$'
_TESTS = [{
'note': 'Full list of webradios available at https://www.radiofrance.fr/ecouter-musique',
'url': 'https://www.radiofrance.fr/fip/radio-metal',
'info_dict': {
'id': 'radio-metal',
'ext': 'aac',
'title': str,
},
'params': {
'format': 'aac',
'skip_download': True,
}
}]
def get_livestream_formats(self, id, api_data):
sources = api_data['media']['sources']
formats = []
for source in sources:
url = source.get('url')
if not url:
continue
format_id = source.get('format')
format = {
'url': url,
'format_id': format_id,
'asr': 48000,
'vcodec': 'none'
}
if format_id == 'mp3':
format['preference'] = 1
format['acodec'] = 'mp3'
format['abr'] = source.get('bitrate')
elif format_id == 'aac':
format['preference'] = 2
format['acodec'] = 'aac'
format['abr'] = source.get('bitrate')
elif format_id == 'hls':
format['preference'] = 0
format['manifest_url'] = url
formats.append(format)
if len(formats) == 0:
raise ExtractorError('No live streaming URL found')
return formats
def _real_extract(self, url):
id = self._match_id(url)
webpage, api_data = self.get_data(url, 'stations', id)
return {
'id': id,
'title': self.get_title(api_data, webpage),
'formats': self.get_livestream_formats(id, api_data),
'thumbnail': self.get_thumbnail(api_data, webpage),
'channel_id': self.get_brand(api_data, webpage),
'is_live': True
} }

View File

@ -692,9 +692,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'invidious': '|'.join(_INVIDIOUS_SITES), 'invidious': '|'.join(_INVIDIOUS_SITES),
} }
_PLAYER_INFO_RE = ( _PLAYER_INFO_RE = (
r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})//(?:tv-)?player', r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/(?:tv-)?player',
r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias(?:_tce)?\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$', r'\b(?P<id>vfl[a-zA-Z0-9_-]{6,})\b.*?\.js$',
) )
_SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
@ -1626,15 +1626,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
""" Return a string representation of a signature """ """ Return a string representation of a signature """
return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
@classmethod def _extract_player_info(self, player_url):
def _extract_player_info(cls, player_url): try:
for player_re in cls._PLAYER_INFO_RE: return self._search_regex(
id_m = re.search(player_re, player_url) self._PLAYER_INFO_RE, player_url, 'player info', group='id')
if id_m: except ExtractorError as e:
break raise ExtractorError(
else: 'Cannot identify player %r' % (player_url,), cause=e)
raise ExtractorError('Cannot identify player %r' % player_url)
return id_m.group('id')
def _load_player(self, video_id, player_url, fatal=True, player_id=None): def _load_player(self, video_id, player_url, fatal=True, player_id=None):
if not player_id: if not player_id:
@ -1711,6 +1709,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
' return %s\n') % (signature_id_tuple, expr_code) ' return %s\n') % (signature_id_tuple, expr_code)
self.to_screen('Extracted signature function:\n' + code) self.to_screen('Extracted signature function:\n' + code)
def _extract_sig_fn(self, jsi, funcname):
var_ay = self._search_regex(
r'''(?x)
(?:\*/|\{|\n|^)\s*(?:'[^']+'\s*;\s*)
(var\s*[\w$]+\s*=\s*(?:
('|")(?:\\\2|(?!\2).)+\2\s*\.\s*split\(\s*('|")\W+\3\s*\)|
\[\s*(?:('|")(?:\\\4|(?!\4).)*\4\s*(?:(?=\])|,\s*))+\]
))(?=\s*[,;])
''', jsi.code, 'useful values', default='')
sig_fn = jsi.extract_function_code(funcname)
if var_ay:
sig_fn = (sig_fn[0], ';\n'.join((var_ay, sig_fn[1])))
return sig_fn
def _parse_sig_js(self, jscode): def _parse_sig_js(self, jscode):
# Examples where `sig` is funcname: # Examples where `sig` is funcname:
# sig=function(a){a=a.split(""); ... ;return a.join("")}; # sig=function(a){a=a.split(""); ... ;return a.join("")};
@ -1736,8 +1751,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
jscode, 'Initial JS player signature function name', group='sig') jscode, 'Initial JS player signature function name', group='sig')
jsi = JSInterpreter(jscode) jsi = JSInterpreter(jscode)
initial_function = jsi.extract_function(funcname)
return lambda s: initial_function([s]) initial_function = self._extract_sig_fn(jsi, funcname)
func = jsi.extract_function_from_code(*initial_function)
return lambda s: func([s])
def _cached(self, func, *cache_id): def _cached(self, func, *cache_id):
def inner(*args, **kwargs): def inner(*args, **kwargs):
@ -1856,15 +1875,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_n_function_code_jsi(self, video_id, jsi, player_id=None): def _extract_n_function_code_jsi(self, video_id, jsi, player_id=None):
var_ay = self._search_regex(
r'(?:[;\s]|^)\s*(var\s*[\w$]+\s*=\s*"(?:\\"|[^"])+"\s*\.\s*split\("\W+"\))(?=\s*[,;])',
jsi.code, 'useful values', default='')
func_name = self._extract_n_function_name(jsi.code) func_name = self._extract_n_function_name(jsi.code)
func_code = jsi.extract_function_code(func_name) func_code = self._extract_sig_fn(jsi, func_name)
if var_ay:
func_code = (func_code[0], ';\n'.join((var_ay, func_code[1])))
if player_id: if player_id:
self.cache.store('youtube-nsig', player_id, func_code) self.cache.store('youtube-nsig', player_id, func_code)
@ -2136,7 +2149,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_details = merge_dicts(*traverse_obj( video_details = merge_dicts(*traverse_obj(
(player_response, api_player_response), (player_response, api_player_response),
(Ellipsis, 'videoDetails', T(dict)))) (Ellipsis, 'videoDetails', T(dict))))
player_response.update(api_player_response or {}) player_response.update(filter_dict(
api_player_response or {}, cndn=lambda k, _: k != 'captions'))
player_response['videoDetails'] = video_details player_response['videoDetails'] = video_details
def is_agegated(playability): def is_agegated(playability):
@ -2566,8 +2580,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
} }
pctr = traverse_obj( pctr = traverse_obj(
player_response, (player_response, api_player_response),
('captions', 'playerCaptionsTracklistRenderer', T(dict))) (Ellipsis, 'captions', 'playerCaptionsTracklistRenderer', T(dict)))
if pctr: if pctr:
def process_language(container, base_url, lang_code, query): def process_language(container, base_url, lang_code, query):
lang_subs = [] lang_subs = []
@ -2584,20 +2598,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def process_subtitles(): def process_subtitles():
subtitles = {} subtitles = {}
for caption_track in traverse_obj(pctr, ( for caption_track in traverse_obj(pctr, (
'captionTracks', lambda _, v: v.get('baseUrl'))): Ellipsis, 'captionTracks', lambda _, v: (
v.get('baseUrl') and v.get('languageCode')))):
base_url = self._yt_urljoin(caption_track['baseUrl']) base_url = self._yt_urljoin(caption_track['baseUrl'])
if not base_url: if not base_url:
continue continue
lang_code = caption_track['languageCode']
if caption_track.get('kind') != 'asr': if caption_track.get('kind') != 'asr':
lang_code = caption_track.get('languageCode')
if not lang_code:
continue
process_language( process_language(
subtitles, base_url, lang_code, {}) subtitles, base_url, lang_code, {})
continue continue
automatic_captions = {} automatic_captions = {}
process_language(
automatic_captions, base_url, lang_code, {})
for translation_language in traverse_obj(pctr, ( for translation_language in traverse_obj(pctr, (
'translationLanguages', lambda _, v: v.get('languageCode'))): Ellipsis, 'translationLanguages', lambda _, v: v.get('languageCode'))):
translation_language_code = translation_language['languageCode'] translation_language_code = translation_language['languageCode']
process_language( process_language(
automatic_captions, base_url, translation_language_code, automatic_captions, base_url, translation_language_code,

View File

@ -678,7 +678,7 @@ class JSInterpreter(object):
return len(obj) return len(obj)
try: try:
return obj[int(idx)] if isinstance(obj, list) else obj[compat_str(idx)] return obj[int(idx)] if isinstance(obj, list) else obj[compat_str(idx)]
except (TypeError, KeyError, IndexError) as e: except (TypeError, KeyError, IndexError, ValueError) as e:
# allow_undefined is None gives correct behaviour # allow_undefined is None gives correct behaviour
if allow_undefined or ( if allow_undefined or (
allow_undefined is None and not isinstance(e, TypeError)): allow_undefined is None and not isinstance(e, TypeError)):
@ -1038,6 +1038,10 @@ class JSInterpreter(object):
left_val = self._index(left_val, idx) left_val = self._index(left_val, idx)
if isinstance(idx, float): if isinstance(idx, float):
idx = int(idx) idx = int(idx)
if isinstance(left_val, list) and len(left_val) <= int_or_none(idx, default=-1):
# JS Array is a sparsely assignable list
# TODO: handle extreme sparsity without memory bloat, eg using auxiliary dict
left_val.extend((idx - len(left_val) + 1) * [JS_Undefined])
left_val[idx] = self._operator( left_val[idx] = self._operator(
m.group('op'), self._index(left_val, idx) if m.group('op') else None, m.group('op'), self._index(left_val, idx) if m.group('op') else None,
m.group('expr'), expr, local_vars, allow_recursion) m.group('expr'), expr, local_vars, allow_recursion)
@ -1204,9 +1208,10 @@ class JSInterpreter(object):
elif member == 'join': elif member == 'join':
assertion(isinstance(obj, list), 'must be applied on a list') assertion(isinstance(obj, list), 'must be applied on a list')
assertion(len(argvals) <= 1, 'takes at most one argument') assertion(len(argvals) <= 1, 'takes at most one argument')
return (',' if len(argvals) == 0 else argvals[0]).join( return (',' if len(argvals) == 0 or argvals[0] in (None, JS_Undefined)
('' if x in (None, JS_Undefined) else _js_toString(x)) else argvals[0]).join(
for x in obj) ('' if x in (None, JS_Undefined) else _js_toString(x))
for x in obj)
elif member == 'reverse': elif member == 'reverse':
assertion(not argvals, 'does not take any arguments') assertion(not argvals, 'does not take any arguments')
obj.reverse() obj.reverse()
@ -1364,19 +1369,21 @@ class JSInterpreter(object):
code, _ = self._separate_at_paren(func_m.group('code')) # refine the match code, _ = self._separate_at_paren(func_m.group('code')) # refine the match
return self.build_arglist(func_m.group('args')), code return self.build_arglist(func_m.group('args')), code
def extract_function(self, funcname): def extract_function(self, funcname, *global_stack):
return function_with_repr( return function_with_repr(
self.extract_function_from_code(*self.extract_function_code(funcname)), self.extract_function_from_code(*itertools.chain(
self.extract_function_code(funcname), global_stack)),
'F<%s>' % (funcname,)) 'F<%s>' % (funcname,))
def extract_function_from_code(self, argnames, code, *global_stack): def extract_function_from_code(self, argnames, code, *global_stack):
local_vars = {} local_vars = {}
start = None
while True: while True:
mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code) mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code[start:])
if mobj is None: if mobj is None:
break break
start, body_start = mobj.span() start, body_start = ((start or 0) + x for x in mobj.span())
body, remaining = self._separate_at_paren(code[body_start - 1:]) body, remaining = self._separate_at_paren(code[body_start - 1:])
name = self._named_object(local_vars, self.extract_function_from_code( name = self._named_object(local_vars, self.extract_function_from_code(
[x.strip() for x in mobj.group('args').split(',')], [x.strip() for x in mobj.group('args').split(',')],