Merge 72db217289 into c5098961b0

[Youtube] Rework n function extraction pattern
Now also succeeds with player b12cc44b
2024-12-23 04:30:10 +09:00 · 2024-08-21 22:32:49 -04:00 · 2024-08-06 20:59:09 +01:00 · 2024-08-06 20:51:38 +01:00 · 2024-08-01 19:18:34 +01:00 · 2022-12-22 14:22:19 -05:00
8 changed files with 362 additions and 192 deletions
--- a/test/test_jsinterp.py
+++ b/test/test_jsinterp.py
@ -425,6 +425,34 @@ class TestJSInterpreter(unittest.TestCase):
            self._test(jsi, [''], args=['', '-'])
            self._test(jsi, [], args=['', ''])
    def test_slice(self):
        self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice()}', [0, 1, 2, 3, 4, 5, 6, 7, 8])
        self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(0)}', [0, 1, 2, 3, 4, 5, 6, 7, 8])
        self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(5)}', [5, 6, 7, 8])
        self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(99)}', [])
        self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(-2)}', [7, 8])
        self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(-99)}', [0, 1, 2, 3, 4, 5, 6, 7, 8])
        self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(0, 0)}', [])
        self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(1, 0)}', [])
        self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(0, 1)}', [0])
        self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(3, 6)}', [3, 4, 5])
        self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(1, -1)}', [1, 2, 3, 4, 5, 6, 7])
        self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(-1, 1)}', [])
        self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(-3, -1)}', [6, 7])
        self._test('function f(){return "012345678".slice()}', '012345678')
        self._test('function f(){return "012345678".slice(0)}', '012345678')
        self._test('function f(){return "012345678".slice(5)}', '5678')
        self._test('function f(){return "012345678".slice(99)}', '')
        self._test('function f(){return "012345678".slice(-2)}', '78')
        self._test('function f(){return "012345678".slice(-99)}', '012345678')
        self._test('function f(){return "012345678".slice(0, 0)}', '')
        self._test('function f(){return "012345678".slice(1, 0)}', '')
        self._test('function f(){return "012345678".slice(0, 1)}', '0')
        self._test('function f(){return "012345678".slice(3, 6)}', '345')
        self._test('function f(){return "012345678".slice(1, -1)}', '1234567')
        self._test('function f(){return "012345678".slice(-1, 1)}', '')
        self._test('function f(){return "012345678".slice(-3, -1)}', '67')
 if __name__ == '__main__':
    unittest.main()
--- a/test/test_youtube_signature.py
+++ b/test/test_youtube_signature.py
@ -174,6 +174,14 @@ _NSIG_TESTS = [
        'https://www.youtube.com/s/player/5604538d/player_ias.vflset/en_US/base.js',
        '7X-he4jjvMx7BCX', 'sViSydX8IHtdWA',
    ),
    (
        'https://www.youtube.com/s/player/20dfca59/player_ias.vflset/en_US/base.js',
        '-fLCxedkAk4LUTK2', 'O8kfRq1y1eyHGw',
    ),
    (
        'https://www.youtube.com/s/player/b12cc44b/player_ias.vflset/en_US/base.js',
        'keLa5R2U00sR9SQK', 'N1OGyujjEwMnLw',
    ),
 ]
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -413,8 +413,6 @@ from .foxnews import (
    FoxNewsArticleIE,
 )
 from .foxsports import FoxSportsIE
 from .franceculture import FranceCultureIE
 from .franceinter import FranceInterIE
 from .francetv import (
    FranceTVIE,
    FranceTVSiteIE,
@ -1011,7 +1009,11 @@ from .radiocanada import (
 from .radiode import RadioDeIE
 from .radiojavan import RadioJavanIE
 from .radiobremen import RadioBremenIE
-from .radiofrance import RadioFranceIE
+from .radiofrance import (
    RadioFrancePodcastEpisodeIE,
    RadioFrancePodcastPlaylistIE,
    RadioFranceWebradioIE,
 )
 from .rai import (
    RaiPlayIE,
    RaiPlayLiveIE,
--- a/youtube_dl/extractor/franceculture.py
+++ b/youtube_dl/extractor/franceculture.py
@ -1,73 +0,0 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..utils import (
    determine_ext,
    extract_attributes,
    int_or_none,
 )
 class FranceCultureIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)'
    _TESTS = [{
        'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks',
        'info_dict': {
            'id': 'rendez-vous-au-pays-des-geeks',
            'display_id': 'rendez-vous-au-pays-des-geeks',
            'ext': 'mp3',
            'title': 'Rendez-vous au pays des geeks',
            'thumbnail': r're:^https?://.*\.jpg$',
            'upload_date': '20140301',
            'timestamp': 1393700400,
            'vcodec': 'none',
        }
    }, {
        # no thumbnail
        'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        display_id = self._match_id(url)
        webpage = self._download_webpage(url, display_id)
        video_data = extract_attributes(self._search_regex(
            r'''(?sx)
                (?:
                    </h1>|
                    <div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>
                ).*?
                (<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>)
            ''',
            webpage, 'video data'))
        video_url = video_data.get('data-url') or video_data['data-asset-source']
        title = video_data.get('data-asset-title') or video_data.get('data-diffusion-title') or self._og_search_title(webpage)
        description = self._html_search_regex(
            r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>',
            webpage, 'description', default=None)
        thumbnail = self._search_regex(
            r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"',
            webpage, 'thumbnail', default=None)
        uploader = self._html_search_regex(
            r'(?s)<span class="author">(.*?)</span>',
            webpage, 'uploader', default=None)
        ext = determine_ext(video_url.lower())
        return {
            'id': display_id,
            'display_id': display_id,
            'url': video_url,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'ext': ext,
            'vcodec': 'none' if ext == 'mp3' else None,
            'uploader': uploader,
            'timestamp': int_or_none(video_data.get('data-start-time')) or int_or_none(video_data.get('data-asset-created-date')),
            'duration': int_or_none(video_data.get('data-duration')),
        }
--- a/youtube_dl/extractor/franceinter.py
+++ b/youtube_dl/extractor/franceinter.py
@ -1,59 +0,0 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..utils import month_by_name
 class FranceInterIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?franceinter\.fr/emissions/(?P<id>[^?#]+)'
    _TEST = {
        'url': 'https://www.franceinter.fr/emissions/affaires-sensibles/affaires-sensibles-07-septembre-2016',
        'md5': '9e54d7bdb6fdc02a841007f8a975c094',
        'info_dict': {
            'id': 'affaires-sensibles/affaires-sensibles-07-septembre-2016',
            'ext': 'mp3',
            'title': 'Affaire Cahuzac : le contentieux du compte en Suisse',
            'description': 'md5:401969c5d318c061f86bda1fa359292b',
            'thumbnail': r're:^https?://.*\.jpg',
            'upload_date': '20160907',
        },
    }
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        video_url = self._search_regex(
            r'(?s)<div[^>]+class=["\']page-diffusion["\'][^>]*>.*?<button[^>]+data-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
            webpage, 'video url', group='url')
        title = self._og_search_title(webpage)
        description = self._og_search_description(webpage)
        thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage)
        upload_date_str = self._search_regex(
            r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<',
            webpage, 'upload date', fatal=False)
        if upload_date_str:
            upload_date_list = upload_date_str.split()
            upload_date_list.reverse()
            upload_date_list[1] = '%02d' % (month_by_name(upload_date_list[1], lang='fr') or 0)
            upload_date_list[2] = '%02d' % int(upload_date_list[2])
            upload_date = ''.join(upload_date_list)
        else:
            upload_date = None
        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'upload_date': upload_date,
            'formats': [{
                'url': video_url,
                'vcodec': 'none',
            }],
        }
--- a/youtube_dl/extractor/radiofrance.py
+++ b/youtube_dl/extractor/radiofrance.py
@ -4,56 +4,284 @@ from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..utils import (
    ExtractorError,
    get_element_by_attribute,
    int_or_none,
    parse_iso8601,
    strip_or_none,
    url_or_none
 )
-class RadioFranceIE(InfoExtractor):
+class RadioFranceBaseIE(InfoExtractor):
-    _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
+    _BASE_URL = r'https://www.radiofrance.fr/'
    IE_NAME = 'radiofrance'
-    _TEST = {
+    def extract_api_data(self, api_path, id, html):
-        'url': 'http://maison.radiofrance.fr/radiovisions/one-one',
+        pattern = r'<script [^>]*sveltekit:data-url="https://www\.radiofrance\.fr/api/v[\d.]+/%s[^>]*>(?P<json>.*)</script>' % api_path
-        'md5': 'bdbb28ace95ed0e04faab32ba3160daf',
+        json = self._search_regex(pattern, html, 'API data', flags=re.DOTALL, group='json')
-        'info_dict': {
+
-            'id': 'one-one',
+        if not json:
-            'ext': 'ogg',
+            raise ExtractorError('%s: JSON data not found' % id)
-            'title': 'One to one',
+
-            'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
+        try:
-            'uploader': 'Thomas Hercouët',
+            json = self._parse_json(json, id)
-        },
+            json = self._parse_json(json['body'], id)
            if api_path == 'path':
                return json['content']
            elif api_path == 'stations':
                return json
            else:
                raise ExtractorError('Coding error')
        except KeyError:
            raise ExtractorError('%s: Invalid JSON' % id)
    def get_title(self, api_data, webpage=None):
        title = strip_or_none(api_data.get('title'))
        if not title and webpage:
            title = strip_or_none(get_element_by_attribute('h1', None, webpage, False)) or strip_or_none(self._og_search_title(webpage))
        return title
    def get_description(self, api_data, webpage=None):
        description = strip_or_none(api_data.get('standFirst'))
        if not description and webpage:
            description = strip_or_none(self._og_search_description(webpage))
        return description
    def get_thumbnail(self, api_data, webpage=None):
        thumbnail = None
        visual = api_data.get('visual')
        if visual:
            thumbnail = url_or_none(visual.get('src'))
        if not thumbnail and webpage:
            thumbnail = self._og_search_thumbnail(webpage)
        return thumbnail
    def get_timestamp(self, api_data, webpage=None):
        timestamp = api_data.get('publishedDate')
        if not timestamp and webpage:
            timestamp = parse_iso8601(self._html_search_meta('article:published_time', webpage, 'publication time', ))
        return timestamp
    def get_brand(self, api_data, webpage=None):
        brand = strip_or_none(api_data.get('brand'))
        if not brand and webpage:
            brand = self._og_search_property('site_name', webpage, 'Station name', fatal=False)
        return brand
    def extract_episode(self, episode_id, api_data):
        manifestations = api_data.get('manifestations')
        if manifestations is None or len(manifestations) == 0:
            return None, None
        url = url_or_none(manifestations[0]['url'])
        duration = int_or_none(manifestations[0].get('duration'))
        return url, duration
    def get_playlist_entries(self, playlist_url, playlist_id, api_data, direction):
        playlist_data = api_data['expressions']
        entries = []
        items = playlist_data.get('items')
        for item in items:
            episode_path = item.get('path')
            if episode_path is None:
                self.report_warning('No path found for episode "%s"', item.get('title'))
                continue
            episode_id = RadioFrancePodcastEpisodeIE._match_id(self._BASE_URL + episode_path)
            if episode_id is None:
                self.report_warning('Could not parse id of episode from path: "%s"' % episode_path)
                continue
            episode_url, duration = self.extract_episode(episode_id, item)
            if episode_url is None:
                self.to_screen('Episode "%s" is not available' % episode_path)
                continue
            entry = {
                'id': episode_id,
                'url': episode_url,
                'title': self.get_title(item),
                'description': self.get_description(item),
                'timestamp': self.get_timestamp(item),
                'thumbnail': self.get_thumbnail(item),
                'duration': duration,
            }
            entries.append(entry)
        page_number = int_or_none(playlist_data.get('pageNumber'))
        if page_number:
            if direction in ['both', 'prev'] and playlist_data.get('prev') is not None:
                webpage, other_api_data = self.get_data(playlist_url, 'path', playlist_id, page=page_number - 1)
                entries = self.get_playlist_entries(playlist_url, playlist_id, other_api_data, direction='prev') + entries
            if direction in ['both', 'next'] and playlist_data.get('next') is not None:
                webpage, other_api_data = self.get_data(playlist_url, 'path', playlist_id, page=page_number + 1)
                entries = entries + self.get_playlist_entries(playlist_url, playlist_id, other_api_data, direction='next')
        return entries
    def get_data(self, url, api_path, id, page=None):
        query = {}
        note = None
        if page:
            query['p'] = page
            note = "Downloading page %i" % page
        webpage = self._download_webpage(url, id, query=query, note=note)
        api_data = self.extract_api_data(api_path, id, webpage)
        return webpage, api_data
 class RadioFrancePodcastEpisodeIE(RadioFranceBaseIE):
    _VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/podcasts/.+/.+-(?P<id>\d+)$'
    _TESTS = [{
        'note': 'Podcast episode with audio from France Info',
        'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-brief-eco/le-brief-eco-du-lundi-05-septembre-2022-8310713',
        'info_dict': {
            'id': '8310713',
            'ext': 'mp3',
            'url': r're:^https?://.*\.mp3$',
            'title': 'Pour la première fois en vingt ans, l’euro passe sous les 0,99\u00a0dollar',
            'description': str,
            'thumbnail': r're:^https?://.*\.jpg$',
            'timestamp': int,
            'duration': int,
            'upload_date': str
        }
    }, {
        'note': 'Podcast episode from France Musique',
        'url': 'https://www.radiofrance.fr/francemusique/podcasts/allegretto/lever-du-jour-9233228',
        'only_matching': True
    }, {
        'note': 'Podcast episode from FranceInter',
        'url': 'https://www.radiofrance.fr/franceinter/podcasts/rendez-vous-avec-x/un-mysterieux-echange-digne-de-la-guerre-froide-9343281',
        'only_matching': True
    }, {
        'note': 'Podcast episode from France Culture',
        'url': 'https://www.radiofrance.fr/franceculture/podcasts/la-science-cqfd/teotihuacan-la-plus-mysterieuse-des-cites-d-or-9224610',
        'only_matching': True
    }, {
        'note': 'Podcast episode from Le Mouv',
        'url': 'https://www.radiofrance.fr/mouv/podcasts/mouv-dj-la-caution/ncr2a-ne-cherche-rien-d-autre-ailleurs-1197950',
        'only_matching': True
    }, {
        'note': 'Podcast episode from FIP',
        'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip/hommage-au-cinema-de-vangelis-4734742',
        'only_matching': True
    }]
    def _real_extract(self, url):
-        m = re.match(self._VALID_URL, url)
+        id = self._match_id(url)
-        video_id = m.group('id')
+        webpage, api_data = self.get_data(url, 'path', id)
-
+        url, duration = self.extract_episode(id, api_data)
-        webpage = self._download_webpage(url, video_id)
+        if url is None:
-        title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
+            msg = 'Podcast file is not available. If the show is too recent, the file may not have been uploaded yet: try again later.'
-        description = self._html_search_regex(
+            raise ExtractorError(msg, expected=True, video_id=id)
            r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
            webpage, 'description', fatal=False)
        uploader = self._html_search_regex(
            r'<div class="credit">&nbsp;&nbsp;&copy;&nbsp;(.*?)</div>',
            webpage, 'uploader', fatal=False)
        formats_str = self._html_search_regex(
            r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
            webpage, 'audio URLs')
        formats = [
            {
                'format_id': fm[0],
                'url': fm[1],
                'vcodec': 'none',
                'preference': i,
            }
            for i, fm in
            enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str))
        ]
        self._sort_formats(formats)
        return {
-            'id': video_id,
+            'id': id,
-            'title': title,
+            'url': url,
-            'formats': formats,
+            'title': self.get_title(api_data, webpage),
-            'description': description,
+            'description': self.get_description(api_data, webpage),
-            'uploader': uploader,
+            'timestamp': self.get_timestamp(api_data, webpage),
            'thumbnail': self.get_thumbnail(api_data, webpage),
            'channel_id': self.get_brand(api_data, webpage),
            'duration': duration,
        }
 class RadioFrancePodcastPlaylistIE(RadioFranceBaseIE):
    _VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/podcasts/(?P<id>[^/]+?)(?:[?#].*)?$'
    _TESTS = [{
        'note': 'Podcast show with multiple pages of episodes and some of them are missing',
        'url': 'https://www.radiofrance.fr/franceculture/podcasts/une-semaine-dans-le-monde-10-11?p=2',
        'info_dict': {
            'id': 'une-semaine-dans-le-monde-10-11',
            'title': 'Une semaine dans le monde | 10-11',
            'description': str,
            'timestamp': int
        },
        'playlist_count': 23,
    }]
    def _real_extract(self, url):
        id = self._match_id(url)
        webpage, api_data = self.get_data(url, 'path', id)
        entries = self.get_playlist_entries(url, id, api_data, direction='both')
        entries.reverse()
        return {
            'id': id,
            '_type': 'playlist',
            'entries': entries,
            'title': self.get_title(api_data, webpage),
            'description': self.get_description(api_data, webpage),
            'timestamp': self.get_timestamp(api_data, webpage),
            'thumbnail': self.get_thumbnail(api_data, webpage),
            'channel_id': self.get_brand(api_data, webpage),
        }
 class RadioFranceWebradioIE(RadioFranceBaseIE):
    _VALID_URL = r'https?://www\.radiofrance\.fr/(?:francemusique|franceinter|franceculture|franceinfo|mouv|fip)/(?P<id>radio-[^/]+)$'
    _TESTS = [{
        'note': 'Full list of webradios available at https://www.radiofrance.fr/ecouter-musique',
        'url': 'https://www.radiofrance.fr/fip/radio-metal',
        'info_dict': {
            'id': 'radio-metal',
            'ext': 'aac',
            'title': str,
        },
        'params': {
            'format': 'aac',
            'skip_download': True,
        }
    }]
    def get_livestream_formats(self, id, api_data):
        sources = api_data['media']['sources']
        formats = []
        for source in sources:
            url = source.get('url')
            if not url:
                continue
            format_id = source.get('format')
            format = {
                'url': url,
                'format_id': format_id,
                'asr': 48000,
                'vcodec': 'none'
            }
            if format_id == 'mp3':
                format['preference'] = 1
                format['acodec'] = 'mp3'
                format['abr'] = source.get('bitrate')
            elif format_id == 'aac':
                format['preference'] = 2
                format['acodec'] = 'aac'
                format['abr'] = source.get('bitrate')
            elif format_id == 'hls':
                format['preference'] = 0
                format['manifest_url'] = url
            formats.append(format)
        if len(formats) == 0:
            raise ExtractorError('No live streaming URL found')
        return formats
    def _real_extract(self, url):
        id = self._match_id(url)
        webpage, api_data = self.get_data(url, 'stations', id)
        return {
            'id': id,
            'title': self.get_title(api_data, webpage),
            'formats': self.get_livestream_formats(id, api_data),
            'thumbnail': self.get_thumbnail(api_data, webpage),
            'channel_id': self.get_brand(api_data, webpage),
            'is_live': True
        }
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -1659,17 +1659,46 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
    def _extract_n_function_name(self, jscode):
        func_name, idx = self._search_regex(
            # new: (b=String.fromCharCode(110),c=a.get(b))&&c=nfunc[idx](c)
-            # or:  (b="nn"[+a.D],c=a.get(b))&&(c=nfunc[idx](c)s
+            # or:  (b="nn"[+a.D],c=a.get(b))&&(c=nfunc[idx](c)
-            # old: .get("n"))&&(b=nfunc[idx](b)
+            # or:  (PL(a),b=a.j.n||null)&&(b=nfunc[idx](b)
-            # older: .get("n"))&&(b=nfunc(b)
+            # or:  (b="nn"[+a.D],vL(a),c=a.j[b]||null)&&(c=narray[idx](c),a.set(b,c),narray.length||nfunc("")
            # old: (b=a.get("n"))&&(b=nfunc[idx](b)(?P<c>[a-z])\s*=\s*[a-z]\s*
            # older: (b=a.get("n"))&&(b=nfunc(b)
            r'''(?x)
-                (?:\(\s*(?P<b>[a-z])\s*=\s*(?:
+                \((?:[\w$()\s]+,)*?\s*      # (
                (?P<b>[a-z])\s*=\s*         # b=
                (?:
                    (?:                     # expect ,c=a.get(b) (etc)
                        String\s*\.\s*fromCharCode\s*\(\s*110\s*\)|
                        "n+"\[\s*\+?s*[\w$.]+\s*]
-                )\s*,(?P<c>[a-z])\s*=\s*[a-z]\s*)?
+                    )\s*(?:,[\w$()\s]+(?=,))*|
-                \.\s*get\s*\(\s*(?(b)(?P=b)|"n{1,2}")(?:\s*\)){2}\s*&&\s*\(\s*(?(c)(?P=c)|b)\s*=\s*
+                       (?P<old>[\w$]+)      # a (old[er])
                   )\s*
                   (?(old)
                                            # b.get("n")
                       (?:\.\s*[\w$]+\s*|\[\s*[\w$]+\s*]\s*)*?
                       (?:\.\s*n|\[\s*"n"\s*]|\.\s*get\s*\(\s*"n"\s*\))
                       |                    # ,c=a.get(b)
                       ,\s*(?P<c>[a-z])\s*=\s*[a-z]\s*
                       (?:\.\s*[\w$]+\s*|\[\s*[\w$]+\s*]\s*)*?
                       (?:\[\s*(?P=b)\s*]|\.\s*get\s*\(\s*(?P=b)\s*\))
                   )
                                            # interstitial junk
                   \s*(?:\|\|\s*null\s*)?(?:\)\s*)?&&\s*(?:\(\s*)?
               (?(c)(?P=c)|(?P=b))\s*=\s*   # [c|b]=
                                            # nfunc|nfunc[idx]
                   (?P<nfunc>[a-zA-Z_$][\w$]*)(?:\s*\[(?P<idx>\d+)\])?\s*\(\s*[\w$]+\s*\)
-            ''', jscode, 'Initial JS player n function name', group=('nfunc', 'idx'))
+            ''', jscode, 'Initial JS player n function name', group=('nfunc', 'idx'),
            default=(None, None))
        # thx bashonly: yt-dlp/yt-dlp/pull/10611
        if not func_name:
            self.report_warning('Falling back to generic n function search')
            return self._search_regex(
                r'''(?xs)
                    (?:(?<=[^\w$])|^)       # instead of \b, which ignores $
                    (?P<name>(?!\d)[a-zA-Z\d_$]+)\s*=\s*function\((?!\d)[a-zA-Z\d_$]+\)
                    \s*\{(?:(?!};).)+?["']enhanced_except_
                ''', jscode, 'Initial JS player n function name', group='name')
        if not idx:
            return func_name
--- a/youtube_dl/jsinterp.py
+++ b/youtube_dl/jsinterp.py
@ -925,9 +925,16 @@ class JSInterpreter(object):
                    obj.reverse()
                    return obj
                elif member == 'slice':
-                    assertion(isinstance(obj, list), 'must be applied on a list')
+                    assertion(isinstance(obj, (list, compat_str)), 'must be applied on a list or string')
-                    assertion(len(argvals) == 1, 'takes exactly one argument')
+                    # From [1]:
-                    return obj[argvals[0]:]
+                    # .slice() - like [:]
                    # .slice(n) - like [n:] (not [slice(n)]
                    # .slice(m, n) - like [m:n] or [slice(m, n)]
                    # [1] https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/slice
                    assertion(len(argvals) <= 2, 'takes between 0 and 2 arguments')
                    if len(argvals) < 2:
                        argvals += (None,)
                    return obj[slice(*argvals)]
                elif member == 'splice':
                    assertion(isinstance(obj, list), 'must be applied on a list')
                    assertion(argvals, 'takes one or more arguments')
Author	SHA1	Message	Date
Olivier Trichet	a941f835c9	Merge `72db217289` into `c5098961b0`	2024-08-21 22:32:49 -04:00
dirkf	c5098961b0	[Youtube] Rework n function extraction pattern Now also succeeds with player b12cc44b	2024-08-06 20:59:09 +01:00
dirkf	dbc08fba83	[jsinterp] Improve slice implementation for player b12cc44b Partly taken from yt-dlp/yt-dlp#10664, thx seproDev Fixes #32896	2024-08-06 20:51:38 +01:00
Aiur Adept	71223bff39	[Youtube] Fix nsig extraction for player 20dfca59 (#32891 ) * dirkf's patch for nsig extraction * add generic search per yt-dlp/yt-dlp/pull/10611 - thx bashonly --------- Co-authored-by: dirkf <fieldhouse@gmx.net>	2024-08-01 19:18:34 +01:00
Olivier Trichet	72db217289	[RadioFrance] Extractor fo thematic webradios	2022-12-22 14:22:19 -05:00
Olivier Trichet	fc933e686b	[RadioFrance] Refactoring	2022-12-22 13:01:10 -05:00
Olivier Trichet	ea02c40539	[RadioFrance] Extractor for podcast playlists	2022-12-22 13:00:54 -05:00
Olivier Trichet	7270ecf3d6	[RadioFrance] Extractor for podcast of Radio France stations	2022-12-22 13:00:17 -05:00
Olivier Trichet	dade9111f1	[RadioFrance] Remove old Radio France stations extractors These are not working anymore after their respectives websites were merged into www.radiofrance.fr.	2022-12-22 13:00:08 -05:00