2025-08-07 19:24:15 +09:00
2 changed files with 75 additions and 125 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -847,16 +847,7 @@ from .nowness import (
    NownessSeriesIE,
 )
 from .noz import NozIE
-from .npo import (
+from .npo import BNNVaraIE, NPOIE, ONIE, VPROIE
    AndereTijdenIE,
    BNNVaraIE,
    NPOIE,
    ONIE,
    SchoolTVIE,
    HetKlokhuisIE,
    VPROIE,
    WNLIE,
 )
 from .npr import NprIE
 from .nrk import (
    NRKIE,
--- a/youtube_dl/extractor/npo.py
+++ b/youtube_dl/extractor/npo.py
@ -11,11 +11,32 @@ from ..utils import ExtractorError
 class NPOIE(InfoExtractor):
    IE_NAME = 'npo'
    IE_DESC = 'npo.nl'
-    _VALID_URL = r'https?://(?:www\.)?npo\.nl/.*'
+    _VALID_URL = r'''(?x)
                    (?:
                        https?://
                            (?:www\.)?
                            (?:
                                npo\.nl/(?:[^/]+/)*
                            )
                        )
                        (?P<id>[^/?#]+)
                '''
    _TESTS = [{
        'url': 'https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/',
        # TODO fill in other test attributes
    }, {
        'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800',
        'md5': 'da50a5787dbfc1603c4ad80f31c5120b',
        'info_dict': {
            'id': 'VARA_101191800',
            'ext': 'm4v',
            'title': 'De Mega Mike & Mega Thomas show: The best of.',
            'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4',
            'upload_date': '20090227',
            'duration': 2400,
        },
        'skip': 'Video gone',
    }, {
        'url': 'https://npo.nl/start/serie/vpro-tegenlicht/seizoen-11/zwart-geld-de-toekomst-komt-uit-afrika',
        'md5': 'f8065e4e5a7824068ed3c7e783178f2c',
@ -45,21 +66,27 @@ class NPOIE(InfoExtractor):
            url = url[:-10]
        url = url.rstrip('/')
        slug = url.split('/')[-1]
-
+        page = self._download_webpage(url, slug, 'Finding productId using slug: %s' % slug)
-        program_metadata = self._download_json('https://npo.nl/start/api/domain/program-detail',
+        # TODO find out what proper HTML parsing utilities are available in youtube-dl
-                                               slug,
+        next_data = page.split('<script id="__NEXT_DATA__" type="application/json">')[1].split('</script>')[0]
-                                               query={'slug': slug})
+        # TODO The data in this script tag feels like GraphQL, so there might be an easier way
-        product_id = program_metadata.get('productId')
+        #      to get the product id, maybe using a GraphQL endpoint
-        images = program_metadata.get('images')
+        next_data = self._parse_json(next_data, slug)
-        thumbnail = None
+        product_id, title, description, thumbnail = None, None, None, None
-        for image in images:
+        for query in next_data['props']['pageProps']['dehydratedState']['queries']:
-            thumbnail = image.get('url')
+            if isinstance(query['state']['data'], list):
-            break
+                for entry in query['state']['data']:
-        title = program_metadata.get('title')
+                    if entry['slug'] == slug:
-        descriptions = program_metadata.get('description', {})
+                        product_id = entry.get('productId')
-        description = descriptions.get('long') or descriptions.get('short') or descriptions.get('brief')
+                        title = entry.get('title')
-        duration = program_metadata.get('durationInSeconds')
+                        synopsis = entry.get('synopsis', {})
-
+                        description = (synopsis.get('long')
                                       or synopsis.get('short')
                                       or synopsis.get('brief'))
                        thumbnails = entry.get('images')
                        for thumbnail_entry in thumbnails:
                            if 'url' in thumbnail_entry:
                                thumbnail = thumbnail_entry.get('url')
        if not product_id:
            raise ExtractorError('No productId found for slug: %s' % slug)
@ -69,9 +96,9 @@ class NPOIE(InfoExtractor):
            'id': slug,
            'formats': formats,
            'title': title or slug,
-            'description': description or title or slug,
+            'description': description,
            'thumbnail': thumbnail,
-            'duration': duration,
+            # TODO fill in other metadata that's available
        }
    def _download_by_product_id(self, product_id, slug, url=None):
@ -79,8 +106,7 @@ class NPOIE(InfoExtractor):
        formats = []
        for profile in (
                'dash',
-                # 'hls' is available too, but implementing it doesn't add much
+                # 'hls',  # TODO test what needs to change for 'hls' support
                # As far as I know 'dash' is always available
        ):
            stream_link = self._download_json(
                'https://prod.npoplayer.nl/stream-link', video_id=slug,
@ -105,7 +131,6 @@ class BNNVaraIE(NPOIE):
    _VALID_URL = r'https?://(?:www\.)?bnnvara\.nl/videos/[0-9]*'
    _TESTS = [{
        'url': 'https://www.bnnvara.nl/videos/27455',
        # TODO fill in other test attributes
    }]
    def _real_extract(self, url):
@ -145,7 +170,6 @@ class ONIE(NPOIE):
    _VALID_URL = r'https?://(?:www\.)?ongehoordnederland.tv/.*'
    _TESTS = [{
        'url': 'https://ongehoordnederland.tv/2024/03/01/korte-clips/heeft-preppen-zin-betwijfel-dat-je-daar-echt-iets-aan-zult-hebben-bij-oorlog-lydia-daniel/',
        # TODO fill in other test attributes
    }]
    def _real_extract(self, url):
@ -157,8 +181,34 @@ class ONIE(NPOIE):
            formats.extend(self._download_by_product_id(result, video_id))
        if not formats:
-            raise ExtractorError('Could not find a POMS product id in the provided URL, '
+            raise ExtractorError('Could not find a POMS product id in the provided URL.')
-                                 'perhaps because all stream URLs are DRM protected.')
+
        return {
            'id': video_id,
            'title': video_id,
            'formats': formats,
        }
 class VPROIE(NPOIE):
    IE_NAME = 'vpro'
    IE_DESC = 'vpro.nl'
    _VALID_URL = r'https?://(?:www\.)?vpro.nl/.*'
    _TESTS = [{
        'url': 'https://www.vpro.nl/programmas/tegenlicht/kijk/afleveringen/2015-2016/offline-als-luxe.html',
    }]
    def _real_extract(self, url):
        video_id = url.rstrip('/').split('/')[-1]
        page, _ = self._download_webpage_handle(url, video_id)
        results = re.findall('data-media-id="(.+_.+)"\s', page)
        formats = []
        for result in results:
            formats.extend(self._download_by_product_id(result, video_id))
            break  # TODO find a better solution, VPRO pages can have multiple videos embedded
        if not formats:
            raise ExtractorError('Could not find a POMS product id in the provided URL.')
        return {
            'id': video_id,
@ -174,7 +224,6 @@ class ZAPPIE(NPOIE):
    _TESTS = [{
        'url': 'https://www.zapp.nl/programmas/zappsport/gemist/AT_300003973',
        # TODO fill in other test attributes
    }]
    def _real_extract(self, url):
@ -187,93 +236,3 @@ class ZAPPIE(NPOIE):
            'title': video_id,
            'formats': formats,
        }
 class SchoolTVIE(NPOIE):
    IE_NAME = 'schooltv'
    IE_DESC = 'schooltv.nl'
    _VALID_URL = r'https?://(?:www\.)?schooltv.nl/item/.*'
    _TESTS = [{
        'url': 'https://schooltv.nl/item/zapp-music-challenge-2015-zapp-music-challenge-2015',
        # TODO fill in other test attributes
    }]
    def _real_extract(self, url):
        video_id = url.rstrip('/').split('/')[-1]
        # TODO Find out how we could obtain this automatically
        #      Otherwise this extractor might break each time SchoolTV deploys a new release
        build_id = 'b7eHUzAVO7wHXCopYxQhV'
        metadata_url = 'https://schooltv.nl/_next/data/' \
                       + build_id \
                       + '/item/' \
                       + video_id + '.json'
        metadata = self._download_json(metadata_url,
                                       video_id).get('pageProps', {}).get('data', {})
        formats = self._download_by_product_id(metadata.get('poms_mid'), video_id)
        if not formats:
            raise ExtractorError('Could not find a POMS product id in the provided URL, '
                                 'perhaps because all stream URLs are DRM protected.')
        return {
            'id': video_id,
            'title': metadata.get('title', '') + ' - ' + metadata.get('subtitle', ''),
            'description': metadata.get('description') or metadata.get('short_description'),
            'formats': formats,
        }
 class HetKlokhuisIE(NPOIE):
    ...
    def _real_extract(self, url):
        ...
 class VPROIE(NPOIE):
    IE_NAME = 'vpro'
    IE_DESC = 'vpro.nl'
    _VALID_URL = r'https?://(?:www\.)?vpro.nl/.*'
    _TESTS = [{
        'url': 'https://www.vpro.nl/programmas/tegenlicht/kijk/afleveringen/2015-2016/offline-als-luxe.html',
        # TODO fill in other test attributes
    }]
    def _real_extract(self, url):
        video_id = url.rstrip('/').split('/')[-1]
        page, _ = self._download_webpage_handle(url, video_id)
        results = re.findall(r'data-media-id="(.+_.+)"\s', page)
        formats = []
        for result in results:
            formats.extend(self._download_by_product_id(result, video_id))
            break  # TODO find a better solution, VPRO pages can have multiple videos embedded
        if not formats:
            raise ExtractorError('Could not find a POMS product id in the provided URL, '
                                 'perhaps because all stream URLs are DRM protected.')
        return {
            'id': video_id,
            'title': video_id,
            'formats': formats,
        }
 class WNLIE(NPOIE):
    ...
    def _real_extract(self, url):
        ...
 class AndereTijdenIE(NPOIE):
    ...
    def _real_extract(self, url):
        ...