Compare commits

...

5 Commits

Author SHA1 Message Date
Bart Broere
0cbcd1aec6 Make diff better 2024-03-06 12:55:51 +01:00
Bart Broere
159f825edd Add scaffolding for last few extractors and change order so the PR diff looks nice 2024-03-06 12:53:37 +01:00
Bart Broere
681b39032a Fix flake8 and better error reporting 2024-03-06 12:32:34 +01:00
Bart Broere
4b24e5f00d Re-add SchoolTV 2024-03-06 12:22:27 +01:00
Bart Broere
3b3d73cbe6 Use program-detail endpoint and remove a test 2024-03-06 11:52:08 +01:00
2 changed files with 125 additions and 75 deletions

View File

@ -847,7 +847,16 @@ from .nowness import (
NownessSeriesIE, NownessSeriesIE,
) )
from .noz import NozIE from .noz import NozIE
from .npo import BNNVaraIE, NPOIE, ONIE, VPROIE from .npo import (
AndereTijdenIE,
BNNVaraIE,
NPOIE,
ONIE,
SchoolTVIE,
HetKlokhuisIE,
VPROIE,
WNLIE,
)
from .npr import NprIE from .npr import NprIE
from .nrk import ( from .nrk import (
NRKIE, NRKIE,

View File

@ -11,32 +11,11 @@ from ..utils import ExtractorError
class NPOIE(InfoExtractor): class NPOIE(InfoExtractor):
IE_NAME = 'npo' IE_NAME = 'npo'
IE_DESC = 'npo.nl' IE_DESC = 'npo.nl'
_VALID_URL = r'''(?x) _VALID_URL = r'https?://(?:www\.)?npo\.nl/.*'
(?:
https?://
(?:www\.)?
(?:
npo\.nl/(?:[^/]+/)*
)
)
(?P<id>[^/?#]+)
'''
_TESTS = [{ _TESTS = [{
'url': 'https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/', 'url': 'https://npo.nl/start/serie/zembla/seizoen-2015/wie-is-de-mol-2/',
# TODO fill in other test attributes # TODO fill in other test attributes
}, {
'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800',
'md5': 'da50a5787dbfc1603c4ad80f31c5120b',
'info_dict': {
'id': 'VARA_101191800',
'ext': 'm4v',
'title': 'De Mega Mike & Mega Thomas show: The best of.',
'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4',
'upload_date': '20090227',
'duration': 2400,
},
'skip': 'Video gone',
}, { }, {
'url': 'https://npo.nl/start/serie/vpro-tegenlicht/seizoen-11/zwart-geld-de-toekomst-komt-uit-afrika', 'url': 'https://npo.nl/start/serie/vpro-tegenlicht/seizoen-11/zwart-geld-de-toekomst-komt-uit-afrika',
'md5': 'f8065e4e5a7824068ed3c7e783178f2c', 'md5': 'f8065e4e5a7824068ed3c7e783178f2c',
@ -66,27 +45,21 @@ class NPOIE(InfoExtractor):
url = url[:-10] url = url[:-10]
url = url.rstrip('/') url = url.rstrip('/')
slug = url.split('/')[-1] slug = url.split('/')[-1]
page = self._download_webpage(url, slug, 'Finding productId using slug: %s' % slug)
# TODO find out what proper HTML parsing utilities are available in youtube-dl program_metadata = self._download_json('https://npo.nl/start/api/domain/program-detail',
next_data = page.split('<script id="__NEXT_DATA__" type="application/json">')[1].split('</script>')[0] slug,
# TODO The data in this script tag feels like GraphQL, so there might be an easier way query={'slug': slug})
# to get the product id, maybe using a GraphQL endpoint product_id = program_metadata.get('productId')
next_data = self._parse_json(next_data, slug) images = program_metadata.get('images')
product_id, title, description, thumbnail = None, None, None, None thumbnail = None
for query in next_data['props']['pageProps']['dehydratedState']['queries']: for image in images:
if isinstance(query['state']['data'], list): thumbnail = image.get('url')
for entry in query['state']['data']: break
if entry['slug'] == slug: title = program_metadata.get('title')
product_id = entry.get('productId') descriptions = program_metadata.get('description', {})
title = entry.get('title') description = descriptions.get('long') or descriptions.get('short') or descriptions.get('brief')
synopsis = entry.get('synopsis', {}) duration = program_metadata.get('durationInSeconds')
description = (synopsis.get('long')
or synopsis.get('short')
or synopsis.get('brief'))
thumbnails = entry.get('images')
for thumbnail_entry in thumbnails:
if 'url' in thumbnail_entry:
thumbnail = thumbnail_entry.get('url')
if not product_id: if not product_id:
raise ExtractorError('No productId found for slug: %s' % slug) raise ExtractorError('No productId found for slug: %s' % slug)
@ -96,9 +69,9 @@ class NPOIE(InfoExtractor):
'id': slug, 'id': slug,
'formats': formats, 'formats': formats,
'title': title or slug, 'title': title or slug,
'description': description, 'description': description or title or slug,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
# TODO fill in other metadata that's available 'duration': duration,
} }
def _download_by_product_id(self, product_id, slug, url=None): def _download_by_product_id(self, product_id, slug, url=None):
@ -106,7 +79,8 @@ class NPOIE(InfoExtractor):
formats = [] formats = []
for profile in ( for profile in (
'dash', 'dash',
# 'hls', # TODO test what needs to change for 'hls' support # 'hls' is available too, but implementing it doesn't add much
# As far as I know 'dash' is always available
): ):
stream_link = self._download_json( stream_link = self._download_json(
'https://prod.npoplayer.nl/stream-link', video_id=slug, 'https://prod.npoplayer.nl/stream-link', video_id=slug,
@ -131,6 +105,7 @@ class BNNVaraIE(NPOIE):
_VALID_URL = r'https?://(?:www\.)?bnnvara\.nl/videos/[0-9]*' _VALID_URL = r'https?://(?:www\.)?bnnvara\.nl/videos/[0-9]*'
_TESTS = [{ _TESTS = [{
'url': 'https://www.bnnvara.nl/videos/27455', 'url': 'https://www.bnnvara.nl/videos/27455',
# TODO fill in other test attributes
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -170,6 +145,7 @@ class ONIE(NPOIE):
_VALID_URL = r'https?://(?:www\.)?ongehoordnederland.tv/.*' _VALID_URL = r'https?://(?:www\.)?ongehoordnederland.tv/.*'
_TESTS = [{ _TESTS = [{
'url': 'https://ongehoordnederland.tv/2024/03/01/korte-clips/heeft-preppen-zin-betwijfel-dat-je-daar-echt-iets-aan-zult-hebben-bij-oorlog-lydia-daniel/', 'url': 'https://ongehoordnederland.tv/2024/03/01/korte-clips/heeft-preppen-zin-betwijfel-dat-je-daar-echt-iets-aan-zult-hebben-bij-oorlog-lydia-daniel/',
# TODO fill in other test attributes
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -181,34 +157,8 @@ class ONIE(NPOIE):
formats.extend(self._download_by_product_id(result, video_id)) formats.extend(self._download_by_product_id(result, video_id))
if not formats: if not formats:
raise ExtractorError('Could not find a POMS product id in the provided URL.') raise ExtractorError('Could not find a POMS product id in the provided URL, '
'perhaps because all stream URLs are DRM protected.')
return {
'id': video_id,
'title': video_id,
'formats': formats,
}
class VPROIE(NPOIE):
IE_NAME = 'vpro'
IE_DESC = 'vpro.nl'
_VALID_URL = r'https?://(?:www\.)?vpro.nl/.*'
_TESTS = [{
'url': 'https://www.vpro.nl/programmas/tegenlicht/kijk/afleveringen/2015-2016/offline-als-luxe.html',
}]
def _real_extract(self, url):
video_id = url.rstrip('/').split('/')[-1]
page, _ = self._download_webpage_handle(url, video_id)
results = re.findall('data-media-id="(.+_.+)"\s', page)
formats = []
for result in results:
formats.extend(self._download_by_product_id(result, video_id))
break # TODO find a better solution, VPRO pages can have multiple videos embedded
if not formats:
raise ExtractorError('Could not find a POMS product id in the provided URL.')
return { return {
'id': video_id, 'id': video_id,
@ -224,6 +174,7 @@ class ZAPPIE(NPOIE):
_TESTS = [{ _TESTS = [{
'url': 'https://www.zapp.nl/programmas/zappsport/gemist/AT_300003973', 'url': 'https://www.zapp.nl/programmas/zappsport/gemist/AT_300003973',
# TODO fill in other test attributes
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -236,3 +187,93 @@ class ZAPPIE(NPOIE):
'title': video_id, 'title': video_id,
'formats': formats, 'formats': formats,
} }
class SchoolTVIE(NPOIE):
IE_NAME = 'schooltv'
IE_DESC = 'schooltv.nl'
_VALID_URL = r'https?://(?:www\.)?schooltv.nl/item/.*'
_TESTS = [{
'url': 'https://schooltv.nl/item/zapp-music-challenge-2015-zapp-music-challenge-2015',
# TODO fill in other test attributes
}]
def _real_extract(self, url):
video_id = url.rstrip('/').split('/')[-1]
# TODO Find out how we could obtain this automatically
# Otherwise this extractor might break each time SchoolTV deploys a new release
build_id = 'b7eHUzAVO7wHXCopYxQhV'
metadata_url = 'https://schooltv.nl/_next/data/' \
+ build_id \
+ '/item/' \
+ video_id + '.json'
metadata = self._download_json(metadata_url,
video_id).get('pageProps', {}).get('data', {})
formats = self._download_by_product_id(metadata.get('poms_mid'), video_id)
if not formats:
raise ExtractorError('Could not find a POMS product id in the provided URL, '
'perhaps because all stream URLs are DRM protected.')
return {
'id': video_id,
'title': metadata.get('title', '') + ' - ' + metadata.get('subtitle', ''),
'description': metadata.get('description') or metadata.get('short_description'),
'formats': formats,
}
class HetKlokhuisIE(NPOIE):
...
def _real_extract(self, url):
...
class VPROIE(NPOIE):
IE_NAME = 'vpro'
IE_DESC = 'vpro.nl'
_VALID_URL = r'https?://(?:www\.)?vpro.nl/.*'
_TESTS = [{
'url': 'https://www.vpro.nl/programmas/tegenlicht/kijk/afleveringen/2015-2016/offline-als-luxe.html',
# TODO fill in other test attributes
}]
def _real_extract(self, url):
video_id = url.rstrip('/').split('/')[-1]
page, _ = self._download_webpage_handle(url, video_id)
results = re.findall(r'data-media-id="(.+_.+)"\s', page)
formats = []
for result in results:
formats.extend(self._download_by_product_id(result, video_id))
break # TODO find a better solution, VPRO pages can have multiple videos embedded
if not formats:
raise ExtractorError('Could not find a POMS product id in the provided URL, '
'perhaps because all stream URLs are DRM protected.')
return {
'id': video_id,
'title': video_id,
'formats': formats,
}
class WNLIE(NPOIE):
...
def _real_extract(self, url):
...
class AndereTijdenIE(NPOIE):
...
def _real_extract(self, url):
...