diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 402e542ae..5dcd4ced3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -426,6 +426,10 @@ from .gamespot import GameSpotIE from .gamestar import GameStarIE from .gaskrank import GaskrankIE from .gazeta import GazetaIE +from .gbnews import ( + GBNewsIE, + GBNewsLiveIE, +) from .gdcvault import GDCVaultIE from .gedidigital import GediDigitalIE from .generic import GenericIE diff --git a/youtube_dl/extractor/gbnews.py b/youtube_dl/extractor/gbnews.py new file mode 100644 index 000000000..d70301c14 --- /dev/null +++ b/youtube_dl/extractor/gbnews.py @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + extract_attributes, + ExtractorError, + try_get, +) + + +class GBNewsIE(InfoExtractor): + '''GB News clips and features''' + + _VALID_URL = r'https?://(?:www\.)?gbnews\.uk/(?:shows(?:/(?P[^/]+))?|a)/(?P\d+)' + _PLATFORM = 'safari' + _SSMP_URL = 'https://mm-dev.simplestream.com/ssmp/api.php' + _TESTS = [{ + 'url': 'https://www.gbnews.uk/shows/andrew-neils-message-to-companies-choosing-to-boycott-gb-news/106889', + 'info_dict': { + 'id': '106889', + 'ext': 'mp4', + 'title': "Andrew Neil's message to companies choosing to boycott GB News", + 'description': 'md5:b281f5d22fd6d5eda64a4e3ba771b351', + }, + }, + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + # extraction based on https://github.com/ytdl-org/youtube-dl/issues/29341 + ''' +
+ ''' + # exception if no match + video_data = self._search_regex( + r'<\s*div\s[^>]*class\s*=\s*([\'"])simplestream\1[^>]*>', + webpage, "video data", group=0) + + # print(video_data) + video_data = extract_attributes(video_data) + ss_id = try_get(video_data, lambda x: x['data-id']) + if not ss_id: + raise ExtractorError('Simplestream ID not found') + + # exception if no JSON + json_data = self._download_json( + self._SSMP_URL, display_id, + note='Downloading Simplestream JSON metadata', + errnote='Unable to download Simplestream JSON metadata', + query={ + 'id': ss_id, + 'env': video_data.get('data-env'), + }) + + meta_url = try_get(json_data, lambda x: x['response']['api_hostname'], compat_str) + if not meta_url: + raise ExtractorError('No API host found') + + uvid = video_data.get('data-uvid') + dtype = video_data.get('data-type') + # exception if no JSON + stream_data = self._download_json( + '%s/api/%s/stream/%s' % (meta_url, 'show' if dtype == 'vod' else dtype, uvid), + display_id, + query={ + 'key': video_data.get('data-key'), + 'platform': self._PLATFORM, + }, + headers={ + 'Token': video_data.get('data-token'), + 'Token-Expiry': video_data.get('data-expiry'), + 'Uvid': uvid, + }) + + stream_url = try_get(stream_data, lambda x: x['response']['stream'], compat_str) + if not stream_url: + raise ExtractorError('No stream data') + + # now known to be a dict + stream_data = stream_data['response'] + drm = stream_data.get('drm') + if drm: + raise ExtractorError( + 'Stream is requesting DRM (%s) playback: unsupported' % drm, + expected=True) + + formats = [] + formats.extend( + self._extract_m3u8_formats(stream_url, display_id, ext='mp4', fatal=False)) + + # exception if no formats + self._sort_formats(formats) + + # no 'title' attribute seen, but if it comes ... + title = stream_data.get('title') or self._og_search_title(webpage) + + return { + 'id': display_id, + 'title': title, + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': video_data.get('data-poster') or None, + 'formats': formats, + 'is_live': 'Live' in self.IE_NAME, + } + + +class GBNewsLiveIE(GBNewsIE): + '''GB News live programme stream''' + + _VALID_URL = r'https?://(?:www.)?gbnews.uk/(?Pwatchlive)(?:$|[/?#])' + _TESTS = [{ + 'url': 'https://www.gbnews.uk/watchlive', + 'info_dict': { + 'id': 'watchlive', + 'ext': 'mp4', + 'title': "Watchlive", + 'is_live': True, + }, + }, + ] + + ''' +
+
+ '''