diff --git a/AUTHORS b/AUTHORS index b507cb8df..64ac71249 100644 --- a/AUTHORS +++ b/AUTHORS @@ -246,3 +246,4 @@ Enes Solak Nathan Rossi Thomas van der Berg Luca Cherubin +Henrik Heimbuerger diff --git a/docs/supportedsites.md b/docs/supportedsites.md index aa8026a32..a4372912e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -569,6 +569,7 @@ - **ndr:embed** - **ndr:embed:base** - **NDTV** + - **Nebula** - **NerdCubedFeed** - **netease:album**: 网易云音乐 - 专辑 - **netease:djradio**: 网易云音乐 - 电台 diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 29b0e615e..9874441d5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -731,8 +731,9 @@ from .ndr import ( NJoyEmbedIE, ) from .ndtv import NDTVIE -from .netzkino import NetzkinoIE +from .nebula import NebulaIE from .nerdcubed import NerdCubedFeedIE +from .netzkino import NetzkinoIE from .neteasemusic import ( NetEaseMusicIE, NetEaseMusicAlbumIE, diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py new file mode 100644 index 000000000..e22a4b088 --- /dev/null +++ b/youtube_dl/extractor/nebula.py @@ -0,0 +1,132 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os + +from .common import InfoExtractor +from ..utils import parse_iso8601 + +COOKIE_NEBULA_AUTH = os.environ.get('COOKIE_NEBULA_AUTH') # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests + + +class NebulaIE(InfoExtractor): + """ + Nebula (https://watchnebula.com/) is a video platform created by the streamer community Standard. It hosts videos + off-YouTube from a small hand-picked group of creators. + + All videos require a subscription to watch. There are no known freely available videos. So the test case is + disabled (but should pass when supplying a 'nebula-auth' cookie for an account with a valid subscription). + + Nebula uses the Zype video infrastructure and this extractor is using the 'url_transparent' mode to hand off + video extraction to the Zype extractor. + + This description has been last updated on 2020-04-07. + """ + + _VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P[-\w]+)' # the 'id' group is actually the slug, but we misname it 'id' to be able to use _match_id() + _TEST = { + 'url': 'https://watchnebula.com/videos/that-time-disney-remade-beauty-and-the-beast', + 'md5': 'fe79c4df8b3aa2fea98a93d027465c7e', + 'info_dict': { + 'id': '5c271b40b13fd613090034fd', + 'ext': 'mp4', + 'title': 'That Time Disney Remade Beauty and the Beast', + 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.', + 'upload_date': '20180731', + 'timestamp': 1533009600, + #'uploader': 'Lindsay Ellis', # TODO: removed because unreliable/sometimes incorrect + } + } + _WORKING = False # this is set to False because the test won't pass without an auth cookie for a (paid) subscription + + def _extract_state_object(self, webpage, display_id): + """ + As of 2020-04-07, every Nebula video page is a React base page, containing an initial state JSON in a script + tag. This function is extracting this script tag, parsing it as JSON. + """ + initial_state_object = self._search_regex(r'', webpage, 'initial_state') + metadata = self._parse_json(initial_state_object, video_id=display_id) # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead? + + return metadata + + def _extract_video_metadata(self, state_object, display_id): + """ + The state object contains a videos.byURL dictionary, which maps URL display IDs to video IDs. Using the + video ID, we can then extract a dictionary with various meta data about the video itself. + """ + video_id = state_object['videos']['byURL'][display_id] + video_meta = state_object['videos']['byID'][video_id] + + return video_id, video_meta + + def _extract_video_url(self, webpage, state_object, video_id): + """ + To get the embed URL of the actual video stream, we could reconstruct it from the video ID, but it seems a + bit more stable to extract the iframe source that links to the video. + """ + iframe = self._search_regex(r'', webpage, 'iframe', fatal=False) + video_url = self._search_regex(r'src="(.+?)"', iframe, 'iframe-src', fatal=False) if iframe else None + + # fallback: reconstruct using video ID and access token from state object + if not video_url: + access_token = state_object['account']['userInfo']['zypeAuthInfo']['accessToken'] + video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format(video_id=video_id, access_token=access_token) + + return video_url + + def _extract_uploader(self, video_meta): + """ + Nebula doesn't really seem to have the concept of an uploader internally, videos are often organized + more like a (TV) series than by uploader. But in the example case, Lindsay Ellis is the creator, so + I'll go with this for now. + """ + return video_meta['categories'][0]['value'][0] + + def _real_extract(self, url): + # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests + if COOKIE_NEBULA_AUTH: + self._set_cookie('watchnebula.com', 'nebula-auth', COOKIE_NEBULA_AUTH) + + # extract the video's display ID from the URL (we'll retrieve the video ID later) + display_id = self._match_id(url) + + # download the page + webpage = self._download_webpage(url, video_id=display_id) # TODO: what video ID do I supply, as I don't know it yet? _download_webpage doesn't accept a display_id instead... + + # extract the state object from the webpage, and then retrieve video meta data from it + state_object = self._extract_state_object(webpage, display_id) + video_id, video_meta = self._extract_video_metadata(state_object, display_id) + + # extract the video URL from the webpage + video_url = self._extract_video_url(webpage, state_object, video_id) + + return { + 'id': video_id, + 'display_id': display_id, + + # we're passing this video URL on to the 'Zype' extractor (that's the video infrastructure that Nebula is + # built on top of) and use the 'url_transparent' type to indicate that our meta data should be better than + # whatever the Zype extractor is able to identify + '_type': 'url_transparent', + 'ie_key': 'Zype', + 'url': video_url, + + # the meta data we were able to extract from Nebula + 'title': video_meta['title'], + 'description': video_meta['description'], + 'timestamp': parse_iso8601(video_meta['published_at']), + #'uploader': self._extract_uploader(video_meta), # TODO: removed because unreliable/sometimes incorrect + 'thumbnails': [ + { + 'id': tn['name'], # this appears to be null in all cases I've seen + 'url': tn['url'], + 'width': tn['width'], + 'height': tn['height'], + } for tn in video_meta['thumbnails'] + ], + 'duration': video_meta['duration'], + # TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from! + # TODO: channel + # TODO: channel_id + # TODO: channel_url + }