youtube-dl/youtube_dl/extractor/nebula.py

194 lines
10 KiB
Python
Raw Normal View History

# coding: utf-8
from __future__ import unicode_literals
import os
from .common import InfoExtractor
2020-04-18 13:15:03 +09:00
from ..compat import compat_str
from ..utils import parse_iso8601, try_get
COOKIE_NEBULA_AUTH = os.environ.get('COOKIE_NEBULA_AUTH') # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests
class NebulaIE(InfoExtractor):
"""
Nebula (https://watchnebula.com/) is a video platform created by the streamer community Standard. It hosts videos
off-YouTube from a small hand-picked group of creators.
All videos require a subscription to watch. There are no known freely available videos. So the test case is
disabled (but should pass when supplying a 'nebula-auth' cookie for an account with a valid subscription).
Nebula uses the Zype video infrastructure and this extractor is using the 'url_transparent' mode to hand off
video extraction to the Zype extractor.
This description has been last updated on 2020-04-07.
"""
_VALID_URL = r'https?://(?:www\.)?watchnebula\.com/videos/(?P<id>[-\w]+)' # the 'id' group is actually the display_id, but we misname it 'id' to be able to use _match_id()
_TESTS = [
{
'url': 'https://watchnebula.com/videos/that-time-disney-remade-beauty-and-the-beast',
'md5': 'fe79c4df8b3aa2fea98a93d027465c7e',
'info_dict': {
'id': '5c271b40b13fd613090034fd',
'ext': 'mp4',
'title': 'That Time Disney Remade Beauty and the Beast',
'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We werent able to remove it without reducing video quality, so its presented here in its original context.',
'upload_date': '20180731',
'timestamp': 1533009600,
'channel': 'Lindsay Ellis',
'uploader': 'Lindsay Ellis',
}
},
{
'url': 'https://watchnebula.com/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
'md5': 'b0b171504d67e2822179149ccd6787db',
'info_dict': {
'id': '5e7e78171aaf320001fbd6be',
'ext': 'mp4',
'title': 'Landing Craft - How The Allies Got Ashore',
'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
'upload_date': '20200327',
'timestamp': 1585348140,
'channel': 'The Logistics of D-Day',
'uploader': 'The Logistics of D-Day',
}
},
{
'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
'md5': '98e96346caa3b303fec4493c5d49dcb5',
'info_dict': {
'id': '5e779ebdd157bc0001d1c75a',
'ext': 'mp4',
'title': 'Episode 1: The Draw',
'description': r're:^Theres free money on offer… if the players can all work together.',
'upload_date': '20200323',
'timestamp': 1584980400,
'channel': 'Tom Scott Presents: Money',
'uploader': 'Tom Scott Presents: Money',
}
},
]
_WORKING = True # FIXME: should this be set to False, to hide the tests from CI, given that the unit tests require an auth cookie of a (paid) subscription?
def _extract_state_object(self, webpage, display_id):
"""
As of 2020-04-07, every Nebula video page is a React base page, containing an initial state JSON in a script
tag. This function is extracting this script tag, parsing it as JSON.
2020-04-18 13:15:03 +09:00
May return None if no state object could be found or it didn't contain valid JSON.
"""
2020-04-18 13:15:03 +09:00
initial_state_object = self._search_regex(
r'<script[^>]*id="initial-app-state"[^>]*>(.+?)</script>', webpage,
'initial_state', fatal=False, default=None)
metadata = self._parse_json(initial_state_object, video_id=display_id) if initial_state_object else None # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead?
return metadata
def _extract_video_metadata(self, state_object, display_id):
"""
The state object contains a videos.byURL dictionary, which maps URL display IDs to video IDs. Using the
video ID, we can then extract a dictionary with various meta data about the video itself.
2020-04-18 13:15:03 +09:00
May return (None, {}) if no state object was given or it didn't contain the expected lookup table or
meta data.
"""
2020-04-18 13:15:03 +09:00
video_id = try_get(state_object, lambda x: x['videos']['byURL'][display_id], compat_str)
video_meta = try_get(state_object, lambda x: x['videos']['byID'][video_id], dict) or {}
return video_id, video_meta
def _extract_video_url(self, webpage, state_object, video_id):
"""
To get the embed URL of the actual video stream, we could reconstruct it from the video ID, but it seems a
bit more stable to extract the iframe source that links to the video.
"""
iframe = self._search_regex(r'<iframe(.+?)</iframe>', webpage, 'iframe', fatal=False)
video_url = self._search_regex(r'src="(.+?)"', iframe, 'iframe-src', fatal=False) if iframe else None
# fallback: reconstruct using video ID and access token from state object
if not video_url:
2020-04-18 13:15:03 +09:00
access_token = try_get(state_object, lambda x: x['account']['userInfo']['zypeAuthInfo']['accessToken'],
compat_str)
video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format(
video_id=video_id, access_token=access_token)
return video_url
def _extract_channel(self, video_meta):
"""
Extract the channel title, by going through the list of categories and finding the first value of the
first category that has a value.
I know this look like a terrible approach. But actually, it's just reproducing the behavior of the
React code the Nebula frontend uses (as of 2020-04-07):
let channel;
if (video && video.categories && video.categories.length) {
const channelTitle = video.categories.map((category) => (category.value[0]))
.filter((title) => (!!title))[0];
channel = getChannelByTitle(state, { title: channelTitle });
}
Basically, it finds the first (truthy) value in the category list and that's assumed to be the
channel title. And then the channel details (e.g. the URL) are looked up by title (!) (not by any
kind of ID) via an additional API call.
TODO: Implement the API calls giving us the channel list, so that we can do the title lookup and then figure out the channel URL
2020-04-18 13:15:03 +09:00
May return None of no category list could be found or no category had a label ('value').
"""
2020-04-18 13:15:03 +09:00
categories = video_meta.get('categories', []) if video_meta else []
for category in categories:
2020-04-18 13:15:03 +09:00
if category.get('value'): # we're intentionally not using "'value' in category" here, because the expression is supposed to be falsy for empty lists in category['value'] as well!
return category['value'][0]
def _real_extract(self, url):
# FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests
if COOKIE_NEBULA_AUTH:
self._set_cookie('watchnebula.com', 'nebula-auth', COOKIE_NEBULA_AUTH)
# extract the video's display ID from the URL (we'll retrieve the video ID later)
display_id = self._match_id(url)
# download the page
webpage = self._download_webpage(url, video_id=display_id) # TODO: what video ID do I supply, as I don't know it yet? _download_webpage doesn't accept a display_id instead...
# extract the state object from the webpage, and then retrieve video meta data from it
state_object = self._extract_state_object(webpage, display_id)
video_id, video_meta = self._extract_video_metadata(state_object, display_id)
channel_title = self._extract_channel(video_meta)
# extract the video URL from the webpage
video_url = self._extract_video_url(webpage, state_object, video_id)
return {
'id': video_id,
'display_id': display_id,
# we're passing this video URL on to the 'Zype' extractor (that's the video infrastructure that Nebula is
# built on top of) and use the 'url_transparent' type to indicate that our meta data should be better than
# whatever the Zype extractor is able to identify
'_type': 'url_transparent',
'ie_key': 'Zype',
'url': video_url,
# the meta data we were able to extract from Nebula
2020-04-18 13:15:03 +09:00
'title': video_meta.get('title'),
'description': video_meta.get('description'),
'timestamp': parse_iso8601(video_meta.get('published_at')),
'thumbnails': [
{
2020-04-18 13:15:03 +09:00
'id': tn.get('name'), # this appears to be null in all cases I've seen
'url': tn['url'],
2020-04-18 13:15:03 +09:00
'width': tn.get('width'),
'height': tn.get('height'),
} for tn in video_meta.get('thumbnails', [])],
'duration': video_meta.get('duration'),
'channel': channel_title,
'uploader': channel_title, # we chose here to declare the channel name as the 'uploader' -- that's certainly arguable, as sometimes it's more of a series
# TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from!
# TODO: channel_id
# TODO: channel_url
}