[nebula] Add better channel title extraction (refs #21258)

This commit is contained in:
Henrik Heimbuerger 2020-04-16 04:35:05 +02:00
parent 469cae38cd
commit 61cead3235

View File

@ -105,13 +105,32 @@ class NebulaIE(InfoExtractor):
return video_url return video_url
def _extract_uploader(self, video_meta): def _extract_channel(self, video_meta):
""" """
Nebula doesn't really seem to have the concept of an uploader internally, videos are often organized Extract the channel title, by going through the list of categories and finding the first value of the
more like a (TV) series than by uploader. But in the example case, Lindsay Ellis is the creator, so first category that has a value.
I'll go with this for now.
I know this look like a terrible approach. But actually, it's just reproducing the behavior of the
React code the Nebula frontend uses (as of 2020-04-07):
let channel;
if (video && video.categories && video.categories.length) {
const channelTitle = video.categories.map((category) => (category.value[0]))
.filter((title) => (!!title))[0];
channel = getChannelByTitle(state, { title: channelTitle });
}
Basically, it finds the first (truthy) value in the category list and that's assumed to be the
channel title. And then the channel details (e.g. the URL) are looked up by title (!) (not by any
kind of ID) via an additional API call.
TODO: Implement the API calls giving us the channel list, so that we can do the title lookup and then figure out the channel URL
""" """
return video_meta['categories'][0]['value'][0] categories = video_meta['categories']
for category in categories:
if category['value']:
return category['value'][0]
return None
def _real_extract(self, url): def _real_extract(self, url):
# FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests
@ -127,6 +146,7 @@ class NebulaIE(InfoExtractor):
# extract the state object from the webpage, and then retrieve video meta data from it # extract the state object from the webpage, and then retrieve video meta data from it
state_object = self._extract_state_object(webpage, display_id) state_object = self._extract_state_object(webpage, display_id)
video_id, video_meta = self._extract_video_metadata(state_object, display_id) video_id, video_meta = self._extract_video_metadata(state_object, display_id)
channel_title = self._extract_channel(video_meta)
# extract the video URL from the webpage # extract the video URL from the webpage
video_url = self._extract_video_url(webpage, state_object, video_id) video_url = self._extract_video_url(webpage, state_object, video_id)
@ -146,7 +166,6 @@ class NebulaIE(InfoExtractor):
'title': video_meta['title'], 'title': video_meta['title'],
'description': video_meta['description'], 'description': video_meta['description'],
'timestamp': parse_iso8601(video_meta['published_at']), 'timestamp': parse_iso8601(video_meta['published_at']),
#'uploader': self._extract_uploader(video_meta), # TODO: removed because unreliable/sometimes incorrect
'thumbnails': [ 'thumbnails': [
{ {
'id': tn['name'], # this appears to be null in all cases I've seen 'id': tn['name'], # this appears to be null in all cases I've seen
@ -156,8 +175,9 @@ class NebulaIE(InfoExtractor):
} for tn in video_meta['thumbnails'] } for tn in video_meta['thumbnails']
], ],
'duration': video_meta['duration'], 'duration': video_meta['duration'],
'channel': channel_title,
'uploader': channel_title, # we chose here to declare the channel name as the 'uploader' -- that's certainly arguable, as sometimes it's more of a series
# TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from! # TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from!
# TODO: channel
# TODO: channel_id # TODO: channel_id
# TODO: channel_url # TODO: channel_url
} }