[MicrosoftStream] Add extractor for channels and refactor single video extractor

This commit is contained in:
Nick Lai 2020-04-10 21:34:20 +08:00
parent 292be92987
commit 67db51cdfe
No known key found for this signature in database
GPG Key ID: AF5E3B79EE6B1CC4
2 changed files with 176 additions and 76 deletions

View File

@ -619,7 +619,10 @@ from .metacritic import MetacriticIE
from .mgoon import MgoonIE from .mgoon import MgoonIE
from .mgtv import MGTVIE from .mgtv import MGTVIE
from .miaopai import MiaoPaiIE from .miaopai import MiaoPaiIE
from .microsoftstream import MicrosoftStreamIE from .microsoftstream import (
MicrosoftStreamIE,
MicrosoftStreamChannelIE
)
from .microsoftvirtualacademy import ( from .microsoftvirtualacademy import (
MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyIE,
MicrosoftVirtualAcademyCourseIE, MicrosoftVirtualAcademyCourseIE,

View File

@ -1,11 +1,13 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ExtractorError from ..utils import (ExtractorError, update_url_query, merge_dicts)
class MicrosoftStreamBaseIE(InfoExtractor): class MicrosoftStreamBaseIE(InfoExtractor):
_LOGIN_URL = 'https://web.microsoftstream.com/?noSignUpCheck=1' # expect redirection _LOGIN_URL = 'https://web.microsoftstream.com/?noSignUpCheck=1' # expect redirection
_ACCESS_TOKEN = None # A JWT token
_API_GATEWAY = None
def is_logged_in(self, webpage): def is_logged_in(self, webpage):
""" """
@ -37,43 +39,14 @@ class MicrosoftStreamBaseIE(InfoExtractor):
self._API_GATEWAY = self._html_search_regex(r"\"ApiGatewayUri\":\"(?P<APIGateway>.+?)\"", webpage, 'APIGateway') self._API_GATEWAY = self._html_search_regex(r"\"ApiGatewayUri\":\"(?P<APIGateway>.+?)\"", webpage, 'APIGateway')
return self._API_GATEWAY return self._API_GATEWAY
class MicrosoftStreamIE(MicrosoftStreamBaseIE):
""" """
Extractor for single Microsoft Stream video Common getters
"""
IE_NAME = 'microsoftstream'
_VALID_URL = r'https?://(?:(?:web|www)\.)?microsoftstream\.com/video/(?P<id>[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})' # https://regex101.com/r/K1mlgK/1/
_NETRC_MACHINE = 'microsoftstream'
_ACCESS_TOKEN = None # A JWT token
_API_GATEWAY = None
_TEXTTRACKS_RESPONSE = None
_VIDEO_ID = None
_TEST = [{
'url': 'https://web.microsoftstream.com/video/c883c6a5-9895-4900-9a35-62f4b5d506c9',
'info_dict': {
'id': 'c883c6a5-9895-4900-9a35-62f4b5d506c9',
'ext': 'mp4',
'title': 'Webinar for Researchers: Use of GitLab',
'thumbnail': r're:^https?://.*$'
},
'skip': 'Requires Microsoft 365 account credentials',
}, {
'url': 'https://web.microsoftstream.com/video/c883c6a5-9895-4900-9a35-62f4b5d506c9',
'only_matching': True,
}, {
'url': 'https://www.microsoftstream.com/video/1541f3f9-7fed-4901-ae70-0f7cb775679f',
'only_matching': True,
}]
"""
Getters
The following getters include helpful message to prompt developers for potential errors.
""" """
@property @property
def api_gateway(self): def api_gateway(self):
"""
Return the start of an API endoint, like "https://aaea-1.api.microsoftstream.com/api/"
"""
if self._API_GATEWAY is None: if self._API_GATEWAY is None:
raise ExtractorError('API gateway is None. Did you forget to call "_extract_api_gateway"?') raise ExtractorError('API gateway is None. Did you forget to call "_extract_api_gateway"?')
return self._API_GATEWAY return self._API_GATEWAY
@ -82,34 +55,15 @@ class MicrosoftStreamIE(MicrosoftStreamBaseIE):
def access_token(self): def access_token(self):
if self._ACCESS_TOKEN is None: if self._ACCESS_TOKEN is None:
raise ExtractorError('Access token is None. Did you forget to call "_extract_access_token"?') raise ExtractorError('Access token is None. Did you forget to call "_extract_access_token"?')
return self._ACCESS_TOKEN return self._ACCESS_TOKEN
@property
def video_id(self):
if self._VIDEO_ID is None:
raise('Variable "_VIDEO_ID" is not defined. Did you make the main extraction call?')
return self._VIDEO_ID
@property @property
def headers(self): def headers(self):
return {'Authorization': 'Bearer %s' % self.access_token} return {'Authorization': 'Bearer %s' % self.access_token}
@property """
def texttrack_info_endpoint(self): Utils function
return "%s/videos/%s/texttracks?api-version=1.3-private" % (self.api_gateway, self.video_id) """
@property
def media_info_endpoint(self):
return "%s/videos/%s?$expand=creator,tokens,status,liveEvent,extensions&api-version=1.3-private" % (self.api_gateway, self.video_id)
def _request_texttracks(self):
"""
Make an additional request to Microsoft Stream for the subtitle and auto-caption
"""
# Map default variable
self._TEXTTRACKS_RESPONSE = self._download_json(self.texttrack_info_endpoint, self.video_id, headers=self.headers).get('value')
return self._TEXTTRACKS_RESPONSE
def _determine_protocol(self, mime): def _determine_protocol(self, mime):
""" """
@ -122,6 +76,12 @@ class MicrosoftStreamIE(MicrosoftStreamBaseIE):
else: else:
return None return None
"""
Remapper
Remap data into correct field
"""
def _remap_thumbnails(self, thumbnail_dict_list): def _remap_thumbnails(self, thumbnail_dict_list):
output = [] output = []
preference_index = ['extraSmall', 'small', 'medium', 'large'] preference_index = ['extraSmall', 'small', 'medium', 'large']
@ -133,7 +93,7 @@ class MicrosoftStreamIE(MicrosoftStreamBaseIE):
}) })
return output return output
def _remap_playback(self, master_playlist_urls): def _remap_playback(self, master_playlist_urls, video_id=None):
""" """
A parser for the HLS and MPD playlists from the API endpoint. A parser for the HLS and MPD playlists from the API endpoint.
""" """
@ -143,11 +103,11 @@ class MicrosoftStreamIE(MicrosoftStreamBaseIE):
protocol = self._determine_protocol(master_playlist_url['mimeType']) protocol = self._determine_protocol(master_playlist_url['mimeType'])
# Handle HLS Master playlist # Handle HLS Master playlist
if protocol == 'm3u8': if protocol == 'm3u8':
varient_playlists = self._extract_m3u8_formats(master_playlist_url['playbackUrl'], video_id=self.video_id, headers=self.headers) varient_playlists = self._extract_m3u8_formats(master_playlist_url['playbackUrl'], video_id=video_id, headers=self.headers)
# For MPEG-DASH Master playlists # For MPEG-DASH Master playlists
elif protocol == 'http_dash_segments': elif protocol == 'http_dash_segments':
varient_playlists = self._extract_mpd_formats(master_playlist_url['playbackUrl'], video_id=self.video_id, headers=self.headers) varient_playlists = self._extract_mpd_formats(master_playlist_url['playbackUrl'], video_id=video_id, headers=self.headers)
# For other Master playlists (like Microsoft Smooth Streaming) # For other Master playlists (like Microsoft Smooth Streaming)
else: else:
@ -160,7 +120,7 @@ class MicrosoftStreamIE(MicrosoftStreamBaseIE):
output.append(varient_playlist) output.append(varient_playlist)
return output return output
def _extract_subtitle(self, tracks, is_auto_generated): def _remap_subtitle(self, tracks, video_id, is_auto_generated):
""" """
An internal method for filtering and remapping text tracks An internal method for filtering and remapping text tracks
""" """
@ -177,18 +137,83 @@ class MicrosoftStreamIE(MicrosoftStreamBaseIE):
subtitle_subset[track_language] = [] # Scaffold an empty list for the object to insert into subtitle_subset[track_language] = [] # Scaffold an empty list for the object to insert into
# Since the subtitle is token protected, a get request will fire here. # Since the subtitle is token protected, a get request will fire here.
data = self._download_webpage(url_or_request=track.get('url'), video_id=self.video_id, headers=self.headers) data = self._download_webpage(url_or_request=track.get('url'), video_id=video_id, headers=self.headers)
subtitle_subset[track_language].append({'data': data, "ext": "vtt"}) subtitle_subset[track_language].append({'data': data, "ext": "vtt"})
return subtitle_subset return subtitle_subset
def _remap_video(self, video):
return {
'id': video['id'],
'title': video['name'],
'description': video.get('description'),
'uploader': video.get('creator').get('name'),
'thumbnails': self._remap_thumbnails(video.get('posterImage') or []),
'formats': self._remap_playback(video['playbackUrls'], video_id=video['id']),
'is_live': False
}
"""
Formatter
"""
def _format_texttrack_endpoint(self, video_id):
return "%s/videos/%s/texttracks?api-version=1.4-private" % (self.api_gateway, video_id)
def _request_texttracks(self, video_id):
"""
Make an additional request to Microsoft Stream for the subtitle and auto-caption
"""
self._TEXTTRACKS_RESPONSE = self._download_json(self._format_texttrack_endpoint(video_id), video_id, headers=self.headers).get('value')
return self._TEXTTRACKS_RESPONSE
class MicrosoftStreamIE(MicrosoftStreamBaseIE):
"""
Extractor for single Microsoft Stream video
"""
IE_NAME = 'microsoftstream'
_VALID_URL = r'https?://(?:(?:web|www)\.)?microsoftstream\.com/video/(?P<id>[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})' # https://regex101.com/r/K1mlgK/1/
_NETRC_MACHINE = 'microsoftstream'
_TEXTTRACKS_RESPONSE = None
_VIDEO_ID = None
_TEST = [{
'url': 'https://web.microsoftstream.com/video/c883c6a5-9895-4900-9a35-62f4b5d506c9',
'info_dict': {
'id': 'c883c6a5-9895-4900-9a35-62f4b5d506c9',
'ext': 'mp4',
'title': 'Webinar for Researchers: Use of GitLab',
'thumbnail': r're:^https?://.*$'
},
'skip': 'Requires Microsoft 365 account credentials',
}, {
'url': 'https://www.microsoftstream.com/video/1541f3f9-7fed-4901-ae70-0f7cb775679f',
'only_matching': True,
}]
"""
Getters
The following getters include helpful message to prompt developers for potential errors.
"""
@property
def video_id(self):
if self._VIDEO_ID is None:
raise('Variable "_VIDEO_ID" is not defined. Did you make the main extraction call?')
return self._VIDEO_ID
@property
def media_info_endpoint(self):
return "%s/videos/%s?$expand=creator,tokens,status,liveEvent,extensions&api-version=1.4-private" % (self.api_gateway, self.video_id)
def _get_subtitles(self, tracks=None): # Fulfill abstract method def _get_subtitles(self, tracks=None): # Fulfill abstract method
tracks = self._TEXTTRACKS_RESPONSE if tracks is None else tracks tracks = self._TEXTTRACKS_RESPONSE if tracks is None else tracks
return self._extract_subtitle(tracks, False) return self._remap_subtitle(tracks, is_auto_generated=False, video_id=self.video_id)
def _get_automatic_captions(self, tracks=None): # Fulfill abstract method def _get_automatic_captions(self, tracks=None): # Fulfill abstract method
tracks = self._TEXTTRACKS_RESPONSE if tracks is None else tracks tracks = self._TEXTTRACKS_RESPONSE if tracks is None else tracks
return self._extract_subtitle(tracks, True) return self._remap_subtitle(tracks, is_auto_generated=True, video_id=self.video_id)
def _real_extract(self, url): def _real_extract(self, url):
self._VIDEO_ID = self._match_id(url) self._VIDEO_ID = self._match_id(url)
@ -202,19 +227,91 @@ class MicrosoftStreamIE(MicrosoftStreamBaseIE):
self._extract_api_gateway(webpage) self._extract_api_gateway(webpage)
# "GET" api for video information # "GET" api for video information
apiResponse = self._download_json(self.media_info_endpoint, self.video_id, headers=self.headers) video = self._download_json(self.media_info_endpoint, self.video_id, headers=self.headers)
texttracks = self._request_texttracks(self.video_id)
texttracks = self._request_texttracks() return merge_dicts(self._remap_video(video), {
return {
'id': self.video_id,
'title': apiResponse['name'],
'description': apiResponse.get('description'),
'uploader': apiResponse.get('creator').get('name'),
'thumbnails': self._remap_thumbnails(apiResponse.get('posterImage')),
'formats': self._remap_playback(apiResponse['playbackUrls']),
'subtitles': self._get_subtitles(texttracks), 'subtitles': self._get_subtitles(texttracks),
'automatic_captions': self._get_automatic_captions(texttracks), 'automatic_captions': self._get_automatic_captions(texttracks),
'is_live': False })
# 'duration': apiResponse['media']['duration'],
class MicrosoftStreamChannelIE(MicrosoftStreamBaseIE):
"""
Extractor for single channel of Microsoft Stream video
"""
IE_NAME = 'microsoftstream:channel'
_VALID_URL = r'https?://(?:(?:web|www)\.)?microsoftstream\.com/channel/(?P<id>[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})' # https://regex101.com/r/K1mlgK/1/
_NETRC_MACHINE = 'microsoftstream'
_CHANNEL_ID = None
_TEST = [{
'url': 'https://web.microsoftstream.com/channel/c883c6a5-9895-4900-9a35-62f4b5d506c9',
'only_matching': True,
}, {
'url': 'https://www.microsoftstream.com/channel/0ceffe58-07b1-4098-8ed9-a15f4e8231f7',
'only_matching': True,
}]
def _format_channel_video_endpoint(self, skip=0, top=100):
parameters = {
'$top': top,
'$skip': skip,
'$expand': 'creator,events',
'$orderby': 'created asc',
'api-version': '1.4-private'
} }
return update_url_query('%schannels/%s/videos' % (self.api_gateway, self._CHANNEL_ID), parameters)
def _iterate_video(self, video):
subtitle_dict = {}
if (self._downloader.params.get('writesubtitles', False)
or self._downloader.params.get('writeautomaticsub', False)
or self._downloader.params.get('listsubtitles')):
texttracks = self._request_texttracks(video['id'])
if (self._downloader.params.get('writesubtitles', False)
or self._downloader.params.get('listsubtitles')):
subtitle_dict['subtitles'] = self._remap_subtitle(texttracks, video_id=video['id'], is_auto_generated=False)
if (self._downloader.params.get('writeautomaticsub', False)
or self._downloader.params.get('listsubtitles')):
subtitle_dict['automatic_captions'] = self._remap_subtitle(texttracks, video_id=video['id'], is_auto_generated=False)
return merge_dicts(self._remap_video(video), subtitle_dict)
def _fetch_video(self):
found_all_video = False
current_skip = 0
top = 100
all_video = []
while found_all_video is False:
video_info_endpoint = self._format_channel_video_endpoint(current_skip, top)
channel_video_subset = self._download_json(video_info_endpoint, self._CHANNEL_ID, headers=self.headers)['value']
for video in channel_video_subset:
all_video.append(self._iterate_video(video)) # Remap the video
found_all_video = bool(len(channel_video_subset) < top) # Break out from the iteration if all content is downloaded.
current_skip += top # Adjust starting position
return all_video
def _real_extract(self, url):
self._CHANNEL_ID = self._match_id(url)
webpage = self._download_webpage(url, self._CHANNEL_ID)
if not self.is_logged_in(webpage):
return self.raise_login_required()
# Extract access token from webpage
self._extract_access_token(webpage)
self._extract_api_gateway(webpage)
entries = self._fetch_video()
return {'_type': 'playlist',
'entries': entries}