[MicrosoftStream] Add new extractor

This commit is contained in:
Nick Lai 2020-04-06 16:45:20 +08:00
parent 00eb865b3c
commit 05698ebf9b
No known key found for this signature in database
GPG Key ID: AF5E3B79EE6B1CC4
2 changed files with 134 additions and 0 deletions

View File

@ -619,6 +619,7 @@ from .metacritic import MetacriticIE
from .mgoon import MgoonIE from .mgoon import MgoonIE
from .mgtv import MGTVIE from .mgtv import MGTVIE
from .miaopai import MiaoPaiIE from .miaopai import MiaoPaiIE
from .microsoftstream import MicrosoftStreamIE
from .microsoftvirtualacademy import ( from .microsoftvirtualacademy import (
MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyIE,
MicrosoftVirtualAcademyCourseIE, MicrosoftVirtualAcademyCourseIE,

View File

@ -0,0 +1,133 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import ExtractorError
class MicrosoftStreamBaseIE(InfoExtractor):
_LOGIN_URL = 'https://web.microsoftstream.com/?noSignUpCheck=1' # expect redirection
_EXPECTED_TITLE = '<title>Microsoft Stream</title>'
def is_logged_in(self, webpage):
return self._EXPECTED_TITLE in webpage
def _real_initialize(self):
username, password = self._get_login_info()
if username is not None or password is not None:
raise ExtractorError('MicrosoftStream Extractor does not support username/password log-in at the moment. Please use cookies log-in instead. See https://github.com/ytdl-org/youtube-dl/blob/master/README.md#how-do-i-pass-cookies-to-youtube-dl for more information')
class MicrosoftStreamIE(MicrosoftStreamBaseIE):
IE_NAME = 'microsoftstream'
_VALID_URL = r'https?://(?:(?:web|www)\.)?microsoftstream\.com/video/(?P<id>[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})' # https://regex101.com/r/K1mlgK/1/
_NETRC_MACHINE = 'microsoftstream'
_TEST = {
'url': 'https://web.microsoftstream.com/video/c883c6a5-9895-4900-9a35-62f4b5d506c9',
'info_dict': {
'id': 'c883c6a5-9895-4900-9a35-62f4b5d506c9',
'ext': 'mp4',
'title': 'Webinar for Researchers: Use of GitLab',
'thumbnail': r're:^https?://.*$',
}
}
def _remap_thumbnails(self, thumbnail_dict_list):
output = []
preference_index = ['extraSmall', 'small', 'medium', 'large']
for _, key in enumerate(thumbnail_dict_list):
output.append({
'preference': preference_index.index(key),
'url': thumbnail_dict_list[key]['url']
})
return output
def _remap_playback(self, master_playlist_urls, video_id, http_headers={}):
"""
A parser for the HLS and MPD playlists from the API endpoint.
"""
output = []
for master_playlist_url in master_playlist_urls:
# Handle HLS Master playlist
if self._determine_protocol(master_playlist_url['mimeType']) == 'm3u8':
varient_playlists = self._extract_m3u8_formats(master_playlist_url['playbackUrl'], video_id, headers=http_headers)
# For MPEG-DASH Master playlists
elif self._determine_protocol(master_playlist_url['mimeType']) == 'http_dash_segments':
varient_playlists = self._extract_mpd_formats(master_playlist_url['playbackUrl'], video_id, headers=http_headers)
else:
self.to_screen('Found unresolvable stream with format %s' % master_playlist_url['mimeType'])
continue
# Patching the "Authorization" header
for varient_playlist in varient_playlists:
varient_playlist['http_headers'] = http_headers
output.append(varient_playlist)
return output
def _determine_protocol(self, mime):
"""
A switch board for the MIME type provided from the API endpoint.
"""
if mime in ['application/dash+xml']:
return 'http_dash_segments'
elif mime in ['application/vnd.apple.mpegurl']:
return 'm3u8'
else:
return None
def _remap_texttracks(self, tracks):
"""
A parser for the texttracks response.
"""
subtitle = {}
automatic_captions = {}
for track in tracks:
if track['autoGenerated'] is True:
if track['language'] not in automatic_captions:
automatic_captions[track['language']] = []
automatic_captions[track['language']].append({'url': track['url']})
else:
if track['language'] not in subtitle:
subtitle[track['language']] = []
subtitle[track['language']].append({'url': track['url']})
return (subtitle, automatic_captions)
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
if not self.is_logged_in(webpage):
return self.raise_login_required()
# Extract access token from webpage
accessToken = self._html_search_regex(r"\"AccessToken\":\"(?P<AccessToken>.+?)\"", webpage, 'AccessToken')
apiGateway = self._html_search_regex(r"\"ApiGatewayUri\":\"(?P<APIGateway>.+?)\"", webpage, 'APIGateway')
headers = {'Authorization': 'Bearer %s' % accessToken}
# "GET" api for video information
apiUri = "%s/videos/%s?$expand=creator,tokens,status,liveEvent,extensions&api-version=1.3-private" % (apiGateway, video_id)
apiCall = self._download_json(apiUri, video_id, headers=headers)
# "GET" api for subtitles and auto-captions
texttracksUri = "%s/videos/%s/texttracks?api-version=1.3-private" % (apiGateway, video_id)
texttracksCall = self._download_json(texttracksUri, video_id, headers=headers)['value']
subtitles, automatic_captions = self._remap_texttracks(texttracksCall)
return {
'id': video_id,
'title': apiCall['name'],
'description': apiCall['description'],
'uploader': apiCall['creator']['name'],
'thumbnails': self._remap_thumbnails(apiCall['posterImage']),
'formats': self._remap_playback(apiCall['playbackUrls'], video_id, http_headers=headers),
'subtitles': subtitles,
'automatic_captions': automatic_captions,
'is_live': False,
# 'duration': apiCall['media']['duration'],
}