Compare commits

...

2 Commits

Author SHA1 Message Date
0l-l0
170e1c1995
[peertube] Extract files also from streamingPlaylists (#27728)
JSON objects with an empty "files" tag seem to be a valid PeerTube API
response. In those cases the "files" arrays contained in the
"streamingPlaylists" members can be used instead.
closes #26002
closes #27586
2021-01-08 20:09:38 +00:00
Remita Amine
61e669acff [khanacademy] fix extraction(closes #2887)(closes #26803) 2021-01-08 16:13:22 +01:00
3 changed files with 106 additions and 58 deletions

View File

@ -526,7 +526,10 @@ from .karaoketv import KaraoketvIE
from .karrierevideos import KarriereVideosIE from .karrierevideos import KarriereVideosIE
from .keezmovies import KeezMoviesIE from .keezmovies import KeezMoviesIE
from .ketnet import KetnetIE from .ketnet import KetnetIE
from .khanacademy import KhanAcademyIE from .khanacademy import (
KhanAcademyIE,
KhanAcademyUnitIE,
)
from .kickstarter import KickStarterIE from .kickstarter import KickStarterIE
from .kinja import KinjaEmbedIE from .kinja import KinjaEmbedIE
from .kinopoisk import KinoPoiskIE from .kinopoisk import KinoPoiskIE

View File

@ -1,82 +1,107 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import json
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
unified_strdate, int_or_none,
parse_iso8601,
try_get,
) )
class KhanAcademyIE(InfoExtractor): class KhanAcademyBaseIE(InfoExtractor):
_VALID_URL = r'^https?://(?:(?:www|api)\.)?khanacademy\.org/(?P<key>[^/]+)/(?:[^/]+/){,2}(?P<id>[^?#/]+)(?:$|[?#])' _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P<id>(?:[^/]+/){%s}%s[^?#/&]+)'
IE_NAME = 'KhanAcademy'
_TESTS = [{ def _parse_video(self, video):
'url': 'http://www.khanacademy.org/video/one-time-pad', return {
'md5': '7b391cce85e758fb94f763ddc1bbb979', '_type': 'url_transparent',
'url': video['youtubeId'],
'id': video.get('slug'),
'title': video.get('title'),
'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'),
'duration': int_or_none(video.get('duration')),
'description': video.get('description'),
'ie_key': 'Youtube',
}
def _real_extract(self, url):
display_id = self._match_id(url)
component_props = self._parse_json(self._download_json(
'https://www.khanacademy.org/api/internal/graphql',
display_id, query={
'hash': 1604303425,
'variables': json.dumps({
'path': display_id,
'queryParams': '',
}),
})['data']['contentJson'], display_id)['componentProps']
return self._parse_component_props(component_props)
class KhanAcademyIE(KhanAcademyBaseIE):
IE_NAME = 'khanacademy'
_VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/')
_TEST = {
'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad',
'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0',
'info_dict': { 'info_dict': {
'id': 'one-time-pad', 'id': 'FlIG3TvQCBQ',
'ext': 'webm', 'ext': 'mp4',
'title': 'The one-time pad', 'title': 'The one-time pad',
'description': 'The perfect cipher', 'description': 'The perfect cipher',
'duration': 176, 'duration': 176,
'uploader': 'Brit Cruise', 'uploader': 'Brit Cruise',
'uploader_id': 'khanacademy', 'uploader_id': 'khanacademy',
'upload_date': '20120411', 'upload_date': '20120411',
'timestamp': 1334170113,
'license': 'cc-by-nc-sa',
}, },
'add_ie': ['Youtube'], 'add_ie': ['Youtube'],
}, { }
'url': 'https://www.khanacademy.org/math/applied-math/cryptography',
def _parse_component_props(self, component_props):
video = component_props['tutorialPageData']['contentModel']
info = self._parse_video(video)
author_names = video.get('authorNames')
info.update({
'uploader': ', '.join(author_names) if author_names else None,
'timestamp': parse_iso8601(video.get('dateAdded')),
'license': video.get('kaUserLicense'),
})
return info
class KhanAcademyUnitIE(KhanAcademyBaseIE):
IE_NAME = 'khanacademy:unit'
_VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)'
_TEST = {
'url': 'https://www.khanacademy.org/computing/computer-science/cryptography',
'info_dict': { 'info_dict': {
'id': 'cryptography', 'id': 'cryptography',
'title': 'Journey into cryptography', 'title': 'Cryptography',
'description': 'How have humans protected their secret messages through history? What has changed today?', 'description': 'How have humans protected their secret messages through history? What has changed today?',
}, },
'playlist_mincount': 3, 'playlist_mincount': 31,
}] }
def _real_extract(self, url): def _parse_component_props(self, component_props):
m = re.match(self._VALID_URL, url) curation = component_props['curation']
video_id = m.group('id')
if m.group('key') == 'video': entries = []
data = self._download_json( tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or []
'http://api.khanacademy.org/api/v1/videos/' + video_id, for tutorial_number, tutorial in enumerate(tutorials, 1):
video_id, 'Downloading video info') chapter_info = {
'chapter': tutorial.get('title'),
upload_date = unified_strdate(data['date_added']) 'chapter_number': tutorial_number,
uploader = ', '.join(data['author_names']) 'chapter_id': tutorial.get('id'),
return {
'_type': 'url_transparent',
'url': data['url'],
'id': video_id,
'title': data['title'],
'thumbnail': data['image_url'],
'duration': data['duration'],
'description': data['description'],
'uploader': uploader,
'upload_date': upload_date,
} }
else: for content_item in (tutorial.get('contentItems') or []):
# topic if content_item.get('kind') == 'Video':
data = self._download_json( info = self._parse_video(content_item)
'http://api.khanacademy.org/api/v1/topic/' + video_id, info.update(chapter_info)
video_id, 'Downloading topic info') entries.append(info)
entries = [ return self.playlist_result(
{ entries, curation.get('unit'), curation.get('title'),
'_type': 'url', curation.get('description'))
'url': c['url'],
'id': c['id'],
'title': c['title'],
}
for c in data['children'] if c['kind'] in ('Video', 'Topic')]
return {
'_type': 'playlist',
'id': video_id,
'title': data['title'],
'description': data['description'],
'entries': entries,
}

View File

@ -450,6 +450,18 @@ class PeerTubeIE(InfoExtractor):
'tags': ['framasoft', 'peertube'], 'tags': ['framasoft', 'peertube'],
'categories': ['Science & Technology'], 'categories': ['Science & Technology'],
} }
}, {
# Issue #26002
'url': 'peertube:spacepub.space:d8943b2d-8280-497b-85ec-bc282ec2afdc',
'info_dict': {
'id': 'd8943b2d-8280-497b-85ec-bc282ec2afdc',
'ext': 'mp4',
'title': 'Dot matrix printer shell demo',
'uploader_id': '3',
'timestamp': 1587401293,
'upload_date': '20200420',
'uploader': 'Drew DeVault',
}
}, { }, {
'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44',
'only_matching': True, 'only_matching': True,
@ -526,7 +538,15 @@ class PeerTubeIE(InfoExtractor):
title = video['name'] title = video['name']
formats = [] formats = []
for file_ in video['files']: files = video.get('files') or []
for playlist in (video.get('streamingPlaylists') or []):
if not isinstance(playlist, dict):
continue
playlist_files = playlist.get('files')
if not (playlist_files and isinstance(playlist_files, list)):
continue
files.extend(playlist_files)
for file_ in files:
if not isinstance(file_, dict): if not isinstance(file_, dict):
continue continue
file_url = url_or_none(file_.get('fileUrl')) file_url = url_or_none(file_.get('fileUrl'))