Compare commits

...

7 Commits

Author SHA1 Message Date
schn0sch
4e580ad264
Merge 14bf10c648 into 1036478d13 2025-01-10 11:25:50 +00:00
dirkf
1036478d13 [YouTube] Endure subtitle URLs are complete
* WEB URLs are, MWEB not
* resolves #33017
2025-01-06 01:39:04 +00:00
dirkf
00ad2b8ca1 [YouTube] Refactor subtitle processing
* move to internal function
* use `traverse-obj()`
2025-01-06 01:24:30 +00:00
dirkf
ab7c61ca29 [YouTube] Apply code style changes, trailing commas, etc 2025-01-06 01:22:16 +00:00
dirkf
176fc2cb00 [YouTube] Avoid early crash if webpage can't be read
* see issue #33013
2024-12-31 14:51:29 +00:00
schn0sch
14bf10c648 [keep2share] share access token across calls to _real_extract 2021-05-17 15:55:14 +02:00
schn0sch
5f3cc3bbea [keep2share] Add new extractor 2021-05-17 15:54:46 +02:00
3 changed files with 152 additions and 52 deletions

View File

@ -562,6 +562,7 @@ from .kaltura import KalturaIE
from .kankan import KankanIE
from .karaoketv import KaraoketvIE
from .karrierevideos import KarriereVideosIE
from .keep2share import Keep2ShareIE
from .keezmovies import KeezMoviesIE
from .ketnet import KetnetIE
from .khanacademy import (

View File

@ -0,0 +1,94 @@
from __future__ import unicode_literals
import json
import re
from .common import (
InfoExtractor,
RegexNotFoundError,
)
from ..utils import (
clean_html,
compat_str,
js_to_json,
urljoin,
try_get,
)
class Keep2ShareIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:k2s\.cc|keep2share\.cc|keep2share\.com)/file/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://k2s.cc/file/d6f565bcb9581/Big_Buck%20Bunny%20Trailer.mp4',
'md5': '0dbce91e7d1efc506d1461439eb8a4c0',
'info_dict': {
'id': 'd6f565bcb9581',
'ext': 'mp4',
'title': 'Big Buck Bunny Trailer.mp4',
'thumbnail': r're:^https?://.*\.jpe?g$',
'filesize': 4447915,
'duration': 33.019,
},
}]
def _get_app_secret(self, video_id):
""" retrieve REACT_APP_API_CLIENT_SECRET """
if getattr(self, '_app_secret', None) is not None:
return self._app_secret
url = 'https://k2s.cc/file/' + video_id
webpage = self._download_webpage(url, video_id)
scripts = re.finditer(r'<script\s+src="(?P<src>/static/[^"]*)"', webpage)
for mobj in scripts:
src = urljoin(url, clean_html(mobj.group('src')))
script = self._download_webpage(src, video_id)
secret = self._search_regex(
r'REACT_APP_API_CLIENT_SECRET:\s*(?P<secret>%(string)s)' % {
'string': r'"(?:[^"]|\\")*"' + '|' + r"'(?:[^']|\\')*'",
}, script, 'app secret', group='secret', default=None)
if secret is not None:
self._app_secret = self._parse_json(secret, video_id,
transform_source=js_to_json)
return self._app_secret
raise RegexNotFoundError('Unable to extract app secret')
def _get_access_token(self, video_id):
""" retrieve access_token """
if getattr(self, '_access_token', None) is not None:
return self._access_token
data = {
'grant_type': 'client_credentials',
'client_id': 'k2s_web_app',
'client_secret': self._get_app_secret(video_id),
}
data = json.dumps(data, separators=(',', ':')).encode('utf-8')
headers = {'Content-Type': 'application/json'}
tokens = self._download_json('https://api.k2s.cc/v1/auth/token',
video_id, data=data, headers=headers)
self._access_token = tokens['access_token']
return self._access_token
def _real_extract(self, url):
video_id = self._match_id(url)
headers = {'Cookie': 'accessToken=' + self._get_access_token(url)}
info = self._download_json('https://api.k2s.cc/v1/files/' + video_id,
video_id, headers=headers)
return {
'id': video_id,
'title': info.get('name', 'keep2share-file'),
'thumbnail': try_get(info, lambda x: x['videoPreview']['cover'], compat_str),
'duration': try_get(info, lambda x: x['videoInfo']['duration'], (int, float)),
'formats': [{
'url': info['videoPreview']['video'],
'ext': 'mp4',
'filesize': try_get(info, lambda x: x['size'], int),
'width': try_get(info, lambda x: x['videoInfo']['resolution']['width'], int),
'height': try_get(info, lambda x: x['videoInfo']['resolution']['height'], int),
}],
}

View File

@ -9,6 +9,7 @@ import json
import os.path
import random
import re
import string
import time
import traceback
@ -67,6 +68,7 @@ from ..utils import (
class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors"""
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
@ -138,7 +140,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
[2, 1, None, 1,
'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
None, [], 4],
1, [None, None, []], None, None, None, True
1, [None, None, []], None, None, None, True,
],
username,
]
@ -160,7 +162,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
None, 1, None, [1, None, None, None, [password, None, True]],
[
None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
1, [None, None, []], None, None, None, True
1, [None, None, []], None, None, None, True,
]]
challenge_results = req(
@ -213,7 +215,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
user_hash, None, 2, None,
[
9, None, None, None, None, None, None, None,
[None, tfa_code, True, 2]
[None, tfa_code, True, 2],
]]
tfa_results = req(
@ -284,7 +286,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'client': {
'clientName': 'WEB',
'clientVersion': '2.20201021.03.00',
}
},
},
}
@ -385,7 +387,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'client': {
'clientName': 'WEB',
'clientVersion': '2.20201021.03.00',
}
},
},
'query': query,
}
@ -462,7 +464,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# (HTML, videodetails, metadata, renderers)
'name': ('content', 'author', (('ownerChannelName', None), 'title'), ['text']),
'url': ('href', 'ownerProfileUrl', 'vanityChannelUrl',
['navigationEndpoint', 'browseEndpoint', 'canonicalBaseUrl'])
['navigationEndpoint', 'browseEndpoint', 'canonicalBaseUrl']),
}
if any((videodetails, metadata, renderers)):
result = (
@ -671,7 +673,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
'description': '',
'uploader': '8KVIDEO',
'title': 'UHDTV TEST 8K VIDEO.mp4'
'title': 'UHDTV TEST 8K VIDEO.mp4',
},
'params': {
'youtube_include_dash_manifest': True,
@ -711,7 +713,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': r're:https?://(?:www\.)?youtube\.com/@theamazingatheist',
'title': 'Burning Everyone\'s Koran',
'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
}
},
},
# Age-gated videos
{
@ -839,7 +841,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
},
'expected_warnings': [
'DASH manifest missing',
]
],
},
# Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
{
@ -1820,8 +1822,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# cpn generation algorithm is reverse engineered from base.js.
# In fact it works even with dummy cpn.
CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))
CPN_ALPHABET = string.ascii_letters + string.digits + '-_'
cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(16))
# more consistent results setting it to right before the end
qs = parse_qs(playback_url)
@ -1881,8 +1883,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
if mobj is None:
raise ExtractorError('Invalid URL: %s' % url)
video_id = mobj.group(2)
return video_id
return mobj.group(2)
def _extract_chapters_from_json(self, data, video_id, duration):
chapters_list = try_get(
@ -1951,7 +1952,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
pb_context = {'html5Preference': 'HTML5_PREF_WANTS'}
player_url = self._extract_player_url(webpage)
ytcfg = self._extract_ytcfg(video_id, webpage)
ytcfg = self._extract_ytcfg(video_id, webpage or '')
sts = self._extract_signature_timestamp(video_id, player_url, ytcfg)
if sts:
pb_context['signatureTimestamp'] = sts
@ -2035,7 +2036,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
headers = {
'X-YouTube-Client-Name': '85',
'X-YouTube-Client-Version': '2.0',
'Origin': 'https://www.youtube.com'
'Origin': 'https://www.youtube.com',
}
video_info = self._call_api('player', query, video_id, fatal=False, headers=headers)
@ -2064,8 +2065,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)])
search_meta = (
lambda x: self._html_search_meta(x, webpage, default=None)) \
if webpage else lambda x: None
(lambda x: self._html_search_meta(x, webpage, default=None))
if webpage else lambda _: None)
video_details = player_response.get('videoDetails') or {}
microformat = try_get(
@ -2137,7 +2138,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def build_fragments(f):
return LazyList({
'url': update_url_query(f['url'], {
'range': '{0}-{1}'.format(range_start, min(range_start + CHUNK_SIZE - 1, f['filesize']))
'range': '{0}-{1}'.format(range_start, min(range_start + CHUNK_SIZE - 1, f['filesize'])),
})
} for range_start in range(0, f['filesize'], CHUNK_SIZE))
@ -2236,7 +2237,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'protocol': 'http_dash_segments',
'fragments': build_fragments(dct),
} if dct['filesize'] else {
'downloader_options': {'http_chunk_size': CHUNK_SIZE} # No longer useful?
'downloader_options': {'http_chunk_size': CHUNK_SIZE}, # No longer useful?
})
formats.append(dct)
@ -2414,9 +2415,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'is_live': is_live,
}
pctr = try_get(
pctr = traverse_obj(
player_response,
lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict)
('captions', 'playerCaptionsTracklistRenderer', T(dict)))
if pctr:
def process_language(container, base_url, lang_code, query):
lang_subs = []
@ -2430,31 +2431,34 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
})
container[lang_code] = lang_subs
subtitles = {}
for caption_track in (pctr.get('captionTracks') or []):
base_url = caption_track.get('baseUrl')
if not base_url:
continue
if caption_track.get('kind') != 'asr':
lang_code = caption_track.get('languageCode')
if not lang_code:
def process_subtitles():
subtitles = {}
for caption_track in traverse_obj(pctr, (
'captionTracks', lambda _, v: v.get('baseUrl'))):
base_url = self._yt_urljoin(caption_track['baseUrl'])
if not base_url:
continue
process_language(
subtitles, base_url, lang_code, {})
continue
automatic_captions = {}
for translation_language in (pctr.get('translationLanguages') or []):
translation_language_code = translation_language.get('languageCode')
if not translation_language_code:
if caption_track.get('kind') != 'asr':
lang_code = caption_track.get('languageCode')
if not lang_code:
continue
process_language(
subtitles, base_url, lang_code, {})
continue
process_language(
automatic_captions, base_url, translation_language_code,
{'tlang': translation_language_code})
info['automatic_captions'] = automatic_captions
info['subtitles'] = subtitles
automatic_captions = {}
for translation_language in traverse_obj(pctr, (
'translationLanguages', lambda _, v: v.get('languageCode'))):
translation_language_code = translation_language['languageCode']
process_language(
automatic_captions, base_url, translation_language_code,
{'tlang': translation_language_code})
info['automatic_captions'] = automatic_captions
info['subtitles'] = subtitles
process_subtitles()
parsed_url = compat_urllib_parse_urlparse(url)
for component in [parsed_url.fragment, parsed_url.query]:
for component in (parsed_url.fragment, parsed_url.query):
query = compat_parse_qs(component)
for k, v in query.items():
for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
@ -2684,7 +2688,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'title': 'Super Cooper Shorts - Shorts',
'uploader': 'Super Cooper Shorts',
'uploader_id': '@SuperCooperShorts',
}
},
}, {
# Channel that does not have a Shorts tab. Test should just download videos on Home tab instead
'url': 'https://www.youtube.com/@emergencyawesome/shorts',
@ -2738,7 +2742,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'description': 'md5:609399d937ea957b0f53cbffb747a14c',
'uploader': 'ThirstForScience',
'uploader_id': '@ThirstForScience',
}
},
}, {
'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
'only_matching': True,
@ -3037,7 +3041,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'uploader': '3Blue1Brown',
'uploader_id': '@3blue1brown',
'channel_id': 'UCYO_jab_esuFRV4b17AJtAw',
}
},
}]
@classmethod
@ -3335,7 +3339,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'client': {
'clientName': 'WEB',
'clientVersion': client_version,
}
},
}
visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str)
@ -3354,7 +3358,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
headers['x-goog-visitor-id'] = visitor_data
data['continuation'] = continuation['continuation']
data['clickTracking'] = {
'clickTrackingParams': continuation['itct']
'clickTrackingParams': continuation['itct'],
}
count = 0
retries = 3
@ -3613,7 +3617,7 @@ class YoutubePlaylistIE(InfoExtractor):
'uploader': 'milan',
'uploader_id': '@milan5503',
'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
}
},
}, {
'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
'playlist_mincount': 455,
@ -3623,7 +3627,7 @@ class YoutubePlaylistIE(InfoExtractor):
'uploader': 'LBK',
'uploader_id': '@music_king',
'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA',
}
},
}, {
'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
'only_matching': True,
@ -3734,7 +3738,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
'info_dict': {
'id': 'youtube-dl test video',
'title': 'youtube-dl test video',
}
},
}]
def _get_n_results(self, query, n):
@ -3754,7 +3758,7 @@ class YoutubeSearchDateIE(YoutubeSearchIE):
'info_dict': {
'id': 'youtube-dl test video',
'title': 'youtube-dl test video',
}
},
}]
@ -3769,7 +3773,7 @@ class YoutubeSearchURLIE(YoutubeBaseInfoExtractor):
'id': 'youtube-dl test video',
'title': 'youtube-dl test video',
},
'params': {'playlistend': 5}
'params': {'playlistend': 5},
}, {
'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
'only_matching': True,
@ -3785,6 +3789,7 @@ class YoutubeSearchURLIE(YoutubeBaseInfoExtractor):
class YoutubeFeedsInfoExtractor(YoutubeTabIE):
"""
Base class for feed extractors
Subclasses must define the _FEED_NAME property.
"""
_LOGIN_REQUIRED = True