Compare commits

...

12 Commits

Author SHA1 Message Date
01001110
026b3815c6
Merge b8f7389a71 into 1036478d13 2025-01-07 16:15:22 +00:00
dirkf
1036478d13 [YouTube] Endure subtitle URLs are complete
* WEB URLs are, MWEB not
* resolves #33017
2025-01-06 01:39:04 +00:00
dirkf
00ad2b8ca1 [YouTube] Refactor subtitle processing
* move to internal function
* use `traverse-obj()`
2025-01-06 01:24:30 +00:00
dirkf
ab7c61ca29 [YouTube] Apply code style changes, trailing commas, etc 2025-01-06 01:22:16 +00:00
01001110
b8f7389a71 update tests checksums 2023-10-03 00:29:54 +08:00
01001110
1250269c74
Merge branch 'ytdl-org:master' into chelseafc-extractor 2023-10-03 00:04:15 +08:00
dirkf
7059253820
Update extractors.py
* sort extractor modules
2023-04-06 01:45:06 +01:00
01001110
24e84f0000 Change regex for better matching 2023-04-02 19:31:05 +08:00
01001110
6a9848f94a Modify extractor according to suggestions 2023-04-02 18:32:21 +08:00
01001110
a9a4d68364 Edit tests 2023-04-02 18:28:44 +08:00
01001110
ceaa2c78fa [chelseafc] improve regex 2023-03-24 16:22:13 +08:00
01001110
b478a0ac3f [chelseafc] Add new extractor 2023-03-23 20:56:38 +08:00
3 changed files with 144 additions and 51 deletions

View File

@ -0,0 +1,83 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
parse_duration,
traverse_obj,
unified_timestamp,
url_or_none,
)
class ChelseafcIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?chelseafc\.com(?:/[a-z]+)?/video/(?P<id>[a-z0-9]+(?:-[a-z0-9]+)*)'
_TESTS = [{
'url': 'https://www.chelseafc.com/en/video/full-match-chelsea-2-2-everton',
'md5': '2fda617911b7148a2a19bec55b75d30a',
'info_dict': {
'id': 'full-match-chelsea-2-2-everton',
'ext': 'mp4',
'title': 'Full Match: Chelsea 2-2 Everton',
'description': 'Full match highlights from Chelsea\'s 2-2 Premier League draw with Everton at Stamford Bridge.',
'duration': 2842.0,
'timestamp': 1679184000,
'tags': ['Premier League', 'Everton', 'Video and article choice'],
'upload_date': '20230319',
'thumbnail': r're:https?://.*\.png'
}
},
{
'url': 'https://www.chelseafc.com/en/video/manchester-city-vs-chelsea-2-0-or-highlights-or-efl-cup',
'md5': '2905365c3c9cf4612f303fbb99c2f4ca',
'info_dict': {
'id': 'manchester-city-vs-chelsea-2-0-or-highlights-or-efl-cup',
'ext': 'mp4',
'title': 'Manchester City 2-0 Chelsea | Highlights | EFL Cup',
'description': 'Highlights from our EFL Cup match against Man City.',
'duration': 120.0,
'timestamp': 1668042000,
'upload_date': '20221110',
'tags': ['Highlights', 'League Cup', 'Manchester City'],
'thumbnail': r're:https?://.*\.jpg'
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_details_div = self._search_regex(
r'(<div\s[^>]*\bdata-component\s*=\s*(?:"|\')\s*VideoDetails\s*(?:"|\')[^>]*>)',
webpage,
'div'
)
raw_data = self._html_search_regex(
r'<div[^>]*\sdata-props\s*=\s*(?:"|\')\s*([^"\']*)\s*(?:"|\')[^>]*>',
video_details_div,
'data'
)
data = self._parse_json(raw_data, video_id)
manifest_url = data['videoDetail']['signedUrl']
data = data['videoDetail']
title = data['title']
formats = self._extract_m3u8_formats(manifest_url, video_id, 'mp4')
self._sort_formats(formats)
txt_or_none = lambda x: x.strip() or None
return {
'id': video_id,
'title': title,
'description': txt_or_none(data.get('description')),
'formats': formats,
'duration': parse_duration(data.get('duration')),
'timestamp': unified_timestamp(data.get('releaseDate')),
'tags': traverse_obj(data, ('tags', Ellipsis, 'title'), expected_type=txt_or_none),
'thumbnail': traverse_obj(data, ('image', 'file', 'url'), expected_type=url_or_none),
}

View File

@ -215,6 +215,7 @@ from .ceskatelevize import CeskaTelevizeIE
from .channel9 import Channel9IE from .channel9 import Channel9IE
from .charlierose import CharlieRoseIE from .charlierose import CharlieRoseIE
from .chaturbate import ChaturbateIE from .chaturbate import ChaturbateIE
from .chelseafc import ChelseafcIE
from .chilloutzone import ChilloutzoneIE from .chilloutzone import ChilloutzoneIE
from .chirbit import ( from .chirbit import (
ChirbitIE, ChirbitIE,
@ -989,6 +990,10 @@ from .puhutv import (
PuhuTVIE, PuhuTVIE,
PuhuTVSerieIE, PuhuTVSerieIE,
) )
from .pr0gramm import (
Pr0grammIE,
Pr0grammStaticIE,
)
from .presstv import PressTVIE from .presstv import PressTVIE
from .prosiebensat1 import ProSiebenSat1IE from .prosiebensat1 import ProSiebenSat1IE
from .puls4 import Puls4IE from .puls4 import Puls4IE

View File

@ -9,6 +9,7 @@ import json
import os.path import os.path
import random import random
import re import re
import string
import time import time
import traceback import traceback
@ -67,6 +68,7 @@ from ..utils import (
class YoutubeBaseInfoExtractor(InfoExtractor): class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors""" """Provide base functions for Youtube extractors"""
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
@ -138,7 +140,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
[2, 1, None, 1, [2, 1, None, 1,
'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
None, [], 4], None, [], 4],
1, [None, None, []], None, None, None, True 1, [None, None, []], None, None, None, True,
], ],
username, username,
] ]
@ -160,7 +162,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
None, 1, None, [1, None, None, None, [password, None, True]], None, 1, None, [1, None, None, None, [password, None, True]],
[ [
None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
1, [None, None, []], None, None, None, True 1, [None, None, []], None, None, None, True,
]] ]]
challenge_results = req( challenge_results = req(
@ -213,7 +215,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
user_hash, None, 2, None, user_hash, None, 2, None,
[ [
9, None, None, None, None, None, None, None, 9, None, None, None, None, None, None, None,
[None, tfa_code, True, 2] [None, tfa_code, True, 2],
]] ]]
tfa_results = req( tfa_results = req(
@ -284,7 +286,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'client': { 'client': {
'clientName': 'WEB', 'clientName': 'WEB',
'clientVersion': '2.20201021.03.00', 'clientVersion': '2.20201021.03.00',
} },
}, },
} }
@ -385,7 +387,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'client': { 'client': {
'clientName': 'WEB', 'clientName': 'WEB',
'clientVersion': '2.20201021.03.00', 'clientVersion': '2.20201021.03.00',
} },
}, },
'query': query, 'query': query,
} }
@ -462,7 +464,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# (HTML, videodetails, metadata, renderers) # (HTML, videodetails, metadata, renderers)
'name': ('content', 'author', (('ownerChannelName', None), 'title'), ['text']), 'name': ('content', 'author', (('ownerChannelName', None), 'title'), ['text']),
'url': ('href', 'ownerProfileUrl', 'vanityChannelUrl', 'url': ('href', 'ownerProfileUrl', 'vanityChannelUrl',
['navigationEndpoint', 'browseEndpoint', 'canonicalBaseUrl']) ['navigationEndpoint', 'browseEndpoint', 'canonicalBaseUrl']),
} }
if any((videodetails, metadata, renderers)): if any((videodetails, metadata, renderers)):
result = ( result = (
@ -671,7 +673,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
'description': '', 'description': '',
'uploader': '8KVIDEO', 'uploader': '8KVIDEO',
'title': 'UHDTV TEST 8K VIDEO.mp4' 'title': 'UHDTV TEST 8K VIDEO.mp4',
}, },
'params': { 'params': {
'youtube_include_dash_manifest': True, 'youtube_include_dash_manifest': True,
@ -711,7 +713,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': r're:https?://(?:www\.)?youtube\.com/@theamazingatheist', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@theamazingatheist',
'title': 'Burning Everyone\'s Koran', 'title': 'Burning Everyone\'s Koran',
'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
} },
}, },
# Age-gated videos # Age-gated videos
{ {
@ -839,7 +841,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}, },
'expected_warnings': [ 'expected_warnings': [
'DASH manifest missing', 'DASH manifest missing',
] ],
}, },
# Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
{ {
@ -1820,8 +1822,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# cpn generation algorithm is reverse engineered from base.js. # cpn generation algorithm is reverse engineered from base.js.
# In fact it works even with dummy cpn. # In fact it works even with dummy cpn.
CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' CPN_ALPHABET = string.ascii_letters + string.digits + '-_'
cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)) cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(16))
# more consistent results setting it to right before the end # more consistent results setting it to right before the end
qs = parse_qs(playback_url) qs = parse_qs(playback_url)
@ -1881,8 +1883,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
mobj = re.match(cls._VALID_URL, url, re.VERBOSE) mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
if mobj is None: if mobj is None:
raise ExtractorError('Invalid URL: %s' % url) raise ExtractorError('Invalid URL: %s' % url)
video_id = mobj.group(2) return mobj.group(2)
return video_id
def _extract_chapters_from_json(self, data, video_id, duration): def _extract_chapters_from_json(self, data, video_id, duration):
chapters_list = try_get( chapters_list = try_get(
@ -2035,7 +2036,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
headers = { headers = {
'X-YouTube-Client-Name': '85', 'X-YouTube-Client-Name': '85',
'X-YouTube-Client-Version': '2.0', 'X-YouTube-Client-Version': '2.0',
'Origin': 'https://www.youtube.com' 'Origin': 'https://www.youtube.com',
} }
video_info = self._call_api('player', query, video_id, fatal=False, headers=headers) video_info = self._call_api('player', query, video_id, fatal=False, headers=headers)
@ -2064,8 +2065,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)]) return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)])
search_meta = ( search_meta = (
lambda x: self._html_search_meta(x, webpage, default=None)) \ (lambda x: self._html_search_meta(x, webpage, default=None))
if webpage else lambda x: None if webpage else lambda _: None)
video_details = player_response.get('videoDetails') or {} video_details = player_response.get('videoDetails') or {}
microformat = try_get( microformat = try_get(
@ -2137,7 +2138,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def build_fragments(f): def build_fragments(f):
return LazyList({ return LazyList({
'url': update_url_query(f['url'], { 'url': update_url_query(f['url'], {
'range': '{0}-{1}'.format(range_start, min(range_start + CHUNK_SIZE - 1, f['filesize'])) 'range': '{0}-{1}'.format(range_start, min(range_start + CHUNK_SIZE - 1, f['filesize'])),
}) })
} for range_start in range(0, f['filesize'], CHUNK_SIZE)) } for range_start in range(0, f['filesize'], CHUNK_SIZE))
@ -2236,7 +2237,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'protocol': 'http_dash_segments', 'protocol': 'http_dash_segments',
'fragments': build_fragments(dct), 'fragments': build_fragments(dct),
} if dct['filesize'] else { } if dct['filesize'] else {
'downloader_options': {'http_chunk_size': CHUNK_SIZE} # No longer useful? 'downloader_options': {'http_chunk_size': CHUNK_SIZE}, # No longer useful?
}) })
formats.append(dct) formats.append(dct)
@ -2414,9 +2415,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'is_live': is_live, 'is_live': is_live,
} }
pctr = try_get( pctr = traverse_obj(
player_response, player_response,
lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict) ('captions', 'playerCaptionsTracklistRenderer', T(dict)))
if pctr: if pctr:
def process_language(container, base_url, lang_code, query): def process_language(container, base_url, lang_code, query):
lang_subs = [] lang_subs = []
@ -2430,9 +2431,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}) })
container[lang_code] = lang_subs container[lang_code] = lang_subs
def process_subtitles():
subtitles = {} subtitles = {}
for caption_track in (pctr.get('captionTracks') or []): for caption_track in traverse_obj(pctr, (
base_url = caption_track.get('baseUrl') 'captionTracks', lambda _, v: v.get('baseUrl'))):
base_url = self._yt_urljoin(caption_track['baseUrl'])
if not base_url: if not base_url:
continue continue
if caption_track.get('kind') != 'asr': if caption_track.get('kind') != 'asr':
@ -2443,18 +2446,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
subtitles, base_url, lang_code, {}) subtitles, base_url, lang_code, {})
continue continue
automatic_captions = {} automatic_captions = {}
for translation_language in (pctr.get('translationLanguages') or []): for translation_language in traverse_obj(pctr, (
translation_language_code = translation_language.get('languageCode') 'translationLanguages', lambda _, v: v.get('languageCode'))):
if not translation_language_code: translation_language_code = translation_language['languageCode']
continue
process_language( process_language(
automatic_captions, base_url, translation_language_code, automatic_captions, base_url, translation_language_code,
{'tlang': translation_language_code}) {'tlang': translation_language_code})
info['automatic_captions'] = automatic_captions info['automatic_captions'] = automatic_captions
info['subtitles'] = subtitles info['subtitles'] = subtitles
process_subtitles()
parsed_url = compat_urllib_parse_urlparse(url) parsed_url = compat_urllib_parse_urlparse(url)
for component in [parsed_url.fragment, parsed_url.query]: for component in (parsed_url.fragment, parsed_url.query):
query = compat_parse_qs(component) query = compat_parse_qs(component)
for k, v in query.items(): for k, v in query.items():
for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]: for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
@ -2684,7 +2688,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'title': 'Super Cooper Shorts - Shorts', 'title': 'Super Cooper Shorts - Shorts',
'uploader': 'Super Cooper Shorts', 'uploader': 'Super Cooper Shorts',
'uploader_id': '@SuperCooperShorts', 'uploader_id': '@SuperCooperShorts',
} },
}, { }, {
# Channel that does not have a Shorts tab. Test should just download videos on Home tab instead # Channel that does not have a Shorts tab. Test should just download videos on Home tab instead
'url': 'https://www.youtube.com/@emergencyawesome/shorts', 'url': 'https://www.youtube.com/@emergencyawesome/shorts',
@ -2738,7 +2742,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'description': 'md5:609399d937ea957b0f53cbffb747a14c', 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
'uploader': 'ThirstForScience', 'uploader': 'ThirstForScience',
'uploader_id': '@ThirstForScience', 'uploader_id': '@ThirstForScience',
} },
}, { }, {
'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
'only_matching': True, 'only_matching': True,
@ -3037,7 +3041,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'uploader': '3Blue1Brown', 'uploader': '3Blue1Brown',
'uploader_id': '@3blue1brown', 'uploader_id': '@3blue1brown',
'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw',
} },
}] }]
@classmethod @classmethod
@ -3335,7 +3339,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'client': { 'client': {
'clientName': 'WEB', 'clientName': 'WEB',
'clientVersion': client_version, 'clientVersion': client_version,
} },
} }
visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str) visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str)
@ -3354,7 +3358,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
headers['x-goog-visitor-id'] = visitor_data headers['x-goog-visitor-id'] = visitor_data
data['continuation'] = continuation['continuation'] data['continuation'] = continuation['continuation']
data['clickTracking'] = { data['clickTracking'] = {
'clickTrackingParams': continuation['itct'] 'clickTrackingParams': continuation['itct'],
} }
count = 0 count = 0
retries = 3 retries = 3
@ -3613,7 +3617,7 @@ class YoutubePlaylistIE(InfoExtractor):
'uploader': 'milan', 'uploader': 'milan',
'uploader_id': '@milan5503', 'uploader_id': '@milan5503',
'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw', 'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
} },
}, { }, {
'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
'playlist_mincount': 455, 'playlist_mincount': 455,
@ -3623,7 +3627,7 @@ class YoutubePlaylistIE(InfoExtractor):
'uploader': 'LBK', 'uploader': 'LBK',
'uploader_id': '@music_king', 'uploader_id': '@music_king',
'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA', 'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA',
} },
}, { }, {
'url': 'TLGGrESM50VT6acwMjAyMjAxNw', 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
'only_matching': True, 'only_matching': True,
@ -3734,7 +3738,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
'info_dict': { 'info_dict': {
'id': 'youtube-dl test video', 'id': 'youtube-dl test video',
'title': 'youtube-dl test video', 'title': 'youtube-dl test video',
} },
}] }]
def _get_n_results(self, query, n): def _get_n_results(self, query, n):
@ -3754,7 +3758,7 @@ class YoutubeSearchDateIE(YoutubeSearchIE):
'info_dict': { 'info_dict': {
'id': 'youtube-dl test video', 'id': 'youtube-dl test video',
'title': 'youtube-dl test video', 'title': 'youtube-dl test video',
} },
}] }]
@ -3769,7 +3773,7 @@ class YoutubeSearchURLIE(YoutubeBaseInfoExtractor):
'id': 'youtube-dl test video', 'id': 'youtube-dl test video',
'title': 'youtube-dl test video', 'title': 'youtube-dl test video',
}, },
'params': {'playlistend': 5} 'params': {'playlistend': 5},
}, { }, {
'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
'only_matching': True, 'only_matching': True,
@ -3785,6 +3789,7 @@ class YoutubeSearchURLIE(YoutubeBaseInfoExtractor):
class YoutubeFeedsInfoExtractor(YoutubeTabIE): class YoutubeFeedsInfoExtractor(YoutubeTabIE):
""" """
Base class for feed extractors Base class for feed extractors
Subclasses must define the _FEED_NAME property. Subclasses must define the _FEED_NAME property.
""" """
_LOGIN_REQUIRED = True _LOGIN_REQUIRED = True