From ab9001dab50db90f6470fcaf2189bcd2cfc0c370 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 Jan 2021 12:38:15 +0700 Subject: [PATCH 01/79] [twitter] Add support for unified cards (closes #27826) --- youtube_dl/extractor/twitter.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1190d721e..ec99dfccd 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -373,6 +373,24 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': '1eVjYOLGkGrQL', }, 'add_ie': ['TwitterBroadcast'], + }, { + # unified card + 'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20', + 'info_dict': { + 'id': '1349794411333394432', + 'ext': 'mp4', + 'title': 'md5:d1c4941658e4caaa6cb579260d85dcba', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:71ead15ec44cee55071547d6447c6a3e', + 'uploader': 'Brooklyn Nets', + 'uploader_id': 'BrooklynNets', + 'duration': 324.484, + 'timestamp': 1610651040, + 'upload_date': '20210114', + }, + 'params': { + 'skip_download': True, + }, }, { # Twitch Clip Embed 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', @@ -433,8 +451,7 @@ class TwitterIE(TwitterBaseIE): 'tags': tags, } - media = try_get(status, lambda x: x['extended_entities']['media'][0]) - if media and media.get('type') != 'photo': + def extract_from_video_info(media): video_info = media.get('video_info') or {} formats = [] @@ -461,6 +478,10 @@ class TwitterIE(TwitterBaseIE): 'thumbnails': thumbnails, 'duration': float_or_none(video_info.get('duration_millis'), 1000), }) + + media = try_get(status, lambda x: x['extended_entities']['media'][0]) + if media and media.get('type') != 'photo': + extract_from_video_info(media) else: card = status.get('card') if card: @@ -493,6 +514,9 @@ class TwitterIE(TwitterBaseIE): '_type': 'url', 'url': get_binding_value('card_url'), }) + elif card_name == 'unified_card': + media_entities = self._parse_json(get_binding_value('unified_card'), twid)['media_entities'] + extract_from_video_info(next(iter(media_entities.values()))) # amplify, promo_video_website, promo_video_convo, appplayer, ... else: is_amplify = card_name == 'amplify' From b484097b01e4f864c7a4369fc6fd071f756802af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 Jan 2021 14:43:52 +0700 Subject: [PATCH 02/79] [youporn] Fix extraction (closes #27822) --- youtube_dl/extractor/youporn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 7b9feafeb..a1f0cce2c 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -88,7 +88,7 @@ class YouPornIE(InfoExtractor): # Main source definitions = self._parse_json( self._search_regex( - r'mediaDefinition\s*=\s*(\[.+?\]);', webpage, + r'mediaDefinition\s*[=:]\s*(\[.+?\])\s*[;,]', webpage, 'media definitions', default='[]'), video_id, fatal=False) if definitions: From aa860b80161152e7205232529e00f3fe636d000e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 Jan 2021 14:54:23 +0700 Subject: [PATCH 03/79] [youporn] Improve height and tbr extraction (refs #23659, refs #20425) --- youtube_dl/extractor/youporn.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index a1f0cce2c..4ca75454e 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -60,6 +60,9 @@ class YouPornIE(InfoExtractor): }, { 'url': 'http://www.youporn.com/watch/505835', 'only_matching': True, + }, { + 'url': 'https://www.youporn.com/watch/13922959/femdom-principal/', + 'only_matching': True, }] @staticmethod @@ -128,8 +131,9 @@ class YouPornIE(InfoExtractor): # Video URL's path looks like this: # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 + # /videos/201703/11/109285532/1080P_4000K_109285532.mp4 # We will benefit from it by extracting some metadata - mobj = re.search(r'(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+/', video_url) + mobj = re.search(r'(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+', video_url) if mobj: height = int(mobj.group('height')) bitrate = int(mobj.group('bitrate')) From 7e92f9015efe97352e824bb593bd810721f4a604 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 Jan 2021 15:12:04 +0700 Subject: [PATCH 04/79] [youporn] Restrict fallback download URL (refs #27822) --- youtube_dl/extractor/youporn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 4ca75454e..534270bac 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -103,7 +103,7 @@ class YouPornIE(InfoExtractor): links.append(video_url) # Fallback #1, this also contains extra low quality 180p format - for _, link in re.findall(r']+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage): + for _, link in re.findall(r']+href=(["\'])(http(?:(?!\1).)+\.mp4(?:(?!\1).)*)\1[^>]+title=["\']Download [Vv]ideo', webpage): links.append(link) # Fallback #2 (unavailable as at 22.06.2017) From 9d50f862323ed3d7c1ccd014c16d5d25b05de925 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 15 Jan 2021 10:32:01 +0100 Subject: [PATCH 05/79] [twitter] Add tests for more cards --- youtube_dl/extractor/twitter.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index ec99dfccd..ed495f297 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -407,6 +407,22 @@ class TwitterIE(TwitterBaseIE): # appplayer card 'url': 'https://twitter.com/poco_dandy/status/1150646424461176832', 'only_matching': True, + }, { + # video_direct_message card + 'url': 'https://twitter.com/qarev001/status/1348948114569269251', + 'only_matching': True, + }, { + # poll2choice_video card + 'url': 'https://twitter.com/CAF_Online/status/1349365911120195585', + 'only_matching': True, + }, { + # poll3choice_video card + 'url': 'https://twitter.com/SamsungMobileSA/status/1348609186725289984', + 'only_matching': True, + }, { + # poll4choice_video card + 'url': 'https://twitter.com/SouthamptonFC/status/1347577658079641604', + 'only_matching': True, }] def _real_extract(self, url): @@ -517,7 +533,9 @@ class TwitterIE(TwitterBaseIE): elif card_name == 'unified_card': media_entities = self._parse_json(get_binding_value('unified_card'), twid)['media_entities'] extract_from_video_info(next(iter(media_entities.values()))) - # amplify, promo_video_website, promo_video_convo, appplayer, ... + # amplify, promo_video_website, promo_video_convo, appplayer, + # video_direct_message, poll2choice_video, poll3choice_video, + # poll4choice_video, ... else: is_amplify = card_name == 'amplify' vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') From 9c9b45814582cc763d45cda2742f9574f254c6a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 Jan 2021 01:46:41 +0700 Subject: [PATCH 06/79] [YoutubeDL] Protect from infinite recursion due to recursively nested playlists (closes #27833) --- youtube_dl/YoutubeDL.py | 236 ++++++++++++++++++++++------------------ 1 file changed, 129 insertions(+), 107 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 0ed4bc6ba..efd42fa63 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -338,6 +338,8 @@ class YoutubeDL(object): _pps = [] _download_retcode = None _num_downloads = None + _playlist_level = 0 + _playlist_urls = set() _screen_file = None def __init__(self, params=None, auto_init=True): @@ -906,115 +908,23 @@ class YoutubeDL(object): return self.process_ie_result( new_result, download=download, extra_info=extra_info) elif result_type in ('playlist', 'multi_video'): - # We process each entry in the playlist - playlist = ie_result.get('title') or ie_result.get('id') - self.to_screen('[download] Downloading playlist: %s' % playlist) - - playlist_results = [] - - playliststart = self.params.get('playliststart', 1) - 1 - playlistend = self.params.get('playlistend') - # For backwards compatibility, interpret -1 as whole list - if playlistend == -1: - playlistend = None - - playlistitems_str = self.params.get('playlist_items') - playlistitems = None - if playlistitems_str is not None: - def iter_playlistitems(format): - for string_segment in format.split(','): - if '-' in string_segment: - start, end = string_segment.split('-') - for item in range(int(start), int(end) + 1): - yield int(item) - else: - yield int(string_segment) - playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) - - ie_entries = ie_result['entries'] - - def make_playlistitems_entries(list_ie_entries): - num_entries = len(list_ie_entries) - return [ - list_ie_entries[i - 1] for i in playlistitems - if -num_entries <= i - 1 < num_entries] - - def report_download(num_entries): + # Protect from infinite recursion due to recursively nested playlists + # (see https://github.com/ytdl-org/youtube-dl/issues/27833) + webpage_url = ie_result['webpage_url'] + if webpage_url in self._playlist_urls: self.to_screen( - '[%s] playlist %s: Downloading %d videos' % - (ie_result['extractor'], playlist, num_entries)) + '[download] Skipping already downloaded playlist: %s' + % ie_result.get('title') or ie_result.get('id')) + return - if isinstance(ie_entries, list): - n_all_entries = len(ie_entries) - if playlistitems: - entries = make_playlistitems_entries(ie_entries) - else: - entries = ie_entries[playliststart:playlistend] - n_entries = len(entries) - self.to_screen( - '[%s] playlist %s: Collected %d video ids (downloading %d of them)' % - (ie_result['extractor'], playlist, n_all_entries, n_entries)) - elif isinstance(ie_entries, PagedList): - if playlistitems: - entries = [] - for item in playlistitems: - entries.extend(ie_entries.getslice( - item - 1, item - )) - else: - entries = ie_entries.getslice( - playliststart, playlistend) - n_entries = len(entries) - report_download(n_entries) - else: # iterable - if playlistitems: - entries = make_playlistitems_entries(list(itertools.islice( - ie_entries, 0, max(playlistitems)))) - else: - entries = list(itertools.islice( - ie_entries, playliststart, playlistend)) - n_entries = len(entries) - report_download(n_entries) - - if self.params.get('playlistreverse', False): - entries = entries[::-1] - - if self.params.get('playlistrandom', False): - random.shuffle(entries) - - x_forwarded_for = ie_result.get('__x_forwarded_for_ip') - - for i, entry in enumerate(entries, 1): - self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) - # This __x_forwarded_for_ip thing is a bit ugly but requires - # minimal changes - if x_forwarded_for: - entry['__x_forwarded_for_ip'] = x_forwarded_for - extra = { - 'n_entries': n_entries, - 'playlist': playlist, - 'playlist_id': ie_result.get('id'), - 'playlist_title': ie_result.get('title'), - 'playlist_uploader': ie_result.get('uploader'), - 'playlist_uploader_id': ie_result.get('uploader_id'), - 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, - 'extractor': ie_result['extractor'], - 'webpage_url': ie_result['webpage_url'], - 'webpage_url_basename': url_basename(ie_result['webpage_url']), - 'extractor_key': ie_result['extractor_key'], - } - - reason = self._match_entry(entry, incomplete=True) - if reason is not None: - self.to_screen('[download] ' + reason) - continue - - entry_result = self.__process_iterable_entry(entry, download, extra) - # TODO: skip failed (empty) entries? - playlist_results.append(entry_result) - ie_result['entries'] = playlist_results - self.to_screen('[download] Finished downloading playlist: %s' % playlist) - return ie_result + self._playlist_level += 1 + self._playlist_urls.add(webpage_url) + try: + return self.__process_playlist(ie_result, download) + finally: + self._playlist_level -= 1 + if not self._playlist_level: + self._playlist_urls.clear() elif result_type == 'compat_list': self.report_warning( 'Extractor %s returned a compat_list result. ' @@ -1039,6 +949,118 @@ class YoutubeDL(object): else: raise Exception('Invalid result type: %s' % result_type) + def __process_playlist(self, ie_result, download): + # We process each entry in the playlist + playlist = ie_result.get('title') or ie_result.get('id') + + self.to_screen('[download] Downloading playlist: %s' % playlist) + + playlist_results = [] + + playliststart = self.params.get('playliststart', 1) - 1 + playlistend = self.params.get('playlistend') + # For backwards compatibility, interpret -1 as whole list + if playlistend == -1: + playlistend = None + + playlistitems_str = self.params.get('playlist_items') + playlistitems = None + if playlistitems_str is not None: + def iter_playlistitems(format): + for string_segment in format.split(','): + if '-' in string_segment: + start, end = string_segment.split('-') + for item in range(int(start), int(end) + 1): + yield int(item) + else: + yield int(string_segment) + playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) + + ie_entries = ie_result['entries'] + + def make_playlistitems_entries(list_ie_entries): + num_entries = len(list_ie_entries) + return [ + list_ie_entries[i - 1] for i in playlistitems + if -num_entries <= i - 1 < num_entries] + + def report_download(num_entries): + self.to_screen( + '[%s] playlist %s: Downloading %d videos' % + (ie_result['extractor'], playlist, num_entries)) + + if isinstance(ie_entries, list): + n_all_entries = len(ie_entries) + if playlistitems: + entries = make_playlistitems_entries(ie_entries) + else: + entries = ie_entries[playliststart:playlistend] + n_entries = len(entries) + self.to_screen( + '[%s] playlist %s: Collected %d video ids (downloading %d of them)' % + (ie_result['extractor'], playlist, n_all_entries, n_entries)) + elif isinstance(ie_entries, PagedList): + if playlistitems: + entries = [] + for item in playlistitems: + entries.extend(ie_entries.getslice( + item - 1, item + )) + else: + entries = ie_entries.getslice( + playliststart, playlistend) + n_entries = len(entries) + report_download(n_entries) + else: # iterable + if playlistitems: + entries = make_playlistitems_entries(list(itertools.islice( + ie_entries, 0, max(playlistitems)))) + else: + entries = list(itertools.islice( + ie_entries, playliststart, playlistend)) + n_entries = len(entries) + report_download(n_entries) + + if self.params.get('playlistreverse', False): + entries = entries[::-1] + + if self.params.get('playlistrandom', False): + random.shuffle(entries) + + x_forwarded_for = ie_result.get('__x_forwarded_for_ip') + + for i, entry in enumerate(entries, 1): + self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) + # This __x_forwarded_for_ip thing is a bit ugly but requires + # minimal changes + if x_forwarded_for: + entry['__x_forwarded_for_ip'] = x_forwarded_for + extra = { + 'n_entries': n_entries, + 'playlist': playlist, + 'playlist_id': ie_result.get('id'), + 'playlist_title': ie_result.get('title'), + 'playlist_uploader': ie_result.get('uploader'), + 'playlist_uploader_id': ie_result.get('uploader_id'), + 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'extractor_key': ie_result['extractor_key'], + } + + reason = self._match_entry(entry, incomplete=True) + if reason is not None: + self.to_screen('[download] ' + reason) + continue + + entry_result = self.__process_iterable_entry(entry, download, extra) + # TODO: skip failed (empty) entries? + playlist_results.append(entry_result) + ie_result['entries'] = playlist_results + self.to_screen('[download] Finished downloading playlist: %s' % playlist) + return ie_result + @__handle_extraction_exceptions def __process_iterable_entry(self, entry, download, extra_info): return self.process_ie_result( From 0cd4c402f0530d357bb11b660e14a303290f6caf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 Jan 2021 09:22:33 +0700 Subject: [PATCH 07/79] [animeondemand] Add support for lazy playlist extraction (closes #27829) --- youtube_dl/extractor/animeondemand.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 00ce684d1..54e097d2f 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -116,8 +116,6 @@ class AnimeOnDemandIE(InfoExtractor): r'(?s)]+itemprop="description"[^>]*>(.+?)', webpage, 'anime description', default=None) - entries = [] - def extract_info(html, video_id, num=None): title, description = [None] * 2 formats = [] @@ -233,7 +231,7 @@ class AnimeOnDemandIE(InfoExtractor): self._sort_formats(info['formats']) f = common_info.copy() f.update(info) - entries.append(f) + yield f # Extract teaser/trailer only when full episode is not available if not info['formats']: @@ -247,7 +245,7 @@ class AnimeOnDemandIE(InfoExtractor): 'title': m.group('title'), 'url': urljoin(url, m.group('href')), }) - entries.append(f) + yield f def extract_episodes(html): for num, episode_html in enumerate(re.findall( @@ -275,7 +273,8 @@ class AnimeOnDemandIE(InfoExtractor): 'episode_number': episode_number, } - extract_entries(episode_html, video_id, common_info) + for e in extract_entries(episode_html, video_id, common_info): + yield e def extract_film(html, video_id): common_info = { @@ -283,11 +282,18 @@ class AnimeOnDemandIE(InfoExtractor): 'title': anime_title, 'description': anime_description, } - extract_entries(html, video_id, common_info) + for e in extract_entries(html, video_id, common_info): + yield e - extract_episodes(webpage) + def entries(): + has_episodes = False + for e in extract_episodes(webpage): + has_episodes = True + yield e - if not entries: - extract_film(webpage, anime_id) + if not has_episodes: + for e in extract_film(webpage, anime_id): + yield e - return self.playlist_result(entries, anime_id, anime_title, anime_description) + return self.playlist_result( + entries(), anime_id, anime_title, anime_description) From f1487d4fca40fd37d735753e24a7bae53a1b1513 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 Jan 2021 09:42:11 +0700 Subject: [PATCH 08/79] [mixcloud:playlist:base] Fix video id extraction in flat playlist mode (refs #27787) --- youtube_dl/extractor/mixcloud.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 37f16a791..69319857d 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -251,11 +251,9 @@ class MixcloudPlaylistBaseIE(MixcloudBaseIE): cloudcast_url = cloudcast.get('url') if not cloudcast_url: continue - video_id = cloudcast.get('slug') - if video_id: - owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str) - if owner_username: - video_id = '%s_%s' % (owner_username, video_id) + slug = try_get(cloudcast, lambda x: x['slug'], compat_str) + owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str) + video_id = '%s_%s' % (owner_username, slug) if slug and owner_username else None entries.append(self.url_result( cloudcast_url, MixcloudIE.ie_key(), video_id)) From 8673f4344c40bf771af5344113b184f5cef08030 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 Jan 2021 09:48:00 +0700 Subject: [PATCH 09/79] [ChangeLog] Actualize [ci skip] --- ChangeLog | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/ChangeLog b/ChangeLog index 3629c4fb8..ab057fae1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,34 @@ +version + +Core +* [YoutubeDL] Protect from infinite recursion due to recursively nested + playlists (#27833) +* [YoutubeDL] Ignore failure to create existing directory (#27811) +* [YoutubeDL] Raise syntax error for format selection expressions with multiple + + operators (#27803) + +Extractors ++ [animeondemand] Add support for lazy playlist extraction (#27829) +* [youporn] Restrict fallback download URL (#27822) +* [youporn] Improve height and tbr extraction (#20425, #23659) +* [youporn] Fix extraction (#27822) ++ [twitter] Add support for unified cards (#27826) ++ [twitch] Add Authorization header with OAuth token for GraphQL requests + (#27790) +* [mixcloud:playlist:base] Extract video id in flat playlist mode (#27787) +* [cspan] Improve info extraction (#27791) +* [adn] Improve info extraction +* [adn] Fix extraction (#26963, #27732) +* [youtube:search] Extract from all sections (#27604) +* [youtube:search] fix viewcount and try to extract all video sections (#27604) +* [twitch] Improve login error extraction +* [twitch] Fix authentication (#27743) +* [3qsdn] Improve extraction (#21058) +* [peertube] Extract formats from streamingPlaylists (#26002, #27586, #27728) +* [khanacademy] Fix extraction (#2887, #26803) +* [spike] Update Paramount Network feed URL (#27715) + + version 2021.01.08 Core From ef50cb3fda7c5455b036df648319c2829d899d7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 Jan 2021 09:51:30 +0700 Subject: [PATCH 10/79] release 2021.01.16 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 8 files changed, 16 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 768d45fc1..aedcfa6b3 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.01.08** +- [ ] I've verified that I'm running youtube-dl version **2021.01.16** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.08 + [debug] youtube-dl version 2021.01.16 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 2bd90da57..5c0dfea4e 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.01.08** +- [ ] I've verified that I'm running youtube-dl version **2021.01.16** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 272895b47..772147a75 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.08** +- [ ] I've verified that I'm running youtube-dl version **2021.01.16** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 608fcfba4..2fcaa3a23 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.01.08** +- [ ] I've verified that I'm running youtube-dl version **2021.01.16** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.08 + [debug] youtube-dl version 2021.01.16 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index d085ab1ef..f1adfce8f 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.08** +- [ ] I've verified that I'm running youtube-dl version **2021.01.16** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index ab057fae1..27f01c438 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.01.16 Core * [YoutubeDL] Protect from infinite recursion due to recursively nested diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3a49043fa..aa8026a32 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -418,7 +418,8 @@ - **Katsomo** - **KeezMovies** - **Ketnet** - - **KhanAcademy** + - **khanacademy** + - **khanacademy:unit** - **KickStarter** - **KinjaEmbed** - **KinoPoisk** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0d9659b2b..ac7242abb 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.01.08' +__version__ = '2021.01.16' From 55a3ca16d3724376385801873c918e450a863f4a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 16 Jan 2021 18:12:05 +0100 Subject: [PATCH 11/79] [spotify] Add new extractor for Spotify Podcasts(closes #27443) --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/spotify.py | 156 +++++++++++++++++++++++++++++ 2 files changed, 160 insertions(+) create mode 100644 youtube_dl/extractor/spotify.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 57d4d319c..c50e1419f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1116,6 +1116,10 @@ from .stitcher import ( from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE +from .spotify import ( + SpotifyIE, + SpotifyShowIE, +) from .spreaker import ( SpreakerIE, SpreakerPageIE, diff --git a/youtube_dl/extractor/spotify.py b/youtube_dl/extractor/spotify.py new file mode 100644 index 000000000..826f98cff --- /dev/null +++ b/youtube_dl/extractor/spotify.py @@ -0,0 +1,156 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + float_or_none, + int_or_none, + strip_or_none, + try_get, + unified_strdate, +) + + +class SpotifyBaseIE(InfoExtractor): + _ACCESS_TOKEN = None + _OPERATION_HASHES = { + 'Episode': '8276d4423d709ae9b68ec1b74cc047ba0f7479059a37820be730f125189ac2bf', + 'MinimalShow': '13ee079672fad3f858ea45a55eb109553b4fb0969ed793185b2e34cbb6ee7cc0', + 'ShowEpisodes': 'e0e5ce27bd7748d2c59b4d44ba245a8992a05be75d6fabc3b20753fc8857444d', + } + _VALID_URL_TEMPL = r'https?://open\.spotify\.com/%s/(?P[^/?&#]+)' + + def _real_initialize(self): + self._ACCESS_TOKEN = self._download_json( + 'https://open.spotify.com/get_access_token', None)['accessToken'] + + def _call_api(self, operation, video_id, variables): + return self._download_json( + 'https://api-partner.spotify.com/pathfinder/v1/query', video_id, query={ + 'operationName': 'query' + operation, + 'variables': json.dumps(variables), + 'extensions': json.dumps({ + 'persistedQuery': { + 'sha256Hash': self._OPERATION_HASHES[operation], + }, + }) + }, headers={'authorization': 'Bearer ' + self._ACCESS_TOKEN})['data'] + + def _extract_episode(self, episode, series): + episode_id = episode['id'] + title = episode['name'].strip() + + formats = [] + audio_preview = episode.get('audioPreview') or {} + audio_preview_url = audio_preview.get('url') + if audio_preview_url: + f = { + 'url': audio_preview_url.replace('://p.scdn.co/mp3-preview/', '://anon-podcast.scdn.co/'), + 'vcodec': 'none', + } + audio_preview_format = audio_preview.get('format') + if audio_preview_format: + f['format_id'] = audio_preview_format + mobj = re.match(r'([0-9A-Z]{3})_(?:[A-Z]+_)?(\d+)', audio_preview_format) + if mobj: + f.update({ + 'abr': int(mobj.group(2)), + 'ext': mobj.group(1).lower(), + }) + formats.append(f) + + for item in (try_get(episode, lambda x: x['audio']['items']) or []): + item_url = item.get('url') + if not (item_url and item.get('externallyHosted')): + continue + formats.append({ + 'url': clean_podcast_url(item_url), + 'vcodec': 'none', + }) + + thumbnails = [] + for source in (try_get(episode, lambda x: x['coverArt']['sources']) or []): + source_url = source.get('url') + if not source_url: + continue + thumbnails.append({ + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + }) + + return { + 'id': episode_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': strip_or_none(episode.get('description')), + 'duration': float_or_none(try_get( + episode, lambda x: x['duration']['totalMilliseconds']), 1000), + 'release_date': unified_strdate(try_get( + episode, lambda x: x['releaseDate']['isoString'])), + 'series': series, + } + + +class SpotifyIE(SpotifyBaseIE): + IE_NAME = 'spotify' + _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'episode' + _TEST = { + 'url': 'https://open.spotify.com/episode/4Z7GAJ50bgctf6uclHlWKo', + 'md5': '74010a1e3fa4d9e1ab3aa7ad14e42d3b', + 'info_dict': { + 'id': '4Z7GAJ50bgctf6uclHlWKo', + 'ext': 'mp3', + 'title': 'From the archive: Why time management is ruining our lives', + 'description': 'md5:b120d9c4ff4135b42aa9b6d9cde86935', + 'duration': 2083.605, + 'release_date': '20201217', + 'series': "The Guardian's Audio Long Reads", + } + } + + def _real_extract(self, url): + episode_id = self._match_id(url) + episode = self._call_api('Episode', episode_id, { + 'uri': 'spotify:episode:' + episode_id + })['episode'] + return self._extract_episode( + episode, try_get(episode, lambda x: x['podcast']['name'])) + + +class SpotifyShowIE(SpotifyBaseIE): + IE_NAME = 'spotify:show' + _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'show' + _TEST = { + 'url': 'https://open.spotify.com/show/4PM9Ke6l66IRNpottHKV9M', + 'info_dict': { + 'id': '4PM9Ke6l66IRNpottHKV9M', + 'title': 'The Story from the Guardian', + 'description': 'The Story podcast is dedicated to our finest audio documentaries, investigations and long form stories', + }, + 'playlist_mincount': 36, + } + + def _real_extract(self, url): + show_id = self._match_id(url) + podcast = self._call_api('ShowEpisodes', show_id, { + 'limit': 1000000000, + 'offset': 0, + 'uri': 'spotify:show:' + show_id, + })['podcast'] + podcast_name = podcast.get('name') + + entries = [] + for item in (try_get(podcast, lambda x: x['episodes']['items']) or []): + episode = item.get('episode') + if not episode: + continue + entries.append(self._extract_episode(episode, podcast_name)) + + return self.playlist_result( + entries, show_id, podcast_name, podcast.get('description')) From 360a5e0f60f273a0fd2fd664fed1439430cf35e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= Date: Sat, 16 Jan 2021 21:40:08 +0100 Subject: [PATCH 12/79] [aenetworks] Fix test (#27847) --- youtube_dl/extractor/aenetworks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 8e4963131..a5d88ebbe 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -256,7 +256,7 @@ class AENetworksShowIE(AENetworksListBaseIE): 'title': 'Ancient Aliens', 'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f', }, - 'playlist_mincount': 168, + 'playlist_mincount': 150, }] _RESOURCE = 'series' _ITEMS_KEY = 'episodes' From d4564afc7074a0c12e62649a50a05a2cdaef4650 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= Date: Sat, 16 Jan 2021 23:41:52 +0100 Subject: [PATCH 13/79] [ard] Fix title and description extraction and update tests (#27761) --- youtube_dl/extractor/ard.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 5b7b2dd6d..6bf5b3f13 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -187,13 +187,13 @@ class ARDMediathekIE(ARDMediathekBaseIE): if doc.tag == 'rss': return GenericIE()._extract_rss(url, video_id, doc) - title = self._html_search_regex( + title = self._og_search_title(webpage, default=None) or self._html_search_regex( [r'(.*?)', r'', r'

(.*?)

', r']*>(.*?)'], webpage, 'title') - description = self._html_search_meta( + description = self._og_search_description(webpage, default=None) or self._html_search_meta( 'dcterms.abstract', webpage, 'description', default=None) if description is None: description = self._html_search_meta( @@ -249,18 +249,18 @@ class ARDMediathekIE(ARDMediathekBaseIE): class ARDIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?P[0-9]+))\.html' + _VALID_URL = r'(?Phttps?://(?:www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?:video-?)?(?P[0-9]+))\.html' _TESTS = [{ - # available till 14.02.2019 - 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', - 'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49', + # available till 7.01.2022 + 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html', + 'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1', 'info_dict': { - 'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video', - 'id': '102', + 'display_id': 'maischberger-die-woche', + 'id': '100', 'ext': 'mp4', - 'duration': 4435.0, - 'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?', - 'upload_date': '20180214', + 'duration': 3687.0, + 'title': 'maischberger. die woche vom 7. Januar 2021', + 'upload_date': '20210107', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -315,17 +315,17 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(ARDMediathekBaseIE): _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?:player|live|video)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', - 'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f', + 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', + 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', 'info_dict': { 'display_id': 'die-robuste-roswita', - 'id': '70153354', + 'id': '78566716', 'title': 'Die robuste Roswita', - 'description': r're:^Der Mord.*trüber ist als die Ilm.', + 'description': r're:^Der Mord.*totgeglaubte Ehefrau Roswita', 'duration': 5316, - 'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard', - 'timestamp': 1577047500, - 'upload_date': '20191222', + 'thumbnail': 'https://img.ardmediathek.de/standard/00/78/56/67/84/575672121/16x9/960?mandant=ard', + 'timestamp': 1596658200, + 'upload_date': '20200805', 'ext': 'mp4', }, }, { From de026a6acd3cbc2e62d0988b21c97e71e3730cb0 Mon Sep 17 00:00:00 2001 From: Tatsh Date: Sun, 17 Jan 2021 08:05:39 -0500 Subject: [PATCH 14/79] [Minds] Add new extractor (#17934) --- youtube_dl/extractor/extractors.py | 5 + youtube_dl/extractor/minds.py | 164 +++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+) create mode 100644 youtube_dl/extractor/minds.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c50e1419f..90012fc4f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -651,6 +651,11 @@ from .microsoftvirtualacademy import ( MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyCourseIE, ) +from .minds import ( + MindsIE, + MindsActivityIE, + MindsChannelIE, +) from .ministrygrid import MinistryGridIE from .minoto import MinotoIE from .miomio import MioMioIE diff --git a/youtube_dl/extractor/minds.py b/youtube_dl/extractor/minds.py new file mode 100644 index 000000000..4523d0938 --- /dev/null +++ b/youtube_dl/extractor/minds.py @@ -0,0 +1,164 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import (int_or_none, sanitized_Request, str_or_none, + unified_strdate) + + +class MindsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?minds\.com/media/(?P[0-9]+)' + _TEST = { + 'url': 'https://www.minds.com/media/100000000000086822', + 'md5': '215a658184a419764852239d4970b045', + 'info_dict': { + 'id': '100000000000086822', + 'ext': 'mp4', + 'title': 'Minds intro sequence', + 'thumbnail': 'https://cdn-cinemr.minds.com/cinemr_com/334128440657580032/thumbnail-00001.png', + 'uploader_id': '100000000000000341', + 'description': '', + 'upload_date': '20130524', + 'timestamp': 1369404826, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_api_url = 'https://www.minds.com/api/v1/media/%s' % video_id + token = self._get_cookies(url).get('XSRF-TOKEN') + headers = { + 'authority': 'www.minds.com', + 'referer': url, + 'x-xsrf-token': token.value if token else '', + } + data = self._download_json(video_api_url, video_id, headers=headers, + query={'children': 'false'}) + formats = [] + owner = data.get('ownerObj', {}) + + transcodes = data.get('transcodes', {}) + # These keys are the width so keep the highest width last + keys = sorted(transcodes.keys()) + + for format_id in keys: + is_numeric = re.match('^[0-9]+\.mp4', format_id) + video_url = transcodes[format_id] + info = { + 'url': video_url, + 'format_id': format_id, + 'http_headers': headers, + } + if is_numeric: + info['width'] = int(format_id.split('.')[0]) + formats.append(info) + + uploader_id = str_or_none(owner.get('guid') or + data.get('owner_guid') or + owner.get('legacy_guid') or + owner.get('owner_guid')) + description = str_or_none(data.get('description')) + if description: + description = description.strip() + uploader_url = age_limit = thumbnail = None + + if owner.get('username'): + uploader_url = 'https://www.minds.com/%s' % owner.get('username') + if data.get('mature') is True: + age_limit = 18 + + thumbnail_api_url = data.get('thumbnail_src') + if thumbnail_api_url: + req = sanitized_Request(thumbnail_api_url) + req.get_method = lambda: 'HEAD' + res = self._request_webpage(req, video_id) + if res.headers.get('content-type', '').startswith('image/'): + thumbnail = getattr(res, 'url', None) + tags = data.get('tags', '').strip() + if isinstance(tags, compat_str) and tags: + tags = [x.strip() for x in tags.split(',')] + else: + tags = None + category = data.get('category') + if isinstance(category, compat_str) and category: + category = [category] + else: + category = None + + return { + 'id': video_id, + 'title': data['title'], + 'formats': formats, + 'description': description, + 'license': str_or_none(data.get('license')), + 'creator': str_or_none(owner.get('name') or owner.get('username')), + 'release_date': unified_strdate(data.get('time_created')), + 'timestamp': int_or_none(data.get('time_created')), + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + 'view_count': int_or_none(data.get('play:count')), + 'like_count': int_or_none(data.get('thumbs:up:count')), + 'dislike_count': int_or_none(data.get('thumbs:down:count')), + 'average_rating': int_or_none(data.get('rating')), + 'age_limit': age_limit, + 'categories': [str_or_none(data.get('category'))], + 'tags': tags, + # As of 20181020 the API is returning `false` for this value both + # at top level and within the entity.comments:count path. The only + # other way to get this is to fetch all comments and count. + 'comment_count': int_or_none(data.get('comments:count')), + 'thumbnail': thumbnail, + } + + +class MindsActivityIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?minds\.com/newsfeed/(?P[0-9]+)' + + def _real_extract(self, url): + guid = self._match_id(url) + api_url = 'https://www.minds.com/api/v1/newsfeed/single/%s' % guid + token = self._get_cookies(url).get('XSRF-TOKEN') + headers = { + 'authority': 'www.minds.com', + 'referer': url, + 'x-xsrf-token': token.value if token else '', + } + data = self._download_json(api_url, guid, headers=headers) + return self.url_result('https://www.minds.com/media/%s' % data['activity']['entity_guid']) + + +class MindsChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?minds\.com/(?!newsfeed|media|api)(?P[^/]+)' + + def _real_extract(self, url): + channel_name = self._match_id(url) + api_url = 'https://www.minds.com/api/v1/channel/%s' % channel_name + token = self._get_cookies(url).get('XSRF-TOKEN') + headers = { + 'authority': 'www.minds.com', + 'referer': url, + 'x-xsrf-token': token.value if token else '', + } + data = self._download_json(api_url, channel_name, headers=headers) + channel = data.get('channel', {}) + params = {'limit': 12, 'offset': ''} + api_url = 'https://www.minds.com/api/v1/newsfeed/personal/%s' % channel['guid'] + entries = [] + while True: + data = self._download_json(api_url, channel['guid'], + headers=headers, query=params) + activity = data.get('activity', []) + if len(activity) == 0 or not data.get('load-next'): + break + for info in activity: + if info.get('custom_type') != 'video': + continue + entries.append(self.url_result('https://www.minds.com/media/%s' % info['entity_guid'])) + params['offset'] = data['load-next'] + return self.playlist_result(entries, + playlist_title='%s activity' % channel_name) From 58f6c2112d55cdd77ad76b323760bb934d7e7576 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 17 Jan 2021 14:07:56 +0100 Subject: [PATCH 15/79] [minds] improve extraction --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/minds.py | 288 ++++++++++++++++------------- 2 files changed, 161 insertions(+), 129 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 90012fc4f..29b0e615e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -653,8 +653,8 @@ from .microsoftvirtualacademy import ( ) from .minds import ( MindsIE, - MindsActivityIE, MindsChannelIE, + MindsGroupIE, ) from .ministrygrid import MinistryGridIE from .minoto import MinotoIE diff --git a/youtube_dl/extractor/minds.py b/youtube_dl/extractor/minds.py index 4523d0938..8e9f0f825 100644 --- a/youtube_dl/extractor/minds.py +++ b/youtube_dl/extractor/minds.py @@ -1,164 +1,196 @@ # coding: utf-8 from __future__ import unicode_literals -import re from .common import InfoExtractor from ..compat import compat_str -from ..utils import (int_or_none, sanitized_Request, str_or_none, - unified_strdate) +from ..utils import ( + clean_html, + int_or_none, + str_or_none, + strip_or_none, +) -class MindsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?minds\.com/media/(?P[0-9]+)' - _TEST = { +class MindsBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?minds\.com/' + + def _call_api(self, path, video_id, resource, query=None): + api_url = 'https://www.minds.com/api/' + path + token = self._get_cookies(api_url).get('XSRF-TOKEN') + return self._download_json( + api_url, video_id, 'Downloading %s JSON metadata' % resource, headers={ + 'Referer': 'https://www.minds.com/', + 'X-XSRF-TOKEN': token.value if token else '', + }, query=query) + + +class MindsIE(MindsBaseIE): + IE_NAME = 'minds' + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?:media|newsfeed|archive/view)/(?P[0-9]+)' + _TESTS = [{ 'url': 'https://www.minds.com/media/100000000000086822', 'md5': '215a658184a419764852239d4970b045', 'info_dict': { 'id': '100000000000086822', 'ext': 'mp4', 'title': 'Minds intro sequence', - 'thumbnail': 'https://cdn-cinemr.minds.com/cinemr_com/334128440657580032/thumbnail-00001.png', - 'uploader_id': '100000000000000341', - 'description': '', + 'thumbnail': r're:https?://.+\.png', + 'uploader_id': 'ottman', 'upload_date': '20130524', 'timestamp': 1369404826, + 'uploader': 'Bill Ottman', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': ['animation'], + 'comment_count': int, + 'license': 'attribution-cc', }, - 'params': { - 'skip_download': True, + }, { + # entity.type == 'activity' and empty title + 'url': 'https://www.minds.com/newsfeed/798025111988506624', + 'md5': 'b2733a74af78d7fd3f541c4cbbaa5950', + 'info_dict': { + 'id': '798022190320226304', + 'ext': 'mp4', + 'title': '798022190320226304', + 'uploader': 'ColinFlaherty', + 'upload_date': '20180111', + 'timestamp': 1515639316, + 'uploader_id': 'ColinFlaherty', }, - } + }, { + 'url': 'https://www.minds.com/archive/view/715172106794442752', + 'only_matching': True, + }, { + # youtube perma_url + 'url': 'https://www.minds.com/newsfeed/1197131838022602752', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - video_api_url = 'https://www.minds.com/api/v1/media/%s' % video_id - token = self._get_cookies(url).get('XSRF-TOKEN') - headers = { - 'authority': 'www.minds.com', - 'referer': url, - 'x-xsrf-token': token.value if token else '', - } - data = self._download_json(video_api_url, video_id, headers=headers, - query={'children': 'false'}) + entity_id = self._match_id(url) + entity = self._call_api( + 'v1/entities/entity/' + entity_id, entity_id, 'entity')['entity'] + if entity.get('type') == 'activity': + if entity.get('custom_type') == 'video': + video_id = entity['entity_guid'] + else: + return self.url_result(entity['perma_url']) + else: + assert(entity['subtype'] == 'video') + video_id = entity_id + # 1080p and webm formats available only on the sources array + video = self._call_api( + 'v2/media/video/' + video_id, video_id, 'video') + formats = [] - owner = data.get('ownerObj', {}) + for source in (video.get('sources') or []): + src = source.get('src') + if not src: + continue + formats.append({ + 'format_id': source.get('label'), + 'height': int_or_none(source.get('size')), + 'url': src, + }) + self._sort_formats(formats) - transcodes = data.get('transcodes', {}) - # These keys are the width so keep the highest width last - keys = sorted(transcodes.keys()) + entity = video.get('entity') or entity + owner = entity.get('ownerObj') or {} + uploader_id = owner.get('username') - for format_id in keys: - is_numeric = re.match('^[0-9]+\.mp4', format_id) - video_url = transcodes[format_id] - info = { - 'url': video_url, - 'format_id': format_id, - 'http_headers': headers, - } - if is_numeric: - info['width'] = int(format_id.split('.')[0]) - formats.append(info) + tags = entity.get('tags') + if tags and isinstance(tags, compat_str): + tags = [tags] - uploader_id = str_or_none(owner.get('guid') or - data.get('owner_guid') or - owner.get('legacy_guid') or - owner.get('owner_guid')) - description = str_or_none(data.get('description')) - if description: - description = description.strip() - uploader_url = age_limit = thumbnail = None - - if owner.get('username'): - uploader_url = 'https://www.minds.com/%s' % owner.get('username') - if data.get('mature') is True: - age_limit = 18 - - thumbnail_api_url = data.get('thumbnail_src') - if thumbnail_api_url: - req = sanitized_Request(thumbnail_api_url) - req.get_method = lambda: 'HEAD' - res = self._request_webpage(req, video_id) - if res.headers.get('content-type', '').startswith('image/'): - thumbnail = getattr(res, 'url', None) - tags = data.get('tags', '').strip() - if isinstance(tags, compat_str) and tags: - tags = [x.strip() for x in tags.split(',')] - else: - tags = None - category = data.get('category') - if isinstance(category, compat_str) and category: - category = [category] - else: - category = None + thumbnail = None + poster = video.get('poster') or entity.get('thumbnail_src') + if poster: + urlh = self._request_webpage(poster, video_id, fatal=False) + if urlh: + thumbnail = urlh.geturl() return { 'id': video_id, - 'title': data['title'], + 'title': entity.get('title') or video_id, 'formats': formats, - 'description': description, - 'license': str_or_none(data.get('license')), - 'creator': str_or_none(owner.get('name') or owner.get('username')), - 'release_date': unified_strdate(data.get('time_created')), - 'timestamp': int_or_none(data.get('time_created')), + 'description': clean_html(entity.get('description')) or None, + 'license': str_or_none(entity.get('license')), + 'timestamp': int_or_none(entity.get('time_created')), + 'uploader': strip_or_none(owner.get('name')), 'uploader_id': uploader_id, - 'uploader_url': uploader_url, - 'view_count': int_or_none(data.get('play:count')), - 'like_count': int_or_none(data.get('thumbs:up:count')), - 'dislike_count': int_or_none(data.get('thumbs:down:count')), - 'average_rating': int_or_none(data.get('rating')), - 'age_limit': age_limit, - 'categories': [str_or_none(data.get('category'))], + 'uploader_url': 'https://www.minds.com/' + uploader_id if uploader_id else None, + 'view_count': int_or_none(entity.get('play:count')), + 'like_count': int_or_none(entity.get('thumbs:up:count')), + 'dislike_count': int_or_none(entity.get('thumbs:down:count')), 'tags': tags, - # As of 20181020 the API is returning `false` for this value both - # at top level and within the entity.comments:count path. The only - # other way to get this is to fetch all comments and count. - 'comment_count': int_or_none(data.get('comments:count')), + 'comment_count': int_or_none(entity.get('comments:count')), 'thumbnail': thumbnail, } -class MindsActivityIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?minds\.com/newsfeed/(?P[0-9]+)' +class MindsFeedBaseIE(MindsBaseIE): + _PAGE_SIZE = 150 - def _real_extract(self, url): - guid = self._match_id(url) - api_url = 'https://www.minds.com/api/v1/newsfeed/single/%s' % guid - token = self._get_cookies(url).get('XSRF-TOKEN') - headers = { - 'authority': 'www.minds.com', - 'referer': url, - 'x-xsrf-token': token.value if token else '', - } - data = self._download_json(api_url, guid, headers=headers) - return self.url_result('https://www.minds.com/media/%s' % data['activity']['entity_guid']) - - -class MindsChannelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?minds\.com/(?!newsfeed|media|api)(?P[^/]+)' - - def _real_extract(self, url): - channel_name = self._match_id(url) - api_url = 'https://www.minds.com/api/v1/channel/%s' % channel_name - token = self._get_cookies(url).get('XSRF-TOKEN') - headers = { - 'authority': 'www.minds.com', - 'referer': url, - 'x-xsrf-token': token.value if token else '', - } - data = self._download_json(api_url, channel_name, headers=headers) - channel = data.get('channel', {}) - params = {'limit': 12, 'offset': ''} - api_url = 'https://www.minds.com/api/v1/newsfeed/personal/%s' % channel['guid'] - entries = [] + def _entries(self, feed_id): + query = {'limit': self._PAGE_SIZE, 'sync': 1} + i = 1 while True: - data = self._download_json(api_url, channel['guid'], - headers=headers, query=params) - activity = data.get('activity', []) - if len(activity) == 0 or not data.get('load-next'): - break - for info in activity: - if info.get('custom_type') != 'video': + data = self._call_api( + 'v2/feeds/container/%s/videos' % feed_id, + feed_id, 'page %s' % i, query) + entities = data.get('entities') or [] + for entity in entities: + guid = entity.get('guid') + if not guid: continue - entries.append(self.url_result('https://www.minds.com/media/%s' % info['entity_guid'])) - params['offset'] = data['load-next'] - return self.playlist_result(entries, - playlist_title='%s activity' % channel_name) + yield self.url_result( + 'https://www.minds.com/newsfeed/' + guid, + MindsIE.ie_key(), guid) + query['from_timestamp'] = data['load-next'] + if not (query['from_timestamp'] and len(entities) == self._PAGE_SIZE): + break + i += 1 + + def _real_extract(self, url): + feed_id = self._match_id(url) + feed = self._call_api( + 'v1/%s/%s' % (self._FEED_PATH, feed_id), + feed_id, self._FEED_TYPE)[self._FEED_TYPE] + + return self.playlist_result( + self._entries(feed['guid']), feed_id, + strip_or_none(feed.get('name')), + feed.get('briefdescription')) + + +class MindsChannelIE(MindsFeedBaseIE): + _FEED_TYPE = 'channel' + IE_NAME = 'minds:' + _FEED_TYPE + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?!(?:newsfeed|media|api|archive|groups)/)(?P[^/?&#]+)' + _FEED_PATH = 'channel' + _TEST = { + 'url': 'https://www.minds.com/ottman', + 'info_dict': { + 'id': 'ottman', + 'title': 'Bill Ottman', + 'description': 'Co-creator & CEO @minds', + }, + 'playlist_mincount': 54, + } + + +class MindsGroupIE(MindsFeedBaseIE): + _FEED_TYPE = 'group' + IE_NAME = 'minds:' + _FEED_TYPE + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'groups/profile/(?P[0-9]+)' + _FEED_PATH = 'groups/group' + _TEST = { + 'url': 'https://www.minds.com/groups/profile/785582576369672204/feed/videos', + 'info_dict': { + 'id': '785582576369672204', + 'title': 'Cooking Videos', + }, + 'playlist_mincount': 1, + } From 26499ba823ecba99e18cf5cc76e001f4bbbe4293 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 17 Jan 2021 17:35:10 +0100 Subject: [PATCH 16/79] [aljazeera] fix extraction(closes #20911)(closes #27779) --- youtube_dl/extractor/aljazeera.py | 41 ++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index c68be3134..c4f915a3c 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -1,13 +1,16 @@ from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?Pprogram/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P[^/?&#]+)' _TESTS = [{ - 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', + 'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance', 'info_dict': { 'id': '3792260579001', 'ext': 'mp4', @@ -20,14 +23,34 @@ class AlJazeeraIE(InfoExtractor): 'add_ie': ['BrightcoveNew'], 'skip': 'Not accessible from Travis CI server', }, { - 'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html', + 'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off', + 'only_matching': True, + }, { + 'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art', 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' def _real_extract(self, url): - program_name = self._match_id(url) - webpage = self._download_webpage(url, program_name) - brightcove_id = self._search_regex( - r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id') - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + post_type, name = re.match(self._VALID_URL, url).groups() + post_type = { + 'features': 'post', + 'program': 'episode', + 'videos': 'video', + }[post_type.split('/')[0]] + video = self._download_json( + 'https://www.aljazeera.com/graphql', name, query={ + 'operationName': 'SingleArticleQuery', + 'variables': json.dumps({ + 'name': name, + 'postType': post_type, + }), + }, headers={ + 'wp-site': 'aje', + })['data']['article']['video'] + video_id = video['id'] + account_id = video.get('accountId') or '665003303001' + player_id = video.get('playerId') or 'BkeSH5BDb' + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), + 'BrightcoveNew', video_id) From f9201cef58c2ef6bd99315ea06b57d0c69a83f59 Mon Sep 17 00:00:00 2001 From: Brian Marks Date: Mon, 18 Jan 2021 09:47:06 -0500 Subject: [PATCH 17/79] [americastestkitchen] Improve metadata extraction for ATK episodes (#27860) --- youtube_dl/extractor/americastestkitchen.py | 30 +++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index e20f00fc3..7d2c375c4 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -6,8 +6,10 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + int_or_none, try_get, unified_strdate, + unified_timestamp, ) @@ -22,8 +24,8 @@ class AmericasTestKitchenIE(InfoExtractor): 'ext': 'mp4', 'description': 'md5:64e606bfee910627efc4b5f050de92b3', 'thumbnail': r're:^https?://', - 'timestamp': 1523664000, - 'upload_date': '20180414', + 'timestamp': 1523318400, + 'upload_date': '20180410', 'release_date': '20180410', 'series': "America's Test Kitchen", 'season_number': 18, @@ -33,6 +35,27 @@ class AmericasTestKitchenIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # Metadata parsing behaves differently for newer episodes (705) as opposed to older episodes (582 above) + 'url': 'https://www.americastestkitchen.com/episode/705-simple-chicken-dinner', + 'md5': '06451608c57651e985a498e69cec17e5', + 'info_dict': { + 'id': '5fbe8c61bda2010001c6763b', + 'title': 'Simple Chicken Dinner', + 'ext': 'mp4', + 'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7', + 'thumbnail': r're:^https?://', + 'timestamp': 1610755200, + 'upload_date': '20210116', + 'release_date': '20210116', + 'series': "America's Test Kitchen", + 'season_number': 21, + 'episode': 'Simple Chicken Dinner', + 'episode_number': 3, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, @@ -60,7 +83,10 @@ class AmericasTestKitchenIE(InfoExtractor): 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % video['zypeId'], 'ie_key': 'Zype', 'description': clean_html(video.get('description')), + 'timestamp': unified_timestamp(video.get('publishDate')), 'release_date': unified_strdate(video.get('publishDate')), + 'episode_number': int_or_none(episode.get('number')), + 'season_number': int_or_none(episode.get('season')), 'series': try_get(episode, lambda x: x['show']['title']), 'episode': episode.get('title'), } From 1dd12708c2042e5d78887e4c5026d6196cc02bb2 Mon Sep 17 00:00:00 2001 From: DrWursterich <31037782+DrWursterich@users.noreply.github.com> Date: Tue, 19 Jan 2021 10:21:37 +0100 Subject: [PATCH 18/79] [9gag] Fix Extraction (#23022) --- youtube_dl/extractor/ninegag.py | 200 +++++++++++++++++++------------- 1 file changed, 122 insertions(+), 78 deletions(-) diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index dc6a27d36..3753bc0a2 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -3,102 +3,146 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import str_to_int +from ..utils import ( + determine_ext, + url_or_none, + int_or_none, + float_or_none, + ExtractorError +) class NineGagIE(InfoExtractor): IE_NAME = '9gag' - _VALID_URL = r'https?://(?:www\.)?9gag(?:\.com/tv|\.tv)/(?:p|embed)/(?P[a-zA-Z0-9]+)(?:/(?P[^?#/]+))?' + _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'http://9gag.com/tv/p/Kk2X5/people-are-awesome-2013-is-absolutely-awesome', + 'url': 'https://9gag.com/gag/an5Qz5b', 'info_dict': { - 'id': 'kXzwOKyGlSA', - 'ext': 'mp4', - 'description': 'This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)', - 'title': '\"People Are Awesome 2013\" Is Absolutely Awesome', - 'uploader_id': 'UCdEH6EjDKwtTe-sO2f0_1XA', - 'uploader': 'CompilationChannel', - 'upload_date': '20131110', - 'view_count': int, - }, - 'add_ie': ['Youtube'], + 'id': 'an5Qz5b', + 'ext': 'webm', + 'title': 'Dogs playing tetherball', + 'upload_date': '20191108', + 'timestamp': 1573243994, + 'age_limit': 0, + 'categories': [ + 'Wholesome' + ], + 'tags': [ + 'Dog' + ] + } }, { - 'url': 'http://9gag.com/tv/p/aKolP3', + 'url': 'https://9gag.com/gag/ae5Ag7B', 'info_dict': { - 'id': 'aKolP3', - 'ext': 'mp4', - 'title': 'This Guy Travelled 11 countries In 44 days Just To Make This Amazing Video', - 'description': "I just saw more in 1 minute than I've seen in 1 year. This guy's video is epic!!", - 'uploader_id': 'rickmereki', - 'uploader': 'Rick Mereki', - 'upload_date': '20110803', - 'view_count': int, - }, - 'add_ie': ['Vimeo'], - }, { - 'url': 'http://9gag.com/tv/p/KklwM', - 'only_matching': True, - }, { - 'url': 'http://9gag.tv/p/Kk2X5', - 'only_matching': True, - }, { - 'url': 'http://9gag.com/tv/embed/a5Dmvl', - 'only_matching': True, + 'id': 'ae5Ag7B', + 'ext': 'webm', + 'title': 'Capybara Agility Training', + 'upload_date': '20191108', + 'timestamp': 1573237208, + 'age_limit': 0, + 'categories': [ + 'Awesome' + ], + 'tags': [ + 'Weimaraner', + 'American Pit Bull Terrier' + ] + } }] - _EXTERNAL_VIDEO_PROVIDER = { - '1': { - 'url': '%s', - 'ie_key': 'Youtube', - }, - '2': { - 'url': 'http://player.vimeo.com/video/%s', - 'ie_key': 'Vimeo', - }, - '3': { - 'url': 'http://instagram.com/p/%s', - 'ie_key': 'Instagram', - }, - '4': { - 'url': 'http://vine.co/v/%s', - 'ie_key': 'Vine', - }, + _EXTERNAL_VIDEO_PROVIDERS = { + 'Youtube': 'https://youtube.com/watch?v=%s' } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + rawJsonData = self._search_regex( + r'window._config\s*=\s*JSON.parse\(["\']({.+?})["\']\);', + webpage, + 'data') + rawJsonData = rawJsonData.replace('\\"', '"').replace('\\\\/', '/') + data = self._parse_json(rawJsonData, video_id)['data']['post'] - webpage = self._download_webpage(url, display_id) + if data['type'] == 'Video': + vid = data['video']['id'] + ie_key = data['video']['source'].capitalize() + return { + '_type': 'url_transparent', + 'url': self._EXTERNAL_VIDEO_PROVIDERS[ie_key] % vid, + 'ie_key': ie_key, + 'id': vid, + 'duration': data['video'].get('duration'), + 'start_time': data['video'].get('startTs') + } - post_view = self._parse_json( - self._search_regex( - r'var\s+postView\s*=\s*new\s+app\.PostView\({\s*post:\s*({.+?})\s*,\s*posts:\s*prefetchedCurrentPost', - webpage, 'post view'), - display_id) + if data['type'] == 'EmbedVideo': + vid = data['video']['id'] + ie_key = data['video']['source'].capitalize() + return { + '_type': 'url_transparent', + 'url': data['video']['embedUrl'], + #'ie_key': vid, + 'start_time': data['video'].get('startTs') + } - ie_key = None - source_url = post_view.get('sourceUrl') - if not source_url: - external_video_id = post_view['videoExternalId'] - external_video_provider = post_view['videoExternalProvider'] - source_url = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['url'] % external_video_id - ie_key = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['ie_key'] - title = post_view['title'] - description = post_view.get('description') - view_count = str_to_int(post_view.get('externalView')) - thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w') + if data['type'] != 'Animated': + raise ExtractorError( + 'The given url does not contain a video', + expected=True) + + duration = None + formats = [] + thumbnails = [] + for key in data['images']: + image = data['images'][key] + if 'duration' in image and duration is None: + duration = int_or_none(image['duration']) + url = url_or_none(image.get('url')) + if url == None: + continue + ext = determine_ext(url) + if ext == 'jpg' or ext == 'png': + thumbnail = { + 'url': url, + 'width': float_or_none(image.get('width')), + 'height': float_or_none(image.get('height')) + } + thumbnails.append(thumbnail) + elif ext == 'webm' or ext == 'mp4': + formats.append({ + 'format_id': re.sub(r'.*_([^\.]+).(.*)', r'\1_\2', url), + 'ext': ext, + 'url': url, + 'width': float_or_none(image.get('width')), + 'height': float_or_none(image.get('height')) + }) + section = None + postSection = data.get('postSection') + if postSection != None and 'name' in postSection: + section = re.sub(r'\\[^\\]{5}', '', postSection['name']) + age_limit = int_or_none(data.get('nsfw')) + if age_limit != None: + age_limit = age_limit * 18 + tags = None + if 'tags' in data: + tags = [] + for tag in data.get('tags') or []: + tags.append(tag.get('key')) return { - '_type': 'url_transparent', - 'url': source_url, - 'ie_key': ie_key, 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'view_count': view_count, - 'thumbnail': thumbnail, + 'title': data['title'], + 'timestamp': int_or_none(data.get('creationTs')), + 'duration': duration, + 'formats': formats, + 'thumbnails': thumbnails, + 'like_count': int_or_none(data.get('upVoteCount')), + 'dislike_count': int_or_none(data.get('downVoteCount')), + 'comment_count': int_or_none(data.get('commentsCount')), + 'age_limit': age_limit, + 'categories': [section], + 'tags': tags, + 'is_live': False } From 54856480d7bac670c9d571d4191f5f35aadc5270 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 19 Jan 2021 10:23:02 +0100 Subject: [PATCH 19/79] [ninegag] improve extraction --- youtube_dl/extractor/ninegag.py | 189 ++++++++++++++------------------ 1 file changed, 83 insertions(+), 106 deletions(-) diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index 3753bc0a2..440f865bc 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -1,148 +1,125 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( determine_ext, - url_or_none, + ExtractorError, int_or_none, - float_or_none, - ExtractorError + try_get, + url_or_none, ) class NineGagIE(InfoExtractor): IE_NAME = '9gag' - _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[^/?&#]+)' - _TESTS = [{ - 'url': 'https://9gag.com/gag/an5Qz5b', - 'info_dict': { - 'id': 'an5Qz5b', - 'ext': 'webm', - 'title': 'Dogs playing tetherball', - 'upload_date': '20191108', - 'timestamp': 1573243994, - 'age_limit': 0, - 'categories': [ - 'Wholesome' - ], - 'tags': [ - 'Dog' - ] - } - }, { + _TEST = { 'url': 'https://9gag.com/gag/ae5Ag7B', 'info_dict': { 'id': 'ae5Ag7B', - 'ext': 'webm', + 'ext': 'mp4', 'title': 'Capybara Agility Training', 'upload_date': '20191108', 'timestamp': 1573237208, - 'age_limit': 0, - 'categories': [ - 'Awesome' - ], - 'tags': [ - 'Weimaraner', - 'American Pit Bull Terrier' - ] + 'categories': ['Awesome'], + 'tags': ['Weimaraner', 'American Pit Bull Terrier'], + 'duration': 44, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, } - }] - - _EXTERNAL_VIDEO_PROVIDERS = { - 'Youtube': 'https://youtube.com/watch?v=%s' } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - rawJsonData = self._search_regex( - r'window._config\s*=\s*JSON.parse\(["\']({.+?})["\']\);', - webpage, - 'data') - rawJsonData = rawJsonData.replace('\\"', '"').replace('\\\\/', '/') - data = self._parse_json(rawJsonData, video_id)['data']['post'] + post_id = self._match_id(url) + post = self._download_json( + 'https://9gag.com/v1/post', post_id, query={ + 'id': post_id + })['data']['post'] - if data['type'] == 'Video': - vid = data['video']['id'] - ie_key = data['video']['source'].capitalize() - return { - '_type': 'url_transparent', - 'url': self._EXTERNAL_VIDEO_PROVIDERS[ie_key] % vid, - 'ie_key': ie_key, - 'id': vid, - 'duration': data['video'].get('duration'), - 'start_time': data['video'].get('startTs') - } - - if data['type'] == 'EmbedVideo': - vid = data['video']['id'] - ie_key = data['video']['source'].capitalize() - return { - '_type': 'url_transparent', - 'url': data['video']['embedUrl'], - #'ie_key': vid, - 'start_time': data['video'].get('startTs') - } - - if data['type'] != 'Animated': + if post.get('type') != 'Animated': raise ExtractorError( 'The given url does not contain a video', expected=True) + title = post['title'] + duration = None formats = [] thumbnails = [] - for key in data['images']: - image = data['images'][key] - if 'duration' in image and duration is None: - duration = int_or_none(image['duration']) - url = url_or_none(image.get('url')) - if url == None: + for key, image in (post.get('images') or {}).items(): + image_url = url_or_none(image.get('url')) + if not image_url: continue - ext = determine_ext(url) - if ext == 'jpg' or ext == 'png': - thumbnail = { - 'url': url, - 'width': float_or_none(image.get('width')), - 'height': float_or_none(image.get('height')) - } - thumbnails.append(thumbnail) - elif ext == 'webm' or ext == 'mp4': - formats.append({ - 'format_id': re.sub(r'.*_([^\.]+).(.*)', r'\1_\2', url), + ext = determine_ext(image_url) + image_id = key.strip('image') + common = { + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } + if ext in ('jpg', 'png'): + webp_url = image.get('webpUrl') + if webp_url: + t = common.copy() + t.update({ + 'id': image_id + '-webp', + 'url': webp_url, + }) + thumbnails.append(t) + common.update({ + 'id': image_id, 'ext': ext, - 'url': url, - 'width': float_or_none(image.get('width')), - 'height': float_or_none(image.get('height')) }) - section = None - postSection = data.get('postSection') - if postSection != None and 'name' in postSection: - section = re.sub(r'\\[^\\]{5}', '', postSection['name']) - age_limit = int_or_none(data.get('nsfw')) - if age_limit != None: - age_limit = age_limit * 18 + thumbnails.append(common) + elif ext in ('webm', 'mp4'): + if not duration: + duration = int_or_none(image.get('duration')) + common['acodec'] = 'none' if image.get('hasAudio') == 0 else None + for vcodec in ('vp8', 'vp9', 'h265'): + c_url = image.get(vcodec + 'Url') + if not c_url: + continue + c_f = common.copy() + c_f.update({ + 'format_id': image_id + '-' + vcodec, + 'url': c_url, + 'vcodec': vcodec, + }) + formats.append(c_f) + common.update({ + 'ext': ext, + 'format_id': image_id, + }) + formats.append(common) + self._sort_formats(formats) + + section = try_get(post, lambda x: x['postSection']['name']) + tags = None - if 'tags' in data: + post_tags = post.get('tags') + if post_tags: tags = [] - for tag in data.get('tags') or []: - tags.append(tag.get('key')) + for tag in post_tags: + tag_key = tag.get('key') + if not tag_key: + continue + tags.append(tag_key) + + get_count = lambda x: int_or_none(post.get(x + 'Count')) return { - 'id': video_id, - 'title': data['title'], - 'timestamp': int_or_none(data.get('creationTs')), + 'id': post_id, + 'title': title, + 'timestamp': int_or_none(post.get('creationTs')), 'duration': duration, 'formats': formats, 'thumbnails': thumbnails, - 'like_count': int_or_none(data.get('upVoteCount')), - 'dislike_count': int_or_none(data.get('downVoteCount')), - 'comment_count': int_or_none(data.get('commentsCount')), - 'age_limit': age_limit, - 'categories': [section], + 'like_count': get_count('upVote'), + 'dislike_count': get_count('downVote'), + 'comment_count': get_count('comments'), + 'age_limit': 18 if post.get('nsfw') == 1 else None, + 'categories': [section] if section else None, 'tags': tags, - 'is_live': False } From 015e19b350a4bb7868008fb9df9092f4a60a5f00 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 19 Jan 2021 11:54:39 +0100 Subject: [PATCH 20/79] [lbry] unescape lbry uri(closes #27872) --- youtube_dl/extractor/lbry.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py index 41cc245eb..413215a99 100644 --- a/youtube_dl/extractor/lbry.py +++ b/youtube_dl/extractor/lbry.py @@ -5,7 +5,10 @@ import functools import json from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( determine_ext, ExtractorError, @@ -131,6 +134,9 @@ class LBRYIE(LBRYBaseIE): }, { 'url': 'https://lbry.tv/$/download/Episode-1/e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', 'only_matching': True, + }, { + 'url': 'https://lbry.tv/@lacajadepandora:a/TRUMP-EST%C3%81-BIEN-PUESTO-con-Pilar-Baselga,-Carlos-Senra,-Luis-Palacios-(720p_30fps_H264-192kbit_AAC):1', + 'only_matching': True, }] def _real_extract(self, url): @@ -139,6 +145,7 @@ class LBRYIE(LBRYBaseIE): display_id = display_id.split('/', 2)[-1].replace('/', ':') else: display_id = display_id.replace(':', '#') + display_id = compat_urllib_parse_unquote(display_id) uri = 'lbry://' + display_id result = self._resolve_url(uri, display_id, 'stream') result_value = result['value'] From bc7c8f3d4ea6cb98fa62a5fe457046dd4e9b5379 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 19 Jan 2021 14:47:39 +0100 Subject: [PATCH 21/79] [yahoo] fix single video extraction --- youtube_dl/extractor/yahoo.py | 80 +++++++++++++++++------------------ 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index e4615376c..a17b10d6e 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -177,46 +177,9 @@ class YahooIE(InfoExtractor): 'only_matching': True, }] - def _real_extract(self, url): - url, country, display_id = re.match(self._VALID_URL, url).groups() - if not country: - country = 'us' - else: - country = country.split('-')[0] - api_base = 'https://%s.yahoo.com/_td/api/resource/' % country - - for i, uuid in enumerate(['url=' + url, 'ymedia-alias=' + display_id]): - content = self._download_json( - api_base + 'content;getDetailView=true;uuids=["%s"]' % uuid, - display_id, 'Downloading content JSON metadata', fatal=i == 1) - if content: - item = content['items'][0] - break - - if item.get('type') != 'video': - entries = [] - - cover = item.get('cover') or {} - if cover.get('type') == 'yvideo': - cover_url = cover.get('url') - if cover_url: - entries.append(self.url_result( - cover_url, 'Yahoo', cover.get('uuid'))) - - for e in item.get('body', []): - if e.get('type') == 'videoIframe': - iframe_url = e.get('url') - if not iframe_url: - continue - entries.append(self.url_result(iframe_url)) - - return self.playlist_result( - entries, item.get('uuid'), - item.get('title'), item.get('summary')) - - video_id = item['uuid'] + def _extract_yahoo_video(self, video_id, country): video = self._download_json( - api_base + 'VideoService.videos;view=full;video_ids=["%s"]' % video_id, + 'https://%s.yahoo.com/_td/api/resource/VideoService.videos;view=full;video_ids=["%s"]' % (country, video_id), video_id, 'Downloading video JSON metadata')[0] title = video['title'] @@ -298,7 +261,6 @@ class YahooIE(InfoExtractor): 'id': video_id, 'title': self._live_title(title) if is_live else title, 'formats': formats, - 'display_id': display_id, 'thumbnails': thumbnails, 'description': clean_html(video.get('description')), 'timestamp': parse_iso8601(video.get('publish_time')), @@ -311,6 +273,44 @@ class YahooIE(InfoExtractor): 'episode_number': int_or_none(series_info.get('episode_number')), } + def _real_extract(self, url): + url, country, display_id = re.match(self._VALID_URL, url).groups() + if not country: + country = 'us' + else: + country = country.split('-')[0] + + item = self._download_json( + 'https://%s.yahoo.com/caas/content/article' % country, display_id, + 'Downloading content JSON metadata', query={ + 'url': url + })['items'][0]['data']['partnerData'] + + if item.get('type') != 'video': + entries = [] + + cover = item.get('cover') or {} + if cover.get('type') == 'yvideo': + cover_url = cover.get('url') + if cover_url: + entries.append(self.url_result( + cover_url, 'Yahoo', cover.get('uuid'))) + + for e in (item.get('body') or []): + if e.get('type') == 'videoIframe': + iframe_url = e.get('url') + if not iframe_url: + continue + entries.append(self.url_result(iframe_url)) + + return self.playlist_result( + entries, item.get('uuid'), + item.get('title'), item.get('summary')) + + info = self._extract_yahoo_video(item['uuid'], country) + info['display_id'] = display_id + return info + class YahooSearchIE(SearchInfoExtractor): IE_DESC = 'Yahoo screen search' From b79977fb6b1b2cea0231e5e25e201db05c0d1dba Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 19 Jan 2021 14:49:45 +0100 Subject: [PATCH 22/79] [aol] add support for yahoo videos(closes #26650) --- youtube_dl/extractor/aol.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index e87994a6a..f6ecb8438 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .yahoo import YahooIE from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, @@ -15,9 +15,9 @@ from ..utils import ( ) -class AolIE(InfoExtractor): +class AolIE(YahooIE): IE_NAME = 'aol.com' - _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P[0-9a-f]+)' + _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P\d{9}|[0-9a-f]{24}|[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})' _TESTS = [{ # video with 5min ID @@ -76,10 +76,16 @@ class AolIE(InfoExtractor): }, { 'url': 'https://www.aol.jp/video/playlist/5a28e936a1334d000137da0c/5a28f3151e642219fde19831/', 'only_matching': True, + }, { + # Yahoo video + 'url': 'https://www.aol.com/video/play/991e6700-ac02-11ea-99ff-357400036f61/24bbc846-3e30-3c46-915e-fe8ccd7fcc46/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + if '-' in video_id: + return self._extract_yahoo_video(video_id, 'us') response = self._download_json( 'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id, From 62acf5fa2c23d2eb52fb1dd07804352116bba12f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 19 Jan 2021 22:43:59 +0100 Subject: [PATCH 23/79] [trovo] Add new extractor(closes #26125) --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/trovo.py | 193 +++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+) create mode 100644 youtube_dl/extractor/trovo.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 29b0e615e..536b184bc 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1238,6 +1238,10 @@ from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE +from .trovo import ( + TrovoIE, + TrovoVodIE, +) from .trunews import TruNewsIE from .trutv import TruTVIE from .tube8 import Tube8IE diff --git a/youtube_dl/extractor/trovo.py b/youtube_dl/extractor/trovo.py new file mode 100644 index 000000000..43745213d --- /dev/null +++ b/youtube_dl/extractor/trovo.py @@ -0,0 +1,193 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, + try_get, +) + + +class TrovoBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?trovo\.live/' + + def _extract_streamer_info(self, data): + streamer_info = data.get('streamerInfo') or {} + username = streamer_info.get('userName') + return { + 'uploader': streamer_info.get('nickName'), + 'uploader_id': str_or_none(streamer_info.get('uid')), + 'uploader_url': 'https://trovo.live/' + username if username else None, + } + + +class TrovoIE(TrovoBaseIE): + _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?!(?:clip|video)/)(?P[^/?&#]+)' + + def _real_extract(self, url): + username = self._match_id(url) + live_info = self._download_json( + 'https://gql.trovo.live/', username, query={ + 'query': '''{ + getLiveInfo(params: {userName: "%s"}) { + isLive + programInfo { + coverUrl + id + streamInfo { + desc + playUrl + } + title + } + streamerInfo { + nickName + uid + userName + } + } +}''' % username, + })['data']['getLiveInfo'] + if live_info.get('isLive') == 0: + raise ExtractorError('%s is offline' % username, expected=True) + program_info = live_info['programInfo'] + program_id = program_info['id'] + title = self._live_title(program_info['title']) + + formats = [] + for stream_info in (program_info.get('streamInfo') or []): + play_url = stream_info.get('playUrl') + if not play_url: + continue + format_id = stream_info.get('desc') + formats.append({ + 'format_id': format_id, + 'height': int_or_none(format_id[:-1]) if format_id else None, + 'url': play_url, + }) + self._sort_formats(formats) + + info = { + 'id': program_id, + 'title': title, + 'formats': formats, + 'thumbnail': program_info.get('coverUrl'), + 'is_live': True, + } + info.update(self._extract_streamer_info(live_info)) + return info + + +class TrovoVodIE(TrovoBaseIE): + _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?:clip|video)/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://trovo.live/video/ltv-100095501_100095501_1609596043', + 'info_dict': { + 'id': 'ltv-100095501_100095501_1609596043', + 'ext': 'mp4', + 'title': 'Spontaner 12 Stunden Stream! - Ok Boomer!', + 'uploader': 'Exsl', + 'timestamp': 1609640305, + 'upload_date': '20210103', + 'uploader_id': '100095501', + 'duration': 43977, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'comments': 'mincount:8', + 'categories': ['Grand Theft Auto V'], + }, + }, { + 'url': 'https://trovo.live/clip/lc-5285890810184026005', + 'only_matching': True, + }] + + def _real_extract(self, url): + vid = self._match_id(url) + resp = self._download_json( + 'https://gql.trovo.live/', vid, data=json.dumps([{ + 'query': '''{ + batchGetVodDetailInfo(params: {vids: ["%s"]}) { + VodDetailInfos + } +}''' % vid, + }, { + 'query': '''{ + getCommentList(params: {appInfo: {postID: "%s"}, pageSize: 1000000000, preview: {}}) { + commentList { + author { + nickName + uid + } + commentID + content + createdAt + parentID + } + } +}''' % vid, + }]).encode(), headers={ + 'Content-Type': 'application/json', + }) + vod_detail_info = resp[0]['data']['batchGetVodDetailInfo']['VodDetailInfos'][vid] + vod_info = vod_detail_info['vodInfo'] + title = vod_info['title'] + + language = vod_info.get('languageName') + formats = [] + for play_info in (vod_info.get('playInfos') or []): + play_url = play_info.get('playUrl') + if not play_url: + continue + format_id = play_info.get('desc') + formats.append({ + 'ext': 'mp4', + 'filesize': int_or_none(play_info.get('fileSize')), + 'format_id': format_id, + 'height': int_or_none(format_id[:-1]) if format_id else None, + 'language': language, + 'protocol': 'm3u8_native', + 'tbr': int_or_none(play_info.get('bitrate')), + 'url': play_url, + }) + self._sort_formats(formats) + + category = vod_info.get('categoryName') + get_count = lambda x: int_or_none(vod_info.get(x + 'Num')) + + comment_list = try_get(resp, lambda x: x[1]['data']['getCommentList']['commentList'], list) or [] + comments = [] + for comment in comment_list: + content = comment.get('content') + if not content: + continue + author = comment.get('author') or {} + parent = comment.get('parentID') + comments.append({ + 'author': author.get('nickName'), + 'author_id': str_or_none(author.get('uid')), + 'id': str_or_none(comment.get('commentID')), + 'text': content, + 'timestamp': int_or_none(comment.get('createdAt')), + 'parent': 'root' if parent == 0 else str_or_none(parent), + }) + + info = { + 'id': vid, + 'title': title, + 'formats': formats, + 'thumbnail': vod_info.get('coverUrl'), + 'timestamp': int_or_none(vod_info.get('publishTs')), + 'duration': int_or_none(vod_info.get('duration')), + 'view_count': get_count('watch'), + 'like_count': get_count('like'), + 'comment_count': get_count('comment'), + 'comments': comments, + 'categories': [category] if category else None, + } + info.update(self._extract_streamer_info(vod_detail_info)) + return info From 657221c81d966115cf6ac263805168e49a48abce Mon Sep 17 00:00:00 2001 From: Brian Marks Date: Thu, 21 Jan 2021 10:46:29 -0500 Subject: [PATCH 24/79] [americastestkitchen] Add support for downloading entire seasons (#27861) --- youtube_dl/extractor/americastestkitchen.py | 67 +++++++++++++++++++++ youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index 7d2c375c4..35d3220c1 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor @@ -90,3 +91,69 @@ class AmericasTestKitchenIE(InfoExtractor): 'series': try_get(episode, lambda x: x['show']['title']), 'episode': episode.get('title'), } + + +class AmericasTestKitchenSeasonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?Pamericastestkitchen|cookscountry)\.com/episodes/browse/season_(?P\d+)' + _TESTS = [{ + # ATK Season + 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', + 'info_dict': { + 'id': 'season-1', + 'title': 'Season 1', + }, + 'playlist_count': 13, + }, { + # Cooks Country Season + 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'info_dict': { + 'id': 'season-12', + 'title': 'Season 12', + }, + 'playlist_count': 13, + }, { + # Multi-digit season + 'url': 'https://www.americastestkitchen.com/episodes/browse/season_20', + 'only_matching': True, + }] + + def _real_extract(self, url): + show_name, season = re.match(self._VALID_URL, url).groups() + + slug = 'atk' if show_name == 'americastestkitchen' else 'cco' + + filters = [ + 'search_season_list:Season %s' % season, + 'search_document_klass:episode', + 'search_show_slug:%s' % slug, + ] + + season_search = self._download_json( + 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_atk_season_desc_production', + season, headers={ + 'Origin': 'https://www.%s.com' % show_name, + 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', + 'X-Algolia-Application-Id': 'Y1FNZXUI30', + }, query={ + 'facetFilters': json.dumps(filters), + 'attributesToRetrieve': 'search_url', + 'attributesToHighlight': '', + # ATK and CCO generally have less than 26 episodes per season + 'hitsPerPage': '100', + }) + + entries = [ + self.url_result( + 'https://www.%s.com%s' % (show_name, episode['search_url']), + 'AmericasTestKitchen', + try_get(episode, lambda e: e['objectID'].split('_')[-1])) + for episode in season_search['hits'] + if 'search_url' in episode and episode['search_url'] + ] + + return { + '_type': 'playlist', + 'id': 'season-%s' % season, + 'title': 'Season %s' % season, + 'entries': sorted(entries, key=lambda e: e.get('id')), + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 536b184bc..52b8db0f9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -42,7 +42,10 @@ from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE from .amara import AmaraIE from .amcnetworks import AMCNetworksIE -from .americastestkitchen import AmericasTestKitchenIE +from .americastestkitchen import ( + AmericasTestKitchenIE, + AmericasTestKitchenSeasonIE, +) from .animeondemand import AnimeOnDemandIE from .anvato import AnvatoIE from .aol import AolIE From cff72b4cc0330f6d635353083eea7e570036b1ea Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 21 Jan 2021 16:47:55 +0100 Subject: [PATCH 25/79] [americastestkitchen] improve season extraction --- youtube_dl/extractor/americastestkitchen.py | 62 ++++++++++----------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index 35d3220c1..be960c0f9 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -99,7 +99,7 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): # ATK Season 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', 'info_dict': { - 'id': 'season-1', + 'id': 'season_1', 'title': 'Season 1', }, 'playlist_count': 13, @@ -107,53 +107,53 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): # Cooks Country Season 'url': 'https://www.cookscountry.com/episodes/browse/season_12', 'info_dict': { - 'id': 'season-12', + 'id': 'season_12', 'title': 'Season 12', }, 'playlist_count': 13, - }, { - # Multi-digit season - 'url': 'https://www.americastestkitchen.com/episodes/browse/season_20', - 'only_matching': True, }] def _real_extract(self, url): - show_name, season = re.match(self._VALID_URL, url).groups() + show_name, season_number = re.match(self._VALID_URL, url).groups() + season_number = int(season_number) slug = 'atk' if show_name == 'americastestkitchen' else 'cco' - filters = [ - 'search_season_list:Season %s' % season, - 'search_document_klass:episode', - 'search_show_slug:%s' % slug, - ] + season = 'Season %d' % season_number season_search = self._download_json( - 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_atk_season_desc_production', + 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, season, headers={ 'Origin': 'https://www.%s.com' % show_name, 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', 'X-Algolia-Application-Id': 'Y1FNZXUI30', }, query={ - 'facetFilters': json.dumps(filters), - 'attributesToRetrieve': 'search_url', + 'facetFilters': json.dumps([ + 'search_season_list:' + season, + 'search_document_klass:episode', + 'search_show_slug:' + slug, + ]), + 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug, 'attributesToHighlight': '', - # ATK and CCO generally have less than 26 episodes per season - 'hitsPerPage': '100', + 'hitsPerPage': 1000, }) - entries = [ - self.url_result( - 'https://www.%s.com%s' % (show_name, episode['search_url']), - 'AmericasTestKitchen', - try_get(episode, lambda e: e['objectID'].split('_')[-1])) - for episode in season_search['hits'] - if 'search_url' in episode and episode['search_url'] - ] + def entries(): + for episode in (season_search.get('hits') or []): + search_url = episode.get('search_url') + if not search_url: + continue + yield { + '_type': 'url', + 'url': 'https://www.%s.com%s' % (show_name, search_url), + 'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]), + 'title': episode.get('title'), + 'description': episode.get('description'), + 'timestamp': unified_timestamp(episode.get('search_document_date')), + 'season_number': season_number, + 'episode_number': int_or_none(episode.get('search_%s_episode_number' % slug)), + 'ie_key': AmericasTestKitchenIE.ie_key(), + } - return { - '_type': 'playlist', - 'id': 'season-%s' % season, - 'title': 'Season %s' % season, - 'entries': sorted(entries, key=lambda e: e.get('id')), - } + return self.playlist_result( + entries(), 'season_%d' % season_number, season) From 8d286bd5b67004d9c5420f6e3f6b7f75d2ba6395 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 21 Jan 2021 17:20:32 +0100 Subject: [PATCH 26/79] [wat] fix format extraction(closes #27901) --- youtube_dl/extractor/wat.py | 55 +++++++++---------------------------- 1 file changed, 13 insertions(+), 42 deletions(-) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 8ef3e0906..7214bfebf 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -1,12 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - ExtractorError, unified_strdate, HEADRequest, int_or_none, @@ -97,46 +94,20 @@ class WatIE(InfoExtractor): return red_url return None - def remove_bitrate_limit(manifest_url): - return re.sub(r'(?:max|min)_bitrate=\d+&?', '', manifest_url) - formats = [] - try: - alt_urls = lambda manifest_url: [re.sub(r'(?:wdv|ssm)?\.ism/', repl + '.ism/', manifest_url) for repl in ('', 'ssm')] - manifest_urls = self._download_json( - 'http://www.wat.tv/get/webhtml/' + video_id, video_id) - m3u8_url = manifest_urls.get('hls') - if m3u8_url: - m3u8_url = remove_bitrate_limit(m3u8_url) - for m3u8_alt_url in alt_urls(m3u8_url): - formats.extend(self._extract_m3u8_formats( - m3u8_alt_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - m3u8_alt_url.replace('ios', 'web').replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - mpd_url = manifest_urls.get('mpd') - if mpd_url: - mpd_url = remove_bitrate_limit(mpd_url) - for mpd_alt_url in alt_urls(mpd_url): - formats.extend(self._extract_mpd_formats( - mpd_alt_url, video_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) - except ExtractorError: - abr = 64 - for vbr, width, height in self._FORMATS: - tbr = vbr + abr - format_id = 'http-%s' % tbr - fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr) - if self._is_valid_url(fmt_url, video_id, format_id): - formats.append({ - 'format_id': format_id, - 'url': fmt_url, - 'vbr': vbr, - 'abr': abr, - 'width': width, - 'height': height, - }) + manifest_urls = self._download_json( + 'http://www.wat.tv/get/webhtml/' + video_id, video_id) + m3u8_url = manifest_urls.get('hls') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + mpd_url = manifest_urls.get('mpd') + if mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'), + video_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4') upload_date = unified_strdate(date_diffusion) if date_diffusion else None From 3bb7769c405e02fc1078252cafbbd982913fe50c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 21 Jan 2021 17:22:30 +0100 Subject: [PATCH 27/79] [wat] remove unused variable --- youtube_dl/extractor/wat.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 7214bfebf..f6940b371 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -43,15 +43,6 @@ class WatIE(InfoExtractor): }, ] - _FORMATS = ( - (200, 416, 234), - (400, 480, 270), - (600, 640, 360), - (1200, 640, 360), - (1800, 960, 540), - (2500, 1280, 720), - ) - def _real_extract(self, url): video_id = self._match_id(url) video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36)) From fa8f6d858064cf07b9167b73647545b3007c6b21 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 21 Jan 2021 23:53:09 +0100 Subject: [PATCH 28/79] [comedycentral] fix extraction(closes #27905) --- youtube_dl/extractor/comedycentral.py | 141 +++++--------------------- youtube_dl/extractor/extractors.py | 3 - youtube_dl/extractor/mtv.py | 23 ++--- youtube_dl/extractor/spike.py | 15 +-- 4 files changed, 37 insertions(+), 145 deletions(-) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index d08b909a6..1bfa912be 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -1,142 +1,51 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor -from .common import InfoExtractor class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ - (video-clips|episodes|cc-studios|video-collections|shows(?=/[^/]+/(?!full-episodes))) - /(?P.*)''' + _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?)/(?P<id>[0-9a-z]{6})' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TESTS = [{ - 'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', - 'md5': 'c4f48e9eda1b16dd10add0744344b6d8', + 'url': 'http://www.cc.com/video-clips/5ke9v2/the-daily-show-with-trevor-noah-doc-rivers-and-steve-ballmer---the-nba-player-strike', + 'md5': 'b8acb347177c680ff18a292aa2166f80', 'info_dict': { - 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', + 'id': '89ccc86e-1b02-4f83-b0c9-1d9592ecd025', 'ext': 'mp4', - 'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother', - 'description': 'After a certain point, breastfeeding becomes c**kblocking.', - 'timestamp': 1376798400, - 'upload_date': '20130818', + 'title': 'The Daily Show with Trevor Noah|August 28, 2020|25|25149|Doc Rivers and Steve Ballmer - The NBA Player Strike', + 'description': 'md5:5334307c433892b85f4f5e5ac9ef7498', + 'timestamp': 1598670000, + 'upload_date': '20200829', }, }, { - 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview', + 'url': 'http://www.cc.com/episodes/pnzzci/drawn-together--american-idol--parody-clip-show-season-3-ep-314', 'only_matching': True, - }] - - -class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ - (?:full-episodes|shows(?=/[^/]+/full-episodes)) - /(?P<id>[^?]+)''' - _FEED_URL = 'http://comedycentral.com/feeds/mrss/' - - _TESTS = [{ - 'url': 'http://www.cc.com/full-episodes/pv391a/the-daily-show-with-trevor-noah-november-28--2016---ryan-speedo-green-season-22-ep-22028', - 'info_dict': { - 'description': 'Donald Trump is accused of exploiting his president-elect status for personal gain, Cuban leader Fidel Castro dies, and Ryan Speedo Green discusses "Sing for Your Life."', - 'title': 'November 28, 2016 - Ryan Speedo Green', - }, - 'playlist_count': 4, }, { - 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - mgid = self._extract_triforce_mgid(webpage, data_zone='t2_lc_promo1') - videos_info = self._get_videos_info(mgid) - return videos_info - - -class ToshIE(MTVServicesInfoExtractor): - IE_DESC = 'Tosh.0' - _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)' - _FEED_URL = 'http://tosh.cc.com/feeds/mrss' - - _TESTS = [{ - 'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans', - 'info_dict': { - 'description': 'Tosh asked fans to share their summer plans.', - 'title': 'Twitter Users Share Summer Plans', - }, - 'playlist': [{ - 'md5': 'f269e88114c1805bb6d7653fecea9e06', - 'info_dict': { - 'id': '90498ec2-ed00-11e0-aca6-0026b9414f30', - 'ext': 'mp4', - 'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans', - 'description': 'Tosh asked fans to share their summer plans.', - 'thumbnail': r're:^https?://.*\.jpg', - # It's really reported to be published on year 2077 - 'upload_date': '20770610', - 'timestamp': 3390510600, - 'subtitles': { - 'en': 'mincount:3', - }, - }, - }] - }, { - 'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp', + 'url': 'https://www.cc.com/video/k3sdvm/the-daily-show-with-jon-stewart-exclusive-the-fourth-estate', 'only_matching': True, }] class ComedyCentralTVIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/folgen/(?P<id>[0-9a-z]{6})' _TESTS = [{ - 'url': 'http://www.comedycentral.tv/staffeln/7436-the-mindy-project-staffel-4', + 'url': 'https://www.comedycentral.tv/folgen/pxdpec/josh-investigates-klimawandel-staffel-1-ep-1', 'info_dict': { - 'id': 'local_playlist-f99b626bdfe13568579a', - 'ext': 'flv', - 'title': 'Episode_the-mindy-project_shows_season-4_episode-3_full-episode_part1', + 'id': '15907dc3-ec3c-11e8-a442-0e40cf2fc285', + 'ext': 'mp4', + 'title': 'Josh Investigates', + 'description': 'Steht uns das Ende der Welt bevor?', }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - 'url': 'http://www.comedycentral.tv/shows/1074-workaholics', - 'only_matching': True, - }, { - 'url': 'http://www.comedycentral.tv/shows/1727-the-mindy-project/bonus', - 'only_matching': True, }] + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' + _GEO_COUNTRIES = ['DE'] - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - mrss_url = self._search_regex( - r'data-mrss=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'mrss url', group='url') - - return self._get_videos_info_from_url(mrss_url, video_id) - - -class ComedyCentralShortnameIE(InfoExtractor): - _VALID_URL = r'^:(?P<id>tds|thedailyshow|theopposition)$' - _TESTS = [{ - 'url': ':tds', - 'only_matching': True, - }, { - 'url': ':thedailyshow', - 'only_matching': True, - }, { - 'url': ':theopposition', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - shortcut_map = { - 'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'theopposition': 'http://www.cc.com/shows/the-opposition-with-jordan-klepper/full-episodes', + def _get_feed_query(self, uri): + return { + 'accountOverride': 'intl.mtvi.com', + 'arcEp': 'web.cc.tv', + 'ep': 'b9032c3a', + 'imageEp': 'web.cc.tv', + 'mgid': uri, } - return self.url_result(shortcut_map[video_id]) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 52b8db0f9..ef57f5556 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -235,11 +235,8 @@ from .cnn import ( ) from .coub import CoubIE from .comedycentral import ( - ComedyCentralFullEpisodesIE, ComedyCentralIE, - ComedyCentralShortnameIE, ComedyCentralTVIE, - ToshIE, ) from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .commonprotocols import ( diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index df1034fc5..f5e30d22d 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -253,6 +253,10 @@ class MTVServicesInfoExtractor(InfoExtractor): return try_get(feed, lambda x: x['result']['data']['id'], compat_str) + @staticmethod + def _extract_child_with_type(parent, t): + return next(c for c in parent['children'] if c.get('type') == t) + def _extract_mgid(self, webpage): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf @@ -278,6 +282,13 @@ class MTVServicesInfoExtractor(InfoExtractor): if not mgid: mgid = self._extract_triforce_mgid(webpage) + if not mgid: + data = self._parse_json(self._search_regex( + r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) + main_container = self._extract_child_with_type(data, 'MainContainer') + video_player = self._extract_child_with_type(main_container, 'VideoPlayer') + mgid = video_player['props']['media']['video']['config']['uri'] + return mgid def _real_extract(self, url): @@ -349,18 +360,6 @@ class MTVIE(MTVServicesInfoExtractor): 'only_matching': True, }] - @staticmethod - def extract_child_with_type(parent, t): - children = parent['children'] - return next(c for c in children if c.get('type') == t) - - def _extract_mgid(self, webpage): - data = self._parse_json(self._search_regex( - r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) - main_container = self.extract_child_with_type(data, 'MainContainer') - video_player = self.extract_child_with_type(main_container, 'VideoPlayer') - return video_player['props']['media']['video']['config']['uri'] - class MTVJapanIE(MTVServicesInfoExtractor): IE_NAME = 'mtvjapan' diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index 4c5e3f7c2..5805f3d44 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -20,9 +20,6 @@ class BellatorIE(MTVServicesInfoExtractor): _FEED_URL = 'http://www.bellator.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] - def _extract_mgid(self, webpage): - return self._extract_triforce_mgid(webpage) - class ParamountNetworkIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' @@ -46,16 +43,6 @@ class ParamountNetworkIE(MTVServicesInfoExtractor): def _get_feed_query(self, uri): return { 'arcEp': 'paramountnetwork.com', + 'imageEp': 'paramountnetwork.com', 'mgid': uri, } - - def _extract_mgid(self, webpage): - root_data = self._parse_json(self._search_regex( - r'window\.__DATA__\s*=\s*({.+})', - webpage, 'data'), None) - - def find_sub_data(data, data_type): - return next(c for c in data['children'] if c.get('type') == data_type) - - c = find_sub_data(find_sub_data(root_data, 'MainContainer'), 'VideoPlayer') - return c['props']['media']['video']['config']['uri'] From 4542e3e55538a8e7115bde777e7ddbd781c2e446 Mon Sep 17 00:00:00 2001 From: aarubui <aarubui@users.noreply.github.com> Date: Fri, 22 Jan 2021 19:56:51 +1100 Subject: [PATCH 29/79] [njpwworld] fix extraction (#27890) --- youtube_dl/extractor/njpwworld.py | 54 ++++++++++++++++--------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py index 025c5d249..3639d142f 100644 --- a/youtube_dl/extractor/njpwworld.py +++ b/youtube_dl/extractor/njpwworld.py @@ -6,30 +6,40 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( - extract_attributes, get_element_by_class, urlencode_postdata, ) class NJPWWorldIE(InfoExtractor): - _VALID_URL = r'https?://njpwworld\.com/p/(?P<id>[a-z0-9_]+)' + _VALID_URL = r'https?://(front\.)?njpwworld\.com/p/(?P<id>[a-z0-9_]+)' IE_DESC = '新日本プロレスワールド' _NETRC_MACHINE = 'njpwworld' - _TEST = { + _TESTS = [{ 'url': 'http://njpwworld.com/p/s_series_00155_1_9/', 'info_dict': { 'id': 's_series_00155_1_9', 'ext': 'mp4', - 'title': '第9試合 ランディ・サベージ vs リック・スタイナー', + 'title': '闘強導夢2000 2000年1月4日 東京ドーム 第9試合 ランディ・サベージ VS リック・スタイナー', 'tags': list, }, 'params': { 'skip_download': True, # AES-encrypted m3u8 }, 'skip': 'Requires login', - } + }, { + 'url': 'https://front.njpwworld.com/p/s_series_00563_16_bs', + 'info_dict': { + 'id': 's_series_00563_16_bs', + 'ext': 'mp4', + 'title': 'WORLD TAG LEAGUE 2020 & BEST OF THE SUPER Jr.27 2020年12月6日 福岡・福岡国際センター バックステージコメント(字幕あり)', + 'tags': ["福岡・福岡国際センター", "バックステージコメント", "2020", "20年代"], + }, + 'params': { + 'skip_download': True, + }, + }] _LOGIN_URL = 'https://front.njpwworld.com/auth/login' @@ -64,35 +74,27 @@ class NJPWWorldIE(InfoExtractor): webpage = self._download_webpage(url, video_id) formats = [] - for mobj in re.finditer(r'<a[^>]+\bhref=(["\'])/player.+?[^>]*>', webpage): - player = extract_attributes(mobj.group(0)) - player_path = player.get('href') - if not player_path: - continue - kind = self._search_regex( - r'(low|high)$', player.get('class') or '', 'kind', - default='low') + for kind, vid in re.findall(r'if\s+\(\s*imageQualityType\s*==\s*\'([^\']+)\'\s*\)\s*{\s*video_id\s*=\s*"(\d+)"', webpage): + player_path = '/intent?id=%s&type=url' % vid player_url = compat_urlparse.urljoin(url, player_path) - player_page = self._download_webpage( - player_url, video_id, note='Downloading player page') - entries = self._parse_html5_media_entries( - player_url, player_page, video_id, m3u8_id='hls-%s' % kind, - m3u8_entry_protocol='m3u8_native') - kind_formats = entries[0]['formats'] - for f in kind_formats: - f['quality'] = 2 if kind == 'high' else 1 - formats.extend(kind_formats) + formats.append({ + 'url': player_url, + 'format_id': kind, + 'ext': 'mp4', + 'protocol': 'm3u8', + 'quality': 2 if kind == 'high' else 1, + }) self._sort_formats(formats) - post_content = get_element_by_class('post-content', webpage) + tag_block = get_element_by_class('tag-block', webpage) tags = re.findall( - r'<li[^>]+class="tag-[^"]+"><a[^>]*>([^<]+)</a></li>', post_content - ) if post_content else None + r'<a[^>]+class="tag-[^"]+"[^>]*>([^<]+)</a>', tag_block + ) if tag_block else None return { 'id': video_id, - 'title': self._og_search_title(webpage), + 'title': get_element_by_class('article-title', webpage) or self._og_search_title(webpage), 'formats': formats, 'tags': tags, } From 142c584063ec02406e636522fe11d0d2be22b299 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 22 Jan 2021 23:51:22 +0700 Subject: [PATCH 30/79] Introduce --output-na-placeholder (closes #27896) --- README.md | 2 +- test/test_YoutubeDL.py | 15 +++++++++++---- youtube_dl/YoutubeDL.py | 7 ++++--- youtube_dl/__init__.py | 1 + youtube_dl/options.py | 4 ++++ 5 files changed, 21 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 85fed6d3a..8c2569aaa 100644 --- a/README.md +++ b/README.md @@ -583,7 +583,7 @@ Available for the media that is a track or a part of a music album: - `disc_number` (numeric): Number of the disc or other physical medium the track belongs to - `release_year` (numeric): Year (YYYY) when the album was released -Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with `NA`. +Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with placeholder value provided with `--output-na-placeholder` (`NA` by default). For example for `-o %(title)s-%(id)s.%(ext)s` and an mp4 video with title `youtube-dl test video` and id `BaW_jenozKcj`, this will result in a `youtube-dl test video-BaW_jenozKcj.mp4` file created in the current directory. diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 4d62ba145..a35effe0e 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -633,13 +633,20 @@ class TestYoutubeDL(unittest.TestCase): 'title2': '%PATH%', } - def fname(templ): - ydl = YoutubeDL({'outtmpl': templ}) + def fname(templ, na_placeholder='NA'): + params = {'outtmpl': templ} + if na_placeholder != 'NA': + params['outtmpl_na_placeholder'] = na_placeholder + ydl = YoutubeDL(params) return ydl.prepare_filename(info) self.assertEqual(fname('%(id)s.%(ext)s'), '1234.mp4') self.assertEqual(fname('%(id)s-%(width)s.%(ext)s'), '1234-NA.mp4') - # Replace missing fields with 'NA' - self.assertEqual(fname('%(uploader_date)s-%(id)s.%(ext)s'), 'NA-1234.mp4') + NA_TEST_OUTTMPL = '%(uploader_date)s-%(width)d-%(id)s.%(ext)s' + # Replace missing fields with 'NA' by default + self.assertEqual(fname(NA_TEST_OUTTMPL), 'NA-NA-1234.mp4') + # Or by provided placeholder + self.assertEqual(fname(NA_TEST_OUTTMPL, na_placeholder='none'), 'none-none-1234.mp4') + self.assertEqual(fname(NA_TEST_OUTTMPL, na_placeholder=''), '--1234.mp4') self.assertEqual(fname('%(height)d.%(ext)s'), '1080.mp4') self.assertEqual(fname('%(height)6d.%(ext)s'), ' 1080.mp4') self.assertEqual(fname('%(height)-6d.%(ext)s'), '1080 .mp4') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index efd42fa63..ecac31f7a 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -163,6 +163,7 @@ class YoutubeDL(object): simulate: Do not download the video files. format: Video format code. See options.py for more information. outtmpl: Template for output names. + outtmpl_na_placeholder: Placeholder for unavailable meta fields. restrictfilenames: Do not allow "&" and spaces in file names ignoreerrors: Do not stop on download errors. force_generic_extractor: Force downloader to use the generic extractor @@ -658,7 +659,7 @@ class YoutubeDL(object): template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v)) for k, v in template_dict.items() if v is not None and not isinstance(v, (list, tuple, dict))) - template_dict = collections.defaultdict(lambda: 'NA', template_dict) + template_dict = collections.defaultdict(lambda: self.params.get('outtmpl_na_placeholder', 'NA'), template_dict) outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) @@ -678,8 +679,8 @@ class YoutubeDL(object): # Missing numeric fields used together with integer presentation types # in format specification will break the argument substitution since - # string 'NA' is returned for missing fields. We will patch output - # template for missing fields to meet string presentation type. + # string NA placeholder is returned for missing fields. We will patch + # output template for missing fields to meet string presentation type. for numeric_field in self._NUMERIC_FIELDS: if numeric_field not in template_dict: # As of [1] format syntax is: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9a659fc65..e1bd67919 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -340,6 +340,7 @@ def _real_main(argv=None): 'format': opts.format, 'listformats': opts.listformats, 'outtmpl': outtmpl, + 'outtmpl_na_placeholder': opts.outtmpl_na_placeholder, 'autonumber_size': opts.autonumber_size, 'autonumber_start': opts.autonumber_start, 'restrictfilenames': opts.restrictfilenames, diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 3000ba41e..6b0c62e19 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -689,6 +689,10 @@ def parseOpts(overrideArguments=None): '-o', '--output', dest='outtmpl', metavar='TEMPLATE', help=('Output filename template, see the "OUTPUT TEMPLATE" for all the info')) + filesystem.add_option( + '--output-na-placeholder', + dest='outtmpl_na_placeholder', metavar='PLACEHOLDER', default='NA', + help=('Placeholder value for unavailable meta fields in output filename template (default is "%default")')) filesystem.add_option( '--autonumber-size', dest='autonumber_size', metavar='NUMBER', type=int, From 5519bba3e14d05ccc8c7114e2f2909294c65a26e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 23 Jan 2021 00:00:53 +0700 Subject: [PATCH 31/79] [options] Clarify --extract-audio help string (closes #27878) --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 6b0c62e19..241cf110f 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -786,7 +786,7 @@ def parseOpts(overrideArguments=None): postproc.add_option( '-x', '--extract-audio', action='store_true', dest='extractaudio', default=False, - help='Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)') + help='Convert video files to audio-only files (requires ffmpeg/avconv and ffprobe/avprobe)') postproc.add_option( '--audio-format', metavar='FORMAT', dest='audioformat', default='best', help='Specify audio format: "best", "aac", "flac", "mp3", "m4a", "opus", "vorbis", or "wav"; "%default" by default; No effect without -x') From d8dab85419ea7e35bd234479abe751334f3e7116 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Grosdidier?= <aurelien.grosdidier@gmail.com> Date: Fri, 22 Jan 2021 19:03:45 +0100 Subject: [PATCH 32/79] [franceculture] Fix extraction (closes #27891) (#27903) Co-authored-by: Sergey M. <dstftw@gmail.com> --- youtube_dl/extractor/franceculture.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index 306b45fc9..7e9ceabbc 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -20,7 +20,7 @@ class FranceCultureIE(InfoExtractor): 'title': 'Rendez-vous au pays des geeks', 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140301', - 'timestamp': 1393642916, + 'timestamp': 1393700400, 'vcodec': 'none', } } @@ -36,12 +36,12 @@ class FranceCultureIE(InfoExtractor): </h1>| <div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*> ).*? - (<button[^>]+data-asset-source="[^"]+"[^>]+>) + (<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>) ''', webpage, 'video data')) - video_url = video_data['data-asset-source'] - title = video_data.get('data-asset-title') or self._og_search_title(webpage) + video_url = video_data.get('data-url') or video_data['data-asset-source'] + title = video_data.get('data-asset-title') or video_data.get('data-diffusion-title') or self._og_search_title(webpage) description = self._html_search_regex( r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>', @@ -64,6 +64,6 @@ class FranceCultureIE(InfoExtractor): 'ext': ext, 'vcodec': 'none' if ext == 'mp3' else None, 'uploader': uploader, - 'timestamp': int_or_none(video_data.get('data-asset-created-date')), + 'timestamp': int_or_none(video_data.get('data-start-time')) or int_or_none(video_data.get('data-asset-created-date')), 'duration': int_or_none(video_data.get('data-duration')), } From f08c31cf33beb0687c4df58b83e319ec8dfe03ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 23 Jan 2021 01:10:35 +0700 Subject: [PATCH 33/79] [franceculture] Make thumbnail optional (closes #18807) --- youtube_dl/extractor/franceculture.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index 7e9ceabbc..14f4cb489 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -11,7 +11,7 @@ from ..utils import ( class FranceCultureIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', 'info_dict': { 'id': 'rendez-vous-au-pays-des-geeks', @@ -23,7 +23,11 @@ class FranceCultureIE(InfoExtractor): 'timestamp': 1393700400, 'vcodec': 'none', } - } + }, { + # no thumbnail + 'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -48,7 +52,7 @@ class FranceCultureIE(InfoExtractor): webpage, 'description', default=None) thumbnail = self._search_regex( r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"', - webpage, 'thumbnail', fatal=False) + webpage, 'thumbnail', default=None) uploader = self._html_search_regex( r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None) From dbf3fa8af67dfdab42d56fdc5f35610658bc2746 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jan 2021 17:36:40 +0700 Subject: [PATCH 34/79] [ChangeLog] Actualize [ci skip] --- ChangeLog | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ChangeLog b/ChangeLog index 27f01c438..f09049fea 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +version <unreleased> + +Core +* Introduce --output-na-placeholder (#27896) + +Extractors +* [franceculture] Make thumbnail optional (#18807) +* [franceculture] Fix extraction (#27891, #27903) +* [njpwworld] Fix extraction (#27890) +* [comedycentral] Fix extraction (#27905) +* [wat] Fix format extraction (#27901) ++ [americastestkitchen:season] Add support for seasons (#27861) ++ [trovo] Add support for trovo.live (#26125) ++ [aol] Add support for yahoo videos (#26650) +* [yahoo] Fix single video extraction +* [lbry] Unescape lbry URI (#27872) +* [9gag] Fix and improve extraction (#23022) +* [americastestkitchen] Improve metadata extraction for ATK episodes (#27860) +* [aljazeera] Fix extraction (#20911, #27779) ++ [minds] Add support for minds.com (#17934) +* [ard] Fix title and description extraction (#27761) ++ [spotify] Add support for Spotify Podcasts (#27443) + + version 2021.01.16 Core From 186cbaffb954f00c4ff1f58f9fe378c65b4ef87b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jan 2021 18:00:58 +0700 Subject: [PATCH 35/79] release 2021.01.24 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +- .../ISSUE_TEMPLATE/2_site_support_request.md | 4 +- .../ISSUE_TEMPLATE/3_site_feature_request.md | 4 +- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 +- ChangeLog | 2 +- README.md | 761 +++++++++--------- docs/supportedsites.md | 13 +- youtube_dl/version.py | 2 +- 9 files changed, 422 insertions(+), 380 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index aedcfa6b3..935806b5e 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.01.16** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.16 + [debug] youtube-dl version 2021.01.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 5c0dfea4e..453a5e147 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.01.16** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 772147a75..07094c10d 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.16** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 2fcaa3a23..b234f8ccd 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.01.16** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.16 + [debug] youtube-dl version 2021.01.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index f1adfce8f..557e59ca5 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.16. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.16** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index f09049fea..9b2f38b25 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2021.01.24 Core * Introduce --output-na-placeholder (#27896) diff --git a/README.md b/README.md index 8c2569aaa..94c34d89a 100644 --- a/README.md +++ b/README.md @@ -52,394 +52,431 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo youtube-dl [OPTIONS] URL [URL...] # OPTIONS - -h, --help Print this help text and exit - --version Print program version and exit - -U, --update Update this program to latest version. Make - sure that you have sufficient permissions - (run with sudo if needed) - -i, --ignore-errors Continue on download errors, for example to - skip unavailable videos in a playlist - --abort-on-error Abort downloading of further videos (in the - playlist or the command line) if an error - occurs - --dump-user-agent Display the current browser identification - --list-extractors List all supported extractors - --extractor-descriptions Output descriptions of all supported - extractors - --force-generic-extractor Force extraction to use the generic - extractor - --default-search PREFIX Use this prefix for unqualified URLs. For - example "gvsearch2:" downloads two videos - from google videos for youtube-dl "large - apple". Use the value "auto" to let - youtube-dl guess ("auto_warning" to emit a - warning when guessing). "error" just throws - an error. The default value "fixup_error" - repairs broken URLs, but emits an error if - this is not possible instead of searching. - --ignore-config Do not read configuration files. When given - in the global configuration file - /etc/youtube-dl.conf: Do not read the user - configuration in ~/.config/youtube- - dl/config (%APPDATA%/youtube-dl/config.txt - on Windows) - --config-location PATH Location of the configuration file; either - the path to the config or its containing - directory. - --flat-playlist Do not extract the videos of a playlist, - only list them. - --mark-watched Mark videos watched (YouTube only) - --no-mark-watched Do not mark videos watched (YouTube only) - --no-color Do not emit color codes in output + -h, --help Print this help text and exit + --version Print program version and exit + -U, --update Update this program to latest version. + Make sure that you have sufficient + permissions (run with sudo if needed) + -i, --ignore-errors Continue on download errors, for + example to skip unavailable videos in a + playlist + --abort-on-error Abort downloading of further videos (in + the playlist or the command line) if an + error occurs + --dump-user-agent Display the current browser + identification + --list-extractors List all supported extractors + --extractor-descriptions Output descriptions of all supported + extractors + --force-generic-extractor Force extraction to use the generic + extractor + --default-search PREFIX Use this prefix for unqualified URLs. + For example "gvsearch2:" downloads two + videos from google videos for youtube- + dl "large apple". Use the value "auto" + to let youtube-dl guess ("auto_warning" + to emit a warning when guessing). + "error" just throws an error. The + default value "fixup_error" repairs + broken URLs, but emits an error if this + is not possible instead of searching. + --ignore-config Do not read configuration files. When + given in the global configuration file + /etc/youtube-dl.conf: Do not read the + user configuration in + ~/.config/youtube-dl/config + (%APPDATA%/youtube-dl/config.txt on + Windows) + --config-location PATH Location of the configuration file; + either the path to the config or its + containing directory. + --flat-playlist Do not extract the videos of a + playlist, only list them. + --mark-watched Mark videos watched (YouTube only) + --no-mark-watched Do not mark videos watched (YouTube + only) + --no-color Do not emit color codes in output ## Network Options: - --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. - To enable SOCKS proxy, specify a proper - scheme. For example - socks5://127.0.0.1:1080/. Pass in an empty - string (--proxy "") for direct connection - --socket-timeout SECONDS Time to wait before giving up, in seconds - --source-address IP Client-side IP address to bind to - -4, --force-ipv4 Make all connections via IPv4 - -6, --force-ipv6 Make all connections via IPv6 + --proxy URL Use the specified HTTP/HTTPS/SOCKS + proxy. To enable SOCKS proxy, specify a + proper scheme. For example + socks5://127.0.0.1:1080/. Pass in an + empty string (--proxy "") for direct + connection + --socket-timeout SECONDS Time to wait before giving up, in + seconds + --source-address IP Client-side IP address to bind to + -4, --force-ipv4 Make all connections via IPv4 + -6, --force-ipv6 Make all connections via IPv6 ## Geo Restriction: - --geo-verification-proxy URL Use this proxy to verify the IP address for - some geo-restricted sites. The default - proxy specified by --proxy (or none, if the - option is not present) is used for the - actual downloading. - --geo-bypass Bypass geographic restriction via faking - X-Forwarded-For HTTP header - --no-geo-bypass Do not bypass geographic restriction via - faking X-Forwarded-For HTTP header - --geo-bypass-country CODE Force bypass geographic restriction with - explicitly provided two-letter ISO 3166-2 - country code - --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction with - explicitly provided IP block in CIDR - notation + --geo-verification-proxy URL Use this proxy to verify the IP address + for some geo-restricted sites. The + default proxy specified by --proxy (or + none, if the option is not present) is + used for the actual downloading. + --geo-bypass Bypass geographic restriction via + faking X-Forwarded-For HTTP header + --no-geo-bypass Do not bypass geographic restriction + via faking X-Forwarded-For HTTP header + --geo-bypass-country CODE Force bypass geographic restriction + with explicitly provided two-letter ISO + 3166-2 country code + --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction + with explicitly provided IP block in + CIDR notation ## Video Selection: - --playlist-start NUMBER Playlist video to start at (default is 1) - --playlist-end NUMBER Playlist video to end at (default is last) - --playlist-items ITEM_SPEC Playlist video items to download. Specify - indices of the videos in the playlist - separated by commas like: "--playlist-items - 1,2,5,8" if you want to download videos - indexed 1, 2, 5, 8 in the playlist. You can - specify range: "--playlist-items - 1-3,7,10-13", it will download the videos - at index 1, 2, 3, 7, 10, 11, 12 and 13. - --match-title REGEX Download only matching titles (regex or - caseless sub-string) - --reject-title REGEX Skip download for matching titles (regex or - caseless sub-string) - --max-downloads NUMBER Abort after downloading NUMBER files - --min-filesize SIZE Do not download any videos smaller than - SIZE (e.g. 50k or 44.6m) - --max-filesize SIZE Do not download any videos larger than SIZE - (e.g. 50k or 44.6m) - --date DATE Download only videos uploaded in this date - --datebefore DATE Download only videos uploaded on or before - this date (i.e. inclusive) - --dateafter DATE Download only videos uploaded on or after - this date (i.e. inclusive) - --min-views COUNT Do not download any videos with less than - COUNT views - --max-views COUNT Do not download any videos with more than - COUNT views - --match-filter FILTER Generic video filter. Specify any key (see - the "OUTPUT TEMPLATE" for a list of - available keys) to match if the key is - present, !key to check if the key is not - present, key > NUMBER (like "comment_count - > 12", also works with >=, <, <=, !=, =) to - compare against a number, key = 'LITERAL' - (like "uploader = 'Mike Smith'", also works - with !=) to match against a string literal - and & to require multiple matches. Values - which are not known are excluded unless you - put a question mark (?) after the operator. - For example, to only match videos that have - been liked more than 100 times and disliked - less than 50 times (or the dislike - functionality is not available at the given - service), but who also have a description, - use --match-filter "like_count > 100 & - dislike_count <? 50 & description" . - --no-playlist Download only the video, if the URL refers - to a video and a playlist. - --yes-playlist Download the playlist, if the URL refers to - a video and a playlist. - --age-limit YEARS Download only videos suitable for the given - age - --download-archive FILE Download only videos not listed in the - archive file. Record the IDs of all - downloaded videos in it. - --include-ads Download advertisements as well - (experimental) + --playlist-start NUMBER Playlist video to start at (default is + 1) + --playlist-end NUMBER Playlist video to end at (default is + last) + --playlist-items ITEM_SPEC Playlist video items to download. + Specify indices of the videos in the + playlist separated by commas like: "-- + playlist-items 1,2,5,8" if you want to + download videos indexed 1, 2, 5, 8 in + the playlist. You can specify range: " + --playlist-items 1-3,7,10-13", it will + download the videos at index 1, 2, 3, + 7, 10, 11, 12 and 13. + --match-title REGEX Download only matching titles (regex or + caseless sub-string) + --reject-title REGEX Skip download for matching titles + (regex or caseless sub-string) + --max-downloads NUMBER Abort after downloading NUMBER files + --min-filesize SIZE Do not download any videos smaller than + SIZE (e.g. 50k or 44.6m) + --max-filesize SIZE Do not download any videos larger than + SIZE (e.g. 50k or 44.6m) + --date DATE Download only videos uploaded in this + date + --datebefore DATE Download only videos uploaded on or + before this date (i.e. inclusive) + --dateafter DATE Download only videos uploaded on or + after this date (i.e. inclusive) + --min-views COUNT Do not download any videos with less + than COUNT views + --max-views COUNT Do not download any videos with more + than COUNT views + --match-filter FILTER Generic video filter. Specify any key + (see the "OUTPUT TEMPLATE" for a list + of available keys) to match if the key + is present, !key to check if the key is + not present, key > NUMBER (like + "comment_count > 12", also works with + >=, <, <=, !=, =) to compare against a + number, key = 'LITERAL' (like "uploader + = 'Mike Smith'", also works with !=) to + match against a string literal and & to + require multiple matches. Values which + are not known are excluded unless you + put a question mark (?) after the + operator. For example, to only match + videos that have been liked more than + 100 times and disliked less than 50 + times (or the dislike functionality is + not available at the given service), + but who also have a description, use + --match-filter "like_count > 100 & + dislike_count <? 50 & description" . + --no-playlist Download only the video, if the URL + refers to a video and a playlist. + --yes-playlist Download the playlist, if the URL + refers to a video and a playlist. + --age-limit YEARS Download only videos suitable for the + given age + --download-archive FILE Download only videos not listed in the + archive file. Record the IDs of all + downloaded videos in it. + --include-ads Download advertisements as well + (experimental) ## Download Options: - -r, --limit-rate RATE Maximum download rate in bytes per second - (e.g. 50K or 4.2M) - -R, --retries RETRIES Number of retries (default is 10), or - "infinite". - --fragment-retries RETRIES Number of retries for a fragment (default - is 10), or "infinite" (DASH, hlsnative and - ISM) - --skip-unavailable-fragments Skip unavailable fragments (DASH, hlsnative - and ISM) - --abort-on-unavailable-fragment Abort downloading when some fragment is not - available - --keep-fragments Keep downloaded fragments on disk after - downloading is finished; fragments are - erased by default - --buffer-size SIZE Size of download buffer (e.g. 1024 or 16K) - (default is 1024) - --no-resize-buffer Do not automatically adjust the buffer - size. By default, the buffer size is - automatically resized from an initial value - of SIZE. - --http-chunk-size SIZE Size of a chunk for chunk-based HTTP - downloading (e.g. 10485760 or 10M) (default - is disabled). May be useful for bypassing - bandwidth throttling imposed by a webserver - (experimental) - --playlist-reverse Download playlist videos in reverse order - --playlist-random Download playlist videos in random order - --xattr-set-filesize Set file xattribute ytdl.filesize with - expected file size - --hls-prefer-native Use the native HLS downloader instead of - ffmpeg - --hls-prefer-ffmpeg Use ffmpeg instead of the native HLS - downloader - --hls-use-mpegts Use the mpegts container for HLS videos, - allowing to play the video while - downloading (some players may not be able - to play it) - --external-downloader COMMAND Use the specified external downloader. - Currently supports - aria2c,avconv,axel,curl,ffmpeg,httpie,wget - --external-downloader-args ARGS Give these arguments to the external - downloader + -r, --limit-rate RATE Maximum download rate in bytes per + second (e.g. 50K or 4.2M) + -R, --retries RETRIES Number of retries (default is 10), or + "infinite". + --fragment-retries RETRIES Number of retries for a fragment + (default is 10), or "infinite" (DASH, + hlsnative and ISM) + --skip-unavailable-fragments Skip unavailable fragments (DASH, + hlsnative and ISM) + --abort-on-unavailable-fragment Abort downloading when some fragment is + not available + --keep-fragments Keep downloaded fragments on disk after + downloading is finished; fragments are + erased by default + --buffer-size SIZE Size of download buffer (e.g. 1024 or + 16K) (default is 1024) + --no-resize-buffer Do not automatically adjust the buffer + size. By default, the buffer size is + automatically resized from an initial + value of SIZE. + --http-chunk-size SIZE Size of a chunk for chunk-based HTTP + downloading (e.g. 10485760 or 10M) + (default is disabled). May be useful + for bypassing bandwidth throttling + imposed by a webserver (experimental) + --playlist-reverse Download playlist videos in reverse + order + --playlist-random Download playlist videos in random + order + --xattr-set-filesize Set file xattribute ytdl.filesize with + expected file size + --hls-prefer-native Use the native HLS downloader instead + of ffmpeg + --hls-prefer-ffmpeg Use ffmpeg instead of the native HLS + downloader + --hls-use-mpegts Use the mpegts container for HLS + videos, allowing to play the video + while downloading (some players may not + be able to play it) + --external-downloader COMMAND Use the specified external downloader. + Currently supports aria2c,avconv,axel,c + url,ffmpeg,httpie,wget + --external-downloader-args ARGS Give these arguments to the external + downloader ## Filesystem Options: - -a, --batch-file FILE File containing URLs to download ('-' for - stdin), one URL per line. Lines starting - with '#', ';' or ']' are considered as - comments and ignored. - --id Use only video ID in file name - -o, --output TEMPLATE Output filename template, see the "OUTPUT - TEMPLATE" for all the info - --autonumber-start NUMBER Specify the start value for %(autonumber)s - (default is 1) - --restrict-filenames Restrict filenames to only ASCII - characters, and avoid "&" and spaces in - filenames - -w, --no-overwrites Do not overwrite files - -c, --continue Force resume of partially downloaded files. - By default, youtube-dl will resume - downloads if possible. - --no-continue Do not resume partially downloaded files - (restart from beginning) - --no-part Do not use .part files - write directly - into output file - --no-mtime Do not use the Last-modified header to set - the file modification time - --write-description Write video description to a .description - file - --write-info-json Write video metadata to a .info.json file - --write-annotations Write video annotations to a - .annotations.xml file - --load-info-json FILE JSON file containing the video information - (created with the "--write-info-json" - option) - --cookies FILE File to read cookies from and dump cookie - jar in - --cache-dir DIR Location in the filesystem where youtube-dl - can store some downloaded information - permanently. By default - $XDG_CACHE_HOME/youtube-dl or - ~/.cache/youtube-dl . At the moment, only - YouTube player files (for videos with - obfuscated signatures) are cached, but that - may change. - --no-cache-dir Disable filesystem caching - --rm-cache-dir Delete all filesystem cache files + -a, --batch-file FILE File containing URLs to download ('-' + for stdin), one URL per line. Lines + starting with '#', ';' or ']' are + considered as comments and ignored. + --id Use only video ID in file name + -o, --output TEMPLATE Output filename template, see the + "OUTPUT TEMPLATE" for all the info + --output-na-placeholder PLACEHOLDER Placeholder value for unavailable meta + fields in output filename template + (default is "NA") + --autonumber-start NUMBER Specify the start value for + %(autonumber)s (default is 1) + --restrict-filenames Restrict filenames to only ASCII + characters, and avoid "&" and spaces in + filenames + -w, --no-overwrites Do not overwrite files + -c, --continue Force resume of partially downloaded + files. By default, youtube-dl will + resume downloads if possible. + --no-continue Do not resume partially downloaded + files (restart from beginning) + --no-part Do not use .part files - write directly + into output file + --no-mtime Do not use the Last-modified header to + set the file modification time + --write-description Write video description to a + .description file + --write-info-json Write video metadata to a .info.json + file + --write-annotations Write video annotations to a + .annotations.xml file + --load-info-json FILE JSON file containing the video + information (created with the "--write- + info-json" option) + --cookies FILE File to read cookies from and dump + cookie jar in + --cache-dir DIR Location in the filesystem where + youtube-dl can store some downloaded + information permanently. By default + $XDG_CACHE_HOME/youtube-dl or + ~/.cache/youtube-dl . At the moment, + only YouTube player files (for videos + with obfuscated signatures) are cached, + but that may change. + --no-cache-dir Disable filesystem caching + --rm-cache-dir Delete all filesystem cache files ## Thumbnail images: - --write-thumbnail Write thumbnail image to disk - --write-all-thumbnails Write all thumbnail image formats to disk - --list-thumbnails Simulate and list all available thumbnail - formats + --write-thumbnail Write thumbnail image to disk + --write-all-thumbnails Write all thumbnail image formats to + disk + --list-thumbnails Simulate and list all available + thumbnail formats ## Verbosity / Simulation Options: - -q, --quiet Activate quiet mode - --no-warnings Ignore warnings - -s, --simulate Do not download the video and do not write - anything to disk - --skip-download Do not download the video - -g, --get-url Simulate, quiet but print URL - -e, --get-title Simulate, quiet but print title - --get-id Simulate, quiet but print id - --get-thumbnail Simulate, quiet but print thumbnail URL - --get-description Simulate, quiet but print video description - --get-duration Simulate, quiet but print video length - --get-filename Simulate, quiet but print output filename - --get-format Simulate, quiet but print output format - -j, --dump-json Simulate, quiet but print JSON information. - See the "OUTPUT TEMPLATE" for a description - of available keys. - -J, --dump-single-json Simulate, quiet but print JSON information - for each command-line argument. If the URL - refers to a playlist, dump the whole - playlist information in a single line. - --print-json Be quiet and print the video information as - JSON (video is still being downloaded). - --newline Output progress bar as new lines - --no-progress Do not print progress bar - --console-title Display progress in console titlebar - -v, --verbose Print various debugging information - --dump-pages Print downloaded pages encoded using base64 - to debug problems (very verbose) - --write-pages Write downloaded intermediary pages to - files in the current directory to debug - problems - --print-traffic Display sent and read HTTP traffic - -C, --call-home Contact the youtube-dl server for debugging - --no-call-home Do NOT contact the youtube-dl server for - debugging + -q, --quiet Activate quiet mode + --no-warnings Ignore warnings + -s, --simulate Do not download the video and do not + write anything to disk + --skip-download Do not download the video + -g, --get-url Simulate, quiet but print URL + -e, --get-title Simulate, quiet but print title + --get-id Simulate, quiet but print id + --get-thumbnail Simulate, quiet but print thumbnail URL + --get-description Simulate, quiet but print video + description + --get-duration Simulate, quiet but print video length + --get-filename Simulate, quiet but print output + filename + --get-format Simulate, quiet but print output format + -j, --dump-json Simulate, quiet but print JSON + information. See the "OUTPUT TEMPLATE" + for a description of available keys. + -J, --dump-single-json Simulate, quiet but print JSON + information for each command-line + argument. If the URL refers to a + playlist, dump the whole playlist + information in a single line. + --print-json Be quiet and print the video + information as JSON (video is still + being downloaded). + --newline Output progress bar as new lines + --no-progress Do not print progress bar + --console-title Display progress in console titlebar + -v, --verbose Print various debugging information + --dump-pages Print downloaded pages encoded using + base64 to debug problems (very verbose) + --write-pages Write downloaded intermediary pages to + files in the current directory to debug + problems + --print-traffic Display sent and read HTTP traffic + -C, --call-home Contact the youtube-dl server for + debugging + --no-call-home Do NOT contact the youtube-dl server + for debugging ## Workarounds: - --encoding ENCODING Force the specified encoding (experimental) - --no-check-certificate Suppress HTTPS certificate validation - --prefer-insecure Use an unencrypted connection to retrieve - information about the video. (Currently - supported only for YouTube) - --user-agent UA Specify a custom user agent - --referer URL Specify a custom referer, use if the video - access is restricted to one domain - --add-header FIELD:VALUE Specify a custom HTTP header and its value, - separated by a colon ':'. You can use this - option multiple times - --bidi-workaround Work around terminals that lack - bidirectional text support. Requires bidiv - or fribidi executable in PATH - --sleep-interval SECONDS Number of seconds to sleep before each - download when used alone or a lower bound - of a range for randomized sleep before each - download (minimum possible number of - seconds to sleep) when used along with - --max-sleep-interval. - --max-sleep-interval SECONDS Upper bound of a range for randomized sleep - before each download (maximum possible - number of seconds to sleep). Must only be - used along with --min-sleep-interval. + --encoding ENCODING Force the specified encoding + (experimental) + --no-check-certificate Suppress HTTPS certificate validation + --prefer-insecure Use an unencrypted connection to + retrieve information about the video. + (Currently supported only for YouTube) + --user-agent UA Specify a custom user agent + --referer URL Specify a custom referer, use if the + video access is restricted to one + domain + --add-header FIELD:VALUE Specify a custom HTTP header and its + value, separated by a colon ':'. You + can use this option multiple times + --bidi-workaround Work around terminals that lack + bidirectional text support. Requires + bidiv or fribidi executable in PATH + --sleep-interval SECONDS Number of seconds to sleep before each + download when used alone or a lower + bound of a range for randomized sleep + before each download (minimum possible + number of seconds to sleep) when used + along with --max-sleep-interval. + --max-sleep-interval SECONDS Upper bound of a range for randomized + sleep before each download (maximum + possible number of seconds to sleep). + Must only be used along with --min- + sleep-interval. ## Video Format Options: - -f, --format FORMAT Video format code, see the "FORMAT - SELECTION" for all the info - --all-formats Download all available video formats - --prefer-free-formats Prefer free video formats unless a specific - one is requested - -F, --list-formats List all available formats of requested - videos - --youtube-skip-dash-manifest Do not download the DASH manifests and - related data on YouTube videos - --merge-output-format FORMAT If a merge is required (e.g. - bestvideo+bestaudio), output to given - container format. One of mkv, mp4, ogg, - webm, flv. Ignored if no merge is required + -f, --format FORMAT Video format code, see the "FORMAT + SELECTION" for all the info + --all-formats Download all available video formats + --prefer-free-formats Prefer free video formats unless a + specific one is requested + -F, --list-formats List all available formats of requested + videos + --youtube-skip-dash-manifest Do not download the DASH manifests and + related data on YouTube videos + --merge-output-format FORMAT If a merge is required (e.g. + bestvideo+bestaudio), output to given + container format. One of mkv, mp4, ogg, + webm, flv. Ignored if no merge is + required ## Subtitle Options: - --write-sub Write subtitle file - --write-auto-sub Write automatically generated subtitle file - (YouTube only) - --all-subs Download all the available subtitles of the - video - --list-subs List all available subtitles for the video - --sub-format FORMAT Subtitle format, accepts formats - preference, for example: "srt" or - "ass/srt/best" - --sub-lang LANGS Languages of the subtitles to download - (optional) separated by commas, use --list- - subs for available language tags + --write-sub Write subtitle file + --write-auto-sub Write automatically generated subtitle + file (YouTube only) + --all-subs Download all the available subtitles of + the video + --list-subs List all available subtitles for the + video + --sub-format FORMAT Subtitle format, accepts formats + preference, for example: "srt" or + "ass/srt/best" + --sub-lang LANGS Languages of the subtitles to download + (optional) separated by commas, use + --list-subs for available language tags ## Authentication Options: - -u, --username USERNAME Login with this account ID - -p, --password PASSWORD Account password. If this option is left - out, youtube-dl will ask interactively. - -2, --twofactor TWOFACTOR Two-factor authentication code - -n, --netrc Use .netrc authentication data - --video-password PASSWORD Video password (vimeo, youku) + -u, --username USERNAME Login with this account ID + -p, --password PASSWORD Account password. If this option is + left out, youtube-dl will ask + interactively. + -2, --twofactor TWOFACTOR Two-factor authentication code + -n, --netrc Use .netrc authentication data + --video-password PASSWORD Video password (vimeo, youku) ## Adobe Pass Options: - --ap-mso MSO Adobe Pass multiple-system operator (TV - provider) identifier, use --ap-list-mso for - a list of available MSOs - --ap-username USERNAME Multiple-system operator account login - --ap-password PASSWORD Multiple-system operator account password. - If this option is left out, youtube-dl will - ask interactively. - --ap-list-mso List all supported multiple-system - operators + --ap-mso MSO Adobe Pass multiple-system operator (TV + provider) identifier, use --ap-list-mso + for a list of available MSOs + --ap-username USERNAME Multiple-system operator account login + --ap-password PASSWORD Multiple-system operator account + password. If this option is left out, + youtube-dl will ask interactively. + --ap-list-mso List all supported multiple-system + operators ## Post-processing Options: - -x, --extract-audio Convert video files to audio-only files - (requires ffmpeg or avconv and ffprobe or - avprobe) - --audio-format FORMAT Specify audio format: "best", "aac", - "flac", "mp3", "m4a", "opus", "vorbis", or - "wav"; "best" by default; No effect without - -x - --audio-quality QUALITY Specify ffmpeg/avconv audio quality, insert - a value between 0 (better) and 9 (worse) - for VBR or a specific bitrate like 128K - (default 5) - --recode-video FORMAT Encode the video to another format if - necessary (currently supported: - mp4|flv|ogg|webm|mkv|avi) - --postprocessor-args ARGS Give these arguments to the postprocessor - -k, --keep-video Keep the video file on disk after the post- - processing; the video is erased by default - --no-post-overwrites Do not overwrite post-processed files; the - post-processed files are overwritten by - default - --embed-subs Embed subtitles in the video (only for mp4, - webm and mkv videos) - --embed-thumbnail Embed thumbnail in the audio as cover art - --add-metadata Write metadata to the video file - --metadata-from-title FORMAT Parse additional metadata like song title / - artist from the video title. The format - syntax is the same as --output. Regular - expression with named capture groups may - also be used. The parsed parameters replace - existing values. Example: --metadata-from- - title "%(artist)s - %(title)s" matches a - title like "Coldplay - Paradise". Example - (regex): --metadata-from-title - "(?P<artist>.+?) - (?P<title>.+)" - --xattrs Write metadata to the video file's xattrs - (using dublin core and xdg standards) - --fixup POLICY Automatically correct known faults of the - file. One of never (do nothing), warn (only - emit a warning), detect_or_warn (the - default; fix file if we can, warn - otherwise) - --prefer-avconv Prefer avconv over ffmpeg for running the - postprocessors - --prefer-ffmpeg Prefer ffmpeg over avconv for running the - postprocessors (default) - --ffmpeg-location PATH Location of the ffmpeg/avconv binary; - either the path to the binary or its - containing directory. - --exec CMD Execute a command on the file after - downloading and post-processing, similar to - find's -exec syntax. Example: --exec 'adb - push {} /sdcard/Music/ && rm {}' - --convert-subs FORMAT Convert the subtitles to other format - (currently supported: srt|ass|vtt|lrc) + -x, --extract-audio Convert video files to audio-only files + (requires ffmpeg/avconv and + ffprobe/avprobe) + --audio-format FORMAT Specify audio format: "best", "aac", + "flac", "mp3", "m4a", "opus", "vorbis", + or "wav"; "best" by default; No effect + without -x + --audio-quality QUALITY Specify ffmpeg/avconv audio quality, + insert a value between 0 (better) and 9 + (worse) for VBR or a specific bitrate + like 128K (default 5) + --recode-video FORMAT Encode the video to another format if + necessary (currently supported: + mp4|flv|ogg|webm|mkv|avi) + --postprocessor-args ARGS Give these arguments to the + postprocessor + -k, --keep-video Keep the video file on disk after the + post-processing; the video is erased by + default + --no-post-overwrites Do not overwrite post-processed files; + the post-processed files are + overwritten by default + --embed-subs Embed subtitles in the video (only for + mp4, webm and mkv videos) + --embed-thumbnail Embed thumbnail in the audio as cover + art + --add-metadata Write metadata to the video file + --metadata-from-title FORMAT Parse additional metadata like song + title / artist from the video title. + The format syntax is the same as + --output. Regular expression with named + capture groups may also be used. The + parsed parameters replace existing + values. Example: --metadata-from-title + "%(artist)s - %(title)s" matches a + title like "Coldplay - Paradise". + Example (regex): --metadata-from-title + "(?P<artist>.+?) - (?P<title>.+)" + --xattrs Write metadata to the video file's + xattrs (using dublin core and xdg + standards) + --fixup POLICY Automatically correct known faults of + the file. One of never (do nothing), + warn (only emit a warning), + detect_or_warn (the default; fix file + if we can, warn otherwise) + --prefer-avconv Prefer avconv over ffmpeg for running + the postprocessors + --prefer-ffmpeg Prefer ffmpeg over avconv for running + the postprocessors (default) + --ffmpeg-location PATH Location of the ffmpeg/avconv binary; + either the path to the binary or its + containing directory. + --exec CMD Execute a command on the file after + downloading and post-processing, + similar to find's -exec syntax. + Example: --exec 'adb push {} + /sdcard/Music/ && rm {}' + --convert-subs FORMAT Convert the subtitles to other format + (currently supported: srt|ass|vtt|lrc) # CONFIGURATION diff --git a/docs/supportedsites.md b/docs/supportedsites.md index aa8026a32..13bac6e27 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -46,10 +46,11 @@ - **Amara** - **AMCNetworks** - **AmericasTestKitchen** + - **AmericasTestKitchenSeason** - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **AnimeOnDemand** - **Anvato** - - **aol.com** + - **aol.com**: Yahoo screen and movies - **APA** - **Aparat** - **AppleConnect** @@ -192,8 +193,6 @@ - **CNNArticle** - **CNNBlogs** - **ComedyCentral** - - **ComedyCentralFullEpisodes** - - **ComedyCentralShortname** - **ComedyCentralTV** - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED - **CONtv** @@ -506,6 +505,9 @@ - **Mgoon** - **MGTV**: 芒果TV - **MiaoPai** + - **minds** + - **minds:channel** + - **minds:group** - **MinistryGrid** - **Minoto** - **miomio.tv** @@ -859,6 +861,8 @@ - **Sport5** - **SportBox** - **SportDeutschland** + - **spotify** + - **spotify:show** - **Spreaker** - **SpreakerPage** - **SpreakerShow** @@ -940,12 +944,13 @@ - **TNAFlixNetworkEmbed** - **toggle** - **ToonGoggles** - - **Tosh**: Tosh.0 - **tou.tv** - **Toypics**: Toypics video - **ToypicsUser**: Toypics user profile - **TrailerAddict** (Currently broken) - **Trilulilu** + - **Trovo** + - **TrovoVod** - **TruNews** - **TruTV** - **Tube8** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ac7242abb..080460d50 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.01.16' +__version__ = '2021.01.24' From b63981e85095542e056d7180496de4a2d85ddf9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jan 2021 18:11:20 +0700 Subject: [PATCH 36/79] release 2021.01.24.1 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- youtube_dl/version.py | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 935806b5e..2dde97a2c 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.01.24** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.24 + [debug] youtube-dl version 2021.01.24.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 453a5e147..c520d1ee0 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 07094c10d..4aacd3bdc 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index b234f8ccd..91bbed506 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.01.24** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.24 + [debug] youtube-dl version 2021.01.24.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 557e59ca5..a0a2c989a 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24** +- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 080460d50..c52f1d9ca 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.01.24' +__version__ = '2021.01.24.1' From 811a183eb6a7dabcc8600f6bf9dc4aac11e72b26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 Jan 2021 18:15:32 +0700 Subject: [PATCH 37/79] [ChangeLog] Actualize [ci skip] --- ChangeLog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 9b2f38b25..7f2e0aad1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version 2021.01.24 +version 2021.01.24.1 Core * Introduce --output-na-placeholder (#27896) From ea399a53eb9744c0b1530d72bf9d5e1e21ed3489 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= <mail@adrianheine.de> Date: Mon, 25 Jan 2021 09:25:57 +0100 Subject: [PATCH 38/79] [ADN] Implement login (#27937) closes #17091 closes #27841 --- youtube_dl/extractor/adn.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index d611ee237..40111586d 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -26,6 +26,7 @@ from ..utils import ( strip_or_none, try_get, unified_strdate, + urlencode_postdata, ) @@ -51,9 +52,11 @@ class ADNIE(InfoExtractor): } } + _NETRC_MACHINE = 'animedigitalnetwork' _BASE_URL = 'http://animedigitalnetwork.fr' _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/' _PLAYER_BASE_URL = _API_BASE_URL + 'player/' + _HEADERS = {} _RSA_KEY = (0x9B42B08905199A5CCE2026274399CA560ECB209EE9878A708B1C0812E1BB8CB5D1FB7441861147C1A1F2F3A0476DD63A9CAC20D3E983613346850AA6CB38F16DC7D720FD7D86FC6E5B3D5BBC72E14CD0BF9E869F2CEA2CCAD648F1DCE38F1FF916CEFB2D339B64AA0264372344BC775E265E8A852F88144AB0BD9AA06C1A4ABB, 65537) _POS_ALIGN_MAP = { 'start': 1, @@ -129,19 +132,32 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' }]) return subtitles + def _real_initialize(self): + username, password = self._get_login_info() + if username: + access_token = (self._download_json( + self._API_BASE_URL + 'authentication/login', None, + 'Logging in', errnote='Unable to log in', fatal=False, + data=urlencode_postdata({ + 'password': password, + 'rememberMe': False, + 'source': 'Web', + 'username': username, + })) or {}).get('accessToken') + if access_token: + self._HEADERS = {'authorization': 'Bearer ' + access_token} + def _real_extract(self, url): video_id = self._match_id(url) video_base_url = self._PLAYER_BASE_URL + 'video/%s/' % video_id player = self._download_json( video_base_url + 'configuration', video_id, - 'Downloading player config JSON metadata')['player'] + 'Downloading player config JSON metadata', headers=self._HEADERS)['player'] options = player['options'] user = options['user'] if not user.get('hasAccess'): - raise ExtractorError( - 'This video is only available for paying users', expected=True) - # self.raise_login_required() # FIXME: Login is not implemented + self.raise_login_required() token = self._download_json( user.get('refreshTokenUrl') or (self._PLAYER_BASE_URL + 'refresh/token'), From 57f2488bbe4345eee545488e937e0f8426664f6f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 25 Jan 2021 09:20:48 +0100 Subject: [PATCH 39/79] [zype] fix uplynk id extraction(closes #27956) --- youtube_dl/extractor/zype.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/zype.py b/youtube_dl/extractor/zype.py index 5288f40d8..f20f953cb 100644 --- a/youtube_dl/extractor/zype.py +++ b/youtube_dl/extractor/zype.py @@ -87,11 +87,16 @@ class ZypeIE(InfoExtractor): r'(["\'])(?P<url>(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', body, 'm3u8 url', group='url', default=None) if not m3u8_url: - source = self._parse_json(self._search_regex( - r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, - 'source'), video_id, js_to_json) - if source.get('integration') == 'verizon-media': - m3u8_url = 'https://content.uplynk.com/%s.m3u8' % source['id'] + source = self._search_regex( + r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, 'source') + + def get_attr(key): + return self._search_regex( + r'\b%s\s*:\s*([\'"])(?P<val>(?:(?!\1).)+)\1' % key, + source, key, group='val') + + if get_attr('integration') == 'verizon-media': + m3u8_url = 'https://content.uplynk.com/%s.m3u8' % get_attr('id') formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') text_tracks = self._search_regex( From 55bb3556c8c5ec088ef88edbdef925860e0926dc Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 25 Jan 2021 09:28:24 +0100 Subject: [PATCH 40/79] [adn] improve login warning reporting --- youtube_dl/extractor/adn.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index 40111586d..a55ebbcbd 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -57,6 +57,7 @@ class ADNIE(InfoExtractor): _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/' _PLAYER_BASE_URL = _API_BASE_URL + 'player/' _HEADERS = {} + _LOGIN_ERR_MESSAGE = 'Unable to log in' _RSA_KEY = (0x9B42B08905199A5CCE2026274399CA560ECB209EE9878A708B1C0812E1BB8CB5D1FB7441861147C1A1F2F3A0476DD63A9CAC20D3E983613346850AA6CB38F16DC7D720FD7D86FC6E5B3D5BBC72E14CD0BF9E869F2CEA2CCAD648F1DCE38F1FF916CEFB2D339B64AA0264372344BC775E265E8A852F88144AB0BD9AA06C1A4ABB, 65537) _POS_ALIGN_MAP = { 'start': 1, @@ -134,10 +135,12 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' def _real_initialize(self): username, password = self._get_login_info() - if username: + if not username: + return + try: access_token = (self._download_json( self._API_BASE_URL + 'authentication/login', None, - 'Logging in', errnote='Unable to log in', fatal=False, + 'Logging in', self._LOGIN_ERR_MESSAGE, fatal=False, data=urlencode_postdata({ 'password': password, 'rememberMe': False, @@ -146,13 +149,21 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' })) or {}).get('accessToken') if access_token: self._HEADERS = {'authorization': 'Bearer ' + access_token} + except ExtractorError as e: + message = None + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json( + e.cause.read().decode(), None, fatal=False) or {} + message = resp.get('message') or resp.get('code') + self.report_warning(message or self._LOGIN_ERR_MESSAGE) def _real_extract(self, url): video_id = self._match_id(url) video_base_url = self._PLAYER_BASE_URL + 'video/%s/' % video_id player = self._download_json( video_base_url + 'configuration', video_id, - 'Downloading player config JSON metadata', headers=self._HEADERS)['player'] + 'Downloading player config JSON metadata', + headers=self._HEADERS)['player'] options = player['options'] user = options['user'] @@ -204,8 +215,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' message = error.get('message') if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country': self.raise_geo_restricted(msg=message) - else: - raise ExtractorError(message) + raise ExtractorError(message) else: raise ExtractorError('Giving up retrying') From 395981288ba0b2e1afabd4e595cb9c959ef62356 Mon Sep 17 00:00:00 2001 From: tpikonen <tpikonen@gmail.com> Date: Mon, 25 Jan 2021 15:43:41 +0200 Subject: [PATCH 41/79] [tv2] Add support for mtvuutiset.fi (#27744) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tv2.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ef57f5556..2331b0e15 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1260,6 +1260,7 @@ from .tv2 import ( TV2IE, TV2ArticleIE, KatsomoIE, + MTVuutisetIE, ) from .tv2dk import ( TV2DKIE, diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 4a19b9be6..42a9af126 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -190,3 +190,32 @@ class KatsomoIE(TV2IE): _API_DOMAIN = 'api.katsomo.fi' _PROTOCOLS = ('HLS', 'MPD') _GEO_COUNTRIES = ['FI'] + + +class MTVuutisetIE(KatsomoIE): + _VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/(?:artikkeli/[0-9a-z-]+/|video/prog)(?P<id>\d+)' + _TEST = { + 'url': 'https://www.mtvuutiset.fi/artikkeli/tallaisia-vaurioita-viking-amorellassa-on-useamman-osaston-alla-vetta/7931384', + 'info_dict': { + 'id': '1311159', + 'ext': 'mp4', + 'title': 'MTV Uutiset Live', + 'description': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', + 'timestamp': 1600608966, + 'upload_date': '20200920', + 'duration': 153.7886666, + 'view_count': int, + 'categories': list, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + art_id = self._match_id(url) + webpage = self._download_webpage(url, art_id) + video_id = self._html_search_regex( + r'<div class=\'player-container\' .*data-katsomoid="(.+?)"', webpage, 'video_id') + return self.url_result("http://mtv.fi/a/0/a/%s" % video_id, video_id=video_id, ie="Katsomo") From 286e5d6724f86ace38f6ccf0e33c511eeb1ab65d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 25 Jan 2021 14:46:04 +0100 Subject: [PATCH 42/79] [tv2] improve MTV Uutiset Article extraction --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/tv2.py | 73 ++++++++++++++++++++---------- 2 files changed, 51 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2331b0e15..c554a8504 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1260,7 +1260,7 @@ from .tv2 import ( TV2IE, TV2ArticleIE, KatsomoIE, - MTVuutisetIE, + MTVUutisetArticleIE, ) from .tv2dk import ( TV2DKIE, diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 42a9af126..334b7d540 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -20,7 +20,7 @@ from ..utils import ( class TV2IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { 'id': '916509', @@ -33,7 +33,7 @@ class TV2IE(InfoExtractor): 'view_count': int, 'categories': list, }, - } + }] _API_DOMAIN = 'sumo.tv2.no' _PROTOCOLS = ('HDS', 'HLS', 'DASH') _GEO_COUNTRIES = ['NO'] @@ -42,6 +42,12 @@ class TV2IE(InfoExtractor): video_id = self._match_id(url) api_base = 'http://%s/api/web/asset/%s' % (self._API_DOMAIN, video_id) + asset = self._download_json( + api_base + '.json', video_id, + 'Downloading metadata JSON')['asset'] + title = asset.get('subtitle') or asset['title'] + is_live = asset.get('live') is True + formats = [] format_urls = [] for protocol in self._PROTOCOLS: @@ -81,7 +87,8 @@ class TV2IE(InfoExtractor): elif ext == 'm3u8': if not data.get('drmProtected'): formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', + video_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', m3u8_id=format_id, fatal=False)) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( @@ -99,11 +106,6 @@ class TV2IE(InfoExtractor): raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) - asset = self._download_json( - api_base + '.json', video_id, - 'Downloading metadata JSON')['asset'] - title = asset['title'] - thumbnails = [{ 'id': thumbnail.get('@type'), 'url': thumbnail.get('url'), @@ -112,7 +114,7 @@ class TV2IE(InfoExtractor): return { 'id': video_id, 'url': video_url, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'description': strip_or_none(asset.get('description')), 'thumbnails': thumbnails, 'timestamp': parse_iso8601(asset.get('createTime')), @@ -120,6 +122,7 @@ class TV2IE(InfoExtractor): 'view_count': int_or_none(asset.get('views')), 'categories': asset.get('keywords', '').split(','), 'formats': formats, + 'is_live': is_live, } @@ -168,13 +171,13 @@ class TV2ArticleIE(InfoExtractor): class KatsomoIE(TV2IE): - _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv)\.fi/(?:#!/)?(?:[^/]+/[0-9a-z-]+-\d+/[0-9a-z-]+-|[^/]+/\d+/[^/]+/)(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv(uutiset)?)\.fi/(?:sarja/[0-9a-z-]+-\d+/[0-9a-z-]+-|(?:#!/)?jakso/(?:\d+/[^/]+/)?|video/prog)(?P<id>\d+)' + _TESTS = [{ 'url': 'https://www.mtv.fi/sarja/mtv-uutiset-live-33001002003/lahden-pelicans-teki-kovan-ratkaisun-ville-nieminen-pihalle-1181321', 'info_dict': { 'id': '1181321', 'ext': 'mp4', - 'title': 'MTV Uutiset Live', + 'title': 'Lahden Pelicans teki kovan ratkaisun – Ville Nieminen pihalle', 'description': 'Päätöksen teki Pelicansin hallitus.', 'timestamp': 1575116484, 'upload_date': '20191130', @@ -186,20 +189,29 @@ class KatsomoIE(TV2IE): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'http://www.katsomo.fi/#!/jakso/33001005/studio55-fi/658521/jukka-kuoppamaki-tekee-yha-lauluja-vaikka-lentokoneessa', + 'only_matching': True, + }, { + 'url': 'https://www.mtvuutiset.fi/video/prog1311159', + 'only_matching': True, + }, { + 'url': 'https://www.katsomo.fi/#!/jakso/1311159', + 'only_matching': True, + }] _API_DOMAIN = 'api.katsomo.fi' _PROTOCOLS = ('HLS', 'MPD') _GEO_COUNTRIES = ['FI'] -class MTVuutisetIE(KatsomoIE): - _VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/(?:artikkeli/[0-9a-z-]+/|video/prog)(?P<id>\d+)' - _TEST = { +class MTVUutisetArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/artikkeli/[^/]+/(?P<id>\d+)' + _TESTS = [{ 'url': 'https://www.mtvuutiset.fi/artikkeli/tallaisia-vaurioita-viking-amorellassa-on-useamman-osaston-alla-vetta/7931384', 'info_dict': { 'id': '1311159', 'ext': 'mp4', - 'title': 'MTV Uutiset Live', + 'title': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', 'description': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', 'timestamp': 1600608966, 'upload_date': '20200920', @@ -211,11 +223,26 @@ class MTVuutisetIE(KatsomoIE): # m3u8 download 'skip_download': True, }, - } + }, { + # multiple Youtube embeds + 'url': 'https://www.mtvuutiset.fi/artikkeli/50-vuotta-subarun-vastaiskua/6070962', + 'only_matching': True, + }] def _real_extract(self, url): - art_id = self._match_id(url) - webpage = self._download_webpage(url, art_id) - video_id = self._html_search_regex( - r'<div class=\'player-container\' .*data-katsomoid="(.+?)"', webpage, 'video_id') - return self.url_result("http://mtv.fi/a/0/a/%s" % video_id, video_id=video_id, ie="Katsomo") + article_id = self._match_id(url) + article = self._download_json( + 'http://api.mtvuutiset.fi/mtvuutiset/api/json/' + article_id, + article_id) + + def entries(): + for video in (article.get('videos') or []): + video_type = video.get('videotype') + video_url = video.get('url') + if not (video_url and video_type in ('katsomo', 'youtube')): + continue + yield self.url_result( + video_url, video_type.capitalize(), video.get('video_id')) + + return self.playlist_result( + entries(), article_id, article.get('title'), article.get('description')) From 0f7d413d5b2637e2fb091745ab4f70811a6cc600 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 25 Jan 2021 15:15:45 +0100 Subject: [PATCH 43/79] [tv4] relax _VALID_URL(closes #27964) --- youtube_dl/extractor/tv4.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index c498b0191..b73bab9a8 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -17,7 +17,7 @@ class TV4IE(InfoExtractor): tv4\.se/(?:[^/]+)/klipp/(?:.*)-| tv4play\.se/ (?: - (?:program|barn)/(?:[^/]+/|(?:[^\?]+)\?video_id=)| + (?:program|barn)/(?:(?:[^/]+/){1,2}|(?:[^\?]+)\?video_id=)| iframe/video/| film/| sport/| @@ -65,6 +65,10 @@ class TV4IE(InfoExtractor): { 'url': 'http://www.tv4play.se/program/farang/3922081', 'only_matching': True, + }, + { + 'url': 'https://www.tv4play.se/program/nyheterna/avsnitt/13315940', + 'only_matching': True, } ] From d18f4419a72a01abc2cb45ef23f2400cd3eb5f43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= <mail@adrianheine.de> Date: Tue, 26 Jan 2021 22:43:11 +0100 Subject: [PATCH 44/79] [AMP] Fix upload_date and timestamp extraction (#27970) --- youtube_dl/extractor/abcnews.py | 2 ++ youtube_dl/extractor/amp.py | 3 ++- youtube_dl/extractor/bleacherreport.py | 10 ++++++++-- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index 8b407bf9c..64ea6e6ed 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -36,6 +36,8 @@ class AbcNewsVideoIE(AMPIE): 'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.', 'duration': 180, 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1380454200, + 'upload_date': '20130929', }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 7ff098cfa..24c684cad 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -8,6 +8,7 @@ from ..utils import ( int_or_none, mimetype2ext, parse_iso8601, + unified_timestamp, url_or_none, ) @@ -88,7 +89,7 @@ class AMPIE(InfoExtractor): self._sort_formats(formats) - timestamp = parse_iso8601(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) + timestamp = unified_timestamp(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) return { 'id': video_id, diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index dc60224d0..d1bf8e829 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -90,13 +90,19 @@ class BleacherReportCMSIE(AMPIE): _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36}|\d{5})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms', - 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1', + 'md5': '670b2d73f48549da032861130488c681', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', + 'upload_date': '20150723', + 'timestamp': 1437679032, + }, + 'expected_warnings': [ + 'Unable to download f4m manifest' + ] }] def _real_extract(self, url): From 11b68df7a4980f7f6175cdf2d7334fde11ff76b1 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 27 Jan 2021 12:28:22 +0100 Subject: [PATCH 45/79] [abcnews] fix extraction(closes #12394)(closes #27920) --- youtube_dl/extractor/abcnews.py | 126 +++++++++++++++++--------------- 1 file changed, 67 insertions(+), 59 deletions(-) diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index 64ea6e6ed..908c83377 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -1,14 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import calendar import re -import time from .amp import AMPIE from .common import InfoExtractor -from .youtube import YoutubeIE -from ..compat import compat_urlparse +from ..utils import ( + parse_duration, + parse_iso8601, + try_get, +) class AbcNewsVideoIE(AMPIE): @@ -18,8 +19,8 @@ class AbcNewsVideoIE(AMPIE): (?: abcnews\.go\.com/ (?: - [^/]+/video/(?P<display_id>[0-9a-z-]+)-| - video/embed\?.*?\bid= + (?:[^/]+/)*video/(?P<display_id>[0-9a-z-]+)-| + video/(?:embed|itemfeed)\?.*?\bid= )| fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/ ) @@ -49,6 +50,12 @@ class AbcNewsVideoIE(AMPIE): }, { 'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478', 'only_matching': True, + }, { + 'url': 'http://abcnews.go.com/video/itemfeed?id=46979033', + 'only_matching': True, + }, { + 'url': 'https://abcnews.go.com/GMA/News/video/history-christmas-story-67894761', + 'only_matching': True, }] def _real_extract(self, url): @@ -69,28 +76,23 @@ class AbcNewsIE(InfoExtractor): _VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)' _TESTS = [{ - 'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY', + # Youtube Embeds + 'url': 'https://abcnews.go.com/Entertainment/peter-billingsley-child-actor-christmas-story-hollywood-power/story?id=51286501', 'info_dict': { - 'id': '10505354', - 'ext': 'flv', - 'display_id': 'dramatic-video-rare-death-job-america', - 'title': 'Occupational Hazards', - 'description': 'Nightline investigates the dangers that lurk at various jobs.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20100428', - 'timestamp': 1272412800, + 'id': '51286501', + 'title': "Peter Billingsley: From child actor in 'A Christmas Story' to Hollywood power player", + 'description': 'Billingsley went from a child actor to Hollywood power player.', }, - 'add_ie': ['AbcNewsVideo'], + 'playlist_count': 5, }, { 'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818', 'info_dict': { 'id': '38897857', 'ext': 'mp4', - 'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016', 'title': 'Justin Timberlake Drops Hints For Secret Single', 'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.', - 'upload_date': '20160515', - 'timestamp': 1463329500, + 'upload_date': '20160505', + 'timestamp': 1462442280, }, 'params': { # m3u8 download @@ -102,49 +104,55 @@ class AbcNewsIE(InfoExtractor): }, { 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', 'only_matching': True, + }, { + # inline.type == 'video' + 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - video_id = mobj.group('id') + story_id = self._match_id(url) + webpage = self._download_webpage(url, story_id) + story = self._parse_json(self._search_regex( + r"window\['__abcnews__'\]\s*=\s*({.+?});", + webpage, 'data'), story_id)['page']['content']['story']['everscroll'][0] + article_contents = story.get('articleContents') or {} - webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL') - full_video_url = compat_urlparse.urljoin(url, video_url) + def entries(): + featured_video = story.get('featuredVideo') or {} + feed = try_get(featured_video, lambda x: x['video']['feed']) + if feed: + yield { + '_type': 'url', + 'id': featured_video.get('id'), + 'title': featured_video.get('name'), + 'url': feed, + 'thumbnail': featured_video.get('images'), + 'description': featured_video.get('description'), + 'timestamp': parse_iso8601(featured_video.get('uploadDate')), + 'duration': parse_duration(featured_video.get('duration')), + 'ie_key': AbcNewsVideoIE.ie_key(), + } - youtube_url = YoutubeIE._extract_url(webpage) + for inline in (article_contents.get('inlines') or []): + inline_type = inline.get('type') + if inline_type == 'iframe': + iframe_url = try_get(inline, lambda x: x['attrs']['src']) + if iframe_url: + yield self.url_result(iframe_url) + elif inline_type == 'video': + video_id = inline.get('id') + if video_id: + yield { + '_type': 'url', + 'id': video_id, + 'url': 'http://abcnews.go.com/video/embed?id=' + video_id, + 'thumbnail': inline.get('imgSrc') or inline.get('imgDefault'), + 'description': inline.get('description'), + 'duration': parse_duration(inline.get('duration')), + 'ie_key': AbcNewsVideoIE.ie_key(), + } - timestamp = None - date_str = self._html_search_regex( - r'<span[^>]+class="timestamp">([^<]+)</span>', - webpage, 'timestamp', fatal=False) - if date_str: - tz_offset = 0 - if date_str.endswith(' ET'): # Eastern Time - tz_offset = -5 - date_str = date_str[:-3] - date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p'] - for date_format in date_formats: - try: - timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format)) - except ValueError: - continue - if timestamp is not None: - timestamp -= tz_offset * 3600 - - entry = { - '_type': 'url_transparent', - 'ie_key': AbcNewsVideoIE.ie_key(), - 'url': full_video_url, - 'id': video_id, - 'display_id': display_id, - 'timestamp': timestamp, - } - - if youtube_url: - entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())] - return self.playlist_result(entries) - - return entry + return self.playlist_result( + entries(), story_id, article_contents.get('headline'), + article_contents.get('subHead')) From c669554ef5491302eb20fc2bcb52339ea1a4ac1a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 27 Jan 2021 14:51:30 +0100 Subject: [PATCH 46/79] [medialaan] add support DPG Media MyChannels based websites closes #14871 closes #15597 closes #16106 closes #16489 --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 23 +++ youtube_dl/extractor/medialaan.py | 303 +++++++---------------------- youtube_dl/extractor/vtm.py | 62 ++++++ 4 files changed, 160 insertions(+), 229 deletions(-) create mode 100644 youtube_dl/extractor/vtm.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c554a8504..ab8d6a5a5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1459,6 +1459,7 @@ from .vrv import ( VRVSeriesIE, ) from .vshare import VShareIE +from .vtm import VTMIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 780971a92..09e680c96 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -128,6 +128,7 @@ from .zype import ZypeIE from .odnoklassniki import OdnoklassnikiIE from .kinja import KinjaEmbedIE from .arcpublishing import ArcPublishingIE +from .medialaan import MedialaanIE class GenericIE(InfoExtractor): @@ -2223,6 +2224,20 @@ class GenericIE(InfoExtractor): 'duration': 1581, }, }, + { + # MyChannels SDK embed + # https://www.24kitchen.nl/populair/deskundige-dit-waarom-sommigen-gevoelig-zijn-voor-voedselallergieen + 'url': 'https://www.demorgen.be/nieuws/burgemeester-rotterdam-richt-zich-in-videoboodschap-tot-relschoppers-voelt-het-goed~b0bcfd741/', + 'md5': '90c0699c37006ef18e198c032d81739c', + 'info_dict': { + 'id': '194165', + 'ext': 'mp4', + 'title': 'Burgemeester Aboutaleb spreekt relschoppers toe', + 'timestamp': 1611740340, + 'upload_date': '20210127', + 'duration': 159, + }, + }, ] def report_following_redirect(self, new_url): @@ -2462,6 +2477,9 @@ class GenericIE(InfoExtractor): webpage = self._webpage_read_content( full_response, url, video_id, prefix=first_bytes) + if '<title>DPG Media Privacy Gate' in webpage: + webpage = self._download_webpage(url, video_id) + self.report_extraction(video_id) # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest? @@ -2593,6 +2611,11 @@ class GenericIE(InfoExtractor): if arc_urls: return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key()) + mychannels_urls = MedialaanIE._extract_urls(webpage) + if mychannels_urls: + return self.playlist_from_matches( + mychannels_urls, video_id, video_title, ie=MedialaanIE.ie_key()) + # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', diff --git a/youtube_dl/extractor/medialaan.py b/youtube_dl/extractor/medialaan.py index 50d5db802..788acf7fb 100644 --- a/youtube_dl/extractor/medialaan.py +++ b/youtube_dl/extractor/medialaan.py @@ -2,268 +2,113 @@ from __future__ import unicode_literals import re -from .gigya import GigyaBaseIE - -from ..compat import compat_str +from .common import InfoExtractor from ..utils import ( + extract_attributes, int_or_none, - parse_duration, - try_get, - unified_timestamp, + mimetype2ext, + parse_iso8601, ) -class MedialaanIE(GigyaBaseIE): +class MedialaanIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// - (?:www\.|nieuws\.)? (?: - (?Pvtm|q2|vtmkzoom)\.be/ - (?: - video(?:/[^/]+/id/|/?\?.*?\baid=)| - (?:[^/]+/)* - ) + (?:embed\.)?mychannels.video/embed/| + embed\.mychannels\.video/(?:s(?:dk|cript)/)?production/| + (?:www\.)?(?: + (?: + 7sur7| + demorgen| + hln| + joe| + qmusic + )\.be| + (?: + [abe]d| + bndestem| + destentor| + gelderlander| + pzc| + tubantia| + volkskrant + )\.nl + )/video/(?:[^/]+/)*[^/?&#]+~p ) - (?P[^/?#&]+) + (?P\d+) ''' - _NETRC_MACHINE = 'medialaan' - _APIKEY = '3_HZ0FtkMW_gOyKlqQzW5_0FHRC7Nd5XpXJZcDdXY4pk5eES2ZWmejRW5egwVm4ug-' - _SITE_TO_APP_ID = { - 'vtm': 'vtm_watch', - 'q2': 'q2', - 'vtmkzoom': 'vtmkzoom', - } _TESTS = [{ - # vod - 'url': 'http://vtm.be/video/volledige-afleveringen/id/vtm_20170219_VM0678361_vtmwatch', + 'url': 'https://www.bndestem.nl/video/de-terugkeer-van-ally-de-aap-en-wie-vertrekt-er-nog-bij-nac~p193993', 'info_dict': { - 'id': 'vtm_20170219_VM0678361_vtmwatch', + 'id': '193993', 'ext': 'mp4', - 'title': 'Allemaal Chris afl. 6', - 'description': 'md5:4be86427521e7b07e0adb0c9c554ddb2', - 'timestamp': 1487533280, - 'upload_date': '20170219', - 'duration': 2562, - 'series': 'Allemaal Chris', - 'season': 'Allemaal Chris', - 'season_number': 1, - 'season_id': '256936078124527', - 'episode': 'Allemaal Chris afl. 6', - 'episode_number': 6, - 'episode_id': '256936078591527', + 'title': 'De terugkeer van Ally de Aap en wie vertrekt er nog bij NAC?', + 'timestamp': 1611663540, + 'upload_date': '20210126', + 'duration': 238, }, 'params': { 'skip_download': True, }, - 'skip': 'Requires account credentials', }, { - # clip - 'url': 'http://vtm.be/video?aid=168332', - 'info_dict': { - 'id': '168332', - 'ext': 'mp4', - 'title': '"Veronique liegt!"', - 'description': 'md5:1385e2b743923afe54ba4adc38476155', - 'timestamp': 1489002029, - 'upload_date': '20170308', - 'duration': 96, - }, - }, { - # vod - 'url': 'http://vtm.be/video/volledige-afleveringen/id/257107153551000', + 'url': 'https://www.gelderlander.nl/video/kanalen/degelderlander~c320/series/snel-nieuws~s984/noodbevel-in-doetinchem-politie-stuurt-mensen-centrum-uit~p194093', 'only_matching': True, }, { - # vod - 'url': 'http://vtm.be/video?aid=163157', + 'url': 'https://embed.mychannels.video/sdk/production/193993?options=TFTFF_default', 'only_matching': True, }, { - # vod - 'url': 'http://www.q2.be/video/volledige-afleveringen/id/2be_20170301_VM0684442_q2', + 'url': 'https://embed.mychannels.video/script/production/193993', 'only_matching': True, }, { - # clip - 'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio', + 'url': 'https://embed.mychannels.video/production/193993', 'only_matching': True, }, { - # http/s redirect - 'url': 'https://vtmkzoom.be/video?aid=45724', - 'info_dict': { - 'id': '257136373657000', - 'ext': 'mp4', - 'title': 'K3 Dansstudio Ushuaia afl.6', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Requires account credentials', + 'url': 'https://mychannels.video/embed/193993', + 'only_matching': True, }, { - # nieuws.vtm.be - 'url': 'https://nieuws.vtm.be/stadion/stadion/genk-nog-moeilijk-programma', + 'url': 'https://embed.mychannels.video/embed/193993', 'only_matching': True, }] - def _real_initialize(self): - self._logged_in = False - - def _login(self): - username, password = self._get_login_info() - if username is None: - self.raise_login_required() - - auth_data = { - 'APIKey': self._APIKEY, - 'sdk': 'js_6.1', - 'format': 'json', - 'loginID': username, - 'password': password, - } - - auth_info = self._gigya_login(auth_data) - - self._uid = auth_info['UID'] - self._uid_signature = auth_info['UIDSignature'] - self._signature_timestamp = auth_info['signatureTimestamp'] - - self._logged_in = True + @staticmethod + def _extract_urls(webpage): + entries = [] + for element in re.findall(r'(]+data-mychannels-type="video"[^>]*>)', webpage): + mychannels_id = extract_attributes(element).get('data-mychannels-id') + if mychannels_id: + entries.append('https://mychannels.video/embed/' + mychannels_id) + return entries def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, site_id = mobj.group('id', 'site_id') + production_id = self._match_id(url) + production = self._download_json( + 'https://embed.mychannels.video/sdk/production/' + production_id, + production_id, query={'options': 'UUUU_default'})['productions'][0] + title = production['title'] - webpage = self._download_webpage(url, video_id) - - config = self._parse_json( - self._search_regex( - r'videoJSConfig\s*=\s*JSON\.parse\(\'({.+?})\'\);', - webpage, 'config', default='{}'), video_id, - transform_source=lambda s: s.replace( - '\\\\', '\\').replace(r'\"', '"').replace(r"\'", "'")) - - vod_id = config.get('vodId') or self._search_regex( - (r'\\"vodId\\"\s*:\s*\\"(.+?)\\"', - r'"vodId"\s*:\s*"(.+?)"', - r'<[^>]+id=["\']vod-(\d+)'), - webpage, 'video_id', default=None) - - # clip, no authentication required - if not vod_id: - player = self._parse_json( - self._search_regex( - r'vmmaplayer\(({.+?})\);', webpage, 'vmma player', - default=''), - video_id, transform_source=lambda s: '[%s]' % s, fatal=False) - if player: - video = player[-1] - if video['videoUrl'] in ('http', 'https'): - return self.url_result(video['url'], MedialaanIE.ie_key()) - info = { - 'id': video_id, - 'url': video['videoUrl'], - 'title': video['title'], - 'thumbnail': video.get('imageUrl'), - 'timestamp': int_or_none(video.get('createdDate')), - 'duration': int_or_none(video.get('duration')), - } + formats = [] + for source in (production.get('sources') or []): + src = source.get('src') + if not src: + continue + ext = mimetype2ext(source.get('type')) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, production_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) else: - info = self._parse_html5_media_entries( - url, webpage, video_id, m3u8_id='hls')[0] - info.update({ - 'id': video_id, - 'title': self._html_search_meta('description', webpage), - 'duration': parse_duration(self._html_search_meta('duration', webpage)), + formats.append({ + 'ext': ext, + 'url': src, }) - # vod, authentication required - else: - if not self._logged_in: - self._login() + self._sort_formats(formats) - settings = self._parse_json( - self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings', default='{}'), - video_id) - - def get(container, item): - return try_get( - settings, lambda x: x[container][item], - compat_str) or self._search_regex( - r'"%s"\s*:\s*"([^"]+)' % item, webpage, item, - default=None) - - app_id = get('vod', 'app_id') or self._SITE_TO_APP_ID.get(site_id, 'vtm_watch') - sso = get('vod', 'gigyaDatabase') or 'vtm-sso' - - data = self._download_json( - 'http://vod.medialaan.io/api/1.0/item/%s/video' % vod_id, - video_id, query={ - 'app_id': app_id, - 'user_network': sso, - 'UID': self._uid, - 'UIDSignature': self._uid_signature, - 'signatureTimestamp': self._signature_timestamp, - }) - - formats = self._extract_m3u8_formats( - data['response']['uri'], video_id, entry_protocol='m3u8_native', - ext='mp4', m3u8_id='hls') - - self._sort_formats(formats) - - info = { - 'id': vod_id, - 'formats': formats, - } - - api_key = get('vod', 'apiKey') - channel = get('medialaanGigya', 'channel') - - if api_key: - videos = self._download_json( - 'http://vod.medialaan.io/vod/v2/videos', video_id, fatal=False, - query={ - 'channels': channel, - 'ids': vod_id, - 'limit': 1, - 'apikey': api_key, - }) - if videos: - video = try_get( - videos, lambda x: x['response']['videos'][0], dict) - if video: - def get(container, item, expected_type=None): - return try_get( - video, lambda x: x[container][item], expected_type) - - def get_string(container, item): - return get(container, item, compat_str) - - info.update({ - 'series': get_string('program', 'title'), - 'season': get_string('season', 'title'), - 'season_number': int_or_none(get('season', 'number')), - 'season_id': get_string('season', 'id'), - 'episode': get_string('episode', 'title'), - 'episode_number': int_or_none(get('episode', 'number')), - 'episode_id': get_string('episode', 'id'), - 'duration': int_or_none( - video.get('duration')) or int_or_none( - video.get('durationMillis'), scale=1000), - 'title': get_string('episode', 'title'), - 'description': get_string('episode', 'text'), - 'timestamp': unified_timestamp(get_string( - 'publication', 'begin')), - }) - - if not info.get('title'): - info['title'] = try_get( - config, lambda x: x['videoConfig']['title'], - compat_str) or self._html_search_regex( - r'\\"title\\"\s*:\s*\\"(.+?)\\"', webpage, 'title', - default=None) or self._og_search_title(webpage) - - if not info.get('description'): - info['description'] = self._html_search_regex( - r']+class="field-item\s+even">\s*

(.+?)

', - webpage, 'description', default=None) - - return info + return { + 'id': production_id, + 'title': title, + 'formats': formats, + 'thumbnail': production.get('posterUrl'), + 'timestamp': parse_iso8601(production.get('publicationDate'), ' '), + 'duration': int_or_none(production.get('duration')) or None, + } diff --git a/youtube_dl/extractor/vtm.py b/youtube_dl/extractor/vtm.py new file mode 100644 index 000000000..093f1aa69 --- /dev/null +++ b/youtube_dl/extractor/vtm.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + try_get, +) + + +class VTMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vtm\.be/([^/?&#]+)~v(?P[0-9a-f]{8}(?:-[0-9a-f]{4}){3}-[0-9a-f]{12})' + _TEST = { + 'url': 'https://vtm.be/gast-vernielt-genkse-hotelkamer~ve7534523-279f-4b4d-a5c9-a33ffdbe23e1', + 'md5': '37dca85fbc3a33f2de28ceb834b071f8', + 'info_dict': { + 'id': '192445', + 'ext': 'mp4', + 'title': 'Gast vernielt Genkse hotelkamer', + 'timestamp': 1611060180, + 'upload_date': '20210119', + 'duration': 74, + # TODO: fix url _type result processing + # 'series': 'Op Interventie', + } + } + + def _real_extract(self, url): + uuid = self._match_id(url) + video = self._download_json( + 'https://omc4vm23offuhaxx6hekxtzspi.appsync-api.eu-west-1.amazonaws.com/graphql', + uuid, query={ + 'query': '''{ + getComponent(type: Video, uuid: "%s") { + ... on Video { + description + duration + myChannelsVideo + program { + title + } + publishedAt + title + } + } +}''' % uuid, + }, headers={ + 'x-api-key': 'da2-lz2cab4tfnah3mve6wiye4n77e', + })['data']['getComponent'] + + return { + '_type': 'url', + 'id': uuid, + 'title': video.get('title'), + 'url': 'http://mychannels.video/embed/%d' % video['myChannelsVideo'], + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('publishedAt')), + 'duration': int_or_none(video.get('duration')), + 'series': try_get(video, lambda x: x['program']['title']), + 'ie_key': 'Medialaan', + } From fd95fc33b13d732002d53c35521f17184d14cc21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= Date: Wed, 27 Jan 2021 20:06:12 +0100 Subject: [PATCH 47/79] [awaan] Extract uploader id (#27963) --- youtube_dl/extractor/awaan.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/awaan.py b/youtube_dl/extractor/awaan.py index a2603bbff..3a7700cd4 100644 --- a/youtube_dl/extractor/awaan.py +++ b/youtube_dl/extractor/awaan.py @@ -48,6 +48,7 @@ class AWAANBaseIE(InfoExtractor): 'duration': int_or_none(video_data.get('duration')), 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), 'is_live': is_live, + 'uploader_id': video_data.get('user_id'), } @@ -107,6 +108,7 @@ class AWAANLiveIE(AWAANBaseIE): 'title': 're:Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'upload_date': '20150107', 'timestamp': 1420588800, + 'uploader_id': '71', }, 'params': { # m3u8 download From 7b8fa658f88d53066f7a2ad00df19697552cf286 Mon Sep 17 00:00:00 2001 From: knapior Date: Wed, 27 Jan 2021 20:43:20 +0100 Subject: [PATCH 48/79] [cda] Improve birth validation detection (closes #14022) (#27929) Co-authored-by: Sergey M --- youtube_dl/extractor/cda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index d67900e62..6429454fb 100644 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -96,7 +96,7 @@ class CDAIE(InfoExtractor): raise ExtractorError('This video is only available for premium users.', expected=True) need_confirm_age = False - if self._html_search_regex(r'(]+action="/a/validatebirth")', + if self._html_search_regex(r'(]+action="[^"]*/a/validatebirth[^"]*")', webpage, 'birthday validate form', default=None): webpage = self._download_age_confirm_page( url, video_id, note='Confirming age') From 0b4f03a56394dee070f6e1723af8ce3a9ce44bfb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 28 Jan 2021 16:22:24 +0100 Subject: [PATCH 49/79] [youtube] improve DASH formats file size extraction --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f57099f8c..b254ceced 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2370,7 +2370,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): mpd_url, video_id, fatal=dash_mpd_fatal, formats_dict=self._formats): if not df.get('filesize'): - df['filesize'] = _extract_filesize(df['url']) + df['filesize'] = _extract_filesize(df.get('fragment_base_url') or df['url']) # Do not overwrite DASH format found in some previous DASH manifest if df['format_id'] not in dash_formats: dash_formats[df['format_id']] = df From 56a7ee903360fe89cb5372805473b508153fb856 Mon Sep 17 00:00:00 2001 From: ping Date: Fri, 29 Jan 2021 16:02:18 +0800 Subject: [PATCH 50/79] [vlive] Fix error message decoding for python 2 (#28004) --- youtube_dl/extractor/vlive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 6224e6200..e2f5d81b8 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -116,7 +116,7 @@ class VLiveIE(VLiveBaseIE): headers={'Referer': 'https://www.vlive.tv/'}, query=query) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self.raise_login_required(json.loads(e.cause.read().decode())['message']) + self.raise_login_required(json.loads(e.cause.read().decode('utf-8'))['message']) raise def _real_extract(self, url): From ba15b2fee673f76c2cff2f193d5b4a19029b501f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 29 Jan 2021 11:52:48 +0100 Subject: [PATCH 51/79] [googledrive] report download page errors(closes #28005) --- youtube_dl/extractor/googledrive.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index de8c80e36..3f2de00f1 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -7,6 +7,7 @@ from ..compat import compat_parse_qs from ..utils import ( determine_ext, ExtractorError, + get_element_by_class, int_or_none, lowercase_escape, try_get, @@ -237,7 +238,7 @@ class GoogleDriveIE(InfoExtractor): if confirmation_webpage: confirm = self._search_regex( r'confirm=([^&"\']+)', confirmation_webpage, - 'confirmation code', fatal=False) + 'confirmation code', default=None) if confirm: confirmed_source_url = update_url_query(source_url, { 'confirm': confirm, @@ -245,6 +246,11 @@ class GoogleDriveIE(InfoExtractor): urlh = request_source_file(confirmed_source_url, 'confirmed source') if urlh and urlh.headers.get('Content-Disposition'): add_source_format(urlh) + else: + self.report_warning( + get_element_by_class('uc-error-subcaption', confirmation_webpage) + or get_element_by_class('uc-error-caption', confirmation_webpage) + or 'unable to extract confirmation code') if not formats and reason: raise ExtractorError(reason, expected=True) From a800838f5afad43a76a53bdb5f4c1b20c80ff202 Mon Sep 17 00:00:00 2001 From: nixxo Date: Fri, 29 Jan 2021 14:24:28 +0100 Subject: [PATCH 52/79] [vvvvid] add support for youtube embeds (#27825) --- youtube_dl/extractor/vvvvid.py | 35 ++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index f4cae7fe9..778ce8b76 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( ExtractorError, int_or_none, @@ -47,6 +48,22 @@ class VVVVIDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # video_type == 'video/youtube' + 'url': 'https://www.vvvvid.it/show/404/one-punch-man/406/486683/trailer', + 'md5': '33e0edfba720ad73a8782157fdebc648', + 'info_dict': { + 'id': 'RzmFKUDOUgw', + 'ext': 'mp4', + 'title': 'Trailer', + 'upload_date': '20150906', + 'description': 'md5:a5e802558d35247fee285875328c0b80', + 'uploader_id': 'BandaiVisual', + 'uploader': 'BANDAI NAMCO Arts Channel', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048', 'only_matching': True @@ -154,12 +171,13 @@ class VVVVIDIE(InfoExtractor): if season_number: info['season_number'] = int(season_number) - for quality in ('_sd', ''): + video_type = video_data.get('video_type') + is_youtube = False + for quality in ('', '_sd'): embed_code = video_data.get('embed_info' + quality) if not embed_code: continue embed_code = ds(embed_code) - video_type = video_data.get('video_type') if video_type in ('video/rcs', 'video/kenc'): if video_type == 'video/kenc': kenc = self._download_json( @@ -172,19 +190,28 @@ class VVVVIDIE(InfoExtractor): if kenc_message: embed_code += '?' + ds(kenc_message) formats.extend(self._extract_akamai_formats(embed_code, video_id)) + elif video_type == 'video/youtube': + info.update({ + '_type': 'url_transparent', + 'ie_key': YoutubeIE.ie_key(), + 'url': embed_code, + }) + is_youtube = True + break else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) metadata_from_url(embed_code) - self._sort_formats(formats) + if not is_youtube: + self._sort_formats(formats) + info['formats'] = formats metadata_from_url(video_data.get('thumbnail')) info.update(self._extract_common_video_info(video_data)) info.update({ 'id': video_id, 'title': title, - 'formats': formats, 'duration': int_or_none(video_data.get('length')), 'series': video_data.get('show_title'), 'season_id': season_id, From 8bf9591a70757c624a8ea5bf686040ed752246e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= Date: Sat, 30 Jan 2021 13:44:01 +0100 Subject: [PATCH 53/79] [AENetworks] update AENetworksShowIE test playlist id (#27851) --- youtube_dl/extractor/aenetworks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index a5d88ebbe..e55c03fd7 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -252,7 +252,7 @@ class AENetworksShowIE(AENetworksListBaseIE): _TESTS = [{ 'url': 'http://www.history.com/shows/ancient-aliens', 'info_dict': { - 'id': 'SH012427480000', + 'id': 'SERIES1574', 'title': 'Ancient Aliens', 'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f', }, From 67299f23d8b1894120e875edf97440de87e22308 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 1 Feb 2021 14:30:59 +0100 Subject: [PATCH 54/79] [youtube] Rewrite Extractor - improve format sorting - remove unused code(swf parsing, ...) - fix series metadata extraction - fix trailer video extraction - improve error reporting - extract video location --- youtube_dl/extractor/common.py | 17 +- youtube_dl/extractor/youtube.py | 1717 +++++++++---------------------- 2 files changed, 483 insertions(+), 1251 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d5faa0eb7..8eb110f4e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2064,7 +2064,7 @@ class InfoExtractor(object): }) return entries - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}): + def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', @@ -2078,10 +2078,9 @@ class InfoExtractor(object): mpd_base_url = base_url(urlh.geturl()) return self._parse_mpd_formats( - mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url, - formats_dict=formats_dict, mpd_url=mpd_url) + mpd_doc, mpd_id, mpd_base_url, mpd_url) - def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): + def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): """ Parse formats from MPD manifest. References: @@ -2359,15 +2358,7 @@ class InfoExtractor(object): else: # Assuming direct URL to unfragmented media. f['url'] = base_url - - # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation - # is not necessarily unique within a Period thus formats with - # the same `format_id` are quite possible. There are numerous examples - # of such manifests (see https://github.com/ytdl-org/youtube-dl/issues/15111, - # https://github.com/ytdl-org/youtube-dl/issues/13919) - full_info = formats_dict.get(representation_id, {}).copy() - full_info.update(f) - formats.append(full_info) + formats.append(f) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b254ceced..5f6769878 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2,41 +2,34 @@ from __future__ import unicode_literals - import itertools import json import os.path import random import re -import time import traceback from .common import InfoExtractor, SearchInfoExtractor -from ..jsinterp import JSInterpreter -from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, compat_HTTPError, compat_parse_qs, - compat_urllib_parse_unquote, + compat_str, compat_urllib_parse_unquote_plus, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, - compat_str, ) +from ..jsinterp import JSInterpreter from ..utils import ( - bool_or_none, - clean_html, - error_to_compat_str, ExtractorError, + clean_html, float_or_none, - get_element_by_id, int_or_none, mimetype2ext, parse_codecs, parse_duration, - remove_quotes, + qualities, remove_start, smuggle_url, str_or_none, @@ -46,7 +39,6 @@ from ..utils import ( unified_strdate, unsmuggle_url, update_url_query, - uppercase_escape, url_or_none, urlencode_postdata, urljoin, @@ -68,12 +60,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' - def _set_language(self): - self._set_cookie( - '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en', - # YouTube sets the expire time to about two months - expire_time=time.time() + 2 * 30 * 24 * 3600) - def _ids_to_results(self, ids): return [ self.url_result(vid_id, 'Youtube', video_id=vid_id) @@ -265,7 +251,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _real_initialize(self): if self._downloader is None: return - self._set_language() if not self._login(): return @@ -282,19 +267,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P[a-z]+)$', - r'\b(?Pvfl[a-zA-Z0-9_-]+)\b.*?\.(?P[a-z]+)$', + r'/(?P[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.js$', + r'\b(?Pvfl[a-zA-Z0-9_-]+)\b.*?\.js$', ) - _formats = { - '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, - '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, - '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, - '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well - '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, - '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - - - # 3D videos - '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, - '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - - # Apple HTTP Live Streaming - '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, - - # DASH mp4 video - '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) - '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, - - # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, - '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, - '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, - - # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, - '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) - '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - - # Dash webm audio - '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, - '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, - - # Dash webm audio with opus inside - '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, - '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, - '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, - - # RTMP (unnamed) - '_rtmp': {'protocol': 'rtmp'}, - - # av01 video only formats sometimes served with "unknown" codecs - '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - } _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') _GEO_BYPASS = False @@ -566,7 +443,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'setindia', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', 'age_limit': 18, - } + }, + 'skip': 'Private video', }, { 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ', @@ -640,7 +518,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'TheAmazingAtheist', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', 'title': 'Burning Everyone\'s Koran', - 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', + 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', } }, # Normal age-gate video (No vevo, embed allowed), available via embed page @@ -676,11 +554,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20100430', 'uploader_id': 'deadmau5', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', - 'creator': 'Dada Life, deadmau5', - 'description': 'md5:12c56784b8032162bb936a5f76d55360', + 'creator': 'deadmau5', + 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336', 'uploader': 'deadmau5', 'title': 'Deadmau5 - Some Chords (HD)', - 'alt_title': 'This Machine Kills Some Chords', + 'alt_title': 'Some Chords', }, 'expected_warnings': [ 'DASH manifest missing', @@ -775,69 +653,64 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, { # Multifeed videos (multiple cameras), URL is for Main Camera - 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs', + 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg', 'info_dict': { - 'id': 'jqWvoWXjCVs', - 'title': 'teamPGP: Rocket League Noob Stream', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'id': 'jvGDaLqkpTg', + 'title': 'Tom Clancy Free Weekend Rainbow Whatever', + 'description': 'md5:e03b909557865076822aa169218d6a5d', }, 'playlist': [{ 'info_dict': { - 'id': 'jqWvoWXjCVs', + 'id': 'jvGDaLqkpTg', 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7335, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', + 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)', + 'description': 'md5:e03b909557865076822aa169218d6a5d', + 'duration': 10643, + 'upload_date': '20161111', + 'uploader': 'Team PGP', + 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', }, }, { 'info_dict': { - 'id': '6h8e8xoXJzg', + 'id': '3AKt1R1aDnw', 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7337, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', + 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)', + 'description': 'md5:e03b909557865076822aa169218d6a5d', + 'duration': 10991, + 'upload_date': '20161111', + 'uploader': 'Team PGP', + 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', }, }, { 'info_dict': { - 'id': 'PUOgX5z9xZw', + 'id': 'RtAMM00gpVc', 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (grizzle)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7337, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', + 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)', + 'description': 'md5:e03b909557865076822aa169218d6a5d', + 'duration': 10995, + 'upload_date': '20161111', + 'uploader': 'Team PGP', + 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', }, }, { 'info_dict': { - 'id': 'teuwxikvS5k', + 'id': '6N2fdlP3C5U', 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (zim)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7334, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', + 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)', + 'description': 'md5:e03b909557865076822aa169218d6a5d', + 'duration': 10990, + 'upload_date': '20161111', + 'uploader': 'Team PGP', + 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', }, }], 'params': { 'skip_download': True, }, - 'skip': 'This video is not available.', }, { # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536) @@ -931,7 +804,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'eQcmzGIKrzg', 'ext': 'mp4', 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders', - 'description': 'md5:dda0d780d5a6e120758d1711d062a867', + 'description': 'md5:13a2503d7b5904ef4b223aa101628f39', 'duration': 4060, 'upload_date': '20151119', 'uploader': 'Bernie Sanders', @@ -978,7 +851,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'iqKdEhx-dD4', 'ext': 'mp4', 'title': 'Isolation - Mind Field (Ep 1)', - 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f', + 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd', 'duration': 2085, 'upload_date': '20170118', 'uploader': 'Vsauce', @@ -1013,6 +886,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.', }, { # itag 212 @@ -1098,6 +972,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Video unavailable', }, { # empty description results in an empty string @@ -1147,24 +1022,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def __init__(self, *args, **kwargs): super(YoutubeIE, self).__init__(*args, **kwargs) + self._code_cache = {} self._player_cache = {} - def report_video_info_webpage_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen('%s: Downloading video info webpage' % video_id) - - def report_information_extraction(self, video_id): - """Report attempt to extract video information.""" - self.to_screen('%s: Extracting video information' % video_id) - - def report_unavailable_format(self, video_id, format): - """Report extracted video URL.""" - self.to_screen('%s: Format %s not available' % (video_id, format)) - - def report_rtmp_download(self): - """Indicate the download will use the RTMP protocol.""" - self.to_screen('RTMP download detected') - def _signature_cache_id(self, example_sig): """ Return a string representation of a signature """ return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) @@ -1177,40 +1037,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): break else: raise ExtractorError('Cannot identify player %r' % player_url) - return id_m.group('ext'), id_m.group('id') + return id_m.group('id') def _extract_signature_function(self, video_id, player_url, example_sig): - player_type, player_id = self._extract_player_info(player_url) + player_id = self._extract_player_info(player_url) # Read from filesystem cache - func_id = '%s_%s_%s' % ( - player_type, player_id, self._signature_cache_id(example_sig)) + func_id = 'js_%s_%s' % ( + player_id, self._signature_cache_id(example_sig)) assert os.path.basename(func_id) == func_id cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) - download_note = ( - 'Downloading player %s' % player_url - if self._downloader.params.get('verbose') else - 'Downloading %s player %s' % (player_type, player_id) - ) - if player_type == 'js': - code = self._download_webpage( + if player_id not in self._code_cache: + self._code_cache[player_id] = self._download_webpage( player_url, video_id, - note=download_note, + note='Downloading player ' + player_id, errnote='Download of %s failed' % player_url) - res = self._parse_sig_js(code) - elif player_type == 'swf': - urlh = self._request_webpage( - player_url, video_id, - note=download_note, - errnote='Download of %s failed' % player_url) - code = urlh.read() - res = self._parse_sig_swf(code) - else: - assert False, 'Invalid player type %r' % player_type + code = self._code_cache[player_id] + res = self._parse_sig_js(code) test_string = ''.join(map(compat_chr, range(len(example_sig)))) cache_res = res(test_string) @@ -1279,14 +1126,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): initial_function = jsi.extract_function(funcname) return lambda s: initial_function([s]) - def _parse_sig_swf(self, file_contents): - swfi = SWFInterpreter(file_contents) - TARGET_CLASSNAME = 'SignatureDecipher' - searched_class = swfi.extract_class(TARGET_CLASSNAME) - initial_function = swfi.extract_function(searched_class, 'decipher') - return lambda s: initial_function([s]) - - def _decrypt_signature(self, s, video_id, player_url, age_gate=False): + def _decrypt_signature(self, s, video_id, player_url): """Turn the encrypted s field into a working signature""" if player_url is None: @@ -1313,158 +1153,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError( 'Signature extraction failed: ' + tb, cause=e) - def _get_subtitles(self, video_id, webpage): - try: - subs_doc = self._download_xml( - 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, - video_id, note=False) - except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) - return {} - - sub_lang_list = {} - for track in subs_doc.findall('track'): - lang = track.attrib['lang_code'] - if lang in sub_lang_list: - continue - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse_urlencode({ - 'lang': lang, - 'v': video_id, - 'fmt': ext, - 'name': track.attrib['name'].encode('utf-8'), - }) - sub_formats.append({ - 'url': 'https://www.youtube.com/api/timedtext?' + params, - 'ext': ext, - }) - sub_lang_list[lang] = sub_formats - if not sub_lang_list: - self._downloader.report_warning('video doesn\'t have subtitles') - return {} - return sub_lang_list - - def _get_ytplayer_config(self, video_id, webpage): - patterns = ( - # User data may contain arbitrary character sequences that may affect - # JSON extraction with regex, e.g. when '};' is contained the second - # regex won't capture the whole JSON. Yet working around by trying more - # concrete regex first keeping in mind proper quoted string handling - # to be implemented in future that will replace this workaround (see - # https://github.com/ytdl-org/youtube-dl/issues/7468, - # https://github.com/ytdl-org/youtube-dl/pull/7599) - r';ytplayer\.config\s*=\s*({.+?});ytplayer', - r';ytplayer\.config\s*=\s*({.+?});', - ) - config = self._search_regex( - patterns, webpage, 'ytplayer.config', default=None) - if config: - return self._parse_json( - uppercase_escape(config), video_id, fatal=False) - - def _get_automatic_captions(self, video_id, player_response, player_config): - """We need the webpage for getting the captions url, pass it as an - argument to speed up the process.""" - self.to_screen('%s: Looking for automatic captions' % video_id) - err_msg = 'Couldn\'t find automatic captions for %s' % video_id - if not (player_response or player_config): - self._downloader.report_warning(err_msg) - return {} - try: - args = player_config.get('args') if player_config else {} - caption_url = args.get('ttsurl') - if caption_url: - timestamp = args['timestamp'] - # We get the available subtitles - list_params = compat_urllib_parse_urlencode({ - 'type': 'list', - 'tlangs': 1, - 'asrs': 1, - }) - list_url = caption_url + '&' + list_params - caption_list = self._download_xml(list_url, video_id) - original_lang_node = caption_list.find('track') - if original_lang_node is None: - self._downloader.report_warning('Video doesn\'t have automatic captions') - return {} - original_lang = original_lang_node.attrib['lang_code'] - caption_kind = original_lang_node.attrib.get('kind', '') - - sub_lang_list = {} - for lang_node in caption_list.findall('target'): - sub_lang = lang_node.attrib['lang_code'] - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse_urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': ext, - 'ts': timestamp, - 'kind': caption_kind, - }) - sub_formats.append({ - 'url': caption_url + '&' + params, - 'ext': ext, - }) - sub_lang_list[sub_lang] = sub_formats - return sub_lang_list - - def make_captions(sub_url, sub_langs): - parsed_sub_url = compat_urllib_parse_urlparse(sub_url) - caption_qs = compat_parse_qs(parsed_sub_url.query) - captions = {} - for sub_lang in sub_langs: - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - caption_qs.update({ - 'tlang': [sub_lang], - 'fmt': [ext], - }) - sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace( - query=compat_urllib_parse_urlencode(caption_qs, True))) - sub_formats.append({ - 'url': sub_url, - 'ext': ext, - }) - captions[sub_lang] = sub_formats - return captions - - # New captions format as of 22.06.2017 - if player_response: - renderer = player_response['captions']['playerCaptionsTracklistRenderer'] - base_url = renderer['captionTracks'][0]['baseUrl'] - sub_lang_list = [] - for lang in renderer['translationLanguages']: - lang_code = lang.get('languageCode') - if lang_code: - sub_lang_list.append(lang_code) - return make_captions(base_url, sub_lang_list) - - # Some videos don't provide ttsurl but rather caption_tracks and - # caption_translation_languages (e.g. 20LmZk1hakA) - # Does not used anymore as of 22.06.2017 - caption_tracks = args['caption_tracks'] - caption_translation_languages = args['caption_translation_languages'] - caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] - sub_lang_list = [] - for lang in caption_translation_languages.split(','): - lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) - sub_lang = lang_qs.get('lc', [None])[0] - if sub_lang: - sub_lang_list.append(sub_lang) - return make_captions(caption_url, sub_lang_list) - # An extractor error can be raise by the download process if there are - # no automatic captions but there are subtitles - except (KeyError, IndexError, ExtractorError): - self._downloader.report_warning(err_msg) - return {} - - def _mark_watched(self, video_id, video_info, player_response): + def _mark_watched(self, video_id, player_response): playback_url = url_or_none(try_get( player_response, - lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get( - video_info, lambda x: x['videostats_playback_base_url'][0])) + lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl'])) if not playback_url: return parsed_playback_url = compat_urlparse.urlparse(playback_url) @@ -1531,289 +1223,74 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id = mobj.group(2) return video_id - def _extract_chapters_from_json(self, webpage, video_id, duration): - if not webpage: - return - data = self._extract_yt_initial_data(video_id, webpage) - if not data or not isinstance(data, dict): - return - chapters_list = try_get( - data, - lambda x: x['playerOverlays'] - ['playerOverlayRenderer'] - ['decoratedPlayerBarRenderer'] - ['decoratedPlayerBarRenderer'] - ['playerBar'] - ['chapteredPlayerBarRenderer'] - ['chapters'], - list) - if not chapters_list: - return - - def chapter_time(chapter): - return float_or_none( - try_get( - chapter, - lambda x: x['chapterRenderer']['timeRangeStartMillis'], - int), - scale=1000) - chapters = [] - for next_num, chapter in enumerate(chapters_list, start=1): - start_time = chapter_time(chapter) - if start_time is None: - continue - end_time = (chapter_time(chapters_list[next_num]) - if next_num < len(chapters_list) else duration) - if end_time is None: - continue - title = try_get( - chapter, lambda x: x['chapterRenderer']['title']['simpleText'], - compat_str) - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': title, - }) - return chapters - - @staticmethod - def _extract_chapters_from_description(description, duration): - if not description: - return None - chapter_lines = re.findall( - r'(?:^|)([^<]*]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)[^>]*)(?=$|)', - description) - if not chapter_lines: - return None - chapters = [] - for next_num, (chapter_line, time_point) in enumerate( - chapter_lines, start=1): - start_time = parse_duration(time_point) - if start_time is None: - continue - if start_time > duration: - break - end_time = (duration if next_num == len(chapter_lines) - else parse_duration(chapter_lines[next_num][1])) - if end_time is None: - continue - if end_time > duration: - end_time = duration - if start_time > end_time: - break - chapter_title = re.sub( - r']+>[^<]+', '', chapter_line).strip(' \t-') - chapter_title = re.sub(r'\s+', ' ', chapter_title) - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': chapter_title, - }) - return chapters - - def _extract_chapters(self, webpage, description, video_id, duration): - return (self._extract_chapters_from_json(webpage, video_id, duration) - or self._extract_chapters_from_description(description, duration)) + def _extract_yt_initial_variable(self, webpage, regex, video_id, name): + return self._parse_json(self._search_regex( + (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), + regex), webpage, name, default='{}'), video_id, fatal=False) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) + video_id = self._match_id(url) + base_url = self.http_scheme() + '//www.youtube.com/' + webpage_url = base_url + 'watch?v=' + video_id + webpage = self._download_webpage(webpage_url, video_id, fatal=False) - proto = ( - 'http' if self._downloader.params.get('prefer_insecure', False) - else 'https') + player_response = None + if webpage: + player_response = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, + video_id, 'initial player response') + if not player_response: + player_response = self._call_api( + 'player', {'videoId': video_id}, video_id) - start_time = None - end_time = None - parsed_url = compat_urllib_parse_urlparse(url) - for component in [parsed_url.fragment, parsed_url.query]: - query = compat_parse_qs(component) - if start_time is None and 't' in query: - start_time = parse_duration(query['t'][0]) - if start_time is None and 'start' in query: - start_time = parse_duration(query['start'][0]) - if end_time is None and 'end' in query: - end_time = parse_duration(query['end'][0]) + playability_status = player_response.get('playabilityStatus') or {} + if playability_status.get('reason') == 'Sign in to confirm your age': + pr = self._parse_json(try_get(compat_parse_qs( + self._download_webpage( + base_url + 'get_video_info', video_id, + 'Refetching age-gated info webpage', + 'unable to download video info webpage', query={ + 'video_id': video_id, + }, fatal=False)), + lambda x: x['player_response'][0], + compat_str) or '{}', video_id) + if pr: + player_response = pr - # Extract original video URL from URL with redirection, like age verification, using next_url parameter - mobj = re.search(self._NEXT_URL_RE, url) - if mobj: - url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/') - video_id = self.extract_id(url) + trailer_video_id = try_get( + playability_status, + lambda x: x['errorScreen']['playerLegacyDesktopYpcTrailerRenderer']['trailerVideoId'], + compat_str) + if trailer_video_id: + return self.url_result( + trailer_video_id, self.ie_key(), trailer_video_id) - # Get video webpage - url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id - video_webpage, urlh = self._download_webpage_handle(url, video_id) - - qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query) - video_id = qs.get('v', [None])[0] or video_id - - # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) - if mobj is not None: - player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) - else: - player_url = None - - dash_mpds = [] - - def add_dash_mpd(video_info): - dash_mpd = video_info.get('dashmpd') - if dash_mpd and dash_mpd[0] not in dash_mpds: - dash_mpds.append(dash_mpd[0]) - - def add_dash_mpd_pr(pl_response): - dash_mpd = url_or_none(try_get( - pl_response, lambda x: x['streamingData']['dashManifestUrl'], - compat_str)) - if dash_mpd and dash_mpd not in dash_mpds: - dash_mpds.append(dash_mpd) - - is_live = None - view_count = None - - def extract_view_count(v_info): - return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) - - def extract_player_response(player_response, video_id): - pl_response = str_or_none(player_response) - if not pl_response: + def get_text(x): + if not x: return - pl_response = self._parse_json(pl_response, video_id, fatal=False) - if isinstance(pl_response, dict): - add_dash_mpd_pr(pl_response) - return pl_response + return x.get('simpleText') or ''.join([r['text'] for r in x['runs']]) - player_response = {} - - # Get video info - video_info = {} - embed_webpage = None - ytplayer_config = None - - if re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None: - age_gate = True - # We simulate the access to the video from www.youtube.com/v/{video_id} - # this can be viewed without login into Youtube - url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') - data = compat_urllib_parse_urlencode({ - 'video_id': video_id, - 'eurl': 'https://youtube.googleapis.com/v/' + video_id, - 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), - }) - video_info_url = proto + '://www.youtube.com/get_video_info?' + data - try: - video_info_webpage = self._download_webpage( - video_info_url, video_id, - note='Refetching age-gated info webpage', - errnote='unable to download video info webpage') - except ExtractorError: - video_info_webpage = None - if video_info_webpage: - video_info = compat_parse_qs(video_info_webpage) - pl_response = video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(video_info) - view_count = extract_view_count(video_info) - else: - age_gate = False - # Try looking directly into the video webpage - ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) - if ytplayer_config: - args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): - # Convert to the same format returned by compat_parse_qs - video_info = dict((k, [v]) for k, v in args.items()) - add_dash_mpd(video_info) - # Rental video is not rented but preview is available (e.g. - # https://www.youtube.com/watch?v=yYr8q0y5Jfg, - # https://github.com/ytdl-org/youtube-dl/issues/10532) - if not video_info and args.get('ypc_vid'): - return self.url_result( - args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) - if args.get('livestream') == '1' or args.get('live_playback') == 1: - is_live = True - if not player_response: - player_response = extract_player_response(args.get('player_response'), video_id) - if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): - add_dash_mpd_pr(player_response) - - if not video_info and not player_response: - player_response = extract_player_response( - self._search_regex( - (r'%s\s*%s' % (self._YT_INITIAL_PLAYER_RESPONSE_RE, self._YT_INITIAL_BOUNDARY_RE), - self._YT_INITIAL_PLAYER_RESPONSE_RE), video_webpage, - 'initial player response', default='{}'), - video_id) - - def extract_unavailable_message(): - messages = [] - for tag, kind in (('h1', 'message'), ('div', 'submessage')): - msg = self._html_search_regex( - r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)'.format(tag=tag, kind=kind), - video_webpage, 'unavailable %s' % kind, default=None) - if msg: - messages.append(msg) - if messages: - return '\n'.join(messages) - - if not video_info and not player_response: - unavailable_message = extract_unavailable_message() - if not unavailable_message: - unavailable_message = 'Unable to extract video data' - raise ExtractorError( - 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) - - if not isinstance(video_info, dict): - video_info = {} - - video_details = try_get( - player_response, lambda x: x['videoDetails'], dict) or {} + search_meta = ( + lambda x: self._html_search_meta(x, webpage, default=None)) \ + if webpage else lambda x: None + video_details = player_response.get('videoDetails') or {} microformat = try_get( - player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {} - - video_title = video_info.get('title', [None])[0] or video_details.get('title') - if not video_title: - self._downloader.report_warning('Unable to extract video title') - video_title = '_' - - description_original = video_description = get_element_by_id("eow-description", video_webpage) - if video_description: - - def replace_url(m): - redir_url = compat_urlparse.urljoin(url, m.group(1)) - parsed_redir_url = compat_urllib_parse_urlparse(redir_url) - if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect': - qs = compat_parse_qs(parsed_redir_url.query) - q = qs.get('q') - if q and q[0]: - return q[0] - return redir_url - - description_original = video_description = re.sub(r'''(?x) - ]*> - [^<]+\.{3}\s* - - ''', replace_url, video_description) - video_description = clean_html(video_description) - else: - video_description = video_details.get('shortDescription') - if video_description is None: - video_description = self._html_search_meta('description', video_webpage) + player_response, + lambda x: x['microformat']['playerMicroformatRenderer'], + dict) or {} + video_title = video_details.get('title') \ + or get_text(microformat.get('title')) \ + or search_meta(['og:title', 'twitter:title', 'title']) + video_description = video_details.get('shortDescription') if not smuggled_data.get('force_singlefeed', False): if not self._downloader.params.get('noplaylist'): multifeed_metadata_list = try_get( player_response, lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'], - compat_str) or try_get( - video_info, lambda x: x['multifeed_metadata_list'][0], compat_str) + compat_str) if multifeed_metadata_list: entries = [] feed_ids = [] @@ -1821,10 +1298,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Unquote should take place before split on comma (,) since textual # fields may contain comma as well (see # https://github.com/ytdl-org/youtube-dl/issues/8536) - feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) + feed_data = compat_parse_qs( + compat_urllib_parse_unquote_plus(feed)) def feed_entry(name): - return try_get(feed_data, lambda x: x[name][0], compat_str) + return try_get( + feed_data, lambda x: x[name][0], compat_str) feed_id = feed_entry('id') if not feed_id: @@ -1837,7 +1316,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '_type': 'url_transparent', 'ie_key': 'Youtube', 'url': smuggle_url( - '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), + base_url + 'watch?v=' + feed_data['id'][0], {'force_singlefeed': True}), 'title': title, }) @@ -1845,631 +1324,393 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.to_screen( 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' % (', '.join(feed_ids), video_id)) - return self.playlist_result(entries, video_id, video_title, video_description) + return self.playlist_result( + entries, video_id, video_title, video_description) else: self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - if view_count is None: - view_count = extract_view_count(video_info) - if view_count is None and video_details: - view_count = int_or_none(video_details.get('viewCount')) - if view_count is None and microformat: - view_count = int_or_none(microformat.get('viewCount')) + formats = [] + itags = [] + player_url = None + q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) + streaming_data = player_response.get('streamingData') or {} + streaming_formats = streaming_data.get('formats') or [] + streaming_formats.extend(streaming_data.get('adaptiveFormats') or []) + for fmt in streaming_formats: + if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): + continue - if is_live is None: - is_live = bool_or_none(video_details.get('isLive')) - - # Check for "rental" videos - if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True) - - def _extract_filesize(media_url): - return int_or_none(self._search_regex( - r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) - - streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or [] - streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or []) - - if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): - self.report_rtmp_download() - formats = [{ - 'format_id': '_rtmp', - 'protocol': 'rtmp', - 'url': video_info['conn'][0], - 'player_url': player_url, - }] - elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): - encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] - if 'rtmpe%3Dyes' in encoded_url_map: - raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True) - formats = [] - formats_spec = {} - fmt_list = video_info.get('fmt_list', [''])[0] - if fmt_list: - for fmt in fmt_list.split(','): - spec = fmt.split('/') - if len(spec) > 1: - width_height = spec[1].split('x') - if len(width_height) == 2: - formats_spec[spec[0]] = { - 'resolution': spec[1], - 'width': int_or_none(width_height[0]), - 'height': int_or_none(width_height[1]), - } - for fmt in streaming_formats: - itag = str_or_none(fmt.get('itag')) - if not itag: + fmt_url = fmt.get('url') + if not fmt_url: + sc = compat_parse_qs(fmt.get('signatureCipher')) + fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) + encrypted_sig = try_get(sc, lambda x: x['s'][0]) + if not (sc and fmt_url and encrypted_sig): continue - quality = fmt.get('quality') - quality_label = fmt.get('qualityLabel') or quality - formats_spec[itag] = { - 'asr': int_or_none(fmt.get('audioSampleRate')), - 'filesize': int_or_none(fmt.get('contentLength')), - 'format_note': quality_label, - 'fps': int_or_none(fmt.get('fps')), - 'height': int_or_none(fmt.get('height')), - # bitrate for itag 43 is always 2147483647 - 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, - 'width': int_or_none(fmt.get('width')), - } - - for fmt in streaming_formats: - if fmt.get('drmFamilies') or fmt.get('drm_families'): - continue - url = url_or_none(fmt.get('url')) - - if not url: - cipher = fmt.get('cipher') or fmt.get('signatureCipher') - if not cipher: + if not player_url: + if not webpage: continue - url_data = compat_parse_qs(cipher) - url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str)) - if not url: + player_url = self._search_regex( + r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', + webpage, 'player URL', fatal=False) + if not player_url: + continue + signature = self._decrypt_signature(sc['s'][0], video_id, player_url) + sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' + fmt_url += '&' + sp + '=' + signature + + itag = str_or_none(fmt.get('itag')) + if itag: + itags.append(itag) + quality = fmt.get('quality') + dct = { + 'asr': int_or_none(fmt.get('audioSampleRate')), + 'filesize': int_or_none(fmt.get('contentLength')), + 'format_id': itag, + 'format_note': fmt.get('qualityLabel') or quality, + 'fps': int_or_none(fmt.get('fps')), + 'height': int_or_none(fmt.get('height')), + 'quality': q(quality), + 'tbr': float_or_none(fmt.get( + 'averageBitrate') or fmt.get('bitrate'), 1000), + 'url': fmt_url, + 'width': fmt.get('width'), + } + mimetype = fmt.get('mimeType') + if mimetype: + mobj = re.match( + r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', mimetype) + if mobj: + dct['ext'] = mimetype2ext(mobj.group(1)) + dct.update(parse_codecs(mobj.group(2))) + if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': + dct['downloader_options'] = { + # Youtube throttles chunks >~10M + 'http_chunk_size': 10485760, + } + formats.append(dct) + + hls_manifest_url = streaming_data.get('hlsManifestUrl') + if hls_manifest_url: + for f in self._extract_m3u8_formats( + hls_manifest_url, video_id, 'mp4', fatal=False): + itag = self._search_regex( + r'/itag/(\d+)', f['url'], 'itag', default=None) + if itag: + f['format_id'] = itag + formats.append(f) + + if self._downloader.params.get('youtube_include_dash_manifest'): + dash_manifest_url = streaming_data.get('dashManifestUrl') + if dash_manifest_url: + for f in self._extract_mpd_formats( + dash_manifest_url, video_id, fatal=False): + if f['format_id'] in itags: continue - else: - cipher = None - url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + filesize = int_or_none(self._search_regex( + r'/clen/(\d+)', f.get('fragment_base_url') + or f['url'], 'file size', default=None)) + if filesize: + f['filesize'] = filesize + formats.append(f) - stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) - # Unsupported FORMAT_STREAM_TYPE_OTF - if stream_type == 3: - continue + if not formats: + if streaming_data.get('licenseInfos'): + raise ExtractorError( + 'This video is DRM protected.', expected=True) + pemr = try_get( + playability_status, + lambda x: x['errorScreen']['playerErrorMessageRenderer'], + dict) or {} + reason = get_text(pemr.get('reason')) or playability_status.get('reason') + subreason = pemr.get('subreason') + if subreason: + subreason = clean_html(get_text(subreason)) + if subreason == 'The uploader has not made this video available in your country.': + countries = microformat.get('availableCountries') + if not countries: + regions_allowed = search_meta('regionsAllowed') + countries = regions_allowed.split(',') if regions_allowed else None + self.raise_geo_restricted( + subreason, countries) + reason += '\n' + subreason + if reason: + raise ExtractorError(reason, expected=True) - format_id = fmt.get('itag') or url_data['itag'][0] - if not format_id: - continue - format_id = compat_str(format_id) + self._sort_formats(formats) - if cipher: - if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): - ASSETS_RE = ( - r']+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base', - r'"jsUrl"\s*:\s*("[^"]+")', - r'"assets":.+?"js":\s*("[^"]+")') - jsplayer_url_json = self._search_regex( - ASSETS_RE, - embed_webpage if age_gate else video_webpage, - 'JS player URL (1)', default=None) - if not jsplayer_url_json and not age_gate: - # We need the embed website after all - if embed_webpage is None: - embed_url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage( - embed_url, video_id, 'Downloading embed webpage') - jsplayer_url_json = self._search_regex( - ASSETS_RE, embed_webpage, 'JS player URL') - - player_url = json.loads(jsplayer_url_json) - if player_url is None: - player_url_json = self._search_regex( - r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', - video_webpage, 'age gate player URL') - player_url = json.loads(player_url_json) - - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - encrypted_sig = url_data['s'][0] - - if self._downloader.params.get('verbose'): - if player_url is None: - player_desc = 'unknown' - else: - player_type, player_version = self._extract_player_info(player_url) - player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version) - parts_sizes = self._signature_cache_id(encrypted_sig) - self.to_screen('{%s} signature length %s, %s' % - (format_id, parts_sizes, player_desc)) - - signature = self._decrypt_signature( - encrypted_sig, video_id, player_url, age_gate) - sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' - url += '&%s=%s' % (sp, signature) - if 'ratebypass' not in url: - url += '&ratebypass=yes' - - dct = { - 'format_id': format_id, - 'url': url, - 'player_url': player_url, - } - if format_id in self._formats: - dct.update(self._formats[format_id]) - if format_id in formats_spec: - dct.update(formats_spec[format_id]) - - # Some itags are not included in DASH manifest thus corresponding formats will - # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993). - # Trying to extract metadata from url_encoded_fmt_stream_map entry. - mobj = re.search(r'^(?P\d+)[xX](?P\d+)$', url_data.get('size', [''])[0]) - width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) - - if width is None: - width = int_or_none(fmt.get('width')) - if height is None: - height = int_or_none(fmt.get('height')) - - filesize = int_or_none(url_data.get( - 'clen', [None])[0]) or _extract_filesize(url) - - quality = url_data.get('quality', [None])[0] or fmt.get('quality') - quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel') - - tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000) - or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None - fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps')) - - more_fields = { - 'filesize': filesize, - 'tbr': tbr, - 'width': width, - 'height': height, - 'fps': fps, - 'format_note': quality_label or quality, - } - for key, value in more_fields.items(): - if value: - dct[key] = value - type_ = url_data.get('type', [None])[0] or fmt.get('mimeType') - if type_: - type_split = type_.split(';') - kind_ext = type_split[0].split('/') - if len(kind_ext) == 2: - kind, _ = kind_ext - dct['ext'] = mimetype2ext(type_split[0]) - if kind in ('audio', 'video'): - codecs = None - for mobj in re.finditer( - r'(?P[a-zA-Z_-]+)=(?P["\']?)(?P.+?)(?P=quote)(?:;|$)', type_): - if mobj.group('key') == 'codecs': - codecs = mobj.group('val') - break - if codecs: - dct.update(parse_codecs(codecs)) - if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': - dct['downloader_options'] = { - # Youtube throttles chunks >~10M - 'http_chunk_size': 10485760, - } - formats.append(dct) - else: - manifest_url = ( - url_or_none(try_get( - player_response, - lambda x: x['streamingData']['hlsManifestUrl'], - compat_str)) - or url_or_none(try_get( - video_info, lambda x: x['hlsvp'][0], compat_str))) - if manifest_url: - formats = [] - m3u8_formats = self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', fatal=False) - for a_format in m3u8_formats: - itag = self._search_regex( - r'/itag/(\d+)/', a_format['url'], 'itag', default=None) - if itag: - a_format['format_id'] = itag - if itag in self._formats: - dct = self._formats[itag].copy() - dct.update(a_format) - a_format = dct - a_format['player_url'] = player_url - # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' - formats.append(a_format) - else: - error_message = extract_unavailable_message() - if not error_message: - reason_list = try_get( - player_response, - lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'], - list) or [] - for reason in reason_list: - if not isinstance(reason, dict): - continue - reason_text = try_get(reason, lambda x: x['text'], compat_str) - if reason_text: - if not error_message: - error_message = '' - error_message += reason_text - if error_message: - error_message = clean_html(error_message) - if not error_message: - error_message = clean_html(try_get( - player_response, lambda x: x['playabilityStatus']['reason'], - compat_str)) - if not error_message: - error_message = clean_html( - try_get(video_info, lambda x: x['reason'][0], compat_str)) - if error_message: - raise ExtractorError(error_message, expected=True) - raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info') - - # uploader - video_uploader = try_get( - video_info, lambda x: x['author'][0], - compat_str) or str_or_none(video_details.get('author')) - if video_uploader: - video_uploader = compat_urllib_parse_unquote_plus(video_uploader) - else: - self._downloader.report_warning('unable to extract uploader name') - - # uploader_id - video_uploader_id = None - video_uploader_url = None - mobj = re.search( - r'', - video_webpage) - if mobj is not None: - video_uploader_id = mobj.group('uploader_id') - video_uploader_url = mobj.group('uploader_url') - else: - owner_profile_url = url_or_none(microformat.get('ownerProfileUrl')) - if owner_profile_url: - video_uploader_id = self._search_regex( - r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id', - default=None) - video_uploader_url = owner_profile_url - - channel_id = ( - str_or_none(video_details.get('channelId')) - or self._html_search_meta( - 'channelId', video_webpage, 'channel id', default=None) - or self._search_regex( - r'data-channel-external-id=(["\'])(?P(?:(?!\1).)+)\1', - video_webpage, 'channel id', default=None, group='id')) - channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None + keywords = video_details.get('keywords') or [] + if not keywords and webpage: + keywords = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), webpage)] + for keyword in keywords: + if keyword.startswith('yt:stretch='): + w, h = keyword.split('=')[1].split(':') + w, h = int(w), int(h) + if w > 0 and h > 0: + ratio = w / h + for f in formats: + if f.get('vcodec') != 'none': + f['stretched_ratio'] = ratio thumbnails = [] - thumbnails_list = try_get( - video_details, lambda x: x['thumbnail']['thumbnails'], list) or [] - for t in thumbnails_list: - if not isinstance(t, dict): - continue - thumbnail_url = url_or_none(t.get('url')) - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(t.get('width')), - 'height': int_or_none(t.get('height')), - }) - - if not thumbnails: - video_thumbnail = None - # We try first to get a high quality image: - m_thumb = re.search(r'', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str) - if thumbnail_url: - video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url) - if video_thumbnail: - thumbnails.append({'url': video_thumbnail}) - - # upload date - upload_date = self._html_search_meta( - 'datePublished', video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], - video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = microformat.get('publishDate') or microformat.get('uploadDate') - upload_date = unified_strdate(upload_date) - - video_license = self._html_search_regex( - r']+class="title"[^>]*>\s*License\s*\s*]*>\s*
  • (.+?)]+class="title"[^>]*>\s*Music\s*\s* - ]*>\s* -
  • (?P.+?) - by (?P<creator>.+?) - (?: - \(.+?\)| - <a[^>]* - (?: - \bhref=["\']/red[^>]*>| # drop possible - >\s*Listen ad-free with YouTube Red # YouTube Red ad - ) - .*? - )?</li - ''', - video_webpage) - if m_music: - video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) - video_creator = clean_html(m_music.group('creator')) + for container in (video_details, microformat): + for thumbnail in (try_get( + container, + lambda x: x['thumbnail']['thumbnails'], list) or []): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'height': int_or_none(thumbnail.get('height')), + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + }) + if thumbnails: + break else: - video_alt_title = video_creator = None + thumbnail = search_meta(['og:image', 'twitter:image']) + if thumbnail: + thumbnails = [{'url': thumbnail}] - def extract_meta(field): - return self._html_search_regex( - r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, - video_webpage, field, default=None) + category = microformat.get('category') or search_meta('genre') + channel_id = video_details.get('channelId') \ + or microformat.get('externalChannelId') \ + or search_meta('channelId') + duration = int_or_none( + video_details.get('lengthSeconds') + or microformat.get('lengthSeconds')) \ + or parse_duration(search_meta('duration')) + is_live = video_details.get('isLive') + owner_profile_url = microformat.get('ownerProfileUrl') - track = extract_meta('Song') - artist = extract_meta('Artist') - album = extract_meta('Album') + info = { + 'id': video_id, + 'title': self._live_title(video_title) if is_live else video_title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': video_description, + 'upload_date': unified_strdate( + microformat.get('uploadDate') + or search_meta('uploadDate')), + 'uploader': video_details['author'], + 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None, + 'uploader_url': owner_profile_url, + 'channel_id': channel_id, + 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None, + 'duration': duration, + 'view_count': int_or_none( + video_details.get('viewCount') + or microformat.get('viewCount') + or search_meta('interactionCount')), + 'average_rating': float_or_none(video_details.get('averageRating')), + 'age_limit': 18 if ( + microformat.get('isFamilySafe') is False + or search_meta('isFamilyFriendly') == 'false' + or search_meta('og:restrictions:age') == '18+') else 0, + 'webpage_url': webpage_url, + 'categories': [category] if category else None, + 'tags': keywords, + 'is_live': is_live, + } + + pctr = try_get( + player_response, + lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict) + if pctr: + def process_language(container, base_url, caption, query): + lang_subs = [] + for fmt in self._SUBTITLE_FORMATS: + query.update({ + 'fmt': fmt, + }) + lang_subs.append({ + 'ext': fmt, + 'url': update_url_query(base_url, query), + }) + subtitles[caption['languageCode']] = lang_subs + + subtitles = {} + for caption_track in pctr['captionTracks']: + base_url = caption_track['baseUrl'] + if caption_track.get('kind') != 'asr': + lang_subs = [] + for fmt in self._SUBTITLE_FORMATS: + lang_subs.append({ + 'ext': fmt, + 'url': update_url_query(base_url, { + 'fmt': fmt, + }), + }) + subtitles[caption_track['languageCode']] = lang_subs + continue + automatic_captions = {} + for translation_language in pctr['translationLanguages']: + translation_language_code = translation_language['languageCode'] + lang_subs = [] + for fmt in self._SUBTITLE_FORMATS: + lang_subs.append({ + 'ext': fmt, + 'url': update_url_query(base_url, { + 'fmt': fmt, + 'tlang': translation_language_code, + }), + }) + automatic_captions[translation_language_code] = lang_subs + info['automatic_captions'] = automatic_captions + info['subtitles'] = subtitles + + parsed_url = compat_urllib_parse_urlparse(url) + for component in [parsed_url.fragment, parsed_url.query]: + query = compat_parse_qs(component) + for k, v in query.items(): + for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]: + d_k += '_time' + if d_k not in info and k in s_ks: + info[d_k] = parse_duration(query[k][0]) - # Youtube Music Auto-generated description - release_date = release_year = None if video_description: mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) if mobj: - if not track: - track = mobj.group('track').strip() - if not artist: - artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')) - if not album: - album = mobj.group('album'.strip()) release_year = mobj.group('release_year') release_date = mobj.group('release_date') if release_date: release_date = release_date.replace('-', '') if not release_year: - release_year = int(release_date[:4]) - if release_year: - release_year = int(release_year) + release_year = release_date[:4] + info.update({ + 'album': mobj.group('album'.strip()), + 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')), + 'track': mobj.group('track').strip(), + 'release_date': release_date, + 'release_year': int(release_year), + }) - yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage) - contents = try_get(yt_initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or [] - for content in contents: - rows = try_get(content, lambda x: x['videoSecondaryInfoRenderer']['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or [] - multiple_songs = False - for row in rows: - if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: - multiple_songs = True - break - for row in rows: - mrr = row.get('metadataRowRenderer') or {} - mrr_title = try_get( - mrr, lambda x: x['title']['simpleText'], compat_str) - mrr_contents = try_get( - mrr, lambda x: x['contents'][0], dict) or {} - mrr_contents_text = try_get(mrr_contents, [lambda x: x['simpleText'], lambda x: x['runs'][0]['text']], compat_str) - if not (mrr_title and mrr_contents_text): + initial_data = None + if webpage: + initial_data = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_DATA_RE, video_id, + 'yt initial data') + if not initial_data: + initial_data = self._call_api( + 'next', {'videoId': video_id}, video_id, fatal=False) + + if initial_data: + for engagment_pannel in (initial_data.get('engagementPanels') or []): + contents = try_get( + engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'], + list) + if not contents: continue - if mrr_title == 'License': - video_license = mrr_contents_text - elif not multiple_songs: - if mrr_title == 'Album': - album = mrr_contents_text - elif mrr_title == 'Artist': - artist = mrr_contents_text - elif mrr_title == 'Song': - track = mrr_contents_text - m_episode = re.search( - r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', - video_webpage) - if m_episode: - series = unescapeHTML(m_episode.group('series')) - season_number = int(m_episode.group('season')) - episode_number = int(m_episode.group('episode')) - else: - series = season_number = episode_number = None + def chapter_time(mmlir): + return parse_duration(mmlir.get( + get_text(mmlir.get('timeDescription')))) - m_cat_container = self._search_regex( - r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', - video_webpage, 'categories', default=None) - category = None - if m_cat_container: - category = self._html_search_regex( - r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', - default=None) - if not category: - category = try_get( - microformat, lambda x: x['category'], compat_str) - video_categories = None if category is None else [category] + chapters = [] + for next_num, content in enumerate(contents, start=1): + mmlir = content.get('macroMarkersListItemRenderer') or {} + start_time = chapter_time(mmlir) + end_time = chapter_time(try_get( + contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \ + if next_num < len(contents) else duration + if not (start_time and end_time): + continue + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': get_text(mmlir.get('title')), + }) + info['chapters'] = chapters - video_tags = [ - unescapeHTML(m.group('content')) - for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] - if not video_tags: - video_tags = try_get(video_details, lambda x: x['keywords'], list) + contents = try_get( + initial_data, + lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], + list) or [] + for content in contents: + vpir = content.get('videoPrimaryInfoRenderer') + if vpir: + stl = vpir.get('superTitleLink') + if stl: + stl = get_text(stl) + if try_get( + vpir, + lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN': + info['location'] = stl + else: + mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl) + if mobj: + info.update({ + 'series': mobj.group(1), + 'season_number': int(mobj.group(2)), + 'episode_number': int(mobj.group(3)), + }) + for tlb in (try_get( + vpir, + lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], + list) or []): + tbr = tlb.get('toggleButtonRenderer') or {} + for getter, regex in [( + lambda x: x['defaultText']['accessibility']['accessibilityData'], + r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([ + lambda x: x['accessibility'], + lambda x: x['accessibilityData']['accessibilityData'], + ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]: + label = (try_get(tbr, getter, dict) or {}).get('label') + if label: + mobj = re.match(regex, label) + if mobj: + info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) + break + sbr_tooltip = try_get( + vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) + if sbr_tooltip: + like_count, dislike_count = sbr_tooltip.split(' / ') + info.update({ + 'like_count': str_to_int(like_count), + 'dislike_count': str_to_int(dislike_count), + }) + vsir = content.get('videoSecondaryInfoRenderer') + if vsir: + info['channel'] = get_text(try_get( + vsir, + lambda x: x['owner']['videoOwnerRenderer']['title'], + compat_str)) + rows = try_get( + vsir, + lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'], + list) or [] + multiple_songs = False + for row in rows: + if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: + multiple_songs = True + break + for row in rows: + mrr = row.get('metadataRowRenderer') or {} + mrr_title = mrr.get('title') + if not mrr_title: + continue + mrr_title = get_text(mrr['title']) + mrr_contents_text = get_text(mrr['contents'][0]) + if mrr_title == 'License': + info['license'] = mrr_contents_text + elif not multiple_songs: + if mrr_title == 'Album': + info['album'] = mrr_contents_text + elif mrr_title == 'Artist': + info['artist'] = mrr_contents_text + elif mrr_title == 'Song': + info['track'] = mrr_contents_text - def _extract_count(count_name): - return str_to_int(self._search_regex( - (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name), - r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)), - video_webpage, count_name, default=None)) + for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: + v = info.get(s_k) + if v: + info[d_k] = v - like_count = _extract_count('like') - dislike_count = _extract_count('dislike') + self.mark_watched(video_id, player_response) - if view_count is None: - view_count = str_to_int(self._search_regex( - r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage, - 'view count', default=None)) - - average_rating = ( - float_or_none(video_details.get('averageRating')) - or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0]))) - - # subtitles - video_subtitles = self.extract_subtitles(video_id, video_webpage) - automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config) - - video_duration = try_get( - video_info, lambda x: int_or_none(x['length_seconds'][0])) - if not video_duration: - video_duration = int_or_none(video_details.get('lengthSeconds')) - if not video_duration: - video_duration = parse_duration(self._html_search_meta( - 'duration', video_webpage, 'video duration')) - - # annotations - video_annotations = None - if self._downloader.params.get('writeannotations', False): - xsrf_token = None - ytcfg = self._extract_ytcfg(video_id, video_webpage) - if ytcfg: - xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str) - if not xsrf_token: - xsrf_token = self._search_regex( - r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2', - video_webpage, 'xsrf token', group='xsrf_token', fatal=False) - invideo_url = try_get( - player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str) - if xsrf_token and invideo_url: - xsrf_field_name = None - if ytcfg: - xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str) - if not xsrf_field_name: - xsrf_field_name = self._search_regex( - r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2', - video_webpage, 'xsrf field name', - group='xsrf_field_name', default='session_token') - video_annotations = self._download_webpage( - self._proto_relative_url(invideo_url), - video_id, note='Downloading annotations', - errnote='Unable to download video annotations', fatal=False, - data=urlencode_postdata({xsrf_field_name: xsrf_token})) - - chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration) - - # Look for the DASH manifest - if self._downloader.params.get('youtube_include_dash_manifest', True): - dash_mpd_fatal = True - for mpd_url in dash_mpds: - dash_formats = {} - try: - def decrypt_sig(mobj): - s = mobj.group(1) - dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) - return '/signature/%s' % dec_s - - mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url) - - for df in self._extract_mpd_formats( - mpd_url, video_id, fatal=dash_mpd_fatal, - formats_dict=self._formats): - if not df.get('filesize'): - df['filesize'] = _extract_filesize(df.get('fragment_base_url') or df['url']) - # Do not overwrite DASH format found in some previous DASH manifest - if df['format_id'] not in dash_formats: - dash_formats[df['format_id']] = df - # Additional DASH manifests may end up in HTTP Error 403 therefore - # allow them to fail without bug report message if we already have - # some DASH manifest succeeded. This is temporary workaround to reduce - # burst of bug reports until we figure out the reason and whether it - # can be fixed at all. - dash_mpd_fatal = False - except (ExtractorError, KeyError) as e: - self.report_warning( - 'Skipping DASH manifest: %r' % e, video_id) - if dash_formats: - # Remove the formats we found through non-DASH, they - # contain less info and it can be wrong, because we use - # fixed values (for example the resolution). See - # https://github.com/ytdl-org/youtube-dl/issues/5774 for an - # example. - formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] - formats.extend(dash_formats.values()) - - # Check for malformed aspect ratio - stretched_m = re.search( - r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">', - video_webpage) - if stretched_m: - w = float(stretched_m.group('w')) - h = float(stretched_m.group('h')) - # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0). - # We will only process correct ratios. - if w > 0 and h > 0: - ratio = w / h - for f in formats: - if f.get('vcodec') != 'none': - f['stretched_ratio'] = ratio - - if not formats: - if 'reason' in video_info: - if 'The uploader has not made this video available in your country.' in video_info['reason']: - regions_allowed = self._html_search_meta( - 'regionsAllowed', video_webpage, default=None) - countries = regions_allowed.split(',') if regions_allowed else None - self.raise_geo_restricted( - msg=video_info['reason'][0], countries=countries) - reason = video_info['reason'][0] - if 'Invalid parameters' in reason: - unavailable_message = extract_unavailable_message() - if unavailable_message: - reason = unavailable_message - raise ExtractorError( - 'YouTube said: %s' % reason, - expected=True, video_id=video_id) - if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']): - raise ExtractorError('This video is DRM protected.', expected=True) - - self._sort_formats(formats) - - self.mark_watched(video_id, video_info, player_response) - - return { - 'id': video_id, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'uploader_url': video_uploader_url, - 'channel_id': channel_id, - 'channel_url': channel_url, - 'upload_date': upload_date, - 'license': video_license, - 'creator': video_creator or artist, - 'title': video_title, - 'alt_title': video_alt_title or track, - 'thumbnails': thumbnails, - 'description': video_description, - 'categories': video_categories, - 'tags': video_tags, - 'subtitles': video_subtitles, - 'automatic_captions': automatic_captions, - 'duration': video_duration, - 'age_limit': 18 if age_gate else 0, - 'annotations': video_annotations, - 'chapters': chapters, - 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'average_rating': average_rating, - 'formats': formats, - 'is_live': is_live, - 'start_time': start_time, - 'end_time': end_time, - 'series': series, - 'season_number': season_number, - 'episode_number': episode_number, - 'track': track, - 'artist': artist, - 'album': album, - 'release_date': release_date, - 'release_year': release_year, - } + return info class YoutubeTabIE(YoutubeBaseInfoExtractor): From 9c724601ba234085dc5071ec9c1c3d98e6834817 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 16:08:50 +0100 Subject: [PATCH 55/79] [youtube] remove description chapters tests video description no longer contain yt.www.watch.player.seekTo function --- test/test_youtube_chapters.py | 275 -------------------------------- youtube_dl/extractor/youtube.py | 91 ++++++++--- 2 files changed, 67 insertions(+), 299 deletions(-) delete mode 100644 test/test_youtube_chapters.py diff --git a/test/test_youtube_chapters.py b/test/test_youtube_chapters.py deleted file mode 100644 index e69c57377..000000000 --- a/test/test_youtube_chapters.py +++ /dev/null @@ -1,275 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -from __future__ import unicode_literals - -# Allow direct execution -import os -import sys -import unittest -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from test.helper import expect_value -from youtube_dl.extractor import YoutubeIE - - -class TestYoutubeChapters(unittest.TestCase): - - _TEST_CASES = [ - ( - # https://www.youtube.com/watch?v=A22oy8dFjqc - # pattern: 00:00 - <title> - '''This is the absolute ULTIMATE experience of Queen's set at LIVE AID, this is the best video mixed to the absolutely superior stereo radio broadcast. This vastly superior audio mix takes a huge dump on all of the official mixes. Best viewed in 1080p. ENJOY! ***MAKE SURE TO READ THE DESCRIPTION***<br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+36);return false;">00:36</a> - Bohemian Rhapsody<br /><a href="#" onclick="yt.www.watch.player.seekTo(02*60+42);return false;">02:42</a> - Radio Ga Ga<br /><a href="#" onclick="yt.www.watch.player.seekTo(06*60+53);return false;">06:53</a> - Ay Oh!<br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+34);return false;">07:34</a> - Hammer To Fall<br /><a href="#" onclick="yt.www.watch.player.seekTo(12*60+08);return false;">12:08</a> - Crazy Little Thing Called Love<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+03);return false;">16:03</a> - We Will Rock You<br /><a href="#" onclick="yt.www.watch.player.seekTo(17*60+18);return false;">17:18</a> - We Are The Champions<br /><a href="#" onclick="yt.www.watch.player.seekTo(21*60+12);return false;">21:12</a> - Is This The World We Created...?<br /><br />Short song analysis:<br /><br />- "Bohemian Rhapsody": Although it's a short medley version, it's one of the best performances of the ballad section, with Freddie nailing the Bb4s with the correct studio phrasing (for the first time ever!).<br /><br />- "Radio Ga Ga": Although it's missing one chorus, this is one of - if not the best - the best versions ever, Freddie nails all the Bb4s and sounds very clean! Spike Edney's Roland Jupiter 8 also really shines through on this mix, compared to the DVD releases!<br /><br />- "Audience Improv": A great improv, Freddie sounds strong and confident. You gotta love when he sustains that A4 for 4 seconds!<br /><br />- "Hammer To Fall": Despite missing a verse and a chorus, it's a strong version (possibly the best ever). Freddie sings the song amazingly, and even ad-libs a C#5 and a C5! Also notice how heavy Brian's guitar sounds compared to the thin DVD mixes - it roars!<br /><br />- "Crazy Little Thing Called Love": A great version, the crowd loves the song, the jam is great as well! Only downside to this is the slight feedback issues.<br /><br />- "We Will Rock You": Although cut down to the 1st verse and chorus, Freddie sounds strong. He nails the A4, and the solo from Dr. May is brilliant!<br /><br />- "We Are the Champions": Perhaps the high-light of the performance - Freddie is very daring on this version, he sustains the pre-chorus Bb4s, nails the 1st C5, belts great A4s, but most importantly: He nails the chorus Bb4s, in all 3 choruses! This is the only time he has ever done so! It has to be said though, the last one sounds a bit rough, but that's a side effect of belting high notes for the past 18 minutes, with nodules AND laryngitis!<br /><br />- "Is This The World We Created... ?": Freddie and Brian perform a beautiful version of this, and it is one of the best versions ever. It's both sad and hilarious that a couple of BBC engineers are talking over the song, one of them being completely oblivious of the fact that he is interrupting the performance, on live television... Which was being televised to almost 2 billion homes.<br /><br /><br />All rights go to their respective owners!<br />-----Copyright Disclaimer Under Section 107 of the Copyright Act 1976, allowance is made for fair use for purposes such as criticism, comment, news reporting, teaching, scholarship, and research. Fair use is a use permitted by copyright statute that might otherwise be infringing. Non-profit, educational or personal use tips the balance in favor of fair use''', - 1477, - [{ - 'start_time': 36, - 'end_time': 162, - 'title': 'Bohemian Rhapsody', - }, { - 'start_time': 162, - 'end_time': 413, - 'title': 'Radio Ga Ga', - }, { - 'start_time': 413, - 'end_time': 454, - 'title': 'Ay Oh!', - }, { - 'start_time': 454, - 'end_time': 728, - 'title': 'Hammer To Fall', - }, { - 'start_time': 728, - 'end_time': 963, - 'title': 'Crazy Little Thing Called Love', - }, { - 'start_time': 963, - 'end_time': 1038, - 'title': 'We Will Rock You', - }, { - 'start_time': 1038, - 'end_time': 1272, - 'title': 'We Are The Champions', - }, { - 'start_time': 1272, - 'end_time': 1477, - 'title': 'Is This The World We Created...?', - }] - ), - ( - # https://www.youtube.com/watch?v=ekYlRhALiRQ - # pattern: <num>. <title> 0:00 - '1. Those Beaten Paths of Confusion <a href="#" onclick="yt.www.watch.player.seekTo(0*60+00);return false;">0:00</a><br />2. Beyond the Shadows of Emptiness & Nothingness <a href="#" onclick="yt.www.watch.player.seekTo(11*60+47);return false;">11:47</a><br />3. Poison Yourself...With Thought <a href="#" onclick="yt.www.watch.player.seekTo(26*60+30);return false;">26:30</a><br />4. The Agents of Transformation <a href="#" onclick="yt.www.watch.player.seekTo(35*60+57);return false;">35:57</a><br />5. Drowning in the Pain of Consciousness <a href="#" onclick="yt.www.watch.player.seekTo(44*60+32);return false;">44:32</a><br />6. Deny the Disease of Life <a href="#" onclick="yt.www.watch.player.seekTo(53*60+07);return false;">53:07</a><br /><br />More info/Buy: http://crepusculonegro.storenvy.com/products/257645-cn-03-arizmenda-within-the-vacuum-of-infinity<br /><br />No copyright is intended. The rights to this video are assumed by the owner and its affiliates.', - 4009, - [{ - 'start_time': 0, - 'end_time': 707, - 'title': '1. Those Beaten Paths of Confusion', - }, { - 'start_time': 707, - 'end_time': 1590, - 'title': '2. Beyond the Shadows of Emptiness & Nothingness', - }, { - 'start_time': 1590, - 'end_time': 2157, - 'title': '3. Poison Yourself...With Thought', - }, { - 'start_time': 2157, - 'end_time': 2672, - 'title': '4. The Agents of Transformation', - }, { - 'start_time': 2672, - 'end_time': 3187, - 'title': '5. Drowning in the Pain of Consciousness', - }, { - 'start_time': 3187, - 'end_time': 4009, - 'title': '6. Deny the Disease of Life', - }] - ), - ( - # https://www.youtube.com/watch?v=WjL4pSzog9w - # pattern: 00:00 <title> - '<a href="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" class="yt-uix-servicelink " data-target-new-window="True" data-servicelink="CDAQ6TgYACITCNf1raqT2dMCFdRjGAod_o0CBSj4HQ" data-url="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" rel="nofollow noopener" target="_blank">https://arizmenda.bandcamp.com/merch/...</a><br /><br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+00);return false;">00:00</a> Christening Unborn Deformities <br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+08);return false;">07:08</a> Taste of Purity<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+16);return false;">16:16</a> Sculpting Sins of a Universal Tongue<br /><a href="#" onclick="yt.www.watch.player.seekTo(24*60+45);return false;">24:45</a> Birth<br /><a href="#" onclick="yt.www.watch.player.seekTo(31*60+24);return false;">31:24</a> Neves<br /><a href="#" onclick="yt.www.watch.player.seekTo(37*60+55);return false;">37:55</a> Libations in Limbo', - 2705, - [{ - 'start_time': 0, - 'end_time': 428, - 'title': 'Christening Unborn Deformities', - }, { - 'start_time': 428, - 'end_time': 976, - 'title': 'Taste of Purity', - }, { - 'start_time': 976, - 'end_time': 1485, - 'title': 'Sculpting Sins of a Universal Tongue', - }, { - 'start_time': 1485, - 'end_time': 1884, - 'title': 'Birth', - }, { - 'start_time': 1884, - 'end_time': 2275, - 'title': 'Neves', - }, { - 'start_time': 2275, - 'end_time': 2705, - 'title': 'Libations in Limbo', - }] - ), - ( - # https://www.youtube.com/watch?v=o3r1sn-t3is - # pattern: <title> 00:00 <note> - 'Download this show in MP3: <a href="http://sh.st/njZKK" class="yt-uix-servicelink " data-url="http://sh.st/njZKK" data-target-new-window="True" data-servicelink="CDAQ6TgYACITCK3j8_6o2dMCFVDCGAoduVAKKij4HQ" rel="nofollow noopener" target="_blank">http://sh.st/njZKK</a><br /><br />Setlist:<br />I-E-A-I-A-I-O <a href="#" onclick="yt.www.watch.player.seekTo(00*60+45);return false;">00:45</a><br />Suite-Pee <a href="#" onclick="yt.www.watch.player.seekTo(4*60+26);return false;">4:26</a> (Incomplete)<br />Attack <a href="#" onclick="yt.www.watch.player.seekTo(5*60+31);return false;">5:31</a> (First live performance since 2011)<br />Prison Song <a href="#" onclick="yt.www.watch.player.seekTo(8*60+42);return false;">8:42</a><br />Know <a href="#" onclick="yt.www.watch.player.seekTo(12*60+32);return false;">12:32</a> (First live performance since 2011)<br />Aerials <a href="#" onclick="yt.www.watch.player.seekTo(15*60+32);return false;">15:32</a><br />Soldier Side - Intro <a href="#" onclick="yt.www.watch.player.seekTo(19*60+13);return false;">19:13</a><br />B.Y.O.B. <a href="#" onclick="yt.www.watch.player.seekTo(20*60+09);return false;">20:09</a><br />Soil <a href="#" onclick="yt.www.watch.player.seekTo(24*60+32);return false;">24:32</a><br />Darts <a href="#" onclick="yt.www.watch.player.seekTo(27*60+48);return false;">27:48</a><br />Radio/Video <a href="#" onclick="yt.www.watch.player.seekTo(30*60+38);return false;">30:38</a><br />Hypnotize <a href="#" onclick="yt.www.watch.player.seekTo(35*60+05);return false;">35:05</a><br />Temper <a href="#" onclick="yt.www.watch.player.seekTo(38*60+08);return false;">38:08</a> (First live performance since 1999)<br />CUBErt <a href="#" onclick="yt.www.watch.player.seekTo(41*60+00);return false;">41:00</a><br />Needles <a href="#" onclick="yt.www.watch.player.seekTo(42*60+57);return false;">42:57</a><br />Deer Dance <a href="#" onclick="yt.www.watch.player.seekTo(46*60+27);return false;">46:27</a><br />Bounce <a href="#" onclick="yt.www.watch.player.seekTo(49*60+38);return false;">49:38</a><br />Suggestions <a href="#" onclick="yt.www.watch.player.seekTo(51*60+25);return false;">51:25</a><br />Psycho <a href="#" onclick="yt.www.watch.player.seekTo(53*60+52);return false;">53:52</a><br />Chop Suey! <a href="#" onclick="yt.www.watch.player.seekTo(58*60+13);return false;">58:13</a><br />Lonely Day <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+01*60+15);return false;">1:01:15</a><br />Question! <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+04*60+14);return false;">1:04:14</a><br />Lost in Hollywood <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+08*60+10);return false;">1:08:10</a><br />Vicinity of Obscenity <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+13*60+40);return false;">1:13:40</a>(First live performance since 2012)<br />Forest <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+16*60+17);return false;">1:16:17</a><br />Cigaro <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+20*60+02);return false;">1:20:02</a><br />Toxicity <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+23*60+57);return false;">1:23:57</a>(with Chino Moreno)<br />Sugar <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+27*60+53);return false;">1:27:53</a>', - 5640, - [{ - 'start_time': 45, - 'end_time': 266, - 'title': 'I-E-A-I-A-I-O', - }, { - 'start_time': 266, - 'end_time': 331, - 'title': 'Suite-Pee (Incomplete)', - }, { - 'start_time': 331, - 'end_time': 522, - 'title': 'Attack (First live performance since 2011)', - }, { - 'start_time': 522, - 'end_time': 752, - 'title': 'Prison Song', - }, { - 'start_time': 752, - 'end_time': 932, - 'title': 'Know (First live performance since 2011)', - }, { - 'start_time': 932, - 'end_time': 1153, - 'title': 'Aerials', - }, { - 'start_time': 1153, - 'end_time': 1209, - 'title': 'Soldier Side - Intro', - }, { - 'start_time': 1209, - 'end_time': 1472, - 'title': 'B.Y.O.B.', - }, { - 'start_time': 1472, - 'end_time': 1668, - 'title': 'Soil', - }, { - 'start_time': 1668, - 'end_time': 1838, - 'title': 'Darts', - }, { - 'start_time': 1838, - 'end_time': 2105, - 'title': 'Radio/Video', - }, { - 'start_time': 2105, - 'end_time': 2288, - 'title': 'Hypnotize', - }, { - 'start_time': 2288, - 'end_time': 2460, - 'title': 'Temper (First live performance since 1999)', - }, { - 'start_time': 2460, - 'end_time': 2577, - 'title': 'CUBErt', - }, { - 'start_time': 2577, - 'end_time': 2787, - 'title': 'Needles', - }, { - 'start_time': 2787, - 'end_time': 2978, - 'title': 'Deer Dance', - }, { - 'start_time': 2978, - 'end_time': 3085, - 'title': 'Bounce', - }, { - 'start_time': 3085, - 'end_time': 3232, - 'title': 'Suggestions', - }, { - 'start_time': 3232, - 'end_time': 3493, - 'title': 'Psycho', - }, { - 'start_time': 3493, - 'end_time': 3675, - 'title': 'Chop Suey!', - }, { - 'start_time': 3675, - 'end_time': 3854, - 'title': 'Lonely Day', - }, { - 'start_time': 3854, - 'end_time': 4090, - 'title': 'Question!', - }, { - 'start_time': 4090, - 'end_time': 4420, - 'title': 'Lost in Hollywood', - }, { - 'start_time': 4420, - 'end_time': 4577, - 'title': 'Vicinity of Obscenity (First live performance since 2012)', - }, { - 'start_time': 4577, - 'end_time': 4802, - 'title': 'Forest', - }, { - 'start_time': 4802, - 'end_time': 5037, - 'title': 'Cigaro', - }, { - 'start_time': 5037, - 'end_time': 5273, - 'title': 'Toxicity (with Chino Moreno)', - }, { - 'start_time': 5273, - 'end_time': 5640, - 'title': 'Sugar', - }] - ), - ( - # https://www.youtube.com/watch?v=PkYLQbsqCE8 - # pattern: <num> - <title> [<latinized title>] 0:00:00 - '''Затемно (Zatemno) is an Obscure Black Metal Band from Russia.<br /><br />"Во прах (Vo prakh)'' Into The Ashes", Debut mini-album released may 6, 2016, by Death Knell Productions<br />Released on 6 panel digipak CD, limited to 100 copies only<br />And digital format on Bandcamp<br /><br />Tracklist<br /><br />1 - Во прах [Vo prakh] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+00*60+00);return false;">0:00:00</a><br />2 - Искупление [Iskupleniye] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+08*60+10);return false;">0:08:10</a><br />3 - Из серпов луны...[Iz serpov luny] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+14*60+30);return false;">0:14:30</a><br /><br />Links:<br /><a href="https://deathknellprod.bandcamp.com/album/--2" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://deathknellprod.bandcamp.com/album/--2" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://deathknellprod.bandcamp.com/a...</a><br /><a href="https://www.facebook.com/DeathKnellProd/" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://www.facebook.com/DeathKnellProd/" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://www.facebook.com/DeathKnellProd/</a><br /><br /><br />I don't have any right about this artifact, my only intention is to spread the music of the band, all rights are reserved to the Затемно (Zatemno) and his producers, Death Knell Productions.<br /><br />------------------------------------------------------------------<br /><br />Subscribe for more videos like this.<br />My link: <a href="https://web.facebook.com/AttackOfTheDragons" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://web.facebook.com/AttackOfTheDragons" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://web.facebook.com/AttackOfTheD...</a>''', - 1138, - [{ - 'start_time': 0, - 'end_time': 490, - 'title': '1 - Во прах [Vo prakh]', - }, { - 'start_time': 490, - 'end_time': 870, - 'title': '2 - Искупление [Iskupleniye]', - }, { - 'start_time': 870, - 'end_time': 1138, - 'title': '3 - Из серпов луны...[Iz serpov luny]', - }] - ), - ( - # https://www.youtube.com/watch?v=xZW70zEasOk - # time point more than duration - '''● LCS Spring finals: Saturday and Sunday from <a href="#" onclick="yt.www.watch.player.seekTo(13*60+30);return false;">13:30</a> outside the venue! <br />● PAX East: Fri, Sat & Sun - more info in tomorrows video on the main channel!''', - 283, - [] - ), - ] - - def test_youtube_chapters(self): - for description, duration, expected_chapters in self._TEST_CASES: - ie = YoutubeIE() - expect_value( - self, ie._extract_chapters_from_description(description, duration), - expected_chapters, None) - - -if __name__ == '__main__': - unittest.main() diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5f6769878..edaca0658 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1223,6 +1223,46 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id = mobj.group(2) return video_id + def _extract_chapters_from_json(self, data, video_id, duration): + chapters_list = try_get( + data, + lambda x: x['playerOverlays'] + ['playerOverlayRenderer'] + ['decoratedPlayerBarRenderer'] + ['decoratedPlayerBarRenderer'] + ['playerBar'] + ['chapteredPlayerBarRenderer'] + ['chapters'], + list) + if not chapters_list: + return + + def chapter_time(chapter): + return float_or_none( + try_get( + chapter, + lambda x: x['chapterRenderer']['timeRangeStartMillis'], + int), + scale=1000) + chapters = [] + for next_num, chapter in enumerate(chapters_list, start=1): + start_time = chapter_time(chapter) + if start_time is None: + continue + end_time = (chapter_time(chapters_list[next_num]) + if next_num < len(chapters_list) else duration) + if end_time is None: + continue + title = try_get( + chapter, lambda x: x['chapterRenderer']['title']['simpleText'], + compat_str) + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': title, + }) + return chapters + def _extract_yt_initial_variable(self, webpage, regex, video_id, name): return self._parse_json(self._search_regex( (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), @@ -1597,31 +1637,34 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'next', {'videoId': video_id}, video_id, fatal=False) if initial_data: - for engagment_pannel in (initial_data.get('engagementPanels') or []): - contents = try_get( - engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'], - list) - if not contents: - continue - - def chapter_time(mmlir): - return parse_duration(mmlir.get( - get_text(mmlir.get('timeDescription')))) - - chapters = [] - for next_num, content in enumerate(contents, start=1): - mmlir = content.get('macroMarkersListItemRenderer') or {} - start_time = chapter_time(mmlir) - end_time = chapter_time(try_get( - contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \ - if next_num < len(contents) else duration - if not (start_time and end_time): + chapters = self._extract_chapters_from_json( + initial_data, video_id, duration) + if not chapters: + for engagment_pannel in (initial_data.get('engagementPanels') or []): + contents = try_get( + engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'], + list) + if not contents: continue - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': get_text(mmlir.get('title')), - }) + + def chapter_time(mmlir): + return parse_duration(mmlir.get( + get_text(mmlir.get('timeDescription')))) + + for next_num, content in enumerate(contents, start=1): + mmlir = content.get('macroMarkersListItemRenderer') or {} + start_time = chapter_time(mmlir) + end_time = chapter_time(try_get( + contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \ + if next_num < len(contents) else duration + if not (start_time and end_time): + continue + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': get_text(mmlir.get('title')), + }) + if chapters: info['chapters'] = chapters contents = try_get( From b46483a6ec6a42889fc16d53afd76d147748785f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 16:35:07 +0100 Subject: [PATCH 56/79] [youtube/test_youtube_signature] fix test --- test/test_youtube_signature.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 69df30eda..b5a4d0d5f 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -86,13 +86,9 @@ class TestPlayerInfo(unittest.TestCase): ('https://www.youtube.com/yts/jsbin/player-en_US-vflaxXRn1/base.js', 'vflaxXRn1'), ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', 'vflXGBaUN'), ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'vflKjOTVq'), - ('http://s.ytimg.com/yt/swfbin/watch_as3-vflrEm9Nq.swf', 'vflrEm9Nq'), - ('https://s.ytimg.com/yts/swfbin/player-vflenCdZL/watch_as3.swf', 'vflenCdZL'), ) for player_url, expected_player_id in PLAYER_URLS: - expected_player_type = player_url.split('.')[-1] - player_type, player_id = YoutubeIE._extract_player_info(player_url) - self.assertEqual(player_type, expected_player_type) + player_id = YoutubeIE._extract_player_info(player_url) self.assertEqual(player_id, expected_player_id) From 159a3d48dfb2b4ed77dc691433e420506c9340c3 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 16:36:19 +0100 Subject: [PATCH 57/79] [youtube] keep _formats array for format sorting tests --- youtube_dl/extractor/youtube.py | 105 ++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index edaca0658..ed844e2a3 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1019,6 +1019,111 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'only_matching': True, }, ] + _formats = { + '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, + '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, + '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, + '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, + '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well + '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, + '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, + '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, + '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, + '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, + + + # 3D videos + '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, + '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, + '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, + '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, + '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, + '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, + '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, + + # Apple HTTP Live Streaming + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, + '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, + '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, + '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, + '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, + '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, + + # DASH mp4 video + '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) + '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, + + # Dash mp4 audio + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, + '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, + + # Dash webm + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) + '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + + # Dash webm audio + '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, + '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, + + # Dash webm audio with opus inside + '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, + '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, + '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, + + # RTMP (unnamed) + '_rtmp': {'protocol': 'rtmp'}, + + # av01 video only formats sometimes served with "unknown" codecs + '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + } def __init__(self, *args, **kwargs): super(YoutubeIE, self).__init__(*args, **kwargs) From efef4ddf51c375c3a9eb12355a61a21d69aec33f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 16:49:52 +0100 Subject: [PATCH 58/79] [youtube] fix chapter extraction fallback --- youtube_dl/extractor/youtube.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ed844e2a3..65fa777e4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1753,22 +1753,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): continue def chapter_time(mmlir): - return parse_duration(mmlir.get( - get_text(mmlir.get('timeDescription')))) + return parse_duration( + get_text(mmlir.get('timeDescription'))) + chapters = [] for next_num, content in enumerate(contents, start=1): mmlir = content.get('macroMarkersListItemRenderer') or {} start_time = chapter_time(mmlir) end_time = chapter_time(try_get( contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \ if next_num < len(contents) else duration - if not (start_time and end_time): + if start_time is None or end_time is None: continue chapters.append({ 'start_time': start_time, 'end_time': end_time, 'title': get_text(mmlir.get('title')), }) + if chapters: + break if chapters: info['chapters'] = chapters From 65eee5a745f705a7904709accdba47efb852cc6a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 18:12:35 +0100 Subject: [PATCH 59/79] [youtube] improve subtitle extraction --- youtube_dl/extractor/youtube.py | 43 ++++++++++++++------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 65fa777e4..75a007353 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1664,7 +1664,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_response, lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict) if pctr: - def process_language(container, base_url, caption, query): + def process_language(container, base_url, lang_code, query): lang_subs = [] for fmt in self._SUBTITLE_FORMATS: query.update({ @@ -1674,35 +1674,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': fmt, 'url': update_url_query(base_url, query), }) - subtitles[caption['languageCode']] = lang_subs + container[lang_code] = lang_subs subtitles = {} - for caption_track in pctr['captionTracks']: - base_url = caption_track['baseUrl'] + for caption_track in (pctr.get('captionTracks') or []): + base_url = caption_track.get('baseUrl') + if not base_url: + continue if caption_track.get('kind') != 'asr': - lang_subs = [] - for fmt in self._SUBTITLE_FORMATS: - lang_subs.append({ - 'ext': fmt, - 'url': update_url_query(base_url, { - 'fmt': fmt, - }), - }) - subtitles[caption_track['languageCode']] = lang_subs + lang_code = caption_track.get('languageCode') + if not lang_code: + continue + process_language( + subtitles, base_url, lang_code, {}) continue automatic_captions = {} - for translation_language in pctr['translationLanguages']: - translation_language_code = translation_language['languageCode'] - lang_subs = [] - for fmt in self._SUBTITLE_FORMATS: - lang_subs.append({ - 'ext': fmt, - 'url': update_url_query(base_url, { - 'fmt': fmt, - 'tlang': translation_language_code, - }), - }) - automatic_captions[translation_language_code] = lang_subs + for translation_language in (pctr.get('translationLanguages') or []): + translation_language_code = translation_language.get('languageCode') + if not translation_language_code: + continue + process_language( + automatic_captions, base_url, translation_language_code, + {'tlang': translation_language_code}) info['automatic_captions'] = automatic_captions info['subtitles'] = subtitles From 8fa7cc387d699899114f7430bcf61837d58557a8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 21:35:18 +0100 Subject: [PATCH 60/79] [vidio] improve metadata extraction --- youtube_dl/extractor/vidio.py | 86 ++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py index b48baf00b..b1243e847 100644 --- a/youtube_dl/extractor/vidio.py +++ b/youtube_dl/extractor/vidio.py @@ -4,7 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + parse_iso8601, + str_or_none, + strip_or_none, + try_get, +) class VidioIE(InfoExtractor): @@ -21,57 +27,63 @@ class VidioIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 149, 'like_count': int, + 'uploader': 'TWELVE Pic', + 'timestamp': 1444902800, + 'upload_date': '20151015', + 'uploader_id': 'twelvepictures', + 'channel': 'Cover Music Video', + 'channel_id': '280236', + 'view_count': int, + 'dislike_count': int, + 'comment_count': int, + 'tags': 'count:4', }, }, { 'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north', 'only_matching': True, }] + def _real_initialize(self): + self._api_key = self._download_json( + 'https://www.vidio.com/auth', None, data=b'')['api_key'] + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, display_id = mobj.group('id', 'display_id') + video_id, display_id = re.match(self._VALID_URL, url).groups() + data = self._download_json( + 'https://api.vidio.com/videos/' + video_id, display_id, headers={ + 'Content-Type': 'application/vnd.api+json', + 'X-API-KEY': self._api_key, + }) + video = data['videos'][0] + title = video['title'].strip() - webpage = self._download_webpage(url, display_id) - - title = self._og_search_title(webpage) - - m3u8_url, duration, thumbnail = [None] * 3 - - clips = self._parse_json( - self._html_search_regex( - r'data-json-clips\s*=\s*(["\'])(?P<data>\[.+?\])\1', - webpage, 'video data', default='[]', group='data'), - display_id, fatal=False) - if clips: - clip = clips[0] - m3u8_url = clip.get('sources', [{}])[0].get('file') - duration = clip.get('clip_duration') - thumbnail = clip.get('image') - - m3u8_url = m3u8_url or self._search_regex( - r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'hls url', group='url') formats = self._extract_m3u8_formats( - m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native') + data['clips'][0]['hls_url'], display_id, 'mp4', 'm3u8_native') self._sort_formats(formats) - duration = int_or_none(duration or self._search_regex( - r'data-video-duration=(["\'])(?P<duration>\d+)\1', webpage, - 'duration', fatal=False, group='duration')) - thumbnail = thumbnail or self._og_search_thumbnail(webpage) - - like_count = int_or_none(self._search_regex( - (r'<span[^>]+data-comment-vote-count=["\'](\d+)', - r'<span[^>]+class=["\'].*?\blike(?:__|-)count\b.*?["\'][^>]*>\s*(\d+)'), - webpage, 'like count', fatal=False)) + get_first = lambda x: try_get(data, lambda y: y[x + 's'][0], dict) or {} + channel = get_first('channel') + user = get_first('user') + username = user.get('username') + get_count = lambda x: int_or_none(video.get('total_' + x)) return { 'id': video_id, 'display_id': display_id, 'title': title, - 'description': self._og_search_description(webpage), - 'thumbnail': thumbnail, - 'duration': duration, - 'like_count': like_count, + 'description': strip_or_none(video.get('description')), + 'thumbnail': video.get('image_url_medium'), + 'duration': int_or_none(video.get('duration')), + 'like_count': get_count('likes'), 'formats': formats, + 'uploader': user.get('name'), + 'timestamp': parse_iso8601(video.get('created_at')), + 'uploader_id': username, + 'uploader_url': 'https://www.vidio.com/@' + username if username else None, + 'channel': channel.get('name'), + 'channel_id': str_or_none(channel.get('id')), + 'view_count': get_count('view_count'), + 'dislike_count': get_count('dislikes'), + 'comment_count': get_count('comments'), + 'tags': video.get('tag_list'), } From c11f7cf9bd6ef239f25e7fb9c54e092ae1490e2d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 1 Feb 2021 22:35:28 +0100 Subject: [PATCH 61/79] [vidzi] remove extractor(closes #12629) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/vidzi.py | 68 ------------------------------ 2 files changed, 69 deletions(-) delete mode 100644 youtube_dl/extractor/vidzi.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ab8d6a5a5..97b0b4034 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1399,7 +1399,6 @@ from .vidme import ( VidmeUserIE, VidmeUserLikesIE, ) -from .vidzi import VidziIE from .vier import VierIE, VierVideosIE from .viewlift import ( ViewLiftIE, diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py deleted file mode 100644 index 42ea4952c..000000000 --- a/youtube_dl/extractor/vidzi.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - decode_packed_codes, - js_to_json, - NO_DEFAULT, - PACKED_CODES_RE, -) - - -class VidziIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si|nu)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' - _TESTS = [{ - 'url': 'http://vidzi.tv/cghql9yq6emu.html', - 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', - 'info_dict': { - 'id': 'cghql9yq6emu', - 'ext': 'mp4', - 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://vidzi.tv/embed-4z2yb0rzphe9-600x338.html', - 'only_matching': True, - }, { - 'url': 'http://vidzi.cc/cghql9yq6emu.html', - 'only_matching': True, - }, { - 'url': 'https://vidzi.si/rph9gztxj1et.html', - 'only_matching': True, - }, { - 'url': 'http://vidzi.nu/cghql9yq6emu.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://vidzi.tv/%s' % video_id, video_id) - title = self._html_search_regex( - r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') - - codes = [webpage] - codes.extend([ - decode_packed_codes(mobj.group(0)).replace('\\\'', '\'') - for mobj in re.finditer(PACKED_CODES_RE, webpage)]) - for num, code in enumerate(codes, 1): - jwplayer_data = self._parse_json( - self._search_regex( - r'setup\(([^)]+)\)', code, 'jwplayer data', - default=NO_DEFAULT if num == len(codes) else '{}'), - video_id, transform_source=lambda s: js_to_json( - re.sub(r'\s*\+\s*window\[.+?\]', '', s))) - if jwplayer_data: - break - - info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) - info_dict['title'] = title - - return info_dict From 0e3a9684795c6c53546dace9e917ed11c4ae72a5 Mon Sep 17 00:00:00 2001 From: Viren Rajput <virendra.rajput567@gmail.com> Date: Mon, 1 Feb 2021 04:56:33 +0000 Subject: [PATCH 62/79] [egghead] update API domain(closes #28038) --- youtube_dl/extractor/egghead.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index df11dc206..94dd75b9b 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -12,7 +12,14 @@ from ..utils import ( ) -class EggheadCourseIE(InfoExtractor): +class EggheadBaseIE(InfoExtractor): + def _call_api(self, path, video_id, resource, fatal=True): + return self._download_json( + 'https://app.egghead.io/api/v1/' + path, + video_id, 'Downloading %s JSON' % resource) + + +class EggheadCourseIE(EggheadBaseIE): IE_DESC = 'egghead.io course' IE_NAME = 'egghead:course' _VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)' @@ -28,10 +35,9 @@ class EggheadCourseIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - - lessons = self._download_json( - 'https://egghead.io/api/v1/series/%s/lessons' % playlist_id, - playlist_id, 'Downloading course lessons JSON') + series_path = 'series/' + playlist_id + lessons = self._call_api( + series_path + '/lessons', playlist_id, 'course lessons') entries = [] for lesson in lessons: @@ -44,9 +50,8 @@ class EggheadCourseIE(InfoExtractor): entries.append(self.url_result( lesson_url, ie=EggheadLessonIE.ie_key(), video_id=lesson_id)) - course = self._download_json( - 'https://egghead.io/api/v1/series/%s' % playlist_id, - playlist_id, 'Downloading course JSON', fatal=False) or {} + course = self._call_api( + series_path, playlist_id, 'course', False) or {} playlist_id = course.get('id') if playlist_id: @@ -57,7 +62,7 @@ class EggheadCourseIE(InfoExtractor): course.get('description')) -class EggheadLessonIE(InfoExtractor): +class EggheadLessonIE(EggheadBaseIE): IE_DESC = 'egghead.io lesson' IE_NAME = 'egghead:lesson' _VALID_URL = r'https://egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)' @@ -74,7 +79,7 @@ class EggheadLessonIE(InfoExtractor): 'upload_date': '20161209', 'duration': 304, 'view_count': 0, - 'tags': ['javascript', 'free'], + 'tags': ['free', 'javascript'], }, 'params': { 'skip_download': True, @@ -88,8 +93,8 @@ class EggheadLessonIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - lesson = self._download_json( - 'https://egghead.io/api/v1/lessons/%s' % display_id, display_id) + lesson = self._call_api( + 'lessons/' + display_id, display_id, 'lesson') lesson_id = compat_str(lesson['id']) title = lesson['title'] From b111a64135244b73b86a1720e9a5212e726afcbf Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 2 Feb 2021 19:05:37 +0100 Subject: [PATCH 63/79] [egghead] fix typo --- youtube_dl/extractor/egghead.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index 94dd75b9b..aff9b88c0 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -16,7 +16,7 @@ class EggheadBaseIE(InfoExtractor): def _call_api(self, path, video_id, resource, fatal=True): return self._download_json( 'https://app.egghead.io/api/v1/' + path, - video_id, 'Downloading %s JSON' % resource) + video_id, 'Downloading %s JSON' % resource, fatal=fatal) class EggheadCourseIE(EggheadBaseIE): @@ -79,7 +79,7 @@ class EggheadLessonIE(EggheadBaseIE): 'upload_date': '20161209', 'duration': 304, 'view_count': 0, - 'tags': ['free', 'javascript'], + 'tags': 'count:2', }, 'params': { 'skip_download': True, From 1e2575df8714ce9056e559058a187ec0ffd2d739 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 3 Feb 2021 00:21:46 +0100 Subject: [PATCH 64/79] Credit @adrianheine for #27732 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index b507cb8df..4a6d7dacd 100644 --- a/AUTHORS +++ b/AUTHORS @@ -246,3 +246,4 @@ Enes Solak Nathan Rossi Thomas van der Berg Luca Cherubin +Adrian Heine \ No newline at end of file From 07f7aad81c47a11483a357e53380fae1ffbadea9 Mon Sep 17 00:00:00 2001 From: Guillem Vela <guillemglez@gmail.com> Date: Thu, 27 Feb 2020 22:18:47 +0100 Subject: [PATCH 65/79] [ccma] improve metadata extraction(closes #27994) - extract age_limit, alt_title, categories, series and episode_number - fix timestamp multiple subtitles extraction --- youtube_dl/extractor/ccma.py | 65 ++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py index 544647f92..4db51e650 100644 --- a/youtube_dl/extractor/ccma.py +++ b/youtube_dl/extractor/ccma.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import datetime import re from .common import InfoExtractor @@ -8,8 +9,8 @@ from ..utils import ( clean_html, int_or_none, parse_duration, - parse_iso8601, parse_resolution, + try_get, url_or_none, ) @@ -24,8 +25,9 @@ class CCMAIE(InfoExtractor): 'ext': 'mp4', 'title': 'L\'espot de La Marató de TV3', 'description': 'md5:f12987f320e2f6e988e9908e4fe97765', - 'timestamp': 1470918540, - 'upload_date': '20160811', + 'timestamp': 1478608140, + 'upload_date': '20161108', + 'age_limit': 0, } }, { 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', @@ -35,8 +37,24 @@ class CCMAIE(InfoExtractor): 'ext': 'mp3', 'title': 'El Consell de Savis analitza el derbi', 'description': 'md5:e2a3648145f3241cb9c6b4b624033e53', - 'upload_date': '20171205', - 'timestamp': 1512507300, + 'upload_date': '20170512', + 'timestamp': 1494622500, + 'vcodec': 'none', + 'categories': ['Esports'], + } + }, { + 'url': 'http://www.ccma.cat/tv3/alacarta/crims/crims-josep-tallada-lespereu-me-capitol-1/video/6031387/', + 'md5': 'b43c3d3486f430f3032b5b160d80cbc3', + 'info_dict': { + 'id': '6031387', + 'ext': 'mp4', + 'title': 'Crims - Josep Talleda, l\'"Espereu-me" (capítol 1)', + 'description': 'md5:7cbdafb640da9d0d2c0f62bad1e74e60', + 'timestamp': 1582577700, + 'upload_date': '20200224', + 'subtitles': 'mincount:4', + 'age_limit': 16, + 'series': 'Crims', } }] @@ -72,17 +90,27 @@ class CCMAIE(InfoExtractor): informacio = media['informacio'] title = informacio['titol'] - durada = informacio.get('durada', {}) + durada = informacio.get('durada') or {} duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) - timestamp = parse_iso8601(informacio.get('data_emissio', {}).get('utc')) + tematica = try_get(informacio, lambda x: x['tematica']['text']) + + timestamp = None + data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) + try: + timestamp = datetime.datetime.strptime( + data_utc, '%Y-%d-%mT%H:%M:%S%z').timestamp() + except TypeError: + pass subtitles = {} - subtitols = media.get('subtitols', {}) - if subtitols: - sub_url = subtitols.get('url') + subtitols = media.get('subtitols') or [] + if isinstance(subtitols, dict): + subtitols = [subtitols] + for st in subtitols: + sub_url = st.get('url') if sub_url: subtitles.setdefault( - subtitols.get('iso') or subtitols.get('text') or 'ca', []).append({ + st.get('iso') or st.get('text') or 'ca', []).append({ 'url': sub_url, }) @@ -97,6 +125,16 @@ class CCMAIE(InfoExtractor): 'height': int_or_none(imatges.get('alcada')), }] + age_limit = None + codi_etic = try_get(informacio, lambda x: x['codi_etic']['id']) + if codi_etic: + codi_etic_s = codi_etic.split('_') + if len(codi_etic_s) == 2: + if codi_etic_s[1] == 'TP': + age_limit = 0 + else: + age_limit = int_or_none(codi_etic_s[1]) + return { 'id': media_id, 'title': title, @@ -106,4 +144,9 @@ class CCMAIE(InfoExtractor): 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, + 'age_limit': age_limit, + 'alt_title': informacio.get('titol_complet'), + 'episode_number': int_or_none(informacio.get('capitol')), + 'categories': [tematica] if tematica else None, + 'series': informacio.get('programa'), } From ab25f3f43196ca56964ba34ba4674fcb2d08f69a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 3 Feb 2021 17:15:31 +0100 Subject: [PATCH 66/79] [youtube] pass embed URL to get_video_info request --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 75a007353..42b0f452c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1397,6 +1397,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'Refetching age-gated info webpage', 'unable to download video info webpage', query={ 'video_id': video_id, + 'eurl': 'https://www.youtube.com/embed/' + video_id, }, fatal=False)), lambda x: x['player_response'][0], compat_str) or '{}', video_id) From 1b731ebcaa3ef2a1e52cf6968cf93e08d50fe0d4 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 3 Feb 2021 18:13:17 +0100 Subject: [PATCH 67/79] [bravotv] add support for oxygen.com(closes #13357)(closes #22500) --- youtube_dl/extractor/bravotv.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py index b9715df00..bae2aedce 100644 --- a/youtube_dl/extractor/bravotv.py +++ b/youtube_dl/extractor/bravotv.py @@ -12,7 +12,7 @@ from ..utils import ( class BravoTVIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<req_id>bravotv|oxygen)\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', @@ -28,10 +28,13 @@ class BravoTVIE(AdobePassIE): }, { 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', 'only_matching': True, + }, { + 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) settings = self._parse_json(self._search_regex( r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'), @@ -53,11 +56,14 @@ class BravoTVIE(AdobePassIE): tp_path = release_pid = tve['release_pid'] if tve.get('entitlement') == 'auth': adobe_pass = settings.get('tve_adobe_auth', {}) + if site == 'bravotv': + site = 'bravo' resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId', 'bravo'), + adobe_pass.get('adobePassResourceId') or site, tve['title'], release_pid, tve.get('rating')) query['auth'] = self._extract_mvpd_auth( - url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource) + url, release_pid, + adobe_pass.get('adobePassRequestorId') or site, resource) else: shared_playlist = settings['ls_playlist'] account_pid = shared_playlist['account_pid'] From 83031d749b11f062b9ba97023c228329e771cbd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 00:25:26 +0700 Subject: [PATCH 68/79] [pornhub:user] Add support for URLs unavailable via /videos page and improve paging (closes #27853) --- youtube_dl/extractor/pornhub.py | 56 +++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 2fcbd186f..67e3731c8 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -22,6 +22,7 @@ from ..utils import ( orderedSet, remove_quotes, str_to_int, + update_url_query, url_or_none, ) @@ -405,6 +406,10 @@ class PornHubIE(PornHubBaseIE): class PornHubPlaylistBaseIE(PornHubBaseIE): + def _extract_page(self, url): + return int_or_none(self._search_regex( + r'\bpage=(\d+)', url, 'page', default=None)) + def _extract_entries(self, webpage, host): # Only process container div with main playlist content skipping # drop-down menu that uses similar pattern for videos (see @@ -463,14 +468,27 @@ class PornHubUserIE(PornHubPlaylistBaseIE): }, { 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1', 'only_matching': True, + }, { + # Unavailable via /videos page, but available with direct pagination + # on pornstar page (see [1]), requires premium + # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 + 'url': 'https://www.pornhubpremium.com/pornstar/sienna-west', + 'only_matching': True, + }, { + # Same as before, multi page + 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user_id = mobj.group('id') + videos_url = '%s/videos' % mobj.group('url') + page = self._extract_page(url) + if page: + videos_url = update_url_query(videos_url, {'page': page}) return self.url_result( - '%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(), - video_id=user_id) + videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id) class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): @@ -488,17 +506,37 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): host = mobj.group('host') item_id = mobj.group('id') - page = int_or_none(self._search_regex( - r'\bpage=(\d+)', url, 'page', default=None)) + page = self._extract_page(url) + + VIDEOS = '/videos' + + def download_page(base_url, num): + note = 'Downloading %spage %d' % ('' if VIDEOS in base_url else 'fallback ', num) + return self._download_webpage( + base_url, item_id, note, query={'page': num}) + + def is_404(e): + return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 entries = [] - for page_num in (page, ) if page is not None else itertools.count(1): + base_url = url + has_page = page is not None + first_page = page if has_page else 1 + for page_num in (first_page, ) if has_page else itertools.count(first_page): try: - webpage = self._download_webpage( - url, item_id, 'Downloading page %d' % page_num, - query={'page': page_num}) + try: + webpage = download_page(base_url, page_num) + except ExtractorError as e: + # Some sources may not be available via /videos page, + # trying to fallback to main page pagination (see [1]) + # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 + if is_404(e) and page_num == first_page and VIDEOS in base_url: + base_url = base_url.replace(VIDEOS, '') + webpage = download_page(base_url, page_num) + else: + raise except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + if is_404(e) and page_num != first_page: break raise page_entries = self._extract_entries(webpage, host) From e22ff4e35681a600ed61918beab8ed316728ec39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:09:11 +0700 Subject: [PATCH 69/79] [pornhub] Add support for authentication (closes #18797, closes #21416, closes #24294) --- youtube_dl/extractor/pornhub.py | 106 +++++++++++++++++++++++--------- 1 file changed, 76 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 67e3731c8..83307a233 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -23,6 +23,7 @@ from ..utils import ( remove_quotes, str_to_int, update_url_query, + urlencode_postdata, url_or_none, ) @@ -53,6 +54,66 @@ class PornHubBaseIE(InfoExtractor): return webpage, urlh + def _real_initialize(self): + self._logged_in = False + + def _login(self, host): + if self._logged_in: + return + + site = host.split('.')[0] + + # Both sites pornhub and pornhubpremium have separate accounts + # so there should be an option to provide credentials for both. + # At the same time some videos are available under the same video id + # on both sites so that we have to identify them as the same video. + # For that purpose we have to keep both in the same extractor + # but under different netrc machines. + username, password = self._get_login_info(netrc_machine=site) + if username is None: + return + + login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '') + login_page = self._download_webpage( + login_url, None, 'Downloading %s login page' % site) + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'class=["\']signOut', + r'>Sign\s+[Oo]ut\s*<')) + + if is_logged(login_page): + self._logged_in = True + return + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + response = self._download_json( + 'https://www.%s/front/authenticate' % host, None, + 'Logging in to %s' % site, + data=urlencode_postdata(login_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': login_url, + 'X-Requested-With': 'XMLHttpRequest', + }) + + if response.get('success') == '1': + self._logged_in = True + return + + message = response.get('message') + if message is not None: + raise ExtractorError( + 'Unable to login: %s' % message, expected=True) + + raise ExtractorError('Unable to log in') + class PornHubIE(PornHubBaseIE): IE_DESC = 'PornHub and Thumbzilla' @@ -164,12 +225,20 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', 'only_matching': True, + }, { + # Some videos are available with the same id on both premium + # and non-premium sites (e.g. this and the following test) + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net|org)/embed/[\da-z]+)', + r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)', webpage) def _extract_count(self, pattern, webpage, name): @@ -181,12 +250,7 @@ class PornHubIE(PornHubBaseIE): host = mobj.group('host') or 'pornhub.com' video_id = mobj.group('id') - if 'premium' in host: - if not self._downloader.params.get('cookiefile'): - raise ExtractorError( - 'PornHub Premium requires authentication.' - ' You may want to use --cookies.', - expected=True) + self._login(host) self._set_cookie(host, 'age_verified', '1') @@ -427,26 +491,6 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): container)) ] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - playlist_id = mobj.group('id') - - webpage = self._download_webpage(url, playlist_id) - - entries = self._extract_entries(webpage, host) - - playlist = self._parse_json( - self._search_regex( - r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage, - 'playlist', default='{}'), - playlist_id, fatal=False) - title = playlist.get('title') or self._search_regex( - r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False) - - return self.playlist_result( - entries, playlist_id, title, playlist.get('description')) - class PornHubUserIE(PornHubPlaylistBaseIE): _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' @@ -506,12 +550,14 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): host = mobj.group('host') item_id = mobj.group('id') + self._login(host) + page = self._extract_page(url) VIDEOS = '/videos' - def download_page(base_url, num): - note = 'Downloading %spage %d' % ('' if VIDEOS in base_url else 'fallback ', num) + def download_page(base_url, num, fallback=False): + note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '') return self._download_webpage( base_url, item_id, note, query={'page': num}) @@ -532,7 +578,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 if is_404(e) and page_num == first_page and VIDEOS in base_url: base_url = base_url.replace(VIDEOS, '') - webpage = download_page(base_url, page_num) + webpage = download_page(base_url, page_num, fallback=True) else: raise except ExtractorError as e: From 1f0910bc2742b16be8425841d5ed6a0fd96f82a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:17:45 +0700 Subject: [PATCH 70/79] [svtplay] Fix video id extraction (closes #28058) --- youtube_dl/extractor/svt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index a0b6ef4db..4acc29fce 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -255,8 +255,10 @@ class SVTPlayIE(SVTPlayBaseIE): svt_id = self._search_regex( (r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', - r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'), + r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']svtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)'), webpage, 'video id') info_dict = self._extract_by_video_id(svt_id, webpage) From 2adc0c51cdf38e039fba0ede11f65bbd9c71bde8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:20:09 +0700 Subject: [PATCH 71/79] [pornhub] Add placeholder netrc machine --- youtube_dl/extractor/pornhub.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 83307a233..83773aebb 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -29,6 +29,8 @@ from ..utils import ( class PornHubBaseIE(InfoExtractor): + _NETRC_MACHINE = 'pornhub' + def _download_webpage_handle(self, *args, **kwargs): def dl(*args, **kwargs): return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) From 89c5a7d5aabd138a14c76453d79d5d66ef573bde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:36:57 +0700 Subject: [PATCH 72/79] [pornhub] Implement lazy playlist extraction --- youtube_dl/extractor/pornhub.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 83773aebb..b7631e4e1 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -547,13 +547,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): <button[^>]+\bid=["\']moreDataBtn ''', webpage) is not None - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - item_id = mobj.group('id') - - self._login(host) - + def _entries(self, url, host, item_id): page = self._extract_page(url) VIDEOS = '/videos' @@ -566,7 +560,6 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): def is_404(e): return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 - entries = [] base_url = url has_page = page is not None first_page = page if has_page else 1 @@ -590,11 +583,19 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): page_entries = self._extract_entries(webpage, host) if not page_entries: break - entries.extend(page_entries) + for e in page_entries: + yield e if not self._has_more(webpage): break - return self.playlist_result(orderedSet(entries), item_id) + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + item_id = mobj.group('id') + + self._login(host) + + return self.playlist_result(self._entries(url, host, item_id), item_id) class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): From 3c07d007ca5376719a0cfe6b9c6627b38cbd3e1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:47:30 +0700 Subject: [PATCH 73/79] [ChangeLog] Actualize [ci skip] --- ChangeLog | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/ChangeLog b/ChangeLog index 7f2e0aad1..bd753d524 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,45 @@ +version <unreleased> + +Extractors +* [pornhub] Implement lazy playlist extraction +* [svtplay] Fix video id extraction (#28058) ++ [pornhub] Add support for authentication (#18797, #21416, #24294) +* [pornhub:user] Improve paging ++ [pornhub:user] Add support for URLs unavailable via /videos page (#27853) ++ [bravotv] Add support for oxygen.com (#13357, #22500) ++ [youtube] Pass embed URL to get_video_info request +* [ccma] Improve metadata extraction (#27994) + + Extract age limit, alt title, categories, series and episode number + * Fix timestamp multiple subtitles extraction +* [egghead] Update API domain (#28038) +- [vidzi] Remove extractor (#12629) +* [vidio] Improve metadata extraction +* [youtube] Improve subtitles extraction +* [youtube] Fix chapter extraction fallback +* [youtube] Rewrite extractor + * Improve format sorting + * Remove unused code + * Fix series metadata extraction + * Fix trailer video extraction + * Improve error reporting + + Extract video location ++ [vvvvid] Add support for youtube embeds (#27825) +* [googledrive] Report download page errors (#28005) +* [vlive] Fix error message decoding for python 2 (#28004) +* [youtube] Improve DASH formats file size extraction +* [cda] Improve birth validation detection (#14022, #27929) ++ [awaan] Extract uploader id (#27963) ++ [medialaan] Add support DPG Media MyChannels based websites (#14871, #15597, + #16106, #16489) +* [abcnews] Fix extraction (#12394, #27920) +* [AMP] Fix upload date and timestamp extraction (#27970) +* [tv4] Relax URL regular expression (#27964) ++ [tv2] Add support for mtvuutiset.fi (#27744) +* [adn] Improve login warning reporting +* [zype] Fix uplynk id extraction (#27956) ++ [adn] Add support for authentication (#17091, #27841, #27937) + + version 2021.01.24.1 Core From cfefb7d854f87e02c971170fcfa08f3ff2cb1bfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 04:49:25 +0700 Subject: [PATCH 74/79] release 2021.02.04 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 8 files changed, 16 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 2dde97a2c..86e48bc4e 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.24.1 + [debug] youtube-dl version 2021.02.04 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index c520d1ee0..fa369b744 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 4aacd3bdc..806c7c58d 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 91bbed506..1d1a36dda 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.24.1 + [debug] youtube-dl version 2021.02.04 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index a0a2c989a..c19052a7a 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.01.24.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index bd753d524..d5d9c00a2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2021.02.04 Extractors * [pornhub] Implement lazy playlist extraction diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 13bac6e27..e1b85b1d1 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -537,6 +537,7 @@ - **mtv:video** - **mtvjapan** - **mtvservices:embedded** + - **MTVUutisetArticle** - **MuenchenTV**: münchen.tv - **mva**: Microsoft Virtual Academy videos - **mva:course**: Microsoft Virtual Academy courses @@ -1058,7 +1059,6 @@ - **vidme** - **vidme:user** - **vidme:user:likes** - - **Vidzi** - **vier**: vier.be and vijf.be - **vier:videos** - **viewlift** @@ -1103,6 +1103,7 @@ - **vrv** - **vrv:series** - **VShare** + - **VTM** - **VTXTV** - **vube**: Vube.com - **VuClip** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c52f1d9ca..d898525c9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.01.24.1' +__version__ = '2021.02.04' From fc88e8f0e3e66f17f787cbc1ea45c87fdc70781e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= <mail@adrianheine.de> Date: Thu, 4 Feb 2021 00:57:56 +0100 Subject: [PATCH 75/79] [azmedien] Fix extraction (#28064) --- youtube_dl/extractor/azmedien.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index b1e20def5..930266990 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -47,7 +47,7 @@ class AZMedienIE(InfoExtractor): 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', 'only_matching': True }] - _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/cb9f2f81ed22e9b47f4ca64ea3cc5a5d13e88d1d' + _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/a4016f65fe62b81dc6664dd9f4910e4ab40383be' _PARTNER_ID = '1719221' def _real_extract(self, url): From 7215691ab7cabc858b17c16928c372da3e35ec59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 13:07:43 +0700 Subject: [PATCH 76/79] [youtube] Prefer DASH formats (closes #28070) --- youtube_dl/extractor/youtube.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 42b0f452c..a3b10c094 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1549,16 +1549,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if self._downloader.params.get('youtube_include_dash_manifest'): dash_manifest_url = streaming_data.get('dashManifestUrl') if dash_manifest_url: + dash_formats = [] for f in self._extract_mpd_formats( dash_manifest_url, video_id, fatal=False): - if f['format_id'] in itags: - continue filesize = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) if filesize: f['filesize'] = filesize - formats.append(f) + dash_formats.append(f) + # Until further investigation prefer DASH formats as non-DASH + # may not be available (see [1]) + # 1. https://github.com/ytdl-org/youtube-dl/issues/28070 + if dash_formats: + dash_formats_keys = [f['format_id'] for f in dash_formats] + formats = [f for f in formats if f['format_id'] not in dash_formats_keys] + formats.extend(dash_formats) if not formats: if streaming_data.get('licenseInfos'): From c7d407bca205d8eb248b94b611435187265b79da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 13:09:28 +0700 Subject: [PATCH 77/79] [ChangeLog] Actualize [ci skip] --- ChangeLog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ChangeLog b/ChangeLog index d5d9c00a2..4392a4e6f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +version <unreleased> + +Extractors +* [youtube] Prefer DASH formats (#28070) +* [azmedien] Fix extraction (#28064) + + version 2021.02.04 Extractors From a4bdc3112bf0e925afc2e512d5f23f9097f6bc7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Feb 2021 13:11:33 +0700 Subject: [PATCH 78/79] release 2021.02.04.1 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 86e48bc4e..19b750f86 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.04 + [debug] youtube-dl version 2021.02.04.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index fa369b744..8acb80b60 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 806c7c58d..66edcf752 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 1d1a36dda..18203fb34 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.04 + [debug] youtube-dl version 2021.02.04.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index c19052a7a..20df40cc5 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2021.02.04.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 4392a4e6f..784b73d8d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2021.02.04.1 Extractors * [youtube] Prefer DASH formats (#28070) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d898525c9..425f15589 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.02.04' +__version__ = '2021.02.04.1' From 1641b132323b544b9ae0dad06707425eba1f926b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 4 Feb 2021 13:05:35 +0100 Subject: [PATCH 79/79] [youtube] skip OTF formats(#28070) --- youtube_dl/extractor/youtube.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a3b10c094..eb5f70763 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1477,6 +1477,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): formats = [] itags = [] + itag_qualities = {} player_url = None q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) streaming_data = player_response.get('streamingData') or {} @@ -1486,6 +1487,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): continue + itag = str_or_none(fmt.get('itag')) + quality = fmt.get('quality') + if itag and quality: + itag_qualities[itag] = quality + # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment + # (adding `&sq=0` to the URL) and parsing emsg box to determine the + # number of fragment that would subsequently requested with (`&sq=N`) + if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF': + continue + fmt_url = fmt.get('url') if not fmt_url: sc = compat_parse_qs(fmt.get('signatureCipher')) @@ -1505,10 +1516,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' fmt_url += '&' + sp + '=' + signature - itag = str_or_none(fmt.get('itag')) if itag: itags.append(itag) - quality = fmt.get('quality') dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), @@ -1549,22 +1558,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if self._downloader.params.get('youtube_include_dash_manifest'): dash_manifest_url = streaming_data.get('dashManifestUrl') if dash_manifest_url: - dash_formats = [] for f in self._extract_mpd_formats( dash_manifest_url, video_id, fatal=False): + itag = f['format_id'] + if itag in itags: + continue + if itag in itag_qualities: + f['quality'] = q(itag_qualities[itag]) filesize = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) if filesize: f['filesize'] = filesize - dash_formats.append(f) - # Until further investigation prefer DASH formats as non-DASH - # may not be available (see [1]) - # 1. https://github.com/ytdl-org/youtube-dl/issues/28070 - if dash_formats: - dash_formats_keys = [f['format_id'] for f in dash_formats] - formats = [f for f in formats if f['format_id'] not in dash_formats_keys] - formats.extend(dash_formats) + formats.append(f) if not formats: if streaming_data.get('licenseInfos'):