From ecbd4635522762adf93c1d1f62953bfc0dd2d714 Mon Sep 17 00:00:00 2001 From: Sacha Arnoud Date: Sun, 19 Sep 2021 03:03:31 +0000 Subject: [PATCH 01/13] more complete patch with subtitles --- youtube_dl/YoutubeDL.py | 2 + youtube_dl/extractor/common.py | 19 +++-- youtube_dl/extractor/francetv.py | 123 +++++++++++++++++-------------- 3 files changed, 82 insertions(+), 62 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index fe30758ef..02c36fb69 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1879,6 +1879,8 @@ class YoutubeDL(object): except (OSError, IOError): self.report_error('Cannot write subtitles file ' + sub_filename) return + elif sub_info.get('downloader') is not None: + sub_info.get('downloader')(self, encodeFilename(sub_filename)) else: try: sub_data = ie._request_webpage( diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 797c35fd5..e63b7537c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1635,7 +1635,7 @@ class InfoExtractor(object): entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, fatal=True, live=False, data=None, headers={}, - query={}): + query={}, include_subtitles=False): res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', @@ -1650,11 +1650,11 @@ class InfoExtractor(object): return self._parse_m3u8_formats( m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, - preference=preference, m3u8_id=m3u8_id, live=live) + preference=preference, m3u8_id=m3u8_id, live=live, include_subtitles=include_subtitles) def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8', preference=None, - m3u8_id=None, live=False): + m3u8_id=None, live=False, include_subtitles=False): if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access return [] @@ -1662,6 +1662,7 @@ class InfoExtractor(object): return [] formats = [] + subtitles = {} format_url = lambda u: ( u @@ -1696,13 +1697,19 @@ class InfoExtractor(object): groups = {} last_stream_inf = {} - def extract_media(x_media_line): + def extract_media(x_media_line, include_subtitles=False): media = parse_m3u8_attributes(x_media_line) # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME') if not (media_type and group_id and name): return groups.setdefault(group_id, []).append(media) + if include_subtitles and (media_type == 'SUBTITLES'): + subtitles[media['LANGUAGE']] = [{ + 'url': format_url(media['URI']), + 'ext': media['SUBFORMAT'], + }] + return if media_type not in ('VIDEO', 'AUDIO'): return media_url = media.get('URI') @@ -1748,7 +1755,7 @@ class InfoExtractor(object): # precede EXT-X-MEDIA tags in HLS manifest such as [3]. for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-MEDIA:'): - extract_media(line) + extract_media(line, include_subtitles=include_subtitles) for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): @@ -1828,6 +1835,8 @@ class InfoExtractor(object): formats.append(http_f) last_stream_inf = {} + if include_subtitles: + return formats, subtitles return formats @staticmethod diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index e4ec2e200..41e96021a 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -20,6 +20,7 @@ from ..utils import ( urljoin, ) from .dailymotion import DailymotionIE +from ..downloader import PROTOCOL_MAP class FranceTVBaseInfoExtractor(InfoExtractor): @@ -90,17 +91,47 @@ class FranceTVIE(InfoExtractor): # Videos are identified by idDiffusion so catalogue part is optional. # However when provided, some extra formats may be returned so we pass # it if available. - info = self._download_json( - 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/', - video_id, 'Downloading video JSON', query={ - 'idDiffusion': video_id, - 'catalogue': catalogue or '', - }) - if info.get('status') == 'NOK': + info = { + 'title': None, + 'subtitle': None, + 'image': None, + 'subtitles': {}, + 'duration': None, + 'videos': [], + 'formats': [], + } + + def update_info(name, value): + if (info[name] is None) and value: + info[name] = value + + for device_type in ['desktop', 'mobile']: + linfo = self._download_json( + 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, + video_id, 'Downloading %s video JSON' % device_type, query={ + 'device_type': device_type, + 'browser': 'chrome', + }, fatal=False) + + if linfo and linfo.get('video'): + if linfo.get('meta'): + update_info('title', linfo['meta'].get('title')) + update_info('subtitle', linfo['meta'].get('additional_title')) + update_info('image', linfo['meta'].get('image_url')) + if linfo['video'].get('url'): + if linfo['video'].get('drm'): + self._downloader.to_screen('This video source is DRM protected. Skipping') + else: + info['videos'].append(linfo['video']) + update_info('duration', linfo['video'].get('duration')) + + if len(info['videos']) == 0: raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, info['message']), - expected=True) + 'No video source has been found', + expected=True, + video_id=video_id) + allowed_countries = info['videos'][0].get('geoblocage') if allowed_countries: georestricted = True @@ -129,29 +160,7 @@ class FranceTVIE(InfoExtractor): is_live = None - videos = [] - - for video in (info.get('videos') or []): - if video.get('statut') != 'ONLINE': - continue - if not video.get('url'): - continue - videos.append(video) - - if not videos: - for device_type in ['desktop', 'mobile']: - fallback_info = self._download_json( - 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, - video_id, 'Downloading fallback %s video JSON' % device_type, query={ - 'device_type': device_type, - 'browser': 'chrome', - }, fatal=False) - - if fallback_info and fallback_info.get('video'): - videos.append(fallback_info['video']) - - formats = [] - for video in videos: + for video in info['videos']: video_url = video.get('url') if not video_url: continue @@ -167,56 +176,56 @@ class FranceTVIE(InfoExtractor): # See https://github.com/ytdl-org/youtube-dl/issues/3963 # m3u8 urls work fine continue - formats.extend(self._extract_f4m_formats( + info['formats'].extend(self._extract_f4m_formats( sign(video_url, format_id) + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, f4m_id=format_id, fatal=False)) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + format, subtitle = self._extract_m3u8_formats( sign(video_url, format_id), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, - fatal=False)) + fatal=False, include_subtitles=True) + info['formats'].extend(format) + for lang in subtitle: + if lang in info['subtitles']: + info['subtitles'][lang].extend(subtitle[lang]) + else: + info['subtitles'][lang] = subtitle[lang] elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( + info['formats'].extend(self._extract_mpd_formats( sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False)) elif video_url.startswith('rtmp'): - formats.append({ + info['formats'].append({ 'url': video_url, 'format_id': 'rtmp-%s' % format_id, 'ext': 'flv', }) else: if self._is_valid_url(video_url, video_id, format_id): - formats.append({ + info['formats'].append({ 'url': video_url, 'format_id': format_id, }) - self._sort_formats(formats) + self._sort_formats(info['formats']) - title = info['titre'] - subtitle = info.get('sous_titre') - if subtitle: - title += ' - %s' % subtitle - title = title.strip() - - subtitles = {} - subtitles_list = [{ - 'url': subformat['url'], - 'ext': subformat.get('format'), - } for subformat in info.get('subtitles', []) if subformat.get('url')] - if subtitles_list: - subtitles['fr'] = subtitles_list + if info['subtitle']: + info['title'] += ' - %s' % info['subtitle'] + info['title'] = info['title'].strip() + for lang, sts in info['subtitles'].items(): + for st in sts: + st['downloader'] = lambda ydl, filename: PROTOCOL_MAP['m3u8_native'](ydl, ydl.params).download(filename, st) + return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': self._live_title(info['title']) if is_live else info['title'], 'description': clean_html(info.get('synopsis')), - 'thumbnail': urljoin('https://sivideo.webservices.francetelevisions.fr', info.get('image')), - 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')), + 'thumbnail': info.get('image'), + 'duration': int_or_none(info.get('duration')), 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])), 'is_live': is_live, - 'formats': formats, - 'subtitles': subtitles, + 'formats': info['formats'], + 'subtitles': info['subtitles'], } def _real_extract(self, url): From f96eff43f40503e69ebdcbaf55d11907b0b74aee Mon Sep 17 00:00:00 2001 From: Sacha Arnoud Date: Mon, 20 Sep 2021 01:12:41 +0000 Subject: [PATCH 02/13] Fixing test --- test/helper.py | 2 +- youtube_dl/extractor/common.py | 2 +- youtube_dl/extractor/extractors.py | 5 - youtube_dl/extractor/francetv.py | 187 ++--------------------------- youtube_dl/utils.py | 2 +- 5 files changed, 13 insertions(+), 185 deletions(-) diff --git a/test/helper.py b/test/helper.py index e62aab11e..f9623bc6b 100644 --- a/test/helper.py +++ b/test/helper.py @@ -190,7 +190,7 @@ def expect_info_dict(self, got_dict, expected_dict): expect_dict(self, got_dict, expected_dict) # Check for the presence of mandatory fields if got_dict.get('_type') not in ('playlist', 'multi_video'): - for key in ('id', 'url', 'title', 'ext'): + for key in ('id', 'webpage_url', 'title', 'ext'): self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key) # Check for mandatory fields that are automatically set by YoutubeDL for key in ['webpage_url', 'extractor', 'extractor_key']: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e63b7537c..058e224d4 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1707,7 +1707,7 @@ class InfoExtractor(object): if include_subtitles and (media_type == 'SUBTITLES'): subtitles[media['LANGUAGE']] = [{ 'url': format_url(media['URI']), - 'ext': media['SUBFORMAT'], + 'ext': media.get('SUBFORMAT', 'webtt'), }] return if media_type not in ('VIDEO', 'AUDIO'): diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6e8fc3961..3e33f496b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -401,12 +401,7 @@ from .franceinter import FranceInterIE from .francetv import ( FranceTVIE, FranceTVSiteIE, - FranceTVEmbedIE, FranceTVInfoIE, - FranceTVInfoSportIE, - FranceTVJeunesseIE, - GenerationWhatIE, - CultureboxIE, ) from .freesound import FreesoundIE from .freespeech import FreespeechIE diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 41e96021a..079c49190 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -50,14 +50,11 @@ class FranceTVIE(InfoExtractor): _TESTS = [{ # without catalog 'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=162311093&callback=_jsonp_loader_callback_request_0', - 'md5': 'c2248a8de38c4e65ea8fae7b5df2d84f', + 'md5': '283491d723a14db7c4e10b887c4b475a', 'info_dict': { 'id': '162311093', 'ext': 'mp4', 'title': '13h15, le dimanche... - Les mystères de Jésus', - 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', - 'timestamp': 1502623500, - 'upload_date': '20170813', }, }, { # with catalog @@ -252,9 +249,6 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): 'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1', 'ext': 'mp4', 'title': '13h15, le dimanche... - Les mystères de Jésus', - 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', - 'timestamp': 1502623500, - 'upload_date': '20170813', }, 'params': { 'skip_download': True, @@ -316,55 +310,26 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): return self._make_url_result(video_id, catalogue) -class FranceTVEmbedIE(FranceTVBaseInfoExtractor): - _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P[^&]+)' - - _TESTS = [{ - 'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961', - 'info_dict': { - 'id': 'NI_983319', - 'ext': 'mp4', - 'title': 'Le Pen Reims', - 'upload_date': '20170505', - 'timestamp': 1493981780, - 'duration': 16, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [FranceTVIE.ie_key()], - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - video = self._download_json( - 'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id, - video_id) - - return self._make_url_result(video['video_id'], video.get('catalog')) - - class FranceTVInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P[^/?#&.]+)' _TESTS = [{ - 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', + 'url': 'https://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2019_3569073.html', 'info_dict': { - 'id': '84981923', + 'id': 'e49f9ff0-2177-458e-830f-a28eccf19dd1', 'ext': 'mp4', 'title': 'Soir 3', - 'upload_date': '20130826', - 'timestamp': 1377548400, 'subtitles': { - 'fr': 'mincount:2', + 'fr': 'mincount:1', }, }, 'params': { 'skip_download': True, + 'format': 'dash-video=118000+dash-audio_fre=192000', }, 'add_ie': [FranceTVIE.ie_key()], + 'expected_warnings': 'Unknown MIME type application/mp4 in DASH manifest', }, { 'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html', 'only_matching': True, @@ -389,6 +354,10 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): 'uploader_id': 'x2q2ez', }, 'add_ie': ['Dailymotion'], + 'params': { + # TODO: the download currently fails (FORBIDDEN) - fix and complete the test + 'skip_download': True, + }, }, { 'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin', 'only_matching': True, @@ -417,139 +386,3 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): webpage, 'video id') return self._make_url_result(video_id) - - -class FranceTVInfoSportIE(FranceTVBaseInfoExtractor): - IE_NAME = 'sport.francetvinfo.fr' - _VALID_URL = r'https?://sport\.francetvinfo\.fr/(?:[^/]+/)*(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'https://sport.francetvinfo.fr/les-jeux-olympiques/retour-sur-les-meilleurs-moments-de-pyeongchang-2018', - 'info_dict': { - 'id': '6e49080e-3f45-11e8-b459-000d3a2439ea', - 'ext': 'mp4', - 'title': 'Retour sur les meilleurs moments de Pyeongchang 2018', - 'timestamp': 1523639962, - 'upload_date': '20180413', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [FranceTVIE.ie_key()], - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex(r'data-video="([^"]+)"', webpage, 'video_id') - return self._make_url_result(video_id, 'Sport-web') - - -class GenerationWhatIE(InfoExtractor): - IE_NAME = 'france2.fr:generation-what' - _VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P[^/?#&]+)' - - _TESTS = [{ - 'url': 'http://generation-what.francetv.fr/portrait/video/present-arms', - 'info_dict': { - 'id': 'wtvKYUG45iw', - 'ext': 'mp4', - 'title': 'Generation What - Garde à vous - FRA', - 'uploader': 'Generation What', - 'uploader_id': 'UCHH9p1eetWCgt4kXBYCb3_w', - 'upload_date': '20160411', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Youtube'], - }, { - 'url': 'http://generation-what.francetv.fr/europe/video/present-arms', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - youtube_id = self._search_regex( - r"window\.videoURL\s*=\s*'([0-9A-Za-z_-]{11})';", - webpage, 'youtube id') - - return self.url_result(youtube_id, ie='Youtube', video_id=youtube_id) - - -class CultureboxIE(FranceTVBaseInfoExtractor): - _VALID_URL = r'https?://(?:m\.)?culturebox\.francetvinfo\.fr/(?:[^/]+/)*(?P[^/?#&]+)' - - _TESTS = [{ - 'url': 'https://culturebox.francetvinfo.fr/opera-classique/musique-classique/c-est-baroque/concerts/cantates-bwv-4-106-et-131-de-bach-par-raphael-pichon-57-268689', - 'info_dict': { - 'id': 'EV_134885', - 'ext': 'mp4', - 'title': 'Cantates BWV 4, 106 et 131 de Bach par Raphaël Pichon 5/7', - 'description': 'md5:19c44af004b88219f4daa50fa9a351d4', - 'upload_date': '20180206', - 'timestamp': 1517945220, - 'duration': 5981, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [FranceTVIE.ie_key()], - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - if ">Ce live n'est plus disponible en replay<" in webpage: - raise ExtractorError( - 'Video %s is not available' % display_id, expected=True) - - video_id, catalogue = self._search_regex( - r'["\'>]https?://videos\.francetv\.fr/video/([^@]+@.+?)["\'<]', - webpage, 'video id').split('@') - - return self._make_url_result(video_id, catalogue) - - -class FranceTVJeunesseIE(FranceTVBaseInfoExtractor): - _VALID_URL = r'(?Phttps?://(?:www\.)?(?:zouzous|ludo)\.fr/heros/(?P[^/?#&]+))' - - _TESTS = [{ - 'url': 'https://www.zouzous.fr/heros/simon', - 'info_dict': { - 'id': 'simon', - }, - 'playlist_count': 9, - }, { - 'url': 'https://www.ludo.fr/heros/ninjago', - 'info_dict': { - 'id': 'ninjago', - }, - 'playlist_count': 10, - }, { - 'url': 'https://www.zouzous.fr/heros/simon?abc', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') - - playlist = self._download_json( - '%s/%s' % (mobj.group('url'), 'playlist'), playlist_id) - - if not playlist.get('count'): - raise ExtractorError( - '%s is not available' % playlist_id, expected=True) - - entries = [] - for item in playlist['items']: - identity = item.get('identity') - if identity and isinstance(identity, compat_str): - entries.append(self._make_url_result(identity)) - - return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e722eed58..ecf744041 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1830,7 +1830,7 @@ def write_json_file(obj, fn): try: with tf: - json.dump(obj, tf) + json.dump(obj, tf, default=lambda _:'') if sys.platform == 'win32': # Need to remove existing file on Windows, else os.rename raises # WindowsError or FileExistsError. From bfa16e8a1d7054dfd64ae61a9f76af8bff3f1de4 Mon Sep 17 00:00:00 2001 From: Sacha Arnoud Date: Mon, 20 Sep 2021 01:15:56 +0000 Subject: [PATCH 03/13] style guide --- youtube_dl/extractor/francetv.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 079c49190..2d12e6dfb 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -6,7 +6,6 @@ import re from .common import InfoExtractor from ..compat import ( - compat_str, compat_urlparse, ) from ..utils import ( @@ -14,10 +13,8 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, - parse_duration, try_get, url_or_none, - urljoin, ) from .dailymotion import DailymotionIE from ..downloader import PROTOCOL_MAP @@ -212,7 +209,7 @@ class FranceTVIE(InfoExtractor): for lang, sts in info['subtitles'].items(): for st in sts: st['downloader'] = lambda ydl, filename: PROTOCOL_MAP['m3u8_native'](ydl, ydl.params).download(filename, st) - + return { 'id': video_id, 'title': self._live_title(info['title']) if is_live else info['title'], From 633ab0c56ffcda9bf2c5d5bb598023e1f7aa86eb Mon Sep 17 00:00:00 2001 From: Sacha Arnoud Date: Mon, 20 Sep 2021 14:33:30 +0000 Subject: [PATCH 04/13] json serialization bugs --- youtube_dl/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 02c36fb69..c6d906f13 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1771,7 +1771,7 @@ class YoutubeDL(object): self.to_stdout(formatSeconds(info_dict['duration'])) print_mandatory('format') if self.params.get('forcejson', False): - self.to_stdout(json.dumps(info_dict)) + self.to_stdout(json.dumps(info_dict, default=lambda _:'')) def process_info(self, info_dict): """Process a single resolved IE result.""" @@ -2076,7 +2076,7 @@ class YoutubeDL(object): raise else: if self.params.get('dump_single_json', False): - self.to_stdout(json.dumps(res)) + self.to_stdout(json.dumps(res, default=lambda _:'')) return self._download_retcode From 7f23f02da7b3d07243d83497f4ecbee3c2346af8 Mon Sep 17 00:00:00 2001 From: Sacha Arnoud Date: Mon, 20 Sep 2021 14:36:49 +0000 Subject: [PATCH 05/13] review commetns --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c6d906f13..dc4005a18 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1879,7 +1879,7 @@ class YoutubeDL(object): except (OSError, IOError): self.report_error('Cannot write subtitles file ' + sub_filename) return - elif sub_info.get('downloader') is not None: + elif callable(sub_info.get('downloader')): sub_info.get('downloader')(self, encodeFilename(sub_filename)) else: try: From d0774569c1eca21344c29f1694dae854a7b2cb2c Mon Sep 17 00:00:00 2001 From: Sacha Arnoud Date: Tue, 21 Sep 2021 15:53:53 +0000 Subject: [PATCH 06/13] update supported sites --- docs/supportedsites.md | 7 ------- 1 file changed, 7 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ed0d5e9d9..9ec55f01f 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -210,7 +210,6 @@ - **CTV** - **CTVNews** - **cu.ntv.co.jp**: Nippon Television Network - - **Culturebox** - **CultureUnplugged** - **curiositystream** - **curiositystream:collection** @@ -307,13 +306,10 @@ - **foxnews**: Fox News and Fox Business Video - **foxnews:article** - **FoxSports** - - **france2.fr:generation-what** - **FranceCulture** - **FranceInter** - **FranceTV** - - **FranceTVEmbed** - **francetvinfo.fr** - - **FranceTVJeunesse** - **FranceTVSite** - **Freesound** - **freespeech.org** @@ -472,8 +468,6 @@ - **LinuxAcademy** - **LiTV** - **LiveJournal** - - **LiveLeak** - - **LiveLeakEmbed** - **livestream** - **livestream:original** - **LnkGo** @@ -877,7 +871,6 @@ - **SpankBangPlaylist** - **Spankwire** - **Spiegel** - - **sport.francetvinfo.fr** - **Sport5** - **SportBox** - **SportDeutschland** From 5fb593d50a8134d23c9060979a0440f3198725f1 Mon Sep 17 00:00:00 2001 From: Sacha Arnoud Date: Tue, 21 Sep 2021 16:31:43 +0000 Subject: [PATCH 07/13] Moving protocol to download subtitles back to the subtitle_info and keep the download logic in YoutubeDl --- youtube_dl/YoutubeDL.py | 19 ++++++++++--------- youtube_dl/downloader/__init__.py | 3 ++- youtube_dl/extractor/common.py | 1 + youtube_dl/extractor/francetv.py | 4 ---- youtube_dl/utils.py | 2 +- 5 files changed, 14 insertions(+), 15 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index dc4005a18..018d585ee 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1771,7 +1771,7 @@ class YoutubeDL(object): self.to_stdout(formatSeconds(info_dict['duration'])) print_mandatory('format') if self.params.get('forcejson', False): - self.to_stdout(json.dumps(info_dict, default=lambda _:'')) + self.to_stdout(json.dumps(info_dict)) def process_info(self, info_dict): """Process a single resolved IE result.""" @@ -1879,15 +1879,16 @@ class YoutubeDL(object): except (OSError, IOError): self.report_error('Cannot write subtitles file ' + sub_filename) return - elif callable(sub_info.get('downloader')): - sub_info.get('downloader')(self, encodeFilename(sub_filename)) else: + fd = get_suitable_downloader(sub_info, self.params)(self, self.params) try: - sub_data = ie._request_webpage( - sub_info['url'], info_dict['id'], note=False).read() - with io.open(encodeFilename(sub_filename), 'wb') as subfile: - subfile.write(sub_data) - except (ExtractorError, IOError, OSError, ValueError) as err: + if self.params.get('verbose'): + self.to_screen('[debug] Invoking subtitle downloader on %r' % sub_info.get('url')) + # The FD is supposed to encodeFilename() + if not fd.download(sub_filename, sub_info): + # depending on the FD, it may catch errors and return False, or not + raise DownloadError('Subtitle download failed') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error, OSError, IOError, YoutubeDLError) as err: self.report_warning('Unable to download subtitle for "%s": %s' % (sub_lang, error_to_compat_str(err))) continue @@ -2076,7 +2077,7 @@ class YoutubeDL(object): raise else: if self.params.get('dump_single_json', False): - self.to_stdout(json.dumps(res, default=lambda _:'')) + self.to_stdout(json.dumps(res)) return self._download_retcode diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index 2e485df9d..f3200566e 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -33,7 +33,8 @@ def get_suitable_downloader(info_dict, params={}): """Get the downloader class that can handle the info dict.""" protocol = determine_protocol(info_dict) info_dict['protocol'] = protocol - + print('SACHA> ', protocol) + # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): # return FFmpegFD diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 058e224d4..e45c67f94 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1708,6 +1708,7 @@ class InfoExtractor(object): subtitles[media['LANGUAGE']] = [{ 'url': format_url(media['URI']), 'ext': media.get('SUBFORMAT', 'webtt'), + 'protocol': 'm3u8_native', }] return if media_type not in ('VIDEO', 'AUDIO'): diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 2d12e6dfb..95ef0bf6c 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -206,10 +206,6 @@ class FranceTVIE(InfoExtractor): info['title'] += ' - %s' % info['subtitle'] info['title'] = info['title'].strip() - for lang, sts in info['subtitles'].items(): - for st in sts: - st['downloader'] = lambda ydl, filename: PROTOCOL_MAP['m3u8_native'](ydl, ydl.params).download(filename, st) - return { 'id': video_id, 'title': self._live_title(info['title']) if is_live else info['title'], diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ecf744041..ef0afd686 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1830,7 +1830,7 @@ def write_json_file(obj, fn): try: with tf: - json.dump(obj, tf, default=lambda _:'') + json.dump(obj) if sys.platform == 'win32': # Need to remove existing file on Windows, else os.rename raises # WindowsError or FileExistsError. From f02f87db8e2186ed4df4721c1e57269a3fdf44c5 Mon Sep 17 00:00:00 2001 From: Sacha Arnoud Date: Tue, 21 Sep 2021 16:33:20 +0000 Subject: [PATCH 08/13] json.dump bug fix --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ef0afd686..e722eed58 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1830,7 +1830,7 @@ def write_json_file(obj, fn): try: with tf: - json.dump(obj) + json.dump(obj, tf) if sys.platform == 'win32': # Need to remove existing file on Windows, else os.rename raises # WindowsError or FileExistsError. From 3b9dad99427a011b25565541b5b03c25dfb19c67 Mon Sep 17 00:00:00 2001 From: Sacha Arnoud Date: Tue, 21 Sep 2021 16:40:08 +0000 Subject: [PATCH 09/13] flake8 style fixes --- youtube_dl/YoutubeDL.py | 4 ++-- youtube_dl/downloader/__init__.py | 3 +-- youtube_dl/extractor/francetv.py | 1 - 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 018d585ee..956b869f0 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -91,6 +91,7 @@ from .utils import ( write_string, YoutubeDLCookieJar, YoutubeDLCookieProcessor, + YoutubeDLError, YoutubeDLHandler, YoutubeDLRedirectHandler, ) @@ -1862,7 +1863,6 @@ class YoutubeDL(object): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE subtitles = info_dict['requested_subtitles'] - ie = self.get_info_extractor(info_dict['extractor_key']) for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) @@ -1887,7 +1887,7 @@ class YoutubeDL(object): # The FD is supposed to encodeFilename() if not fd.download(sub_filename, sub_info): # depending on the FD, it may catch errors and return False, or not - raise DownloadError('Subtitle download failed') + raise YoutubeDLError('Subtitle download failed') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error, OSError, IOError, YoutubeDLError) as err: self.report_warning('Unable to download subtitle for "%s": %s' % (sub_lang, error_to_compat_str(err))) diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index f3200566e..2e485df9d 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -33,8 +33,7 @@ def get_suitable_downloader(info_dict, params={}): """Get the downloader class that can handle the info dict.""" protocol = determine_protocol(info_dict) info_dict['protocol'] = protocol - print('SACHA> ', protocol) - + # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): # return FFmpegFD diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 95ef0bf6c..003a3a0dc 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -17,7 +17,6 @@ from ..utils import ( url_or_none, ) from .dailymotion import DailymotionIE -from ..downloader import PROTOCOL_MAP class FranceTVBaseInfoExtractor(InfoExtractor): From 8e8e95a4903864bfc643b5857492c531bebad671 Mon Sep 17 00:00:00 2001 From: Sacha Arnoud Date: Sat, 25 Sep 2021 22:24:26 +0000 Subject: [PATCH 10/13] document "protocol" field --- youtube_dl/extractor/common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e45c67f94..fd1fcda67 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -250,7 +250,10 @@ class InfoExtractor(object): preference, each element is a dictionary with the "ext" entry and one of: * "data": The subtitles file contents - * "url": A URL pointing to the subtitles file + * "url": A URL pointing to the subtitles resource + With "url", a "protocol" entry (as for "formats" above) + may be provided to indicate how the URL should be + processed; by default it is a file downloaded by HTTP(S) "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles', used by the YoutubeIE for automatically generated captions From 1b0746bab5aa73fbfcbc055fcae290b266749302 Mon Sep 17 00:00:00 2001 From: Sacha Arnoud Date: Sat, 25 Sep 2021 22:36:22 +0000 Subject: [PATCH 11/13] Bug fix when _extract_m3u8_formats returns False --- youtube_dl/extractor/francetv.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 003a3a0dc..f4ba97d54 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -173,10 +173,13 @@ class FranceTVIE(InfoExtractor): sign(video_url, format_id) + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, f4m_id=format_id, fatal=False)) elif ext == 'm3u8': - format, subtitle = self._extract_m3u8_formats( + res = self._extract_m3u8_formats( sign(video_url, format_id), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False, include_subtitles=True) + if not res: + continue + format, subtitle = res info['formats'].extend(format) for lang in subtitle: if lang in info['subtitles']: From a2c46586abc71594cba72ad9f539b4c41ddb86a7 Mon Sep 17 00:00:00 2001 From: Sacha Arnoud Date: Sat, 25 Sep 2021 22:53:41 +0000 Subject: [PATCH 12/13] set a preference for formats --- youtube_dl/extractor/francetv.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index f4ba97d54..86b4d49f7 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -202,6 +202,18 @@ class FranceTVIE(InfoExtractor): 'format_id': format_id, }) + for f in info['formats']: + preference = 50 + if f['format_id'].startswith('dash-audio_qtz=96000') or (f['format_id'].find('Description') >= 0): + preference = -1 + elif f['format_id'].startswith('hls-audio'): + preference = 10 + elif f['format_id'].startswith('dash-audio'): + preference = 20 + elif f['format_id'].startswith('dash-video'): + preference = 100 + f['preference'] = preference + self._sort_formats(info['formats']) if info['subtitle']: From 19b3af2b3b06c72ffef3af52440554bde30b1d1f Mon Sep 17 00:00:00 2001 From: Sacha Arnoud Date: Sat, 25 Sep 2021 23:20:03 +0000 Subject: [PATCH 13/13] fix tests --- youtube_dl/extractor/francetv.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 86b4d49f7..edd2da2c8 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -46,12 +46,16 @@ class FranceTVIE(InfoExtractor): _TESTS = [{ # without catalog 'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=162311093&callback=_jsonp_loader_callback_request_0', - 'md5': '283491d723a14db7c4e10b887c4b475a', + 'md5': '944fe929c5ed2c05f864085ec5714f98', 'info_dict': { 'id': '162311093', 'ext': 'mp4', 'title': '13h15, le dimanche... - Les mystères de Jésus', }, + 'params': { + 'format': 'bestvideo', + }, + 'expected_warnings': 'Unknown MIME type application/mp4 in DASH manifest', }, { # with catalog 'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=NI_1004933&catalogue=Zouzous&callback=_jsonp_loader_callback_request_4', @@ -203,15 +207,15 @@ class FranceTVIE(InfoExtractor): }) for f in info['formats']: - preference = 50 + preference = 100 if f['format_id'].startswith('dash-audio_qtz=96000') or (f['format_id'].find('Description') >= 0): preference = -1 - elif f['format_id'].startswith('hls-audio'): - preference = 10 elif f['format_id'].startswith('dash-audio'): - preference = 20 + preference = 10 + elif f['format_id'].startswith('hls-audio'): + preference = 200 elif f['format_id'].startswith('dash-video'): - preference = 100 + preference = 50 f['preference'] = preference self._sort_formats(info['formats']) @@ -259,8 +263,10 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): }, 'params': { 'skip_download': True, + 'format': 'bestvideo', }, 'add_ie': [FranceTVIE.ie_key()], + 'expected_warnings': 'Unknown MIME type application/mp4 in DASH manifest', }, { # france3 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html',