From ecbd4635522762adf93c1d1f62953bfc0dd2d714 Mon Sep 17 00:00:00 2001 From: Sacha Arnoud Date: Sun, 19 Sep 2021 03:03:31 +0000 Subject: [PATCH] more complete patch with subtitles --- youtube_dl/YoutubeDL.py | 2 + youtube_dl/extractor/common.py | 19 +++-- youtube_dl/extractor/francetv.py | 123 +++++++++++++++++-------------- 3 files changed, 82 insertions(+), 62 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index fe30758ef..02c36fb69 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1879,6 +1879,8 @@ class YoutubeDL(object): except (OSError, IOError): self.report_error('Cannot write subtitles file ' + sub_filename) return + elif sub_info.get('downloader') is not None: + sub_info.get('downloader')(self, encodeFilename(sub_filename)) else: try: sub_data = ie._request_webpage( diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 797c35fd5..e63b7537c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1635,7 +1635,7 @@ class InfoExtractor(object): entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, fatal=True, live=False, data=None, headers={}, - query={}): + query={}, include_subtitles=False): res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', @@ -1650,11 +1650,11 @@ class InfoExtractor(object): return self._parse_m3u8_formats( m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, - preference=preference, m3u8_id=m3u8_id, live=live) + preference=preference, m3u8_id=m3u8_id, live=live, include_subtitles=include_subtitles) def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8', preference=None, - m3u8_id=None, live=False): + m3u8_id=None, live=False, include_subtitles=False): if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access return [] @@ -1662,6 +1662,7 @@ class InfoExtractor(object): return [] formats = [] + subtitles = {} format_url = lambda u: ( u @@ -1696,13 +1697,19 @@ class InfoExtractor(object): groups = {} last_stream_inf = {} - def extract_media(x_media_line): + def extract_media(x_media_line, include_subtitles=False): media = parse_m3u8_attributes(x_media_line) # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME') if not (media_type and group_id and name): return groups.setdefault(group_id, []).append(media) + if include_subtitles and (media_type == 'SUBTITLES'): + subtitles[media['LANGUAGE']] = [{ + 'url': format_url(media['URI']), + 'ext': media['SUBFORMAT'], + }] + return if media_type not in ('VIDEO', 'AUDIO'): return media_url = media.get('URI') @@ -1748,7 +1755,7 @@ class InfoExtractor(object): # precede EXT-X-MEDIA tags in HLS manifest such as [3]. for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-MEDIA:'): - extract_media(line) + extract_media(line, include_subtitles=include_subtitles) for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): @@ -1828,6 +1835,8 @@ class InfoExtractor(object): formats.append(http_f) last_stream_inf = {} + if include_subtitles: + return formats, subtitles return formats @staticmethod diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index e4ec2e200..41e96021a 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -20,6 +20,7 @@ from ..utils import ( urljoin, ) from .dailymotion import DailymotionIE +from ..downloader import PROTOCOL_MAP class FranceTVBaseInfoExtractor(InfoExtractor): @@ -90,17 +91,47 @@ class FranceTVIE(InfoExtractor): # Videos are identified by idDiffusion so catalogue part is optional. # However when provided, some extra formats may be returned so we pass # it if available. - info = self._download_json( - 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/', - video_id, 'Downloading video JSON', query={ - 'idDiffusion': video_id, - 'catalogue': catalogue or '', - }) - if info.get('status') == 'NOK': + info = { + 'title': None, + 'subtitle': None, + 'image': None, + 'subtitles': {}, + 'duration': None, + 'videos': [], + 'formats': [], + } + + def update_info(name, value): + if (info[name] is None) and value: + info[name] = value + + for device_type in ['desktop', 'mobile']: + linfo = self._download_json( + 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, + video_id, 'Downloading %s video JSON' % device_type, query={ + 'device_type': device_type, + 'browser': 'chrome', + }, fatal=False) + + if linfo and linfo.get('video'): + if linfo.get('meta'): + update_info('title', linfo['meta'].get('title')) + update_info('subtitle', linfo['meta'].get('additional_title')) + update_info('image', linfo['meta'].get('image_url')) + if linfo['video'].get('url'): + if linfo['video'].get('drm'): + self._downloader.to_screen('This video source is DRM protected. Skipping') + else: + info['videos'].append(linfo['video']) + update_info('duration', linfo['video'].get('duration')) + + if len(info['videos']) == 0: raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, info['message']), - expected=True) + 'No video source has been found', + expected=True, + video_id=video_id) + allowed_countries = info['videos'][0].get('geoblocage') if allowed_countries: georestricted = True @@ -129,29 +160,7 @@ class FranceTVIE(InfoExtractor): is_live = None - videos = [] - - for video in (info.get('videos') or []): - if video.get('statut') != 'ONLINE': - continue - if not video.get('url'): - continue - videos.append(video) - - if not videos: - for device_type in ['desktop', 'mobile']: - fallback_info = self._download_json( - 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, - video_id, 'Downloading fallback %s video JSON' % device_type, query={ - 'device_type': device_type, - 'browser': 'chrome', - }, fatal=False) - - if fallback_info and fallback_info.get('video'): - videos.append(fallback_info['video']) - - formats = [] - for video in videos: + for video in info['videos']: video_url = video.get('url') if not video_url: continue @@ -167,56 +176,56 @@ class FranceTVIE(InfoExtractor): # See https://github.com/ytdl-org/youtube-dl/issues/3963 # m3u8 urls work fine continue - formats.extend(self._extract_f4m_formats( + info['formats'].extend(self._extract_f4m_formats( sign(video_url, format_id) + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, f4m_id=format_id, fatal=False)) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + format, subtitle = self._extract_m3u8_formats( sign(video_url, format_id), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, - fatal=False)) + fatal=False, include_subtitles=True) + info['formats'].extend(format) + for lang in subtitle: + if lang in info['subtitles']: + info['subtitles'][lang].extend(subtitle[lang]) + else: + info['subtitles'][lang] = subtitle[lang] elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( + info['formats'].extend(self._extract_mpd_formats( sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False)) elif video_url.startswith('rtmp'): - formats.append({ + info['formats'].append({ 'url': video_url, 'format_id': 'rtmp-%s' % format_id, 'ext': 'flv', }) else: if self._is_valid_url(video_url, video_id, format_id): - formats.append({ + info['formats'].append({ 'url': video_url, 'format_id': format_id, }) - self._sort_formats(formats) + self._sort_formats(info['formats']) - title = info['titre'] - subtitle = info.get('sous_titre') - if subtitle: - title += ' - %s' % subtitle - title = title.strip() - - subtitles = {} - subtitles_list = [{ - 'url': subformat['url'], - 'ext': subformat.get('format'), - } for subformat in info.get('subtitles', []) if subformat.get('url')] - if subtitles_list: - subtitles['fr'] = subtitles_list + if info['subtitle']: + info['title'] += ' - %s' % info['subtitle'] + info['title'] = info['title'].strip() + for lang, sts in info['subtitles'].items(): + for st in sts: + st['downloader'] = lambda ydl, filename: PROTOCOL_MAP['m3u8_native'](ydl, ydl.params).download(filename, st) + return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': self._live_title(info['title']) if is_live else info['title'], 'description': clean_html(info.get('synopsis')), - 'thumbnail': urljoin('https://sivideo.webservices.francetelevisions.fr', info.get('image')), - 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')), + 'thumbnail': info.get('image'), + 'duration': int_or_none(info.get('duration')), 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])), 'is_live': is_live, - 'formats': formats, - 'subtitles': subtitles, + 'formats': info['formats'], + 'subtitles': info['subtitles'], } def _real_extract(self, url):