From 8522bcd97c4173407261a3fa0283dd7800c39601 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 3 Jan 2021 12:12:06 +0100 Subject: [PATCH] [stitcher] Add support for shows and show metadata extraction(closes #20510) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/stitcher.py | 120 +++++++++++++++++++++-------- 2 files changed, 92 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 74743a449..d1e1e9a60 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1092,7 +1092,10 @@ from .spike import ( BellatorIE, ParamountNetworkIE, ) -from .stitcher import StitcherIE +from .stitcher import ( + StitcherIE, + StitcherShowIE, +) from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index b8b5711b1..3dd0d3b5f 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -1,19 +1,60 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( clean_html, ExtractorError, int_or_none, str_or_none, try_get, + url_or_none, ) -class StitcherIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/(?:[^/]+/)+e(?:pisode)?/(?:(?P[^/#?&]+?)-)?(?P\d+)(?:[/#?&]|$)' +class StitcherBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/' + + def _call_api(self, path, video_id, query): + resp = self._download_json( + 'https://api.prod.stitcher.com/' + path, + video_id, query=query) + error_massage = try_get(resp, lambda x: x['errors'][0]['message']) + if error_massage: + raise ExtractorError(error_massage, expected=True) + return resp['data'] + + def _extract_description(self, data): + return clean_html(data.get('html_description') or data.get('description')) + + def _extract_audio_url(self, episode): + return url_or_none(episode.get('audio_url') or episode.get('guid')) + + def _extract_show_info(self, show): + return { + 'thumbnail': show.get('image_base_url'), + 'series': show.get('title'), + } + + def _extract_episode(self, episode, audio_url, show_info): + info = { + 'id': compat_str(episode['id']), + 'display_id': episode.get('slug'), + 'title': episode['title'].strip(), + 'description': self._extract_description(episode), + 'duration': int_or_none(episode.get('duration')), + 'url': audio_url, + 'vcodec': 'none', + 'timestamp': int_or_none(episode.get('date_published')), + 'season_number': int_or_none(episode.get('season')), + 'season_id': str_or_none(episode.get('season_id')), + } + info.update(show_info) + return info + + +class StitcherIE(StitcherBaseIE): + _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?:[^/]+/)+e(?:pisode)?/(?:[^/#?&]+-)?(?P\d+)' _TESTS = [{ 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', 'md5': 'e9635098e0da10b21a0e2b85585530f6', @@ -24,8 +65,9 @@ class StitcherIE(InfoExtractor): 'description': 'md5:547adb4081864be114ae3831b4c2b42f', 'duration': 1604, 'thumbnail': r're:^https?://.*\.jpg', - 'upload_date': '20180126', - 'timestamp': 1516989316, + 'upload_date': '20151008', + 'timestamp': 1444285800, + 'series': 'Talking Machines', }, }, { 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', @@ -55,33 +97,47 @@ class StitcherIE(InfoExtractor): }] def _real_extract(self, url): - display_id, audio_id = re.match(self._VALID_URL, url).groups() + audio_id = self._match_id(url) + data = self._call_api( + 'shows/episodes', audio_id, {'episode_ids': audio_id}) + episode = data['episodes'][0] + audio_url = self._extract_audio_url(episode) + if not audio_url: + self.raise_login_required() + show = try_get(data, lambda x: x['shows'][0], dict) or {} + return self._extract_episode( + episode, audio_url, self._extract_show_info(show)) - resp = self._download_json( - 'https://api.prod.stitcher.com/episode/' + audio_id, - display_id or audio_id) - episode = try_get(resp, lambda x: x['data']['episodes'][0], dict) - if not episode: - raise ExtractorError(resp['errors'][0]['message'], expected=True) - title = episode['title'].strip() - audio_url = episode['audio_url'] +class StitcherShowIE(StitcherBaseIE): + _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?P[^/#?&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'http://www.stitcher.com/podcast/the-talking-machines', + 'info_dict': { + 'id': 'the-talking-machines', + 'title': 'Talking Machines', + 'description': 'md5:831f0995e40f26c10231af39cf1ebf0b', + }, + 'playlist_mincount': 106, + }, { + 'url': 'https://www.stitcher.com/show/the-talking-machines', + 'only_matching': True, + }] - thumbnail = None - show_id = episode.get('show_id') - if show_id and episode.get('classic_id') != -1: - thumbnail = 'https://stitcher-classic.imgix.net/feedimages/%s.jpg' % show_id + def _real_extract(self, url): + show_slug = self._match_id(url) + data = self._call_api( + 'search/show/%s/allEpisodes' % show_slug, show_slug, {'count': 10000}) + show = try_get(data, lambda x: x['shows'][0], dict) or {} + show_info = self._extract_show_info(show) - return { - 'id': audio_id, - 'display_id': display_id, - 'title': title, - 'description': clean_html(episode.get('html_description') or episode.get('description')), - 'duration': int_or_none(episode.get('duration')), - 'thumbnail': thumbnail, - 'url': audio_url, - 'vcodec': 'none', - 'timestamp': int_or_none(episode.get('date_created')), - 'season_number': int_or_none(episode.get('season')), - 'season_id': str_or_none(episode.get('season_id')), - } + entries = [] + for episode in (data.get('episodes') or []): + audio_url = self._extract_audio_url(episode) + if not audio_url: + continue + entries.append(self._extract_episode(episode, audio_url, show_info)) + + return self.playlist_result( + entries, show_slug, show.get('title'), + self._extract_description(show))