From 700fcf273ba15b0d8f960828a25524b30fd968dc Mon Sep 17 00:00:00 2001 From: kikuyan Date: Thu, 10 Jun 2021 23:38:48 +0900 Subject: [PATCH] [cnn] add podcasts support --- youtube_dl/extractor/cnn.py | 78 +++++++++++++++++++++++++++++- youtube_dl/extractor/extractors.py | 1 + 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 2d950fa05..0f76bbbac 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -3,8 +3,19 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, +) from .turner import TurnerBaseIE -from ..utils import url_basename +from ..utils import ( + clean_podcast_url, + int_or_none, + parse_iso8601, + try_get, + url_basename, +) class CNNIE(TurnerBaseIE): @@ -126,7 +137,7 @@ class CNNBlogsIE(InfoExtractor): class CNNArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)' + _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!(?:videos?|audio/podcasts)/)' _TEST = { 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/', 'md5': '689034c2a3d9c6dc4aa72d65a81efd01', @@ -145,3 +156,66 @@ class CNNArticleIE(InfoExtractor): webpage = self._download_webpage(url, url_basename(url)) cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url') return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key()) + + +class CNNPodcastsIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:edition|us|www)\.)?cnn\.com/audio/podcasts/' + _TESTS = [{ + 'url': 'https://edition.cnn.com/audio/podcasts/lincoln?episodeguid=4780950f-e269-407b-9ea1-accc01762945', + 'info_dict': { + 'id': '4780950f-e269-407b-9ea1-accc01762945', + 'ext': 'mp3', + 'title': 'Rising Star', + 'description': 'md5:cc953c3786761333e0829608d2437aba', + 'timestamp': 1613361600, + 'upload_date': '20210215', + }, + }, { + # playlist + 'url': 'https://edition.cnn.com/audio/podcasts/lincoln', + 'info_dict': { + 'id': 'lincoln', + 'title': 'Lincoln: Divided We Stand', + 'description': 'md5:9e122d8d05d58464fc2d5346d84671df', + }, + 'playlist_count': 7, + }] + + def _real_extract(self, url): + episode_id = None + query = compat_urllib_parse_urlparse(url).query + if query: + episode_id = compat_parse_qs(query).get('episodeguid', [None])[0] + playlist_id = url_basename(url) + video_id = episode_id or playlist_id + webpage = self._download_webpage(url, video_id) + episode_data = self._parse_json(self._search_regex( + r'EPISODE_DATA\s*=\s*(\{.+?\});', webpage, 'episode data'), video_id) + episodes = episode_data.get('episodes') or [] + + def entry_info_dict(episode): + description = episode.get('summary') + if description: + # remove extra note + description = re.sub(r'\r\n.*', '', description, flags=re.DOTALL).strip() + return { + 'id': episode.get('guid'), + 'title': episode.get('title'), + 'url': clean_podcast_url(try_get(episode, lambda x: x['enclosure']['url'], compat_str)), + 'description': description, + 'duration': int_or_none(episode.get('duration')), + 'timestamp': parse_iso8601(episode.get('publishedDate')), + } + + if episode_id: + for episode in episodes: + if episode.get('guid') == episode_id: + return entry_info_dict(episode) + else: + entries = [] + for episode in episodes: + entries.append(entry_info_dict(episode)) + playlist_title = episode_data.get('name') + playlist_description = episode_data.get('description') + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 402e542ae..c5a2e9ce8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -239,6 +239,7 @@ from .cnn import ( CNNIE, CNNBlogsIE, CNNArticleIE, + CNNPodcastsIE, ) from .coub import CoubIE from .comedycentral import (