[googleplus] Remove Extractor(closes #4955 )(closes #7400 )

[applepodcasts] Add new extractor(#25918 )
[googlepodcasts] Add new extractor
2025-07-13 15:04:14 +09:00 · 2021-01-04 01:14:26 +01:00 · 2021-01-04 01:14:26 +01:00 · 2021-01-04 01:14:26 +01:00 · 2021-01-04 01:14:26 +01:00 · 2021-01-04 01:14:26 +01:00
9 changed files with 281 additions and 76 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -21,6 +21,7 @@ from youtube_dl.utils import (
    encode_base_n,
    caesar,
    clean_html,
    clean_podcast_url,
    date_from_str,
    DateRange,
    detect_exe_version,
@ -1470,6 +1471,10 @@ Line 1
        self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
        self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])
    def test_clean_podcast_url(self):
        self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3')
        self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3')
 if __name__ == '__main__':
    unittest.main()
--- a/youtube_dl/extractor/acast.py
+++ b/youtube_dl/extractor/acast.py
@ -6,6 +6,7 @@ import re
 from .common import InfoExtractor
 from ..utils import (
    clean_html,
    clean_podcast_url,
    int_or_none,
    parse_iso8601,
 )
@ -17,7 +18,7 @@ class ACastBaseIE(InfoExtractor):
        info = {
            'id': episode['id'],
            'display_id': episode.get('episodeUrl'),
-            'url': episode['url'],
+            'url': clean_podcast_url(episode['url']),
            'title': title,
            'description': clean_html(episode.get('description') or episode.get('summary')),
            'thumbnail': episode.get('image'),
--- a/youtube_dl/extractor/applepodcasts.py
+++ b/youtube_dl/extractor/applepodcasts.py
@ -0,0 +1,61 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..utils import (
    clean_podcast_url,
    int_or_none,
    parse_iso8601,
    try_get,
 )
 class ApplePodcastsIE(InfoExtractor):
    _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
        'md5': 'df02e6acb11c10e844946a39e7222b08',
        'info_dict': {
            'id': '1000482637777',
            'ext': 'mp3',
            'title': '207 - Whitney Webb Returns',
            'description': 'md5:13a73bade02d2e43737751e3987e1399',
            'upload_date': '20200705',
            'timestamp': 1593921600,
            'duration': 6425,
            'series': 'The Tim Dillon Show',
        }
    }, {
        'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
        'only_matching': True,
    }, {
        'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777',
        'only_matching': True,
    }, {
        'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        episode_id = self._match_id(url)
        webpage = self._download_webpage(url, episode_id)
        ember_data = self._parse_json(self._search_regex(
            r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
            webpage, 'ember data'), episode_id)
        episode = ember_data['data']['attributes']
        description = episode.get('description') or {}
        series = None
        for inc in (ember_data.get('included') or []):
            if inc.get('type') == 'media/podcast':
                series = try_get(inc, lambda x: x['attributes']['name'])
        return {
            'id': episode_id,
            'title': episode['name'],
            'url': clean_podcast_url(episode['assetUrl']),
            'description': description.get('standard') or description.get('short'),
            'timestamp': parse_iso8601(episode.get('releaseDateTime')),
            'duration': int_or_none(episode.get('durationInMilliseconds'), 1000),
            'series': series,
        }
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -55,6 +55,7 @@ from .appletrailers import (
    AppleTrailersIE,
    AppleTrailersSectionIE,
 )
 from .applepodcasts import ApplePodcastsIE
 from .archiveorg import ArchiveOrgIE
 from .arcpublishing import ArcPublishingIE
 from .arkena import ArkenaIE
@ -422,7 +423,10 @@ from .go import GoIE
 from .godtube import GodTubeIE
 from .golem import GolemIE
 from .googledrive import GoogleDriveIE
-from .googleplus import GooglePlusIE
+from .googlepodcasts import (
    GooglePodcastsIE,
    GooglePodcastsFeedIE,
 )
 from .googlesearch import GoogleSearchIE
 from .goshgay import GoshgayIE
 from .gputechconf import GPUTechConfIE
@ -463,6 +467,10 @@ from .ign import (
    OneUPIE,
    PCMagIE,
 )
 from .iheart import (
    IHeartRadioIE,
    IHeartRadioPodcastIE,
 )
 from .imdb import (
    ImdbIE,
    ImdbListIE
--- a/youtube_dl/extractor/googleplus.py
+++ b/youtube_dl/extractor/googleplus.py
@ -1,73 +0,0 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 import codecs
 from .common import InfoExtractor
 from ..utils import unified_strdate
 class GooglePlusIE(InfoExtractor):
    IE_DESC = 'Google Plus'
    _VALID_URL = r'https?://plus\.google\.com/(?:[^/]+/)*?posts/(?P<id>\w+)'
    IE_NAME = 'plus.google'
    _TEST = {
        'url': 'https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH',
        'info_dict': {
            'id': 'ZButuJc6CtH',
            'ext': 'flv',
            'title': '嘆きの天使 降臨',
            'upload_date': '20120613',
            'uploader': '井上ヨシマサ',
        }
    }
    def _real_extract(self, url):
        video_id = self._match_id(url)
        # Step 1, Retrieve post webpage to extract further information
        webpage = self._download_webpage(url, video_id, 'Downloading entry webpage')
        title = self._og_search_description(webpage).splitlines()[0]
        upload_date = unified_strdate(self._html_search_regex(
            r'''(?x)<a.+?class="o-U-s\s[^"]+"\s+style="display:\s*none"\s*>
                    ([0-9]{4}-[0-9]{2}-[0-9]{2})</a>''',
            webpage, 'upload date', fatal=False, flags=re.VERBOSE))
        uploader = self._html_search_regex(
            r'rel="author".*?>(.*?)</a>', webpage, 'uploader', fatal=False)
        # Step 2, Simulate clicking the image box to launch video
        DOMAIN = 'https://plus.google.com/'
        video_page = self._search_regex(
            r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN),
            webpage, 'video page URL')
        if not video_page.startswith(DOMAIN):
            video_page = DOMAIN + video_page
        webpage = self._download_webpage(video_page, video_id, 'Downloading video page')
        def unicode_escape(s):
            decoder = codecs.getdecoder('unicode_escape')
            return re.sub(
                r'\\u[0-9a-fA-F]{4,}',
                lambda m: decoder(m.group(0))[0],
                s)
        # Extract video links all sizes
        formats = [{
            'url': unicode_escape(video_url),
            'ext': 'flv',
            'width': int(width),
            'height': int(height),
        } for width, height, video_url in re.findall(
            r'\d+,(\d+),(\d+),"(https?://[^.]+\.googleusercontent\.com.*?)"', webpage)]
        self._sort_formats(formats)
        return {
            'id': video_id,
            'title': title,
            'uploader': uploader,
            'upload_date': upload_date,
            'formats': formats,
        }
--- a/youtube_dl/extractor/googlepodcasts.py
+++ b/youtube_dl/extractor/googlepodcasts.py
@ -0,0 +1,88 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import json
 import re
 from .common import InfoExtractor
 from ..utils import (
    clean_podcast_url,
    int_or_none,
    try_get,
    urlencode_postdata,
 )
 class GooglePodcastsBaseIE(InfoExtractor):
    _VALID_URL_BASE = r'https?://podcasts\.google\.com/feed/'
    def _batch_execute(self, func_id, video_id, params):
        return json.loads(self._download_json(
            'https://podcasts.google.com/_/PodcastsUi/data/batchexecute',
            video_id, data=urlencode_postdata({
                'f.req': json.dumps([[[func_id, json.dumps(params), None, '1']]]),
            }), transform_source=lambda x: self._search_regex(r'(?s)(\[.+\])', x, 'data'))[0][2])
    def _extract_episode(self, episode):
        return {
            'id': episode[4][3],
            'title': episode[8],
            'url': clean_podcast_url(episode[13]),
            'thumbnail': episode[2],
            'description': episode[9],
            'creator': try_get(episode, lambda x: x[14]),
            'timestamp': int_or_none(episode[11]),
            'duration': int_or_none(episode[12]),
            'series': episode[1],
        }
 class GooglePodcastsIE(GooglePodcastsBaseIE):
    IE_NAME = 'google:podcasts'
    _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<feed_url>[^/]+)/episode/(?P<id>[^/?&#]+)'
    _TEST = {
        'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA/episode/MzBlNWRlN2UtOWE4Yy00ODcwLTk2M2MtM2JlMmUyNmViOTRh',
        'md5': 'fa56b2ee8bd0703e27e42d4b104c4766',
        'info_dict': {
            'id': '30e5de7e-9a8c-4870-963c-3be2e26eb94a',
            'ext': 'mp3',
            'title': 'WWDTM New Year 2021',
            'description': 'We say goodbye to 2020 with Christine Baranksi, Doug Jones, Jonna Mendez, and Kellee Edwards.',
            'upload_date': '20210102',
            'timestamp': 1609606800,
            'duration': 2901,
            'series': "Wait Wait... Don't Tell Me!",
        }
    }
    def _real_extract(self, url):
        b64_feed_url, b64_guid = re.match(self._VALID_URL, url).groups()
        episode = self._batch_execute(
            'oNjqVe', b64_guid, [b64_feed_url, b64_guid])[1]
        return self._extract_episode(episode)
 class GooglePodcastsFeedIE(GooglePodcastsBaseIE):
    IE_NAME = 'google:podcasts:feed'
    _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<id>[^/?&#]+)/?(?:[?#&]|$)'
    _TEST = {
        'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA',
        'info_dict': {
            'title': "Wait Wait... Don't Tell Me!",
            'description': "NPR's weekly current events quiz. Have a laugh and test your news knowledge while figuring out what's real and what we've made up.",
        },
        'playlist_mincount': 20,
    }
    def _real_extract(self, url):
        b64_feed_url = self._match_id(url)
        data = self._batch_execute('ncqJEe', b64_feed_url, [b64_feed_url])
        entries = []
        for episode in (try_get(data, lambda x: x[1][0]) or []):
            entries.append(self._extract_episode(episode))
        feed = try_get(data, lambda x: x[3]) or []
        return self.playlist_result(
            entries, playlist_title=try_get(feed, lambda x: x[0]),
            playlist_description=try_get(feed, lambda x: x[2]))
--- a/youtube_dl/extractor/iheart.py
+++ b/youtube_dl/extractor/iheart.py
@ -0,0 +1,97 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..utils import (
    clean_podcast_url,
    int_or_none,
    str_or_none,
 )
 class IHeartRadioBaseIE(InfoExtractor):
    def _call_api(self, path, video_id, fatal=True, query=None):
        return self._download_json(
            'https://api.iheart.com/api/v3/podcast/' + path,
            video_id, fatal=fatal, query=query)
    def _extract_episode(self, episode):
        return {
            'thumbnail': episode.get('imageUrl'),
            'description': episode.get('description'),
            'timestamp': int_or_none(episode.get('startDate'), 1000),
            'duration': int_or_none(episode.get('duration')),
        }
 class IHeartRadioIE(IHeartRadioBaseIE):
    IENAME = 'iheartradio'
    _VALID_URL = r'(?:https?://(?:www\.)?iheart\.com/podcast/[^/]+/episode/(?P<display_id>[^/?&#]+)-|iheartradio:)(?P<id>\d+)'
    _TEST = {
        'url': 'https://www.iheart.com/podcast/105-behind-the-bastards-29236323/episode/part-one-alexander-lukashenko-the-dictator-70346499/?embed=true',
        'md5': 'c8609c92c8688dcb69d8541042b8abca',
        'info_dict': {
            'id': '70346499',
            'ext': 'mp3',
            'title': 'Part One: Alexander Lukashenko: The Dictator of Belarus',
            'description': 'md5:66480b2d25ec93a5f60c0faa3275ce5c',
            'timestamp': 1597741200,
            'upload_date': '20200818',
        }
    }
    def _real_extract(self, url):
        episode_id = self._match_id(url)
        episode = self._call_api(
            'episodes/' + episode_id, episode_id)['episode']
        info = self._extract_episode(episode)
        print(episode['mediaUrl'])
        info.update({
            'id': episode_id,
            'title': episode['title'],
            'url': clean_podcast_url(episode['mediaUrl']),
        })
        return info
 class IHeartRadioPodcastIE(IHeartRadioBaseIE):
    IE_NAME = 'iheartradio:podcast'
    _VALID_URL = r'https?://(?:www\.)?iheart(?:podcastnetwork)?\.com/podcast/[^/?&#]+-(?P<id>\d+)/?(?:[?#&]|$)'
    _TESTS = [{
        'url': 'https://www.iheart.com/podcast/1119-it-could-happen-here-30717896/',
        'info_dict': {
            'id': '30717896',
            'title': 'It Could Happen Here',
            'description': 'md5:5842117412a967eb0b01f8088eb663e2',
        },
        'playlist_mincount': 11,
    }, {
        'url': 'https://www.iheartpodcastnetwork.com/podcast/105-stuff-you-should-know-26940277',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        podcast_id = self._match_id(url)
        path = 'podcasts/' + podcast_id
        episodes = self._call_api(
            path + '/episodes', podcast_id, query={'limit': 1000000000})['data']
        entries = []
        for episode in episodes:
            episode_id = str_or_none(episode.get('id'))
            if not episode_id:
                continue
            info = self._extract_episode(episode)
            info.update({
                '_type': 'url',
                'id': episode_id,
                'title': episode.get('title'),
                'url': 'iheartradio:' + episode_id,
                'ie_key': IHeartRadioIE.ie_key(),
            })
            entries.append(info)
        podcast = self._call_api(path, podcast_id, False) or {}
        return self.playlist_result(
            entries, podcast_id, podcast.get('title'), podcast.get('description'))
--- a/youtube_dl/extractor/stitcher.py
+++ b/youtube_dl/extractor/stitcher.py
@ -4,6 +4,7 @@ from .common import InfoExtractor
 from ..compat import compat_str
 from ..utils import (
    clean_html,
    clean_podcast_url,
    ExtractorError,
    int_or_none,
    str_or_none,
@ -43,7 +44,7 @@ class StitcherBaseIE(InfoExtractor):
            'title': episode['title'].strip(),
            'description': self._extract_description(episode),
            'duration': int_or_none(episode.get('duration')),
-            'url': audio_url,
+            'url': clean_podcast_url(audio_url),
            'vcodec': 'none',
            'timestamp': int_or_none(episode.get('date_published')),
            'season_number': int_or_none(episode.get('season')),
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -5706,3 +5706,20 @@ def random_birthday(year_field, month_field, day_field):
        month_field: str(random_date.month),
        day_field: str(random_date.day),
    }
 def clean_podcast_url(url):
    return re.sub(r'''(?x)
        (?:
            (?:
                chtbl\.com/track|
                media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
                play\.podtrac\.com
            )/[^/]+|
            (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
            flex\.acast\.com|
            pd(?:
                cn\.co| # https://podcorn.com/analytics-prefix/
                st\.fm # https://podsights.com/docs/
            )/e
        )/''', '', url)
Author	SHA1	Message	Date
Remita Amine	964a8eb754	[googleplus] Remove Extractor(closes #4955 )(closes #7400 )	2021-01-04 01:14:26 +01:00
Remita Amine	ac61f2e058	[applepodcasts] Add new extractor(#25918 )	2021-01-04 01:14:26 +01:00
Remita Amine	8487e8b98a	[googlepodcasts] Add new extractor	2021-01-04 01:14:26 +01:00
Remita Amine	9c484c0019	[iheart] Add new extractor for iHeartRadio(#27037 )	2021-01-04 01:14:26 +01:00
Remita Amine	0e96b4b5ce	[acast] clean podcast URLs	2021-01-04 01:14:26 +01:00
Remita Amine	a563c97c5c	[stitcher] clean podcast URLs	2021-01-04 01:14:25 +01:00
Remita Amine	e88c9ef62a	[utils] add a function to clean podcast URLs	2021-01-04 01:14:25 +01:00