[googleplus] Remove Extractor(closes #4955 )(closes #7400 )

[applepodcasts] Add new extractor(#25918 )
[googlepodcasts] Add new extractor
2025-07-12 22:44:14 +09:00 · 2021-01-04 01:14:26 +01:00 · 2021-01-04 01:14:26 +01:00 · 2021-01-04 01:14:26 +01:00 · 2021-01-04 01:14:26 +01:00 · 2021-01-04 01:14:26 +01:00
9 changed files with 281 additions and 76 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -21,6 +21,7 @@ from youtube_dl.utils import (
    encode_base_n,
    caesar,
    clean_html,
+    clean_podcast_url,
    date_from_str,
    DateRange,
    detect_exe_version,
@ -1470,6 +1471,10 @@ Line 1
        self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
        self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])

+    def test_clean_podcast_url(self):
+        self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3')
+        self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3')
+

 if __name__ == '__main__':
    unittest.main()
--- a/youtube_dl/extractor/acast.py
+++ b/youtube_dl/extractor/acast.py
@ -6,6 +6,7 @@ import re
 from .common import InfoExtractor
 from ..utils import (
    clean_html,
+    clean_podcast_url,
    int_or_none,
    parse_iso8601,
 )
@ -17,7 +18,7 @@ class ACastBaseIE(InfoExtractor):
        info = {
            'id': episode['id'],
            'display_id': episode.get('episodeUrl'),
-            'url': episode['url'],
+            'url': clean_podcast_url(episode['url']),
            'title': title,
            'description': clean_html(episode.get('description') or episode.get('summary')),
            'thumbnail': episode.get('image'),
--- a/youtube_dl/extractor/applepodcasts.py
+++ b/youtube_dl/extractor/applepodcasts.py
@ -0,0 +1,61 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_podcast_url,
+    int_or_none,
+    parse_iso8601,
+    try_get,
+)
+
+
+class ApplePodcastsIE(InfoExtractor):
+    _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
+        'md5': 'df02e6acb11c10e844946a39e7222b08',
+        'info_dict': {
+            'id': '1000482637777',
+            'ext': 'mp3',
+            'title': '207 - Whitney Webb Returns',
+            'description': 'md5:13a73bade02d2e43737751e3987e1399',
+            'upload_date': '20200705',
+            'timestamp': 1593921600,
+            'duration': 6425,
+            'series': 'The Tim Dillon Show',
+        }
+    }, {
+        'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
+        'only_matching': True,
+    }, {
+        'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777',
+        'only_matching': True,
+    }, {
+        'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        episode_id = self._match_id(url)
+        webpage = self._download_webpage(url, episode_id)
+        ember_data = self._parse_json(self._search_regex(
+            r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
+            webpage, 'ember data'), episode_id)
+        episode = ember_data['data']['attributes']
+        description = episode.get('description') or {}
+
+        series = None
+        for inc in (ember_data.get('included') or []):
+            if inc.get('type') == 'media/podcast':
+                series = try_get(inc, lambda x: x['attributes']['name'])
+
+        return {
+            'id': episode_id,
+            'title': episode['name'],
+            'url': clean_podcast_url(episode['assetUrl']),
+            'description': description.get('standard') or description.get('short'),
+            'timestamp': parse_iso8601(episode.get('releaseDateTime')),
+            'duration': int_or_none(episode.get('durationInMilliseconds'), 1000),
+            'series': series,
+        }
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -55,6 +55,7 @@ from .appletrailers import (
    AppleTrailersIE,
    AppleTrailersSectionIE,
 )
+from .applepodcasts import ApplePodcastsIE
 from .archiveorg import ArchiveOrgIE
 from .arcpublishing import ArcPublishingIE
 from .arkena import ArkenaIE
@ -422,7 +423,10 @@ from .go import GoIE
 from .godtube import GodTubeIE
 from .golem import GolemIE
 from .googledrive import GoogleDriveIE
-from .googleplus import GooglePlusIE
+from .googlepodcasts import (
+    GooglePodcastsIE,
+    GooglePodcastsFeedIE,
+)
 from .googlesearch import GoogleSearchIE
 from .goshgay import GoshgayIE
 from .gputechconf import GPUTechConfIE
@ -463,6 +467,10 @@ from .ign import (
    OneUPIE,
    PCMagIE,
 )
+from .iheart import (
+    IHeartRadioIE,
+    IHeartRadioPodcastIE,
+)
 from .imdb import (
    ImdbIE,
    ImdbListIE
--- a/youtube_dl/extractor/googleplus.py
+++ b/youtube_dl/extractor/googleplus.py
@ -1,73 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-import codecs
-
-from .common import InfoExtractor
-from ..utils import unified_strdate
-
-
-class GooglePlusIE(InfoExtractor):
-    IE_DESC = 'Google Plus'
-    _VALID_URL = r'https?://plus\.google\.com/(?:[^/]+/)*?posts/(?P<id>\w+)'
-    IE_NAME = 'plus.google'
-    _TEST = {
-        'url': 'https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH',
-        'info_dict': {
-            'id': 'ZButuJc6CtH',
-            'ext': 'flv',
-            'title': '嘆きの天使 降臨',
-            'upload_date': '20120613',
-            'uploader': '井上ヨシマサ',
-        }
-    }
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        # Step 1, Retrieve post webpage to extract further information
-        webpage = self._download_webpage(url, video_id, 'Downloading entry webpage')
-
-        title = self._og_search_description(webpage).splitlines()[0]
-        upload_date = unified_strdate(self._html_search_regex(
-            r'''(?x)<a.+?class="o-U-s\s[^"]+"\s+style="display:\s*none"\s*>
-                    ([0-9]{4}-[0-9]{2}-[0-9]{2})</a>''',
-            webpage, 'upload date', fatal=False, flags=re.VERBOSE))
-        uploader = self._html_search_regex(
-            r'rel="author".*?>(.*?)</a>', webpage, 'uploader', fatal=False)
-
-        # Step 2, Simulate clicking the image box to launch video
-        DOMAIN = 'https://plus.google.com/'
-        video_page = self._search_regex(
-            r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN),
-            webpage, 'video page URL')
-        if not video_page.startswith(DOMAIN):
-            video_page = DOMAIN + video_page
-
-        webpage = self._download_webpage(video_page, video_id, 'Downloading video page')
-
-        def unicode_escape(s):
-            decoder = codecs.getdecoder('unicode_escape')
-            return re.sub(
-                r'\\u[0-9a-fA-F]{4,}',
-                lambda m: decoder(m.group(0))[0],
-                s)
-
-        # Extract video links all sizes
-        formats = [{
-            'url': unicode_escape(video_url),
-            'ext': 'flv',
-            'width': int(width),
-            'height': int(height),
-        } for width, height, video_url in re.findall(
-            r'\d+,(\d+),(\d+),"(https?://[^.]+\.googleusercontent\.com.*?)"', webpage)]
-        self._sort_formats(formats)
-
-        return {
-            'id': video_id,
-            'title': title,
-            'uploader': uploader,
-            'upload_date': upload_date,
-            'formats': formats,
-        }
--- a/youtube_dl/extractor/googlepodcasts.py
+++ b/youtube_dl/extractor/googlepodcasts.py
@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_podcast_url,
+    int_or_none,
+    try_get,
+    urlencode_postdata,
+)
+
+
+class GooglePodcastsBaseIE(InfoExtractor):
+    _VALID_URL_BASE = r'https?://podcasts\.google\.com/feed/'
+
+    def _batch_execute(self, func_id, video_id, params):
+        return json.loads(self._download_json(
+            'https://podcasts.google.com/_/PodcastsUi/data/batchexecute',
+            video_id, data=urlencode_postdata({
+                'f.req': json.dumps([[[func_id, json.dumps(params), None, '1']]]),
+            }), transform_source=lambda x: self._search_regex(r'(?s)(\[.+\])', x, 'data'))[0][2])
+
+    def _extract_episode(self, episode):
+        return {
+            'id': episode[4][3],
+            'title': episode[8],
+            'url': clean_podcast_url(episode[13]),
+            'thumbnail': episode[2],
+            'description': episode[9],
+            'creator': try_get(episode, lambda x: x[14]),
+            'timestamp': int_or_none(episode[11]),
+            'duration': int_or_none(episode[12]),
+            'series': episode[1],
+        }
+
+
+class GooglePodcastsIE(GooglePodcastsBaseIE):
+    IE_NAME = 'google:podcasts'
+    _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<feed_url>[^/]+)/episode/(?P<id>[^/?&#]+)'
+    _TEST = {
+        'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA/episode/MzBlNWRlN2UtOWE4Yy00ODcwLTk2M2MtM2JlMmUyNmViOTRh',
+        'md5': 'fa56b2ee8bd0703e27e42d4b104c4766',
+        'info_dict': {
+            'id': '30e5de7e-9a8c-4870-963c-3be2e26eb94a',
+            'ext': 'mp3',
+            'title': 'WWDTM New Year 2021',
+            'description': 'We say goodbye to 2020 with Christine Baranksi, Doug Jones, Jonna Mendez, and Kellee Edwards.',
+            'upload_date': '20210102',
+            'timestamp': 1609606800,
+            'duration': 2901,
+            'series': "Wait Wait... Don't Tell Me!",
+        }
+    }
+
+    def _real_extract(self, url):
+        b64_feed_url, b64_guid = re.match(self._VALID_URL, url).groups()
+        episode = self._batch_execute(
+            'oNjqVe', b64_guid, [b64_feed_url, b64_guid])[1]
+        return self._extract_episode(episode)
+
+
+class GooglePodcastsFeedIE(GooglePodcastsBaseIE):
+    IE_NAME = 'google:podcasts:feed'
+    _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<id>[^/?&#]+)/?(?:[?#&]|$)'
+    _TEST = {
+        'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA',
+        'info_dict': {
+            'title': "Wait Wait... Don't Tell Me!",
+            'description': "NPR's weekly current events quiz. Have a laugh and test your news knowledge while figuring out what's real and what we've made up.",
+        },
+        'playlist_mincount': 20,
+    }
+
+    def _real_extract(self, url):
+        b64_feed_url = self._match_id(url)
+        data = self._batch_execute('ncqJEe', b64_feed_url, [b64_feed_url])
+
+        entries = []
+        for episode in (try_get(data, lambda x: x[1][0]) or []):
+            entries.append(self._extract_episode(episode))
+
+        feed = try_get(data, lambda x: x[3]) or []
+        return self.playlist_result(
+            entries, playlist_title=try_get(feed, lambda x: x[0]),
+            playlist_description=try_get(feed, lambda x: x[2]))
--- a/youtube_dl/extractor/iheart.py
+++ b/youtube_dl/extractor/iheart.py
@ -0,0 +1,97 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_podcast_url,
+    int_or_none,
+    str_or_none,
+)
+
+
+class IHeartRadioBaseIE(InfoExtractor):
+    def _call_api(self, path, video_id, fatal=True, query=None):
+        return self._download_json(
+            'https://api.iheart.com/api/v3/podcast/' + path,
+            video_id, fatal=fatal, query=query)
+
+    def _extract_episode(self, episode):
+        return {
+            'thumbnail': episode.get('imageUrl'),
+            'description': episode.get('description'),
+            'timestamp': int_or_none(episode.get('startDate'), 1000),
+            'duration': int_or_none(episode.get('duration')),
+        }
+
+
+class IHeartRadioIE(IHeartRadioBaseIE):
+    IENAME = 'iheartradio'
+    _VALID_URL = r'(?:https?://(?:www\.)?iheart\.com/podcast/[^/]+/episode/(?P<display_id>[^/?&#]+)-|iheartradio:)(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://www.iheart.com/podcast/105-behind-the-bastards-29236323/episode/part-one-alexander-lukashenko-the-dictator-70346499/?embed=true',
+        'md5': 'c8609c92c8688dcb69d8541042b8abca',
+        'info_dict': {
+            'id': '70346499',
+            'ext': 'mp3',
+            'title': 'Part One: Alexander Lukashenko: The Dictator of Belarus',
+            'description': 'md5:66480b2d25ec93a5f60c0faa3275ce5c',
+            'timestamp': 1597741200,
+            'upload_date': '20200818',
+        }
+    }
+
+    def _real_extract(self, url):
+        episode_id = self._match_id(url)
+        episode = self._call_api(
+            'episodes/' + episode_id, episode_id)['episode']
+        info = self._extract_episode(episode)
+        print(episode['mediaUrl'])
+        info.update({
+            'id': episode_id,
+            'title': episode['title'],
+            'url': clean_podcast_url(episode['mediaUrl']),
+        })
+        return info
+
+
+class IHeartRadioPodcastIE(IHeartRadioBaseIE):
+    IE_NAME = 'iheartradio:podcast'
+    _VALID_URL = r'https?://(?:www\.)?iheart(?:podcastnetwork)?\.com/podcast/[^/?&#]+-(?P<id>\d+)/?(?:[?#&]|$)'
+    _TESTS = [{
+        'url': 'https://www.iheart.com/podcast/1119-it-could-happen-here-30717896/',
+        'info_dict': {
+            'id': '30717896',
+            'title': 'It Could Happen Here',
+            'description': 'md5:5842117412a967eb0b01f8088eb663e2',
+        },
+        'playlist_mincount': 11,
+    }, {
+        'url': 'https://www.iheartpodcastnetwork.com/podcast/105-stuff-you-should-know-26940277',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        podcast_id = self._match_id(url)
+        path = 'podcasts/' + podcast_id
+        episodes = self._call_api(
+            path + '/episodes', podcast_id, query={'limit': 1000000000})['data']
+
+        entries = []
+        for episode in episodes:
+            episode_id = str_or_none(episode.get('id'))
+            if not episode_id:
+                continue
+            info = self._extract_episode(episode)
+            info.update({
+                '_type': 'url',
+                'id': episode_id,
+                'title': episode.get('title'),
+                'url': 'iheartradio:' + episode_id,
+                'ie_key': IHeartRadioIE.ie_key(),
+            })
+            entries.append(info)
+
+        podcast = self._call_api(path, podcast_id, False) or {}
+
+        return self.playlist_result(
+            entries, podcast_id, podcast.get('title'), podcast.get('description'))
--- a/youtube_dl/extractor/stitcher.py
+++ b/youtube_dl/extractor/stitcher.py
@ -4,6 +4,7 @@ from .common import InfoExtractor
 from ..compat import compat_str
 from ..utils import (
    clean_html,
+    clean_podcast_url,
    ExtractorError,
    int_or_none,
    str_or_none,
@ -43,7 +44,7 @@ class StitcherBaseIE(InfoExtractor):
            'title': episode['title'].strip(),
            'description': self._extract_description(episode),
            'duration': int_or_none(episode.get('duration')),
-            'url': audio_url,
+            'url': clean_podcast_url(audio_url),
            'vcodec': 'none',
            'timestamp': int_or_none(episode.get('date_published')),
            'season_number': int_or_none(episode.get('season')),
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -5706,3 +5706,20 @@ def random_birthday(year_field, month_field, day_field):
        month_field: str(random_date.month),
        day_field: str(random_date.day),
    }
+
+
+def clean_podcast_url(url):
+    return re.sub(r'''(?x)
+        (?:
+            (?:
+                chtbl\.com/track|
+                media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
+                play\.podtrac\.com
+            )/[^/]+|
+            (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
+            flex\.acast\.com|
+            pd(?:
+                cn\.co| # https://podcorn.com/analytics-prefix/
+                st\.fm # https://podsights.com/docs/
+            )/e
+        )/''', '', url)
Author	SHA1	Message	Date
Remita Amine	964a8eb754	[googleplus] Remove Extractor(closes #4955 )(closes #7400 )	2021-01-04 01:14:26 +01:00
Remita Amine	ac61f2e058	[applepodcasts] Add new extractor(#25918 )	2021-01-04 01:14:26 +01:00
Remita Amine	8487e8b98a	[googlepodcasts] Add new extractor	2021-01-04 01:14:26 +01:00
Remita Amine	9c484c0019	[iheart] Add new extractor for iHeartRadio(#27037 )	2021-01-04 01:14:26 +01:00
Remita Amine	0e96b4b5ce	[acast] clean podcast URLs	2021-01-04 01:14:26 +01:00
Remita Amine	a563c97c5c	[stitcher] clean podcast URLs	2021-01-04 01:14:25 +01:00
Remita Amine	e88c9ef62a	[utils] add a function to clean podcast URLs	2021-01-04 01:14:25 +01:00