From 7bb8d9418465f7d66a3977c37908a946ba93351d Mon Sep 17 00:00:00 2001 From: Glenn Pavlovic Date: Tue, 17 Jan 2023 18:40:37 -0800 Subject: [PATCH 1/3] Rumble + UsaWatchdog - improves Rumble support and adds UsaWatchdog support --- youtube_dl/extractor/extractors.py | 10 ++- youtube_dl/extractor/rumble.py | 105 +++++++++++++++++++++++----- youtube_dl/extractor/usawatchdog.py | 46 ++++++++++++ 3 files changed, 141 insertions(+), 20 deletions(-) create mode 100644 youtube_dl/extractor/usawatchdog.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 947cbe8fd..6c3990189 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1049,7 +1049,11 @@ from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETe from .rtvnh import RTVNHIE from .rtvs import RTVSIE from .ruhd import RUHDIE -from .rumble import RumbleEmbedIE +from .rumble import ( + RumbleEmbedIE, + RumblePageIE, + RumblePlaylistIE, +) from .rutube import ( RutubeIE, RutubeChannelIE, @@ -1414,6 +1418,10 @@ from .urort import UrortIE from .urplay import URPlayIE from .usanetwork import USANetworkIE from .usatoday import USATodayIE +from .usawatchdog import ( + UsaWatchdogStoryIE, + UsaWatchdogIE, +) from .ustream import UstreamIE, UstreamChannelIE from .ustudio import ( UstudioIE, diff --git a/youtube_dl/extractor/rumble.py b/youtube_dl/extractor/rumble.py index 4a0225109..7431a3b7e 100644 --- a/youtube_dl/extractor/rumble.py +++ b/youtube_dl/extractor/rumble.py @@ -1,5 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..compat import compat_str @@ -11,28 +12,14 @@ from ..utils import ( ) -class RumbleEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P[0-9a-z]+)' - _TESTS = [{ - 'url': 'https://rumble.com/embed/v5pv5f', - 'md5': '36a18a049856720189f30977ccbb2c34', - 'info_dict': { - 'id': 'v5pv5f', - 'ext': 'mp4', - 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', - 'timestamp': 1571611968, - 'upload_date': '20191020', - } - }, { - 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) +class rumbleBase(InfoExtractor): + def rumble_video_info(self, video_id): video = self._download_json( 'https://rumble.com/embedJS/', video_id, query={'request': 'video', 'v': video_id}) + if not video: + return None + title = video['title'] formats = [] @@ -65,3 +52,83 @@ class RumbleEmbedIE(InfoExtractor): 'channel_url': author.get('url'), 'duration': int_or_none(video.get('duration')), } + + +class RumbleEmbedIE(rumbleBase): + _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P[0-9a-z]+)' + _TESTS = [{ + 'url': 'https://rumble.com/embed/v5pv5f', + 'md5': '36a18a049856720189f30977ccbb2c34', + 'info_dict': { + 'id': 'v5pv5f', + 'ext': 'mp4', + 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', + 'timestamp': 1571611968, + 'upload_date': '20191020', + } + }, { + 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.rumble_video_info(video_id) + + +class RumblePageIE(rumbleBase): + _VALID_URL = r'https?://rumble\.com/[a-zA-Z0-9-_.]*\.html' + _TEST = { + 'url': 'https://rumble.com/v8c1bt-wmar-2-news-latest-headlines-october-20-6pm.html', + 'md5': '36a18a049856720189f30977ccbb2c34', + 'info_dict': { + 'id': 'v5pv5f', + 'ext': 'mp4', + 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', + 'timestamp': 1571611968, + 'upload_date': '20191020', + }} + + _RUMBLE_JS_RE = r'Rumble *\( *["\']play["\'], *\{[^}]*["\']video["\'] *: *["\'](?P[^"\']+)' + + def _real_extract(self, url): + page = self._download_webpage(url, 'Rumble Page') + video_id = self._search_regex(self._RUMBLE_JS_RE, page, "id") + return self.rumble_video_info(video_id) + + +class RumblePlaylistIE(rumbleBase): + _VALID_URL = r'https?://rumble.com/(?:c|user)/(?P[^/]+)' + _TEST = { + 'url': 'https://rumble.com/c/PeakProsperity', + 'playlist_mincount': 25, + 'info_dict': { + 'id': 'PeakProsperity', + }} + + def _real_extract(self, url): + urls = [] + id = self._match_id(url) + page = self._download_webpage(url, id) + for mobj in re.finditer(r'[a-zA-Z0-9\-.]+)>', page): + urls.append('https://rumble.com/' + mobj.group('href')) + + return self.playlist_from_matches(urls, id) + + +def rumble_embedded_id(page_data): + '''For use by extractors of sites which use emedded Rumble videos. Given + a webpage as a string returns a list of url result dicts for each embedded + rumble video found. None is returned if no embeds were found. Duplicates + are not removed''' + + embeds = [] + # The JS embeds + for mobj in re.finditer(RumblePageIE._RUMBLE_JS_RE, page_data): + embeds.append(InfoExtractor.url_result('https://rumble.com/embed/' + mobj.group('id'),'RumbleEmbed',mobj.group('id'))) + + # The iframes embeds + for mobj in re.finditer(RumbleEmbedIE._VALID_URL, page_data): + embeds.append(InfoExtractor.url_result('https://rumble.com/embed/' + mobj.group('id'),'RumbleEmbed',mobj.group('id'))) + + return embeds if embeds else None diff --git a/youtube_dl/extractor/usawatchdog.py b/youtube_dl/extractor/usawatchdog.py new file mode 100644 index 000000000..4bb576a9b --- /dev/null +++ b/youtube_dl/extractor/usawatchdog.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals +import re + +from .common import InfoExtractor + +from .rumble import rumble_embedded_id + + +class UsaWatchdogStoryIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?usawatchdog\.com/(?P[^/]+)' + _TEST = { + 'url': 'https://usawatchdog.com/cv-19-vaccine-warning-cv-19-cure-must-watch-videos/', + 'md5': 'bf40e20aebca9016ca195534028cbb6f', + 'info_dict': { + 'id': 'vcl8gx', + 'ext': 'mp4', + 'timestamp': 1617141926, + 'upload_date': '20210330', + 'title': u'Vaccine Warning \u2013 CV-19 Cure Must Watch Videos', + }} + + def _real_extract(self, url): + title = self._match_id(url) + embeds = rumble_embedded_id(self._download_webpage(url, title)) + return embeds[0] if embeds is not None else None + + +class UsaWatchdogIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?usawatchdog\.com/$' + _TEST = { + 'url': 'https://usawatchdog.com/', + 'playlist_mincount': 15, + 'info_dict': { + 'id': 'USA Watchdog', + }} + + def _real_extract(self, url): + matches = [] + for mobj in re.finditer(r'front-view-title[^<]+https?:(?:www\.)?//usawatchdog.com/[^/]+\/?)[^>]+>(?P[^<]+)', + self._download_webpage(url, 'Site Root')): + matches.append(self.url_result(mobj.group('href'), + 'UsaWatchdogStory', None, + mobj.group('title').encode('utf8'))) + + return self.playlist_result(matches, 'USA Watchdog') From a777aeeda0598d18c2aba01a90d7fd4fcb621aaf Mon Sep 17 00:00:00 2001 From: Glenn Pavlovic <glimrick@epilitimus.com> Date: Sun, 29 Jan 2023 18:20:46 -0800 Subject: [PATCH 2/3] combine all the rumble extractors, add rumble to generic.py, remove UsaWatchdogStory --- youtube_dl/extractor/extractors.py | 11 +- youtube_dl/extractor/generic.py | 5 + youtube_dl/extractor/rumble.py | 177 ++++++++++++++++------------ youtube_dl/extractor/usawatchdog.py | 26 +--- 4 files changed, 109 insertions(+), 110 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6c3990189..6f2c96991 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1049,11 +1049,7 @@ from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETe from .rtvnh import RTVNHIE from .rtvs import RTVSIE from .ruhd import RUHDIE -from .rumble import ( - RumbleEmbedIE, - RumblePageIE, - RumblePlaylistIE, -) +from .rumble import RumbleIE from .rutube import ( RutubeIE, RutubeChannelIE, @@ -1418,10 +1414,7 @@ from .urort import UrortIE from .urplay import URPlayIE from .usanetwork import USANetworkIE from .usatoday import USATodayIE -from .usawatchdog import ( - UsaWatchdogStoryIE, - UsaWatchdogIE, -) +from .usawatchdog import UsaWatchdogIE from .ustream import UstreamIE, UstreamChannelIE from .ustudio import ( UstudioIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0e473e952..3d2f7d33d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -132,6 +132,7 @@ from .kinja import KinjaEmbedIE from .arcpublishing import ArcPublishingIE from .medialaan import MedialaanIE from .simplecast import SimplecastIE +from .rumble import RumbleIE class GenericIE(InfoExtractor): @@ -3499,6 +3500,10 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) + rumble_urls = RumbleIE.rumble_embedded_id(webpage) + if rumble_urls is not None: + return self.playlist_result(rumble_urls) if len(rumble_urls) > 1 else rumble_urls[0] + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: diff --git a/youtube_dl/extractor/rumble.py b/youtube_dl/extractor/rumble.py index 7431a3b7e..e163c63e5 100644 --- a/youtube_dl/extractor/rumble.py +++ b/youtube_dl/extractor/rumble.py @@ -9,16 +9,99 @@ from ..utils import ( int_or_none, parse_iso8601, try_get, + ExtractorError, ) -class rumbleBase(InfoExtractor): +class RumbleIE(InfoExtractor): + + RE_DICT = { + 'iframe_url': { + 're': r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)', + 'compiled': None}, + 'jscript_url': { + 're': r'https?://rumble\.com/[a-zA-Z0-9-_.]*\.html', + 'compiled': None}, + 'list_url': { + 're': r'https?://rumble.com/(?:c|user)/(?P<id>[^/]+)', + 'compiled': None}, + 'jscript_id': { + 're': r'Rumble *\( *["\']play["\'], *\{[^}]*["\']video["\'] *: *["\'](?P<id>[^"\']+)', + 'compiled': None} + } + + _TESTS = [ + { + 'url': 'https://rumble.com/embed/v5pv5f', + 'md5': '36a18a049856720189f30977ccbb2c34', + 'info_dict': { + 'id': 'v5pv5f', + 'ext': 'mp4', + 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', + 'timestamp': 1571611968, + 'upload_date': '20191020', + } + }, + { + 'url': 'https://rumble.com/v8c1bt-wmar-2-news-latest-headlines-october-20-6pm.html', + 'md5': '36a18a049856720189f30977ccbb2c34', + 'info_dict': { + 'id': 'v5pv5f', + 'ext': 'mp4', + 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', + 'timestamp': 1571611968, + 'upload_date': '20191020', + } + }, + { + 'url': 'https://rumble.com/c/PeakProsperity', + 'playlist_mincount': 25, + 'info_dict': { + 'id': 'PeakProsperity', + } + }, + { + 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', + 'only_matching': True, + } + ] + + @classmethod + def get_re(cls, tag): + if cls.RE_DICT[tag]['compiled'] is None: + cls.RE_DICT[tag]['compiled'] = re.compile(cls.RE_DICT[tag]['re']) + return cls.RE_DICT[tag]['compiled'] + + @classmethod + def suitable(cls, url): + return (cls.get_re('jscript_url').match(url) is not None or + cls.get_re('list_url').match(url) is not None or + cls.get_re('iframe_url').match(url) is not None) + + @staticmethod + def rumble_embedded_id(page_data): + '''For use by extractors of sites which use emedded Rumble videos. Given + a webpage as a string returns a list of url result dicts for each embedded + rumble video found. None is returned if no embeds were found. Duplicates + are not removed''' + + embeds = [] + # The JS embeds + for mobj in RumbleIE.get_re('jscript_id').finditer(page_data): + embeds.append(InfoExtractor.url_result('https://rumble.com/embed/' + mobj.group('id'), 'Rumble', mobj.group('id'))) + + # The iframes embeds + for mobj in RumbleIE.get_re('iframe_url').finditer(page_data): + embeds.append(InfoExtractor.url_result('https://rumble.com/embed/' + mobj.group('id'), 'Rumble', mobj.group('id'))) + + return embeds if embeds else None + def rumble_video_info(self, video_id): video = self._download_json( 'https://rumble.com/embedJS/', video_id, query={'request': 'video', 'v': video_id}) if not video: - return None + raise ExtractorError('Unable to locate video information.', expected=True) title = video['title'] @@ -53,82 +136,22 @@ class rumbleBase(InfoExtractor): 'duration': int_or_none(video.get('duration')), } - -class RumbleEmbedIE(rumbleBase): - _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)' - _TESTS = [{ - 'url': 'https://rumble.com/embed/v5pv5f', - 'md5': '36a18a049856720189f30977ccbb2c34', - 'info_dict': { - 'id': 'v5pv5f', - 'ext': 'mp4', - 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', - 'timestamp': 1571611968, - 'upload_date': '20191020', - } - }, { - 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', - 'only_matching': True, - }] - def _real_extract(self, url): - video_id = self._match_id(url) - return self.rumble_video_info(video_id) + if self.get_re('jscript_url').match(url) is not None: + page = self._download_webpage(url, 'Rumble Page') + video_id = self._search_regex(self.get_re('jscript_id'), page, "id") + return self.rumble_video_info(video_id) + mobj = self.get_re('list_url').match(url) + if mobj is not None: + urls = [] + id = mobj.group('id') + page = self._download_webpage(url, id) + for mobj in re.finditer(r'<a class=video-item--a href=\/(?P<href>[a-zA-Z0-9\-.]+)>', page): + urls.append('https://rumble.com/' + mobj.group('href')) -class RumblePageIE(rumbleBase): - _VALID_URL = r'https?://rumble\.com/[a-zA-Z0-9-_.]*\.html' - _TEST = { - 'url': 'https://rumble.com/v8c1bt-wmar-2-news-latest-headlines-october-20-6pm.html', - 'md5': '36a18a049856720189f30977ccbb2c34', - 'info_dict': { - 'id': 'v5pv5f', - 'ext': 'mp4', - 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', - 'timestamp': 1571611968, - 'upload_date': '20191020', - }} + return self.playlist_from_matches(urls, id) - _RUMBLE_JS_RE = r'Rumble *\( *["\']play["\'], *\{[^}]*["\']video["\'] *: *["\'](?P<id>[^"\']+)' - - def _real_extract(self, url): - page = self._download_webpage(url, 'Rumble Page') - video_id = self._search_regex(self._RUMBLE_JS_RE, page, "id") - return self.rumble_video_info(video_id) - - -class RumblePlaylistIE(rumbleBase): - _VALID_URL = r'https?://rumble.com/(?:c|user)/(?P<id>[^/]+)' - _TEST = { - 'url': 'https://rumble.com/c/PeakProsperity', - 'playlist_mincount': 25, - 'info_dict': { - 'id': 'PeakProsperity', - }} - - def _real_extract(self, url): - urls = [] - id = self._match_id(url) - page = self._download_webpage(url, id) - for mobj in re.finditer(r'<a class=video-item--a href=\/(?P<href>[a-zA-Z0-9\-.]+)>', page): - urls.append('https://rumble.com/' + mobj.group('href')) - - return self.playlist_from_matches(urls, id) - - -def rumble_embedded_id(page_data): - '''For use by extractors of sites which use emedded Rumble videos. Given - a webpage as a string returns a list of url result dicts for each embedded - rumble video found. None is returned if no embeds were found. Duplicates - are not removed''' - - embeds = [] - # The JS embeds - for mobj in re.finditer(RumblePageIE._RUMBLE_JS_RE, page_data): - embeds.append(InfoExtractor.url_result('https://rumble.com/embed/' + mobj.group('id'),'RumbleEmbed',mobj.group('id'))) - - # The iframes embeds - for mobj in re.finditer(RumbleEmbedIE._VALID_URL, page_data): - embeds.append(InfoExtractor.url_result('https://rumble.com/embed/' + mobj.group('id'),'RumbleEmbed',mobj.group('id'))) - - return embeds if embeds else None + mobj = self.get_re('iframe_url').match(url) + if mobj is not None: + return self.rumble_video_info(mobj.group('id')) diff --git a/youtube_dl/extractor/usawatchdog.py b/youtube_dl/extractor/usawatchdog.py index 4bb576a9b..c46849b83 100644 --- a/youtube_dl/extractor/usawatchdog.py +++ b/youtube_dl/extractor/usawatchdog.py @@ -4,30 +4,9 @@ import re from .common import InfoExtractor -from .rumble import rumble_embedded_id - - -class UsaWatchdogStoryIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?usawatchdog\.com/(?P<id>[^/]+)' - _TEST = { - 'url': 'https://usawatchdog.com/cv-19-vaccine-warning-cv-19-cure-must-watch-videos/', - 'md5': 'bf40e20aebca9016ca195534028cbb6f', - 'info_dict': { - 'id': 'vcl8gx', - 'ext': 'mp4', - 'timestamp': 1617141926, - 'upload_date': '20210330', - 'title': u'Vaccine Warning \u2013 CV-19 Cure Must Watch Videos', - }} - - def _real_extract(self, url): - title = self._match_id(url) - embeds = rumble_embedded_id(self._download_webpage(url, title)) - return embeds[0] if embeds is not None else None - class UsaWatchdogIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?usawatchdog\.com/$' + _VALID_URL = r'^https?://(?:www\.)?usawatchdog\.com/?$' _TEST = { 'url': 'https://usawatchdog.com/', 'playlist_mincount': 15, @@ -40,7 +19,6 @@ class UsaWatchdogIE(InfoExtractor): for mobj in re.finditer(r'front-view-title[^<]+<a.+href=["\'](?P<href>https?:(?:www\.)?//usawatchdog.com/[^/]+\/?)[^>]+>(?P<title>[^<]+)', self._download_webpage(url, 'Site Root')): matches.append(self.url_result(mobj.group('href'), - 'UsaWatchdogStory', None, - mobj.group('title').encode('utf8'))) + video_title=mobj.group('title').encode('utf8'))) return self.playlist_result(matches, 'USA Watchdog') From 4b1c8309573a4c6e3d5e883cc64338a8e19d6778 Mon Sep 17 00:00:00 2001 From: Glenn Pavlovic <glimrick@epilitimus.com> Date: Thu, 2 Feb 2023 16:37:30 -0800 Subject: [PATCH 3/3] fix formatting issue --- youtube_dl/extractor/rumble.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rumble.py b/youtube_dl/extractor/rumble.py index e163c63e5..8a18d12b7 100644 --- a/youtube_dl/extractor/rumble.py +++ b/youtube_dl/extractor/rumble.py @@ -74,9 +74,9 @@ class RumbleIE(InfoExtractor): @classmethod def suitable(cls, url): - return (cls.get_re('jscript_url').match(url) is not None or - cls.get_re('list_url').match(url) is not None or - cls.get_re('iframe_url').match(url) is not None) + return (cls.get_re('jscript_url').match(url) is not None + or cls.get_re('list_url').match(url) is not None + or cls.get_re('iframe_url').match(url) is not None) @staticmethod def rumble_embedded_id(page_data):