Rumble + UsaWatchdog - improves Rumble support and adds UsaWatchdog support

This commit is contained in:
Glenn Pavlovic 2023-01-17 18:40:37 -08:00
parent 195f22f679
commit 7bb8d94184
3 changed files with 141 additions and 20 deletions

View File

@ -1049,7 +1049,11 @@ from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETe
from .rtvnh import RTVNHIE from .rtvnh import RTVNHIE
from .rtvs import RTVSIE from .rtvs import RTVSIE
from .ruhd import RUHDIE from .ruhd import RUHDIE
from .rumble import RumbleEmbedIE from .rumble import (
RumbleEmbedIE,
RumblePageIE,
RumblePlaylistIE,
)
from .rutube import ( from .rutube import (
RutubeIE, RutubeIE,
RutubeChannelIE, RutubeChannelIE,
@ -1414,6 +1418,10 @@ from .urort import UrortIE
from .urplay import URPlayIE from .urplay import URPlayIE
from .usanetwork import USANetworkIE from .usanetwork import USANetworkIE
from .usatoday import USATodayIE from .usatoday import USATodayIE
from .usawatchdog import (
UsaWatchdogStoryIE,
UsaWatchdogIE,
)
from .ustream import UstreamIE, UstreamChannelIE from .ustream import UstreamIE, UstreamChannelIE
from .ustudio import ( from .ustudio import (
UstudioIE, UstudioIE,

View File

@ -1,5 +1,6 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str from ..compat import compat_str
@ -11,28 +12,14 @@ from ..utils import (
) )
class RumbleEmbedIE(InfoExtractor): class rumbleBase(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)' def rumble_video_info(self, video_id):
_TESTS = [{
'url': 'https://rumble.com/embed/v5pv5f',
'md5': '36a18a049856720189f30977ccbb2c34',
'info_dict': {
'id': 'v5pv5f',
'ext': 'mp4',
'title': 'WMAR 2 News Latest Headlines | October 20, 6pm',
'timestamp': 1571611968,
'upload_date': '20191020',
}
}, {
'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video = self._download_json( video = self._download_json(
'https://rumble.com/embedJS/', video_id, 'https://rumble.com/embedJS/', video_id,
query={'request': 'video', 'v': video_id}) query={'request': 'video', 'v': video_id})
if not video:
return None
title = video['title'] title = video['title']
formats = [] formats = []
@ -65,3 +52,83 @@ class RumbleEmbedIE(InfoExtractor):
'channel_url': author.get('url'), 'channel_url': author.get('url'),
'duration': int_or_none(video.get('duration')), 'duration': int_or_none(video.get('duration')),
} }
class RumbleEmbedIE(rumbleBase):
_VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
_TESTS = [{
'url': 'https://rumble.com/embed/v5pv5f',
'md5': '36a18a049856720189f30977ccbb2c34',
'info_dict': {
'id': 'v5pv5f',
'ext': 'mp4',
'title': 'WMAR 2 News Latest Headlines | October 20, 6pm',
'timestamp': 1571611968,
'upload_date': '20191020',
}
}, {
'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
return self.rumble_video_info(video_id)
class RumblePageIE(rumbleBase):
_VALID_URL = r'https?://rumble\.com/[a-zA-Z0-9-_.]*\.html'
_TEST = {
'url': 'https://rumble.com/v8c1bt-wmar-2-news-latest-headlines-october-20-6pm.html',
'md5': '36a18a049856720189f30977ccbb2c34',
'info_dict': {
'id': 'v5pv5f',
'ext': 'mp4',
'title': 'WMAR 2 News Latest Headlines | October 20, 6pm',
'timestamp': 1571611968,
'upload_date': '20191020',
}}
_RUMBLE_JS_RE = r'Rumble *\( *["\']play["\'], *\{[^}]*["\']video["\'] *: *["\'](?P<id>[^"\']+)'
def _real_extract(self, url):
page = self._download_webpage(url, 'Rumble Page')
video_id = self._search_regex(self._RUMBLE_JS_RE, page, "id")
return self.rumble_video_info(video_id)
class RumblePlaylistIE(rumbleBase):
_VALID_URL = r'https?://rumble.com/(?:c|user)/(?P<id>[^/]+)'
_TEST = {
'url': 'https://rumble.com/c/PeakProsperity',
'playlist_mincount': 25,
'info_dict': {
'id': 'PeakProsperity',
}}
def _real_extract(self, url):
urls = []
id = self._match_id(url)
page = self._download_webpage(url, id)
for mobj in re.finditer(r'<a class=video-item--a href=\/(?P<href>[a-zA-Z0-9\-.]+)>', page):
urls.append('https://rumble.com/' + mobj.group('href'))
return self.playlist_from_matches(urls, id)
def rumble_embedded_id(page_data):
'''For use by extractors of sites which use emedded Rumble videos. Given
a webpage as a string returns a list of url result dicts for each embedded
rumble video found. None is returned if no embeds were found. Duplicates
are not removed'''
embeds = []
# The JS embeds
for mobj in re.finditer(RumblePageIE._RUMBLE_JS_RE, page_data):
embeds.append(InfoExtractor.url_result('https://rumble.com/embed/' + mobj.group('id'),'RumbleEmbed',mobj.group('id')))
# The iframes embeds
for mobj in re.finditer(RumbleEmbedIE._VALID_URL, page_data):
embeds.append(InfoExtractor.url_result('https://rumble.com/embed/' + mobj.group('id'),'RumbleEmbed',mobj.group('id')))
return embeds if embeds else None

View File

@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from .rumble import rumble_embedded_id
class UsaWatchdogStoryIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?usawatchdog\.com/(?P<id>[^/]+)'
_TEST = {
'url': 'https://usawatchdog.com/cv-19-vaccine-warning-cv-19-cure-must-watch-videos/',
'md5': 'bf40e20aebca9016ca195534028cbb6f',
'info_dict': {
'id': 'vcl8gx',
'ext': 'mp4',
'timestamp': 1617141926,
'upload_date': '20210330',
'title': u'Vaccine Warning \u2013 CV-19 Cure Must Watch Videos',
}}
def _real_extract(self, url):
title = self._match_id(url)
embeds = rumble_embedded_id(self._download_webpage(url, title))
return embeds[0] if embeds is not None else None
class UsaWatchdogIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?usawatchdog\.com/$'
_TEST = {
'url': 'https://usawatchdog.com/',
'playlist_mincount': 15,
'info_dict': {
'id': 'USA Watchdog',
}}
def _real_extract(self, url):
matches = []
for mobj in re.finditer(r'front-view-title[^<]+<a.+href=["\'](?P<href>https?:(?:www\.)?//usawatchdog.com/[^/]+\/?)[^>]+>(?P<title>[^<]+)',
self._download_webpage(url, 'Site Root')):
matches.append(self.url_result(mobj.group('href'),
'UsaWatchdogStory', None,
mobj.group('title').encode('utf8')))
return self.playlist_result(matches, 'USA Watchdog')