Compare commits

...

7 Commits

Author SHA1 Message Date
Zenon Mousmoulas
76d2b0a8c6
Merge 96a0ad4778da7f30ed5be627f2c10df6d0af3ca8 into 4e714f9df1ed2cccd51df60d45ff5504abe827b7 2025-03-27 10:14:43 +02:00
dirkf
4e714f9df1 [Misc] Correct [_]IE_DESC/NAME in a few IEs
* thx seproDev, yt-dlp/yt-dlp/pull/12694/commits/ae69e3c
* also add documenting comment in `InfoExtractor`
2025-03-26 12:47:19 +00:00
dirkf
c1ea7f5a24 [ITV] Mark ITVX not working
* update old shim
* correct [_]IE_DESC
2025-03-26 12:17:49 +00:00
Zenon Mousmoulas
96a0ad4778 MegaTVComEmbedIE: Make canonical URL extraction more robust 2021-11-13 11:50:22 +02:00
Zenon Mousmoulas
28fddc1758 Fix copy/paste typo 2021-11-13 11:39:29 +02:00
Zenon Mousmoulas
a5ec30e106 Address PR comments about escapes 2021-11-13 08:42:15 +02:00
Zenon Mousmoulas
34c3b06402 Add MegaTVCom IEs
* Add new IEs
  * MegaTVComBaseIE: Base IE class
  * MegaTVComIE: Extract from TV VOD pages and news articles, i.e. all
    sorts of pages showing videos on megatv.com
  * MegaTVComEmbedIE: Extract iframe-embeddable megatv.com videos
* When video_id is not matched in the URL, namely for news articles,
  extract it (article_id) from a particular element on the web page
* Derive metadata and sources directly from the web page, from data
  attributes of the player placeholder element and other commonly used
  elements
* Let MegaTVComEmbedIE defer to MegaTVComIE for extraction, as the
  metadata on the embeddable page are some times slightly different, for
  the same video
2021-11-11 15:40:14 +02:00
8 changed files with 224 additions and 15 deletions

View File

@ -32,7 +32,7 @@ class BokeCCBaseIE(InfoExtractor):
class BokeCCIE(BokeCCBaseIE):
_IE_DESC = 'CC视频'
IE_DESC = 'CC视频'
_VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)'
_TESTS = [{

View File

@ -9,7 +9,7 @@ from ..utils import (
class CloudyIE(InfoExtractor):
_IE_DESC = 'cloudy.ec'
IE_DESC = 'cloudy.ec'
_VALID_URL = r'https?://(?:www\.)?cloudy\.ec/(?:v/|embed\.php\?.*?\bid=)(?P<id>[A-Za-z0-9]+)'
_TESTS = [{
'url': 'https://www.cloudy.ec/v/af511e2527aac',

View File

@ -422,6 +422,8 @@ class InfoExtractor(object):
_GEO_COUNTRIES = None
_GEO_IP_BLOCKS = None
_WORKING = True
# supply this in public subclasses: used in supported sites list, etc
# IE_DESC = 'short description of IE'
def __init__(self, downloader=None):
"""Constructor. Receives an optional downloader."""

View File

@ -1078,6 +1078,10 @@ from .rutube import (
RutubePersonIE,
RutubePlaylistIE,
)
from .megatvcom import (
MegaTVComIE,
MegaTVComEmbedIE,
)
from .rutv import RUTVIE
from .ruutu import RuutuIE
from .ruv import RuvIE

View File

@ -102,6 +102,7 @@ from .ustream import UstreamIE
from .arte import ArteTVEmbedIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
from .megatvcom import MegaTVComEmbedIE
from .limelight import LimelightBaseIE
from .anvato import AnvatoIE
from .washingtonpost import WashingtonPostIE
@ -3400,6 +3401,12 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
# Look for megatv.com embeds
megatvcom_urls = list(MegaTVComEmbedIE._extract_urls(webpage, url))
if megatvcom_urls:
return self.playlist_from_matches(
megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key())
# Look for WashingtonPost embeds
wapo_urls = WashingtonPostIE._extract_urls(webpage)
if wapo_urls:

View File

@ -35,15 +35,6 @@ from ..utils import (
class ITVBaseIE(InfoExtractor):
def _search_nextjs_data(self, webpage, video_id, **kw):
transform_source = kw.pop('transform_source', None)
fatal = kw.pop('fatal', True)
return self._parse_json(
self._search_regex(
r'''<script\b[^>]+\bid=('|")__NEXT_DATA__\1[^>]*>(?P<js>[^<]+)</script>''',
webpage, 'next.js data', group='js', fatal=fatal, **kw),
video_id, transform_source=transform_source, fatal=fatal)
def __handle_request_webpage_error(self, err, video_id=None, errnote=None, fatal=True):
if errnote is False:
return False
@ -109,7 +100,9 @@ class ITVBaseIE(InfoExtractor):
class ITVIE(ITVBaseIE):
_VALID_URL = r'https?://(?:www\.)?itv\.com/(?:(?P<w>watch)|hub)/[^/]+/(?(w)[\w-]+/)(?P<id>\w+)'
_IE_DESC = 'ITVX'
IE_DESC = 'ITVX'
_WORKING = False
_TESTS = [{
'note': 'Hub URLs redirect to ITVX',
'url': 'https://www.itv.com/hub/liar/2a4547a0012',
@ -270,7 +263,7 @@ class ITVIE(ITVBaseIE):
'ext': determine_ext(href, 'vtt'),
})
next_data = self._search_nextjs_data(webpage, video_id, fatal=False, default='{}')
next_data = self._search_nextjs_data(webpage, video_id, fatal=False, default={})
video_data.update(traverse_obj(next_data, ('props', 'pageProps', ('title', 'episode')), expected_type=dict)[0] or {})
title = traverse_obj(video_data, 'headerTitle', 'episodeTitle')
info = self._og_extract(webpage, require_title=not title)
@ -323,7 +316,7 @@ class ITVIE(ITVBaseIE):
class ITVBTCCIE(ITVBaseIE):
_VALID_URL = r'https?://(?:www\.)?itv\.com/(?!(?:watch|hub)/)(?:[^/]+/)+(?P<id>[^/?#&]+)'
_IE_DESC = 'ITV articles: News, British Touring Car Championship'
IE_DESC = 'ITV articles: News, British Touring Car Championship'
_TESTS = [{
'note': 'British Touring Car Championship',
'url': 'https://www.itv.com/btcc/articles/btcc-2018-all-the-action-from-brands-hatch',

View File

@ -0,0 +1,203 @@
# coding: utf-8
from __future__ import unicode_literals
import hashlib
import re
from .common import InfoExtractor
from ..compat import (
compat_str,
compat_parse_qs,
compat_urllib_parse_urlparse,
)
from ..utils import (
HEADRequest,
ExtractorError,
determine_ext,
get_element_by_class,
unified_timestamp,
extract_attributes,
clean_html,
unescapeHTML,
)
class MegaTVComBaseIE(InfoExtractor):
_PLAYER_DIV_ID = 'player_div_id'
def _extract_player_attrs(self, webpage):
PLAYER_DIV_RE = r'''(?x)
<div(?:
id=(?P<_q1>["'])(?P<%(pdi)s>%(pdi)s)(?P=_q1)|
[^>]*?
)+>
''' % {'pdi': self._PLAYER_DIV_ID}
for mobj in re.finditer(PLAYER_DIV_RE, webpage):
if mobj.group(self._PLAYER_DIV_ID):
player_el = mobj.group(0)
break
else:
raise ExtractorError('no <div id="%s"> element found in webpage' %
self._PLAYER_DIV_ID)
return {
re.sub(r'^data-(?:kwik_)?', '', k): v
for k, v in extract_attributes(player_el).items()
if k not in ('id',)
}
class MegaTVComIE(MegaTVComBaseIE):
IE_NAME = 'megatvcom'
IE_DESC = 'megatv.com videos'
_VALID_URL = r'https?://(?:www\.)?megatv\.com/(?:(?!\d{4})[^/]+/(?P<id>\d+)/[^/]+|\d{4}/\d{2}/\d{2}/.+)'
_TESTS = [{
'url': 'https://www.megatv.com/2021/10/23/egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia/',
'md5': '2ebe96661cb81854889053cebb661068',
'info_dict': {
'id': '520979',
'ext': 'mp4',
'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
'description': 'md5:0209fa8d318128569c0d256a5c404db1',
'timestamp': 1634975747,
'upload_date': '20211023',
},
}, {
'url': 'https://www.megatv.com/tvshows/527800/epeisodio-65-12/',
'md5': '8ab0c9d664cea11678670202b87bb2b1',
'info_dict': {
'id': '527800',
'ext': 'mp4',
'title': 'md5:fc322cb51f682eecfe2f54cd5ab3a157',
'description': 'md5:b2b7ed3690a78f2a0156eb790fdc00df',
'timestamp': 1636048859,
'upload_date': '20211104',
},
}]
def _match_article_id(self, webpage):
ART_RE = r'''(?x)
<article(?:
id=(?P<_q2>["'])Article_(?P<article>\d+)(?P=_q2)|
[^>]*?
)+>
'''
return compat_str(self._search_regex(ART_RE, webpage, 'article_id',
group='article'))
def _real_extract(self, url):
video_id = self._match_id(url)
_is_article = video_id == 'None'
webpage = self._download_webpage(url,
'N/A' if _is_article else
video_id)
if _is_article:
video_id = self._match_article_id(webpage)
player_attrs = self._extract_player_attrs(webpage)
title = player_attrs.get('label') or self._og_search_title(webpage)
description = clean_html(get_element_by_class(
'article-wrapper' if _is_article else 'story_content',
webpage))
if not description:
description = self._og_search_description(webpage)
thumbnail = player_attrs.get('image') or \
self._og_search_thumbnail(webpage)
timestamp = unified_timestamp(self._html_search_meta(
'article:published_time', webpage))
try:
source = player_attrs['source']
except KeyError:
raise ExtractorError('no source found for %s' % video_id)
formats = self._extract_m3u8_formats(source, video_id, 'mp4') \
if determine_ext(source) == 'm3u8' else [source]
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'timestamp': timestamp,
'formats': formats,
}
class MegaTVComEmbedIE(MegaTVComBaseIE):
IE_NAME = 'megatvcom:embed'
IE_DESC = 'megatv.com embedded videos'
_VALID_URL = r'https?://(?:www\.)?megatv\.com/embed/?\?p=\d+'
_TESTS = [{
'url': 'https://www.megatv.com/embed/?p=2020520979',
'md5': '2ebe96661cb81854889053cebb661068',
'info_dict': {
'id': '520979',
'ext': 'mp4',
'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
'description': 'md5:0209fa8d318128569c0d256a5c404db1',
'timestamp': 1634975747,
'upload_date': '20211023',
},
}, {
'url': 'https://www.megatv.com/embed/?p=2020534081',
'md5': 'f9a15e315acbf01b128e8efa3f75aab3',
'info_dict': {
'id': '534081',
'ext': 'mp4',
'title': 'md5:062e9d5976ef854d8bdc1f5724d9b2d0',
'description': 'md5:36dbe4c3762d2ede9513eea8d07f6d52',
'timestamp': 1636376351,
'upload_date': '20211108',
},
}]
@classmethod
def _extract_urls(cls, webpage, origin_url=None):
# make the scheme in _VALID_URL optional
_URL_RE = r'(?:https?:)?//' + cls._VALID_URL.split('://', 1)[1]
EMBED_RE = r'''(?x)
<iframe[^>]+?src=(?P<_q1>%(quot_re)s)
(?P<url>%(url_re)s)(?P=_q1)
''' % {'quot_re': r'["\']', 'url_re': _URL_RE}
for mobj in re.finditer(EMBED_RE, webpage):
url = unescapeHTML(mobj.group('url'))
if url.startswith('//'):
scheme = compat_urllib_parse_urlparse(origin_url).scheme \
if origin_url else 'https'
url = '%s:%s' % (scheme, url)
yield url
def _match_canonical_url(self, webpage):
LINK_RE = r'''(?x)
<link(?:
rel=(?P<_q1>%(quot_re)s)(?P<canonical>canonical)(?P=_q1)|
href=(?P<_q2>%(quot_re)s)(?P<href>(?:(?!(?P=_q2)).)+)(?P=_q2)|
[^>]*?
)+>
''' % {'quot_re': r'["\']'}
for mobj in re.finditer(LINK_RE, webpage):
canonical, href = mobj.group('canonical', 'href')
if canonical and href:
return unescapeHTML(href)
def _real_extract(self, url):
webpage = self._download_webpage(url, 'N/A')
player_attrs = self._extract_player_attrs(webpage)
canonical_url = player_attrs.get('share_url') or \
self._match_canonical_url(webpage)
if not canonical_url:
raise ExtractorError('canonical URL not found')
video_id = compat_parse_qs(compat_urllib_parse_urlparse(
canonical_url).query)['p'][0]
# Resolve the canonical URL, following redirects, and defer to
# megatvcom, as the metadata extracted from the embeddable page some
# times are slightly different, for the same video
canonical_url = self._request_webpage(
HEADRequest(canonical_url), video_id,
note='Resolve canonical URL',
errnote='Could not resolve canonical URL').geturl()
return self.url_result(
canonical_url,
MegaTVComIE.ie_key(),
video_id
)

View File

@ -47,7 +47,7 @@ class SenateISVPIE(InfoExtractor):
['vetaff', '76462', 'http://vetaff-f.akamaihd.net'],
['arch', '', 'http://ussenate-f.akamaihd.net/']
]
_IE_NAME = 'senate.gov'
IE_NAME = 'senate.gov'
_VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)'
_TESTS = [{
'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',