Compare commits

...

6 Commits

Author SHA1 Message Date
Zenon Mousmoulas
bdc9a26914
Merge 3872619ed5210e8a60f68cff7dbb64164bdc7b5c into 4e714f9df1ed2cccd51df60d45ff5504abe827b7 2025-03-27 10:14:43 +02:00
dirkf
4e714f9df1 [Misc] Correct [_]IE_DESC/NAME in a few IEs
* thx seproDev, yt-dlp/yt-dlp/pull/12694/commits/ae69e3c
* also add documenting comment in `InfoExtractor`
2025-03-26 12:47:19 +00:00
dirkf
c1ea7f5a24 [ITV] Mark ITVX not working
* update old shim
* correct [_]IE_DESC
2025-03-26 12:17:49 +00:00
Zenon Mousmoulas
3872619ed5 Ant1NewsGrEmbedIE._extract_urls: Simplify redundant statement 2021-11-13 11:55:12 +02:00
Zenon Mousmoulas
264544f90e Remove unnecessary quote escape 2021-11-13 08:45:46 +02:00
Zenon Mousmoulas
9af0f299bf Add Ant1NewsGr IEs
* Add new IEs
  * Ant1NewsGrBaseIE: Base IE class
  * Ant1NewsGrWatchIE: Extract videos from TV VOD pages
  * Ant1NewsGrArticleIE: Extract videos from news articles
  * Ant1NewsGrEmbedIE: Extract iframe-embeddable ant1news.gr videos
* NB: There is a different platform at vod.antenna.gr, which is not
  covered here
* The Generic extractor can also be used to extract videos from news
  article pages (through Ant1NewsGrEmbed._extract_urls), however a
  specific IE is used to extract JSON-LD @type='NewsArticle' metadata
  * However that does not work currently, as _json_ld does not support
    @graph nesting
* Ant1NewsGrArticleIE defers to Ant1NewsGrEmbedIE, either as a playlist
  or a single video
* Ant1NewsGrWatchIE and Ant1NewsGrEmbedIE query an API endpoint to
  extract metadata, get the respective stream/source URLs and detect
  video formats
  * The endpoint HTTP path varies per IE
  * Ant1NewsGrEmbedIE first resolves any redirects for its' own URL, to
    derive the properl base URL for the API query
2021-11-11 15:47:41 +02:00
8 changed files with 211 additions and 15 deletions

View File

@ -0,0 +1,188 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urllib_parse_urlparse,
compat_urlparse,
)
from ..utils import (
HEADRequest,
ExtractorError,
determine_ext,
smuggle_url,
unsmuggle_url,
unescapeHTML,
)
class Ant1NewsGrBaseIE(InfoExtractor):
@staticmethod
def _smuggle_parent_info(url, **info_dict):
return smuggle_url(url, {'parent_info': info_dict})
@staticmethod
def _unsmuggle_parent_info(url):
unsmuggled_url, data = unsmuggle_url(url, default={'parent_info': {}})
return unsmuggled_url, data['parent_info']
def _download_api_data(self, netloc, cid, scheme='https'):
url_parts = (scheme, netloc, self._API_PATH, None, None, None)
url = compat_urlparse.urlunparse(url_parts)
query = {'cid': cid}
return self._download_json(
url, cid,
'Downloading JSON',
'Unable to download JSON',
query=query)
def _download_and_extract_api_data(self, video_id, *args, **kwargs):
info = self._download_api_data(*args, **kwargs)
try:
source = info['url']
except KeyError:
raise ExtractorError('no source found for %s' % video_id)
formats = self._extract_m3u8_formats(source, video_id, 'mp4') \
if determine_ext(source) == 'm3u8' else [source]
self._sort_formats(formats)
return {
'id': video_id,
'title': info['title'],
'thumbnail': info['thumb'],
'formats': formats,
}
class Ant1NewsGrWatchIE(Ant1NewsGrBaseIE):
IE_NAME = 'ant1newsgr:watch'
IE_DESC = 'ant1news.gr videos'
_VALID_URL = r'https?://(?:www\.)?ant1news\.gr/watch/(?P<id>\d+)/'
_API_PATH = '/templates/data/player'
_TEST = {
'url': 'https://www.ant1news.gr/watch/1506168/ant1-news-09112021-stis-18-45',
'md5': '60a984da5ffc98c9924e6d9dd46c6f04',
'info_dict': {
'id': '1506168',
'ext': 'mp4',
'title': 'md5:0ad00fa66ecf8aa233d26ab0dba7514a',
'description': 'md5:18665af715a6dcfeac1d6153a44f16b0',
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
scheme, netloc, _, _, _, _ = compat_urllib_parse_urlparse(url)
info = self._download_and_extract_api_data(
video_id, netloc, video_id, scheme=scheme)
info['description'] = self._og_search_description(webpage)
return info
class Ant1NewsGrArticleIE(Ant1NewsGrBaseIE):
IE_NAME = 'ant1newsgr:article'
IE_DESC = 'ant1news.gr articles'
_VALID_URL = r'https?://(?:www\.)?ant1news\.gr/[^/]+/article/(?P<id>\d+)/'
_TESTS = [{
'url': 'https://www.ant1news.gr/afieromata/article/549468/o-tzeims-mpont-sta-meteora-oi-apeiles-kai-o-xesikomos-ton-kalogeron',
'md5': 'eb635a194c15272c2611a751766b0200',
'info_dict': {
'id': '_xvg/m_cmbatw=',
'ext': 'mp4',
'title': 'md5:a93e8ecf2e4073bfdffcb38f59945411',
},
'expected_warnings': [r'^[Uu]nable to extract JSON-LD'],
}, {
'url': 'https://ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn',
'info_dict': {
'id': '620286',
'title': 'md5:91fe569e952e4d146485740ae927662b',
},
'expected_warnings': [r'^[Uu]nable to extract JSON-LD'],
'playlist_mincount': 2,
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
info = self._search_json_ld(webpage, video_id,
expected_type='NewsArticle',
fatal=False)
# workaround as _json_ld does not recognize @graph nesting
if not info:
info['title'] = self._og_search_title(webpage)
embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage, url, **info))
if not embed_urls:
raise ExtractorError('no videos found for %s' % video_id)
if len(embed_urls) == 1:
return self.url_result(embed_urls[0], ie=Ant1NewsGrEmbedIE.ie_key(),
video_title=info['title'])
return self.playlist_from_matches(
embed_urls, video_id, info['title'], ie=Ant1NewsGrEmbedIE.ie_key())
class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE):
IE_NAME = 'ant1newsgr:embed'
IE_DESC = 'ant1news.gr embedded videos'
_VALID_URL = r'''(?x)https?://(?:[a-zA-Z0-9\-]+\.)?
(?:antenna|ant1news)\.gr/templates/pages/player
\?(?:(?:cid=(?P<id>[^&#]+)|[^&=#]+=[^&#]+)&?)+'''
_API_PATH = '/news/templates/data/jsonPlayer'
_TEST = {
'url': 'https://www.antenna.gr/templates/pages/player?cid=3f_li_c_az_jw_y_u=&w=670&h=377',
'md5': '12872b12af18b5dbf76528786728de8c',
'info_dict': {
'id': '3f_li_c_az_jw_y_u=',
'ext': 'mp4',
'title': 'md5:a30c93332455f53e1e84ae0724f0adf7',
},
}
@classmethod
def _extract_urls(cls, webpage, origin_url=None, **parent_info):
# make the scheme in _VALID_URL optional
_URL_RE = r'(?:https?:)?//' + cls._VALID_URL.split('://', 1)[1]
# simplify the query string part of _VALID_URL; after extracting iframe
# src, the URL will be matched again
_URL_RE = _URL_RE.split(r'\?', 1)[0] + r'\?(?:(?!(?P=_q1)).)+'
EMBED_RE = r'''(?x)
<iframe[^>]+?src=(?P<_q1>%(quot_re)s)(?P<url>%(url_re)s)(?P=_q1)
''' % {'quot_re': r'["\']', 'url_re': _URL_RE}
for mobj in re.finditer(EMBED_RE, webpage):
url = unescapeHTML(mobj.group('url'))
if url.startswith('//'):
scheme = compat_urllib_parse_urlparse(origin_url).scheme \
if origin_url else 'https'
url = '%s:%s' % (scheme, url)
if not cls.suitable(url):
continue
yield cls._smuggle_parent_info(url, **parent_info)
def _real_extract(self, url):
url, parent_info = type(self)._unsmuggle_parent_info(url)
video_id = self._match_id(url)
# resolve any redirects, to derive the proper base URL for the API query
canonical_url = self._request_webpage(
HEADRequest(url), video_id,
note='Resolve canonical player URL',
errnote='Could not resolve canonical player URL').geturl()
scheme, netloc, _, _, query, _ = compat_urllib_parse_urlparse(
canonical_url)
query = compat_parse_qs(query)
cid = query['cid'][0]
info = self._download_and_extract_api_data(
video_id, netloc, cid, scheme=scheme)
if 'timestamp' not in info and 'timestamp' in parent_info:
info['timestamp'] = parent_info['timestamp']
return info

View File

@ -32,7 +32,7 @@ class BokeCCBaseIE(InfoExtractor):
class BokeCCIE(BokeCCBaseIE):
_IE_DESC = 'CC视频'
IE_DESC = 'CC视频'
_VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)'
_TESTS = [{

View File

@ -9,7 +9,7 @@ from ..utils import (
class CloudyIE(InfoExtractor):
_IE_DESC = 'cloudy.ec'
IE_DESC = 'cloudy.ec'
_VALID_URL = r'https?://(?:www\.)?cloudy\.ec/(?:v/|embed\.php\?.*?\bid=)(?P<id>[A-Za-z0-9]+)'
_TESTS = [{
'url': 'https://www.cloudy.ec/v/af511e2527aac',

View File

@ -422,6 +422,8 @@ class InfoExtractor(object):
_GEO_COUNTRIES = None
_GEO_IP_BLOCKS = None
_WORKING = True
# supply this in public subclasses: used in supported sites list, etc
# IE_DESC = 'short description of IE'
def __init__(self, downloader=None):
"""Constructor. Receives an optional downloader."""

View File

@ -1078,6 +1078,11 @@ from .rutube import (
RutubePersonIE,
RutubePlaylistIE,
)
from .ant1newsgr import (
Ant1NewsGrWatchIE,
Ant1NewsGrArticleIE,
Ant1NewsGrEmbedIE,
)
from .rutv import RUTVIE
from .ruutu import RuutuIE
from .ruv import RuvIE

View File

@ -102,6 +102,7 @@ from .ustream import UstreamIE
from .arte import ArteTVEmbedIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
from .ant1newsgr import Ant1NewsGrEmbedIE
from .limelight import LimelightBaseIE
from .anvato import AnvatoIE
from .washingtonpost import WashingtonPostIE
@ -3400,6 +3401,13 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
# Look for ant1news.gr embeds
ant1newsgr_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage, url,
title=video_title))
if ant1newsgr_urls:
return self.playlist_from_matches(
ant1newsgr_urls, video_id, video_title, ie=Ant1NewsGrEmbedIE.ie_key())
# Look for WashingtonPost embeds
wapo_urls = WashingtonPostIE._extract_urls(webpage)
if wapo_urls:

View File

@ -35,15 +35,6 @@ from ..utils import (
class ITVBaseIE(InfoExtractor):
def _search_nextjs_data(self, webpage, video_id, **kw):
transform_source = kw.pop('transform_source', None)
fatal = kw.pop('fatal', True)
return self._parse_json(
self._search_regex(
r'''<script\b[^>]+\bid=('|")__NEXT_DATA__\1[^>]*>(?P<js>[^<]+)</script>''',
webpage, 'next.js data', group='js', fatal=fatal, **kw),
video_id, transform_source=transform_source, fatal=fatal)
def __handle_request_webpage_error(self, err, video_id=None, errnote=None, fatal=True):
if errnote is False:
return False
@ -109,7 +100,9 @@ class ITVBaseIE(InfoExtractor):
class ITVIE(ITVBaseIE):
_VALID_URL = r'https?://(?:www\.)?itv\.com/(?:(?P<w>watch)|hub)/[^/]+/(?(w)[\w-]+/)(?P<id>\w+)'
_IE_DESC = 'ITVX'
IE_DESC = 'ITVX'
_WORKING = False
_TESTS = [{
'note': 'Hub URLs redirect to ITVX',
'url': 'https://www.itv.com/hub/liar/2a4547a0012',
@ -270,7 +263,7 @@ class ITVIE(ITVBaseIE):
'ext': determine_ext(href, 'vtt'),
})
next_data = self._search_nextjs_data(webpage, video_id, fatal=False, default='{}')
next_data = self._search_nextjs_data(webpage, video_id, fatal=False, default={})
video_data.update(traverse_obj(next_data, ('props', 'pageProps', ('title', 'episode')), expected_type=dict)[0] or {})
title = traverse_obj(video_data, 'headerTitle', 'episodeTitle')
info = self._og_extract(webpage, require_title=not title)
@ -323,7 +316,7 @@ class ITVIE(ITVBaseIE):
class ITVBTCCIE(ITVBaseIE):
_VALID_URL = r'https?://(?:www\.)?itv\.com/(?!(?:watch|hub)/)(?:[^/]+/)+(?P<id>[^/?#&]+)'
_IE_DESC = 'ITV articles: News, British Touring Car Championship'
IE_DESC = 'ITV articles: News, British Touring Car Championship'
_TESTS = [{
'note': 'British Touring Car Championship',
'url': 'https://www.itv.com/btcc/articles/btcc-2018-all-the-action-from-brands-hatch',

View File

@ -47,7 +47,7 @@ class SenateISVPIE(InfoExtractor):
['vetaff', '76462', 'http://vetaff-f.akamaihd.net'],
['arch', '', 'http://ussenate-f.akamaihd.net/']
]
_IE_NAME = 'senate.gov'
IE_NAME = 'senate.gov'
_VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)'
_TESTS = [{
'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',