Merge 96a0ad4778 into c5098961b0

MegaTVComEmbedIE: Make canonical URL extraction more robust
Fix copy/paste typo
2024-12-23 12:40:11 +09:00 · 2024-08-07 08:48:30 +03:00 · 2021-11-13 11:50:22 +02:00 · 2021-11-13 11:39:29 +02:00 · 2021-11-13 08:42:15 +02:00 · 2021-11-11 15:40:14 +02:00
3 changed files with 214 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -1078,6 +1078,10 @@ from .rutube import (
    RutubePersonIE,
    RutubePlaylistIE,
 )
 from .megatvcom import (
    MegaTVComIE,
    MegaTVComEmbedIE,
 )
 from .rutv import RUTVIE
 from .ruutu import RuutuIE
 from .ruv import RuvIE
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -102,6 +102,7 @@ from .ustream import UstreamIE
 from .arte import ArteTVEmbedIE
 from .videopress import VideoPressIE
 from .rutube import RutubeIE
 from .megatvcom import MegaTVComEmbedIE
 from .limelight import LimelightBaseIE
 from .anvato import AnvatoIE
 from .washingtonpost import WashingtonPostIE
@ -3400,6 +3401,12 @@ class GenericIE(InfoExtractor):
            return self.playlist_from_matches(
                rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
        # Look for megatv.com embeds
        megatvcom_urls = list(MegaTVComEmbedIE._extract_urls(webpage, url))
        if megatvcom_urls:
            return self.playlist_from_matches(
                megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key())
        # Look for WashingtonPost embeds
        wapo_urls = WashingtonPostIE._extract_urls(webpage)
        if wapo_urls:
--- a/youtube_dl/extractor/megatvcom.py
+++ b/youtube_dl/extractor/megatvcom.py
@ -0,0 +1,203 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import hashlib
 import re
 from .common import InfoExtractor
 from ..compat import (
    compat_str,
    compat_parse_qs,
    compat_urllib_parse_urlparse,
 )
 from ..utils import (
    HEADRequest,
    ExtractorError,
    determine_ext,
    get_element_by_class,
    unified_timestamp,
    extract_attributes,
    clean_html,
    unescapeHTML,
 )
 class MegaTVComBaseIE(InfoExtractor):
    _PLAYER_DIV_ID = 'player_div_id'
    def _extract_player_attrs(self, webpage):
        PLAYER_DIV_RE = r'''(?x)
        <div(?:
            id=(?P<_q1>["'])(?P<%(pdi)s>%(pdi)s)(?P=_q1)|
            [^>]*?
        )+>
        ''' % {'pdi': self._PLAYER_DIV_ID}
        for mobj in re.finditer(PLAYER_DIV_RE, webpage):
            if mobj.group(self._PLAYER_DIV_ID):
                player_el = mobj.group(0)
                break
        else:
            raise ExtractorError('no <div id="%s"> element found in webpage' %
                                 self._PLAYER_DIV_ID)
        return {
            re.sub(r'^data-(?:kwik_)?', '', k): v
            for k, v in extract_attributes(player_el).items()
            if k not in ('id',)
        }
 class MegaTVComIE(MegaTVComBaseIE):
    IE_NAME = 'megatvcom'
    IE_DESC = 'megatv.com videos'
    _VALID_URL = r'https?://(?:www\.)?megatv\.com/(?:(?!\d{4})[^/]+/(?P<id>\d+)/[^/]+|\d{4}/\d{2}/\d{2}/.+)'
    _TESTS = [{
        'url': 'https://www.megatv.com/2021/10/23/egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia/',
        'md5': '2ebe96661cb81854889053cebb661068',
        'info_dict': {
            'id': '520979',
            'ext': 'mp4',
            'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
            'description': 'md5:0209fa8d318128569c0d256a5c404db1',
            'timestamp': 1634975747,
            'upload_date': '20211023',
        },
    }, {
        'url': 'https://www.megatv.com/tvshows/527800/epeisodio-65-12/',
        'md5': '8ab0c9d664cea11678670202b87bb2b1',
        'info_dict': {
            'id': '527800',
            'ext': 'mp4',
            'title': 'md5:fc322cb51f682eecfe2f54cd5ab3a157',
            'description': 'md5:b2b7ed3690a78f2a0156eb790fdc00df',
            'timestamp': 1636048859,
            'upload_date': '20211104',
        },
    }]
    def _match_article_id(self, webpage):
        ART_RE = r'''(?x)
        <article(?:
            id=(?P<_q2>["'])Article_(?P<article>\d+)(?P=_q2)|
            [^>]*?
        )+>
        '''
        return compat_str(self._search_regex(ART_RE, webpage, 'article_id',
                                             group='article'))
    def _real_extract(self, url):
        video_id = self._match_id(url)
        _is_article = video_id == 'None'
        webpage = self._download_webpage(url,
                                               'N/A' if _is_article else
                                               video_id)
        if _is_article:
            video_id = self._match_article_id(webpage)
        player_attrs = self._extract_player_attrs(webpage)
        title = player_attrs.get('label') or self._og_search_title(webpage)
        description = clean_html(get_element_by_class(
            'article-wrapper' if _is_article else 'story_content',
            webpage))
        if not description:
            description = self._og_search_description(webpage)
        thumbnail = player_attrs.get('image') or \
            self._og_search_thumbnail(webpage)
        timestamp = unified_timestamp(self._html_search_meta(
            'article:published_time', webpage))
        try:
            source = player_attrs['source']
        except KeyError:
            raise ExtractorError('no source found for %s' % video_id)
        formats = self._extract_m3u8_formats(source, video_id, 'mp4') \
            if determine_ext(source) == 'm3u8' else [source]
        self._sort_formats(formats)
        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'timestamp': timestamp,
            'formats': formats,
        }
 class MegaTVComEmbedIE(MegaTVComBaseIE):
    IE_NAME = 'megatvcom:embed'
    IE_DESC = 'megatv.com embedded videos'
    _VALID_URL = r'https?://(?:www\.)?megatv\.com/embed/?\?p=\d+'
    _TESTS = [{
        'url': 'https://www.megatv.com/embed/?p=2020520979',
        'md5': '2ebe96661cb81854889053cebb661068',
        'info_dict': {
            'id': '520979',
            'ext': 'mp4',
            'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
            'description': 'md5:0209fa8d318128569c0d256a5c404db1',
            'timestamp': 1634975747,
            'upload_date': '20211023',
        },
    }, {
        'url': 'https://www.megatv.com/embed/?p=2020534081',
        'md5': 'f9a15e315acbf01b128e8efa3f75aab3',
        'info_dict': {
            'id': '534081',
            'ext': 'mp4',
            'title': 'md5:062e9d5976ef854d8bdc1f5724d9b2d0',
            'description': 'md5:36dbe4c3762d2ede9513eea8d07f6d52',
            'timestamp': 1636376351,
            'upload_date': '20211108',
        },
    }]
    @classmethod
    def _extract_urls(cls, webpage, origin_url=None):
        # make the scheme in _VALID_URL optional
        _URL_RE = r'(?:https?:)?//' + cls._VALID_URL.split('://', 1)[1]
        EMBED_RE = r'''(?x)
            <iframe[^>]+?src=(?P<_q1>%(quot_re)s)
                (?P<url>%(url_re)s)(?P=_q1)
        ''' % {'quot_re': r'["\']', 'url_re': _URL_RE}
        for mobj in re.finditer(EMBED_RE, webpage):
            url = unescapeHTML(mobj.group('url'))
            if url.startswith('//'):
                scheme = compat_urllib_parse_urlparse(origin_url).scheme \
                    if origin_url else 'https'
                url = '%s:%s' % (scheme, url)
            yield url
    def _match_canonical_url(self, webpage):
        LINK_RE = r'''(?x)
        <link(?:
            rel=(?P<_q1>%(quot_re)s)(?P<canonical>canonical)(?P=_q1)|
            href=(?P<_q2>%(quot_re)s)(?P<href>(?:(?!(?P=_q2)).)+)(?P=_q2)|
            [^>]*?
        )+>
        ''' % {'quot_re': r'["\']'}
        for mobj in re.finditer(LINK_RE, webpage):
            canonical, href = mobj.group('canonical', 'href')
            if canonical and href:
                return unescapeHTML(href)
    def _real_extract(self, url):
        webpage = self._download_webpage(url, 'N/A')
        player_attrs = self._extract_player_attrs(webpage)
        canonical_url = player_attrs.get('share_url') or \
            self._match_canonical_url(webpage)
        if not canonical_url:
            raise ExtractorError('canonical URL not found')
        video_id = compat_parse_qs(compat_urllib_parse_urlparse(
            canonical_url).query)['p'][0]
        # Resolve the canonical URL, following redirects, and defer to
        # megatvcom, as the metadata extracted from the embeddable page some
        # times are slightly different, for the same video
        canonical_url = self._request_webpage(
            HEADRequest(canonical_url), video_id,
            note='Resolve canonical URL',
            errnote='Could not resolve canonical URL').geturl()
        return self.url_result(
            canonical_url,
            MegaTVComIE.ie_key(),
            video_id
        )
Author	SHA1	Message	Date
Zenon Mousmoulas	3b5a3407b2	Merge `96a0ad4778` into `c5098961b0`	2024-08-07 08:48:30 +03:00
Zenon Mousmoulas	96a0ad4778	MegaTVComEmbedIE: Make canonical URL extraction more robust	2021-11-13 11:50:22 +02:00
Zenon Mousmoulas	28fddc1758	Fix copy/paste typo	2021-11-13 11:39:29 +02:00
Zenon Mousmoulas	a5ec30e106	Address PR comments about escapes	2021-11-13 08:42:15 +02:00
Zenon Mousmoulas	34c3b06402	Add MegaTVCom IEs * Add new IEs * MegaTVComBaseIE: Base IE class * MegaTVComIE: Extract from TV VOD pages and news articles, i.e. all sorts of pages showing videos on megatv.com * MegaTVComEmbedIE: Extract iframe-embeddable megatv.com videos * When video_id is not matched in the URL, namely for news articles, extract it (article_id) from a particular element on the web page * Derive metadata and sources directly from the web page, from data attributes of the player placeholder element and other commonly used elements * Let MegaTVComEmbedIE defer to MegaTVComIE for extraction, as the metadata on the embeddable page are some times slightly different, for the same video	2021-11-11 15:40:14 +02:00