From d303e1e05ff6a397a7f762c910e1485c6f2bd1d2 Mon Sep 17 00:00:00 2001 From: Zenon Mousmoulas Date: Thu, 11 Nov 2021 11:16:29 +0200 Subject: [PATCH] GlomexEmbedIE: Reuse _VALID_URL in _extract_urls * Let _extract_urls reuse _VALID_URL after making scheme optional and simplifying the query string part * Upon an iframe match * Add the scheme to the matched URL, if necessary * Match the URL against the full _VALID_URL --- youtube_dl/extractor/glomex.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/glomex.py b/youtube_dl/extractor/glomex.py index 316e27770..98482fc87 100644 --- a/youtube_dl/extractor/glomex.py +++ b/youtube_dl/extractor/glomex.py @@ -162,7 +162,8 @@ class GlomexEmbedIE(GlomexBaseIE): IE_NAME = 'glomex:embed' IE_DESC = 'Glomex embedded videos' _BASE_PLAYER_URL = 'https://player.glomex.com/integration/1/iframe-player.html' - _VALID_URL = r'(?:https?:)?//player\.glomex\.com/integration/[^/]+/iframe-player\.html\?(?:(?:integrationId=(?P[^&#]+)|playlistId=(?P[^&#]+)|[^&=#]+=[^&#]+)&?)+' + _VALID_URL = r'''(?x)https?://player\.glomex\.com/integration/[^/]+/iframe-player\.html + \?(?:(?:integrationId=(?P[^&#]+)|playlistId=(?P[^&#]+)|[^&=#]+=[^&#]+)&?)+''' _TESTS = [{ 'url': 'https://player.glomex.com/integration/1/iframe-player.html?integrationId=4059a013k56vb2yd&playlistId=v-cfa6lye0dkdd-sf', @@ -219,12 +220,16 @@ class GlomexEmbedIE(GlomexBaseIE): @classmethod def _extract_urls(cls, webpage, origin_url): + # make the scheme in _VALID_URL optional + _URL_RE = r'(?:https?:)?//' + cls._VALID_URL.split('://', 1)[1] + # simplify the query string part of _VALID_URL; after extracting iframe + # src, the URL will be matched again + _URL_RE = _URL_RE.split(r'\?', 1)[0] + r'\?(?:(?!(?P=_q1)).)+' # https://docs.glomex.com/publisher/video-player-integration/javascript-api/ EMBED_RE = r'''(?x) (?: ]+?src=(?P<_q1>%(quot_re)s) - (?P(?:https?:)?//player\.glomex\.com/integration/[^/]+/iframe-player\.html\? - (?:(?!(?P=_q1)).)+)(?P=_q1)| + (?P%(url_re)s)(?P=_q1)| <(?Pglomex-player|div)(?: data-integration-id=(?P<_q2>%(quot_re)s)(?P(?:(?!(?P=_q2)).)+)(?P=_q2)| data-playlist-id=(?P<_q3>%(quot_re)s)(?P(?:(?!(?P=_q3)).)+)(?P=_q3)| @@ -240,7 +245,7 @@ class GlomexEmbedIE(GlomexBaseIE): (?:\s|.)*? )+ ) - ''' % {'quot_re': r'[\"\']'} + ''' % {'quot_re': r'[\"\']', 'url_re': _URL_RE} for mobj in re.finditer(EMBED_RE, webpage): url, html_tag, video_id_html, integration_html, glomex_player, \ script_tag, video_id_js, integration_js = \ @@ -248,7 +253,14 @@ class GlomexEmbedIE(GlomexBaseIE): 'integration_html', 'glomex_player', 'script_tag', 'id_js', 'integration_js') if url: - yield cls._smuggle_origin_url(unescapeHTML(url), origin_url) + url = unescapeHTML(url) + if url.startswith('//'): + scheme = compat_urllib_parse_urlparse(origin_url).scheme \ + if origin_url else 'https' + url = '%s:%s' % (scheme, url) + if not cls.suitable(url): + continue + yield cls._smuggle_origin_url(url, origin_url) elif html_tag: if html_tag == "div" and not glomex_player: continue