[nytimes] Add new cooking.nytimes.com extractor (#27143 )

* [nytimes] support cooking.nytimes.com, resolves #27112 Co-authored-by: remitamine <remitamine@gmail.com>
[lbry] relax _VALID_URL regex(closes #27144 )
2025-09-18 15:46:28 +09:00 · 2020-11-22 14:12:47 +01:00 · 2020-11-22 13:16:03 +01:00 · 2020-11-22 13:07:04 +01:00 · 2020-11-22 13:01:56 +01:00 · 2020-11-22 12:54:55 +01:00
6 changed files with 387 additions and 1 deletions
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -2596,6 +2596,7 @@ class InfoExtractor(object):
    def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
        formats = []
        hdcore_sign = 'hdcore=3.7.0'
        f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
        hds_host = hosts.get('hds')
@ -2608,6 +2609,7 @@ class InfoExtractor(object):
        for entry in f4m_formats:
            entry.update({'extra_param_to_segment_url': hdcore_sign})
        formats.extend(f4m_formats)
        m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
        hls_host = hosts.get('hls')
        if hls_host:
@ -2615,6 +2617,31 @@ class InfoExtractor(object):
        formats.extend(self._extract_m3u8_formats(
            m3u8_url, video_id, 'mp4', 'm3u8_native',
            m3u8_id='hls', fatal=False))
        http_host = hosts.get('http')
        if http_host and 'hdnea=' not in manifest_url:
            REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+'
            qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
            qualities_length = len(qualities)
            if len(formats) in (qualities_length + 1, qualities_length * 2 + 1):
                i = 0
                http_formats = []
                for f in formats:
                    if f['protocol'] == 'm3u8_native' and f['vcodec'] != 'none':
                        for protocol in ('http', 'https'):
                            http_f = f.copy()
                            del http_f['manifest_url']
                            http_url = re.sub(
                                REPL_REGEX, protocol + r'://%s/\1%s\3' % (http_host, qualities[i]), f['url'])
                            http_f.update({
                                'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
                                'url': http_url,
                                'protocol': protocol,
                            })
                            http_formats.append(http_f)
                        i += 1
                formats.extend(http_formats)
        return formats
    def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -782,6 +782,7 @@ from .ntvru import NTVRuIE
 from .nytimes import (
    NYTimesIE,
    NYTimesArticleIE,
    NYTimesCookingIE,
 )
 from .nuvid import NuvidIE
 from .nzz import NZZIE
@ -963,6 +964,7 @@ from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETe
 from .rtvnh import RTVNHIE
 from .rtvs import RTVSIE
 from .ruhd import RUHDIE
 from .rumble import RumbleEmbedIE
 from .rutube import (
    RutubeIE,
    RutubeChannelIE,
@ -1013,6 +1015,16 @@ from .shared import (
 from .showroomlive import ShowRoomLiveIE
 from .sina import SinaIE
 from .sixplay import SixPlayIE
 from .skyit import (
    SkyItPlayerIE,
    SkyItVideoIE,
    SkyItVideoLiveIE,
    SkyItIE,
    SkyItAcademyIE,
    SkyItArteIE,
    CieloTVItIE,
    TV8ItIE,
 )
 from .skylinewebcams import SkylineWebcamsIE
 from .skynewsarabia import (
    SkyNewsArabiaIE,
--- a/youtube_dl/extractor/lbry.py
+++ b/youtube_dl/extractor/lbry.py
@ -16,7 +16,7 @@ from ..utils import (
 class LBRYIE(InfoExtractor):
    IE_NAME = 'lbry.tv'
-    _VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P<id>@[0-9a-zA-Z-]+:[0-9a-z]+/[0-9a-zA-Z().-]+:[0-9a-z])'
+    _VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P<id>@[^:]+:[0-9a-z]+/[^:]+:[0-9a-z])'
    _TESTS = [{
        # Video
        'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1',
@ -44,6 +44,9 @@ class LBRYIE(InfoExtractor):
    }, {
        'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e',
        'only_matching': True,
    }, {
        'url': "https://odysee.com/@ScammerRevolts:b0/I-SYSKEY'D-THE-SAME-SCAMMERS-3-TIMES!:b",
        'only_matching': True,
    }]
    def _call_api_proxy(self, method, display_id, params):
--- a/youtube_dl/extractor/nytimes.py
+++ b/youtube_dl/extractor/nytimes.py
@ -221,3 +221,41 @@ class NYTimesArticleIE(NYTimesBaseIE):
             r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'),
            webpage, 'podcast data')
        return self._extract_podcast_from_json(podcast_data, page_id, webpage)
 class NYTimesCookingIE(NYTimesBaseIE):
    _VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart',
        'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3',
        'info_dict': {
            'id': '100000004756089',
            'ext': 'mov',
            'timestamp': 1479383008,
            'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON',
            'title': 'Cranberry Tart',
            'upload_date': '20161117',
            'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.',
        },
    }, {
        'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey',
        'md5': '4b2e8c70530a89b8d905a2b572316eb8',
        'info_dict': {
            'id': '100000003951728',
            'ext': 'mov',
            'timestamp': 1445509539,
            'description': 'Turkey guide',
            'upload_date': '20151022',
            'title': 'Turkey',
        }
    }]
    def _real_extract(self, url):
        page_id = self._match_id(url)
        webpage = self._download_webpage(url, page_id)
        video_id = self._search_regex(
            r'data-video-id=["\'](\d+)', webpage, 'video id')
        return self._extract_video_from_id(video_id)
--- a/youtube_dl/extractor/rumble.py
+++ b/youtube_dl/extractor/rumble.py
@ -0,0 +1,67 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..compat import compat_str
 from ..utils import (
    determine_ext,
    int_or_none,
    parse_iso8601,
    try_get,
 )
 class RumbleEmbedIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
    _TESTS = [{
        'url': 'https://rumble.com/embed/v5pv5f',
        'md5': '36a18a049856720189f30977ccbb2c34',
        'info_dict': {
            'id': 'v5pv5f',
            'ext': 'mp4',
            'title': 'WMAR 2 News Latest Headlines | October 20, 6pm',
            'timestamp': 1571611968,
            'upload_date': '20191020',
        }
    }, {
        'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        video_id = self._match_id(url)
        video = self._download_json(
            'https://rumble.com/embedJS/', video_id,
            query={'request': 'video', 'v': video_id})
        title = video['title']
        formats = []
        for height, ua in (video.get('ua') or {}).items():
            for i in range(2):
                f_url = try_get(ua, lambda x: x[i], compat_str)
                if f_url:
                    ext = determine_ext(f_url)
                    f = {
                        'ext': ext,
                        'format_id': '%s-%sp' % (ext, height),
                        'height': int_or_none(height),
                        'url': f_url,
                    }
                    bitrate = try_get(ua, lambda x: x[i + 2]['bitrate'])
                    if bitrate:
                        f['tbr'] = int_or_none(bitrate)
                    formats.append(f)
        self._sort_formats(formats)
        author = video.get('author') or {}
        return {
            'id': video_id,
            'title': title,
            'formats': formats,
            'thumbnail': video.get('i'),
            'timestamp': parse_iso8601(video.get('pubDate')),
            'channel': author.get('name'),
            'channel_url': author.get('url'),
            'duration': int_or_none(video.get('duration')),
        }
--- a/youtube_dl/extractor/skyit.py
+++ b/youtube_dl/extractor/skyit.py
@ -0,0 +1,239 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..compat import (
    compat_str,
    compat_parse_qs,
    compat_urllib_parse_urlparse,
 )
 from ..utils import (
    dict_get,
    int_or_none,
    parse_duration,
    unified_timestamp,
 )
 class SkyItPlayerIE(InfoExtractor):
    IE_NAME = 'player.sky.it'
    _VALID_URL = r'https?://player\.sky\.it/player/(?:external|social)\.html\?.*?\bid=(?P<id>\d+)'
    _GEO_BYPASS = False
    _DOMAIN = 'sky'
    _PLAYER_TMPL = 'https://player.sky.it/player/external.html?id=%s&domain=%s'
    # http://static.sky.it/static/skyplayer/conf.json
    _TOKEN_MAP = {
        'cielo': 'Hh9O7M8ks5yi6nSROL7bKYz933rdf3GhwZlTLMgvy4Q',
        'hotclub': 'kW020K2jq2lk2eKRJD2vWEg832ncx2EivZlTLQput2C',
        'mtv8': 'A5Nn9GGb326CI7vP5e27d7E4PIaQjota',
        'salesforce': 'C6D585FD1615272C98DE38235F38BD86',
        'sitocommerciale': 'VJwfFuSGnLKnd9Phe9y96WkXgYDCguPMJ2dLhGMb2RE',
        'sky': 'F96WlOd8yoFmLQgiqv6fNQRvHZcsWk5jDaYnDvhbiJk',
        'skyacademy': 'A6LAn7EkO2Q26FRy0IAMBekX6jzDXYL3',
        'skyarte': 'LWk29hfiU39NNdq87ePeRach3nzTSV20o0lTv2001Cd',
        'theupfront': 'PRSGmDMsg6QMGc04Obpoy7Vsbn7i2Whp',
    }
    def _player_url_result(self, video_id):
        return self.url_result(
            self._PLAYER_TMPL % (video_id, self._DOMAIN),
            SkyItPlayerIE.ie_key(), video_id)
    def _parse_video(self, video, video_id):
        title = video['title']
        is_live = video.get('type') == 'live'
        hls_url = video.get(('streaming' if is_live else 'hls') + '_url')
        if not hls_url and video.get('geoblock' if is_live else 'geob'):
            self.raise_geo_restricted(countries=['IT'])
        if is_live:
            formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4')
        else:
            formats = self._extract_akamai_formats(
                hls_url, video_id, {'http': 'videoplatform.sky.it'})
        self._sort_formats(formats)
        return {
            'id': video_id,
            'title': self._live_title(title) if is_live else title,
            'formats': formats,
            'thumbnail': dict_get(video, ('video_still', 'video_still_medium', 'thumb')),
            'description': video.get('short_desc') or None,
            'timestamp': unified_timestamp(video.get('create_date')),
            'duration': int_or_none(video.get('duration_sec')) or parse_duration(video.get('duration')),
            'is_live': is_live,
        }
    def _real_extract(self, url):
        video_id = self._match_id(url)
        domain = compat_parse_qs(compat_urllib_parse_urlparse(
            url).query).get('domain', [None])[0]
        token = dict_get(self._TOKEN_MAP, (domain, 'sky'))
        video = self._download_json(
            'https://apid.sky.it/vdp/v1/getVideoData',
            video_id, query={
                'caller': 'sky',
                'id': video_id,
                'token': token
            }, headers=self.geo_verification_headers())
        return self._parse_video(video, video_id)
 class SkyItVideoIE(SkyItPlayerIE):
    IE_NAME = 'video.sky.it'
    _VALID_URL = r'https?://(?:masterchef|video|xfactor)\.sky\.it(?:/[^/]+)*/video/[0-9a-z-]+-(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://video.sky.it/news/mondo/video/uomo-ucciso-da-uno-squalo-in-australia-631227',
        'md5': 'fe5c91e59a84a3437eaa0bca6e134ccd',
        'info_dict': {
            'id': '631227',
            'ext': 'mp4',
            'title': 'Uomo ucciso da uno squalo in Australia',
            'timestamp': 1606036192,
            'upload_date': '20201122',
        }
    }, {
        'url': 'https://xfactor.sky.it/video/x-factor-2020-replay-audizioni-1-615820',
        'only_matching': True,
    }, {
        'url': 'https://masterchef.sky.it/video/masterchef-9-cosa-e-successo-nella-prima-puntata-562831',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        video_id = self._match_id(url)
        return self._player_url_result(video_id)
 class SkyItVideoLiveIE(SkyItPlayerIE):
    IE_NAME = 'video.sky.it:live'
    _VALID_URL = r'https?://video\.sky\.it/diretta/(?P<id>[^/?&#]+)'
    _TEST = {
        'url': 'https://video.sky.it/diretta/tg24',
        'info_dict': {
            'id': '1',
            'ext': 'mp4',
            'title': r're:Diretta TG24 \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
            'description': 'Guarda la diretta streaming di SkyTg24, segui con Sky tutti gli appuntamenti e gli speciali di Tg24.',
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        },
    }
    def _real_extract(self, url):
        display_id = self._match_id(url)
        webpage = self._download_webpage(url, display_id)
        asset_id = compat_str(self._parse_json(self._search_regex(
            r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>',
            webpage, 'next data'), display_id)['props']['initialState']['livePage']['content']['asset_id'])
        livestream = self._download_json(
            'https://apid.sky.it/vdp/v1/getLivestream',
            asset_id, query={'id': asset_id})
        return self._parse_video(livestream, asset_id)
 class SkyItIE(SkyItPlayerIE):
    IE_NAME = 'sky.it'
    _VALID_URL = r'https?://(?:sport|tg24)\.sky\.it(?:/[^/]+)*/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)'
    _TESTS = [{
        'url': 'https://sport.sky.it/calcio/serie-a/2020/11/21/juventus-cagliari-risultato-gol',
        'info_dict': {
            'id': '631201',
            'ext': 'mp4',
            'title': 'Un rosso alla violenza: in campo per i diritti delle donne',
            'upload_date': '20201121',
            'timestamp': 1605995753,
        },
        'expected_warnings': ['Unable to download f4m manifest'],
    }, {
        'url': 'https://tg24.sky.it/mondo/2020/11/22/australia-squalo-uccide-uomo',
        'md5': 'fe5c91e59a84a3437eaa0bca6e134ccd',
        'info_dict': {
            'id': '631227',
            'ext': 'mp4',
            'title': 'Uomo ucciso da uno squalo in Australia',
            'timestamp': 1606036192,
            'upload_date': '20201122',
        },
    }]
    _VIDEO_ID_REGEX = r'data-videoid="(\d+)"'
    def _real_extract(self, url):
        display_id = self._match_id(url)
        webpage = self._download_webpage(url, display_id)
        video_id = self._search_regex(
            self._VIDEO_ID_REGEX, webpage, 'video id')
        return self._player_url_result(video_id)
 class SkyItAcademyIE(SkyItIE):
    IE_NAME = 'skyacademy.it'
    _VALID_URL = r'https?://(?:www\.)?skyacademy\.it(?:/[^/]+)*/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)'
    _TESTS = [{
        'url': 'https://www.skyacademy.it/eventi-speciali/2019/07/05/a-lezione-di-cinema-con-sky-academy-/',
        'md5': 'ced5c26638b7863190cbc44dd6f6ba08',
        'info_dict': {
            'id': '523458',
            'ext': 'mp4',
            'title': 'Sky Academy "The Best CineCamp 2019"',
            'timestamp': 1562843784,
            'upload_date': '20190711',
        }
    }]
    _DOMAIN = 'skyacademy'
    _VIDEO_ID_REGEX = r'id="news-videoId_(\d+)"'
 class SkyItArteIE(SkyItIE):
    IE_NAME = 'arte.sky.it'
    _VALID_URL = r'https?://arte\.sky\.it/video/(?P<id>[^/?&#]+)'
    _TESTS = [{
        'url': 'https://arte.sky.it/video/serie-musei-venezia-collezionismo-12-novembre/',
        'md5': '515aee97b87d7a018b6c80727d3e7e17',
        'info_dict': {
            'id': '627926',
            'ext': 'mp4',
            'title': "Musei Galleria Franchetti alla Ca' d'Oro Palazzo Grimani",
            'upload_date': '20201106',
            'timestamp': 1604664493,
        }
    }]
    _DOMAIN = 'skyarte'
    _VIDEO_ID_REGEX = r'(?s)<iframe[^>]+src="(?:https:)?//player\.sky\.it/player/external\.html\?[^"]*\bid=(\d+)'
 class CieloTVItIE(SkyItIE):
    IE_NAME = 'cielotv.it'
    _VALID_URL = r'https?://(?:www\.)?cielotv\.it/video/(?P<id>[^.]+)\.html'
    _TESTS = [{
        'url': 'https://www.cielotv.it/video/Il-lunedi-e-sempre-un-dramma.html',
        'md5': 'c4deed77552ba901c2a0d9258320304b',
        'info_dict': {
            'id': '499240',
            'ext': 'mp4',
            'title': 'Il lunedì è sempre un dramma',
            'upload_date': '20190329',
            'timestamp': 1553862178,
        }
    }]
    _DOMAIN = 'cielo'
    _VIDEO_ID_REGEX = r'videoId\s*=\s*"(\d+)"'
 class TV8ItIE(SkyItVideoIE):
    IE_NAME = 'tv8.it'
    _VALID_URL = r'https?://tv8\.it/showvideo/(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://tv8.it/showvideo/630529/ogni-mattina-ucciso-asino-di-andrea-lo-cicero/18-11-2020/',
        'md5': '9ab906a3f75ea342ed928442f9dabd21',
        'info_dict': {
            'id': '630529',
            'ext': 'mp4',
            'title': 'Ogni mattina - Ucciso asino di Andrea Lo Cicero',
            'timestamp': 1605721374,
            'upload_date': '20201118',
        }
    }]
    _DOMAIN = 'mtv8'
Author	SHA1	Message	Date
Jia Rong Yee	15f2734791	[nytimes] Add new cooking.nytimes.com extractor (#27143 ) * [nytimes] support cooking.nytimes.com, resolves #27112 Co-authored-by: remitamine <remitamine@gmail.com>	2020-11-22 14:12:47 +01:00
Remita Amine	cb6e24f946	[lbry] relax _VALID_URL regex(closes #27144 )	2020-11-22 13:16:03 +01:00
Remita Amine	9d531aa291	[rumble] add support for embed pages(#10785 )	2020-11-22 13:07:04 +01:00
Remita Amine	e9cbb98a0f	[skyit] add support for multiple Sky Italia websites(closes #26629 )	2020-11-22 13:01:56 +01:00
Remita Amine	193422e12a	[extractor/common] add generic support for akamai http format extraction	2020-11-22 12:54:55 +01:00