diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index ea0a59dca..febbd2344 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.02.10** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.10 + [debug] youtube-dl version 2021.04.07 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index d24855c72..d7296d0a9 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.02.10** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 8b96a2883..92e616a1a 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.10** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index e46971047..b55739f6c 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.02.10** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.10 + [debug] youtube-dl version 2021.04.07 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index a9ca379ca..dbdb8356a 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.10** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 384bd19c2..22b4fa67d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,164 @@ +version 2021.04.07 + +Core +* [extractor/common] Use compat_cookies_SimpleCookie for _get_cookies ++ [compat] Introduce compat_cookies_SimpleCookie +* [extractor/common] Improve JSON-LD author extraction +* [extractor/common] Fix _get_cookies on python 2 (#20673, #23256, #20326, + #28640) + +Extractors +* [youtube] Fix extraction of videos with restricted location (#28685) ++ [line] Add support for live.line.me (#17205, #28658) +* [vimeo] Improve extraction (#28591) +* [youku] Update ccode (#17852, #28447, #28460, #28648) +* [youtube] Prefer direct entry metadata over entry metadata from playlist + (#28619, #28636) +* [screencastomatic] Fix extraction (#11976, #24489) ++ [palcomp3] Add support for palcomp3.com (#13120) ++ [arnes] Add support for video.arnes.si (#28483) ++ [youtube:tab] Add support for hashtags (#28308) + + +version 2021.04.01 + +Extractors +* [youtube] Setup CONSENT cookie when needed (#28604) +* [vimeo] Fix password protected review extraction (#27591) +* [youtube] Improve age-restricted video extraction (#28578) + + +version 2021.03.31 + +Extractors +* [vlive] Fix inkey request (#28589) +* [francetvinfo] Improve video id extraction (#28584) ++ [instagram] Extract duration (#28469) +* [instagram] Improve title extraction (#28469) ++ [sbs] Add support for ondemand watch URLs (#28566) +* [youtube] Fix video's channel extraction (#28562) +* [picarto] Fix live stream extraction (#28532) +* [vimeo] Fix unlisted video extraction (#28414) +* [youtube:tab] Fix playlist/community continuation items extraction (#28266) +* [ard] Improve clip id extraction (#22724, #28528) + + +version 2021.03.25 + +Extractors ++ [zoom] Add support for zoom.us (#16597, #27002, #28531) +* [bbc] Fix BBC IPlayer Episodes/Group extraction (#28360) +* [youtube] Fix default value for youtube_include_dash_manifest (#28523) +* [zingmp3] Fix extraction (#11589, #16409, #16968, #27205) ++ [vgtv] Add support for new tv.aftonbladet.se URL schema (#28514) ++ [tiktok] Detect private videos (#28453) +* [vimeo:album] Fix extraction for albums with number of videos multiple + to page size (#28486) +* [vvvvid] Fix kenc format extraction (#28473) +* [mlb] Fix video extraction (#21241) +* [svtplay] Improve extraction (#28448) +* [applepodcasts] Fix extraction (#28445) +* [rtve] Improve extraction + + Extract all formats + * Fix RTVE Infantil extraction (#24851) + + Extract is_live and series + + +version 2021.03.14 + +Core ++ Introduce release_timestamp meta field (#28386) + +Extractors ++ [southpark] Add support for southparkstudios.com (#28413) +* [southpark] Fix extraction (#26763, #28413) +* [sportdeutschland] Fix extraction (#21856, #28425) +* [pinterest] Reduce the number of HLS format requests +* [peertube] Improve thumbnail extraction (#28419) +* [tver] Improve title extraction (#28418) +* [fujitv] Fix HLS formats extension (#28416) +* [shahid] Fix format extraction (#28383) ++ [lbry] Add support for channel filters (#28385) ++ [bandcamp] Extract release timestamp ++ [lbry] Extract release timestamp (#28386) +* [pornhub] Detect flagged videos ++ [pornhub] Extract formats from get_media end point (#28395) +* [bilibili] Fix video info extraction (#28341) ++ [cbs] Add support for Paramount+ (#28342) ++ [trovo] Add Origin header to VOD formats (#28346) +* [voxmedia] Fix volume embed extraction (#28338) + + +version 2021.03.03 + +Extractors +* [youtube:tab] Switch continuation to browse API (#28289, #28327) +* [9c9media] Fix extraction for videos with multiple ContentPackages (#28309) ++ [bbc] Add support for BBC Reel videos (#21870, #23660, #28268) + + +version 2021.03.02 + +Extractors +* [zdf] Rework extractors (#11606, #13473, #17354, #21185, #26711, #27068, + #27930, #28198, #28199, #28274) + * Generalize cross-extractor video ids for zdf based extractors + * Improve extraction + * Fix 3sat and phoenix +* [stretchinternet] Fix extraction (#28297) +* [urplay] Fix episode data extraction (#28292) ++ [bandaichannel] Add support for b-ch.com (#21404) +* [srgssr] Improve extraction (#14717, #14725, #27231, #28238) + + Extract subtitle + * Fix extraction for new videos + * Update srf download domains +* [vvvvid] Reduce season request payload size ++ [vvvvid] Extract series sublists playlist title (#27601, #27618) ++ [dplay] Extract Ad-Free uplynk URLs (#28160) ++ [wat] Detect DRM protected videos (#27958) +* [tf1] Improve extraction (#27980, #28040) +* [tmz] Fix and improve extraction (#24603, #24687, 28211) ++ [gedidigital] Add support for Gedi group sites (#7347, #26946) +* [youtube] Fix get_video_info request + + +version 2021.02.22 + +Core ++ [postprocessor/embedthumbnail] Recognize atomicparsley binary in lowercase + (#28112) + +Extractors +* [apa] Fix and improve extraction (#27750) ++ [youporn] Extract duration (#28019) ++ [peertube] Add support for canard.tube (#28190) +* [youtube] Fixup m4a_dash formats (#28165) ++ [samplefocus] Add support for samplefocus.com (#27763) ++ [vimeo] Add support for unlisted video source format extraction +* [viki] Improve extraction (#26522, #28203) + * Extract uploader URL and episode number + * Report login required error + + Extract 480p formats + * Fix API v4 calls +* [ninegag] Unescape title (#28201) +* [youtube] Improve URL regular expression (#28193) ++ [youtube] Add support for redirect.invidious.io (#28193) ++ [dplay] Add support for de.hgtv.com (#28182) ++ [dplay] Add support for discoveryplus.com (#24698) ++ [simplecast] Add support for simplecast.com (#24107) +* [youtube] Fix uploader extraction in flat playlist mode (#28045) +* [yandexmusic:playlist] Request missing tracks in chunks (#27355, #28184) ++ [storyfire] Add support for storyfire.com (#25628, #26349) ++ [zhihu] Add support for zhihu.com (#28177) +* [youtube] Fix controversial videos when authenticated with cookies (#28174) +* [ccma] Fix timestamp parsing in python 2 ++ [videopress] Add support for video.wordpress.com +* [kakao] Improve info extraction and detect geo restriction (#26577) +* [xboxclips] Fix extraction (#27151) +* [ard] Improve formats extraction (#28155) ++ [canvas] Add support for dagelijksekost.een.be (#28119) + + version 2021.02.10 Extractors diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1373cc4f6..ff9177a2c 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -82,6 +82,7 @@ - **awaan:video** - **AZMedien**: AZ Medien videos - **BaiduVideo**: 百度视频 + - **bandaichannel** - **Bandcamp** - **Bandcamp:album** - **Bandcamp:weekly** @@ -89,7 +90,8 @@ - **bbc**: BBC - **bbc.co.uk**: BBC iPlayer - **bbc.co.uk:article**: BBC articles - - **bbc.co.uk:iplayer:playlist** + - **bbc.co.uk:iplayer:episodes** + - **bbc.co.uk:iplayer:group** - **bbc.co.uk:playlist** - **BBVTV** - **Beatport** @@ -212,6 +214,7 @@ - **curiositystream** - **curiositystream:collection** - **CWTV** + - **DagelijkseKost**: dagelijksekost.een.be - **DailyMail** - **dailymotion** - **dailymotion:playlist** @@ -233,6 +236,7 @@ - **DiscoveryGo** - **DiscoveryGoPlaylist** - **DiscoveryNetworksDe** + - **DiscoveryPlus** - **DiscoveryVR** - **Disney** - **dlive:stream** @@ -328,6 +332,7 @@ - **Gaskrank** - **Gazeta** - **GDCVault** + - **GediDigital** - **generic**: Generic downloader that works on some sites - **Gfycat** - **GiantBomb** @@ -353,6 +358,7 @@ - **HentaiStigma** - **hetklokhuis** - **hgtv.com:show** + - **HGTVDe** - **HiDive** - **HistoricFilms** - **history:player** @@ -457,6 +463,8 @@ - **limelight** - **limelight:channel** - **limelight:channel_list** + - **LineLive** + - **LineLiveChannel** - **LineTV** - **linkedin:learning** - **linkedin:learning:course** @@ -517,6 +525,7 @@ - **mixcloud:playlist** - **mixcloud:user** - **MLB** + - **MLBVideo** - **Mnet** - **MNetTV** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net @@ -672,6 +681,9 @@ - **OutsideTV** - **PacktPub** - **PacktPubCourse** + - **PalcoMP3:artist** + - **PalcoMP3:song** + - **PalcoMP3:video** - **pandora.tv**: 판도라TV - **ParamountNetwork** - **parliamentlive.tv**: UK parliament videos @@ -803,6 +815,7 @@ - **safari:course**: safaribooksonline.com online courses - **SAKTV** - **SaltTV** + - **SampleFocus** - **Sapo**: SAPO Vídeos - **savefrom.net** - **SBS**: sbs.com.au @@ -825,6 +838,9 @@ - **ShahidShow** - **Shared**: shared.sx - **ShowRoomLive** + - **simplecast** + - **simplecast:episode** + - **simplecast:podcast** - **Sina** - **sky.it** - **sky:news** @@ -877,6 +893,9 @@ - **Steam** - **Stitcher** - **StitcherShow** + - **StoryFire** + - **StoryFireSeries** + - **StoryFireUser** - **Streamable** - **streamcloud.eu** - **StreamCZ** @@ -1045,6 +1064,7 @@ - **Vidbit** - **Viddler** - **Videa** + - **video.arnes.si**: Arnes Video - **video.google:search**: Google Video search - **video.sky.it** - **video.sky.it:live** @@ -1198,5 +1218,8 @@ - **ZattooLive** - **ZDF** - **ZDFChannel** + - **Zhihu** - **zingmp3**: mp3.zing.vn + - **zingmp3:album** + - **zoom** - **Zype** diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ecac31f7a..8f65c6499 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1511,14 +1511,18 @@ class YoutubeDL(object): if 'display_id' not in info_dict and 'id' in info_dict: info_dict['display_id'] = info_dict['id'] - if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None: - # Working around out-of-range timestamp values (e.g. negative ones on Windows, - # see http://bugs.python.org/issue1646728) - try: - upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp']) - info_dict['upload_date'] = upload_date.strftime('%Y%m%d') - except (ValueError, OverflowError, OSError): - pass + for ts_key, date_key in ( + ('timestamp', 'upload_date'), + ('release_timestamp', 'release_date'), + ): + if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None: + # Working around out-of-range timestamp values (e.g. negative ones on Windows, + # see http://bugs.python.org/issue1646728) + try: + upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) + info_dict[date_key] = upload_date.strftime('%Y%m%d') + except (ValueError, OverflowError, OSError): + pass # Auto generate title fields corresponding to the *_number fields when missing # in order to always have clean titles. This is very common for TV series. diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 6c3d49d45..9e45c454b 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -73,6 +73,15 @@ try: except ImportError: # Python 2 import Cookie as compat_cookies +if sys.version_info[0] == 2: + class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie): + def load(self, rawdata): + if isinstance(rawdata, compat_str): + rawdata = str(rawdata) + return super(compat_cookies_SimpleCookie, self).load(rawdata) +else: + compat_cookies_SimpleCookie = compat_cookies.SimpleCookie + try: import html.entities as compat_html_entities except ImportError: # Python 2 @@ -3000,6 +3009,7 @@ __all__ = [ 'compat_cookiejar', 'compat_cookiejar_Cookie', 'compat_cookies', + 'compat_cookies_SimpleCookie', 'compat_ctypes_WINFUNCTYPE', 'compat_etree_Element', 'compat_etree_fromstring', diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py index 98ccdaa4a..cbc1c0ecb 100644 --- a/youtube_dl/extractor/apa.py +++ b/youtube_dl/extractor/apa.py @@ -6,25 +6,21 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, - js_to_json, + int_or_none, url_or_none, ) class APAIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.apa\.at/embed/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _VALID_URL = r'(?Phttps?://[^/]+\.apa\.at)/embed/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _TESTS = [{ 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', 'md5': '2b12292faeb0a7d930c778c7a5b4759b', 'info_dict': { - 'id': 'jjv85FdZ', + 'id': '293f6d17-692a-44e3-9fd5-7b178f3a1029', 'ext': 'mp4', - 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'title': '293f6d17-692a-44e3-9fd5-7b178f3a1029', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 254, - 'timestamp': 1519211149, - 'upload_date': '20180221', }, }, { 'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78', @@ -46,9 +42,11 @@ class APAIE(InfoExtractor): webpage)] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id, base_url = mobj.group('id', 'base_url') - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + '%s/player/%s' % (base_url, video_id), video_id) jwplatform_id = self._search_regex( r'media[iI]d\s*:\s*["\'](?P[a-zA-Z0-9]{8})', webpage, @@ -59,16 +57,18 @@ class APAIE(InfoExtractor): 'jwplatform:' + jwplatform_id, ie='JWPlatform', video_id=video_id) - sources = self._parse_json( - self._search_regex( - r'sources\s*=\s*(\[.+?\])\s*;', webpage, 'sources'), - video_id, transform_source=js_to_json) + def extract(field, name=None): + return self._search_regex( + r'\b%s["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' % field, + webpage, name or field, default=None, group='value') + + title = extract('title') or video_id + description = extract('description') + thumbnail = extract('poster', 'thumbnail') formats = [] - for source in sources: - if not isinstance(source, dict): - continue - source_url = url_or_none(source.get('file')) + for format_id in ('hls', 'progressive'): + source_url = url_or_none(extract(format_id)) if not source_url: continue ext = determine_ext(source_url) @@ -77,18 +77,19 @@ class APAIE(InfoExtractor): source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) else: + height = int_or_none(self._search_regex( + r'(\d+)\.mp4', source_url, 'height', default=None)) formats.append({ 'url': source_url, + 'format_id': format_id, + 'height': height, }) self._sort_formats(formats) - thumbnail = self._search_regex( - r'image\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'thumbnail', fatal=False, group='url') - return { 'id': video_id, - 'title': video_id, + 'title': title, + 'description': description, 'thumbnail': thumbnail, 'formats': formats, } diff --git a/youtube_dl/extractor/applepodcasts.py b/youtube_dl/extractor/applepodcasts.py index 95758fece..6a74de758 100644 --- a/youtube_dl/extractor/applepodcasts.py +++ b/youtube_dl/extractor/applepodcasts.py @@ -42,6 +42,7 @@ class ApplePodcastsIE(InfoExtractor): ember_data = self._parse_json(self._search_regex( r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', webpage, 'ember data'), episode_id) + ember_data = ember_data.get(episode_id) or ember_data episode = ember_data['data']['attributes'] description = episode.get('description') or {} diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 143fc51e9..d57c5ba0f 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -335,7 +335,7 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(ARDMediathekBaseIE): - _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?:player|live|video)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?:[^/]+/)?(?:player|live|video)/(?:[^/]+/)*(?PY3JpZDovL[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', @@ -365,22 +365,22 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): }, { 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') - display_id = mobj.group('display_id') - if display_id: - display_id = display_id.rstrip('/') - if not display_id: - display_id = video_id + video_id = self._match_id(url) player_page = self._download_json( 'https://api.ardmediathek.de/public-gateway', - display_id, data=json.dumps({ + video_id, data=json.dumps({ 'query': '''{ - playerPage(client:"%s", clipId: "%s") { + playerPage(client: "ard", clipId: "%s") { blockedByFsk broadcastedOn maturityContentRating @@ -410,7 +410,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): } } } -}''' % (mobj.group('client'), video_id), +}''' % video_id, }).encode(), headers={ 'Content-Type': 'application/json' })['data']['playerPage'] @@ -435,7 +435,6 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) info.update({ 'age_limit': age_limit, - 'display_id': display_id, 'title': title, 'description': description, 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), diff --git a/youtube_dl/extractor/arnes.py b/youtube_dl/extractor/arnes.py new file mode 100644 index 000000000..c0032fcab --- /dev/null +++ b/youtube_dl/extractor/arnes.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + float_or_none, + int_or_none, + parse_iso8601, + remove_start, +) + + +class ArnesIE(InfoExtractor): + IE_NAME = 'video.arnes.si' + IE_DESC = 'Arnes Video' + _VALID_URL = r'https?://video\.arnes\.si/(?:[a-z]{2}/)?(?:watch|embed|api/(?:asset|public/video))/(?P[0-9a-zA-Z]{12})' + _TESTS = [{ + 'url': 'https://video.arnes.si/watch/a1qrWTOQfVoU?t=10', + 'md5': '4d0f4d0a03571b33e1efac25fd4a065d', + 'info_dict': { + 'id': 'a1qrWTOQfVoU', + 'ext': 'mp4', + 'title': 'Linearna neodvisnost, definicija', + 'description': 'Linearna neodvisnost, definicija', + 'license': 'PRIVATE', + 'creator': 'Polona Oblak', + 'timestamp': 1585063725, + 'upload_date': '20200324', + 'channel': 'Polona Oblak', + 'channel_id': 'q6pc04hw24cj', + 'channel_url': 'https://video.arnes.si/?channel=q6pc04hw24cj', + 'duration': 596.75, + 'view_count': int, + 'tags': ['linearna_algebra'], + 'start_time': 10, + } + }, { + 'url': 'https://video.arnes.si/api/asset/s1YjnV7hadlC/play.mp4', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/embed/s1YjnV7hadlC', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/en/watch/s1YjnV7hadlC', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/embed/s1YjnV7hadlC?t=123&hideRelated=1', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/api/public/video/s1YjnV7hadlC', + 'only_matching': True, + }] + _BASE_URL = 'https://video.arnes.si' + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + self._BASE_URL + '/api/public/video/' + video_id, video_id)['data'] + title = video['title'] + + formats = [] + for media in (video.get('media') or []): + media_url = media.get('url') + if not media_url: + continue + formats.append({ + 'url': self._BASE_URL + media_url, + 'format_id': remove_start(media.get('format'), 'FORMAT_'), + 'format_note': media.get('formatTranslation'), + 'width': int_or_none(media.get('width')), + 'height': int_or_none(media.get('height')), + }) + self._sort_formats(formats) + + channel = video.get('channel') or {} + channel_id = channel.get('url') + thumbnail = video.get('thumbnailUrl') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': self._BASE_URL + thumbnail, + 'description': video.get('description'), + 'license': video.get('license'), + 'creator': video.get('author'), + 'timestamp': parse_iso8601(video.get('creationTime')), + 'channel': channel.get('name'), + 'channel_id': channel_id, + 'channel_url': self._BASE_URL + '/?channel=' + channel_id if channel_id else None, + 'duration': float_or_none(video.get('duration'), 1000), + 'view_count': int_or_none(video.get('views')), + 'tags': video.get('hashtags'), + 'start_time': int_or_none(compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get('t', [None])[0]), + } diff --git a/youtube_dl/extractor/bandaichannel.py b/youtube_dl/extractor/bandaichannel.py new file mode 100644 index 000000000..d67285913 --- /dev/null +++ b/youtube_dl/extractor/bandaichannel.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .brightcove import BrightcoveNewIE +from ..utils import extract_attributes + + +class BandaiChannelIE(BrightcoveNewIE): + IE_NAME = 'bandaichannel' + _VALID_URL = r'https?://(?:www\.)?b-ch\.com/titles/(?P\d+/\d+)' + _TESTS = [{ + 'url': 'https://www.b-ch.com/titles/514/001', + 'md5': 'a0f2d787baa5729bed71108257f613a4', + 'info_dict': { + 'id': '6128044564001', + 'ext': 'mp4', + 'title': 'メタルファイターMIKU 第1話', + 'timestamp': 1580354056, + 'uploader_id': '5797077852001', + 'upload_date': '20200130', + 'duration': 1387.733, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + attrs = extract_attributes(self._search_regex( + r'(]+\bid="bcplayer"[^>]*>)', webpage, 'player')) + bc = self._download_json( + 'https://pbifcd.b-ch.com/v1/playbackinfo/ST/70/' + attrs['data-info'], + video_id, headers={'X-API-KEY': attrs['data-auth'].strip()})['bc'] + return self._parse_brightcove_metadata(bc, bc['id']) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 69e673a26..006aab3b4 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -49,6 +49,7 @@ class BandcampIE(InfoExtractor): 'uploader': 'Ben Prunty', 'timestamp': 1396508491, 'upload_date': '20140403', + 'release_timestamp': 1396483200, 'release_date': '20140403', 'duration': 260.877, 'track': 'Lanius (Battle)', @@ -69,6 +70,7 @@ class BandcampIE(InfoExtractor): 'uploader': 'Mastodon', 'timestamp': 1322005399, 'upload_date': '20111122', + 'release_timestamp': 1076112000, 'release_date': '20040207', 'duration': 120.79, 'track': 'Hail to Fire', @@ -197,7 +199,7 @@ class BandcampIE(InfoExtractor): 'thumbnail': thumbnail, 'uploader': artist, 'timestamp': timestamp, - 'release_date': unified_strdate(tralbum.get('album_release_date')), + 'release_timestamp': unified_timestamp(tralbum.get('album_release_date')), 'duration': duration, 'track': track, 'track_number': track_number, diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index b4daee54e..e8d000bbb 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1,14 +1,24 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import itertools +import json import re from .common import InfoExtractor +from ..compat import ( + compat_etree_Element, + compat_HTTPError, + compat_parse_qs, + compat_urllib_parse_urlparse, + compat_urlparse, +) from ..utils import ( + ExtractorError, + OnDemandPagedList, clean_html, dict_get, - ExtractorError, float_or_none, get_element_by_class, int_or_none, @@ -21,11 +31,6 @@ from ..utils import ( urlencode_postdata, urljoin, ) -from ..compat import ( - compat_etree_Element, - compat_HTTPError, - compat_urlparse, -) class BBCCoUkIE(InfoExtractor): @@ -793,11 +798,25 @@ class BBCIE(BBCCoUkIE): 'description': 'Learn English words and phrases from this story', }, 'add_ie': [BBCCoUkIE.ie_key()], + }, { + # BBC Reel + 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness', + 'info_dict': { + 'id': 'p07c6sb9', + 'ext': 'mp4', + 'title': 'How positive thinking is harming your happiness', + 'alt_title': 'The downsides of positive thinking', + 'description': 'md5:fad74b31da60d83b8265954ee42d85b4', + 'duration': 235, + 'thumbnail': r're:https?://.+/p07c9dsr.jpg', + 'upload_date': '20190604', + 'categories': ['Psychology'], + }, }] @classmethod def suitable(cls, url): - EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE) + EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE) return (False if any(ie.suitable(url) for ie in EXCLUDE_IE) else super(BBCIE, cls).suitable(url)) @@ -929,7 +948,7 @@ class BBCIE(BBCCoUkIE): else: entry['title'] = info['title'] entry['formats'].extend(info['formats']) - except Exception as e: + except ExtractorError as e: # Some playlist URL may fail with 500, at the same time # the other one may work fine (e.g. # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) @@ -980,6 +999,37 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } + # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness) + initial_data = self._parse_json(self._html_search_regex( + r']+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P(?:(?!\2).)+)', + webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False) + if initial_data: + init_data = try_get( + initial_data, lambda x: x['initData']['items'][0], dict) or {} + smp_data = init_data.get('smpData') or {} + clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {} + version_id = clip_data.get('versionID') + if version_id: + title = smp_data['title'] + formats, subtitles = self._download_media_selector(version_id) + self._sort_formats(formats) + image_url = smp_data.get('holdingImageURL') + display_date = init_data.get('displayDate') + topic_title = init_data.get('topicTitle') + + return { + 'id': version_id, + 'title': title, + 'formats': formats, + 'alt_title': init_data.get('shortTitle'), + 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None, + 'description': smp_data.get('summary') or init_data.get('shortSummary'), + 'upload_date': display_date.replace('-', '') if display_date else None, + 'subtitles': subtitles, + 'duration': int_or_none(clip_data.get('duration')), + 'categories': [topic_title] if topic_title else None, + } + # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) # There are several setPayload calls may be present but the video # seems to be always related to the first one @@ -1041,7 +1091,7 @@ class BBCIE(BBCCoUkIE): thumbnail = None image_url = current_programme.get('image_url') if image_url: - thumbnail = image_url.replace('{recipe}', '1920x1920') + thumbnail = image_url.replace('{recipe}', 'raw') return { 'id': programme_id, 'title': title, @@ -1293,21 +1343,149 @@ class BBCCoUkPlaylistBaseIE(InfoExtractor): playlist_id, title, description) -class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): - IE_NAME = 'bbc.co.uk:iplayer:playlist' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P%s)' % BBCCoUkIE._ID_REGEX - _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s' - _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)' +class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor): + _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P%s)' % BBCCoUkIE._ID_REGEX + + @staticmethod + def _get_default(episode, key, default_key='default'): + return try_get(episode, lambda x: x[key][default_key]) + + def _get_description(self, data): + synopsis = data.get(self._DESCRIPTION_KEY) or {} + return dict_get(synopsis, ('large', 'medium', 'small')) + + def _fetch_page(self, programme_id, per_page, series_id, page): + elements = self._get_elements(self._call_api( + programme_id, per_page, page + 1, series_id)) + for element in elements: + episode = self._get_episode(element) + episode_id = episode.get('id') + if not episode_id: + continue + thumbnail = None + image = self._get_episode_image(episode) + if image: + thumbnail = image.replace('{recipe}', 'raw') + category = self._get_default(episode, 'labels', 'category') + yield { + '_type': 'url', + 'id': episode_id, + 'title': self._get_episode_field(episode, 'subtitle'), + 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id, + 'thumbnail': thumbnail, + 'description': self._get_description(episode), + 'categories': [category] if category else None, + 'series': self._get_episode_field(episode, 'title'), + 'ie_key': BBCCoUkIE.ie_key(), + } + + def _real_extract(self, url): + pid = self._match_id(url) + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + series_id = qs.get('seriesId', [None])[0] + page = qs.get('page', [None])[0] + per_page = 36 if page else self._PAGE_SIZE + fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id) + entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE) + playlist_data = self._get_playlist_data(self._call_api(pid, 1)) + return self.playlist_result( + entries, pid, self._get_playlist_title(playlist_data), + self._get_description(playlist_data)) + + +class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:episodes' + _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes' _TESTS = [{ 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v', 'info_dict': { 'id': 'b05rcz9v', 'title': 'The Disappearance', - 'description': 'French thriller serial about a missing teenager.', + 'description': 'md5:58eb101aee3116bad4da05f91179c0cb', }, - 'playlist_mincount': 6, - 'skip': 'This programme is not currently available on BBC iPlayer', + 'playlist_mincount': 8, }, { + # all seasons + 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster', + 'info_dict': { + 'id': 'b094m5t9', + 'title': 'Doctor Foster', + 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6', + }, + 'playlist_mincount': 10, + }, { + # explicit season + 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv', + 'info_dict': { + 'id': 'b094m5t9', + 'title': 'Doctor Foster', + 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6', + }, + 'playlist_mincount': 5, + }, { + # all pages + 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove', + 'info_dict': { + 'id': 'm0004c4v', + 'title': 'Beechgrove', + 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.', + }, + 'playlist_mincount': 37, + }, { + # explicit page + 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2', + 'info_dict': { + 'id': 'm0004c4v', + 'title': 'Beechgrove', + 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.', + }, + 'playlist_mincount': 1, + }] + _PAGE_SIZE = 100 + _DESCRIPTION_KEY = 'synopsis' + + def _get_episode_image(self, episode): + return self._get_default(episode, 'image') + + def _get_episode_field(self, episode, field): + return self._get_default(episode, field) + + @staticmethod + def _get_elements(data): + return data['entities']['results'] + + @staticmethod + def _get_episode(element): + return element.get('episode') or {} + + def _call_api(self, pid, per_page, page=1, series_id=None): + variables = { + 'id': pid, + 'page': page, + 'perPage': per_page, + } + if series_id: + variables['sliceId'] = series_id + return self._download_json( + 'https://graph.ibl.api.bbc.co.uk/', pid, headers={ + 'Content-Type': 'application/json' + }, data=json.dumps({ + 'id': '5692d93d5aac8d796a0305e895e61551', + 'variables': variables, + }).encode('utf-8'))['data']['programme'] + + @staticmethod + def _get_playlist_data(data): + return data + + def _get_playlist_title(self, data): + return self._get_default(data, 'title') + + +class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:group' + _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group' + _TESTS = [{ # Available for over a year unlike 30 days for most other programmes 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32', 'info_dict': { @@ -1316,14 +1494,56 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', }, 'playlist_mincount': 10, + }, { + # all pages + 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7', + 'info_dict': { + 'id': 'p081d7j7', + 'title': 'Music in Scotland', + 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.', + }, + 'playlist_mincount': 47, + }, { + # explicit page + 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2', + 'info_dict': { + 'id': 'p081d7j7', + 'title': 'Music in Scotland', + 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.', + }, + 'playlist_mincount': 11, }] + _PAGE_SIZE = 200 + _DESCRIPTION_KEY = 'synopses' - def _extract_title_and_description(self, webpage): - title = self._search_regex(r'

([^<]+)

', webpage, 'title', fatal=False) - description = self._search_regex( - r']+class=(["\'])subtitle\1[^>]*>(?P[^<]+)

', - webpage, 'description', fatal=False, group='value') - return title, description + def _get_episode_image(self, episode): + return self._get_default(episode, 'images', 'standard') + + def _get_episode_field(self, episode, field): + return episode.get(field) + + @staticmethod + def _get_elements(data): + return data['elements'] + + @staticmethod + def _get_episode(element): + return element + + def _call_api(self, pid, per_page, page=1, series_id=None): + return self._download_json( + 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid, + pid, query={ + 'page': page, + 'per_page': per_page, + })['group_episodes'] + + @staticmethod + def _get_playlist_data(data): + return data['group'] + + def _get_playlist_title(self, data): + return data.get('title') class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 4dc597e16..589fdc1ce 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -156,6 +156,7 @@ class BiliBiliIE(InfoExtractor): cid = js['result']['cid'] headers = { + 'Accept': 'application/json', 'Referer': url } headers.update(self.geo_verification_headers()) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 4a19a73d2..c79e55a75 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -27,7 +27,7 @@ class CBSBaseIE(ThePlatformFeedIE): class CBSIE(CBSBaseIE): - _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' + _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:(?:cbs|paramountplus)\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', @@ -52,6 +52,9 @@ class CBSIE(CBSBaseIE): }, { 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, + }, { + 'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/', + 'only_matching': True, }] def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index 83b764762..a891c9a55 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -1,38 +1,113 @@ from __future__ import unicode_literals -from .cbs import CBSBaseIE +import re + +# from .cbs import CBSBaseIE +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, +) -class CBSSportsIE(CBSBaseIE): - _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/(?:video|news)/(?P[^/?#&]+)' - +# class CBSSportsEmbedIE(CBSBaseIE): +class CBSSportsEmbedIE(InfoExtractor): + IE_NAME = 'cbssports:embed' + _VALID_URL = r'''(?ix)https?://(?:(?:www\.)?cbs|embed\.247)sports\.com/player/embed.+? + (?: + ids%3D(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})| + pcid%3D(?P\d+) + )''' _TESTS = [{ - 'url': 'https://www.cbssports.com/nba/video/donovan-mitchell-flashes-star-potential-in-game-2-victory-over-thunder/', - 'info_dict': { - 'id': '1214315075735', - 'ext': 'mp4', - 'title': 'Donovan Mitchell flashes star potential in Game 2 victory over Thunder', - 'description': 'md5:df6f48622612c2d6bd2e295ddef58def', - 'timestamp': 1524111457, - 'upload_date': '20180419', - 'uploader': 'CBSI-NEW', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } + 'url': 'https://www.cbssports.com/player/embed/?args=player_id%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26ids%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26resizable%3D1%26autoplay%3Dtrue%26domain%3Dcbssports.com%26comp_ads_enabled%3Dfalse%26watchAndRead%3D0%26startTime%3D0%26env%3Dprod', + 'only_matching': True, }, { - 'url': 'https://www.cbssports.com/nba/news/nba-playoffs-2018-watch-76ers-vs-heat-game-3-series-schedule-tv-channel-online-stream/', + 'url': 'https://embed.247sports.com/player/embed/?args=%3fplayer_id%3d1827823171591%26channel%3dcollege-football-recruiting%26pcid%3d1827823171591%26width%3d640%26height%3d360%26autoplay%3dTrue%26comp_ads_enabled%3dFalse%26uvpc%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_v4%2526partner%253d247%26uvpc_m%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_m_v4%2526partner_m%253d247_mobile%26utag%3d247sportssite%26resizable%3dTrue', 'only_matching': True, }] - def _extract_video_info(self, filter_query, video_id): - return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + # def _extract_video_info(self, filter_query, video_id): + # return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + def _real_extract(self, url): + uuid, pcid = re.match(self._VALID_URL, url).groups() + query = {'id': uuid} if uuid else {'pcid': pcid} + video = self._download_json( + 'https://www.cbssports.com/api/content/video/', + uuid or pcid, query=query)[0] + video_id = video['id'] + title = video['title'] + metadata = video.get('metaData') or {} + # return self._extract_video_info('byId=%d' % metadata['mpxOutletId'], video_id) + # return self._extract_video_info('byGuid=' + metadata['mpxRefId'], video_id) + + formats = self._extract_m3u8_formats( + metadata['files'][0]['url'], video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + self._sort_formats(formats) + + image = video.get('image') + thumbnails = None + if image: + image_path = image.get('path') + if image_path: + thumbnails = [{ + 'url': image_path, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + 'filesize': int_or_none(image.get('size')), + }] + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': video.get('description'), + 'timestamp': int_or_none(try_get(video, lambda x: x['dateCreated']['epoch'])), + 'duration': int_or_none(metadata.get('duration')), + } + + +class CBSSportsBaseIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - [r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'], - webpage, 'video id') - return self._extract_video_info('byId=%s' % video_id, video_id) + iframe_url = self._search_regex( + r']+(?:data-)?src="(https?://[^/]+/player/embed[^"]+)"', + webpage, 'embed url') + return self.url_result(iframe_url, CBSSportsEmbedIE.ie_key()) + + +class CBSSportsIE(CBSSportsBaseIE): + IE_NAME = 'cbssports' + _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/video/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.cbssports.com/college-football/video/cover-3-stanford-spring-gleaning/', + 'info_dict': { + 'id': 'b56c03a6-231a-4bbe-9c55-af3c8a8e9636', + 'ext': 'mp4', + 'title': 'Cover 3: Stanford Spring Gleaning', + 'description': 'The Cover 3 crew break down everything you need to know about the Stanford Cardinal this spring.', + 'timestamp': 1617218398, + 'upload_date': '20210331', + 'duration': 502, + }, + }] + + +class TwentyFourSevenSportsIE(CBSSportsBaseIE): + IE_NAME = '247sports' + _VALID_URL = r'https?://(?:www\.)?247sports\.com/Video/(?:[^/?#&]+-)?(?P\d+)' + _TESTS = [{ + 'url': 'https://247sports.com/Video/2021-QB-Jake-Garcia-senior-highlights-through-five-games-10084854/', + 'info_dict': { + 'id': '4f1265cb-c3b5-44a8-bb1d-1914119a0ccc', + 'ext': 'mp4', + 'title': '2021 QB Jake Garcia senior highlights through five games', + 'description': 'md5:8cb67ebed48e2e6adac1701e0ff6e45b', + 'timestamp': 1607114223, + 'upload_date': '20201204', + 'duration': 208, + }, + }] diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8eb110f4e..797c35fd5 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -17,7 +17,7 @@ import math from ..compat import ( compat_cookiejar_Cookie, - compat_cookies, + compat_cookies_SimpleCookie, compat_etree_Element, compat_etree_fromstring, compat_getpass, @@ -230,8 +230,10 @@ class InfoExtractor(object): uploader: Full name of the video uploader. license: License name the video is licensed under. creator: The creator of the video. + release_timestamp: UNIX timestamp of the moment the video was released. release_date: The date (YYYYMMDD) when the video was released. - timestamp: UNIX timestamp of the moment the video became available. + timestamp: UNIX timestamp of the moment the video became available + (uploaded). upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. @@ -1273,6 +1275,7 @@ class InfoExtractor(object): def extract_video_object(e): assert e['@type'] == 'VideoObject' + author = e.get('author') info.update({ 'url': url_or_none(e.get('contentUrl')), 'title': unescapeHTML(e.get('name')), @@ -1280,7 +1283,11 @@ class InfoExtractor(object): 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), - 'uploader': str_or_none(e.get('author')), + # author can be an instance of 'Organization' or 'Person' types. + # both types can have 'name' property(inherited from 'Thing' type). [1] + # however some websites are using 'Text' type instead. + # 1. https://schema.org/VideoObject + 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None, 'filesize': float_or_none(e.get('contentSize')), 'tbr': int_or_none(e.get('bitrate')), 'width': int_or_none(e.get('width')), @@ -2894,10 +2901,10 @@ class InfoExtractor(object): self._downloader.cookiejar.set_cookie(cookie) def _get_cookies(self, url): - """ Return a compat_cookies.SimpleCookie with the cookies for the url """ + """ Return a compat_cookies_SimpleCookie with the cookies for the url """ req = sanitized_Request(url) self._downloader.cookiejar.add_cookie_header(req) - return compat_cookies.SimpleCookie(req.get_header('Cookie')) + return compat_cookies_SimpleCookie(req.get_header('Cookie')) def _apply_first_set_cookie_header(self, url_handle, cookie): """ diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py index e4a7fca6c..ae64a07d7 100644 --- a/youtube_dl/extractor/curiositystream.py +++ b/youtube_dl/extractor/curiositystream.py @@ -25,12 +25,12 @@ class CuriosityStreamBaseIE(InfoExtractor): raise ExtractorError( '%s said: %s' % (self.IE_NAME, error), expected=True) - def _call_api(self, path, video_id): + def _call_api(self, path, video_id, query=None): headers = {} if self._auth_token: headers['X-Auth-Token'] = self._auth_token result = self._download_json( - self._API_BASE_URL + path, video_id, headers=headers) + self._API_BASE_URL + path, video_id, headers=headers, query=query) self._handle_errors(result) return result['data'] @@ -52,62 +52,75 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P\d+)' _TEST = { 'url': 'https://app.curiositystream.com/video/2', - 'md5': '262bb2f257ff301115f1973540de8983', 'info_dict': { 'id': '2', 'ext': 'mp4', 'title': 'How Did You Develop The Internet?', 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', - } + }, + 'params': { + 'format': 'bestvideo', + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): video_id = self._match_id(url) - media = self._call_api('media/' + video_id, video_id) - title = media['title'] formats = [] - for encoding in media.get('encodings', []): - m3u8_url = encoding.get('master_playlist_url') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - encoding_url = encoding.get('url') - file_url = encoding.get('file_url') - if not encoding_url and not file_url: - continue - f = { - 'width': int_or_none(encoding.get('width')), - 'height': int_or_none(encoding.get('height')), - 'vbr': int_or_none(encoding.get('video_bitrate')), - 'abr': int_or_none(encoding.get('audio_bitrate')), - 'filesize': int_or_none(encoding.get('size_in_bytes')), - 'vcodec': encoding.get('video_codec'), - 'acodec': encoding.get('audio_codec'), - 'container': encoding.get('container_type'), - } - for f_url in (encoding_url, file_url): - if not f_url: + for encoding_format in ('m3u8', 'mpd'): + media = self._call_api('media/' + video_id, video_id, query={ + 'encodingsNew': 'true', + 'encodingsFormat': encoding_format, + }) + for encoding in media.get('encodings', []): + playlist_url = encoding.get('master_playlist_url') + if encoding_format == 'm3u8': + # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol + formats.extend(self._extract_m3u8_formats( + playlist_url, video_id, 'mp4', + m3u8_id='hls', fatal=False)) + elif encoding_format == 'mpd': + formats.extend(self._extract_mpd_formats( + playlist_url, video_id, mpd_id='dash', fatal=False)) + encoding_url = encoding.get('url') + file_url = encoding.get('file_url') + if not encoding_url and not file_url: continue - fmt = f.copy() - rtmp = re.search(r'^(?Prtmpe?://(?P[^/]+)/(?P.+))/(?Pmp[34]:.+)$', f_url) - if rtmp: - fmt.update({ - 'url': rtmp.group('url'), - 'play_path': rtmp.group('playpath'), - 'app': rtmp.group('app'), - 'ext': 'flv', - 'format_id': 'rtmp', - }) - else: - fmt.update({ - 'url': f_url, - 'format_id': 'http', - }) - formats.append(fmt) + f = { + 'width': int_or_none(encoding.get('width')), + 'height': int_or_none(encoding.get('height')), + 'vbr': int_or_none(encoding.get('video_bitrate')), + 'abr': int_or_none(encoding.get('audio_bitrate')), + 'filesize': int_or_none(encoding.get('size_in_bytes')), + 'vcodec': encoding.get('video_codec'), + 'acodec': encoding.get('audio_codec'), + 'container': encoding.get('container_type'), + } + for f_url in (encoding_url, file_url): + if not f_url: + continue + fmt = f.copy() + rtmp = re.search(r'^(?Prtmpe?://(?P[^/]+)/(?P.+))/(?Pmp[34]:.+)$', f_url) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + 'format_id': 'rtmp', + }) + else: + fmt.update({ + 'url': f_url, + 'format_id': 'http', + }) + formats.append(fmt) self._sort_formats(formats) + title = media['title'] + subtitles = {} for closed_caption in media.get('closed_captions', []): sub_url = closed_caption.get('file') @@ -140,7 +153,7 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): 'title': 'Curious Minds: The Internet', 'description': 'How is the internet shaping our lives in the 21st Century?', }, - 'playlist_mincount': 17, + 'playlist_mincount': 16, }, { 'url': 'https://curiositystream.com/series/2', 'only_matching': True, diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 0f0632f26..bbb199094 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -330,6 +330,7 @@ class DiscoveryPlusIE(DPlayIE): 'videoId': video_id, 'wisteriaProperties': { 'platform': 'desktop', + 'product': 'dplus_us', }, }).encode('utf-8'))['data']['attributes']['streaming'] diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 848d387d1..5a07c18f4 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -1,193 +1,43 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, - xpath_text, - determine_ext, - float_or_none, - ExtractorError, -) +from .zdf import ZDFIE -class DreiSatIE(InfoExtractor): +class DreiSatIE(ZDFIE): IE_NAME = '3sat' - _GEO_COUNTRIES = ['DE'] - _VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)' - _TESTS = [ - { - 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', - 'md5': 'be37228896d30a88f315b638900a026e', - 'info_dict': { - 'id': '45918', - 'ext': 'mp4', - 'title': 'Waidmannsheil', - 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', - 'uploader': 'SCHWEIZWEIT', - 'uploader_id': '100000210', - 'upload_date': '20140913' - }, - 'params': { - 'skip_download': True, # m3u8 downloads - } + _VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/]+/)*(?P[^/?#&]+)\.html' + _TESTS = [{ + # Same as https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html + 'url': 'https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html', + 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', + 'info_dict': { + 'id': '141007_ab18_10wochensommer_film', + 'ext': 'mp4', + 'title': 'Ab 18! - 10 Wochen Sommer', + 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', + 'duration': 2660, + 'timestamp': 1608604200, + 'upload_date': '20201222', }, - { - 'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066', - 'only_matching': True, + }, { + 'url': 'https://www.3sat.de/gesellschaft/schweizweit/waidmannsheil-100.html', + 'info_dict': { + 'id': '140913_sendung_schweizweit', + 'ext': 'mp4', + 'title': 'Waidmannsheil', + 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', + 'timestamp': 1410623100, + 'upload_date': '20140913' }, - ] - - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - param_groups = {} - for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): - group_id = param_group.get(self._xpath_ns( - 'id', 'http://www.w3.org/XML/1998/namespace')) - params = {} - for param in param_group: - params[param.get('name')] = param.get('value') - param_groups[group_id] = params - - formats = [] - for video in smil.findall(self._xpath_ns('.//video', namespace)): - src = video.get('src') - if not src: - continue - bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) - group_id = video.get('paramGroup') - param_group = param_groups[group_id] - for proto in param_group['protocols'].split(','): - formats.append({ - 'url': '%s://%s' % (proto, param_group['host']), - 'app': param_group['app'], - 'play_path': src, - 'ext': 'flv', - 'format_id': '%s-%d' % (proto, bitrate), - 'tbr': bitrate, - }) - self._sort_formats(formats) - return formats - - def extract_from_xml_url(self, video_id, xml_url): - doc = self._download_xml( - xml_url, video_id, - note='Downloading video info', - errnote='Failed to download video info') - - status_code = xpath_text(doc, './status/statuscode') - if status_code and status_code != 'ok': - if status_code == 'notVisibleAnymore': - message = 'Video %s is not available' % video_id - else: - message = '%s returned error: %s' % (self.IE_NAME, status_code) - raise ExtractorError(message, expected=True) - - title = xpath_text(doc, './/information/title', 'title', True) - - urls = [] - formats = [] - for fnode in doc.findall('.//formitaeten/formitaet'): - video_url = xpath_text(fnode, 'url') - if not video_url or video_url in urls: - continue - urls.append(video_url) - - is_available = 'http://www.metafilegenerator' not in video_url - geoloced = 'static_geoloced_online' in video_url - if not is_available or geoloced: - continue - - format_id = fnode.attrib['basetype'] - format_m = re.match(r'''(?x) - (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_ - (?P[^_]+)_(?P[^_]+)_(?P[^_]+) - ''', format_id) - - ext = determine_ext(video_url, None) or format_m.group('container') - - if ext == 'meta': - continue - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - video_url, video_id, fatal=False)) - elif ext == 'm3u8': - # the certificates are misconfigured (see - # https://github.com/ytdl-org/youtube-dl/issues/8665) - if video_url.startswith('https://'): - continue - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=format_id, fatal=False)) - else: - quality = xpath_text(fnode, './quality') - if quality: - format_id += '-' + quality - - abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000) - vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000) - - tbr = int_or_none(self._search_regex( - r'_(\d+)k', video_url, 'bitrate', None)) - if tbr and vbr and not abr: - abr = tbr - vbr - - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': ext, - 'acodec': format_m.group('acodec'), - 'vcodec': format_m.group('vcodec'), - 'abr': abr, - 'vbr': vbr, - 'tbr': tbr, - 'width': int_or_none(xpath_text(fnode, './width')), - 'height': int_or_none(xpath_text(fnode, './height')), - 'filesize': int_or_none(xpath_text(fnode, './filesize')), - 'protocol': format_m.group('proto').lower(), - }) - - geolocation = xpath_text(doc, './/details/geolocation') - if not formats and geolocation and geolocation != 'none': - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - - self._sort_formats(formats) - - thumbnails = [] - for node in doc.findall('.//teaserimages/teaserimage'): - thumbnail_url = node.text - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - thumbnail_key = node.get('key') - if thumbnail_key: - m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) - - upload_date = unified_strdate(xpath_text(doc, './/details/airtime')) - - return { - 'id': video_id, - 'title': title, - 'description': xpath_text(doc, './/information/detail'), - 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')), - 'thumbnails': thumbnails, - 'uploader': xpath_text(doc, './/details/originChannelTitle'), - 'uploader_id': xpath_text(doc, './/details/originChannelId'), - 'upload_date': upload_date, - 'formats': formats, + 'params': { + 'skip_download': True, } - - def _real_extract(self, url): - video_id = self._match_id(url) - details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id - return self.extract_from_xml_url(video_id, details_url) + }, { + # Same as https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html + 'url': 'https://www.3sat.de/film/spielfilm/der-hauptmann-100.html', + 'only_matching': True, + }, { + # Same as https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids + 'url': 'https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html', + 'only_matching': True, + }] diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1a39c25c5..ac33cd996 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -72,6 +72,7 @@ from .arte import ( ArteTVEmbedIE, ArteTVPlaylistIE, ) +from .arnes import ArnesIE from .asiancrush import ( AsianCrushIE, AsianCrushPlaylistIE, @@ -90,11 +91,13 @@ from .awaan import ( ) from .azmedien import AZMedienIE from .baidu import BaiduVideoIE +from .bandaichannel import BandaiChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE from .bbc import ( BBCCoUkIE, BBCCoUkArticleIE, - BBCCoUkIPlayerPlaylistIE, + BBCCoUkIPlayerEpisodesIE, + BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE, BBCIE, ) @@ -188,7 +191,11 @@ from .cbsnews import ( CBSNewsIE, CBSNewsLiveVideoIE, ) -from .cbssports import CBSSportsIE +from .cbssports import ( + CBSSportsEmbedIE, + CBSSportsIE, + TwentyFourSevenSportsIE, +) from .ccc import ( CCCIE, CCCPlaylistIE, @@ -421,6 +428,7 @@ from .gamestar import GameStarIE from .gaskrank import GaskrankIE from .gazeta import GazetaIE from .gdcvault import GDCVaultIE +from .gedidigital import GediDigitalIE from .generic import GenericIE from .gfycat import GfycatIE from .giantbomb import GiantBombIE @@ -591,7 +599,11 @@ from .limelight import ( LimelightChannelIE, LimelightChannelListIE, ) -from .line import LineTVIE +from .line import ( + LineTVIE, + LineLiveIE, + LineLiveChannelIE, +) from .linkedin import ( LinkedInLearningIE, LinkedInLearningCourseIE, @@ -628,6 +640,7 @@ from .mangomolo import ( MangomoloLiveIE, ) from .manyvids import ManyVidsIE +from .maoritv import MaoriTVIE from .markiza import ( MarkizaIE, MarkizaPageIE, @@ -671,7 +684,10 @@ from .mixcloud import ( MixcloudUserIE, MixcloudPlaylistIE, ) -from .mlb import MLBIE +from .mlb import ( + MLBIE, + MLBVideoIE, +) from .mnet import MnetIE from .moevideo import MoeVideoIE from .mofosex import ( @@ -872,6 +888,11 @@ from .packtpub import ( PacktPubIE, PacktPubCourseIE, ) +from .palcomp3 import ( + PalcoMP3IE, + PalcoMP3ArtistIE, + PalcoMP3VideoIE, +) from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE @@ -1619,5 +1640,9 @@ from .zattoo import ( ) from .zdf import ZDFIE, ZDFChannelIE from .zhihu import ZhihuIE -from .zingmp3 import ZingMp3IE +from .zingmp3 import ( + ZingMp3IE, + ZingMp3AlbumIE, +) +from .zoom import ZoomIE from .zype import ZypeIE diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 3ca415077..7cc88bf18 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -399,7 +399,8 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): video_id = self._search_regex( (r'player\.load[^;]+src:\s*["\']([^"\']+)', r'id-video=([^@]+@[^"]+)', - r']+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'), + r']+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', + r'data-id=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'), webpage, 'video id') return self._make_url_result(video_id) diff --git a/youtube_dl/extractor/fujitv.py b/youtube_dl/extractor/fujitv.py index 39685e075..a02a94374 100644 --- a/youtube_dl/extractor/fujitv.py +++ b/youtube_dl/extractor/fujitv.py @@ -17,7 +17,7 @@ class FujiTVFODPlus7IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) formats = self._extract_m3u8_formats( - self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id) + self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id, 'mp4') for f in formats: wh = self._BITRATE_MAP.get(f.get('tbr')) if wh: diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py new file mode 100644 index 000000000..6c4153b40 --- /dev/null +++ b/youtube_dl/extractor/gedidigital.py @@ -0,0 +1,161 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, +) + + +class GediDigitalIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://video\. + (?: + (?: + (?:espresso\.)?repubblica + |lastampa + |ilsecoloxix + )| + (?: + iltirreno + |messaggeroveneto + |ilpiccolo + |gazzettadimantova + |mattinopadova + |laprovinciapavese + |tribunatreviso + |nuovavenezia + |gazzettadimodena + |lanuovaferrara + |corrierealpi + |lasentinella + )\.gelocal + )\.it(?:/[^/]+){2,3}?/(?P\d+)(?:[/?&#]|$)''' + _TESTS = [{ + 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', + 'md5': '84658d7fb9e55a6e57ecc77b73137494', + 'info_dict': { + 'id': '121559', + 'ext': 'mp4', + 'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso', + 'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca', + 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-full-.+?\.jpg$', + 'duration': 125, + }, + }, { + 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', + 'only_matching': True, + }, { + 'url': 'https://video.repubblica.it/motori/record-della-pista-a-spa-francorchamps-la-pagani-huayra-roadster-bc-stupisce/367415/367963', + 'only_matching': True, + }, { + 'url': 'https://video.ilsecoloxix.it/sport/cassani-e-i-brividi-azzurri-ai-mondiali-di-imola-qui-mi-sono-innamorato-del-ciclismo-da-ragazzino-incredibile-tornarci-da-ct/66184/66267', + 'only_matching': True, + }, { + 'url': 'https://video.iltirreno.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/141059/142723', + 'only_matching': True, + }, { + 'url': 'https://video.messaggeroveneto.gelocal.it/locale/maria-giovanna-elmi-covid-vaccino/138155/139268', + 'only_matching': True, + }, { + 'url': 'https://video.ilpiccolo.gelocal.it/dossier/big-john/dinosauro-big-john-al-via-le-visite-guidate-a-trieste/135226/135751', + 'only_matching': True, + }, { + 'url': 'https://video.gazzettadimantova.gelocal.it/locale/dal-ponte-visconteo-di-valeggio-l-and-8217sos-dei-ristoratori-aprire-anche-a-cena/137310/137818', + 'only_matching': True, + }, { + 'url': 'https://video.mattinopadova.gelocal.it/dossier/coronavirus-in-veneto/covid-a-vo-un-anno-dopo-un-cuore-tricolore-per-non-dimenticare/138402/138964', + 'only_matching': True, + }, { + 'url': 'https://video.laprovinciapavese.gelocal.it/locale/mede-zona-rossa-via-alle-vaccinazioni-per-gli-over-80/137545/138120', + 'only_matching': True, + }, { + 'url': 'https://video.tribunatreviso.gelocal.it/dossier/coronavirus-in-veneto/ecco-le-prima-vaccinazioni-di-massa-nella-marca/134485/135024', + 'only_matching': True, + }, { + 'url': 'https://video.nuovavenezia.gelocal.it/locale/camion-troppo-alto-per-il-ponte-ferroviario-perde-il-carico/135734/136266', + 'only_matching': True, + }, { + 'url': 'https://video.gazzettadimodena.gelocal.it/locale/modena-scoperta-la-proteina-che-predice-il-livello-di-gravita-del-covid/139109/139796', + 'only_matching': True, + }, { + 'url': 'https://video.lanuovaferrara.gelocal.it/locale/due-bombole-di-gpl-aperte-e-abbandonate-i-vigili-bruciano-il-gas/134391/134957', + 'only_matching': True, + }, { + 'url': 'https://video.corrierealpi.gelocal.it/dossier/cortina-2021-i-mondiali-di-sci-alpino/mondiali-di-sci-il-timelapse-sulla-splendida-olympia/133760/134331', + 'only_matching': True, + }, { + 'url': 'https://video.lasentinella.gelocal.it/locale/vestigne-centra-un-auto-e-si-ribalta/138931/139466', + 'only_matching': True, + }, { + 'url': 'https://video.espresso.repubblica.it/tutti-i-video/01-ted-villa/14772', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + title = self._html_search_meta( + ['twitter:title', 'og:title'], webpage, fatal=True) + player_data = re.findall( + r"PlayerFactory\.setParam\('(?Pformat|param)',\s*'(?P[^']+)',\s*'(?P[^']+)'\);", + webpage) + + formats = [] + duration = thumb = None + for t, n, v in player_data: + if t == 'format': + if n in ('video-hds-vod-ec', 'video-hls-vod-ec', 'video-viralize', 'video-youtube-pfp'): + continue + elif n.endswith('-vod-ak'): + formats.extend(self._extract_akamai_formats( + v, video_id, {'http': 'media.gedidigital.it'})) + else: + ext = determine_ext(v) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + v, video_id, 'mp4', 'm3u8_native', m3u8_id=n, fatal=False)) + continue + f = { + 'format_id': n, + 'url': v, + } + if ext == 'mp3': + abr = int_or_none(self._search_regex( + r'-mp3-audio-(\d+)', v, 'abr', default=None)) + f.update({ + 'abr': abr, + 'tbr': abr, + 'vcodec': 'none' + }) + else: + mobj = re.match(r'^video-rrtv-(\d+)(?:-(\d+))?$', n) + if mobj: + f.update({ + 'height': int(mobj.group(1)), + 'vbr': int_or_none(mobj.group(2)), + }) + if not f.get('vbr'): + f['vbr'] = int_or_none(self._search_regex( + r'-video-rrtv-(\d+)', v, 'abr', default=None)) + formats.append(f) + elif t == 'param': + if n in ['image_full', 'image']: + thumb = v + elif n == 'videoDuration': + duration = int_or_none(v) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta( + ['twitter:description', 'og:description', 'description'], webpage), + 'thumbnail': thumb or self._og_search_thumbnail(webpage), + 'formats': formats, + 'duration': duration, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c2b1b3bdf..f99d887ca 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2953,7 +2953,7 @@ class GenericIE(InfoExtractor): webpage) if not mobj: mobj = re.search( - r'data-video-link=["\'](?Phttp://m.mlb.com/video/[^"\']+)', + r'data-video-link=["\'](?Phttp://m\.mlb\.com/video/[^"\']+)', webpage) if mobj is not None: return self.url_result(mobj.group('url'), 'MLB') diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 1eeddc3b6..12e10143c 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -12,6 +12,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + float_or_none, get_element_by_attribute, int_or_none, lowercase_escape, @@ -32,6 +33,7 @@ class InstagramIE(InfoExtractor): 'title': 'Video by naomipq', 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 0, 'timestamp': 1371748545, 'upload_date': '20130620', 'uploader_id': 'naomipq', @@ -48,6 +50,7 @@ class InstagramIE(InfoExtractor): 'ext': 'mp4', 'title': 'Video by britneyspears', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 0, 'timestamp': 1453760977, 'upload_date': '20160125', 'uploader_id': 'britneyspears', @@ -86,6 +89,24 @@ class InstagramIE(InfoExtractor): 'title': 'Post by instagram', 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', }, + }, { + # IGTV + 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/', + 'info_dict': { + 'id': 'BkfuX9UB-eK', + 'ext': 'mp4', + 'title': 'Fingerboarding Tricks with @cass.fb', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 53.83, + 'timestamp': 1530032919, + 'upload_date': '20180626', + 'uploader_id': 'instagram', + 'uploader': 'Instagram', + 'like_count': int, + 'comment_count': int, + 'comments': list, + 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.', + } }, { 'url': 'https://instagram.com/p/-Cmh1cukG2/', 'only_matching': True, @@ -159,7 +180,9 @@ class InstagramIE(InfoExtractor): description = try_get( media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], compat_str) or media.get('caption') + title = media.get('title') thumbnail = media.get('display_src') or media.get('display_url') + duration = float_or_none(media.get('video_duration')) timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) uploader = media.get('owner', {}).get('full_name') uploader_id = media.get('owner', {}).get('username') @@ -200,9 +223,10 @@ class InstagramIE(InfoExtractor): continue entries.append({ 'id': node.get('shortcode') or node['id'], - 'title': 'Video %d' % edge_num, + 'title': node.get('title') or 'Video %d' % edge_num, 'url': node_video_url, 'thumbnail': node.get('display_url'), + 'duration': float_or_none(node.get('video_duration')), 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), 'view_count': int_or_none(node.get('video_view_count')), @@ -239,8 +263,9 @@ class InstagramIE(InfoExtractor): 'id': video_id, 'formats': formats, 'ext': 'mp4', - 'title': 'Video by %s' % uploader_id, + 'title': title or 'Video by %s' % uploader_id, 'description': description, + 'duration': duration, 'thumbnail': thumbnail, 'timestamp': timestamp, 'uploader_id': uploader_id, diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py index 490efa8fb..1db7c64af 100644 --- a/youtube_dl/extractor/jamendo.py +++ b/youtube_dl/extractor/jamendo.py @@ -29,34 +29,51 @@ class JamendoIE(InfoExtractor): 'id': '196219', 'display_id': 'stories-from-emona-i', 'ext': 'flac', - 'title': 'Maya Filipič - Stories from Emona I', - 'artist': 'Maya Filipič', + # 'title': 'Maya Filipič - Stories from Emona I', + 'title': 'Stories from Emona I', + # 'artist': 'Maya Filipič', 'track': 'Stories from Emona I', 'duration': 210, 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1217438117, 'upload_date': '20080730', + 'license': 'by-nc-nd', + 'view_count': int, + 'like_count': int, + 'average_rating': int, + 'tags': ['piano', 'peaceful', 'newage', 'strings', 'upbeat'], } }, { 'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock', 'only_matching': True, }] + def _call_api(self, resource, resource_id): + path = '/api/%ss' % resource + rand = compat_str(random.random()) + return self._download_json( + 'https://www.jamendo.com' + path, resource_id, query={ + 'id[]': resource_id, + }, headers={ + 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand) + })[0] + def _real_extract(self, url): track_id, display_id = self._VALID_URL_RE.match(url).groups() - webpage = self._download_webpage( - 'https://www.jamendo.com/track/' + track_id, track_id) - models = self._parse_json(self._html_search_regex( - r"data-bundled-models='([^']+)", - webpage, 'bundled models'), track_id) - track = models['track']['models'][0] + # webpage = self._download_webpage( + # 'https://www.jamendo.com/track/' + track_id, track_id) + # models = self._parse_json(self._html_search_regex( + # r"data-bundled-models='([^']+)", + # webpage, 'bundled models'), track_id) + # track = models['track']['models'][0] + track = self._call_api('track', track_id) title = track_name = track['name'] - get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {} - artist = get_model('artist') - artist_name = artist.get('name') - if artist_name: - title = '%s - %s' % (artist_name, title) - album = get_model('album') + # get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {} + # artist = get_model('artist') + # artist_name = artist.get('name') + # if artist_name: + # title = '%s - %s' % (artist_name, title) + # album = get_model('album') formats = [{ 'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294' @@ -74,7 +91,7 @@ class JamendoIE(InfoExtractor): urls = [] thumbnails = [] - for _, covers in track.get('cover', {}).items(): + for covers in (track.get('cover') or {}).values(): for cover_id, cover_url in covers.items(): if not cover_url or cover_url in urls: continue @@ -88,13 +105,14 @@ class JamendoIE(InfoExtractor): }) tags = [] - for tag in track.get('tags', []): + for tag in (track.get('tags') or []): tag_name = tag.get('name') if not tag_name: continue tags.append(tag_name) stats = track.get('stats') or {} + license = track.get('licenseCC') or [] return { 'id': track_id, @@ -103,11 +121,11 @@ class JamendoIE(InfoExtractor): 'title': title, 'description': track.get('description'), 'duration': int_or_none(track.get('duration')), - 'artist': artist_name, + # 'artist': artist_name, 'track': track_name, - 'album': album.get('name'), + # 'album': album.get('name'), 'formats': formats, - 'license': '-'.join(track.get('licenseCC', [])) or None, + 'license': '-'.join(license) if license else None, 'timestamp': int_or_none(track.get('dateCreated')), 'view_count': int_or_none(stats.get('listenedAll')), 'like_count': int_or_none(stats.get('favorited')), @@ -116,9 +134,9 @@ class JamendoIE(InfoExtractor): } -class JamendoAlbumIE(InfoExtractor): +class JamendoAlbumIE(JamendoIE): _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P[0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.jamendo.com/album/121486/duck-on-cover', 'info_dict': { 'id': '121486', @@ -151,17 +169,7 @@ class JamendoAlbumIE(InfoExtractor): 'params': { 'playlistend': 2 } - } - - def _call_api(self, resource, resource_id): - path = '/api/%ss' % resource - rand = compat_str(random.random()) - return self._download_json( - 'https://www.jamendo.com' + path, resource_id, query={ - 'id[]': resource_id, - }, headers={ - 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand) - })[0] + }] def _real_extract(self, url): album_id = self._match_id(url) @@ -169,7 +177,7 @@ class JamendoAlbumIE(InfoExtractor): album_name = album.get('name') entries = [] - for track in album.get('tracks', []): + for track in (album.get('tracks') or []): track_id = track.get('id') if not track_id: continue diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py index 413215a99..ae43d56ea 100644 --- a/youtube_dl/extractor/lbry.py +++ b/youtube_dl/extractor/lbry.py @@ -6,8 +6,10 @@ import json from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_str, compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, ) from ..utils import ( determine_ext, @@ -60,6 +62,7 @@ class LBRYBaseIE(InfoExtractor): 'description': stream_value.get('description'), 'license': stream_value.get('license'), 'timestamp': int_or_none(stream.get('timestamp')), + 'release_timestamp': int_or_none(stream_value.get('release_time')), 'tags': stream_value.get('tags'), 'duration': int_or_none(media.get('duration')), 'channel': try_get(signing_channel, lambda x: x['value']['title']), @@ -92,6 +95,8 @@ class LBRYIE(LBRYBaseIE): 'description': 'md5:f6cb5c704b332d37f5119313c2c98f51', 'timestamp': 1595694354, 'upload_date': '20200725', + 'release_timestamp': 1595340697, + 'release_date': '20200721', 'width': 1280, 'height': 720, } @@ -106,6 +111,8 @@ class LBRYIE(LBRYBaseIE): 'description': 'md5:661ac4f1db09f31728931d7b88807a61', 'timestamp': 1591312601, 'upload_date': '20200604', + 'release_timestamp': 1591312421, + 'release_date': '20200604', 'tags': list, 'duration': 2570, 'channel': 'The LBRY Foundation', @@ -181,17 +188,18 @@ class LBRYChannelIE(LBRYBaseIE): }] _PAGE_SIZE = 50 - def _fetch_page(self, claim_id, url, page): + def _fetch_page(self, claim_id, url, params, page): page += 1 + page_params = { + 'channel_ids': [claim_id], + 'claim_type': 'stream', + 'no_totals': True, + 'page': page, + 'page_size': self._PAGE_SIZE, + } + page_params.update(params) result = self._call_api_proxy( - 'claim_search', claim_id, { - 'channel_ids': [claim_id], - 'claim_type': 'stream', - 'no_totals': True, - 'page': page, - 'page_size': self._PAGE_SIZE, - 'stream_types': self._SUPPORTED_STREAM_TYPES, - }, 'page %d' % page) + 'claim_search', claim_id, page_params, 'page %d' % page) for item in (result.get('items') or []): stream_claim_name = item.get('name') stream_claim_id = item.get('claim_id') @@ -212,8 +220,31 @@ class LBRYChannelIE(LBRYBaseIE): result = self._resolve_url( 'lbry://' + display_id, display_id, 'channel') claim_id = result['claim_id'] + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + content = qs.get('content', [None])[0] + params = { + 'fee_amount': qs.get('fee_amount', ['>=0'])[0], + 'order_by': { + 'new': ['release_time'], + 'top': ['effective_amount'], + 'trending': ['trending_group', 'trending_mixed'], + }[qs.get('order', ['new'])[0]], + 'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES, + } + duration = qs.get('duration', [None])[0] + if duration: + params['duration'] = { + 'long': '>=1200', + 'short': '<=240', + }[duration] + language = qs.get('language', ['all'])[0] + if language != 'all': + languages = [language] + if language == 'en': + languages.append('none') + params['any_languages'] = languages entries = OnDemandPagedList( - functools.partial(self._fetch_page, claim_id, url), + functools.partial(self._fetch_page, claim_id, url, params), self._PAGE_SIZE) result_value = result.get('value') or {} return self.playlist_result( diff --git a/youtube_dl/extractor/line.py b/youtube_dl/extractor/line.py index 7f5fa446e..2526daa77 100644 --- a/youtube_dl/extractor/line.py +++ b/youtube_dl/extractor/line.py @@ -4,7 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import js_to_json +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + js_to_json, + str_or_none, +) class LineTVIE(InfoExtractor): @@ -88,3 +94,137 @@ class LineTVIE(InfoExtractor): for thumbnail in video_info.get('thumbnails', {}).get('list', [])], 'view_count': video_info.get('meta', {}).get('count'), } + + +class LineLiveBaseIE(InfoExtractor): + _API_BASE_URL = 'https://live-api.line-apps.com/web/v4.0/channel/' + + def _parse_broadcast_item(self, item): + broadcast_id = compat_str(item['id']) + title = item['title'] + is_live = item.get('isBroadcastingNow') + + thumbnails = [] + for thumbnail_id, thumbnail_url in (item.get('thumbnailURLs') or {}).items(): + if not thumbnail_url: + continue + thumbnails.append({ + 'id': thumbnail_id, + 'url': thumbnail_url, + }) + + channel = item.get('channel') or {} + channel_id = str_or_none(channel.get('id')) + + return { + 'id': broadcast_id, + 'title': self._live_title(title) if is_live else title, + 'thumbnails': thumbnails, + 'timestamp': int_or_none(item.get('createdAt')), + 'channel': channel.get('name'), + 'channel_id': channel_id, + 'channel_url': 'https://live.line.me/channels/' + channel_id if channel_id else None, + 'duration': int_or_none(item.get('archiveDuration')), + 'view_count': int_or_none(item.get('viewerCount')), + 'comment_count': int_or_none(item.get('chatCount')), + 'is_live': is_live, + } + + +class LineLiveIE(LineLiveBaseIE): + _VALID_URL = r'https?://live\.line\.me/channels/(?P\d+)/broadcast/(?P\d+)' + _TESTS = [{ + 'url': 'https://live.line.me/channels/4867368/broadcast/16331360', + 'md5': 'bc931f26bf1d4f971e3b0982b3fab4a3', + 'info_dict': { + 'id': '16331360', + 'title': '振りコピ講座😙😙😙', + 'ext': 'mp4', + 'timestamp': 1617095132, + 'upload_date': '20210330', + 'channel': '白川ゆめか', + 'channel_id': '4867368', + 'view_count': int, + 'comment_count': int, + 'is_live': False, + } + }, { + # archiveStatus == 'DELETED' + 'url': 'https://live.line.me/channels/4778159/broadcast/16378488', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_id, broadcast_id = re.match(self._VALID_URL, url).groups() + broadcast = self._download_json( + self._API_BASE_URL + '%s/broadcast/%s' % (channel_id, broadcast_id), + broadcast_id) + item = broadcast['item'] + info = self._parse_broadcast_item(item) + protocol = 'm3u8' if info['is_live'] else 'm3u8_native' + formats = [] + for k, v in (broadcast.get(('live' if info['is_live'] else 'archived') + 'HLSURLs') or {}).items(): + if not v: + continue + if k == 'abr': + formats.extend(self._extract_m3u8_formats( + v, broadcast_id, 'mp4', protocol, + m3u8_id='hls', fatal=False)) + continue + f = { + 'ext': 'mp4', + 'format_id': 'hls-' + k, + 'protocol': protocol, + 'url': v, + } + if not k.isdigit(): + f['vcodec'] = 'none' + formats.append(f) + if not formats: + archive_status = item.get('archiveStatus') + if archive_status != 'ARCHIVED': + raise ExtractorError('this video has been ' + archive_status.lower(), expected=True) + self._sort_formats(formats) + info['formats'] = formats + return info + + +class LineLiveChannelIE(LineLiveBaseIE): + _VALID_URL = r'https?://live\.line\.me/channels/(?P\d+)(?!/broadcast/\d+)(?:[/?&#]|$)' + _TEST = { + 'url': 'https://live.line.me/channels/5893542', + 'info_dict': { + 'id': '5893542', + 'title': 'いくらちゃん', + 'description': 'md5:c3a4af801f43b2fac0b02294976580be', + }, + 'playlist_mincount': 29 + } + + def _archived_broadcasts_entries(self, archived_broadcasts, channel_id): + while True: + for row in (archived_broadcasts.get('rows') or []): + share_url = str_or_none(row.get('shareURL')) + if not share_url: + continue + info = self._parse_broadcast_item(row) + info.update({ + '_type': 'url', + 'url': share_url, + 'ie_key': LineLiveIE.ie_key(), + }) + yield info + if not archived_broadcasts.get('hasNextPage'): + return + archived_broadcasts = self._download_json( + self._API_BASE_URL + channel_id + '/archived_broadcasts', + channel_id, query={ + 'lastId': info['id'], + }) + + def _real_extract(self, url): + channel_id = self._match_id(url) + channel = self._download_json(self._API_BASE_URL + channel_id, channel_id) + return self.playlist_result( + self._archived_broadcasts_entries(channel.get('archivedBroadcasts') or {}, channel_id), + channel_id, channel.get('title'), channel.get('information')) diff --git a/youtube_dl/extractor/maoritv.py b/youtube_dl/extractor/maoritv.py new file mode 100644 index 000000000..0d23fec75 --- /dev/null +++ b/youtube_dl/extractor/maoritv.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MaoriTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?maoritelevision\.com/shows/(?:[^/]+/)+(?P[^/?&#]+)' + _TEST = { + 'url': 'https://www.maoritelevision.com/shows/korero-mai/S01E054/korero-mai-series-1-episode-54', + 'md5': '5ade8ef53851b6a132c051b1cd858899', + 'info_dict': { + 'id': '4774724855001', + 'ext': 'mp4', + 'title': 'Kōrero Mai, Series 1 Episode 54', + 'upload_date': '20160226', + 'timestamp': 1456455018, + 'description': 'md5:59bde32fd066d637a1a55794c56d8dcb', + 'uploader_id': '1614493167001', + }, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1614493167001/HJlhIQhQf_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + brightcove_id = self._search_regex( + r'data-main-video-id=["\'](\d+)', webpage, 'brightcove id') + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index b907f6b49..b69301d97 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -1,15 +1,91 @@ from __future__ import unicode_literals -from .nhl import NHLBaseIE +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + parse_iso8601, + try_get, +) -class MLBIE(NHLBaseIE): +class MLBBaseIE(InfoExtractor): + def _real_extract(self, url): + display_id = self._match_id(url) + video = self._download_video_data(display_id) + video_id = video['id'] + title = video['title'] + feed = self._get_feed(video) + + formats = [] + for playback in (feed.get('playbacks') or []): + playback_url = playback.get('url') + if not playback_url: + continue + name = playback.get('name') + ext = determine_ext(playback_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + playback_url, video_id, 'mp4', + 'm3u8_native', m3u8_id=name, fatal=False)) + else: + f = { + 'format_id': name, + 'url': playback_url, + } + mobj = re.search(r'_(\d+)K_(\d+)X(\d+)', name) + if mobj: + f.update({ + 'height': int(mobj.group(3)), + 'tbr': int(mobj.group(1)), + 'width': int(mobj.group(2)), + }) + mobj = re.search(r'_(\d+)x(\d+)_(\d+)_(\d+)K\.mp4', playback_url) + if mobj: + f.update({ + 'fps': int(mobj.group(3)), + 'height': int(mobj.group(2)), + 'tbr': int(mobj.group(4)), + 'width': int(mobj.group(1)), + }) + formats.append(f) + self._sort_formats(formats) + + thumbnails = [] + for cut in (try_get(feed, lambda x: x['image']['cuts'], list) or []): + src = cut.get('src') + if not src: + continue + thumbnails.append({ + 'height': int_or_none(cut.get('height')), + 'url': src, + 'width': int_or_none(cut.get('width')), + }) + + language = (video.get('language') or 'EN').lower() + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': video.get('description'), + 'duration': parse_duration(feed.get('duration')), + 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(video.get(self._TIMESTAMP_KEY)), + 'subtitles': self._extract_mlb_subtitles(feed, language), + } + + +class MLBIE(MLBBaseIE): _VALID_URL = r'''(?x) https?:// - (?:[\da-z_-]+\.)*(?Pmlb)\.com/ + (?:[\da-z_-]+\.)*mlb\.com/ (?: (?: - (?:[^/]+/)*c-| + (?:[^/]+/)*video/[^/]+/c-| (?: shared/video/embed/(?:embed|m-internal-embed)\.html| (?:[^/]+/)+(?:play|index)\.jsp| @@ -18,7 +94,6 @@ class MLBIE(NHLBaseIE): (?P\d+) ) ''' - _CONTENT_DOMAIN = 'content.mlb.com' _TESTS = [ { 'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933', @@ -76,18 +151,6 @@ class MLBIE(NHLBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', }, }, - { - 'url': 'https://www.mlb.com/news/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer/c-118550098', - 'md5': 'e09e37b552351fddbf4d9e699c924d68', - 'info_dict': { - 'id': '75609783', - 'ext': 'mp4', - 'title': 'Must C: Pillar climbs for catch', - 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run', - 'timestamp': 1429139220, - 'upload_date': '20150415', - } - }, { 'url': 'https://www.mlb.com/video/hargrove-homers-off-caldwell/c-1352023483?tid=67793694', 'only_matching': True, @@ -113,8 +176,92 @@ class MLBIE(NHLBaseIE): 'url': 'http://mlb.mlb.com/shared/video/embed/m-internal-embed.html?content_id=75609783&property=mlb&autoplay=true&hashmode=false&siteSection=mlb/multimedia/article_118550098/article_embed&club=mlb', 'only_matching': True, }, - { - 'url': 'https://www.mlb.com/cut4/carlos-gomez-borrowed-sunglasses-from-an-as-fan/c-278912842', - 'only_matching': True, - } ] + _TIMESTAMP_KEY = 'date' + + @staticmethod + def _get_feed(video): + return video + + @staticmethod + def _extract_mlb_subtitles(feed, language): + subtitles = {} + for keyword in (feed.get('keywordsAll') or []): + keyword_type = keyword.get('type') + if keyword_type and keyword_type.startswith('closed_captions_location_'): + cc_location = keyword.get('value') + if cc_location: + subtitles.setdefault(language, []).append({ + 'url': cc_location, + }) + return subtitles + + def _download_video_data(self, display_id): + return self._download_json( + 'http://content.mlb.com/mlb/item/id/v1/%s/details/web-v1.json' % display_id, + display_id) + + +class MLBVideoIE(MLBBaseIE): + _VALID_URL = r'https?://(?:www\.)?mlb\.com/(?:[^/]+/)*video/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://www.mlb.com/mariners/video/ackley-s-spectacular-catch-c34698933', + 'md5': '632358dacfceec06bad823b83d21df2d', + 'info_dict': { + 'id': 'c04a8863-f569-42e6-9f87-992393657614', + 'ext': 'mp4', + 'title': "Ackley's spectacular catch", + 'description': 'md5:7f5a981eb4f3cbc8daf2aeffa2215bf0', + 'duration': 66, + 'timestamp': 1405995000, + 'upload_date': '20140722', + 'thumbnail': r're:^https?://.+', + }, + } + _TIMESTAMP_KEY = 'timestamp' + + @classmethod + def suitable(cls, url): + return False if MLBIE.suitable(url) else super(MLBVideoIE, cls).suitable(url) + + @staticmethod + def _get_feed(video): + return video['feeds'][0] + + @staticmethod + def _extract_mlb_subtitles(feed, language): + subtitles = {} + for cc_location in (feed.get('closedCaptions') or []): + subtitles.setdefault(language, []).append({ + 'url': cc_location, + }) + + def _download_video_data(self, display_id): + # https://www.mlb.com/data-service/en/videos/[SLUG] + return self._download_json( + 'https://fastball-gateway.mlb.com/graphql', + display_id, query={ + 'query': '''{ + mediaPlayback(ids: "%s") { + description + feeds(types: CMS) { + closedCaptions + duration + image { + cuts { + width + height + src + } + } + playbacks { + name + url + } + } + id + timestamp + title + } +}''' % display_id, + })['data']['mediaPlayback'][0] diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index f5e30d22d..5a5205c0e 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -255,7 +255,9 @@ class MTVServicesInfoExtractor(InfoExtractor): @staticmethod def _extract_child_with_type(parent, t): - return next(c for c in parent['children'] if c.get('type') == t) + for c in parent['children']: + if c.get('type') == t: + return c def _extract_mgid(self, webpage): try: @@ -286,7 +288,8 @@ class MTVServicesInfoExtractor(InfoExtractor): data = self._parse_json(self._search_regex( r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) main_container = self._extract_child_with_type(data, 'MainContainer') - video_player = self._extract_child_with_type(main_container, 'VideoPlayer') + ab_testing = self._extract_child_with_type(main_container, 'ABTesting') + video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer') mgid = video_player['props']['media']['video']['config']['uri'] return mgid @@ -320,7 +323,7 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r']+?src=(["\'])(?P(?:https?:)?//media.mtvnservices.com/embed/.+?)\1', webpage) + r']+?src=(["\'])(?P(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1', webpage) if mobj: return mobj.group('url') diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index a569c889e..cfc220314 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -23,11 +23,9 @@ class NineCNineMediaIE(InfoExtractor): destination_code, content_id = re.match(self._VALID_URL, url).groups() api_base_url = self._API_BASE_TEMPLATE % (destination_code, content_id) content = self._download_json(api_base_url, content_id, query={ - '$include': '[Media,Season,ContentPackages]', + '$include': '[Media.Name,Season,ContentPackages.Duration,ContentPackages.Id]', }) title = content['Name'] - if len(content['ContentPackages']) > 1: - raise ExtractorError('multiple content packages') content_package = content['ContentPackages'][0] package_id = content_package['Id'] content_package_url = api_base_url + 'contentpackages/%s/' % package_id diff --git a/youtube_dl/extractor/palcomp3.py b/youtube_dl/extractor/palcomp3.py new file mode 100644 index 000000000..fb29d83f9 --- /dev/null +++ b/youtube_dl/extractor/palcomp3.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + str_or_none, + try_get, +) + + +class PalcoMP3BaseIE(InfoExtractor): + _GQL_QUERY_TMPL = '''{ + artist(slug: "%s") { + %s + } +}''' + _ARTIST_FIELDS_TMPL = '''music(slug: "%%s") { + %s + }''' + _MUSIC_FIELDS = '''duration + hls + mp3File + musicID + plays + title''' + + def _call_api(self, artist_slug, artist_fields): + return self._download_json( + 'https://www.palcomp3.com.br/graphql/', artist_slug, query={ + 'query': self._GQL_QUERY_TMPL % (artist_slug, artist_fields), + })['data'] + + def _parse_music(self, music): + music_id = compat_str(music['musicID']) + title = music['title'] + + formats = [] + hls_url = music.get('hls') + if hls_url: + formats.append({ + 'url': hls_url, + 'protocol': 'm3u8_native', + 'ext': 'mp4', + }) + mp3_file = music.get('mp3File') + if mp3_file: + formats.append({ + 'url': mp3_file, + }) + + return { + 'id': music_id, + 'title': title, + 'formats': formats, + 'duration': int_or_none(music.get('duration')), + 'view_count': int_or_none(music.get('plays')), + } + + def _real_initialize(self): + self._ARTIST_FIELDS_TMPL = self._ARTIST_FIELDS_TMPL % self._MUSIC_FIELDS + + def _real_extract(self, url): + artist_slug, music_slug = re.match(self._VALID_URL, url).groups() + artist_fields = self._ARTIST_FIELDS_TMPL % music_slug + music = self._call_api(artist_slug, artist_fields)['artist']['music'] + return self._parse_music(music) + + +class PalcoMP3IE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:song' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P[^/]+)/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/nossas-composicoes-cuida-bem-dela/', + 'md5': '99fd6405b2d8fd589670f6db1ba3b358', + 'info_dict': { + 'id': '3162927', + 'ext': 'mp3', + 'title': 'Nossas Composições - CUIDA BEM DELA', + 'duration': 210, + 'view_count': int, + } + }] + + @classmethod + def suitable(cls, url): + return False if PalcoMP3VideoIE.suitable(url) else super(PalcoMP3IE, cls).suitable(url) + + +class PalcoMP3ArtistIE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:artist' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.palcomp3.com.br/condedoforro/', + 'info_dict': { + 'id': '358396', + 'title': 'Conde do Forró', + }, + 'playlist_mincount': 188, + }] + _ARTIST_FIELDS_TMPL = '''artistID + musics { + nodes { + %s + } + } + name''' + + @ classmethod + def suitable(cls, url): + return False if re.match(PalcoMP3IE._VALID_URL, url) else super(PalcoMP3ArtistIE, cls).suitable(url) + + def _real_extract(self, url): + artist_slug = self._match_id(url) + artist = self._call_api(artist_slug, self._ARTIST_FIELDS_TMPL)['artist'] + + def entries(): + for music in (try_get(artist, lambda x: x['musics']['nodes'], list) or []): + yield self._parse_music(music) + + return self.playlist_result( + entries(), str_or_none(artist.get('artistID')), artist.get('name')) + + +class PalcoMP3VideoIE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:video' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P[^/]+)/(?P[^/?&#]+)/?#clipe' + _TESTS = [{ + 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/maiara-e-maraisa-voce-faz-falta-aqui-ao-vivo-em-vicosa-mg/#clipe', + 'add_ie': ['Youtube'], + 'info_dict': { + 'id': '_pD1nR2qqPg', + 'ext': 'mp4', + 'title': 'Maiara e Maraisa - Você Faz Falta Aqui - DVD Ao Vivo Em Campo Grande', + 'description': 'md5:7043342c09a224598e93546e98e49282', + 'upload_date': '20161107', + 'uploader_id': 'maiaramaraisaoficial', + 'uploader': 'Maiara e Maraisa', + } + }] + _MUSIC_FIELDS = 'youtubeID' + + def _parse_music(self, music): + youtube_id = music['youtubeID'] + return self.url_result(youtube_id, 'Youtube', youtube_id) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index c2ca71c71..d9b13adc2 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -413,7 +413,8 @@ class PeerTubeIE(InfoExtractor): peertube3\.cpy\.re| peertube2\.cpy\.re| videos\.tcit\.fr| - peertube\.cpy\.re + peertube\.cpy\.re| + canard\.tube )''' _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' _API_BASE = 'https://%s/api/v1/videos/%s/%s' @@ -598,11 +599,13 @@ class PeerTubeIE(InfoExtractor): else: age_limit = None + webpage_url = 'https://%s/videos/watch/%s' % (host, video_id) + return { 'id': video_id, 'title': title, 'description': description, - 'thumbnail': urljoin(url, video.get('thumbnailPath')), + 'thumbnail': urljoin(webpage_url, video.get('thumbnailPath')), 'timestamp': unified_timestamp(video.get('publishedAt')), 'uploader': account_data('displayName', compat_str), 'uploader_id': str_or_none(account_data('id', int)), @@ -620,5 +623,6 @@ class PeerTubeIE(InfoExtractor): 'tags': try_get(video, lambda x: x['tags'], list), 'categories': categories, 'formats': formats, - 'subtitles': subtitles + 'subtitles': subtitles, + 'webpage_url': webpage_url, } diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index e435c28e1..dbbfce983 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -1,45 +1,128 @@ +# coding: utf-8 from __future__ import unicode_literals -from .dreisat import DreiSatIE +import re + +from .youtube import YoutubeIE +from .zdf import ZDFBaseIE +from ..compat import compat_str +from ..utils import ( + int_or_none, + merge_dicts, + unified_timestamp, + xpath_text, +) -class PhoenixIE(DreiSatIE): +class PhoenixIE(ZDFBaseIE): IE_NAME = 'phoenix.de' - _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ - (?: - phoenix/die_sendungen/(?:[^/]+/)? - )? - (?P[0-9]+)''' - _TESTS = [ - { - 'url': 'http://www.phoenix.de/content/884301', - 'md5': 'ed249f045256150c92e72dbb70eadec6', - 'info_dict': { - 'id': '884301', - 'ext': 'mp4', - 'title': 'Michael Krons mit Hans-Werner Sinn', - 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', - 'upload_date': '20141025', - 'uploader': 'Im Dialog', - } + _VALID_URL = r'https?://(?:www\.)?phoenix\.de/(?:[^/]+/)*[^/?#&]*-a-(?P\d+)\.html' + _TESTS = [{ + # Same as https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html + 'url': 'https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html', + 'md5': '34ec321e7eb34231fd88616c65c92db0', + 'info_dict': { + 'id': '210222_phx_nachgehakt_corona_protest', + 'ext': 'mp4', + 'title': 'Wohin führt der Protest in der Pandemie?', + 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', + 'duration': 1691, + 'timestamp': 1613906100, + 'upload_date': '20210221', + 'uploader': 'Phoenix', + 'channel': 'corona nachgehakt', }, - { - 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815', - 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://www.phoenix.de/sendungen/gespraeche/phoenix-streitgut-brennglas-corona-a-1965505.html', + 'info_dict': { + 'id': 'hMQtqFYjomk', + 'ext': 'mp4', + 'title': 'phoenix streitgut: Brennglas Corona - Wie gerecht ist unsere Gesellschaft?', + 'description': 'md5:ac7a02e2eb3cb17600bc372e4ab28fdd', + 'duration': 3509, + 'upload_date': '20201219', + 'uploader': 'phoenix', + 'uploader_id': 'phoenix', }, - { - 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234', - 'only_matching': True, + 'params': { + 'skip_download': True, }, - ] + }, { + 'url': 'https://www.phoenix.de/entwicklungen-in-russland-a-2044720.html', + 'only_matching': True, + }, { + # no media + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/mit-dem-jumbo-durch-die-nacht-a-89625.html', + 'only_matching': True, + }, { + # Same as https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + article_id = self._match_id(url) - internal_id = self._search_regex( - r'
[a-zA-Z0-9]+)(?:/(?P[a-zA-Z0-9]+))?' + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P[a-zA-Z0-9]+)' _TEST = { 'url': 'https://picarto.tv/Setz', 'info_dict': { @@ -34,65 +27,46 @@ class PicartoIE(InfoExtractor): return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') + channel_id = self._match_id(url) - metadata = self._download_json( - 'https://api.picarto.tv/v1/channel/name/' + channel_id, - channel_id) + data = self._download_json( + 'https://ptvintern.picarto.tv/ptvapi', channel_id, query={ + 'query': '''{ + channel(name: "%s") { + adult + id + online + stream_name + title + } + getLoadBalancerUrl(channel_name: "%s") { + url + } +}''' % (channel_id, channel_id), + })['data'] + metadata = data['channel'] - if metadata.get('online') is False: + if metadata.get('online') == 0: raise ExtractorError('Stream is offline', expected=True) + title = metadata['title'] cdn_data = self._download_json( - 'https://picarto.tv/process/channel', channel_id, - data=urlencode_postdata({'loadbalancinginfo': channel_id}), - note='Downloading load balancing info') + data['getLoadBalancerUrl']['url'] + '/stream/json_' + metadata['stream_name'] + '.js', + channel_id, 'Downloading load balancing info') - token = mobj.group('token') or 'public' - params = { - 'con': int(time.time() * 1000), - 'token': token, - } - - prefered_edge = cdn_data.get('preferedEdge') formats = [] - - for edge in cdn_data['edges']: - edge_ep = edge.get('ep') - if not edge_ep or not isinstance(edge_ep, compat_str): + for source in (cdn_data.get('source') or []): + source_url = source.get('url') + if not source_url: continue - edge_id = edge.get('id') - for tech in cdn_data['techs']: - tech_label = tech.get('label') - tech_type = tech.get('type') - preference = 0 - if edge_id == prefered_edge: - preference += 1 - format_id = [] - if edge_id: - format_id.append(edge_id) - if tech_type == 'application/x-mpegurl' or tech_label == 'HLS': - format_id.append('hls') - formats.extend(self._extract_m3u8_formats( - update_url_query( - 'https://%s/hls/%s/index.m3u8' - % (edge_ep, channel_id), params), - channel_id, 'mp4', preference=preference, - m3u8_id='-'.join(format_id), fatal=False)) - continue - elif tech_type == 'video/mp4' or tech_label == 'MP4': - format_id.append('mp4') - formats.append({ - 'url': update_url_query( - 'https://%s/mp4/%s.mp4' % (edge_ep, channel_id), - params), - 'format_id': '-'.join(format_id), - 'preference': preference, - }) - else: - # rtmp format does not seem to work - continue + source_type = source.get('type') + if source_type == 'html5/application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + source_url, channel_id, 'mp4', m3u8_id='hls', fatal=False)) + elif source_type == 'html5/video/mp4': + formats.append({ + 'url': source_url, + }) self._sort_formats(formats) mature = metadata.get('adult') @@ -103,10 +77,10 @@ class PicartoIE(InfoExtractor): return { 'id': channel_id, - 'title': self._live_title(metadata.get('title') or channel_id), + 'title': self._live_title(title.strip()), 'is_live': True, - 'thumbnail': try_get(metadata, lambda x: x['thumbnails']['web']), 'channel': channel_id, + 'channel_id': metadata.get('id'), 'channel_url': 'https://picarto.tv/%s' % channel_id, 'age_limit': age_limit, 'formats': formats, diff --git a/youtube_dl/extractor/pinterest.py b/youtube_dl/extractor/pinterest.py index b249c9eda..42528d746 100644 --- a/youtube_dl/extractor/pinterest.py +++ b/youtube_dl/extractor/pinterest.py @@ -31,6 +31,7 @@ class PinterestBaseIE(InfoExtractor): title = (data.get('title') or data.get('grid_title') or video_id).strip() + urls = [] formats = [] duration = None if extract_formats: @@ -38,8 +39,9 @@ class PinterestBaseIE(InfoExtractor): if not isinstance(format_dict, dict): continue format_url = url_or_none(format_dict.get('url')) - if not format_url: + if not format_url or format_url in urls: continue + urls.append(format_url) duration = float_or_none(format_dict.get('duration'), scale=1000) ext = determine_ext(format_url) if 'hls' in format_id.lower() or ext == 'm3u8': diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index b7631e4e1..031454600 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -167,6 +167,7 @@ class PornHubIE(PornHubBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'Video has been flagged for verification in accordance with our trust and safety policy', }, { # subtitles 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', @@ -265,7 +266,8 @@ class PornHubIE(PornHubBaseIE): webpage = dl_webpage('pc') error_msg = self._html_search_regex( - r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)
', + (r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)', + r'(?s)]+class=["\']noVideo["\'][^>]*>(?P.+?)'), webpage, 'error message', default=None, group='error') if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) @@ -394,34 +396,50 @@ class PornHubIE(PornHubBaseIE): upload_date = None formats = [] + + def add_format(format_url, height=None): + ext = determine_ext(format_url) + if ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + return + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + return + tbr = None + mobj = re.search(r'(?P\d+)[pP]?_(?P\d+)[kK]', format_url) + if mobj: + if not height: + height = int(mobj.group('height')) + tbr = int(mobj.group('tbr')) + formats.append({ + 'url': format_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + 'tbr': tbr, + }) + for video_url, height in video_urls: if not upload_date: upload_date = self._search_regex( r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) if upload_date: upload_date = upload_date.replace('/', '') - ext = determine_ext(video_url) - if ext == 'mpd': - formats.extend(self._extract_mpd_formats( - video_url, video_id, mpd_id='dash', fatal=False)) + if '/video/get_media' in video_url: + medias = self._download_json(video_url, video_id, fatal=False) + if isinstance(medias, list): + for media in medias: + if not isinstance(media, dict): + continue + video_url = url_or_none(media.get('videoUrl')) + if not video_url: + continue + height = int_or_none(media.get('quality')) + add_format(video_url, height) continue - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - tbr = None - mobj = re.search(r'(?P\d+)[pP]?_(?P\d+)[kK]', video_url) - if mobj: - if not height: - height = int(mobj.group('height')) - tbr = int(mobj.group('tbr')) - formats.append({ - 'url': video_url, - 'format_id': '%dp' % height if height else None, - 'height': height, - 'tbr': tbr, - }) + add_format(video_url) self._sort_formats(formats) video_uploader = self._html_search_regex( diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py index 8c016a77d..0c497856e 100644 --- a/youtube_dl/extractor/rds.py +++ b/youtube_dl/extractor/rds.py @@ -15,17 +15,17 @@ class RDSIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P[^/]+)-\d+\.\d+' _TESTS = [{ - 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799', + # has two 9c9media ContentPackages, the web player selects the first ContentPackage + 'url': 'https://www.rds.ca/videos/Hockey/NationalHockeyLeague/teams/9/forum-du-5-a-7-jesperi-kotkaniemi-de-retour-de-finlande-3.1377606', 'info_dict': { - 'id': '604333', - 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', + 'id': '2083309', + 'display_id': 'forum-du-5-a-7-jesperi-kotkaniemi-de-retour-de-finlande', 'ext': 'flv', - 'title': 'Fowler Jr. prend la direction de Jacksonville', - 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ', - 'timestamp': 1430397346, - 'upload_date': '20150430', - 'duration': 154.354, - 'age_limit': 0, + 'title': 'Forum du 5 à 7 : Kotkaniemi de retour de Finlande', + 'description': 'md5:83fa38ecc4a79b19e433433254077f25', + 'timestamp': 1606129030, + 'upload_date': '20201123', + 'duration': 773.039, } }, { 'url': 'http://www.rds.ca/vid%C3%A9os/un-voyage-positif-3.877934', diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py index 48f17b828..aed35f8a9 100644 --- a/youtube_dl/extractor/rts.py +++ b/youtube_dl/extractor/rts.py @@ -6,11 +6,12 @@ import re from .srgssr import SRGSSRIE from ..compat import compat_str from ..utils import ( + determine_ext, int_or_none, parse_duration, parse_iso8601, unescapeHTML, - determine_ext, + urljoin, ) @@ -21,7 +22,7 @@ class RTSIE(SRGSSRIE): _TESTS = [ { 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', - 'md5': 'ff7f8450a90cf58dacb64e29707b4a8e', + 'md5': '753b877968ad8afaeddccc374d4256a5', 'info_dict': { 'id': '3449373', 'display_id': 'les-enfants-terribles', @@ -35,6 +36,7 @@ class RTSIE(SRGSSRIE): 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', @@ -63,11 +65,12 @@ class RTSIE(SRGSSRIE): # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], 'skip': 'Blocked outside Switzerland', }, { 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html', - 'md5': '1bae984fe7b1f78e94abc74e802ed99f', + 'md5': '9bb06503773c07ce83d3cbd793cebb91', 'info_dict': { 'id': '5745356', 'display_id': 'londres-cachee-par-un-epais-smog', @@ -81,6 +84,7 @@ class RTSIE(SRGSSRIE): 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html', @@ -160,7 +164,7 @@ class RTSIE(SRGSSRIE): media_type = 'video' if 'video' in all_info else 'audio' # check for errors - self.get_media_data('rts', media_type, media_id) + self._get_media_data('rts', media_type, media_id) info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] @@ -194,6 +198,7 @@ class RTSIE(SRGSSRIE): 'tbr': extract_bitrate(format_url), }) + download_base = 'http://rtsww%s-d.rts.ch/' % ('-a' if media_type == 'audio' else '') for media in info.get('media', []): media_url = media.get('url') if not media_url or re.match(r'https?://', media_url): @@ -205,7 +210,7 @@ class RTSIE(SRGSSRIE): format_id += '-%dk' % rate formats.append({ 'format_id': format_id, - 'url': 'http://download-video.rts.ch/' + media_url, + 'url': urljoin(download_base, media_url), 'tbr': rate or extract_bitrate(media_url), }) diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index ce9db0629..d2fb754cf 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -2,8 +2,9 @@ from __future__ import unicode_literals import base64 +import io import re -import time +import sys from .common import InfoExtractor from ..compat import ( @@ -14,56 +15,13 @@ from ..utils import ( determine_ext, ExtractorError, float_or_none, + qualities, remove_end, remove_start, - sanitized_Request, std_headers, ) - -def _decrypt_url(png): - encrypted_data = compat_b64decode(png) - text_index = encrypted_data.find(b'tEXt') - text_chunk = encrypted_data[text_index - 4:] - length = compat_struct_unpack('!I', text_chunk[:4])[0] - # Use bytearray to get integers when iterating in both python 2.x and 3.x - data = bytearray(text_chunk[8:8 + length]) - data = [chr(b) for b in data if b != 0] - hash_index = data.index('#') - alphabet_data = data[:hash_index] - url_data = data[hash_index + 1:] - if url_data[0] == 'H' and url_data[3] == '%': - # remove useless HQ%% at the start - url_data = url_data[4:] - - alphabet = [] - e = 0 - d = 0 - for l in alphabet_data: - if d == 0: - alphabet.append(l) - d = e = (e + 1) % 4 - else: - d -= 1 - url = '' - f = 0 - e = 3 - b = 1 - for letter in url_data: - if f == 0: - l = int(letter) * 10 - f = 1 - else: - if e == 0: - l += int(letter) - url += alphabet[l] - e = (b + 3) % 4 - f = 0 - b += 1 - else: - e -= 1 - - return url +_bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x)) class RTVEALaCartaIE(InfoExtractor): @@ -79,28 +37,31 @@ class RTVEALaCartaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', 'duration': 5024.566, + 'series': 'Balonmano', }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'note': 'Live stream', 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', 'info_dict': { 'id': '1694255', - 'ext': 'flv', - 'title': 'TODO', + 'ext': 'mp4', + 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': 'live stream', }, - 'skip': 'The f4m manifest can\'t be used yet', }, { 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', - 'md5': 'e55e162379ad587e9640eda4f7353c0f', + 'md5': 'd850f3c8731ea53952ebab489cf81cbf', 'info_dict': { 'id': '4236788', 'ext': 'mp4', - 'title': 'Servir y proteger - Capítulo 104 ', + 'title': 'Servir y proteger - Capítulo 104', 'duration': 3222.0, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', 'only_matching': True, @@ -111,58 +72,102 @@ class RTVEALaCartaIE(InfoExtractor): def _real_initialize(self): user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8') - manager_info = self._download_json( + self._manager = self._download_json( 'http://www.rtve.es/odin/loki/' + user_agent_b64, - None, 'Fetching manager info') - self._manager = manager_info['manager'] + None, 'Fetching manager info')['manager'] + + @staticmethod + def _decrypt_url(png): + encrypted_data = io.BytesIO(compat_b64decode(png)[8:]) + while True: + length = compat_struct_unpack('!I', encrypted_data.read(4))[0] + chunk_type = encrypted_data.read(4) + if chunk_type == b'IEND': + break + data = encrypted_data.read(length) + if chunk_type == b'tEXt': + alphabet_data, text = data.split(b'\0') + quality, url_data = text.split(b'%%') + alphabet = [] + e = 0 + d = 0 + for l in _bytes_to_chr(alphabet_data): + if d == 0: + alphabet.append(l) + d = e = (e + 1) % 4 + else: + d -= 1 + url = '' + f = 0 + e = 3 + b = 1 + for letter in _bytes_to_chr(url_data): + if f == 0: + l = int(letter) * 10 + f = 1 + else: + if e == 0: + l += int(letter) + url += alphabet[l] + e = (b + 3) % 4 + f = 0 + b += 1 + else: + e -= 1 + + yield quality.decode(), url + encrypted_data.read(4) # CRC + + def _extract_png_formats(self, video_id): + png = self._download_webpage( + 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id), + video_id, 'Downloading url information', query={'q': 'v2'}) + q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) + formats = [] + for quality, video_url in self._decrypt_url(png): + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, 'dash', fatal=False)) + else: + formats.append({ + 'format_id': quality, + 'quality': q(quality), + 'url': video_url, + }) + self._sort_formats(formats) + return formats def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) info = self._download_json( 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, video_id)['page']['items'][0] if info['state'] == 'DESPU': raise ExtractorError('The video is no longer available', expected=True) - title = info['title'] - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id) - png_request = sanitized_Request(png_url) - png_request.add_header('Referer', url) - png = self._download_webpage(png_request, video_id, 'Downloading url information') - video_url = _decrypt_url(png) - ext = determine_ext(video_url) - - formats = [] - if not video_url.endswith('.f4m') and ext != 'm3u8': - if '?' not in video_url: - video_url = video_url.replace('resources/', 'auth/resources/') - video_url = video_url.replace('.net.rtve', '.multimedia.cdn.rtve') - - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id='hds', fatal=False)) - else: - formats.append({ - 'url': video_url, - }) - self._sort_formats(formats) + title = info['title'].strip() + formats = self._extract_png_formats(video_id) subtitles = None - if info.get('sbtFile') is not None: - subtitles = self.extract_subtitles(video_id, info['sbtFile']) + sbt_file = info.get('sbtFile') + if sbt_file: + subtitles = self.extract_subtitles(video_id, sbt_file) + + is_live = info.get('live') is True return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'formats': formats, 'thumbnail': info.get('image'), - 'page_url': url, 'subtitles': subtitles, - 'duration': float_or_none(info.get('duration'), scale=1000), + 'duration': float_or_none(info.get('duration'), 1000), + 'is_live': is_live, + 'series': info.get('programTitle'), } def _get_subtitles(self, video_id, sub_file): @@ -174,48 +179,26 @@ class RTVEALaCartaIE(InfoExtractor): for s in subs) -class RTVEInfantilIE(InfoExtractor): +class RTVEInfantilIE(RTVEALaCartaIE): IE_NAME = 'rtve.es:infantil' IE_DESC = 'RTVE infantil' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/(?P[^/]*)/video/(?P[^/]*)/(?P[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P[0-9]+)/' _TESTS = [{ 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', - 'md5': '915319587b33720b8e0357caaa6617e6', + 'md5': '5747454717aedf9f9fdf212d1bcfc48d', 'info_dict': { 'id': '3040283', 'ext': 'mp4', 'title': 'Maneras de vivir', - 'thumbnail': 'http://www.rtve.es/resources/jpg/6/5/1426182947956.JPG', + 'thumbnail': r're:https?://.+/1426182947956\.JPG', 'duration': 357.958, }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }] - def _real_extract(self, url): - video_id = self._match_id(url) - info = self._download_json( - 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, - video_id)['page']['items'][0] - webpage = self._download_webpage(url, video_id) - vidplayer_id = self._search_regex( - r' id="vidplayer([0-9]+)"', webpage, 'internal video ID') - - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id - png = self._download_webpage(png_url, video_id, 'Downloading url information') - video_url = _decrypt_url(png) - - return { - 'id': video_id, - 'ext': 'mp4', - 'title': info['title'], - 'url': video_url, - 'thumbnail': info.get('image'), - 'duration': float_or_none(info.get('duration'), scale=1000), - } - - -class RTVELiveIE(InfoExtractor): +class RTVELiveIE(RTVEALaCartaIE): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P[a-zA-Z0-9-]+)' @@ -225,7 +208,7 @@ class RTVELiveIE(InfoExtractor): 'info_dict': { 'id': 'la-1', 'ext': 'mp4', - 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$', + 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', }, 'params': { 'skip_download': 'live stream', @@ -234,29 +217,22 @@ class RTVELiveIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - start_time = time.gmtime() video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') title = remove_start(title, 'Estoy viendo ') - title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time) vidplayer_id = self._search_regex( (r'playerId=player([0-9]+)', r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', r'data-id=["\'](\d+)'), webpage, 'internal video ID') - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id - png = self._download_webpage(png_url, video_id, 'Downloading url information') - m3u8_url = _decrypt_url(png) - formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') - self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'formats': formats, + 'title': self._live_title(title), + 'formats': self._extract_png_formats(vidplayer_id), 'is_live': True, } diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index f722528cd..0a806ee4e 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -10,7 +10,7 @@ from ..utils import ( class SBSIE(InfoExtractor): IE_DESC = 'sbs.com.au' - _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=)|news/(?:embeds/)?video/)(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=|/watch/)|news/(?:embeds/)?video/)(?P[0-9]+)' _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: @@ -43,6 +43,9 @@ class SBSIE(InfoExtractor): }, { 'url': 'https://www.sbs.com.au/news/embeds/video/1840778819866', 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/watch/1698704451971', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py index b5e76c9af..0afdc1715 100644 --- a/youtube_dl/extractor/screencastomatic.py +++ b/youtube_dl/extractor/screencastomatic.py @@ -2,12 +2,18 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( + get_element_by_class, + int_or_none, + remove_start, + strip_or_none, + unified_strdate, +) class ScreencastOMaticIE(InfoExtractor): - _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P[0-9a-zA-Z]+)' - _TEST = { + _VALID_URL = r'https?://screencast-o-matic\.com/(?:(?:watch|player)/|embed\?.*?\bsc=)(?P[0-9a-zA-Z]+)' + _TESTS = [{ 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl', 'md5': '483583cb80d92588f15ccbedd90f0c18', 'info_dict': { @@ -16,22 +22,30 @@ class ScreencastOMaticIE(InfoExtractor): 'title': 'Welcome to 3-4 Philosophy @ DECV!', 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.', - 'duration': 369.163, + 'duration': 369, + 'upload_date': '20141216', } - } + }, { + 'url': 'http://screencast-o-matic.com/player/c2lD3BeOPl', + 'only_matching': True, + }, { + 'url': 'http://screencast-o-matic.com/embed?ff=true&sc=cbV2r4Q5TL&fromPH=true&a=1', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - jwplayer_data = self._parse_json( - self._search_regex( - r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", webpage, 'setup code'), - video_id, transform_source=js_to_json) - - info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) - info_dict.update({ - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), + webpage = self._download_webpage( + 'https://screencast-o-matic.com/player/' + video_id, video_id) + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info.update({ + 'id': video_id, + 'title': get_element_by_class('overlayTitle', webpage), + 'description': strip_or_none(get_element_by_class('overlayDescription', webpage)) or None, + 'duration': int_or_none(self._search_regex( + r'player\.duration\s*=\s*function\(\)\s*{\s*return\s+(\d+);\s*};', + webpage, 'duration', default=None)), + 'upload_date': unified_strdate(remove_start( + get_element_by_class('overlayPublished', webpage), 'Published: ')), }) - return info_dict + return info diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index 5c2a6206b..b5e093bd2 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -51,13 +51,16 @@ class ShahidIE(ShahidBaseIE): _NETRC_MACHINE = 'shahid' _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?Pepisode|clip|movie)-(?P\d+)' _TESTS = [{ - 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AC%D9%84%D8%B3-%D8%A7%D9%84%D8%B4%D8%A8%D8%A7%D8%A8-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-275286', + 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AA%D8%AD%D9%81-%D8%A7%D9%84%D8%AF%D8%AD%D9%8A%D8%AD-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-816924', 'info_dict': { - 'id': '275286', + 'id': '816924', 'ext': 'mp4', - 'title': 'مجلس الشباب الموسم 1 كليب 1', - 'timestamp': 1506988800, - 'upload_date': '20171003', + 'title': 'متحف الدحيح الموسم 1 كليب 1', + 'timestamp': 1602806400, + 'upload_date': '20201016', + 'description': 'برومو', + 'duration': 22, + 'categories': ['كوميديا'], }, 'params': { # m3u8 download @@ -109,12 +112,15 @@ class ShahidIE(ShahidBaseIE): page_type = 'episode' playout = self._call_api( - 'playout/url/' + video_id, video_id)['playout'] + 'playout/new/url/' + video_id, video_id)['playout'] if playout.get('drm'): raise ExtractorError('This video is DRM protected.', expected=True) - formats = self._extract_m3u8_formats(playout['url'], video_id, 'mp4') + formats = self._extract_m3u8_formats(re.sub( + # https://docs.aws.amazon.com/mediapackage/latest/ug/manifest-filtering.html + r'aws\.manifestfilter=[\w:;,-]+&?', + '', playout['url']), video_id, 'mp4') self._sort_formats(formats) # video = self._call_api( diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index da75a43a7..0774da06e 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -6,9 +6,9 @@ from .mtv import MTVServicesInfoExtractor class SouthParkIE(MTVServicesInfoExtractor): IE_NAME = 'southpark.cc.com' - _VALID_URL = r'https?://(?:www\.)?(?Psouthpark\.cc\.com/(?:clips|(?:full-)?episodes|collections)/(?P.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?Psouthpark(?:\.cc|studios)\.com/(?:clips|(?:full-)?episodes|collections)/(?P.+?)(\?|#|$))' - _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' _TESTS = [{ 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', @@ -23,8 +23,20 @@ class SouthParkIE(MTVServicesInfoExtractor): }, { 'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1', 'only_matching': True, + }, { + 'url': 'https://www.southparkstudios.com/episodes/h4o269/south-park-stunning-and-brave-season-19-ep-1', + 'only_matching': True, }] + def _get_feed_query(self, uri): + return { + 'accountOverride': 'intl.mtvi.com', + 'arcEp': 'shared.southpark.global', + 'ep': '90877963', + 'imageEp': 'shared.southpark.global', + 'mgid': uri, + } + class SouthParkEsIE(SouthParkIE): IE_NAME = 'southpark.cc.com:español' diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index 378fc7568..3e497a939 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -1,82 +1,105 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( + clean_html, + float_or_none, + int_or_none, parse_iso8601, - sanitized_Request, + strip_or_none, + try_get, ) class SportDeutschlandIE(InfoExtractor): - _VALID_URL = r'https?://sportdeutschland\.tv/(?P[^/?#]+)/(?P[^?#/]+)(?:$|[?#])' + _VALID_URL = r'https?://sportdeutschland\.tv/(?P(?:[^/]+/)?[^?#/&]+)' _TESTS = [{ 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', 'info_dict': { - 'id': 're-live-deutsche-meisterschaften-2020-halbfinals', + 'id': '5318cac0275701382770543d7edaf0a0', 'ext': 'mp4', - 'title': 're:Re-live: Deutsche Meisterschaften 2020.*Halbfinals', - 'categories': ['Badminton-Deutschland'], - 'view_count': int, - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - 'timestamp': int, - 'upload_date': '20200201', - 'description': 're:.*', # meaningless description for THIS video + 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals - Teil 1', + 'duration': 16106.36, }, + 'params': { + 'noplaylist': True, + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', + 'info_dict': { + 'id': 'c6e2fdd01f63013854c47054d2ab776f', + 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals', + 'description': 'md5:5263ff4c31c04bb780c9f91130b48530', + 'duration': 31397, + }, + 'playlist_count': 2, + }, { + 'url': 'https://sportdeutschland.tv/freeride-world-tour-2021-fieberbrunn-oesterreich', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - sport_id = mobj.group('sport') - - api_url = 'https://proxy.vidibusdynamic.net/ssl/backend.sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( - sport_id, video_id) - req = sanitized_Request(api_url, headers={ - 'Accept': 'application/vnd.vidibus.v2.html+json', - 'Referer': url, - }) - data = self._download_json(req, video_id) - + display_id = self._match_id(url) + data = self._download_json( + 'https://backend.sportdeutschland.tv/api/permalinks/' + display_id, + display_id, query={'access_token': 'true'}) asset = data['asset'] - categories = [data['section']['title']] - - formats = [] - smil_url = asset['video'] - if '.smil' in smil_url: - m3u8_url = smil_url.replace('.smil', '.m3u8') - formats.extend( - self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')) - - smil_doc = self._download_xml( - smil_url, video_id, note='Downloading SMIL metadata') - base_url_el = smil_doc.find('./head/meta') - if base_url_el: - base_url = base_url_el.attrib['base'] - formats.extend([{ - 'format_id': 'rmtp', - 'url': base_url if base_url_el else n.attrib['src'], - 'play_path': n.attrib['src'], - 'ext': 'flv', - 'preference': -100, - 'format_note': 'Seems to fail at example stream', - } for n in smil_doc.findall('./body/video')]) - else: - formats.append({'url': smil_url}) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'title': asset['title'], - 'thumbnail': asset.get('image'), - 'description': asset.get('teaser'), - 'duration': asset.get('duration'), - 'categories': categories, - 'view_count': asset.get('views'), - 'rtmp_live': asset.get('live'), - 'timestamp': parse_iso8601(asset.get('date')), + title = (asset.get('title') or asset['label']).strip() + asset_id = asset.get('id') or asset.get('uuid') + info = { + 'id': asset_id, + 'title': title, + 'description': clean_html(asset.get('body') or asset.get('description')) or asset.get('teaser'), + 'duration': int_or_none(asset.get('seconds')), } + videos = asset.get('videos') or [] + if len(videos) > 1: + playlist_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('playlistId', [None])[0] + if playlist_id: + if self._downloader.params.get('noplaylist'): + videos = [videos[int(playlist_id)]] + self.to_screen('Downloading just a single video because of --no-playlist') + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % asset_id) + + def entries(): + for i, video in enumerate(videos, 1): + video_id = video.get('uuid') + video_url = video.get('url') + if not (video_id and video_url): + continue + formats = self._extract_m3u8_formats( + video_url.replace('.smil', '.m3u8'), video_id, 'mp4', fatal=False) + if not formats: + continue + yield { + 'id': video_id, + 'formats': formats, + 'title': title + ' - ' + (video.get('label') or 'Teil %d' % i), + 'duration': float_or_none(video.get('duration')), + } + info.update({ + '_type': 'multi_video', + 'entries': entries(), + }) + else: + formats = self._extract_m3u8_formats( + videos[0]['url'].replace('.smil', '.m3u8'), asset_id, 'mp4') + section_title = strip_or_none(try_get(data, lambda x: x['section']['title'])) + info.update({ + 'formats': formats, + 'display_id': asset.get('permalink'), + 'thumbnail': try_get(asset, lambda x: x['images'][0]), + 'categories': [section_title] if section_title else None, + 'view_count': int_or_none(asset.get('views')), + 'is_live': asset.get('is_live') is True, + 'timestamp': parse_iso8601(asset.get('date') or asset.get('published_at')), + }) + return info diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index f63a1359a..ac018e740 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -4,16 +4,32 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse from ..utils import ( ExtractorError, + float_or_none, + int_or_none, parse_iso8601, qualities, + try_get, ) class SRGSSRIE(InfoExtractor): - _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?Psrf|rts|rsi|rtr|swi):(?:[^:]+:)?(?Pvideo|audio):(?P[0-9a-f\-]{36}|\d+)' + _VALID_URL = r'''(?x) + (?: + https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn| + srgssr + ): + (?P + srf|rts|rsi|rtr|swi + ):(?:[^:]+:)? + (?P + video|audio + ): + (?P + [0-9a-f\-]{36}|\d+ + ) + ''' _GEO_BYPASS = False _GEO_COUNTRIES = ['CH'] @@ -25,25 +41,39 @@ class SRGSSRIE(InfoExtractor): 'LEGAL': 'The video cannot be transmitted for legal reasons.', 'STARTDATE': 'This video is not yet available. Please try again later.', } + _DEFAULT_LANGUAGE_CODES = { + 'srf': 'de', + 'rts': 'fr', + 'rsi': 'it', + 'rtr': 'rm', + 'swi': 'en', + } def _get_tokenized_src(self, url, video_id, format_id): - sp = compat_urllib_parse_urlparse(url).path.split('/') token = self._download_json( - 'http://tp.srgssr.ch/akahd/token?acl=/%s/%s/*' % (sp[1], sp[2]), + 'http://tp.srgssr.ch/akahd/token?acl=*', video_id, 'Downloading %s token' % format_id, fatal=False) or {} - auth_params = token.get('token', {}).get('authparams') + auth_params = try_get(token, lambda x: x['token']['authparams']) if auth_params: - url += '?' + auth_params + url += ('?' if '?' not in url else '&') + auth_params return url - def get_media_data(self, bu, media_type, media_id): - media_data = self._download_json( - 'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), - media_id)[media_type.capitalize()] + def _get_media_data(self, bu, media_type, media_id): + query = {'onlyChapters': True} if media_type == 'video' else {} + full_media_data = self._download_json( + 'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json' + % (bu, media_type, media_id), + media_id, query=query)['chapterList'] + try: + media_data = next( + x for x in full_media_data if x.get('id') == media_id) + except StopIteration: + raise ExtractorError('No media information found') - if media_data.get('block') and media_data['block'] in self._ERRORS: - message = self._ERRORS[media_data['block']] - if media_data['block'] == 'GEOBLOCK': + block_reason = media_data.get('blockReason') + if block_reason and block_reason in self._ERRORS: + message = self._ERRORS[block_reason] + if block_reason == 'GEOBLOCK': self.raise_geo_restricted( msg=message, countries=self._GEO_COUNTRIES) raise ExtractorError( @@ -53,53 +83,75 @@ class SRGSSRIE(InfoExtractor): def _real_extract(self, url): bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + media_data = self._get_media_data(bu, media_type, media_id) + title = media_data['title'] - media_data = self.get_media_data(bu, media_type, media_id) - - metadata = media_data['AssetMetadatas']['AssetMetadata'][0] - title = metadata['title'] - description = metadata.get('description') - created_date = media_data.get('createdDate') or metadata.get('createdDate') - timestamp = parse_iso8601(created_date) - - thumbnails = [{ - 'id': image.get('id'), - 'url': image['url'], - } for image in media_data.get('Image', {}).get('ImageRepresentations', {}).get('ImageRepresentation', [])] - - preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) formats = [] - for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []): - protocol = source.get('@protocol') - for asset in source['url']: - asset_url = asset['text'] - quality = asset['@quality'] - format_id = '%s-%s' % (protocol, quality) - if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'): - asset_url = self._get_tokenized_src(asset_url, media_id, format_id) - if protocol.startswith('HTTP-HDS'): - formats.extend(self._extract_f4m_formats( - asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0', - media_id, f4m_id=format_id, fatal=False)) - elif protocol.startswith('HTTP-HLS'): - formats.extend(self._extract_m3u8_formats( - asset_url, media_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - else: - formats.append({ - 'format_id': format_id, - 'url': asset_url, - 'preference': preference(quality), - 'ext': 'flv' if protocol == 'RTMP' else None, - }) + q = qualities(['SD', 'HD']) + for source in (media_data.get('resourceList') or []): + format_url = source.get('url') + if not format_url: + continue + protocol = source.get('protocol') + quality = source.get('quality') + format_id = [] + for e in (protocol, source.get('encoding'), quality): + if e: + format_id.append(e) + format_id = '-'.join(format_id) + + if protocol in ('HDS', 'HLS'): + if source.get('tokenType') == 'AKAMAI': + format_url = self._get_tokenized_src( + format_url, media_id, format_id) + formats.extend(self._extract_akamai_formats( + format_url, media_id)) + elif protocol == 'HLS': + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) + elif protocol in ('HTTP', 'HTTPS'): + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'quality': q(quality), + }) + + # This is needed because for audio medias the podcast url is usually + # always included, even if is only an audio segment and not the + # whole episode. + if int_or_none(media_data.get('position')) == 0: + for p in ('S', 'H'): + podcast_url = media_data.get('podcast%sdUrl' % p) + if not podcast_url: + continue + quality = p + 'D' + formats.append({ + 'format_id': 'PODCAST-' + quality, + 'url': podcast_url, + 'quality': q(quality), + }) self._sort_formats(formats) + subtitles = {} + if media_type == 'video': + for sub in (media_data.get('subtitleList') or []): + sub_url = sub.get('url') + if not sub_url: + continue + lang = sub.get('locale') or self._DEFAULT_LANGUAGE_CODES[bu] + subtitles.setdefault(lang, []).append({ + 'url': sub_url, + }) + return { 'id': media_id, 'title': title, - 'description': description, - 'timestamp': timestamp, - 'thumbnails': thumbnails, + 'description': media_data.get('description'), + 'timestamp': parse_iso8601(media_data.get('date')), + 'thumbnail': media_data.get('imageUrl'), + 'duration': float_or_none(media_data.get('duration'), 1000), + 'subtitles': subtitles, 'formats': formats, } @@ -119,26 +171,17 @@ class SRGSSRPlayIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'md5': 'da6b5b3ac9fa4761a942331cef20fcb3', + 'md5': '6db2226ba97f62ad42ce09783680046c', 'info_dict': { 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', 'ext': 'mp4', 'upload_date': '20130701', 'title': 'Snowden beantragt Asyl in Russland', - 'timestamp': 1372713995, - } - }, { - # No Speichern (Save) button - 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', - 'md5': '0a274ce38fda48c53c01890651985bc6', - 'info_dict': { - 'id': '677f5829-e473-4823-ac83-a1087fe97faa', - 'ext': 'flv', - 'upload_date': '20130710', - 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', - 'description': 'md5:88604432b60d5a38787f152dec89cd56', - 'timestamp': 1373493600, + 'timestamp': 1372708215, + 'duration': 113.827, + 'thumbnail': r're:^https?://.*1383719781\.png$', }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', 'info_dict': { @@ -146,7 +189,8 @@ class SRGSSRPlayIE(InfoExtractor): 'ext': 'mp3', 'upload_date': '20151013', 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', - 'timestamp': 1444750398, + 'timestamp': 1444709160, + 'duration': 336.816, }, 'params': { # rtmp download @@ -159,19 +203,32 @@ class SRGSSRPlayIE(InfoExtractor): 'id': '6348260', 'display_id': '6348260', 'ext': 'mp4', - 'duration': 1796, + 'duration': 1796.76, 'title': 'Le 19h30', - 'description': '', - 'uploader': '19h30', 'upload_date': '20141201', 'timestamp': 1417458600, 'thumbnail': r're:^https?://.*\.image', - 'view_count': int, }, 'params': { # m3u8 download 'skip_download': True, } + }, { + 'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270', + 'info_dict': { + 'id': '42960270', + 'ext': 'mp4', + 'title': 'Why people were against tax reforms', + 'description': 'md5:7ac442c558e9630e947427469c4b824d', + 'duration': 94.0, + 'upload_date': '20170215', + 'timestamp': 1487173560, + 'thumbnail': r're:https?://www\.swissinfo\.ch/srgscalableimage/42961964', + 'subtitles': 'count:9', + }, + 'params': { + 'skip_download': True, + } }, { 'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01', 'only_matching': True, @@ -181,6 +238,10 @@ class SRGSSRPlayIE(InfoExtractor): }, { 'url': 'https://www.rts.ch/play/tv/19h30/video/le-19h30?urn=urn:rts:video:6348260', 'only_matching': True, + }, { + # audio segment, has podcastSdUrl of the full episode + 'url': 'https://www.srf.ch/play/radio/popupaudioplayer?id=50b20dc8-f05b-4972-bf03-e438ff2833eb', + 'only_matching': True, }] def _real_extract(self, url): @@ -188,5 +249,4 @@ class SRGSSRPlayIE(InfoExtractor): bu = mobj.group('bu') media_type = mobj.group('type') or mobj.group('type_2') media_id = mobj.group('id') - # other info can be extracted from url + '&layout=json' return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') diff --git a/youtube_dl/extractor/stretchinternet.py b/youtube_dl/extractor/stretchinternet.py index 4dbead2ba..ec08eae55 100644 --- a/youtube_dl/extractor/stretchinternet.py +++ b/youtube_dl/extractor/stretchinternet.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none class StretchInternetIE(InfoExtractor): @@ -11,22 +10,28 @@ class StretchInternetIE(InfoExtractor): 'info_dict': { 'id': '573272', 'ext': 'mp4', - 'title': 'University of Mary Wrestling vs. Upper Iowa', - 'timestamp': 1575668361, - 'upload_date': '20191206', + 'title': 'UNIVERSITY OF MARY WRESTLING VS UPPER IOWA', + # 'timestamp': 1575668361, + # 'upload_date': '20191206', + 'uploader_id': '99997', } } def _real_extract(self, url): video_id = self._match_id(url) + media_url = self._download_json( + 'https://core.stretchlive.com/trinity/event/tcg/' + video_id, + video_id)[0]['media'][0]['url'] event = self._download_json( - 'https://api.stretchinternet.com/trinity/event/tcg/' + video_id, - video_id)[0] + 'https://neo-client.stretchinternet.com/portal-ws/getEvent.json', + video_id, query={'eventID': video_id, 'token': 'asdf'})['event'] return { 'id': video_id, 'title': event['title'], - 'timestamp': int_or_none(event.get('dateCreated'), 1000), - 'url': 'https://' + event['media'][0]['url'], + # TODO: parse US timezone abbreviations + # 'timestamp': event.get('dateTimeString'), + 'url': 'https://' + media_url, + 'uploader_id': event.get('ownerID'), } diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 4acc29fce..aba9bb447 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -146,18 +146,19 @@ class SVTPlayIE(SVTPlayBaseIE): ) (?P[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P[^/?#&]+) + (?:.*?modalId=(?P[\da-zA-Z-]+))? ) ''' _TESTS = [{ - 'url': 'https://www.svtplay.se/video/26194546/det-har-ar-himlen', + 'url': 'https://www.svtplay.se/video/30479064', 'md5': '2382036fd6f8c994856c323fe51c426e', 'info_dict': { - 'id': 'jNwpV9P', + 'id': '8zVbDPA', 'ext': 'mp4', - 'title': 'Det här är himlen', - 'timestamp': 1586044800, - 'upload_date': '20200405', - 'duration': 3515, + 'title': 'Designdrömmar i Stenungsund', + 'timestamp': 1615770000, + 'upload_date': '20210315', + 'duration': 3519, 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', 'age_limit': 0, 'subtitles': { @@ -173,6 +174,9 @@ class SVTPlayIE(SVTPlayBaseIE): # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B 'skip_download': True, }, + }, { + 'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA', + 'only_matching': True, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -219,7 +223,8 @@ class SVTPlayIE(SVTPlayBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id, svt_id = mobj.group('id', 'svt_id') + video_id = mobj.group('id') + svt_id = mobj.group('svt_id') or mobj.group('modal_id') if svt_id: return self._extract_by_video_id(svt_id) @@ -254,6 +259,7 @@ class SVTPlayIE(SVTPlayBaseIE): if not svt_id: svt_id = self._search_regex( (r']+data-video-id=["\']([\da-zA-Z-]+)', + r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\bmodalId=([\da-zA-Z-]+)' % re.escape(video_id), r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 55e2a0721..23c2808a1 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -1,92 +1,87 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor -from ..compat import compat_str +from ..utils import ( + int_or_none, + parse_iso8601, + try_get, +) class TF1IE(InfoExtractor): - """TF1 uses the wat.tv player.""" - _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|(?:www\.)?(?:tfou|ushuaiatv|histoire|tvbreizh))\.fr/(?:[^/]+/)*(?P[^/?#.]+)' + _VALID_URL = r'https?://(?:www\.)?tf1\.fr/[^/]+/(?P[^/]+)/videos/(?P[^/?&#]+)\.html' _TESTS = [{ - 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', - 'info_dict': { - 'id': '10635995', - 'ext': 'mp4', - 'title': 'Citroën Grand C4 Picasso 2013 : présentation officielle', - 'description': 'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.', - }, - 'params': { - # Sometimes wat serves the whole file with the --test option - 'skip_download': True, - }, - 'expected_warnings': ['HTTP Error 404'], - }, { - 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', - 'info_dict': { - 'id': 'le-grand-mysterioso-chuggington-7085291-739', - 'ext': 'mp4', - 'title': 'Le grand Mystérioso - Chuggington', - 'description': 'Le grand Mystérioso - Emery rêve qu\'un article lui soit consacré dans le journal.', - 'upload_date': '20150103', - }, - 'params': { - # Sometimes wat serves the whole file with the --test option - 'skip_download': True, - }, - 'skip': 'HTTP Error 410: Gone', - }, { - 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', - 'only_matching': True, - }, { - 'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html', - 'only_matching': True, - }, { - 'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', - 'only_matching': True, - }, { 'url': 'https://www.tf1.fr/tmc/quotidien-avec-yann-barthes/videos/quotidien-premiere-partie-11-juin-2019.html', 'info_dict': { 'id': '13641379', 'ext': 'mp4', 'title': 'md5:f392bc52245dc5ad43771650c96fb620', - 'description': 'md5:44bc54f0a21322f5b91d68e76a544eae', + 'description': 'md5:a02cdb217141fb2d469d6216339b052f', 'upload_date': '20190611', + 'timestamp': 1560273989, + 'duration': 1738, + 'series': 'Quotidien avec Yann Barthès', + 'tags': ['intégrale', 'quotidien', 'Replay'], }, 'params': { # Sometimes wat serves the whole file with the --test option 'skip_download': True, + 'format': 'bestvideo', }, + }, { + 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', + 'only_matching': True, + }, { + 'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + program_slug, slug = re.match(self._VALID_URL, url).groups() + video = self._download_json( + 'https://www.tf1.fr/graphql/web', slug, query={ + 'id': '9b80783950b85247541dd1d851f9cc7fa36574af015621f853ab111a679ce26f', + 'variables': json.dumps({ + 'programSlug': program_slug, + 'slug': slug, + }) + })['data']['videoBySlug'] + wat_id = video['streamId'] - webpage = self._download_webpage(url, video_id) + tags = [] + for tag in (video.get('tags') or []): + label = tag.get('label') + if not label: + continue + tags.append(label) - wat_id = None + decoration = video.get('decoration') or {} - data = self._parse_json( - self._search_regex( - r'__APOLLO_STATE__\s*=\s*({.+?})\s*(?:;|)', webpage, - 'data', default='{}'), video_id, fatal=False) + thumbnails = [] + for source in (try_get(decoration, lambda x: x['image']['sources'], list) or []): + source_url = source.get('url') + if not source_url: + continue + thumbnails.append({ + 'url': source_url, + 'width': int_or_none(source.get('width')), + }) - if data: - try: - wat_id = next( - video.get('streamId') - for key, video in data.items() - if isinstance(video, dict) - and video.get('slug') == video_id) - if not isinstance(wat_id, compat_str) or not wat_id.isdigit(): - wat_id = None - except StopIteration: - pass - - if not wat_id: - wat_id = self._html_search_regex( - (r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P\d{8})\1', - r'(["\']?)streamId\1\s*:\s*(["\']?)(?P\d+)\2'), - webpage, 'wat id', group='id') - - return self.url_result('wat:%s' % wat_id, 'Wat') + return { + '_type': 'url_transparent', + 'id': wat_id, + 'url': 'wat:' + wat_id, + 'title': video.get('title'), + 'thumbnails': thumbnails, + 'description': decoration.get('description'), + 'timestamp': parse_iso8601(video.get('date')), + 'duration': int_or_none(try_get(video, lambda x: x['publicPlayingInfos']['duration'])), + 'tags': tags, + 'series': decoration.get('programLabel'), + 'season_number': int_or_none(video.get('season')), + 'episode_number': int_or_none(video.get('episode')), + } diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index ea1beb8af..4faa6de54 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -107,9 +107,12 @@ class TikTokIE(TikTokBaseIE): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - data = self._parse_json(self._search_regex( + page_props = self._parse_json(self._search_regex( r']+\bid=["\']__NEXT_DATA__[^>]+>\s*({.+?})\s*[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.tmz.com/videos/0_okj015ty/', - 'md5': '4d22a51ef205b6c06395d8394f72d560', - 'info_dict': { - 'id': '0_okj015ty', - 'ext': 'mp4', - 'title': 'Kim Kardashian\'s Boobs Unlock a Mystery!', - 'description': 'Did Kim Kardasain try to one-up Khloe by one-upping Kylie??? Or is she just showing off her amazing boobs?', - 'timestamp': 1394747163, - 'uploader_id': 'batchUser', - 'upload_date': '20140313', - } - }, { 'url': 'http://www.tmz.com/videos/0-cegprt2p/', + 'md5': '31f9223e20eef55954973359afa61a20', + 'info_dict': { + 'id': 'P6YjLBLk', + 'ext': 'mp4', + 'title': "No Charges Against Hillary Clinton? Harvey Says It Ain't Over Yet", + 'description': 'md5:b714359fc18607715ebccbd2da8ff488', + 'timestamp': 1467831837, + 'upload_date': '20160706', + }, + 'add_ie': [JWPlatformIE.ie_key()], + }, { + 'url': 'http://www.tmz.com/videos/0_okj015ty/', + 'only_matching': True, + }, { + 'url': 'https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/', + 'only_matching': True, + }, { + 'url': 'https://www.tmz.com/videos/2021-02-19-021921-floyd-mayweather-1043872/', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url).replace('-', '_') - return self.url_result('kaltura:591531:%s' % video_id, 'Kaltura', video_id) + + webpage = self._download_webpage(url, video_id, fatal=False) + if webpage: + tmz_video_id = self._search_regex( + r'nodeRef\s*:\s*["\']tmz:video:([\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12})', + webpage, 'video id', default=None) + video = self._download_json( + 'https://www.tmz.com/_/video/%s' % tmz_video_id, video_id, + fatal=False) + if video: + message = video['message'] + info = { + '_type': 'url_transparent', + 'title': message.get('title'), + 'description': message.get('description'), + 'timestamp': unified_timestamp(message.get('published_at')), + 'duration': int_or_none(message.get('duration')), + } + jwplatform_id = message.get('jwplayer_media_id') + if jwplatform_id: + info.update({ + 'url': 'jwplatform:%s' % jwplatform_id, + 'ie_key': JWPlatformIE.ie_key(), + }) + else: + kaltura_entry_id = message.get('kaltura_entry_id') or video_id + kaltura_partner_id = message.get('kaltura_partner_id') or '591531' + info.update({ + 'url': 'kaltura:%s:%s' % (kaltura_partner_id, kaltura_entry_id), + 'ie_key': KalturaIE.ie_key(), + }) + return info + + return self.url_result( + 'kaltura:591531:%s' % video_id, KalturaIE.ie_key(), video_id) class TMZArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P[^/]+)/?' + _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P[^/?#&]+)' _TEST = { 'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert', - 'md5': '3316ff838ae5bb7f642537825e1e90d2', 'info_dict': { - 'id': '0_6snoelag', - 'ext': 'mov', + 'id': 'PAKZa97W', + 'ext': 'mp4', 'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake', 'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', - 'timestamp': 1429467813, + 'timestamp': 1429466400, 'upload_date': '20150419', - 'uploader_id': 'batchUser', - } + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [JWPlatformIE.ie_key()], } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + + tmz_url = self._search_regex( + r'clickLink\s*\(\s*["\'](?P%s)' % TMZIE._VALID_URL, webpage, + 'video id', default=None, group='url') + if tmz_url: + return self.url_result(tmz_url, ie=TMZIE.ie_key()) + embedded_video_info = self._parse_json(self._html_search_regex( r'tmzVideoEmbed\(({.+?})\);', webpage, 'embedded video info'), video_id) - return self.url_result( - 'http://www.tmz.com/videos/%s/' % embedded_video_info['id']) + 'http://www.tmz.com/videos/%s/' % embedded_video_info['id'], + ie=TMZIE.ie_key()) diff --git a/youtube_dl/extractor/trovo.py b/youtube_dl/extractor/trovo.py index 43745213d..de0107aa9 100644 --- a/youtube_dl/extractor/trovo.py +++ b/youtube_dl/extractor/trovo.py @@ -153,6 +153,7 @@ class TrovoVodIE(TrovoBaseIE): 'protocol': 'm3u8_native', 'tbr': int_or_none(play_info.get('bitrate')), 'url': play_url, + 'http_headers': {'Origin': 'https://trovo.live'}, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py index 931d4d650..a54f49319 100644 --- a/youtube_dl/extractor/tver.py +++ b/youtube_dl/extractor/tver.py @@ -9,6 +9,7 @@ from ..utils import ( int_or_none, remove_start, smuggle_url, + strip_or_none, try_get, ) @@ -25,6 +26,10 @@ class TVerIE(InfoExtractor): }, { 'url': 'https://tver.jp/episode/79622438', 'only_matching': True, + }, { + # subtitle = ' ' + 'url': 'https://tver.jp/corner/f0068870', + 'only_matching': True, }] _TOKEN = None BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' @@ -47,8 +52,12 @@ class TVerIE(InfoExtractor): } if service == 'cx': + title = main['title'] + subtitle = strip_or_none(main.get('subtitle')) + if subtitle: + title += ' - ' + subtitle info.update({ - 'title': main.get('subtitle') or main['title'], + 'title': title, 'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id), 'ie_key': 'FujiTVFODPlus7', }) diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py index 5452c7ca1..d6c79147e 100644 --- a/youtube_dl/extractor/urplay.py +++ b/youtube_dl/extractor/urplay.py @@ -21,6 +21,11 @@ class URPlayIE(InfoExtractor): 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', 'timestamp': 1513292400, 'upload_date': '20171214', + 'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik', + 'duration': 2269, + 'categories': ['Kultur & historia'], + 'tags': ['Kritiskt tänkande', 'Vetenskap', 'Vetenskaplig verksamhet'], + 'episode': 'Om vetenskap, kritiskt tänkande och motstånd', }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', @@ -31,6 +36,10 @@ class URPlayIE(InfoExtractor): 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', 'timestamp': 1440086400, 'upload_date': '20150820', + 'series': 'Tripp, Trapp, Träd', + 'duration': 865, + 'tags': ['Sova'], + 'episode': 'Sovkudde', }, }, { 'url': 'http://urskola.se/Produkter/155794-Smasagor-meankieli-Grodan-i-vida-varlden', @@ -41,9 +50,11 @@ class URPlayIE(InfoExtractor): video_id = self._match_id(url) url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) - urplayer_data = self._parse_json(self._html_search_regex( + vid = int(video_id) + accessible_episodes = self._parse_json(self._html_search_regex( r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', - webpage, 'urplayer data'), video_id)['accessibleEpisodes'][0] + webpage, 'urplayer data'), video_id)['accessibleEpisodes'] + urplayer_data = next(e for e in accessible_episodes if e.get('id') == vid) episode = urplayer_data['title'] raw_streaming_info = urplayer_data['streamingInfo']['raw'] host = self._download_json( diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index fe7a26b62..22e99e8f0 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -23,6 +23,8 @@ class VGTVIE(XstreamIE): 'fvn.no/fvntv': 'fvntv', 'aftenposten.no/webtv': 'aptv', 'ap.vgtv.no/webtv': 'aptv', + 'tv.aftonbladet.se': 'abtv', + # obsolete URL schemas, kept in order to save one HTTP redirect 'tv.aftonbladet.se/abtv': 'abtv', 'www.aftonbladet.se/tv': 'abtv', } @@ -140,6 +142,10 @@ class VGTVIE(XstreamIE): 'url': 'http://www.vgtv.no/#!/video/127205/inside-the-mind-of-favela-funk', 'only_matching': True, }, + { + 'url': 'https://tv.aftonbladet.se/video/36015/vulkanutbrott-i-rymden-nu-slapper-nasa-bilderna', + 'only_matching': True, + }, { 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'only_matching': True, diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index bd2663fe0..102687b82 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import base64 import functools -import json import re import itertools @@ -17,14 +16,14 @@ from ..compat import ( from ..utils import ( clean_html, determine_ext, - dict_get, ExtractorError, + get_element_by_class, js_to_json, int_or_none, merge_dicts, OnDemandPagedList, parse_filesize, - RegexNotFoundError, + parse_iso8601, sanitized_Request, smuggle_url, std_headers, @@ -74,25 +73,28 @@ class VimeoBaseInfoExtractor(InfoExtractor): expected=True) raise ExtractorError('Unable to log in') - def _verify_video_password(self, url, video_id, webpage): + def _get_video_password(self): password = self._downloader.params.get('videopassword') if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) - token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata({ - 'password': password, - 'token': token, - }) + raise ExtractorError( + 'This video is protected by a password, use the --video-password option', + expected=True) + return password + + def _verify_video_password(self, url, video_id, password, token, vuid): if url.startswith('http://'): # vimeo only supports https now, but the user can give an http url url = url.replace('http://', 'https://') - password_request = sanitized_Request(url + '/password', data) - password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Referer', url) self._set_vimeo_cookie('vuid', vuid) return self._download_webpage( - password_request, video_id, - 'Verifying the password', 'Wrong password') + url + '/password', video_id, 'Verifying the password', + 'Wrong password', data=urlencode_postdata({ + 'password': password, + 'token': token, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': url, + }) def _extract_xsrft_and_vuid(self, webpage): xsrft = self._search_regex( @@ -123,10 +125,11 @@ class VimeoBaseInfoExtractor(InfoExtractor): video_title = video_data['title'] live_event = video_data.get('live_event') or {} is_live = live_event.get('status') == 'started' + request = config.get('request') or {} formats = [] - config_files = video_data.get('files') or config['request'].get('files', {}) - for f in config_files.get('progressive', []): + config_files = video_data.get('files') or request.get('files') or {} + for f in (config_files.get('progressive') or []): video_url = f.get('url') if not video_url: continue @@ -142,7 +145,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): # TODO: fix handling of 308 status code returned for live archive manifest requests sep_pattern = r'/sep/video/' for files_type in ('hls', 'dash'): - for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items(): + for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items(): manifest_url = cdn_data.get('url') if not manifest_url: continue @@ -188,17 +191,15 @@ class VimeoBaseInfoExtractor(InfoExtractor): f['preference'] = -40 subtitles = {} - text_tracks = config['request'].get('text_tracks') - if text_tracks: - for tt in text_tracks: - subtitles[tt['lang']] = [{ - 'ext': 'vtt', - 'url': urljoin('https://vimeo.com', tt['url']), - }] + for tt in (request.get('text_tracks') or []): + subtitles[tt['lang']] = [{ + 'ext': 'vtt', + 'url': urljoin('https://vimeo.com', tt['url']), + }] thumbnails = [] if not is_live: - for key, thumb in video_data.get('thumbs', {}).items(): + for key, thumb in (video_data.get('thumbs') or {}).items(): thumbnails.append({ 'id': key, 'width': int_or_none(key), @@ -278,7 +279,7 @@ class VimeoIE(VimeoBaseInfoExtractor): )? (?:videos?/)? (?P[0-9]+) - (?:/[\da-f]+)? + (?:/(?P[\da-f]{10}))? /?(?:[?&].*)?(?:[#].*)?$ ''' IE_NAME = 'vimeo' @@ -318,6 +319,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 1595, 'upload_date': '20130610', 'timestamp': 1370893156, + 'license': 'by', }, 'params': { 'format': 'best[protocol=https]', @@ -331,9 +333,9 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '54469442', 'ext': 'mp4', 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012', - 'uploader': 'The BLN & Business of Software', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware', - 'uploader_id': 'theblnbusinessofsoftware', + 'uploader': 'Business of Software', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/businessofsoftware', + 'uploader_id': 'businessofsoftware', 'duration': 3610, 'description': None, }, @@ -396,6 +398,12 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'staff', 'uploader': 'Vimeo Staff', 'duration': 62, + 'subtitles': { + 'de': [{'ext': 'vtt'}], + 'en': [{'ext': 'vtt'}], + 'es': [{'ext': 'vtt'}], + 'fr': [{'ext': 'vtt'}], + }, } }, { @@ -468,6 +476,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip_download': True, }, 'expected_warnings': ['Unable to download JSON metadata'], + 'skip': 'this page is no longer available.', }, { 'url': 'http://player.vimeo.com/video/68375962', @@ -550,9 +559,7 @@ class VimeoIE(VimeoBaseInfoExtractor): return urls[0] if urls else None def _verify_player_video_password(self, url, video_id, headers): - password = self._downloader.params.get('videopassword') - if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) + password = self._get_video_password() data = urlencode_postdata({ 'password': base64.b64encode(password.encode()), }) @@ -569,6 +576,37 @@ class VimeoIE(VimeoBaseInfoExtractor): def _real_initialize(self): self._login() + def _extract_from_api(self, video_id, unlisted_hash=None): + token = self._download_json( + 'https://vimeo.com/_rv/jwt', video_id, headers={ + 'X-Requested-With': 'XMLHttpRequest' + })['token'] + api_url = 'https://api.vimeo.com/videos/' + video_id + if unlisted_hash: + api_url += ':' + unlisted_hash + video = self._download_json( + api_url, video_id, headers={ + 'Authorization': 'jwt ' + token, + }, query={ + 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', + }) + info = self._parse_config(self._download_json( + video['config_url'], video_id), video_id) + self._vimeo_sort_formats(info['formats']) + get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) + info.update({ + 'description': video.get('description'), + 'license': video.get('license'), + 'release_timestamp': get_timestamp('release'), + 'timestamp': get_timestamp('created'), + 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])), + }) + connections = try_get( + video, lambda x: x['metadata']['connections'], dict) or {} + for k in ('comment', 'like'): + info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total'])) + return info + def _real_extract(self, url): url, data = unsmuggle_url(url, {}) headers = std_headers.copy() @@ -577,22 +615,19 @@ class VimeoIE(VimeoBaseInfoExtractor): if 'Referer' not in headers: headers['Referer'] = url - channel_id = self._search_regex( - r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) + mobj = re.match(self._VALID_URL, url).groupdict() + video_id, unlisted_hash = mobj['id'], mobj.get('unlisted_hash') + if unlisted_hash: + return self._extract_from_api(video_id, unlisted_hash) - # Extract ID from URL - video_id = self._match_id(url) orig_url = url is_pro = 'vimeopro.com/' in url - is_player = '://player.vimeo.com/video/' in url if is_pro: # some videos require portfolio_id to be present in player url # https://github.com/ytdl-org/youtube-dl/issues/20070 url = self._extract_url(url, self._download_webpage(url, video_id)) if not url: url = 'https://vimeo.com/' + video_id - elif is_player: - url = 'https://player.vimeo.com/video/' + video_id elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id @@ -612,14 +647,25 @@ class VimeoIE(VimeoBaseInfoExtractor): expected=True) raise - # Now we begin extracting as much information as we can from what we - # retrieved. First we extract the information common to all extractors, - # and latter we extract those that are Vimeo specific. - self.report_extraction(video_id) + if '://player.vimeo.com/video/' in url: + config = self._parse_json(self._search_regex( + r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id) + if config.get('view') == 4: + config = self._verify_player_video_password( + redirect_url, video_id, headers) + info = self._parse_config(config, video_id) + self._vimeo_sort_formats(info['formats']) + return info + + if re.search(r']+?id="pw_form"', webpage): + video_password = self._get_video_password() + token, vuid = self._extract_xsrft_and_vuid(webpage) + webpage = self._verify_video_password( + redirect_url, video_id, video_password, token, vuid) vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None) if vimeo_config: - seed_status = vimeo_config.get('seed_status', {}) + seed_status = vimeo_config.get('seed_status') or {} if seed_status.get('state') == 'failed': raise ExtractorError( '%s said: %s' % (self.IE_NAME, seed_status['title']), @@ -628,67 +674,40 @@ class VimeoIE(VimeoBaseInfoExtractor): cc_license = None timestamp = None video_description = None + info_dict = {} - # Extract the config JSON - try: - try: - config_url = self._html_search_regex( - r' data-config-url="(.+?)"', webpage, - 'config URL', default=None) - if not config_url: - # Sometimes new react-based page is served instead of old one that require - # different config URL extraction approach (see - # https://github.com/ytdl-org/youtube-dl/pull/7209) - page_config = self._parse_json(self._search_regex( - r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', - webpage, 'page config'), video_id) - config_url = page_config['player']['config_url'] - cc_license = page_config.get('cc_license') - timestamp = try_get( - page_config, lambda x: x['clip']['uploaded_on'], - compat_str) - video_description = clean_html(dict_get( - page_config, ('description', 'description_html_escaped'))) - config = self._download_json(config_url, video_id) - except RegexNotFoundError: - # For pro videos or player.vimeo.com urls - # We try to find out to which variable is assigned the config dic - m_variable_name = re.search(r'(\w)\.video\.id', webpage) - if m_variable_name is not None: - config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))] - else: - config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] - config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;') - config_re.append(r'\bconfig\s*=\s*({.+?})\s*;') - config = self._search_regex(config_re, webpage, 'info section', - flags=re.DOTALL) - config = json.loads(config) - except Exception as e: - if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): - raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option') - - if re.search(r']+?id="pw_form"', webpage) is not None: - if '_video_password_verified' in data: - raise ExtractorError('video password verification failed!') - self._verify_video_password(redirect_url, video_id, webpage) - return self._real_extract( - smuggle_url(redirect_url, {'_video_password_verified': 'verified'})) - else: - raise ExtractorError('Unable to extract info section', - cause=e) + channel_id = self._search_regex( + r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) + if channel_id: + config_url = self._html_search_regex( + r'\bdata-config-url="([^"]+)"', webpage, 'config URL') + video_description = clean_html(get_element_by_class('description', webpage)) + info_dict.update({ + 'channel_id': channel_id, + 'channel_url': 'https://vimeo.com/channels/' + channel_id, + }) else: - if config.get('view') == 4: - config = self._verify_player_video_password(redirect_url, video_id, headers) - + page_config = self._parse_json(self._search_regex( + r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', + webpage, 'page config', default='{}'), video_id, fatal=False) + if not page_config: + return self._extract_from_api(video_id) + config_url = page_config['player']['config_url'] + cc_license = page_config.get('cc_license') + clip = page_config.get('clip') or {} + timestamp = clip.get('uploaded_on') + video_description = clean_html( + clip.get('description') or page_config.get('description_html_escaped')) + config = self._download_json(config_url, video_id) video = config.get('video') or {} vod = video.get('vod') or {} def is_rented(): if '>You rented this title.<' in webpage: return True - if config.get('user', {}).get('purchased'): + if try_get(config, lambda x: x['user']['purchased']): return True - for purchase_option in vod.get('purchase_options', []): + for purchase_option in (vod.get('purchase_options') or []): if purchase_option.get('purchased'): return True label = purchase_option.get('label_string') @@ -703,14 +722,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'https://player.vimeo.com/player/%s' % feature_id, {'force_feature_id': True}), 'Vimeo') - # Extract video description - if not video_description: - video_description = self._html_search_regex( - r'(?s)]*>(.*?)', - webpage, 'description', default=None) if not video_description: video_description = self._html_search_meta( - 'description', webpage, default=None) + ['description', 'og:description', 'twitter:description'], + webpage, default=None) if not video_description and is_pro: orig_webpage = self._download_webpage( orig_url, video_id, @@ -719,25 +734,14 @@ class VimeoIE(VimeoBaseInfoExtractor): if orig_webpage: video_description = self._html_search_meta( 'description', orig_webpage, default=None) - if not video_description and not is_player: + if not video_description: self._downloader.report_warning('Cannot find video description') - # Extract upload date if not timestamp: timestamp = self._search_regex( r']+datetime="([^"]+)"', webpage, 'timestamp', default=None) - try: - view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) - like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count')) - comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count')) - except RegexNotFoundError: - # This info is only available in vimeo.com/{id} urls - view_count = None - like_count = None - comment_count = None - formats = [] source_format = self._extract_original_format( @@ -756,29 +760,20 @@ class VimeoIE(VimeoBaseInfoExtractor): r']+rel=["\']license["\'][^>]+href=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'license', default=None, group='license') - channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None - - info_dict = { + info_dict.update({ 'formats': formats, 'timestamp': unified_timestamp(timestamp), 'description': video_description, 'webpage_url': url, - 'view_count': view_count, - 'like_count': like_count, - 'comment_count': comment_count, 'license': cc_license, - 'channel_id': channel_id, - 'channel_url': channel_url, - } + }) - info_dict = merge_dicts(info_dict, info_dict_config, json_ld) - - return info_dict + return merge_dicts(info_dict, info_dict_config, json_ld) class VimeoOndemandIE(VimeoIE): IE_NAME = 'vimeo:ondemand' - _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?:[^/]+/)?(?P[^/?#&]+)' _TESTS = [{ # ondemand video not available via https://vimeo.com/id 'url': 'https://vimeo.com/ondemand/20704', @@ -939,11 +934,15 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor): } if hashed_pass: query['_hashed_pass'] = hashed_pass - videos = self._download_json( - 'https://api.vimeo.com/albums/%s/videos' % album_id, - album_id, 'Downloading page %d' % api_page, query=query, headers={ - 'Authorization': 'jwt ' + authorization, - })['data'] + try: + videos = self._download_json( + 'https://api.vimeo.com/albums/%s/videos' % album_id, + album_id, 'Downloading page %d' % api_page, query=query, headers={ + 'Authorization': 'jwt ' + authorization, + })['data'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + return for video in videos: link = video.get('link') if not link: @@ -1058,10 +1057,23 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): def _real_extract(self, url): page_url, video_id = re.match(self._VALID_URL, url).groups() - clip_data = self._download_json( - page_url.replace('/review/', '/review/data/'), - video_id)['clipData'] - config_url = clip_data['configUrl'] + data = self._download_json( + page_url.replace('/review/', '/review/data/'), video_id) + if data.get('isLocked') is True: + video_password = self._get_video_password() + viewer = self._download_json( + 'https://vimeo.com/_rv/viewer', video_id) + webpage = self._verify_video_password( + 'https://vimeo.com/' + video_id, video_id, + video_password, viewer['xsrft'], viewer['vuid']) + clip_page_config = self._parse_json(self._search_regex( + r'window\.vimeo\.clip_page_config\s*=\s*({.+?});', + webpage, 'clip page config'), video_id) + config_url = clip_page_config['player']['config_url'] + clip_data = clip_page_config.get('clip') or {} + else: + clip_data = data['clipData'] + config_url = clip_data['configUrl'] config = self._download_json(config_url, video_id) info_dict = self._parse_config(config, video_id) source_format = self._extract_original_format( diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index e2f5d81b8..42da34d44 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -106,7 +106,7 @@ class VLiveIE(VLiveBaseIE): raise ExtractorError('Unable to log in', expected=True) def _call_api(self, path_template, video_id, fields=None): - query = {'appId': self._APP_ID, 'gcc': 'KR'} + query = {'appId': self._APP_ID, 'gcc': 'KR', 'platformType': 'PC'} if fields: query['fields'] = fields try: diff --git a/youtube_dl/extractor/voxmedia.py b/youtube_dl/extractor/voxmedia.py index b318e15d4..661208125 100644 --- a/youtube_dl/extractor/voxmedia.py +++ b/youtube_dl/extractor/voxmedia.py @@ -7,6 +7,8 @@ from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, int_or_none, + try_get, + unified_timestamp, ) @@ -19,14 +21,17 @@ class VoxMediaVolumeIE(OnceIE): setup = self._parse_json(self._search_regex( r'setup\s*=\s*({.+});', webpage, 'setup'), video_id) - video_data = setup.get('video') or {} + player_setup = setup.get('player_setup') or setup + video_data = player_setup.get('video') or {} + formatted_metadata = video_data.get('formatted_metadata') or {} info = { 'id': video_id, - 'title': video_data.get('title_short'), + 'title': player_setup.get('title') or video_data.get('title_short'), 'description': video_data.get('description_long') or video_data.get('description_short'), - 'thumbnail': video_data.get('brightcove_thumbnail') + 'thumbnail': formatted_metadata.get('thumbnail') or video_data.get('brightcove_thumbnail'), + 'timestamp': unified_timestamp(formatted_metadata.get('video_publish_date')), } - asset = setup.get('asset') or setup.get('params') or {} + asset = try_get(setup, lambda x: x['embed_assets']['chorus'], dict) or {} formats = [] hls_url = asset.get('hls_url') @@ -47,6 +52,7 @@ class VoxMediaVolumeIE(OnceIE): if formats: self._sort_formats(formats) info['formats'] = formats + info['duration'] = int_or_none(asset.get('duration')) return info for provider_video_type in ('ooyala', 'youtube', 'brightcove'): @@ -84,7 +90,7 @@ class VoxMediaIE(InfoExtractor): }, { # Volume embed, Youtube 'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet', - 'md5': '4c8f4a0937752b437c3ebc0ed24802b5', + 'md5': 'fd19aa0cf3a0eea515d4fd5c8c0e9d68', 'info_dict': { 'id': 'Gy8Md3Eky38', 'ext': 'mp4', @@ -93,6 +99,7 @@ class VoxMediaIE(InfoExtractor): 'uploader_id': 'TheVerge', 'upload_date': '20141021', 'uploader': 'The Verge', + 'timestamp': 1413907200, }, 'add_ie': ['Youtube'], 'skip': 'similar to the previous test', @@ -100,13 +107,13 @@ class VoxMediaIE(InfoExtractor): # Volume embed, Youtube 'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill', 'info_dict': { - 'id': 'YCjDnX-Xzhg', + 'id': '22986359b', 'ext': 'mp4', 'title': "Mississippi's laws are so bad that its anti-LGBTQ law isn't needed to allow discrimination", 'description': 'md5:fc1317922057de31cd74bce91eb1c66c', - 'uploader_id': 'voxdotcom', 'upload_date': '20150915', - 'uploader': 'Vox', + 'timestamp': 1442332800, + 'duration': 285, }, 'add_ie': ['Youtube'], 'skip': 'similar to the previous test', @@ -160,6 +167,9 @@ class VoxMediaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Post-Post-PC CEO: The Full Code Conference Video of Microsoft\'s Satya Nadella', 'description': 'The longtime veteran was chosen earlier this year as the software giant\'s third leader in its history.', + 'timestamp': 1402938000, + 'upload_date': '20140616', + 'duration': 4114, }, 'add_ie': ['VoxMediaVolume'], }] diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index 778ce8b76..bc196f8a0 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -75,12 +75,15 @@ class VVVVIDIE(InfoExtractor): 'https://www.vvvvid.it/user/login', None, headers=self.geo_verification_headers())['data']['conn_id'] - def _download_info(self, show_id, path, video_id, fatal=True): + def _download_info(self, show_id, path, video_id, fatal=True, query=None): + q = { + 'conn_id': self._conn_id, + } + if query: + q.update(query) response = self._download_json( 'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path), - video_id, headers=self.geo_verification_headers(), query={ - 'conn_id': self._conn_id, - }, fatal=fatal) + video_id, headers=self.geo_verification_headers(), query=q, fatal=fatal) if not (response or fatal): return if response.get('result') == 'error': @@ -98,7 +101,8 @@ class VVVVIDIE(InfoExtractor): show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() response = self._download_info( - show_id, 'season/%s' % season_id, video_id) + show_id, 'season/%s' % season_id, + video_id, query={'video_id': video_id}) vid = int(video_id) video_data = list(filter( @@ -178,17 +182,20 @@ class VVVVIDIE(InfoExtractor): if not embed_code: continue embed_code = ds(embed_code) - if video_type in ('video/rcs', 'video/kenc'): - if video_type == 'video/kenc': - kenc = self._download_json( - 'https://www.vvvvid.it/kenc', video_id, query={ - 'action': 'kt', - 'conn_id': self._conn_id, - 'url': embed_code, - }, fatal=False) or {} - kenc_message = kenc.get('message') - if kenc_message: - embed_code += '?' + ds(kenc_message) + if video_type == 'video/kenc': + embed_code = re.sub(r'https?(://[^/]+)/z/', r'https\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8') + kenc = self._download_json( + 'https://www.vvvvid.it/kenc', video_id, query={ + 'action': 'kt', + 'conn_id': self._conn_id, + 'url': embed_code, + }, fatal=False) or {} + kenc_message = kenc.get('message') + if kenc_message: + embed_code += '?' + ds(kenc_message) + formats.extend(self._extract_m3u8_formats( + embed_code, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif video_type == 'video/rcs': formats.extend(self._extract_akamai_formats(embed_code, video_id)) elif video_type == 'video/youtube': info.update({ @@ -247,9 +254,13 @@ class VVVVIDShowIE(VVVVIDIE): show_info = self._download_info( show_id, 'info/', show_title, fatal=False) + if not show_title: + base_url += "/title" + entries = [] for season in (seasons or []): episodes = season.get('episodes') or [] + playlist_title = season.get('name') or show_info.get('title') for episode in episodes: if episode.get('playable') is False: continue @@ -259,12 +270,13 @@ class VVVVIDShowIE(VVVVIDIE): continue info = self._extract_common_video_info(episode) info.update({ - '_type': 'url', + '_type': 'url_transparent', 'ie_key': VVVVIDIE.ie_key(), 'url': '/'.join([base_url, season_id, video_id]), 'title': episode.get('title'), 'description': episode.get('description'), 'season_id': season_id, + 'playlist_title': playlist_title, }) entries.append(info) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index f6940b371..f1bccc2d6 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -4,9 +4,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - unified_strdate, - HEADRequest, + ExtractorError, int_or_none, + try_get, + unified_strdate, ) @@ -29,6 +30,7 @@ class WatIE(InfoExtractor): 'skip_download': True, }, 'expected_warnings': ['HTTP Error 404'], + 'skip': 'This content is no longer available', }, { 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', @@ -40,8 +42,10 @@ class WatIE(InfoExtractor): 'upload_date': '20140816', }, 'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."], + 'skip': 'This content is no longer available', }, ] + _GEO_BYPASS = False def _real_extract(self, url): video_id = self._match_id(url) @@ -49,71 +53,54 @@ class WatIE(InfoExtractor): # 'contentv4' is used in the website, but it also returns the related # videos, we don't need them + # video_data = self._download_json( + # 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) video_data = self._download_json( - 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) + 'https://mediainfo.tf1.fr/mediainfocombo/' + video_id, + video_id, query={'context': 'MYTF1'}) video_info = video_data['media'] error_desc = video_info.get('error_desc') if error_desc: - self.report_warning( - '%s returned error: %s' % (self.IE_NAME, error_desc)) + if video_info.get('error_code') == 'GEOBLOCKED': + self.raise_geo_restricted(error_desc, video_info.get('geoList')) + raise ExtractorError(error_desc, expected=True) - chapters = video_info['chapters'] - if chapters: - first_chapter = chapters[0] - - def video_id_for_chapter(chapter): - return chapter['tc_start'].split('-')[0] - - if video_id_for_chapter(first_chapter) != video_id: - self.to_screen('Multipart video detected') - entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters] - return self.playlist_result(entries, video_id, video_info['title']) - # Otherwise we can continue and extract just one part, we have to use - # the video id for getting the video url - else: - first_chapter = video_info - - title = first_chapter['title'] - - def extract_url(path_template, url_type): - req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id) - head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type, fatal=False) - if head: - red_url = head.geturl() - if req_url != red_url: - return red_url - return None + title = video_info['title'] formats = [] - manifest_urls = self._download_json( - 'http://www.wat.tv/get/webhtml/' + video_id, video_id) - m3u8_url = manifest_urls.get('hls') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - mpd_url = manifest_urls.get('mpd') - if mpd_url: - formats.extend(self._extract_mpd_formats( - mpd_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'), - video_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) - date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4') - upload_date = unified_strdate(date_diffusion) if date_diffusion else None - duration = None - files = video_info['files'] - if files: - duration = int_or_none(files[0].get('duration')) + def extract_formats(manifest_urls): + for f, f_url in manifest_urls.items(): + if not f_url: + continue + if f in ('dash', 'mpd'): + formats.extend(self._extract_mpd_formats( + f_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'), + video_id, mpd_id='dash', fatal=False)) + elif f == 'hls': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + delivery = video_data.get('delivery') or {} + extract_formats({delivery.get('format'): delivery.get('url')}) + if not formats: + if delivery.get('drm'): + raise ExtractorError('This video is DRM protected.', expected=True) + manifest_urls = self._download_json( + 'http://www.wat.tv/get/webhtml/' + video_id, video_id, fatal=False) + if manifest_urls: + extract_formats(manifest_urls) + + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'thumbnail': first_chapter.get('preview'), - 'description': first_chapter.get('description'), - 'view_count': int_or_none(video_info.get('views')), - 'upload_date': upload_date, - 'duration': duration, + 'thumbnail': video_info.get('preview'), + 'upload_date': unified_strdate(try_get( + video_data, lambda x: x['mediametrie']['chapters'][0]['estatS4'])), + 'duration': int_or_none(video_info.get('duration')), 'formats': formats, } diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 61d1ab209..880c89687 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -154,7 +154,7 @@ class YoukuIE(InfoExtractor): # request basic data basic_data_params = { 'vid': video_id, - 'ccode': '0590', + 'ccode': '0532', 'client_ip': '192.168.1.1', 'utid': cna, 'client_ts': time.time() / 1000, diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 534270bac..33114363d 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -25,6 +25,7 @@ class YouPornIE(InfoExtractor): 'title': 'Sex Ed: Is It Safe To Masturbate Daily?', 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 210, 'uploader': 'Ask Dan And Jennifer', 'upload_date': '20101217', 'average_rating': int, @@ -54,6 +55,7 @@ class YouPornIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': '404', }, { 'url': 'https://www.youporn.com/embed/505835/sex-ed-is-it-safe-to-masturbate-daily/', 'only_matching': True, @@ -153,6 +155,8 @@ class YouPornIE(InfoExtractor): thumbnail = self._search_regex( r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P.+?)\1', webpage, 'thumbnail', fatal=False, group='thumbnail') + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration', fatal=False)) uploader = self._html_search_regex( r'(?s)]+class=["\']submitByLink["\'][^>]*>(.+?)', @@ -194,6 +198,7 @@ class YouPornIE(InfoExtractor): 'title': title, 'description': description, 'thumbnail': thumbnail, + 'duration': duration, 'uploader': uploader, 'upload_date': upload_date, 'average_rating': average_rating, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 72d9fbbc6..79e47c919 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -24,6 +24,7 @@ from ..jsinterp import JSInterpreter from ..utils import ( ExtractorError, clean_html, + dict_get, float_or_none, int_or_none, mimetype2ext, @@ -248,7 +249,23 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True + def _initialize_consent(self): + cookies = self._get_cookies('https://www.youtube.com/') + if cookies.get('__Secure-3PSID'): + return + consent_id = None + consent = cookies.get('CONSENT') + if consent: + if 'YES' in consent.value: + return + consent_id = self._search_regex( + r'PENDING\+(\d+)', consent.value, 'consent', default=None) + if not consent_id: + consent_id = random.randint(100, 999) + self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + def _real_initialize(self): + self._initialize_consent() if self._downloader is None: return if not self._login(): @@ -289,7 +306,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return self._parse_json( self._search_regex( r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', - default='{}'), video_id, fatal=False) + default='{}'), video_id, fatal=False) or {} def _extract_video(self, renderer): video_id = renderer['videoId'] @@ -312,7 +329,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): (lambda x: x['ownerText']['runs'][0]['text'], lambda x: x['shortBylineText']['runs'][0]['text']), compat_str) return { - '_type': 'url_transparent', + '_type': 'url', 'ie_key': YoutubeIE.ie_key(), 'id': video_id, 'url': video_id, @@ -1067,6 +1084,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg', 'only_matching': True, }, + { + # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685 + 'url': 'cBvYw8_A0vQ', + 'info_dict': { + 'id': 'cBvYw8_A0vQ', + 'ext': 'mp4', + 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き', + 'description': 'md5:ea770e474b7cd6722b4c95b833c03630', + 'upload_date': '20201120', + 'uploader': 'Walk around Japan', + 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw', + }, + 'params': { + 'skip_download': True, + }, + }, ] _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -1431,7 +1465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): base_url = self.http_scheme() + '//www.youtube.com/' webpage_url = base_url + 'watch?v=' + video_id webpage = self._download_webpage( - webpage_url + '&bpctr=9999999999', video_id, fatal=False) + webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False) player_response = None if webpage: @@ -1450,7 +1484,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'Refetching age-gated info webpage', 'unable to download video info webpage', query={ 'video_id': video_id, - 'eurl': 'https://www.youtube.com/embed/' + video_id, + 'eurl': 'https://youtube.googleapis.com/v/' + video_id, }, fatal=False)), lambda x: x['player_response'][0], compat_str) or '{}', video_id) @@ -1468,7 +1502,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def get_text(x): if not x: return - return x.get('simpleText') or ''.join([r['text'] for r in x['runs']]) + text = x.get('simpleText') + if text and isinstance(text, compat_str): + return text + runs = x.get('runs') + if not isinstance(runs, list): + return + return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)]) search_meta = ( lambda x: self._html_search_meta(x, webpage, default=None)) \ @@ -1617,7 +1657,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f['format_id'] = itag formats.append(f) - if self._downloader.params.get('youtube_include_dash_manifest'): + if self._downloader.params.get('youtube_include_dash_manifest', True): dash_manifest_url = streaming_data.get('dashManifestUrl') if dash_manifest_url: for f in self._extract_mpd_formats( @@ -1895,7 +1935,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): info['channel'] = get_text(try_get( vsir, lambda x: x['owner']['videoOwnerRenderer']['title'], - compat_str)) + dict)) rows = try_get( vsir, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'], @@ -1942,7 +1982,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): invidio\.us )/ (?: - (?:channel|c|user|feed)/| + (?:channel|c|user|feed|hashtag)/| (?:playlist|watch)\?.*?\blist=| (?!(?:watch|embed|v|e)\b) ) @@ -2228,6 +2268,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/TheYoungTurks/live', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/hashtag/cctv9', + 'info_dict': { + 'id': 'cctv9', + 'title': '#cctv9', + }, + 'playlist_mincount': 350, }] @classmethod @@ -2375,6 +2422,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): for entry in self._post_thread_entries(renderer): yield entry + def _rich_grid_entries(self, contents): + for content in contents: + video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict) + if video_renderer: + entry = self._video_entry(video_renderer) + if entry: + yield entry + @staticmethod def _build_continuation_query(continuation, ctp=None): query = { @@ -2420,82 +2475,111 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): ctp = continuation_ep.get('clickTrackingParams') return YoutubeTabIE._build_continuation_query(continuation, ctp) - def _entries(self, tab, identity_token): + def _entries(self, tab, item_id, webpage): tab_content = try_get(tab, lambda x: x['content'], dict) if not tab_content: return slr_renderer = try_get(tab_content, lambda x: x['sectionListRenderer'], dict) - if not slr_renderer: - return - is_channels_tab = tab.get('title') == 'Channels' - continuation = None - slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or [] - for slr_content in slr_contents: - if not isinstance(slr_content, dict): - continue - is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) - if not is_renderer: - continue - isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] - for isr_content in isr_contents: - if not isinstance(isr_content, dict): + if slr_renderer: + is_channels_tab = tab.get('title') == 'Channels' + continuation = None + slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or [] + for slr_content in slr_contents: + if not isinstance(slr_content, dict): continue - renderer = isr_content.get('playlistVideoListRenderer') - if renderer: - for entry in self._playlist_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) + is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) + if not is_renderer: continue - renderer = isr_content.get('gridRenderer') - if renderer: - for entry in self._grid_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('shelfRenderer') - if renderer: - for entry in self._shelf_entries(renderer, not is_channels_tab): - yield entry - continue - renderer = isr_content.get('backstagePostThreadRenderer') - if renderer: - for entry in self._post_thread_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('videoRenderer') - if renderer: - entry = self._video_entry(renderer) - if entry: - yield entry + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] + for isr_content in isr_contents: + if not isinstance(isr_content, dict): + continue + renderer = isr_content.get('playlistVideoListRenderer') + if renderer: + for entry in self._playlist_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('gridRenderer') + if renderer: + for entry in self._grid_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('shelfRenderer') + if renderer: + for entry in self._shelf_entries(renderer, not is_channels_tab): + yield entry + continue + renderer = isr_content.get('backstagePostThreadRenderer') + if renderer: + for entry in self._post_thread_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('videoRenderer') + if renderer: + entry = self._video_entry(renderer) + if entry: + yield entry + if not continuation: + continuation = self._extract_continuation(is_renderer) if not continuation: - continuation = self._extract_continuation(is_renderer) + continuation = self._extract_continuation(slr_renderer) + else: + rich_grid_renderer = tab_content.get('richGridRenderer') + if not rich_grid_renderer: + return + for entry in self._rich_grid_entries(rich_grid_renderer.get('contents') or []): + yield entry + continuation = self._extract_continuation(rich_grid_renderer) - if not continuation: - continuation = self._extract_continuation(slr_renderer) + ytcfg = self._extract_ytcfg(item_id, webpage) + client_version = try_get( + ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str) or '2.20210407.08.00' headers = { 'x-youtube-client-name': '1', - 'x-youtube-client-version': '2.20201112.04.01', + 'x-youtube-client-version': client_version, + 'content-type': 'application/json', } + + context = try_get(ytcfg, lambda x: x['INNERTUBE_CONTEXT'], dict) or { + 'client': { + 'clientName': 'WEB', + 'clientVersion': client_version, + } + } + visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str) + + identity_token = self._extract_identity_token(ytcfg, webpage) if identity_token: headers['x-youtube-identity-token'] = identity_token + data = { + 'context': context, + } + for page_num in itertools.count(1): if not continuation: break + if visitor_data: + headers['x-goog-visitor-id'] = visitor_data + data['continuation'] = continuation['continuation'] + data['clickTracking'] = { + 'clickTrackingParams': continuation['itct'] + } count = 0 retries = 3 while count <= retries: try: # Downloading page may result in intermittent 5xx HTTP error # that is usually worked around with a retry - browse = self._download_json( - 'https://www.youtube.com/browse_ajax', None, - 'Downloading page %d%s' - % (page_num, ' (retry #%d)' % count if count else ''), - headers=headers, query=continuation) + response = self._download_json( + 'https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + None, 'Downloading page %d%s' % (page_num, ' (retry #%d)' % count if count else ''), + headers=headers, data=json.dumps(data).encode('utf8')) break except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): @@ -2503,12 +2587,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if count <= retries: continue raise - if not browse: - break - response = try_get(browse, lambda x: x[1]['response'], dict) if not response: break + visitor_data = try_get( + response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data + continuation_contents = try_get( response, lambda x: x['continuationContents'], dict) if continuation_contents: @@ -2531,13 +2615,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): continuation = self._extract_continuation(continuation_renderer) continue + on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints')) continuation_items = try_get( - response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list) + on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list) if continuation_items: continuation_item = continuation_items[0] if not isinstance(continuation_item, dict): continue - renderer = continuation_item.get('gridVideoRenderer') + renderer = self._extract_grid_item_renderer(continuation_item) if renderer: grid_renderer = {'items': continuation_items} for entry in self._grid_entries(grid_renderer): @@ -2551,6 +2636,19 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield entry continuation = self._extract_continuation(video_list_renderer) continue + renderer = continuation_item.get('backstagePostThreadRenderer') + if renderer: + continuation_renderer = {'contents': continuation_items} + for entry in self._post_thread_continuation_entries(continuation_renderer): + yield entry + continuation = self._extract_continuation(continuation_renderer) + continue + renderer = continuation_item.get('richItemRenderer') + if renderer: + for entry in self._rich_grid_entries(continuation_items): + yield entry + continuation = self._extract_continuation({'contents': continuation_items}) + continue break @@ -2603,11 +2701,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): alerts.append(text) return '\n'.join(alerts) - def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): + def _extract_from_tabs(self, item_id, webpage, data, tabs): selected_tab = self._extract_selected_tab(tabs) renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) - playlist_id = title = description = None + playlist_id = item_id + title = description = None if renderer: channel_title = renderer.get('title') or item_id tab_title = selected_tab.get('title') @@ -2616,14 +2715,18 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): title += ' - %s' % tab_title description = renderer.get('description') playlist_id = renderer.get('externalId') - renderer = try_get( - data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) - if renderer: - title = renderer.get('title') - description = None - playlist_id = item_id + else: + renderer = try_get( + data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) + if renderer: + title = renderer.get('title') + else: + renderer = try_get( + data, lambda x: x['header']['hashtagHeaderRenderer'], dict) + if renderer: + title = try_get(renderer, lambda x: x['hashtag']['simpleText']) playlist = self.playlist_result( - self._entries(selected_tab, identity_token), + self._entries(selected_tab, item_id, webpage), playlist_id=playlist_id, playlist_title=title, playlist_description=description) playlist.update(self._extract_uploader(data)) @@ -2647,8 +2750,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): self._playlist_entries(playlist), playlist_id=playlist_id, playlist_title=title) - def _extract_identity_token(self, webpage, item_id): - ytcfg = self._extract_ytcfg(item_id, webpage) + def _extract_identity_token(self, ytcfg, webpage): if ytcfg: token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str) if token: @@ -2671,12 +2773,11 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) webpage = self._download_webpage(url, item_id) - identity_token = self._extract_identity_token(webpage, item_id) data = self._extract_yt_initial_data(item_id, webpage) tabs = try_get( data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) if tabs: - return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token) + return self._extract_from_tabs(item_id, webpage, data, tabs) playlist = try_get( data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) if playlist: diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 5ed2946c2..4dd56f66d 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -7,7 +7,9 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( determine_ext, + float_or_none, int_or_none, + merge_dicts, NO_DEFAULT, orderedSet, parse_codecs, @@ -21,49 +23,17 @@ from ..utils import ( class ZDFBaseIE(InfoExtractor): - def _call_api(self, url, player, referrer, video_id, item): - return self._download_json( - url, video_id, 'Downloading JSON %s' % item, - headers={ - 'Referer': referrer, - 'Api-Auth': 'Bearer %s' % player['apiToken'], - }) - - def _extract_player(self, webpage, video_id, fatal=True): - return self._parse_json( - self._search_regex( - r'(?s)data-zdfplayer-jsb=(["\'])(?P{.+?})\1', webpage, - 'player JSON', default='{}' if not fatal else NO_DEFAULT, - group='json'), - video_id) - - -class ZDFIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?]+)\.html' - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') _GEO_COUNTRIES = ['DE'] + _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') - _TESTS = [{ - 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', - 'info_dict': { - 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', - 'ext': 'mp4', - 'title': 'Die Magie der Farben (2/2)', - 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', - 'duration': 2615, - 'timestamp': 1465021200, - 'upload_date': '20160604', - }, - }, { - 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', - 'only_matching': True, - }, { - 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html', - 'only_matching': True, - }, { - 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', - 'only_matching': True, - }] + def _call_api(self, url, video_id, item, api_token=None, referrer=None): + headers = {} + if api_token: + headers['Api-Auth'] = 'Bearer %s' % api_token + if referrer: + headers['Referer'] = referrer + return self._download_json( + url, video_id, 'Downloading JSON %s' % item, headers=headers) @staticmethod def _extract_subtitles(src): @@ -109,20 +79,11 @@ class ZDFIE(ZDFBaseIE): }) formats.append(f) - def _extract_entry(self, url, player, content, video_id): - title = content.get('title') or content['teaserHeadline'] - - t = content['mainVideoContent']['http://zdf.de/rels/target'] - - ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') - - if not ptmd_path: - ptmd_path = t[ - 'http://zdf.de/rels/streams/ptmd-template'].replace( - '{playerId}', 'ngplayer_2_4') - + def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer): ptmd = self._call_api( - urljoin(url, ptmd_path), player, url, video_id, 'metadata') + ptmd_url, video_id, 'metadata', api_token, referrer) + + content_id = ptmd.get('basename') or ptmd_url.split('/')[-1] formats = [] track_uris = set() @@ -140,7 +101,7 @@ class ZDFIE(ZDFBaseIE): continue for track in tracks: self._extract_format( - video_id, formats, track_uris, { + content_id, formats, track_uris, { 'url': track.get('uri'), 'type': f.get('type'), 'mimeType': f.get('mimeType'), @@ -149,6 +110,103 @@ class ZDFIE(ZDFBaseIE): }) self._sort_formats(formats) + duration = float_or_none(try_get( + ptmd, lambda x: x['attributes']['duration']['value']), scale=1000) + + return { + 'extractor_key': ZDFIE.ie_key(), + 'id': content_id, + 'duration': duration, + 'formats': formats, + 'subtitles': self._extract_subtitles(ptmd), + } + + def _extract_player(self, webpage, video_id, fatal=True): + return self._parse_json( + self._search_regex( + r'(?s)data-zdfplayer-jsb=(["\'])(?P{.+?})\1', webpage, + 'player JSON', default='{}' if not fatal else NO_DEFAULT, + group='json'), + video_id) + + +class ZDFIE(ZDFBaseIE): + _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?#&]+)\.html' + _TESTS = [{ + # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html + 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html', + 'md5': '34ec321e7eb34231fd88616c65c92db0', + 'info_dict': { + 'id': '210222_phx_nachgehakt_corona_protest', + 'ext': 'mp4', + 'title': 'Wohin führt der Protest in der Pandemie?', + 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', + 'duration': 1691, + 'timestamp': 1613948400, + 'upload_date': '20210221', + }, + }, { + # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html + 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', + 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', + 'info_dict': { + 'id': '141007_ab18_10wochensommer_film', + 'ext': 'mp4', + 'title': 'Ab 18! - 10 Wochen Sommer', + 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', + 'duration': 2660, + 'timestamp': 1608604200, + 'upload_date': '20201222', + }, + }, { + 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', + 'info_dict': { + 'id': '151025_magie_farben2_tex', + 'ext': 'mp4', + 'title': 'Die Magie der Farben (2/2)', + 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', + 'duration': 2615, + 'timestamp': 1465021200, + 'upload_date': '20160604', + }, + }, { + # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche + 'url': 'https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html', + 'only_matching': True, + }, { + # Same as https://www.3sat.de/film/spielfilm/der-hauptmann-100.html + 'url': 'https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html', + 'only_matching': True, + }, { + # Same as https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids + 'url': 'https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', + 'only_matching': True, + }] + + def _extract_entry(self, url, player, content, video_id): + title = content.get('title') or content['teaserHeadline'] + + t = content['mainVideoContent']['http://zdf.de/rels/target'] + + ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') + + if not ptmd_path: + ptmd_path = t[ + 'http://zdf.de/rels/streams/ptmd-template'].replace( + '{playerId}', 'ngplayer_2_4') + + info = self._extract_ptmd( + urljoin(url, ptmd_path), video_id, player['apiToken'], url) + thumbnails = [] layouts = try_get( content, lambda x: x['teaserImageRef']['layouts'], dict) @@ -169,33 +227,33 @@ class ZDFIE(ZDFBaseIE): }) thumbnails.append(thumbnail) - return { - 'id': video_id, + return merge_dicts(info, { 'title': title, 'description': content.get('leadParagraph') or content.get('teasertext'), 'duration': int_or_none(t.get('duration')), 'timestamp': unified_timestamp(content.get('editorialDate')), 'thumbnails': thumbnails, - 'subtitles': self._extract_subtitles(ptmd), - 'formats': formats, - } + }) def _extract_regular(self, url, player, video_id): content = self._call_api( - player['content'], player, url, video_id, 'content') + player['content'], video_id, 'content', player['apiToken'], url) return self._extract_entry(player['content'], player, content, video_id) def _extract_mobile(self, video_id): - document = self._download_json( + video = self._download_json( 'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id, - video_id)['document'] + video_id) + + document = video['document'] title = document['titel'] + content_id = document['basename'] formats = [] format_urls = set() for f in document['formitaeten']: - self._extract_format(video_id, formats, format_urls, f) + self._extract_format(content_id, formats, format_urls, f) self._sort_formats(formats) thumbnails = [] @@ -213,12 +271,12 @@ class ZDFIE(ZDFBaseIE): }) return { - 'id': video_id, + 'id': content_id, 'title': title, 'description': document.get('beschreibung'), 'duration': int_or_none(document.get('length')), - 'timestamp': unified_timestamp(try_get( - document, lambda x: x['meta']['editorialDate'], compat_str)), + 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp( + try_get(video, lambda x: x['meta']['editorialDate'], compat_str)), 'thumbnails': thumbnails, 'subtitles': self._extract_subtitles(document), 'formats': formats, diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py index adfdcaabf..207c04f5e 100644 --- a/youtube_dl/extractor/zingmp3.py +++ b/youtube_dl/extractor/zingmp3.py @@ -1,93 +1,94 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, - update_url_query, ) -class ZingMp3BaseInfoExtractor(InfoExtractor): +class ZingMp3BaseIE(InfoExtractor): + _VALID_URL_TMPL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?:%s)/[^/]+/(?P\w+)\.html' + _GEO_COUNTRIES = ['VN'] - def _extract_item(self, item, page_type, fatal=True): - error_message = item.get('msg') - if error_message: - if not fatal: - return - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_message), - expected=True) + def _extract_item(self, item, fatal): + item_id = item['id'] + title = item.get('name') or item['title'] formats = [] - for quality, source_url in zip(item.get('qualities') or item.get('quality', []), item.get('source_list') or item.get('source', [])): - if not source_url or source_url == 'require vip': + for k, v in (item.get('source') or {}).items(): + if not v: continue - if not re.match(r'https?://', source_url): - source_url = '//' + source_url - source_url = self._proto_relative_url(source_url, 'http:') - quality_num = int_or_none(quality) - f = { - 'format_id': quality, - 'url': source_url, - } - if page_type == 'video': - f.update({ - 'height': quality_num, - 'ext': 'mp4', - }) + if k in ('mp4', 'hls'): + for res, video_url in v.items(): + if not video_url: + continue + if k == 'hls': + formats.extend(self._extract_m3u8_formats( + video_url, item_id, 'mp4', + 'm3u8_native', m3u8_id=k, fatal=False)) + elif k == 'mp4': + formats.append({ + 'format_id': 'mp4-' + res, + 'url': video_url, + 'height': int_or_none(self._search_regex( + r'^(\d+)p', res, 'resolution', default=None)), + }) else: - f.update({ - 'abr': quality_num, + formats.append({ 'ext': 'mp3', + 'format_id': k, + 'tbr': int_or_none(k), + 'url': self._proto_relative_url(v), + 'vcodec': 'none', }) - formats.append(f) + if not formats: + if not fatal: + return + msg = item['msg'] + if msg == 'Sorry, this content is not available in your country.': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + raise ExtractorError(msg, expected=True) + self._sort_formats(formats) - cover = item.get('cover') + subtitles = None + lyric = item.get('lyric') + if lyric: + subtitles = { + 'origin': [{ + 'url': lyric, + }], + } + + album = item.get('album') or {} return { - 'title': (item.get('name') or item.get('title')).strip(), + 'id': item_id, + 'title': title, 'formats': formats, - 'thumbnail': 'http:/' + cover if cover else None, - 'artist': item.get('artist'), + 'thumbnail': item.get('thumbnail'), + 'subtitles': subtitles, + 'duration': int_or_none(item.get('duration')), + 'track': title, + 'artist': item.get('artists_names'), + 'album': album.get('name') or album.get('title'), + 'album_artist': album.get('artists_names'), } - def _extract_player_json(self, player_json_url, id, page_type, playlist_title=None): - player_json = self._download_json(player_json_url, id, 'Downloading Player JSON') - items = player_json['data'] - if 'item' in items: - items = items['item'] - - if len(items) == 1: - # one single song - data = self._extract_item(items[0], page_type) - data['id'] = id - - return data - else: - # playlist of songs - entries = [] - - for i, item in enumerate(items, 1): - entry = self._extract_item(item, page_type, fatal=False) - if not entry: - continue - entry['id'] = '%s-%d' % (id, i) - entries.append(entry) - - return { - '_type': 'playlist', - 'id': id, - 'title': playlist_title, - 'entries': entries, - } + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage( + url.replace('://zingmp3.vn/', '://mp3.zing.vn/'), + page_id, query={'play_song': 1}) + data_path = self._search_regex( + r'data-xml="([^"]+)', webpage, 'data path') + return self._process_data(self._download_json( + 'https://mp3.zing.vn/xhr' + data_path, page_id)['data']) -class ZingMp3IE(ZingMp3BaseInfoExtractor): - _VALID_URL = r'https?://mp3\.zing\.vn/(?:bai-hat|album|playlist|video-clip)/[^/]+/(?P\w+)\.html' +class ZingMp3IE(ZingMp3BaseIE): + _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip' _TESTS = [{ 'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'md5': 'ead7ae13693b3205cbc89536a077daed', @@ -95,49 +96,66 @@ class ZingMp3IE(ZingMp3BaseInfoExtractor): 'id': 'ZWZB9WAB', 'title': 'Xa Mãi Xa', 'ext': 'mp3', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.+\.jpg', + 'subtitles': { + 'origin': [{ + 'ext': 'lrc', + }] + }, + 'duration': 255, + 'track': 'Xa Mãi Xa', + 'artist': 'Bảo Thy', + 'album': 'Special Album', + 'album_artist': 'Bảo Thy', }, }, { - 'url': 'http://mp3.zing.vn/video-clip/Let-It-Go-Frozen-OST-Sungha-Jung/ZW6BAEA0.html', - 'md5': '870295a9cd8045c0e15663565902618d', + 'url': 'https://mp3.zing.vn/video-clip/Suong-Hoa-Dua-Loi-K-ICM-RYO/ZO8ZF7C7.html', + 'md5': 'e9c972b693aa88301ef981c8151c4343', 'info_dict': { - 'id': 'ZW6BAEA0', - 'title': 'Let It Go (Frozen OST)', + 'id': 'ZO8ZF7C7', + 'title': 'Sương Hoa Đưa Lối', 'ext': 'mp4', + 'thumbnail': r're:^https?://.+\.jpg', + 'duration': 207, + 'track': 'Sương Hoa Đưa Lối', + 'artist': 'K-ICM, RYO', }, }, { - 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', - 'info_dict': { - '_type': 'playlist', - 'id': 'ZWZBWDAF', - 'title': 'Lâu Đài Tình Ái - Bằng Kiều,Minh Tuyết | Album 320 lossless', - }, - 'playlist_count': 10, - 'skip': 'removed at the request of the owner', - }, { - 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', + 'url': 'https://zingmp3.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'only_matching': True, }] IE_NAME = 'zingmp3' IE_DESC = 'mp3.zing.vn' - def _real_extract(self, url): - page_id = self._match_id(url) + def _process_data(self, data): + return self._extract_item(data, True) - webpage = self._download_webpage(url, page_id) - player_json_url = self._search_regex([ - r'data-xml="([^"]+)', - r'&xmlURL=([^&]+)&' - ], webpage, 'player xml url') +class ZingMp3AlbumIE(ZingMp3BaseIE): + _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'album|playlist' + _TESTS = [{ + 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', + 'info_dict': { + '_type': 'playlist', + 'id': 'ZWZBWDAF', + 'title': 'Lâu Đài Tình Ái', + }, + 'playlist_count': 10, + }, { + 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', + 'only_matching': True, + }, { + 'url': 'https://zingmp3.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', + 'only_matching': True, + }] + IE_NAME = 'zingmp3:album' - playlist_title = None - page_type = self._search_regex(r'/(?:html5)?xml/([^/-]+)', player_json_url, 'page type') - if page_type == 'video': - player_json_url = update_url_query(player_json_url, {'format': 'json'}) - else: - player_json_url = player_json_url.replace('/xml/', '/html5xml/') - if page_type == 'album': - playlist_title = self._og_search_title(webpage) - - return self._extract_player_json(player_json_url, page_id, page_type, playlist_title) + def _process_data(self, data): + def entries(): + for item in (data.get('items') or []): + entry = self._extract_item(item, False) + if entry: + yield entry + info = data.get('info') or {} + return self.playlist_result( + entries(), info.get('id'), info.get('name') or info.get('title')) diff --git a/youtube_dl/extractor/zoom.py b/youtube_dl/extractor/zoom.py new file mode 100644 index 000000000..db073d91d --- /dev/null +++ b/youtube_dl/extractor/zoom.py @@ -0,0 +1,68 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + js_to_json, + parse_filesize, + urlencode_postdata, +) + + +class ZoomIE(InfoExtractor): + IE_NAME = 'zoom' + _VALID_URL = r'(?Phttps?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?:play|share)/(?P[A-Za-z0-9_.-]+)' + _TEST = { + 'url': 'https://economist.zoom.us/rec/play/dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', + 'md5': 'ab445e8c911fddc4f9adc842c2c5d434', + 'info_dict': { + 'id': 'dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', + 'ext': 'mp4', + 'title': 'China\'s "two sessions" and the new five-year plan', + } + } + + def _real_extract(self, url): + base_url, play_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, play_id) + + try: + form = self._form_hidden_inputs('password_form', webpage) + except ExtractorError: + form = None + if form: + password = self._downloader.params.get('videopassword') + if not password: + raise ExtractorError( + 'This video is protected by a passcode, use the --video-password option', expected=True) + is_meeting = form.get('useWhichPasswd') == 'meeting' + validation = self._download_json( + base_url + 'rec/validate%s_passwd' % ('_meet' if is_meeting else ''), + play_id, 'Validating passcode', 'Wrong passcode', data=urlencode_postdata({ + 'id': form[('meet' if is_meeting else 'file') + 'Id'], + 'passwd': password, + 'action': form.get('action'), + })) + if not validation.get('status'): + raise ExtractorError(validation['errorMessage'], expected=True) + webpage = self._download_webpage(url, play_id) + + data = self._parse_json(self._search_regex( + r'(?s)window\.__data__\s*=\s*({.+?});', + webpage, 'data'), play_id, js_to_json) + + return { + 'id': play_id, + 'title': data['topic'], + 'url': data['viewMp4Url'], + 'width': int_or_none(data.get('viewResolvtionsWidth')), + 'height': int_or_none(data.get('viewResolvtionsHeight')), + 'http_headers': { + 'Referer': base_url, + }, + 'filesize_approx': parse_filesize(data.get('fileSize')), + } diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 79d2be625..a6b1b8dce 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.02.10' +__version__ = '2021.04.07'