diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 768d45fc1..19b750f86 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.01.08** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.08 + [debug] youtube-dl version 2021.02.04.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 2bd90da57..8acb80b60 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.01.08** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 272895b47..66edcf752 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.08** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 608fcfba4..18203fb34 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.01.08** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.08 + [debug] youtube-dl version 2021.02.04.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index d085ab1ef..20df40cc5 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.08** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/AUTHORS b/AUTHORS index b507cb8df..4a6d7dacd 100644 --- a/AUTHORS +++ b/AUTHORS @@ -246,3 +246,4 @@ Enes Solak Nathan Rossi Thomas van der Berg Luca Cherubin +Adrian Heine \ No newline at end of file diff --git a/ChangeLog b/ChangeLog index 3629c4fb8..784b73d8d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,107 @@ +version 2021.02.04.1 + +Extractors +* [youtube] Prefer DASH formats (#28070) +* [azmedien] Fix extraction (#28064) + + +version 2021.02.04 + +Extractors +* [pornhub] Implement lazy playlist extraction +* [svtplay] Fix video id extraction (#28058) ++ [pornhub] Add support for authentication (#18797, #21416, #24294) +* [pornhub:user] Improve paging ++ [pornhub:user] Add support for URLs unavailable via /videos page (#27853) ++ [bravotv] Add support for oxygen.com (#13357, #22500) ++ [youtube] Pass embed URL to get_video_info request +* [ccma] Improve metadata extraction (#27994) + + Extract age limit, alt title, categories, series and episode number + * Fix timestamp multiple subtitles extraction +* [egghead] Update API domain (#28038) +- [vidzi] Remove extractor (#12629) +* [vidio] Improve metadata extraction +* [youtube] Improve subtitles extraction +* [youtube] Fix chapter extraction fallback +* [youtube] Rewrite extractor + * Improve format sorting + * Remove unused code + * Fix series metadata extraction + * Fix trailer video extraction + * Improve error reporting + + Extract video location ++ [vvvvid] Add support for youtube embeds (#27825) +* [googledrive] Report download page errors (#28005) +* [vlive] Fix error message decoding for python 2 (#28004) +* [youtube] Improve DASH formats file size extraction +* [cda] Improve birth validation detection (#14022, #27929) ++ [awaan] Extract uploader id (#27963) ++ [medialaan] Add support DPG Media MyChannels based websites (#14871, #15597, + #16106, #16489) +* [abcnews] Fix extraction (#12394, #27920) +* [AMP] Fix upload date and timestamp extraction (#27970) +* [tv4] Relax URL regular expression (#27964) ++ [tv2] Add support for mtvuutiset.fi (#27744) +* [adn] Improve login warning reporting +* [zype] Fix uplynk id extraction (#27956) ++ [adn] Add support for authentication (#17091, #27841, #27937) + + +version 2021.01.24.1 + +Core +* Introduce --output-na-placeholder (#27896) + +Extractors +* [franceculture] Make thumbnail optional (#18807) +* [franceculture] Fix extraction (#27891, #27903) +* [njpwworld] Fix extraction (#27890) +* [comedycentral] Fix extraction (#27905) +* [wat] Fix format extraction (#27901) ++ [americastestkitchen:season] Add support for seasons (#27861) ++ [trovo] Add support for trovo.live (#26125) ++ [aol] Add support for yahoo videos (#26650) +* [yahoo] Fix single video extraction +* [lbry] Unescape lbry URI (#27872) +* [9gag] Fix and improve extraction (#23022) +* [americastestkitchen] Improve metadata extraction for ATK episodes (#27860) +* [aljazeera] Fix extraction (#20911, #27779) ++ [minds] Add support for minds.com (#17934) +* [ard] Fix title and description extraction (#27761) ++ [spotify] Add support for Spotify Podcasts (#27443) + + +version 2021.01.16 + +Core +* [YoutubeDL] Protect from infinite recursion due to recursively nested + playlists (#27833) +* [YoutubeDL] Ignore failure to create existing directory (#27811) +* [YoutubeDL] Raise syntax error for format selection expressions with multiple + + operators (#27803) + +Extractors ++ [animeondemand] Add support for lazy playlist extraction (#27829) +* [youporn] Restrict fallback download URL (#27822) +* [youporn] Improve height and tbr extraction (#20425, #23659) +* [youporn] Fix extraction (#27822) ++ [twitter] Add support for unified cards (#27826) ++ [twitch] Add Authorization header with OAuth token for GraphQL requests + (#27790) +* [mixcloud:playlist:base] Extract video id in flat playlist mode (#27787) +* [cspan] Improve info extraction (#27791) +* [adn] Improve info extraction +* [adn] Fix extraction (#26963, #27732) +* [youtube:search] Extract from all sections (#27604) +* [youtube:search] fix viewcount and try to extract all video sections (#27604) +* [twitch] Improve login error extraction +* [twitch] Fix authentication (#27743) +* [3qsdn] Improve extraction (#21058) +* [peertube] Extract formats from streamingPlaylists (#26002, #27586, #27728) +* [khanacademy] Fix extraction (#2887, #26803) +* [spike] Update Paramount Network feed URL (#27715) + + version 2021.01.08 Core diff --git a/README.md b/README.md index 85fed6d3a..94c34d89a 100644 --- a/README.md +++ b/README.md @@ -52,394 +52,431 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo youtube-dl [OPTIONS] URL [URL...] # OPTIONS - -h, --help Print this help text and exit - --version Print program version and exit - -U, --update Update this program to latest version. Make - sure that you have sufficient permissions - (run with sudo if needed) - -i, --ignore-errors Continue on download errors, for example to - skip unavailable videos in a playlist - --abort-on-error Abort downloading of further videos (in the - playlist or the command line) if an error - occurs - --dump-user-agent Display the current browser identification - --list-extractors List all supported extractors - --extractor-descriptions Output descriptions of all supported - extractors - --force-generic-extractor Force extraction to use the generic - extractor - --default-search PREFIX Use this prefix for unqualified URLs. For - example "gvsearch2:" downloads two videos - from google videos for youtube-dl "large - apple". Use the value "auto" to let - youtube-dl guess ("auto_warning" to emit a - warning when guessing). "error" just throws - an error. The default value "fixup_error" - repairs broken URLs, but emits an error if - this is not possible instead of searching. - --ignore-config Do not read configuration files. When given - in the global configuration file - /etc/youtube-dl.conf: Do not read the user - configuration in ~/.config/youtube- - dl/config (%APPDATA%/youtube-dl/config.txt - on Windows) - --config-location PATH Location of the configuration file; either - the path to the config or its containing - directory. - --flat-playlist Do not extract the videos of a playlist, - only list them. - --mark-watched Mark videos watched (YouTube only) - --no-mark-watched Do not mark videos watched (YouTube only) - --no-color Do not emit color codes in output + -h, --help Print this help text and exit + --version Print program version and exit + -U, --update Update this program to latest version. + Make sure that you have sufficient + permissions (run with sudo if needed) + -i, --ignore-errors Continue on download errors, for + example to skip unavailable videos in a + playlist + --abort-on-error Abort downloading of further videos (in + the playlist or the command line) if an + error occurs + --dump-user-agent Display the current browser + identification + --list-extractors List all supported extractors + --extractor-descriptions Output descriptions of all supported + extractors + --force-generic-extractor Force extraction to use the generic + extractor + --default-search PREFIX Use this prefix for unqualified URLs. + For example "gvsearch2:" downloads two + videos from google videos for youtube- + dl "large apple". Use the value "auto" + to let youtube-dl guess ("auto_warning" + to emit a warning when guessing). + "error" just throws an error. The + default value "fixup_error" repairs + broken URLs, but emits an error if this + is not possible instead of searching. + --ignore-config Do not read configuration files. When + given in the global configuration file + /etc/youtube-dl.conf: Do not read the + user configuration in + ~/.config/youtube-dl/config + (%APPDATA%/youtube-dl/config.txt on + Windows) + --config-location PATH Location of the configuration file; + either the path to the config or its + containing directory. + --flat-playlist Do not extract the videos of a + playlist, only list them. + --mark-watched Mark videos watched (YouTube only) + --no-mark-watched Do not mark videos watched (YouTube + only) + --no-color Do not emit color codes in output ## Network Options: - --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. - To enable SOCKS proxy, specify a proper - scheme. For example - socks5://127.0.0.1:1080/. Pass in an empty - string (--proxy "") for direct connection - --socket-timeout SECONDS Time to wait before giving up, in seconds - --source-address IP Client-side IP address to bind to - -4, --force-ipv4 Make all connections via IPv4 - -6, --force-ipv6 Make all connections via IPv6 + --proxy URL Use the specified HTTP/HTTPS/SOCKS + proxy. To enable SOCKS proxy, specify a + proper scheme. For example + socks5://127.0.0.1:1080/. Pass in an + empty string (--proxy "") for direct + connection + --socket-timeout SECONDS Time to wait before giving up, in + seconds + --source-address IP Client-side IP address to bind to + -4, --force-ipv4 Make all connections via IPv4 + -6, --force-ipv6 Make all connections via IPv6 ## Geo Restriction: - --geo-verification-proxy URL Use this proxy to verify the IP address for - some geo-restricted sites. The default - proxy specified by --proxy (or none, if the - option is not present) is used for the - actual downloading. - --geo-bypass Bypass geographic restriction via faking - X-Forwarded-For HTTP header - --no-geo-bypass Do not bypass geographic restriction via - faking X-Forwarded-For HTTP header - --geo-bypass-country CODE Force bypass geographic restriction with - explicitly provided two-letter ISO 3166-2 - country code - --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction with - explicitly provided IP block in CIDR - notation + --geo-verification-proxy URL Use this proxy to verify the IP address + for some geo-restricted sites. The + default proxy specified by --proxy (or + none, if the option is not present) is + used for the actual downloading. + --geo-bypass Bypass geographic restriction via + faking X-Forwarded-For HTTP header + --no-geo-bypass Do not bypass geographic restriction + via faking X-Forwarded-For HTTP header + --geo-bypass-country CODE Force bypass geographic restriction + with explicitly provided two-letter ISO + 3166-2 country code + --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction + with explicitly provided IP block in + CIDR notation ## Video Selection: - --playlist-start NUMBER Playlist video to start at (default is 1) - --playlist-end NUMBER Playlist video to end at (default is last) - --playlist-items ITEM_SPEC Playlist video items to download. Specify - indices of the videos in the playlist - separated by commas like: "--playlist-items - 1,2,5,8" if you want to download videos - indexed 1, 2, 5, 8 in the playlist. You can - specify range: "--playlist-items - 1-3,7,10-13", it will download the videos - at index 1, 2, 3, 7, 10, 11, 12 and 13. - --match-title REGEX Download only matching titles (regex or - caseless sub-string) - --reject-title REGEX Skip download for matching titles (regex or - caseless sub-string) - --max-downloads NUMBER Abort after downloading NUMBER files - --min-filesize SIZE Do not download any videos smaller than - SIZE (e.g. 50k or 44.6m) - --max-filesize SIZE Do not download any videos larger than SIZE - (e.g. 50k or 44.6m) - --date DATE Download only videos uploaded in this date - --datebefore DATE Download only videos uploaded on or before - this date (i.e. inclusive) - --dateafter DATE Download only videos uploaded on or after - this date (i.e. inclusive) - --min-views COUNT Do not download any videos with less than - COUNT views - --max-views COUNT Do not download any videos with more than - COUNT views - --match-filter FILTER Generic video filter. Specify any key (see - the "OUTPUT TEMPLATE" for a list of - available keys) to match if the key is - present, !key to check if the key is not - present, key > NUMBER (like "comment_count - > 12", also works with >=, <, <=, !=, =) to - compare against a number, key = 'LITERAL' - (like "uploader = 'Mike Smith'", also works - with !=) to match against a string literal - and & to require multiple matches. Values - which are not known are excluded unless you - put a question mark (?) after the operator. - For example, to only match videos that have - been liked more than 100 times and disliked - less than 50 times (or the dislike - functionality is not available at the given - service), but who also have a description, - use --match-filter "like_count > 100 & - dislike_count NUMBER (like + "comment_count > 12", also works with + >=, <, <=, !=, =) to compare against a + number, key = 'LITERAL' (like "uploader + = 'Mike Smith'", also works with !=) to + match against a string literal and & to + require multiple matches. Values which + are not known are excluded unless you + put a question mark (?) after the + operator. For example, to only match + videos that have been liked more than + 100 times and disliked less than 50 + times (or the dislike functionality is + not available at the given service), + but who also have a description, use + --match-filter "like_count > 100 & + dislike_count .+?) - (?P.+)" - --xattrs Write metadata to the video file's xattrs - (using dublin core and xdg standards) - --fixup POLICY Automatically correct known faults of the - file. One of never (do nothing), warn (only - emit a warning), detect_or_warn (the - default; fix file if we can, warn - otherwise) - --prefer-avconv Prefer avconv over ffmpeg for running the - postprocessors - --prefer-ffmpeg Prefer ffmpeg over avconv for running the - postprocessors (default) - --ffmpeg-location PATH Location of the ffmpeg/avconv binary; - either the path to the binary or its - containing directory. - --exec CMD Execute a command on the file after - downloading and post-processing, similar to - find's -exec syntax. Example: --exec 'adb - push {} /sdcard/Music/ && rm {}' - --convert-subs FORMAT Convert the subtitles to other format - (currently supported: srt|ass|vtt|lrc) + -x, --extract-audio Convert video files to audio-only files + (requires ffmpeg/avconv and + ffprobe/avprobe) + --audio-format FORMAT Specify audio format: "best", "aac", + "flac", "mp3", "m4a", "opus", "vorbis", + or "wav"; "best" by default; No effect + without -x + --audio-quality QUALITY Specify ffmpeg/avconv audio quality, + insert a value between 0 (better) and 9 + (worse) for VBR or a specific bitrate + like 128K (default 5) + --recode-video FORMAT Encode the video to another format if + necessary (currently supported: + mp4|flv|ogg|webm|mkv|avi) + --postprocessor-args ARGS Give these arguments to the + postprocessor + -k, --keep-video Keep the video file on disk after the + post-processing; the video is erased by + default + --no-post-overwrites Do not overwrite post-processed files; + the post-processed files are + overwritten by default + --embed-subs Embed subtitles in the video (only for + mp4, webm and mkv videos) + --embed-thumbnail Embed thumbnail in the audio as cover + art + --add-metadata Write metadata to the video file + --metadata-from-title FORMAT Parse additional metadata like song + title / artist from the video title. + The format syntax is the same as + --output. Regular expression with named + capture groups may also be used. The + parsed parameters replace existing + values. Example: --metadata-from-title + "%(artist)s - %(title)s" matches a + title like "Coldplay - Paradise". + Example (regex): --metadata-from-title + "(?P<artist>.+?) - (?P<title>.+)" + --xattrs Write metadata to the video file's + xattrs (using dublin core and xdg + standards) + --fixup POLICY Automatically correct known faults of + the file. One of never (do nothing), + warn (only emit a warning), + detect_or_warn (the default; fix file + if we can, warn otherwise) + --prefer-avconv Prefer avconv over ffmpeg for running + the postprocessors + --prefer-ffmpeg Prefer ffmpeg over avconv for running + the postprocessors (default) + --ffmpeg-location PATH Location of the ffmpeg/avconv binary; + either the path to the binary or its + containing directory. + --exec CMD Execute a command on the file after + downloading and post-processing, + similar to find's -exec syntax. + Example: --exec 'adb push {} + /sdcard/Music/ && rm {}' + --convert-subs FORMAT Convert the subtitles to other format + (currently supported: srt|ass|vtt|lrc) # CONFIGURATION @@ -583,7 +620,7 @@ Available for the media that is a track or a part of a music album: - `disc_number` (numeric): Number of the disc or other physical medium the track belongs to - `release_year` (numeric): Year (YYYY) when the album was released -Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with `NA`. +Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with placeholder value provided with `--output-na-placeholder` (`NA` by default). For example for `-o %(title)s-%(id)s.%(ext)s` and an mp4 video with title `youtube-dl test video` and id `BaW_jenozKcj`, this will result in a `youtube-dl test video-BaW_jenozKcj.mp4` file created in the current directory. diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3a49043fa..e1b85b1d1 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -46,10 +46,11 @@ - **Amara** - **AMCNetworks** - **AmericasTestKitchen** + - **AmericasTestKitchenSeason** - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **AnimeOnDemand** - **Anvato** - - **aol.com** + - **aol.com**: Yahoo screen and movies - **APA** - **Aparat** - **AppleConnect** @@ -192,8 +193,6 @@ - **CNNArticle** - **CNNBlogs** - **ComedyCentral** - - **ComedyCentralFullEpisodes** - - **ComedyCentralShortname** - **ComedyCentralTV** - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED - **CONtv** @@ -418,7 +417,8 @@ - **Katsomo** - **KeezMovies** - **Ketnet** - - **KhanAcademy** + - **khanacademy** + - **khanacademy:unit** - **KickStarter** - **KinjaEmbed** - **KinoPoisk** @@ -505,6 +505,9 @@ - **Mgoon** - **MGTV**: 芒果TV - **MiaoPai** + - **minds** + - **minds:channel** + - **minds:group** - **MinistryGrid** - **Minoto** - **miomio.tv** @@ -534,6 +537,7 @@ - **mtv:video** - **mtvjapan** - **mtvservices:embedded** + - **MTVUutisetArticle** - **MuenchenTV**: münchen.tv - **mva**: Microsoft Virtual Academy videos - **mva:course**: Microsoft Virtual Academy courses @@ -858,6 +862,8 @@ - **Sport5** - **SportBox** - **SportDeutschland** + - **spotify** + - **spotify:show** - **Spreaker** - **SpreakerPage** - **SpreakerShow** @@ -939,12 +945,13 @@ - **TNAFlixNetworkEmbed** - **toggle** - **ToonGoggles** - - **Tosh**: Tosh.0 - **tou.tv** - **Toypics**: Toypics video - **ToypicsUser**: Toypics user profile - **TrailerAddict** (Currently broken) - **Trilulilu** + - **Trovo** + - **TrovoVod** - **TruNews** - **TruTV** - **Tube8** @@ -1052,7 +1059,6 @@ - **vidme** - **vidme:user** - **vidme:user:likes** - - **Vidzi** - **vier**: vier.be and vijf.be - **vier:videos** - **viewlift** @@ -1097,6 +1103,7 @@ - **vrv** - **vrv:series** - **VShare** + - **VTM** - **VTXTV** - **vube**: Vube.com - **VuClip** diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 4d62ba145..a35effe0e 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -633,13 +633,20 @@ class TestYoutubeDL(unittest.TestCase): 'title2': '%PATH%', } - def fname(templ): - ydl = YoutubeDL({'outtmpl': templ}) + def fname(templ, na_placeholder='NA'): + params = {'outtmpl': templ} + if na_placeholder != 'NA': + params['outtmpl_na_placeholder'] = na_placeholder + ydl = YoutubeDL(params) return ydl.prepare_filename(info) self.assertEqual(fname('%(id)s.%(ext)s'), '1234.mp4') self.assertEqual(fname('%(id)s-%(width)s.%(ext)s'), '1234-NA.mp4') - # Replace missing fields with 'NA' - self.assertEqual(fname('%(uploader_date)s-%(id)s.%(ext)s'), 'NA-1234.mp4') + NA_TEST_OUTTMPL = '%(uploader_date)s-%(width)d-%(id)s.%(ext)s' + # Replace missing fields with 'NA' by default + self.assertEqual(fname(NA_TEST_OUTTMPL), 'NA-NA-1234.mp4') + # Or by provided placeholder + self.assertEqual(fname(NA_TEST_OUTTMPL, na_placeholder='none'), 'none-none-1234.mp4') + self.assertEqual(fname(NA_TEST_OUTTMPL, na_placeholder=''), '--1234.mp4') self.assertEqual(fname('%(height)d.%(ext)s'), '1080.mp4') self.assertEqual(fname('%(height)6d.%(ext)s'), ' 1080.mp4') self.assertEqual(fname('%(height)-6d.%(ext)s'), '1080 .mp4') diff --git a/test/test_youtube_chapters.py b/test/test_youtube_chapters.py deleted file mode 100644 index e69c57377..000000000 --- a/test/test_youtube_chapters.py +++ /dev/null @@ -1,275 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -from __future__ import unicode_literals - -# Allow direct execution -import os -import sys -import unittest -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from test.helper import expect_value -from youtube_dl.extractor import YoutubeIE - - -class TestYoutubeChapters(unittest.TestCase): - - _TEST_CASES = [ - ( - # https://www.youtube.com/watch?v=A22oy8dFjqc - # pattern: 00:00 - <title> - '''This is the absolute ULTIMATE experience of Queen's set at LIVE AID, this is the best video mixed to the absolutely superior stereo radio broadcast. This vastly superior audio mix takes a huge dump on all of the official mixes. Best viewed in 1080p. ENJOY! ***MAKE SURE TO READ THE DESCRIPTION***<br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+36);return false;">00:36</a> - Bohemian Rhapsody<br /><a href="#" onclick="yt.www.watch.player.seekTo(02*60+42);return false;">02:42</a> - Radio Ga Ga<br /><a href="#" onclick="yt.www.watch.player.seekTo(06*60+53);return false;">06:53</a> - Ay Oh!<br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+34);return false;">07:34</a> - Hammer To Fall<br /><a href="#" onclick="yt.www.watch.player.seekTo(12*60+08);return false;">12:08</a> - Crazy Little Thing Called Love<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+03);return false;">16:03</a> - We Will Rock You<br /><a href="#" onclick="yt.www.watch.player.seekTo(17*60+18);return false;">17:18</a> - We Are The Champions<br /><a href="#" onclick="yt.www.watch.player.seekTo(21*60+12);return false;">21:12</a> - Is This The World We Created...?<br /><br />Short song analysis:<br /><br />- "Bohemian Rhapsody": Although it's a short medley version, it's one of the best performances of the ballad section, with Freddie nailing the Bb4s with the correct studio phrasing (for the first time ever!).<br /><br />- "Radio Ga Ga": Although it's missing one chorus, this is one of - if not the best - the best versions ever, Freddie nails all the Bb4s and sounds very clean! Spike Edney's Roland Jupiter 8 also really shines through on this mix, compared to the DVD releases!<br /><br />- "Audience Improv": A great improv, Freddie sounds strong and confident. You gotta love when he sustains that A4 for 4 seconds!<br /><br />- "Hammer To Fall": Despite missing a verse and a chorus, it's a strong version (possibly the best ever). Freddie sings the song amazingly, and even ad-libs a C#5 and a C5! Also notice how heavy Brian's guitar sounds compared to the thin DVD mixes - it roars!<br /><br />- "Crazy Little Thing Called Love": A great version, the crowd loves the song, the jam is great as well! Only downside to this is the slight feedback issues.<br /><br />- "We Will Rock You": Although cut down to the 1st verse and chorus, Freddie sounds strong. He nails the A4, and the solo from Dr. May is brilliant!<br /><br />- "We Are the Champions": Perhaps the high-light of the performance - Freddie is very daring on this version, he sustains the pre-chorus Bb4s, nails the 1st C5, belts great A4s, but most importantly: He nails the chorus Bb4s, in all 3 choruses! This is the only time he has ever done so! It has to be said though, the last one sounds a bit rough, but that's a side effect of belting high notes for the past 18 minutes, with nodules AND laryngitis!<br /><br />- "Is This The World We Created... ?": Freddie and Brian perform a beautiful version of this, and it is one of the best versions ever. It's both sad and hilarious that a couple of BBC engineers are talking over the song, one of them being completely oblivious of the fact that he is interrupting the performance, on live television... Which was being televised to almost 2 billion homes.<br /><br /><br />All rights go to their respective owners!<br />-----Copyright Disclaimer Under Section 107 of the Copyright Act 1976, allowance is made for fair use for purposes such as criticism, comment, news reporting, teaching, scholarship, and research. Fair use is a use permitted by copyright statute that might otherwise be infringing. Non-profit, educational or personal use tips the balance in favor of fair use''', - 1477, - [{ - 'start_time': 36, - 'end_time': 162, - 'title': 'Bohemian Rhapsody', - }, { - 'start_time': 162, - 'end_time': 413, - 'title': 'Radio Ga Ga', - }, { - 'start_time': 413, - 'end_time': 454, - 'title': 'Ay Oh!', - }, { - 'start_time': 454, - 'end_time': 728, - 'title': 'Hammer To Fall', - }, { - 'start_time': 728, - 'end_time': 963, - 'title': 'Crazy Little Thing Called Love', - }, { - 'start_time': 963, - 'end_time': 1038, - 'title': 'We Will Rock You', - }, { - 'start_time': 1038, - 'end_time': 1272, - 'title': 'We Are The Champions', - }, { - 'start_time': 1272, - 'end_time': 1477, - 'title': 'Is This The World We Created...?', - }] - ), - ( - # https://www.youtube.com/watch?v=ekYlRhALiRQ - # pattern: <num>. <title> 0:00 - '1. Those Beaten Paths of Confusion <a href="#" onclick="yt.www.watch.player.seekTo(0*60+00);return false;">0:00</a><br />2. Beyond the Shadows of Emptiness & Nothingness <a href="#" onclick="yt.www.watch.player.seekTo(11*60+47);return false;">11:47</a><br />3. Poison Yourself...With Thought <a href="#" onclick="yt.www.watch.player.seekTo(26*60+30);return false;">26:30</a><br />4. The Agents of Transformation <a href="#" onclick="yt.www.watch.player.seekTo(35*60+57);return false;">35:57</a><br />5. Drowning in the Pain of Consciousness <a href="#" onclick="yt.www.watch.player.seekTo(44*60+32);return false;">44:32</a><br />6. Deny the Disease of Life <a href="#" onclick="yt.www.watch.player.seekTo(53*60+07);return false;">53:07</a><br /><br />More info/Buy: http://crepusculonegro.storenvy.com/products/257645-cn-03-arizmenda-within-the-vacuum-of-infinity<br /><br />No copyright is intended. The rights to this video are assumed by the owner and its affiliates.', - 4009, - [{ - 'start_time': 0, - 'end_time': 707, - 'title': '1. Those Beaten Paths of Confusion', - }, { - 'start_time': 707, - 'end_time': 1590, - 'title': '2. Beyond the Shadows of Emptiness & Nothingness', - }, { - 'start_time': 1590, - 'end_time': 2157, - 'title': '3. Poison Yourself...With Thought', - }, { - 'start_time': 2157, - 'end_time': 2672, - 'title': '4. The Agents of Transformation', - }, { - 'start_time': 2672, - 'end_time': 3187, - 'title': '5. Drowning in the Pain of Consciousness', - }, { - 'start_time': 3187, - 'end_time': 4009, - 'title': '6. Deny the Disease of Life', - }] - ), - ( - # https://www.youtube.com/watch?v=WjL4pSzog9w - # pattern: 00:00 <title> - '<a href="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" class="yt-uix-servicelink " data-target-new-window="True" data-servicelink="CDAQ6TgYACITCNf1raqT2dMCFdRjGAod_o0CBSj4HQ" data-url="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" rel="nofollow noopener" target="_blank">https://arizmenda.bandcamp.com/merch/...</a><br /><br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+00);return false;">00:00</a> Christening Unborn Deformities <br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+08);return false;">07:08</a> Taste of Purity<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+16);return false;">16:16</a> Sculpting Sins of a Universal Tongue<br /><a href="#" onclick="yt.www.watch.player.seekTo(24*60+45);return false;">24:45</a> Birth<br /><a href="#" onclick="yt.www.watch.player.seekTo(31*60+24);return false;">31:24</a> Neves<br /><a href="#" onclick="yt.www.watch.player.seekTo(37*60+55);return false;">37:55</a> Libations in Limbo', - 2705, - [{ - 'start_time': 0, - 'end_time': 428, - 'title': 'Christening Unborn Deformities', - }, { - 'start_time': 428, - 'end_time': 976, - 'title': 'Taste of Purity', - }, { - 'start_time': 976, - 'end_time': 1485, - 'title': 'Sculpting Sins of a Universal Tongue', - }, { - 'start_time': 1485, - 'end_time': 1884, - 'title': 'Birth', - }, { - 'start_time': 1884, - 'end_time': 2275, - 'title': 'Neves', - }, { - 'start_time': 2275, - 'end_time': 2705, - 'title': 'Libations in Limbo', - }] - ), - ( - # https://www.youtube.com/watch?v=o3r1sn-t3is - # pattern: <title> 00:00 <note> - 'Download this show in MP3: <a href="http://sh.st/njZKK" class="yt-uix-servicelink " data-url="http://sh.st/njZKK" data-target-new-window="True" data-servicelink="CDAQ6TgYACITCK3j8_6o2dMCFVDCGAoduVAKKij4HQ" rel="nofollow noopener" target="_blank">http://sh.st/njZKK</a><br /><br />Setlist:<br />I-E-A-I-A-I-O <a href="#" onclick="yt.www.watch.player.seekTo(00*60+45);return false;">00:45</a><br />Suite-Pee <a href="#" onclick="yt.www.watch.player.seekTo(4*60+26);return false;">4:26</a> (Incomplete)<br />Attack <a href="#" onclick="yt.www.watch.player.seekTo(5*60+31);return false;">5:31</a> (First live performance since 2011)<br />Prison Song <a href="#" onclick="yt.www.watch.player.seekTo(8*60+42);return false;">8:42</a><br />Know <a href="#" onclick="yt.www.watch.player.seekTo(12*60+32);return false;">12:32</a> (First live performance since 2011)<br />Aerials <a href="#" onclick="yt.www.watch.player.seekTo(15*60+32);return false;">15:32</a><br />Soldier Side - Intro <a href="#" onclick="yt.www.watch.player.seekTo(19*60+13);return false;">19:13</a><br />B.Y.O.B. <a href="#" onclick="yt.www.watch.player.seekTo(20*60+09);return false;">20:09</a><br />Soil <a href="#" onclick="yt.www.watch.player.seekTo(24*60+32);return false;">24:32</a><br />Darts <a href="#" onclick="yt.www.watch.player.seekTo(27*60+48);return false;">27:48</a><br />Radio/Video <a href="#" onclick="yt.www.watch.player.seekTo(30*60+38);return false;">30:38</a><br />Hypnotize <a href="#" onclick="yt.www.watch.player.seekTo(35*60+05);return false;">35:05</a><br />Temper <a href="#" onclick="yt.www.watch.player.seekTo(38*60+08);return false;">38:08</a> (First live performance since 1999)<br />CUBErt <a href="#" onclick="yt.www.watch.player.seekTo(41*60+00);return false;">41:00</a><br />Needles <a href="#" onclick="yt.www.watch.player.seekTo(42*60+57);return false;">42:57</a><br />Deer Dance <a href="#" onclick="yt.www.watch.player.seekTo(46*60+27);return false;">46:27</a><br />Bounce <a href="#" onclick="yt.www.watch.player.seekTo(49*60+38);return false;">49:38</a><br />Suggestions <a href="#" onclick="yt.www.watch.player.seekTo(51*60+25);return false;">51:25</a><br />Psycho <a href="#" onclick="yt.www.watch.player.seekTo(53*60+52);return false;">53:52</a><br />Chop Suey! <a href="#" onclick="yt.www.watch.player.seekTo(58*60+13);return false;">58:13</a><br />Lonely Day <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+01*60+15);return false;">1:01:15</a><br />Question! <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+04*60+14);return false;">1:04:14</a><br />Lost in Hollywood <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+08*60+10);return false;">1:08:10</a><br />Vicinity of Obscenity <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+13*60+40);return false;">1:13:40</a>(First live performance since 2012)<br />Forest <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+16*60+17);return false;">1:16:17</a><br />Cigaro <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+20*60+02);return false;">1:20:02</a><br />Toxicity <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+23*60+57);return false;">1:23:57</a>(with Chino Moreno)<br />Sugar <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+27*60+53);return false;">1:27:53</a>', - 5640, - [{ - 'start_time': 45, - 'end_time': 266, - 'title': 'I-E-A-I-A-I-O', - }, { - 'start_time': 266, - 'end_time': 331, - 'title': 'Suite-Pee (Incomplete)', - }, { - 'start_time': 331, - 'end_time': 522, - 'title': 'Attack (First live performance since 2011)', - }, { - 'start_time': 522, - 'end_time': 752, - 'title': 'Prison Song', - }, { - 'start_time': 752, - 'end_time': 932, - 'title': 'Know (First live performance since 2011)', - }, { - 'start_time': 932, - 'end_time': 1153, - 'title': 'Aerials', - }, { - 'start_time': 1153, - 'end_time': 1209, - 'title': 'Soldier Side - Intro', - }, { - 'start_time': 1209, - 'end_time': 1472, - 'title': 'B.Y.O.B.', - }, { - 'start_time': 1472, - 'end_time': 1668, - 'title': 'Soil', - }, { - 'start_time': 1668, - 'end_time': 1838, - 'title': 'Darts', - }, { - 'start_time': 1838, - 'end_time': 2105, - 'title': 'Radio/Video', - }, { - 'start_time': 2105, - 'end_time': 2288, - 'title': 'Hypnotize', - }, { - 'start_time': 2288, - 'end_time': 2460, - 'title': 'Temper (First live performance since 1999)', - }, { - 'start_time': 2460, - 'end_time': 2577, - 'title': 'CUBErt', - }, { - 'start_time': 2577, - 'end_time': 2787, - 'title': 'Needles', - }, { - 'start_time': 2787, - 'end_time': 2978, - 'title': 'Deer Dance', - }, { - 'start_time': 2978, - 'end_time': 3085, - 'title': 'Bounce', - }, { - 'start_time': 3085, - 'end_time': 3232, - 'title': 'Suggestions', - }, { - 'start_time': 3232, - 'end_time': 3493, - 'title': 'Psycho', - }, { - 'start_time': 3493, - 'end_time': 3675, - 'title': 'Chop Suey!', - }, { - 'start_time': 3675, - 'end_time': 3854, - 'title': 'Lonely Day', - }, { - 'start_time': 3854, - 'end_time': 4090, - 'title': 'Question!', - }, { - 'start_time': 4090, - 'end_time': 4420, - 'title': 'Lost in Hollywood', - }, { - 'start_time': 4420, - 'end_time': 4577, - 'title': 'Vicinity of Obscenity (First live performance since 2012)', - }, { - 'start_time': 4577, - 'end_time': 4802, - 'title': 'Forest', - }, { - 'start_time': 4802, - 'end_time': 5037, - 'title': 'Cigaro', - }, { - 'start_time': 5037, - 'end_time': 5273, - 'title': 'Toxicity (with Chino Moreno)', - }, { - 'start_time': 5273, - 'end_time': 5640, - 'title': 'Sugar', - }] - ), - ( - # https://www.youtube.com/watch?v=PkYLQbsqCE8 - # pattern: <num> - <title> [<latinized title>] 0:00:00 - '''Затемно (Zatemno) is an Obscure Black Metal Band from Russia.<br /><br />"Во прах (Vo prakh)'' Into The Ashes", Debut mini-album released may 6, 2016, by Death Knell Productions<br />Released on 6 panel digipak CD, limited to 100 copies only<br />And digital format on Bandcamp<br /><br />Tracklist<br /><br />1 - Во прах [Vo prakh] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+00*60+00);return false;">0:00:00</a><br />2 - Искупление [Iskupleniye] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+08*60+10);return false;">0:08:10</a><br />3 - Из серпов луны...[Iz serpov luny] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+14*60+30);return false;">0:14:30</a><br /><br />Links:<br /><a href="https://deathknellprod.bandcamp.com/album/--2" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://deathknellprod.bandcamp.com/album/--2" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://deathknellprod.bandcamp.com/a...</a><br /><a href="https://www.facebook.com/DeathKnellProd/" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://www.facebook.com/DeathKnellProd/" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://www.facebook.com/DeathKnellProd/</a><br /><br /><br />I don't have any right about this artifact, my only intention is to spread the music of the band, all rights are reserved to the Затемно (Zatemno) and his producers, Death Knell Productions.<br /><br />------------------------------------------------------------------<br /><br />Subscribe for more videos like this.<br />My link: <a href="https://web.facebook.com/AttackOfTheDragons" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://web.facebook.com/AttackOfTheDragons" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://web.facebook.com/AttackOfTheD...</a>''', - 1138, - [{ - 'start_time': 0, - 'end_time': 490, - 'title': '1 - Во прах [Vo prakh]', - }, { - 'start_time': 490, - 'end_time': 870, - 'title': '2 - Искупление [Iskupleniye]', - }, { - 'start_time': 870, - 'end_time': 1138, - 'title': '3 - Из серпов луны...[Iz serpov luny]', - }] - ), - ( - # https://www.youtube.com/watch?v=xZW70zEasOk - # time point more than duration - '''● LCS Spring finals: Saturday and Sunday from <a href="#" onclick="yt.www.watch.player.seekTo(13*60+30);return false;">13:30</a> outside the venue! <br />● PAX East: Fri, Sat & Sun - more info in tomorrows video on the main channel!''', - 283, - [] - ), - ] - - def test_youtube_chapters(self): - for description, duration, expected_chapters in self._TEST_CASES: - ie = YoutubeIE() - expect_value( - self, ie._extract_chapters_from_description(description, duration), - expected_chapters, None) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 69df30eda..b5a4d0d5f 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -86,13 +86,9 @@ class TestPlayerInfo(unittest.TestCase): ('https://www.youtube.com/yts/jsbin/player-en_US-vflaxXRn1/base.js', 'vflaxXRn1'), ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', 'vflXGBaUN'), ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'vflKjOTVq'), - ('http://s.ytimg.com/yt/swfbin/watch_as3-vflrEm9Nq.swf', 'vflrEm9Nq'), - ('https://s.ytimg.com/yts/swfbin/player-vflenCdZL/watch_as3.swf', 'vflenCdZL'), ) for player_url, expected_player_id in PLAYER_URLS: - expected_player_type = player_url.split('.')[-1] - player_type, player_id = YoutubeIE._extract_player_info(player_url) - self.assertEqual(player_type, expected_player_type) + player_id = YoutubeIE._extract_player_info(player_url) self.assertEqual(player_id, expected_player_id) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 0ed4bc6ba..ecac31f7a 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -163,6 +163,7 @@ class YoutubeDL(object): simulate: Do not download the video files. format: Video format code. See options.py for more information. outtmpl: Template for output names. + outtmpl_na_placeholder: Placeholder for unavailable meta fields. restrictfilenames: Do not allow "&" and spaces in file names ignoreerrors: Do not stop on download errors. force_generic_extractor: Force downloader to use the generic extractor @@ -338,6 +339,8 @@ class YoutubeDL(object): _pps = [] _download_retcode = None _num_downloads = None + _playlist_level = 0 + _playlist_urls = set() _screen_file = None def __init__(self, params=None, auto_init=True): @@ -656,7 +659,7 @@ class YoutubeDL(object): template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v)) for k, v in template_dict.items() if v is not None and not isinstance(v, (list, tuple, dict))) - template_dict = collections.defaultdict(lambda: 'NA', template_dict) + template_dict = collections.defaultdict(lambda: self.params.get('outtmpl_na_placeholder', 'NA'), template_dict) outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) @@ -676,8 +679,8 @@ class YoutubeDL(object): # Missing numeric fields used together with integer presentation types # in format specification will break the argument substitution since - # string 'NA' is returned for missing fields. We will patch output - # template for missing fields to meet string presentation type. + # string NA placeholder is returned for missing fields. We will patch + # output template for missing fields to meet string presentation type. for numeric_field in self._NUMERIC_FIELDS: if numeric_field not in template_dict: # As of [1] format syntax is: @@ -906,115 +909,23 @@ class YoutubeDL(object): return self.process_ie_result( new_result, download=download, extra_info=extra_info) elif result_type in ('playlist', 'multi_video'): - # We process each entry in the playlist - playlist = ie_result.get('title') or ie_result.get('id') - self.to_screen('[download] Downloading playlist: %s' % playlist) - - playlist_results = [] - - playliststart = self.params.get('playliststart', 1) - 1 - playlistend = self.params.get('playlistend') - # For backwards compatibility, interpret -1 as whole list - if playlistend == -1: - playlistend = None - - playlistitems_str = self.params.get('playlist_items') - playlistitems = None - if playlistitems_str is not None: - def iter_playlistitems(format): - for string_segment in format.split(','): - if '-' in string_segment: - start, end = string_segment.split('-') - for item in range(int(start), int(end) + 1): - yield int(item) - else: - yield int(string_segment) - playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) - - ie_entries = ie_result['entries'] - - def make_playlistitems_entries(list_ie_entries): - num_entries = len(list_ie_entries) - return [ - list_ie_entries[i - 1] for i in playlistitems - if -num_entries <= i - 1 < num_entries] - - def report_download(num_entries): + # Protect from infinite recursion due to recursively nested playlists + # (see https://github.com/ytdl-org/youtube-dl/issues/27833) + webpage_url = ie_result['webpage_url'] + if webpage_url in self._playlist_urls: self.to_screen( - '[%s] playlist %s: Downloading %d videos' % - (ie_result['extractor'], playlist, num_entries)) + '[download] Skipping already downloaded playlist: %s' + % ie_result.get('title') or ie_result.get('id')) + return - if isinstance(ie_entries, list): - n_all_entries = len(ie_entries) - if playlistitems: - entries = make_playlistitems_entries(ie_entries) - else: - entries = ie_entries[playliststart:playlistend] - n_entries = len(entries) - self.to_screen( - '[%s] playlist %s: Collected %d video ids (downloading %d of them)' % - (ie_result['extractor'], playlist, n_all_entries, n_entries)) - elif isinstance(ie_entries, PagedList): - if playlistitems: - entries = [] - for item in playlistitems: - entries.extend(ie_entries.getslice( - item - 1, item - )) - else: - entries = ie_entries.getslice( - playliststart, playlistend) - n_entries = len(entries) - report_download(n_entries) - else: # iterable - if playlistitems: - entries = make_playlistitems_entries(list(itertools.islice( - ie_entries, 0, max(playlistitems)))) - else: - entries = list(itertools.islice( - ie_entries, playliststart, playlistend)) - n_entries = len(entries) - report_download(n_entries) - - if self.params.get('playlistreverse', False): - entries = entries[::-1] - - if self.params.get('playlistrandom', False): - random.shuffle(entries) - - x_forwarded_for = ie_result.get('__x_forwarded_for_ip') - - for i, entry in enumerate(entries, 1): - self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) - # This __x_forwarded_for_ip thing is a bit ugly but requires - # minimal changes - if x_forwarded_for: - entry['__x_forwarded_for_ip'] = x_forwarded_for - extra = { - 'n_entries': n_entries, - 'playlist': playlist, - 'playlist_id': ie_result.get('id'), - 'playlist_title': ie_result.get('title'), - 'playlist_uploader': ie_result.get('uploader'), - 'playlist_uploader_id': ie_result.get('uploader_id'), - 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, - 'extractor': ie_result['extractor'], - 'webpage_url': ie_result['webpage_url'], - 'webpage_url_basename': url_basename(ie_result['webpage_url']), - 'extractor_key': ie_result['extractor_key'], - } - - reason = self._match_entry(entry, incomplete=True) - if reason is not None: - self.to_screen('[download] ' + reason) - continue - - entry_result = self.__process_iterable_entry(entry, download, extra) - # TODO: skip failed (empty) entries? - playlist_results.append(entry_result) - ie_result['entries'] = playlist_results - self.to_screen('[download] Finished downloading playlist: %s' % playlist) - return ie_result + self._playlist_level += 1 + self._playlist_urls.add(webpage_url) + try: + return self.__process_playlist(ie_result, download) + finally: + self._playlist_level -= 1 + if not self._playlist_level: + self._playlist_urls.clear() elif result_type == 'compat_list': self.report_warning( 'Extractor %s returned a compat_list result. ' @@ -1039,6 +950,118 @@ class YoutubeDL(object): else: raise Exception('Invalid result type: %s' % result_type) + def __process_playlist(self, ie_result, download): + # We process each entry in the playlist + playlist = ie_result.get('title') or ie_result.get('id') + + self.to_screen('[download] Downloading playlist: %s' % playlist) + + playlist_results = [] + + playliststart = self.params.get('playliststart', 1) - 1 + playlistend = self.params.get('playlistend') + # For backwards compatibility, interpret -1 as whole list + if playlistend == -1: + playlistend = None + + playlistitems_str = self.params.get('playlist_items') + playlistitems = None + if playlistitems_str is not None: + def iter_playlistitems(format): + for string_segment in format.split(','): + if '-' in string_segment: + start, end = string_segment.split('-') + for item in range(int(start), int(end) + 1): + yield int(item) + else: + yield int(string_segment) + playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) + + ie_entries = ie_result['entries'] + + def make_playlistitems_entries(list_ie_entries): + num_entries = len(list_ie_entries) + return [ + list_ie_entries[i - 1] for i in playlistitems + if -num_entries <= i - 1 < num_entries] + + def report_download(num_entries): + self.to_screen( + '[%s] playlist %s: Downloading %d videos' % + (ie_result['extractor'], playlist, num_entries)) + + if isinstance(ie_entries, list): + n_all_entries = len(ie_entries) + if playlistitems: + entries = make_playlistitems_entries(ie_entries) + else: + entries = ie_entries[playliststart:playlistend] + n_entries = len(entries) + self.to_screen( + '[%s] playlist %s: Collected %d video ids (downloading %d of them)' % + (ie_result['extractor'], playlist, n_all_entries, n_entries)) + elif isinstance(ie_entries, PagedList): + if playlistitems: + entries = [] + for item in playlistitems: + entries.extend(ie_entries.getslice( + item - 1, item + )) + else: + entries = ie_entries.getslice( + playliststart, playlistend) + n_entries = len(entries) + report_download(n_entries) + else: # iterable + if playlistitems: + entries = make_playlistitems_entries(list(itertools.islice( + ie_entries, 0, max(playlistitems)))) + else: + entries = list(itertools.islice( + ie_entries, playliststart, playlistend)) + n_entries = len(entries) + report_download(n_entries) + + if self.params.get('playlistreverse', False): + entries = entries[::-1] + + if self.params.get('playlistrandom', False): + random.shuffle(entries) + + x_forwarded_for = ie_result.get('__x_forwarded_for_ip') + + for i, entry in enumerate(entries, 1): + self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) + # This __x_forwarded_for_ip thing is a bit ugly but requires + # minimal changes + if x_forwarded_for: + entry['__x_forwarded_for_ip'] = x_forwarded_for + extra = { + 'n_entries': n_entries, + 'playlist': playlist, + 'playlist_id': ie_result.get('id'), + 'playlist_title': ie_result.get('title'), + 'playlist_uploader': ie_result.get('uploader'), + 'playlist_uploader_id': ie_result.get('uploader_id'), + 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'extractor_key': ie_result['extractor_key'], + } + + reason = self._match_entry(entry, incomplete=True) + if reason is not None: + self.to_screen('[download] ' + reason) + continue + + entry_result = self.__process_iterable_entry(entry, download, extra) + # TODO: skip failed (empty) entries? + playlist_results.append(entry_result) + ie_result['entries'] = playlist_results + self.to_screen('[download] Finished downloading playlist: %s' % playlist) + return ie_result + @__handle_extraction_exceptions def __process_iterable_entry(self, entry, download, extra_info): return self.process_ie_result( diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9a659fc65..e1bd67919 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -340,6 +340,7 @@ def _real_main(argv=None): 'format': opts.format, 'listformats': opts.listformats, 'outtmpl': outtmpl, + 'outtmpl_na_placeholder': opts.outtmpl_na_placeholder, 'autonumber_size': opts.autonumber_size, 'autonumber_start': opts.autonumber_start, 'restrictfilenames': opts.restrictfilenames, diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index 8b407bf9c..908c83377 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -1,14 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import calendar import re -import time from .amp import AMPIE from .common import InfoExtractor -from .youtube import YoutubeIE -from ..compat import compat_urlparse +from ..utils import ( + parse_duration, + parse_iso8601, + try_get, +) class AbcNewsVideoIE(AMPIE): @@ -18,8 +19,8 @@ class AbcNewsVideoIE(AMPIE): (?: abcnews\.go\.com/ (?: - [^/]+/video/(?P<display_id>[0-9a-z-]+)-| - video/embed\?.*?\bid= + (?:[^/]+/)*video/(?P<display_id>[0-9a-z-]+)-| + video/(?:embed|itemfeed)\?.*?\bid= )| fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/ ) @@ -36,6 +37,8 @@ class AbcNewsVideoIE(AMPIE): 'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.', 'duration': 180, 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1380454200, + 'upload_date': '20130929', }, 'params': { # m3u8 download @@ -47,6 +50,12 @@ class AbcNewsVideoIE(AMPIE): }, { 'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478', 'only_matching': True, + }, { + 'url': 'http://abcnews.go.com/video/itemfeed?id=46979033', + 'only_matching': True, + }, { + 'url': 'https://abcnews.go.com/GMA/News/video/history-christmas-story-67894761', + 'only_matching': True, }] def _real_extract(self, url): @@ -67,28 +76,23 @@ class AbcNewsIE(InfoExtractor): _VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)' _TESTS = [{ - 'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY', + # Youtube Embeds + 'url': 'https://abcnews.go.com/Entertainment/peter-billingsley-child-actor-christmas-story-hollywood-power/story?id=51286501', 'info_dict': { - 'id': '10505354', - 'ext': 'flv', - 'display_id': 'dramatic-video-rare-death-job-america', - 'title': 'Occupational Hazards', - 'description': 'Nightline investigates the dangers that lurk at various jobs.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20100428', - 'timestamp': 1272412800, + 'id': '51286501', + 'title': "Peter Billingsley: From child actor in 'A Christmas Story' to Hollywood power player", + 'description': 'Billingsley went from a child actor to Hollywood power player.', }, - 'add_ie': ['AbcNewsVideo'], + 'playlist_count': 5, }, { 'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818', 'info_dict': { 'id': '38897857', 'ext': 'mp4', - 'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016', 'title': 'Justin Timberlake Drops Hints For Secret Single', 'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.', - 'upload_date': '20160515', - 'timestamp': 1463329500, + 'upload_date': '20160505', + 'timestamp': 1462442280, }, 'params': { # m3u8 download @@ -100,49 +104,55 @@ class AbcNewsIE(InfoExtractor): }, { 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', 'only_matching': True, + }, { + # inline.type == 'video' + 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - video_id = mobj.group('id') + story_id = self._match_id(url) + webpage = self._download_webpage(url, story_id) + story = self._parse_json(self._search_regex( + r"window\['__abcnews__'\]\s*=\s*({.+?});", + webpage, 'data'), story_id)['page']['content']['story']['everscroll'][0] + article_contents = story.get('articleContents') or {} - webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL') - full_video_url = compat_urlparse.urljoin(url, video_url) + def entries(): + featured_video = story.get('featuredVideo') or {} + feed = try_get(featured_video, lambda x: x['video']['feed']) + if feed: + yield { + '_type': 'url', + 'id': featured_video.get('id'), + 'title': featured_video.get('name'), + 'url': feed, + 'thumbnail': featured_video.get('images'), + 'description': featured_video.get('description'), + 'timestamp': parse_iso8601(featured_video.get('uploadDate')), + 'duration': parse_duration(featured_video.get('duration')), + 'ie_key': AbcNewsVideoIE.ie_key(), + } - youtube_url = YoutubeIE._extract_url(webpage) + for inline in (article_contents.get('inlines') or []): + inline_type = inline.get('type') + if inline_type == 'iframe': + iframe_url = try_get(inline, lambda x: x['attrs']['src']) + if iframe_url: + yield self.url_result(iframe_url) + elif inline_type == 'video': + video_id = inline.get('id') + if video_id: + yield { + '_type': 'url', + 'id': video_id, + 'url': 'http://abcnews.go.com/video/embed?id=' + video_id, + 'thumbnail': inline.get('imgSrc') or inline.get('imgDefault'), + 'description': inline.get('description'), + 'duration': parse_duration(inline.get('duration')), + 'ie_key': AbcNewsVideoIE.ie_key(), + } - timestamp = None - date_str = self._html_search_regex( - r'<span[^>]+class="timestamp">([^<]+)</span>', - webpage, 'timestamp', fatal=False) - if date_str: - tz_offset = 0 - if date_str.endswith(' ET'): # Eastern Time - tz_offset = -5 - date_str = date_str[:-3] - date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p'] - for date_format in date_formats: - try: - timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format)) - except ValueError: - continue - if timestamp is not None: - timestamp -= tz_offset * 3600 - - entry = { - '_type': 'url_transparent', - 'ie_key': AbcNewsVideoIE.ie_key(), - 'url': full_video_url, - 'id': video_id, - 'display_id': display_id, - 'timestamp': timestamp, - } - - if youtube_url: - entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())] - return self.playlist_result(entries) - - return entry + return self.playlist_result( + entries(), story_id, article_contents.get('headline'), + article_contents.get('subHead')) diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index d611ee237..a55ebbcbd 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -26,6 +26,7 @@ from ..utils import ( strip_or_none, try_get, unified_strdate, + urlencode_postdata, ) @@ -51,9 +52,12 @@ class ADNIE(InfoExtractor): } } + _NETRC_MACHINE = 'animedigitalnetwork' _BASE_URL = 'http://animedigitalnetwork.fr' _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/' _PLAYER_BASE_URL = _API_BASE_URL + 'player/' + _HEADERS = {} + _LOGIN_ERR_MESSAGE = 'Unable to log in' _RSA_KEY = (0x9B42B08905199A5CCE2026274399CA560ECB209EE9878A708B1C0812E1BB8CB5D1FB7441861147C1A1F2F3A0476DD63A9CAC20D3E983613346850AA6CB38F16DC7D720FD7D86FC6E5B3D5BBC72E14CD0BF9E869F2CEA2CCAD648F1DCE38F1FF916CEFB2D339B64AA0264372344BC775E265E8A852F88144AB0BD9AA06C1A4ABB, 65537) _POS_ALIGN_MAP = { 'start': 1, @@ -129,19 +133,42 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' }]) return subtitles + def _real_initialize(self): + username, password = self._get_login_info() + if not username: + return + try: + access_token = (self._download_json( + self._API_BASE_URL + 'authentication/login', None, + 'Logging in', self._LOGIN_ERR_MESSAGE, fatal=False, + data=urlencode_postdata({ + 'password': password, + 'rememberMe': False, + 'source': 'Web', + 'username': username, + })) or {}).get('accessToken') + if access_token: + self._HEADERS = {'authorization': 'Bearer ' + access_token} + except ExtractorError as e: + message = None + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json( + e.cause.read().decode(), None, fatal=False) or {} + message = resp.get('message') or resp.get('code') + self.report_warning(message or self._LOGIN_ERR_MESSAGE) + def _real_extract(self, url): video_id = self._match_id(url) video_base_url = self._PLAYER_BASE_URL + 'video/%s/' % video_id player = self._download_json( video_base_url + 'configuration', video_id, - 'Downloading player config JSON metadata')['player'] + 'Downloading player config JSON metadata', + headers=self._HEADERS)['player'] options = player['options'] user = options['user'] if not user.get('hasAccess'): - raise ExtractorError( - 'This video is only available for paying users', expected=True) - # self.raise_login_required() # FIXME: Login is not implemented + self.raise_login_required() token = self._download_json( user.get('refreshTokenUrl') or (self._PLAYER_BASE_URL + 'refresh/token'), @@ -188,8 +215,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' message = error.get('message') if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country': self.raise_geo_restricted(msg=message) - else: - raise ExtractorError(message) + raise ExtractorError(message) else: raise ExtractorError('Giving up retrying') diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 8e4963131..e55c03fd7 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -252,11 +252,11 @@ class AENetworksShowIE(AENetworksListBaseIE): _TESTS = [{ 'url': 'http://www.history.com/shows/ancient-aliens', 'info_dict': { - 'id': 'SH012427480000', + 'id': 'SERIES1574', 'title': 'Ancient Aliens', 'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f', }, - 'playlist_mincount': 168, + 'playlist_mincount': 150, }] _RESOURCE = 'series' _ITEMS_KEY = 'episodes' diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index c68be3134..c4f915a3c 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -1,13 +1,16 @@ from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P<id>[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?P<type>program/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)' _TESTS = [{ - 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', + 'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance', 'info_dict': { 'id': '3792260579001', 'ext': 'mp4', @@ -20,14 +23,34 @@ class AlJazeeraIE(InfoExtractor): 'add_ie': ['BrightcoveNew'], 'skip': 'Not accessible from Travis CI server', }, { - 'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html', + 'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off', + 'only_matching': True, + }, { + 'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art', 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' def _real_extract(self, url): - program_name = self._match_id(url) - webpage = self._download_webpage(url, program_name) - brightcove_id = self._search_regex( - r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id') - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + post_type, name = re.match(self._VALID_URL, url).groups() + post_type = { + 'features': 'post', + 'program': 'episode', + 'videos': 'video', + }[post_type.split('/')[0]] + video = self._download_json( + 'https://www.aljazeera.com/graphql', name, query={ + 'operationName': 'SingleArticleQuery', + 'variables': json.dumps({ + 'name': name, + 'postType': post_type, + }), + }, headers={ + 'wp-site': 'aje', + })['data']['article']['video'] + video_id = video['id'] + account_id = video.get('accountId') or '665003303001' + player_id = video.get('playerId') or 'BkeSH5BDb' + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), + 'BrightcoveNew', video_id) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index e20f00fc3..be960c0f9 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -1,13 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor from ..utils import ( clean_html, + int_or_none, try_get, unified_strdate, + unified_timestamp, ) @@ -22,8 +25,8 @@ class AmericasTestKitchenIE(InfoExtractor): 'ext': 'mp4', 'description': 'md5:64e606bfee910627efc4b5f050de92b3', 'thumbnail': r're:^https?://', - 'timestamp': 1523664000, - 'upload_date': '20180414', + 'timestamp': 1523318400, + 'upload_date': '20180410', 'release_date': '20180410', 'series': "America's Test Kitchen", 'season_number': 18, @@ -33,6 +36,27 @@ class AmericasTestKitchenIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # Metadata parsing behaves differently for newer episodes (705) as opposed to older episodes (582 above) + 'url': 'https://www.americastestkitchen.com/episode/705-simple-chicken-dinner', + 'md5': '06451608c57651e985a498e69cec17e5', + 'info_dict': { + 'id': '5fbe8c61bda2010001c6763b', + 'title': 'Simple Chicken Dinner', + 'ext': 'mp4', + 'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7', + 'thumbnail': r're:^https?://', + 'timestamp': 1610755200, + 'upload_date': '20210116', + 'release_date': '20210116', + 'series': "America's Test Kitchen", + 'season_number': 21, + 'episode': 'Simple Chicken Dinner', + 'episode_number': 3, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, @@ -60,7 +84,76 @@ class AmericasTestKitchenIE(InfoExtractor): 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % video['zypeId'], 'ie_key': 'Zype', 'description': clean_html(video.get('description')), + 'timestamp': unified_timestamp(video.get('publishDate')), 'release_date': unified_strdate(video.get('publishDate')), + 'episode_number': int_or_none(episode.get('number')), + 'season_number': int_or_none(episode.get('season')), 'series': try_get(episode, lambda x: x['show']['title']), 'episode': episode.get('title'), } + + +class AmericasTestKitchenSeasonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?P<show>americastestkitchen|cookscountry)\.com/episodes/browse/season_(?P<id>\d+)' + _TESTS = [{ + # ATK Season + 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', + 'info_dict': { + 'id': 'season_1', + 'title': 'Season 1', + }, + 'playlist_count': 13, + }, { + # Cooks Country Season + 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'info_dict': { + 'id': 'season_12', + 'title': 'Season 12', + }, + 'playlist_count': 13, + }] + + def _real_extract(self, url): + show_name, season_number = re.match(self._VALID_URL, url).groups() + season_number = int(season_number) + + slug = 'atk' if show_name == 'americastestkitchen' else 'cco' + + season = 'Season %d' % season_number + + season_search = self._download_json( + 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, + season, headers={ + 'Origin': 'https://www.%s.com' % show_name, + 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', + 'X-Algolia-Application-Id': 'Y1FNZXUI30', + }, query={ + 'facetFilters': json.dumps([ + 'search_season_list:' + season, + 'search_document_klass:episode', + 'search_show_slug:' + slug, + ]), + 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug, + 'attributesToHighlight': '', + 'hitsPerPage': 1000, + }) + + def entries(): + for episode in (season_search.get('hits') or []): + search_url = episode.get('search_url') + if not search_url: + continue + yield { + '_type': 'url', + 'url': 'https://www.%s.com%s' % (show_name, search_url), + 'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]), + 'title': episode.get('title'), + 'description': episode.get('description'), + 'timestamp': unified_timestamp(episode.get('search_document_date')), + 'season_number': season_number, + 'episode_number': int_or_none(episode.get('search_%s_episode_number' % slug)), + 'ie_key': AmericasTestKitchenIE.ie_key(), + } + + return self.playlist_result( + entries(), 'season_%d' % season_number, season) diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 7ff098cfa..24c684cad 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -8,6 +8,7 @@ from ..utils import ( int_or_none, mimetype2ext, parse_iso8601, + unified_timestamp, url_or_none, ) @@ -88,7 +89,7 @@ class AMPIE(InfoExtractor): self._sort_formats(formats) - timestamp = parse_iso8601(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) + timestamp = unified_timestamp(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) return { 'id': video_id, diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 00ce684d1..54e097d2f 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -116,8 +116,6 @@ class AnimeOnDemandIE(InfoExtractor): r'(?s)<div[^>]+itemprop="description"[^>]*>(.+?)</div>', webpage, 'anime description', default=None) - entries = [] - def extract_info(html, video_id, num=None): title, description = [None] * 2 formats = [] @@ -233,7 +231,7 @@ class AnimeOnDemandIE(InfoExtractor): self._sort_formats(info['formats']) f = common_info.copy() f.update(info) - entries.append(f) + yield f # Extract teaser/trailer only when full episode is not available if not info['formats']: @@ -247,7 +245,7 @@ class AnimeOnDemandIE(InfoExtractor): 'title': m.group('title'), 'url': urljoin(url, m.group('href')), }) - entries.append(f) + yield f def extract_episodes(html): for num, episode_html in enumerate(re.findall( @@ -275,7 +273,8 @@ class AnimeOnDemandIE(InfoExtractor): 'episode_number': episode_number, } - extract_entries(episode_html, video_id, common_info) + for e in extract_entries(episode_html, video_id, common_info): + yield e def extract_film(html, video_id): common_info = { @@ -283,11 +282,18 @@ class AnimeOnDemandIE(InfoExtractor): 'title': anime_title, 'description': anime_description, } - extract_entries(html, video_id, common_info) + for e in extract_entries(html, video_id, common_info): + yield e - extract_episodes(webpage) + def entries(): + has_episodes = False + for e in extract_episodes(webpage): + has_episodes = True + yield e - if not entries: - extract_film(webpage, anime_id) + if not has_episodes: + for e in extract_film(webpage, anime_id): + yield e - return self.playlist_result(entries, anime_id, anime_title, anime_description) + return self.playlist_result( + entries(), anime_id, anime_title, anime_description) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index e87994a6a..f6ecb8438 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .yahoo import YahooIE from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, @@ -15,9 +15,9 @@ from ..utils import ( ) -class AolIE(InfoExtractor): +class AolIE(YahooIE): IE_NAME = 'aol.com' - _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>[0-9a-f]+)' + _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>\d{9}|[0-9a-f]{24}|[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})' _TESTS = [{ # video with 5min ID @@ -76,10 +76,16 @@ class AolIE(InfoExtractor): }, { 'url': 'https://www.aol.jp/video/playlist/5a28e936a1334d000137da0c/5a28f3151e642219fde19831/', 'only_matching': True, + }, { + # Yahoo video + 'url': 'https://www.aol.com/video/play/991e6700-ac02-11ea-99ff-357400036f61/24bbc846-3e30-3c46-915e-fe8ccd7fcc46/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + if '-' in video_id: + return self._extract_yahoo_video(video_id, 'us') response = self._download_json( 'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id, diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 5b7b2dd6d..6bf5b3f13 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -187,13 +187,13 @@ class ARDMediathekIE(ARDMediathekBaseIE): if doc.tag == 'rss': return GenericIE()._extract_rss(url, video_id, doc) - title = self._html_search_regex( + title = self._og_search_title(webpage, default=None) or self._html_search_regex( [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', r'<meta name="dcterms\.title" content="(.*?)"/>', r'<h4 class="headline">(.*?)</h4>', r'<title[^>]*>(.*?)'], webpage, 'title') - description = self._html_search_meta( + description = self._og_search_description(webpage, default=None) or self._html_search_meta( 'dcterms.abstract', webpage, 'description', default=None) if description is None: description = self._html_search_meta( @@ -249,18 +249,18 @@ class ARDMediathekIE(ARDMediathekBaseIE): class ARDIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?P[0-9]+))\.html' + _VALID_URL = r'(?Phttps?://(?:www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?:video-?)?(?P[0-9]+))\.html' _TESTS = [{ - # available till 14.02.2019 - 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', - 'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49', + # available till 7.01.2022 + 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html', + 'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1', 'info_dict': { - 'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video', - 'id': '102', + 'display_id': 'maischberger-die-woche', + 'id': '100', 'ext': 'mp4', - 'duration': 4435.0, - 'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?', - 'upload_date': '20180214', + 'duration': 3687.0, + 'title': 'maischberger. die woche vom 7. Januar 2021', + 'upload_date': '20210107', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -315,17 +315,17 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(ARDMediathekBaseIE): _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?:player|live|video)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', - 'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f', + 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', + 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', 'info_dict': { 'display_id': 'die-robuste-roswita', - 'id': '70153354', + 'id': '78566716', 'title': 'Die robuste Roswita', - 'description': r're:^Der Mord.*trüber ist als die Ilm.', + 'description': r're:^Der Mord.*totgeglaubte Ehefrau Roswita', 'duration': 5316, - 'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard', - 'timestamp': 1577047500, - 'upload_date': '20191222', + 'thumbnail': 'https://img.ardmediathek.de/standard/00/78/56/67/84/575672121/16x9/960?mandant=ard', + 'timestamp': 1596658200, + 'upload_date': '20200805', 'ext': 'mp4', }, }, { diff --git a/youtube_dl/extractor/awaan.py b/youtube_dl/extractor/awaan.py index a2603bbff..3a7700cd4 100644 --- a/youtube_dl/extractor/awaan.py +++ b/youtube_dl/extractor/awaan.py @@ -48,6 +48,7 @@ class AWAANBaseIE(InfoExtractor): 'duration': int_or_none(video_data.get('duration')), 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), 'is_live': is_live, + 'uploader_id': video_data.get('user_id'), } @@ -107,6 +108,7 @@ class AWAANLiveIE(AWAANBaseIE): 'title': 're:Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'upload_date': '20150107', 'timestamp': 1420588800, + 'uploader_id': '71', }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index b1e20def5..930266990 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -47,7 +47,7 @@ class AZMedienIE(InfoExtractor): 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', 'only_matching': True }] - _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/cb9f2f81ed22e9b47f4ca64ea3cc5a5d13e88d1d' + _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/a4016f65fe62b81dc6664dd9f4910e4ab40383be' _PARTNER_ID = '1719221' def _real_extract(self, url): diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index dc60224d0..d1bf8e829 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -90,13 +90,19 @@ class BleacherReportCMSIE(AMPIE): _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P[0-9a-f-]{36}|\d{5})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms', - 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1', + 'md5': '670b2d73f48549da032861130488c681', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', + 'upload_date': '20150723', + 'timestamp': 1437679032, + }, + 'expected_warnings': [ + 'Unable to download f4m manifest' + ] }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py index b9715df00..bae2aedce 100644 --- a/youtube_dl/extractor/bravotv.py +++ b/youtube_dl/extractor/bravotv.py @@ -12,7 +12,7 @@ from ..utils import ( class BravoTVIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?Pbravotv|oxygen)\.com/(?:[^/]+/)+(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', @@ -28,10 +28,13 @@ class BravoTVIE(AdobePassIE): }, { 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', 'only_matching': True, + }, { + 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) settings = self._parse_json(self._search_regex( r']+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})', webpage, 'drupal settings'), @@ -53,11 +56,14 @@ class BravoTVIE(AdobePassIE): tp_path = release_pid = tve['release_pid'] if tve.get('entitlement') == 'auth': adobe_pass = settings.get('tve_adobe_auth', {}) + if site == 'bravotv': + site = 'bravo' resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId', 'bravo'), + adobe_pass.get('adobePassResourceId') or site, tve['title'], release_pid, tve.get('rating')) query['auth'] = self._extract_mvpd_auth( - url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource) + url, release_pid, + adobe_pass.get('adobePassRequestorId') or site, resource) else: shared_playlist = settings['ls_playlist'] account_pid = shared_playlist['account_pid'] diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py index 544647f92..4db51e650 100644 --- a/youtube_dl/extractor/ccma.py +++ b/youtube_dl/extractor/ccma.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import datetime import re from .common import InfoExtractor @@ -8,8 +9,8 @@ from ..utils import ( clean_html, int_or_none, parse_duration, - parse_iso8601, parse_resolution, + try_get, url_or_none, ) @@ -24,8 +25,9 @@ class CCMAIE(InfoExtractor): 'ext': 'mp4', 'title': 'L\'espot de La Marató de TV3', 'description': 'md5:f12987f320e2f6e988e9908e4fe97765', - 'timestamp': 1470918540, - 'upload_date': '20160811', + 'timestamp': 1478608140, + 'upload_date': '20161108', + 'age_limit': 0, } }, { 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', @@ -35,8 +37,24 @@ class CCMAIE(InfoExtractor): 'ext': 'mp3', 'title': 'El Consell de Savis analitza el derbi', 'description': 'md5:e2a3648145f3241cb9c6b4b624033e53', - 'upload_date': '20171205', - 'timestamp': 1512507300, + 'upload_date': '20170512', + 'timestamp': 1494622500, + 'vcodec': 'none', + 'categories': ['Esports'], + } + }, { + 'url': 'http://www.ccma.cat/tv3/alacarta/crims/crims-josep-tallada-lespereu-me-capitol-1/video/6031387/', + 'md5': 'b43c3d3486f430f3032b5b160d80cbc3', + 'info_dict': { + 'id': '6031387', + 'ext': 'mp4', + 'title': 'Crims - Josep Talleda, l\'"Espereu-me" (capítol 1)', + 'description': 'md5:7cbdafb640da9d0d2c0f62bad1e74e60', + 'timestamp': 1582577700, + 'upload_date': '20200224', + 'subtitles': 'mincount:4', + 'age_limit': 16, + 'series': 'Crims', } }] @@ -72,17 +90,27 @@ class CCMAIE(InfoExtractor): informacio = media['informacio'] title = informacio['titol'] - durada = informacio.get('durada', {}) + durada = informacio.get('durada') or {} duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) - timestamp = parse_iso8601(informacio.get('data_emissio', {}).get('utc')) + tematica = try_get(informacio, lambda x: x['tematica']['text']) + + timestamp = None + data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) + try: + timestamp = datetime.datetime.strptime( + data_utc, '%Y-%d-%mT%H:%M:%S%z').timestamp() + except TypeError: + pass subtitles = {} - subtitols = media.get('subtitols', {}) - if subtitols: - sub_url = subtitols.get('url') + subtitols = media.get('subtitols') or [] + if isinstance(subtitols, dict): + subtitols = [subtitols] + for st in subtitols: + sub_url = st.get('url') if sub_url: subtitles.setdefault( - subtitols.get('iso') or subtitols.get('text') or 'ca', []).append({ + st.get('iso') or st.get('text') or 'ca', []).append({ 'url': sub_url, }) @@ -97,6 +125,16 @@ class CCMAIE(InfoExtractor): 'height': int_or_none(imatges.get('alcada')), }] + age_limit = None + codi_etic = try_get(informacio, lambda x: x['codi_etic']['id']) + if codi_etic: + codi_etic_s = codi_etic.split('_') + if len(codi_etic_s) == 2: + if codi_etic_s[1] == 'TP': + age_limit = 0 + else: + age_limit = int_or_none(codi_etic_s[1]) + return { 'id': media_id, 'title': title, @@ -106,4 +144,9 @@ class CCMAIE(InfoExtractor): 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, + 'age_limit': age_limit, + 'alt_title': informacio.get('titol_complet'), + 'episode_number': int_or_none(informacio.get('capitol')), + 'categories': [tematica] if tematica else None, + 'series': informacio.get('programa'), } diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index d67900e62..6429454fb 100644 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -96,7 +96,7 @@ class CDAIE(InfoExtractor): raise ExtractorError('This video is only available for premium users.', expected=True) need_confirm_age = False - if self._html_search_regex(r'(]+action="/a/validatebirth")', + if self._html_search_regex(r'(]+action="[^"]*/a/validatebirth[^"]*")', webpage, 'birthday validate form', default=None): webpage = self._download_age_confirm_page( url, video_id, note='Confirming age') diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index d08b909a6..1bfa912be 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -1,142 +1,51 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor -from .common import InfoExtractor class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ - (video-clips|episodes|cc-studios|video-collections|shows(?=/[^/]+/(?!full-episodes))) - /(?P.*)''' + _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?)/(?P<id>[0-9a-z]{6})' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TESTS = [{ - 'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', - 'md5': 'c4f48e9eda1b16dd10add0744344b6d8', + 'url': 'http://www.cc.com/video-clips/5ke9v2/the-daily-show-with-trevor-noah-doc-rivers-and-steve-ballmer---the-nba-player-strike', + 'md5': 'b8acb347177c680ff18a292aa2166f80', 'info_dict': { - 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', + 'id': '89ccc86e-1b02-4f83-b0c9-1d9592ecd025', 'ext': 'mp4', - 'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother', - 'description': 'After a certain point, breastfeeding becomes c**kblocking.', - 'timestamp': 1376798400, - 'upload_date': '20130818', + 'title': 'The Daily Show with Trevor Noah|August 28, 2020|25|25149|Doc Rivers and Steve Ballmer - The NBA Player Strike', + 'description': 'md5:5334307c433892b85f4f5e5ac9ef7498', + 'timestamp': 1598670000, + 'upload_date': '20200829', }, }, { - 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview', + 'url': 'http://www.cc.com/episodes/pnzzci/drawn-together--american-idol--parody-clip-show-season-3-ep-314', 'only_matching': True, - }] - - -class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ - (?:full-episodes|shows(?=/[^/]+/full-episodes)) - /(?P<id>[^?]+)''' - _FEED_URL = 'http://comedycentral.com/feeds/mrss/' - - _TESTS = [{ - 'url': 'http://www.cc.com/full-episodes/pv391a/the-daily-show-with-trevor-noah-november-28--2016---ryan-speedo-green-season-22-ep-22028', - 'info_dict': { - 'description': 'Donald Trump is accused of exploiting his president-elect status for personal gain, Cuban leader Fidel Castro dies, and Ryan Speedo Green discusses "Sing for Your Life."', - 'title': 'November 28, 2016 - Ryan Speedo Green', - }, - 'playlist_count': 4, }, { - 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - mgid = self._extract_triforce_mgid(webpage, data_zone='t2_lc_promo1') - videos_info = self._get_videos_info(mgid) - return videos_info - - -class ToshIE(MTVServicesInfoExtractor): - IE_DESC = 'Tosh.0' - _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)' - _FEED_URL = 'http://tosh.cc.com/feeds/mrss' - - _TESTS = [{ - 'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans', - 'info_dict': { - 'description': 'Tosh asked fans to share their summer plans.', - 'title': 'Twitter Users Share Summer Plans', - }, - 'playlist': [{ - 'md5': 'f269e88114c1805bb6d7653fecea9e06', - 'info_dict': { - 'id': '90498ec2-ed00-11e0-aca6-0026b9414f30', - 'ext': 'mp4', - 'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans', - 'description': 'Tosh asked fans to share their summer plans.', - 'thumbnail': r're:^https?://.*\.jpg', - # It's really reported to be published on year 2077 - 'upload_date': '20770610', - 'timestamp': 3390510600, - 'subtitles': { - 'en': 'mincount:3', - }, - }, - }] - }, { - 'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp', + 'url': 'https://www.cc.com/video/k3sdvm/the-daily-show-with-jon-stewart-exclusive-the-fourth-estate', 'only_matching': True, }] class ComedyCentralTVIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/folgen/(?P<id>[0-9a-z]{6})' _TESTS = [{ - 'url': 'http://www.comedycentral.tv/staffeln/7436-the-mindy-project-staffel-4', + 'url': 'https://www.comedycentral.tv/folgen/pxdpec/josh-investigates-klimawandel-staffel-1-ep-1', 'info_dict': { - 'id': 'local_playlist-f99b626bdfe13568579a', - 'ext': 'flv', - 'title': 'Episode_the-mindy-project_shows_season-4_episode-3_full-episode_part1', + 'id': '15907dc3-ec3c-11e8-a442-0e40cf2fc285', + 'ext': 'mp4', + 'title': 'Josh Investigates', + 'description': 'Steht uns das Ende der Welt bevor?', }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - 'url': 'http://www.comedycentral.tv/shows/1074-workaholics', - 'only_matching': True, - }, { - 'url': 'http://www.comedycentral.tv/shows/1727-the-mindy-project/bonus', - 'only_matching': True, }] + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' + _GEO_COUNTRIES = ['DE'] - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - mrss_url = self._search_regex( - r'data-mrss=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'mrss url', group='url') - - return self._get_videos_info_from_url(mrss_url, video_id) - - -class ComedyCentralShortnameIE(InfoExtractor): - _VALID_URL = r'^:(?P<id>tds|thedailyshow|theopposition)$' - _TESTS = [{ - 'url': ':tds', - 'only_matching': True, - }, { - 'url': ':thedailyshow', - 'only_matching': True, - }, { - 'url': ':theopposition', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - shortcut_map = { - 'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'theopposition': 'http://www.cc.com/shows/the-opposition-with-jordan-klepper/full-episodes', + def _get_feed_query(self, uri): + return { + 'accountOverride': 'intl.mtvi.com', + 'arcEp': 'web.cc.tv', + 'ep': 'b9032c3a', + 'imageEp': 'web.cc.tv', + 'mgid': uri, } - return self.url_result(shortcut_map[video_id]) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d5faa0eb7..8eb110f4e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2064,7 +2064,7 @@ class InfoExtractor(object): }) return entries - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}): + def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', @@ -2078,10 +2078,9 @@ class InfoExtractor(object): mpd_base_url = base_url(urlh.geturl()) return self._parse_mpd_formats( - mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url, - formats_dict=formats_dict, mpd_url=mpd_url) + mpd_doc, mpd_id, mpd_base_url, mpd_url) - def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): + def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): """ Parse formats from MPD manifest. References: @@ -2359,15 +2358,7 @@ class InfoExtractor(object): else: # Assuming direct URL to unfragmented media. f['url'] = base_url - - # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation - # is not necessarily unique within a Period thus formats with - # the same `format_id` are quite possible. There are numerous examples - # of such manifests (see https://github.com/ytdl-org/youtube-dl/issues/15111, - # https://github.com/ytdl-org/youtube-dl/issues/13919) - full_info = formats_dict.get(representation_id, {}).copy() - full_info.update(f) - formats.append(full_info) + formats.append(f) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index df11dc206..aff9b88c0 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -12,7 +12,14 @@ from ..utils import ( ) -class EggheadCourseIE(InfoExtractor): +class EggheadBaseIE(InfoExtractor): + def _call_api(self, path, video_id, resource, fatal=True): + return self._download_json( + 'https://app.egghead.io/api/v1/' + path, + video_id, 'Downloading %s JSON' % resource, fatal=fatal) + + +class EggheadCourseIE(EggheadBaseIE): IE_DESC = 'egghead.io course' IE_NAME = 'egghead:course' _VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)' @@ -28,10 +35,9 @@ class EggheadCourseIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - - lessons = self._download_json( - 'https://egghead.io/api/v1/series/%s/lessons' % playlist_id, - playlist_id, 'Downloading course lessons JSON') + series_path = 'series/' + playlist_id + lessons = self._call_api( + series_path + '/lessons', playlist_id, 'course lessons') entries = [] for lesson in lessons: @@ -44,9 +50,8 @@ class EggheadCourseIE(InfoExtractor): entries.append(self.url_result( lesson_url, ie=EggheadLessonIE.ie_key(), video_id=lesson_id)) - course = self._download_json( - 'https://egghead.io/api/v1/series/%s' % playlist_id, - playlist_id, 'Downloading course JSON', fatal=False) or {} + course = self._call_api( + series_path, playlist_id, 'course', False) or {} playlist_id = course.get('id') if playlist_id: @@ -57,7 +62,7 @@ class EggheadCourseIE(InfoExtractor): course.get('description')) -class EggheadLessonIE(InfoExtractor): +class EggheadLessonIE(EggheadBaseIE): IE_DESC = 'egghead.io lesson' IE_NAME = 'egghead:lesson' _VALID_URL = r'https://egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)' @@ -74,7 +79,7 @@ class EggheadLessonIE(InfoExtractor): 'upload_date': '20161209', 'duration': 304, 'view_count': 0, - 'tags': ['javascript', 'free'], + 'tags': 'count:2', }, 'params': { 'skip_download': True, @@ -88,8 +93,8 @@ class EggheadLessonIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - lesson = self._download_json( - 'https://egghead.io/api/v1/lessons/%s' % display_id, display_id) + lesson = self._call_api( + 'lessons/' + display_id, display_id, 'lesson') lesson_id = compat_str(lesson['id']) title = lesson['title'] diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f2505768c..a22ca8477 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -42,7 +42,10 @@ from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE from .amara import AmaraIE from .amcnetworks import AMCNetworksIE -from .americastestkitchen import AmericasTestKitchenIE +from .americastestkitchen import ( + AmericasTestKitchenIE, + AmericasTestKitchenSeasonIE, +) from .animeondemand import AnimeOnDemandIE from .anvato import AnvatoIE from .aol import AolIE @@ -232,11 +235,8 @@ from .cnn import ( ) from .coub import CoubIE from .comedycentral import ( - ComedyCentralFullEpisodesIE, ComedyCentralIE, - ComedyCentralShortnameIE, ComedyCentralTVIE, - ToshIE, ) from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .commonprotocols import ( @@ -651,6 +651,11 @@ from .microsoftvirtualacademy import ( MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyCourseIE, ) +from .minds import ( + MindsIE, + MindsChannelIE, + MindsGroupIE, +) from .ministrygrid import MinistryGridIE from .minoto import MinotoIE from .miomio import MioMioIE @@ -1116,6 +1121,10 @@ from .stitcher import ( from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE +from .spotify import ( + SpotifyIE, + SpotifyShowIE, +) from .spreaker import ( SpreakerIE, SpreakerPageIE, @@ -1229,6 +1238,10 @@ from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE +from .trovo import ( + TrovoIE, + TrovoVodIE, +) from .trunews import TruNewsIE from .trutv import TruTVIE from .tube8 import Tube8IE @@ -1247,6 +1260,7 @@ from .tv2 import ( TV2IE, TV2ArticleIE, KatsomoIE, + MTVUutisetArticleIE, ) from .tv2dk import ( TV2DKIE, @@ -1385,7 +1399,6 @@ from .vidme import ( VidmeUserIE, VidmeUserLikesIE, ) -from .vidzi import VidziIE from .vier import VierIE, VierVideosIE, VierVijfKijkOnlineIE from .viewlift import ( ViewLiftIE, @@ -1445,6 +1458,7 @@ from .vrv import ( VRVSeriesIE, ) from .vshare import VShareIE +from .vtm import VTMIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index 306b45fc9..14f4cb489 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -11,7 +11,7 @@ from ..utils import ( class FranceCultureIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', 'info_dict': { 'id': 'rendez-vous-au-pays-des-geeks', @@ -20,10 +20,14 @@ class FranceCultureIE(InfoExtractor): 'title': 'Rendez-vous au pays des geeks', 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140301', - 'timestamp': 1393642916, + 'timestamp': 1393700400, 'vcodec': 'none', } - } + }, { + # no thumbnail + 'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -36,19 +40,19 @@ class FranceCultureIE(InfoExtractor): </h1>| <div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*> ).*? - (<button[^>]+data-asset-source="[^"]+"[^>]+>) + (<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>) ''', webpage, 'video data')) - video_url = video_data['data-asset-source'] - title = video_data.get('data-asset-title') or self._og_search_title(webpage) + video_url = video_data.get('data-url') or video_data['data-asset-source'] + title = video_data.get('data-asset-title') or video_data.get('data-diffusion-title') or self._og_search_title(webpage) description = self._html_search_regex( r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>', webpage, 'description', default=None) thumbnail = self._search_regex( r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"', - webpage, 'thumbnail', fatal=False) + webpage, 'thumbnail', default=None) uploader = self._html_search_regex( r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None) @@ -64,6 +68,6 @@ class FranceCultureIE(InfoExtractor): 'ext': ext, 'vcodec': 'none' if ext == 'mp3' else None, 'uploader': uploader, - 'timestamp': int_or_none(video_data.get('data-asset-created-date')), + 'timestamp': int_or_none(video_data.get('data-start-time')) or int_or_none(video_data.get('data-asset-created-date')), 'duration': int_or_none(video_data.get('data-duration')), } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 780971a92..09e680c96 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -128,6 +128,7 @@ from .zype import ZypeIE from .odnoklassniki import OdnoklassnikiIE from .kinja import KinjaEmbedIE from .arcpublishing import ArcPublishingIE +from .medialaan import MedialaanIE class GenericIE(InfoExtractor): @@ -2223,6 +2224,20 @@ class GenericIE(InfoExtractor): 'duration': 1581, }, }, + { + # MyChannels SDK embed + # https://www.24kitchen.nl/populair/deskundige-dit-waarom-sommigen-gevoelig-zijn-voor-voedselallergieen + 'url': 'https://www.demorgen.be/nieuws/burgemeester-rotterdam-richt-zich-in-videoboodschap-tot-relschoppers-voelt-het-goed~b0bcfd741/', + 'md5': '90c0699c37006ef18e198c032d81739c', + 'info_dict': { + 'id': '194165', + 'ext': 'mp4', + 'title': 'Burgemeester Aboutaleb spreekt relschoppers toe', + 'timestamp': 1611740340, + 'upload_date': '20210127', + 'duration': 159, + }, + }, ] def report_following_redirect(self, new_url): @@ -2462,6 +2477,9 @@ class GenericIE(InfoExtractor): webpage = self._webpage_read_content( full_response, url, video_id, prefix=first_bytes) + if '<title>DPG Media Privacy Gate' in webpage: + webpage = self._download_webpage(url, video_id) + self.report_extraction(video_id) # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest? @@ -2593,6 +2611,11 @@ class GenericIE(InfoExtractor): if arc_urls: return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key()) + mychannels_urls = MedialaanIE._extract_urls(webpage) + if mychannels_urls: + return self.playlist_from_matches( + mychannels_urls, video_id, video_title, ie=MedialaanIE.ie_key()) + # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index de8c80e36..3f2de00f1 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -7,6 +7,7 @@ from ..compat import compat_parse_qs from ..utils import ( determine_ext, ExtractorError, + get_element_by_class, int_or_none, lowercase_escape, try_get, @@ -237,7 +238,7 @@ class GoogleDriveIE(InfoExtractor): if confirmation_webpage: confirm = self._search_regex( r'confirm=([^&"\']+)', confirmation_webpage, - 'confirmation code', fatal=False) + 'confirmation code', default=None) if confirm: confirmed_source_url = update_url_query(source_url, { 'confirm': confirm, @@ -245,6 +246,11 @@ class GoogleDriveIE(InfoExtractor): urlh = request_source_file(confirmed_source_url, 'confirmed source') if urlh and urlh.headers.get('Content-Disposition'): add_source_format(urlh) + else: + self.report_warning( + get_element_by_class('uc-error-subcaption', confirmation_webpage) + or get_element_by_class('uc-error-caption', confirmation_webpage) + or 'unable to extract confirmation code') if not formats and reason: raise ExtractorError(reason, expected=True) diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py index 41cc245eb..413215a99 100644 --- a/youtube_dl/extractor/lbry.py +++ b/youtube_dl/extractor/lbry.py @@ -5,7 +5,10 @@ import functools import json from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( determine_ext, ExtractorError, @@ -131,6 +134,9 @@ class LBRYIE(LBRYBaseIE): }, { 'url': 'https://lbry.tv/$/download/Episode-1/e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', 'only_matching': True, + }, { + 'url': 'https://lbry.tv/@lacajadepandora:a/TRUMP-EST%C3%81-BIEN-PUESTO-con-Pilar-Baselga,-Carlos-Senra,-Luis-Palacios-(720p_30fps_H264-192kbit_AAC):1', + 'only_matching': True, }] def _real_extract(self, url): @@ -139,6 +145,7 @@ class LBRYIE(LBRYBaseIE): display_id = display_id.split('/', 2)[-1].replace('/', ':') else: display_id = display_id.replace(':', '#') + display_id = compat_urllib_parse_unquote(display_id) uri = 'lbry://' + display_id result = self._resolve_url(uri, display_id, 'stream') result_value = result['value'] diff --git a/youtube_dl/extractor/medialaan.py b/youtube_dl/extractor/medialaan.py index 50d5db802..788acf7fb 100644 --- a/youtube_dl/extractor/medialaan.py +++ b/youtube_dl/extractor/medialaan.py @@ -2,268 +2,113 @@ from __future__ import unicode_literals import re -from .gigya import GigyaBaseIE - -from ..compat import compat_str +from .common import InfoExtractor from ..utils import ( + extract_attributes, int_or_none, - parse_duration, - try_get, - unified_timestamp, + mimetype2ext, + parse_iso8601, ) -class MedialaanIE(GigyaBaseIE): +class MedialaanIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// - (?:www\.|nieuws\.)? (?: - (?Pvtm|q2|vtmkzoom)\.be/ - (?: - video(?:/[^/]+/id/|/?\?.*?\baid=)| - (?:[^/]+/)* - ) + (?:embed\.)?mychannels.video/embed/| + embed\.mychannels\.video/(?:s(?:dk|cript)/)?production/| + (?:www\.)?(?: + (?: + 7sur7| + demorgen| + hln| + joe| + qmusic + )\.be| + (?: + [abe]d| + bndestem| + destentor| + gelderlander| + pzc| + tubantia| + volkskrant + )\.nl + )/video/(?:[^/]+/)*[^/?&#]+~p ) - (?P[^/?#&]+) + (?P\d+) ''' - _NETRC_MACHINE = 'medialaan' - _APIKEY = '3_HZ0FtkMW_gOyKlqQzW5_0FHRC7Nd5XpXJZcDdXY4pk5eES2ZWmejRW5egwVm4ug-' - _SITE_TO_APP_ID = { - 'vtm': 'vtm_watch', - 'q2': 'q2', - 'vtmkzoom': 'vtmkzoom', - } _TESTS = [{ - # vod - 'url': 'http://vtm.be/video/volledige-afleveringen/id/vtm_20170219_VM0678361_vtmwatch', + 'url': 'https://www.bndestem.nl/video/de-terugkeer-van-ally-de-aap-en-wie-vertrekt-er-nog-bij-nac~p193993', 'info_dict': { - 'id': 'vtm_20170219_VM0678361_vtmwatch', + 'id': '193993', 'ext': 'mp4', - 'title': 'Allemaal Chris afl. 6', - 'description': 'md5:4be86427521e7b07e0adb0c9c554ddb2', - 'timestamp': 1487533280, - 'upload_date': '20170219', - 'duration': 2562, - 'series': 'Allemaal Chris', - 'season': 'Allemaal Chris', - 'season_number': 1, - 'season_id': '256936078124527', - 'episode': 'Allemaal Chris afl. 6', - 'episode_number': 6, - 'episode_id': '256936078591527', + 'title': 'De terugkeer van Ally de Aap en wie vertrekt er nog bij NAC?', + 'timestamp': 1611663540, + 'upload_date': '20210126', + 'duration': 238, }, 'params': { 'skip_download': True, }, - 'skip': 'Requires account credentials', }, { - # clip - 'url': 'http://vtm.be/video?aid=168332', - 'info_dict': { - 'id': '168332', - 'ext': 'mp4', - 'title': '"Veronique liegt!"', - 'description': 'md5:1385e2b743923afe54ba4adc38476155', - 'timestamp': 1489002029, - 'upload_date': '20170308', - 'duration': 96, - }, - }, { - # vod - 'url': 'http://vtm.be/video/volledige-afleveringen/id/257107153551000', + 'url': 'https://www.gelderlander.nl/video/kanalen/degelderlander~c320/series/snel-nieuws~s984/noodbevel-in-doetinchem-politie-stuurt-mensen-centrum-uit~p194093', 'only_matching': True, }, { - # vod - 'url': 'http://vtm.be/video?aid=163157', + 'url': 'https://embed.mychannels.video/sdk/production/193993?options=TFTFF_default', 'only_matching': True, }, { - # vod - 'url': 'http://www.q2.be/video/volledige-afleveringen/id/2be_20170301_VM0684442_q2', + 'url': 'https://embed.mychannels.video/script/production/193993', 'only_matching': True, }, { - # clip - 'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio', + 'url': 'https://embed.mychannels.video/production/193993', 'only_matching': True, }, { - # http/s redirect - 'url': 'https://vtmkzoom.be/video?aid=45724', - 'info_dict': { - 'id': '257136373657000', - 'ext': 'mp4', - 'title': 'K3 Dansstudio Ushuaia afl.6', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Requires account credentials', + 'url': 'https://mychannels.video/embed/193993', + 'only_matching': True, }, { - # nieuws.vtm.be - 'url': 'https://nieuws.vtm.be/stadion/stadion/genk-nog-moeilijk-programma', + 'url': 'https://embed.mychannels.video/embed/193993', 'only_matching': True, }] - def _real_initialize(self): - self._logged_in = False - - def _login(self): - username, password = self._get_login_info() - if username is None: - self.raise_login_required() - - auth_data = { - 'APIKey': self._APIKEY, - 'sdk': 'js_6.1', - 'format': 'json', - 'loginID': username, - 'password': password, - } - - auth_info = self._gigya_login(auth_data) - - self._uid = auth_info['UID'] - self._uid_signature = auth_info['UIDSignature'] - self._signature_timestamp = auth_info['signatureTimestamp'] - - self._logged_in = True + @staticmethod + def _extract_urls(webpage): + entries = [] + for element in re.findall(r'(]+data-mychannels-type="video"[^>]*>)', webpage): + mychannels_id = extract_attributes(element).get('data-mychannels-id') + if mychannels_id: + entries.append('https://mychannels.video/embed/' + mychannels_id) + return entries def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, site_id = mobj.group('id', 'site_id') + production_id = self._match_id(url) + production = self._download_json( + 'https://embed.mychannels.video/sdk/production/' + production_id, + production_id, query={'options': 'UUUU_default'})['productions'][0] + title = production['title'] - webpage = self._download_webpage(url, video_id) - - config = self._parse_json( - self._search_regex( - r'videoJSConfig\s*=\s*JSON\.parse\(\'({.+?})\'\);', - webpage, 'config', default='{}'), video_id, - transform_source=lambda s: s.replace( - '\\\\', '\\').replace(r'\"', '"').replace(r"\'", "'")) - - vod_id = config.get('vodId') or self._search_regex( - (r'\\"vodId\\"\s*:\s*\\"(.+?)\\"', - r'"vodId"\s*:\s*"(.+?)"', - r'<[^>]+id=["\']vod-(\d+)'), - webpage, 'video_id', default=None) - - # clip, no authentication required - if not vod_id: - player = self._parse_json( - self._search_regex( - r'vmmaplayer\(({.+?})\);', webpage, 'vmma player', - default=''), - video_id, transform_source=lambda s: '[%s]' % s, fatal=False) - if player: - video = player[-1] - if video['videoUrl'] in ('http', 'https'): - return self.url_result(video['url'], MedialaanIE.ie_key()) - info = { - 'id': video_id, - 'url': video['videoUrl'], - 'title': video['title'], - 'thumbnail': video.get('imageUrl'), - 'timestamp': int_or_none(video.get('createdDate')), - 'duration': int_or_none(video.get('duration')), - } + formats = [] + for source in (production.get('sources') or []): + src = source.get('src') + if not src: + continue + ext = mimetype2ext(source.get('type')) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, production_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) else: - info = self._parse_html5_media_entries( - url, webpage, video_id, m3u8_id='hls')[0] - info.update({ - 'id': video_id, - 'title': self._html_search_meta('description', webpage), - 'duration': parse_duration(self._html_search_meta('duration', webpage)), + formats.append({ + 'ext': ext, + 'url': src, }) - # vod, authentication required - else: - if not self._logged_in: - self._login() + self._sort_formats(formats) - settings = self._parse_json( - self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings', default='{}'), - video_id) - - def get(container, item): - return try_get( - settings, lambda x: x[container][item], - compat_str) or self._search_regex( - r'"%s"\s*:\s*"([^"]+)' % item, webpage, item, - default=None) - - app_id = get('vod', 'app_id') or self._SITE_TO_APP_ID.get(site_id, 'vtm_watch') - sso = get('vod', 'gigyaDatabase') or 'vtm-sso' - - data = self._download_json( - 'http://vod.medialaan.io/api/1.0/item/%s/video' % vod_id, - video_id, query={ - 'app_id': app_id, - 'user_network': sso, - 'UID': self._uid, - 'UIDSignature': self._uid_signature, - 'signatureTimestamp': self._signature_timestamp, - }) - - formats = self._extract_m3u8_formats( - data['response']['uri'], video_id, entry_protocol='m3u8_native', - ext='mp4', m3u8_id='hls') - - self._sort_formats(formats) - - info = { - 'id': vod_id, - 'formats': formats, - } - - api_key = get('vod', 'apiKey') - channel = get('medialaanGigya', 'channel') - - if api_key: - videos = self._download_json( - 'http://vod.medialaan.io/vod/v2/videos', video_id, fatal=False, - query={ - 'channels': channel, - 'ids': vod_id, - 'limit': 1, - 'apikey': api_key, - }) - if videos: - video = try_get( - videos, lambda x: x['response']['videos'][0], dict) - if video: - def get(container, item, expected_type=None): - return try_get( - video, lambda x: x[container][item], expected_type) - - def get_string(container, item): - return get(container, item, compat_str) - - info.update({ - 'series': get_string('program', 'title'), - 'season': get_string('season', 'title'), - 'season_number': int_or_none(get('season', 'number')), - 'season_id': get_string('season', 'id'), - 'episode': get_string('episode', 'title'), - 'episode_number': int_or_none(get('episode', 'number')), - 'episode_id': get_string('episode', 'id'), - 'duration': int_or_none( - video.get('duration')) or int_or_none( - video.get('durationMillis'), scale=1000), - 'title': get_string('episode', 'title'), - 'description': get_string('episode', 'text'), - 'timestamp': unified_timestamp(get_string( - 'publication', 'begin')), - }) - - if not info.get('title'): - info['title'] = try_get( - config, lambda x: x['videoConfig']['title'], - compat_str) or self._html_search_regex( - r'\\"title\\"\s*:\s*\\"(.+?)\\"', webpage, 'title', - default=None) or self._og_search_title(webpage) - - if not info.get('description'): - info['description'] = self._html_search_regex( - r']+class="field-item\s+even">\s*

(.+?)

', - webpage, 'description', default=None) - - return info + return { + 'id': production_id, + 'title': title, + 'formats': formats, + 'thumbnail': production.get('posterUrl'), + 'timestamp': parse_iso8601(production.get('publicationDate'), ' '), + 'duration': int_or_none(production.get('duration')) or None, + } diff --git a/youtube_dl/extractor/minds.py b/youtube_dl/extractor/minds.py new file mode 100644 index 000000000..8e9f0f825 --- /dev/null +++ b/youtube_dl/extractor/minds.py @@ -0,0 +1,196 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + int_or_none, + str_or_none, + strip_or_none, +) + + +class MindsBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?minds\.com/' + + def _call_api(self, path, video_id, resource, query=None): + api_url = 'https://www.minds.com/api/' + path + token = self._get_cookies(api_url).get('XSRF-TOKEN') + return self._download_json( + api_url, video_id, 'Downloading %s JSON metadata' % resource, headers={ + 'Referer': 'https://www.minds.com/', + 'X-XSRF-TOKEN': token.value if token else '', + }, query=query) + + +class MindsIE(MindsBaseIE): + IE_NAME = 'minds' + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?:media|newsfeed|archive/view)/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://www.minds.com/media/100000000000086822', + 'md5': '215a658184a419764852239d4970b045', + 'info_dict': { + 'id': '100000000000086822', + 'ext': 'mp4', + 'title': 'Minds intro sequence', + 'thumbnail': r're:https?://.+\.png', + 'uploader_id': 'ottman', + 'upload_date': '20130524', + 'timestamp': 1369404826, + 'uploader': 'Bill Ottman', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': ['animation'], + 'comment_count': int, + 'license': 'attribution-cc', + }, + }, { + # entity.type == 'activity' and empty title + 'url': 'https://www.minds.com/newsfeed/798025111988506624', + 'md5': 'b2733a74af78d7fd3f541c4cbbaa5950', + 'info_dict': { + 'id': '798022190320226304', + 'ext': 'mp4', + 'title': '798022190320226304', + 'uploader': 'ColinFlaherty', + 'upload_date': '20180111', + 'timestamp': 1515639316, + 'uploader_id': 'ColinFlaherty', + }, + }, { + 'url': 'https://www.minds.com/archive/view/715172106794442752', + 'only_matching': True, + }, { + # youtube perma_url + 'url': 'https://www.minds.com/newsfeed/1197131838022602752', + 'only_matching': True, + }] + + def _real_extract(self, url): + entity_id = self._match_id(url) + entity = self._call_api( + 'v1/entities/entity/' + entity_id, entity_id, 'entity')['entity'] + if entity.get('type') == 'activity': + if entity.get('custom_type') == 'video': + video_id = entity['entity_guid'] + else: + return self.url_result(entity['perma_url']) + else: + assert(entity['subtype'] == 'video') + video_id = entity_id + # 1080p and webm formats available only on the sources array + video = self._call_api( + 'v2/media/video/' + video_id, video_id, 'video') + + formats = [] + for source in (video.get('sources') or []): + src = source.get('src') + if not src: + continue + formats.append({ + 'format_id': source.get('label'), + 'height': int_or_none(source.get('size')), + 'url': src, + }) + self._sort_formats(formats) + + entity = video.get('entity') or entity + owner = entity.get('ownerObj') or {} + uploader_id = owner.get('username') + + tags = entity.get('tags') + if tags and isinstance(tags, compat_str): + tags = [tags] + + thumbnail = None + poster = video.get('poster') or entity.get('thumbnail_src') + if poster: + urlh = self._request_webpage(poster, video_id, fatal=False) + if urlh: + thumbnail = urlh.geturl() + + return { + 'id': video_id, + 'title': entity.get('title') or video_id, + 'formats': formats, + 'description': clean_html(entity.get('description')) or None, + 'license': str_or_none(entity.get('license')), + 'timestamp': int_or_none(entity.get('time_created')), + 'uploader': strip_or_none(owner.get('name')), + 'uploader_id': uploader_id, + 'uploader_url': 'https://www.minds.com/' + uploader_id if uploader_id else None, + 'view_count': int_or_none(entity.get('play:count')), + 'like_count': int_or_none(entity.get('thumbs:up:count')), + 'dislike_count': int_or_none(entity.get('thumbs:down:count')), + 'tags': tags, + 'comment_count': int_or_none(entity.get('comments:count')), + 'thumbnail': thumbnail, + } + + +class MindsFeedBaseIE(MindsBaseIE): + _PAGE_SIZE = 150 + + def _entries(self, feed_id): + query = {'limit': self._PAGE_SIZE, 'sync': 1} + i = 1 + while True: + data = self._call_api( + 'v2/feeds/container/%s/videos' % feed_id, + feed_id, 'page %s' % i, query) + entities = data.get('entities') or [] + for entity in entities: + guid = entity.get('guid') + if not guid: + continue + yield self.url_result( + 'https://www.minds.com/newsfeed/' + guid, + MindsIE.ie_key(), guid) + query['from_timestamp'] = data['load-next'] + if not (query['from_timestamp'] and len(entities) == self._PAGE_SIZE): + break + i += 1 + + def _real_extract(self, url): + feed_id = self._match_id(url) + feed = self._call_api( + 'v1/%s/%s' % (self._FEED_PATH, feed_id), + feed_id, self._FEED_TYPE)[self._FEED_TYPE] + + return self.playlist_result( + self._entries(feed['guid']), feed_id, + strip_or_none(feed.get('name')), + feed.get('briefdescription')) + + +class MindsChannelIE(MindsFeedBaseIE): + _FEED_TYPE = 'channel' + IE_NAME = 'minds:' + _FEED_TYPE + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?!(?:newsfeed|media|api|archive|groups)/)(?P[^/?&#]+)' + _FEED_PATH = 'channel' + _TEST = { + 'url': 'https://www.minds.com/ottman', + 'info_dict': { + 'id': 'ottman', + 'title': 'Bill Ottman', + 'description': 'Co-creator & CEO @minds', + }, + 'playlist_mincount': 54, + } + + +class MindsGroupIE(MindsFeedBaseIE): + _FEED_TYPE = 'group' + IE_NAME = 'minds:' + _FEED_TYPE + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'groups/profile/(?P[0-9]+)' + _FEED_PATH = 'groups/group' + _TEST = { + 'url': 'https://www.minds.com/groups/profile/785582576369672204/feed/videos', + 'info_dict': { + 'id': '785582576369672204', + 'title': 'Cooking Videos', + }, + 'playlist_mincount': 1, + } diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 37f16a791..69319857d 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -251,11 +251,9 @@ class MixcloudPlaylistBaseIE(MixcloudBaseIE): cloudcast_url = cloudcast.get('url') if not cloudcast_url: continue - video_id = cloudcast.get('slug') - if video_id: - owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str) - if owner_username: - video_id = '%s_%s' % (owner_username, video_id) + slug = try_get(cloudcast, lambda x: x['slug'], compat_str) + owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str) + video_id = '%s_%s' % (owner_username, slug) if slug and owner_username else None entries.append(self.url_result( cloudcast_url, MixcloudIE.ie_key(), video_id)) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index df1034fc5..f5e30d22d 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -253,6 +253,10 @@ class MTVServicesInfoExtractor(InfoExtractor): return try_get(feed, lambda x: x['result']['data']['id'], compat_str) + @staticmethod + def _extract_child_with_type(parent, t): + return next(c for c in parent['children'] if c.get('type') == t) + def _extract_mgid(self, webpage): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf @@ -278,6 +282,13 @@ class MTVServicesInfoExtractor(InfoExtractor): if not mgid: mgid = self._extract_triforce_mgid(webpage) + if not mgid: + data = self._parse_json(self._search_regex( + r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) + main_container = self._extract_child_with_type(data, 'MainContainer') + video_player = self._extract_child_with_type(main_container, 'VideoPlayer') + mgid = video_player['props']['media']['video']['config']['uri'] + return mgid def _real_extract(self, url): @@ -349,18 +360,6 @@ class MTVIE(MTVServicesInfoExtractor): 'only_matching': True, }] - @staticmethod - def extract_child_with_type(parent, t): - children = parent['children'] - return next(c for c in children if c.get('type') == t) - - def _extract_mgid(self, webpage): - data = self._parse_json(self._search_regex( - r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) - main_container = self.extract_child_with_type(data, 'MainContainer') - video_player = self.extract_child_with_type(main_container, 'VideoPlayer') - return video_player['props']['media']['video']['config']['uri'] - class MTVJapanIE(MTVServicesInfoExtractor): IE_NAME = 'mtvjapan' diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index dc6a27d36..440f865bc 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -1,104 +1,125 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import str_to_int +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + try_get, + url_or_none, +) class NineGagIE(InfoExtractor): IE_NAME = '9gag' - _VALID_URL = r'https?://(?:www\.)?9gag(?:\.com/tv|\.tv)/(?:p|embed)/(?P[a-zA-Z0-9]+)(?:/(?P[^?#/]+))?' + _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[^/?&#]+)' - _TESTS = [{ - 'url': 'http://9gag.com/tv/p/Kk2X5/people-are-awesome-2013-is-absolutely-awesome', + _TEST = { + 'url': 'https://9gag.com/gag/ae5Ag7B', 'info_dict': { - 'id': 'kXzwOKyGlSA', + 'id': 'ae5Ag7B', 'ext': 'mp4', - 'description': 'This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)', - 'title': '\"People Are Awesome 2013\" Is Absolutely Awesome', - 'uploader_id': 'UCdEH6EjDKwtTe-sO2f0_1XA', - 'uploader': 'CompilationChannel', - 'upload_date': '20131110', - 'view_count': int, - }, - 'add_ie': ['Youtube'], - }, { - 'url': 'http://9gag.com/tv/p/aKolP3', - 'info_dict': { - 'id': 'aKolP3', - 'ext': 'mp4', - 'title': 'This Guy Travelled 11 countries In 44 days Just To Make This Amazing Video', - 'description': "I just saw more in 1 minute than I've seen in 1 year. This guy's video is epic!!", - 'uploader_id': 'rickmereki', - 'uploader': 'Rick Mereki', - 'upload_date': '20110803', - 'view_count': int, - }, - 'add_ie': ['Vimeo'], - }, { - 'url': 'http://9gag.com/tv/p/KklwM', - 'only_matching': True, - }, { - 'url': 'http://9gag.tv/p/Kk2X5', - 'only_matching': True, - }, { - 'url': 'http://9gag.com/tv/embed/a5Dmvl', - 'only_matching': True, - }] - - _EXTERNAL_VIDEO_PROVIDER = { - '1': { - 'url': '%s', - 'ie_key': 'Youtube', - }, - '2': { - 'url': 'http://player.vimeo.com/video/%s', - 'ie_key': 'Vimeo', - }, - '3': { - 'url': 'http://instagram.com/p/%s', - 'ie_key': 'Instagram', - }, - '4': { - 'url': 'http://vine.co/v/%s', - 'ie_key': 'Vine', - }, + 'title': 'Capybara Agility Training', + 'upload_date': '20191108', + 'timestamp': 1573237208, + 'categories': ['Awesome'], + 'tags': ['Weimaraner', 'American Pit Bull Terrier'], + 'duration': 44, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id + post_id = self._match_id(url) + post = self._download_json( + 'https://9gag.com/v1/post', post_id, query={ + 'id': post_id + })['data']['post'] - webpage = self._download_webpage(url, display_id) + if post.get('type') != 'Animated': + raise ExtractorError( + 'The given url does not contain a video', + expected=True) - post_view = self._parse_json( - self._search_regex( - r'var\s+postView\s*=\s*new\s+app\.PostView\({\s*post:\s*({.+?})\s*,\s*posts:\s*prefetchedCurrentPost', - webpage, 'post view'), - display_id) + title = post['title'] - ie_key = None - source_url = post_view.get('sourceUrl') - if not source_url: - external_video_id = post_view['videoExternalId'] - external_video_provider = post_view['videoExternalProvider'] - source_url = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['url'] % external_video_id - ie_key = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['ie_key'] - title = post_view['title'] - description = post_view.get('description') - view_count = str_to_int(post_view.get('externalView')) - thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w') + duration = None + formats = [] + thumbnails = [] + for key, image in (post.get('images') or {}).items(): + image_url = url_or_none(image.get('url')) + if not image_url: + continue + ext = determine_ext(image_url) + image_id = key.strip('image') + common = { + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } + if ext in ('jpg', 'png'): + webp_url = image.get('webpUrl') + if webp_url: + t = common.copy() + t.update({ + 'id': image_id + '-webp', + 'url': webp_url, + }) + thumbnails.append(t) + common.update({ + 'id': image_id, + 'ext': ext, + }) + thumbnails.append(common) + elif ext in ('webm', 'mp4'): + if not duration: + duration = int_or_none(image.get('duration')) + common['acodec'] = 'none' if image.get('hasAudio') == 0 else None + for vcodec in ('vp8', 'vp9', 'h265'): + c_url = image.get(vcodec + 'Url') + if not c_url: + continue + c_f = common.copy() + c_f.update({ + 'format_id': image_id + '-' + vcodec, + 'url': c_url, + 'vcodec': vcodec, + }) + formats.append(c_f) + common.update({ + 'ext': ext, + 'format_id': image_id, + }) + formats.append(common) + self._sort_formats(formats) + + section = try_get(post, lambda x: x['postSection']['name']) + + tags = None + post_tags = post.get('tags') + if post_tags: + tags = [] + for tag in post_tags: + tag_key = tag.get('key') + if not tag_key: + continue + tags.append(tag_key) + + get_count = lambda x: int_or_none(post.get(x + 'Count')) return { - '_type': 'url_transparent', - 'url': source_url, - 'ie_key': ie_key, - 'id': video_id, - 'display_id': display_id, + 'id': post_id, 'title': title, - 'description': description, - 'view_count': view_count, - 'thumbnail': thumbnail, + 'timestamp': int_or_none(post.get('creationTs')), + 'duration': duration, + 'formats': formats, + 'thumbnails': thumbnails, + 'like_count': get_count('upVote'), + 'dislike_count': get_count('downVote'), + 'comment_count': get_count('comments'), + 'age_limit': 18 if post.get('nsfw') == 1 else None, + 'categories': [section] if section else None, + 'tags': tags, } diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py index 025c5d249..3639d142f 100644 --- a/youtube_dl/extractor/njpwworld.py +++ b/youtube_dl/extractor/njpwworld.py @@ -6,30 +6,40 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( - extract_attributes, get_element_by_class, urlencode_postdata, ) class NJPWWorldIE(InfoExtractor): - _VALID_URL = r'https?://njpwworld\.com/p/(?P[a-z0-9_]+)' + _VALID_URL = r'https?://(front\.)?njpwworld\.com/p/(?P[a-z0-9_]+)' IE_DESC = '新日本プロレスワールド' _NETRC_MACHINE = 'njpwworld' - _TEST = { + _TESTS = [{ 'url': 'http://njpwworld.com/p/s_series_00155_1_9/', 'info_dict': { 'id': 's_series_00155_1_9', 'ext': 'mp4', - 'title': '第9試合 ランディ・サベージ vs リック・スタイナー', + 'title': '闘強導夢2000 2000年1月4日 東京ドーム 第9試合 ランディ・サベージ VS リック・スタイナー', 'tags': list, }, 'params': { 'skip_download': True, # AES-encrypted m3u8 }, 'skip': 'Requires login', - } + }, { + 'url': 'https://front.njpwworld.com/p/s_series_00563_16_bs', + 'info_dict': { + 'id': 's_series_00563_16_bs', + 'ext': 'mp4', + 'title': 'WORLD TAG LEAGUE 2020 & BEST OF THE SUPER Jr.27 2020年12月6日 福岡・福岡国際センター バックステージコメント(字幕あり)', + 'tags': ["福岡・福岡国際センター", "バックステージコメント", "2020", "20年代"], + }, + 'params': { + 'skip_download': True, + }, + }] _LOGIN_URL = 'https://front.njpwworld.com/auth/login' @@ -64,35 +74,27 @@ class NJPWWorldIE(InfoExtractor): webpage = self._download_webpage(url, video_id) formats = [] - for mobj in re.finditer(r']+\bhref=(["\'])/player.+?[^>]*>', webpage): - player = extract_attributes(mobj.group(0)) - player_path = player.get('href') - if not player_path: - continue - kind = self._search_regex( - r'(low|high)$', player.get('class') or '', 'kind', - default='low') + for kind, vid in re.findall(r'if\s+\(\s*imageQualityType\s*==\s*\'([^\']+)\'\s*\)\s*{\s*video_id\s*=\s*"(\d+)"', webpage): + player_path = '/intent?id=%s&type=url' % vid player_url = compat_urlparse.urljoin(url, player_path) - player_page = self._download_webpage( - player_url, video_id, note='Downloading player page') - entries = self._parse_html5_media_entries( - player_url, player_page, video_id, m3u8_id='hls-%s' % kind, - m3u8_entry_protocol='m3u8_native') - kind_formats = entries[0]['formats'] - for f in kind_formats: - f['quality'] = 2 if kind == 'high' else 1 - formats.extend(kind_formats) + formats.append({ + 'url': player_url, + 'format_id': kind, + 'ext': 'mp4', + 'protocol': 'm3u8', + 'quality': 2 if kind == 'high' else 1, + }) self._sort_formats(formats) - post_content = get_element_by_class('post-content', webpage) + tag_block = get_element_by_class('tag-block', webpage) tags = re.findall( - r']+class="tag-[^"]+">]*>([^<]+)', post_content - ) if post_content else None + r']+class="tag-[^"]+"[^>]*>([^<]+)', tag_block + ) if tag_block else None return { 'id': video_id, - 'title': self._og_search_title(webpage), + 'title': get_element_by_class('article-title', webpage) or self._og_search_title(webpage), 'formats': formats, 'tags': tags, } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 2fcbd186f..b7631e4e1 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -22,11 +22,15 @@ from ..utils import ( orderedSet, remove_quotes, str_to_int, + update_url_query, + urlencode_postdata, url_or_none, ) class PornHubBaseIE(InfoExtractor): + _NETRC_MACHINE = 'pornhub' + def _download_webpage_handle(self, *args, **kwargs): def dl(*args, **kwargs): return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) @@ -52,6 +56,66 @@ class PornHubBaseIE(InfoExtractor): return webpage, urlh + def _real_initialize(self): + self._logged_in = False + + def _login(self, host): + if self._logged_in: + return + + site = host.split('.')[0] + + # Both sites pornhub and pornhubpremium have separate accounts + # so there should be an option to provide credentials for both. + # At the same time some videos are available under the same video id + # on both sites so that we have to identify them as the same video. + # For that purpose we have to keep both in the same extractor + # but under different netrc machines. + username, password = self._get_login_info(netrc_machine=site) + if username is None: + return + + login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '') + login_page = self._download_webpage( + login_url, None, 'Downloading %s login page' % site) + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'class=["\']signOut', + r'>Sign\s+[Oo]ut\s*<')) + + if is_logged(login_page): + self._logged_in = True + return + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + response = self._download_json( + 'https://www.%s/front/authenticate' % host, None, + 'Logging in to %s' % site, + data=urlencode_postdata(login_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': login_url, + 'X-Requested-With': 'XMLHttpRequest', + }) + + if response.get('success') == '1': + self._logged_in = True + return + + message = response.get('message') + if message is not None: + raise ExtractorError( + 'Unable to login: %s' % message, expected=True) + + raise ExtractorError('Unable to log in') + class PornHubIE(PornHubBaseIE): IE_DESC = 'PornHub and Thumbzilla' @@ -163,12 +227,20 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', 'only_matching': True, + }, { + # Some videos are available with the same id on both premium + # and non-premium sites (e.g. this and the following test) + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( - r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub\.(?:com|net|org)/embed/[\da-z]+)', + r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)', webpage) def _extract_count(self, pattern, webpage, name): @@ -180,12 +252,7 @@ class PornHubIE(PornHubBaseIE): host = mobj.group('host') or 'pornhub.com' video_id = mobj.group('id') - if 'premium' in host: - if not self._downloader.params.get('cookiefile'): - raise ExtractorError( - 'PornHub Premium requires authentication.' - ' You may want to use --cookies.', - expected=True) + self._login(host) self._set_cookie(host, 'age_verified', '1') @@ -405,6 +472,10 @@ class PornHubIE(PornHubBaseIE): class PornHubPlaylistBaseIE(PornHubBaseIE): + def _extract_page(self, url): + return int_or_none(self._search_regex( + r'\bpage=(\d+)', url, 'page', default=None)) + def _extract_entries(self, webpage, host): # Only process container div with main playlist content skipping # drop-down menu that uses similar pattern for videos (see @@ -422,26 +493,6 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): container)) ] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - playlist_id = mobj.group('id') - - webpage = self._download_webpage(url, playlist_id) - - entries = self._extract_entries(webpage, host) - - playlist = self._parse_json( - self._search_regex( - r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage, - 'playlist', default='{}'), - playlist_id, fatal=False) - title = playlist.get('title') or self._search_regex( - r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False) - - return self.playlist_result( - entries, playlist_id, title, playlist.get('description')) - class PornHubUserIE(PornHubPlaylistBaseIE): _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' @@ -463,14 +514,27 @@ class PornHubUserIE(PornHubPlaylistBaseIE): }, { 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1', 'only_matching': True, + }, { + # Unavailable via /videos page, but available with direct pagination + # on pornstar page (see [1]), requires premium + # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 + 'url': 'https://www.pornhubpremium.com/pornstar/sienna-west', + 'only_matching': True, + }, { + # Same as before, multi page + 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user_id = mobj.group('id') + videos_url = '%s/videos' % mobj.group('url') + page = self._extract_page(url) + if page: + videos_url = update_url_query(videos_url, {'page': page}) return self.url_result( - '%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(), - video_id=user_id) + videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id) class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): @@ -483,32 +547,55 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): ]+\bid=["\']moreDataBtn ''', webpage) is not None - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - item_id = mobj.group('id') + def _entries(self, url, host, item_id): + page = self._extract_page(url) - page = int_or_none(self._search_regex( - r'\bpage=(\d+)', url, 'page', default=None)) + VIDEOS = '/videos' - entries = [] - for page_num in (page, ) if page is not None else itertools.count(1): + def download_page(base_url, num, fallback=False): + note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '') + return self._download_webpage( + base_url, item_id, note, query={'page': num}) + + def is_404(e): + return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 + + base_url = url + has_page = page is not None + first_page = page if has_page else 1 + for page_num in (first_page, ) if has_page else itertools.count(first_page): try: - webpage = self._download_webpage( - url, item_id, 'Downloading page %d' % page_num, - query={'page': page_num}) + try: + webpage = download_page(base_url, page_num) + except ExtractorError as e: + # Some sources may not be available via /videos page, + # trying to fallback to main page pagination (see [1]) + # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 + if is_404(e) and page_num == first_page and VIDEOS in base_url: + base_url = base_url.replace(VIDEOS, '') + webpage = download_page(base_url, page_num, fallback=True) + else: + raise except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + if is_404(e) and page_num != first_page: break raise page_entries = self._extract_entries(webpage, host) if not page_entries: break - entries.extend(page_entries) + for e in page_entries: + yield e if not self._has_more(webpage): break - return self.playlist_result(orderedSet(entries), item_id) + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + item_id = mobj.group('id') + + self._login(host) + + return self.playlist_result(self._entries(url, host, item_id), item_id) class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index 4c5e3f7c2..5805f3d44 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -20,9 +20,6 @@ class BellatorIE(MTVServicesInfoExtractor): _FEED_URL = 'http://www.bellator.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] - def _extract_mgid(self, webpage): - return self._extract_triforce_mgid(webpage) - class ParamountNetworkIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' @@ -46,16 +43,6 @@ class ParamountNetworkIE(MTVServicesInfoExtractor): def _get_feed_query(self, uri): return { 'arcEp': 'paramountnetwork.com', + 'imageEp': 'paramountnetwork.com', 'mgid': uri, } - - def _extract_mgid(self, webpage): - root_data = self._parse_json(self._search_regex( - r'window\.__DATA__\s*=\s*({.+})', - webpage, 'data'), None) - - def find_sub_data(data, data_type): - return next(c for c in data['children'] if c.get('type') == data_type) - - c = find_sub_data(find_sub_data(root_data, 'MainContainer'), 'VideoPlayer') - return c['props']['media']['video']['config']['uri'] diff --git a/youtube_dl/extractor/spotify.py b/youtube_dl/extractor/spotify.py new file mode 100644 index 000000000..826f98cff --- /dev/null +++ b/youtube_dl/extractor/spotify.py @@ -0,0 +1,156 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + float_or_none, + int_or_none, + strip_or_none, + try_get, + unified_strdate, +) + + +class SpotifyBaseIE(InfoExtractor): + _ACCESS_TOKEN = None + _OPERATION_HASHES = { + 'Episode': '8276d4423d709ae9b68ec1b74cc047ba0f7479059a37820be730f125189ac2bf', + 'MinimalShow': '13ee079672fad3f858ea45a55eb109553b4fb0969ed793185b2e34cbb6ee7cc0', + 'ShowEpisodes': 'e0e5ce27bd7748d2c59b4d44ba245a8992a05be75d6fabc3b20753fc8857444d', + } + _VALID_URL_TEMPL = r'https?://open\.spotify\.com/%s/(?P[^/?&#]+)' + + def _real_initialize(self): + self._ACCESS_TOKEN = self._download_json( + 'https://open.spotify.com/get_access_token', None)['accessToken'] + + def _call_api(self, operation, video_id, variables): + return self._download_json( + 'https://api-partner.spotify.com/pathfinder/v1/query', video_id, query={ + 'operationName': 'query' + operation, + 'variables': json.dumps(variables), + 'extensions': json.dumps({ + 'persistedQuery': { + 'sha256Hash': self._OPERATION_HASHES[operation], + }, + }) + }, headers={'authorization': 'Bearer ' + self._ACCESS_TOKEN})['data'] + + def _extract_episode(self, episode, series): + episode_id = episode['id'] + title = episode['name'].strip() + + formats = [] + audio_preview = episode.get('audioPreview') or {} + audio_preview_url = audio_preview.get('url') + if audio_preview_url: + f = { + 'url': audio_preview_url.replace('://p.scdn.co/mp3-preview/', '://anon-podcast.scdn.co/'), + 'vcodec': 'none', + } + audio_preview_format = audio_preview.get('format') + if audio_preview_format: + f['format_id'] = audio_preview_format + mobj = re.match(r'([0-9A-Z]{3})_(?:[A-Z]+_)?(\d+)', audio_preview_format) + if mobj: + f.update({ + 'abr': int(mobj.group(2)), + 'ext': mobj.group(1).lower(), + }) + formats.append(f) + + for item in (try_get(episode, lambda x: x['audio']['items']) or []): + item_url = item.get('url') + if not (item_url and item.get('externallyHosted')): + continue + formats.append({ + 'url': clean_podcast_url(item_url), + 'vcodec': 'none', + }) + + thumbnails = [] + for source in (try_get(episode, lambda x: x['coverArt']['sources']) or []): + source_url = source.get('url') + if not source_url: + continue + thumbnails.append({ + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + }) + + return { + 'id': episode_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': strip_or_none(episode.get('description')), + 'duration': float_or_none(try_get( + episode, lambda x: x['duration']['totalMilliseconds']), 1000), + 'release_date': unified_strdate(try_get( + episode, lambda x: x['releaseDate']['isoString'])), + 'series': series, + } + + +class SpotifyIE(SpotifyBaseIE): + IE_NAME = 'spotify' + _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'episode' + _TEST = { + 'url': 'https://open.spotify.com/episode/4Z7GAJ50bgctf6uclHlWKo', + 'md5': '74010a1e3fa4d9e1ab3aa7ad14e42d3b', + 'info_dict': { + 'id': '4Z7GAJ50bgctf6uclHlWKo', + 'ext': 'mp3', + 'title': 'From the archive: Why time management is ruining our lives', + 'description': 'md5:b120d9c4ff4135b42aa9b6d9cde86935', + 'duration': 2083.605, + 'release_date': '20201217', + 'series': "The Guardian's Audio Long Reads", + } + } + + def _real_extract(self, url): + episode_id = self._match_id(url) + episode = self._call_api('Episode', episode_id, { + 'uri': 'spotify:episode:' + episode_id + })['episode'] + return self._extract_episode( + episode, try_get(episode, lambda x: x['podcast']['name'])) + + +class SpotifyShowIE(SpotifyBaseIE): + IE_NAME = 'spotify:show' + _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'show' + _TEST = { + 'url': 'https://open.spotify.com/show/4PM9Ke6l66IRNpottHKV9M', + 'info_dict': { + 'id': '4PM9Ke6l66IRNpottHKV9M', + 'title': 'The Story from the Guardian', + 'description': 'The Story podcast is dedicated to our finest audio documentaries, investigations and long form stories', + }, + 'playlist_mincount': 36, + } + + def _real_extract(self, url): + show_id = self._match_id(url) + podcast = self._call_api('ShowEpisodes', show_id, { + 'limit': 1000000000, + 'offset': 0, + 'uri': 'spotify:show:' + show_id, + })['podcast'] + podcast_name = podcast.get('name') + + entries = [] + for item in (try_get(podcast, lambda x: x['episodes']['items']) or []): + episode = item.get('episode') + if not episode: + continue + entries.append(self._extract_episode(episode, podcast_name)) + + return self.playlist_result( + entries, show_id, podcast_name, podcast.get('description')) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index a0b6ef4db..4acc29fce 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -255,8 +255,10 @@ class SVTPlayIE(SVTPlayBaseIE): svt_id = self._search_regex( (r']+data-video-id=["\']([\da-zA-Z-]+)', r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', - r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'), + r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']svtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)'), webpage, 'video id') info_dict = self._extract_by_video_id(svt_id, webpage) diff --git a/youtube_dl/extractor/trovo.py b/youtube_dl/extractor/trovo.py new file mode 100644 index 000000000..43745213d --- /dev/null +++ b/youtube_dl/extractor/trovo.py @@ -0,0 +1,193 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, + try_get, +) + + +class TrovoBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?trovo\.live/' + + def _extract_streamer_info(self, data): + streamer_info = data.get('streamerInfo') or {} + username = streamer_info.get('userName') + return { + 'uploader': streamer_info.get('nickName'), + 'uploader_id': str_or_none(streamer_info.get('uid')), + 'uploader_url': 'https://trovo.live/' + username if username else None, + } + + +class TrovoIE(TrovoBaseIE): + _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?!(?:clip|video)/)(?P[^/?&#]+)' + + def _real_extract(self, url): + username = self._match_id(url) + live_info = self._download_json( + 'https://gql.trovo.live/', username, query={ + 'query': '''{ + getLiveInfo(params: {userName: "%s"}) { + isLive + programInfo { + coverUrl + id + streamInfo { + desc + playUrl + } + title + } + streamerInfo { + nickName + uid + userName + } + } +}''' % username, + })['data']['getLiveInfo'] + if live_info.get('isLive') == 0: + raise ExtractorError('%s is offline' % username, expected=True) + program_info = live_info['programInfo'] + program_id = program_info['id'] + title = self._live_title(program_info['title']) + + formats = [] + for stream_info in (program_info.get('streamInfo') or []): + play_url = stream_info.get('playUrl') + if not play_url: + continue + format_id = stream_info.get('desc') + formats.append({ + 'format_id': format_id, + 'height': int_or_none(format_id[:-1]) if format_id else None, + 'url': play_url, + }) + self._sort_formats(formats) + + info = { + 'id': program_id, + 'title': title, + 'formats': formats, + 'thumbnail': program_info.get('coverUrl'), + 'is_live': True, + } + info.update(self._extract_streamer_info(live_info)) + return info + + +class TrovoVodIE(TrovoBaseIE): + _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?:clip|video)/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://trovo.live/video/ltv-100095501_100095501_1609596043', + 'info_dict': { + 'id': 'ltv-100095501_100095501_1609596043', + 'ext': 'mp4', + 'title': 'Spontaner 12 Stunden Stream! - Ok Boomer!', + 'uploader': 'Exsl', + 'timestamp': 1609640305, + 'upload_date': '20210103', + 'uploader_id': '100095501', + 'duration': 43977, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'comments': 'mincount:8', + 'categories': ['Grand Theft Auto V'], + }, + }, { + 'url': 'https://trovo.live/clip/lc-5285890810184026005', + 'only_matching': True, + }] + + def _real_extract(self, url): + vid = self._match_id(url) + resp = self._download_json( + 'https://gql.trovo.live/', vid, data=json.dumps([{ + 'query': '''{ + batchGetVodDetailInfo(params: {vids: ["%s"]}) { + VodDetailInfos + } +}''' % vid, + }, { + 'query': '''{ + getCommentList(params: {appInfo: {postID: "%s"}, pageSize: 1000000000, preview: {}}) { + commentList { + author { + nickName + uid + } + commentID + content + createdAt + parentID + } + } +}''' % vid, + }]).encode(), headers={ + 'Content-Type': 'application/json', + }) + vod_detail_info = resp[0]['data']['batchGetVodDetailInfo']['VodDetailInfos'][vid] + vod_info = vod_detail_info['vodInfo'] + title = vod_info['title'] + + language = vod_info.get('languageName') + formats = [] + for play_info in (vod_info.get('playInfos') or []): + play_url = play_info.get('playUrl') + if not play_url: + continue + format_id = play_info.get('desc') + formats.append({ + 'ext': 'mp4', + 'filesize': int_or_none(play_info.get('fileSize')), + 'format_id': format_id, + 'height': int_or_none(format_id[:-1]) if format_id else None, + 'language': language, + 'protocol': 'm3u8_native', + 'tbr': int_or_none(play_info.get('bitrate')), + 'url': play_url, + }) + self._sort_formats(formats) + + category = vod_info.get('categoryName') + get_count = lambda x: int_or_none(vod_info.get(x + 'Num')) + + comment_list = try_get(resp, lambda x: x[1]['data']['getCommentList']['commentList'], list) or [] + comments = [] + for comment in comment_list: + content = comment.get('content') + if not content: + continue + author = comment.get('author') or {} + parent = comment.get('parentID') + comments.append({ + 'author': author.get('nickName'), + 'author_id': str_or_none(author.get('uid')), + 'id': str_or_none(comment.get('commentID')), + 'text': content, + 'timestamp': int_or_none(comment.get('createdAt')), + 'parent': 'root' if parent == 0 else str_or_none(parent), + }) + + info = { + 'id': vid, + 'title': title, + 'formats': formats, + 'thumbnail': vod_info.get('coverUrl'), + 'timestamp': int_or_none(vod_info.get('publishTs')), + 'duration': int_or_none(vod_info.get('duration')), + 'view_count': get_count('watch'), + 'like_count': get_count('like'), + 'comment_count': get_count('comment'), + 'comments': comments, + 'categories': [category] if category else None, + } + info.update(self._extract_streamer_info(vod_detail_info)) + return info diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 4a19b9be6..334b7d540 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -20,7 +20,7 @@ from ..utils import ( class TV2IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { 'id': '916509', @@ -33,7 +33,7 @@ class TV2IE(InfoExtractor): 'view_count': int, 'categories': list, }, - } + }] _API_DOMAIN = 'sumo.tv2.no' _PROTOCOLS = ('HDS', 'HLS', 'DASH') _GEO_COUNTRIES = ['NO'] @@ -42,6 +42,12 @@ class TV2IE(InfoExtractor): video_id = self._match_id(url) api_base = 'http://%s/api/web/asset/%s' % (self._API_DOMAIN, video_id) + asset = self._download_json( + api_base + '.json', video_id, + 'Downloading metadata JSON')['asset'] + title = asset.get('subtitle') or asset['title'] + is_live = asset.get('live') is True + formats = [] format_urls = [] for protocol in self._PROTOCOLS: @@ -81,7 +87,8 @@ class TV2IE(InfoExtractor): elif ext == 'm3u8': if not data.get('drmProtected'): formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', + video_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', m3u8_id=format_id, fatal=False)) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( @@ -99,11 +106,6 @@ class TV2IE(InfoExtractor): raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) - asset = self._download_json( - api_base + '.json', video_id, - 'Downloading metadata JSON')['asset'] - title = asset['title'] - thumbnails = [{ 'id': thumbnail.get('@type'), 'url': thumbnail.get('url'), @@ -112,7 +114,7 @@ class TV2IE(InfoExtractor): return { 'id': video_id, 'url': video_url, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'description': strip_or_none(asset.get('description')), 'thumbnails': thumbnails, 'timestamp': parse_iso8601(asset.get('createTime')), @@ -120,6 +122,7 @@ class TV2IE(InfoExtractor): 'view_count': int_or_none(asset.get('views')), 'categories': asset.get('keywords', '').split(','), 'formats': formats, + 'is_live': is_live, } @@ -168,13 +171,13 @@ class TV2ArticleIE(InfoExtractor): class KatsomoIE(TV2IE): - _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv)\.fi/(?:#!/)?(?:[^/]+/[0-9a-z-]+-\d+/[0-9a-z-]+-|[^/]+/\d+/[^/]+/)(?P\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv(uutiset)?)\.fi/(?:sarja/[0-9a-z-]+-\d+/[0-9a-z-]+-|(?:#!/)?jakso/(?:\d+/[^/]+/)?|video/prog)(?P\d+)' + _TESTS = [{ 'url': 'https://www.mtv.fi/sarja/mtv-uutiset-live-33001002003/lahden-pelicans-teki-kovan-ratkaisun-ville-nieminen-pihalle-1181321', 'info_dict': { 'id': '1181321', 'ext': 'mp4', - 'title': 'MTV Uutiset Live', + 'title': 'Lahden Pelicans teki kovan ratkaisun – Ville Nieminen pihalle', 'description': 'Päätöksen teki Pelicansin hallitus.', 'timestamp': 1575116484, 'upload_date': '20191130', @@ -186,7 +189,60 @@ class KatsomoIE(TV2IE): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'http://www.katsomo.fi/#!/jakso/33001005/studio55-fi/658521/jukka-kuoppamaki-tekee-yha-lauluja-vaikka-lentokoneessa', + 'only_matching': True, + }, { + 'url': 'https://www.mtvuutiset.fi/video/prog1311159', + 'only_matching': True, + }, { + 'url': 'https://www.katsomo.fi/#!/jakso/1311159', + 'only_matching': True, + }] _API_DOMAIN = 'api.katsomo.fi' _PROTOCOLS = ('HLS', 'MPD') _GEO_COUNTRIES = ['FI'] + + +class MTVUutisetArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/artikkeli/[^/]+/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.mtvuutiset.fi/artikkeli/tallaisia-vaurioita-viking-amorellassa-on-useamman-osaston-alla-vetta/7931384', + 'info_dict': { + 'id': '1311159', + 'ext': 'mp4', + 'title': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', + 'description': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', + 'timestamp': 1600608966, + 'upload_date': '20200920', + 'duration': 153.7886666, + 'view_count': int, + 'categories': list, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # multiple Youtube embeds + 'url': 'https://www.mtvuutiset.fi/artikkeli/50-vuotta-subarun-vastaiskua/6070962', + 'only_matching': True, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + article = self._download_json( + 'http://api.mtvuutiset.fi/mtvuutiset/api/json/' + article_id, + article_id) + + def entries(): + for video in (article.get('videos') or []): + video_type = video.get('videotype') + video_url = video.get('url') + if not (video_url and video_type in ('katsomo', 'youtube')): + continue + yield self.url_result( + video_url, video_type.capitalize(), video.get('video_id')) + + return self.playlist_result( + entries(), article_id, article.get('title'), article.get('description')) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index c498b0191..b73bab9a8 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -17,7 +17,7 @@ class TV4IE(InfoExtractor): tv4\.se/(?:[^/]+)/klipp/(?:.*)-| tv4play\.se/ (?: - (?:program|barn)/(?:[^/]+/|(?:[^\?]+)\?video_id=)| + (?:program|barn)/(?:(?:[^/]+/){1,2}|(?:[^\?]+)\?video_id=)| iframe/video/| film/| sport/| @@ -65,6 +65,10 @@ class TV4IE(InfoExtractor): { 'url': 'http://www.tv4play.se/program/farang/3922081', 'only_matching': True, + }, + { + 'url': 'https://www.tv4play.se/program/nyheterna/avsnitt/13315940', + 'only_matching': True, } ] diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1190d721e..ed495f297 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -373,6 +373,24 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': '1eVjYOLGkGrQL', }, 'add_ie': ['TwitterBroadcast'], + }, { + # unified card + 'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20', + 'info_dict': { + 'id': '1349794411333394432', + 'ext': 'mp4', + 'title': 'md5:d1c4941658e4caaa6cb579260d85dcba', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:71ead15ec44cee55071547d6447c6a3e', + 'uploader': 'Brooklyn Nets', + 'uploader_id': 'BrooklynNets', + 'duration': 324.484, + 'timestamp': 1610651040, + 'upload_date': '20210114', + }, + 'params': { + 'skip_download': True, + }, }, { # Twitch Clip Embed 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', @@ -389,6 +407,22 @@ class TwitterIE(TwitterBaseIE): # appplayer card 'url': 'https://twitter.com/poco_dandy/status/1150646424461176832', 'only_matching': True, + }, { + # video_direct_message card + 'url': 'https://twitter.com/qarev001/status/1348948114569269251', + 'only_matching': True, + }, { + # poll2choice_video card + 'url': 'https://twitter.com/CAF_Online/status/1349365911120195585', + 'only_matching': True, + }, { + # poll3choice_video card + 'url': 'https://twitter.com/SamsungMobileSA/status/1348609186725289984', + 'only_matching': True, + }, { + # poll4choice_video card + 'url': 'https://twitter.com/SouthamptonFC/status/1347577658079641604', + 'only_matching': True, }] def _real_extract(self, url): @@ -433,8 +467,7 @@ class TwitterIE(TwitterBaseIE): 'tags': tags, } - media = try_get(status, lambda x: x['extended_entities']['media'][0]) - if media and media.get('type') != 'photo': + def extract_from_video_info(media): video_info = media.get('video_info') or {} formats = [] @@ -461,6 +494,10 @@ class TwitterIE(TwitterBaseIE): 'thumbnails': thumbnails, 'duration': float_or_none(video_info.get('duration_millis'), 1000), }) + + media = try_get(status, lambda x: x['extended_entities']['media'][0]) + if media and media.get('type') != 'photo': + extract_from_video_info(media) else: card = status.get('card') if card: @@ -493,7 +530,12 @@ class TwitterIE(TwitterBaseIE): '_type': 'url', 'url': get_binding_value('card_url'), }) - # amplify, promo_video_website, promo_video_convo, appplayer, ... + elif card_name == 'unified_card': + media_entities = self._parse_json(get_binding_value('unified_card'), twid)['media_entities'] + extract_from_video_info(next(iter(media_entities.values()))) + # amplify, promo_video_website, promo_video_convo, appplayer, + # video_direct_message, poll2choice_video, poll3choice_video, + # poll4choice_video, ... else: is_amplify = card_name == 'amplify' vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py index b48baf00b..b1243e847 100644 --- a/youtube_dl/extractor/vidio.py +++ b/youtube_dl/extractor/vidio.py @@ -4,7 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + parse_iso8601, + str_or_none, + strip_or_none, + try_get, +) class VidioIE(InfoExtractor): @@ -21,57 +27,63 @@ class VidioIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 149, 'like_count': int, + 'uploader': 'TWELVE Pic', + 'timestamp': 1444902800, + 'upload_date': '20151015', + 'uploader_id': 'twelvepictures', + 'channel': 'Cover Music Video', + 'channel_id': '280236', + 'view_count': int, + 'dislike_count': int, + 'comment_count': int, + 'tags': 'count:4', }, }, { 'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north', 'only_matching': True, }] + def _real_initialize(self): + self._api_key = self._download_json( + 'https://www.vidio.com/auth', None, data=b'')['api_key'] + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, display_id = mobj.group('id', 'display_id') + video_id, display_id = re.match(self._VALID_URL, url).groups() + data = self._download_json( + 'https://api.vidio.com/videos/' + video_id, display_id, headers={ + 'Content-Type': 'application/vnd.api+json', + 'X-API-KEY': self._api_key, + }) + video = data['videos'][0] + title = video['title'].strip() - webpage = self._download_webpage(url, display_id) - - title = self._og_search_title(webpage) - - m3u8_url, duration, thumbnail = [None] * 3 - - clips = self._parse_json( - self._html_search_regex( - r'data-json-clips\s*=\s*(["\'])(?P\[.+?\])\1', - webpage, 'video data', default='[]', group='data'), - display_id, fatal=False) - if clips: - clip = clips[0] - m3u8_url = clip.get('sources', [{}])[0].get('file') - duration = clip.get('clip_duration') - thumbnail = clip.get('image') - - m3u8_url = m3u8_url or self._search_regex( - r'data(?:-vjs)?-clip-hls-url=(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'hls url', group='url') formats = self._extract_m3u8_formats( - m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native') + data['clips'][0]['hls_url'], display_id, 'mp4', 'm3u8_native') self._sort_formats(formats) - duration = int_or_none(duration or self._search_regex( - r'data-video-duration=(["\'])(?P\d+)\1', webpage, - 'duration', fatal=False, group='duration')) - thumbnail = thumbnail or self._og_search_thumbnail(webpage) - - like_count = int_or_none(self._search_regex( - (r']+data-comment-vote-count=["\'](\d+)', - r']+class=["\'].*?\blike(?:__|-)count\b.*?["\'][^>]*>\s*(\d+)'), - webpage, 'like count', fatal=False)) + get_first = lambda x: try_get(data, lambda y: y[x + 's'][0], dict) or {} + channel = get_first('channel') + user = get_first('user') + username = user.get('username') + get_count = lambda x: int_or_none(video.get('total_' + x)) return { 'id': video_id, 'display_id': display_id, 'title': title, - 'description': self._og_search_description(webpage), - 'thumbnail': thumbnail, - 'duration': duration, - 'like_count': like_count, + 'description': strip_or_none(video.get('description')), + 'thumbnail': video.get('image_url_medium'), + 'duration': int_or_none(video.get('duration')), + 'like_count': get_count('likes'), 'formats': formats, + 'uploader': user.get('name'), + 'timestamp': parse_iso8601(video.get('created_at')), + 'uploader_id': username, + 'uploader_url': 'https://www.vidio.com/@' + username if username else None, + 'channel': channel.get('name'), + 'channel_id': str_or_none(channel.get('id')), + 'view_count': get_count('view_count'), + 'dislike_count': get_count('dislikes'), + 'comment_count': get_count('comments'), + 'tags': video.get('tag_list'), } diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py deleted file mode 100644 index 42ea4952c..000000000 --- a/youtube_dl/extractor/vidzi.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - decode_packed_codes, - js_to_json, - NO_DEFAULT, - PACKED_CODES_RE, -) - - -class VidziIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si|nu)/(?:embed-)?(?P[0-9a-zA-Z]+)' - _TESTS = [{ - 'url': 'http://vidzi.tv/cghql9yq6emu.html', - 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', - 'info_dict': { - 'id': 'cghql9yq6emu', - 'ext': 'mp4', - 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://vidzi.tv/embed-4z2yb0rzphe9-600x338.html', - 'only_matching': True, - }, { - 'url': 'http://vidzi.cc/cghql9yq6emu.html', - 'only_matching': True, - }, { - 'url': 'https://vidzi.si/rph9gztxj1et.html', - 'only_matching': True, - }, { - 'url': 'http://vidzi.nu/cghql9yq6emu.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://vidzi.tv/%s' % video_id, video_id) - title = self._html_search_regex( - r'(?s)

(.*?)

', webpage, 'title') - - codes = [webpage] - codes.extend([ - decode_packed_codes(mobj.group(0)).replace('\\\'', '\'') - for mobj in re.finditer(PACKED_CODES_RE, webpage)]) - for num, code in enumerate(codes, 1): - jwplayer_data = self._parse_json( - self._search_regex( - r'setup\(([^)]+)\)', code, 'jwplayer data', - default=NO_DEFAULT if num == len(codes) else '{}'), - video_id, transform_source=lambda s: js_to_json( - re.sub(r'\s*\+\s*window\[.+?\]', '', s))) - if jwplayer_data: - break - - info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) - info_dict['title'] = title - - return info_dict diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 6224e6200..e2f5d81b8 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -116,7 +116,7 @@ class VLiveIE(VLiveBaseIE): headers={'Referer': 'https://www.vlive.tv/'}, query=query) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self.raise_login_required(json.loads(e.cause.read().decode())['message']) + self.raise_login_required(json.loads(e.cause.read().decode('utf-8'))['message']) raise def _real_extract(self, url): diff --git a/youtube_dl/extractor/vtm.py b/youtube_dl/extractor/vtm.py new file mode 100644 index 000000000..093f1aa69 --- /dev/null +++ b/youtube_dl/extractor/vtm.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + try_get, +) + + +class VTMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vtm\.be/([^/?&#]+)~v(?P[0-9a-f]{8}(?:-[0-9a-f]{4}){3}-[0-9a-f]{12})' + _TEST = { + 'url': 'https://vtm.be/gast-vernielt-genkse-hotelkamer~ve7534523-279f-4b4d-a5c9-a33ffdbe23e1', + 'md5': '37dca85fbc3a33f2de28ceb834b071f8', + 'info_dict': { + 'id': '192445', + 'ext': 'mp4', + 'title': 'Gast vernielt Genkse hotelkamer', + 'timestamp': 1611060180, + 'upload_date': '20210119', + 'duration': 74, + # TODO: fix url _type result processing + # 'series': 'Op Interventie', + } + } + + def _real_extract(self, url): + uuid = self._match_id(url) + video = self._download_json( + 'https://omc4vm23offuhaxx6hekxtzspi.appsync-api.eu-west-1.amazonaws.com/graphql', + uuid, query={ + 'query': '''{ + getComponent(type: Video, uuid: "%s") { + ... on Video { + description + duration + myChannelsVideo + program { + title + } + publishedAt + title + } + } +}''' % uuid, + }, headers={ + 'x-api-key': 'da2-lz2cab4tfnah3mve6wiye4n77e', + })['data']['getComponent'] + + return { + '_type': 'url', + 'id': uuid, + 'title': video.get('title'), + 'url': 'http://mychannels.video/embed/%d' % video['myChannelsVideo'], + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('publishedAt')), + 'duration': int_or_none(video.get('duration')), + 'series': try_get(video, lambda x: x['program']['title']), + 'ie_key': 'Medialaan', + } diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index f4cae7fe9..778ce8b76 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( ExtractorError, int_or_none, @@ -47,6 +48,22 @@ class VVVVIDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # video_type == 'video/youtube' + 'url': 'https://www.vvvvid.it/show/404/one-punch-man/406/486683/trailer', + 'md5': '33e0edfba720ad73a8782157fdebc648', + 'info_dict': { + 'id': 'RzmFKUDOUgw', + 'ext': 'mp4', + 'title': 'Trailer', + 'upload_date': '20150906', + 'description': 'md5:a5e802558d35247fee285875328c0b80', + 'uploader_id': 'BandaiVisual', + 'uploader': 'BANDAI NAMCO Arts Channel', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048', 'only_matching': True @@ -154,12 +171,13 @@ class VVVVIDIE(InfoExtractor): if season_number: info['season_number'] = int(season_number) - for quality in ('_sd', ''): + video_type = video_data.get('video_type') + is_youtube = False + for quality in ('', '_sd'): embed_code = video_data.get('embed_info' + quality) if not embed_code: continue embed_code = ds(embed_code) - video_type = video_data.get('video_type') if video_type in ('video/rcs', 'video/kenc'): if video_type == 'video/kenc': kenc = self._download_json( @@ -172,19 +190,28 @@ class VVVVIDIE(InfoExtractor): if kenc_message: embed_code += '?' + ds(kenc_message) formats.extend(self._extract_akamai_formats(embed_code, video_id)) + elif video_type == 'video/youtube': + info.update({ + '_type': 'url_transparent', + 'ie_key': YoutubeIE.ie_key(), + 'url': embed_code, + }) + is_youtube = True + break else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) metadata_from_url(embed_code) - self._sort_formats(formats) + if not is_youtube: + self._sort_formats(formats) + info['formats'] = formats metadata_from_url(video_data.get('thumbnail')) info.update(self._extract_common_video_info(video_data)) info.update({ 'id': video_id, 'title': title, - 'formats': formats, 'duration': int_or_none(video_data.get('length')), 'series': video_data.get('show_title'), 'season_id': season_id, diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 8ef3e0906..f6940b371 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -1,12 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - ExtractorError, unified_strdate, HEADRequest, int_or_none, @@ -46,15 +43,6 @@ class WatIE(InfoExtractor): }, ] - _FORMATS = ( - (200, 416, 234), - (400, 480, 270), - (600, 640, 360), - (1200, 640, 360), - (1800, 960, 540), - (2500, 1280, 720), - ) - def _real_extract(self, url): video_id = self._match_id(url) video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36)) @@ -97,46 +85,20 @@ class WatIE(InfoExtractor): return red_url return None - def remove_bitrate_limit(manifest_url): - return re.sub(r'(?:max|min)_bitrate=\d+&?', '', manifest_url) - formats = [] - try: - alt_urls = lambda manifest_url: [re.sub(r'(?:wdv|ssm)?\.ism/', repl + '.ism/', manifest_url) for repl in ('', 'ssm')] - manifest_urls = self._download_json( - 'http://www.wat.tv/get/webhtml/' + video_id, video_id) - m3u8_url = manifest_urls.get('hls') - if m3u8_url: - m3u8_url = remove_bitrate_limit(m3u8_url) - for m3u8_alt_url in alt_urls(m3u8_url): - formats.extend(self._extract_m3u8_formats( - m3u8_alt_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - m3u8_alt_url.replace('ios', 'web').replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - mpd_url = manifest_urls.get('mpd') - if mpd_url: - mpd_url = remove_bitrate_limit(mpd_url) - for mpd_alt_url in alt_urls(mpd_url): - formats.extend(self._extract_mpd_formats( - mpd_alt_url, video_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) - except ExtractorError: - abr = 64 - for vbr, width, height in self._FORMATS: - tbr = vbr + abr - format_id = 'http-%s' % tbr - fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr) - if self._is_valid_url(fmt_url, video_id, format_id): - formats.append({ - 'format_id': format_id, - 'url': fmt_url, - 'vbr': vbr, - 'abr': abr, - 'width': width, - 'height': height, - }) + manifest_urls = self._download_json( + 'http://www.wat.tv/get/webhtml/' + video_id, video_id) + m3u8_url = manifest_urls.get('hls') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + mpd_url = manifest_urls.get('mpd') + if mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'), + video_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4') upload_date = unified_strdate(date_diffusion) if date_diffusion else None diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index e4615376c..a17b10d6e 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -177,46 +177,9 @@ class YahooIE(InfoExtractor): 'only_matching': True, }] - def _real_extract(self, url): - url, country, display_id = re.match(self._VALID_URL, url).groups() - if not country: - country = 'us' - else: - country = country.split('-')[0] - api_base = 'https://%s.yahoo.com/_td/api/resource/' % country - - for i, uuid in enumerate(['url=' + url, 'ymedia-alias=' + display_id]): - content = self._download_json( - api_base + 'content;getDetailView=true;uuids=["%s"]' % uuid, - display_id, 'Downloading content JSON metadata', fatal=i == 1) - if content: - item = content['items'][0] - break - - if item.get('type') != 'video': - entries = [] - - cover = item.get('cover') or {} - if cover.get('type') == 'yvideo': - cover_url = cover.get('url') - if cover_url: - entries.append(self.url_result( - cover_url, 'Yahoo', cover.get('uuid'))) - - for e in item.get('body', []): - if e.get('type') == 'videoIframe': - iframe_url = e.get('url') - if not iframe_url: - continue - entries.append(self.url_result(iframe_url)) - - return self.playlist_result( - entries, item.get('uuid'), - item.get('title'), item.get('summary')) - - video_id = item['uuid'] + def _extract_yahoo_video(self, video_id, country): video = self._download_json( - api_base + 'VideoService.videos;view=full;video_ids=["%s"]' % video_id, + 'https://%s.yahoo.com/_td/api/resource/VideoService.videos;view=full;video_ids=["%s"]' % (country, video_id), video_id, 'Downloading video JSON metadata')[0] title = video['title'] @@ -298,7 +261,6 @@ class YahooIE(InfoExtractor): 'id': video_id, 'title': self._live_title(title) if is_live else title, 'formats': formats, - 'display_id': display_id, 'thumbnails': thumbnails, 'description': clean_html(video.get('description')), 'timestamp': parse_iso8601(video.get('publish_time')), @@ -311,6 +273,44 @@ class YahooIE(InfoExtractor): 'episode_number': int_or_none(series_info.get('episode_number')), } + def _real_extract(self, url): + url, country, display_id = re.match(self._VALID_URL, url).groups() + if not country: + country = 'us' + else: + country = country.split('-')[0] + + item = self._download_json( + 'https://%s.yahoo.com/caas/content/article' % country, display_id, + 'Downloading content JSON metadata', query={ + 'url': url + })['items'][0]['data']['partnerData'] + + if item.get('type') != 'video': + entries = [] + + cover = item.get('cover') or {} + if cover.get('type') == 'yvideo': + cover_url = cover.get('url') + if cover_url: + entries.append(self.url_result( + cover_url, 'Yahoo', cover.get('uuid'))) + + for e in (item.get('body') or []): + if e.get('type') == 'videoIframe': + iframe_url = e.get('url') + if not iframe_url: + continue + entries.append(self.url_result(iframe_url)) + + return self.playlist_result( + entries, item.get('uuid'), + item.get('title'), item.get('summary')) + + info = self._extract_yahoo_video(item['uuid'], country) + info['display_id'] = display_id + return info + class YahooSearchIE(SearchInfoExtractor): IE_DESC = 'Yahoo screen search' diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 7b9feafeb..534270bac 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -60,6 +60,9 @@ class YouPornIE(InfoExtractor): }, { 'url': 'http://www.youporn.com/watch/505835', 'only_matching': True, + }, { + 'url': 'https://www.youporn.com/watch/13922959/femdom-principal/', + 'only_matching': True, }] @staticmethod @@ -88,7 +91,7 @@ class YouPornIE(InfoExtractor): # Main source definitions = self._parse_json( self._search_regex( - r'mediaDefinition\s*=\s*(\[.+?\]);', webpage, + r'mediaDefinition\s*[=:]\s*(\[.+?\])\s*[;,]', webpage, 'media definitions', default='[]'), video_id, fatal=False) if definitions: @@ -100,7 +103,7 @@ class YouPornIE(InfoExtractor): links.append(video_url) # Fallback #1, this also contains extra low quality 180p format - for _, link in re.findall(r']+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage): + for _, link in re.findall(r']+href=(["\'])(http(?:(?!\1).)+\.mp4(?:(?!\1).)*)\1[^>]+title=["\']Download [Vv]ideo', webpage): links.append(link) # Fallback #2 (unavailable as at 22.06.2017) @@ -128,8 +131,9 @@ class YouPornIE(InfoExtractor): # Video URL's path looks like this: # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 + # /videos/201703/11/109285532/1080P_4000K_109285532.mp4 # We will benefit from it by extracting some metadata - mobj = re.search(r'(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+/', video_url) + mobj = re.search(r'(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+', video_url) if mobj: height = int(mobj.group('height')) bitrate = int(mobj.group('bitrate')) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f57099f8c..eb5f70763 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2,41 +2,34 @@ from __future__ import unicode_literals - import itertools import json import os.path import random import re -import time import traceback from .common import InfoExtractor, SearchInfoExtractor -from ..jsinterp import JSInterpreter -from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, compat_HTTPError, compat_parse_qs, - compat_urllib_parse_unquote, + compat_str, compat_urllib_parse_unquote_plus, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, - compat_str, ) +from ..jsinterp import JSInterpreter from ..utils import ( - bool_or_none, - clean_html, - error_to_compat_str, ExtractorError, + clean_html, float_or_none, - get_element_by_id, int_or_none, mimetype2ext, parse_codecs, parse_duration, - remove_quotes, + qualities, remove_start, smuggle_url, str_or_none, @@ -46,7 +39,6 @@ from ..utils import ( unified_strdate, unsmuggle_url, update_url_query, - uppercase_escape, url_or_none, urlencode_postdata, urljoin, @@ -68,12 +60,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' - def _set_language(self): - self._set_cookie( - '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en', - # YouTube sets the expire time to about two months - expire_time=time.time() + 2 * 30 * 24 * 3600) - def _ids_to_results(self, ids): return [ self.url_result(vid_id, 'Youtube', video_id=vid_id) @@ -265,7 +251,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _real_initialize(self): if self._downloader is None: return - self._set_language() if not self._login(): return @@ -282,19 +267,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P[a-z]+)$', - r'\b(?Pvfl[a-zA-Z0-9_-]+)\b.*?\.(?P[a-z]+)$', + r'/(?P[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.js$', + r'\b(?Pvfl[a-zA-Z0-9_-]+)\b.*?\.js$', ) + _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') + + _GEO_BYPASS = False + + IE_NAME = 'youtube' + _TESTS = [ + { + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', + 'info_dict': { + 'id': 'BaW_jenozKc', + 'ext': 'mp4', + 'title': 'youtube-dl test video "\'/\\ä↭𝕐', + 'uploader': 'Philipp Hagemeister', + 'uploader_id': 'phihag', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', + 'upload_date': '20121002', + 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', + 'categories': ['Science & Technology'], + 'tags': ['youtube-dl'], + 'duration': 10, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'start_time': 1, + 'end_time': 9, + } + }, + { + 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', + 'note': 'Embed-only video (#1746)', + 'info_dict': { + 'id': 'yZIXLfi8CZQ', + 'ext': 'mp4', + 'upload_date': '20120608', + 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', + 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', + 'uploader': 'SET India', + 'uploader_id': 'setindia', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', + 'age_limit': 18, + }, + 'skip': 'Private video', + }, + { + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ', + 'note': 'Use the first video ID in the URL', + 'info_dict': { + 'id': 'BaW_jenozKc', + 'ext': 'mp4', + 'title': 'youtube-dl test video "\'/\\ä↭𝕐', + 'uploader': 'Philipp Hagemeister', + 'uploader_id': 'phihag', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'upload_date': '20121002', + 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', + 'categories': ['Science & Technology'], + 'tags': ['youtube-dl'], + 'duration': 10, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I', + 'note': '256k DASH audio (format 141) via DASH manifest', + 'info_dict': { + 'id': 'a9LDPn-MO4I', + 'ext': 'm4a', + 'upload_date': '20121002', + 'uploader_id': '8KVIDEO', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', + 'description': '', + 'uploader': '8KVIDEO', + 'title': 'UHDTV TEST 8K VIDEO.mp4' + }, + 'params': { + 'youtube_include_dash_manifest': True, + 'format': '141', + }, + 'skip': 'format 141 not served anymore', + }, + # DASH manifest with encrypted signature + { + 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA', + 'info_dict': { + 'id': 'IB3lcPjvWLA', + 'ext': 'm4a', + 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', + 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', + 'duration': 244, + 'uploader': 'AfrojackVEVO', + 'uploader_id': 'AfrojackVEVO', + 'upload_date': '20131011', + }, + 'params': { + 'youtube_include_dash_manifest': True, + 'format': '141/bestaudio[ext=m4a]', + }, + }, + # Controversy video + { + 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', + 'info_dict': { + 'id': 'T4XJQO3qol8', + 'ext': 'mp4', + 'duration': 219, + 'upload_date': '20100909', + 'uploader': 'Amazing Atheist', + 'uploader_id': 'TheAmazingAtheist', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', + 'title': 'Burning Everyone\'s Koran', + 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', + } + }, + # Normal age-gate video (No vevo, embed allowed), available via embed page + { + 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', + 'info_dict': { + 'id': 'HtVdAasjOgU', + 'ext': 'mp4', + 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', + 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', + 'duration': 142, + 'uploader': 'The Witcher', + 'uploader_id': 'WitcherGame', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', + 'upload_date': '20140605', + 'age_limit': 18, + }, + }, + { + # Age-gated video only available with authentication (unavailable + # via embed page workaround) + 'url': 'XgnwCQzjau8', + 'only_matching': True, + }, + # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) + # YouTube Red ad is not captured for creator + { + 'url': '__2ABJjxzNo', + 'info_dict': { + 'id': '__2ABJjxzNo', + 'ext': 'mp4', + 'duration': 266, + 'upload_date': '20100430', + 'uploader_id': 'deadmau5', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', + 'creator': 'deadmau5', + 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336', + 'uploader': 'deadmau5', + 'title': 'Deadmau5 - Some Chords (HD)', + 'alt_title': 'Some Chords', + }, + 'expected_warnings': [ + 'DASH manifest missing', + ] + }, + # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) + { + 'url': 'lqQg6PlCWgI', + 'info_dict': { + 'id': 'lqQg6PlCWgI', + 'ext': 'mp4', + 'duration': 6085, + 'upload_date': '20150827', + 'uploader_id': 'olympic', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', + 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', + 'uploader': 'Olympic', + 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', + }, + 'params': { + 'skip_download': 'requires avconv', + } + }, + # Non-square pixels + { + 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0', + 'info_dict': { + 'id': '_b-2C3KPAM0', + 'ext': 'mp4', + 'stretched_ratio': 16 / 9., + 'duration': 85, + 'upload_date': '20110310', + 'uploader_id': 'AllenMeow', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', + 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', + 'uploader': '孫ᄋᄅ', + 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', + }, + }, + # url_encoded_fmt_stream_map is empty string + { + 'url': 'qEJwOuvDf7I', + 'info_dict': { + 'id': 'qEJwOuvDf7I', + 'ext': 'webm', + 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', + 'description': '', + 'upload_date': '20150404', + 'uploader_id': 'spbelect', + 'uploader': 'Наблюдатели Петербурга', + }, + 'params': { + 'skip_download': 'requires avconv', + }, + 'skip': 'This live event has ended.', + }, + # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097) + { + 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', + 'info_dict': { + 'id': 'FIl7x6_3R5Y', + 'ext': 'webm', + 'title': 'md5:7b81415841e02ecd4313668cde88737a', + 'description': 'md5:116377fd2963b81ec4ce64b542173306', + 'duration': 220, + 'upload_date': '20150625', + 'uploader_id': 'dorappi2000', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', + 'uploader': 'dorappi2000', + 'formats': 'mincount:31', + }, + 'skip': 'not actual anymore', + }, + # DASH manifest with segment_list + { + 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8', + 'md5': '8ce563a1d667b599d21064e982ab9e31', + 'info_dict': { + 'id': 'CsmdDsKjzN8', + 'ext': 'mp4', + 'upload_date': '20150501', # According to ' valid video id redirection + 'url': 'DJztXj2GPfl', + 'info_dict': { + 'id': 'DJztXj2GPfk', + 'ext': 'mp4', + 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)', + 'description': 'md5:bf577a41da97918e94fa9798d9228825', + 'upload_date': '20090125', + 'uploader': 'Prochorowka', + 'uploader_id': 'Prochorowka', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka', + 'artist': 'Panjabi MC', + 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix', + 'album': 'Beware of the Boys (Mundian To Bach Ke)', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Video unavailable', + }, + { + # empty description results in an empty string + 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k', + 'info_dict': { + 'id': 'x41yOUIvK2k', + 'ext': 'mp4', + 'title': 'IMG 3456', + 'description': '', + 'upload_date': '20170613', + 'uploader_id': 'ElevageOrVert', + 'uploader': 'ElevageOrVert', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # with '};' inside yt initial data (see [1]) + # see [2] for an example with '};' inside ytInitialPlayerResponse + # 1. https://github.com/ytdl-org/youtube-dl/issues/27093 + # 2. https://github.com/ytdl-org/youtube-dl/issues/27216 + 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no', + 'info_dict': { + 'id': 'CHqg6qOn4no', + 'ext': 'mp4', + 'title': 'Part 77 Sort a list of simple types in c#', + 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc', + 'upload_date': '20130831', + 'uploader_id': 'kudvenkat', + 'uploader': 'kudvenkat', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # another example of '};' in ytInitialData + 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY', + 'only_matching': True, + }, + { + 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ', + 'only_matching': True, + }, + ] _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -524,647 +1124,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, } - _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') - - _GEO_BYPASS = False - - IE_NAME = 'youtube' - _TESTS = [ - { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', - 'info_dict': { - 'id': 'BaW_jenozKc', - 'ext': 'mp4', - 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', - 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', - 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', - 'upload_date': '20121002', - 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', - 'categories': ['Science & Technology'], - 'tags': ['youtube-dl'], - 'duration': 10, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - 'start_time': 1, - 'end_time': 9, - } - }, - { - 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', - 'note': 'Embed-only video (#1746)', - 'info_dict': { - 'id': 'yZIXLfi8CZQ', - 'ext': 'mp4', - 'upload_date': '20120608', - 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', - 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', - 'uploader': 'SET India', - 'uploader_id': 'setindia', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', - 'age_limit': 18, - } - }, - { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ', - 'note': 'Use the first video ID in the URL', - 'info_dict': { - 'id': 'BaW_jenozKc', - 'ext': 'mp4', - 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', - 'upload_date': '20121002', - 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', - 'categories': ['Science & Technology'], - 'tags': ['youtube-dl'], - 'duration': 10, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I', - 'note': '256k DASH audio (format 141) via DASH manifest', - 'info_dict': { - 'id': 'a9LDPn-MO4I', - 'ext': 'm4a', - 'upload_date': '20121002', - 'uploader_id': '8KVIDEO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', - 'description': '', - 'uploader': '8KVIDEO', - 'title': 'UHDTV TEST 8K VIDEO.mp4' - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141', - }, - 'skip': 'format 141 not served anymore', - }, - # DASH manifest with encrypted signature - { - 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA', - 'info_dict': { - 'id': 'IB3lcPjvWLA', - 'ext': 'm4a', - 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', - 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', - 'duration': 244, - 'uploader': 'AfrojackVEVO', - 'uploader_id': 'AfrojackVEVO', - 'upload_date': '20131011', - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141/bestaudio[ext=m4a]', - }, - }, - # Controversy video - { - 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', - 'info_dict': { - 'id': 'T4XJQO3qol8', - 'ext': 'mp4', - 'duration': 219, - 'upload_date': '20100909', - 'uploader': 'Amazing Atheist', - 'uploader_id': 'TheAmazingAtheist', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', - 'title': 'Burning Everyone\'s Koran', - 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', - } - }, - # Normal age-gate video (No vevo, embed allowed), available via embed page - { - 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', - 'info_dict': { - 'id': 'HtVdAasjOgU', - 'ext': 'mp4', - 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', - 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', - 'duration': 142, - 'uploader': 'The Witcher', - 'uploader_id': 'WitcherGame', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', - 'upload_date': '20140605', - 'age_limit': 18, - }, - }, - { - # Age-gated video only available with authentication (unavailable - # via embed page workaround) - 'url': 'XgnwCQzjau8', - 'only_matching': True, - }, - # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) - # YouTube Red ad is not captured for creator - { - 'url': '__2ABJjxzNo', - 'info_dict': { - 'id': '__2ABJjxzNo', - 'ext': 'mp4', - 'duration': 266, - 'upload_date': '20100430', - 'uploader_id': 'deadmau5', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', - 'creator': 'Dada Life, deadmau5', - 'description': 'md5:12c56784b8032162bb936a5f76d55360', - 'uploader': 'deadmau5', - 'title': 'Deadmau5 - Some Chords (HD)', - 'alt_title': 'This Machine Kills Some Chords', - }, - 'expected_warnings': [ - 'DASH manifest missing', - ] - }, - # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) - { - 'url': 'lqQg6PlCWgI', - 'info_dict': { - 'id': 'lqQg6PlCWgI', - 'ext': 'mp4', - 'duration': 6085, - 'upload_date': '20150827', - 'uploader_id': 'olympic', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', - 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', - 'uploader': 'Olympic', - 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', - }, - 'params': { - 'skip_download': 'requires avconv', - } - }, - # Non-square pixels - { - 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0', - 'info_dict': { - 'id': '_b-2C3KPAM0', - 'ext': 'mp4', - 'stretched_ratio': 16 / 9., - 'duration': 85, - 'upload_date': '20110310', - 'uploader_id': 'AllenMeow', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', - 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', - 'uploader': '孫ᄋᄅ', - 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', - }, - }, - # url_encoded_fmt_stream_map is empty string - { - 'url': 'qEJwOuvDf7I', - 'info_dict': { - 'id': 'qEJwOuvDf7I', - 'ext': 'webm', - 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', - 'description': '', - 'upload_date': '20150404', - 'uploader_id': 'spbelect', - 'uploader': 'Наблюдатели Петербурга', - }, - 'params': { - 'skip_download': 'requires avconv', - }, - 'skip': 'This live event has ended.', - }, - # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097) - { - 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', - 'info_dict': { - 'id': 'FIl7x6_3R5Y', - 'ext': 'webm', - 'title': 'md5:7b81415841e02ecd4313668cde88737a', - 'description': 'md5:116377fd2963b81ec4ce64b542173306', - 'duration': 220, - 'upload_date': '20150625', - 'uploader_id': 'dorappi2000', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', - 'uploader': 'dorappi2000', - 'formats': 'mincount:31', - }, - 'skip': 'not actual anymore', - }, - # DASH manifest with segment_list - { - 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8', - 'md5': '8ce563a1d667b599d21064e982ab9e31', - 'info_dict': { - 'id': 'CsmdDsKjzN8', - 'ext': 'mp4', - 'upload_date': '20150501', # According to ' valid video id redirection - 'url': 'DJztXj2GPfl', - 'info_dict': { - 'id': 'DJztXj2GPfk', - 'ext': 'mp4', - 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)', - 'description': 'md5:bf577a41da97918e94fa9798d9228825', - 'upload_date': '20090125', - 'uploader': 'Prochorowka', - 'uploader_id': 'Prochorowka', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka', - 'artist': 'Panjabi MC', - 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix', - 'album': 'Beware of the Boys (Mundian To Bach Ke)', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # empty description results in an empty string - 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k', - 'info_dict': { - 'id': 'x41yOUIvK2k', - 'ext': 'mp4', - 'title': 'IMG 3456', - 'description': '', - 'upload_date': '20170613', - 'uploader_id': 'ElevageOrVert', - 'uploader': 'ElevageOrVert', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # with '};' inside yt initial data (see [1]) - # see [2] for an example with '};' inside ytInitialPlayerResponse - # 1. https://github.com/ytdl-org/youtube-dl/issues/27093 - # 2. https://github.com/ytdl-org/youtube-dl/issues/27216 - 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no', - 'info_dict': { - 'id': 'CHqg6qOn4no', - 'ext': 'mp4', - 'title': 'Part 77 Sort a list of simple types in c#', - 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc', - 'upload_date': '20130831', - 'uploader_id': 'kudvenkat', - 'uploader': 'kudvenkat', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # another example of '};' in ytInitialData - 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY', - 'only_matching': True, - }, - { - 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ', - 'only_matching': True, - }, - ] def __init__(self, *args, **kwargs): super(YoutubeIE, self).__init__(*args, **kwargs) + self._code_cache = {} self._player_cache = {} - def report_video_info_webpage_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen('%s: Downloading video info webpage' % video_id) - - def report_information_extraction(self, video_id): - """Report attempt to extract video information.""" - self.to_screen('%s: Extracting video information' % video_id) - - def report_unavailable_format(self, video_id, format): - """Report extracted video URL.""" - self.to_screen('%s: Format %s not available' % (video_id, format)) - - def report_rtmp_download(self): - """Indicate the download will use the RTMP protocol.""" - self.to_screen('RTMP download detected') - def _signature_cache_id(self, example_sig): """ Return a string representation of a signature """ return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) @@ -1177,40 +1142,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): break else: raise ExtractorError('Cannot identify player %r' % player_url) - return id_m.group('ext'), id_m.group('id') + return id_m.group('id') def _extract_signature_function(self, video_id, player_url, example_sig): - player_type, player_id = self._extract_player_info(player_url) + player_id = self._extract_player_info(player_url) # Read from filesystem cache - func_id = '%s_%s_%s' % ( - player_type, player_id, self._signature_cache_id(example_sig)) + func_id = 'js_%s_%s' % ( + player_id, self._signature_cache_id(example_sig)) assert os.path.basename(func_id) == func_id cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) - download_note = ( - 'Downloading player %s' % player_url - if self._downloader.params.get('verbose') else - 'Downloading %s player %s' % (player_type, player_id) - ) - if player_type == 'js': - code = self._download_webpage( + if player_id not in self._code_cache: + self._code_cache[player_id] = self._download_webpage( player_url, video_id, - note=download_note, + note='Downloading player ' + player_id, errnote='Download of %s failed' % player_url) - res = self._parse_sig_js(code) - elif player_type == 'swf': - urlh = self._request_webpage( - player_url, video_id, - note=download_note, - errnote='Download of %s failed' % player_url) - code = urlh.read() - res = self._parse_sig_swf(code) - else: - assert False, 'Invalid player type %r' % player_type + code = self._code_cache[player_id] + res = self._parse_sig_js(code) test_string = ''.join(map(compat_chr, range(len(example_sig)))) cache_res = res(test_string) @@ -1279,14 +1231,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): initial_function = jsi.extract_function(funcname) return lambda s: initial_function([s]) - def _parse_sig_swf(self, file_contents): - swfi = SWFInterpreter(file_contents) - TARGET_CLASSNAME = 'SignatureDecipher' - searched_class = swfi.extract_class(TARGET_CLASSNAME) - initial_function = swfi.extract_function(searched_class, 'decipher') - return lambda s: initial_function([s]) - - def _decrypt_signature(self, s, video_id, player_url, age_gate=False): + def _decrypt_signature(self, s, video_id, player_url): """Turn the encrypted s field into a working signature""" if player_url is None: @@ -1313,158 +1258,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError( 'Signature extraction failed: ' + tb, cause=e) - def _get_subtitles(self, video_id, webpage): - try: - subs_doc = self._download_xml( - 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, - video_id, note=False) - except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) - return {} - - sub_lang_list = {} - for track in subs_doc.findall('track'): - lang = track.attrib['lang_code'] - if lang in sub_lang_list: - continue - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse_urlencode({ - 'lang': lang, - 'v': video_id, - 'fmt': ext, - 'name': track.attrib['name'].encode('utf-8'), - }) - sub_formats.append({ - 'url': 'https://www.youtube.com/api/timedtext?' + params, - 'ext': ext, - }) - sub_lang_list[lang] = sub_formats - if not sub_lang_list: - self._downloader.report_warning('video doesn\'t have subtitles') - return {} - return sub_lang_list - - def _get_ytplayer_config(self, video_id, webpage): - patterns = ( - # User data may contain arbitrary character sequences that may affect - # JSON extraction with regex, e.g. when '};' is contained the second - # regex won't capture the whole JSON. Yet working around by trying more - # concrete regex first keeping in mind proper quoted string handling - # to be implemented in future that will replace this workaround (see - # https://github.com/ytdl-org/youtube-dl/issues/7468, - # https://github.com/ytdl-org/youtube-dl/pull/7599) - r';ytplayer\.config\s*=\s*({.+?});ytplayer', - r';ytplayer\.config\s*=\s*({.+?});', - ) - config = self._search_regex( - patterns, webpage, 'ytplayer.config', default=None) - if config: - return self._parse_json( - uppercase_escape(config), video_id, fatal=False) - - def _get_automatic_captions(self, video_id, player_response, player_config): - """We need the webpage for getting the captions url, pass it as an - argument to speed up the process.""" - self.to_screen('%s: Looking for automatic captions' % video_id) - err_msg = 'Couldn\'t find automatic captions for %s' % video_id - if not (player_response or player_config): - self._downloader.report_warning(err_msg) - return {} - try: - args = player_config.get('args') if player_config else {} - caption_url = args.get('ttsurl') - if caption_url: - timestamp = args['timestamp'] - # We get the available subtitles - list_params = compat_urllib_parse_urlencode({ - 'type': 'list', - 'tlangs': 1, - 'asrs': 1, - }) - list_url = caption_url + '&' + list_params - caption_list = self._download_xml(list_url, video_id) - original_lang_node = caption_list.find('track') - if original_lang_node is None: - self._downloader.report_warning('Video doesn\'t have automatic captions') - return {} - original_lang = original_lang_node.attrib['lang_code'] - caption_kind = original_lang_node.attrib.get('kind', '') - - sub_lang_list = {} - for lang_node in caption_list.findall('target'): - sub_lang = lang_node.attrib['lang_code'] - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse_urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': ext, - 'ts': timestamp, - 'kind': caption_kind, - }) - sub_formats.append({ - 'url': caption_url + '&' + params, - 'ext': ext, - }) - sub_lang_list[sub_lang] = sub_formats - return sub_lang_list - - def make_captions(sub_url, sub_langs): - parsed_sub_url = compat_urllib_parse_urlparse(sub_url) - caption_qs = compat_parse_qs(parsed_sub_url.query) - captions = {} - for sub_lang in sub_langs: - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - caption_qs.update({ - 'tlang': [sub_lang], - 'fmt': [ext], - }) - sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace( - query=compat_urllib_parse_urlencode(caption_qs, True))) - sub_formats.append({ - 'url': sub_url, - 'ext': ext, - }) - captions[sub_lang] = sub_formats - return captions - - # New captions format as of 22.06.2017 - if player_response: - renderer = player_response['captions']['playerCaptionsTracklistRenderer'] - base_url = renderer['captionTracks'][0]['baseUrl'] - sub_lang_list = [] - for lang in renderer['translationLanguages']: - lang_code = lang.get('languageCode') - if lang_code: - sub_lang_list.append(lang_code) - return make_captions(base_url, sub_lang_list) - - # Some videos don't provide ttsurl but rather caption_tracks and - # caption_translation_languages (e.g. 20LmZk1hakA) - # Does not used anymore as of 22.06.2017 - caption_tracks = args['caption_tracks'] - caption_translation_languages = args['caption_translation_languages'] - caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] - sub_lang_list = [] - for lang in caption_translation_languages.split(','): - lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) - sub_lang = lang_qs.get('lc', [None])[0] - if sub_lang: - sub_lang_list.append(sub_lang) - return make_captions(caption_url, sub_lang_list) - # An extractor error can be raise by the download process if there are - # no automatic captions but there are subtitles - except (KeyError, IndexError, ExtractorError): - self._downloader.report_warning(err_msg) - return {} - - def _mark_watched(self, video_id, video_info, player_response): + def _mark_watched(self, video_id, player_response): playback_url = url_or_none(try_get( player_response, - lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get( - video_info, lambda x: x['videostats_playback_base_url'][0])) + lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl'])) if not playback_url: return parsed_playback_url = compat_urlparse.urlparse(playback_url) @@ -1531,12 +1328,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id = mobj.group(2) return video_id - def _extract_chapters_from_json(self, webpage, video_id, duration): - if not webpage: - return - data = self._extract_yt_initial_data(video_id, webpage) - if not data or not isinstance(data, dict): - return + def _extract_chapters_from_json(self, data, video_id, duration): chapters_list = try_get( data, lambda x: x['playerOverlays'] @@ -1576,244 +1368,75 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }) return chapters - @staticmethod - def _extract_chapters_from_description(description, duration): - if not description: - return None - chapter_lines = re.findall( - r'(?:^|)([^<]*]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)[^>]*)(?=$|)', - description) - if not chapter_lines: - return None - chapters = [] - for next_num, (chapter_line, time_point) in enumerate( - chapter_lines, start=1): - start_time = parse_duration(time_point) - if start_time is None: - continue - if start_time > duration: - break - end_time = (duration if next_num == len(chapter_lines) - else parse_duration(chapter_lines[next_num][1])) - if end_time is None: - continue - if end_time > duration: - end_time = duration - if start_time > end_time: - break - chapter_title = re.sub( - r']+>[^<]+', '', chapter_line).strip(' \t-') - chapter_title = re.sub(r'\s+', ' ', chapter_title) - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': chapter_title, - }) - return chapters - - def _extract_chapters(self, webpage, description, video_id, duration): - return (self._extract_chapters_from_json(webpage, video_id, duration) - or self._extract_chapters_from_description(description, duration)) + def _extract_yt_initial_variable(self, webpage, regex, video_id, name): + return self._parse_json(self._search_regex( + (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), + regex), webpage, name, default='{}'), video_id, fatal=False) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) + video_id = self._match_id(url) + base_url = self.http_scheme() + '//www.youtube.com/' + webpage_url = base_url + 'watch?v=' + video_id + webpage = self._download_webpage(webpage_url, video_id, fatal=False) - proto = ( - 'http' if self._downloader.params.get('prefer_insecure', False) - else 'https') + player_response = None + if webpage: + player_response = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, + video_id, 'initial player response') + if not player_response: + player_response = self._call_api( + 'player', {'videoId': video_id}, video_id) - start_time = None - end_time = None - parsed_url = compat_urllib_parse_urlparse(url) - for component in [parsed_url.fragment, parsed_url.query]: - query = compat_parse_qs(component) - if start_time is None and 't' in query: - start_time = parse_duration(query['t'][0]) - if start_time is None and 'start' in query: - start_time = parse_duration(query['start'][0]) - if end_time is None and 'end' in query: - end_time = parse_duration(query['end'][0]) + playability_status = player_response.get('playabilityStatus') or {} + if playability_status.get('reason') == 'Sign in to confirm your age': + pr = self._parse_json(try_get(compat_parse_qs( + self._download_webpage( + base_url + 'get_video_info', video_id, + 'Refetching age-gated info webpage', + 'unable to download video info webpage', query={ + 'video_id': video_id, + 'eurl': 'https://www.youtube.com/embed/' + video_id, + }, fatal=False)), + lambda x: x['player_response'][0], + compat_str) or '{}', video_id) + if pr: + player_response = pr - # Extract original video URL from URL with redirection, like age verification, using next_url parameter - mobj = re.search(self._NEXT_URL_RE, url) - if mobj: - url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/') - video_id = self.extract_id(url) + trailer_video_id = try_get( + playability_status, + lambda x: x['errorScreen']['playerLegacyDesktopYpcTrailerRenderer']['trailerVideoId'], + compat_str) + if trailer_video_id: + return self.url_result( + trailer_video_id, self.ie_key(), trailer_video_id) - # Get video webpage - url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id - video_webpage, urlh = self._download_webpage_handle(url, video_id) - - qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query) - video_id = qs.get('v', [None])[0] or video_id - - # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) - if mobj is not None: - player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) - else: - player_url = None - - dash_mpds = [] - - def add_dash_mpd(video_info): - dash_mpd = video_info.get('dashmpd') - if dash_mpd and dash_mpd[0] not in dash_mpds: - dash_mpds.append(dash_mpd[0]) - - def add_dash_mpd_pr(pl_response): - dash_mpd = url_or_none(try_get( - pl_response, lambda x: x['streamingData']['dashManifestUrl'], - compat_str)) - if dash_mpd and dash_mpd not in dash_mpds: - dash_mpds.append(dash_mpd) - - is_live = None - view_count = None - - def extract_view_count(v_info): - return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) - - def extract_player_response(player_response, video_id): - pl_response = str_or_none(player_response) - if not pl_response: + def get_text(x): + if not x: return - pl_response = self._parse_json(pl_response, video_id, fatal=False) - if isinstance(pl_response, dict): - add_dash_mpd_pr(pl_response) - return pl_response + return x.get('simpleText') or ''.join([r['text'] for r in x['runs']]) - player_response = {} - - # Get video info - video_info = {} - embed_webpage = None - ytplayer_config = None - - if re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None: - age_gate = True - # We simulate the access to the video from www.youtube.com/v/{video_id} - # this can be viewed without login into Youtube - url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') - data = compat_urllib_parse_urlencode({ - 'video_id': video_id, - 'eurl': 'https://youtube.googleapis.com/v/' + video_id, - 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), - }) - video_info_url = proto + '://www.youtube.com/get_video_info?' + data - try: - video_info_webpage = self._download_webpage( - video_info_url, video_id, - note='Refetching age-gated info webpage', - errnote='unable to download video info webpage') - except ExtractorError: - video_info_webpage = None - if video_info_webpage: - video_info = compat_parse_qs(video_info_webpage) - pl_response = video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(video_info) - view_count = extract_view_count(video_info) - else: - age_gate = False - # Try looking directly into the video webpage - ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) - if ytplayer_config: - args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): - # Convert to the same format returned by compat_parse_qs - video_info = dict((k, [v]) for k, v in args.items()) - add_dash_mpd(video_info) - # Rental video is not rented but preview is available (e.g. - # https://www.youtube.com/watch?v=yYr8q0y5Jfg, - # https://github.com/ytdl-org/youtube-dl/issues/10532) - if not video_info and args.get('ypc_vid'): - return self.url_result( - args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) - if args.get('livestream') == '1' or args.get('live_playback') == 1: - is_live = True - if not player_response: - player_response = extract_player_response(args.get('player_response'), video_id) - if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): - add_dash_mpd_pr(player_response) - - if not video_info and not player_response: - player_response = extract_player_response( - self._search_regex( - (r'%s\s*%s' % (self._YT_INITIAL_PLAYER_RESPONSE_RE, self._YT_INITIAL_BOUNDARY_RE), - self._YT_INITIAL_PLAYER_RESPONSE_RE), video_webpage, - 'initial player response', default='{}'), - video_id) - - def extract_unavailable_message(): - messages = [] - for tag, kind in (('h1', 'message'), ('div', 'submessage')): - msg = self._html_search_regex( - r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)'.format(tag=tag, kind=kind), - video_webpage, 'unavailable %s' % kind, default=None) - if msg: - messages.append(msg) - if messages: - return '\n'.join(messages) - - if not video_info and not player_response: - unavailable_message = extract_unavailable_message() - if not unavailable_message: - unavailable_message = 'Unable to extract video data' - raise ExtractorError( - 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) - - if not isinstance(video_info, dict): - video_info = {} - - video_details = try_get( - player_response, lambda x: x['videoDetails'], dict) or {} + search_meta = ( + lambda x: self._html_search_meta(x, webpage, default=None)) \ + if webpage else lambda x: None + video_details = player_response.get('videoDetails') or {} microformat = try_get( - player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {} - - video_title = video_info.get('title', [None])[0] or video_details.get('title') - if not video_title: - self._downloader.report_warning('Unable to extract video title') - video_title = '_' - - description_original = video_description = get_element_by_id("eow-description", video_webpage) - if video_description: - - def replace_url(m): - redir_url = compat_urlparse.urljoin(url, m.group(1)) - parsed_redir_url = compat_urllib_parse_urlparse(redir_url) - if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect': - qs = compat_parse_qs(parsed_redir_url.query) - q = qs.get('q') - if q and q[0]: - return q[0] - return redir_url - - description_original = video_description = re.sub(r'''(?x) - ]*> - [^<]+\.{3}\s* - - ''', replace_url, video_description) - video_description = clean_html(video_description) - else: - video_description = video_details.get('shortDescription') - if video_description is None: - video_description = self._html_search_meta('description', video_webpage) + player_response, + lambda x: x['microformat']['playerMicroformatRenderer'], + dict) or {} + video_title = video_details.get('title') \ + or get_text(microformat.get('title')) \ + or search_meta(['og:title', 'twitter:title', 'title']) + video_description = video_details.get('shortDescription') if not smuggled_data.get('force_singlefeed', False): if not self._downloader.params.get('noplaylist'): multifeed_metadata_list = try_get( player_response, lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'], - compat_str) or try_get( - video_info, lambda x: x['multifeed_metadata_list'][0], compat_str) + compat_str) if multifeed_metadata_list: entries = [] feed_ids = [] @@ -1821,10 +1444,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Unquote should take place before split on comma (,) since textual # fields may contain comma as well (see # https://github.com/ytdl-org/youtube-dl/issues/8536) - feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) + feed_data = compat_parse_qs( + compat_urllib_parse_unquote_plus(feed)) def feed_entry(name): - return try_get(feed_data, lambda x: x[name][0], compat_str) + return try_get( + feed_data, lambda x: x[name][0], compat_str) feed_id = feed_entry('id') if not feed_id: @@ -1837,7 +1462,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '_type': 'url_transparent', 'ie_key': 'Youtube', 'url': smuggle_url( - '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), + base_url + 'watch?v=' + feed_data['id'][0], {'force_singlefeed': True}), 'title': title, }) @@ -1845,631 +1470,404 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.to_screen( 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' % (', '.join(feed_ids), video_id)) - return self.playlist_result(entries, video_id, video_title, video_description) + return self.playlist_result( + entries, video_id, video_title, video_description) else: self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - if view_count is None: - view_count = extract_view_count(video_info) - if view_count is None and video_details: - view_count = int_or_none(video_details.get('viewCount')) - if view_count is None and microformat: - view_count = int_or_none(microformat.get('viewCount')) + formats = [] + itags = [] + itag_qualities = {} + player_url = None + q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) + streaming_data = player_response.get('streamingData') or {} + streaming_formats = streaming_data.get('formats') or [] + streaming_formats.extend(streaming_data.get('adaptiveFormats') or []) + for fmt in streaming_formats: + if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): + continue - if is_live is None: - is_live = bool_or_none(video_details.get('isLive')) + itag = str_or_none(fmt.get('itag')) + quality = fmt.get('quality') + if itag and quality: + itag_qualities[itag] = quality + # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment + # (adding `&sq=0` to the URL) and parsing emsg box to determine the + # number of fragment that would subsequently requested with (`&sq=N`) + if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF': + continue - # Check for "rental" videos - if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True) - - def _extract_filesize(media_url): - return int_or_none(self._search_regex( - r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) - - streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or [] - streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or []) - - if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): - self.report_rtmp_download() - formats = [{ - 'format_id': '_rtmp', - 'protocol': 'rtmp', - 'url': video_info['conn'][0], - 'player_url': player_url, - }] - elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): - encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] - if 'rtmpe%3Dyes' in encoded_url_map: - raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True) - formats = [] - formats_spec = {} - fmt_list = video_info.get('fmt_list', [''])[0] - if fmt_list: - for fmt in fmt_list.split(','): - spec = fmt.split('/') - if len(spec) > 1: - width_height = spec[1].split('x') - if len(width_height) == 2: - formats_spec[spec[0]] = { - 'resolution': spec[1], - 'width': int_or_none(width_height[0]), - 'height': int_or_none(width_height[1]), - } - for fmt in streaming_formats: - itag = str_or_none(fmt.get('itag')) - if not itag: + fmt_url = fmt.get('url') + if not fmt_url: + sc = compat_parse_qs(fmt.get('signatureCipher')) + fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) + encrypted_sig = try_get(sc, lambda x: x['s'][0]) + if not (sc and fmt_url and encrypted_sig): continue - quality = fmt.get('quality') - quality_label = fmt.get('qualityLabel') or quality - formats_spec[itag] = { - 'asr': int_or_none(fmt.get('audioSampleRate')), - 'filesize': int_or_none(fmt.get('contentLength')), - 'format_note': quality_label, - 'fps': int_or_none(fmt.get('fps')), - 'height': int_or_none(fmt.get('height')), - # bitrate for itag 43 is always 2147483647 - 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, - 'width': int_or_none(fmt.get('width')), - } - - for fmt in streaming_formats: - if fmt.get('drmFamilies') or fmt.get('drm_families'): - continue - url = url_or_none(fmt.get('url')) - - if not url: - cipher = fmt.get('cipher') or fmt.get('signatureCipher') - if not cipher: + if not player_url: + if not webpage: continue - url_data = compat_parse_qs(cipher) - url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str)) - if not url: + player_url = self._search_regex( + r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', + webpage, 'player URL', fatal=False) + if not player_url: + continue + signature = self._decrypt_signature(sc['s'][0], video_id, player_url) + sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' + fmt_url += '&' + sp + '=' + signature + + if itag: + itags.append(itag) + dct = { + 'asr': int_or_none(fmt.get('audioSampleRate')), + 'filesize': int_or_none(fmt.get('contentLength')), + 'format_id': itag, + 'format_note': fmt.get('qualityLabel') or quality, + 'fps': int_or_none(fmt.get('fps')), + 'height': int_or_none(fmt.get('height')), + 'quality': q(quality), + 'tbr': float_or_none(fmt.get( + 'averageBitrate') or fmt.get('bitrate'), 1000), + 'url': fmt_url, + 'width': fmt.get('width'), + } + mimetype = fmt.get('mimeType') + if mimetype: + mobj = re.match( + r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', mimetype) + if mobj: + dct['ext'] = mimetype2ext(mobj.group(1)) + dct.update(parse_codecs(mobj.group(2))) + if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': + dct['downloader_options'] = { + # Youtube throttles chunks >~10M + 'http_chunk_size': 10485760, + } + formats.append(dct) + + hls_manifest_url = streaming_data.get('hlsManifestUrl') + if hls_manifest_url: + for f in self._extract_m3u8_formats( + hls_manifest_url, video_id, 'mp4', fatal=False): + itag = self._search_regex( + r'/itag/(\d+)', f['url'], 'itag', default=None) + if itag: + f['format_id'] = itag + formats.append(f) + + if self._downloader.params.get('youtube_include_dash_manifest'): + dash_manifest_url = streaming_data.get('dashManifestUrl') + if dash_manifest_url: + for f in self._extract_mpd_formats( + dash_manifest_url, video_id, fatal=False): + itag = f['format_id'] + if itag in itags: continue - else: - cipher = None - url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + if itag in itag_qualities: + f['quality'] = q(itag_qualities[itag]) + filesize = int_or_none(self._search_regex( + r'/clen/(\d+)', f.get('fragment_base_url') + or f['url'], 'file size', default=None)) + if filesize: + f['filesize'] = filesize + formats.append(f) - stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) - # Unsupported FORMAT_STREAM_TYPE_OTF - if stream_type == 3: - continue + if not formats: + if streaming_data.get('licenseInfos'): + raise ExtractorError( + 'This video is DRM protected.', expected=True) + pemr = try_get( + playability_status, + lambda x: x['errorScreen']['playerErrorMessageRenderer'], + dict) or {} + reason = get_text(pemr.get('reason')) or playability_status.get('reason') + subreason = pemr.get('subreason') + if subreason: + subreason = clean_html(get_text(subreason)) + if subreason == 'The uploader has not made this video available in your country.': + countries = microformat.get('availableCountries') + if not countries: + regions_allowed = search_meta('regionsAllowed') + countries = regions_allowed.split(',') if regions_allowed else None + self.raise_geo_restricted( + subreason, countries) + reason += '\n' + subreason + if reason: + raise ExtractorError(reason, expected=True) - format_id = fmt.get('itag') or url_data['itag'][0] - if not format_id: - continue - format_id = compat_str(format_id) + self._sort_formats(formats) - if cipher: - if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): - ASSETS_RE = ( - r']+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base', - r'"jsUrl"\s*:\s*("[^"]+")', - r'"assets":.+?"js":\s*("[^"]+")') - jsplayer_url_json = self._search_regex( - ASSETS_RE, - embed_webpage if age_gate else video_webpage, - 'JS player URL (1)', default=None) - if not jsplayer_url_json and not age_gate: - # We need the embed website after all - if embed_webpage is None: - embed_url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage( - embed_url, video_id, 'Downloading embed webpage') - jsplayer_url_json = self._search_regex( - ASSETS_RE, embed_webpage, 'JS player URL') - - player_url = json.loads(jsplayer_url_json) - if player_url is None: - player_url_json = self._search_regex( - r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', - video_webpage, 'age gate player URL') - player_url = json.loads(player_url_json) - - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - encrypted_sig = url_data['s'][0] - - if self._downloader.params.get('verbose'): - if player_url is None: - player_desc = 'unknown' - else: - player_type, player_version = self._extract_player_info(player_url) - player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version) - parts_sizes = self._signature_cache_id(encrypted_sig) - self.to_screen('{%s} signature length %s, %s' % - (format_id, parts_sizes, player_desc)) - - signature = self._decrypt_signature( - encrypted_sig, video_id, player_url, age_gate) - sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' - url += '&%s=%s' % (sp, signature) - if 'ratebypass' not in url: - url += '&ratebypass=yes' - - dct = { - 'format_id': format_id, - 'url': url, - 'player_url': player_url, - } - if format_id in self._formats: - dct.update(self._formats[format_id]) - if format_id in formats_spec: - dct.update(formats_spec[format_id]) - - # Some itags are not included in DASH manifest thus corresponding formats will - # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993). - # Trying to extract metadata from url_encoded_fmt_stream_map entry. - mobj = re.search(r'^(?P\d+)[xX](?P\d+)$', url_data.get('size', [''])[0]) - width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) - - if width is None: - width = int_or_none(fmt.get('width')) - if height is None: - height = int_or_none(fmt.get('height')) - - filesize = int_or_none(url_data.get( - 'clen', [None])[0]) or _extract_filesize(url) - - quality = url_data.get('quality', [None])[0] or fmt.get('quality') - quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel') - - tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000) - or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None - fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps')) - - more_fields = { - 'filesize': filesize, - 'tbr': tbr, - 'width': width, - 'height': height, - 'fps': fps, - 'format_note': quality_label or quality, - } - for key, value in more_fields.items(): - if value: - dct[key] = value - type_ = url_data.get('type', [None])[0] or fmt.get('mimeType') - if type_: - type_split = type_.split(';') - kind_ext = type_split[0].split('/') - if len(kind_ext) == 2: - kind, _ = kind_ext - dct['ext'] = mimetype2ext(type_split[0]) - if kind in ('audio', 'video'): - codecs = None - for mobj in re.finditer( - r'(?P[a-zA-Z_-]+)=(?P["\']?)(?P.+?)(?P=quote)(?:;|$)', type_): - if mobj.group('key') == 'codecs': - codecs = mobj.group('val') - break - if codecs: - dct.update(parse_codecs(codecs)) - if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': - dct['downloader_options'] = { - # Youtube throttles chunks >~10M - 'http_chunk_size': 10485760, - } - formats.append(dct) - else: - manifest_url = ( - url_or_none(try_get( - player_response, - lambda x: x['streamingData']['hlsManifestUrl'], - compat_str)) - or url_or_none(try_get( - video_info, lambda x: x['hlsvp'][0], compat_str))) - if manifest_url: - formats = [] - m3u8_formats = self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', fatal=False) - for a_format in m3u8_formats: - itag = self._search_regex( - r'/itag/(\d+)/', a_format['url'], 'itag', default=None) - if itag: - a_format['format_id'] = itag - if itag in self._formats: - dct = self._formats[itag].copy() - dct.update(a_format) - a_format = dct - a_format['player_url'] = player_url - # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' - formats.append(a_format) - else: - error_message = extract_unavailable_message() - if not error_message: - reason_list = try_get( - player_response, - lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'], - list) or [] - for reason in reason_list: - if not isinstance(reason, dict): - continue - reason_text = try_get(reason, lambda x: x['text'], compat_str) - if reason_text: - if not error_message: - error_message = '' - error_message += reason_text - if error_message: - error_message = clean_html(error_message) - if not error_message: - error_message = clean_html(try_get( - player_response, lambda x: x['playabilityStatus']['reason'], - compat_str)) - if not error_message: - error_message = clean_html( - try_get(video_info, lambda x: x['reason'][0], compat_str)) - if error_message: - raise ExtractorError(error_message, expected=True) - raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info') - - # uploader - video_uploader = try_get( - video_info, lambda x: x['author'][0], - compat_str) or str_or_none(video_details.get('author')) - if video_uploader: - video_uploader = compat_urllib_parse_unquote_plus(video_uploader) - else: - self._downloader.report_warning('unable to extract uploader name') - - # uploader_id - video_uploader_id = None - video_uploader_url = None - mobj = re.search( - r'', - video_webpage) - if mobj is not None: - video_uploader_id = mobj.group('uploader_id') - video_uploader_url = mobj.group('uploader_url') - else: - owner_profile_url = url_or_none(microformat.get('ownerProfileUrl')) - if owner_profile_url: - video_uploader_id = self._search_regex( - r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id', - default=None) - video_uploader_url = owner_profile_url - - channel_id = ( - str_or_none(video_details.get('channelId')) - or self._html_search_meta( - 'channelId', video_webpage, 'channel id', default=None) - or self._search_regex( - r'data-channel-external-id=(["\'])(?P(?:(?!\1).)+)\1', - video_webpage, 'channel id', default=None, group='id')) - channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None + keywords = video_details.get('keywords') or [] + if not keywords and webpage: + keywords = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), webpage)] + for keyword in keywords: + if keyword.startswith('yt:stretch='): + w, h = keyword.split('=')[1].split(':') + w, h = int(w), int(h) + if w > 0 and h > 0: + ratio = w / h + for f in formats: + if f.get('vcodec') != 'none': + f['stretched_ratio'] = ratio thumbnails = [] - thumbnails_list = try_get( - video_details, lambda x: x['thumbnail']['thumbnails'], list) or [] - for t in thumbnails_list: - if not isinstance(t, dict): - continue - thumbnail_url = url_or_none(t.get('url')) - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(t.get('width')), - 'height': int_or_none(t.get('height')), - }) - - if not thumbnails: - video_thumbnail = None - # We try first to get a high quality image: - m_thumb = re.search(r'', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str) - if thumbnail_url: - video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url) - if video_thumbnail: - thumbnails.append({'url': video_thumbnail}) - - # upload date - upload_date = self._html_search_meta( - 'datePublished', video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], - video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = microformat.get('publishDate') or microformat.get('uploadDate') - upload_date = unified_strdate(upload_date) - - video_license = self._html_search_regex( - r']+class="title"[^>]*>\s*License\s*\s*]*>\s*
  • (.+?)]+class="title"[^>]*>\s*Music\s*\s* - ]*>\s* -
  • (?P.+?) - by (?P<creator>.+?) - (?: - \(.+?\)| - <a[^>]* - (?: - \bhref=["\']/red[^>]*>| # drop possible - >\s*Listen ad-free with YouTube Red # YouTube Red ad - ) - .*? - )?</li - ''', - video_webpage) - if m_music: - video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) - video_creator = clean_html(m_music.group('creator')) + for container in (video_details, microformat): + for thumbnail in (try_get( + container, + lambda x: x['thumbnail']['thumbnails'], list) or []): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'height': int_or_none(thumbnail.get('height')), + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + }) + if thumbnails: + break else: - video_alt_title = video_creator = None + thumbnail = search_meta(['og:image', 'twitter:image']) + if thumbnail: + thumbnails = [{'url': thumbnail}] - def extract_meta(field): - return self._html_search_regex( - r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, - video_webpage, field, default=None) + category = microformat.get('category') or search_meta('genre') + channel_id = video_details.get('channelId') \ + or microformat.get('externalChannelId') \ + or search_meta('channelId') + duration = int_or_none( + video_details.get('lengthSeconds') + or microformat.get('lengthSeconds')) \ + or parse_duration(search_meta('duration')) + is_live = video_details.get('isLive') + owner_profile_url = microformat.get('ownerProfileUrl') - track = extract_meta('Song') - artist = extract_meta('Artist') - album = extract_meta('Album') + info = { + 'id': video_id, + 'title': self._live_title(video_title) if is_live else video_title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': video_description, + 'upload_date': unified_strdate( + microformat.get('uploadDate') + or search_meta('uploadDate')), + 'uploader': video_details['author'], + 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None, + 'uploader_url': owner_profile_url, + 'channel_id': channel_id, + 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None, + 'duration': duration, + 'view_count': int_or_none( + video_details.get('viewCount') + or microformat.get('viewCount') + or search_meta('interactionCount')), + 'average_rating': float_or_none(video_details.get('averageRating')), + 'age_limit': 18 if ( + microformat.get('isFamilySafe') is False + or search_meta('isFamilyFriendly') == 'false' + or search_meta('og:restrictions:age') == '18+') else 0, + 'webpage_url': webpage_url, + 'categories': [category] if category else None, + 'tags': keywords, + 'is_live': is_live, + } + + pctr = try_get( + player_response, + lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict) + if pctr: + def process_language(container, base_url, lang_code, query): + lang_subs = [] + for fmt in self._SUBTITLE_FORMATS: + query.update({ + 'fmt': fmt, + }) + lang_subs.append({ + 'ext': fmt, + 'url': update_url_query(base_url, query), + }) + container[lang_code] = lang_subs + + subtitles = {} + for caption_track in (pctr.get('captionTracks') or []): + base_url = caption_track.get('baseUrl') + if not base_url: + continue + if caption_track.get('kind') != 'asr': + lang_code = caption_track.get('languageCode') + if not lang_code: + continue + process_language( + subtitles, base_url, lang_code, {}) + continue + automatic_captions = {} + for translation_language in (pctr.get('translationLanguages') or []): + translation_language_code = translation_language.get('languageCode') + if not translation_language_code: + continue + process_language( + automatic_captions, base_url, translation_language_code, + {'tlang': translation_language_code}) + info['automatic_captions'] = automatic_captions + info['subtitles'] = subtitles + + parsed_url = compat_urllib_parse_urlparse(url) + for component in [parsed_url.fragment, parsed_url.query]: + query = compat_parse_qs(component) + for k, v in query.items(): + for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]: + d_k += '_time' + if d_k not in info and k in s_ks: + info[d_k] = parse_duration(query[k][0]) - # Youtube Music Auto-generated description - release_date = release_year = None if video_description: mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) if mobj: - if not track: - track = mobj.group('track').strip() - if not artist: - artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')) - if not album: - album = mobj.group('album'.strip()) release_year = mobj.group('release_year') release_date = mobj.group('release_date') if release_date: release_date = release_date.replace('-', '') if not release_year: - release_year = int(release_date[:4]) - if release_year: - release_year = int(release_year) + release_year = release_date[:4] + info.update({ + 'album': mobj.group('album'.strip()), + 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')), + 'track': mobj.group('track').strip(), + 'release_date': release_date, + 'release_year': int(release_year), + }) - yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage) - contents = try_get(yt_initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or [] - for content in contents: - rows = try_get(content, lambda x: x['videoSecondaryInfoRenderer']['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or [] - multiple_songs = False - for row in rows: - if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: - multiple_songs = True - break - for row in rows: - mrr = row.get('metadataRowRenderer') or {} - mrr_title = try_get( - mrr, lambda x: x['title']['simpleText'], compat_str) - mrr_contents = try_get( - mrr, lambda x: x['contents'][0], dict) or {} - mrr_contents_text = try_get(mrr_contents, [lambda x: x['simpleText'], lambda x: x['runs'][0]['text']], compat_str) - if not (mrr_title and mrr_contents_text): - continue - if mrr_title == 'License': - video_license = mrr_contents_text - elif not multiple_songs: - if mrr_title == 'Album': - album = mrr_contents_text - elif mrr_title == 'Artist': - artist = mrr_contents_text - elif mrr_title == 'Song': - track = mrr_contents_text + initial_data = None + if webpage: + initial_data = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_DATA_RE, video_id, + 'yt initial data') + if not initial_data: + initial_data = self._call_api( + 'next', {'videoId': video_id}, video_id, fatal=False) - m_episode = re.search( - r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', - video_webpage) - if m_episode: - series = unescapeHTML(m_episode.group('series')) - season_number = int(m_episode.group('season')) - episode_number = int(m_episode.group('episode')) - else: - series = season_number = episode_number = None + if initial_data: + chapters = self._extract_chapters_from_json( + initial_data, video_id, duration) + if not chapters: + for engagment_pannel in (initial_data.get('engagementPanels') or []): + contents = try_get( + engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'], + list) + if not contents: + continue - m_cat_container = self._search_regex( - r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', - video_webpage, 'categories', default=None) - category = None - if m_cat_container: - category = self._html_search_regex( - r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', - default=None) - if not category: - category = try_get( - microformat, lambda x: x['category'], compat_str) - video_categories = None if category is None else [category] + def chapter_time(mmlir): + return parse_duration( + get_text(mmlir.get('timeDescription'))) - video_tags = [ - unescapeHTML(m.group('content')) - for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] - if not video_tags: - video_tags = try_get(video_details, lambda x: x['keywords'], list) + chapters = [] + for next_num, content in enumerate(contents, start=1): + mmlir = content.get('macroMarkersListItemRenderer') or {} + start_time = chapter_time(mmlir) + end_time = chapter_time(try_get( + contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \ + if next_num < len(contents) else duration + if start_time is None or end_time is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': get_text(mmlir.get('title')), + }) + if chapters: + break + if chapters: + info['chapters'] = chapters - def _extract_count(count_name): - return str_to_int(self._search_regex( - (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name), - r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)), - video_webpage, count_name, default=None)) + contents = try_get( + initial_data, + lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], + list) or [] + for content in contents: + vpir = content.get('videoPrimaryInfoRenderer') + if vpir: + stl = vpir.get('superTitleLink') + if stl: + stl = get_text(stl) + if try_get( + vpir, + lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN': + info['location'] = stl + else: + mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl) + if mobj: + info.update({ + 'series': mobj.group(1), + 'season_number': int(mobj.group(2)), + 'episode_number': int(mobj.group(3)), + }) + for tlb in (try_get( + vpir, + lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], + list) or []): + tbr = tlb.get('toggleButtonRenderer') or {} + for getter, regex in [( + lambda x: x['defaultText']['accessibility']['accessibilityData'], + r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([ + lambda x: x['accessibility'], + lambda x: x['accessibilityData']['accessibilityData'], + ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]: + label = (try_get(tbr, getter, dict) or {}).get('label') + if label: + mobj = re.match(regex, label) + if mobj: + info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) + break + sbr_tooltip = try_get( + vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) + if sbr_tooltip: + like_count, dislike_count = sbr_tooltip.split(' / ') + info.update({ + 'like_count': str_to_int(like_count), + 'dislike_count': str_to_int(dislike_count), + }) + vsir = content.get('videoSecondaryInfoRenderer') + if vsir: + info['channel'] = get_text(try_get( + vsir, + lambda x: x['owner']['videoOwnerRenderer']['title'], + compat_str)) + rows = try_get( + vsir, + lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'], + list) or [] + multiple_songs = False + for row in rows: + if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: + multiple_songs = True + break + for row in rows: + mrr = row.get('metadataRowRenderer') or {} + mrr_title = mrr.get('title') + if not mrr_title: + continue + mrr_title = get_text(mrr['title']) + mrr_contents_text = get_text(mrr['contents'][0]) + if mrr_title == 'License': + info['license'] = mrr_contents_text + elif not multiple_songs: + if mrr_title == 'Album': + info['album'] = mrr_contents_text + elif mrr_title == 'Artist': + info['artist'] = mrr_contents_text + elif mrr_title == 'Song': + info['track'] = mrr_contents_text - like_count = _extract_count('like') - dislike_count = _extract_count('dislike') + for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: + v = info.get(s_k) + if v: + info[d_k] = v - if view_count is None: - view_count = str_to_int(self._search_regex( - r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage, - 'view count', default=None)) + self.mark_watched(video_id, player_response) - average_rating = ( - float_or_none(video_details.get('averageRating')) - or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0]))) - - # subtitles - video_subtitles = self.extract_subtitles(video_id, video_webpage) - automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config) - - video_duration = try_get( - video_info, lambda x: int_or_none(x['length_seconds'][0])) - if not video_duration: - video_duration = int_or_none(video_details.get('lengthSeconds')) - if not video_duration: - video_duration = parse_duration(self._html_search_meta( - 'duration', video_webpage, 'video duration')) - - # annotations - video_annotations = None - if self._downloader.params.get('writeannotations', False): - xsrf_token = None - ytcfg = self._extract_ytcfg(video_id, video_webpage) - if ytcfg: - xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str) - if not xsrf_token: - xsrf_token = self._search_regex( - r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2', - video_webpage, 'xsrf token', group='xsrf_token', fatal=False) - invideo_url = try_get( - player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str) - if xsrf_token and invideo_url: - xsrf_field_name = None - if ytcfg: - xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str) - if not xsrf_field_name: - xsrf_field_name = self._search_regex( - r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2', - video_webpage, 'xsrf field name', - group='xsrf_field_name', default='session_token') - video_annotations = self._download_webpage( - self._proto_relative_url(invideo_url), - video_id, note='Downloading annotations', - errnote='Unable to download video annotations', fatal=False, - data=urlencode_postdata({xsrf_field_name: xsrf_token})) - - chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration) - - # Look for the DASH manifest - if self._downloader.params.get('youtube_include_dash_manifest', True): - dash_mpd_fatal = True - for mpd_url in dash_mpds: - dash_formats = {} - try: - def decrypt_sig(mobj): - s = mobj.group(1) - dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) - return '/signature/%s' % dec_s - - mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url) - - for df in self._extract_mpd_formats( - mpd_url, video_id, fatal=dash_mpd_fatal, - formats_dict=self._formats): - if not df.get('filesize'): - df['filesize'] = _extract_filesize(df['url']) - # Do not overwrite DASH format found in some previous DASH manifest - if df['format_id'] not in dash_formats: - dash_formats[df['format_id']] = df - # Additional DASH manifests may end up in HTTP Error 403 therefore - # allow them to fail without bug report message if we already have - # some DASH manifest succeeded. This is temporary workaround to reduce - # burst of bug reports until we figure out the reason and whether it - # can be fixed at all. - dash_mpd_fatal = False - except (ExtractorError, KeyError) as e: - self.report_warning( - 'Skipping DASH manifest: %r' % e, video_id) - if dash_formats: - # Remove the formats we found through non-DASH, they - # contain less info and it can be wrong, because we use - # fixed values (for example the resolution). See - # https://github.com/ytdl-org/youtube-dl/issues/5774 for an - # example. - formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] - formats.extend(dash_formats.values()) - - # Check for malformed aspect ratio - stretched_m = re.search( - r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">', - video_webpage) - if stretched_m: - w = float(stretched_m.group('w')) - h = float(stretched_m.group('h')) - # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0). - # We will only process correct ratios. - if w > 0 and h > 0: - ratio = w / h - for f in formats: - if f.get('vcodec') != 'none': - f['stretched_ratio'] = ratio - - if not formats: - if 'reason' in video_info: - if 'The uploader has not made this video available in your country.' in video_info['reason']: - regions_allowed = self._html_search_meta( - 'regionsAllowed', video_webpage, default=None) - countries = regions_allowed.split(',') if regions_allowed else None - self.raise_geo_restricted( - msg=video_info['reason'][0], countries=countries) - reason = video_info['reason'][0] - if 'Invalid parameters' in reason: - unavailable_message = extract_unavailable_message() - if unavailable_message: - reason = unavailable_message - raise ExtractorError( - 'YouTube said: %s' % reason, - expected=True, video_id=video_id) - if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']): - raise ExtractorError('This video is DRM protected.', expected=True) - - self._sort_formats(formats) - - self.mark_watched(video_id, video_info, player_response) - - return { - 'id': video_id, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'uploader_url': video_uploader_url, - 'channel_id': channel_id, - 'channel_url': channel_url, - 'upload_date': upload_date, - 'license': video_license, - 'creator': video_creator or artist, - 'title': video_title, - 'alt_title': video_alt_title or track, - 'thumbnails': thumbnails, - 'description': video_description, - 'categories': video_categories, - 'tags': video_tags, - 'subtitles': video_subtitles, - 'automatic_captions': automatic_captions, - 'duration': video_duration, - 'age_limit': 18 if age_gate else 0, - 'annotations': video_annotations, - 'chapters': chapters, - 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'average_rating': average_rating, - 'formats': formats, - 'is_live': is_live, - 'start_time': start_time, - 'end_time': end_time, - 'series': series, - 'season_number': season_number, - 'episode_number': episode_number, - 'track': track, - 'artist': artist, - 'album': album, - 'release_date': release_date, - 'release_year': release_year, - } + return info class YoutubeTabIE(YoutubeBaseInfoExtractor): diff --git a/youtube_dl/extractor/zype.py b/youtube_dl/extractor/zype.py index 5288f40d8..f20f953cb 100644 --- a/youtube_dl/extractor/zype.py +++ b/youtube_dl/extractor/zype.py @@ -87,11 +87,16 @@ class ZypeIE(InfoExtractor): r'(["\'])(?P<url>(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', body, 'm3u8 url', group='url', default=None) if not m3u8_url: - source = self._parse_json(self._search_regex( - r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, - 'source'), video_id, js_to_json) - if source.get('integration') == 'verizon-media': - m3u8_url = 'https://content.uplynk.com/%s.m3u8' % source['id'] + source = self._search_regex( + r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, 'source') + + def get_attr(key): + return self._search_regex( + r'\b%s\s*:\s*([\'"])(?P<val>(?:(?!\1).)+)\1' % key, + source, key, group='val') + + if get_attr('integration') == 'verizon-media': + m3u8_url = 'https://content.uplynk.com/%s.m3u8' % get_attr('id') formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') text_tracks = self._search_regex( diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 3000ba41e..241cf110f 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -689,6 +689,10 @@ def parseOpts(overrideArguments=None): '-o', '--output', dest='outtmpl', metavar='TEMPLATE', help=('Output filename template, see the "OUTPUT TEMPLATE" for all the info')) + filesystem.add_option( + '--output-na-placeholder', + dest='outtmpl_na_placeholder', metavar='PLACEHOLDER', default='NA', + help=('Placeholder value for unavailable meta fields in output filename template (default is "%default")')) filesystem.add_option( '--autonumber-size', dest='autonumber_size', metavar='NUMBER', type=int, @@ -782,7 +786,7 @@ def parseOpts(overrideArguments=None): postproc.add_option( '-x', '--extract-audio', action='store_true', dest='extractaudio', default=False, - help='Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)') + help='Convert video files to audio-only files (requires ffmpeg/avconv and ffprobe/avprobe)') postproc.add_option( '--audio-format', metavar='FORMAT', dest='audioformat', default='best', help='Specify audio format: "best", "aac", "flac", "mp3", "m4a", "opus", "vorbis", or "wav"; "%default" by default; No effect without -x') diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0d9659b2b..425f15589 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.01.08' +__version__ = '2021.02.04.1'