diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 40a869113..5aff47e75 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.03.24** +- [ ] I've verified that I'm running youtube-dl version **2020.12.09** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.03.24 + [debug] youtube-dl version 2020.12.09 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 7b10df3d4..91c44d9b3 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.03.24** +- [ ] I've verified that I'm running youtube-dl version **2020.12.09** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 04bbcfa68..fec9aa55c 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.03.24** +- [ ] I've verified that I'm running youtube-dl version **2020.12.09** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index a9e231817..da77bdd7e 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.03.24** +- [ ] I've verified that I'm running youtube-dl version **2020.12.09** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.03.24 + [debug] youtube-dl version 2020.12.09 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 4a3d32d51..34dad8079 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.03.24** +- [ ] I've verified that I'm running youtube-dl version **2020.12.09** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/.travis.yml b/.travis.yml index 51afd469a..d828d027d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,29 +12,29 @@ python: dist: trusty env: - YTDL_TEST_SET=core - - YTDL_TEST_SET=download +# - YTDL_TEST_SET=download jobs: include: - python: 3.7 dist: xenial env: YTDL_TEST_SET=core - - python: 3.7 - dist: xenial - env: YTDL_TEST_SET=download +# - python: 3.7 +# dist: xenial +# env: YTDL_TEST_SET=download - python: 3.8 dist: xenial env: YTDL_TEST_SET=core - - python: 3.8 - dist: xenial - env: YTDL_TEST_SET=download +# - python: 3.8 +# dist: xenial +# env: YTDL_TEST_SET=download - python: 3.8-dev dist: xenial env: YTDL_TEST_SET=core - - python: 3.8-dev - dist: xenial - env: YTDL_TEST_SET=download +# - python: 3.8-dev +# dist: xenial +# env: YTDL_TEST_SET=download - env: JYTHON=true; YTDL_TEST_SET=core - - env: JYTHON=true; YTDL_TEST_SET=download +# - env: JYTHON=true; YTDL_TEST_SET=download - name: flake8 python: 3.8 dist: xenial @@ -42,9 +42,9 @@ jobs: script: flake8 . fast_finish: true allow_failures: - - env: YTDL_TEST_SET=download +# - env: YTDL_TEST_SET=download - env: JYTHON=true; YTDL_TEST_SET=core - - env: JYTHON=true; YTDL_TEST_SET=download +# - env: JYTHON=true; YTDL_TEST_SET=download before_install: - if [ "$JYTHON" == "true" ]; then ./devscripts/install_jython.sh; export PATH="$HOME/jython/bin:$PATH"; fi script: ./devscripts/run_tests.sh diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ac759ddc4..58ab3a4b8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -153,7 +153,7 @@ After you have ensured this site is distributing its content legally, you can fo 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](http://flake8.pycqa.org/en/latest/index.html#quickstart): +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): $ flake8 youtube_dl/extractor/yourextractor.py diff --git a/ChangeLog b/ChangeLog index f753972c4..8a57965e0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,447 @@ +version 2020.12.09 + +Core +* [extractor/common] Fix inline HTML5 media tags processing (#27345) + +Extractors +* [youtube:tab] Improve identity token extraction (#27197) +* [youtube:tab] Make click tracking params on continuation optional +* [youtube:tab] Delegate inline playlists to tab-based playlists (27298) ++ [tubitv] Extract release year (#27317) +* [amcnetworks] Fix free content extraction (#20354) ++ [lbry:channel] Add support for channels (#25584) ++ [lbry] Add support for short and embed URLs +* [lbry] Fix channel metadata extraction ++ [telequebec] Add support for video.telequebec.tv (#27339) +* [telequebec] Fix extraction (#25733, #26883) ++ [youtube:tab] Capture and output alerts (#27340) +* [tvplay:home] Fix extraction (#21153) +* [americastestkitchen] Fix Extraction and add support + for Cook's Country and Cook's Illustrated (#17234, #27322) ++ [slideslive] Add support for yoda service videos and extract subtitles + (#27323) + + +version 2020.12.07 + +Core +* [extractor/common] Extract timestamp from Last-Modified header ++ [extractor/common] Add support for dl8-* media tags (#27283) +* [extractor/common] Fix media type extraction for HTML5 media tags + in start/end form + +Extractors +* [aenetworks] Fix extraction (#23363, #23390, #26795, #26985) + * Fix Fastly format extraction + + Add support for play and watch subdomains + + Extract series metadata +* [youtube] Improve youtu.be extraction in non-existing playlists (#27324) ++ [generic] Extract RSS video description, timestamp and itunes metadata + (#27177) +* [nrk] Reduce the number of instalments and episodes requests +* [nrk] Improve extraction + * Improve format extraction for old akamai formats + + Add is_live value to entry info dict + * Request instalments only when available + * Fix skole extraction ++ [peertube] Extract fps ++ [peertube] Recognize audio-only formats (#27295) + + +version 2020.12.05 + +Core +* [extractor/common] Improve Akamai HTTP format extraction + * Allow m3u8 manifest without an additional audio format + * Fix extraction for qualities starting with a number + +Extractors +* [teachable:course] Improve extraction (#24507, #27286) +* [nrk] Improve error extraction +* [nrktv:series] Improve extraction (#21926) +* [nrktv:season] Improve extraction +* [nrk] Improve format extraction and geo-restriction detection (#24221) +* [pornhub] Handle HTTP errors gracefully (#26414) +* [nrktv] Relax URL regular expression (#27299, #26185) ++ [zdf] Extract webm formats (#26659) ++ [gamespot] Extract DASH and HTTP formats ++ [tver] Add support for tver.jp (#26662, #27284) ++ [pornhub] Add support for pornhub.org (#27276) + + +version 2020.12.02 + +Extractors ++ [tva] Add support for qub.ca (#27235) ++ [toggle] Detect DRM protected videos (closes #16479)(closes #20805) ++ [toggle] Add support for new MeWatch URLs (#27256) +* [youtube:tab] Extract channels only from channels tab (#27266) ++ [cspan] Extract info from jwplayer data (#3672, #3734, #10638, #13030, + #18806, #23148, #24461, #26171, #26800, #27263) +* [cspan] Pass Referer header with format's video URL (#26032, #25729) +* [youtube] Improve age-gated videos extraction (#27259) ++ [mediaset] Add support for movie URLs (#27240) +* [yandexmusic] Refactor ++ [yandexmusic] Add support for artist's tracks and albums (#11887, #22284) +* [yandexmusic:track] Fix extraction (#26449, #26669, #26747, #26748, #26762) + + +version 2020.11.29 + +Core +* [YoutubeDL] Write static debug to stderr and respect quiet for dynamic debug + (#14579, #22593) + +Extractors +* [drtv] Extend URL regular expression (#27243) +* [tiktok] Fix extraction (#20809, #22838, #22850, #25987, #26281, #26411, + #26639, #26776, #27237) ++ [ina] Add support for mobile URLs (#27229) +* [pornhub] Fix like and dislike count extraction (#27227, #27234) +* [youtube] Improve yt initial player response extraction (#27216) +* [videa] Fix extraction (#25650, #25973, #26301) + + +version 2020.11.26 + +Core +* [downloader/fragment] Set final file's mtime according to last fragment's + Last-Modified header (#11718, #18384, #27138) + +Extractors ++ [spreaker] Add support for spreaker.com (#13480, #13877) +* [vlive] Improve extraction for geo-restricted videos ++ [vlive] Add support for post URLs (#27122, #27123) +* [viki] Fix video API request (#27184) +* [bbc] Fix BBC Three clip extraction +* [bbc] Fix BBC News videos extraction ++ [medaltv] Add support for medal.tv (#27149) +* [youtube] Improve music metadata and license extraction (#26013) +* [nrk] Fix extraction +* [cda] Fix extraction (#17803, #24458, #24518, #26381) + + +version 2020.11.24 + +Core ++ [extractor/common] Add generic support for akamai HTTP format extraction + +Extractors +* [youtube:tab] Fix feeds extraction (#25695, #26452) +* [youtube:favorites] Restore extractor +* [youtube:tab] Fix some weird typo (#27157) ++ [pinterest] Add support for large collections (more than 25 pins) ++ [franceinter] Extract thumbnail (#27153) ++ [box] Add support for box.com (#5949) ++ [nytimes] Add support for cooking.nytimes.com (#27112, #27143) +* [lbry] Relax URL regular expression (#27144) ++ [rumble] Add support for embed pages (#10785) ++ [skyit] Add support for multiple Sky Italia websites (#26629) ++ [pinterest] Add support for pinterest.com (#25747) + + +version 2020.11.21.1 + +Core +* [downloader/http] Fix crash during urlopen caused by missing reason + of URLError +* [YoutubeDL] Fix --ignore-errors for playlists with generator-based entries + of url_transparent (#27064) + +Extractors ++ [svtplay] Add support for svt.se/barnkanalen (#24817) ++ [svt] Extract timestamp (#27130) +* [svtplay] Improve thumbnail extraction (#27130) +* [youtube] Fix error reason extraction (#27081) +* [youtube] Fix like and dislike count extraction (#25977) ++ [youtube:tab] Add support for current video and fix lives extraction (#27126) +* [infoq] Fix format extraction (#25984) +* [francetv] Update to fix thumbnail URL issue (#27120) +* [youtube] Improve yt initial data extraction (#27093) ++ [discoverynetworks] Add support new TLC/DMAX URLs (#27100) +* [rai] Fix protocol relative relinker URLs (#22766) +* [rai] Fix unavailable video format detection +* [rai] Improve extraction +* [rai] Fix extraction (#27077) +* [viki] Improve format extraction +* [viki] Fix stream extraction from MPD (#27092) +* [googledrive] Fix format extraction (#26979) ++ [amara] Add support for amara.org (#20618) +* [vimeo:album] Fix extraction (#27079) +* [mtv] Fix mgid extraction (#26841) + + +version 2020.11.19 + +Core +* [extractor/common] Output error for invalid URLs in _is_valid_url (#21400, + #24151, #25617, #25618, #25586, #26068, #27072) + +Extractors +* [youporn] Fix upload date extraction +* [youporn] Make comment count optional (#26986) +* [arte] Rework extractors + * Reimplement embed and playlist extractors to delegate to the single + entrypoint artetv extractor + * Improve embeds detection (#27057) ++ [arte] Extract m3u8 formats (#27061) +* [mgtv] Fix format extraction (#26415) ++ [lbry] Add support for odysee.com (#26806) +* [francetv] Improve info extraction ++ [francetv] Add fallback video URL extraction (#27047) + + +version 2020.11.18 + +Extractors +* [spiegel] Fix extraction (#24206, #24767) +* [youtube] Improve extraction + + Add support for --no-playlist (#27009) + * Improve playlist and mix extraction (#26390, #26509, #26534, #27011) + + Extract playlist uploader data +* [youtube:tab] Fix view count extraction (#27051) +* [malltv] Fix extraction (#27035) ++ [bandcamp] Extract playlist description (#22684) +* [urplay] Fix extraction (#26828) +* [youtube:tab] Fix playlist title extraction (#27015) +* [youtube] Fix chapters extraction (#26005) + + +version 2020.11.17 + +Core +* [utils] Skip ! prefixed code in js_to_json + +Extractors +* [youtube:tab] Fix extraction with cookies provided (#27005) +* [lrt] Fix extraction with empty tags (#20264) ++ [ndr:embed:base] Extract subtitles (#25447, #26106) ++ [servus] Add support for pm-wissen.com (#25869) +* [servus] Fix extraction (#26872, #26967, #26983, #27000) +* [xtube] Fix extraction (#26996) +* [lrt] Fix extraction ++ [lbry] Add support for lbry.tv ++ [condenast] Extract subtitles +* [condenast] Fix extraction +* [bandcamp] Fix extraction (#26681, #26684) +* [rai] Fix RaiPlay extraction (#26064, #26096) +* [vlive] Fix extraction +* [usanetwork] Fix extraction +* [nbc] Fix NBCNews/Today/MSNBC extraction +* [cnbc] Fix extraction + + +version 2020.11.12 + +Extractors +* [youtube] Rework extractors + + +version 2020.11.01 + +Core +* [utils] Don't attempt to coerce JS strings to numbers in js_to_json (#26851) +* [downloader/http] Properly handle missing message in SSLError (#26646) +* [downloader/http] Fix access to not yet opened stream in retry + +Extractors +* [youtube] Fix JS player URL extraction +* [ytsearch] Fix extraction (#26920) +* [afreecatv] Fix typo (#26970) +* [23video] Relax URL regular expression (#26870) ++ [ustream] Add support for video.ibm.com (#26894) +* [iqiyi] Fix typo (#26884) ++ [expressen] Add support for di.se (#26670) +* [iprima] Improve video id extraction (#26507, #26494) + + +version 2020.09.20 + +Core +* [extractor/common] Relax interaction count extraction in _json_ld ++ [extractor/common] Extract author as uploader for VideoObject in _json_ld +* [downloader/hls] Fix incorrect end byte in Range HTTP header for + media segments with EXT-X-BYTERANGE (#14748, #24512) +* [extractor/common] Handle ssl.CertificateError in _request_webpage (#26601) +* [downloader/http] Improve timeout detection when reading block of data + (#10935) +* [downloader/http] Retry download when urlopen times out (#10935, #26603) + +Extractors +* [redtube] Extend URL regular expression (#26506) +* [twitch] Refactor +* [twitch:stream] Switch to GraphQL and fix reruns (#26535) ++ [telequebec] Add support for brightcove videos (#25833) +* [pornhub] Extract metadata from JSON-LD (#26614) +* [pornhub] Fix view count extraction (#26621, #26614) + + +version 2020.09.14 + +Core ++ [postprocessor/embedthumbnail] Add support for non jpg/png thumbnails + (#25687, #25717) + +Extractors +* [rtlnl] Extend URL regular expression (#26549, #25821) +* [youtube] Fix empty description extraction (#26575, #26006) +* [srgssr] Extend URL regular expression (#26555, #26556, #26578) +* [googledrive] Use redirect URLs for source format (#18877, #23919, #24689, + #26565) +* [svtplay] Fix id extraction (#26576) +* [redbulltv] Improve support for rebull.com TV localized URLs (#22063) ++ [redbulltv] Add support for new redbull.com TV URLs (#22037, #22063) +* [soundcloud:pagedplaylist] Reduce pagination limit (#26557) + + +version 2020.09.06 + +Core ++ [utils] Recognize wav mimetype (#26463) + +Extractors +* [nrktv:episode] Improve video id extraction (#25594, #26369, #26409) +* [youtube] Fix age gate content detection (#26100, #26152, #26311, #26384) +* [youtube:user] Extend URL regular expression (#26443) +* [xhamster] Improve initials regular expression (#26526, #26353) +* [svtplay] Fix video id extraction (#26425, #26428, #26438) +* [twitch] Rework extractors (#12297, #20414, #20604, #21811, #21812, #22979, + #24263, #25010, #25553, #25606) + * Switch to GraphQL + + Add support for collections + + Add support for clips and collections playlists +* [biqle] Improve video ext extraction +* [xhamster] Fix extraction (#26157, #26254) +* [xhamster] Extend URL regular expression (#25789, #25804, #25927)) + + +version 2020.07.28 + +Extractors +* [youtube] Fix sigfunc name extraction (#26134, #26135, #26136, #26137) +* [youtube] Improve description extraction (#25937, #25980) +* [wistia] Restrict embed regular expression (#25969) +* [youtube] Prevent excess HTTP 301 (#25786) ++ [youtube:playlists] Extend URL regular expression (#25810) ++ [bellmedia] Add support for cp24.com clip URLs (#25764) +* [brightcove] Improve embed detection (#25674) + + +version 2020.06.16.1 + +Extractors +* [youtube] Force old layout (#25682, #25683, #25680, #25686) +* [youtube] Fix categories and improve tags extraction + + +version 2020.06.16 + +Extractors +* [youtube] Fix uploader id and uploader URL extraction +* [youtube] Improve view count extraction +* [youtube] Fix upload date extraction (#25677) +* [youtube] Fix thumbnails extraction (#25676) +* [youtube] Fix playlist and feed extraction (#25675) ++ [facebook] Add support for single-video ID links ++ [youtube] Extract chapters from JSON (#24819) ++ [kaltura] Add support for multiple embeds on a webpage (#25523) + + +version 2020.06.06 + +Extractors +* [tele5] Bypass geo restriction ++ [jwplatform] Add support for bypass geo restriction +* [tele5] Prefer jwplatform over nexx (#25533) +* [twitch:stream] Expect 400 and 410 HTTP errors from API +* [twitch:stream] Fix extraction (#25528) +* [twitch] Fix thumbnails extraction (#25531) ++ [twitch] Pass v5 Accept HTTP header (#25531) +* [brightcove] Fix subtitles extraction (#25540) ++ [malltv] Add support for sk.mall.tv (#25445) +* [periscope] Fix untitled broadcasts (#25482) +* [jwplatform] Improve embeds extraction (#25467) + + +version 2020.05.29 + +Core +* [postprocessor/ffmpeg] Embed series metadata with --add-metadata +* [utils] Fix file permissions in write_json_file (#12471, #25122) + +Extractors +* [ard:beta] Extend URL regular expression (#25405) ++ [youtube] Add support for more invidious instances (#25417) +* [giantbomb] Extend URL regular expression (#25222) +* [ard] Improve URL regular expression (#25134, #25198) +* [redtube] Improve formats extraction and extract m3u8 formats (#25311, + #25321) +* [indavideo] Switch to HTTPS for API request (#25191) +* [redtube] Improve title extraction (#25208) +* [vimeo] Improve format extraction and sorting (#25285) +* [soundcloud] Reduce API playlist page limit (#25274) ++ [youtube] Add support for yewtu.be (#25226) +* [mailru] Fix extraction (#24530, #25239) +* [bellator] Fix mgid extraction (#25195) + + +version 2020.05.08 + +Core +* [downloader/http] Request last data block of exact remaining size +* [downloader/http] Finish downloading once received data length matches + expected +* [extractor/common] Use compat_cookiejar_Cookie for _set_cookie to always + ensure cookie name and value are bytestrings on python 2 (#23256, #24776) ++ [compat] Introduce compat_cookiejar_Cookie +* [utils] Improve cookie files support + + Add support for UTF-8 in cookie files + * Skip malformed cookie file entries instead of crashing (invalid entry + length, invalid expires at) + +Extractors +* [youtube] Improve signature cipher extraction (#25187, #25188) +* [iprima] Improve extraction (#25138) +* [uol] Fix extraction (#22007) ++ [orf] Add support for more radio stations (#24938, #24968) +* [dailymotion] Fix typo +- [puhutv] Remove no longer available HTTP formats (#25124) + + +version 2020.05.03 + +Core ++ [extractor/common] Extract multiple JSON-LD entries +* [options] Clarify doc on --exec command (#19087, #24883) +* [extractor/common] Skip malformed ISM manifest XMLs while extracting + ISM formats (#24667) + +Extractors +* [crunchyroll] Fix and improve extraction (#25096, #25060) +* [youtube] Improve player id extraction +* [youtube] Use redirected video id if any (#25063) +* [yahoo] Fix GYAO Player extraction and relax URL regular expression + (#24178, #24778) +* [tvplay] Fix Viafree extraction (#15189, #24473, #24789) +* [tenplay] Relax URL regular expression (#25001) ++ [prosiebensat1] Extract series metadata +* [prosiebensat1] Improve extraction and remove 7tv.de support (#24948) +- [prosiebensat1] Remove 7tv.de support (#24948) +* [youtube] Fix DRM videos detection (#24736) +* [thisoldhouse] Fix video id extraction (#24548, #24549) ++ [soundcloud] Extract AAC format (#19173, #24708) +* [youtube] Skip broken multifeed videos (#24711) +* [nova:embed] Fix extraction (#24700) +* [motherless] Fix extraction (#24699) +* [twitch:clips] Extend URL regular expression (#24290, #24642) +* [tv4] Fix ISM formats extraction (#24667) +* [tele5] Fix extraction (#24553) ++ [mofosex] Add support for generic embeds (#24633) ++ [youporn] Add support for generic embeds ++ [spankwire] Add support for generic embeds (#24633) +* [spankwire] Fix extraction (#18924, #20648) + + version 2020.03.24 Core @@ -96,7 +540,7 @@ Extractors + Add support for more domains * [svt] Fix series extraction (#22297) * [svt] Fix article extraction (#22897, #22919) -* [soundcloud] Imporve private playlist/set tracks extraction (#3707) +* [soundcloud] Improve private playlist/set tracks extraction (#3707) version 2020.01.24 @@ -222,7 +666,7 @@ Extractors * [abcotvs] Relax URL regular expression and improve metadata extraction (#18014) * [channel9] Reduce response size -* [adobetv] Improve extaction +* [adobetv] Improve extraction * Use OnDemandPagedList for list extractors * Reduce show extraction requests * Extract original video format and subtitles @@ -247,7 +691,7 @@ Extractors * [dailymotion] Improve extraction * Extract http formats included in m3u8 manifest * Fix user extraction (#3553, #21415) - + Add suport for User Authentication (#11491) + + Add support for User Authentication (#11491) * Fix password protected videos extraction (#23176) * Respect age limit option and family filter cookie value (#18437) * Handle video url playlist query param @@ -332,7 +776,7 @@ Extractors - [go90] Remove extractor * [kakao] Remove raw request + [kakao] Extract format total bitrate -* [daum] Fix VOD and Clip extracton (#15015) +* [daum] Fix VOD and Clip extraction (#15015) * [kakao] Improve extraction + Add support for embed URLs + Add support for Kakao Legacy vid based embed URLs @@ -376,7 +820,7 @@ Extractors * Improve format extraction (#22123) + Extract uploader_id and uploader_url (#21916) + Extract all known thumbnails (#19071, #20659) - * Fix extration for private playlists (#20976) + * Fix extraction for private playlists (#20976) + Add support for playlist embeds (#20976) * Skip preview formats (#22806) * [dplay] Improve extraction @@ -851,7 +1295,7 @@ Extractors * [hbo] Fix extraction and extract subtitles (#14629, #13709) * [youtube] Extract srv[1-3] subtitle formats (#20566) * [adultswim] Fix extraction (#18025) -* [teamcoco] Fix extraction and add suport for subdomains (#17099, #20339) +* [teamcoco] Fix extraction and add support for subdomains (#17099, #20339) * [adn] Fix subtitle compatibility with ffmpeg * [adn] Fix extraction and add support for positioning styles (#20549) * [vk] Use unique video id (#17848) @@ -1263,7 +1707,7 @@ version 2018.11.18 Extractors + [wwe] Extract subtitles -+ [wwe] Add support for playlistst (#14781) ++ [wwe] Add support for playlists (#14781) + [wwe] Add support for wwe.com (#14781, #17450) * [vk] Detect geo restriction (#17767) * [openload] Use original host during extraction (#18211) @@ -2296,7 +2740,7 @@ Extractors * [youku] Update ccode (#14872) * [mnet] Fix format extraction (#14883) + [xiami] Add Referer header to API request -* [mtv] Correct scc extention in extracted subtitles (#13730) +* [mtv] Correct scc extension in extracted subtitles (#13730) * [vvvvid] Fix extraction for kenc videos (#13406) + [br] Add support for BR Mediathek videos (#14560, #14788) + [daisuki] Add support for motto.daisuki.com (#14681) @@ -2317,7 +2761,7 @@ Extractors * [nexx] Extract more formats + [openload] Add support for openload.link (#14763) * [empflix] Relax URL regular expression -* [empflix] Fix extractrion +* [empflix] Fix extraction * [tnaflix] Don't modify download URLs (#14811) - [gamersyde] Remove extractor * [francetv:generationwhat] Fix extraction @@ -2512,7 +2956,7 @@ Extractors * [yahoo] Bypass geo restriction for brightcove (#14210) * [yahoo] Use extracted brightcove account id (#14210) * [rtve:alacarta] Fix extraction (#14290) -+ [yahoo] Add support for custom brigthcove embeds (#14210) ++ [yahoo] Add support for custom brightcove embeds (#14210) + [generic] Add support for Video.js embeds + [gfycat] Add support for /gifs/detail URLs (#14322) * [generic] Fix infinite recursion for twitter:player URLs (#14339) @@ -2757,7 +3201,7 @@ Extractors * [amcnetworks] Make rating optional (#12453) * [cloudy] Fix extraction (#13737) + [nickru] Add support for nickelodeon.ru -* [mtv] Improve thumbnal extraction +* [mtv] Improve thumbnail extraction * [nick] Automate geo-restriction bypass (#13711) * [niconico] Improve error reporting (#13696) @@ -3121,7 +3565,7 @@ Extractors + [cda] Support birthday verification (#12789) * [leeco] Fix extraction (#12974) + [pbs] Extract chapters -* [amp] Imporove thumbnail and subtitles extraction +* [amp] Improve thumbnail and subtitles extraction * [foxsports] Fix extraction (#12945) - [coub] Remove comment count extraction (#12941) @@ -3291,7 +3735,7 @@ Extractors + [rbmaradio] Add support for redbullradio.com URLs (#12687) + [npo:live] Add support for default URL (#12555) * [mixcloud:playlist] Fix title, description and view count extraction (#12582) -+ [thesun] Add suport for thesun.co.uk (#11298, #12674) ++ [thesun] Add support for thesun.co.uk (#11298, #12674) + [ceskateleveize:porady] Add support for porady (#7411, #12645) * [ceskateleveize] Improve extraction and remove URL replacement hacks + [kaltura] Add support for iframe embeds (#12679) @@ -3330,7 +3774,7 @@ Extractors * [funimation] Fix extraction (#10696, #11773) + [xfileshare] Add support for vidabc.com (#12589) + [xfileshare] Improve extraction and extract hls formats -+ [crunchyroll] Pass geo verifcation proxy ++ [crunchyroll] Pass geo verification proxy + [cwtv] Extract ISM formats + [tvplay] Bypass geo restriction + [vrv] Add support for vrv.co @@ -3394,7 +3838,7 @@ Extractors + [bostonglobe] Add extractor for bostonglobe.com (#12099) + [toongoggles] Add support for toongoggles.com (#12171) + [medialaan] Add support for Medialaan sites (#9974, #11912) -+ [discoverynetworks] Add support for more domains and bypass geo restiction ++ [discoverynetworks] Add support for more domains and bypass geo restriction * [openload] Fix extraction (#10408) @@ -4984,7 +5428,7 @@ version 2016.07.09.1 Fixed/improved extractors - youtube - ard -- srmediatek (#9373) +- srmediathek (#9373) version 2016.07.09 @@ -5048,7 +5492,7 @@ Fixed/improved extractors - kaltura (#5557) - la7 - Changed features -- Rename --cn-verfication-proxy to --geo-verification-proxy +- Rename --cn-verification-proxy to --geo-verification-proxy Miscellaneous - Add script for displaying downloads statistics diff --git a/README.md b/README.md index 4f54a5240..fb1aee7dc 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![Build Status](https://travis-ci.org/ytdl-org/youtube-dl.svg?branch=master)](https://travis-ci.org/ytdl-org/youtube-dl) +[![Build Status](https://travis-ci.com/ytdl-org/youtube-dl.svg?branch=master)](https://travis-ci.com/ytdl-org/youtube-dl) youtube-dl - download videos from youtube.com or other video platforms @@ -434,9 +434,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo either the path to the binary or its containing directory. --exec CMD Execute a command on the file after - downloading, similar to find's -exec - syntax. Example: --exec 'adb push {} - /sdcard/Music/ && rm {}' + downloading and post-processing, similar to + find's -exec syntax. Example: --exec 'adb + push {} /sdcard/Music/ && rm {}' --convert-subs FORMAT Convert the subtitles to other format (currently supported: srt|ass|vtt|lrc) @@ -545,7 +545,7 @@ The basic usage is not to set any template arguments when downloading a single f - `extractor` (string): Name of the extractor - `extractor_key` (string): Key name of the extractor - `epoch` (numeric): Unix epoch when creating the file - - `autonumber` (numeric): Five-digit number that will be increased with each download, starting at zero + - `autonumber` (numeric): Number that will be increased with each download, starting at `--autonumber-start` - `playlist` (string): Name or id of the playlist that contains the video - `playlist_index` (numeric): Index of the video in the playlist padded with leading zeros according to the total length of the playlist - `playlist_id` (string): Playlist identifier @@ -1032,7 +1032,7 @@ After you have ensured this site is distributing its content legally, you can fo 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](http://flake8.pycqa.org/en/latest/index.html#quickstart): +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): $ flake8 youtube_dl/extractor/yourextractor.py diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 0a1762dbc..878ae72b1 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -61,7 +61,7 @@ def build_lazy_ie(ie, name): return s -# find the correct sorting and add the required base classes so that sublcasses +# find the correct sorting and add the required base classes so that subclasses # can be correctly created classes = _ALL_CLASSES[:-1] ordered_cls = [] diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 174b83bf3..b551aa1eb 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -35,12 +35,15 @@ - **adobetv:video** - **AdultSwim** - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault + - **aenetworks:collection** + - **aenetworks:show** - **afreecatv**: afreecatv.com - **AirMozilla** - **AliExpressLive** - **AlJazeera** - **Allocine** - **AlphaPorno** + - **Amara** - **AMCNetworks** - **AmericasTestKitchen** - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl @@ -58,9 +61,10 @@ - **ARD:mediathek** - **ARDBetaMediathek** - **Arkena** - - **arte.tv:+7** - - **arte.tv:embed** - - **arte.tv:playlist** + - **arte.sky.it** + - **ArteTV** + - **ArteTVEmbed** + - **ArteTVPlaylist** - **AsianCrush** - **AsianCrushPlaylist** - **AtresPlayer** @@ -109,6 +113,7 @@ - **Bloomberg** - **BokeCC** - **BostonGlobe** + - **Box** - **Bpb**: Bundeszentrale für politische Bildung - **BR**: Bayerischer Rundfunk - **BravoTV** @@ -156,6 +161,7 @@ - **Chilloutzone** - **chirbit** - **chirbit:profile** + - **cielotv.it** - **Cinchcast** - **Cinemax** - **CiscoLiveSearch** @@ -304,6 +310,7 @@ - **FrontendMasters** - **FrontendMastersCourse** - **FrontendMastersLesson** + - **FujiTVFODPlus7** - **Funimation** - **Funk** - **Fusion** @@ -417,6 +424,8 @@ - **la7.it** - **laola1tv** - **laola1tv:embed** + - **lbry** + - **lbry:channel** - **LCI** - **Lcp** - **LcpPlay** @@ -466,6 +475,7 @@ - **massengeschmack.tv** - **MatchTV** - **MDR**: MDR.DE and KiKA + - **MedalTV** - **media.ccc.de** - **media.ccc.de:lists** - **Medialaan** @@ -480,6 +490,7 @@ - **META** - **metacafe** - **Metacritic** + - **mewatch** - **Mgoon** - **MGTV**: 芒果TV - **MiaoPai** @@ -497,6 +508,7 @@ - **MNetTV** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net - **Mofosex** + - **MofosexEmbed** - **Mojvideo** - **Morningstar**: morningstar.com - **Motherless** @@ -605,6 +617,7 @@ - **Nuvid** - **NYTimes** - **NYTimesArticle** + - **NYTimesCooking** - **NZZ** - **ocw.mit.edu** - **OdaTV** @@ -619,11 +632,21 @@ - **Ooyala** - **OoyalaExternal** - **OraTV** + - **orf:burgenland**: Radio Burgenland - **orf:fm4**: radio FM4 - **orf:fm4:story**: fm4.orf.at stories - **orf:iptv**: iptv.ORF.at + - **orf:kaernten**: Radio Kärnten + - **orf:noe**: Radio Niederösterreich + - **orf:oberoesterreich**: Radio Oberösterreich - **orf:oe1**: Radio Österreich 1 + - **orf:oe3**: Radio Österreich 3 + - **orf:salzburg**: Radio Salzburg + - **orf:steiermark**: Radio Steiermark + - **orf:tirol**: Radio Tirol - **orf:tvthek**: ORF TVthek + - **orf:vorarlberg**: Radio Vorarlberg + - **orf:wien**: Radio Wien - **OsnatelTV** - **OutsideTV** - **PacktPub** @@ -647,10 +670,13 @@ - **PicartoVod** - **Piksel** - **Pinkbike** + - **Pinterest** + - **PinterestCollection** - **Pladform** - **Platzi** - **PlatziCourse** - **play.fm** + - **player.sky.it** - **PlayPlusTV** - **PlaysTV** - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz @@ -688,6 +714,7 @@ - **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:toplist**: QQ音乐 - 排行榜 - **QuantumTV** + - **Qub** - **Quickline** - **QuicklineLive** - **R7** @@ -706,6 +733,8 @@ - **RayWenderlichCourse** - **RBMARadio** - **RDS**: RDS.ca + - **RedBull** + - **RedBullEmbed** - **RedBullTV** - **RedBullTVRrnContent** - **Reddit** @@ -740,6 +769,7 @@ - **RTVNH** - **RTVS** - **RUHD** + - **RumbleEmbed** - **rutube**: Rutube videos - **rutube:channel**: Rutube channels - **rutube:embed**: Rutube embedded videos @@ -777,6 +807,8 @@ - **Shared**: shared.sx - **ShowRoomLive** - **Sina** + - **sky.it** + - **skyacademy.it** - **SkylineWebcams** - **SkyNews** - **skynewsarabia:article** @@ -810,12 +842,14 @@ - **SpankBangPlaylist** - **Spankwire** - **Spiegel** - - **Spiegel:Article**: Articles on spiegel.de - - **Spiegeltv** - **sport.francetvinfo.fr** - **Sport5** - **SportBox** - **SportDeutschland** + - **Spreaker** + - **SpreakerPage** + - **SpreakerShow** + - **SpreakerShowPage** - **SpringboardPlatform** - **Sprout** - **sr:mediathek**: Saarländischer Rundfunk @@ -867,6 +901,7 @@ - **TeleQuebecEmission** - **TeleQuebecLive** - **TeleQuebecSquat** + - **TeleQuebecVideo** - **TeleTask** - **Telewebion** - **TennisTV** @@ -884,7 +919,7 @@ - **ThisAV** - **ThisOldHouse** - **TikTok** - - **TikTokUser** + - **TikTokUser** (Currently broken) - **tinypic**: tinypic.com videos - **TMZ** - **TMZArticle** @@ -917,11 +952,13 @@ - **TV2DKBornholmPlay** - **TV4**: tv4.se and tv4play.se - **TV5MondePlus**: TV5MONDE+ + - **tv8.it** - **TVA** - **TVANouvelles** - **TVANouvellesArticle** - **TVC** - **TVCArticle** + - **TVer** - **tvigle**: Интернет-телевидение Tvigle.ru - **tvland.com** - **TVN24** @@ -939,16 +976,13 @@ - **TVPlayHome** - **Tweakers** - **TwitCasting** - - **twitch:chapter** - **twitch:clips** - - **twitch:profile** - **twitch:stream** - - **twitch:video** - - **twitch:videos:all** - - **twitch:videos:highlights** - - **twitch:videos:past-broadcasts** - - **twitch:videos:uploads** - **twitch:vod** + - **TwitchCollection** + - **TwitchVideos** + - **TwitchVideosClips** + - **TwitchVideosCollections** - **twitter** - **twitter:amplify** - **twitter:broadcast** @@ -991,6 +1025,8 @@ - **Viddler** - **Videa** - **video.google:search**: Google Video search + - **video.sky.it** + - **video.sky.it:live** - **VideoDetective** - **videofy.me** - **videomore** @@ -1032,7 +1068,7 @@ - **vk:wallpost** - **vlive** - **vlive:channel** - - **vlive:playlist** + - **vlive:post** - **Vodlocker** - **VODPl** - **VODPlatform** @@ -1104,6 +1140,8 @@ - **yahoo:japannews**: Yahoo! Japan News - **YandexDisk** - **yandexmusic:album**: Яндекс.Музыка - Альбом + - **yandexmusic:artist:albums**: Яндекс.Музыка - Артист - Альбомы + - **yandexmusic:artist:tracks**: Яндекс.Музыка - Артист - Треки - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист - **yandexmusic:track**: Яндекс.Музыка - Трек - **YandexVideo** @@ -1121,20 +1159,17 @@ - **YourPorn** - **YourUpload** - **youtube**: YouTube.com - - **youtube:channel**: YouTube.com channels - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication) - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) - - **youtube:live**: YouTube.com live streams - **youtube:playlist**: YouTube.com playlists - - **youtube:playlists**: YouTube.com user/channel playlists - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) - **youtube:search**: YouTube.com searches - **youtube:search:date**: YouTube.com searches, newest videos first - - **youtube:search_url**: YouTube.com search URLs - - **youtube:show**: YouTube.com (multi-season) shows - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) + - **youtube:tab**: YouTube.com tab - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **YoutubeYtBe** + - **YoutubeYtUser** - **Zapiks** - **Zaq1** - **Zattoo** diff --git a/test/parameters.json b/test/parameters.json index 7bf59c25f..65fd54428 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -37,7 +37,7 @@ "writeinfojson": true, "writesubtitles": false, "allsubtitles": false, - "listssubtitles": false, + "listsubtitles": false, "socket_timeout": 20, "fixup": "never" } diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 71f6608fe..644b3759c 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -108,6 +108,18 @@ class TestInfoExtractor(unittest.TestCase): self.assertEqual(self.ie._download_json(uri, None, fatal=False), None) def test_parse_html5_media_entries(self): + # inline video tag + expect_dict( + self, + self.ie._parse_html5_media_entries( + 'https://127.0.0.1/video.html', + r'', None)[0], + { + 'formats': [{ + 'url': 'https://127.0.0.1/vid.mp4', + }], + }) + # from https://www.r18.com/ # with kpbs in label expect_dict( diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 1e204e551..62f916d11 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -919,6 +919,76 @@ class TestYoutubeDL(unittest.TestCase): self.assertEqual(downloaded['extractor'], 'testex') self.assertEqual(downloaded['extractor_key'], 'TestEx') + # Test case for https://github.com/ytdl-org/youtube-dl/issues/27064 + def test_ignoreerrors_for_playlist_with_url_transparent_iterable_entries(self): + + class _YDL(YDL): + def __init__(self, *args, **kwargs): + super(_YDL, self).__init__(*args, **kwargs) + + def trouble(self, s, tb=None): + pass + + ydl = _YDL({ + 'format': 'extra', + 'ignoreerrors': True, + }) + + class VideoIE(InfoExtractor): + _VALID_URL = r'video:(?P\d+)' + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = [{ + 'format_id': 'default', + 'url': 'url:', + }] + if video_id == '0': + raise ExtractorError('foo') + if video_id == '2': + formats.append({ + 'format_id': 'extra', + 'url': TEST_URL, + }) + return { + 'id': video_id, + 'title': 'Video %s' % video_id, + 'formats': formats, + } + + class PlaylistIE(InfoExtractor): + _VALID_URL = r'playlist:' + + def _entries(self): + for n in range(3): + video_id = compat_str(n) + yield { + '_type': 'url_transparent', + 'ie_key': VideoIE.ie_key(), + 'id': video_id, + 'url': 'video:%s' % video_id, + 'title': 'Video Transparent %s' % video_id, + } + + def _real_extract(self, url): + return self.playlist_result(self._entries()) + + ydl.add_info_extractor(VideoIE(ydl)) + ydl.add_info_extractor(PlaylistIE(ydl)) + info = ydl.extract_info('playlist:') + entries = info['entries'] + self.assertEqual(len(entries), 3) + self.assertTrue(entries[0] is None) + self.assertTrue(entries[1] is None) + self.assertEqual(len(ydl.downloaded_info_dicts), 1) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(entries[2], downloaded) + self.assertEqual(downloaded['url'], TEST_URL) + self.assertEqual(downloaded['title'], 'Video Transparent 2') + self.assertEqual(downloaded['id'], '2') + self.assertEqual(downloaded['extractor'], 'Video') + self.assertEqual(downloaded['extractor_key'], 'Video') + if __name__ == '__main__': unittest.main() diff --git a/test/test_YoutubeDLCookieJar.py b/test/test_YoutubeDLCookieJar.py index f959798de..05f48bd74 100644 --- a/test/test_YoutubeDLCookieJar.py +++ b/test/test_YoutubeDLCookieJar.py @@ -39,6 +39,13 @@ class TestYoutubeDLCookieJar(unittest.TestCase): assert_cookie_has_value('HTTPONLY_COOKIE') assert_cookie_has_value('JS_ACCESSIBLE_COOKIE') + def test_malformed_cookies(self): + cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/malformed_cookies.txt') + cookiejar.load(ignore_discard=True, ignore_expires=True) + # Cookies should be empty since all malformed cookie file entries + # will be ignored + self.assertFalse(cookiejar._cookies) + if __name__ == '__main__': unittest.main() diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 81056a999..50c3466fa 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -31,16 +31,17 @@ class TestAllURLsMatching(unittest.TestCase): def test_youtube_playlist_matching(self): assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist']) + assertTab = lambda url: self.assertMatch(url, ['youtube:tab']) assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585 assertPlaylist('PL63F0C78739B09958') - assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') + assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') - assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') - assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 + assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') + assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M')) # Top tracks - assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101') + assertTab('https://www.youtube.com/playlist?list=MCUS.20142101') def test_youtube_matching(self): self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M')) @@ -51,26 +52,23 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube']) def test_youtube_channel_matching(self): - assertChannel = lambda url: self.assertMatch(url, ['youtube:channel']) + assertChannel = lambda url: self.assertMatch(url, ['youtube:tab']) assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM') assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec') assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos') - def test_youtube_user_matching(self): - self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:user']) + # def test_youtube_user_matching(self): + # self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab']) def test_youtube_feeds(self): - self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) - self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) - self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) - self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites']) + self.assertMatch('https://www.youtube.com/feed/library', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/history', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab']) - def test_youtube_show_matching(self): - self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show']) - - def test_youtube_search_matching(self): - self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) - self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) + # def test_youtube_search_matching(self): + # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) + # self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) def test_youtube_extract(self): assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) diff --git a/test/test_utils.py b/test/test_utils.py index 0896f4150..925a21d34 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -803,6 +803,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(mimetype2ext('text/vtt'), 'vtt') self.assertEqual(mimetype2ext('text/vtt;charset=utf-8'), 'vtt') self.assertEqual(mimetype2ext('text/html; charset=utf-8'), 'html') + self.assertEqual(mimetype2ext('audio/x-wav'), 'wav') + self.assertEqual(mimetype2ext('audio/x-wav;codec=pcm'), 'wav') def test_month_by_name(self): self.assertEqual(month_by_name(None), None) @@ -935,6 +937,28 @@ class TestUtil(unittest.TestCase): self.assertEqual(d['x'], 1) self.assertEqual(d['y'], 'a') + # Just drop ! prefix for now though this results in a wrong value + on = js_to_json('''{ + a: !0, + b: !1, + c: !!0, + d: !!42.42, + e: !!![], + f: !"abc", + g: !"", + !42: 42 + }''') + self.assertEqual(json.loads(on), { + 'a': 0, + 'b': 1, + 'c': 0, + 'd': 42.42, + 'e': [], + 'f': "abc", + 'g': "", + '42': 42 + }) + on = js_to_json('["abc", "def",]') self.assertEqual(json.loads(on), ['abc', 'def']) @@ -992,6 +1016,12 @@ class TestUtil(unittest.TestCase): on = js_to_json('{42:4.2e1}') self.assertEqual(json.loads(on), {'42': 42.0}) + on = js_to_json('{ "0x40": "0x40" }') + self.assertEqual(json.loads(on), {'0x40': '0x40'}) + + on = js_to_json('{ "040": "040" }') + self.assertEqual(json.loads(on), {'040': '040'}) + def test_js_to_json_malformed(self): self.assertEqual(js_to_json('42a1'), '42"a1"') self.assertEqual(js_to_json('42a-1'), '42"a"-1') diff --git a/test/test_youtube_chapters.py b/test/test_youtube_chapters.py index 324ca8525..e69c57377 100644 --- a/test/test_youtube_chapters.py +++ b/test/test_youtube_chapters.py @@ -267,7 +267,7 @@ class TestYoutubeChapters(unittest.TestCase): for description, duration, expected_chapters in self._TEST_CASES: ie = YoutubeIE() expect_value( - self, ie._extract_chapters(description, duration), + self, ie._extract_chapters_from_description(description, duration), expected_chapters, None) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index f0c370eee..69df30eda 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -74,6 +74,28 @@ _TESTS = [ ] +class TestPlayerInfo(unittest.TestCase): + def test_youtube_extract_player_info(self): + PLAYER_URLS = ( + ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/en_US/base.js', '64dddad9'), + # obsolete + ('https://www.youtube.com/yts/jsbin/player_ias-vfle4-e03/en_US/base.js', 'vfle4-e03'), + ('https://www.youtube.com/yts/jsbin/player_ias-vfl49f_g4/en_US/base.js', 'vfl49f_g4'), + ('https://www.youtube.com/yts/jsbin/player_ias-vflCPQUIL/en_US/base.js', 'vflCPQUIL'), + ('https://www.youtube.com/yts/jsbin/player-vflzQZbt7/en_US/base.js', 'vflzQZbt7'), + ('https://www.youtube.com/yts/jsbin/player-en_US-vflaxXRn1/base.js', 'vflaxXRn1'), + ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', 'vflXGBaUN'), + ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'vflKjOTVq'), + ('http://s.ytimg.com/yt/swfbin/watch_as3-vflrEm9Nq.swf', 'vflrEm9Nq'), + ('https://s.ytimg.com/yts/swfbin/player-vflenCdZL/watch_as3.swf', 'vflenCdZL'), + ) + for player_url, expected_player_id in PLAYER_URLS: + expected_player_type = player_url.split('.')[-1] + player_type, player_id = YoutubeIE._extract_player_info(player_url) + self.assertEqual(player_type, expected_player_type) + self.assertEqual(player_id, expected_player_id) + + class TestSignature(unittest.TestCase): def setUp(self): TEST_DIR = os.path.dirname(os.path.abspath(__file__)) diff --git a/test/testdata/cookies/malformed_cookies.txt b/test/testdata/cookies/malformed_cookies.txt new file mode 100644 index 000000000..17bc40354 --- /dev/null +++ b/test/testdata/cookies/malformed_cookies.txt @@ -0,0 +1,9 @@ +# Netscape HTTP Cookie File +# http://curl.haxx.se/rfc/cookie_spec.html +# This is a generated file! Do not edit. + +# Cookie file entry with invalid number of fields - 6 instead of 7 +www.foobar.foobar FALSE / FALSE 0 COOKIE + +# Cookie file entry with invalid expires at +www.foobar.foobar FALSE / FALSE 1.7976931348623157e+308 COOKIE VALUE diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 19370f62b..fdd68f616 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -793,21 +793,14 @@ class YoutubeDL(object): self.report_warning('The program functionality for this site has been marked as broken, ' 'and will probably not work.') + return self.__extract_info(url, ie, download, extra_info, process) + else: + self.report_error('no suitable InfoExtractor for URL %s' % url) + + def __handle_extraction_exceptions(func): + def wrapper(self, *args, **kwargs): try: - ie_result = ie.extract(url) - if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) - break - if isinstance(ie_result, list): - # Backwards compatibility: old IE result format - ie_result = { - '_type': 'compat_list', - 'entries': ie_result, - } - self.add_default_extra_info(ie_result, ie, url) - if process: - return self.process_ie_result(ie_result, download, extra_info) - else: - return ie_result + return func(self, *args, **kwargs) except GeoRestrictedError as e: msg = e.msg if e.countries: @@ -815,20 +808,33 @@ class YoutubeDL(object): map(ISO3166Utils.short2full, e.countries)) msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' self.report_error(msg) - break except ExtractorError as e: # An error we somewhat expected self.report_error(compat_str(e), e.format_traceback()) - break except MaxDownloadsReached: raise except Exception as e: if self.params.get('ignoreerrors', False): self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) - break else: raise + return wrapper + + @__handle_extraction_exceptions + def __extract_info(self, url, ie, download, extra_info, process): + ie_result = ie.extract(url) + if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) + return + if isinstance(ie_result, list): + # Backwards compatibility: old IE result format + ie_result = { + '_type': 'compat_list', + 'entries': ie_result, + } + self.add_default_extra_info(ie_result, ie, url) + if process: + return self.process_ie_result(ie_result, download, extra_info) else: - self.report_error('no suitable InfoExtractor for URL %s' % url) + return ie_result def add_default_extra_info(self, ie_result, ie, url): self.add_extra_info(ie_result, { @@ -1003,9 +1009,8 @@ class YoutubeDL(object): self.to_screen('[download] ' + reason) continue - entry_result = self.process_ie_result(entry, - download=download, - extra_info=extra) + entry_result = self.__process_iterable_entry(entry, download, extra) + # TODO: skip failed (empty) entries? playlist_results.append(entry_result) ie_result['entries'] = playlist_results self.to_screen('[download] Finished downloading playlist: %s' % playlist) @@ -1034,6 +1039,11 @@ class YoutubeDL(object): else: raise Exception('Invalid result type: %s' % result_type) + @__handle_extraction_exceptions + def __process_iterable_entry(self, entry, download, extra_info): + return self.process_ie_result( + entry, download=download, extra_info=extra_info) + def _build_format_filter(self, filter_spec): " Returns a function to filter the formats according to the filter_spec " @@ -1600,7 +1610,7 @@ class YoutubeDL(object): if req_format is None: req_format = self._default_format_spec(info_dict, download=download) if self.params.get('verbose'): - self.to_stdout('[debug] Default format spec: %s' % req_format) + self._write_string('[debug] Default format spec: %s\n' % req_format) format_selector = self.build_format_selector(req_format) @@ -1861,7 +1871,7 @@ class YoutubeDL(object): for ph in self._progress_hooks: fd.add_progress_hook(ph) if self.params.get('verbose'): - self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) + self.to_screen('[debug] Invoking downloader on %r' % info.get('url')) return fd.download(name, info) if info_dict.get('requested_formats') is not None: diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index d1b86bd13..6c3d49d45 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -57,6 +57,17 @@ try: except ImportError: # Python 2 import cookielib as compat_cookiejar +if sys.version_info[0] == 2: + class compat_cookiejar_Cookie(compat_cookiejar.Cookie): + def __init__(self, version, name, value, *args, **kwargs): + if isinstance(name, compat_str): + name = name.encode() + if isinstance(value, compat_str): + value = value.encode() + compat_cookiejar.Cookie.__init__(self, version, name, value, *args, **kwargs) +else: + compat_cookiejar_Cookie = compat_cookiejar.Cookie + try: import http.cookies as compat_cookies except ImportError: # Python 2 @@ -2334,7 +2345,7 @@ except ImportError: # Python <3.4 # HTMLParseError has been deprecated in Python 3.3 and removed in # Python 3.5. Introducing dummy exception for Python >3.5 for compatible - # and uniform cross-version exceptiong handling + # and uniform cross-version exception handling class compat_HTMLParseError(Exception): pass @@ -2987,6 +2998,7 @@ __all__ = [ 'compat_basestring', 'compat_chr', 'compat_cookiejar', + 'compat_cookiejar_Cookie', 'compat_cookies', 'compat_ctypes_WINFUNCTYPE', 'compat_etree_Element', diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 02f35459e..35c76feba 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -97,12 +97,15 @@ class FragmentFD(FileDownloader): def _download_fragment(self, ctx, frag_url, info_dict, headers=None): fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index']) - success = ctx['dl'].download(fragment_filename, { + fragment_info_dict = { 'url': frag_url, 'http_headers': headers or info_dict.get('http_headers'), - }) + } + success = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: return False, None + if fragment_info_dict.get('filetime'): + ctx['fragment_filetime'] = fragment_info_dict.get('filetime') down, frag_sanitized = sanitize_open(fragment_filename, 'rb') ctx['fragment_filename_sanitized'] = frag_sanitized frag_content = down.read() @@ -258,6 +261,13 @@ class FragmentFD(FileDownloader): downloaded_bytes = ctx['complete_frags_downloaded_bytes'] else: self.try_rename(ctx['tmpfilename'], ctx['filename']) + if self.params.get('updatetime', True): + filetime = ctx.get('fragment_filetime') + if filetime: + try: + os.utime(ctx['filename'], (time.time(), filetime)) + except Exception: + pass downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) self._hook_progress({ diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 84bc34928..0f2c06f40 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -141,7 +141,7 @@ class HlsFD(FragmentFD): count = 0 headers = info_dict.get('http_headers', {}) if byte_range: - headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end']) + headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'] - 1) while count <= fragment_retries: try: success, frag_content = self._download_fragment( diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 3c72ea18b..d8ac41dcc 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -106,7 +106,14 @@ class HttpFD(FileDownloader): set_range(request, range_start, range_end) # Establish connection try: - ctx.data = self.ydl.urlopen(request) + try: + ctx.data = self.ydl.urlopen(request) + except (compat_urllib_error.URLError, ) as err: + # reason may not be available, e.g. for urllib2.HTTPError on python 2.6 + reason = getattr(err, 'reason', None) + if isinstance(reason, socket.timeout): + raise RetryDownload(err) + raise err # When trying to resume, Content-Range HTTP header of response has to be checked # to match the value of requested Range HTTP header. This is due to a webservers # that don't support resuming and serve a whole file with no Content-Range @@ -218,24 +225,27 @@ class HttpFD(FileDownloader): def retry(e): to_stdout = ctx.tmpfilename == '-' - if not to_stdout: - ctx.stream.close() - ctx.stream = None + if ctx.stream is not None: + if not to_stdout: + ctx.stream.close() + ctx.stream = None ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename)) raise RetryDownload(e) while True: try: # Download and write - data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) + data_block = ctx.data.read(block_size if data_len is None else min(block_size, data_len - byte_counter)) # socket.timeout is a subclass of socket.error but may not have # errno set except socket.timeout as e: retry(e) except socket.error as e: - if e.errno not in (errno.ECONNRESET, errno.ETIMEDOUT): - raise - retry(e) + # SSLError on python 2 (inherits socket.error) may have + # no errno set but this error message + if e.errno in (errno.ECONNRESET, errno.ETIMEDOUT) or getattr(e, 'message', None) == 'The read operation timed out': + retry(e) + raise byte_counter += len(data_block) @@ -299,7 +309,7 @@ class HttpFD(FileDownloader): 'elapsed': now - ctx.start_time, }) - if is_test and byte_counter == data_len: + if data_len is not None and byte_counter == data_len: break if not is_test and ctx.chunk_size and ctx.data_len is not None and byte_counter < ctx.data_len: diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 611b948f5..3d0cf1208 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -5,20 +5,30 @@ import re from .theplatform import ThePlatformIE from ..utils import ( - extract_attributes, ExtractorError, int_or_none, - smuggle_url, update_url_query, -) -from ..compat import ( - compat_urlparse, + urlencode_postdata, ) class AENetworksBaseIE(ThePlatformIE): + _BASE_URL_REGEX = r'''(?x)https?:// + (?:(?:www|play|watch)\.)? + (?P + (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| + fyi\.tv + )/''' _THEPLATFORM_KEY = 'crazyjava' _THEPLATFORM_SECRET = 's3cr3t' + _DOMAIN_MAP = { + 'history.com': ('HISTORY', 'history'), + 'aetv.com': ('AETV', 'aetv'), + 'mylifetime.com': ('LIFETIME', 'lifetime'), + 'lifetimemovieclub.com': ('LIFETIMEMOVIECLUB', 'lmc'), + 'fyi.tv': ('FYI', 'fyi'), + 'historyvault.com': (None, 'historyvault'), + } def _extract_aen_smil(self, smil_url, video_id, auth=None): query = {'mbr': 'true'} @@ -31,7 +41,7 @@ class AENetworksBaseIE(ThePlatformIE): 'assetTypes': 'high_video_s3' }, { 'assetTypes': 'high_video_s3', - 'switch': 'hls_ingest_fastly' + 'switch': 'hls_high_fastly', }] formats = [] subtitles = {} @@ -61,20 +71,13 @@ class AENetworksBaseIE(ThePlatformIE): class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?P - (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| - fyi\.tv - )/ - (?: - shows/(?P[^/]+(?:/[^/]+){0,2})| - movies/(?P[^/]+)(?:/full-movie)?| - specials/(?P[^/]+)/(?:full-special|preview-)| - collections/[^/]+/(?P[^/]+) - ) - ''' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'''(?P + shows/[^/]+/season-\d+/episode-\d+| + (?: + (?:movie|special)s/[^/]+| + (?:shows/[^/]+/)?videos + )/[^/?#&]+ + )''' _TESTS = [{ 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', 'info_dict': { @@ -91,22 +94,23 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.history.com/shows/ancient-aliens/season-1', - 'info_dict': { - 'id': '71889446852', - }, - 'playlist_mincount': 5, - }, { - 'url': 'http://www.mylifetime.com/shows/atlanta-plastic', - 'info_dict': { - 'id': 'SERIES4317', - 'title': 'Atlanta Plastic', - }, - 'playlist_mincount': 2, + 'skip': 'This video is only available for users of participating TV providers.', }, { 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', - 'only_matching': True + 'info_dict': { + 'id': '600587331957', + 'ext': 'mp4', + 'title': 'Inlawful Entry', + 'description': 'md5:57c12115a2b384d883fe64ca50529e08', + 'timestamp': 1452634428, + 'upload_date': '20160112', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], }, { 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', 'only_matching': True @@ -117,80 +121,152 @@ class AENetworksIE(AENetworksBaseIE): 'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie', 'only_matching': True }, { - 'url': 'https://www.lifetimemovieclub.com/movies/a-killer-among-us', + 'url': 'https://watch.lifetimemovieclub.com/movies/10-year-reunion/full-movie', 'only_matching': True }, { 'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special', 'only_matching': True - }, { - 'url': 'https://www.historyvault.com/collections/america-the-story-of-us/westward', - 'only_matching': True }, { 'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story', 'only_matching': True + }, { + 'url': 'http://www.history.com/videos/history-of-valentines-day', + 'only_matching': True + }, { + 'url': 'https://play.aetv.com/shows/duck-dynasty/videos/best-of-duck-dynasty-getting-quack-in-shape', + 'only_matching': True }] - _DOMAIN_TO_REQUESTOR_ID = { - 'history.com': 'HISTORY', - 'aetv.com': 'AETV', - 'mylifetime.com': 'LIFETIME', - 'lifetimemovieclub.com': 'LIFETIMEMOVIECLUB', - 'fyi.tv': 'FYI', - } def _real_extract(self, url): - domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups() - display_id = show_path or movie_display_id or special_display_id or collection_display_id - webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers()) - if show_path: - url_parts = show_path.split('/') - url_parts_len = len(url_parts) - if url_parts_len == 1: - entries = [] - for season_url_path in re.findall(r'(?s)]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): - entries.append(self.url_result( - compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) - if entries: - return self.playlist_result( - entries, self._html_search_meta('aetn:SeriesId', webpage), - self._html_search_meta('aetn:SeriesTitle', webpage)) - else: - # single season - url_parts_len = 2 - if url_parts_len == 2: - entries = [] - for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage): - episode_attributes = extract_attributes(episode_item) - episode_url = compat_urlparse.urljoin( - url, episode_attributes['data-canonical']) - entries.append(self.url_result( - episode_url, 'AENetworks', - episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id'))) - return self.playlist_result( - entries, self._html_search_meta('aetn:SeasonId', webpage)) - - video_id = self._html_search_meta('aetn:VideoID', webpage) - media_url = self._search_regex( - [r"media_url\s*=\s*'(?P[^']+)'", - r'data-media-url=(?P(?:https?:)?//[^\s>]+)', - r'data-media-url=(["\'])(?P(?:(?!\1).)+?)\1'], - webpage, 'video url', group='url') + domain, canonical = re.match(self._VALID_URL, url).groups() + requestor_id, brand = self._DOMAIN_MAP[domain] + result = self._download_json( + 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, + canonical, query={'filter[canonical]': '/' + canonical})['results'][0] + title = result['title'] + video_id = result['id'] + media_url = result['publicUrl'] theplatform_metadata = self._download_theplatform_metadata(self._search_regex( r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) info = self._parse_theplatform_metadata(theplatform_metadata) auth = None if theplatform_metadata.get('AETN$isBehindWall'): - requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] resource = self._get_mvpd_resource( requestor_id, theplatform_metadata['title'], theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), theplatform_metadata['ratings'][0]['rating']) auth = self._extract_mvpd_auth( url, video_id, requestor_id, resource) - info.update(self._search_json_ld(webpage, video_id, fatal=False)) info.update(self._extract_aen_smil(media_url, video_id, auth)) + info.update({ + 'title': title, + 'series': result.get('seriesName'), + 'season_number': int_or_none(result.get('tvSeasonNumber')), + 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), + }) return info +class AENetworksListBaseIE(AENetworksBaseIE): + def _call_api(self, resource, slug, brand, fields): + return self._download_json( + 'https://yoga.appsvcs.aetnd.com/graphql', + slug, query={'brand': brand}, data=urlencode_postdata({ + 'query': '''{ + %s(slug: "%s") { + %s + } +}''' % (resource, slug, fields), + }))['data'][resource] + + def _real_extract(self, url): + domain, slug = re.match(self._VALID_URL, url).groups() + _, brand = self._DOMAIN_MAP[domain] + playlist = self._call_api(self._RESOURCE, slug, brand, self._FIELDS) + base_url = 'http://watch.%s' % domain + + entries = [] + for item in (playlist.get(self._ITEMS_KEY) or []): + doc = self._get_doc(item) + canonical = doc.get('canonical') + if not canonical: + continue + entries.append(self.url_result( + base_url + canonical, AENetworksIE.ie_key(), doc.get('id'))) + + description = None + if self._PLAYLIST_DESCRIPTION_KEY: + description = playlist.get(self._PLAYLIST_DESCRIPTION_KEY) + + return self.playlist_result( + entries, playlist.get('id'), + playlist.get(self._PLAYLIST_TITLE_KEY), description) + + +class AENetworksCollectionIE(AENetworksListBaseIE): + IE_NAME = 'aenetworks:collection' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'(?:[^/]+/)*(?:list|collections)/(?P[^/?#&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://watch.historyvault.com/list/america-the-story-of-us', + 'info_dict': { + 'id': '282', + 'title': 'America The Story of Us', + }, + 'playlist_mincount': 12, + }, { + 'url': 'https://watch.historyvault.com/shows/america-the-story-of-us-2/season-1/list/america-the-story-of-us', + 'only_matching': True + }, { + 'url': 'https://www.historyvault.com/collections/mysteryquest', + 'only_matching': True + }] + _RESOURCE = 'list' + _ITEMS_KEY = 'items' + _PLAYLIST_TITLE_KEY = 'display_title' + _PLAYLIST_DESCRIPTION_KEY = None + _FIELDS = '''id + display_title + items { + ... on ListVideoItem { + doc { + canonical + id + } + } + }''' + + def _get_doc(self, item): + return item.get('doc') or {} + + +class AENetworksShowIE(AENetworksListBaseIE): + IE_NAME = 'aenetworks:show' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'shows/(?P[^/?#&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'http://www.history.com/shows/ancient-aliens', + 'info_dict': { + 'id': 'SH012427480000', + 'title': 'Ancient Aliens', + 'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f', + }, + 'playlist_mincount': 168, + }] + _RESOURCE = 'series' + _ITEMS_KEY = 'episodes' + _PLAYLIST_TITLE_KEY = 'title' + _PLAYLIST_DESCRIPTION_KEY = 'description' + _FIELDS = '''description + id + title + episodes { + canonical + id + }''' + + def _get_doc(self, item): + return item + + class HistoryTopicIE(AENetworksBaseIE): IE_NAME = 'history:topic' IE_DESC = 'History.com Topic' @@ -204,6 +280,7 @@ class HistoryTopicIE(AENetworksBaseIE): 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', 'timestamp': 1375819729, 'upload_date': '20130806', + 'uploader': 'AENE-NEW', }, 'params': { # m3u8 download @@ -212,36 +289,8 @@ class HistoryTopicIE(AENetworksBaseIE): 'add_ie': ['ThePlatform'], }] - def theplatform_url_result(self, theplatform_url, video_id, query): - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': smuggle_url( - update_url_query(theplatform_url, query), - { - 'sig': { - 'key': self._THEPLATFORM_KEY, - 'secret': self._THEPLATFORM_SECRET, - }, - 'force_smil_url': True - }), - 'ie_key': 'ThePlatform', - } - def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r']+src="[^"]+\btpid=(\d+)', webpage, 'tpid') - result = self._download_json( - 'https://feeds.video.aetnd.com/api/v2/history/videos', - video_id, query={'filter[id]': video_id})['results'][0] - title = result['title'] - info = self._extract_aen_smil(result['publicUrl'], video_id) - info.update({ - 'title': title, - 'description': result.get('description'), - 'duration': int_or_none(result.get('duration')), - 'timestamp': int_or_none(result.get('added'), 1000), - }) - return info + return self.url_result( + 'http://www.history.com/videos/' + display_id, + AENetworksIE.ie_key()) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 6275e5209..b56abb1e6 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -275,7 +275,7 @@ class AfreecaTVIE(InfoExtractor): video_element = video_xml.findall(compat_xpath('./track/video'))[-1] if video_element is None or video_element.text is None: raise ExtractorError( - 'Video %s video does not exist' % video_id, expected=True) + 'Video %s does not exist' % video_id, expected=True) video_url = video_element.text.strip() diff --git a/youtube_dl/extractor/amara.py b/youtube_dl/extractor/amara.py new file mode 100644 index 000000000..61d469574 --- /dev/null +++ b/youtube_dl/extractor/amara.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .youtube import YoutubeIE +from .vimeo import VimeoIE +from ..utils import ( + int_or_none, + parse_iso8601, + update_url_query, +) + + +class AmaraIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P\w+)' + _TESTS = [{ + # Youtube + 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', + 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', + 'info_dict': { + 'id': 'h6ZuVdvYnfE', + 'ext': 'mp4', + 'title': 'Why jury trials are becoming less common', + 'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20160813', + 'uploader': 'PBS NewsHour', + 'uploader_id': 'PBSNewsHour', + 'timestamp': 1549639570, + } + }, { + # Vimeo + 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', + 'md5': '99392c75fa05d432a8f11df03612195e', + 'info_dict': { + 'id': '18622084', + 'ext': 'mov', + 'title': 'Vimeo at CES 2011!', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'timestamp': 1294763658, + 'upload_date': '20110111', + 'uploader': 'Sam Morrill', + 'uploader_id': 'sammorrill' + } + }, { + # Direct Link + 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', + 'md5': 'd3970f08512738ee60c5807311ff5d3f', + 'info_dict': { + 'id': 's8KL7I3jLmh6', + 'ext': 'mp4', + 'title': 'The danger of a single story', + 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20091007', + 'timestamp': 1254942511, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + meta = self._download_json( + 'https://amara.org/api/videos/%s/' % video_id, + video_id, query={'format': 'json'}) + title = meta['title'] + video_url = meta['all_urls'][0] + + subtitles = {} + for language in (meta.get('languages') or []): + subtitles_uri = language.get('subtitles_uri') + if not (subtitles_uri and language.get('published')): + continue + subtitle = subtitles.setdefault(language.get('code') or 'en', []) + for f in ('json', 'srt', 'vtt'): + subtitle.append({ + 'ext': f, + 'url': update_url_query(subtitles_uri, {'format': f}), + }) + + info = { + 'url': video_url, + 'id': video_id, + 'subtitles': subtitles, + 'title': title, + 'description': meta.get('description'), + 'thumbnail': meta.get('thumbnail'), + 'duration': int_or_none(meta.get('duration')), + 'timestamp': parse_iso8601(meta.get('created')), + } + + for ie in (YoutubeIE, VimeoIE): + if ie.suitable(video_url): + info.update({ + '_type': 'url_transparent', + 'ie_key': ie.ie_key(), + }) + break + + return info diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py index 6fb3d6c53..12b6de0bf 100644 --- a/youtube_dl/extractor/amcnetworks.py +++ b/youtube_dl/extractor/amcnetworks.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .theplatform import ThePlatformIE from ..utils import ( int_or_none, @@ -11,25 +13,22 @@ from ..utils import ( class AMCNetworksIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?:movies|shows(?:/[^/]+)+)/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?Pamc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1', - 'md5': '', + 'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631', 'info_dict': { - 'id': 's3MX01Nl4vPH', + 'id': '4Lq1dzOnZGt0', 'ext': 'mp4', - 'title': 'Maron - Season 4 - Step 1', - 'description': 'In denial about his current situation, Marc is reluctantly convinced by his friends to enter rehab. Starring Marc Maron and Constance Zimmer.', - 'age_limit': 17, - 'upload_date': '20160505', - 'timestamp': 1462468831, + 'title': "The Graham Norton Show - Season 28 - Tina Fey's Adorable Airline-Themed Family Dinner", + 'description': "It turns out child stewardesses are very generous with the wine! All-new episodes of 'The Graham Norton Show' premiere Fridays at 11/10c on BBC America.", + 'upload_date': '20201120', + 'timestamp': 1605904350, 'uploader': 'AMCN', }, 'params': { # m3u8 download 'skip_download': True, }, - 'skip': 'Requires TV provider accounts', }, { 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', 'only_matching': True, @@ -55,32 +54,33 @@ class AMCNetworksIE(ThePlatformIE): 'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1', 'only_matching': True, }] + _REQUESTOR_ID_MAP = { + 'amc': 'AMC', + 'bbcamerica': 'BBCA', + 'ifc': 'IFC', + 'sundancetv': 'SUNDANCE', + 'wetv': 'WETV', + } def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + site, display_id = re.match(self._VALID_URL, url).groups() + requestor_id = self._REQUESTOR_ID_MAP[site] + properties = self._download_json( + 'https://content-delivery-gw.svc.ds.amcn.com/api/v2/content/amcn/%s/url/%s' % (requestor_id.lower(), display_id), + display_id)['data']['properties'] query = { 'mbr': 'true', 'manifest': 'm3u', } - media_url = self._search_regex( - r'window\.platformLinkURL\s*=\s*[\'"]([^\'"]+)', - webpage, 'media url') - theplatform_metadata = self._download_theplatform_metadata(self._search_regex( - r'link\.theplatform\.com/s/([^?]+)', - media_url, 'theplatform_path'), display_id) + tp_path = 'M_UwQC/media/' + properties['videoPid'] + media_url = 'https://link.theplatform.com/s/' + tp_path + theplatform_metadata = self._download_theplatform_metadata(tp_path, display_id) info = self._parse_theplatform_metadata(theplatform_metadata) video_id = theplatform_metadata['pid'] title = theplatform_metadata['title'] rating = try_get( theplatform_metadata, lambda x: x['ratings'][0]['rating']) - auth_required = self._search_regex( - r'window\.authRequired\s*=\s*(true|false);', - webpage, 'auth required') - if auth_required == 'true': - requestor_id = self._search_regex( - r'window\.requestor_id\s*=\s*[\'"]([^\'"]+)', - webpage, 'requestor id') + if properties.get('videoCategory') == 'TVE-Auth': resource = self._get_mvpd_resource( requestor_id, title, video_id, rating) query['auth'] = self._extract_mvpd_auth( diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index 9c9d77ae1..e20f00fc3 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -1,33 +1,33 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( clean_html, - int_or_none, - js_to_json, try_get, unified_strdate, ) class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:episode|videos)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?Pepisode|videos)/(?P\d+)' _TESTS = [{ 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', 'info_dict': { 'id': '5b400b9ee338f922cb06450c', - 'title': 'Weeknight Japanese Suppers', + 'title': 'Japanese Suppers', 'ext': 'mp4', - 'description': 'md5:3d0c1a44bb3b27607ce82652db25b4a8', + 'description': 'md5:64e606bfee910627efc4b5f050de92b3', 'thumbnail': r're:^https?://', 'timestamp': 1523664000, 'upload_date': '20180414', - 'release_date': '20180414', + 'release_date': '20180410', 'series': "America's Test Kitchen", 'season_number': 18, - 'episode': 'Weeknight Japanese Suppers', + 'episode': 'Japanese Suppers', 'episode_number': 15, }, 'params': { @@ -36,47 +36,31 @@ class AmericasTestKitchenIE(InfoExtractor): }, { 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + resource_type, video_id = re.match(self._VALID_URL, url).groups() + is_episode = resource_type == 'episode' + if is_episode: + resource_type = 'episodes' - webpage = self._download_webpage(url, video_id) - - video_data = self._parse_json( - self._search_regex( - r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*', - webpage, 'initial context'), - video_id, js_to_json) - - ep_data = try_get( - video_data, - (lambda x: x['episodeDetail']['content']['data'], - lambda x: x['videoDetail']['content']['data']), dict) - ep_meta = ep_data.get('full_video', {}) - - zype_id = ep_data.get('zype_id') or ep_meta['zype_id'] - - title = ep_data.get('title') or ep_meta.get('title') - description = clean_html(ep_meta.get('episode_description') or ep_data.get( - 'description') or ep_meta.get('description')) - thumbnail = try_get(ep_meta, lambda x: x['photo']['image_url']) - release_date = unified_strdate(ep_data.get('aired_at')) - - season_number = int_or_none(ep_meta.get('season_number')) - episode = ep_meta.get('title') - episode_number = int_or_none(ep_meta.get('episode_number')) + resource = self._download_json( + 'https://www.americastestkitchen.com/api/v6/%s/%s' % (resource_type, video_id), video_id) + video = resource['video'] if is_episode else resource + episode = resource if is_episode else resource.get('episode') or {} return { '_type': 'url_transparent', - 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id, + 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % video['zypeId'], 'ie_key': 'Zype', - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'release_date': release_date, - 'series': "America's Test Kitchen", - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, + 'description': clean_html(video.get('description')), + 'release_date': unified_strdate(video.get('publishDate')), + 'series': try_get(episode, lambda x: x['show']['title']), + 'episode': episode.get('title'), } diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 2f47e21c3..5b7b2dd6d 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -249,7 +249,7 @@ class ARDMediathekIE(ARDMediathekBaseIE): class ARDIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos/(?P[^/?#]+)-(?P[0-9]+))\.html' + _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?P[0-9]+))\.html' _TESTS = [{ # available till 14.02.2019 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', @@ -263,6 +263,9 @@ class ARDIE(InfoExtractor): 'upload_date': '20180214', 'thumbnail': r're:^https?://.*\.jpg$', }, + }, { + 'url': 'https://www.daserste.de/information/reportage-dokumentation/erlebnis-erde/videosextern/woelfe-und-herdenschutzhunde-ungleiche-brueder-102.html', + 'only_matching': True, }, { 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', 'only_matching': True, @@ -310,9 +313,9 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(ARDMediathekBaseIE): - _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/(?P[^/]+)/(?:player|live)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' + _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?:player|live|video)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', + 'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', 'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f', 'info_dict': { 'display_id': 'die-robuste-roswita', @@ -325,6 +328,15 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'upload_date': '20191222', 'ext': 'mp4', }, + }, { + 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', + 'only_matching': True, + }, { + 'url': 'https://ardmediathek.de/ard/video/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/ard/video/trailer/private-eyes-s01-e01/one/Y3JpZDovL3dkci5kZS9CZWl0cmFnLTE1MTgwYzczLWNiMTEtNGNkMS1iMjUyLTg5MGYzOWQxZmQ1YQ/', + 'only_matching': True, }, { 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/', 'only_matching': True, @@ -336,7 +348,11 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') - display_id = mobj.group('display_id') or video_id + display_id = mobj.group('display_id') + if display_id: + display_id = display_id.rstrip('/') + if not display_id: + display_id = video_id player_page = self._download_json( 'https://api.ardmediathek.de/public-gateway', diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 2bd3bfe8a..03abdbfaf 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -4,23 +4,57 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( ExtractorError, int_or_none, qualities, try_get, unified_strdate, + url_or_none, ) -# There are different sources of video in arte.tv, the extraction process -# is different for each one. The videos usually expire in 7 days, so we can't -# add tests. - class ArteTVBaseIE(InfoExtractor): - def _extract_from_json_url(self, json_url, video_id, lang, title=None): - info = self._download_json(json_url, video_id) + _ARTE_LANGUAGES = 'fr|de|en|es|it|pl' + _API_BASE = 'https://api.arte.tv/api/player/v1' + + +class ArteTVIE(ArteTVBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?arte\.tv/(?P%(langs)s)/videos| + api\.arte\.tv/api/player/v\d+/config/(?P%(langs)s) + ) + /(?P\d{6}-\d{3}-[AF]) + ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES} + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'info_dict': { + 'id': '088501-000-A', + 'ext': 'mp4', + 'title': 'Mexico: Stealing Petrol to Survive', + 'upload_date': '20190628', + }, + }, { + 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', + 'only_matching': True, + }, { + 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + lang = mobj.group('lang') or mobj.group('lang_2') + + info = self._download_json( + '%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id) player_info = info['videoJsonPlayer'] vsr = try_get(player_info, lambda x: x['VSR'], dict) @@ -37,18 +71,11 @@ class ArteTVBaseIE(InfoExtractor): if not upload_date_str: upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] - title = (player_info.get('VTI') or title or player_info['VID']).strip() + title = (player_info.get('VTI') or player_info['VID']).strip() subtitle = player_info.get('VSU', '').strip() if subtitle: title += ' - %s' % subtitle - info_dict = { - 'id': player_info['VID'], - 'title': title, - 'description': player_info.get('VDE'), - 'upload_date': unified_strdate(upload_date_str), - 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), - } qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) LANGS = { @@ -65,6 +92,10 @@ class ArteTVBaseIE(InfoExtractor): formats = [] for format_id, format_dict in vsr.items(): f = dict(format_dict) + format_url = url_or_none(f.get('url')) + streamer = f.get('streamer') + if not format_url and not streamer: + continue versionCode = f.get('versionCode') l = re.escape(langcode) @@ -107,6 +138,16 @@ class ArteTVBaseIE(InfoExtractor): else: lang_pref = -1 + media_type = f.get('mediaType') + if media_type == 'hls': + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + for m3u8_format in m3u8_formats: + m3u8_format['language_preference'] = lang_pref + formats.extend(m3u8_formats) + continue + format = { 'format_id': format_id, 'preference': -10 if f.get('videoFormat') == 'M3U8' else None, @@ -118,7 +159,7 @@ class ArteTVBaseIE(InfoExtractor): 'quality': qfunc(f.get('quality')), } - if f.get('mediaType') == 'rtmp': + if media_type == 'rtmp': format['url'] = f['streamer'] format['play_path'] = 'mp4:' + f['url'] format['ext'] = 'flv' @@ -127,56 +168,50 @@ class ArteTVBaseIE(InfoExtractor): formats.append(format) - self._check_formats(formats, video_id) self._sort_formats(formats) - info_dict['formats'] = formats - return info_dict + return { + 'id': player_info.get('VID') or video_id, + 'title': title, + 'description': player_info.get('VDE'), + 'upload_date': unified_strdate(upload_date_str), + 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), + 'formats': formats, + } -class ArteTVPlus7IE(ArteTVBaseIE): - IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?Pfr|de|en|es|it|pl)/videos/(?P\d{6}-\d{3}-[AF])' - +class ArteTVEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+' _TESTS = [{ - 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A', 'info_dict': { - 'id': '088501-000-A', + 'id': '100605-013-A', 'ext': 'mp4', - 'title': 'Mexico: Stealing Petrol to Survive', - 'upload_date': '20190628', + 'title': 'United we Stream November Lockdown Edition #13', + 'description': 'md5:be40b667f45189632b78c1425c7c2ce1', + 'upload_date': '20201116', }, + }, { + 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A', + 'only_matching': True, }] - def _real_extract(self, url): - lang, video_id = re.match(self._VALID_URL, url).groups() - return self._extract_from_json_url( - 'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id), - video_id, lang) - - -class ArteTVEmbedIE(ArteTVPlus7IE): - IE_NAME = 'arte.tv:embed' - _VALID_URL = r'''(?x) - https://www\.arte\.tv - /player/v3/index\.php\?json_url= - (?P - https?://api\.arte\.tv/api/player/v1/config/ - (?P[^/]+)/(?P\d{6}-\d{3}-[AF]) - ) - ''' - - _TESTS = [] + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r'<(?:iframe|script)[^>]+src=(["\'])(?P(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1', + webpage)] def _real_extract(self, url): - json_url, lang, video_id = re.match(self._VALID_URL, url).groups() - return self._extract_from_json_url(json_url, video_id, lang) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + json_url = qs['json_url'][0] + video_id = ArteTVIE._match_id(json_url) + return self.url_result( + json_url, ie=ArteTVIE.ie_key(), video_id=video_id) class ArteTVPlaylistIE(ArteTVBaseIE): - IE_NAME = 'arte.tv:playlist' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?Pfr|de|en|es|it|pl)/videos/(?PRC-\d{6})' - + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P%s)/videos/(?PRC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES _TESTS = [{ 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', 'info_dict': { @@ -185,17 +220,35 @@ class ArteTVPlaylistIE(ArteTVBaseIE): 'description': 'md5:d322c55011514b3a7241f7fb80d494c2', }, 'playlist_mincount': 6, + }, { + 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/', + 'only_matching': True, }] def _real_extract(self, url): lang, playlist_id = re.match(self._VALID_URL, url).groups() collection = self._download_json( - 'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' - % (lang, playlist_id), playlist_id) + '%s/collectionData/%s/%s?source=videos' + % (self._API_BASE, lang, playlist_id), playlist_id) + entries = [] + for video in collection['videos']: + if not isinstance(video, dict): + continue + video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl')) + if not video_url: + continue + video_id = video.get('programId') + entries.append({ + '_type': 'url_transparent', + 'url': video_url, + 'id': video_id, + 'title': video.get('title'), + 'alt_title': video.get('subtitle'), + 'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)), + 'duration': int_or_none(video.get('durationSeconds')), + 'view_count': int_or_none(video.get('views')), + 'ie_key': ArteTVIE.ie_key(), + }) title = collection.get('title') description = collection.get('shortDescription') or collection.get('teaserText') - entries = [ - self._extract_from_json_url( - video['jsonUrl'], video.get('programId') or playlist_id, lang) - for video in collection['videos'] if video.get('jsonUrl')] return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index f14b407dc..69e673a26 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import random @@ -5,10 +6,7 @@ import re import time from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, float_or_none, @@ -17,30 +15,32 @@ from ..utils import ( parse_filesize, str_or_none, try_get, - unescapeHTML, update_url_query, unified_strdate, unified_timestamp, url_or_none, + urljoin, ) class BandcampIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P[^/?#&]+)' + _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭", 'duration': 9.8485, + 'uploader': 'youtube-dl "\'/\\ä↭', + 'upload_date': '20121129', + 'timestamp': 1354224127, }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { # free download 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '853e35bf34aa1d6fe2615ae612564b36', 'info_dict': { 'id': '2650410135', 'ext': 'aiff', @@ -79,11 +79,16 @@ class BandcampIE(InfoExtractor): }, }] + def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True): + return self._parse_json(self._html_search_regex( + r'data-%s=(["\'])({.+?})\1' % attr, webpage, + attr + ' data', group=2), video_id, fatal=fatal) + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') + title = self._match_id(url) webpage = self._download_webpage(url, title) - thumbnail = self._html_search_meta('og:image', webpage, default=None) + tralbum = self._extract_data_attr(webpage, title) + thumbnail = self._og_search_thumbnail(webpage) track_id = None track = None @@ -91,10 +96,7 @@ class BandcampIE(InfoExtractor): duration = None formats = [] - track_info = self._parse_json( - self._search_regex( - r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n', - webpage, 'track info', default='{}'), title) + track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict) if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -111,37 +113,25 @@ class BandcampIE(InfoExtractor): 'abr': int_or_none(abr_str), }) track = track_info.get('title') - track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) + track_id = str_or_none( + track_info.get('track_id') or track_info.get('id')) track_number = int_or_none(track_info.get('track_num')) duration = float_or_none(track_info.get('duration')) - def extract(key): - return self._search_regex( - r'\b%s\s*["\']?\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' % key, - webpage, key, default=None, group='value') - - artist = extract('artist') - album = extract('album_title') + embed = self._extract_data_attr(webpage, title, 'embed', False) + current = tralbum.get('current') or {} + artist = embed.get('artist') or current.get('artist') or tralbum.get('artist') timestamp = unified_timestamp( - extract('publish_date') or extract('album_publish_date')) - release_date = unified_strdate(extract('album_release_date')) + current.get('publish_date') or tralbum.get('album_publish_date')) - download_link = self._search_regex( - r'freeDownloadPage\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'download link', default=None, group='url') + download_link = tralbum.get('freeDownloadPage') if download_link: - track_id = self._search_regex( - r'(?ms)var TralbumData = .*?[{,]\s*id: (?P\d+),?$', - webpage, 'track id') + track_id = compat_str(tralbum['id']) download_webpage = self._download_webpage( download_link, track_id, 'Downloading free downloads page') - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P{.+?})\1', download_webpage, - 'blob', group='blob'), - track_id, transform_source=unescapeHTML) + blob = self._extract_data_attr(download_webpage, track_id, 'blob') info = try_get( blob, (lambda x: x['digital_items'][0], @@ -207,20 +197,20 @@ class BandcampIE(InfoExtractor): 'thumbnail': thumbnail, 'uploader': artist, 'timestamp': timestamp, - 'release_date': release_date, + 'release_date': unified_strdate(tralbum.get('album_release_date')), 'duration': duration, 'track': track, 'track_number': track_number, 'track_id': track_id, 'artist': artist, - 'album': album, + 'album': embed.get('album_title'), 'formats': formats, } -class BandcampAlbumIE(InfoExtractor): +class BandcampAlbumIE(BandcampIE): IE_NAME = 'Bandcamp:album' - _VALID_URL = r'https?://(?:(?P[^.]+)\.)?bandcamp\.com(?:/album/(?P[^/?#&]+))?' + _VALID_URL = r'https?://(?:(?P[^.]+)\.)?bandcamp\.com(?:/album/(?P[^/?#&]+))?' _TESTS = [{ 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -230,7 +220,10 @@ class BandcampAlbumIE(InfoExtractor): 'info_dict': { 'id': '1353101989', 'ext': 'mp3', - 'title': 'Intro', + 'title': 'Blazo - Intro', + 'timestamp': 1311756226, + 'upload_date': '20110727', + 'uploader': 'Blazo', } }, { @@ -238,7 +231,10 @@ class BandcampAlbumIE(InfoExtractor): 'info_dict': { 'id': '38097443', 'ext': 'mp3', - 'title': 'Kero One - Keep It Alive (Blazo remix)', + 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)', + 'timestamp': 1311757238, + 'upload_date': '20110727', + 'uploader': 'Blazo', } }, ], @@ -274,6 +270,7 @@ class BandcampAlbumIE(InfoExtractor): 'title': '"Entropy" EP', 'uploader_id': 'jstrecords', 'id': 'entropy-ep', + 'description': 'md5:0ff22959c943622972596062f2f366a5', }, 'playlist_mincount': 3, }, { @@ -283,6 +280,7 @@ class BandcampAlbumIE(InfoExtractor): 'id': 'we-are-the-plague', 'title': 'WE ARE THE PLAGUE', 'uploader_id': 'insulters', + 'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f', }, 'playlist_count': 2, }] @@ -294,41 +292,34 @@ class BandcampAlbumIE(InfoExtractor): else super(BandcampAlbumIE, cls).suitable(url)) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - uploader_id = mobj.group('subdomain') - album_id = mobj.group('album_id') + uploader_id, album_id = re.match(self._VALID_URL, url).groups() playlist_id = album_id or uploader_id webpage = self._download_webpage(url, playlist_id) - track_elements = re.findall( - r'(?s)]*>(.*?]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)', webpage) - if not track_elements: + tralbum = self._extract_data_attr(webpage, playlist_id) + track_info = tralbum.get('trackinfo') + if not track_info: raise ExtractorError('The page doesn\'t contain any tracks') # Only tracks with duration info have songs entries = [ self.url_result( - compat_urlparse.urljoin(url, t_path), - ie=BandcampIE.ie_key(), - video_title=self._search_regex( - r']+\bitemprop=["\']name["\'][^>]*>([^<]+)', - elem_content, 'track title', fatal=False)) - for elem_content, t_path in track_elements - if self._html_search_meta('duration', elem_content, default=None)] + urljoin(url, t['title_link']), BandcampIE.ie_key(), + str_or_none(t.get('track_id') or t.get('id')), t.get('title')) + for t in track_info + if t.get('duration')] + + current = tralbum.get('current') or {} - title = self._html_search_regex( - r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', - webpage, 'title', fatal=False) - if title: - title = title.replace(r'\"', '"') return { '_type': 'playlist', 'uploader_id': uploader_id, 'id': playlist_id, - 'title': title, + 'title': current.get('title'), + 'description': current.get('about'), 'entries': entries, } -class BandcampWeeklyIE(InfoExtractor): +class BandcampWeeklyIE(BandcampIE): IE_NAME = 'Bandcamp:weekly' _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P\d+)' _TESTS = [{ @@ -343,29 +334,23 @@ class BandcampWeeklyIE(InfoExtractor): 'release_date': '20170404', 'series': 'Bandcamp Weekly', 'episode': 'Magic Moments', - 'episode_number': 208, 'episode_id': '224', - } + }, + 'params': { + 'format': 'opus-lo', + }, }, { 'url': 'https://bandcamp.com/?blah/blah@&show=228', 'only_matching': True }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P{.+?})\1', webpage, - 'blob', group='blob'), - video_id, transform_source=unescapeHTML) + blob = self._extract_data_attr(webpage, show_id, 'blob') - show = blob['bcw_show'] - - # This is desired because any invalid show id redirects to `bandcamp.com` - # which happens to expose the latest Bandcamp Weekly episode. - show_id = int_or_none(show.get('show_id')) or int_or_none(video_id) + show = blob['bcw_data'][show_id] formats = [] for format_id, format_url in show['audio_stream'].items(): @@ -390,20 +375,8 @@ class BandcampWeeklyIE(InfoExtractor): if subtitle: title += ' - %s' % subtitle - episode_number = None - seq = blob.get('bcw_seq') - - if seq and isinstance(seq, list): - try: - episode_number = next( - int_or_none(e.get('episode_number')) - for e in seq - if isinstance(e, dict) and int_or_none(e.get('id')) == show_id) - except StopIteration: - pass - return { - 'id': video_id, + 'id': show_id, 'title': title, 'description': show.get('desc') or show.get('short_desc'), 'duration': float_or_none(show.get('audio_duration')), @@ -411,7 +384,6 @@ class BandcampWeeklyIE(InfoExtractor): 'release_date': unified_strdate(show.get('published_date')), 'series': 'Bandcamp Weekly', 'episode': show.get('subtitle'), - 'episode_number': episode_number, - 'episode_id': compat_str(video_id), + 'episode_id': show_id, 'formats': formats } diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 901c5a54f..54cbcdc8e 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -528,7 +528,7 @@ class BBCCoUkIE(InfoExtractor): def get_programme_id(item): def get_from_attributes(item): - for p in('identifier', 'group'): + for p in ('identifier', 'group'): value = item.get(p) if value and re.match(r'^[pb][\da-z]{7}$', value): return value @@ -981,7 +981,7 @@ class BBCIE(BBCCoUkIE): group_id = self._search_regex( r']+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, webpage, 'group id', default=None) - if playlist_id: + if group_id: return self.url_result( 'https://www.bbc.co.uk/programmes/%s' % group_id, ie=BBCCoUkIE.ie_key()) @@ -1092,10 +1092,26 @@ class BBCIE(BBCCoUkIE): self._search_regex( r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, 'bbcthree config', default='{}'), - playlist_id, transform_source=js_to_json, fatal=False) - if bbc3_config: + playlist_id, transform_source=js_to_json, fatal=False) or {} + payload = bbc3_config.get('payload') or {} + if payload: + clip = payload.get('currentClip') or {} + clip_vpid = clip.get('vpid') + clip_title = clip.get('title') + if clip_vpid and clip_title: + formats, subtitles = self._download_media_selector(clip_vpid) + self._sort_formats(formats) + return { + 'id': clip_vpid, + 'title': clip_title, + 'thumbnail': dict_get(clip, ('poster', 'imageUrl')), + 'description': clip.get('description'), + 'duration': parse_duration(clip.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + } bbc3_playlist = try_get( - bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'], + payload, lambda x: x['content']['bbcMedia']['playlist'], dict) if bbc3_playlist: playlist_title = bbc3_playlist.get('title') or playlist_title @@ -1118,6 +1134,39 @@ class BBCIE(BBCCoUkIE): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + initial_data = self._parse_json(self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), playlist_id, fatal=False) + if initial_data: + def parse_media(media): + if not media: + return + for item in (try_get(media, lambda x: x['media']['items'], list) or []): + item_id = item.get('id') + item_title = item.get('title') + if not (item_id and item_title): + continue + formats, subtitles = self._download_media_selector(item_id) + self._sort_formats(formats) + entries.append({ + 'id': item_id, + 'title': item_title, + 'thumbnail': item.get('holdingImageUrl'), + 'formats': formats, + 'subtitles': subtitles, + }) + for resp in (initial_data.get('data') or {}).values(): + name = resp.get('name') + if name == 'media-experience': + parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) + elif name == 'article': + for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): + if block.get('type') != 'media': + continue + parse_media(block.get('model')) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), diff --git a/youtube_dl/extractor/beampro.py b/youtube_dl/extractor/beampro.py deleted file mode 100644 index 86abdae00..000000000 --- a/youtube_dl/extractor/beampro.py +++ /dev/null @@ -1,194 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - clean_html, - compat_str, - float_or_none, - int_or_none, - parse_iso8601, - try_get, - urljoin, -) - - -class BeamProBaseIE(InfoExtractor): - _API_BASE = 'https://mixer.com/api/v1' - _RATINGS = {'family': 0, 'teen': 13, '18+': 18} - - def _extract_channel_info(self, chan): - user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id']) - return { - 'uploader': chan.get('token') or try_get( - chan, lambda x: x['user']['username'], compat_str), - 'uploader_id': compat_str(user_id) if user_id else None, - 'age_limit': self._RATINGS.get(chan.get('audience')), - } - - -class BeamProLiveIE(BeamProBaseIE): - IE_NAME = 'Mixer:live' - _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/(?P[^/?#&]+)' - _TEST = { - 'url': 'http://mixer.com/niterhayven', - 'info_dict': { - 'id': '261562', - 'ext': 'mp4', - 'title': 'Introducing The Witcher 3 // The Grind Starts Now!', - 'description': 'md5:0b161ac080f15fe05d18a07adb44a74d', - 'thumbnail': r're:https://.*\.jpg$', - 'timestamp': 1483477281, - 'upload_date': '20170103', - 'uploader': 'niterhayven', - 'uploader_id': '373396', - 'age_limit': 18, - 'is_live': True, - 'view_count': int, - }, - 'skip': 'niterhayven is offline', - 'params': { - 'skip_download': True, - }, - } - - _MANIFEST_URL_TEMPLATE = '%s/channels/%%s/manifest.%%s' % BeamProBaseIE._API_BASE - - @classmethod - def suitable(cls, url): - return False if BeamProVodIE.suitable(url) else super(BeamProLiveIE, cls).suitable(url) - - def _real_extract(self, url): - channel_name = self._match_id(url) - - chan = self._download_json( - '%s/channels/%s' % (self._API_BASE, channel_name), channel_name) - - if chan.get('online') is False: - raise ExtractorError( - '{0} is offline'.format(channel_name), expected=True) - - channel_id = chan['id'] - - def manifest_url(kind): - return self._MANIFEST_URL_TEMPLATE % (channel_id, kind) - - formats = self._extract_m3u8_formats( - manifest_url('m3u8'), channel_name, ext='mp4', m3u8_id='hls', - fatal=False) - formats.extend(self._extract_smil_formats( - manifest_url('smil'), channel_name, fatal=False)) - self._sort_formats(formats) - - info = { - 'id': compat_str(chan.get('id') or channel_name), - 'title': self._live_title(chan.get('name') or channel_name), - 'description': clean_html(chan.get('description')), - 'thumbnail': try_get( - chan, lambda x: x['thumbnail']['url'], compat_str), - 'timestamp': parse_iso8601(chan.get('updatedAt')), - 'is_live': True, - 'view_count': int_or_none(chan.get('viewersTotal')), - 'formats': formats, - } - info.update(self._extract_channel_info(chan)) - - return info - - -class BeamProVodIE(BeamProBaseIE): - IE_NAME = 'Mixer:vod' - _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/[^/?#&]+\?.*?\bvod=(?P[^?#&]+)' - _TESTS = [{ - 'url': 'https://mixer.com/willow8714?vod=2259830', - 'md5': 'b2431e6e8347dc92ebafb565d368b76b', - 'info_dict': { - 'id': '2259830', - 'ext': 'mp4', - 'title': 'willow8714\'s Channel', - 'duration': 6828.15, - 'thumbnail': r're:https://.*source\.png$', - 'timestamp': 1494046474, - 'upload_date': '20170506', - 'uploader': 'willow8714', - 'uploader_id': '6085379', - 'age_limit': 13, - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://mixer.com/streamer?vod=IxFno1rqC0S_XJ1a2yGgNw', - 'only_matching': True, - }, { - 'url': 'https://mixer.com/streamer?vod=Rh3LY0VAqkGpEQUe2pN-ig', - 'only_matching': True, - }] - - @staticmethod - def _extract_format(vod, vod_type): - if not vod.get('baseUrl'): - return [] - - if vod_type == 'hls': - filename, protocol = 'manifest.m3u8', 'm3u8_native' - elif vod_type == 'raw': - filename, protocol = 'source.mp4', 'https' - else: - assert False - - data = vod.get('data') if isinstance(vod.get('data'), dict) else {} - - format_id = [vod_type] - if isinstance(data.get('Height'), compat_str): - format_id.append('%sp' % data['Height']) - - return [{ - 'url': urljoin(vod['baseUrl'], filename), - 'format_id': '-'.join(format_id), - 'ext': 'mp4', - 'protocol': protocol, - 'width': int_or_none(data.get('Width')), - 'height': int_or_none(data.get('Height')), - 'fps': int_or_none(data.get('Fps')), - 'tbr': int_or_none(data.get('Bitrate'), 1000), - }] - - def _real_extract(self, url): - vod_id = self._match_id(url) - - vod_info = self._download_json( - '%s/recordings/%s' % (self._API_BASE, vod_id), vod_id) - - state = vod_info.get('state') - if state != 'AVAILABLE': - raise ExtractorError( - 'VOD %s is not available (state: %s)' % (vod_id, state), - expected=True) - - formats = [] - thumbnail_url = None - - for vod in vod_info['vods']: - vod_type = vod.get('format') - if vod_type in ('hls', 'raw'): - formats.extend(self._extract_format(vod, vod_type)) - elif vod_type == 'thumbnail': - thumbnail_url = urljoin(vod.get('baseUrl'), 'source.png') - - self._sort_formats(formats) - - info = { - 'id': vod_id, - 'title': vod_info.get('name') or vod_id, - 'duration': float_or_none(vod_info.get('duration')), - 'thumbnail': thumbnail_url, - 'timestamp': parse_iso8601(vod_info.get('createdAt')), - 'view_count': int_or_none(vod_info.get('viewsTotal')), - 'formats': formats, - } - info.update(self._extract_channel_info(vod_info.get('channel') or {})) - - return info diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py index 485173774..9f9de96c6 100644 --- a/youtube_dl/extractor/bellmedia.py +++ b/youtube_dl/extractor/bellmedia.py @@ -25,8 +25,8 @@ class BellMediaIE(InfoExtractor): etalk| marilyn )\.ca| - much\.com - )/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6,})''' + (?:much|cp24)\.com + )/.*?(?:\b(?:vid(?:eoid)?|clipId)=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6,})''' _TESTS = [{ 'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070', 'md5': '36d3ef559cfe8af8efe15922cd3ce950', @@ -62,6 +62,9 @@ class BellMediaIE(InfoExtractor): }, { 'url': 'http://www.etalk.ca/video?videoid=663455', 'only_matching': True, + }, { + 'url': 'https://www.cp24.com/video?clipId=1982548', + 'only_matching': True, }] _DOMAINS = { 'thecomedynetwork': 'comedy', diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py index af21e3ee5..17ebbb257 100644 --- a/youtube_dl/extractor/biqle.py +++ b/youtube_dl/extractor/biqle.py @@ -3,10 +3,11 @@ from __future__ import unicode_literals from .common import InfoExtractor from .vk import VKIE -from ..utils import ( - HEADRequest, - int_or_none, +from ..compat import ( + compat_b64decode, + compat_urllib_parse_unquote, ) +from ..utils import int_or_none class BIQLEIE(InfoExtractor): @@ -47,9 +48,16 @@ class BIQLEIE(InfoExtractor): if VKIE.suitable(embed_url): return self.url_result(embed_url, VKIE.ie_key(), video_id) - self._request_webpage( - HEADRequest(embed_url), video_id, headers={'Referer': url}) - video_id, sig, _, access_token = self._get_cookies(embed_url)['video_ext'].value.split('%3A') + embed_page = self._download_webpage( + embed_url, video_id, headers={'Referer': url}) + video_ext = self._get_cookies(embed_url).get('video_ext') + if video_ext: + video_ext = compat_urllib_parse_unquote(video_ext.value) + if not video_ext: + video_ext = compat_b64decode(self._search_regex( + r'video_ext\s*:\s*[\'"]([A-Za-z0-9+/=]+)', + embed_page, 'video_ext')).decode() + video_id, sig, _, access_token = video_ext.split(':') item = self._download_json( 'https://api.vk.com/method/video.get', video_id, headers={'User-Agent': 'okhttp/3.4.1'}, query={ diff --git a/youtube_dl/extractor/box.py b/youtube_dl/extractor/box.py new file mode 100644 index 000000000..aae82d1af --- /dev/null +++ b/youtube_dl/extractor/box.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + parse_iso8601, + # try_get, + update_url_query, +) + + +class BoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P[^/]+)/file/(?P\d+)' + _TEST = { + 'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538', + 'md5': '1f81b2fd3960f38a40a3b8823e5fcd43', + 'info_dict': { + 'id': '510727257538', + 'ext': 'mp4', + 'title': 'Garber St. Louis will be 28th MLS team +scarving.mp4', + 'uploader': 'MLS Video', + 'timestamp': 1566320259, + 'upload_date': '20190820', + 'uploader_id': '235196876', + } + } + + def _real_extract(self, url): + shared_name, file_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, file_id) + request_token = self._parse_json(self._search_regex( + r'Box\.config\s*=\s*({.+?});', webpage, + 'Box config'), file_id)['requestToken'] + access_token = self._download_json( + 'https://app.box.com/app-api/enduserapp/elements/tokens', file_id, + 'Downloading token JSON metadata', + data=json.dumps({'fileIDs': [file_id]}).encode(), headers={ + 'Content-Type': 'application/json', + 'X-Request-Token': request_token, + 'X-Box-EndUser-API': 'sharedName=' + shared_name, + })[file_id]['read'] + shared_link = 'https://app.box.com/s/' + shared_name + f = self._download_json( + 'https://api.box.com/2.0/files/' + file_id, file_id, + 'Downloading file JSON metadata', headers={ + 'Authorization': 'Bearer ' + access_token, + 'BoxApi': 'shared_link=' + shared_link, + 'X-Rep-Hints': '[dash]', # TODO: extract `hls` formats + }, query={ + 'fields': 'authenticated_download_url,created_at,created_by,description,extension,is_download_available,name,representations,size' + }) + title = f['name'] + + query = { + 'access_token': access_token, + 'shared_link': shared_link + } + + formats = [] + + # for entry in (try_get(f, lambda x: x['representations']['entries'], list) or []): + # entry_url_template = try_get( + # entry, lambda x: x['content']['url_template']) + # if not entry_url_template: + # continue + # representation = entry.get('representation') + # if representation == 'dash': + # TODO: append query to every fragment URL + # formats.extend(self._extract_mpd_formats( + # entry_url_template.replace('{+asset_path}', 'manifest.mpd'), + # file_id, query=query)) + + authenticated_download_url = f.get('authenticated_download_url') + if authenticated_download_url and f.get('is_download_available'): + formats.append({ + 'ext': f.get('extension') or determine_ext(title), + 'filesize': f.get('size'), + 'format_id': 'download', + 'url': update_url_query(authenticated_download_url, query), + }) + + self._sort_formats(formats) + + creator = f.get('created_by') or {} + + return { + 'id': file_id, + 'title': title, + 'formats': formats, + 'description': f.get('description') or None, + 'uploader': creator.get('name'), + 'timestamp': parse_iso8601(f.get('created_at')), + 'uploader_id': creator.get('id'), + } diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 85001b3ad..000eac71c 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -5,32 +5,34 @@ import base64 import re import struct -from .common import InfoExtractor from .adobepass import AdobePassIE +from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, + compat_HTTPError, compat_parse_qs, compat_urllib_parse_urlparse, compat_urlparse, compat_xml_parse_error, - compat_HTTPError, ) from ..utils import ( - ExtractorError, + clean_html, extract_attributes, + ExtractorError, find_xpath_attr, fix_xml_ampersands, float_or_none, - js_to_json, int_or_none, + js_to_json, + mimetype2ext, parse_iso8601, smuggle_url, + str_or_none, unescapeHTML, unsmuggle_url, - update_url_query, - clean_html, - mimetype2ext, UnsupportedError, + update_url_query, + url_or_none, ) @@ -145,7 +147,7 @@ class BrightcoveLegacyIE(InfoExtractor): ] @classmethod - def _build_brighcove_url(cls, object_str): + def _build_brightcove_url(cls, object_str): """ Build a Brightcove url from a xml string containing {params} @@ -215,7 +217,7 @@ class BrightcoveLegacyIE(InfoExtractor): return cls._make_brightcove_url(params) @classmethod - def _build_brighcove_url_from_js(cls, object_js): + def _build_brightcove_url_from_js(cls, object_js): # The layout of JS is as follows: # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { # // build Brightcove XML @@ -270,12 +272,12 @@ class BrightcoveLegacyIE(InfoExtractor): ).+?>\s*''', webpage) if matches: - return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + return list(filter(None, [cls._build_brightcove_url(m) for m in matches])) matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage) if matches: return list(filter(None, [ - cls._build_brighcove_url_from_js(custom_bc) + cls._build_brightcove_url_from_js(custom_bc) for custom_bc in matches])) return [src for _, src in re.findall( r']+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)] @@ -424,7 +426,7 @@ class BrightcoveNewIE(AdobePassIE): # [2] looks like: for video, script_tag, account_id, player_id, embed in re.findall( r'''(?isx) - (]*\bdata-video-id\s*=\s*['"]?[^>]+>) + (]*\bdata-video-id\s*=\s*['"]?[^>]+>) (?:.*? (', + webpage, 'hydration data', default='{}'), video_id) + + clip = try_get( + hydration_data, lambda x: x['clips'][video_id], dict) or {} + if not clip: + raise ExtractorError( + 'Could not find video information.', video_id=video_id) + + title = clip['contentTitle'] + + source_width = int_or_none(clip.get('sourceWidth')) + source_height = int_or_none(clip.get('sourceHeight')) + + aspect_ratio = source_width / source_height if source_width and source_height else 16 / 9 + + def add_item(container, item_url, height, id_key='format_id', item_id=None): + item_id = item_id or '%dp' % height + if item_id not in item_url: + return + width = int(round(aspect_ratio * height)) + container.append({ + 'url': item_url, + id_key: item_id, + 'width': width, + 'height': height + }) + + formats = [] + thumbnails = [] + for k, v in clip.items(): + if not (v and isinstance(v, compat_str)): + continue + mobj = re.match(r'(contentUrl|thumbnail)(?:(\d+)p)?$', k) + if not mobj: + continue + prefix = mobj.group(1) + height = int_or_none(mobj.group(2)) + if prefix == 'contentUrl': + add_item( + formats, v, height or source_height, + item_id=None if height else 'source') + elif prefix == 'thumbnail': + add_item(thumbnails, v, height, 'id') + + error = clip.get('error') + if not formats and error: + if error == 404: + raise ExtractorError( + 'That clip does not exist.', + expected=True, video_id=video_id) + else: + raise ExtractorError( + 'An unknown error occurred ({0}).'.format(error), + video_id=video_id) + + self._sort_formats(formats) + + # Necessary because the id of the author is not known in advance. + # Won't raise an issue if no profile can be found as this is optional. + author = try_get( + hydration_data, lambda x: list(x['profiles'].values())[0], dict) or {} + author_id = str_or_none(author.get('id')) + author_url = 'https://medal.tv/users/{0}'.format(author_id) if author_id else None + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': clip.get('contentDescription'), + 'uploader': author.get('displayName'), + 'timestamp': float_or_none(clip.get('created'), 1000), + 'uploader_id': author_id, + 'uploader_url': author_url, + 'duration': int_or_none(clip.get('videoLengthSeconds')), + 'view_count': int_or_none(clip.get('views')), + 'like_count': int_or_none(clip.get('likes')), + 'comment_count': int_or_none(clip.get('comments')), + } diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 933df1495..2c16fc9e2 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -23,7 +23,7 @@ class MediasetIE(ThePlatformBaseIE): https?:// (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ (?: - (?:video|on-demand)/(?:[^/]+/)+[^/]+_| + (?:video|on-demand|movie)/(?:[^/]+/)+[^/]+_| player/index\.html\?.*?\bprogramGuid= ) )(?P[0-9A-Z]{16,}) @@ -88,6 +88,9 @@ class MediasetIE(ThePlatformBaseIE): }, { 'url': 'https://www.mediasetplay.mediaset.it/video/grandefratellovip/benedetta-una-doccia-gelata_F309344401044C135', 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/movie/herculeslaleggendahainizio/hercules-la-leggenda-ha-inizio_F305927501000102', + 'only_matching': True, }] @staticmethod diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index 71fc3ec56..cab3aa045 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -17,9 +17,8 @@ from ..utils import ( class MGTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P\d+)\.html' + _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P\d+)\.html' IE_DESC = '芒果TV' - _GEO_COUNTRIES = ['CN'] _TESTS = [{ 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', @@ -34,14 +33,18 @@ class MGTVIE(InfoExtractor): }, { 'url': 'http://www.mgtv.com/b/301817/3826653.html', 'only_matching': True, + }, { + 'url': 'https://w.mgtv.com/b/301817/3826653.html', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + tk2 = base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1] try: api_data = self._download_json( 'https://pcweb.api.mgtv.com/player/video', video_id, query={ - 'tk2': base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1], + 'tk2': tk2, 'video_id': video_id, }, headers=self.geo_verification_headers())['data'] except ExtractorError as e: @@ -56,6 +59,7 @@ class MGTVIE(InfoExtractor): stream_data = self._download_json( 'https://pcweb.api.mgtv.com/player/getSource', video_id, query={ 'pm2': api_data['atc']['pm2'], + 'tk2': tk2, 'video_id': video_id, }, headers=self.geo_verification_headers())['data'] stream_domain = stream_data['stream_domain'][0] diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py index 1c652813a..5234cac02 100644 --- a/youtube_dl/extractor/mofosex.py +++ b/youtube_dl/extractor/mofosex.py @@ -1,5 +1,8 @@ from __future__ import unicode_literals +import re + +from .common import InfoExtractor from ..utils import ( int_or_none, str_to_int, @@ -54,3 +57,23 @@ class MofosexIE(KeezMoviesIE): }) return info + + +class MofosexEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=(?P\d+)' + _TESTS = [{ + 'url': 'https://www.mofosex.com/embed/?videoid=318131&referrer=KM', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\']((?:https?:)?//(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=\d+)', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'http://www.mofosex.com/videos/{0}/{0}.html'.format(video_id), + ie=MofosexIE.ie_key(), video_id=video_id) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 43fd70f11..b1615b4d8 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -26,7 +26,7 @@ class MotherlessIE(InfoExtractor): 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], 'upload_date': '20100913', 'uploader_id': 'famouslyfuckedup', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, } }, { @@ -40,7 +40,7 @@ class MotherlessIE(InfoExtractor): 'game', 'hairy'], 'upload_date': '20140622', 'uploader_id': 'Sulivana7x', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, }, 'skip': '404', @@ -54,7 +54,7 @@ class MotherlessIE(InfoExtractor): 'categories': ['superheroine heroine superher'], 'upload_date': '20140827', 'uploader_id': 'shade0230', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, } }, { @@ -76,7 +76,8 @@ class MotherlessIE(InfoExtractor): raise ExtractorError('Video %s is for friends only' % video_id, expected=True) title = self._html_search_regex( - r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') + (r'(?s)]+\bclass=["\']media-meta-title[^>]+>(.+?)', + r'id="view-upload-title">\s+([^<]+)<'), webpage, 'title') video_url = (self._html_search_regex( (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', r'fileurl\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), @@ -84,14 +85,15 @@ class MotherlessIE(InfoExtractor): or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( - r'Views\s+([^<]+)<', + (r'>(\d+)\s+Views<', r'Views\s+([^<]+)<'), webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( - r'Favorited\s+([^<]+)<', + (r'>(\d+)\s+Favorites<', r'Favorited\s+([^<]+)<'), webpage, 'like count', fatal=False)) upload_date = self._html_search_regex( - r'Uploaded\s+([^<]+)<', webpage, 'upload date') + (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', + r'Uploaded\s+([^<]+)<'), webpage, 'upload date') if 'Ago' in upload_date: days = int(re.search(r'([0-9]+)', upload_date).group(1)) upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index fedd5f46b..df1034fc5 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -349,6 +349,18 @@ class MTVIE(MTVServicesInfoExtractor): 'only_matching': True, }] + @staticmethod + def extract_child_with_type(parent, t): + children = parent['children'] + return next(c for c in children if c.get('type') == t) + + def _extract_mgid(self, webpage): + data = self._parse_json(self._search_regex( + r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) + main_container = self.extract_child_with_type(data, 'MainContainer') + video_player = self.extract_child_with_type(main_container, 'VideoPlayer') + return video_player['props']['media']['video']['config']['uri'] + class MTVJapanIE(MTVServicesInfoExtractor): IE_NAME = 'mtvjapan' diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 6f3cb3003..ea5f5a315 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -10,7 +10,6 @@ from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, - js_to_json, parse_duration, smuggle_url, try_get, @@ -394,8 +393,8 @@ class NBCNewsIE(ThePlatformIE): webpage = self._download_webpage(url, video_id) data = self._parse_json(self._search_regex( - r'window\.__data\s*=\s*({.+});', webpage, - 'bootstrap json'), video_id, js_to_json) + r'', + webpage, 'bootstrap json'), video_id)['props']['initialState'] video_data = try_get(data, lambda x: x['video']['current'], dict) if not video_data: video_data = data['article']['content'][0]['primaryMedia']['video'] diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 2447c812e..ddd828d92 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -81,6 +81,29 @@ class NDRIE(NDRBaseIE): 'params': { 'skip_download': True, }, + }, { + # with subtitles + 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html', + 'info_dict': { + 'id': 'extra18674', + 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring', + 'ext': 'mp4', + 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', + 'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6', + 'uploader': 'ndrtv', + 'upload_date': '20201113', + 'duration': 1749, + 'subtitles': { + 'de': [{ + 'ext': 'ttml', + 'url': r're:^https://www\.ndr\.de.+', + }], + }, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html', 'only_matching': True, @@ -239,6 +262,20 @@ class NDREmbedBaseIE(InfoExtractor): 'preference': quality_key(thumbnail.get('quality')), }) + subtitles = {} + tracks = config.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if not isinstance(track, dict): + continue + track_url = urljoin(url, track.get('src')) + if not track_url: + continue + subtitles.setdefault(track.get('srclang') or 'de', []).append({ + 'url': track_url, + 'ext': 'ttml', + }) + return { 'id': video_id, 'title': title, @@ -248,6 +285,7 @@ class NDREmbedBaseIE(InfoExtractor): 'duration': duration, 'thumbnails': thumbnails, 'formats': formats, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index de6a707c4..6a61a47d2 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -10,7 +10,7 @@ class NhkVodIE(InfoExtractor): # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ - # clip + # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', 'md5': '256a1be14f48d960a7e61e2532d95ec3', 'info_dict': { @@ -21,6 +21,19 @@ class NhkVodIE(InfoExtractor): 'timestamp': 1565965194, 'upload_date': '20190816', }, + }, { + # audio clip + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/', + 'info_dict': { + 'id': 'r_inventions-20201104-1-en', + 'ext': 'm4a', + 'title': "Japan's Top Inventions - Miniature Video Cameras", + 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', 'only_matching': True, diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 2850af5db..47b9748f0 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + determine_ext, int_or_none, js_to_json, qualities, @@ -33,42 +34,76 @@ class NovaEmbedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - bitrates = self._parse_json( - self._search_regex( - r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), - video_id, transform_source=js_to_json) - - QUALITIES = ('lq', 'mq', 'hq', 'hd') - quality_key = qualities(QUALITIES) - + duration = None formats = [] - for format_id, format_list in bitrates.items(): - if not isinstance(format_list, list): - format_list = [format_list] - for format_url in format_list: - format_url = url_or_none(format_url) - if not format_url: - continue - if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - continue - f = { - 'url': format_url, - } - f_id = format_id - for quality in QUALITIES: - if '%s.mp4' % quality in format_url: - f_id += '-%s' % quality - f.update({ - 'quality': quality_key(quality), - 'format_note': quality.upper(), + + player = self._parse_json( + self._search_regex( + r'Player\.init\s*\([^,]+,\s*({.+?})\s*,\s*{.+?}\s*\)\s*;', + webpage, 'player', default='{}'), video_id, fatal=False) + if player: + for format_id, format_list in player['tracks'].items(): + if not isinstance(format_list, list): + format_list = [format_list] + for format_dict in format_list: + if not isinstance(format_dict, dict): + continue + format_url = url_or_none(format_dict.get('src')) + format_type = format_dict.get('type') + ext = determine_ext(format_url) + if (format_type == 'application/x-mpegURL' + or format_id == 'HLS' or ext == 'm3u8'): + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + elif (format_type == 'application/dash+xml' + or format_id == 'DASH' or ext == 'mpd'): + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': format_url, }) - break - f['format_id'] = f_id - formats.append(f) + duration = int_or_none(player.get('duration')) + else: + # Old path, not actual as of 08.04.2020 + bitrates = self._parse_json( + self._search_regex( + r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), + video_id, transform_source=js_to_json) + + QUALITIES = ('lq', 'mq', 'hq', 'hd') + quality_key = qualities(QUALITIES) + + for format_id, format_list in bitrates.items(): + if not isinstance(format_list, list): + format_list = [format_list] + for format_url in format_list: + format_url = url_or_none(format_url) + if not format_url: + continue + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + continue + f = { + 'url': format_url, + } + f_id = format_id + for quality in QUALITIES: + if '%s.mp4' % quality in format_url: + f_id += '-%s' % quality + f.update({ + 'quality': quality_key(quality), + 'format_note': quality.upper(), + }) + break + f['format_id'] = f_id + formats.append(f) + self._sort_formats(formats) title = self._og_search_title( @@ -81,7 +116,8 @@ class NovaEmbedIE(InfoExtractor): r'poster\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'thumbnail', fatal=False, group='value') duration = int_or_none(self._search_regex( - r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', + default=duration)) return { 'id': video_id, diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py index 53acc6e57..9d1122f0c 100644 --- a/youtube_dl/extractor/npr.py +++ b/youtube_dl/extractor/npr.py @@ -33,7 +33,7 @@ class NprIE(InfoExtractor): }, }], }, { - # mutlimedia, not media title + # multimedia, not media title 'url': 'https://www.npr.org/2017/06/19/533198237/tigers-jaw-tiny-desk-concert', 'info_dict': { 'id': '533198237', diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 94115534b..fdf2d7407 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools +import random import re from .common import InfoExtractor @@ -9,25 +11,289 @@ from ..compat import ( compat_urllib_parse_unquote, ) from ..utils import ( + determine_ext, ExtractorError, int_or_none, - JSON_LD_RE, - js_to_json, - NO_DEFAULT, parse_age_limit, parse_duration, try_get, + urljoin, + url_or_none, ) class NRKBaseIE(InfoExtractor): _GEO_COUNTRIES = ['NO'] + _CDN_REPL_REGEX = r'''(?x):// + (?: + nrkod\d{1,2}-httpcache0-47115-cacheod0\.dna\.ip-only\.net/47115-cacheod0| + nrk-od-no\.telenorcdn\.net| + minicdn-od\.nrk\.no/od/nrkhd-osl-rr\.netwerk\.no/no + )/''' - _api_host = None + def _extract_nrk_formats(self, asset_url, video_id): + if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url): + return self._extract_akamai_formats( + re.sub(r'(?:b=\d+-\d+|__a__=off)&?', '', asset_url), video_id) + asset_url = re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url) + formats = self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', fatal=False) + if not formats and re.search(self._CDN_REPL_REGEX, asset_url): + formats = self._extract_m3u8_formats( + re.sub(self._CDN_REPL_REGEX, '://nrk-od-%02d.akamaized.net/no/' % random.randint(0, 99), asset_url), + video_id, 'mp4', 'm3u8_native', fatal=False) + return formats + + def _raise_error(self, data): + MESSAGES = { + 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', + 'ProgramRightsHasExpired': 'Programmet har gått ut', + 'NoProgramRights': 'Ikke tilgjengelig', + 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', + } + message_type = data.get('messageType', '') + # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* + if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is True: + self.raise_geo_restricted( + msg=MESSAGES.get('ProgramIsGeoBlocked'), + countries=self._GEO_COUNTRIES) + message = data.get('endUserMessage') or MESSAGES.get(message_type, message_type) + raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + + def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None): + return self._download_json( + urljoin('http://psapi.nrk.no/', path), + video_id, note or 'Downloading %s JSON' % item, + fatal=fatal, query=query) + + +class NRKIE(NRKBaseIE): + _VALID_URL = r'''(?x) + (?: + nrk:| + https?:// + (?: + (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| + v8[-.]psapi\.nrk\.no/mediaelement/ + ) + ) + (?P[^?\#&]+) + ''' + + _TESTS = [{ + # video + 'url': 'http://www.nrk.no/video/PS*150533', + 'md5': 'f46be075326e23ad0e524edfcb06aeb6', + 'info_dict': { + 'id': '150533', + 'ext': 'mp4', + 'title': 'Dompap og andre fugler i Piip-Show', + 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', + 'duration': 262, + } + }, { + # audio + 'url': 'http://www.nrk.no/video/PS*154915', + # MD5 is unstable + 'info_dict': { + 'id': '154915', + 'ext': 'mp4', + 'title': 'Slik høres internett ut når du er blind', + 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', + 'duration': 20, + } + }, { + 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', + 'only_matching': True, + }, { + 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', + 'only_matching': True, + }] + + def _extract_from_playback(self, video_id): + path_templ = 'playback/%s/' + video_id + + def call_playback_api(item, query=None): + return self._call_api(path_templ % item, video_id, item, query=query) + # known values for preferredCdn: akamai, iponly, minicdn and telenor + manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) + + if manifest.get('playability') == 'nonPlayable': + self._raise_error(manifest['nonPlayable']) + + playable = manifest['playable'] + + formats = [] + for asset in playable['assets']: + if not isinstance(asset, dict): + continue + if asset.get('encrypted'): + continue + format_url = url_or_none(asset.get('url')) + if not format_url: + continue + if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_nrk_formats(format_url, video_id)) + self._sort_formats(formats) + + data = call_playback_api('metadata') + + preplay = data['preplay'] + titles = preplay['titles'] + title = titles['title'] + alt_title = titles.get('subtitle') + + description = preplay.get('description') + duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) + + thumbnails = [] + for image in try_get( + preplay, lambda x: x['poster']['images'], list) or []: + if not isinstance(image, dict): + continue + image_url = url_or_none(image.get('url')) + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('pixelWidth')), + 'height': int_or_none(image.get('pixelHeight')), + }) + + return { + 'id': video_id, + 'title': title, + 'alt_title': alt_title, + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats, + } def _real_extract(self, url): video_id = self._match_id(url) + return self._extract_from_playback(video_id) + +class NRKTVIE(NRKBaseIE): + IE_DESC = 'NRK TV and NRK Radio' + _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' + _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE + _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') + _TESTS = [{ + 'url': 'https://tv.nrk.no/program/MDDP12000117', + 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', + 'info_dict': { + 'id': 'MDDP12000117AA', + 'ext': 'mp4', + 'title': 'Alarm Trolltunga', + 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', + 'duration': 2223.44, + 'age_limit': 6, + }, + }, { + 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', + 'md5': '8d40dab61cea8ab0114e090b029a0565', + 'info_dict': { + 'id': 'MUHH48000314AA', + 'ext': 'mp4', + 'title': '20 spørsmål 23.05.2014', + 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', + 'duration': 1741, + 'series': '20 spørsmål', + 'episode': '23.05.2014', + }, + }, { + 'url': 'https://tv.nrk.no/program/mdfp15000514', + 'info_dict': { + 'id': 'MDFP15000514CA', + 'ext': 'mp4', + 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', + 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', + 'duration': 4605.08, + 'series': 'Kunnskapskanalen', + 'episode': '24.05.2014', + }, + 'params': { + 'skip_download': True, + }, + }, { + # single playlist video + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', + 'info_dict': { + 'id': 'MSPO40010515AH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', + 'description': 'md5:c03aba1e917561eface5214020551b7a', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + 'skip': 'particular part is not supported currently', + }, { + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', + 'info_dict': { + 'id': 'MSPO40010515AH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', + 'description': 'md5:c03aba1e917561eface5214020551b7a', + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', + 'info_dict': { + 'id': 'KMTE50001317AA', + 'ext': 'mp4', + 'title': 'Anno 13:30', + 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', + 'duration': 2340, + 'series': 'Anno', + 'episode': '13:30', + 'season_number': 3, + 'episode_number': 13, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', + 'info_dict': { + 'id': 'MUHH46000317AA', + 'ext': 'mp4', + 'title': 'Nytt på Nytt 27.01.2017', + 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', + 'duration': 1796, + 'series': 'Nytt på nytt', + 'episode': '27.01.2017', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'ProgramRightsHasExpired', + }, { + 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201507/NPUB21019315', + 'only_matching': True, + }] + + _api_host = None + + def _extract_from_mediaelement(self, video_id): api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS for api_host in api_hosts: @@ -43,6 +309,7 @@ class NRKBaseIE(InfoExtractor): title = data.get('fullTitle') or data.get('mainTitle') or data['title'] video_id = data.get('id') or video_id + urls = [] entries = [] conviva = data.get('convivaStatistics') or {} @@ -59,19 +326,14 @@ class NRKBaseIE(InfoExtractor): else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) for num, asset in enumerate(media_assets, 1): asset_url = asset.get('url') - if not asset_url: + if not asset_url or asset_url in urls: continue - formats = self._extract_akamai_formats(asset_url, video_id) + urls.append(asset_url) + formats = self._extract_nrk_formats(asset_url, video_id) if not formats: continue self._sort_formats(formats) - # Some f4m streams may not work with hdcore in fragments' URLs - for f in formats: - extra_param = f.get('extra_param_to_segment_url') - if extra_param and 'hdcore' in extra_param: - del f['extra_param_to_segment_url'] - entry_id, entry_title = video_id_and_title(num) duration = parse_duration(asset.get('duration')) subtitles = {} @@ -87,38 +349,26 @@ class NRKBaseIE(InfoExtractor): 'duration': duration, 'subtitles': subtitles, 'formats': formats, + 'is_live': live, }) if not entries: media_url = data.get('mediaUrl') - if media_url: - formats = self._extract_akamai_formats(media_url, video_id) - self._sort_formats(formats) - duration = parse_duration(data.get('duration')) - entries = [{ - 'id': video_id, - 'title': make_title(title), - 'duration': duration, - 'formats': formats, - }] + if media_url and media_url not in urls: + formats = self._extract_nrk_formats(media_url, video_id) + if formats: + self._sort_formats(formats) + duration = parse_duration(data.get('duration')) + entries = [{ + 'id': video_id, + 'title': make_title(title), + 'duration': duration, + 'formats': formats, + 'is_live': live, + }] if not entries: - MESSAGES = { - 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', - 'ProgramRightsHasExpired': 'Programmet har gått ut', - 'NoProgramRights': 'Ikke tilgjengelig', - 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', - } - message_type = data.get('messageType', '') - # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* - if 'IsGeoBlocked' in message_type: - self.raise_geo_restricted( - msg=MESSAGES.get('ProgramIsGeoBlocked'), - countries=self._GEO_COUNTRIES) - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, MESSAGES.get( - message_type, message_type)), - expected=True) + self._raise_error(data) series = conviva.get('seriesName') or data.get('seriesTitle') episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') @@ -196,190 +446,9 @@ class NRKBaseIE(InfoExtractor): return self.playlist_result(entries, video_id, title, description) - -class NRKIE(NRKBaseIE): - _VALID_URL = r'''(?x) - (?: - nrk:| - https?:// - (?: - (?:www\.)?nrk\.no/video/PS\*| - v8[-.]psapi\.nrk\.no/mediaelement/ - ) - ) - (?P[^?#&]+) - ''' - _API_HOSTS = ('psapi.nrk.no', 'v8-psapi.nrk.no') - _TESTS = [{ - # video - 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': '706f34cdf1322577589e369e522b50ef', - 'info_dict': { - 'id': '150533', - 'ext': 'mp4', - 'title': 'Dompap og andre fugler i Piip-Show', - 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 262, - } - }, { - # audio - 'url': 'http://www.nrk.no/video/PS*154915', - # MD5 is unstable - 'info_dict': { - 'id': '154915', - 'ext': 'flv', - 'title': 'Slik høres internett ut når du er blind', - 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', - 'duration': 20, - } - }, { - 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }, { - 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', - 'only_matching': True, - }, { - 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }] - - -class NRKTVIE(NRKBaseIE): - IE_DESC = 'NRK TV and NRK Radio' - _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' - _VALID_URL = r'''(?x) - https?:// - (?:tv|radio)\.nrk(?:super)?\.no/ - (?:serie(?:/[^/]+){1,2}|program)/ - (?![Ee]pisodes)%s - (?:/\d{2}-\d{2}-\d{4})? - (?:\#del=(?P\d+))? - ''' % _EPISODE_RE - _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') - _TESTS = [{ - 'url': 'https://tv.nrk.no/program/MDDP12000117', - 'md5': '8270824df46ec629b66aeaa5796b36fb', - 'info_dict': { - 'id': 'MDDP12000117AA', - 'ext': 'mp4', - 'title': 'Alarm Trolltunga', - 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', - 'duration': 2223, - 'age_limit': 6, - }, - }, { - 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': '9a167e54d04671eb6317a37b7bc8a280', - 'info_dict': { - 'id': 'MUHH48000314AA', - 'ext': 'mp4', - 'title': '20 spørsmål 23.05.2014', - 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', - 'duration': 1741, - 'series': '20 spørsmål', - 'episode': '23.05.2014', - }, - 'skip': 'NoProgramRights', - }, { - 'url': 'https://tv.nrk.no/program/mdfp15000514', - 'info_dict': { - 'id': 'MDFP15000514CA', - 'ext': 'mp4', - 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', - 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', - 'duration': 4605, - 'series': 'Kunnskapskanalen', - 'episode': '24.05.2014', - }, - 'params': { - 'skip_download': True, - }, - }, { - # single playlist video - 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', - 'info_dict': { - 'id': 'MSPO40010515-part2', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Video is geo restricted'], - 'skip': 'particular part is not supported currently', - }, { - 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', - 'playlist': [{ - 'info_dict': { - 'id': 'MSPO40010515AH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 772, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'info_dict': { - 'id': 'MSPO40010515BH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 6175, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }], - 'info_dict': { - 'id': 'MSPO40010515', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - }, - 'expected_warnings': ['Video is geo restricted'], - }, { - 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', - 'info_dict': { - 'id': 'KMTE50001317AA', - 'ext': 'mp4', - 'title': 'Anno 13:30', - 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', - 'duration': 2340, - 'series': 'Anno', - 'episode': '13:30', - 'season_number': 3, - 'episode_number': 13, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', - 'info_dict': { - 'id': 'MUHH46000317AA', - 'ext': 'mp4', - 'title': 'Nytt på Nytt 27.01.2017', - 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', - 'duration': 1796, - 'series': 'Nytt på nytt', - 'episode': '27.01.2017', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', - 'only_matching': True, - }, { - 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', - 'only_matching': True, - }] + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_from_mediaelement(video_id) class NRKTVEpisodeIE(InfoExtractor): @@ -425,66 +494,114 @@ class NRKTVEpisodeIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - nrk_id = self._parse_json( - self._search_regex(JSON_LD_RE, webpage, 'JSON-LD', group='json_ld'), - display_id)['@id'] - + info = self._search_json_ld(webpage, display_id, default={}) + nrk_id = info.get('@id') or self._html_search_meta( + 'nrk:program-id', webpage, default=None) or self._search_regex( + r'data-program-id=["\'](%s)' % NRKTVIE._EPISODE_RE, webpage, + 'nrk id') assert re.match(NRKTVIE._EPISODE_RE, nrk_id) - return self.url_result( - 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id) + + info.update({ + '_type': 'url_transparent', + 'id': nrk_id, + 'url': 'nrk:%s' % nrk_id, + 'ie_key': NRKIE.ie_key(), + }) + return info -class NRKTVSerieBaseIE(InfoExtractor): - def _extract_series(self, webpage, display_id, fatal=True): - config = self._parse_json( - self._search_regex( - (r'INITIAL_DATA(?:_V\d)?_*\s*=\s*({.+?})\s*;', - r'({.+?})\s*,\s*"[^"]+"\s*\)\s*'), - webpage, 'config', default='{}' if not fatal else NO_DEFAULT), - display_id, fatal=False, transform_source=js_to_json) - if not config: - return - return try_get( - config, - (lambda x: x['initialState']['series'], lambda x: x['series']), - dict) - - def _extract_seasons(self, seasons): - if not isinstance(seasons, list): - return [] - entries = [] - for season in seasons: - entries.extend(self._extract_episodes(season)) - return entries - - def _extract_episodes(self, season): - if not isinstance(season, dict): - return [] - return self._extract_entries(season.get('episodes')) - +class NRKTVSerieBaseIE(NRKBaseIE): def _extract_entries(self, entry_list): if not isinstance(entry_list, list): return [] entries = [] for episode in entry_list: - nrk_id = episode.get('prfId') + nrk_id = episode.get('prfId') or episode.get('episodeId') if not nrk_id or not isinstance(nrk_id, compat_str): continue + if not re.match(NRKTVIE._EPISODE_RE, nrk_id): + continue entries.append(self.url_result( 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) return entries + _ASSETS_KEYS = ('episodes', 'instalments',) + + def _extract_assets_key(self, embedded): + for asset_key in self._ASSETS_KEYS: + if embedded.get(asset_key): + return asset_key + + def _entries(self, data, display_id): + for page_num in itertools.count(1): + embedded = data.get('_embedded') or data + if not isinstance(embedded, dict): + break + assets_key = self._extract_assets_key(embedded) + if not assets_key: + break + # Extract entries + entries = try_get( + embedded, + (lambda x: x[assets_key]['_embedded'][assets_key], + lambda x: x[assets_key]), + list) + for e in self._extract_entries(entries): + yield e + # Find next URL + next_url_path = try_get( + data, + (lambda x: x['_links']['next']['href'], + lambda x: x['_embedded'][assets_key]['_links']['next']['href']), + compat_str) + if not next_url_path: + break + data = self._call_api( + next_url_path, display_id, + note='Downloading %s JSON page %d' % (assets_key, page_num), + fatal=False) + if not data: + break + class NRKTVSeasonIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://tv\.nrk\.no/serie/[^/]+/sesong/(?P\d+)' - _TEST = { + _VALID_URL = r'https?://(?Ptv|radio)\.nrk\.no/serie/(?P[^/]+)/(?:sesong/)?(?P\d+)' + _TESTS = [{ 'url': 'https://tv.nrk.no/serie/backstage/sesong/1', 'info_dict': { - 'id': '1', + 'id': 'backstage/1', 'title': 'Sesong 1', }, 'playlist_mincount': 30, - } + }, { + # no /sesong/ in path + 'url': 'https://tv.nrk.no/serie/lindmo/2016', + 'info_dict': { + 'id': 'lindmo/2016', + 'title': '2016', + }, + 'playlist_mincount': 29, + }, { + # weird nested _embedded in catalog JSON response + 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens/sesong/1', + 'info_dict': { + 'id': 'dickie-dick-dickens/1', + 'title': 'Sesong 1', + }, + 'playlist_mincount': 11, + }, { + # 841 entries, multi page + 'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201509', + 'info_dict': { + 'id': 'dagsnytt/201509', + 'title': 'September 2015', + }, + 'playlist_mincount': 841, + }, { + # 180 entries, single page + 'url': 'https://tv.nrk.no/serie/spangas/sesong/1', + 'only_matching': True, + }] @classmethod def suitable(cls, url): @@ -492,25 +609,35 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): else super(NRKTVSeasonIE, cls).suitable(url)) def _real_extract(self, url): - display_id = self._match_id(url) + domain, serie, season_id = re.match(self._VALID_URL, url).groups() + display_id = '%s/%s' % (serie, season_id) - webpage = self._download_webpage(url, display_id) + data = self._call_api( + '%s/catalog/series/%s/seasons/%s' % (domain, serie, season_id), + display_id, 'season', query={'pageSize': 50}) - series = self._extract_series(webpage, display_id) - - season = next( - s for s in series['seasons'] - if int(display_id) == s.get('seasonNumber')) - - title = try_get(season, lambda x: x['titles']['title'], compat_str) + title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id return self.playlist_result( - self._extract_episodes(season), display_id, title) + self._entries(data, display_id), + display_id, title) class NRKTVSeriesIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P[^/]+)' - _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P\d+)' + _VALID_URL = r'https?://(?P(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/serie/(?P[^/]+)' _TESTS = [{ + # new layout, instalments + 'url': 'https://tv.nrk.no/serie/groenn-glede', + 'info_dict': { + 'id': 'groenn-glede', + 'title': 'Grønn glede', + 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', + }, + 'playlist_mincount': 90, + }, { + # new layout, instalments, more entries + 'url': 'https://tv.nrk.no/serie/lindmo', + 'only_matching': True, + }, { 'url': 'https://tv.nrk.no/serie/blank', 'info_dict': { 'id': 'blank', @@ -524,25 +651,16 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'info_dict': { 'id': 'backstage', 'title': 'Backstage', - 'description': 'md5:c3ec3a35736fca0f9e1207b5511143d3', + 'description': 'md5:63692ceb96813d9a207e9910483d948b', }, 'playlist_mincount': 60, - }, { - # new layout, instalments - 'url': 'https://tv.nrk.no/serie/groenn-glede', - 'info_dict': { - 'id': 'groenn-glede', - 'title': 'Grønn glede', - 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', - }, - 'playlist_mincount': 10, }, { # old layout 'url': 'https://tv.nrksuper.no/serie/labyrint', 'info_dict': { 'id': 'labyrint', 'title': 'Labyrint', - 'description': 'md5:318b597330fdac5959247c9b69fdb1ec', + 'description': 'I Daidalos sin undersjøiske Labyrint venter spennende oppgaver, skumle robotskapninger og slim.', }, 'playlist_mincount': 3, }, { @@ -554,6 +672,17 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): }, { 'url': 'https://tv.nrk.no/serie/postmann-pat', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens', + 'info_dict': { + 'id': 'dickie-dick-dickens', + 'title': 'Dickie Dick Dickens', + 'description': 'md5:19e67411ffe57f7dce08a943d7a0b91f', + }, + 'playlist_mincount': 8, + }, { + 'url': 'https://nrksuper.no/serie/labyrint', + 'only_matching': True, }] @classmethod @@ -564,43 +693,42 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): else super(NRKTVSeriesIE, cls).suitable(url)) def _real_extract(self, url): - series_id = self._match_id(url) + site, series_id = re.match(self._VALID_URL, url).groups() + is_radio = site == 'radio.nrk' + domain = 'radio' if is_radio else 'tv' - webpage = self._download_webpage(url, series_id) + size_prefix = 'p' if is_radio else 'embeddedInstalmentsP' + series = self._call_api( + '%s/catalog/series/%s' % (domain, series_id), + series_id, 'serie', query={size_prefix + 'ageSize': 50}) + titles = try_get(series, [ + lambda x: x['titles'], + lambda x: x[x['type']]['titles'], + lambda x: x[x['seriesType']]['titles'], + ]) or {} - # New layout (e.g. https://tv.nrk.no/serie/backstage) - series = self._extract_series(webpage, series_id, fatal=False) - if series: - title = try_get(series, lambda x: x['titles']['title'], compat_str) - description = try_get( - series, lambda x: x['titles']['subtitle'], compat_str) - entries = [] - entries.extend(self._extract_seasons(series.get('seasons'))) - entries.extend(self._extract_entries(series.get('instalments'))) - entries.extend(self._extract_episodes(series.get('extraMaterial'))) - return self.playlist_result(entries, series_id, title, description) + entries = [] + entries.extend(self._entries(series, series_id)) + embedded = series.get('_embedded') or {} + linked_seasons = try_get(series, lambda x: x['_links']['seasons']) or [] + embedded_seasons = embedded.get('seasons') or [] + if len(linked_seasons) > len(embedded_seasons): + for season in linked_seasons: + season_name = season.get('name') + if season_name and isinstance(season_name, compat_str): + entries.append(self.url_result( + 'https://%s.nrk.no/serie/%s/sesong/%s' + % (domain, series_id, season_name), + ie=NRKTVSeasonIE.ie_key(), + video_title=season.get('title'))) + else: + for season in embedded_seasons: + entries.extend(self._entries(season, series_id)) + entries.extend(self._entries( + embedded.get('extraMaterial') or {}, series_id)) - # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint) - entries = [ - self.url_result( - 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( - series=series_id, season=season_id)) - for season_id in re.findall(self._ITEM_RE, webpage) - ] - - title = self._html_search_meta( - 'seriestitle', webpage, - 'title', default=None) or self._og_search_title( - webpage, fatal=False) - if title: - title = self._search_regex( - r'NRK (?:Super )?TV\s*[-–]\s*(.+)', title, 'title', default=title) - - description = self._html_search_meta( - 'series_description', webpage, - 'description', default=None) or self._og_search_description(webpage) - - return self.playlist_result(entries, series_id, title, description) + return self.playlist_result( + entries, series_id, titles.get('title'), titles.get('subtitle')) class NRKTVDirekteIE(NRKTVIE): @@ -704,14 +832,8 @@ class NRKSkoleIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://mimir.nrk.no/plugin/1.0/static?mediaId=%s' % video_id, - video_id) - - nrk_id = self._parse_json( - self._search_regex( - r'', - webpage, 'application json'), - video_id)['activeMedia']['psId'] + nrk_id = self._download_json( + 'https://nrkno-skole-prod.kube.nrk.no/skole/api/media/%s' % video_id, + video_id)['psId'] return self.url_result('nrk:%s' % nrk_id) diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index fc78ca56c..976b1c694 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -221,3 +221,41 @@ class NYTimesArticleIE(NYTimesBaseIE): r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'), webpage, 'podcast data') return self._extract_podcast_from_json(podcast_data, page_id, webpage) + + +class NYTimesCookingIE(NYTimesBaseIE): + _VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P\d+)' + _TESTS = [{ + 'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart', + 'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3', + 'info_dict': { + 'id': '100000004756089', + 'ext': 'mov', + 'timestamp': 1479383008, + 'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON', + 'title': 'Cranberry Tart', + 'upload_date': '20161117', + 'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.', + }, + }, { + 'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey', + 'md5': '4b2e8c70530a89b8d905a2b572316eb8', + 'info_dict': { + 'id': '100000003951728', + 'ext': 'mov', + 'timestamp': 1445509539, + 'description': 'Turkey guide', + 'upload_date': '20151022', + 'title': 'Turkey', + } + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + + webpage = self._download_webpage(url, page_id) + + video_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'video id') + + return self._extract_video_from_id(video_id) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index d54b8ace6..700ce448c 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -162,13 +162,12 @@ class ORFTVthekIE(InfoExtractor): class ORFRadioIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - station = mobj.group('station') show_date = mobj.group('date') show_id = mobj.group('show') data = self._download_json( 'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s' - % (station, show_id, show_date), show_id) + % (self._API_STATION, show_id, show_date), show_id) entries = [] for info in data['streams']: @@ -183,7 +182,7 @@ class ORFRadioIE(InfoExtractor): duration = end - start if end and start else None entries.append({ 'id': loop_stream_id.replace('.mp3', ''), - 'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (station, loop_stream_id), + 'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id), 'title': title, 'description': clean_html(data.get('subtitle')), 'duration': duration, @@ -205,6 +204,8 @@ class ORFFM4IE(ORFRadioIE): IE_NAME = 'orf:fm4' IE_DESC = 'radio FM4' _VALID_URL = r'https?://(?Pfm4)\.orf\.at/player/(?P[0-9]+)/(?P4\w+)' + _API_STATION = 'fm4' + _LOOP_STATION = 'fm4' _TEST = { 'url': 'http://fm4.orf.at/player/20170107/4CC', @@ -223,10 +224,142 @@ class ORFFM4IE(ORFRadioIE): } +class ORFNOEIE(ORFRadioIE): + IE_NAME = 'orf:noe' + IE_DESC = 'Radio Niederösterreich' + _VALID_URL = r'https?://(?Pnoe)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' + _API_STATION = 'noe' + _LOOP_STATION = 'oe2n' + + _TEST = { + 'url': 'https://noe.orf.at/player/20200423/NGM', + 'only_matching': True, + } + + +class ORFWIEIE(ORFRadioIE): + IE_NAME = 'orf:wien' + IE_DESC = 'Radio Wien' + _VALID_URL = r'https?://(?Pwien)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' + _API_STATION = 'wie' + _LOOP_STATION = 'oe2w' + + _TEST = { + 'url': 'https://wien.orf.at/player/20200423/WGUM', + 'only_matching': True, + } + + +class ORFBGLIE(ORFRadioIE): + IE_NAME = 'orf:burgenland' + IE_DESC = 'Radio Burgenland' + _VALID_URL = r'https?://(?Pburgenland)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' + _API_STATION = 'bgl' + _LOOP_STATION = 'oe2b' + + _TEST = { + 'url': 'https://burgenland.orf.at/player/20200423/BGM', + 'only_matching': True, + } + + +class ORFOOEIE(ORFRadioIE): + IE_NAME = 'orf:oberoesterreich' + IE_DESC = 'Radio Oberösterreich' + _VALID_URL = r'https?://(?Pooe)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' + _API_STATION = 'ooe' + _LOOP_STATION = 'oe2o' + + _TEST = { + 'url': 'https://ooe.orf.at/player/20200423/OGMO', + 'only_matching': True, + } + + +class ORFSTMIE(ORFRadioIE): + IE_NAME = 'orf:steiermark' + IE_DESC = 'Radio Steiermark' + _VALID_URL = r'https?://(?Psteiermark)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' + _API_STATION = 'stm' + _LOOP_STATION = 'oe2st' + + _TEST = { + 'url': 'https://steiermark.orf.at/player/20200423/STGMS', + 'only_matching': True, + } + + +class ORFKTNIE(ORFRadioIE): + IE_NAME = 'orf:kaernten' + IE_DESC = 'Radio Kärnten' + _VALID_URL = r'https?://(?Pkaernten)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' + _API_STATION = 'ktn' + _LOOP_STATION = 'oe2k' + + _TEST = { + 'url': 'https://kaernten.orf.at/player/20200423/KGUMO', + 'only_matching': True, + } + + +class ORFSBGIE(ORFRadioIE): + IE_NAME = 'orf:salzburg' + IE_DESC = 'Radio Salzburg' + _VALID_URL = r'https?://(?Psalzburg)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' + _API_STATION = 'sbg' + _LOOP_STATION = 'oe2s' + + _TEST = { + 'url': 'https://salzburg.orf.at/player/20200423/SGUM', + 'only_matching': True, + } + + +class ORFTIRIE(ORFRadioIE): + IE_NAME = 'orf:tirol' + IE_DESC = 'Radio Tirol' + _VALID_URL = r'https?://(?Ptirol)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' + _API_STATION = 'tir' + _LOOP_STATION = 'oe2t' + + _TEST = { + 'url': 'https://tirol.orf.at/player/20200423/TGUMO', + 'only_matching': True, + } + + +class ORFVBGIE(ORFRadioIE): + IE_NAME = 'orf:vorarlberg' + IE_DESC = 'Radio Vorarlberg' + _VALID_URL = r'https?://(?Pvorarlberg)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' + _API_STATION = 'vbg' + _LOOP_STATION = 'oe2v' + + _TEST = { + 'url': 'https://vorarlberg.orf.at/player/20200423/VGUM', + 'only_matching': True, + } + + +class ORFOE3IE(ORFRadioIE): + IE_NAME = 'orf:oe3' + IE_DESC = 'Radio Österreich 3' + _VALID_URL = r'https?://(?Poe3)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' + _API_STATION = 'oe3' + _LOOP_STATION = 'oe3' + + _TEST = { + 'url': 'https://oe3.orf.at/player/20200424/3WEK', + 'only_matching': True, + } + + class ORFOE1IE(ORFRadioIE): IE_NAME = 'orf:oe1' IE_DESC = 'Radio Österreich 1' _VALID_URL = r'https?://(?Poe1)\.orf\.at/player/(?P[0-9]+)/(?P\w+)' + _API_STATION = 'oe1' + _LOOP_STATION = 'oe1' _TEST = { 'url': 'http://oe1.orf.at/player/20170108/456544', diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 4dbe661be..d4baa16ee 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -477,7 +477,7 @@ class PBSIE(InfoExtractor): if media_id: return media_id, presumptive_id, upload_date, description - # Fronline video embedded via flp + # Frontline video embedded via flp video_id = self._search_regex( r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None) if video_id: diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index 48fb95416..c39d12728 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -541,6 +541,10 @@ class PeerTubeIE(InfoExtractor): 'format_id': format_id, 'filesize': file_size, }) + if format_id == '0p': + f['vcodec'] = 'none' + else: + f['fps'] = int_or_none(file_.get('fps')) formats.append(f) self._sort_formats(formats) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index c02e34aba..b15906390 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -18,7 +18,7 @@ class PeriscopeBaseIE(InfoExtractor): item_id, query=query) def _parse_broadcast_data(self, broadcast, video_id): - title = broadcast['status'] + title = broadcast.get('status') or 'Periscope Broadcast' uploader = broadcast.get('user_display_name') or broadcast.get('username') title = '%s - %s' % (uploader, title) if uploader else title is_live = broadcast.get('state').lower() == 'running' diff --git a/youtube_dl/extractor/pinterest.py b/youtube_dl/extractor/pinterest.py new file mode 100644 index 000000000..b249c9eda --- /dev/null +++ b/youtube_dl/extractor/pinterest.py @@ -0,0 +1,201 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +class PinterestBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)' + + def _call_api(self, resource, video_id, options): + return self._download_json( + 'https://www.pinterest.com/resource/%sResource/get/' % resource, + video_id, 'Download %s JSON metadata' % resource, query={ + 'data': json.dumps({'options': options}) + })['resource_response'] + + def _extract_video(self, data, extract_formats=True): + video_id = data['id'] + + title = (data.get('title') or data.get('grid_title') or video_id).strip() + + formats = [] + duration = None + if extract_formats: + for format_id, format_dict in data['videos']['video_list'].items(): + if not isinstance(format_dict, dict): + continue + format_url = url_or_none(format_dict.get('url')) + if not format_url: + continue + duration = float_or_none(format_dict.get('duration'), scale=1000) + ext = determine_ext(format_url) + if 'hls' in format_id.lower() or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'width': int_or_none(format_dict.get('width')), + 'height': int_or_none(format_dict.get('height')), + 'duration': duration, + }) + self._sort_formats( + formats, field_preference=('height', 'width', 'tbr', 'format_id')) + + description = data.get('description') or data.get('description_html') or data.get('seo_description') + timestamp = unified_timestamp(data.get('created_at')) + + def _u(field): + return try_get(data, lambda x: x['closeup_attribution'][field], compat_str) + + uploader = _u('full_name') + uploader_id = _u('id') + + repost_count = int_or_none(data.get('repin_count')) + comment_count = int_or_none(data.get('comment_count')) + categories = try_get(data, lambda x: x['pin_join']['visual_annotation'], list) + tags = data.get('hashtags') + + thumbnails = [] + images = data.get('images') + if isinstance(images, dict): + for thumbnail_id, thumbnail in images.items(): + if not isinstance(thumbnail, dict): + continue + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'repost_count': repost_count, + 'comment_count': comment_count, + 'categories': categories, + 'tags': tags, + 'formats': formats, + 'extractor_key': PinterestIE.ie_key(), + } + + +class PinterestIE(PinterestBaseIE): + _VALID_URL = r'%s/pin/(?P\d+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.pinterest.com/pin/664281013778109217/', + 'md5': '6550c2af85d6d9f3fe3b88954d1577fc', + 'info_dict': { + 'id': '664281013778109217', + 'ext': 'mp4', + 'title': 'Origami', + 'description': 'md5:b9d90ddf7848e897882de9e73344f7dd', + 'duration': 57.7, + 'timestamp': 1593073622, + 'upload_date': '20200625', + 'uploader': 'Love origami -I am Dafei', + 'uploader_id': '586523688879454212', + 'repost_count': 50, + 'comment_count': 0, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://co.pinterest.com/pin/824721750502199491/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._call_api( + 'Pin', video_id, { + 'field_set_key': 'unauth_react_main_pin', + 'id': video_id, + })['data'] + return self._extract_video(data) + + +class PinterestCollectionIE(PinterestBaseIE): + _VALID_URL = r'%s/(?P[^/]+)/(?P[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/', + 'info_dict': { + 'id': '585890301462791043', + 'title': 'cool diys', + }, + 'playlist_count': 8, + }, { + 'url': 'https://www.pinterest.ca/fudohub/videos/', + 'info_dict': { + 'id': '682858430939307450', + 'title': 'VIDEOS', + }, + 'playlist_mincount': 365, + 'skip': 'Test with extract_formats=False', + }] + + @classmethod + def suitable(cls, url): + return False if PinterestIE.suitable(url) else super( + PinterestCollectionIE, cls).suitable(url) + + def _real_extract(self, url): + username, slug = re.match(self._VALID_URL, url).groups() + board = self._call_api( + 'Board', slug, { + 'slug': slug, + 'username': username + })['data'] + board_id = board['id'] + options = { + 'board_id': board_id, + 'page_size': 250, + } + bookmark = None + entries = [] + while True: + if bookmark: + options['bookmarks'] = [bookmark] + board_feed = self._call_api('BoardFeed', board_id, options) + for item in (board_feed.get('data') or []): + if not isinstance(item, dict) or item.get('type') != 'pin': + continue + video_id = item.get('id') + if video_id: + # Some pins may not be available anonymously via pin URL + # video = self._extract_video(item, extract_formats=False) + # video.update({ + # '_type': 'url_transparent', + # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, + # }) + # entries.append(video) + entries.append(self._extract_video(item)) + bookmark = board_feed.get('bookmark') + if not bookmark: + break + return self.playlist_result( + entries, playlist_id=board_id, playlist_title=board.get('name')) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 3567a3283..9ad92a8ec 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -17,6 +17,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + merge_dicts, NO_DEFAULT, orderedSet, remove_quotes, @@ -30,7 +31,12 @@ class PornHubBaseIE(InfoExtractor): def dl(*args, **kwargs): return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) - webpage, urlh = dl(*args, **kwargs) + ret = dl(*args, **kwargs) + + if not ret: + return ret + + webpage, urlh = ret if any(re.search(p, webpage) for p in ( r']+\bonload=["\']go\(\)', @@ -52,20 +58,21 @@ class PornHubIE(PornHubBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| + (?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P[\da-z]+) ''' _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', - 'md5': '1e19b41231a02eba417839222ac9d58e', + 'md5': 'a6391306d050e4547f62b3f485dd9ba9', 'info_dict': { 'id': '648719015', 'ext': 'mp4', 'title': 'Seductive Indian beauty strips down and fingers her pink pussy', 'uploader': 'Babes', 'upload_date': '20130628', + 'timestamp': 1372447216, 'duration': 361, 'view_count': int, 'like_count': int, @@ -82,8 +89,8 @@ class PornHubIE(PornHubBaseIE): 'id': '1331683002', 'ext': 'mp4', 'title': '重庆婷婷女王足交', - 'uploader': 'Unknown', 'upload_date': '20150213', + 'timestamp': 1423804862, 'duration': 1753, 'view_count': int, 'like_count': int, @@ -121,6 +128,7 @@ class PornHubIE(PornHubBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'This video has been disabled', }, { 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, @@ -149,6 +157,9 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933', 'only_matching': True, + }, { + 'url': 'https://www.pornhub.org/view_video.php?viewkey=203640933', + 'only_matching': True, }, { 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', 'only_matching': True, @@ -157,7 +168,7 @@ class PornHubIE(PornHubBaseIE): @staticmethod def _extract_urls(webpage): return re.findall( - r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)', + r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub\.(?:com|net|org)/embed/[\da-z]+)', webpage) def _extract_count(self, pattern, webpage, name): @@ -338,14 +349,14 @@ class PornHubIE(PornHubBaseIE): video_uploader = self._html_search_regex( r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', - webpage, 'uploader', fatal=False) + webpage, 'uploader', default=None) view_count = self._extract_count( - r'([\d,\.]+) views', webpage, 'view') + r'([\d,\.]+) [Vv]iews', webpage, 'view') like_count = self._extract_count( - r'([\d,\.]+)', webpage, 'like') + r']+class="votesUp"[^>]*>([\d,\.]+)', webpage, 'like') dislike_count = self._extract_count( - r'([\d,\.]+)', webpage, 'dislike') + r']+class="votesDown"[^>]*>([\d,\.]+)', webpage, 'dislike') comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') @@ -356,7 +367,11 @@ class PornHubIE(PornHubBaseIE): if div: return re.findall(r']+\bhref=[^>]+>([^<]+)', div) - return { + info = self._search_json_ld(webpage, video_id, default={}) + # description provided in JSON-LD is irrelevant + info['description'] = None + + return merge_dicts({ 'id': video_id, 'uploader': video_uploader, 'upload_date': upload_date, @@ -372,7 +387,7 @@ class PornHubIE(PornHubBaseIE): 'tags': extract_list('tags'), 'categories': extract_list('categories'), 'subtitles': subtitles, - } + }, info) class PornHubPlaylistBaseIE(PornHubBaseIE): @@ -415,7 +430,7 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): class PornHubUserIE(PornHubPlaylistBaseIE): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' + _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph', 'playlist_mincount': 118, @@ -483,7 +498,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'https?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?P(?:[^/]+/)*[^/?#&]+)' + _VALID_URL = r'https?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?P(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ 'url': 'https://www.pornhub.com/model/zoe_ph/videos', 'only_matching': True, @@ -598,7 +613,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): - _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos/upload)' + _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P[^/]+)/videos/upload)' _TESTS = [{ 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'info_dict': { diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 1bc4f9b6b..e47088292 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -11,6 +11,7 @@ from ..utils import ( determine_ext, float_or_none, int_or_none, + merge_dicts, unified_strdate, ) @@ -175,7 +176,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): (?: (?:beta\.)? (?: - prosieben(?:maxx)?|sixx|sat1(?:gold)?|kabeleins(?:doku)?|the-voice-of-germany|7tv|advopedia + prosieben(?:maxx)?|sixx|sat1(?:gold)?|kabeleins(?:doku)?|the-voice-of-germany|advopedia )\.(?:de|at|ch)| ran\.de|fem\.com|advopedia\.de|galileo\.tv/video ) @@ -193,10 +194,14 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): 'info_dict': { 'id': '2104602', 'ext': 'mp4', - 'title': 'Episode 18 - Staffel 2', + 'title': 'CIRCUS HALLIGALLI - Episode 18 - Staffel 2', 'description': 'md5:8733c81b702ea472e069bc48bb658fc1', 'upload_date': '20131231', 'duration': 5845.04, + 'series': 'CIRCUS HALLIGALLI', + 'season_number': 2, + 'episode': 'Episode 18 - Staffel 2', + 'episode_number': 18, }, }, { @@ -300,8 +305,9 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): 'info_dict': { 'id': '2572814', 'ext': 'mp4', - 'title': 'Andreas Kümmert: Rocket Man', + 'title': 'The Voice of Germany - Andreas Kümmert: Rocket Man', 'description': 'md5:6ddb02b0781c6adf778afea606652e38', + 'timestamp': 1382041620, 'upload_date': '20131017', 'duration': 469.88, }, @@ -310,7 +316,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): }, }, { - 'url': 'http://www.fem.com/wellness/videos/wellness-video-clip-kurztripps-zum-valentinstag.html', + 'url': 'http://www.fem.com/videos/beauty-lifestyle/kurztrips-zum-valentinstag', 'info_dict': { 'id': '2156342', 'ext': 'mp4', @@ -332,19 +338,6 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): 'playlist_count': 2, 'skip': 'This video is unavailable', }, - { - 'url': 'http://www.7tv.de/circus-halligalli/615-best-of-circus-halligalli-ganze-folge', - 'info_dict': { - 'id': '4187506', - 'ext': 'mp4', - 'title': 'Best of Circus HalliGalli', - 'description': 'md5:8849752efd90b9772c9db6fdf87fb9e9', - 'upload_date': '20151229', - }, - 'params': { - 'skip_download': True, - }, - }, { # title in 'url': 'http://www.prosieben.de/stars/oscar-award/videos/jetzt-erst-enthuellt-das-geheimnis-von-emma-stones-oscar-robe-clip', @@ -421,7 +414,6 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): r']+id="veeseoDescription"[^>]*>(.+?)', ] _UPLOAD_DATE_REGEXES = [ - r'', r'\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}) \|\s*\s*(\d{2}\.\d{2}\.\d{4}) \d{2}:\d{2} Uhr', r'(\d{2}\.\d{2}\.\d{4})', @@ -451,17 +443,21 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): if description is None: description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) - upload_date = unified_strdate(self._html_search_regex( - self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) + upload_date = unified_strdate( + self._html_search_meta('og:published_time', webpage, + 'upload date', default=None) + or self._html_search_regex(self._UPLOAD_DATE_REGEXES, + webpage, 'upload date', default=None)) - info.update({ + json_ld = self._search_json_ld(webpage, clip_id, default={}) + + return merge_dicts(info, { 'id': clip_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, - }) - return info + }, json_ld) def _extract_playlist(self, url, webpage): playlist_id = self._html_search_regex( diff --git a/youtube_dl/extractor/puhutv.py b/youtube_dl/extractor/puhutv.py index fb704a3c4..ca71665e0 100644 --- a/youtube_dl/extractor/puhutv.py +++ b/youtube_dl/extractor/puhutv.py @@ -82,17 +82,6 @@ class PuhuTVIE(InfoExtractor): urls = [] formats = [] - def add_http_from_hls(m3u8_f): - http_url = m3u8_f['url'].replace('/hls/', '/mp4/').replace('/chunklist.m3u8', '.mp4') - if http_url != m3u8_f['url']: - f = m3u8_f.copy() - f.update({ - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - 'url': http_url, - }) - formats.append(f) - for video in videos['data']['videos']: media_url = url_or_none(video.get('url')) if not media_url or media_url in urls: @@ -101,12 +90,9 @@ class PuhuTVIE(InfoExtractor): playlist = video.get('is_playlist') if (video.get('stream_type') == 'hls' and playlist is True) or 'playlist.m3u8' in media_url: - m3u8_formats = self._extract_m3u8_formats( + formats.extend(self._extract_m3u8_formats( media_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False) - for m3u8_f in m3u8_formats: - formats.append(m3u8_f) - add_http_from_hls(m3u8_f) + m3u8_id='hls', fatal=False)) continue quality = int_or_none(video.get('quality')) @@ -128,8 +114,6 @@ class PuhuTVIE(InfoExtractor): format_id += '-%sp' % quality f['format_id'] = format_id formats.append(f) - if is_hls: - add_http_from_hls(f) self._sort_formats(formats) creator = try_get( diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 207a6c247..ecb628f14 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -15,9 +16,9 @@ from ..utils import ( GeoRestrictedError, int_or_none, parse_duration, + remove_start, strip_or_none, try_get, - unescapeHTML, unified_strdate, unified_timestamp, update_url_query, @@ -67,7 +68,7 @@ class RaiBaseIE(InfoExtractor): # This does not imply geo restriction (e.g. # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) - if media_url == 'http://download.rai.it/video_no_available.mp4': + if '/video_no_available.mp4' in media_url: continue ext = determine_ext(media_url) @@ -122,40 +123,20 @@ class RaiBaseIE(InfoExtractor): class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/.+?-(?P%s)\.html)' % RaiBaseIE._UUID_RE + _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/.+?-(?P%s))\.(?:html|json)' % RaiBaseIE._UUID_RE _TESTS = [{ - 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', - 'md5': '340aa3b7afb54bfd14a8c11786450d76', - 'info_dict': { - 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', - 'ext': 'mp4', - 'title': 'La Casa Bianca', - 'alt_title': 'S2016 - Puntata del 23/10/2016', - 'description': 'md5:a09d45890850458077d1f68bb036e0a5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 3', - 'creator': 'Rai 3', - 'duration': 3278, - 'timestamp': 1477764300, - 'upload_date': '20161029', - 'series': 'La Casa Bianca', - 'season': '2016', - }, - }, { 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', 'ext': 'mp4', 'title': 'Report del 07/04/2014', - 'alt_title': 'S2013/14 - Puntata del 07/04/2014', - 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', + 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014', + 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 5', - 'creator': 'Rai 5', + 'uploader': 'Rai Gulp', 'duration': 6160, 'series': 'Report', - 'season_number': 5, 'season': '2013/14', }, 'params': { @@ -167,48 +148,52 @@ class RaiPlayIE(RaiBaseIE): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - url, video_id = mobj.group('url', 'id') + base, video_id = re.match(self._VALID_URL, url).groups() media = self._download_json( - '%s?json' % url, video_id, 'Downloading video JSON') + base + '.json', video_id, 'Downloading video JSON') title = media['name'] video = media['video'] - relinker_info = self._extract_relinker_info(video['contentUrl'], video_id) + relinker_info = self._extract_relinker_info(video['content_url'], video_id) self._sort_formats(relinker_info['formats']) thumbnails = [] - if 'images' in media: - for _, value in media.get('images').items(): - if value: - thumbnails.append({ - 'url': value.replace('[RESOLUTION]', '600x400') - }) + for _, value in media.get('images', {}).items(): + if value: + thumbnails.append({ + 'url': urljoin(url, value), + }) - timestamp = unified_timestamp(try_get( - media, lambda x: x['availabilities'][0]['start'], compat_str)) + date_published = media.get('date_published') + time_published = media.get('time_published') + if date_published and time_published: + date_published += ' ' + time_published subtitles = self._extract_subtitles(url, video.get('subtitles')) + program_info = media.get('program_info') or {} + season = media.get('season') + info = { - 'id': video_id, + 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, + 'display_id': video_id, 'title': self._live_title(title) if relinker_info.get( 'is_live') else title, - 'alt_title': media.get('subtitle'), + 'alt_title': strip_or_none(media.get('subtitle')), 'description': media.get('description'), 'uploader': strip_or_none(media.get('channel')), - 'creator': strip_or_none(media.get('editor')), + 'creator': strip_or_none(media.get('editor') or None), 'duration': parse_duration(video.get('duration')), - 'timestamp': timestamp, + 'timestamp': unified_timestamp(date_published), 'thumbnails': thumbnails, - 'series': try_get( - media, lambda x: x['isPartOf']['name'], compat_str), - 'season_number': int_or_none(try_get( - media, lambda x: x['isPartOf']['numeroStagioni'])), - 'season': media.get('stagione') or None, + 'series': program_info.get('name'), + 'season_number': int_or_none(season), + 'season': season if (season and not season.isdigit()) else None, + 'episode': media.get('episode_title'), + 'episode_number': int_or_none(media.get('episode')), 'subtitles': subtitles, } @@ -216,16 +201,16 @@ class RaiPlayIE(RaiBaseIE): return info -class RaiPlayLiveIE(RaiBaseIE): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P[^/?#&]+)' - _TEST = { +class RaiPlayLiveIE(RaiPlayIE): + _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/dirette/(?P[^/?#&]+))' + _TESTS = [{ 'url': 'http://www.raiplay.it/dirette/rainews24', 'info_dict': { 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', 'display_id': 'rainews24', 'ext': 'mp4', 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:6eca31500550f9376819f174e5644754', + 'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497', 'uploader': 'Rai News 24', 'creator': 'Rai News 24', 'is_live': True, @@ -233,58 +218,50 @@ class RaiPlayLiveIE(RaiBaseIE): 'params': { 'skip_download': True, }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - video_id = self._search_regex( - r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE, - webpage, 'content id') - - return { - '_type': 'url_transparent', - 'ie_key': RaiPlayIE.ie_key(), - 'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id, - 'id': video_id, - 'display_id': display_id, - } + }] class RaiPlayPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P[^/?#&]+)' + _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/programmi/(?P[^/?#&]+))' _TESTS = [{ 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', 'info_dict': { 'id': 'nondirloalmiocapo', 'title': 'Non dirlo al mio capo', - 'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86', + 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', }, 'playlist_mincount': 12, }] def _real_extract(self, url): - playlist_id = self._match_id(url) + base, playlist_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, playlist_id) - - title = self._html_search_meta( - ('programma', 'nomeProgramma'), webpage, 'title') - description = unescapeHTML(self._html_search_meta( - ('description', 'og:description'), webpage, 'description')) + program = self._download_json( + base + '.json', playlist_id, 'Downloading program JSON') entries = [] - for mobj in re.finditer( - r']+\bhref=(["\'])(?P/raiplay/video/.+?)\1', - webpage): - video_url = urljoin(url, mobj.group('path')) - entries.append(self.url_result( - video_url, ie=RaiPlayIE.ie_key(), - video_id=RaiPlayIE._match_id(video_url))) + for b in (program.get('blocks') or []): + for s in (b.get('sets') or []): + s_id = s.get('id') + if not s_id: + continue + medias = self._download_json( + '%s/%s.json' % (base, s_id), s_id, + 'Downloading content set JSON', fatal=False) + if not medias: + continue + for m in (medias.get('items') or []): + path_id = m.get('path_id') + if not path_id: + continue + video_url = urljoin(url, path_id) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) - return self.playlist_result(entries, playlist_id, title, description) + return self.playlist_result( + entries, playlist_id, program.get('name'), + try_get(program, lambda x: x['program_info']['description'])) class RaiIE(RaiBaseIE): @@ -300,7 +277,8 @@ class RaiIE(RaiBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1758, 'upload_date': '20140612', - } + }, + 'skip': 'This content is available only in Italy', }, { # with ContentItem in many metas 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', @@ -316,7 +294,7 @@ class RaiIE(RaiBaseIE): }, { # with ContentItem in og:url 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', - 'md5': '11959b4e44fa74de47011b5799490adf', + 'md5': '6865dd00cf0bbf5772fdd89d59bd768a', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 'ext': 'mp4', @@ -326,18 +304,6 @@ class RaiIE(RaiBaseIE): 'duration': 2214, 'upload_date': '20161103', } - }, { - # drawMediaRaiTV(...) - 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': '2dd727e61114e1ee9c47f0da6914e178', - 'info_dict': { - 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', - 'ext': 'mp4', - 'title': 'Il pacco', - 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20141221', - }, }, { # initEdizione('ContentItem-...' 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', @@ -349,17 +315,6 @@ class RaiIE(RaiBaseIE): 'upload_date': '20170401', }, 'skip': 'Changes daily', - }, { - # HDS live stream with only relinker URL - 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', - 'info_dict': { - 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', - 'ext': 'flv', - 'title': 'EuroNews', - }, - 'params': { - 'skip_download': True, - }, }, { # HLS live stream with ContentItem in og:url 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', @@ -469,7 +424,7 @@ class RaiIE(RaiBaseIE): except ExtractorError: pass - relinker_url = self._search_regex( + relinker_url = self._proto_relative_url(self._search_regex( r'''(?x) (?: var\s+videoURL| @@ -481,7 +436,7 @@ class RaiIE(RaiBaseIE): //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 ''', - webpage, 'relinker URL', group='url') + webpage, 'relinker URL', group='url')) relinker_info = self._extract_relinker_info( urljoin(url, relinker_url), video_id) diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py index dbe1aaded..3aae79f5d 100644 --- a/youtube_dl/extractor/redbulltv.py +++ b/youtube_dl/extractor/redbulltv.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( @@ -10,7 +12,7 @@ from ..utils import ( class RedBullTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com(?:/[^/]+)?(?:/tv)?)(?:/events/[^/]+)?/(?:videos?|live)/(?PAP-\w+)' + _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com(?:/[^/]+)?(?:/tv)?)(?:/events/[^/]+)?/(?:videos?|live|(?:film|episode)s)/(?PAP-\w+)' _TESTS = [{ # film 'url': 'https://www.redbull.tv/video/AP-1Q6XCDTAN1W11', @@ -29,8 +31,8 @@ class RedBullTVIE(InfoExtractor): 'id': 'AP-1PMHKJFCW1W11', 'ext': 'mp4', 'title': 'Grime - Hashtags S2E4', - 'description': 'md5:b5f522b89b72e1e23216e5018810bb25', - 'duration': 904.6, + 'description': 'md5:5546aa612958c08a98faaad4abce484d', + 'duration': 904, }, 'params': { 'skip_download': True, @@ -44,11 +46,15 @@ class RedBullTVIE(InfoExtractor): }, { 'url': 'https://www.redbull.com/us-en/events/AP-1XV2K61Q51W11/live/AP-1XUJ86FDH1W11', 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/films/AP-1ZSMAW8FH2111', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/episodes/AP-1TQWK7XE11W11', + 'only_matching': True, }] - def _real_extract(self, url): - video_id = self._match_id(url) - + def extract_info(self, video_id): session = self._download_json( 'https://api.redbull.tv/v3/session', video_id, note='Downloading access token', query={ @@ -105,24 +111,119 @@ class RedBullTVIE(InfoExtractor): 'subtitles': subtitles, } + def _real_extract(self, url): + video_id = self._match_id(url) + return self.extract_info(video_id) + + +class RedBullEmbedIE(RedBullTVIE): + _VALID_URL = r'https?://(?:www\.)?redbull\.com/embed/(?Prrn:content:[^:]+:[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}:[a-z]{2}-[A-Z]{2,3})' + _TESTS = [{ + # HLS manifest accessible only using assetId + 'url': 'https://www.redbull.com/embed/rrn:content:episode-videos:f3021f4f-3ed4-51ac-915a-11987126e405:en-INT', + 'only_matching': True, + }] + _VIDEO_ESSENSE_TMPL = '''... on %s { + videoEssence { + attributes + } + }''' + + def _real_extract(self, url): + rrn_id = self._match_id(url) + asset_id = self._download_json( + 'https://edge-graphql.crepo-production.redbullaws.com/v1/graphql', + rrn_id, headers={'API-KEY': 'e90a1ff11335423998b100c929ecc866'}, + query={ + 'query': '''{ + resource(id: "%s", enforceGeoBlocking: false) { + %s + %s + } +}''' % (rrn_id, self._VIDEO_ESSENSE_TMPL % 'LiveVideo', self._VIDEO_ESSENSE_TMPL % 'VideoResource'), + })['data']['resource']['videoEssence']['attributes']['assetId'] + return self.extract_info(asset_id) + class RedBullTVRrnContentIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com(?:/[^/]+)?(?:/tv)?)/(?:video|live)/rrn:content:[^:]+:(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _VALID_URL = r'https?://(?:www\.)?redbull\.com/(?P[a-z]{2,3})-(?P[a-z]{2})/tv/(?:video|live|film)/(?Prrn:content:[^:]+:[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _TESTS = [{ 'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:live-videos:e3e6feb4-e95f-50b7-962a-c70f8fd13c73/mens-dh-finals-fort-william', 'only_matching': True, }, { 'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:videos:a36a0f36-ff1b-5db8-a69d-ee11a14bf48b/tn-ts-style?playlist=rrn:content:event-profiles:83f05926-5de8-5389-b5e4-9bb312d715e8:extras', 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/tv/film/rrn:content:films:d1f4d00e-4c04-5d19-b510-a805ffa2ab83/follow-me', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + region, lang, rrn_id = re.search(self._VALID_URL, url).groups() + rrn_id += ':%s-%s' % (lang, region.upper()) + return self.url_result( + 'https://www.redbull.com/embed/' + rrn_id, + RedBullEmbedIE.ie_key(), rrn_id) - webpage = self._download_webpage(url, display_id) - video_url = self._og_search_url(webpage) +class RedBullIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?redbull\.com/(?P[a-z]{2,3})-(?P[a-z]{2})/(?P(?:episode|film|(?:(?:recap|trailer)-)?video)s|live)/(?!AP-|rrn:content:)(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.redbull.com/int-en/episodes/grime-hashtags-s02-e04', + 'md5': 'db8271a7200d40053a1809ed0dd574ff', + 'info_dict': { + 'id': 'AA-1MT8DQWA91W14', + 'ext': 'mp4', + 'title': 'Grime - Hashtags S2E4', + 'description': 'md5:5546aa612958c08a98faaad4abce484d', + }, + }, { + 'url': 'https://www.redbull.com/int-en/films/kilimanjaro-mountain-of-greatness', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/recap-videos/uci-mountain-bike-world-cup-2017-mens-xco-finals-from-vallnord', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/trailer-videos/kings-of-content', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/videos/tnts-style-red-bull-dance-your-style-s1-e12', + 'only_matching': True, + }, { + 'url': 'https://www.redbull.com/int-en/live/mens-dh-finals-fort-william', + 'only_matching': True, + }, { + # only available on the int-en website so a fallback is need for the API + # https://www.redbull.com/v3/api/graphql/v1/v3/query/en-GB>en-INT?filter[uriSlug]=fia-wrc-saturday-recap-estonia&rb3Schema=v1:hero + 'url': 'https://www.redbull.com/gb-en/live/fia-wrc-saturday-recap-estonia', + 'only_matching': True, + }] + _INT_FALLBACK_LIST = ['de', 'en', 'es', 'fr'] + _LAT_FALLBACK_MAP = ['ar', 'bo', 'car', 'cl', 'co', 'mx', 'pe'] + + def _real_extract(self, url): + region, lang, filter_type, display_id = re.search(self._VALID_URL, url).groups() + if filter_type == 'episodes': + filter_type = 'episode-videos' + elif filter_type == 'live': + filter_type = 'live-videos' + + regions = [region.upper()] + if region != 'int': + if region in self._LAT_FALLBACK_MAP: + regions.append('LAT') + if lang in self._INT_FALLBACK_LIST: + regions.append('INT') + locale = '>'.join(['%s-%s' % (lang, reg) for reg in regions]) + + rrn_id = self._download_json( + 'https://www.redbull.com/v3/api/graphql/v1/v3/query/' + locale, + display_id, query={ + 'filter[type]': filter_type, + 'filter[uriSlug]': display_id, + 'rb3Schema': 'v1:hero', + })['data']['id'] return self.url_result( - video_url, ie=RedBullTVIE.ie_key(), - video_id=RedBullTVIE._match_id(video_url)) + 'https://www.redbull.com/embed/' + rrn_id, + RedBullEmbedIE.ie_key(), rrn_id) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index b1bde1e81..a1ca791ca 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, int_or_none, merge_dicts, @@ -14,7 +15,7 @@ from ..utils import ( class RedTubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P[0-9]+)' + _VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P[0-9]+)' _TESTS = [{ 'url': 'http://www.redtube.com/66418', 'md5': 'fc08071233725f26b8f014dba9590005', @@ -30,6 +31,9 @@ class RedTubeIE(InfoExtractor): }, { 'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286', 'only_matching': True, + }, { + 'url': 'http://it.redtube.com/66418', + 'only_matching': True, }] @staticmethod @@ -57,7 +61,7 @@ class RedTubeIE(InfoExtractor): if not info.get('title'): info['title'] = self._html_search_regex( - (r']+class="(?:video_title_text|videoTitle)[^"]*">(?P(?:(?!\1).)+)', + (r']+class="(?:video_title_text|videoTitle|video_title)[^"]*">(?P(?:(?!\1).)+)', r'(?:videoTitle|title)\s*:\s*(["\'])(?P(?:(?!\1).)+)\1',), webpage, 'title', group='title', default=None) or self._og_search_title(webpage) @@ -77,7 +81,7 @@ class RedTubeIE(InfoExtractor): }) medias = self._parse_json( self._search_regex( - r'mediaDefinition\s*:\s*(\[.+?\])', webpage, + r'mediaDefinition["\']?\s*:\s*(\[.+?}\s*\])', webpage, 'media definitions', default='{}'), video_id, fatal=False) if medias and isinstance(medias, list): @@ -85,6 +89,12 @@ class RedTubeIE(InfoExtractor): format_url = url_or_none(media.get('videoUrl')) if not format_url: continue + if media.get('format') == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + continue format_id = media.get('quality') formats.append({ 'url': format_url, diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index fadca8c17..9eaa06f25 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -14,12 +14,27 @@ class RtlNlIE(InfoExtractor): _VALID_URL = r'''(?x) https?://(?:(?:www|static)\.)? (?: - rtlxl\.nl/[^\#]*\#!/[^/]+/| - rtl\.nl/(?:(?:system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html|embed)\b.+?\buuid=|video/) + rtlxl\.nl/(?:[^\#]*\#!|programma)/[^/]+/| + rtl\.nl/(?:(?:system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html|embed)\b.+?\buuid=|video/)| + embed\.rtl\.nl/\#uuid= ) (?P[0-9a-f-]+)''' _TESTS = [{ + # new URL schema + 'url': 'https://www.rtlxl.nl/programma/rtl-nieuws/0bd1384d-d970-3086-98bb-5c104e10c26f', + 'md5': '490428f1187b60d714f34e1f2e3af0b6', + 'info_dict': { + 'id': '0bd1384d-d970-3086-98bb-5c104e10c26f', + 'ext': 'mp4', + 'title': 'RTL Nieuws', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'timestamp': 1593293400, + 'upload_date': '20200627', + 'duration': 661.08, + }, + }, { + # old URL schema 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/82b1aad1-4a14-3d7b-b554-b0aed1b2c416', 'md5': '473d1946c1fdd050b2c0161a4b13c373', 'info_dict': { @@ -31,6 +46,7 @@ class RtlNlIE(InfoExtractor): 'upload_date': '20160429', 'duration': 1167.96, }, + 'skip': '404', }, { # best format available a3t 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false', @@ -76,6 +92,10 @@ class RtlNlIE(InfoExtractor): }, { 'url': 'https://static.rtl.nl/embed/?uuid=1a2970fc-5c0b-43ff-9fdc-927e39e6d1bc&autoplay=false&publicatiepunt=rtlnieuwsnl', 'only_matching': True, + }, { + # new embed URL schema + 'url': 'https://embed.rtl.nl/#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/rumble.py b/youtube_dl/extractor/rumble.py new file mode 100644 index 000000000..4a0225109 --- /dev/null +++ b/youtube_dl/extractor/rumble.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + parse_iso8601, + try_get, +) + + +class RumbleEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P[0-9a-z]+)' + _TESTS = [{ + 'url': 'https://rumble.com/embed/v5pv5f', + 'md5': '36a18a049856720189f30977ccbb2c34', + 'info_dict': { + 'id': 'v5pv5f', + 'ext': 'mp4', + 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', + 'timestamp': 1571611968, + 'upload_date': '20191020', + } + }, { + 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._download_json( + 'https://rumble.com/embedJS/', video_id, + query={'request': 'video', 'v': video_id}) + title = video['title'] + + formats = [] + for height, ua in (video.get('ua') or {}).items(): + for i in range(2): + f_url = try_get(ua, lambda x: x[i], compat_str) + if f_url: + ext = determine_ext(f_url) + f = { + 'ext': ext, + 'format_id': '%s-%sp' % (ext, height), + 'height': int_or_none(height), + 'url': f_url, + } + bitrate = try_get(ua, lambda x: x[i + 2]['bitrate']) + if bitrate: + f['tbr'] = int_or_none(bitrate) + formats.append(f) + self._sort_formats(formats) + + author = video.get('author') or {} + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': video.get('i'), + 'timestamp': parse_iso8601(video.get('pubDate')), + 'channel': author.get('name'), + 'channel_url': author.get('url'), + 'duration': int_or_none(video.get('duration')), + } diff --git a/youtube_dl/extractor/servus.py b/youtube_dl/extractor/servus.py index 9401bf2cf..1610ddc2c 100644 --- a/youtube_dl/extractor/servus.py +++ b/youtube_dl/extractor/servus.py @@ -1,9 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + unified_timestamp, + urlencode_postdata, + url_or_none, +) class ServusIE(InfoExtractor): @@ -12,20 +18,29 @@ class ServusIE(InfoExtractor): (?:www\.)? (?: servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| - servustv\.com/videos + (?:servustv|pm-wissen)\.com/videos ) /(?P[aA]{2}-\w+|\d+-\d+) ''' _TESTS = [{ # new URL schema 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', - 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', + 'md5': '60474d4c21f3eb148838f215c37f02b9', 'info_dict': { 'id': 'AA-1T6VBU5PW1W12', 'ext': 'mp4', 'title': 'Die Grünen aus Sicht des Volkes', + 'alt_title': 'Talk im Hangar-7 Voxpops Gruene', 'description': 'md5:1247204d85783afe3682644398ff2ec4', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 62.442, + 'timestamp': 1605193976, + 'upload_date': '20201112', + 'series': 'Talk im Hangar-7', + 'season': 'Season 9', + 'season_number': 9, + 'episode': 'Episode 31 - September 14', + 'episode_number': 31, } }, { # old URL schema @@ -40,30 +55,94 @@ class ServusIE(InfoExtractor): }, { 'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/', 'only_matching': True, + }, { + 'url': 'https://www.pm-wissen.com/videos/aa-24mus4g2w2112/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url).upper() - webpage = self._download_webpage(url, video_id) - title = self._search_regex( - (r'videoLabel\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', - r']+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P[^<]+)'), - webpage, 'title', default=None, - group='title') or self._og_search_title(webpage) - title = re.sub(r'\s*-\s*Servus TV\s*$', '', title) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) + token = self._download_json( + 'https://auth.redbullmediahouse.com/token', video_id, + 'Downloading token', data=urlencode_postdata({ + 'grant_type': 'client_credentials', + }), headers={ + 'Authorization': 'Basic SVgtMjJYNEhBNFdEM1cxMTpEdDRVSkFLd2ZOMG5IMjB1NGFBWTBmUFpDNlpoQ1EzNA==', + }) + access_token = token['access_token'] + token_type = token.get('token_type', 'Bearer') - formats = self._extract_m3u8_formats( - 'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id, - video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + video = self._download_json( + 'https://sparkle-api.liiift.io/api/v1/stv/channels/international/assets/%s' % video_id, + video_id, 'Downloading video JSON', headers={ + 'Authorization': '%s %s' % (token_type, access_token), + }) + + formats = [] + thumbnail = None + for resource in video['resources']: + if not isinstance(resource, dict): + continue + format_url = url_or_none(resource.get('url')) + if not format_url: + continue + extension = resource.get('extension') + type_ = resource.get('type') + if extension == 'jpg' or type_ == 'reference_keyframe': + thumbnail = format_url + continue + ext = determine_ext(format_url) + if type_ == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + elif type_ == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif extension == 'mp4' or ext == 'mp4': + formats.append({ + 'url': format_url, + 'format_id': type_, + 'width': int_or_none(resource.get('width')), + 'height': int_or_none(resource.get('height')), + }) self._sort_formats(formats) + attrs = {} + for attribute in video['attributes']: + if not isinstance(attribute, dict): + continue + key = attribute.get('fieldKey') + value = attribute.get('fieldValue') + if not key or not value: + continue + attrs[key] = value + + title = attrs.get('title_stv') or video_id + alt_title = attrs.get('title') + description = attrs.get('long_description') or attrs.get('short_description') + series = attrs.get('label') + season = attrs.get('season') + episode = attrs.get('chapter') + duration = float_or_none(attrs.get('duration'), scale=1000) + season_number = int_or_none(self._search_regex( + r'Season (\d+)', season or '', 'season number', default=None)) + episode_number = int_or_none(self._search_regex( + r'Episode (\d+)', episode or '', 'episode number', default=None)) + return { 'id': video_id, 'title': title, + 'alt_title': alt_title, 'description': description, 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': unified_timestamp(video.get('lastPublished')), + 'series': series, + 'season': season, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, 'formats': formats, } diff --git a/youtube_dl/extractor/skyit.py b/youtube_dl/extractor/skyit.py new file mode 100644 index 000000000..14a4d8d4c --- /dev/null +++ b/youtube_dl/extractor/skyit.py @@ -0,0 +1,239 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + dict_get, + int_or_none, + parse_duration, + unified_timestamp, +) + + +class SkyItPlayerIE(InfoExtractor): + IE_NAME = 'player.sky.it' + _VALID_URL = r'https?://player\.sky\.it/player/(?:external|social)\.html\?.*?\bid=(?P\d+)' + _GEO_BYPASS = False + _DOMAIN = 'sky' + _PLAYER_TMPL = 'https://player.sky.it/player/external.html?id=%s&domain=%s' + # http://static.sky.it/static/skyplayer/conf.json + _TOKEN_MAP = { + 'cielo': 'Hh9O7M8ks5yi6nSROL7bKYz933rdf3GhwZlTLMgvy4Q', + 'hotclub': 'kW020K2jq2lk2eKRJD2vWEg832ncx2EivZlTLQput2C', + 'mtv8': 'A5Nn9GGb326CI7vP5e27d7E4PIaQjota', + 'salesforce': 'C6D585FD1615272C98DE38235F38BD86', + 'sitocommerciale': 'VJwfFuSGnLKnd9Phe9y96WkXgYDCguPMJ2dLhGMb2RE', + 'sky': 'F96WlOd8yoFmLQgiqv6fNQRvHZcsWk5jDaYnDvhbiJk', + 'skyacademy': 'A6LAn7EkO2Q26FRy0IAMBekX6jzDXYL3', + 'skyarte': 'LWk29hfiU39NNdq87ePeRach3nzTSV20o0lTv2001Cd', + 'theupfront': 'PRSGmDMsg6QMGc04Obpoy7Vsbn7i2Whp', + } + + def _player_url_result(self, video_id): + return self.url_result( + self._PLAYER_TMPL % (video_id, self._DOMAIN), + SkyItPlayerIE.ie_key(), video_id) + + def _parse_video(self, video, video_id): + title = video['title'] + is_live = video.get('type') == 'live' + hls_url = video.get(('streaming' if is_live else 'hls') + '_url') + if not hls_url and video.get('geoblock' if is_live else 'geob'): + self.raise_geo_restricted(countries=['IT']) + + if is_live: + formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4') + else: + formats = self._extract_akamai_formats( + hls_url, video_id, {'http': 'videoplatform.sky.it'}) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'formats': formats, + 'thumbnail': dict_get(video, ('video_still', 'video_still_medium', 'thumb')), + 'description': video.get('short_desc') or None, + 'timestamp': unified_timestamp(video.get('create_date')), + 'duration': int_or_none(video.get('duration_sec')) or parse_duration(video.get('duration')), + 'is_live': is_live, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + domain = compat_parse_qs(compat_urllib_parse_urlparse( + url).query).get('domain', [None])[0] + token = dict_get(self._TOKEN_MAP, (domain, 'sky')) + video = self._download_json( + 'https://apid.sky.it/vdp/v1/getVideoData', + video_id, query={ + 'caller': 'sky', + 'id': video_id, + 'token': token + }, headers=self.geo_verification_headers()) + return self._parse_video(video, video_id) + + +class SkyItVideoIE(SkyItPlayerIE): + IE_NAME = 'video.sky.it' + _VALID_URL = r'https?://(?:masterchef|video|xfactor)\.sky\.it(?:/[^/]+)*/video/[0-9a-z-]+-(?P\d+)' + _TESTS = [{ + 'url': 'https://video.sky.it/news/mondo/video/uomo-ucciso-da-uno-squalo-in-australia-631227', + 'md5': 'fe5c91e59a84a3437eaa0bca6e134ccd', + 'info_dict': { + 'id': '631227', + 'ext': 'mp4', + 'title': 'Uomo ucciso da uno squalo in Australia', + 'timestamp': 1606036192, + 'upload_date': '20201122', + } + }, { + 'url': 'https://xfactor.sky.it/video/x-factor-2020-replay-audizioni-1-615820', + 'only_matching': True, + }, { + 'url': 'https://masterchef.sky.it/video/masterchef-9-cosa-e-successo-nella-prima-puntata-562831', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._player_url_result(video_id) + + +class SkyItVideoLiveIE(SkyItPlayerIE): + IE_NAME = 'video.sky.it:live' + _VALID_URL = r'https?://video\.sky\.it/diretta/(?P[^/?]+)' + _TEST = { + 'url': 'https://video.sky.it/diretta/tg24', + 'info_dict': { + 'id': '1', + 'ext': 'mp4', + 'title': r're:Diretta TG24 \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'description': 'Guarda la diretta streaming di SkyTg24, segui con Sky tutti gli appuntamenti e gli speciali di Tg24.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + asset_id = compat_str(self._parse_json(self._search_regex( + r'', + webpage, 'next data'), display_id)['props']['initialState']['livePage']['content']['asset_id']) + livestream = self._download_json( + 'https://apid.sky.it/vdp/v1/getLivestream', + asset_id, query={'id': asset_id}) + return self._parse_video(livestream, asset_id) + + +class SkyItIE(SkyItPlayerIE): + IE_NAME = 'sky.it' + _VALID_URL = r'https?://(?:sport|tg24)\.sky\.it(?:/[^/]+)*/\d{4}/\d{2}/\d{2}/(?P[^/?]+)' + _TESTS = [{ + 'url': 'https://sport.sky.it/calcio/serie-a/2020/11/21/juventus-cagliari-risultato-gol', + 'info_dict': { + 'id': '631201', + 'ext': 'mp4', + 'title': 'Un rosso alla violenza: in campo per i diritti delle donne', + 'upload_date': '20201121', + 'timestamp': 1605995753, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'https://tg24.sky.it/mondo/2020/11/22/australia-squalo-uccide-uomo', + 'md5': 'fe5c91e59a84a3437eaa0bca6e134ccd', + 'info_dict': { + 'id': '631227', + 'ext': 'mp4', + 'title': 'Uomo ucciso da uno squalo in Australia', + 'timestamp': 1606036192, + 'upload_date': '20201122', + }, + }] + _VIDEO_ID_REGEX = r'data-videoid="(\d+)"' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + self._VIDEO_ID_REGEX, webpage, 'video id') + return self._player_url_result(video_id) + + +class SkyItAcademyIE(SkyItIE): + IE_NAME = 'skyacademy.it' + _VALID_URL = r'https?://(?:www\.)?skyacademy\.it(?:/[^/]+)*/\d{4}/\d{2}/\d{2}/(?P[^/?]+)' + _TESTS = [{ + 'url': 'https://www.skyacademy.it/eventi-speciali/2019/07/05/a-lezione-di-cinema-con-sky-academy-/', + 'md5': 'ced5c26638b7863190cbc44dd6f6ba08', + 'info_dict': { + 'id': '523458', + 'ext': 'mp4', + 'title': 'Sky Academy "The Best CineCamp 2019"', + 'timestamp': 1562843784, + 'upload_date': '20190711', + } + }] + _DOMAIN = 'skyacademy' + _VIDEO_ID_REGEX = r'id="news-videoId_(\d+)"' + + +class SkyItArteIE(SkyItIE): + IE_NAME = 'arte.sky.it' + _VALID_URL = r'https?://arte\.sky\.it/video/(?P[^/?]+)' + _TESTS = [{ + 'url': 'https://arte.sky.it/video/serie-musei-venezia-collezionismo-12-novembre/', + 'md5': '515aee97b87d7a018b6c80727d3e7e17', + 'info_dict': { + 'id': '627926', + 'ext': 'mp4', + 'title': "Musei Galleria Franchetti alla Ca' d'Oro Palazzo Grimani", + 'upload_date': '20201106', + 'timestamp': 1604664493, + } + }] + _DOMAIN = 'skyarte' + _VIDEO_ID_REGEX = r'(?s)]+src="(?:https:)?//player\.sky\.it/player/external\.html\?[^"]*\bid=(\d+)' + + +class CieloTVItIE(SkyItIE): + IE_NAME = 'cielotv.it' + _VALID_URL = r'https?://(?:www\.)?cielotv\.it/video/(?P[^.]+)\.html' + _TESTS = [{ + 'url': 'https://www.cielotv.it/video/Il-lunedi-e-sempre-un-dramma.html', + 'md5': 'c4deed77552ba901c2a0d9258320304b', + 'info_dict': { + 'id': '499240', + 'ext': 'mp4', + 'title': 'Il lunedì è sempre un dramma', + 'upload_date': '20190329', + 'timestamp': 1553862178, + } + }] + _DOMAIN = 'cielo' + _VIDEO_ID_REGEX = r'videoId\s*=\s*"(\d+)"' + + +class TV8ItIE(SkyItVideoIE): + IE_NAME = 'tv8.it' + _VALID_URL = r'https?://tv8\.it/showvideo/(?P\d+)' + _TESTS = [{ + 'url': 'https://tv8.it/showvideo/630529/ogni-mattina-ucciso-asino-di-andrea-lo-cicero/18-11-2020/', + 'md5': '9ab906a3f75ea342ed928442f9dabd21', + 'info_dict': { + 'id': '630529', + 'ext': 'mp4', + 'title': 'Ogni mattina - Ucciso asino di Andrea Lo Cicero', + 'timestamp': 1605721374, + 'upload_date': '20201118', + } + }] + _DOMAIN = 'mtv8' diff --git a/youtube_dl/extractor/slideslive.py b/youtube_dl/extractor/slideslive.py index d9ea76831..cd70841a9 100644 --- a/youtube_dl/extractor/slideslive.py +++ b/youtube_dl/extractor/slideslive.py @@ -2,7 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import smuggle_url +from ..utils import ( + bool_or_none, + smuggle_url, + try_get, + url_or_none, +) class SlidesLiveIE(InfoExtractor): @@ -18,8 +23,21 @@ class SlidesLiveIE(InfoExtractor): 'description': 'Watch full version of this video at https://slideslive.com/38902413.', 'uploader': 'SlidesLive Videos - A', 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', + 'timestamp': 1597615266, 'upload_date': '20170925', } + }, { + # video_service_name = yoda + 'url': 'https://slideslive.com/38935785', + 'md5': '575cd7a6c0acc6e28422fe76dd4bcb1a', + 'info_dict': { + 'id': 'RMraDYN5ozA_', + 'ext': 'mp4', + 'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges', + }, + 'params': { + 'format': 'bestvideo', + }, }, { # video_service_name = youtube 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', @@ -39,18 +57,47 @@ class SlidesLiveIE(InfoExtractor): video_data = self._download_json( 'https://ben.slideslive.com/player/' + video_id, video_id) service_name = video_data['video_service_name'].lower() - assert service_name in ('url', 'vimeo', 'youtube') + assert service_name in ('url', 'yoda', 'vimeo', 'youtube') service_id = video_data['video_service_id'] + subtitles = {} + for sub in try_get(video_data, lambda x: x['subtitles'], list) or []: + if not isinstance(sub, dict): + continue + webvtt_url = url_or_none(sub.get('webvtt_url')) + if not webvtt_url: + continue + lang = sub.get('language') or 'en' + subtitles.setdefault(lang, []).append({ + 'url': webvtt_url, + }) info = { 'id': video_id, 'thumbnail': video_data.get('thumbnail'), - 'url': service_id, + 'is_live': bool_or_none(video_data.get('is_live')), + 'subtitles': subtitles, } - if service_name == 'url': + if service_name in ('url', 'yoda'): info['title'] = video_data['title'] + if service_name == 'url': + info['url'] = service_id + else: + formats = [] + _MANIFEST_PATTERN = 'https://01.cdn.yoda.slideslive.com/%s/master.%s' + formats.extend(self._extract_m3u8_formats( + _MANIFEST_PATTERN % (service_id, 'm3u8'), service_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_mpd_formats( + _MANIFEST_PATTERN % (service_id, 'mpd'), service_id, + mpd_id='dash', fatal=False)) + self._sort_formats(formats) + info.update({ + 'id': service_id, + 'formats': formats, + }) else: info.update({ '_type': 'url_transparent', + 'url': service_id, 'ie_key': service_name.capitalize(), 'title': video_data.get('title'), }) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py deleted file mode 100644 index 45995f30f..000000000 --- a/youtube_dl/extractor/smotri.py +++ /dev/null @@ -1,416 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import json -import hashlib -import uuid - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - sanitized_Request, - unified_strdate, - urlencode_postdata, - xpath_text, -) - - -class SmotriIE(InfoExtractor): - IE_DESC = 'Smotri.com' - IE_NAME = 'smotri' - _VALID_URL = r'https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?Pv(?P[0-9]+)[a-z0-9]{4})' - _NETRC_MACHINE = 'smotri' - - _TESTS = [ - # real video id 2610366 - { - 'url': 'http://smotri.com/video/view/?id=v261036632ab', - 'md5': '02c0dfab2102984e9c5bb585cc7cc321', - 'info_dict': { - 'id': 'v261036632ab', - 'ext': 'mp4', - 'title': 'катастрофа с камер видеонаблюдения', - 'uploader': 'rbc2008', - 'uploader_id': 'rbc08', - 'upload_date': '20131118', - 'thumbnail': 'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg', - }, - }, - # real video id 57591 - { - 'url': 'http://smotri.com/video/view/?id=v57591cb20', - 'md5': '830266dfc21f077eac5afd1883091bcd', - 'info_dict': { - 'id': 'v57591cb20', - 'ext': 'flv', - 'title': 'test', - 'uploader': 'Support Photofile@photofile', - 'uploader_id': 'support-photofile', - 'upload_date': '20070704', - 'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg', - }, - }, - # video-password, not approved by moderator - { - 'url': 'http://smotri.com/video/view/?id=v1390466a13c', - 'md5': 'f6331cef33cad65a0815ee482a54440b', - 'info_dict': { - 'id': 'v1390466a13c', - 'ext': 'mp4', - 'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', - 'uploader': 'timoxa40', - 'uploader_id': 'timoxa40', - 'upload_date': '20100404', - 'thumbnail': 'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg', - }, - 'params': { - 'videopassword': 'qwerty', - }, - 'skip': 'Video is not approved by moderator', - }, - # video-password - { - 'url': 'http://smotri.com/video/view/?id=v6984858774#', - 'md5': 'f11e01d13ac676370fc3b95b9bda11b0', - 'info_dict': { - 'id': 'v6984858774', - 'ext': 'mp4', - 'title': 'Дача Солженицина ПАРОЛЬ 223322', - 'uploader': 'psavari1', - 'uploader_id': 'psavari1', - 'upload_date': '20081103', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - 'videopassword': '223322', - }, - }, - # age limit + video-password, not approved by moderator - { - 'url': 'http://smotri.com/video/view/?id=v15408898bcf', - 'md5': '91e909c9f0521adf5ee86fbe073aad70', - 'info_dict': { - 'id': 'v15408898bcf', - 'ext': 'flv', - 'title': 'этот ролик не покажут по ТВ', - 'uploader': 'zzxxx', - 'uploader_id': 'ueggb', - 'upload_date': '20101001', - 'thumbnail': 'http://frame3.loadup.ru/75/75/1540889.1.3.jpg', - 'age_limit': 18, - }, - 'params': { - 'videopassword': '333' - }, - 'skip': 'Video is not approved by moderator', - }, - # age limit + video-password - { - 'url': 'http://smotri.com/video/view/?id=v7780025814', - 'md5': 'b4599b068422559374a59300c5337d72', - 'info_dict': { - 'id': 'v7780025814', - 'ext': 'mp4', - 'title': 'Sexy Beach (пароль 123)', - 'uploader': 'вАся', - 'uploader_id': 'asya_prosto', - 'upload_date': '20081218', - 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 18, - }, - 'params': { - 'videopassword': '123' - }, - }, - # swf player - { - 'url': 'http://pics.smotri.com/scrubber_custom8.swf?file=v9188090500', - 'md5': '31099eeb4bc906712c5f40092045108d', - 'info_dict': { - 'id': 'v9188090500', - 'ext': 'mp4', - 'title': 'Shakira - Don\'t Bother', - 'uploader': 'HannahL', - 'uploader_id': 'lisaha95', - 'upload_date': '20090331', - 'thumbnail': 'http://frame8.loadup.ru/44/0b/918809.7.3.jpg', - }, - }, - ] - - @classmethod - def _extract_url(cls, webpage): - mobj = re.search( - r']src=(["\'])(?Phttp://pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=v.+?\1)', - webpage) - if mobj is not None: - return mobj.group('url') - - mobj = re.search( - r'''(?x)http://smotri\.com/video/download/file/[^<]+\s* - [^<]+\s* - (?P[^<]+)''', webpage) - if mobj is not None: - return 'http://smotri.com/video/view/?id=%s' % mobj.group('id') - - def _search_meta(self, name, html, display_name=None): - if display_name is None: - display_name = name - return self._html_search_meta(name, html, display_name) - - def _real_extract(self, url): - video_id = self._match_id(url) - - video_form = { - 'ticket': video_id, - 'video_url': '1', - 'frame_url': '1', - 'devid': 'LoadupFlashPlayer', - 'getvideoinfo': '1', - } - - video_password = self._downloader.params.get('videopassword') - if video_password: - video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest() - - video = self._download_json( - 'http://smotri.com/video/view/url/bot/', - video_id, 'Downloading video JSON', - data=urlencode_postdata(video_form), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - video_url = video.get('_vidURL') or video.get('_vidURL_mp4') - - if not video_url: - if video.get('_moderate_no'): - raise ExtractorError( - 'Video %s has not been approved by moderator' % video_id, expected=True) - - if video.get('error'): - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - if video.get('_pass_protected') == 1: - msg = ('Invalid video password' if video_password - else 'This video is protected by a password, use the --video-password option') - raise ExtractorError(msg, expected=True) - - title = video['title'] - thumbnail = video.get('_imgURL') - upload_date = unified_strdate(video.get('added')) - uploader = video.get('userNick') - uploader_id = video.get('userLogin') - duration = int_or_none(video.get('duration')) - - # Video JSON does not provide enough meta data - # We will extract some from the video web page instead - webpage_url = 'http://smotri.com/video/view/?id=%s' % video_id - webpage = self._download_webpage(webpage_url, video_id, 'Downloading video page') - - # Warning if video is unavailable - warning = self._html_search_regex( - r']+class="videoUnModer"[^>]*>(.+?)', webpage, - 'warning message', default=None) - if warning is not None: - self._downloader.report_warning( - 'Video %s may not be available; smotri said: %s ' % - (video_id, warning)) - - # Adult content - if 'EroConfirmText">' in webpage: - self.report_age_confirmation() - confirm_string = self._html_search_regex( - r']+href="/video/view/\?id=%s&confirm=([^"]+)"' % video_id, - webpage, 'confirm string') - confirm_url = webpage_url + '&confirm=%s' % confirm_string - webpage = self._download_webpage( - confirm_url, video_id, - 'Downloading video page (age confirmed)') - adult_content = True - else: - adult_content = False - - view_count = self._html_search_regex( - r'(?s)Общее количество просмотров.*?(\d+)', - webpage, 'view count', fatal=False) - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, - 'uploader_id': uploader_id, - 'duration': duration, - 'view_count': int_or_none(view_count), - 'age_limit': 18 if adult_content else 0, - } - - -class SmotriCommunityIE(InfoExtractor): - IE_DESC = 'Smotri.com community videos' - IE_NAME = 'smotri:community' - _VALID_URL = r'https?://(?:www\.)?smotri\.com/community/video/(?P[0-9A-Za-z_\'-]+)' - _TEST = { - 'url': 'http://smotri.com/community/video/kommuna', - 'info_dict': { - 'id': 'kommuna', - }, - 'playlist_mincount': 4, - } - - def _real_extract(self, url): - community_id = self._match_id(url) - - rss = self._download_xml( - 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id, - community_id, 'Downloading community RSS') - - entries = [ - self.url_result(video_url.text, SmotriIE.ie_key()) - for video_url in rss.findall('./channel/item/link')] - - return self.playlist_result(entries, community_id) - - -class SmotriUserIE(InfoExtractor): - IE_DESC = 'Smotri.com user videos' - IE_NAME = 'smotri:user' - _VALID_URL = r'https?://(?:www\.)?smotri\.com/user/(?P[0-9A-Za-z_\'-]+)' - _TESTS = [{ - 'url': 'http://smotri.com/user/inspector', - 'info_dict': { - 'id': 'inspector', - 'title': 'Inspector', - }, - 'playlist_mincount': 9, - }] - - def _real_extract(self, url): - user_id = self._match_id(url) - - rss = self._download_xml( - 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id, - user_id, 'Downloading user RSS') - - entries = [self.url_result(video_url.text, 'Smotri') - for video_url in rss.findall('./channel/item/link')] - - description_text = xpath_text(rss, './channel/description') or '' - user_nickname = self._search_regex( - '^Видео режиссера (.+)$', description_text, - 'user nickname', fatal=False) - - return self.playlist_result(entries, user_id, user_nickname) - - -class SmotriBroadcastIE(InfoExtractor): - IE_DESC = 'Smotri.com broadcasts' - IE_NAME = 'smotri:broadcast' - _VALID_URL = r'https?://(?:www\.)?(?Psmotri\.com/live/(?P[^/]+))/?.*' - _NETRC_MACHINE = 'smotri' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - broadcast_id = mobj.group('id') - - broadcast_url = 'http://' + mobj.group('url') - broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page') - - if re.search('>Режиссер с логином "%s" не существует<' % broadcast_id, broadcast_page) is not None: - raise ExtractorError( - 'Broadcast %s does not exist' % broadcast_id, expected=True) - - # Adult content - if re.search('EroConfirmText">', broadcast_page) is not None: - - (username, password) = self._get_login_info() - if username is None: - self.raise_login_required( - 'Erotic broadcasts allowed only for registered users') - - login_form = { - 'login-hint53': '1', - 'confirm_erotic': '1', - 'login': username, - 'password': password, - } - - request = sanitized_Request( - broadcast_url + '/?no_redirect=1', urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - broadcast_page = self._download_webpage( - request, broadcast_id, 'Logging in and confirming age') - - if '>Неверный логин или пароль<' in broadcast_page: - raise ExtractorError( - 'Unable to log in: bad username or password', expected=True) - - adult_content = True - else: - adult_content = False - - ticket = self._html_search_regex( - (r'data-user-file=(["\'])(?P(?!\1).+)\1', - r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'(?P[^']+)'\)"), - broadcast_page, 'broadcast ticket', group='ticket') - - broadcast_url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket - - broadcast_password = self._downloader.params.get('videopassword') - if broadcast_password: - broadcast_url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() - - broadcast_json_page = self._download_webpage( - broadcast_url, broadcast_id, 'Downloading broadcast JSON') - - try: - broadcast_json = json.loads(broadcast_json_page) - - protected_broadcast = broadcast_json['_pass_protected'] == 1 - if protected_broadcast and not broadcast_password: - raise ExtractorError( - 'This broadcast is protected by a password, use the --video-password option', - expected=True) - - broadcast_offline = broadcast_json['is_play'] == 0 - if broadcast_offline: - raise ExtractorError('Broadcast %s is offline' % broadcast_id, expected=True) - - rtmp_url = broadcast_json['_server'] - mobj = re.search(r'^rtmp://[^/]+/(?P.+)/?$', rtmp_url) - if not mobj: - raise ExtractorError('Unexpected broadcast rtmp URL') - - broadcast_playpath = broadcast_json['_streamName'] - broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL']) - broadcast_thumbnail = broadcast_json.get('_imgURL') - broadcast_title = self._live_title(broadcast_json['title']) - broadcast_description = broadcast_json.get('description') - broadcaster_nick = broadcast_json.get('nick') - broadcaster_login = broadcast_json.get('login') - rtmp_conn = 'S:%s' % uuid.uuid4().hex - except KeyError: - if protected_broadcast: - raise ExtractorError('Bad broadcast password', expected=True) - raise ExtractorError('Unexpected broadcast JSON') - - return { - 'id': broadcast_id, - 'url': rtmp_url, - 'title': broadcast_title, - 'thumbnail': broadcast_thumbnail, - 'description': broadcast_description, - 'uploader': broadcaster_nick, - 'uploader_id': broadcaster_login, - 'age_limit': 18 if adult_content else 0, - 'ext': 'flv', - 'play_path': broadcast_playpath, - 'player_url': 'http://pics.smotri.com/broadcast_play.swf', - 'app': broadcast_app, - 'rtmp_live': True, - 'rtmp_conn': rtmp_conn, - 'is_live': True, - } diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index ff6be0b54..abb85e1e5 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -246,7 +246,12 @@ class SoundcloudIE(InfoExtractor): 'comment_count': int, 'repost_count': int, }, - } + }, + { + # with AAC HQ format available via OAuth token + 'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1', + 'only_matching': True, + }, ] _API_V2_BASE = 'https://api-v2.soundcloud.com/' @@ -350,6 +355,9 @@ class SoundcloudIE(InfoExtractor): format_id_list = [] if protocol: format_id_list.append(protocol) + ext = f.get('ext') + if ext == 'aac': + f['abr'] = '256' for k in ('ext', 'abr'): v = f.get(k) if v: @@ -360,9 +368,13 @@ class SoundcloudIE(InfoExtractor): abr = f.get('abr') if abr: f['abr'] = int(abr) + if protocol == 'hls': + protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' + else: + protocol = 'http' f.update({ 'format_id': '_'.join(format_id_list), - 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + 'protocol': protocol, 'preference': -10 if preview else None, }) formats.append(f) @@ -546,8 +558,10 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): class SoundcloudPagedPlaylistBaseIE(SoundcloudIE): def _extract_playlist(self, base_url, playlist_id, playlist_title): + # Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200. + # https://developers.soundcloud.com/blog/offset-pagination-deprecated COMMON_QUERY = { - 'limit': 2000000000, + 'limit': 200, 'linked_partitioning': '1', } diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 44d8fa52f..35ab9ec37 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -3,34 +3,47 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, -) from ..utils import ( - sanitized_Request, + float_or_none, + int_or_none, + merge_dicts, + str_or_none, str_to_int, - unified_strdate, + url_or_none, ) -from ..aes import aes_decrypt_text class SpankwireIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pspankwire\.com/[^/]*/video(?P[0-9]+)/?)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?spankwire\.com/ + (?: + [^/]+/video| + EmbedPlayer\.aspx/?\?.*?\bArticleId= + ) + (?P\d+) + ''' _TESTS = [{ # download URL pattern: */P_K_.mp4 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', - 'md5': '8bbfde12b101204b39e4b9fe7eb67095', + 'md5': '5aa0e4feef20aad82cbcae3aed7ab7cd', 'info_dict': { 'id': '103545', 'ext': 'mp4', 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch', 'description': 'Crazy Bitch X rated music video.', + 'duration': 222, 'uploader': 'oreusz', 'uploader_id': '124697', - 'upload_date': '20070507', + 'timestamp': 1178587885, + 'upload_date': '20070508', + 'average_rating': float, + 'view_count': int, + 'comment_count': int, 'age_limit': 18, - } + 'categories': list, + 'tags': list, + }, }, { # download URL pattern: */mp4__.mp4 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/', @@ -45,83 +58,125 @@ class SpankwireIE(InfoExtractor): 'upload_date': '20150822', 'age_limit': 18, }, + 'params': { + 'proxy': '127.0.0.1:8118' + }, + 'skip': 'removed', + }, { + 'url': 'https://www.spankwire.com/EmbedPlayer.aspx/?ArticleId=156156&autostart=true', + 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\']((?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)', + webpage) + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) - req = sanitized_Request('http://www.' + mobj.group('url')) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) + video = self._download_json( + 'https://www.spankwire.com/api/video/%s.json' % video_id, video_id) - title = self._html_search_regex( - r'([^<]+)', webpage, 'title') - description = self._html_search_regex( - r'(?s)(.+?)', - webpage, 'description', fatal=False) - thumbnail = self._html_search_regex( - r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']', - webpage, 'thumbnail', fatal=False) - - uploader = self._html_search_regex( - r'by:\s*]*>(.+?)', - webpage, 'uploader', fatal=False) - uploader_id = self._html_search_regex( - r'by:\s* on (.+?) at \d+:\d+', - webpage, 'upload date', fatal=False)) - - view_count = str_to_int(self._html_search_regex( - r'([\d,\.]+) views', - webpage, 'view count', fatal=False)) - comment_count = str_to_int(self._html_search_regex( - r']*>([\d,\.]+)', - webpage, 'comment count', fatal=False)) - - videos = re.findall( - r'playerData\.cdnPath([0-9]{3,})\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage) - heights = [int(video[0]) for video in videos] - video_urls = list(map(compat_urllib_parse_unquote, [video[1] for video in videos])) - if webpage.find(r'flashvars\.encrypted = "true"') != -1: - password = self._search_regex( - r'flashvars\.video_title = "([^"]+)', - webpage, 'password').replace('+', ' ') - video_urls = list(map( - lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), - video_urls)) + title = video['title'] formats = [] - for height, video_url in zip(heights, video_urls): - path = compat_urllib_parse_urlparse(video_url).path - m = re.search(r'/(?P\d+)[pP]_(?P\d+)[kK]', path) - if m: - tbr = int(m.group('tbr')) - height = int(m.group('height')) - else: - tbr = None - formats.append({ - 'url': video_url, - 'format_id': '%dp' % height, - 'height': height, - 'tbr': tbr, + videos = video.get('videos') + if isinstance(videos, dict): + for format_id, format_url in videos.items(): + video_url = url_or_none(format_url) + if not format_url: + continue + height = int_or_none(self._search_regex( + r'(\d+)[pP]', format_id, 'height', default=None)) + m = re.search( + r'/(?P\d+)[pP]_(?P\d+)[kK]', video_url) + if m: + tbr = int(m.group('tbr')) + height = height or int(m.group('height')) + else: + tbr = None + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height if height else format_id, + 'height': height, + 'tbr': tbr, + }) + m3u8_url = url_or_none(video.get('HLS')) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats, ('height', 'tbr', 'width', 'format_id')) + + view_count = str_to_int(video.get('viewed')) + + thumbnails = [] + for preference, t in enumerate(('', '2x'), start=0): + thumbnail_url = url_or_none(video.get('poster%s' % t)) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'preference': preference, }) - self._sort_formats(formats) - age_limit = self._rta_search(webpage) + def extract_names(key): + entries_list = video.get(key) + if not isinstance(entries_list, list): + return + entries = [] + for entry in entries_list: + name = str_or_none(entry.get('name')) + if name: + entries.append(name) + return entries - return { + categories = extract_names('categories') + tags = extract_names('tags') + + uploader = None + info = {} + + webpage = self._download_webpage( + 'https://www.spankwire.com/_/video%s/' % video_id, video_id, + fatal=False) + if webpage: + info = self._search_json_ld(webpage, video_id, default={}) + thumbnail_url = None + if 'thumbnail' in info: + thumbnail_url = url_or_none(info['thumbnail']) + del info['thumbnail'] + if not thumbnail_url: + thumbnail_url = self._og_search_thumbnail(webpage) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'preference': 10, + }) + uploader = self._html_search_regex( + r'(?s)by\s*]+\bclass=["\']uploaded__by[^>]*>(.+?)', + webpage, 'uploader', fatal=False) + if not view_count: + view_count = str_to_int(self._search_regex( + r'data-views=["\']([\d,.]+)', webpage, 'view count', + fatal=False)) + + return merge_dicts({ 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'description': video.get('description'), + 'duration': int_or_none(video.get('duration')), + 'thumbnails': thumbnails, 'uploader': uploader, - 'uploader_id': uploader_id, - 'upload_date': upload_date, + 'uploader_id': str_or_none(video.get('userId')), + 'timestamp': int_or_none(video.get('time_approved_on')), + 'average_rating': float_or_none(video.get('rating')), 'view_count': view_count, - 'comment_count': comment_count, + 'comment_count': int_or_none(video.get('comments')), + 'age_limit': 18, + 'categories': categories, + 'tags': tags, 'formats': formats, - 'age_limit': age_limit, - } + }, info) diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 4df7f4ddc..2da32b9b2 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -1,159 +1,54 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .spiegeltv import SpiegeltvIE -from ..compat import compat_urlparse -from ..utils import ( - parse_duration, - strip_or_none, - unified_timestamp, -) +from .jwplatform import JWPlatformIE class SpiegelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', - 'md5': 'b57399839d055fccfeb9a0455c439868', + 'md5': '50c7948883ec85a3e431a0a44b7ad1d6', 'info_dict': { - 'id': '563747', + 'id': 'II0BUyxY', + 'display_id': '1259285', 'ext': 'mp4', - 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', + 'title': 'Vulkan Tungurahua in Ecuador ist wieder aktiv - DER SPIEGEL - Wissenschaft', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', - 'duration': 49, + 'duration': 48.0, 'upload_date': '20130311', - 'timestamp': 1362994320, + 'timestamp': 1362997920, }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', - 'md5': '5b6c2f4add9d62912ed5fc78a1faed80', - 'info_dict': { - 'id': '580988', - 'ext': 'mp4', - 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', - 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', - 'duration': 983, - 'upload_date': '20131115', - 'timestamp': 1384546642, - }, - }, { - 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', - 'md5': '97b91083a672d72976faa8433430afb9', - 'info_dict': { - 'id': '601883', - 'ext': 'mp4', - 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', - 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', - 'upload_date': '20140904', - 'timestamp': 1409834160, - } - }, { - 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', 'only_matching': True, }, { - # nexx video + 'url': 'https://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html', + 'only_matching': True, + }, { + 'url': 'https://www.spiegel.de/panorama/urteile-im-goldmuenzenprozess-haftstrafen-fuer-clanmitglieder-a-aae8df48-43c1-4c61-867d-23f0a2d254b7', + 'only_matching': True, + }, { 'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html', 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id - handle = self._request_webpage(metadata_url, video_id) - - # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html - if SpiegeltvIE.suitable(handle.geturl()): - return self.url_result(handle.geturl(), 'Spiegeltv') - - video_data = self._parse_json(self._webpage_read_content( - handle, metadata_url, video_id), video_id) - title = video_data['title'] - nexx_id = video_data['nexxOmniaId'] - domain_id = video_data.get('nexxOmniaDomain') or '748' - - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': 'nexx:%s:%s' % (domain_id, nexx_id), - 'title': title, - 'description': strip_or_none(video_data.get('teaser')), - 'duration': parse_duration(video_data.get('duration')), - 'timestamp': unified_timestamp(video_data.get('datum')), - 'ie_key': NexxIE.ie_key(), - } - - -class SpiegelArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P[0-9]+)\.html' - IE_NAME = 'Spiegel:Article' - IE_DESC = 'Articles on spiegel.de' - _TESTS = [{ + }, { 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html', - 'info_dict': { - 'id': '1516455', - 'ext': 'mp4', - 'title': 'Faszination Badminton: Nennt es bloß nicht Federball', - 'description': 're:^Patrick Kämnitz gehört.{100,}', - 'upload_date': '20140825', - }, - }, { - 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', - 'info_dict': { - - }, - 'playlist_count': 6, - }, { - # Nexx iFrame embed - 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html', - 'info_dict': { - 'id': '161464', - 'ext': 'mp4', - 'title': 'Nervenkitzel Achterbahn', - 'alt_title': 'Karussellbauer in Deutschland', - 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', - 'release_year': 2005, - 'creator': 'SPIEGEL TV', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2761, - 'timestamp': 1394021479, - 'upload_date': '20140305', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - - # Single video on top of the page - video_link = self._search_regex( - r'\s*.*?url\s*=\s*"([^"]+)"', - webpage) - entries = [ - self.url_result(compat_urlparse.urljoin( - self.http_scheme() + '//spiegel.de/', embed_path)) - for embed_path in embeds] - if embeds: - return self.playlist_result(entries) - - return self.playlist_from_matches( - NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key()) + media_id = self._html_search_regex( + r'("|["\'])mediaId\1\s*:\s*("|["\'])(?P(?:(?!\2).)+)\2', + webpage, 'media id', group='id') + return { + '_type': 'url_transparent', + 'id': video_id, + 'display_id': video_id, + 'url': 'jwplatform:%s' % media_id, + 'title': self._og_search_title(webpage, default=None), + 'ie_key': JWPlatformIE.ie_key(), + } diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py deleted file mode 100644 index 6ccf4c342..000000000 --- a/youtube_dl/extractor/spiegeltv.py +++ /dev/null @@ -1,17 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from .nexx import NexxIE - - -class SpiegeltvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P\d+)' - _TEST = { - 'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/', - 'only_matching': True, - } - - def _real_extract(self, url): - return self.url_result( - 'https://api.nexx.cloud/v3/748/videos/byid/%s' - % self._match_id(url), ie=NexxIE.ie_key()) diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index 7c11ea7aa..aabff7a3c 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -8,15 +8,10 @@ class BellatorIE(MTVServicesInfoExtractor): _TESTS = [{ 'url': 'http://www.bellator.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg', 'info_dict': { - 'id': 'b55e434e-fde1-4a98-b7cc-92003a034de4', - 'ext': 'mp4', - 'title': 'Douglas Lima vs. Paul Daley - Round 1', - 'description': 'md5:805a8dd29310fd611d32baba2f767885', - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'title': 'Michael Page vs. Evangelista Cyborg', + 'description': 'md5:0d917fc00ffd72dd92814963fc6cbb05', }, + 'playlist_count': 3, }, { 'url': 'http://www.bellator.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page', 'only_matching': True, @@ -25,6 +20,9 @@ class BellatorIE(MTVServicesInfoExtractor): _FEED_URL = 'http://www.bellator.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] + def _extract_mgid(self, webpage): + return self._extract_triforce_mgid(webpage) + class ParamountNetworkIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' diff --git a/youtube_dl/extractor/spreaker.py b/youtube_dl/extractor/spreaker.py new file mode 100644 index 000000000..6c7e40ae4 --- /dev/null +++ b/youtube_dl/extractor/spreaker.py @@ -0,0 +1,176 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + int_or_none, + str_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +def _extract_episode(data, episode_id=None): + title = data['title'] + download_url = data['download_url'] + + series = try_get(data, lambda x: x['show']['title'], compat_str) + uploader = try_get(data, lambda x: x['author']['fullname'], compat_str) + + thumbnails = [] + for image in ('image_original', 'image_medium', 'image'): + image_url = url_or_none(data.get('%s_url' % image)) + if image_url: + thumbnails.append({'url': image_url}) + + def stats(key): + return int_or_none(try_get( + data, + (lambda x: x['%ss_count' % key], + lambda x: x['stats']['%ss' % key]))) + + def duration(key): + return float_or_none(data.get(key), scale=1000) + + return { + 'id': compat_str(episode_id or data['episode_id']), + 'url': download_url, + 'display_id': data.get('permalink'), + 'title': title, + 'description': data.get('description'), + 'timestamp': unified_timestamp(data.get('published_at')), + 'uploader': uploader, + 'uploader_id': str_or_none(data.get('author_id')), + 'creator': uploader, + 'duration': duration('duration') or duration('length'), + 'view_count': stats('play'), + 'like_count': stats('like'), + 'comment_count': stats('message'), + 'format': 'MPEG Layer 3', + 'format_id': 'mp3', + 'container': 'mp3', + 'ext': 'mp3', + 'thumbnails': thumbnails, + 'series': series, + 'extractor_key': SpreakerIE.ie_key(), + } + + +class SpreakerIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + api\.spreaker\.com/ + (?: + (?:download/)?episode| + v2/episodes + )/ + (?P\d+) + ''' + _TESTS = [{ + 'url': 'https://api.spreaker.com/episode/12534508', + 'info_dict': { + 'id': '12534508', + 'display_id': 'swm-ep15-how-to-market-your-music-part-2', + 'ext': 'mp3', + 'title': 'EP:15 | Music Marketing (Likes) - Part 2', + 'description': 'md5:0588c43e27be46423e183076fa071177', + 'timestamp': 1502250336, + 'upload_date': '20170809', + 'uploader': 'SWM', + 'uploader_id': '9780658', + 'duration': 1063.42, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'series': 'Success With Music (SWM)', + }, + }, { + 'url': 'https://api.spreaker.com/download/episode/12534508/swm_ep15_how_to_market_your_music_part_2.mp3', + 'only_matching': True, + }, { + 'url': 'https://api.spreaker.com/v2/episodes/12534508?export=episode_segments', + 'only_matching': True, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + data = self._download_json( + 'https://api.spreaker.com/v2/episodes/%s' % episode_id, + episode_id)['response']['episode'] + return _extract_episode(data, episode_id) + + +class SpreakerPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spreaker\.com/user/[^/]+/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/user/9780658/swm-ep15-how-to-market-your-music-part-2', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + episode_id = self._search_regex( + (r'data-episode_id=["\'](?P\d+)', + r'episode_id\s*:\s*(?P\d+)'), webpage, 'episode id') + return self.url_result( + 'https://api.spreaker.com/episode/%s' % episode_id, + ie=SpreakerIE.ie_key(), video_id=episode_id) + + +class SpreakerShowIE(InfoExtractor): + _VALID_URL = r'https?://api\.spreaker\.com/show/(?P\d+)' + _TESTS = [{ + 'url': 'https://api.spreaker.com/show/4652058', + 'info_dict': { + 'id': '4652058', + }, + 'playlist_mincount': 118, + }] + + def _entries(self, show_id): + for page_num in itertools.count(1): + episodes = self._download_json( + 'https://api.spreaker.com/show/%s/episodes' % show_id, + show_id, note='Downloading JSON page %d' % page_num, query={ + 'page': page_num, + 'max_per_page': 100, + }) + pager = try_get(episodes, lambda x: x['response']['pager'], dict) + if not pager: + break + results = pager.get('results') + if not results or not isinstance(results, list): + break + for result in results: + if not isinstance(result, dict): + continue + yield _extract_episode(result) + if page_num == pager.get('last_page'): + break + + def _real_extract(self, url): + show_id = self._match_id(url) + return self.playlist_result(self._entries(show_id), playlist_id=show_id) + + +class SpreakerShowPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spreaker\.com/show/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/show/success-with-music', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + show_id = self._search_regex( + r'show_id\s*:\s*(?P\d+)', webpage, 'show id') + return self.url_result( + 'https://api.spreaker.com/show/%s' % show_id, + ie=SpreakerShowIE.ie_key(), video_id=show_id) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index 170dce87f..f63a1359a 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -114,7 +114,7 @@ class SRGSSRPlayIE(InfoExtractor): [^/]+/(?Pvideo|audio)/[^?]+| popup(?Pvideo|audio)player ) - \?id=(?P[0-9a-f\-]{36}|\d+) + \?.*?\b(?:id=|urn=urn:[^:]+:video:)(?P[0-9a-f\-]{36}|\d+) ''' _TESTS = [{ @@ -175,6 +175,12 @@ class SRGSSRPlayIE(InfoExtractor): }, { 'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01', 'only_matching': True, + }, { + 'url': 'https://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?urn=urn:srf:video:28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'only_matching': True, + }, { + 'url': 'https://www.rts.ch/play/tv/19h30/video/le-19h30?urn=urn:rts:video:6348260', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index e12389cad..a0b6ef4db 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -9,6 +9,7 @@ from ..utils import ( determine_ext, dict_get, int_or_none, + unified_timestamp, str_or_none, strip_or_none, try_get, @@ -44,7 +45,8 @@ class SVTBaseIE(InfoExtractor): 'format_id': player_type, 'url': vurl, }) - if not formats and video_info.get('rights', {}).get('geoBlockedSweden'): + rights = try_get(video_info, lambda x: x['rights'], dict) or {} + if not formats and rights.get('geoBlockedSweden'): self.raise_geo_restricted( 'This video is only available in Sweden', countries=self._GEO_COUNTRIES) @@ -70,6 +72,7 @@ class SVTBaseIE(InfoExtractor): episode = video_info.get('episodeTitle') episode_number = int_or_none(video_info.get('episodeNumber')) + timestamp = unified_timestamp(rights.get('validFrom')) duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration'))) age_limit = None adult = dict_get( @@ -84,6 +87,7 @@ class SVTBaseIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, 'duration': duration, + 'timestamp': timestamp, 'age_limit': age_limit, 'series': series, 'season_number': season_number, @@ -136,26 +140,39 @@ class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' _VALID_URL = r'''(?x) (?: - svt:(?P[^/?#&]+)| + (?: + svt:| + https?://(?:www\.)?svt\.se/barnkanalen/barnplay/[^/]+/ + ) + (?P[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P[^/?#&]+) ) ''' _TESTS = [{ - 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', - 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', + 'url': 'https://www.svtplay.se/video/26194546/det-har-ar-himlen', + 'md5': '2382036fd6f8c994856c323fe51c426e', 'info_dict': { - 'id': '5996901', + 'id': 'jNwpV9P', 'ext': 'mp4', - 'title': 'Flygplan till Haile Selassie', - 'duration': 3527, - 'thumbnail': r're:^https?://.*[\.-]jpg$', + 'title': 'Det här är himlen', + 'timestamp': 1586044800, + 'upload_date': '20200405', + 'duration': 3515, + 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', 'age_limit': 0, 'subtitles': { 'sv': [{ - 'ext': 'wsrt', + 'ext': 'vtt', }] }, }, + 'params': { + 'format': 'bestvideo', + # skip for now due to download test asserts that segment is > 10000 bytes and svt uses + # init segments that are smaller + # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B + 'skip_download': True, + }, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -172,6 +189,12 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'svt:14278044', 'only_matching': True, + }, { + 'url': 'https://www.svt.se/barnkanalen/barnplay/kar/eWv5MLX/', + 'only_matching': True, + }, { + 'url': 'svt:eWv5MLX', + 'only_matching': True, }] def _adjust_title(self, info): @@ -224,11 +247,22 @@ class SVTPlayIE(SVTPlayBaseIE): self._adjust_title(info_dict) return info_dict - svt_id = self._search_regex( - r']+data-video-id=["\']([\da-zA-Z-]+)', - webpage, 'video id') + svt_id = try_get( + data, lambda x: x['statistics']['dataLake']['content']['id'], + compat_str) - return self._extract_by_video_id(svt_id, webpage) + if not svt_id: + svt_id = self._search_regex( + (r']+data-video-id=["\']([\da-zA-Z-]+)', + r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', + r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'), + webpage, 'video id') + + info_dict = self._extract_by_video_id(svt_id, webpage) + info_dict['thumbnail'] = thumbnail + + return info_dict class SVTSeriesIE(SVTPlayBaseIE): @@ -352,7 +386,7 @@ class SVTPageIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) + return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPageIE, cls).suitable(url) def _real_extract(self, url): path, display_id = re.match(self._VALID_URL, url).groups() diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index c351b7545..8ceab7e35 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -86,7 +86,7 @@ class TagesschauPlayerIE(InfoExtractor): # return self._extract_via_api(kind, video_id) # JSON api does not provide some audio formats (e.g. ogg) thus - # extractiong audio via webpage + # extracting audio via webpage webpage = self._download_webpage(url, video_id) diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py index a75369dbe..6f264bddc 100644 --- a/youtube_dl/extractor/teachable.py +++ b/youtube_dl/extractor/teachable.py @@ -269,7 +269,7 @@ class TeachableCourseIE(TeachableBaseIE): r'(?s)(?P]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?)', webpage): li = mobj.group('li') - if 'fa-youtube-play' not in li: + if 'fa-youtube-play' not in li and not re.search(r'\d{1,2}:\d{2}', li): continue lecture_url = self._search_regex( r']+href=(["\'])(?P(?:(?!\1).)+)\1', li, diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index 33a72083b..3e1a7a9e6 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -1,13 +1,21 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from .jwplatform import JWPlatformIE from .nexx import NexxIE from ..compat import compat_urlparse +from ..utils import ( + NO_DEFAULT, + smuggle_url, +) class Tele5IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P[^/?#&]+)' + _GEO_COUNTRIES = ['DE'] _TESTS = [{ 'url': 'https://www.tele5.de/mediathek/filme-online/videos?vid=1549416', 'info_dict': { @@ -20,6 +28,21 @@ class Tele5IE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # jwplatform, nexx unavailable + 'url': 'https://www.tele5.de/filme/ghoul-das-geheimnis-des-friedhofmonsters/', + 'info_dict': { + 'id': 'WJuiOlUp', + 'ext': 'mp4', + 'upload_date': '20200603', + 'timestamp': 1591214400, + 'title': 'Ghoul - Das Geheimnis des Friedhofmonsters', + 'description': 'md5:42002af1d887ff3d5b2b3ca1f8137d97', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [JWPlatformIE.ie_key()], }, { 'url': 'https://www.tele5.de/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191', 'only_matching': True, @@ -44,14 +67,42 @@ class Tele5IE(InfoExtractor): qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0] - if not video_id: + NEXX_ID_RE = r'\d{6,}' + JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}' + + def nexx_result(nexx_id): + return self.url_result( + 'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id, + ie=NexxIE.ie_key(), video_id=nexx_id) + + nexx_id = jwplatform_id = None + + if video_id: + if re.match(NEXX_ID_RE, video_id): + return nexx_result(video_id) + elif re.match(JWPLATFORM_ID_RE, video_id): + jwplatform_id = video_id + + if not nexx_id: display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._html_search_regex( - (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](\d+)', - r'\s+id\s*=\s*["\']player_(\d{6,})', - r'\bdata-id\s*=\s*["\'](\d{6,})'), webpage, 'video id') + + def extract_id(pattern, name, default=NO_DEFAULT): + return self._html_search_regex( + (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern, + r'\s+id\s*=\s*["\']player_(%s)' % pattern, + r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, + default=default) + + nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None) + if nexx_id: + return nexx_result(nexx_id) + + if not jwplatform_id: + jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id') return self.url_result( - 'https://api.nexx.cloud/v3/759/videos/byid/%s' % video_id, - ie=NexxIE.ie_key(), video_id=video_id) + smuggle_url( + 'jwplatform:%s' % jwplatform_id, + {'geo_countries': self._GEO_COUNTRIES}), + ie=JWPlatformIE.ie_key(), video_id=jwplatform_id) diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py index c82c94b3a..800d87b70 100644 --- a/youtube_dl/extractor/telequebec.py +++ b/youtube_dl/extractor/telequebec.py @@ -12,13 +12,14 @@ from ..utils import ( class TeleQuebecBaseIE(InfoExtractor): + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + @staticmethod - def _limelight_result(media_id): + def _brightcove_result(brightcove_id, player_id, account_id='6150020952001'): return { '_type': 'url_transparent', - 'url': smuggle_url( - 'limelight:media:' + media_id, {'geo_countries': ['CA']}), - 'ie_key': 'LimelightMedia', + 'url': smuggle_url(TeleQuebecBaseIE.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, brightcove_id), {'geo_countries': ['CA']}), + 'ie_key': 'BrightcoveNew', } @@ -34,14 +35,33 @@ class TeleQuebecIE(TeleQuebecBaseIE): # available till 01.01.2023 'url': 'http://zonevideo.telequebec.tv/media/37578/un-petit-choc-et-puis-repart/un-chef-a-la-cabane', 'info_dict': { - 'id': '577116881b4b439084e6b1cf4ef8b1b3', + 'id': '6155972771001', 'ext': 'mp4', 'title': 'Un petit choc et puis repart!', 'description': 'md5:b04a7e6b3f74e32d7b294cffe8658374', + 'timestamp': 1589262469, + 'uploader_id': '6150020952001', + 'upload_date': '20200512', }, 'params': { - 'skip_download': True, + 'format': 'bestvideo', }, + 'add_ie': ['BrightcoveNew'], + }, { + 'url': 'https://zonevideo.telequebec.tv/media/55267/le-soleil/passe-partout', + 'info_dict': { + 'id': '6167180337001', + 'ext': 'mp4', + 'title': 'Le soleil', + 'description': 'md5:64289c922a8de2abbe99c354daffde02', + 'uploader_id': '6150020952001', + 'upload_date': '20200625', + 'timestamp': 1593090307, + }, + 'params': { + 'format': 'bestvideo', + }, + 'add_ie': ['BrightcoveNew'], }, { # no description 'url': 'http://zonevideo.telequebec.tv/media/30261', @@ -53,18 +73,20 @@ class TeleQuebecIE(TeleQuebecBaseIE): def _real_extract(self, url): media_id = self._match_id(url) - - media_data = self._download_json( - 'https://mnmedias.api.telequebec.tv/api/v2/media/' + media_id, + media = self._download_json( + 'https://mnmedias.api.telequebec.tv/api/v3/media/' + media_id, media_id)['media'] - - info = self._limelight_result(media_data['streamInfo']['sourceId']) + source_id = next(source_info['sourceId'] for source_info in media['streamInfos'] if source_info.get('source') == 'Brightcove') + info = self._brightcove_result(source_id, '22gPKdt7f') + product = media.get('product') or {} + season = product.get('season') or {} info.update({ - 'title': media_data.get('title'), - 'description': try_get( - media_data, lambda x: x['descriptions'][0]['text'], compat_str), - 'duration': int_or_none( - media_data.get('durationInMilliseconds'), 1000), + 'description': try_get(media, lambda x: x['descriptions'][-1]['text'], compat_str), + 'series': try_get(season, lambda x: x['serie']['titre']), + 'season': season.get('name'), + 'season_number': int_or_none(season.get('seasonNo')), + 'episode': product.get('titre'), + 'episode_number': int_or_none(product.get('episodeNo')), }) return info @@ -115,7 +137,7 @@ class TeleQuebecSquatIE(InfoExtractor): } -class TeleQuebecEmissionIE(TeleQuebecBaseIE): +class TeleQuebecEmissionIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: @@ -127,15 +149,16 @@ class TeleQuebecEmissionIE(TeleQuebecBaseIE): _TESTS = [{ 'url': 'http://lindicemcsween.telequebec.tv/emissions/100430013/des-soins-esthetiques-a-377-d-interets-annuels-ca-vous-tente', 'info_dict': { - 'id': '66648a6aef914fe3badda25e81a4d50a', + 'id': '6154476028001', 'ext': 'mp4', - 'title': "Des soins esthétiques à 377 % d'intérêts annuels, ça vous tente?", - 'description': 'md5:369e0d55d0083f1fc9b71ffb640ea014', - 'upload_date': '20171024', - 'timestamp': 1508862118, + 'title': 'Des soins esthétiques à 377 % d’intérêts annuels, ça vous tente?', + 'description': 'md5:cb4d378e073fae6cce1f87c00f84ae9f', + 'upload_date': '20200505', + 'timestamp': 1588713424, + 'uploader_id': '6150020952001', }, 'params': { - 'skip_download': True, + 'format': 'bestvideo', }, }, { 'url': 'http://bancpublic.telequebec.tv/emissions/emission-49/31986/jeunes-meres-sous-pression', @@ -154,26 +177,26 @@ class TeleQuebecEmissionIE(TeleQuebecBaseIE): webpage = self._download_webpage(url, display_id) media_id = self._search_regex( - r'mediaUID\s*:\s*["\'][Ll]imelight_(?P[a-z0-9]{32})', webpage, - 'limelight id') + r'mediaId\s*:\s*(?P\d+)', webpage, 'media id') - info = self._limelight_result(media_id) - info.update({ - 'title': self._og_search_title(webpage, default=None), - 'description': self._og_search_description(webpage, default=None), - }) - return info + return self.url_result( + 'http://zonevideo.telequebec.tv/media/' + media_id, + TeleQuebecIE.ie_key()) -class TeleQuebecLiveIE(InfoExtractor): +class TeleQuebecLiveIE(TeleQuebecBaseIE): _VALID_URL = r'https?://zonevideo\.telequebec\.tv/(?Pendirect)' _TEST = { 'url': 'http://zonevideo.telequebec.tv/endirect/', 'info_dict': { - 'id': 'endirect', + 'id': '6159095684001', 'ext': 'mp4', - 'title': 're:^Télé-Québec - En direct [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^Télé-Québec [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, + 'description': 'Canal principal de Télé-Québec', + 'uploader_id': '6150020952001', + 'timestamp': 1590439901, + 'upload_date': '20200525', }, 'params': { 'skip_download': True, @@ -181,25 +204,49 @@ class TeleQuebecLiveIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) + return self._brightcove_result('6159095684001', 'skCsmi2Uw') - m3u8_url = None - webpage = self._download_webpage( - 'https://player.telequebec.tv/Tq_VideoPlayer.js', video_id, - fatal=False) - if webpage: - m3u8_url = self._search_regex( - r'm3U8Url\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'm3u8 url', default=None, group='url') - if not m3u8_url: - m3u8_url = 'https://teleqmmd.mmdlive.lldns.net/teleqmmd/f386e3b206814e1f8c8c1c71c0f8e748/manifest.m3u8' - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', m3u8_id='hls') - self._sort_formats(formats) - return { - 'id': video_id, - 'title': self._live_title('Télé-Québec - En direct'), - 'is_live': True, - 'formats': formats, - } +class TeleQuebecVideoIE(TeleQuebecBaseIE): + _VALID_URL = r'https?://video\.telequebec\.tv/player(?:-live)?/(?P