diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index e2cbaee1c..ea0a59dca 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.11.21.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.21.1 + [debug] youtube-dl version 2021.02.10 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index e531e89c0..d24855c72 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.11.21.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 686878c56..8b96a2883 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.21.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 17b853716..e46971047 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.11.21.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.11.21.1 + [debug] youtube-dl version 2021.02.10 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 109c47925..a9ca379ca 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.11.21.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index e69b907d8..892cea0a3 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -7,8 +7,10 @@ --- ### Before submitting a *pull request* make sure you have: -- [ ] At least skimmed through [adding new extractor tutorial](https://github.com/ytdl-org/youtube-dl#adding-support-for-a-new-site) and [youtube-dl coding conventions](https://github.com/ytdl-org/youtube-dl#youtube-dl-coding-conventions) sections - [ ] [Searched](https://github.com/ytdl-org/youtube-dl/search?q=is%3Apr&type=Issues) the bugtracker for similar pull requests +- [ ] Read [adding new extractor tutorial](https://github.com/ytdl-org/youtube-dl#adding-support-for-a-new-site) +- [ ] Read [youtube-dl coding conventions](https://github.com/ytdl-org/youtube-dl#youtube-dl-coding-conventions) and adjusted the code to meet them +- [ ] Covered the code with tests (note that PRs without tests will be REJECTED) - [ ] Checked the code with [flake8](https://pypi.python.org/pypi/flake8) ### In order to be accepted and merged into youtube-dl each piece of code must be in public domain or released under [Unlicense](http://unlicense.org/). Check one of the following options: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000..a9dc47a71 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,74 @@ +name: CI +on: [push, pull_request] +jobs: + tests: + name: Tests + runs-on: ${{ matrix.os }} + strategy: + fail-fast: true + matrix: + os: [ubuntu-18.04] + # TODO: python 2.6 + python-version: [2.7, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, pypy-2.7, pypy-3.6, pypy-3.7] + python-impl: [cpython] + ytdl-test-set: [core, download] + run-tests-ext: [sh] + include: + # python 3.2 is only available on windows via setup-python + - os: windows-latest + python-version: 3.2 + python-impl: cpython + ytdl-test-set: core + run-tests-ext: bat + - os: windows-latest + python-version: 3.2 + python-impl: cpython + ytdl-test-set: download + run-tests-ext: bat + # jython + - os: ubuntu-18.04 + python-impl: jython + ytdl-test-set: core + run-tests-ext: sh + - os: ubuntu-18.04 + python-impl: jython + ytdl-test-set: download + run-tests-ext: sh + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + if: ${{ matrix.python-impl == 'cpython' }} + with: + python-version: ${{ matrix.python-version }} + - name: Set up Java 8 + if: ${{ matrix.python-impl == 'jython' }} + uses: actions/setup-java@v1 + with: + java-version: 8 + - name: Install Jython + if: ${{ matrix.python-impl == 'jython' }} + run: | + wget http://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar + java -jar jython-installer.jar -s -d "$HOME/jython" + echo "$HOME/jython/bin" >> $GITHUB_PATH + - name: Install nose + run: pip install nose + - name: Run tests + continue-on-error: ${{ matrix.ytdl-test-set == 'download' || matrix.python-impl == 'jython' }} + env: + YTDL_TEST_SET: ${{ matrix.ytdl-test-set }} + run: ./devscripts/run_tests.${{ matrix.run-tests-ext }} + flake8: + name: Linter + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.9 + - name: Install flake8 + run: pip install flake8 + - name: Run flake8 + run: flake8 . diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 51afd469a..000000000 --- a/.travis.yml +++ /dev/null @@ -1,50 +0,0 @@ -language: python -python: - - "2.6" - - "2.7" - - "3.2" - - "3.3" - - "3.4" - - "3.5" - - "3.6" - - "pypy" - - "pypy3" -dist: trusty -env: - - YTDL_TEST_SET=core - - YTDL_TEST_SET=download -jobs: - include: - - python: 3.7 - dist: xenial - env: YTDL_TEST_SET=core - - python: 3.7 - dist: xenial - env: YTDL_TEST_SET=download - - python: 3.8 - dist: xenial - env: YTDL_TEST_SET=core - - python: 3.8 - dist: xenial - env: YTDL_TEST_SET=download - - python: 3.8-dev - dist: xenial - env: YTDL_TEST_SET=core - - python: 3.8-dev - dist: xenial - env: YTDL_TEST_SET=download - - env: JYTHON=true; YTDL_TEST_SET=core - - env: JYTHON=true; YTDL_TEST_SET=download - - name: flake8 - python: 3.8 - dist: xenial - install: pip install flake8 - script: flake8 . - fast_finish: true - allow_failures: - - env: YTDL_TEST_SET=download - - env: JYTHON=true; YTDL_TEST_SET=core - - env: JYTHON=true; YTDL_TEST_SET=download -before_install: - - if [ "$JYTHON" == "true" ]; then ./devscripts/install_jython.sh; export PATH="$HOME/jython/bin:$PATH"; fi -script: ./devscripts/run_tests.sh diff --git a/AUTHORS b/AUTHORS index b507cb8df..4a6d7dacd 100644 --- a/AUTHORS +++ b/AUTHORS @@ -246,3 +246,4 @@ Enes Solak Nathan Rossi Thomas van der Berg Luca Cherubin +Adrian Heine \ No newline at end of file diff --git a/ChangeLog b/ChangeLog index 5a08b3a66..384bd19c2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,488 @@ +version 2021.02.10 + +Extractors +* [youtube:tab] Improve grid continuation extraction (#28130) +* [ign] Fix extraction (#24771) ++ [xhamster] Extract format filesize ++ [xhamster] Extract formats from xplayer settings (#28114) ++ [youtube] Add support phone/tablet JS player (#26424) +* [archiveorg] Fix and improve extraction (#21330, #23586, #25277, #26780, + #27109, #27236, #28063) ++ [cda] Detect geo restricted videos (#28106) +* [urplay] Fix extraction (#28073, #28074) +* [youtube] Fix release date extraction (#28094) ++ [youtube] Extract abr and vbr (#28100) +* [youtube] Skip OTF formats (#28070) + + +version 2021.02.04.1 + +Extractors +* [youtube] Prefer DASH formats (#28070) +* [azmedien] Fix extraction (#28064) + + +version 2021.02.04 + +Extractors +* [pornhub] Implement lazy playlist extraction +* [svtplay] Fix video id extraction (#28058) ++ [pornhub] Add support for authentication (#18797, #21416, #24294) +* [pornhub:user] Improve paging ++ [pornhub:user] Add support for URLs unavailable via /videos page (#27853) ++ [bravotv] Add support for oxygen.com (#13357, #22500) ++ [youtube] Pass embed URL to get_video_info request +* [ccma] Improve metadata extraction (#27994) + + Extract age limit, alt title, categories, series and episode number + * Fix timestamp multiple subtitles extraction +* [egghead] Update API domain (#28038) +- [vidzi] Remove extractor (#12629) +* [vidio] Improve metadata extraction +* [youtube] Improve subtitles extraction +* [youtube] Fix chapter extraction fallback +* [youtube] Rewrite extractor + * Improve format sorting + * Remove unused code + * Fix series metadata extraction + * Fix trailer video extraction + * Improve error reporting + + Extract video location ++ [vvvvid] Add support for youtube embeds (#27825) +* [googledrive] Report download page errors (#28005) +* [vlive] Fix error message decoding for python 2 (#28004) +* [youtube] Improve DASH formats file size extraction +* [cda] Improve birth validation detection (#14022, #27929) ++ [awaan] Extract uploader id (#27963) ++ [medialaan] Add support DPG Media MyChannels based websites (#14871, #15597, + #16106, #16489) +* [abcnews] Fix extraction (#12394, #27920) +* [AMP] Fix upload date and timestamp extraction (#27970) +* [tv4] Relax URL regular expression (#27964) ++ [tv2] Add support for mtvuutiset.fi (#27744) +* [adn] Improve login warning reporting +* [zype] Fix uplynk id extraction (#27956) ++ [adn] Add support for authentication (#17091, #27841, #27937) + + +version 2021.01.24.1 + +Core +* Introduce --output-na-placeholder (#27896) + +Extractors +* [franceculture] Make thumbnail optional (#18807) +* [franceculture] Fix extraction (#27891, #27903) +* [njpwworld] Fix extraction (#27890) +* [comedycentral] Fix extraction (#27905) +* [wat] Fix format extraction (#27901) ++ [americastestkitchen:season] Add support for seasons (#27861) ++ [trovo] Add support for trovo.live (#26125) ++ [aol] Add support for yahoo videos (#26650) +* [yahoo] Fix single video extraction +* [lbry] Unescape lbry URI (#27872) +* [9gag] Fix and improve extraction (#23022) +* [americastestkitchen] Improve metadata extraction for ATK episodes (#27860) +* [aljazeera] Fix extraction (#20911, #27779) ++ [minds] Add support for minds.com (#17934) +* [ard] Fix title and description extraction (#27761) ++ [spotify] Add support for Spotify Podcasts (#27443) + + +version 2021.01.16 + +Core +* [YoutubeDL] Protect from infinite recursion due to recursively nested + playlists (#27833) +* [YoutubeDL] Ignore failure to create existing directory (#27811) +* [YoutubeDL] Raise syntax error for format selection expressions with multiple + + operators (#27803) + +Extractors ++ [animeondemand] Add support for lazy playlist extraction (#27829) +* [youporn] Restrict fallback download URL (#27822) +* [youporn] Improve height and tbr extraction (#20425, #23659) +* [youporn] Fix extraction (#27822) ++ [twitter] Add support for unified cards (#27826) ++ [twitch] Add Authorization header with OAuth token for GraphQL requests + (#27790) +* [mixcloud:playlist:base] Extract video id in flat playlist mode (#27787) +* [cspan] Improve info extraction (#27791) +* [adn] Improve info extraction +* [adn] Fix extraction (#26963, #27732) +* [youtube:search] Extract from all sections (#27604) +* [youtube:search] fix viewcount and try to extract all video sections (#27604) +* [twitch] Improve login error extraction +* [twitch] Fix authentication (#27743) +* [3qsdn] Improve extraction (#21058) +* [peertube] Extract formats from streamingPlaylists (#26002, #27586, #27728) +* [khanacademy] Fix extraction (#2887, #26803) +* [spike] Update Paramount Network feed URL (#27715) + + +version 2021.01.08 + +Core +* [downloader/hls] Disable decryption in tests (#27660) ++ [utils] Add a function to clean podcast URLs + +Extractors +* [rai] Improve subtitles extraction (#27698, #27705) +* [canvas] Match only supported VRT NU URLs (#27707) ++ [bibeltv] Add support for bibeltv.de (#14361) ++ [bfmtv] Add support for bfmtv.com (#16053, #26615) ++ [sbs] Add support for ondemand play and news embed URLs (#17650, #27629) +* [twitch] Drop legacy kraken API v5 code altogether and refactor +* [twitch:vod] Switch to GraphQL for video metadata +* [canvas] Fix VRT NU extraction (#26957, #27053) +* [twitch] Switch access token to GraphQL and refactor (#27646) ++ [rai] Detect ContentItem in iframe (#12652, #27673) +* [ketnet] Fix extraction (#27662) ++ [dplay] Add suport Discovery+ domains (#27680) +* [motherless] Improve extraction (#26495, #27450) +* [motherless] Fix recent videos upload date extraction (#27661) +* [nrk] Fix extraction for videos without a legalAge rating +- [googleplus] Remove extractor (#4955, #7400) ++ [applepodcasts] Add support for podcasts.apple.com (#25918) ++ [googlepodcasts] Add support for podcasts.google.com ++ [iheart] Add support for iheart.com (#27037) +* [acast] Clean podcast URLs +* [stitcher] Clean podcast URLs ++ [xfileshare] Add support for aparat.cam (#27651) ++ [twitter] Add support for summary card (#25121) +* [twitter] Try to use a Generic fallback for unknown twitter cards (#25982) ++ [stitcher] Add support for shows and show metadata extraction (#20510) +* [stv] Improve episode id extraction (#23083) + + +version 2021.01.03 + +Extractors +* [nrk] Improve series metadata extraction (#27473) ++ [nrk] Extract subtitles +* [nrk] Fix age limit extraction +* [nrk] Improve video id extraction ++ [nrk] Add support for podcasts (#27634, #27635) +* [nrk] Generalize and delegate all item extractors to nrk ++ [nrk] Add support for mp3 formats +* [nrktv] Switch to playback endpoint +* [vvvvid] Fix season metadata extraction (#18130) +* [stitcher] Fix extraction (#20811, #27606) +* [acast] Fix extraction (#21444, #27612, #27613) ++ [arcpublishing] Add support for arcpublishing.com (#2298, #9340, #17200) ++ [sky] Add support for Sports News articles and Brighcove videos (#13054) ++ [vvvvid] Extract akamai formats +* [vvvvid] Skip unplayable episodes (#27599) +* [yandexvideo] Fix extraction for Python 3.4 + + +version 2020.12.31 + +Core +* [utils] Accept only supported protocols in url_or_none +* [YoutubeDL] Allow format filtering using audio language (#16209) + +Extractors ++ [redditr] Extract all thumbnails (#27503) +* [vvvvid] Improve info extraction ++ [vvvvid] Add support for playlists (#18130, #27574) ++ [yandexdisk] Extract info from webpage +* [yandexdisk] Fix extraction (#17861, #27131) +* [yandexvideo] Use old API call as fallback +* [yandexvideo] Fix extraction (#25000) +- [nbc] Remove CSNNE extractor +* [nbc] Fix NBCSport VPlayer URL extraction (#16640) ++ [aenetworks] Add support for biography.com (#3863) +* [uktvplay] Match new video URLs (#17909) +* [sevenplay] Detect API errors +* [tenplay] Fix format extraction (#26653) +* [brightcove] Raise error for DRM protected videos (#23467, #27568) + + +version 2020.12.29 + +Extractors +* [youtube] Improve yt initial data extraction (#27524) +* [youtube:tab] Improve URL matching #27559) +* [youtube:tab] Restore retry on browse requests (#27313, #27564) +* [aparat] Fix extraction (#22285, #22611, #23348, #24354, #24591, #24904, + #25418, #26070, #26350, #26738, #27563) +- [brightcove] Remove sonyliv specific code +* [piksel] Improve format extraction ++ [zype] Add support for uplynk videos ++ [toggle] Add support for live.mewatch.sg (#27555) ++ [go] Add support for fxnow.fxnetworks.com (#13972, #22467, #23754, #26826) +* [teachable] Improve embed detection (#26923) +* [mitele] Fix free video extraction (#24624, #25827, #26757) +* [telecinco] Fix extraction +* [youtube] Update invidious.snopyta.org (#22667) +* [amcnetworks] Improve auth only video detection (#27548) ++ [generic] Add support for VHX Embeds (#27546) + + +version 2020.12.26 + +Extractors +* [instagram] Fix comment count extraction ++ [instagram] Add support for reel URLs (#26234, #26250) +* [bbc] Switch to media selector v6 (#23232, #23933, #26303, #26432, #26821, + #27538) +* [instagram] Improve thumbnail extraction +* [instagram] Fix extraction when authenticated (#22880, #26377, #26981, + #27422) +* [spankbang:playlist] Fix extraction (#24087) ++ [spankbang] Add support for playlist videos +* [pornhub] Improve like and dislike count extraction (#27356) +* [pornhub] Fix lq formats extraction (#27386, #27393) ++ [bongacams] Add support for bongacams.com (#27440) +* [youtube:tab] Extend URL regular expression (#27501) +* [theweatherchannel] Fix extraction (#25930, #26051) ++ [sprout] Add support for Universal Kids (#22518) +* [theplatform] Allow passing geo bypass countries from other extractors ++ [wistia] Add support for playlists (#27533) ++ [ctv] Add support for ctv.ca (#27525) +* [9c9media] Improve info extraction +* [youtube] Fix automatic captions extraction (#27162, #27388) +* [sonyliv] Fix title for movies +* [sonyliv] Fix extraction (#25667) +* [streetvoice] Fix extraction (#27455, #27492) ++ [facebook] Add support for watchparty pages (#27507) +* [cbslocal] Fix video extraction ++ [brightcove] Add another method to extract policyKey +* [mewatch] Relax URL regular expression (#27506) + + +version 2020.12.22 + +Core +* [common] Remove unwanted query params from unsigned akamai manifest URLs + +Extractors +- [tastytrade] Remove extractor (#25716) +* [niconico] Fix playlist extraction (#27428) +- [everyonesmixtape] Remove extractor +- [kanalplay] Remove extractor +* [arkena] Fix extraction +* [nba] Rewrite extractor +* [turner] Improve info extraction +* [youtube] Improve xsrf token extraction (#27442) +* [generic] Improve RSS age limit extraction +* [generic] Fix RSS itunes thumbnail extraction (#27405) ++ [redditr] Extract duration (#27426) +- [zaq1] Remove extractor ++ [asiancrush] Add support for retrocrush.tv +* [asiancrush] Fix extraction +- [noco] Remove extractor (#10864) +* [nfl] Fix extraction (#22245) +* [skysports] Relax URL regular expression (#27435) ++ [tv5unis] Add support for tv5unis.ca (#22399, #24890) ++ [videomore] Add support for more.tv (#27088) ++ [yandexmusic] Add support for music.yandex.com (#27425) ++ [nhk:program] Add support for audio programs and program clips ++ [nhk] Add support for NHK video programs (#27230) + + +version 2020.12.14 + +Core +* [extractor/common] Improve JSON-LD interaction statistic extraction (#23306) +* [downloader/hls] Delegate manifests with media initialization to ffmpeg ++ [extractor/common] Document duration meta field for playlists + +Extractors +* [mdr] Bypass geo restriction +* [mdr] Improve extraction (#24346, #26873) +* [yandexmusic:album] Improve album title extraction (#27418) +* [eporner] Fix view count extraction and make optional (#23306) ++ [eporner] Extend URL regular expression +* [eporner] Fix hash extraction and extend _VALID_URL (#27396) +* [slideslive] Use m3u8 entry protocol for m3u8 formats (#27400) +* [twitcasting] Fix format extraction and improve info extraction (#24868) +* [linuxacademy] Fix authentication and extraction (#21129, #26223, #27402) +* [itv] Clean description from HTML tags (#27399) +* [vlive] Sort live formats (#27404) +* [hotstart] Fix and improve extraction + * Fix format extraction (#26690) + + Extract thumbnail URL (#16079, #20412) + + Add support for country specific playlist URLs (#23496) + * Select the last id in video URL (#26412) ++ [youtube] Add some invidious instances (#27373) + + +version 2020.12.12 + +Core +* [YoutubeDL] Improve thumbnail filename deducing (#26010, #27244) + +Extractors ++ [ruutu] Extract more metadata ++ [ruutu] Detect non-free videos (#21154) +* [ruutu] Authenticate format URLs (#21031, #26782) ++ [ruutu] Add support for static.nelonenmedia.fi (#25412) ++ [ruutu] Extend URL regular expression (#24839) ++ [facebook] Add support archived live video URLs (#15859) +* [wdr] Improve overall extraction ++ [wdr] Extend subtitles extraction (#22672, #22723) ++ [facebook] Add support for videos attached to Relay based story pages + (#10795) ++ [wdr:page] Add support for kinder.wdr.de (#27350) ++ [facebook] Add another regular expression for handleServerJS +* [facebook] Fix embed page extraction ++ [facebook] Add support for Relay post pages (#26935) ++ [facebook] Add support for watch videos (#22795, #27062) ++ [facebook] Add support for group posts with multiple videos (#19131) +* [itv] Fix series metadata extraction (#26897) +- [itv] Remove old extraction method (#23177) +* [facebook] Redirect mobile URLs to desktop URLs (#24831, #25624) ++ [facebook] Add support for Relay based pages (#26823) +* [facebook] Try to reduce unnecessary tahoe requests +- [facebook] Remove hardcoded Chrome User-Agent (#18974, #25411, #26958, + #27329) +- [smotri] Remove extractor (#27358) +- [beampro] Remove extractor (#17290, #22871, #23020, #23061, #26099) + + +version 2020.12.09 + +Core +* [extractor/common] Fix inline HTML5 media tags processing (#27345) + +Extractors +* [youtube:tab] Improve identity token extraction (#27197) +* [youtube:tab] Make click tracking params on continuation optional +* [youtube:tab] Delegate inline playlists to tab-based playlists (27298) ++ [tubitv] Extract release year (#27317) +* [amcnetworks] Fix free content extraction (#20354) ++ [lbry:channel] Add support for channels (#25584) ++ [lbry] Add support for short and embed URLs +* [lbry] Fix channel metadata extraction ++ [telequebec] Add support for video.telequebec.tv (#27339) +* [telequebec] Fix extraction (#25733, #26883) ++ [youtube:tab] Capture and output alerts (#27340) +* [tvplay:home] Fix extraction (#21153) +* [americastestkitchen] Fix Extraction and add support + for Cook's Country and Cook's Illustrated (#17234, #27322) ++ [slideslive] Add support for yoda service videos and extract subtitles + (#27323) + + +version 2020.12.07 + +Core +* [extractor/common] Extract timestamp from Last-Modified header ++ [extractor/common] Add support for dl8-* media tags (#27283) +* [extractor/common] Fix media type extraction for HTML5 media tags + in start/end form + +Extractors +* [aenetworks] Fix extraction (#23363, #23390, #26795, #26985) + * Fix Fastly format extraction + + Add support for play and watch subdomains + + Extract series metadata +* [youtube] Improve youtu.be extraction in non-existing playlists (#27324) ++ [generic] Extract RSS video description, timestamp and itunes metadata + (#27177) +* [nrk] Reduce the number of instalments and episodes requests +* [nrk] Improve extraction + * Improve format extraction for old akamai formats + + Add is_live value to entry info dict + * Request instalments only when available + * Fix skole extraction ++ [peertube] Extract fps ++ [peertube] Recognize audio-only formats (#27295) + + +version 2020.12.05 + +Core +* [extractor/common] Improve Akamai HTTP format extraction + * Allow m3u8 manifest without an additional audio format + * Fix extraction for qualities starting with a number + +Extractors +* [teachable:course] Improve extraction (#24507, #27286) +* [nrk] Improve error extraction +* [nrktv:series] Improve extraction (#21926) +* [nrktv:season] Improve extraction +* [nrk] Improve format extraction and geo-restriction detection (#24221) +* [pornhub] Handle HTTP errors gracefully (#26414) +* [nrktv] Relax URL regular expression (#27299, #26185) ++ [zdf] Extract webm formats (#26659) ++ [gamespot] Extract DASH and HTTP formats ++ [tver] Add support for tver.jp (#26662, #27284) ++ [pornhub] Add support for pornhub.org (#27276) + + +version 2020.12.02 + +Extractors ++ [tva] Add support for qub.ca (#27235) ++ [toggle] Detect DRM protected videos (#16479, #20805) ++ [toggle] Add support for new MeWatch URLs (#27256) +* [youtube:tab] Extract channels only from channels tab (#27266) ++ [cspan] Extract info from jwplayer data (#3672, #3734, #10638, #13030, + #18806, #23148, #24461, #26171, #26800, #27263) +* [cspan] Pass Referer header with format's video URL (#26032, #25729) +* [youtube] Improve age-gated videos extraction (#27259) ++ [mediaset] Add support for movie URLs (#27240) +* [yandexmusic] Refactor ++ [yandexmusic] Add support for artist's tracks and albums (#11887, #22284) +* [yandexmusic:track] Fix extraction (#26449, #26669, #26747, #26748, #26762) + + +version 2020.11.29 + +Core +* [YoutubeDL] Write static debug to stderr and respect quiet for dynamic debug + (#14579, #22593) + +Extractors +* [drtv] Extend URL regular expression (#27243) +* [tiktok] Fix extraction (#20809, #22838, #22850, #25987, #26281, #26411, + #26639, #26776, #27237) ++ [ina] Add support for mobile URLs (#27229) +* [pornhub] Fix like and dislike count extraction (#27227, #27234) +* [youtube] Improve yt initial player response extraction (#27216) +* [videa] Fix extraction (#25650, #25973, #26301) + + +version 2020.11.26 + +Core +* [downloader/fragment] Set final file's mtime according to last fragment's + Last-Modified header (#11718, #18384, #27138) + +Extractors ++ [spreaker] Add support for spreaker.com (#13480, #13877) +* [vlive] Improve extraction for geo-restricted videos ++ [vlive] Add support for post URLs (#27122, #27123) +* [viki] Fix video API request (#27184) +* [bbc] Fix BBC Three clip extraction +* [bbc] Fix BBC News videos extraction ++ [medaltv] Add support for medal.tv (#27149) +* [youtube] Improve music metadata and license extraction (#26013) +* [nrk] Fix extraction +* [cda] Fix extraction (#17803, #24458, #24518, #26381) + + +version 2020.11.24 + +Core ++ [extractor/common] Add generic support for akamai HTTP format extraction + +Extractors +* [youtube:tab] Fix feeds extraction (#25695, #26452) +* [youtube:favorites] Restore extractor +* [youtube:tab] Fix some weird typo (#27157) ++ [pinterest] Add support for large collections (more than 25 pins) ++ [franceinter] Extract thumbnail (#27153) ++ [box] Add support for box.com (#5949) ++ [nytimes] Add support for cooking.nytimes.com (#27112, #27143) +* [lbry] Relax URL regular expression (#27144) ++ [rumble] Add support for embed pages (#10785) ++ [skyit] Add support for multiple Sky Italia websites (#26629) ++ [pinterest] Add support for pinterest.com (#25747) + + version 2020.11.21.1 Core diff --git a/README.md b/README.md index cd8856828..94c34d89a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -[![Build Status](https://travis-ci.org/ytdl-org/youtube-dl.svg?branch=master)](https://travis-ci.org/ytdl-org/youtube-dl) +[![Build Status](https://github.com/ytdl-org/youtube-dl/workflows/CI/badge.svg)](https://github.com/ytdl-org/youtube-dl/actions?query=workflow%3ACI) + youtube-dl - download videos from youtube.com or other video platforms @@ -51,394 +52,431 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo youtube-dl [OPTIONS] URL [URL...] # OPTIONS - -h, --help Print this help text and exit - --version Print program version and exit - -U, --update Update this program to latest version. Make - sure that you have sufficient permissions - (run with sudo if needed) - -i, --ignore-errors Continue on download errors, for example to - skip unavailable videos in a playlist - --abort-on-error Abort downloading of further videos (in the - playlist or the command line) if an error - occurs - --dump-user-agent Display the current browser identification - --list-extractors List all supported extractors - --extractor-descriptions Output descriptions of all supported - extractors - --force-generic-extractor Force extraction to use the generic - extractor - --default-search PREFIX Use this prefix for unqualified URLs. For - example "gvsearch2:" downloads two videos - from google videos for youtube-dl "large - apple". Use the value "auto" to let - youtube-dl guess ("auto_warning" to emit a - warning when guessing). "error" just throws - an error. The default value "fixup_error" - repairs broken URLs, but emits an error if - this is not possible instead of searching. - --ignore-config Do not read configuration files. When given - in the global configuration file - /etc/youtube-dl.conf: Do not read the user - configuration in ~/.config/youtube- - dl/config (%APPDATA%/youtube-dl/config.txt - on Windows) - --config-location PATH Location of the configuration file; either - the path to the config or its containing - directory. - --flat-playlist Do not extract the videos of a playlist, - only list them. - --mark-watched Mark videos watched (YouTube only) - --no-mark-watched Do not mark videos watched (YouTube only) - --no-color Do not emit color codes in output + -h, --help Print this help text and exit + --version Print program version and exit + -U, --update Update this program to latest version. + Make sure that you have sufficient + permissions (run with sudo if needed) + -i, --ignore-errors Continue on download errors, for + example to skip unavailable videos in a + playlist + --abort-on-error Abort downloading of further videos (in + the playlist or the command line) if an + error occurs + --dump-user-agent Display the current browser + identification + --list-extractors List all supported extractors + --extractor-descriptions Output descriptions of all supported + extractors + --force-generic-extractor Force extraction to use the generic + extractor + --default-search PREFIX Use this prefix for unqualified URLs. + For example "gvsearch2:" downloads two + videos from google videos for youtube- + dl "large apple". Use the value "auto" + to let youtube-dl guess ("auto_warning" + to emit a warning when guessing). + "error" just throws an error. The + default value "fixup_error" repairs + broken URLs, but emits an error if this + is not possible instead of searching. + --ignore-config Do not read configuration files. When + given in the global configuration file + /etc/youtube-dl.conf: Do not read the + user configuration in + ~/.config/youtube-dl/config + (%APPDATA%/youtube-dl/config.txt on + Windows) + --config-location PATH Location of the configuration file; + either the path to the config or its + containing directory. + --flat-playlist Do not extract the videos of a + playlist, only list them. + --mark-watched Mark videos watched (YouTube only) + --no-mark-watched Do not mark videos watched (YouTube + only) + --no-color Do not emit color codes in output ## Network Options: - --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. - To enable SOCKS proxy, specify a proper - scheme. For example - socks5://127.0.0.1:1080/. Pass in an empty - string (--proxy "") for direct connection - --socket-timeout SECONDS Time to wait before giving up, in seconds - --source-address IP Client-side IP address to bind to - -4, --force-ipv4 Make all connections via IPv4 - -6, --force-ipv6 Make all connections via IPv6 + --proxy URL Use the specified HTTP/HTTPS/SOCKS + proxy. To enable SOCKS proxy, specify a + proper scheme. For example + socks5://127.0.0.1:1080/. Pass in an + empty string (--proxy "") for direct + connection + --socket-timeout SECONDS Time to wait before giving up, in + seconds + --source-address IP Client-side IP address to bind to + -4, --force-ipv4 Make all connections via IPv4 + -6, --force-ipv6 Make all connections via IPv6 ## Geo Restriction: - --geo-verification-proxy URL Use this proxy to verify the IP address for - some geo-restricted sites. The default - proxy specified by --proxy (or none, if the - option is not present) is used for the - actual downloading. - --geo-bypass Bypass geographic restriction via faking - X-Forwarded-For HTTP header - --no-geo-bypass Do not bypass geographic restriction via - faking X-Forwarded-For HTTP header - --geo-bypass-country CODE Force bypass geographic restriction with - explicitly provided two-letter ISO 3166-2 - country code - --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction with - explicitly provided IP block in CIDR - notation + --geo-verification-proxy URL Use this proxy to verify the IP address + for some geo-restricted sites. The + default proxy specified by --proxy (or + none, if the option is not present) is + used for the actual downloading. + --geo-bypass Bypass geographic restriction via + faking X-Forwarded-For HTTP header + --no-geo-bypass Do not bypass geographic restriction + via faking X-Forwarded-For HTTP header + --geo-bypass-country CODE Force bypass geographic restriction + with explicitly provided two-letter ISO + 3166-2 country code + --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction + with explicitly provided IP block in + CIDR notation ## Video Selection: - --playlist-start NUMBER Playlist video to start at (default is 1) - --playlist-end NUMBER Playlist video to end at (default is last) - --playlist-items ITEM_SPEC Playlist video items to download. Specify - indices of the videos in the playlist - separated by commas like: "--playlist-items - 1,2,5,8" if you want to download videos - indexed 1, 2, 5, 8 in the playlist. You can - specify range: "--playlist-items - 1-3,7,10-13", it will download the videos - at index 1, 2, 3, 7, 10, 11, 12 and 13. - --match-title REGEX Download only matching titles (regex or - caseless sub-string) - --reject-title REGEX Skip download for matching titles (regex or - caseless sub-string) - --max-downloads NUMBER Abort after downloading NUMBER files - --min-filesize SIZE Do not download any videos smaller than - SIZE (e.g. 50k or 44.6m) - --max-filesize SIZE Do not download any videos larger than SIZE - (e.g. 50k or 44.6m) - --date DATE Download only videos uploaded in this date - --datebefore DATE Download only videos uploaded on or before - this date (i.e. inclusive) - --dateafter DATE Download only videos uploaded on or after - this date (i.e. inclusive) - --min-views COUNT Do not download any videos with less than - COUNT views - --max-views COUNT Do not download any videos with more than - COUNT views - --match-filter FILTER Generic video filter. Specify any key (see - the "OUTPUT TEMPLATE" for a list of - available keys) to match if the key is - present, !key to check if the key is not - present, key > NUMBER (like "comment_count - > 12", also works with >=, <, <=, !=, =) to - compare against a number, key = 'LITERAL' - (like "uploader = 'Mike Smith'", also works - with !=) to match against a string literal - and & to require multiple matches. Values - which are not known are excluded unless you - put a question mark (?) after the operator. - For example, to only match videos that have - been liked more than 100 times and disliked - less than 50 times (or the dislike - functionality is not available at the given - service), but who also have a description, - use --match-filter "like_count > 100 & - dislike_count NUMBER (like + "comment_count > 12", also works with + >=, <, <=, !=, =) to compare against a + number, key = 'LITERAL' (like "uploader + = 'Mike Smith'", also works with !=) to + match against a string literal and & to + require multiple matches. Values which + are not known are excluded unless you + put a question mark (?) after the + operator. For example, to only match + videos that have been liked more than + 100 times and disliked less than 50 + times (or the dislike functionality is + not available at the given service), + but who also have a description, use + --match-filter "like_count > 100 & + dislike_count .+?) - (?P.+)" - --xattrs Write metadata to the video file's xattrs - (using dublin core and xdg standards) - --fixup POLICY Automatically correct known faults of the - file. One of never (do nothing), warn (only - emit a warning), detect_or_warn (the - default; fix file if we can, warn - otherwise) - --prefer-avconv Prefer avconv over ffmpeg for running the - postprocessors - --prefer-ffmpeg Prefer ffmpeg over avconv for running the - postprocessors (default) - --ffmpeg-location PATH Location of the ffmpeg/avconv binary; - either the path to the binary or its - containing directory. - --exec CMD Execute a command on the file after - downloading and post-processing, similar to - find's -exec syntax. Example: --exec 'adb - push {} /sdcard/Music/ && rm {}' - --convert-subs FORMAT Convert the subtitles to other format - (currently supported: srt|ass|vtt|lrc) + -x, --extract-audio Convert video files to audio-only files + (requires ffmpeg/avconv and + ffprobe/avprobe) + --audio-format FORMAT Specify audio format: "best", "aac", + "flac", "mp3", "m4a", "opus", "vorbis", + or "wav"; "best" by default; No effect + without -x + --audio-quality QUALITY Specify ffmpeg/avconv audio quality, + insert a value between 0 (better) and 9 + (worse) for VBR or a specific bitrate + like 128K (default 5) + --recode-video FORMAT Encode the video to another format if + necessary (currently supported: + mp4|flv|ogg|webm|mkv|avi) + --postprocessor-args ARGS Give these arguments to the + postprocessor + -k, --keep-video Keep the video file on disk after the + post-processing; the video is erased by + default + --no-post-overwrites Do not overwrite post-processed files; + the post-processed files are + overwritten by default + --embed-subs Embed subtitles in the video (only for + mp4, webm and mkv videos) + --embed-thumbnail Embed thumbnail in the audio as cover + art + --add-metadata Write metadata to the video file + --metadata-from-title FORMAT Parse additional metadata like song + title / artist from the video title. + The format syntax is the same as + --output. Regular expression with named + capture groups may also be used. The + parsed parameters replace existing + values. Example: --metadata-from-title + "%(artist)s - %(title)s" matches a + title like "Coldplay - Paradise". + Example (regex): --metadata-from-title + "(?P<artist>.+?) - (?P<title>.+)" + --xattrs Write metadata to the video file's + xattrs (using dublin core and xdg + standards) + --fixup POLICY Automatically correct known faults of + the file. One of never (do nothing), + warn (only emit a warning), + detect_or_warn (the default; fix file + if we can, warn otherwise) + --prefer-avconv Prefer avconv over ffmpeg for running + the postprocessors + --prefer-ffmpeg Prefer ffmpeg over avconv for running + the postprocessors (default) + --ffmpeg-location PATH Location of the ffmpeg/avconv binary; + either the path to the binary or its + containing directory. + --exec CMD Execute a command on the file after + downloading and post-processing, + similar to find's -exec syntax. + Example: --exec 'adb push {} + /sdcard/Music/ && rm {}' + --convert-subs FORMAT Convert the subtitles to other format + (currently supported: srt|ass|vtt|lrc) # CONFIGURATION @@ -582,7 +620,7 @@ Available for the media that is a track or a part of a music album: - `disc_number` (numeric): Number of the disc or other physical medium the track belongs to - `release_year` (numeric): Year (YYYY) when the album was released -Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with `NA`. +Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with placeholder value provided with `--output-na-placeholder` (`NA` by default). For example for `-o %(title)s-%(id)s.%(ext)s` and an mp4 video with title `youtube-dl test video` and id `BaW_jenozKcj`, this will result in a `youtube-dl test video-BaW_jenozKcj.mp4` file created in the current directory. @@ -677,6 +715,7 @@ Also filtering work for comparisons `=` (equals), `^=` (starts with), `$=` (ends - `container`: Name of the container format - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `http_dash_segments`, `m3u8`, or `m3u8_native`) - `format_id`: A short description of the format + - `language`: Language code Any string comparison may be prefixed with negation `!` in order to produce an opposite comparison, e.g. `!*=` (does not contain). @@ -879,7 +918,7 @@ Either prepend `https://www.youtube.com/watch?v=` or separate the ID from the op Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. -In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) (for Firefox). +In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [Get cookies.txt](https://chrome.google.com/webstore/detail/get-cookiestxt/bgaddhkoddajcdgocldbbfleckgcbcid/) (for Chrome) or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) (for Firefox). Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, macOS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. diff --git a/devscripts/install_jython.sh b/devscripts/install_jython.sh deleted file mode 100755 index bafca4da4..000000000 --- a/devscripts/install_jython.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -wget http://central.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -java -jar jython-installer-2.7.1.jar -s -d "$HOME/jython" -$HOME/jython/bin/jython -m pip install nose diff --git a/devscripts/run_tests.bat b/devscripts/run_tests.bat new file mode 100644 index 000000000..79359b5a7 --- /dev/null +++ b/devscripts/run_tests.bat @@ -0,0 +1,17 @@ +@echo off + +rem Keep this list in sync with the `offlinetest` target in Makefile +set DOWNLOAD_TESTS="age_restriction^|download^|iqiyi_sdk_interpreter^|socks^|subtitles^|write_annotations^|youtube_lists^|youtube_signature" + +if "%YTDL_TEST_SET%" == "core" ( + set test_set="-I test_("%DOWNLOAD_TESTS%")\.py" + set multiprocess_args="" +) else if "%YTDL_TEST_SET%" == "download" ( + set test_set="-I test_(?!"%DOWNLOAD_TESTS%").+\.py" + set multiprocess_args="--processes=4 --process-timeout=540" +) else ( + echo YTDL_TEST_SET is not set or invalid + exit /b 1 +) + +nosetests test --verbose %test_set:"=% %multiprocess_args:"=% diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 53dbc28e5..1373cc4f6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1,6 +1,5 @@ # Supported sites - **1tv**: Первый канал - - **1up.com** - **20min** - **220.ro** - **23video** @@ -35,6 +34,8 @@ - **adobetv:video** - **AdultSwim** - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault + - **aenetworks:collection** + - **aenetworks:show** - **afreecatv**: afreecatv.com - **AirMozilla** - **AliExpressLive** @@ -44,21 +45,25 @@ - **Amara** - **AMCNetworks** - **AmericasTestKitchen** + - **AmericasTestKitchenSeason** - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **AnimeOnDemand** - **Anvato** - - **aol.com** + - **aol.com**: Yahoo screen and movies - **APA** - **Aparat** - **AppleConnect** - **AppleDaily**: 臺灣蘋果日報 + - **ApplePodcasts** - **appletrailers** - **appletrailers:section** - **archive.org**: archive.org videos + - **ArcPublishing** - **ARD** - **ARD:mediathek** - **ARDBetaMediathek** - **Arkena** + - **arte.sky.it** - **ArteTV** - **ArteTVEmbed** - **ArteTVPlaylist** @@ -94,6 +99,10 @@ - **BellMedia** - **Bet** - **bfi:player** + - **bfmtv** + - **bfmtv:article** + - **bfmtv:live** + - **BibelTV** - **Bigflix** - **Bild**: Bild.de - **BiliBili** @@ -101,6 +110,7 @@ - **BilibiliAudioAlbum** - **BiliBiliPlayer** - **BioBioChileTV** + - **Biography** - **BIQLE** - **BitChute** - **BitChuteChannel** @@ -109,7 +119,9 @@ - **blinkx** - **Bloomberg** - **BokeCC** + - **BongaCams** - **BostonGlobe** + - **Box** - **Bpb**: Bundeszentrale für politische Bildung - **BR**: Bayerischer Rundfunk - **BravoTV** @@ -142,6 +154,7 @@ - **CBS** - **CBSInteractive** - **CBSLocal** + - **CBSLocalArticle** - **cbsnews**: CBS News - **cbsnews:embed** - **cbsnews:livevideo**: CBS News Live Videos @@ -157,6 +170,7 @@ - **Chilloutzone** - **chirbit** - **chirbit:profile** + - **cielotv.it** - **Cinchcast** - **Cinemax** - **CiscoLiveSearch** @@ -178,8 +192,6 @@ - **CNNArticle** - **CNNBlogs** - **ComedyCentral** - - **ComedyCentralFullEpisodes** - - **ComedyCentralShortname** - **ComedyCentralTV** - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED - **CONtv** @@ -190,9 +202,9 @@ - **CrooksAndLiars** - **crunchyroll** - **crunchyroll:playlist** - - **CSNNE** - **CSpan**: C-SPAN - **CtsNews**: 華視新聞 + - **CTV** - **CTVNews** - **cu.ntv.co.jp**: Nippon Television Network - **Culturebox** @@ -263,7 +275,6 @@ - **ESPNArticle** - **EsriVideo** - **Europa** - - **EveryonesMixtape** - **EWETV** - **ExpoTV** - **Expressen** @@ -305,11 +316,11 @@ - **FrontendMasters** - **FrontendMastersCourse** - **FrontendMastersLesson** + - **FujiTVFODPlus7** - **Funimation** - **Funk** - **Fusion** - **Fux** - - **FXNetworks** - **Gaia** - **GameInformer** - **GameSpot** @@ -328,6 +339,8 @@ - **Go** - **GodTube** - **Golem** + - **google:podcasts** + - **google:podcasts:feed** - **GoogleDrive** - **Goshgay** - **GPUTechConf** @@ -342,6 +355,7 @@ - **hgtv.com:show** - **HiDive** - **HistoricFilms** + - **history:player** - **history:topic**: History.com Topic - **hitbox** - **hitbox:live** @@ -361,6 +375,10 @@ - **HungamaSong** - **Hypem** - **ign.com** + - **IGNArticle** + - **IGNVideo** + - **IHeartRadio** + - **iheartradio:podcast** - **imdb**: Internet Movie Database trailers - **imdb:list**: Internet Movie Database lists - **Imgur** @@ -394,14 +412,14 @@ - **JWPlatform** - **Kakao** - **Kaltura** - - **KanalPlay**: Kanal 5/9/11 Play - **Kankan** - **Karaoketv** - **KarriereVideos** - **Katsomo** - **KeezMovies** - **Ketnet** - - **KhanAcademy** + - **khanacademy** + - **khanacademy:unit** - **KickStarter** - **KinjaEmbed** - **KinoPoisk** @@ -418,7 +436,8 @@ - **la7.it** - **laola1tv** - **laola1tv:embed** - - **lbry.tv** + - **lbry** + - **lbry:channel** - **LCI** - **Lcp** - **LcpPlay** @@ -468,6 +487,7 @@ - **massengeschmack.tv** - **MatchTV** - **MDR**: MDR.DE and KiKA + - **MedalTV** - **media.ccc.de** - **media.ccc.de:lists** - **Medialaan** @@ -482,9 +502,13 @@ - **META** - **metacafe** - **Metacritic** + - **mewatch** - **Mgoon** - **MGTV**: 芒果TV - **MiaoPai** + - **minds** + - **minds:channel** + - **minds:group** - **MinistryGrid** - **Minoto** - **miomio.tv** @@ -492,8 +516,6 @@ - **mixcloud** - **mixcloud:playlist** - **mixcloud:user** - - **Mixer:live** - - **Mixer:vod** - **MLB** - **Mnet** - **MNetTV** @@ -516,6 +538,7 @@ - **mtv:video** - **mtvjapan** - **mtvservices:embedded** + - **MTVUutisetArticle** - **MuenchenTV**: münchen.tv - **mva**: Microsoft Virtual Academy videos - **mva:course**: Microsoft Virtual Academy courses @@ -534,6 +557,11 @@ - **NationalGeographicTV** - **Naver** - **NBA** + - **nba:watch** + - **nba:watch:collection** + - **NBAChannel** + - **NBAEmbed** + - **NBAWatchEmbed** - **NBC** - **NBCNews** - **nbcolympics** @@ -563,8 +591,10 @@ - **NextTV**: 壹電視 - **Nexx** - **NexxEmbed** - - **nfl.com** + - **nfl.com** (Currently broken) + - **nfl.com:article** (Currently broken) - **NhkVod** + - **NhkVodProgram** - **nhl.com** - **nick.com** - **nick.de** @@ -578,7 +608,6 @@ - **njoy:embed** - **NJPWWorld**: 新日本プロレスワールド - **NobelPrize** - - **Noco** - **NonkTube** - **Noovo** - **Normalboots** @@ -596,6 +625,7 @@ - **Npr** - **NRK** - **NRKPlaylist** + - **NRKRadioPodkast** - **NRKSkole**: NRK Skole - **NRKTV**: NRK TV and NRK Radio - **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte @@ -608,6 +638,7 @@ - **Nuvid** - **NYTimes** - **NYTimesArticle** + - **NYTimesCooking** - **NZZ** - **ocw.mit.edu** - **OdaTV** @@ -646,7 +677,6 @@ - **parliamentlive.tv**: UK parliament videos - **Patreon** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - - **pcmag** - **PearVideo** - **PeerTube** - **People** @@ -660,10 +690,13 @@ - **PicartoVod** - **Piksel** - **Pinkbike** + - **Pinterest** + - **PinterestCollection** - **Pladform** - **Platzi** - **PlatziCourse** - **play.fm** + - **player.sky.it** - **PlayPlusTV** - **PlaysTV** - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz @@ -671,7 +704,6 @@ - **Playwire** - **pluralsight** - **pluralsight:course** - - **plus.google**: Google Plus - **podomatic** - **Pokemon** - **PolskieRadio** @@ -701,6 +733,7 @@ - **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:toplist**: QQ音乐 - 排行榜 - **QuantumTV** + - **Qub** - **Quickline** - **QuicklineLive** - **R7** @@ -755,6 +788,7 @@ - **RTVNH** - **RTVS** - **RUHD** + - **RumbleEmbed** - **rutube**: Rutube videos - **rutube:channel**: Rutube channels - **rutube:embed**: Rutube embedded videos @@ -792,18 +826,17 @@ - **Shared**: shared.sx - **ShowRoomLive** - **Sina** + - **sky.it** + - **sky:news** + - **sky:sports** + - **sky:sports:news** + - **skyacademy.it** - **SkylineWebcams** - - **SkyNews** - **skynewsarabia:article** - **skynewsarabia:video** - - **SkySports** - **Slideshare** - **SlidesLive** - **Slutload** - - **smotri**: Smotri.com - - **smotri:broadcast**: Smotri.com broadcasts - - **smotri:community**: Smotri.com community videos - - **smotri:user**: Smotri.com user videos - **Snotr** - **Sohu** - **SonyLIV** @@ -829,6 +862,12 @@ - **Sport5** - **SportBox** - **SportDeutschland** + - **spotify** + - **spotify:show** + - **Spreaker** + - **SpreakerPage** + - **SpreakerShow** + - **SpreakerShowPage** - **SpringboardPlatform** - **Sprout** - **sr:mediathek**: Saarländischer Rundfunk @@ -837,6 +876,7 @@ - **stanfordoc**: Stanford Open ClassRoom - **Steam** - **Stitcher** + - **StitcherShow** - **Streamable** - **streamcloud.eu** - **StreamCZ** @@ -857,7 +897,6 @@ - **Tagesschau** - **tagesschau:player** - **Tass** - - **TastyTrade** - **TBS** - **TDSLifeway** - **Teachable** @@ -880,6 +919,7 @@ - **TeleQuebecEmission** - **TeleQuebecLive** - **TeleQuebecSquat** + - **TeleQuebecVideo** - **TeleTask** - **Telewebion** - **TennisTV** @@ -897,7 +937,7 @@ - **ThisAV** - **ThisOldHouse** - **TikTok** - - **TikTokUser** + - **TikTokUser** (Currently broken) - **tinypic**: tinypic.com videos - **TMZ** - **TMZArticle** @@ -905,12 +945,13 @@ - **TNAFlixNetworkEmbed** - **toggle** - **ToonGoggles** - - **Tosh**: Tosh.0 - **tou.tv** - **Toypics**: Toypics video - **ToypicsUser**: Toypics user profile - **TrailerAddict** (Currently broken) - **Trilulilu** + - **Trovo** + - **TrovoVod** - **TruNews** - **TruTV** - **Tube8** @@ -930,11 +971,15 @@ - **TV2DKBornholmPlay** - **TV4**: tv4.se and tv4play.se - **TV5MondePlus**: TV5MONDE+ + - **tv5unis** + - **tv5unis:video** + - **tv8.it** - **TVA** - **TVANouvelles** - **TVANouvellesArticle** - **TVC** - **TVCArticle** + - **TVer** - **tvigle**: Интернет-телевидение Tvigle.ru - **tvland.com** - **TVN24** @@ -1001,6 +1046,8 @@ - **Viddler** - **Videa** - **video.google:search**: Google Video search + - **video.sky.it** + - **video.sky.it:live** - **VideoDetective** - **videofy.me** - **videomore** @@ -1012,7 +1059,6 @@ - **vidme** - **vidme:user** - **vidme:user:likes** - - **Vidzi** - **vier**: vier.be and vijf.be - **vier:videos** - **viewlift** @@ -1042,6 +1088,7 @@ - **vk:wallpost** - **vlive** - **vlive:channel** + - **vlive:post** - **Vodlocker** - **VODPl** - **VODPlatform** @@ -1056,10 +1103,12 @@ - **vrv** - **vrv:series** - **VShare** + - **VTM** - **VTXTV** - **vube**: Vube.com - **VuClip** - **VVVVID** + - **VVVVIDShow** - **VyboryMos** - **Vzaar** - **Wakanim** @@ -1082,6 +1131,7 @@ - **WeiboMobile** - **WeiqiTV**: WQTV - **Wistia** + - **WistiaPlaylist** - **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **WorldStarHipHop** - **WSJ**: Wall Street Journal @@ -1089,7 +1139,7 @@ - **WWE** - **XBef** - **XboxClips** - - **XFileShare**: XFileShare based sites: ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, XVideoSharing + - **XFileShare**: XFileShare based sites: Aparat, ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, XVideoSharing - **XHamster** - **XHamsterEmbed** - **XHamsterUser** @@ -1113,6 +1163,8 @@ - **yahoo:japannews**: Yahoo! Japan News - **YandexDisk** - **yandexmusic:album**: Яндекс.Музыка - Альбом + - **yandexmusic:artist:albums**: Яндекс.Музыка - Артист - Альбомы + - **yandexmusic:artist:tracks**: Яндекс.Музыка - Артист - Треки - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист - **yandexmusic:track**: Яндекс.Музыка - Трек - **YandexVideo** @@ -1130,6 +1182,7 @@ - **YourPorn** - **YourUpload** - **youtube**: YouTube.com + - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication) - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) - **youtube:playlist**: YouTube.com playlists - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) @@ -1138,9 +1191,9 @@ - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - **youtube:tab**: YouTube.com tab - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **YoutubeYtBe** - **YoutubeYtUser** - **Zapiks** - - **Zaq1** - **Zattoo** - **ZattooLive** - **ZDF** diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 71f6608fe..dd69a681b 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -98,6 +98,55 @@ class TestInfoExtractor(unittest.TestCase): self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) + def test_search_json_ld_realworld(self): + # https://github.com/ytdl-org/youtube-dl/issues/23306 + expect_dict( + self, + self.ie._search_json_ld(r'''<script type="application/ld+json"> +{ +"@context": "http://schema.org/", +"@type": "VideoObject", +"name": "1 On 1 With Kleio", +"url": "https://www.eporner.com/hd-porn/xN49A1cT3eB/1-On-1-With-Kleio/", +"duration": "PT0H12M23S", +"thumbnailUrl": ["https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg", "https://imggen.eporner.com/780814/1920/1080/9.jpg"], +"contentUrl": "https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4", +"embedUrl": "https://www.eporner.com/embed/xN49A1cT3eB/1-On-1-With-Kleio/", +"image": "https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg", +"width": "1920", +"height": "1080", +"encodingFormat": "mp4", +"bitrate": "6617kbps", +"isFamilyFriendly": "False", +"description": "Kleio Valentien", +"uploadDate": "2015-12-05T21:24:35+01:00", +"interactionStatistic": { +"@type": "InteractionCounter", +"interactionType": { "@type": "http://schema.org/WatchAction" }, +"userInteractionCount": 1120958 +}, "aggregateRating": { +"@type": "AggregateRating", +"ratingValue": "88", +"ratingCount": "630", +"bestRating": "100", +"worstRating": "0" +}, "actor": [{ +"@type": "Person", +"name": "Kleio Valentien", +"url": "https://www.eporner.com/pornstar/kleio-valentien/" +}]} +</script>''', None), + { + 'title': '1 On 1 With Kleio', + 'description': 'Kleio Valentien', + 'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4', + 'timestamp': 1449347075, + 'duration': 743.0, + 'view_count': 1120958, + 'width': 1920, + 'height': 1080, + }) + def test_download_json(self): uri = encode_data_uri(b'{"foo": "blah"}', 'application/json') self.assertEqual(self.ie._download_json(uri, None), {'foo': 'blah'}) @@ -108,6 +157,18 @@ class TestInfoExtractor(unittest.TestCase): self.assertEqual(self.ie._download_json(uri, None, fatal=False), None) def test_parse_html5_media_entries(self): + # inline video tag + expect_dict( + self, + self.ie._parse_html5_media_entries( + 'https://127.0.0.1/video.html', + r'<html><video src="/vid.mp4" /></html>', None)[0], + { + 'formats': [{ + 'url': 'https://127.0.0.1/vid.mp4', + }], + }) + # from https://www.r18.com/ # with kpbs in label expect_dict( diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 62f916d11..a35effe0e 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -464,6 +464,7 @@ class TestFormatSelection(unittest.TestCase): assert_syntax_error('+bestaudio') assert_syntax_error('bestvideo+') assert_syntax_error('/') + assert_syntax_error('bestvideo+bestvideo+bestaudio') def test_format_filtering(self): formats = [ @@ -632,13 +633,20 @@ class TestYoutubeDL(unittest.TestCase): 'title2': '%PATH%', } - def fname(templ): - ydl = YoutubeDL({'outtmpl': templ}) + def fname(templ, na_placeholder='NA'): + params = {'outtmpl': templ} + if na_placeholder != 'NA': + params['outtmpl_na_placeholder'] = na_placeholder + ydl = YoutubeDL(params) return ydl.prepare_filename(info) self.assertEqual(fname('%(id)s.%(ext)s'), '1234.mp4') self.assertEqual(fname('%(id)s-%(width)s.%(ext)s'), '1234-NA.mp4') - # Replace missing fields with 'NA' - self.assertEqual(fname('%(uploader_date)s-%(id)s.%(ext)s'), 'NA-1234.mp4') + NA_TEST_OUTTMPL = '%(uploader_date)s-%(width)d-%(id)s.%(ext)s' + # Replace missing fields with 'NA' by default + self.assertEqual(fname(NA_TEST_OUTTMPL), 'NA-NA-1234.mp4') + # Or by provided placeholder + self.assertEqual(fname(NA_TEST_OUTTMPL, na_placeholder='none'), 'none-none-1234.mp4') + self.assertEqual(fname(NA_TEST_OUTTMPL, na_placeholder=''), '--1234.mp4') self.assertEqual(fname('%(height)d.%(ext)s'), '1080.mp4') self.assertEqual(fname('%(height)6d.%(ext)s'), ' 1080.mp4') self.assertEqual(fname('%(height)-6d.%(ext)s'), '1080 .mp4') diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 56a08bed8..df6d81b5d 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -36,7 +36,7 @@ class TestAllURLsMatching(unittest.TestCase): assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585 assertPlaylist('PL63F0C78739B09958') assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') - assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') + assertTab('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M')) @@ -57,13 +57,14 @@ class TestAllURLsMatching(unittest.TestCase): assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec') assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos') - # def test_youtube_user_matching(self): - # self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab']) + def test_youtube_user_matching(self): + self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab']) def test_youtube_feeds(self): - self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) - self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) - self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) + self.assertMatch('https://www.youtube.com/feed/library', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/history', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab']) + self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab']) # def test_youtube_search_matching(self): # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 17aaaf20d..550e0ca00 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -258,16 +258,24 @@ class TestNRKSubtitles(BaseTestSubtitles): class TestRaiPlaySubtitles(BaseTestSubtitles): - url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html' IE = RaiPlayIE - def test_allsubtitles(self): + def test_subtitles_key(self): + self.url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['it'])) self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a') + def test_subtitles_array_key(self): + self.url = 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html' + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['it'])) + self.assertEqual(md5(subtitles['it']), '4b3264186fbb103508abe5311cfcb9cd') + class TestVikiSubtitles(BaseTestSubtitles): url = 'http://www.viki.com/videos/1060846v-punch-episode-18' diff --git a/test/test_utils.py b/test/test_utils.py index 925a21d34..259c4763e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -21,6 +21,7 @@ from youtube_dl.utils import ( encode_base_n, caesar, clean_html, + clean_podcast_url, date_from_str, DateRange, detect_exe_version, @@ -554,6 +555,11 @@ class TestUtil(unittest.TestCase): self.assertEqual(url_or_none('http$://foo.de'), None) self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de') self.assertEqual(url_or_none('//foo.de'), '//foo.de') + self.assertEqual(url_or_none('s3://foo.de'), None) + self.assertEqual(url_or_none('rtmpte://foo.de'), 'rtmpte://foo.de') + self.assertEqual(url_or_none('mms://foo.de'), 'mms://foo.de') + self.assertEqual(url_or_none('rtspu://foo.de'), 'rtspu://foo.de') + self.assertEqual(url_or_none('ftps://foo.de'), 'ftps://foo.de') def test_parse_age_limit(self): self.assertEqual(parse_age_limit(None), None) @@ -1465,6 +1471,10 @@ Line 1 self.assertEqual(get_elements_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), []) + def test_clean_podcast_url(self): + self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3') + self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3') + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_chapters.py b/test/test_youtube_chapters.py deleted file mode 100644 index e69c57377..000000000 --- a/test/test_youtube_chapters.py +++ /dev/null @@ -1,275 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -from __future__ import unicode_literals - -# Allow direct execution -import os -import sys -import unittest -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from test.helper import expect_value -from youtube_dl.extractor import YoutubeIE - - -class TestYoutubeChapters(unittest.TestCase): - - _TEST_CASES = [ - ( - # https://www.youtube.com/watch?v=A22oy8dFjqc - # pattern: 00:00 - <title> - '''This is the absolute ULTIMATE experience of Queen's set at LIVE AID, this is the best video mixed to the absolutely superior stereo radio broadcast. This vastly superior audio mix takes a huge dump on all of the official mixes. Best viewed in 1080p. ENJOY! ***MAKE SURE TO READ THE DESCRIPTION***<br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+36);return false;">00:36</a> - Bohemian Rhapsody<br /><a href="#" onclick="yt.www.watch.player.seekTo(02*60+42);return false;">02:42</a> - Radio Ga Ga<br /><a href="#" onclick="yt.www.watch.player.seekTo(06*60+53);return false;">06:53</a> - Ay Oh!<br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+34);return false;">07:34</a> - Hammer To Fall<br /><a href="#" onclick="yt.www.watch.player.seekTo(12*60+08);return false;">12:08</a> - Crazy Little Thing Called Love<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+03);return false;">16:03</a> - We Will Rock You<br /><a href="#" onclick="yt.www.watch.player.seekTo(17*60+18);return false;">17:18</a> - We Are The Champions<br /><a href="#" onclick="yt.www.watch.player.seekTo(21*60+12);return false;">21:12</a> - Is This The World We Created...?<br /><br />Short song analysis:<br /><br />- "Bohemian Rhapsody": Although it's a short medley version, it's one of the best performances of the ballad section, with Freddie nailing the Bb4s with the correct studio phrasing (for the first time ever!).<br /><br />- "Radio Ga Ga": Although it's missing one chorus, this is one of - if not the best - the best versions ever, Freddie nails all the Bb4s and sounds very clean! Spike Edney's Roland Jupiter 8 also really shines through on this mix, compared to the DVD releases!<br /><br />- "Audience Improv": A great improv, Freddie sounds strong and confident. You gotta love when he sustains that A4 for 4 seconds!<br /><br />- "Hammer To Fall": Despite missing a verse and a chorus, it's a strong version (possibly the best ever). Freddie sings the song amazingly, and even ad-libs a C#5 and a C5! Also notice how heavy Brian's guitar sounds compared to the thin DVD mixes - it roars!<br /><br />- "Crazy Little Thing Called Love": A great version, the crowd loves the song, the jam is great as well! Only downside to this is the slight feedback issues.<br /><br />- "We Will Rock You": Although cut down to the 1st verse and chorus, Freddie sounds strong. He nails the A4, and the solo from Dr. May is brilliant!<br /><br />- "We Are the Champions": Perhaps the high-light of the performance - Freddie is very daring on this version, he sustains the pre-chorus Bb4s, nails the 1st C5, belts great A4s, but most importantly: He nails the chorus Bb4s, in all 3 choruses! This is the only time he has ever done so! It has to be said though, the last one sounds a bit rough, but that's a side effect of belting high notes for the past 18 minutes, with nodules AND laryngitis!<br /><br />- "Is This The World We Created... ?": Freddie and Brian perform a beautiful version of this, and it is one of the best versions ever. It's both sad and hilarious that a couple of BBC engineers are talking over the song, one of them being completely oblivious of the fact that he is interrupting the performance, on live television... Which was being televised to almost 2 billion homes.<br /><br /><br />All rights go to their respective owners!<br />-----Copyright Disclaimer Under Section 107 of the Copyright Act 1976, allowance is made for fair use for purposes such as criticism, comment, news reporting, teaching, scholarship, and research. Fair use is a use permitted by copyright statute that might otherwise be infringing. Non-profit, educational or personal use tips the balance in favor of fair use''', - 1477, - [{ - 'start_time': 36, - 'end_time': 162, - 'title': 'Bohemian Rhapsody', - }, { - 'start_time': 162, - 'end_time': 413, - 'title': 'Radio Ga Ga', - }, { - 'start_time': 413, - 'end_time': 454, - 'title': 'Ay Oh!', - }, { - 'start_time': 454, - 'end_time': 728, - 'title': 'Hammer To Fall', - }, { - 'start_time': 728, - 'end_time': 963, - 'title': 'Crazy Little Thing Called Love', - }, { - 'start_time': 963, - 'end_time': 1038, - 'title': 'We Will Rock You', - }, { - 'start_time': 1038, - 'end_time': 1272, - 'title': 'We Are The Champions', - }, { - 'start_time': 1272, - 'end_time': 1477, - 'title': 'Is This The World We Created...?', - }] - ), - ( - # https://www.youtube.com/watch?v=ekYlRhALiRQ - # pattern: <num>. <title> 0:00 - '1. Those Beaten Paths of Confusion <a href="#" onclick="yt.www.watch.player.seekTo(0*60+00);return false;">0:00</a><br />2. Beyond the Shadows of Emptiness & Nothingness <a href="#" onclick="yt.www.watch.player.seekTo(11*60+47);return false;">11:47</a><br />3. Poison Yourself...With Thought <a href="#" onclick="yt.www.watch.player.seekTo(26*60+30);return false;">26:30</a><br />4. The Agents of Transformation <a href="#" onclick="yt.www.watch.player.seekTo(35*60+57);return false;">35:57</a><br />5. Drowning in the Pain of Consciousness <a href="#" onclick="yt.www.watch.player.seekTo(44*60+32);return false;">44:32</a><br />6. Deny the Disease of Life <a href="#" onclick="yt.www.watch.player.seekTo(53*60+07);return false;">53:07</a><br /><br />More info/Buy: http://crepusculonegro.storenvy.com/products/257645-cn-03-arizmenda-within-the-vacuum-of-infinity<br /><br />No copyright is intended. The rights to this video are assumed by the owner and its affiliates.', - 4009, - [{ - 'start_time': 0, - 'end_time': 707, - 'title': '1. Those Beaten Paths of Confusion', - }, { - 'start_time': 707, - 'end_time': 1590, - 'title': '2. Beyond the Shadows of Emptiness & Nothingness', - }, { - 'start_time': 1590, - 'end_time': 2157, - 'title': '3. Poison Yourself...With Thought', - }, { - 'start_time': 2157, - 'end_time': 2672, - 'title': '4. The Agents of Transformation', - }, { - 'start_time': 2672, - 'end_time': 3187, - 'title': '5. Drowning in the Pain of Consciousness', - }, { - 'start_time': 3187, - 'end_time': 4009, - 'title': '6. Deny the Disease of Life', - }] - ), - ( - # https://www.youtube.com/watch?v=WjL4pSzog9w - # pattern: 00:00 <title> - '<a href="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" class="yt-uix-servicelink " data-target-new-window="True" data-servicelink="CDAQ6TgYACITCNf1raqT2dMCFdRjGAod_o0CBSj4HQ" data-url="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" rel="nofollow noopener" target="_blank">https://arizmenda.bandcamp.com/merch/...</a><br /><br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+00);return false;">00:00</a> Christening Unborn Deformities <br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+08);return false;">07:08</a> Taste of Purity<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+16);return false;">16:16</a> Sculpting Sins of a Universal Tongue<br /><a href="#" onclick="yt.www.watch.player.seekTo(24*60+45);return false;">24:45</a> Birth<br /><a href="#" onclick="yt.www.watch.player.seekTo(31*60+24);return false;">31:24</a> Neves<br /><a href="#" onclick="yt.www.watch.player.seekTo(37*60+55);return false;">37:55</a> Libations in Limbo', - 2705, - [{ - 'start_time': 0, - 'end_time': 428, - 'title': 'Christening Unborn Deformities', - }, { - 'start_time': 428, - 'end_time': 976, - 'title': 'Taste of Purity', - }, { - 'start_time': 976, - 'end_time': 1485, - 'title': 'Sculpting Sins of a Universal Tongue', - }, { - 'start_time': 1485, - 'end_time': 1884, - 'title': 'Birth', - }, { - 'start_time': 1884, - 'end_time': 2275, - 'title': 'Neves', - }, { - 'start_time': 2275, - 'end_time': 2705, - 'title': 'Libations in Limbo', - }] - ), - ( - # https://www.youtube.com/watch?v=o3r1sn-t3is - # pattern: <title> 00:00 <note> - 'Download this show in MP3: <a href="http://sh.st/njZKK" class="yt-uix-servicelink " data-url="http://sh.st/njZKK" data-target-new-window="True" data-servicelink="CDAQ6TgYACITCK3j8_6o2dMCFVDCGAoduVAKKij4HQ" rel="nofollow noopener" target="_blank">http://sh.st/njZKK</a><br /><br />Setlist:<br />I-E-A-I-A-I-O <a href="#" onclick="yt.www.watch.player.seekTo(00*60+45);return false;">00:45</a><br />Suite-Pee <a href="#" onclick="yt.www.watch.player.seekTo(4*60+26);return false;">4:26</a> (Incomplete)<br />Attack <a href="#" onclick="yt.www.watch.player.seekTo(5*60+31);return false;">5:31</a> (First live performance since 2011)<br />Prison Song <a href="#" onclick="yt.www.watch.player.seekTo(8*60+42);return false;">8:42</a><br />Know <a href="#" onclick="yt.www.watch.player.seekTo(12*60+32);return false;">12:32</a> (First live performance since 2011)<br />Aerials <a href="#" onclick="yt.www.watch.player.seekTo(15*60+32);return false;">15:32</a><br />Soldier Side - Intro <a href="#" onclick="yt.www.watch.player.seekTo(19*60+13);return false;">19:13</a><br />B.Y.O.B. <a href="#" onclick="yt.www.watch.player.seekTo(20*60+09);return false;">20:09</a><br />Soil <a href="#" onclick="yt.www.watch.player.seekTo(24*60+32);return false;">24:32</a><br />Darts <a href="#" onclick="yt.www.watch.player.seekTo(27*60+48);return false;">27:48</a><br />Radio/Video <a href="#" onclick="yt.www.watch.player.seekTo(30*60+38);return false;">30:38</a><br />Hypnotize <a href="#" onclick="yt.www.watch.player.seekTo(35*60+05);return false;">35:05</a><br />Temper <a href="#" onclick="yt.www.watch.player.seekTo(38*60+08);return false;">38:08</a> (First live performance since 1999)<br />CUBErt <a href="#" onclick="yt.www.watch.player.seekTo(41*60+00);return false;">41:00</a><br />Needles <a href="#" onclick="yt.www.watch.player.seekTo(42*60+57);return false;">42:57</a><br />Deer Dance <a href="#" onclick="yt.www.watch.player.seekTo(46*60+27);return false;">46:27</a><br />Bounce <a href="#" onclick="yt.www.watch.player.seekTo(49*60+38);return false;">49:38</a><br />Suggestions <a href="#" onclick="yt.www.watch.player.seekTo(51*60+25);return false;">51:25</a><br />Psycho <a href="#" onclick="yt.www.watch.player.seekTo(53*60+52);return false;">53:52</a><br />Chop Suey! <a href="#" onclick="yt.www.watch.player.seekTo(58*60+13);return false;">58:13</a><br />Lonely Day <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+01*60+15);return false;">1:01:15</a><br />Question! <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+04*60+14);return false;">1:04:14</a><br />Lost in Hollywood <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+08*60+10);return false;">1:08:10</a><br />Vicinity of Obscenity <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+13*60+40);return false;">1:13:40</a>(First live performance since 2012)<br />Forest <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+16*60+17);return false;">1:16:17</a><br />Cigaro <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+20*60+02);return false;">1:20:02</a><br />Toxicity <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+23*60+57);return false;">1:23:57</a>(with Chino Moreno)<br />Sugar <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+27*60+53);return false;">1:27:53</a>', - 5640, - [{ - 'start_time': 45, - 'end_time': 266, - 'title': 'I-E-A-I-A-I-O', - }, { - 'start_time': 266, - 'end_time': 331, - 'title': 'Suite-Pee (Incomplete)', - }, { - 'start_time': 331, - 'end_time': 522, - 'title': 'Attack (First live performance since 2011)', - }, { - 'start_time': 522, - 'end_time': 752, - 'title': 'Prison Song', - }, { - 'start_time': 752, - 'end_time': 932, - 'title': 'Know (First live performance since 2011)', - }, { - 'start_time': 932, - 'end_time': 1153, - 'title': 'Aerials', - }, { - 'start_time': 1153, - 'end_time': 1209, - 'title': 'Soldier Side - Intro', - }, { - 'start_time': 1209, - 'end_time': 1472, - 'title': 'B.Y.O.B.', - }, { - 'start_time': 1472, - 'end_time': 1668, - 'title': 'Soil', - }, { - 'start_time': 1668, - 'end_time': 1838, - 'title': 'Darts', - }, { - 'start_time': 1838, - 'end_time': 2105, - 'title': 'Radio/Video', - }, { - 'start_time': 2105, - 'end_time': 2288, - 'title': 'Hypnotize', - }, { - 'start_time': 2288, - 'end_time': 2460, - 'title': 'Temper (First live performance since 1999)', - }, { - 'start_time': 2460, - 'end_time': 2577, - 'title': 'CUBErt', - }, { - 'start_time': 2577, - 'end_time': 2787, - 'title': 'Needles', - }, { - 'start_time': 2787, - 'end_time': 2978, - 'title': 'Deer Dance', - }, { - 'start_time': 2978, - 'end_time': 3085, - 'title': 'Bounce', - }, { - 'start_time': 3085, - 'end_time': 3232, - 'title': 'Suggestions', - }, { - 'start_time': 3232, - 'end_time': 3493, - 'title': 'Psycho', - }, { - 'start_time': 3493, - 'end_time': 3675, - 'title': 'Chop Suey!', - }, { - 'start_time': 3675, - 'end_time': 3854, - 'title': 'Lonely Day', - }, { - 'start_time': 3854, - 'end_time': 4090, - 'title': 'Question!', - }, { - 'start_time': 4090, - 'end_time': 4420, - 'title': 'Lost in Hollywood', - }, { - 'start_time': 4420, - 'end_time': 4577, - 'title': 'Vicinity of Obscenity (First live performance since 2012)', - }, { - 'start_time': 4577, - 'end_time': 4802, - 'title': 'Forest', - }, { - 'start_time': 4802, - 'end_time': 5037, - 'title': 'Cigaro', - }, { - 'start_time': 5037, - 'end_time': 5273, - 'title': 'Toxicity (with Chino Moreno)', - }, { - 'start_time': 5273, - 'end_time': 5640, - 'title': 'Sugar', - }] - ), - ( - # https://www.youtube.com/watch?v=PkYLQbsqCE8 - # pattern: <num> - <title> [<latinized title>] 0:00:00 - '''Затемно (Zatemno) is an Obscure Black Metal Band from Russia.<br /><br />"Во прах (Vo prakh)'' Into The Ashes", Debut mini-album released may 6, 2016, by Death Knell Productions<br />Released on 6 panel digipak CD, limited to 100 copies only<br />And digital format on Bandcamp<br /><br />Tracklist<br /><br />1 - Во прах [Vo prakh] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+00*60+00);return false;">0:00:00</a><br />2 - Искупление [Iskupleniye] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+08*60+10);return false;">0:08:10</a><br />3 - Из серпов луны...[Iz serpov luny] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+14*60+30);return false;">0:14:30</a><br /><br />Links:<br /><a href="https://deathknellprod.bandcamp.com/album/--2" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://deathknellprod.bandcamp.com/album/--2" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://deathknellprod.bandcamp.com/a...</a><br /><a href="https://www.facebook.com/DeathKnellProd/" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://www.facebook.com/DeathKnellProd/" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://www.facebook.com/DeathKnellProd/</a><br /><br /><br />I don't have any right about this artifact, my only intention is to spread the music of the band, all rights are reserved to the Затемно (Zatemno) and his producers, Death Knell Productions.<br /><br />------------------------------------------------------------------<br /><br />Subscribe for more videos like this.<br />My link: <a href="https://web.facebook.com/AttackOfTheDragons" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://web.facebook.com/AttackOfTheDragons" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://web.facebook.com/AttackOfTheD...</a>''', - 1138, - [{ - 'start_time': 0, - 'end_time': 490, - 'title': '1 - Во прах [Vo prakh]', - }, { - 'start_time': 490, - 'end_time': 870, - 'title': '2 - Искупление [Iskupleniye]', - }, { - 'start_time': 870, - 'end_time': 1138, - 'title': '3 - Из серпов луны...[Iz serpov luny]', - }] - ), - ( - # https://www.youtube.com/watch?v=xZW70zEasOk - # time point more than duration - '''● LCS Spring finals: Saturday and Sunday from <a href="#" onclick="yt.www.watch.player.seekTo(13*60+30);return false;">13:30</a> outside the venue! <br />● PAX East: Fri, Sat & Sun - more info in tomorrows video on the main channel!''', - 283, - [] - ), - ] - - def test_youtube_chapters(self): - for description, duration, expected_chapters in self._TEST_CASES: - ie = YoutubeIE() - expect_value( - self, ie._extract_chapters_from_description(description, duration), - expected_chapters, None) - - -if __name__ == '__main__': - unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 69df30eda..627d4cb92 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -19,55 +19,46 @@ from youtube_dl.compat import compat_str, compat_urlretrieve _TESTS = [ ( 'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js', - 'js', 86, '>=<;:/.-[+*)(\'&%$#"!ZYX0VUTSRQPONMLKJIHGFEDCBA\\yxwvutsrqponmlkjihgfedcba987654321', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-vfldJ8xgI.js', - 'js', 85, '3456789a0cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRS[UVWXYZ!"#$%&\'()*+,-./:;<=>?@', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-vfle-mVwz.js', - 'js', 90, ']\\[@?>=<;:/.-,+*)(\'&%$#"hZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjiagfedcb39876', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl0Cbn9e.js', - 'js', 84, 'O1I3456789abcde0ghijklmnopqrstuvwxyzABCDEFGHfJKLMN2PQRSTUVW@YZ!"#$%&\'()*+,-./:;<=', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', - 'js', '2ACFC7A61CA478CD21425E5A57EBD73DDC78E22A.2094302436B2D377D14A3BBA23022D023B8BC25AA', 'A52CB8B320D22032ABB3A41D773D2B6342034902.A22E87CDD37DBE75A5E52412DC874AC16A7CFCA2', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflBb0OQx.js', - 'js', 84, '123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQ0STUVWXYZ!"#$%&\'()*+,@./:;<=>' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl9FYC6l.js', - 'js', 83, '123456789abcdefghijklmnopqr0tuvwxyzABCDETGHIJKLMNOPQRS>UVWXYZ!"#$%&\'()*+,-./:;<=F' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflCGk6yw/html5player.js', - 'js', '4646B5181C6C3020DF1D9C7FCFEA.AD80ABF70C39BD369CCCAE780AFBB98FA6B6CB42766249D9488C288', '82C8849D94266724DC6B6AF89BBFA087EACCD963.B93C07FBA084ACAEFCF7C9D1FD0203C6C1815B6B' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', - 'js', '312AA52209E3623129A412D56A40F11CB0AF14AE.3EE09501CB14E3BCDC3B2AE808BF3F1D14E7FBF12', '112AA5220913623229A412D56A40F11CB0AF14AE.3EE0950FCB14EEBCDC3B2AE808BF331D14E7FBF3', ) @@ -78,6 +69,10 @@ class TestPlayerInfo(unittest.TestCase): def test_youtube_extract_player_info(self): PLAYER_URLS = ( ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/en_US/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/fr_FR/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-en_US.vflset/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-de_DE.vflset/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-tablet-en_US.vflset/base.js', '64dddad9'), # obsolete ('https://www.youtube.com/yts/jsbin/player_ias-vfle4-e03/en_US/base.js', 'vfle4-e03'), ('https://www.youtube.com/yts/jsbin/player_ias-vfl49f_g4/en_US/base.js', 'vfl49f_g4'), @@ -86,13 +81,9 @@ class TestPlayerInfo(unittest.TestCase): ('https://www.youtube.com/yts/jsbin/player-en_US-vflaxXRn1/base.js', 'vflaxXRn1'), ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', 'vflXGBaUN'), ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'vflKjOTVq'), - ('http://s.ytimg.com/yt/swfbin/watch_as3-vflrEm9Nq.swf', 'vflrEm9Nq'), - ('https://s.ytimg.com/yts/swfbin/player-vflenCdZL/watch_as3.swf', 'vflenCdZL'), ) for player_url, expected_player_id in PLAYER_URLS: - expected_player_type = player_url.split('.')[-1] - player_type, player_id = YoutubeIE._extract_player_info(player_url) - self.assertEqual(player_type, expected_player_type) + player_id = YoutubeIE._extract_player_info(player_url) self.assertEqual(player_id, expected_player_id) @@ -104,13 +95,13 @@ class TestSignature(unittest.TestCase): os.mkdir(self.TESTDATA_DIR) -def make_tfunc(url, stype, sig_input, expected_sig): +def make_tfunc(url, sig_input, expected_sig): m = re.match(r'.*-([a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$', url) assert m, '%r should follow URL format' % url test_id = m.group(1) def test_func(self): - basename = 'player-%s.%s' % (test_id, stype) + basename = 'player-%s.js' % test_id fn = os.path.join(self.TESTDATA_DIR, basename) if not os.path.exists(fn): @@ -118,22 +109,16 @@ def make_tfunc(url, stype, sig_input, expected_sig): ydl = FakeYDL() ie = YoutubeIE(ydl) - if stype == 'js': - with io.open(fn, encoding='utf-8') as testf: - jscode = testf.read() - func = ie._parse_sig_js(jscode) - else: - assert stype == 'swf' - with open(fn, 'rb') as testf: - swfcode = testf.read() - func = ie._parse_sig_swf(swfcode) + with io.open(fn, encoding='utf-8') as testf: + jscode = testf.read() + func = ie._parse_sig_js(jscode) src_sig = ( compat_str(string.printable[:sig_input]) if isinstance(sig_input, int) else sig_input) got_sig = func(src_sig) self.assertEqual(got_sig, expected_sig) - test_func.__name__ = str('test_signature_' + stype + '_' + test_id) + test_func.__name__ = str('test_signature_js_' + test_id) setattr(TestSignature, test_func.__name__, test_func) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 855a73157..ecac31f7a 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -163,6 +163,7 @@ class YoutubeDL(object): simulate: Do not download the video files. format: Video format code. See options.py for more information. outtmpl: Template for output names. + outtmpl_na_placeholder: Placeholder for unavailable meta fields. restrictfilenames: Do not allow "&" and spaces in file names ignoreerrors: Do not stop on download errors. force_generic_extractor: Force downloader to use the generic extractor @@ -338,6 +339,8 @@ class YoutubeDL(object): _pps = [] _download_retcode = None _num_downloads = None + _playlist_level = 0 + _playlist_urls = set() _screen_file = None def __init__(self, params=None, auto_init=True): @@ -656,7 +659,7 @@ class YoutubeDL(object): template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v)) for k, v in template_dict.items() if v is not None and not isinstance(v, (list, tuple, dict))) - template_dict = collections.defaultdict(lambda: 'NA', template_dict) + template_dict = collections.defaultdict(lambda: self.params.get('outtmpl_na_placeholder', 'NA'), template_dict) outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) @@ -676,8 +679,8 @@ class YoutubeDL(object): # Missing numeric fields used together with integer presentation types # in format specification will break the argument substitution since - # string 'NA' is returned for missing fields. We will patch output - # template for missing fields to meet string presentation type. + # string NA placeholder is returned for missing fields. We will patch + # output template for missing fields to meet string presentation type. for numeric_field in self._NUMERIC_FIELDS: if numeric_field not in template_dict: # As of [1] format syntax is: @@ -906,115 +909,23 @@ class YoutubeDL(object): return self.process_ie_result( new_result, download=download, extra_info=extra_info) elif result_type in ('playlist', 'multi_video'): - # We process each entry in the playlist - playlist = ie_result.get('title') or ie_result.get('id') - self.to_screen('[download] Downloading playlist: %s' % playlist) - - playlist_results = [] - - playliststart = self.params.get('playliststart', 1) - 1 - playlistend = self.params.get('playlistend') - # For backwards compatibility, interpret -1 as whole list - if playlistend == -1: - playlistend = None - - playlistitems_str = self.params.get('playlist_items') - playlistitems = None - if playlistitems_str is not None: - def iter_playlistitems(format): - for string_segment in format.split(','): - if '-' in string_segment: - start, end = string_segment.split('-') - for item in range(int(start), int(end) + 1): - yield int(item) - else: - yield int(string_segment) - playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) - - ie_entries = ie_result['entries'] - - def make_playlistitems_entries(list_ie_entries): - num_entries = len(list_ie_entries) - return [ - list_ie_entries[i - 1] for i in playlistitems - if -num_entries <= i - 1 < num_entries] - - def report_download(num_entries): + # Protect from infinite recursion due to recursively nested playlists + # (see https://github.com/ytdl-org/youtube-dl/issues/27833) + webpage_url = ie_result['webpage_url'] + if webpage_url in self._playlist_urls: self.to_screen( - '[%s] playlist %s: Downloading %d videos' % - (ie_result['extractor'], playlist, num_entries)) + '[download] Skipping already downloaded playlist: %s' + % ie_result.get('title') or ie_result.get('id')) + return - if isinstance(ie_entries, list): - n_all_entries = len(ie_entries) - if playlistitems: - entries = make_playlistitems_entries(ie_entries) - else: - entries = ie_entries[playliststart:playlistend] - n_entries = len(entries) - self.to_screen( - '[%s] playlist %s: Collected %d video ids (downloading %d of them)' % - (ie_result['extractor'], playlist, n_all_entries, n_entries)) - elif isinstance(ie_entries, PagedList): - if playlistitems: - entries = [] - for item in playlistitems: - entries.extend(ie_entries.getslice( - item - 1, item - )) - else: - entries = ie_entries.getslice( - playliststart, playlistend) - n_entries = len(entries) - report_download(n_entries) - else: # iterable - if playlistitems: - entries = make_playlistitems_entries(list(itertools.islice( - ie_entries, 0, max(playlistitems)))) - else: - entries = list(itertools.islice( - ie_entries, playliststart, playlistend)) - n_entries = len(entries) - report_download(n_entries) - - if self.params.get('playlistreverse', False): - entries = entries[::-1] - - if self.params.get('playlistrandom', False): - random.shuffle(entries) - - x_forwarded_for = ie_result.get('__x_forwarded_for_ip') - - for i, entry in enumerate(entries, 1): - self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) - # This __x_forwarded_for_ip thing is a bit ugly but requires - # minimal changes - if x_forwarded_for: - entry['__x_forwarded_for_ip'] = x_forwarded_for - extra = { - 'n_entries': n_entries, - 'playlist': playlist, - 'playlist_id': ie_result.get('id'), - 'playlist_title': ie_result.get('title'), - 'playlist_uploader': ie_result.get('uploader'), - 'playlist_uploader_id': ie_result.get('uploader_id'), - 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, - 'extractor': ie_result['extractor'], - 'webpage_url': ie_result['webpage_url'], - 'webpage_url_basename': url_basename(ie_result['webpage_url']), - 'extractor_key': ie_result['extractor_key'], - } - - reason = self._match_entry(entry, incomplete=True) - if reason is not None: - self.to_screen('[download] ' + reason) - continue - - entry_result = self.__process_iterable_entry(entry, download, extra) - # TODO: skip failed (empty) entries? - playlist_results.append(entry_result) - ie_result['entries'] = playlist_results - self.to_screen('[download] Finished downloading playlist: %s' % playlist) - return ie_result + self._playlist_level += 1 + self._playlist_urls.add(webpage_url) + try: + return self.__process_playlist(ie_result, download) + finally: + self._playlist_level -= 1 + if not self._playlist_level: + self._playlist_urls.clear() elif result_type == 'compat_list': self.report_warning( 'Extractor %s returned a compat_list result. ' @@ -1039,6 +950,118 @@ class YoutubeDL(object): else: raise Exception('Invalid result type: %s' % result_type) + def __process_playlist(self, ie_result, download): + # We process each entry in the playlist + playlist = ie_result.get('title') or ie_result.get('id') + + self.to_screen('[download] Downloading playlist: %s' % playlist) + + playlist_results = [] + + playliststart = self.params.get('playliststart', 1) - 1 + playlistend = self.params.get('playlistend') + # For backwards compatibility, interpret -1 as whole list + if playlistend == -1: + playlistend = None + + playlistitems_str = self.params.get('playlist_items') + playlistitems = None + if playlistitems_str is not None: + def iter_playlistitems(format): + for string_segment in format.split(','): + if '-' in string_segment: + start, end = string_segment.split('-') + for item in range(int(start), int(end) + 1): + yield int(item) + else: + yield int(string_segment) + playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) + + ie_entries = ie_result['entries'] + + def make_playlistitems_entries(list_ie_entries): + num_entries = len(list_ie_entries) + return [ + list_ie_entries[i - 1] for i in playlistitems + if -num_entries <= i - 1 < num_entries] + + def report_download(num_entries): + self.to_screen( + '[%s] playlist %s: Downloading %d videos' % + (ie_result['extractor'], playlist, num_entries)) + + if isinstance(ie_entries, list): + n_all_entries = len(ie_entries) + if playlistitems: + entries = make_playlistitems_entries(ie_entries) + else: + entries = ie_entries[playliststart:playlistend] + n_entries = len(entries) + self.to_screen( + '[%s] playlist %s: Collected %d video ids (downloading %d of them)' % + (ie_result['extractor'], playlist, n_all_entries, n_entries)) + elif isinstance(ie_entries, PagedList): + if playlistitems: + entries = [] + for item in playlistitems: + entries.extend(ie_entries.getslice( + item - 1, item + )) + else: + entries = ie_entries.getslice( + playliststart, playlistend) + n_entries = len(entries) + report_download(n_entries) + else: # iterable + if playlistitems: + entries = make_playlistitems_entries(list(itertools.islice( + ie_entries, 0, max(playlistitems)))) + else: + entries = list(itertools.islice( + ie_entries, playliststart, playlistend)) + n_entries = len(entries) + report_download(n_entries) + + if self.params.get('playlistreverse', False): + entries = entries[::-1] + + if self.params.get('playlistrandom', False): + random.shuffle(entries) + + x_forwarded_for = ie_result.get('__x_forwarded_for_ip') + + for i, entry in enumerate(entries, 1): + self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) + # This __x_forwarded_for_ip thing is a bit ugly but requires + # minimal changes + if x_forwarded_for: + entry['__x_forwarded_for_ip'] = x_forwarded_for + extra = { + 'n_entries': n_entries, + 'playlist': playlist, + 'playlist_id': ie_result.get('id'), + 'playlist_title': ie_result.get('title'), + 'playlist_uploader': ie_result.get('uploader'), + 'playlist_uploader_id': ie_result.get('uploader_id'), + 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'extractor_key': ie_result['extractor_key'], + } + + reason = self._match_entry(entry, incomplete=True) + if reason is not None: + self.to_screen('[download] ' + reason) + continue + + entry_result = self.__process_iterable_entry(entry, download, extra) + # TODO: skip failed (empty) entries? + playlist_results.append(entry_result) + ie_result['entries'] = playlist_results + self.to_screen('[download] Finished downloading playlist: %s' % playlist) + return ie_result + @__handle_extraction_exceptions def __process_iterable_entry(self, entry, download, extra_info): return self.process_ie_result( @@ -1083,7 +1106,7 @@ class YoutubeDL(object): '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) - \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id) + \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id|language) \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)? \s*(?P<value>[a-zA-Z0-9._-]+) \s*$ @@ -1226,6 +1249,8 @@ class YoutubeDL(object): group = _parse_format_selection(tokens, inside_group=True) current_selector = FormatSelector(GROUP, group, []) elif string == '+': + if inside_merge: + raise syntax_error('Unexpected "+"', start) video_selector = current_selector audio_selector = _parse_format_selection(tokens, inside_merge=True) if not video_selector or not audio_selector: @@ -1610,7 +1635,7 @@ class YoutubeDL(object): if req_format is None: req_format = self._default_format_spec(info_dict, download=download) if self.params.get('verbose'): - self.to_stdout('[debug] Default format spec: %s' % req_format) + self._write_string('[debug] Default format spec: %s\n' % req_format) format_selector = self.build_format_selector(req_format) @@ -1777,6 +1802,8 @@ class YoutubeDL(object): os.makedirs(dn) return True except (OSError, IOError) as err: + if isinstance(err, OSError) and err.errno == errno.EEXIST: + return True self.report_error('unable to create directory ' + error_to_compat_str(err)) return False @@ -1871,7 +1898,7 @@ class YoutubeDL(object): for ph in self._progress_hooks: fd.add_progress_hook(ph) if self.params.get('verbose'): - self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) + self.to_screen('[debug] Invoking downloader on %r' % info.get('url')) return fd.download(name, info) if info_dict.get('requested_formats') is not None: @@ -2410,7 +2437,7 @@ class YoutubeDL(object): thumb_ext = determine_ext(t['url'], 'jpg') suffix = '_%s' % t['id'] if len(thumbnails) > 1 else '' thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else '' - t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext + t['filename'] = thumb_filename = replace_extension(filename + suffix, thumb_ext, info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): self.to_screen('[%s] %s: Thumbnail %sis already present' % diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9a659fc65..e1bd67919 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -340,6 +340,7 @@ def _real_main(argv=None): 'format': opts.format, 'listformats': opts.listformats, 'outtmpl': outtmpl, + 'outtmpl_na_placeholder': opts.outtmpl_na_placeholder, 'autonumber_size': opts.autonumber_size, 'autonumber_start': opts.autonumber_start, 'restrictfilenames': opts.restrictfilenames, diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 02f35459e..35c76feba 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -97,12 +97,15 @@ class FragmentFD(FileDownloader): def _download_fragment(self, ctx, frag_url, info_dict, headers=None): fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index']) - success = ctx['dl'].download(fragment_filename, { + fragment_info_dict = { 'url': frag_url, 'http_headers': headers or info_dict.get('http_headers'), - }) + } + success = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: return False, None + if fragment_info_dict.get('filetime'): + ctx['fragment_filetime'] = fragment_info_dict.get('filetime') down, frag_sanitized = sanitize_open(fragment_filename, 'rb') ctx['fragment_filename_sanitized'] = frag_sanitized frag_content = down.read() @@ -258,6 +261,13 @@ class FragmentFD(FileDownloader): downloaded_bytes = ctx['complete_frags_downloaded_bytes'] else: self.try_rename(ctx['tmpfilename'], ctx['filename']) + if self.params.get('updatetime', True): + filetime = ctx.get('fragment_filetime') + if filetime: + try: + os.utime(ctx['filename'], (time.time(), filetime)) + except Exception: + pass downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) self._hook_progress({ diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 0f2c06f40..7aaebc940 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -42,11 +42,13 @@ class HlsFD(FragmentFD): # no segments will definitely be appended to the end of the playlist. # r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of # # event media playlists [4] + r'#EXT-X-MAP:', # media initialization [5] # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 + # 5. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.5 ) check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES] is_aes128_enc = '#EXT-X-KEY:METHOD=AES-128' in manifest @@ -170,8 +172,12 @@ class HlsFD(FragmentFD): iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen( self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read() - frag_content = AES.new( - decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) + # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block + # size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded, + # not what it decrypts to. + if not test: + frag_content = AES.new( + decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) self._append_fragment(ctx, frag_content) # We only download the first fragment during the test if test: diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index 8b407bf9c..908c83377 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -1,14 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import calendar import re -import time from .amp import AMPIE from .common import InfoExtractor -from .youtube import YoutubeIE -from ..compat import compat_urlparse +from ..utils import ( + parse_duration, + parse_iso8601, + try_get, +) class AbcNewsVideoIE(AMPIE): @@ -18,8 +19,8 @@ class AbcNewsVideoIE(AMPIE): (?: abcnews\.go\.com/ (?: - [^/]+/video/(?P<display_id>[0-9a-z-]+)-| - video/embed\?.*?\bid= + (?:[^/]+/)*video/(?P<display_id>[0-9a-z-]+)-| + video/(?:embed|itemfeed)\?.*?\bid= )| fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/ ) @@ -36,6 +37,8 @@ class AbcNewsVideoIE(AMPIE): 'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.', 'duration': 180, 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1380454200, + 'upload_date': '20130929', }, 'params': { # m3u8 download @@ -47,6 +50,12 @@ class AbcNewsVideoIE(AMPIE): }, { 'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478', 'only_matching': True, + }, { + 'url': 'http://abcnews.go.com/video/itemfeed?id=46979033', + 'only_matching': True, + }, { + 'url': 'https://abcnews.go.com/GMA/News/video/history-christmas-story-67894761', + 'only_matching': True, }] def _real_extract(self, url): @@ -67,28 +76,23 @@ class AbcNewsIE(InfoExtractor): _VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)' _TESTS = [{ - 'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY', + # Youtube Embeds + 'url': 'https://abcnews.go.com/Entertainment/peter-billingsley-child-actor-christmas-story-hollywood-power/story?id=51286501', 'info_dict': { - 'id': '10505354', - 'ext': 'flv', - 'display_id': 'dramatic-video-rare-death-job-america', - 'title': 'Occupational Hazards', - 'description': 'Nightline investigates the dangers that lurk at various jobs.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20100428', - 'timestamp': 1272412800, + 'id': '51286501', + 'title': "Peter Billingsley: From child actor in 'A Christmas Story' to Hollywood power player", + 'description': 'Billingsley went from a child actor to Hollywood power player.', }, - 'add_ie': ['AbcNewsVideo'], + 'playlist_count': 5, }, { 'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818', 'info_dict': { 'id': '38897857', 'ext': 'mp4', - 'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016', 'title': 'Justin Timberlake Drops Hints For Secret Single', 'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.', - 'upload_date': '20160515', - 'timestamp': 1463329500, + 'upload_date': '20160505', + 'timestamp': 1462442280, }, 'params': { # m3u8 download @@ -100,49 +104,55 @@ class AbcNewsIE(InfoExtractor): }, { 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', 'only_matching': True, + }, { + # inline.type == 'video' + 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - video_id = mobj.group('id') + story_id = self._match_id(url) + webpage = self._download_webpage(url, story_id) + story = self._parse_json(self._search_regex( + r"window\['__abcnews__'\]\s*=\s*({.+?});", + webpage, 'data'), story_id)['page']['content']['story']['everscroll'][0] + article_contents = story.get('articleContents') or {} - webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL') - full_video_url = compat_urlparse.urljoin(url, video_url) + def entries(): + featured_video = story.get('featuredVideo') or {} + feed = try_get(featured_video, lambda x: x['video']['feed']) + if feed: + yield { + '_type': 'url', + 'id': featured_video.get('id'), + 'title': featured_video.get('name'), + 'url': feed, + 'thumbnail': featured_video.get('images'), + 'description': featured_video.get('description'), + 'timestamp': parse_iso8601(featured_video.get('uploadDate')), + 'duration': parse_duration(featured_video.get('duration')), + 'ie_key': AbcNewsVideoIE.ie_key(), + } - youtube_url = YoutubeIE._extract_url(webpage) + for inline in (article_contents.get('inlines') or []): + inline_type = inline.get('type') + if inline_type == 'iframe': + iframe_url = try_get(inline, lambda x: x['attrs']['src']) + if iframe_url: + yield self.url_result(iframe_url) + elif inline_type == 'video': + video_id = inline.get('id') + if video_id: + yield { + '_type': 'url', + 'id': video_id, + 'url': 'http://abcnews.go.com/video/embed?id=' + video_id, + 'thumbnail': inline.get('imgSrc') or inline.get('imgDefault'), + 'description': inline.get('description'), + 'duration': parse_duration(inline.get('duration')), + 'ie_key': AbcNewsVideoIE.ie_key(), + } - timestamp = None - date_str = self._html_search_regex( - r'<span[^>]+class="timestamp">([^<]+)</span>', - webpage, 'timestamp', fatal=False) - if date_str: - tz_offset = 0 - if date_str.endswith(' ET'): # Eastern Time - tz_offset = -5 - date_str = date_str[:-3] - date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p'] - for date_format in date_formats: - try: - timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format)) - except ValueError: - continue - if timestamp is not None: - timestamp -= tz_offset * 3600 - - entry = { - '_type': 'url_transparent', - 'ie_key': AbcNewsVideoIE.ie_key(), - 'url': full_video_url, - 'id': video_id, - 'display_id': display_id, - 'timestamp': timestamp, - } - - if youtube_url: - entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())] - return self.playlist_result(entries) - - return entry + return self.playlist_result( + entries(), story_id, article_contents.get('headline'), + article_contents.get('subHead')) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index b17c792d2..b9355a2c8 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -2,21 +2,48 @@ from __future__ import unicode_literals import re -import functools from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( clean_html, - float_or_none, + clean_podcast_url, int_or_none, - try_get, - unified_timestamp, - OnDemandPagedList, + parse_iso8601, ) -class ACastIE(InfoExtractor): +class ACastBaseIE(InfoExtractor): + def _extract_episode(self, episode, show_info): + title = episode['title'] + info = { + 'id': episode['id'], + 'display_id': episode.get('episodeUrl'), + 'url': clean_podcast_url(episode['url']), + 'title': title, + 'description': clean_html(episode.get('description') or episode.get('summary')), + 'thumbnail': episode.get('image'), + 'timestamp': parse_iso8601(episode.get('publishDate')), + 'duration': int_or_none(episode.get('duration')), + 'filesize': int_or_none(episode.get('contentLength')), + 'season_number': int_or_none(episode.get('season')), + 'episode': title, + 'episode_number': int_or_none(episode.get('episode')), + } + info.update(show_info) + return info + + def _extract_show_info(self, show): + return { + 'creator': show.get('author'), + 'series': show.get('title'), + } + + def _call_api(self, path, video_id, query=None): + return self._download_json( + 'https://feeder.acast.com/api/v1/shows/' + path, video_id, query=query) + + +class ACastIE(ACastBaseIE): IE_NAME = 'acast' _VALID_URL = r'''(?x) https?:// @@ -28,15 +55,15 @@ class ACastIE(InfoExtractor): ''' _TESTS = [{ 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': '16d936099ec5ca2d5869e3a813ee8dc4', + 'md5': 'f5598f3ad1e4776fed12ec1407153e4b', 'info_dict': { 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', 'ext': 'mp3', 'title': '2. Raggarmordet - Röster ur det förflutna', - 'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4', + 'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67', 'timestamp': 1477346700, 'upload_date': '20161024', - 'duration': 2766.602563, + 'duration': 2766, 'creator': 'Anton Berg & Martin Johnson', 'series': 'Spår', 'episode': '2. Raggarmordet - Röster ur det förflutna', @@ -45,7 +72,7 @@ class ACastIE(InfoExtractor): 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', 'only_matching': True, }, { - 'url': 'https://play.acast.com/s/rattegangspodden/s04e09-styckmordet-i-helenelund-del-22', + 'url': 'https://play.acast.com/s/rattegangspodden/s04e09styckmordetihelenelund-del2-2', 'only_matching': True, }, { 'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9', @@ -54,40 +81,14 @@ class ACastIE(InfoExtractor): def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() - s = self._download_json( - 'https://feeder.acast.com/api/v1/shows/%s/episodes/%s' % (channel, display_id), - display_id) - media_url = s['url'] - if re.search(r'[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}', display_id): - episode_url = s.get('episodeUrl') - if episode_url: - display_id = episode_url - else: - channel, display_id = re.match(self._VALID_URL, s['link']).groups() - cast_data = self._download_json( - 'https://play-api.acast.com/splash/%s/%s' % (channel, display_id), - display_id)['result'] - e = cast_data['episode'] - title = e.get('name') or s['title'] - return { - 'id': compat_str(e['id']), - 'display_id': display_id, - 'url': media_url, - 'title': title, - 'description': e.get('summary') or clean_html(e.get('description') or s.get('description')), - 'thumbnail': e.get('image'), - 'timestamp': unified_timestamp(e.get('publishingDate') or s.get('publishDate')), - 'duration': float_or_none(e.get('duration') or s.get('duration')), - 'filesize': int_or_none(e.get('contentLength')), - 'creator': try_get(cast_data, lambda x: x['show']['author'], compat_str), - 'series': try_get(cast_data, lambda x: x['show']['name'], compat_str), - 'season_number': int_or_none(e.get('seasonNumber')), - 'episode': title, - 'episode_number': int_or_none(e.get('episodeNumber')), - } + episode = self._call_api( + '%s/episodes/%s' % (channel, display_id), + display_id, {'showInfo': 'true'}) + return self._extract_episode( + episode, self._extract_show_info(episode.get('show') or {})) -class ACastChannelIE(InfoExtractor): +class ACastChannelIE(ACastBaseIE): IE_NAME = 'acast:channel' _VALID_URL = r'''(?x) https?:// @@ -102,34 +103,24 @@ class ACastChannelIE(InfoExtractor): 'info_dict': { 'id': '4efc5294-5385-4847-98bd-519799ce5786', 'title': 'Today in Focus', - 'description': 'md5:9ba5564de5ce897faeb12963f4537a64', + 'description': 'md5:c09ce28c91002ce4ffce71d6504abaae', }, - 'playlist_mincount': 35, + 'playlist_mincount': 200, }, { 'url': 'http://play.acast.com/s/ft-banking-weekly', 'only_matching': True, }] - _API_BASE_URL = 'https://play.acast.com/api/' - _PAGE_SIZE = 10 @classmethod def suitable(cls, url): return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url) - def _fetch_page(self, channel_slug, page): - casts = self._download_json( - self._API_BASE_URL + 'channels/%s/acasts?page=%s' % (channel_slug, page), - channel_slug, note='Download page %d of channel data' % page) - for cast in casts: - yield self.url_result( - 'https://play.acast.com/s/%s/%s' % (channel_slug, cast['url']), - 'ACast', cast['id']) - def _real_extract(self, url): - channel_slug = self._match_id(url) - channel_data = self._download_json( - self._API_BASE_URL + 'channels/%s' % channel_slug, channel_slug) - entries = OnDemandPagedList(functools.partial( - self._fetch_page, channel_slug), self._PAGE_SIZE) - return self.playlist_result(entries, compat_str( - channel_data['id']), channel_data['name'], channel_data.get('description')) + show_slug = self._match_id(url) + show = self._call_api(show_slug, show_slug) + show_info = self._extract_show_info(show) + entries = [] + for episode in (show.get('episodes') or []): + entries.append(self._extract_episode(episode, show_info)) + return self.playlist_result( + entries, show.get('id'), show.get('title'), show.get('description')) diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index c95ad2173..a55ebbcbd 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -10,6 +10,7 @@ import random from .common import InfoExtractor from ..aes import aes_cbc_decrypt from ..compat import ( + compat_HTTPError, compat_b64decode, compat_ord, ) @@ -18,11 +19,14 @@ from ..utils import ( bytes_to_long, ExtractorError, float_or_none, + int_or_none, intlist_to_bytes, long_to_bytes, pkcs1pad, strip_or_none, - urljoin, + try_get, + unified_strdate, + urlencode_postdata, ) @@ -31,16 +35,30 @@ class ADNIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P<id>\d+)' _TEST = { 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', - 'md5': 'e497370d847fd79d9d4c74be55575c7a', + 'md5': '0319c99885ff5547565cacb4f3f9348d', 'info_dict': { 'id': '7778', 'ext': 'mp4', - 'title': 'Blue Exorcist - Kyôto Saga - Épisode 1', + 'title': 'Blue Exorcist - Kyôto Saga - Episode 1', 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5', + 'series': 'Blue Exorcist - Kyôto Saga', + 'duration': 1467, + 'release_date': '20170106', + 'comment_count': int, + 'average_rating': float, + 'season_number': 2, + 'episode': 'Début des hostilités', + 'episode_number': 1, } } + + _NETRC_MACHINE = 'animedigitalnetwork' _BASE_URL = 'http://animedigitalnetwork.fr' - _RSA_KEY = (0xc35ae1e4356b65a73b551493da94b8cb443491c0aa092a357a5aee57ffc14dda85326f42d716e539a34542a0d3f363adf16c5ec222d713d5997194030ee2e4f0d1fb328c01a81cf6868c090d50de8e169c6b13d1675b9eeed1cbc51e1fffca9b38af07f37abd790924cd3bee59d0257cfda4fe5f3f0534877e21ce5821447d1b, 65537) + _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/' + _PLAYER_BASE_URL = _API_BASE_URL + 'player/' + _HEADERS = {} + _LOGIN_ERR_MESSAGE = 'Unable to log in' + _RSA_KEY = (0x9B42B08905199A5CCE2026274399CA560ECB209EE9878A708B1C0812E1BB8CB5D1FB7441861147C1A1F2F3A0476DD63A9CAC20D3E983613346850AA6CB38F16DC7D720FD7D86FC6E5B3D5BBC72E14CD0BF9E869F2CEA2CCAD648F1DCE38F1FF916CEFB2D339B64AA0264372344BC775E265E8A852F88144AB0BD9AA06C1A4ABB, 65537) _POS_ALIGN_MAP = { 'start': 1, 'end': 3, @@ -54,26 +72,24 @@ class ADNIE(InfoExtractor): def _ass_subtitles_timecode(seconds): return '%01d:%02d:%02d.%02d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 100) - def _get_subtitles(self, sub_path, video_id): - if not sub_path: + def _get_subtitles(self, sub_url, video_id): + if not sub_url: return None enc_subtitles = self._download_webpage( - urljoin(self._BASE_URL, sub_path), - video_id, 'Downloading subtitles location', fatal=False) or '{}' + sub_url, video_id, 'Downloading subtitles location', fatal=False) or '{}' subtitle_location = (self._parse_json(enc_subtitles, video_id, fatal=False) or {}).get('location') if subtitle_location: enc_subtitles = self._download_webpage( - urljoin(self._BASE_URL, subtitle_location), - video_id, 'Downloading subtitles data', fatal=False, - headers={'Origin': 'https://animedigitalnetwork.fr'}) + subtitle_location, video_id, 'Downloading subtitles data', + fatal=False, headers={'Origin': 'https://animedigitalnetwork.fr'}) if not enc_subtitles: return None # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), - bytes_to_intlist(binascii.unhexlify(self._K + '4b8ef13ec1872730')), + bytes_to_intlist(binascii.unhexlify(self._K + 'ab9f52f5baae7c72')), bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( @@ -117,61 +133,100 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' }]) return subtitles + def _real_initialize(self): + username, password = self._get_login_info() + if not username: + return + try: + access_token = (self._download_json( + self._API_BASE_URL + 'authentication/login', None, + 'Logging in', self._LOGIN_ERR_MESSAGE, fatal=False, + data=urlencode_postdata({ + 'password': password, + 'rememberMe': False, + 'source': 'Web', + 'username': username, + })) or {}).get('accessToken') + if access_token: + self._HEADERS = {'authorization': 'Bearer ' + access_token} + except ExtractorError as e: + message = None + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json( + e.cause.read().decode(), None, fatal=False) or {} + message = resp.get('message') or resp.get('code') + self.report_warning(message or self._LOGIN_ERR_MESSAGE) + def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - player_config = self._parse_json(self._search_regex( - r'playerConfig\s*=\s*({.+});', webpage, - 'player config', default='{}'), video_id, fatal=False) - if not player_config: - config_url = urljoin(self._BASE_URL, self._search_regex( - r'(?:id="player"|class="[^"]*adn-player-container[^"]*")[^>]+data-url="([^"]+)"', - webpage, 'config url')) - player_config = self._download_json( - config_url, video_id, - 'Downloading player config JSON metadata')['player'] + video_base_url = self._PLAYER_BASE_URL + 'video/%s/' % video_id + player = self._download_json( + video_base_url + 'configuration', video_id, + 'Downloading player config JSON metadata', + headers=self._HEADERS)['player'] + options = player['options'] - video_info = {} - video_info_str = self._search_regex( - r'videoInfo\s*=\s*({.+});', webpage, - 'video info', fatal=False) - if video_info_str: - video_info = self._parse_json( - video_info_str, video_id, fatal=False) or {} + user = options['user'] + if not user.get('hasAccess'): + self.raise_login_required() - options = player_config.get('options') or {} - metas = options.get('metas') or {} - links = player_config.get('links') or {} - sub_path = player_config.get('subtitles') - error = None - if not links: - links_url = player_config.get('linksurl') or options['videoUrl'] - token = options['token'] - self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)]) - message = bytes_to_intlist(json.dumps({ - 'k': self._K, - 'e': 60, - 't': token, - })) + token = self._download_json( + user.get('refreshTokenUrl') or (self._PLAYER_BASE_URL + 'refresh/token'), + video_id, 'Downloading access token', headers={ + 'x-player-refresh-token': user['refreshToken'] + }, data=b'')['token'] + + links_url = try_get(options, lambda x: x['video']['url']) or (video_base_url + 'link') + self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)]) + message = bytes_to_intlist(json.dumps({ + 'k': self._K, + 't': token, + })) + + # Sometimes authentication fails for no good reason, retry with + # a different random padding + links_data = None + for _ in range(3): padded_message = intlist_to_bytes(pkcs1pad(message, 128)) n, e = self._RSA_KEY encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n)) authorization = base64.b64encode(encrypted_message).decode() - links_data = self._download_json( - urljoin(self._BASE_URL, links_url), video_id, - 'Downloading links JSON metadata', headers={ - 'Authorization': 'Bearer ' + authorization, - }) - links = links_data.get('links') or {} - metas = metas or links_data.get('meta') or {} - sub_path = sub_path or links_data.get('subtitles') or \ - 'index.php?option=com_vodapi&task=subtitles.getJSON&format=json&id=' + video_id - sub_path += '&token=' + token - error = links_data.get('error') - title = metas.get('title') or video_info['title'] + + try: + links_data = self._download_json( + links_url, video_id, 'Downloading links JSON metadata', headers={ + 'X-Player-Token': authorization + }, query={ + 'freeWithAds': 'true', + 'adaptive': 'false', + 'withMetadata': 'true', + 'source': 'Web' + }) + break + except ExtractorError as e: + if not isinstance(e.cause, compat_HTTPError): + raise e + + if e.cause.code == 401: + # This usually goes away with a different random pkcs1pad, so retry + continue + + error = self._parse_json(e.cause.read(), video_id) + message = error.get('message') + if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country': + self.raise_geo_restricted(msg=message) + raise ExtractorError(message) + else: + raise ExtractorError('Giving up retrying') + + links = links_data.get('links') or {} + metas = links_data.get('metadata') or {} + sub_url = (links.get('subtitles') or {}).get('all') + video_info = links_data.get('video') or {} + title = metas['title'] formats = [] - for format_id, qualities in links.items(): + for format_id, qualities in (links.get('streaming') or {}).items(): if not isinstance(qualities, dict): continue for quality, load_balancer_url in qualities.items(): @@ -189,19 +244,26 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' for f in m3u8_formats: f['language'] = 'fr' formats.extend(m3u8_formats) - if not error: - error = options.get('error') - if not formats and error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) self._sort_formats(formats) + video = (self._download_json( + self._API_BASE_URL + 'video/%s' % video_id, video_id, + 'Downloading additional video metadata', fatal=False) or {}).get('video') or {} + show = video.get('show') or {} + return { 'id': video_id, 'title': title, - 'description': strip_or_none(metas.get('summary') or video_info.get('resume')), - 'thumbnail': video_info.get('image'), + 'description': strip_or_none(metas.get('summary') or video.get('summary')), + 'thumbnail': video_info.get('image') or player.get('image'), 'formats': formats, - 'subtitles': self.extract_subtitles(sub_path, video_id), - 'episode': metas.get('subtitle') or video_info.get('videoTitle'), - 'series': video_info.get('playlistTitle'), + 'subtitles': self.extract_subtitles(sub_url, video_id), + 'episode': metas.get('subtitle') or video.get('name'), + 'episode_number': int_or_none(video.get('shortNumber')), + 'series': show.get('title'), + 'season_number': int_or_none(video.get('season')), + 'duration': int_or_none(video_info.get('duration') or video.get('duration')), + 'release_date': unified_strdate(video.get('releaseDate')), + 'average_rating': float_or_none(video.get('rating') or metas.get('rating')), + 'comment_count': int_or_none(video.get('commentsCount')), } diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 611b948f5..e55c03fd7 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -5,20 +5,32 @@ import re from .theplatform import ThePlatformIE from ..utils import ( - extract_attributes, ExtractorError, + GeoRestrictedError, int_or_none, - smuggle_url, update_url_query, -) -from ..compat import ( - compat_urlparse, + urlencode_postdata, ) class AENetworksBaseIE(ThePlatformIE): + _BASE_URL_REGEX = r'''(?x)https?:// + (?:(?:www|play|watch)\.)? + (?P<domain> + (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| + fyi\.tv + )/''' _THEPLATFORM_KEY = 'crazyjava' _THEPLATFORM_SECRET = 's3cr3t' + _DOMAIN_MAP = { + 'history.com': ('HISTORY', 'history'), + 'aetv.com': ('AETV', 'aetv'), + 'mylifetime.com': ('LIFETIME', 'lifetime'), + 'lifetimemovieclub.com': ('LIFETIMEMOVIECLUB', 'lmc'), + 'fyi.tv': ('FYI', 'fyi'), + 'historyvault.com': (None, 'historyvault'), + 'biography.com': (None, 'biography'), + } def _extract_aen_smil(self, smil_url, video_id, auth=None): query = {'mbr': 'true'} @@ -31,7 +43,7 @@ class AENetworksBaseIE(ThePlatformIE): 'assetTypes': 'high_video_s3' }, { 'assetTypes': 'high_video_s3', - 'switch': 'hls_ingest_fastly' + 'switch': 'hls_high_fastly', }] formats = [] subtitles = {} @@ -44,6 +56,8 @@ class AENetworksBaseIE(ThePlatformIE): tp_formats, tp_subtitles = self._extract_theplatform_smil( m_url, video_id, 'Downloading %s SMIL data' % (q.get('switch') or q['assetTypes'])) except ExtractorError as e: + if isinstance(e, GeoRestrictedError): + raise last_e = e continue formats.extend(tp_formats) @@ -57,24 +71,45 @@ class AENetworksBaseIE(ThePlatformIE): 'subtitles': subtitles, } + def _extract_aetn_info(self, domain, filter_key, filter_value, url): + requestor_id, brand = self._DOMAIN_MAP[domain] + result = self._download_json( + 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, + filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0] + title = result['title'] + video_id = result['id'] + media_url = result['publicUrl'] + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + auth = None + if theplatform_metadata.get('AETN$isBehindWall'): + resource = self._get_mvpd_resource( + requestor_id, theplatform_metadata['title'], + theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), + theplatform_metadata['ratings'][0]['rating']) + auth = self._extract_mvpd_auth( + url, video_id, requestor_id, resource) + info.update(self._extract_aen_smil(media_url, video_id, auth)) + info.update({ + 'title': title, + 'series': result.get('seriesName'), + 'season_number': int_or_none(result.get('tvSeasonNumber')), + 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), + }) + return info + class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?P<domain> - (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| - fyi\.tv - )/ - (?: - shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})| - movies/(?P<movie_display_id>[^/]+)(?:/full-movie)?| - specials/(?P<special_display_id>[^/]+)/(?:full-special|preview-)| - collections/[^/]+/(?P<collection_display_id>[^/]+) - ) - ''' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'''(?P<id> + shows/[^/]+/season-\d+/episode-\d+| + (?: + (?:movie|special)s/[^/]+| + (?:shows/[^/]+/)?videos + )/[^/?#&]+ + )''' _TESTS = [{ 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', 'info_dict': { @@ -91,22 +126,23 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.history.com/shows/ancient-aliens/season-1', - 'info_dict': { - 'id': '71889446852', - }, - 'playlist_mincount': 5, - }, { - 'url': 'http://www.mylifetime.com/shows/atlanta-plastic', - 'info_dict': { - 'id': 'SERIES4317', - 'title': 'Atlanta Plastic', - }, - 'playlist_mincount': 2, + 'skip': 'This video is only available for users of participating TV providers.', }, { 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', - 'only_matching': True + 'info_dict': { + 'id': '600587331957', + 'ext': 'mp4', + 'title': 'Inlawful Entry', + 'description': 'md5:57c12115a2b384d883fe64ca50529e08', + 'timestamp': 1452634428, + 'upload_date': '20160112', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], }, { 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', 'only_matching': True @@ -117,78 +153,125 @@ class AENetworksIE(AENetworksBaseIE): 'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie', 'only_matching': True }, { - 'url': 'https://www.lifetimemovieclub.com/movies/a-killer-among-us', + 'url': 'https://watch.lifetimemovieclub.com/movies/10-year-reunion/full-movie', 'only_matching': True }, { 'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special', 'only_matching': True - }, { - 'url': 'https://www.historyvault.com/collections/america-the-story-of-us/westward', - 'only_matching': True }, { 'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story', 'only_matching': True + }, { + 'url': 'http://www.history.com/videos/history-of-valentines-day', + 'only_matching': True + }, { + 'url': 'https://play.aetv.com/shows/duck-dynasty/videos/best-of-duck-dynasty-getting-quack-in-shape', + 'only_matching': True }] - _DOMAIN_TO_REQUESTOR_ID = { - 'history.com': 'HISTORY', - 'aetv.com': 'AETV', - 'mylifetime.com': 'LIFETIME', - 'lifetimemovieclub.com': 'LIFETIMEMOVIECLUB', - 'fyi.tv': 'FYI', - } def _real_extract(self, url): - domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups() - display_id = show_path or movie_display_id or special_display_id or collection_display_id - webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers()) - if show_path: - url_parts = show_path.split('/') - url_parts_len = len(url_parts) - if url_parts_len == 1: - entries = [] - for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): - entries.append(self.url_result( - compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) - if entries: - return self.playlist_result( - entries, self._html_search_meta('aetn:SeriesId', webpage), - self._html_search_meta('aetn:SeriesTitle', webpage)) - else: - # single season - url_parts_len = 2 - if url_parts_len == 2: - entries = [] - for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage): - episode_attributes = extract_attributes(episode_item) - episode_url = compat_urlparse.urljoin( - url, episode_attributes['data-canonical']) - entries.append(self.url_result( - episode_url, 'AENetworks', - episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id'))) - return self.playlist_result( - entries, self._html_search_meta('aetn:SeasonId', webpage)) + domain, canonical = re.match(self._VALID_URL, url).groups() + return self._extract_aetn_info(domain, 'canonical', '/' + canonical, url) - video_id = self._html_search_meta('aetn:VideoID', webpage) - media_url = self._search_regex( - [r"media_url\s*=\s*'(?P<url>[^']+)'", - r'data-media-url=(?P<url>(?:https?:)?//[^\s>]+)', - r'data-media-url=(["\'])(?P<url>(?:(?!\1).)+?)\1'], - webpage, 'video url', group='url') - theplatform_metadata = self._download_theplatform_metadata(self._search_regex( - r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) - info = self._parse_theplatform_metadata(theplatform_metadata) - auth = None - if theplatform_metadata.get('AETN$isBehindWall'): - requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] - resource = self._get_mvpd_resource( - requestor_id, theplatform_metadata['title'], - theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), - theplatform_metadata['ratings'][0]['rating']) - auth = self._extract_mvpd_auth( - url, video_id, requestor_id, resource) - info.update(self._search_json_ld(webpage, video_id, fatal=False)) - info.update(self._extract_aen_smil(media_url, video_id, auth)) - return info + +class AENetworksListBaseIE(AENetworksBaseIE): + def _call_api(self, resource, slug, brand, fields): + return self._download_json( + 'https://yoga.appsvcs.aetnd.com/graphql', + slug, query={'brand': brand}, data=urlencode_postdata({ + 'query': '''{ + %s(slug: "%s") { + %s + } +}''' % (resource, slug, fields), + }))['data'][resource] + + def _real_extract(self, url): + domain, slug = re.match(self._VALID_URL, url).groups() + _, brand = self._DOMAIN_MAP[domain] + playlist = self._call_api(self._RESOURCE, slug, brand, self._FIELDS) + base_url = 'http://watch.%s' % domain + + entries = [] + for item in (playlist.get(self._ITEMS_KEY) or []): + doc = self._get_doc(item) + canonical = doc.get('canonical') + if not canonical: + continue + entries.append(self.url_result( + base_url + canonical, AENetworksIE.ie_key(), doc.get('id'))) + + description = None + if self._PLAYLIST_DESCRIPTION_KEY: + description = playlist.get(self._PLAYLIST_DESCRIPTION_KEY) + + return self.playlist_result( + entries, playlist.get('id'), + playlist.get(self._PLAYLIST_TITLE_KEY), description) + + +class AENetworksCollectionIE(AENetworksListBaseIE): + IE_NAME = 'aenetworks:collection' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'(?:[^/]+/)*(?:list|collections)/(?P<id>[^/?#&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://watch.historyvault.com/list/america-the-story-of-us', + 'info_dict': { + 'id': '282', + 'title': 'America The Story of Us', + }, + 'playlist_mincount': 12, + }, { + 'url': 'https://watch.historyvault.com/shows/america-the-story-of-us-2/season-1/list/america-the-story-of-us', + 'only_matching': True + }, { + 'url': 'https://www.historyvault.com/collections/mysteryquest', + 'only_matching': True + }] + _RESOURCE = 'list' + _ITEMS_KEY = 'items' + _PLAYLIST_TITLE_KEY = 'display_title' + _PLAYLIST_DESCRIPTION_KEY = None + _FIELDS = '''id + display_title + items { + ... on ListVideoItem { + doc { + canonical + id + } + } + }''' + + def _get_doc(self, item): + return item.get('doc') or {} + + +class AENetworksShowIE(AENetworksListBaseIE): + IE_NAME = 'aenetworks:show' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'shows/(?P<id>[^/?#&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'http://www.history.com/shows/ancient-aliens', + 'info_dict': { + 'id': 'SERIES1574', + 'title': 'Ancient Aliens', + 'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f', + }, + 'playlist_mincount': 150, + }] + _RESOURCE = 'series' + _ITEMS_KEY = 'episodes' + _PLAYLIST_TITLE_KEY = 'title' + _PLAYLIST_DESCRIPTION_KEY = 'description' + _FIELDS = '''description + id + title + episodes { + canonical + id + }''' + + def _get_doc(self, item): + return item class HistoryTopicIE(AENetworksBaseIE): @@ -204,6 +287,7 @@ class HistoryTopicIE(AENetworksBaseIE): 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', 'timestamp': 1375819729, 'upload_date': '20130806', + 'uploader': 'AENE-NEW', }, 'params': { # m3u8 download @@ -212,36 +296,47 @@ class HistoryTopicIE(AENetworksBaseIE): 'add_ie': ['ThePlatform'], }] - def theplatform_url_result(self, theplatform_url, video_id, query): - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': smuggle_url( - update_url_query(theplatform_url, query), - { - 'sig': { - 'key': self._THEPLATFORM_KEY, - 'secret': self._THEPLATFORM_SECRET, - }, - 'force_smil_url': True - }), - 'ie_key': 'ThePlatform', - } + def _real_extract(self, url): + display_id = self._match_id(url) + return self.url_result( + 'http://www.history.com/videos/' + display_id, + AENetworksIE.ie_key()) + + +class HistoryPlayerIE(AENetworksBaseIE): + IE_NAME = 'history:player' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|biography)\.com)/player/(?P<id>\d+)' + _TESTS = [] + + def _real_extract(self, url): + domain, video_id = re.match(self._VALID_URL, url).groups() + return self._extract_aetn_info(domain, 'id', video_id, url) + + +class BiographyIE(AENetworksBaseIE): + _VALID_URL = r'https?://(?:www\.)?biography\.com/video/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.biography.com/video/vincent-van-gogh-full-episode-2075049808', + 'info_dict': { + 'id': '30322987', + 'ext': 'mp4', + 'title': 'Vincent Van Gogh - Full Episode', + 'description': 'A full biography about the most influential 20th century painter, Vincent Van Gogh.', + 'timestamp': 1311970571, + 'upload_date': '20110729', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], + }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'<phoenix-iframe[^>]+src="[^"]+\btpid=(\d+)', webpage, 'tpid') - result = self._download_json( - 'https://feeds.video.aetnd.com/api/v2/history/videos', - video_id, query={'filter[id]': video_id})['results'][0] - title = result['title'] - info = self._extract_aen_smil(result['publicUrl'], video_id) - info.update({ - 'title': title, - 'description': result.get('description'), - 'duration': int_or_none(result.get('duration')), - 'timestamp': int_or_none(result.get('added'), 1000), - }) - return info + player_url = self._search_regex( + r'<phoenix-iframe[^>]+src="(%s)' % HistoryPlayerIE._VALID_URL, + webpage, 'player URL') + return self.url_result(player_url, HistoryPlayerIE.ie_key()) diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index c68be3134..c4f915a3c 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -1,13 +1,16 @@ from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P<id>[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?P<type>program/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)' _TESTS = [{ - 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', + 'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance', 'info_dict': { 'id': '3792260579001', 'ext': 'mp4', @@ -20,14 +23,34 @@ class AlJazeeraIE(InfoExtractor): 'add_ie': ['BrightcoveNew'], 'skip': 'Not accessible from Travis CI server', }, { - 'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html', + 'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off', + 'only_matching': True, + }, { + 'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art', 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' def _real_extract(self, url): - program_name = self._match_id(url) - webpage = self._download_webpage(url, program_name) - brightcove_id = self._search_regex( - r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id') - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + post_type, name = re.match(self._VALID_URL, url).groups() + post_type = { + 'features': 'post', + 'program': 'episode', + 'videos': 'video', + }[post_type.split('/')[0]] + video = self._download_json( + 'https://www.aljazeera.com/graphql', name, query={ + 'operationName': 'SingleArticleQuery', + 'variables': json.dumps({ + 'name': name, + 'postType': post_type, + }), + }, headers={ + 'wp-site': 'aje', + })['data']['article']['video'] + video_id = video['id'] + account_id = video.get('accountId') or '665003303001' + player_id = video.get('playerId') or 'BkeSH5BDb' + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), + 'BrightcoveNew', video_id) diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py index 6fb3d6c53..b8027bbca 100644 --- a/youtube_dl/extractor/amcnetworks.py +++ b/youtube_dl/extractor/amcnetworks.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .theplatform import ThePlatformIE from ..utils import ( int_or_none, @@ -11,25 +13,22 @@ from ..utils import ( class AMCNetworksIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?:movies|shows(?:/[^/]+)+)/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<site>amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P<id>(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1', - 'md5': '', + 'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631', 'info_dict': { - 'id': 's3MX01Nl4vPH', + 'id': '4Lq1dzOnZGt0', 'ext': 'mp4', - 'title': 'Maron - Season 4 - Step 1', - 'description': 'In denial about his current situation, Marc is reluctantly convinced by his friends to enter rehab. Starring Marc Maron and Constance Zimmer.', - 'age_limit': 17, - 'upload_date': '20160505', - 'timestamp': 1462468831, + 'title': "The Graham Norton Show - Season 28 - Tina Fey's Adorable Airline-Themed Family Dinner", + 'description': "It turns out child stewardesses are very generous with the wine! All-new episodes of 'The Graham Norton Show' premiere Fridays at 11/10c on BBC America.", + 'upload_date': '20201120', + 'timestamp': 1605904350, 'uploader': 'AMCN', }, 'params': { # m3u8 download 'skip_download': True, }, - 'skip': 'Requires TV provider accounts', }, { 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', 'only_matching': True, @@ -55,32 +54,34 @@ class AMCNetworksIE(ThePlatformIE): 'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1', 'only_matching': True, }] + _REQUESTOR_ID_MAP = { + 'amc': 'AMC', + 'bbcamerica': 'BBCA', + 'ifc': 'IFC', + 'sundancetv': 'SUNDANCE', + 'wetv': 'WETV', + } def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + site, display_id = re.match(self._VALID_URL, url).groups() + requestor_id = self._REQUESTOR_ID_MAP[site] + properties = self._download_json( + 'https://content-delivery-gw.svc.ds.amcn.com/api/v2/content/amcn/%s/url/%s' % (requestor_id.lower(), display_id), + display_id)['data']['properties'] query = { 'mbr': 'true', 'manifest': 'm3u', } - media_url = self._search_regex( - r'window\.platformLinkURL\s*=\s*[\'"]([^\'"]+)', - webpage, 'media url') - theplatform_metadata = self._download_theplatform_metadata(self._search_regex( - r'link\.theplatform\.com/s/([^?]+)', - media_url, 'theplatform_path'), display_id) + tp_path = 'M_UwQC/media/' + properties['videoPid'] + media_url = 'https://link.theplatform.com/s/' + tp_path + theplatform_metadata = self._download_theplatform_metadata(tp_path, display_id) info = self._parse_theplatform_metadata(theplatform_metadata) video_id = theplatform_metadata['pid'] title = theplatform_metadata['title'] rating = try_get( theplatform_metadata, lambda x: x['ratings'][0]['rating']) - auth_required = self._search_regex( - r'window\.authRequired\s*=\s*(true|false);', - webpage, 'auth required') - if auth_required == 'true': - requestor_id = self._search_regex( - r'window\.requestor_id\s*=\s*[\'"]([^\'"]+)', - webpage, 'requestor id') + video_category = properties.get('videoCategory') + if video_category and video_category.endswith('-Auth'): resource = self._get_mvpd_resource( requestor_id, title, video_id, rating) query['auth'] = self._extract_mvpd_auth( diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index 9c9d77ae1..be960c0f9 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -1,82 +1,159 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor from ..utils import ( clean_html, int_or_none, - js_to_json, try_get, unified_strdate, + unified_timestamp, ) class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:episode|videos)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?P<resource_type>episode|videos)/(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', 'info_dict': { 'id': '5b400b9ee338f922cb06450c', - 'title': 'Weeknight Japanese Suppers', + 'title': 'Japanese Suppers', 'ext': 'mp4', - 'description': 'md5:3d0c1a44bb3b27607ce82652db25b4a8', + 'description': 'md5:64e606bfee910627efc4b5f050de92b3', 'thumbnail': r're:^https?://', - 'timestamp': 1523664000, - 'upload_date': '20180414', - 'release_date': '20180414', + 'timestamp': 1523318400, + 'upload_date': '20180410', + 'release_date': '20180410', 'series': "America's Test Kitchen", 'season_number': 18, - 'episode': 'Weeknight Japanese Suppers', + 'episode': 'Japanese Suppers', 'episode_number': 15, }, 'params': { 'skip_download': True, }, + }, { + # Metadata parsing behaves differently for newer episodes (705) as opposed to older episodes (582 above) + 'url': 'https://www.americastestkitchen.com/episode/705-simple-chicken-dinner', + 'md5': '06451608c57651e985a498e69cec17e5', + 'info_dict': { + 'id': '5fbe8c61bda2010001c6763b', + 'title': 'Simple Chicken Dinner', + 'ext': 'mp4', + 'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7', + 'thumbnail': r're:^https?://', + 'timestamp': 1610755200, + 'upload_date': '20210116', + 'release_date': '20210116', + 'series': "America's Test Kitchen", + 'season_number': 21, + 'episode': 'Simple Chicken Dinner', + 'episode_number': 3, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + resource_type, video_id = re.match(self._VALID_URL, url).groups() + is_episode = resource_type == 'episode' + if is_episode: + resource_type = 'episodes' - webpage = self._download_webpage(url, video_id) - - video_data = self._parse_json( - self._search_regex( - r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*</script>', - webpage, 'initial context'), - video_id, js_to_json) - - ep_data = try_get( - video_data, - (lambda x: x['episodeDetail']['content']['data'], - lambda x: x['videoDetail']['content']['data']), dict) - ep_meta = ep_data.get('full_video', {}) - - zype_id = ep_data.get('zype_id') or ep_meta['zype_id'] - - title = ep_data.get('title') or ep_meta.get('title') - description = clean_html(ep_meta.get('episode_description') or ep_data.get( - 'description') or ep_meta.get('description')) - thumbnail = try_get(ep_meta, lambda x: x['photo']['image_url']) - release_date = unified_strdate(ep_data.get('aired_at')) - - season_number = int_or_none(ep_meta.get('season_number')) - episode = ep_meta.get('title') - episode_number = int_or_none(ep_meta.get('episode_number')) + resource = self._download_json( + 'https://www.americastestkitchen.com/api/v6/%s/%s' % (resource_type, video_id), video_id) + video = resource['video'] if is_episode else resource + episode = resource if is_episode else resource.get('episode') or {} return { '_type': 'url_transparent', - 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id, + 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % video['zypeId'], 'ie_key': 'Zype', - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'release_date': release_date, - 'series': "America's Test Kitchen", - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, + 'description': clean_html(video.get('description')), + 'timestamp': unified_timestamp(video.get('publishDate')), + 'release_date': unified_strdate(video.get('publishDate')), + 'episode_number': int_or_none(episode.get('number')), + 'season_number': int_or_none(episode.get('season')), + 'series': try_get(episode, lambda x: x['show']['title']), + 'episode': episode.get('title'), } + + +class AmericasTestKitchenSeasonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?P<show>americastestkitchen|cookscountry)\.com/episodes/browse/season_(?P<id>\d+)' + _TESTS = [{ + # ATK Season + 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', + 'info_dict': { + 'id': 'season_1', + 'title': 'Season 1', + }, + 'playlist_count': 13, + }, { + # Cooks Country Season + 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'info_dict': { + 'id': 'season_12', + 'title': 'Season 12', + }, + 'playlist_count': 13, + }] + + def _real_extract(self, url): + show_name, season_number = re.match(self._VALID_URL, url).groups() + season_number = int(season_number) + + slug = 'atk' if show_name == 'americastestkitchen' else 'cco' + + season = 'Season %d' % season_number + + season_search = self._download_json( + 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, + season, headers={ + 'Origin': 'https://www.%s.com' % show_name, + 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', + 'X-Algolia-Application-Id': 'Y1FNZXUI30', + }, query={ + 'facetFilters': json.dumps([ + 'search_season_list:' + season, + 'search_document_klass:episode', + 'search_show_slug:' + slug, + ]), + 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug, + 'attributesToHighlight': '', + 'hitsPerPage': 1000, + }) + + def entries(): + for episode in (season_search.get('hits') or []): + search_url = episode.get('search_url') + if not search_url: + continue + yield { + '_type': 'url', + 'url': 'https://www.%s.com%s' % (show_name, search_url), + 'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]), + 'title': episode.get('title'), + 'description': episode.get('description'), + 'timestamp': unified_timestamp(episode.get('search_document_date')), + 'season_number': season_number, + 'episode_number': int_or_none(episode.get('search_%s_episode_number' % slug)), + 'ie_key': AmericasTestKitchenIE.ie_key(), + } + + return self.playlist_result( + entries(), 'season_%d' % season_number, season) diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 7ff098cfa..24c684cad 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -8,6 +8,7 @@ from ..utils import ( int_or_none, mimetype2ext, parse_iso8601, + unified_timestamp, url_or_none, ) @@ -88,7 +89,7 @@ class AMPIE(InfoExtractor): self._sort_formats(formats) - timestamp = parse_iso8601(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) + timestamp = unified_timestamp(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) return { 'id': video_id, diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 00ce684d1..54e097d2f 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -116,8 +116,6 @@ class AnimeOnDemandIE(InfoExtractor): r'(?s)<div[^>]+itemprop="description"[^>]*>(.+?)</div>', webpage, 'anime description', default=None) - entries = [] - def extract_info(html, video_id, num=None): title, description = [None] * 2 formats = [] @@ -233,7 +231,7 @@ class AnimeOnDemandIE(InfoExtractor): self._sort_formats(info['formats']) f = common_info.copy() f.update(info) - entries.append(f) + yield f # Extract teaser/trailer only when full episode is not available if not info['formats']: @@ -247,7 +245,7 @@ class AnimeOnDemandIE(InfoExtractor): 'title': m.group('title'), 'url': urljoin(url, m.group('href')), }) - entries.append(f) + yield f def extract_episodes(html): for num, episode_html in enumerate(re.findall( @@ -275,7 +273,8 @@ class AnimeOnDemandIE(InfoExtractor): 'episode_number': episode_number, } - extract_entries(episode_html, video_id, common_info) + for e in extract_entries(episode_html, video_id, common_info): + yield e def extract_film(html, video_id): common_info = { @@ -283,11 +282,18 @@ class AnimeOnDemandIE(InfoExtractor): 'title': anime_title, 'description': anime_description, } - extract_entries(html, video_id, common_info) + for e in extract_entries(html, video_id, common_info): + yield e - extract_episodes(webpage) + def entries(): + has_episodes = False + for e in extract_episodes(webpage): + has_episodes = True + yield e - if not entries: - extract_film(webpage, anime_id) + if not has_episodes: + for e in extract_film(webpage, anime_id): + yield e - return self.playlist_result(entries, anime_id, anime_title, anime_description) + return self.playlist_result( + entries(), anime_id, anime_title, anime_description) diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py index 84e841035..b7398563b 100644 --- a/youtube_dl/extractor/anvato.py +++ b/youtube_dl/extractor/anvato.py @@ -116,7 +116,76 @@ class AnvatoIE(InfoExtractor): 'anvato_scripps_app_ios_prod_409c41960c60b308db43c3cc1da79cab9f1c3d93': 'WPxj5GraLTkYCyj3M7RozLqIycjrXOEcDGFMIJPn', 'EZqvRyKBJLrgpClDPDF8I7Xpdp40Vx73': '4OxGd2dEakylntVKjKF0UK9PDPYB6A9W', 'M2v78QkpleXm9hPp9jUXI63x5vA6BogR': 'ka6K32k7ZALmpINkjJUGUo0OE42Md1BQ', - 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ' + 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', + 'X8POa4zPPaKVZHqmWjuEzfP31b1QM9VN': 'Dn5vOY9ooDw7VSl9qztjZI5o0g08mA0z', + 'M2v78QkBMpNJlSPp9diX5F2PBmBy6Bog': 'ka6K32kyo7nDZfNkjQCGWf1lpApXMd1B', + 'bvJ0dQpav07l0hG5JgfVLF2dv1vARwpP': 'BzoQW24GrJZoJfmNodiJKSPeB9B8NOxj', + 'lxQMLg2XZKuEZaWgsqubBxV9INZ6bryY': 'Vm2Mx6noKds9jB71h6urazwlTG3m9x8l', + '04EnjvXeoSmkbJ9ckPs7oY0mcxv7PlyN': 'aXERQP9LMfQVlEDsgGs6eEA1SWznAQ8P', + 'mQbO2ge6BFRWVPYCYpU06YvNt80XLvAX': 'E2BV1NGmasN5v7eujECVPJgwflnLPm2A', + 'g43oeBzJrCml7o6fa5fRL1ErCdeD8z4K': 'RX34mZ6zVH4Nr6whbxIGLv9WSbxEKo8V', + 'VQrDJoP7mtdBzkxhXbSPwGB1coeElk4x': 'j2VejQx0VFKQepAF7dI0mJLKtOVJE18z', + 'WxA5NzLRjCrmq0NUgaU5pdMDuZO7RJ4w': 'lyY5ADLKaIOLEgAsGQCveEMAcqnx3rY9', + 'M4lpMXB71ie0PjMCjdFzVXq0SeRVqz49': 'n2zVkOqaLIv3GbLfBjcwW51LcveWOZ2e', + 'dyDZGEqN8u8nkJZcJns0oxYmtP7KbGAn': 'VXOEqQW9BtEVLajfZQSLEqxgS5B7qn2D', + 'E7QNjrVY5u5mGvgu67IoDgV1CjEND8QR': 'rz8AaDmdKIkLmPNhB5ILPJnjS5PnlL8d', + 'a4zrqjoKlfzg0dwHEWtP31VqcLBpjm4g': 'LY9J16gwETdGWa3hjBu5o0RzuoQDjqXQ', + 'dQP5BZroMsMVLO1hbmT5r2Enu86GjxA6': '7XR3oOdbPF6x3PRFLDCq9RkgsRjAo48V', + 'M4lKNBO1NFe0PjMCj1tzVXq0SeRVqzA9': 'n2zoRqGLRUv3GbLfBmTwW51LcveWOZYe', + 'nAZ7MZdpGCGg1pqFEbsoJOz2C60mv143': 'dYJgdqA9aT4yojETqGi7yNgoFADxqmXP', + '3y1MERYgOuE9NzbFgwhV6Wv2F0YKvbyz': '081xpZDQgC4VadLTavhWQxrku56DAgXV', + 'bmQvmEXr5HWklBMCZOcpE2Z3HBYwqGyl': 'zxXPbVNyMiMAZldhr9FkOmA0fl4aKr2v', + 'wA7oDNYldfr6050Hwxi52lPZiVlB86Ap': 'ZYK16aA7ni0d3l3c34uwpxD7CbReMm8Q', + 'g43MbKMWmFml7o7sJoSRkXxZiXRvJ3QK': 'RX3oBJonvs4Nr6rUWBCGn3matRGqJPXV', + 'mA9VdlqpLS0raGaSDvtoqNrBTzb8XY4q': '0XN4OjBD3fnW7r7IbmtJB4AyfOmlrE2r', + 'mAajOwgkGt17oGoFmEuklMP9H0GnW54d': 'lXbBLPGyzikNGeGujAuAJGjZiwLRxyXR', + 'vy8vjJ9kbUwrRqRu59Cj5dWZfzYErlAb': 'K8l7gpwaGcBpnAnCLNCmPZRdin3eaQX0', + 'xQMWBpR8oHEZaWaSMGUb0avOHjLVYn4Y': 'm2MrN4vEaf9jB7BFy5Srb40jTrN67AYl', + 'xyKEmVO3miRr6D6UVkt7oB8jtD6aJEAv': 'g2ddDebqDfqdgKgswyUKwGjbTWwzq923', + '7Qk0wa2D9FjKapacoJF27aLvUDKkLGA0': 'b2kgBEkephJaMkMTL7s1PLe4Ua6WyP2P', + '3QLg6nqmNTJ5VvVTo7f508LPidz1xwyY': 'g2L1GgpraipmAOAUqmIbBnPxHOmw4MYa', + '3y1B7zZjXTE9NZNSzZSVNPZaTNLjo6Qz': '081b5G6wzH4VagaURmcWbN5mT4JGEe2V', + 'lAqnwvkw6SG6D8DSqmUg6DRLUp0w3G4x': 'O2pbP0xPDFNJjpjIEvcdryOJtpkVM4X5', + 'awA7xd1N0Hr6050Hw2c52lPZiVlB864p': 'GZYKpn4aoT0d3l3c3PiwpxD7CbReMmXQ', + 'jQVqPLl9YHL1WGWtR1HDgWBGT63qRNyV': '6X03ne6vrU4oWyWUN7tQVoajikxJR3Ye', + 'GQRMR8mL7uZK797t7xH3eNzPIP5dOny1': 'm2vqPWGd4U31zWzSyasDRAoMT1PKRp8o', + 'zydq9RdmRhXLkNkfNoTJlMzaF0lWekQB': '3X7LnvE7vH5nkEkSqLiey793Un7dLB8e', + 'VQrDzwkB2IdBzjzu9MHPbEYkSB50gR4x': 'j2VebLzoKUKQeEesmVh0gM1eIp9jKz8z', + 'mAa2wMamBs17oGoFmktklMP9H0GnW54d': 'lXbgP74xZTkNGeGujVUAJGjZiwLRxy8R', + '7yjB6ZLG6sW8R6RF2xcan1KGfJ5dNoyd': 'wXQkPorvPHZ45N5t4Jf6qwg5Tp4xvw29', + 'a4zPpNeWGuzg0m0iX3tPeanGSkRKWXQg': 'LY9oa3QAyHdGW9Wu3Ri5JGeEik7l1N8Q', + 'k2rneA2M38k25cXDwwSknTJlxPxQLZ6M': '61lyA2aEVDzklfdwmmh31saPxQx2VRjp', + 'bK9Zk4OvPnvxduLgxvi8VUeojnjA02eV': 'o5jANYjbeMb4nfBaQvcLAt1jzLzYx6ze', + '5VD6EydM3R9orHmNMGInGCJwbxbQvGRw': 'w3zjmX7g4vnxzCxElvUEOiewkokXprkZ', + '70X35QbVYVYNPUmP9YfbzI06YqYQk2R1': 'vG4Aj2BMjMjoztB7zeFOnCVPJpJ8lMOa', + '26qYwQVG9p1Bks2GgBckjfDJOXOAMgG1': 'r4ev9X0mv5zqJc0yk5IBDcQOwZw8mnwQ', + 'rvVKpA56MBXWlSxMw3cobT5pdkd4Dm7q': '1J7ZkY53pZ645c93owcLZuveE7E8B3rL', + 'qN1zdy1zlYL23IWZGWtDvfV6WeWQWkJo': 'qN1zdy1zlYL23IWZGWtDvfV6WeWQWkJo', + 'jdKqRGF16dKsBviMDae7IGDl7oTjEbVV': 'Q09l7vhlNxPFErIOK6BVCe7KnwUW5DVV', + '3QLkogW1OUJ5VvPsrDH56DY2u7lgZWyY': 'g2LRE1V9espmAOPhE4ubj4ZdUA57yDXa', + 'wyJvWbXGBSdbkEzhv0CW8meou82aqRy8': 'M2wolPvyBIpQGkbT4juedD4ruzQGdK2y', + '7QkdZrzEkFjKap6IYDU2PB0oCNZORmA0': 'b2kN1l96qhJaMkPs9dt1lpjBfwqZoA8P', + 'pvA05113MHG1w3JTYxc6DVlRCjErVz4O': 'gQXeAbblBUnDJ7vujbHvbRd1cxlz3AXO', + 'mA9blJDZwT0raG1cvkuoeVjLC7ZWd54q': '0XN9jRPwMHnW7rvumgfJZOD9CJgVkWYr', + '5QwRN5qKJTvGKlDTmnf7xwNZcjRmvEy9': 'R2GP6LWBJU1QlnytwGt0B9pytWwAdDYy', + 'eyn5rPPbkfw2KYxH32fG1q58CbLJzM40': 'p2gyqooZnS56JWeiDgfmOy1VugOQEBXn', + '3BABn3b5RfPJGDwilbHe7l82uBoR05Am': '7OYZG7KMVhbPdKJS3xcWEN3AuDlLNmXj', + 'xA5zNGXD3HrmqMlF6OS5pdMDuZO7RJ4w': 'yY5DAm6r1IOLE3BCVMFveEMAcqnx3r29', + 'g43PgW3JZfml7o6fDEURL1ErCdeD8zyK': 'RX3aQn1zrS4Nr6whDgCGLv9WSbxEKo2V', + 'lAqp8WbGgiG6D8LTKJcg3O72CDdre1Qx': 'O2pnm6473HNJjpKuVosd3vVeh975yrX5', + 'wyJbYEDxKSdbkJ6S6RhW8meou82aqRy8': 'M2wPm7EgRSpQGlAh70CedD4ruzQGdKYy', + 'M4lgW28nLCe0PVdtaXszVXq0SeRVqzA9': 'n2zmJvg4jHv3G0ETNgiwW51LcveWOZ8e', + '5Qw3OVvp9FvGKlDTmOC7xwNZcjRmvEQ9': 'R2GzDdml9F1Qlnytw9s0B9pytWwAdD8y', + 'vy8a98X7zCwrRqbHrLUjYzwDiK2b70Qb': 'K8lVwzyjZiBpnAaSGeUmnAgxuGOBxmY0', + 'g4eGjJLLoiqRD3Pf9oT5O03LuNbLRDQp': '6XqD59zzpfN4EwQuaGt67qNpSyRBlnYy', + 'g43OPp9boIml7o6fDOIRL1ErCdeD8z4K': 'RX33alNB4s4Nr6whDPUGLv9WSbxEKoXV', + 'xA2ng9OkBcGKzDbTkKsJlx7dUK8R3dA5': 'z2aPnJvzBfObkwGC3vFaPxeBhxoMqZ8K', + 'xyKEgBajZuRr6DEC0Kt7XpD1cnNW9gAv': 'g2ddlEBvRsqdgKaI4jUK9PrgfMexGZ23', + 'BAogww51jIMa2JnH1BcYpXM5F658RNAL': 'rYWDmm0KptlkGv4FGJFMdZmjs9RDE6XR', + 'BAokpg62VtMa2JnH1mHYpXM5F658RNAL': 'rYWryDnlNslkGv4FG4HMdZmjs9RDE62R', + 'a4z1Px5e2hzg0m0iMMCPeanGSkRKWXAg': 'LY9eorNQGUdGW9WuKKf5JGeEik7l1NYQ', + 'kAx69R58kF9nY5YcdecJdl2pFXP53WyX': 'gXyRxELpbfPvLeLSaRil0mp6UEzbZJ8L', + 'BAoY13nwViMa2J2uo2cY6BlETgmdwryL': 'rYWwKzJmNFlkGvGtNoUM9bzwIJVzB1YR', } _MCP_TO_ACCESS_KEY_TABLE = { @@ -189,19 +258,17 @@ class AnvatoIE(InfoExtractor): video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii') anvrid = md5_text(time.time() * 1000 * random.random())[:30] - payload = { - 'api': { - 'anvrid': anvrid, - 'anvstk': md5_text('%s|%s|%d|%s' % ( - access_key, anvrid, server_time, - self._ANVACK_TABLE.get(access_key, self._API_KEY))), - 'anvts': server_time, - }, + api = { + 'anvrid': anvrid, + 'anvts': server_time, } + api['anvstk'] = md5_text('%s|%s|%d|%s' % ( + access_key, anvrid, server_time, + self._ANVACK_TABLE.get(access_key, self._API_KEY))) return self._download_json( video_data_url, video_id, transform_source=strip_jsonp, - data=json.dumps(payload).encode('utf-8')) + data=json.dumps({'api': api}).encode('utf-8')) def _get_anvato_videos(self, access_key, video_id): video_data = self._get_video_json(access_key, video_id) @@ -259,7 +326,7 @@ class AnvatoIE(InfoExtractor): 'description': video_data.get('def_description'), 'tags': video_data.get('def_tags', '').split(','), 'categories': video_data.get('categories'), - 'thumbnail': video_data.get('thumbnail'), + 'thumbnail': video_data.get('src_image_url') or video_data.get('thumbnail'), 'timestamp': int_or_none(video_data.get( 'ts_published') or video_data.get('ts_added')), 'uploader': video_data.get('mcp_id'), diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index e87994a6a..f6ecb8438 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .yahoo import YahooIE from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, @@ -15,9 +15,9 @@ from ..utils import ( ) -class AolIE(InfoExtractor): +class AolIE(YahooIE): IE_NAME = 'aol.com' - _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>[0-9a-f]+)' + _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>\d{9}|[0-9a-f]{24}|[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})' _TESTS = [{ # video with 5min ID @@ -76,10 +76,16 @@ class AolIE(InfoExtractor): }, { 'url': 'https://www.aol.jp/video/playlist/5a28e936a1334d000137da0c/5a28f3151e642219fde19831/', 'only_matching': True, + }, { + # Yahoo video + 'url': 'https://www.aol.com/video/play/991e6700-ac02-11ea-99ff-357400036f61/24bbc846-3e30-3c46-915e-fe8ccd7fcc46/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + if '-' in video_id: + return self._extract_yahoo_video(video_id, 'us') response = self._download_json( 'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id, diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 883dcee7a..a9527e785 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + get_element_by_id, int_or_none, merge_dicts, mimetype2ext, @@ -39,23 +40,15 @@ class AparatIE(InfoExtractor): webpage = self._download_webpage(url, video_id, fatal=False) if not webpage: - # Note: There is an easier-to-parse configuration at - # http://www.aparat.com/video/video/config/videohash/%video_id - # but the URL in there does not work webpage = self._download_webpage( 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, video_id) - options = self._parse_json( - self._search_regex( - r'options\s*=\s*JSON\.parse\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1\s*\)', - webpage, 'options', group='value'), - video_id) - - player = options['plugins']['sabaPlayerPlugin'] + options = self._parse_json(self._search_regex( + r'options\s*=\s*({.+?})\s*;', webpage, 'options'), video_id) formats = [] - for sources in player['multiSRC']: + for sources in (options.get('multiSRC') or []): for item in sources: if not isinstance(item, dict): continue @@ -85,11 +78,12 @@ class AparatIE(InfoExtractor): info = self._search_json_ld(webpage, video_id, default={}) if not info.get('title'): - info['title'] = player['title'] + info['title'] = get_element_by_id('videoTitle', webpage) or \ + self._html_search_meta(['og:title', 'twitter:title', 'DC.Title', 'title'], webpage, fatal=True) return merge_dicts(info, { 'id': video_id, 'thumbnail': url_or_none(options.get('poster')), - 'duration': int_or_none(player.get('duration')), + 'duration': int_or_none(options.get('duration')), 'formats': formats, }) diff --git a/youtube_dl/extractor/applepodcasts.py b/youtube_dl/extractor/applepodcasts.py new file mode 100644 index 000000000..95758fece --- /dev/null +++ b/youtube_dl/extractor/applepodcasts.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + parse_iso8601, + try_get, +) + + +class ApplePodcastsIE(InfoExtractor): + _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', + 'md5': 'df02e6acb11c10e844946a39e7222b08', + 'info_dict': { + 'id': '1000482637777', + 'ext': 'mp3', + 'title': '207 - Whitney Webb Returns', + 'description': 'md5:13a73bade02d2e43737751e3987e1399', + 'upload_date': '20200705', + 'timestamp': 1593921600, + 'duration': 6425, + 'series': 'The Tim Dillon Show', + } + }, { + 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', + 'only_matching': True, + }, { + 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777', + 'only_matching': True, + }, { + 'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777', + 'only_matching': True, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + webpage = self._download_webpage(url, episode_id) + ember_data = self._parse_json(self._search_regex( + r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', + webpage, 'ember data'), episode_id) + episode = ember_data['data']['attributes'] + description = episode.get('description') or {} + + series = None + for inc in (ember_data.get('included') or []): + if inc.get('type') == 'media/podcast': + series = try_get(inc, lambda x: x['attributes']['name']) + + return { + 'id': episode_id, + 'title': episode['name'], + 'url': clean_podcast_url(episode['assetUrl']), + 'description': description.get('standard') or description.get('short'), + 'timestamp': parse_iso8601(episode.get('releaseDateTime')), + 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000), + 'series': series, + } diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index c79c58e82..e42ed5e79 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -2,15 +2,17 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - unified_strdate, clean_html, + extract_attributes, + unified_strdate, + unified_timestamp, ) class ArchiveOrgIE(InfoExtractor): IE_NAME = 'archive.org' IE_DESC = 'archive.org videos' - _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#]+)(?:[?].*)?$' + _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'md5': '8af1d4cf447933ed3c7f4871162602db', @@ -19,8 +21,11 @@ class ArchiveOrgIE(InfoExtractor): 'ext': 'ogg', 'title': '1968 Demo - FJCC Conference Presentation Reel #1', 'description': 'md5:da45c349df039f1cc8075268eb1b5c25', - 'upload_date': '19681210', - 'uploader': 'SRI International' + 'creator': 'SRI International', + 'release_date': '19681210', + 'uploader': 'SRI International', + 'timestamp': 1268695290, + 'upload_date': '20100315', } }, { 'url': 'https://archive.org/details/Cops1922', @@ -29,22 +34,43 @@ class ArchiveOrgIE(InfoExtractor): 'id': 'Cops1922', 'ext': 'mp4', 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6', + 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c', + 'timestamp': 1387699629, + 'upload_date': '20131222', } }, { 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'only_matching': True, + }, { + 'url': 'https://archive.org/details/MSNBCW_20131125_040000_To_Catch_a_Predator/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://archive.org/embed/' + video_id, video_id) - jwplayer_playlist = self._parse_json(self._search_regex( - r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", - webpage, 'jwplayer playlist'), video_id) - info = self._parse_jwplayer_data( - {'playlist': jwplayer_playlist}, video_id, base_url=url) + + playlist = None + play8 = self._search_regex( + r'(<[^>]+\bclass=["\']js-play8-playlist[^>]+>)', webpage, + 'playlist', default=None) + if play8: + attrs = extract_attributes(play8) + playlist = attrs.get('value') + if not playlist: + # Old jwplayer fallback + playlist = self._search_regex( + r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", + webpage, 'jwplayer playlist', default='[]') + jwplayer_playlist = self._parse_json(playlist, video_id, fatal=False) + if jwplayer_playlist: + info = self._parse_jwplayer_data( + {'playlist': jwplayer_playlist}, video_id, base_url=url) + else: + # HTML5 media fallback + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info['id'] = video_id def get_optional(metadata, field): return metadata.get(field, [None])[0] @@ -58,8 +84,12 @@ class ArchiveOrgIE(InfoExtractor): 'description': clean_html(get_optional(metadata, 'description')), }) if info.get('_type') != 'playlist': + creator = get_optional(metadata, 'creator') info.update({ - 'uploader': get_optional(metadata, 'creator'), - 'upload_date': unified_strdate(get_optional(metadata, 'date')), + 'creator': creator, + 'release_date': unified_strdate(get_optional(metadata, 'date')), + 'uploader': get_optional(metadata, 'publisher') or creator, + 'timestamp': unified_timestamp(get_optional(metadata, 'publicdate')), + 'language': get_optional(metadata, 'language'), }) return info diff --git a/youtube_dl/extractor/arcpublishing.py b/youtube_dl/extractor/arcpublishing.py new file mode 100644 index 000000000..ca6a6c4d8 --- /dev/null +++ b/youtube_dl/extractor/arcpublishing.py @@ -0,0 +1,174 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + parse_iso8601, + try_get, +) + + +class ArcPublishingIE(InfoExtractor): + _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' + _VALID_URL = r'arcpublishing:(?P<org>[a-z]+):(?P<id>%s)' % _UUID_REGEX + _TESTS = [{ + # https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/ + 'url': 'arcpublishing:adn:8c99cb6e-b29c-4bc9-9173-7bf9979225ab', + 'only_matching': True, + }, { + # https://www.bostonglobe.com/video/2020/12/30/metro/footage-released-showing-officer-talking-about-striking-protesters-with-car/ + 'url': 'arcpublishing:bostonglobe:232b7ae6-7d73-432d-bc0a-85dbf0119ab1', + 'only_matching': True, + }, { + # https://www.actionnewsjax.com/video/live-stream/ + 'url': 'arcpublishing:cmg:cfb1cf1b-3ab5-4d1b-86c5-a5515d311f2a', + 'only_matching': True, + }, { + # https://elcomercio.pe/videos/deportes/deporte-total-futbol-peruano-seleccion-peruana-la-valorizacion-de-los-peruanos-en-el-exterior-tras-un-2020-atipico-nnav-vr-video-noticia/ + 'url': 'arcpublishing:elcomercio:27a7e1f8-2ec7-4177-874f-a4feed2885b3', + 'only_matching': True, + }, { + # https://www.clickondetroit.com/video/community/2020/05/15/events-surrounding-woodward-dream-cruise-being-canceled/ + 'url': 'arcpublishing:gmg:c8793fb2-8d44-4242-881e-2db31da2d9fe', + 'only_matching': True, + }, { + # https://www.wabi.tv/video/2020/12/30/trenton-company-making-equipment-pfizer-covid-vaccine/ + 'url': 'arcpublishing:gray:0b0ba30e-032a-4598-8810-901d70e6033e', + 'only_matching': True, + }, { + # https://www.lateja.cr/el-mundo/video-china-aprueba-con-condiciones-su-primera/dfcbfa57-527f-45ff-a69b-35fe71054143/video/ + 'url': 'arcpublishing:gruponacion:dfcbfa57-527f-45ff-a69b-35fe71054143', + 'only_matching': True, + }, { + # https://www.fifthdomain.com/video/2018/03/09/is-america-vulnerable-to-a-cyber-attack/ + 'url': 'arcpublishing:mco:aa0ca6fe-1127-46d4-b32c-be0d6fdb8055', + 'only_matching': True, + }, { + # https://www.vl.no/kultur/2020/12/09/en-melding-fra-en-lytter-endret-julelista-til-lewi-bergrud/ + 'url': 'arcpublishing:mentormedier:47a12084-650b-4011-bfd0-3699b6947b2d', + 'only_matching': True, + }, { + # https://www.14news.com/2020/12/30/whiskey-theft-caught-camera-henderson-liquor-store/ + 'url': 'arcpublishing:raycom:b89f61f8-79fa-4c09-8255-e64237119bf7', + 'only_matching': True, + }, { + # https://www.theglobeandmail.com/world/video-ethiopian-woman-who-became-symbol-of-integration-in-italy-killed-on/ + 'url': 'arcpublishing:tgam:411b34c1-8701-4036-9831-26964711664b', + 'only_matching': True, + }, { + # https://www.pilotonline.com/460f2931-8130-4719-8ea1-ffcb2d7cb685-132.html + 'url': 'arcpublishing:tronc:460f2931-8130-4719-8ea1-ffcb2d7cb685', + 'only_matching': True, + }] + _POWA_DEFAULTS = [ + (['cmg', 'prisa'], '%s-config-prod.api.cdn.arcpublishing.com/video'), + ([ + 'adn', 'advancelocal', 'answers', 'bonnier', 'bostonglobe', 'demo', + 'gmg', 'gruponacion', 'infobae', 'mco', 'nzme', 'pmn', 'raycom', + 'spectator', 'tbt', 'tgam', 'tronc', 'wapo', 'wweek', + ], 'video-api-cdn.%s.arcpublishing.com/api'), + ] + + @staticmethod + def _extract_urls(webpage): + entries = [] + # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview + for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage): + powa = extract_attributes(powa_el) or {} + org = powa.get('data-org') + uuid = powa.get('data-uuid') + if org and uuid: + entries.append('arcpublishing:%s:%s' % (org, uuid)) + return entries + + def _real_extract(self, url): + org, uuid = re.match(self._VALID_URL, url).groups() + for orgs, tmpl in self._POWA_DEFAULTS: + if org in orgs: + base_api_tmpl = tmpl + break + else: + base_api_tmpl = '%s-prod-cdn.video-api.arcpublishing.com/api' + if org == 'wapo': + org = 'washpost' + video = self._download_json( + 'https://%s/v1/ansvideos/findByUuid' % (base_api_tmpl % org), + uuid, query={'uuid': uuid})[0] + title = video['headlines']['basic'] + is_live = video.get('status') == 'live' + + urls = [] + formats = [] + for s in video.get('streams', []): + s_url = s.get('url') + if not s_url or s_url in urls: + continue + urls.append(s_url) + stream_type = s.get('stream_type') + if stream_type == 'smil': + smil_formats = self._extract_smil_formats( + s_url, uuid, fatal=False) + for f in smil_formats: + if f['url'].endswith('/cfx/st'): + f['app'] = 'cfx/st' + if not f['play_path'].startswith('mp4:'): + f['play_path'] = 'mp4:' + f['play_path'] + if isinstance(f['tbr'], float): + f['vbr'] = f['tbr'] * 1000 + del f['tbr'] + f['format_id'] = 'rtmp-%d' % f['vbr'] + formats.extend(smil_formats) + elif stream_type in ('ts', 'hls'): + m3u8_formats = self._extract_m3u8_formats( + s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False) + if all([f.get('acodec') == 'none' for f in m3u8_formats]): + continue + for f in m3u8_formats: + if f.get('acodec') == 'none': + f['preference'] = -40 + elif f.get('vcodec') == 'none': + f['preference'] = -50 + height = f.get('height') + if not height: + continue + vbr = self._search_regex( + r'[_x]%d[_-](\d+)' % height, f['url'], 'vbr', default=None) + if vbr: + f['vbr'] = int(vbr) + formats.extend(m3u8_formats) + else: + vbr = int_or_none(s.get('bitrate')) + formats.append({ + 'format_id': '%s-%d' % (stream_type, vbr) if vbr else stream_type, + 'vbr': vbr, + 'width': int_or_none(s.get('width')), + 'height': int_or_none(s.get('height')), + 'filesize': int_or_none(s.get('filesize')), + 'url': s_url, + 'preference': -1, + }) + self._sort_formats( + formats, ('preference', 'width', 'height', 'vbr', 'filesize', 'tbr', 'ext', 'format_id')) + + subtitles = {} + for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []): + subtitle_url = subtitle.get('url') + if subtitle_url: + subtitles.setdefault('en', []).append({'url': subtitle_url}) + + return { + 'id': uuid, + 'title': self._live_title(title) if is_live else title, + 'thumbnail': try_get(video, lambda x: x['promo_image']['url']), + 'description': try_get(video, lambda x: x['subheadlines']['basic']), + 'formats': formats, + 'duration': int_or_none(video.get('duration'), 100), + 'timestamp': parse_iso8601(video.get('created_date')), + 'subtitles': subtitles, + 'is_live': is_live, + } diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 5b7b2dd6d..143fc51e9 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -187,13 +187,13 @@ class ARDMediathekIE(ARDMediathekBaseIE): if doc.tag == 'rss': return GenericIE()._extract_rss(url, video_id, doc) - title = self._html_search_regex( + title = self._og_search_title(webpage, default=None) or self._html_search_regex( [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', r'<meta name="dcterms\.title" content="(.*?)"/>', r'<h4 class="headline">(.*?)</h4>', r'<title[^>]*>(.*?)'], webpage, 'title') - description = self._html_search_meta( + description = self._og_search_description(webpage, default=None) or self._html_search_meta( 'dcterms.abstract', webpage, 'description', default=None) if description is None: description = self._html_search_meta( @@ -249,18 +249,18 @@ class ARDMediathekIE(ARDMediathekBaseIE): class ARDIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?P[0-9]+))\.html' + _VALID_URL = r'(?Phttps?://(?:www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?:video-?)?(?P[0-9]+))\.html' _TESTS = [{ - # available till 14.02.2019 - 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', - 'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49', + # available till 7.01.2022 + 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html', + 'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1', 'info_dict': { - 'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video', - 'id': '102', + 'display_id': 'maischberger-die-woche', + 'id': '100', 'ext': 'mp4', - 'duration': 4435.0, - 'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?', - 'upload_date': '20180214', + 'duration': 3687.0, + 'title': 'maischberger. die woche vom 7. Januar 2021', + 'upload_date': '20210107', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -284,20 +284,42 @@ class ARDIE(InfoExtractor): formats = [] for a in video_node.findall('.//asset'): + file_name = xpath_text(a, './fileName', default=None) + if not file_name: + continue + format_type = a.attrib.get('type') + format_url = url_or_none(file_name) + if format_url: + ext = determine_ext(file_name) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_type or 'hls', fatal=False)) + continue + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(format_url, {'hdcore': '3.7.0'}), + display_id, f4m_id=format_type or 'hds', fatal=False)) + continue f = { - 'format_id': a.attrib['type'], - 'width': int_or_none(a.find('./frameWidth').text), - 'height': int_or_none(a.find('./frameHeight').text), - 'vbr': int_or_none(a.find('./bitrateVideo').text), - 'abr': int_or_none(a.find('./bitrateAudio').text), - 'vcodec': a.find('./codecVideo').text, - 'tbr': int_or_none(a.find('./totalBitrate').text), + 'format_id': format_type, + 'width': int_or_none(xpath_text(a, './frameWidth')), + 'height': int_or_none(xpath_text(a, './frameHeight')), + 'vbr': int_or_none(xpath_text(a, './bitrateVideo')), + 'abr': int_or_none(xpath_text(a, './bitrateAudio')), + 'vcodec': xpath_text(a, './codecVideo'), + 'tbr': int_or_none(xpath_text(a, './totalBitrate')), } - if a.find('./serverPrefix').text: - f['url'] = a.find('./serverPrefix').text - f['playpath'] = a.find('./fileName').text + server_prefix = xpath_text(a, './serverPrefix', default=None) + if server_prefix: + f.update({ + 'url': server_prefix, + 'playpath': file_name, + }) else: - f['url'] = a.find('./fileName').text + if not format_url: + continue + f['url'] = format_url formats.append(f) self._sort_formats(formats) @@ -315,17 +337,17 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(ARDMediathekBaseIE): _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?:player|live|video)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', - 'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f', + 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', + 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', 'info_dict': { 'display_id': 'die-robuste-roswita', - 'id': '70153354', + 'id': '78566716', 'title': 'Die robuste Roswita', - 'description': r're:^Der Mord.*trüber ist als die Ilm.', + 'description': r're:^Der Mord.*totgeglaubte Ehefrau Roswita', 'duration': 5316, - 'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard', - 'timestamp': 1577047500, - 'upload_date': '20191222', + 'thumbnail': 'https://img.ardmediathek.de/standard/00/78/56/67/84/575672121/16x9/960?mandant=ard', + 'timestamp': 1596658200, + 'upload_date': '20200805', 'ext': 'mp4', }, }, { diff --git a/youtube_dl/extractor/arkena.py b/youtube_dl/extractor/arkena.py index 854f58767..fd46b1c77 100644 --- a/youtube_dl/extractor/arkena.py +++ b/youtube_dl/extractor/arkena.py @@ -6,13 +6,11 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( - determine_ext, ExtractorError, float_or_none, int_or_none, - mimetype2ext, parse_iso8601, - strip_jsonp, + try_get, ) @@ -20,22 +18,27 @@ class ArkenaIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - video\.arkena\.com/play2/embed/player\?| + video\.(?:arkena|qbrick)\.com/play2/embed/player\?| play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P[^/]+)/[^/]+/(?P\d+) ) ''' _TESTS = [{ - 'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411', - 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'url': 'https://video.qbrick.com/play2/embed/player?accountId=1034090&mediaId=d8ab4607-00090107-aab86310', + 'md5': '97f117754e5f3c020f5f26da4a44ebaf', 'info_dict': { - 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'id': 'd8ab4607-00090107-aab86310', 'ext': 'mp4', - 'title': 'Big Buck Bunny', - 'description': 'Royalty free test video', - 'timestamp': 1432816365, - 'upload_date': '20150528', - 'is_live': False, + 'title': 'EM_HT20_117_roslund_v2.mp4', + 'timestamp': 1608285912, + 'upload_date': '20201218', + 'duration': 1429.162667, + 'subtitles': { + 'sv': 'count:3', + }, }, + }, { + 'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411', + 'only_matching': True, }, { 'url': 'https://play.arkena.com/config/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411/?callbackMethod=jQuery1111023664739129262213_1469227693893', 'only_matching': True, @@ -72,62 +75,89 @@ class ArkenaIE(InfoExtractor): if not video_id or not account_id: raise ExtractorError('Invalid URL', expected=True) - playlist = self._download_json( - 'https://play.arkena.com/config/avp/v2/player/media/%s/0/%s/?callbackMethod=_' - % (video_id, account_id), - video_id, transform_source=strip_jsonp)['Playlist'][0] + media = self._download_json( + 'https://video.qbrick.com/api/v1/public/accounts/%s/medias/%s' % (account_id, video_id), + video_id, query={ + # https://video.qbrick.com/docs/api/examples/library-api.html + 'fields': 'asset/resources/*/renditions/*(height,id,language,links/*(href,mimeType),type,size,videos/*(audios/*(codec,sampleRate),bitrate,codec,duration,height,width),width),created,metadata/*(title,description),tags', + }) + metadata = media.get('metadata') or {} + title = metadata['title'] - media_info = playlist['MediaInfo'] - title = media_info['Title'] - media_files = playlist['MediaFiles'] - - is_live = False + duration = None formats = [] - for kind_case, kind_formats in media_files.items(): - kind = kind_case.lower() - for f in kind_formats: - f_url = f.get('Url') - if not f_url: - continue - is_live = f.get('Live') == 'true' - exts = (mimetype2ext(f.get('Type')), determine_ext(f_url, None)) - if kind == 'm3u8' or 'm3u8' in exts: - formats.extend(self._extract_m3u8_formats( - f_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=kind, fatal=False, live=is_live)) - elif kind == 'flash' or 'f4m' in exts: - formats.extend(self._extract_f4m_formats( - f_url, video_id, f4m_id=kind, fatal=False)) - elif kind == 'dash' or 'mpd' in exts: - formats.extend(self._extract_mpd_formats( - f_url, video_id, mpd_id=kind, fatal=False)) - elif kind == 'silverlight': - # TODO: process when ism is supported (see - # https://github.com/ytdl-org/youtube-dl/issues/8118) - continue - else: - tbr = float_or_none(f.get('Bitrate'), 1000) - formats.append({ - 'url': f_url, - 'format_id': '%s-%d' % (kind, tbr) if tbr else kind, - 'tbr': tbr, - }) + thumbnails = [] + subtitles = {} + for resource in media['asset']['resources']: + for rendition in (resource.get('renditions') or []): + rendition_type = rendition.get('type') + for i, link in enumerate(rendition.get('links') or []): + href = link.get('href') + if not href: + continue + if rendition_type == 'image': + thumbnails.append({ + 'filesize': int_or_none(rendition.get('size')), + 'height': int_or_none(rendition.get('height')), + 'id': rendition.get('id'), + 'url': href, + 'width': int_or_none(rendition.get('width')), + }) + elif rendition_type == 'subtitle': + subtitles.setdefault(rendition.get('language') or 'en', []).append({ + 'url': href, + }) + elif rendition_type == 'video': + f = { + 'filesize': int_or_none(rendition.get('size')), + 'format_id': rendition.get('id'), + 'url': href, + } + video = try_get(rendition, lambda x: x['videos'][i], dict) + if video: + if not duration: + duration = float_or_none(video.get('duration')) + f.update({ + 'height': int_or_none(video.get('height')), + 'tbr': int_or_none(video.get('bitrate'), 1000), + 'vcodec': video.get('codec'), + 'width': int_or_none(video.get('width')), + }) + audio = try_get(video, lambda x: x['audios'][0], dict) + if audio: + f.update({ + 'acodec': audio.get('codec'), + 'asr': int_or_none(audio.get('sampleRate')), + }) + formats.append(f) + elif rendition_type == 'index': + mime_type = link.get('mimeType') + if mime_type == 'application/smil+xml': + formats.extend(self._extract_smil_formats( + href, video_id, fatal=False)) + elif mime_type == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + href, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif mime_type == 'application/hds+xml': + formats.extend(self._extract_f4m_formats( + href, video_id, f4m_id='hds', fatal=False)) + elif mime_type == 'application/dash+xml': + formats.extend(self._extract_f4m_formats( + href, video_id, f4m_id='hds', fatal=False)) + elif mime_type == 'application/vnd.ms-sstr+xml': + formats.extend(self._extract_ism_formats( + href, video_id, ism_id='mss', fatal=False)) self._sort_formats(formats) - description = media_info.get('Description') - video_id = media_info.get('VideoId') or video_id - timestamp = parse_iso8601(media_info.get('PublishDate')) - thumbnails = [{ - 'url': thumbnail['Url'], - 'width': int_or_none(thumbnail.get('Size')), - } for thumbnail in (media_info.get('Poster') or []) if thumbnail.get('Url')] - return { 'id': video_id, 'title': title, - 'description': description, - 'timestamp': timestamp, - 'is_live': is_live, + 'description': metadata.get('description'), + 'timestamp': parse_iso8601(media.get('created')), 'thumbnails': thumbnails, + 'subtitles': subtitles, + 'duration': duration, + 'tags': media.get('tags'), 'formats': formats, } diff --git a/youtube_dl/extractor/asiancrush.py b/youtube_dl/extractor/asiancrush.py index 0348e680c..66ce7c686 100644 --- a/youtube_dl/extractor/asiancrush.py +++ b/youtube_dl/extractor/asiancrush.py @@ -1,27 +1,91 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import re from .common import InfoExtractor from .kaltura import KalturaIE -from ..utils import extract_attributes +from ..utils import ( + extract_attributes, + int_or_none, + OnDemandPagedList, + parse_age_limit, + strip_or_none, + try_get, +) -class AsianCrushIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?(?P(?:(?:asiancrush|yuyutv|midnightpulp)\.com|cocoro\.tv))' - _VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P\d+)v\b' % _VALID_URL_BASE +class AsianCrushBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?(?P(?:(?:asiancrush|yuyutv|midnightpulp)\.com|(?:cocoro|retrocrush)\.tv))' + _KALTURA_KEYS = [ + 'video_url', 'progressive_url', 'download_url', 'thumbnail_url', + 'widescreen_thumbnail_url', 'screencap_widescreen', + ] + _API_SUFFIX = {'retrocrush.tv': '-ott'} + + def _call_api(self, host, endpoint, video_id, query, resource): + return self._download_json( + 'https://api%s.%s/%s' % (self._API_SUFFIX.get(host, ''), host, endpoint), video_id, + 'Downloading %s JSON metadata' % resource, query=query, + headers=self.geo_verification_headers())['objects'] + + def _download_object_data(self, host, object_id, resource): + return self._call_api( + host, 'search', object_id, {'id': object_id}, resource)[0] + + def _get_object_description(self, obj): + return strip_or_none(obj.get('long_description') or obj.get('short_description')) + + def _parse_video_data(self, video): + title = video['name'] + + entry_id, partner_id = [None] * 2 + for k in self._KALTURA_KEYS: + k_url = video.get(k) + if k_url: + mobj = re.search(r'/p/(\d+)/.+?/entryId/([^/]+)/', k_url) + if mobj: + partner_id, entry_id = mobj.groups() + break + + meta_categories = try_get(video, lambda x: x['meta']['categories'], list) or [] + categories = list(filter(None, [c.get('name') for c in meta_categories])) + + show_info = video.get('show_info') or {} + + return { + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'ie_key': KalturaIE.ie_key(), + 'id': entry_id, + 'title': title, + 'description': self._get_object_description(video), + 'age_limit': parse_age_limit(video.get('mpaa_rating') or video.get('tv_rating')), + 'categories': categories, + 'series': show_info.get('show_name'), + 'season_number': int_or_none(show_info.get('season_num')), + 'season_id': show_info.get('season_id'), + 'episode_number': int_or_none(show_info.get('episode_num')), + } + + +class AsianCrushIE(AsianCrushBaseIE): + _VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P\d+)v\b' % AsianCrushBaseIE._VALID_URL_BASE _TESTS = [{ - 'url': 'https://www.asiancrush.com/video/012869v/women-who-flirt/', + 'url': 'https://www.asiancrush.com/video/004289v/women-who-flirt', 'md5': 'c3b740e48d0ba002a42c0b72857beae6', 'info_dict': { 'id': '1_y4tmjm5r', 'ext': 'mp4', 'title': 'Women Who Flirt', - 'description': 'md5:7e986615808bcfb11756eb503a751487', + 'description': 'md5:b65c7e0ae03a85585476a62a186f924c', 'timestamp': 1496936429, 'upload_date': '20170608', 'uploader_id': 'craig@crifkin.com', + 'age_limit': 13, + 'categories': 'count:5', + 'duration': 5812, }, }, { 'url': 'https://www.asiancrush.com/video/she-was-pretty/011886v-pretty-episode-3/', @@ -41,67 +105,35 @@ class AsianCrushIE(InfoExtractor): }, { 'url': 'https://www.cocoro.tv/video/the-wonderful-wizard-of-oz/008878v-the-wonderful-wizard-of-oz-ep01/', 'only_matching': True, + }, { + 'url': 'https://www.retrocrush.tv/video/true-tears/012328v-i...gave-away-my-tears', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - video_id = mobj.group('id') + host, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, video_id) - - entry_id, partner_id, title = [None] * 3 - - vars = self._parse_json( - self._search_regex( + if host == 'cocoro.tv': + webpage = self._download_webpage(url, video_id) + embed_vars = self._parse_json(self._search_regex( r'iEmbedVars\s*=\s*({.+?})', webpage, 'embed vars', - default='{}'), video_id, fatal=False) - if vars: - entry_id = vars.get('entry_id') - partner_id = vars.get('partner_id') - title = vars.get('vid_label') + default='{}'), video_id, fatal=False) or {} + video_id = embed_vars.get('entry_id') or video_id - if not entry_id: - entry_id = self._search_regex( - r'\bentry_id["\']\s*:\s*["\'](\d+)', webpage, 'entry id') - - player = self._download_webpage( - 'https://api.%s/embeddedVideoPlayer' % host, video_id, - query={'id': entry_id}) - - kaltura_id = self._search_regex( - r'entry_id["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', player, - 'kaltura id', group='id') - - if not partner_id: - partner_id = self._search_regex( - r'/p(?:artner_id)?/(\d+)', player, 'partner id', - default='513551') - - description = self._html_search_regex( - r'(?s)]+\bclass=["\']description["\'][^>]*>(.+?)', - webpage, 'description', fatal=False) - - return { - '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (partner_id, kaltura_id), - 'ie_key': KalturaIE.ie_key(), - 'id': video_id, - 'title': title, - 'description': description, - } + video = self._download_object_data(host, video_id, 'video') + return self._parse_video_data(video) -class AsianCrushPlaylistIE(InfoExtractor): - _VALID_URL = r'%s/series/0+(?P\d+)s\b' % AsianCrushIE._VALID_URL_BASE +class AsianCrushPlaylistIE(AsianCrushBaseIE): + _VALID_URL = r'%s/series/0+(?P\d+)s\b' % AsianCrushBaseIE._VALID_URL_BASE _TESTS = [{ - 'url': 'https://www.asiancrush.com/series/012481s/scholar-walks-night/', + 'url': 'https://www.asiancrush.com/series/006447s/fruity-samurai', 'info_dict': { - 'id': '12481', - 'title': 'Scholar Who Walks the Night', - 'description': 'md5:7addd7c5132a09fd4741152d96cce886', + 'id': '6447', + 'title': 'Fruity Samurai', + 'description': 'md5:7535174487e4a202d3872a7fc8f2f154', }, - 'playlist_count': 20, + 'playlist_count': 13, }, { 'url': 'https://www.yuyutv.com/series/013920s/peep-show/', 'only_matching': True, @@ -111,35 +143,58 @@ class AsianCrushPlaylistIE(InfoExtractor): }, { 'url': 'https://www.cocoro.tv/series/008549s/the-wonderful-wizard-of-oz/', 'only_matching': True, + }, { + 'url': 'https://www.retrocrush.tv/series/012355s/true-tears', + 'only_matching': True, }] + _PAGE_SIZE = 1000000000 + + def _fetch_page(self, domain, parent_id, page): + videos = self._call_api( + domain, 'getreferencedobjects', parent_id, { + 'max': self._PAGE_SIZE, + 'object_type': 'video', + 'parent_id': parent_id, + 'start': page * self._PAGE_SIZE, + }, 'page %d' % (page + 1)) + for video in videos: + yield self._parse_video_data(video) def _real_extract(self, url): - playlist_id = self._match_id(url) + host, playlist_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, playlist_id) + if host == 'cocoro.tv': + webpage = self._download_webpage(url, playlist_id) - entries = [] + entries = [] - for mobj in re.finditer( - r']+href=(["\'])(?P%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL, - webpage): - attrs = extract_attributes(mobj.group(0)) - if attrs.get('class') == 'clearfix': - entries.append(self.url_result( - mobj.group('url'), ie=AsianCrushIE.ie_key())) + for mobj in re.finditer( + r']+href=(["\'])(?P%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL, + webpage): + attrs = extract_attributes(mobj.group(0)) + if attrs.get('class') == 'clearfix': + entries.append(self.url_result( + mobj.group('url'), ie=AsianCrushIE.ie_key())) - title = self._html_search_regex( - r'(?s)]\bid=["\']movieTitle[^>]+>(.+?)', webpage, - 'title', default=None) or self._og_search_title( - webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', - default=None) or self._search_regex( - r'([^<]+)', webpage, 'title', fatal=False) - if title: - title = re.sub(r'\s*\|\s*.+?$', '', title) + title = self._html_search_regex( + r'(?s)]\bid=["\']movieTitle[^>]+>(.+?)', webpage, + 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', + default=None) or self._search_regex( + r'([^<]+)', webpage, 'title', fatal=False) + if title: + title = re.sub(r'\s*\|\s*.+?$', '', title) - description = self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'twitter:description', webpage, 'description', fatal=False) + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'twitter:description', webpage, 'description', fatal=False) + else: + show = self._download_object_data(host, playlist_id, 'show') + title = show.get('name') + description = self._get_object_description(show) + entries = OnDemandPagedList( + functools.partial(self._fetch_page, host, playlist_id), + self._PAGE_SIZE) return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/awaan.py b/youtube_dl/extractor/awaan.py index a2603bbff..3a7700cd4 100644 --- a/youtube_dl/extractor/awaan.py +++ b/youtube_dl/extractor/awaan.py @@ -48,6 +48,7 @@ class AWAANBaseIE(InfoExtractor): 'duration': int_or_none(video_data.get('duration')), 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), 'is_live': is_live, + 'uploader_id': video_data.get('user_id'), } @@ -107,6 +108,7 @@ class AWAANLiveIE(AWAANBaseIE): 'title': 're:Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'upload_date': '20150107', 'timestamp': 1420588800, + 'uploader_id': '71', }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index b1e20def5..930266990 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -47,7 +47,7 @@ class AZMedienIE(InfoExtractor): 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', 'only_matching': True }] - _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/cb9f2f81ed22e9b47f4ca64ea3cc5a5d13e88d1d' + _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/a4016f65fe62b81dc6664dd9f4910e4ab40383be' _PARTNER_ID = '1719221' def _real_extract(self, url): diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 002c39c39..b4daee54e 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -49,22 +49,17 @@ class BBCCoUkIE(InfoExtractor): _LOGIN_URL = 'https://account.bbc.com/signin' _NETRC_MACHINE = 'bbc' - _MEDIASELECTOR_URLS = [ + _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s' + _MEDIA_SETS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails # with geolocation in some cases when it's even not geo restricted at all (e.g. # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable. - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', + 'iptv-all', + 'pc', ] - _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection' _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist' - _NAMESPACES = ( - _MEDIASELECTION_NS, - _EMP_PLAYLIST_NS, - ) - _TESTS = [ { 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', @@ -261,8 +256,6 @@ class BBCCoUkIE(InfoExtractor): 'only_matching': True, }] - _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' - def _login(self): username, password = self._get_login_info() if username is None: @@ -307,22 +300,14 @@ class BBCCoUkIE(InfoExtractor): def _extract_items(self, playlist): return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) - def _findall_ns(self, element, xpath): - elements = [] - for ns in self._NAMESPACES: - elements.extend(element.findall(xpath % ns)) - return elements - def _extract_medias(self, media_selection): - error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS) - if error is None: - media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS) - if error is not None: - raise BBCCoUkIE.MediaSelectionError(error.get('id')) - return self._findall_ns(media_selection, './{%s}media') + error = media_selection.get('result') + if error: + raise BBCCoUkIE.MediaSelectionError(error) + return media_selection.get('media') or [] def _extract_connections(self, media): - return self._findall_ns(media, './{%s}connection') + return media.get('connection') or [] def _get_subtitles(self, media, programme_id): subtitles = {} @@ -334,13 +319,13 @@ class BBCCoUkIE(InfoExtractor): cc_url, programme_id, 'Downloading captions', fatal=False) if not isinstance(captions, compat_etree_Element): continue - lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') - subtitles[lang] = [ + subtitles['en'] = [ { 'url': connection.get('href'), 'ext': 'ttml', }, ] + break return subtitles def _raise_extractor_error(self, media_selection_error): @@ -350,10 +335,10 @@ class BBCCoUkIE(InfoExtractor): def _download_media_selector(self, programme_id): last_exception = None - for mediaselector_url in self._MEDIASELECTOR_URLS: + for media_set in self._MEDIA_SETS: try: return self._download_media_selector_url( - mediaselector_url % programme_id, programme_id) + self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id) except BBCCoUkIE.MediaSelectionError as e: if e.id in ('notukerror', 'geolocation', 'selectionunavailable'): last_exception = e @@ -362,8 +347,8 @@ class BBCCoUkIE(InfoExtractor): self._raise_extractor_error(last_exception) def _download_media_selector_url(self, url, programme_id=None): - media_selection = self._download_xml( - url, programme_id, 'Downloading media selection XML', + media_selection = self._download_json( + url, programme_id, 'Downloading media selection JSON', expected_status=(403, 404)) return self._process_media_selector(media_selection, programme_id) @@ -377,7 +362,6 @@ class BBCCoUkIE(InfoExtractor): if kind in ('video', 'audio'): bitrate = int_or_none(media.get('bitrate')) encoding = media.get('encoding') - service = media.get('service') width = int_or_none(media.get('width')) height = int_or_none(media.get('height')) file_size = int_or_none(media.get('media_file_size')) @@ -392,8 +376,6 @@ class BBCCoUkIE(InfoExtractor): supplier = connection.get('supplier') transfer_format = connection.get('transferFormat') format_id = supplier or conn_kind or protocol - if service: - format_id = '%s_%s' % (service, format_id) # ASX playlist if supplier == 'asx': for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): @@ -408,20 +390,11 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( href, programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)) - if re.search(self._USP_RE, href): - usp_formats = self._extract_m3u8_formats( - re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href), - programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False) - for f in usp_formats: - if f.get('height') and f['height'] > 720: - continue - formats.append(f) elif transfer_format == 'hds': formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) else: - if not service and not supplier and bitrate: + if not supplier and bitrate: format_id += '-%d' % bitrate fmt = { 'format_id': format_id, @@ -554,7 +527,7 @@ class BBCCoUkIE(InfoExtractor): webpage = self._download_webpage(url, group_id, 'Downloading video page') error = self._search_regex( - r']+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<', + r']+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<', webpage, 'error', default=None) if error: raise ExtractorError(error, expected=True) @@ -607,16 +580,9 @@ class BBCIE(BBCCoUkIE): IE_DESC = 'BBC' _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' - _MEDIASELECTOR_URLS = [ - # Provides HQ HLS streams but fails with geolocation in some cases when it's - # even not geo restricted at all - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', - # Provides more formats, namely direct mp4 links, but fails on some videos with - # notukerror for non UK (?) users (e.g. - # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) - 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s', - # Provides fewer formats, but works everywhere for everybody (hopefully) - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s', + _MEDIA_SETS = [ + 'mobile-tablet-main', + 'pc', ] _TESTS = [{ @@ -981,7 +947,7 @@ class BBCIE(BBCCoUkIE): group_id = self._search_regex( r']+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, webpage, 'group id', default=None) - if playlist_id: + if group_id: return self.url_result( 'https://www.bbc.co.uk/programmes/%s' % group_id, ie=BBCCoUkIE.ie_key()) @@ -1092,10 +1058,26 @@ class BBCIE(BBCCoUkIE): self._search_regex( r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, 'bbcthree config', default='{}'), - playlist_id, transform_source=js_to_json, fatal=False) - if bbc3_config: + playlist_id, transform_source=js_to_json, fatal=False) or {} + payload = bbc3_config.get('payload') or {} + if payload: + clip = payload.get('currentClip') or {} + clip_vpid = clip.get('vpid') + clip_title = clip.get('title') + if clip_vpid and clip_title: + formats, subtitles = self._download_media_selector(clip_vpid) + self._sort_formats(formats) + return { + 'id': clip_vpid, + 'title': clip_title, + 'thumbnail': dict_get(clip, ('poster', 'imageUrl')), + 'description': clip.get('description'), + 'duration': parse_duration(clip.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + } bbc3_playlist = try_get( - bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'], + payload, lambda x: x['content']['bbcMedia']['playlist'], dict) if bbc3_playlist: playlist_title = bbc3_playlist.get('title') or playlist_title @@ -1118,6 +1100,39 @@ class BBCIE(BBCCoUkIE): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + initial_data = self._parse_json(self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), playlist_id, fatal=False) + if initial_data: + def parse_media(media): + if not media: + return + for item in (try_get(media, lambda x: x['media']['items'], list) or []): + item_id = item.get('id') + item_title = item.get('title') + if not (item_id and item_title): + continue + formats, subtitles = self._download_media_selector(item_id) + self._sort_formats(formats) + entries.append({ + 'id': item_id, + 'title': item_title, + 'thumbnail': item.get('holdingImageUrl'), + 'formats': formats, + 'subtitles': subtitles, + }) + for resp in (initial_data.get('data') or {}).values(): + name = resp.get('name') + if name == 'media-experience': + parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) + elif name == 'article': + for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): + if block.get('type') != 'media': + continue + parse_media(block.get('model')) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), diff --git a/youtube_dl/extractor/beampro.py b/youtube_dl/extractor/beampro.py deleted file mode 100644 index 86abdae00..000000000 --- a/youtube_dl/extractor/beampro.py +++ /dev/null @@ -1,194 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - clean_html, - compat_str, - float_or_none, - int_or_none, - parse_iso8601, - try_get, - urljoin, -) - - -class BeamProBaseIE(InfoExtractor): - _API_BASE = 'https://mixer.com/api/v1' - _RATINGS = {'family': 0, 'teen': 13, '18+': 18} - - def _extract_channel_info(self, chan): - user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id']) - return { - 'uploader': chan.get('token') or try_get( - chan, lambda x: x['user']['username'], compat_str), - 'uploader_id': compat_str(user_id) if user_id else None, - 'age_limit': self._RATINGS.get(chan.get('audience')), - } - - -class BeamProLiveIE(BeamProBaseIE): - IE_NAME = 'Mixer:live' - _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/(?P[^/?#&]+)' - _TEST = { - 'url': 'http://mixer.com/niterhayven', - 'info_dict': { - 'id': '261562', - 'ext': 'mp4', - 'title': 'Introducing The Witcher 3 // The Grind Starts Now!', - 'description': 'md5:0b161ac080f15fe05d18a07adb44a74d', - 'thumbnail': r're:https://.*\.jpg$', - 'timestamp': 1483477281, - 'upload_date': '20170103', - 'uploader': 'niterhayven', - 'uploader_id': '373396', - 'age_limit': 18, - 'is_live': True, - 'view_count': int, - }, - 'skip': 'niterhayven is offline', - 'params': { - 'skip_download': True, - }, - } - - _MANIFEST_URL_TEMPLATE = '%s/channels/%%s/manifest.%%s' % BeamProBaseIE._API_BASE - - @classmethod - def suitable(cls, url): - return False if BeamProVodIE.suitable(url) else super(BeamProLiveIE, cls).suitable(url) - - def _real_extract(self, url): - channel_name = self._match_id(url) - - chan = self._download_json( - '%s/channels/%s' % (self._API_BASE, channel_name), channel_name) - - if chan.get('online') is False: - raise ExtractorError( - '{0} is offline'.format(channel_name), expected=True) - - channel_id = chan['id'] - - def manifest_url(kind): - return self._MANIFEST_URL_TEMPLATE % (channel_id, kind) - - formats = self._extract_m3u8_formats( - manifest_url('m3u8'), channel_name, ext='mp4', m3u8_id='hls', - fatal=False) - formats.extend(self._extract_smil_formats( - manifest_url('smil'), channel_name, fatal=False)) - self._sort_formats(formats) - - info = { - 'id': compat_str(chan.get('id') or channel_name), - 'title': self._live_title(chan.get('name') or channel_name), - 'description': clean_html(chan.get('description')), - 'thumbnail': try_get( - chan, lambda x: x['thumbnail']['url'], compat_str), - 'timestamp': parse_iso8601(chan.get('updatedAt')), - 'is_live': True, - 'view_count': int_or_none(chan.get('viewersTotal')), - 'formats': formats, - } - info.update(self._extract_channel_info(chan)) - - return info - - -class BeamProVodIE(BeamProBaseIE): - IE_NAME = 'Mixer:vod' - _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/[^/?#&]+\?.*?\bvod=(?P[^?#&]+)' - _TESTS = [{ - 'url': 'https://mixer.com/willow8714?vod=2259830', - 'md5': 'b2431e6e8347dc92ebafb565d368b76b', - 'info_dict': { - 'id': '2259830', - 'ext': 'mp4', - 'title': 'willow8714\'s Channel', - 'duration': 6828.15, - 'thumbnail': r're:https://.*source\.png$', - 'timestamp': 1494046474, - 'upload_date': '20170506', - 'uploader': 'willow8714', - 'uploader_id': '6085379', - 'age_limit': 13, - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://mixer.com/streamer?vod=IxFno1rqC0S_XJ1a2yGgNw', - 'only_matching': True, - }, { - 'url': 'https://mixer.com/streamer?vod=Rh3LY0VAqkGpEQUe2pN-ig', - 'only_matching': True, - }] - - @staticmethod - def _extract_format(vod, vod_type): - if not vod.get('baseUrl'): - return [] - - if vod_type == 'hls': - filename, protocol = 'manifest.m3u8', 'm3u8_native' - elif vod_type == 'raw': - filename, protocol = 'source.mp4', 'https' - else: - assert False - - data = vod.get('data') if isinstance(vod.get('data'), dict) else {} - - format_id = [vod_type] - if isinstance(data.get('Height'), compat_str): - format_id.append('%sp' % data['Height']) - - return [{ - 'url': urljoin(vod['baseUrl'], filename), - 'format_id': '-'.join(format_id), - 'ext': 'mp4', - 'protocol': protocol, - 'width': int_or_none(data.get('Width')), - 'height': int_or_none(data.get('Height')), - 'fps': int_or_none(data.get('Fps')), - 'tbr': int_or_none(data.get('Bitrate'), 1000), - }] - - def _real_extract(self, url): - vod_id = self._match_id(url) - - vod_info = self._download_json( - '%s/recordings/%s' % (self._API_BASE, vod_id), vod_id) - - state = vod_info.get('state') - if state != 'AVAILABLE': - raise ExtractorError( - 'VOD %s is not available (state: %s)' % (vod_id, state), - expected=True) - - formats = [] - thumbnail_url = None - - for vod in vod_info['vods']: - vod_type = vod.get('format') - if vod_type in ('hls', 'raw'): - formats.extend(self._extract_format(vod, vod_type)) - elif vod_type == 'thumbnail': - thumbnail_url = urljoin(vod.get('baseUrl'), 'source.png') - - self._sort_formats(formats) - - info = { - 'id': vod_id, - 'title': vod_info.get('name') or vod_id, - 'duration': float_or_none(vod_info.get('duration')), - 'thumbnail': thumbnail_url, - 'timestamp': parse_iso8601(vod_info.get('createdAt')), - 'view_count': int_or_none(vod_info.get('viewsTotal')), - 'formats': formats, - } - info.update(self._extract_channel_info(vod_info.get('channel') or {})) - - return info diff --git a/youtube_dl/extractor/bfmtv.py b/youtube_dl/extractor/bfmtv.py new file mode 100644 index 000000000..501f69d80 --- /dev/null +++ b/youtube_dl/extractor/bfmtv.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import extract_attributes + + +class BFMTVBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?bfmtv\.com/' + _VALID_URL_TMPL = _VALID_URL_BASE + r'(?:[^/]+/)*[^/?&#]+_%s[A-Z]-(?P\d{12})\.html' + _VIDEO_BLOCK_REGEX = r'(]+class="video_block"[^>]*>)' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + + def _brightcove_url_result(self, video_id, video_block): + account_id = video_block.get('accountid') or '876450612001' + player_id = video_block.get('playerid') or 'I2qBTln4u' + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), + 'BrightcoveNew', video_id) + + +class BFMTVIE(BFMTVBaseIE): + IE_NAME = 'bfmtv' + _VALID_URL = BFMTVBaseIE._VALID_URL_TMPL % 'V' + _TESTS = [{ + 'url': 'https://www.bfmtv.com/politique/emmanuel-macron-l-islam-est-une-religion-qui-vit-une-crise-aujourd-hui-partout-dans-le-monde_VN-202010020146.html', + 'info_dict': { + 'id': '6196747868001', + 'ext': 'mp4', + 'title': 'Emmanuel Macron: "L\'Islam est une religion qui vit une crise aujourd’hui, partout dans le monde"', + 'description': 'Le Président s\'exprime sur la question du séparatisme depuis les Mureaux, dans les Yvelines.', + 'uploader_id': '876450610001', + 'upload_date': '20201002', + 'timestamp': 1601629620, + }, + }] + + def _real_extract(self, url): + bfmtv_id = self._match_id(url) + webpage = self._download_webpage(url, bfmtv_id) + video_block = extract_attributes(self._search_regex( + self._VIDEO_BLOCK_REGEX, webpage, 'video block')) + return self._brightcove_url_result(video_block['videoid'], video_block) + + +class BFMTVLiveIE(BFMTVIE): + IE_NAME = 'bfmtv:live' + _VALID_URL = BFMTVBaseIE._VALID_URL_BASE + '(?P(?:[^/]+/)?en-direct)' + _TESTS = [{ + 'url': 'https://www.bfmtv.com/en-direct/', + 'info_dict': { + 'id': '5615950982001', + 'ext': 'mp4', + 'title': r're:^le direct BFMTV WEB \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'uploader_id': '876450610001', + 'upload_date': '20171018', + 'timestamp': 1508329950, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.bfmtv.com/economie/en-direct/', + 'only_matching': True, + }] + + +class BFMTVArticleIE(BFMTVBaseIE): + IE_NAME = 'bfmtv:article' + _VALID_URL = BFMTVBaseIE._VALID_URL_TMPL % 'A' + _TESTS = [{ + 'url': 'https://www.bfmtv.com/sante/covid-19-un-responsable-de-l-institut-pasteur-se-demande-quand-la-france-va-se-reconfiner_AV-202101060198.html', + 'info_dict': { + 'id': '202101060198', + 'title': 'Covid-19: un responsable de l\'Institut Pasteur se demande "quand la France va se reconfiner"', + 'description': 'md5:947974089c303d3ac6196670ae262843', + }, + 'playlist_count': 2, + }, { + 'url': 'https://www.bfmtv.com/international/pour-bolsonaro-le-bresil-est-en-faillite-mais-il-ne-peut-rien-faire_AD-202101060232.html', + 'only_matching': True, + }, { + 'url': 'https://www.bfmtv.com/sante/covid-19-oui-le-vaccin-de-pfizer-distribue-en-france-a-bien-ete-teste-sur-des-personnes-agees_AN-202101060275.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + bfmtv_id = self._match_id(url) + webpage = self._download_webpage(url, bfmtv_id) + + entries = [] + for video_block_el in re.findall(self._VIDEO_BLOCK_REGEX, webpage): + video_block = extract_attributes(video_block_el) + video_id = video_block.get('videoid') + if not video_id: + continue + entries.append(self._brightcove_url_result(video_id, video_block)) + + return self.playlist_result( + entries, bfmtv_id, self._og_search_title(webpage, fatal=False), + self._html_search_meta(['og:description', 'description'], webpage)) diff --git a/youtube_dl/extractor/bibeltv.py b/youtube_dl/extractor/bibeltv.py new file mode 100644 index 000000000..56c2bfee8 --- /dev/null +++ b/youtube_dl/extractor/bibeltv.py @@ -0,0 +1,30 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class BibelTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?:crn/)?(?P\d+)' + _TESTS = [{ + 'url': 'https://www.bibeltv.de/mediathek/videos/329703-sprachkurs-in-malaiisch', + 'md5': '252f908192d611de038b8504b08bf97f', + 'info_dict': { + 'id': 'ref:329703', + 'ext': 'mp4', + 'title': 'Sprachkurs in Malaiisch', + 'description': 'md5:3e9f197d29ee164714e67351cf737dfe', + 'timestamp': 1608316701, + 'uploader_id': '5840105145001', + 'upload_date': '20201218', + } + }, { + 'url': 'https://www.bibeltv.de/mediathek/videos/crn/326374', + 'only_matching': True, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5840105145001/default_default/index.html?videoId=ref:%s' + + def _real_extract(self, url): + crn_id = self._match_id(url) + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % crn_id, 'BrightcoveNew') diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index dc60224d0..d1bf8e829 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -90,13 +90,19 @@ class BleacherReportCMSIE(AMPIE): _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P[0-9a-f-]{36}|\d{5})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms', - 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1', + 'md5': '670b2d73f48549da032861130488c681', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', + 'upload_date': '20150723', + 'timestamp': 1437679032, + }, + 'expected_warnings': [ + 'Unable to download f4m manifest' + ] }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/bongacams.py b/youtube_dl/extractor/bongacams.py new file mode 100644 index 000000000..180542fbc --- /dev/null +++ b/youtube_dl/extractor/bongacams.py @@ -0,0 +1,60 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + try_get, + urlencode_postdata, +) + + +class BongaCamsIE(InfoExtractor): + _VALID_URL = r'https?://(?P(?:[^/]+\.)?bongacams\d*\.com)/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://de.bongacams.com/azumi-8', + 'only_matching': True, + }, { + 'url': 'https://cn.bongacams.com/azumi-8', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + channel_id = mobj.group('id') + + amf = self._download_json( + 'https://%s/tools/amf.php' % host, channel_id, + data=urlencode_postdata(( + ('method', 'getRoomData'), + ('args[]', channel_id), + ('args[]', 'false'), + )), headers={'X-Requested-With': 'XMLHttpRequest'}) + + server_url = amf['localData']['videoServerUrl'] + + uploader_id = try_get( + amf, lambda x: x['performerData']['username'], compat_str) or channel_id + uploader = try_get( + amf, lambda x: x['performerData']['displayName'], compat_str) + like_count = int_or_none(try_get( + amf, lambda x: x['performerData']['loversCount'])) + + formats = self._extract_m3u8_formats( + '%s/hls/stream_%s/playlist.m3u8' % (server_url, uploader_id), + channel_id, 'mp4', m3u8_id='hls', live=True) + self._sort_formats(formats) + + return { + 'id': channel_id, + 'title': self._live_title(uploader or uploader_id), + 'uploader': uploader, + 'uploader_id': uploader_id, + 'like_count': like_count, + 'age_limit': 18, + 'is_live': True, + 'formats': formats, + } diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py index b9715df00..bae2aedce 100644 --- a/youtube_dl/extractor/bravotv.py +++ b/youtube_dl/extractor/bravotv.py @@ -12,7 +12,7 @@ from ..utils import ( class BravoTVIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?Pbravotv|oxygen)\.com/(?:[^/]+/)+(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', @@ -28,10 +28,13 @@ class BravoTVIE(AdobePassIE): }, { 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', 'only_matching': True, + }, { + 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) settings = self._parse_json(self._search_regex( r']+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})', webpage, 'drupal settings'), @@ -53,11 +56,14 @@ class BravoTVIE(AdobePassIE): tp_path = release_pid = tve['release_pid'] if tve.get('entitlement') == 'auth': adobe_pass = settings.get('tve_adobe_auth', {}) + if site == 'bravotv': + site = 'bravo' resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId', 'bravo'), + adobe_pass.get('adobePassResourceId') or site, tve['title'], release_pid, tve.get('rating')) query['auth'] = self._extract_mvpd_auth( - url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource) + url, release_pid, + adobe_pass.get('adobePassRequestorId') or site, resource) else: shared_playlist = settings['ls_playlist'] account_pid = shared_playlist['account_pid'] diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 000eac71c..6022076ac 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -28,6 +28,7 @@ from ..utils import ( parse_iso8601, smuggle_url, str_or_none, + try_get, unescapeHTML, unsmuggle_url, UnsupportedError, @@ -470,13 +471,18 @@ class BrightcoveNewIE(AdobePassIE): def _parse_brightcove_metadata(self, json_data, video_id, headers={}): title = json_data['name'].strip() + num_drm_sources = 0 formats = [] - for source in json_data.get('sources', []): + sources = json_data.get('sources') or [] + for source in sources: container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object - if ext == 'ism' or container == 'WVM' or source.get('key_systems'): + if container == 'WVM' or source.get('key_systems'): + num_drm_sources += 1 + continue + elif ext == 'ism': continue elif ext == 'm3u8' or container == 'M2TS': if not src: @@ -533,20 +539,15 @@ class BrightcoveNewIE(AdobePassIE): 'format_id': build_format_id('rtmp'), }) formats.append(f) - if not formats: - # for sonyliv.com DRM protected videos - s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') - if s3_source_url: - formats.append({ - 'url': s3_source_url, - 'format_id': 'source', - }) - errors = json_data.get('errors') - if not formats and errors: - error = errors[0] - raise ExtractorError( - error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + if not formats: + errors = json_data.get('errors') + if errors: + error = errors[0] + raise ExtractorError( + error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + if sources and num_drm_sources == len(sources): + raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) @@ -600,24 +601,27 @@ class BrightcoveNewIE(AdobePassIE): store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x) def extract_policy_key(): - webpage = self._download_webpage( - 'http://players.brightcove.net/%s/%s_%s/index.min.js' - % (account_id, player_id, embed), video_id) - - policy_key = None - - catalog = self._search_regex( - r'catalog\(({.+?})\);', webpage, 'catalog', default=None) - if catalog: - catalog = self._parse_json( - js_to_json(catalog), video_id, fatal=False) - if catalog: - policy_key = catalog.get('policyKey') - + base_url = 'http://players.brightcove.net/%s/%s_%s/' % (account_id, player_id, embed) + config = self._download_json( + base_url + 'config.json', video_id, fatal=False) or {} + policy_key = try_get( + config, lambda x: x['video_cloud']['policy_key']) if not policy_key: - policy_key = self._search_regex( - r'policyKey\s*:\s*(["\'])(?P.+?)\1', - webpage, 'policy key', group='pk') + webpage = self._download_webpage( + base_url + 'index.min.js', video_id) + + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + if catalog: + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) + if catalog: + policy_key = catalog.get('policyKey') + + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P.+?)\1', + webpage, 'policy key', group='pk') store_pk(policy_key) return policy_key diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 8667a0d04..eefbab241 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -8,18 +8,20 @@ from .gigya import GigyaBaseIE from ..compat import compat_HTTPError from ..utils import ( ExtractorError, - strip_or_none, + clean_html, + extract_attributes, float_or_none, + get_element_by_class, int_or_none, merge_dicts, - parse_iso8601, str_or_none, + strip_or_none, url_or_none, ) class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?Pcanvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P[^/?#&]+)' + _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?Pcanvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'md5': '68993eda72ef62386a15ea2cf3c93107', @@ -37,6 +39,7 @@ class CanvasIE(InfoExtractor): 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 'only_matching': True, }] + _GEO_BYPASS = False _HLS_ENTRY_PROTOCOLS_MAP = { 'HLS': 'm3u8_native', 'HLS_AES': 'm3u8', @@ -47,29 +50,34 @@ class CanvasIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) site_id, video_id = mobj.group('site_id'), mobj.group('id') - # Old API endpoint, serves more formats but may fail for some videos - data = self._download_json( - 'https://mediazone.vrt.be/api/v1/%s/assets/%s' - % (site_id, video_id), video_id, 'Downloading asset JSON', - 'Unable to download asset JSON', fatal=False) + data = None + if site_id != 'vrtvideo': + # Old API endpoint, serves more formats but may fail for some videos + data = self._download_json( + 'https://mediazone.vrt.be/api/v1/%s/assets/%s' + % (site_id, video_id), video_id, 'Downloading asset JSON', + 'Unable to download asset JSON', fatal=False) # New API endpoint if not data: + headers = self.geo_verification_headers() + headers.update({'Content-Type': 'application/json'}) token = self._download_json( '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', data=b'', - headers={'Content-Type': 'application/json'})['vrtPlayerToken'] + 'Downloading token', data=b'', headers=headers)['vrtPlayerToken'] data = self._download_json( '%s/videos/%s' % (self._REST_API_BASE, video_id), - video_id, 'Downloading video JSON', fatal=False, query={ + video_id, 'Downloading video JSON', query={ 'vrtPlayerToken': token, 'client': '%s@PROD' % site_id, }, expected_status=400) - message = data.get('message') - if message and not data.get('title'): - if data.get('code') == 'AUTHENTICATION_REQUIRED': - self.raise_login_required(message) - raise ExtractorError(message, expected=True) + if not data.get('title'): + code = data.get('code') + if code == 'AUTHENTICATION_REQUIRED': + self.raise_login_required() + elif code == 'INVALID_LOCATION': + self.raise_geo_restricted(countries=['BE']) + raise ExtractorError(data.get('message') or code, expected=True) title = data['title'] description = data.get('description') @@ -205,20 +213,24 @@ class CanvasEenIE(InfoExtractor): class VrtNUIE(GigyaBaseIE): IE_DESC = 'VrtNU.be' - _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?Pvrtnu)/(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P[^/?#&]+)' _TESTS = [{ # Available via old API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/', + 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', 'info_dict': { - 'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', + 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', 'ext': 'mp4', - 'title': 'De zwarte weduwe', - 'description': 'md5:db1227b0f318c849ba5eab1fef895ee4', + 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)', + 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7', 'duration': 1457.04, 'thumbnail': r're:^https?://.*\.jpg$', - 'season': 'Season 1', - 'season_number': 1, + 'series': 'Postbus X', + 'season': 'Seizoen 1989', + 'season_number': 1989, + 'episode': 'De zwarte weduwe', 'episode_number': 1, + 'timestamp': 1595822400, + 'upload_date': '20200727', }, 'skip': 'This video is only available for registered users', 'params': { @@ -300,69 +312,73 @@ class VrtNUIE(GigyaBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - webpage, urlh = self._download_webpage_handle(url, display_id) + webpage = self._download_webpage(url, display_id) + + attrs = extract_attributes(self._search_regex( + r'(]+>)', webpage, 'media element')) + video_id = attrs['videoid'] + publication_id = attrs.get('publicationid') + if publication_id: + video_id = publication_id + '$' + video_id + + page = (self._parse_json(self._search_regex( + r'digitalData\s*=\s*({.+?});', webpage, 'digial data', + default='{}'), video_id, fatal=False) or {}).get('page') or {} info = self._search_json_ld(webpage, display_id, default={}) - - # title is optional here since it may be extracted by extractor - # that is delegated from here - title = strip_or_none(self._html_search_regex( - r'(?ms)

(.+?)

', - webpage, 'title', default=None)) - - description = self._html_search_regex( - r'(?ms)
(.+?)
', - webpage, 'description', default=None) - - season = self._html_search_regex( - [r'''(?xms)\s* - seizoen\ (.+?)\s* - ''', - r'