Compare commits

..

89 Commits

Author SHA1 Message Date
Sergey M․
10734553fe release 2019.03.09 2019-03-09 02:53:18 +07:00
Sergey M․
bba35695eb [ChangeLog] Actualize
[ci skip]
2019-03-09 02:52:08 +07:00
Sergey M․
9d74ea6d36 [francetv:site] Relax video id regex and update test (closes #20268) 2019-03-08 23:28:24 +07:00
Remita Amine
7b6e760870 [toutv] detect invalid login error 2019-03-06 09:28:14 +01:00
Remita Amine
829685b88a [toutv] fix authentication(closes #20261) 2019-03-06 09:20:39 +01:00
Sergey M․
fca9baf0da [test] Fix test_compat_etree_Element 2019-03-06 02:46:26 +07:00
0x9fff00
d347b52b63 [urplay] Extract timestamp (#20235) 2019-03-06 02:11:32 +07:00
yonaikerlol
97157c692c [openload] Add support for oload.space 2019-03-06 01:34:34 +07:00
Sergey M․
a551768acf [facebook] Improve uploader extraction (closes #20250) 2019-03-06 01:27:22 +07:00
Sergey M․
ee0ba927aa Use compat_etree_Element 2019-03-06 01:21:57 +07:00
Sergey M․
399f76870d [compat] Introduce compat_etree_Element 2019-03-06 01:18:52 +07:00
Sergey M․
e5ada4f3ad [extractor/common] Fallback url to base URL for DASH formats 2019-03-06 00:33:08 +07:00
Sergey M․
bb6f112d9d [npo] Improve ISM extraction 2019-03-05 23:58:46 +07:00
Sergey M․
c17eb5b4b0 [rai] Improve extraction (closes #20253) 2019-03-05 23:56:42 +07:00
Sergey M․
d9eb580a79 [extractor/common] Do not fail on invalid data while parsing F4M manifest in non fatal mode 2019-03-05 23:56:33 +07:00
Remita Amine
5dcd630dca [paramountnetwork] fix mgid extraction(closes #20241) 2019-03-04 22:26:55 +01:00
Sergey M․
c790e93ab5 [extractor/common] Clarify url and manifest_url meta fields 2019-03-05 00:41:53 +07:00
Sergey M․
39c780fdec [extractor/common] Return MPD manifest as format's url meta field (#20242)
For symmetry with other segmented media
2019-03-05 00:40:57 +07:00
remitamine
e7e62441cd [utils] strip #HttpOnly_ prefix from cookies files (#20219) 2019-03-03 19:23:59 +07:00
Remita Amine
0a5baf9c21 [libsyn] improve extraction(closes #20229) 2019-03-03 06:18:51 +01:00
dimqua
8ae113ca9d [youtube] Add more invidious instances
See [Invidious-Instances](https://github.com/omarroth/invidious/wiki/Invidious-Instances) for the reference.
2019-03-03 08:19:36 +07:00
cclauss
7aeb788e56 [travis] Remove sudo: false
Travis now recommends removing `sudo: false` from configuration: https://blog.travis-ci.com/2018-11-19-required-linux-infrastructure-migration.
2019-03-03 08:16:48 +07:00
Sergey M․
7465e0aee2 [spankbang] Fix extraction (closes #20023) 2019-03-03 06:25:45 +07:00
Remita Amine
a8f83f0c56 [crunchyroll] fix is_logged check 2019-03-02 08:25:47 +01:00
Remita Amine
dca0e0040a Revert "use older login method(closes #11572)"
This reverts commit cc6a960e13.
2019-03-02 08:01:42 +01:00
Remita Amine
398e1e21d6 [espn] extend _VALID_URL regex(closes #20013) 2019-03-01 15:34:05 +01:00
Remita Amine
c5b02efe20 [sixplay] handle videos with empty assets(closes #20016) 2019-03-01 15:08:11 +01:00
Remita Amine
06242d44fe [vimeo] add support for Vimeo Pro portfolio protected videos(closes #20070) 2019-03-01 08:14:34 +01:00
Sergey M․
04c33bdfb3 release 2019.03.01 2019-03-01 01:03:51 +07:00
Sergey M․
333f617b12 [ChangeLog] Actualize
[ci skip]
2019-03-01 01:02:36 +07:00
Sergey M․
ff60ec8f02 [npo] Fix extraction (#20084) 2019-03-01 00:47:18 +07:00
Sergey M․
9d9a8676dc [francetv:site] Extend video id regex (closes #20029, closes #20071) 2019-02-28 23:26:52 +07:00
Sergey M․
db1c3a9d3f [periscope] Extract width and height (closes #20015) 2019-02-27 03:41:15 +07:00
Sergey M․
55b8588f0e [servus] Fix extraction (closes #19297) 2019-02-24 23:20:06 +07:00
Sergey M․
f0228f56fb [bbccouk] Make subtitles non fatal (#19651) 2019-02-24 21:01:25 +07:00
Sergey M․
8c80603f1a [downloader/external] Add support for rate limit and retries for wget 2019-02-23 01:00:03 +07:00
Sergey M․
37b239b3b6 [downloader/external] Fix infinite retries for curl (closes #19303) 2019-02-23 00:43:29 +07:00
Sergey M․
caf48f557a [metacafe] Fix family filter bypass (closes #19287) 2019-02-21 05:59:07 +07:00
Sergey M․
77a842c892 release 2019.02.18 2019-02-18 02:11:11 +07:00
Sergey M․
c76fc5b22a [ChangeLog] Actualize
[ci skip]
2019-02-18 02:10:06 +07:00
Sergey M․
388cfbd3d8 [tvp:website] Improve support 2019-02-17 14:27:00 +07:00
Sergey M․
d93083789b [tvp:series] Fix extraction 2019-02-17 14:09:30 +07:00
Sergey M․
34568dc296 [tvp] Detect unavailable videos 2019-02-17 13:39:00 +07:00
Sergey M․
3c9647372e [tvp] Fix description extraction, make thumbnail optional and fix tests 2019-02-17 13:38:21 +07:00
Sergey M․
659e93fcf5 [linuxacademy] Add extractor (closes #12207) 2019-02-17 07:12:10 +07:00
Sergey M․
c9a0ea6e51 [bilibili] Update keys (closes #19233) 2019-02-17 05:00:16 +07:00
Sergey M․
d7d513891b [udemy] Extend _VALID_URLs (closes #14330, closes #15883) 2019-02-17 01:05:01 +07:00
Sergey M․
ae65c93a26 [udemy] Update User-Agent and detect captcha (closes #14713, closes #15839, closes #18126) 2019-02-17 00:58:13 +07:00
Sergey M․
ba2e3730d1 [noovo] Fix extraction (closes #19230) 2019-02-16 22:45:53 +07:00
Sergey M․
2b2da3ba10 [rai] Relax _VALID_URL (closes #19232) 2019-02-15 23:56:29 +07:00
Sergey M․
794c1b6e02 [vshare] Pass Referer to download request (closes #19205, closes #19221) 2019-02-14 23:43:16 +07:00
yonaikerlol
7bee705d8f [openload] Add support for oload.live 2019-02-14 22:28:16 +07:00
bitraid
6f5c1807f4 [imgur] Use video id as title fallback (closes #18590) 2019-02-13 00:02:29 +07:00
Sergey M․
985637cbbf [twitch] Add new source format detection approach (closes #19193) 2019-02-12 00:13:50 +07:00
Sergey M․
7d8b89163c [tvplayhome] Fix video id extraction (closes #19190) 2019-02-11 04:41:28 +07:00
Sergey M․
d777f3e81c [tvplayhome] Fix episode metadata extraction (closes #19190) 2019-02-11 04:39:23 +07:00
Sergey M․
4c0e0dc9dc [rutube:embed] Fix extraction and add support private videos (closes #19163) 2019-02-11 00:49:51 +07:00
Sergey M․
f516f44094 [soundcloud] Extract more metadata 2019-02-10 23:44:08 +07:00
Sergey M․
e9dee7f1b2 [trunews] Add extractor (closes #19153) 2019-02-09 23:50:27 +07:00
Remita Amine
91effe22a0 [linkedin:learning] extract chapter_number and chapter_id(closes #19162) 2019-02-08 07:21:50 +01:00
Sergey M․
04eacf5453 release 2019.02.08 2019-02-08 01:12:51 +07:00
Sergey M․
f1f5b47255 [ChangeLog] Actualize
[ci skip]
2019-02-08 01:10:12 +07:00
Sergey M․
1211bb6dac [YoutubeDL] Improve _make_archive_id (closes #19149) 2019-02-08 01:08:48 +07:00
Sergey M․
4de3cb883c [malltv] Fix issues and simplify (closes #17856) 2019-02-08 00:43:31 +07:00
Ales Jirasek
22f5f5c6fc [malltv] Add extractor (closes #18058) 2019-02-08 00:43:26 +07:00
Sergey M․
49bd993fd9 [spankbang:playlist] Add extractor (closes #19145) 2019-02-08 00:09:50 +07:00
Sergey M․
f06a1cabe8 [spankbang] Extend _VALID_URL 2019-02-08 00:07:29 +07:00
Remita Amine
241c5d7d38 [trutv] fix extraction(closes #17336) 2019-02-06 19:38:10 +01:00
Remita Amine
8fecc7353d [toutv] fix authentication(closes #16398)(closes #18700) 2019-02-06 13:59:33 +01:00
Sergey M․
5dda1edef9 [pornhub] Improve and simplify (closes #19135) 2019-02-05 23:09:24 +07:00
JChris246
d2d970d07e [pornhub] Fix tags and categories extraction (closes #13720) 2019-02-05 23:08:49 +07:00
Sergey M․
48fb963b2f [pornhd] Fix formats extraction 2019-02-05 00:07:37 +07:00
JChris246
70c3ee1367 [pornhd] Extract like count 2019-02-05 00:06:04 +07:00
Remita Amine
07fbfef1c7 [radiocanada] switch to the new media requests(closes #19115) 2019-02-03 12:10:41 +01:00
Remita Amine
eecf788b90 [teachable] add support for courses.workitdaily.com (closes #18871) 2019-02-03 09:10:35 +01:00
Sergey M․
0efcb5a2fe [vporn] Remove extractor (closes #16276)
Handled by generic extractor
2019-02-03 00:33:45 +07:00
Sergey M․
7c5307f4c4 [soundcloud:pagedplaylist] Improve (closes #19086) 2019-02-02 23:40:30 +07:00
Cory Hall
6cc6e0c34d [soundcloud:pagedplaylist] Add ie and title to entries (#19022)
rel: https://github.com/rg3/youtube-dl/issues/19022
2019-02-02 23:40:22 +07:00
JChris246
b9bc1cff72 [drtuber] Extract duration 2019-02-02 06:04:00 +07:00
Sergey M․
e9fef7ee4e [YoutubeDL] Fallback to ie_key of matching extractor while making download archive id when no explicit ie_key is provided (#19022) 2019-02-02 05:44:31 +07:00
Sergey M․
b6423e6ca2 [soundcloud:user] Update tests 2019-02-02 04:11:32 +07:00
Sergey M․
3ef2da2d21 [soundcloud] Fix paged playlists extraction, add support for albums and update client id 2019-02-02 04:00:29 +07:00
Sergey M․
49fe4175ae [drtv] Improve preference (closes #19079) 2019-02-01 01:49:33 +07:00
Sergey M․
9613e14a92 [openload] Add support for openload.pw and oload.pw (closes #18930) 2019-01-31 00:15:45 +07:00
Batuhan's Unmaintained Account
15e832ff2a [openload] Add support for oload.info 2019-01-30 23:39:02 +07:00
Remita Amine
645c4885cf [crackle] authorize media detail request(closes #16931) 2019-01-30 14:44:23 +01:00
Sergey M․
7b0f9df23d release 2019.01.30.1 2019-01-30 06:19:36 +07:00
Sergey M․
c2a0fe2ea7 [ChangeLog] Actualize
[ci skip]
2019-01-30 06:17:25 +07:00
Sergey M․
ce52c7c111 [postprocessor/ffmpeg] Fix avconv processing broken in #19025 (closes #19067) 2019-01-30 06:15:23 +07:00
58 changed files with 1395 additions and 609 deletions

View File

@@ -6,8 +6,8 @@
---
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.30*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.30**
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.03.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.03.09**
### Before submitting an *issue* make sure you have:
- [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl
[debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
[debug] youtube-dl version 2019.01.30
[debug] youtube-dl version 2019.03.09
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {}

View File

@@ -9,7 +9,6 @@ python:
- "3.6"
- "pypy"
- "pypy3"
sudo: false
env:
- YTDL_TEST_SET=core
- YTDL_TEST_SET=download

106
ChangeLog
View File

@@ -1,3 +1,109 @@
version 2019.03.09
Core
* [extractor/common] Use compat_etree_Element
+ [compat] Introduce compat_etree_Element
* [extractor/common] Fallback url to base URL for DASH formats
* [extractor/common] Do not fail on invalid data while parsing F4M manifest
in non fatal mode
* [extractor/common] Return MPD manifest as format's url meta field (#20242)
* [utils] Strip #HttpOnly_ prefix from cookies files (#20219)
Extractors
* [francetv:site] Relax video id regular expression (#20268)
* [toutv] Detect invalid login error
* [toutv] Fix authentication (#20261)
+ [urplay] Extract timestamp (#20235)
+ [openload] Add support for oload.space (#20246)
* [facebook] Improve uploader extraction (#20250)
* [bbc] Use compat_etree_Element
* [crunchyroll] Use compat_etree_Element
* [npo] Improve ISM extraction
* [rai] Improve extraction (#20253)
* [paramountnetwork] Fix mgid extraction (#20241)
* [libsyn] Improve extraction (#20229)
+ [youtube] Add more invidious instances to URL regular expression (#20228)
* [spankbang] Fix extraction (#20023)
* [espn] Extend URL regular expression (#20013)
* [sixplay] Handle videos with empty assets (#20016)
+ [vimeo] Add support for Vimeo Pro portfolio protected videos (#20070)
version 2019.03.01
Core
+ [downloader/external] Add support for rate limit and retries for wget
* [downloader/external] Fix infinite retries for curl (#19303)
Extractors
* [npo] Fix extraction (#20084)
* [francetv:site] Extend video id regex (#20029, #20071)
+ [periscope] Extract width and height (#20015)
* [servus] Fix extraction (#19297)
* [bbccouk] Make subtitles non fatal (#19651)
* [metacafe] Fix family filter bypass (#19287)
version 2019.02.18
Extractors
* [tvp:website] Fix and improve extraction
+ [tvp] Detect unavailable videos
* [tvp] Fix description extraction and make thumbnail optional
+ [linuxacademy] Add support for linuxacademy.com (#12207)
* [bilibili] Update keys (#19233)
* [udemy] Extend URL regular expressions (#14330, #15883)
* [udemy] Update User-Agent and detect captcha (#14713, #15839, #18126)
* [noovo] Fix extraction (#19230)
* [rai] Relax URL regular expression (#19232)
+ [vshare] Pass Referer to download request (#19205, #19221)
+ [openload] Add support for oload.live (#19222)
* [imgur] Use video id as title fallback (#18590)
+ [twitch] Add new source format detection approach (#19193)
* [tvplayhome] Fix video id extraction (#19190)
* [tvplayhome] Fix episode metadata extraction (#19190)
* [rutube:embed] Fix extraction (#19163)
+ [rutube:embed] Add support private videos (#19163)
+ [soundcloud] Extract more metadata
+ [trunews] Add support for trunews.com (#19153)
+ [linkedin:learning] Extract chapter_number and chapter_id (#19162)
version 2019.02.08
Core
* [utils] Improve JSON-LD regular expression (#18058)
* [YoutubeDL] Fallback to ie_key of matching extractor while making
download archive id when no explicit ie_key is provided (#19022)
Extractors
+ [malltv] Add support for mall.tv (#18058, #17856)
+ [spankbang:playlist] Add support for playlists (#19145)
* [spankbang] Extend URL regular expression
* [trutv] Fix extraction (#17336)
* [toutv] Fix authentication (#16398, #18700)
* [pornhub] Fix tags and categories extraction (#13720, #19135)
* [pornhd] Fix formats extraction
+ [pornhd] Extract like count (#19123, #19125)
* [radiocanada] Switch to the new media requests (#19115)
+ [teachable] Add support for courses.workitdaily.com (#18871)
- [vporn] Remove extractor (#16276)
+ [soundcloud:pagedplaylist] Add ie and title to entries (#19022, #19086)
+ [drtuber] Extract duration (#19078)
* [soundcloud] Fix paged playlists extraction, add support for albums and update client id
* [soundcloud] Update client id
* [drtv] Improve preference (#19079)
+ [openload] Add support for openload.pw and oload.pw (#18930)
+ [openload] Add support for oload.info (#19073)
* [crackle] Authorize media detail request (#16931)
version 2019.01.30.1
Core
* [postprocessor/ffmpeg] Fix avconv processing broken in #19025 (#19067)
version 2019.01.30
Core

View File

@@ -458,6 +458,7 @@
- **LineTV**
- **linkedin:learning**
- **linkedin:learning:course**
- **LinuxAcademy**
- **LiTV**
- **LiveLeak**
- **LiveLeakEmbed**
@@ -476,6 +477,7 @@
- **mailru:music**: Музыка@Mail.Ru
- **mailru:music:search**: Музыка@Mail.Ru
- **MakerTV**
- **MallTV**
- **mangomolo:live**
- **mangomolo:video**
- **ManyVids**
@@ -827,6 +829,7 @@
- **southpark.nl**
- **southparkstudios.dk**
- **SpankBang**
- **SpankBangPlaylist**
- **Spankwire**
- **Spiegel**
- **Spiegel:Article**: Articles on spiegel.de
@@ -913,6 +916,7 @@
- **ToypicsUser**: Toypics user profile
- **TrailerAddict** (Currently broken)
- **Trilulilu**
- **TruNews**
- **TruTV**
- **Tube8**
- **TubiTv**
@@ -1057,7 +1061,6 @@
- **Voot**
- **VoxMedia**
- **VoxMediaVolume**
- **Vporn**
- **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl
- **Vrak**
- **VRT**: deredactie.be, sporza.be, cobra.be and cobra.canvas.be

View File

@@ -61,6 +61,7 @@ class TestInfoExtractor(unittest.TestCase):
<meta content='Foo' property=og:foobar>
<meta name="og:test1" content='foo > < bar'/>
<meta name="og:test2" content="foo >//< bar"/>
<meta property=og-test3 content='Ill-formatted opengraph'/>
'''
self.assertEqual(ie._og_search_title(html), 'Foo')
self.assertEqual(ie._og_search_description(html), 'Some video\'s description ')
@@ -69,6 +70,7 @@ class TestInfoExtractor(unittest.TestCase):
self.assertEqual(ie._og_search_property('foobar', html), 'Foo')
self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar')
self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar')
self.assertEqual(ie._og_search_property('test3', html), 'Ill-formatted opengraph')
self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar')
self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True)
self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True)

View File

@@ -29,6 +29,16 @@ class TestYoutubeDLCookieJar(unittest.TestCase):
tf.close()
os.remove(tf.name)
def test_strip_httponly_prefix(self):
cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/httponly_cookies.txt')
cookiejar.load(ignore_discard=True, ignore_expires=True)
def assert_cookie_has_value(key):
self.assertEqual(cookiejar._cookies['www.foobar.foobar']['/'][key].value, key + '_VALUE')
assert_cookie_has_value('HTTPONLY_COOKIE')
assert_cookie_has_value('JS_ACCESSIBLE_COOKIE')
if __name__ == '__main__':
unittest.main()

View File

@@ -13,6 +13,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from youtube_dl.compat import (
compat_getenv,
compat_setenv,
compat_etree_Element,
compat_etree_fromstring,
compat_expanduser,
compat_shlex_split,
@@ -90,6 +91,12 @@ class TestCompat(unittest.TestCase):
self.assertEqual(compat_shlex_split('-option "one\ntwo" \n -flag'), ['-option', 'one\ntwo', '-flag'])
self.assertEqual(compat_shlex_split('-val 中文'), ['-val', '中文'])
def test_compat_etree_Element(self):
try:
compat_etree_Element.items
except AttributeError:
self.fail('compat_etree_Element is not a type')
def test_compat_etree_fromstring(self):
xml = '''
<root foo="bar" spam="中文">

View File

@@ -0,0 +1,6 @@
# Netscape HTTP Cookie File
# http://curl.haxx.se/rfc/cookie_spec.html
# This is a generated file! Do not edit.
#HttpOnly_www.foobar.foobar FALSE / TRUE 2147483647 HTTPONLY_COOKIE HTTPONLY_COOKIE_VALUE
www.foobar.foobar FALSE / TRUE 2147483647 JS_ACCESSIBLE_COOKIE JS_ACCESSIBLE_COOKIE_VALUE

View File

@@ -82,6 +82,7 @@ from .utils import (
sanitize_url,
sanitized_Request,
std_headers,
str_or_none,
subtitles_filename,
UnavailableVideoError,
url_basename,
@@ -2060,15 +2061,24 @@ class YoutubeDL(object):
self.report_warning('Unable to remove downloaded original file')
def _make_archive_id(self, info_dict):
video_id = info_dict.get('id')
if not video_id:
return
# Future-proof against any change in case
# and backwards compatibility with prior versions
extractor = info_dict.get('extractor_key')
extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
if extractor is None:
if 'id' in info_dict:
extractor = info_dict.get('ie_key') # key in a playlist
if extractor is None:
return None # Incomplete video information
return extractor.lower() + ' ' + info_dict['id']
url = str_or_none(info_dict.get('url'))
if not url:
return
# Try to find matching extractor for the URL and take its ie_key
for ie in self._ies:
if ie.suitable(url):
extractor = ie.ie_key()
break
else:
return
return extractor.lower() + ' ' + video_id
def in_download_archive(self, info_dict):
fn = self.params.get('download_archive')
@@ -2076,7 +2086,7 @@ class YoutubeDL(object):
return False
vid_id = self._make_archive_id(info_dict)
if vid_id is None:
if not vid_id:
return False # Incomplete video information
try:

View File

@@ -2508,6 +2508,15 @@ class _TreeBuilder(etree.TreeBuilder):
pass
try:
# xml.etree.ElementTree.Element is a method in Python <=2.6 and
# the following will crash with:
# TypeError: isinstance() arg 2 must be a class, type, or tuple of classes and types
isinstance(None, xml.etree.ElementTree.Element)
from xml.etree.ElementTree import Element as compat_etree_Element
except TypeError: # Python <=2.6
from xml.etree.ElementTree import _ElementInterface as compat_etree_Element
if sys.version_info[0] >= 3:
def compat_etree_fromstring(text):
return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder()))
@@ -2969,6 +2978,7 @@ __all__ = [
'compat_cookiejar',
'compat_cookies',
'compat_ctypes_WINFUNCTYPE',
'compat_etree_Element',
'compat_etree_fromstring',
'compat_etree_register_namespace',
'compat_expanduser',

View File

@@ -121,7 +121,11 @@ class CurlFD(ExternalFD):
cmd += self._valueless_option('--silent', 'noprogress')
cmd += self._valueless_option('--verbose', 'verbose')
cmd += self._option('--limit-rate', 'ratelimit')
cmd += self._option('--retry', 'retries')
retry = self._option('--retry', 'retries')
if len(retry) == 2:
if retry[1] in ('inf', 'infinite'):
retry[1] = '2147483647'
cmd += retry
cmd += self._option('--max-filesize', 'max_filesize')
cmd += self._option('--interface', 'source_address')
cmd += self._option('--proxy', 'proxy')
@@ -160,6 +164,12 @@ class WgetFD(ExternalFD):
cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies']
for key, val in info_dict['http_headers'].items():
cmd += ['--header', '%s: %s' % (key, val)]
cmd += self._option('--limit-rate', 'ratelimit')
retry = self._option('--tries', 'retries')
if len(retry) == 2:
if retry[1] in ('inf', 'infinite'):
retry[1] = '0'
cmd += retry
cmd += self._option('--bind-address', 'source_address')
cmd += self._option('--proxy', 'proxy')
cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate')

View File

@@ -1,8 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import itertools
import re
from .common import InfoExtractor
from ..utils import (
@@ -17,10 +17,12 @@ from ..utils import (
parse_iso8601,
try_get,
unescapeHTML,
url_or_none,
urlencode_postdata,
urljoin,
)
from ..compat import (
compat_etree_Element,
compat_HTTPError,
compat_urlparse,
)
@@ -310,7 +312,13 @@ class BBCCoUkIE(InfoExtractor):
def _get_subtitles(self, media, programme_id):
subtitles = {}
for connection in self._extract_connections(media):
captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
cc_url = url_or_none(connection.get('href'))
if not cc_url:
continue
captions = self._download_xml(
cc_url, programme_id, 'Downloading captions', fatal=False)
if not isinstance(captions, compat_etree_Element):
continue
lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
subtitles[lang] = [
{

View File

@@ -93,8 +93,8 @@ class BiliBiliIE(InfoExtractor):
}]
}]
_APP_KEY = '84956560bc028eb7'
_BILIBILI_KEY = '94aba54af9065f71de72f5508f1cd42e'
_APP_KEY = 'iVGUTjsxvpLeuDCf'
_BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt'
def _report_error(self, result):
if 'message' in result:

View File

@@ -17,6 +17,7 @@ import math
from ..compat import (
compat_cookiejar,
compat_cookies,
compat_etree_Element,
compat_etree_fromstring,
compat_getpass,
compat_integer_types,
@@ -102,10 +103,23 @@ class InfoExtractor(object):
from worst to best quality.
Potential fields:
* url Mandatory. The URL of the video file
* url The mandatory URL representing the media:
for plain file media - HTTP URL of this file,
for RTMP - RTMP URL,
for HLS - URL of the M3U8 media playlist,
for HDS - URL of the F4M manifest,
for DASH - URL of the MPD manifest or
base URL representing the media
if MPD manifest is parsed from
a string,
for MSS - URL of the ISM manifest.
* manifest_url
The URL of the manifest file in case of
fragmented media (DASH, hls, hds)
fragmented media:
for HLS - URL of the M3U8 master playlist,
for HDS - URL of the F4M manifest,
for DASH - URL of the MPD manifest,
for MSS - URL of the ISM manifest.
* ext Will be calculated from URL if missing
* format A human-readable description of the format
("mp4 container with h264/opus").
@@ -788,7 +802,7 @@ class InfoExtractor(object):
fatal=True, encoding=None, data=None, headers={}, query={},
expected_status=None):
"""
Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
Return a tuple (xml as an compat_etree_Element, URL handle).
See _download_webpage docstring for arguments specification.
"""
@@ -809,7 +823,7 @@ class InfoExtractor(object):
transform_source=None, fatal=True, encoding=None,
data=None, headers={}, query={}, expected_status=None):
"""
Return the xml as an xml.etree.ElementTree.Element.
Return the xml as an compat_etree_Element.
See _download_webpage docstring for arguments specification.
"""
@@ -1058,7 +1072,7 @@ class InfoExtractor(object):
@staticmethod
def _og_regexes(prop):
content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
% {'prop': re.escape(prop)})
template = r'<meta[^>]+?%s[^>]+?%s'
return [
@@ -1454,6 +1468,9 @@ class InfoExtractor(object):
def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
transform_source=lambda s: fix_xml_ampersands(s).strip(),
fatal=True, m3u8_id=None):
if not isinstance(manifest, compat_etree_Element) and not fatal:
return []
# currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
if akamai_pv is not None and ';' in akamai_pv.text:
@@ -2120,7 +2137,8 @@ class InfoExtractor(object):
bandwidth = int_or_none(representation_attrib.get('bandwidth'))
f = {
'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
'url': base_url,
# NB: mpd_url may be empty when MPD manifest is parsed from a string
'url': mpd_url or base_url,
'manifest_url': mpd_url,
'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')),

View File

@@ -1,7 +1,10 @@
# coding: utf-8
from __future__ import unicode_literals, division
import hashlib
import hmac
import re
import time
from .common import InfoExtractor
from ..compat import compat_HTTPError
@@ -74,13 +77,16 @@ class CrackleIE(InfoExtractor):
for country in countries:
try:
# Authorization generation algorithm is reverse engineered from:
# https://www.sonycrackle.com/static/js/main.ea93451f.chunk.js
media_detail_url = 'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s?disableProtocols=true' % (video_id, country)
timestamp = time.strftime('%Y%m%d%H%M', time.gmtime())
h = hmac.new(b'IGSLUQCBDFHEOIFM', '|'.join([media_detail_url, timestamp]).encode(), hashlib.sha1).hexdigest().upper()
media = self._download_json(
'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s'
% (video_id, country), video_id,
'Downloading media JSON as %s' % country,
'Unable to download media JSON', query={
'disableProtocols': 'true',
'format': 'json'
media_detail_url, video_id, 'Downloading media JSON as %s' % country,
'Unable to download media JSON', headers={
'Accept': 'application/json',
'Authorization': '|'.join([h, timestamp, '117', '1']),
})
except ExtractorError as e:
# 401 means geo restriction, trying next country

View File

@@ -3,7 +3,6 @@ from __future__ import unicode_literals
import re
import json
import xml.etree.ElementTree as etree
import zlib
from hashlib import sha1
@@ -12,6 +11,7 @@ from .common import InfoExtractor
from .vrv import VRVIE
from ..compat import (
compat_b64decode,
compat_etree_Element,
compat_etree_fromstring,
compat_urllib_parse_urlencode,
compat_urllib_request,
@@ -56,22 +56,11 @@ class CrunchyrollBaseIE(InfoExtractor):
if username is None:
return
self._download_webpage(
'https://www.crunchyroll.com/?a=formhandler',
None, 'Logging in', 'Wrong login info',
data=urlencode_postdata({
'formname': 'RpcApiUser_Login',
'next_url': 'https://www.crunchyroll.com/acct/membership',
'name': username,
'password': password,
}))
'''
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
def is_logged(webpage):
return '<title>Redirecting' in webpage
return 'href="/logout"' in webpage
# Already logged in
if is_logged(login_page):
@@ -110,7 +99,6 @@ class CrunchyrollBaseIE(InfoExtractor):
raise ExtractorError('Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
'''
def _real_initialize(self):
self._login()
@@ -402,7 +390,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
'Downloading subtitles for ' + sub_name, data={
'subtitle_script_id': sub_id,
})
if not isinstance(sub_doc, etree.Element):
if not isinstance(sub_doc, compat_etree_Element):
continue
sid = sub_doc.get('id')
iv = xpath_text(sub_doc, 'iv', 'subtitle iv')
@@ -519,7 +507,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
'video_quality': stream_quality,
'current_page': url,
})
if isinstance(streamdata, etree.Element):
if isinstance(streamdata, compat_etree_Element):
stream_info = streamdata.find('./{default}preload/stream_info')
if stream_info is not None:
stream_infos.append(stream_info)
@@ -530,7 +518,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
'video_format': stream_format,
'video_encode_quality': stream_quality,
})
if isinstance(stream_info, etree.Element):
if isinstance(stream_info, compat_etree_Element):
stream_infos.append(stream_info)
for stream_info in stream_infos:
video_encode_id = xpath_text(stream_info, './video_encode_id')
@@ -605,7 +593,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
season = episode = episode_number = duration = thumbnail = None
if isinstance(metadata, etree.Element):
if isinstance(metadata, compat_etree_Element):
season = xpath_text(metadata, 'series_title')
episode = xpath_text(metadata, 'episode_title')
episode_number = int_or_none(xpath_text(metadata, 'episode_number'))

View File

@@ -4,7 +4,9 @@ import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
NO_DEFAULT,
parse_duration,
str_to_int,
)
@@ -65,6 +67,9 @@ class DrTuberIE(InfoExtractor):
})
self._sort_formats(formats)
duration = int_or_none(video_data.get('duration')) or parse_duration(
video_data.get('duration_format'))
title = self._html_search_regex(
(r'<h1[^>]+class=["\']title[^>]+>([^<]+)',
r'<title>([^<]+)\s*@\s+DrTuber',
@@ -103,4 +108,5 @@ class DrTuberIE(InfoExtractor):
'comment_count': comment_count,
'categories': categories,
'age_limit': self._rta_search(webpage),
'duration': duration,
}

View File

@@ -171,10 +171,13 @@ class DRTVIE(InfoExtractor):
continue
target = link.get('Target')
format_id = target or ''
preference = None
if asset_target in ('SpokenSubtitles', 'SignLanguage'):
if asset_target in ('SpokenSubtitles', 'SignLanguage', 'VisuallyInterpreted'):
preference = -1
format_id += '-%s' % asset_target
elif asset_target == 'Default':
preference = 1
else:
preference = None
if target == 'HDS':
f4m_formats = self._extract_f4m_formats(
uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',

View File

@@ -29,7 +29,8 @@ class ESPNIE(OnceIE):
(?:
.*?\?.*?\bid=|
/_/id/
)
)|
[^/]+/video/
)
)|
(?:www\.)espnfc\.(?:com|us)/(?:video/)?[^/]+/\d+/video/
@@ -94,6 +95,9 @@ class ESPNIE(OnceIE):
}, {
'url': 'http://www.espnfc.com/english-premier-league/23/video/3324163/premier-league-in-90-seconds-golden-tweets',
'only_matching': True,
}, {
'url': 'http://www.espn.com/espnw/video/26066627/arkansas-gibson-completes-hr-cycle-four-innings',
'only_matching': True,
}]
def _real_extract(self, url):

View File

@@ -593,6 +593,7 @@ from .linkedin import (
LinkedInLearningIE,
LinkedInLearningCourseIE,
)
from .linuxacademy import LinuxAcademyIE
from .litv import LiTVIE
from .liveleak import (
LiveLeakIE,
@@ -619,6 +620,7 @@ from .mailru import (
MailRuMusicSearchIE,
)
from .makertv import MakerTVIE
from .malltv import MallTVIE
from .mangomolo import (
MangomoloVideoIE,
MangomoloLiveIE,
@@ -1058,7 +1060,10 @@ from .southpark import (
SouthParkEsIE,
SouthParkNlIE
)
from .spankbang import SpankBangIE
from .spankbang import (
SpankBangIE,
SpankBangPlaylistIE,
)
from .spankwire import SpankwireIE
from .spiegel import SpiegelIE, SpiegelArticleIE
from .spiegeltv import SpiegeltvIE
@@ -1167,6 +1172,7 @@ from .toutv import TouTvIE
from .toypics import ToypicsUserIE, ToypicsIE
from .traileraddict import TrailerAddictIE
from .trilulilu import TriluliluIE
from .trunews import TruNewsIE
from .trutv import TruTVIE
from .tube8 import Tube8IE
from .tubitv import TubiTvIE
@@ -1212,7 +1218,7 @@ from .tvnow import (
from .tvp import (
TVPEmbedIE,
TVPIE,
TVPSeriesIE,
TVPWebsiteIE,
)
from .tvplay import (
TVPlayIE,
@@ -1362,7 +1368,6 @@ from .voxmedia import (
VoxMediaVolumeIE,
VoxMediaIE,
)
from .vporn import VpornIE
from .vrt import VRTIE
from .vrak import VrakIE
from .vrv import (

View File

@@ -424,7 +424,7 @@ class FacebookIE(InfoExtractor):
uploader = clean_html(get_element_by_id(
'fbPhotoPageAuthorName', webpage)) or self._search_regex(
r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',
fatal=False) or self._og_search_title(webpage, fatal=False)
default=None) or self._og_search_title(webpage, fatal=False)
timestamp = int_or_none(self._search_regex(
r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
'timestamp', default=None))

View File

@@ -215,7 +215,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
_TESTS = [{
'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html',
'info_dict': {
'id': '162311093',
'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1',
'ext': 'mp4',
'title': '13h15, le dimanche... - Les mystères de Jésus',
'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42',
@@ -271,7 +271,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
catalogue = None
video_id = self._search_regex(
r'data-main-video=(["\'])(?P<id>(?:(?!\1).)+)\1',
r'(?:data-main-video\s*=|videoId["\']?\s*[:=])\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
webpage, 'video id', default=None, group='id')
if not video_id:

View File

@@ -27,6 +27,10 @@ class ImgurIE(InfoExtractor):
}, {
'url': 'https://i.imgur.com/crGpqCV.mp4',
'only_matching': True,
}, {
# no title
'url': 'https://i.imgur.com/jxBXAMC.gifv',
'only_matching': True,
}]
def _real_extract(self, url):
@@ -87,7 +91,7 @@ class ImgurIE(InfoExtractor):
return {
'id': video_id,
'formats': formats,
'title': self._og_search_title(webpage),
'title': self._og_search_title(webpage, default=video_id),
}

View File

@@ -1,12 +1,14 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
get_element_by_class,
parse_duration,
strip_or_none,
unified_strdate,
)
@@ -21,7 +23,9 @@ class LibsynIE(InfoExtractor):
'id': '6385796',
'ext': 'mp3',
'title': "Champion Minded - Developing a Growth Mindset",
'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.',
# description fetched using another request:
# http://html5-player.libsyn.com/embed/getitemdetails?item_id=6385796
# 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.',
'upload_date': '20180320',
'thumbnail': 're:^https?://.*',
},
@@ -38,22 +42,36 @@ class LibsynIE(InfoExtractor):
}]
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
url = m.group('mainurl')
url, video_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, video_id)
podcast_title = self._search_regex(
r'<h3>([^<]+)</h3>', webpage, 'podcast title', default=None)
if podcast_title:
podcast_title = podcast_title.strip()
episode_title = self._search_regex(
r'(?:<div class="episode-title">|<h4>)([^<]+)</', webpage, 'episode title')
if episode_title:
episode_title = episode_title.strip()
data = self._parse_json(self._search_regex(
r'var\s+playlistItem\s*=\s*({.+?});',
webpage, 'JSON data block'), video_id)
episode_title = data.get('item_title') or get_element_by_class('episode-title', webpage)
if not episode_title:
self._search_regex(
[r'data-title="([^"]+)"', r'<title>(.+?)</title>'],
webpage, 'episode title')
episode_title = episode_title.strip()
podcast_title = strip_or_none(clean_html(self._search_regex(
r'<h3>([^<]+)</h3>', webpage, 'podcast title',
default=None) or get_element_by_class('podcast-title', webpage)))
title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title
formats = []
for k, format_id in (('media_url_libsyn', 'libsyn'), ('media_url', 'main'), ('download_link', 'download')):
f_url = data.get(k)
if not f_url:
continue
formats.append({
'url': f_url,
'format_id': format_id,
})
description = self._html_search_regex(
r'<p\s+id="info_text_body">(.+?)</p>', webpage,
'description', default=None)
@@ -61,27 +79,15 @@ class LibsynIE(InfoExtractor):
# Strip non-breaking and normal spaces
description = description.replace('\u00A0', ' ').strip()
release_date = unified_strdate(self._search_regex(
r'<div class="release_date">Released: ([^<]+)<', webpage, 'release date', fatal=False))
data_json = self._search_regex(r'var\s+playlistItem\s*=\s*(\{.*?\});\n', webpage, 'JSON data block')
data = json.loads(data_json)
formats = [{
'url': data['media_url'],
'format_id': 'main',
}, {
'url': data['media_url_libsyn'],
'format_id': 'libsyn',
}]
thumbnail = data.get('thumbnail_url')
duration = parse_duration(data.get('duration'))
r'<div class="release_date">Released: ([^<]+)<',
webpage, 'release date', default=None) or data.get('release_date'))
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'thumbnail': data.get('thumbnail_url'),
'upload_date': release_date,
'duration': duration,
'duration': parse_duration(data.get('duration')),
'formats': formats,
}

View File

@@ -34,12 +34,15 @@ class LinkedInLearningBaseIE(InfoExtractor):
'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value,
}, query=query)['elements'][0]
def _get_video_id(self, urn, course_slug, video_slug):
def _get_urn_id(self, video_data):
urn = video_data.get('urn')
if urn:
mobj = re.search(r'urn:li:lyndaCourse:\d+,(\d+)', urn)
if mobj:
return mobj.group(1)
return '%s/%s' % (course_slug, video_slug)
def _get_video_id(self, video_data, course_slug, video_slug):
return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug)
def _real_initialize(self):
email, password = self._get_login_info()
@@ -123,7 +126,7 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
self._sort_formats(formats, ('width', 'height', 'source_preference', 'tbr', 'abr'))
return {
'id': self._get_video_id(video_data.get('urn'), course_slug, video_slug),
'id': self._get_video_id(video_data, course_slug, video_slug),
'title': title,
'formats': formats,
'thumbnail': video_data.get('defaultThumbnail'),
@@ -154,18 +157,21 @@ class LinkedInLearningCourseIE(LinkedInLearningBaseIE):
course_data = self._call_api(course_slug, 'chapters,description,title')
entries = []
for chapter in course_data.get('chapters', []):
for chapter_number, chapter in enumerate(course_data.get('chapters', []), 1):
chapter_title = chapter.get('title')
chapter_id = self._get_urn_id(chapter)
for video in chapter.get('videos', []):
video_slug = video.get('slug')
if not video_slug:
continue
entries.append({
'_type': 'url_transparent',
'id': self._get_video_id(video.get('urn'), course_slug, video_slug),
'id': self._get_video_id(video, course_slug, video_slug),
'title': video.get('title'),
'url': 'https://www.linkedin.com/learning/%s/%s' % (course_slug, video_slug),
'chapter': chapter_title,
'chapter_number': chapter_number,
'chapter_id': chapter_id,
'ie_key': LinkedInLearningIE.ie_key(),
})

View File

@@ -0,0 +1,174 @@
from __future__ import unicode_literals
import json
import random
import re
from .common import InfoExtractor
from ..compat import (
compat_b64decode,
compat_HTTPError,
compat_str,
)
from ..utils import (
ExtractorError,
orderedSet,
unescapeHTML,
urlencode_postdata,
urljoin,
)
class LinuxAcademyIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:www\.)?linuxacademy\.com/cp/
(?:
courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)|
modules/view/id/(?P<course_id>\d+)
)
'''
_TESTS = [{
'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2/module/154',
'info_dict': {
'id': '1498-2',
'ext': 'mp4',
'title': "Introduction to the Practitioner's Brief",
},
'params': {
'skip_download': True,
},
'skip': 'Requires Linux Academy account credentials',
}, {
'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
'only_matching': True,
}, {
'url': 'https://linuxacademy.com/cp/modules/view/id/154',
'info_dict': {
'id': '154',
'title': 'AWS Certified Cloud Practitioner',
'description': 'md5:039db7e60e4aac9cf43630e0a75fa834',
},
'playlist_count': 41,
'skip': 'Requires Linux Academy account credentials',
}]
_AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
_ORIGIN_URL = 'https://linuxacademy.com'
_CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
_NETRC_MACHINE = 'linuxacademy'
def _real_initialize(self):
self._login()
def _login(self):
username, password = self._get_login_info()
if username is None:
return
def random_string():
return ''.join([
random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~')
for _ in range(32)])
webpage, urlh = self._download_webpage_handle(
self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
'client_id': self._CLIENT_ID,
'response_type': 'token id_token',
'redirect_uri': self._ORIGIN_URL,
'scope': 'openid email user_impersonation profile',
'audience': self._ORIGIN_URL,
'state': random_string(),
'nonce': random_string(),
})
login_data = self._parse_json(
self._search_regex(
r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
'login info', group='value'), None,
transform_source=lambda x: compat_b64decode(x).decode('utf-8')
)['extraParams']
login_data.update({
'client_id': self._CLIENT_ID,
'redirect_uri': self._ORIGIN_URL,
'tenant': 'lacausers',
'connection': 'Username-Password-Authentication',
'username': username,
'password': password,
'sso': 'true',
})
login_state_url = compat_str(urlh.geturl())
try:
login_page = self._download_webpage(
'https://login.linuxacademy.com/usernamepassword/login', None,
'Downloading login page', data=json.dumps(login_data).encode(),
headers={
'Content-Type': 'application/json',
'Origin': 'https://login.linuxacademy.com',
'Referer': login_state_url,
})
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
error = self._parse_json(e.cause.read(), None)
message = error.get('description') or error['code']
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, message), expected=True)
raise
callback_page, urlh = self._download_webpage_handle(
'https://login.linuxacademy.com/login/callback', None,
'Downloading callback page',
data=urlencode_postdata(self._hidden_inputs(login_page)),
headers={
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://login.linuxacademy.com',
'Referer': login_state_url,
})
access_token = self._search_regex(
r'access_token=([^=&]+)', compat_str(urlh.geturl()),
'access token')
self._download_webpage(
'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
% access_token, None, 'Downloading token validation page')
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)
webpage = self._download_webpage(url, item_id)
# course path
if course_id:
entries = [
self.url_result(
urljoin(url, lesson_url), ie=LinuxAcademyIE.ie_key())
for lesson_url in orderedSet(re.findall(
r'<a[^>]+\bhref=["\'](/cp/courses/lesson/course/\d+/lesson/\d+/module/\d+)',
webpage))]
title = unescapeHTML(self._html_search_regex(
(r'class=["\']course-title["\'][^>]*>(?P<value>[^<]+)',
r'var\s+title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'),
webpage, 'title', default=None, group='value'))
description = unescapeHTML(self._html_search_regex(
r'var\s+description\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
webpage, 'description', default=None, group='value'))
return self.playlist_result(entries, course_id, title, description)
# single video path
info = self._extract_jwplayer_data(
webpage, item_id, require_title=False, m3u8_id='hls',)
title = self._search_regex(
(r'>Lecture\s*:\s*(?P<value>[^<]+)',
r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
'title', group='value')
info.update({
'id': item_id,
'title': title,
})
return info

View File

@@ -0,0 +1,53 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import merge_dicts
class MallTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?mall\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.mall.tv/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
'md5': '1c4a37f080e1f3023103a7b43458e518',
'info_dict': {
'id': 't0zzt0',
'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
'ext': 'mp4',
'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?',
'description': 'md5:25fc0ec42a72ba602b602c683fa29deb',
'duration': 216,
'timestamp': 1538870400,
'upload_date': '20181007',
'view_count': int,
}
}, {
'url': 'https://www.mall.tv/kdo-to-plati/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(
url, display_id, headers=self.geo_verification_headers())
SOURCE_RE = r'(<source[^>]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P<id>[\da-z]+)/index)\b'
video_id = self._search_regex(
SOURCE_RE, webpage, 'video id', group='id')
media = self._parse_html5_media_entries(
url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id,
m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0]
info = self._search_json_ld(webpage, video_id, default={})
return merge_dicts(media, info, {
'id': video_id,
'display_id': display_id,
'title': self._og_search_title(webpage, default=None) or display_id,
'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
})

View File

@@ -1,12 +1,13 @@
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urllib_parse,
compat_urllib_parse_unquote,
compat_urllib_parse_urlencode,
)
from ..utils import (
determine_ext,
@@ -144,7 +145,7 @@ class MetacafeIE(InfoExtractor):
headers = {
# Disable family filter
'Cookie': 'user=%s; ' % compat_urllib_parse_urlencode({'ffilter': False})
'Cookie': 'user=%s; ' % compat_urllib_parse.quote(json.dumps({'ffilter': False}))
}
# AnyClip videos require the flashversion cookie so that we get the link

View File

@@ -57,7 +57,8 @@ class NoovoIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
bc_url = BrightcoveNewIE._extract_url(self, webpage)
brightcove_id = self._search_regex(
r'data-video-id=["\'](\d+)', webpage, 'brightcove id')
data = self._parse_json(
self._search_regex(
@@ -89,7 +90,10 @@ class NoovoIE(InfoExtractor):
return {
'_type': 'url_transparent',
'ie_key': BrightcoveNewIE.ie_key(),
'url': smuggle_url(bc_url, {'geo_countries': ['CA']}),
'url': smuggle_url(
self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
{'geo_countries': ['CA']}),
'id': brightcove_id,
'title': title,
'description': description,
'series': series,

View File

@@ -12,11 +12,16 @@ from ..utils import (
ExtractorError,
fix_xml_ampersands,
int_or_none,
merge_dicts,
orderedSet,
parse_duration,
qualities,
str_or_none,
strip_jsonp,
unified_strdate,
unified_timestamp,
url_or_none,
urlencode_postdata,
)
@@ -176,9 +181,118 @@ class NPOIE(NPOBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url)
return self._get_info(video_id)
try:
return self._get_info(url, video_id)
except ExtractorError:
return self._get_old_info(video_id)
def _get_info(self, video_id):
def _get_info(self, url, video_id):
token = self._download_json(
'https://www.npostart.nl/api/token', video_id,
'Downloading token', headers={
'Referer': url,
'X-Requested-With': 'XMLHttpRequest',
})['token']
player = self._download_json(
'https://www.npostart.nl/player/%s' % video_id, video_id,
'Downloading player JSON', data=urlencode_postdata({
'autoplay': 0,
'share': 1,
'pageUrl': url,
'hasAdConsent': 0,
'_token': token,
}))
player_token = player['token']
format_urls = set()
formats = []
for profile in ('hls', 'dash-widevine', 'dash-playready', 'smooth'):
streams = self._download_json(
'https://start-player.npo.nl/video/%s/streams' % video_id,
video_id, 'Downloading %s profile JSON' % profile, fatal=False,
query={
'profile': profile,
'quality': 'npo',
'tokenId': player_token,
'streamType': 'broadcast',
})
if not streams:
continue
stream = streams.get('stream')
if not isinstance(stream, dict):
continue
stream_url = url_or_none(stream.get('src'))
if not stream_url or stream_url in format_urls:
continue
format_urls.add(stream_url)
if stream.get('protection') is not None:
continue
stream_type = stream.get('type')
stream_ext = determine_ext(stream_url)
if stream_type == 'application/dash+xml' or stream_ext == 'mpd':
formats.extend(self._extract_mpd_formats(
stream_url, video_id, mpd_id='dash', fatal=False))
elif stream_type == 'application/vnd.apple.mpegurl' or stream_ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
stream_url, video_id, ext='mp4',
entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
elif re.search(r'\.isml?/Manifest', stream_url):
formats.extend(self._extract_ism_formats(
stream_url, video_id, ism_id='mss', fatal=False))
else:
formats.append({
'url': stream_url,
})
self._sort_formats(formats)
info = {
'id': video_id,
'title': video_id,
'formats': formats,
}
embed_url = url_or_none(player.get('embedUrl'))
if embed_url:
webpage = self._download_webpage(
embed_url, video_id, 'Downloading embed page', fatal=False)
if webpage:
video = self._parse_json(
self._search_regex(
r'\bvideo\s*=\s*({.+?})\s*;', webpage, 'video',
default='{}'), video_id)
if video:
title = video.get('episodeTitle')
subtitles = {}
subtitles_list = video.get('subtitles')
if isinstance(subtitles_list, list):
for cc in subtitles_list:
cc_url = url_or_none(cc.get('src'))
if not cc_url:
continue
lang = str_or_none(cc.get('language')) or 'nl'
subtitles.setdefault(lang, []).append({
'url': cc_url,
})
return merge_dicts({
'title': title,
'description': video.get('description'),
'thumbnail': url_or_none(
video.get('still_image_url') or video.get('orig_image_url')),
'duration': int_or_none(video.get('duration')),
'timestamp': unified_timestamp(video.get('broadcastDate')),
'creator': video.get('channel'),
'series': video.get('title'),
'episode': title,
'episode_number': int_or_none(video.get('episodeNumber')),
'subtitles': subtitles,
}, info)
return info
def _get_old_info(self, video_id):
metadata = self._download_json(
'http://e.omroep.nl/metadata/%s' % video_id,
video_id,
@@ -280,7 +394,7 @@ class NPOIE(NPOBaseIE):
# JSON
else:
video_url = stream_info.get('url')
if not video_url or video_url in urls:
if not video_url or 'vodnotavailable.' in video_url or video_url in urls:
continue
urls.add(video_url)
if determine_ext(video_url) == 'm3u8':

View File

@@ -248,8 +248,8 @@ class OpenloadIE(InfoExtractor):
(?P<host>
(?:www\.)?
(?:
openload\.(?:co|io|link)|
oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club)
openload\.(?:co|io|link|pw)|
oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|pw|live|space)
)
)/
(?:f|embed)/
@@ -337,6 +337,21 @@ class OpenloadIE(InfoExtractor):
}, {
'url': 'https://oload.club/f/Nr1L-aZ2dbQ',
'only_matching': True,
}, {
'url': 'https://oload.info/f/5NEAbI2BDSk',
'only_matching': True,
}, {
'url': 'https://openload.pw/f/WyKgK8s94N0',
'only_matching': True,
}, {
'url': 'https://oload.pw/f/WyKgK8s94N0',
'only_matching': True,
}, {
'url': 'https://oload.live/f/-Z58UZ-GR4M',
'only_matching': True,
}, {
'url': 'https://oload.space/f/IY4eZSst3u8/',
'only_matching': True,
}]
_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'

View File

@@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_iso8601,
unescapeHTML,
)
@@ -75,6 +76,14 @@ class PeriscopeIE(PeriscopeBaseIE):
'url': broadcast[image],
} for image in ('image_url', 'image_url_small') if broadcast.get(image)]
width = int_or_none(broadcast.get('width'))
height = int_or_none(broadcast.get('height'))
def add_width_and_height(f):
for key, val in (('width', width), ('height', height)):
if not f.get(key):
f[key] = val
video_urls = set()
formats = []
for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'):
@@ -83,16 +92,21 @@ class PeriscopeIE(PeriscopeBaseIE):
continue
video_urls.add(video_url)
if format_id != 'rtmp':
formats.extend(self._extract_m3u8_formats(
m3u8_formats = self._extract_m3u8_formats(
video_url, token, 'mp4',
entry_protocol='m3u8_native'
if state in ('ended', 'timed_out') else 'm3u8',
m3u8_id=format_id, fatal=False))
m3u8_id=format_id, fatal=False)
if len(m3u8_formats) == 1:
add_width_and_height(m3u8_formats[0])
formats.extend(m3u8_formats)
continue
formats.append({
rtmp_format = {
'url': video_url,
'ext': 'flv' if format_id == 'rtmp' else 'mp4',
})
}
add_width_and_height(rtmp_format)
formats.append(rtmp_format)
self._sort_formats(formats)
return {

View File

@@ -4,9 +4,11 @@ import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
js_to_json,
urljoin,
)
@@ -14,7 +16,7 @@ class PornHdIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?'
_TESTS = [{
'url': 'http://www.pornhd.com/videos/9864/selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video',
'md5': 'c8b964b1f0a4b5f7f28ae3a5c9f86ad5',
'md5': '87f1540746c1d32ec7a2305c12b96b25',
'info_dict': {
'id': '9864',
'display_id': 'selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video',
@@ -23,6 +25,7 @@ class PornHdIE(InfoExtractor):
'description': 'md5:3748420395e03e31ac96857a8f125b2b',
'thumbnail': r're:^https?://.*\.jpg',
'view_count': int,
'like_count': int,
'age_limit': 18,
}
}, {
@@ -37,6 +40,7 @@ class PornHdIE(InfoExtractor):
'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294',
'thumbnail': r're:^https?://.*\.jpg',
'view_count': int,
'like_count': int,
'age_limit': 18,
},
'skip': 'Not available anymore',
@@ -65,12 +69,14 @@ class PornHdIE(InfoExtractor):
formats = []
for format_id, video_url in sources.items():
video_url = urljoin(url, video_url)
if not video_url:
continue
height = int_or_none(self._search_regex(
r'^(\d+)[pP]', format_id, 'height', default=None))
formats.append({
'url': video_url,
'ext': determine_ext(video_url, 'mp4'),
'format_id': format_id,
'height': height,
})
@@ -85,6 +91,11 @@ class PornHdIE(InfoExtractor):
r"poster'?\s*:\s*([\"'])(?P<url>(?:(?!\1).)+)\1", webpage,
'thumbnail', fatal=False, group='url')
like_count = int_or_none(self._search_regex(
(r'(\d+)\s*</11[^>]+>(?:&nbsp;|\s)*\blikes',
r'class=["\']save-count["\'][^>]*>\s*(\d+)'),
webpage, 'like count', fatal=False))
return {
'id': video_id,
'display_id': display_id,
@@ -92,6 +103,7 @@ class PornHdIE(InfoExtractor):
'description': description,
'thumbnail': thumbnail,
'view_count': view_count,
'like_count': like_count,
'formats': formats,
'age_limit': 18,
}

View File

@@ -16,7 +16,6 @@ from .openload import PhantomJSwrapper
from ..utils import (
ExtractorError,
int_or_none,
js_to_json,
orderedSet,
remove_quotes,
str_to_int,
@@ -303,14 +302,12 @@ class PornHubIE(PornHubBaseIE):
comment_count = self._extract_count(
r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
page_params = self._parse_json(self._search_regex(
r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P<data>{[^}]+})',
webpage, 'page parameters', group='data', default='{}'),
video_id, transform_source=js_to_json, fatal=False)
tags = categories = None
if page_params:
tags = page_params.get('tags', '').split(',')
categories = page_params.get('categories', '').split(',')
def extract_list(meta_key):
div = self._search_regex(
r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>'
% meta_key, webpage, meta_key, default=None)
if div:
return re.findall(r'<a[^>]+\bhref=[^>]+>([^<]+)', div)
return {
'id': video_id,
@@ -325,8 +322,8 @@ class PornHubIE(PornHubBaseIE):
'comment_count': comment_count,
'formats': formats,
'age_limit': 18,
'tags': tags,
'categories': categories,
'tags': extract_list('tags'),
'categories': extract_list('categories'),
'subtitles': subtitles,
}

View File

@@ -4,16 +4,12 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_HTTPError
from ..utils import (
xpath_text,
find_xpath_attr,
determine_ext,
ExtractorError,
int_or_none,
unified_strdate,
xpath_element,
ExtractorError,
determine_protocol,
unsmuggle_url,
)
@@ -61,107 +57,67 @@ class RadioCanadaIE(InfoExtractor):
'only_matching': True,
}
]
_GEO_COUNTRIES = ['CA']
_access_token = None
_claims = None
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
app_code, video_id = re.match(self._VALID_URL, url).groups()
metadata = self._download_xml(
'http://api.radio-canada.ca/metaMedia/v1/index.ashx',
video_id, note='Downloading metadata XML', query={
def _call_api(self, path, video_id=None, app_code=None, query=None):
if not query:
query = {}
query.update({
'client_key': '773aea60-0e80-41bb-9c7f-e6d7c3ad17fb',
'output': 'json',
})
if video_id:
query.update({
'appCode': app_code,
'idMedia': video_id,
})
if self._access_token:
query['access_token'] = self._access_token
try:
return self._download_json(
'https://services.radio-canada.ca/media/' + path, video_id, query=query)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 422):
data = self._parse_json(e.cause.read().decode(), None)
error = data.get('error_description') or data['errorMessage']['text']
raise ExtractorError(error, expected=True)
raise
def _extract_info(self, app_code, video_id):
metas = self._call_api('meta/v1/index.ashx', video_id, app_code)['Metas']
def get_meta(name):
el = find_xpath_attr(metadata, './/Meta', 'name', name)
return el.text if el is not None else None
for meta in metas:
if meta.get('name') == name:
text = meta.get('text')
if text:
return text
# protectionType does not necessarily mean the video is DRM protected (see
# https://github.com/rg3/youtube-dl/pull/18609).
if get_meta('protectionType'):
self.report_warning('This video is probably DRM protected.')
device_types = ['ipad']
if not smuggled_data:
device_types.append('flash')
device_types.append('android')
formats = []
error = None
# TODO: extract f4m formats
# f4m formats can be extracted using flashhd device_type but they produce unplayable file
for device_type in device_types:
validation_url = 'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx'
query = {
'appCode': app_code,
'idMedia': video_id,
'connectionType': 'broadband',
'multibitrate': 'true',
'deviceType': device_type,
}
if smuggled_data:
validation_url = 'https://services.radio-canada.ca/media/validation/v2/'
query.update(smuggled_data)
else:
query.update({
# paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction
'paysJ391wsHjbOJwvCs26toz': 'CA',
'bypasslock': 'NZt5K62gRqfc',
})
v_data = self._download_xml(validation_url, video_id, note='Downloading %s XML' % device_type, query=query, fatal=False)
v_url = xpath_text(v_data, 'url')
if not v_url:
continue
if v_url == 'null':
error = xpath_text(v_data, 'message')
continue
ext = determine_ext(v_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
v_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
v_url, video_id, f4m_id='hds', fatal=False))
else:
ext = determine_ext(v_url)
bitrates = xpath_element(v_data, 'bitrates')
for url_e in bitrates.findall('url'):
tbr = int_or_none(url_e.get('bitrate'))
if not tbr:
continue
f_url = re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url)
protocol = determine_protocol({'url': f_url})
f = {
'format_id': '%s-%d' % (protocol, tbr),
'url': f_url,
'ext': 'flv' if protocol == 'rtmp' else ext,
'protocol': protocol,
'width': int_or_none(url_e.get('width')),
'height': int_or_none(url_e.get('height')),
'tbr': tbr,
}
mobj = re.match(r'(?P<url>rtmp://[^/]+/[^/]+)/(?P<playpath>[^?]+)(?P<auth>\?.+)', f_url)
if mobj:
f.update({
'url': mobj.group('url') + mobj.group('auth'),
'play_path': mobj.group('playpath'),
})
formats.append(f)
if protocol == 'rtsp':
base_url = self._search_regex(
r'rtsp://([^?]+)', f_url, 'base url', default=None)
if base_url:
base_url = 'http://' + base_url
formats.extend(self._extract_m3u8_formats(
base_url + '/playlist.m3u8', video_id, 'mp4',
'm3u8_native', m3u8_id='hls', fatal=False))
formats.extend(self._extract_f4m_formats(
base_url + '/manifest.f4m', video_id,
f4m_id='hds', fatal=False))
if not formats and error:
query = {
'connectionType': 'hd',
'deviceType': 'ipad',
'multibitrate': 'true',
}
if self._claims:
query['claims'] = self._claims
v_data = self._call_api('validation/v2/', video_id, app_code, query)
v_url = v_data.get('url')
if not v_url:
error = v_data['message']
if error == "Le contenu sélectionné n'est pas disponible dans votre pays":
raise self.raise_geo_restricted(error, self._GEO_COUNTRIES)
if error == 'Le contenu sélectionné est disponible seulement en premium':
self.raise_login_required(error)
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, error), expected=True)
formats = self._extract_m3u8_formats(v_url, video_id, 'mp4')
self._sort_formats(formats)
subtitles = {}
@@ -186,11 +142,14 @@ class RadioCanadaIE(InfoExtractor):
'formats': formats,
}
def _real_extract(self, url):
return self._extract_info(*re.match(self._VALID_URL, url).groups())
class RadioCanadaAudioVideoIE(InfoExtractor):
'radiocanada:audiovideo'
_VALID_URL = r'https?://ici\.radio-canada\.ca/audio-video/media-(?P<id>[0-9]+)'
_TEST = {
_VALID_URL = r'https?://ici\.radio-canada\.ca/([^/]+/)*media-(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam',
'info_dict': {
'id': '7527184',
@@ -203,7 +162,10 @@ class RadioCanadaAudioVideoIE(InfoExtractor):
# m3u8 download
'skip_download': True,
},
}
}, {
'url': 'https://ici.radio-canada.ca/info/videos/media-7527184/barack-obama-au-vietnam',
'only_matching': True,
}]
def _real_extract(self, url):
return self.url_result('radiocanada:medianet:%s' % self._match_id(url))

View File

@@ -74,11 +74,11 @@ class RaiBaseIE(InfoExtractor):
if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'):
continue
if ext == 'm3u8':
if ext == 'm3u8' or 'format=m3u8' in media_url or platform == 'mon':
formats.extend(self._extract_m3u8_formats(
media_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
elif ext == 'f4m':
elif ext == 'f4m' or platform == 'flash':
manifest_url = update_url_query(
media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'),
{'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'})
@@ -288,7 +288,7 @@ class RaiPlayPlaylistIE(InfoExtractor):
class RaiIE(RaiBaseIE):
_VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE
_VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE
_TESTS = [{
# var uniquename = "ContentItem-..."
# data-id="ContentItem-..."
@@ -375,6 +375,9 @@ class RaiIE(RaiBaseIE):
# Direct MMS URL
'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html',
'only_matching': True,
}, {
'url': 'https://www.rainews.it/tgr/marche/notiziari/video/2019/02/ContentItem-6ba945a2-889c-4a80-bdeb-8489c70a8db9.html',
'only_matching': True,
}]
def _extract_from_content_id(self, content_id, url):

View File

@@ -21,7 +21,17 @@ from ..utils import (
class RutubeBaseIE(InfoExtractor):
def _extract_video(self, video, video_id=None, require_title=True):
def _download_api_info(self, video_id, query=None):
if not query:
query = {}
query['format'] = 'json'
return self._download_json(
'http://rutube.ru/api/video/%s/' % video_id,
video_id, 'Downloading video JSON',
'Unable to download video JSON', query=query)
@staticmethod
def _extract_info(video, video_id=None, require_title=True):
title = video['title'] if require_title else video.get('title')
age_limit = video.get('is_adult')
@@ -32,7 +42,7 @@ class RutubeBaseIE(InfoExtractor):
category = try_get(video, lambda x: x['category']['name'])
return {
'id': video.get('id') or video_id,
'id': video.get('id') or video_id if video_id else video['id'],
'title': title,
'description': video.get('description'),
'thumbnail': video.get('thumbnail_url'),
@@ -47,6 +57,42 @@ class RutubeBaseIE(InfoExtractor):
'is_live': bool_or_none(video.get('is_livestream')),
}
def _download_and_extract_info(self, video_id, query=None):
return self._extract_info(
self._download_api_info(video_id, query=query), video_id)
def _download_api_options(self, video_id, query=None):
if not query:
query = {}
query['format'] = 'json'
return self._download_json(
'http://rutube.ru/api/play/options/%s/' % video_id,
video_id, 'Downloading options JSON',
'Unable to download options JSON',
headers=self.geo_verification_headers(), query=query)
def _extract_formats(self, options, video_id):
formats = []
for format_id, format_url in options['video_balancer'].items():
ext = determine_ext(format_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
format_url, video_id, f4m_id=format_id, fatal=False))
else:
formats.append({
'url': format_url,
'format_id': format_id,
})
self._sort_formats(formats)
return formats
def _download_and_extract_formats(self, video_id, query=None):
return self._extract_formats(
self._download_api_options(video_id, query=query), video_id)
class RutubeIE(RutubeBaseIE):
IE_NAME = 'rutube'
@@ -55,13 +101,13 @@ class RutubeIE(RutubeBaseIE):
_TESTS = [{
'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
'md5': '79938ade01294ef7e27574890d0d3769',
'md5': '1d24f180fac7a02f3900712e5a5764d6',
'info_dict': {
'id': '3eac3b4561676c17df9132a9a1e62e3e',
'ext': 'flv',
'ext': 'mp4',
'title': 'Раненный кенгуру забежал в аптеку',
'description': 'http://www.ntdtv.ru ',
'duration': 80,
'duration': 81,
'uploader': 'NTDRussian',
'uploader_id': '29790',
'timestamp': 1381943602,
@@ -94,39 +140,12 @@ class RutubeIE(RutubeBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url)
video = self._download_json(
'http://rutube.ru/api/video/%s/?format=json' % video_id,
video_id, 'Downloading video JSON')
info = self._extract_video(video, video_id)
options = self._download_json(
'http://rutube.ru/api/play/options/%s/?format=json' % video_id,
video_id, 'Downloading options JSON',
headers=self.geo_verification_headers())
formats = []
for format_id, format_url in options['video_balancer'].items():
ext = determine_ext(format_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
format_url, video_id, f4m_id=format_id, fatal=False))
else:
formats.append({
'url': format_url,
'format_id': format_id,
})
self._sort_formats(formats)
info['formats'] = formats
info = self._download_and_extract_info(video_id)
info['formats'] = self._download_and_extract_formats(video_id)
return info
class RutubeEmbedIE(InfoExtractor):
class RutubeEmbedIE(RutubeBaseIE):
IE_NAME = 'rutube:embed'
IE_DESC = 'Rutube embedded videos'
_VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)'
@@ -135,7 +154,7 @@ class RutubeEmbedIE(InfoExtractor):
'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=',
'info_dict': {
'id': 'a10e53b86e8f349080f718582ce4c661',
'ext': 'flv',
'ext': 'mp4',
'timestamp': 1387830582,
'upload_date': '20131223',
'uploader_id': '297833',
@@ -149,16 +168,26 @@ class RutubeEmbedIE(InfoExtractor):
}, {
'url': 'http://rutube.ru/play/embed/8083783',
'only_matching': True,
}, {
# private video
'url': 'https://rutube.ru/play/embed/10631925?p=IbAigKqWd1do4mjaM5XLIQ',
'only_matching': True,
}]
def _real_extract(self, url):
embed_id = self._match_id(url)
webpage = self._download_webpage(url, embed_id)
canonical_url = self._html_search_regex(
r'<link\s+rel="canonical"\s+href="([^"]+?)"', webpage,
'Canonical URL')
return self.url_result(canonical_url, RutubeIE.ie_key())
# Query may contain private videos token and should be passed to API
# requests (see #19163)
query = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
options = self._download_api_options(embed_id, query)
video_id = options['effective_video']
formats = self._extract_formats(options, video_id)
info = self._download_and_extract_info(video_id, query)
info.update({
'extractor_key': 'Rutube',
'formats': formats,
})
return info
class RutubePlaylistBaseIE(RutubeBaseIE):
@@ -181,7 +210,7 @@ class RutubePlaylistBaseIE(RutubeBaseIE):
video_url = url_or_none(result.get('video_url'))
if not video_url:
continue
entry = self._extract_video(result, require_title=False)
entry = self._extract_info(result, require_title=False)
entry.update({
'_type': 'url',
'url': video_url,

View File

@@ -1,31 +1,44 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class ServusIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?servus\.com/(?:at|de)/p/[^/]+/(?P<id>AA-\w+|\d+-\d+)'
_VALID_URL = r'https?://(?:www\.)?servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)/(?P<id>[aA]{2}-\w+|\d+-\d+)'
_TESTS = [{
'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/',
'md5': '046dee641cda1c4cabe13baef3be2c1c',
'md5': '3e1dd16775aa8d5cbef23628cfffc1f4',
'info_dict': {
'id': 'AA-1T6VBU5PW1W12',
'ext': 'mp4',
'title': 'Die Grünen aus Volkssicht',
'description': 'md5:052b5da1cb2cd7d562ef1f19be5a5cba',
'thumbnail': r're:^https?://.*\.jpg$',
'title': 'Die Grünen aus Sicht des Volkes',
'description': 'md5:1247204d85783afe3682644398ff2ec4',
'thumbnail': r're:^https?://.*\.jpg',
}
}, {
'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/',
'only_matching': True,
}, {
'url': 'https://www.servus.com/tv/videos/aa-1t6vbu5pw1w12/',
'only_matching': True,
}, {
'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video_id = self._match_id(url).upper()
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(webpage)
title = self._search_regex(
(r'videoLabel\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'),
webpage, 'title', default=None,
group='title') or self._og_search_title(webpage)
title = re.sub(r'\s*-\s*Servus TV\s*$', '', title)
description = self._og_search_description(webpage)
thumbnail = self._og_search_thumbnail(webpage)

View File

@@ -61,7 +61,8 @@ class SixPlayIE(InfoExtractor):
quality_key = qualities(['lq', 'sd', 'hq', 'hd'])
formats = []
subtitles = {}
for asset in clip_data['assets']:
assets = clip_data.get('assets') or []
for asset in assets:
asset_url = asset.get('full_physical_path')
protocol = asset.get('protocol')
if not asset_url or protocol == 'primetime' or asset.get('type') == 'usp_hlsfp_h264' or asset_url in urls:

View File

@@ -16,8 +16,10 @@ from ..compat import (
from ..utils import (
ExtractorError,
int_or_none,
unified_strdate,
try_get,
unified_timestamp,
update_url_query,
url_or_none,
)
@@ -34,7 +36,7 @@ class SoundcloudIE(InfoExtractor):
(?:(?:(?:www\.|m\.)?soundcloud\.com/
(?!stations/track)
(?P<uploader>[\w\d-]+)/
(?!(?:tracks|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#]))
(?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#]))
(?P<title>[\w\d-]+)/?
(?P<token>[^?]+?)?(?:[?].*)?$)
|(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)
@@ -50,12 +52,17 @@ class SoundcloudIE(InfoExtractor):
'info_dict': {
'id': '62986583',
'ext': 'mp3',
'upload_date': '20121011',
'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1',
'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d',
'uploader': 'E.T. ExTerrestrial Music',
'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1',
'timestamp': 1349920598,
'upload_date': '20121011',
'duration': 143,
'license': 'all-rights-reserved',
'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
}
},
# not streamable song
@@ -67,9 +74,14 @@ class SoundcloudIE(InfoExtractor):
'title': 'Goldrushed',
'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
'uploader': 'The Royal Concept',
'timestamp': 1337635207,
'upload_date': '20120521',
'duration': 227,
'duration': 30,
'license': 'all-rights-reserved',
'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
},
'params': {
# rtmp
@@ -84,11 +96,16 @@ class SoundcloudIE(InfoExtractor):
'id': '123998367',
'ext': 'mp3',
'title': 'Youtube - Dl Test Video \'\' Ä↭',
'uploader': 'jaimeMF',
'description': 'test chars: \"\'/\\ä↭',
'uploader': 'jaimeMF',
'timestamp': 1386604920,
'upload_date': '20131209',
'duration': 9,
'license': 'all-rights-reserved',
'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
},
},
# private link (alt format)
@@ -99,11 +116,16 @@ class SoundcloudIE(InfoExtractor):
'id': '123998367',
'ext': 'mp3',
'title': 'Youtube - Dl Test Video \'\' Ä↭',
'uploader': 'jaimeMF',
'description': 'test chars: \"\'/\\ä↭',
'uploader': 'jaimeMF',
'timestamp': 1386604920,
'upload_date': '20131209',
'duration': 9,
'license': 'all-rights-reserved',
'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
},
},
# downloadable song
@@ -116,9 +138,14 @@ class SoundcloudIE(InfoExtractor):
'title': 'Bus Brakes',
'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66',
'uploader': 'oddsamples',
'timestamp': 1389232924,
'upload_date': '20140109',
'duration': 17,
'license': 'cc-by-sa',
'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
},
},
# private link, downloadable format
@@ -131,9 +158,14 @@ class SoundcloudIE(InfoExtractor):
'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]',
'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366',
'uploader': 'Ori Uplift Music',
'timestamp': 1504206263,
'upload_date': '20170831',
'duration': 7449,
'license': 'all-rights-reserved',
'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
},
},
# no album art, use avatar pic for thumbnail
@@ -146,10 +178,15 @@ class SoundcloudIE(InfoExtractor):
'title': 'Sideways (Prod. Mad Real)',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'uploader': 'garyvee',
'timestamp': 1488152409,
'upload_date': '20170226',
'duration': 207,
'thumbnail': r're:https?://.*\.jpg',
'license': 'all-rights-reserved',
'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
},
'params': {
'skip_download': True,
@@ -157,7 +194,7 @@ class SoundcloudIE(InfoExtractor):
},
]
_CLIENT_ID = 'LvWovRaJZlWCHql0bISuum8Bd2KX79mb'
_CLIENT_ID = 'NmW1FlPaiL94ueEu7oziOWjYEzZzQDcK'
@staticmethod
def _extract_urls(webpage):
@@ -175,22 +212,33 @@ class SoundcloudIE(InfoExtractor):
def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None):
track_id = compat_str(info['id'])
title = info['title']
name = full_title or track_id
if quiet:
self.report_extraction(name)
thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url')
if isinstance(thumbnail, compat_str):
thumbnail = thumbnail.replace('-large', '-t500x500')
username = try_get(info, lambda x: x['user']['username'], compat_str)
def extract_count(key):
return int_or_none(info.get('%s_count' % key))
result = {
'id': track_id,
'uploader': info.get('user', {}).get('username'),
'upload_date': unified_strdate(info.get('created_at')),
'title': info['title'],
'uploader': username,
'timestamp': unified_timestamp(info.get('created_at')),
'title': title,
'description': info.get('description'),
'thumbnail': thumbnail,
'duration': int_or_none(info.get('duration'), 1000),
'webpage_url': info.get('permalink_url'),
'license': info.get('license'),
'view_count': extract_count('playback'),
'like_count': extract_count('favoritings'),
'comment_count': extract_count('comment'),
'repost_count': extract_count('reposts'),
'genre': info.get('genre'),
}
formats = []
query = {'client_id': self._CLIENT_ID}
@@ -368,7 +416,6 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE):
_API_BASE = 'https://api.soundcloud.com'
_API_V2_BASE = 'https://api-v2.soundcloud.com'
def _extract_playlist(self, base_url, playlist_id, playlist_title):
@@ -389,21 +436,30 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE):
next_href, playlist_id, 'Downloading track page %s' % (i + 1))
collection = response['collection']
if not collection:
break
def resolve_permalink_url(candidates):
if not isinstance(collection, list):
collection = []
# Empty collection may be returned, in this case we proceed
# straight to next_href
def resolve_entry(candidates):
for cand in candidates:
if isinstance(cand, dict):
permalink_url = cand.get('permalink_url')
entry_id = self._extract_id(cand)
if permalink_url and permalink_url.startswith('http'):
return permalink_url, entry_id
if not isinstance(cand, dict):
continue
permalink_url = url_or_none(cand.get('permalink_url'))
if not permalink_url:
continue
return self.url_result(
permalink_url,
ie=SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None,
video_id=self._extract_id(cand),
video_title=cand.get('title'))
for e in collection:
permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist')))
if permalink_url:
entries.append(self.url_result(permalink_url, video_id=entry_id))
entry = resolve_entry((e, e.get('track'), e.get('playlist')))
if entry:
entries.append(entry)
next_href = response.get('next_href')
if not next_href:
@@ -429,46 +485,53 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
(?:(?:www|m)\.)?soundcloud\.com/
(?P<user>[^/]+)
(?:/
(?P<rsrc>tracks|sets|reposts|likes|spotlight)
(?P<rsrc>tracks|albums|sets|reposts|likes|spotlight)
)?
/?(?:[?#].*)?$
'''
IE_NAME = 'soundcloud:user'
_TESTS = [{
'url': 'https://soundcloud.com/the-akashic-chronicler',
'url': 'https://soundcloud.com/soft-cell-official',
'info_dict': {
'id': '114582580',
'title': 'The Akashic Chronicler (All)',
'id': '207965082',
'title': 'Soft Cell (All)',
},
'playlist_mincount': 74,
'playlist_mincount': 28,
}, {
'url': 'https://soundcloud.com/the-akashic-chronicler/tracks',
'url': 'https://soundcloud.com/soft-cell-official/tracks',
'info_dict': {
'id': '114582580',
'title': 'The Akashic Chronicler (Tracks)',
'id': '207965082',
'title': 'Soft Cell (Tracks)',
},
'playlist_mincount': 37,
'playlist_mincount': 27,
}, {
'url': 'https://soundcloud.com/the-akashic-chronicler/sets',
'url': 'https://soundcloud.com/soft-cell-official/albums',
'info_dict': {
'id': '114582580',
'title': 'The Akashic Chronicler (Playlists)',
'id': '207965082',
'title': 'Soft Cell (Albums)',
},
'playlist_mincount': 1,
}, {
'url': 'https://soundcloud.com/jcv246/sets',
'info_dict': {
'id': '12982173',
'title': 'Jordi / cv (Playlists)',
},
'playlist_mincount': 2,
}, {
'url': 'https://soundcloud.com/the-akashic-chronicler/reposts',
'url': 'https://soundcloud.com/jcv246/reposts',
'info_dict': {
'id': '114582580',
'title': 'The Akashic Chronicler (Reposts)',
'id': '12982173',
'title': 'Jordi / cv (Reposts)',
},
'playlist_mincount': 7,
'playlist_mincount': 6,
}, {
'url': 'https://soundcloud.com/the-akashic-chronicler/likes',
'url': 'https://soundcloud.com/clalberg/likes',
'info_dict': {
'id': '114582580',
'title': 'The Akashic Chronicler (Likes)',
'id': '11817582',
'title': 'clalberg (Likes)',
},
'playlist_mincount': 321,
'playlist_mincount': 5,
}, {
'url': 'https://soundcloud.com/grynpyret/spotlight',
'info_dict': {
@@ -479,10 +542,11 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
}]
_BASE_URL_MAP = {
'all': '%s/profile/soundcloud:users:%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_BASE,
'all': '%s/stream/users/%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
'albums': '%s/users/%%s/albums' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
'sets': '%s/users/%%s/playlists' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
'reposts': '%s/profile/soundcloud:users:%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
'reposts': '%s/stream/users/%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
'likes': '%s/users/%%s/likes' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
'spotlight': '%s/users/%%s/spotlight' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
}
@@ -490,6 +554,7 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
_TITLE_MAP = {
'all': 'All',
'tracks': 'Tracks',
'albums': 'Albums',
'sets': 'Playlists',
'reposts': 'Reposts',
'likes': 'Likes',

View File

@@ -5,14 +5,17 @@ import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
orderedSet,
parse_duration,
parse_resolution,
str_to_int,
url_or_none,
urlencode_postdata,
)
class SpankBangIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www|m|[a-z]{2})\.)?spankbang\.com/(?P<id>[\da-z]+)/video'
_VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/(?:video|play|embed)\b'
_TESTS = [{
'url': 'http://spankbang.com/3vvn/video/fantasy+solo',
'md5': '1cc433e1d6aa14bc376535b8679302f7',
@@ -41,29 +44,71 @@ class SpankBangIE(InfoExtractor):
# 4k
'url': 'https://spankbang.com/1vwqx/video/jade+kush+solo+4k',
'only_matching': True,
}, {
'url': 'https://m.spankbang.com/3vvn/play/fantasy+solo/480p/',
'only_matching': True,
}, {
'url': 'https://m.spankbang.com/3vvn/play',
'only_matching': True,
}, {
'url': 'https://spankbang.com/2y3td/embed/',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id, headers={
'Cookie': 'country=US'
})
webpage = self._download_webpage(
url.replace('/%s/embed' % video_id, '/%s/video' % video_id),
video_id, headers={'Cookie': 'country=US'})
if re.search(r'<[^>]+\bid=["\']video_removed', webpage):
raise ExtractorError(
'Video %s is not available' % video_id, expected=True)
formats = []
for mobj in re.finditer(
r'stream_url_(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2',
webpage):
format_id, format_url = mobj.group('id', 'url')
def extract_format(format_id, format_url):
f_url = url_or_none(format_url)
if not f_url:
return
f = parse_resolution(format_id)
f.update({
'url': format_url,
'url': f_url,
'format_id': format_id,
})
formats.append(f)
STREAM_URL_PREFIX = 'stream_url_'
for mobj in re.finditer(
r'%s(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2'
% STREAM_URL_PREFIX, webpage):
extract_format(mobj.group('id', 'url'))
if not formats:
stream_key = self._search_regex(
r'data-streamkey\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
webpage, 'stream key', group='value')
sb_csrf_session = self._get_cookies(
'https://spankbang.com')['sb_csrf_session'].value
stream = self._download_json(
'https://spankbang.com/api/videos/stream', video_id,
'Downloading stream JSON', data=urlencode_postdata({
'id': stream_key,
'data': 0,
'sb_csrf_session': sb_csrf_session,
}), headers={
'Referer': url,
'X-CSRFToken': sb_csrf_session,
})
for format_id, format_url in stream.items():
if format_id.startswith(STREAM_URL_PREFIX):
extract_format(
format_id[len(STREAM_URL_PREFIX):], format_url)
self._sort_formats(formats)
title = self._html_search_regex(
@@ -94,3 +139,33 @@ class SpankBangIE(InfoExtractor):
'formats': formats,
'age_limit': age_limit,
}
class SpankBangPlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/[^/]+'
_TEST = {
'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties',
'info_dict': {
'id': 'ug0k',
'title': 'Big Ass Titties',
},
'playlist_mincount': 50,
}
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(
url, playlist_id, headers={'Cookie': 'country=US; mobile=on'})
entries = [self.url_result(
'https://spankbang.com/%s/video' % video_id,
ie=SpankBangIE.ie_key(), video_id=video_id)
for video_id in orderedSet(re.findall(
r'<a[^>]+\bhref=["\']/?([\da-z]+)/play/', webpage))]
title = self._html_search_regex(
r'<h1>([^<]+)\s+playlist</h1>', webpage, 'playlist title',
fatal=False)
return self.playlist_result(entries, playlist_id, title)

View File

@@ -46,8 +46,12 @@ class ParamountNetworkIE(MTVServicesInfoExtractor):
_GEO_COUNTRIES = ['US']
def _extract_mgid(self, webpage):
cs = self._parse_json(self._search_regex(
root_data = self._parse_json(self._search_regex(
r'window\.__DATA__\s*=\s*({.+})',
webpage, 'data'), None)['children']
c = next(c for c in cs if c.get('type') == 'VideoPlayer')
webpage, 'data'), None)
def find_sub_data(data, data_type):
return next(c for c in data['children'] if c.get('type') == data_type)
c = find_sub_data(find_sub_data(root_data, 'MainContainer'), 'VideoPlayer')
return c['props']['media']['video']['config']['uri']

View File

@@ -27,6 +27,7 @@ class TeachableBaseIE(InfoExtractor):
'market.saleshacker.com': 'saleshacker',
'learnability.org': 'learnability',
'edurila.com': 'edurila',
'courses.workitdaily.com': 'workitdaily',
}
_VALID_URL_SUB_TUPLE = (_URL_PREFIX, '|'.join(re.escape(site) for site in _SITES.keys()))

View File

@@ -1,24 +1,21 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
from .radiocanada import RadioCanadaIE
from ..compat import compat_HTTPError
from ..utils import (
ExtractorError,
int_or_none,
js_to_json,
urlencode_postdata,
extract_attributes,
smuggle_url,
merge_dicts,
)
class TouTvIE(InfoExtractor):
class TouTvIE(RadioCanadaIE):
_NETRC_MACHINE = 'toutv'
IE_NAME = 'tou.tv'
_VALID_URL = r'https?://ici\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/S[0-9]+[EC][0-9]+)?)'
_access_token = None
_claims = None
_TESTS = [{
'url': 'http://ici.tou.tv/garfield-tout-court/S2015E17',
@@ -41,57 +38,31 @@ class TouTvIE(InfoExtractor):
'url': 'https://ici.tou.tv/l-age-adulte/S01C501',
'only_matching': True,
}]
_CLIENT_KEY = '4dd36440-09d5-4468-8923-b6d91174ad36'
def _real_initialize(self):
email, password = self._get_login_info()
if email is None:
return
state = 'http://ici.tou.tv/'
webpage = self._download_webpage(state, None, 'Downloading homepage')
toutvlogin = self._parse_json(self._search_regex(
r'(?s)toutvlogin\s*=\s*({.+?});', webpage, 'toutvlogin'), None, js_to_json)
authorize_url = toutvlogin['host'] + '/auth/oauth/v2/authorize'
login_webpage = self._download_webpage(
authorize_url, None, 'Downloading login page', query={
'client_id': toutvlogin['clientId'],
'redirect_uri': 'https://ici.tou.tv/login/loginCallback',
'response_type': 'token',
'scope': 'media-drmt openid profile email id.write media-validation.read.privileged',
'state': state,
})
def extract_form_url_and_data(wp, default_form_url, form_spec_re=''):
form, form_elem = re.search(
r'(?s)((<form[^>]+?%s[^>]*?>).+?</form>)' % form_spec_re, wp).groups()
form_data = self._hidden_inputs(form)
form_url = extract_attributes(form_elem).get('action') or default_form_url
return form_url, form_data
post_url, form_data = extract_form_url_and_data(
login_webpage,
'https://services.radio-canada.ca/auth/oauth/v2/authorize/login',
r'(?:id|name)="Form-login"')
form_data.update({
'login-email': email,
'login-password': password,
})
consent_webpage = self._download_webpage(
post_url, None, 'Logging in', data=urlencode_postdata(form_data))
post_url, form_data = extract_form_url_and_data(
consent_webpage,
'https://services.radio-canada.ca/auth/oauth/v2/authorize/consent')
_, urlh = self._download_webpage_handle(
post_url, None, 'Following Redirection',
data=urlencode_postdata(form_data))
self._access_token = self._search_regex(
r'access_token=([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
urlh.geturl(), 'access token')
self._claims = self._download_json(
'https://services.radio-canada.ca/media/validation/v2/getClaims',
None, 'Extracting Claims', query={
'token': self._access_token,
'access_token': self._access_token,
})['claims']
try:
self._access_token = self._download_json(
'https://services.radio-canada.ca/toutv/profiling/accounts/login',
None, 'Logging in', data=json.dumps({
'ClientId': self._CLIENT_KEY,
'ClientSecret': '34026772-244b-49b6-8b06-317b30ac9a20',
'Email': email,
'Password': password,
'Scope': 'id.write media-validation.read',
}).encode(), headers={
'Authorization': 'client-key ' + self._CLIENT_KEY,
'Content-Type': 'application/json;charset=utf-8',
})['access_token']
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
error = self._parse_json(e.cause.read().decode(), None)['Message']
raise ExtractorError(error, expected=True)
raise
self._claims = self._call_api('validation/v2/getClaims')['claims']
def _real_extract(self, url):
path = self._match_id(url)
@@ -102,19 +73,10 @@ class TouTvIE(InfoExtractor):
self.report_warning('This video is probably DRM protected.', path)
video_id = metadata['IdMedia']
details = metadata['Details']
title = details['OriginalTitle']
video_url = 'radiocanada:%s:%s' % (metadata.get('AppCode', 'toutv'), video_id)
if self._access_token and self._claims:
video_url = smuggle_url(video_url, {
'access_token': self._access_token,
'claims': self._claims,
})
return {
'_type': 'url_transparent',
'url': video_url,
return merge_dicts({
'id': video_id,
'title': title,
'title': details.get('OriginalTitle'),
'thumbnail': details.get('ImageUrl'),
'duration': int_or_none(details.get('LengthInSeconds')),
}
}, self._extract_info(metadata.get('AppCode', 'toutv'), video_id))

View File

@@ -0,0 +1,75 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
dict_get,
float_or_none,
int_or_none,
unified_timestamp,
update_url_query,
url_or_none,
)
class TruNewsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?trunews\.com/stream/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'https://www.trunews.com/stream/will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech',
'md5': 'a19c024c3906ff954fac9b96ce66bb08',
'info_dict': {
'id': '5c5a21e65d3c196e1c0020cc',
'display_id': 'will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech',
'ext': 'mp4',
'title': "Will Democrats Stage a Circus During President Trump's State of the Union Speech?",
'description': 'md5:c583b72147cc92cf21f56a31aff7a670',
'duration': 3685,
'timestamp': 1549411440,
'upload_date': '20190206',
},
'add_ie': ['Zype'],
}
def _real_extract(self, url):
display_id = self._match_id(url)
video = self._download_json(
'https://api.zype.com/videos', display_id, query={
'app_key': 'PUVKp9WgGUb3-JUw6EqafLx8tFVP6VKZTWbUOR-HOm__g4fNDt1bCsm_LgYf_k9H',
'per_page': 1,
'active': 'true',
'friendly_title': display_id,
})['response'][0]
zype_id = video['_id']
thumbnails = []
thumbnails_list = video.get('thumbnails')
if isinstance(thumbnails_list, list):
for thumbnail in thumbnails_list:
if not isinstance(thumbnail, dict):
continue
thumbnail_url = url_or_none(thumbnail.get('url'))
if not thumbnail_url:
continue
thumbnails.append({
'url': thumbnail_url,
'width': int_or_none(thumbnail.get('width')),
'height': int_or_none(thumbnail.get('height')),
})
return {
'_type': 'url_transparent',
'url': update_url_query(
'https://player.zype.com/embed/%s.js' % zype_id,
{'api_key': 'X5XnahkjCwJrT_l5zUqypnaLEObotyvtUKJWWlONxDoHVjP8vqxlArLV8llxMbyt'}),
'ie_key': 'Zype',
'id': zype_id,
'display_id': display_id,
'title': video.get('title'),
'description': dict_get(video, ('description', 'ott_description', 'short_description')),
'duration': int_or_none(video.get('duration')),
'timestamp': unified_timestamp(video.get('published_at')),
'average_rating': float_or_none(video.get('rating')),
'view_count': int_or_none(video.get('request_count')),
'thumbnails': thumbnails,
}

View File

@@ -4,44 +4,72 @@ from __future__ import unicode_literals
import re
from .turner import TurnerBaseIE
from ..utils import (
int_or_none,
parse_iso8601,
)
class TruTVIE(TurnerBaseIE):
_VALID_URL = r'https?://(?:www\.)?trutv\.com(?:(?P<path>/shows/[^/]+/videos/[^/?#]+?)\.html|/full-episodes/[^/]+/(?P<id>\d+))'
_VALID_URL = r'https?://(?:www\.)?trutv\.com/(?:shows|full-episodes)/(?P<series_slug>[0-9A-Za-z-]+)/(?:videos/(?P<clip_slug>[0-9A-Za-z-]+)|(?P<id>\d+))'
_TEST = {
'url': 'http://www.trutv.com/shows/10-things/videos/you-wont-believe-these-sports-bets.html',
'md5': '2cdc844f317579fed1a7251b087ff417',
'url': 'https://www.trutv.com/shows/the-carbonaro-effect/videos/sunlight-activated-flower.html',
'info_dict': {
'id': '/shows/10-things/videos/you-wont-believe-these-sports-bets',
'id': 'f16c03beec1e84cd7d1a51f11d8fcc29124cc7f1',
'ext': 'mp4',
'title': 'You Won\'t Believe These Sports Bets',
'description': 'Jamie Lee sits down with a bookie to discuss the bizarre world of illegal sports betting.',
'upload_date': '20130305',
}
'title': 'Sunlight-Activated Flower',
'description': "A customer is stunned when he sees Michael's sunlight-activated flower.",
},
'params': {
# m3u8 download
'skip_download': True,
},
}
def _real_extract(self, url):
path, video_id = re.match(self._VALID_URL, url).groups()
auth_required = False
if path:
data_src = 'http://www.trutv.com/video/cvp/v2/xml/content.xml?id=%s.xml' % path
series_slug, clip_slug, video_id = re.match(self._VALID_URL, url).groups()
if video_id:
path = 'episode'
display_id = video_id
else:
webpage = self._download_webpage(url, video_id)
video_id = self._search_regex(
r"TTV\.TVE\.episodeId\s*=\s*'([^']+)';",
webpage, 'video id', default=video_id)
auth_required = self._search_regex(
r'TTV\.TVE\.authRequired\s*=\s*(true|false);',
webpage, 'auth required', default='false') == 'true'
data_src = 'http://www.trutv.com/tveverywhere/services/cvpXML.do?titleId=' + video_id
return self._extract_cvp_info(
data_src, path, {
'secure': {
'media_src': 'http://androidhls-secure.cdn.turner.com/trutv/big',
'tokenizer_src': 'http://www.trutv.com/tveverywhere/processors/services/token_ipadAdobe.do',
},
}, {
path = 'series/clip'
display_id = clip_slug
data = self._download_json(
'https://api.trutv.com/v2/web/%s/%s/%s' % (path, series_slug, display_id),
display_id)
video_data = data['episode'] if video_id else data['info']
media_id = video_data['mediaId']
title = video_data['title'].strip()
info = self._extract_ngtv_info(
media_id, {}, {
'url': url,
'site_name': 'truTV',
'auth_required': auth_required,
'auth_required': video_data.get('isAuthRequired'),
})
thumbnails = []
for image in video_data.get('images', []):
image_url = image.get('srcUrl')
if not image_url:
continue
thumbnails.append({
'url': image_url,
'width': int_or_none(image.get('width')),
'height': int_or_none(image.get('height')),
})
info.update({
'id': media_id,
'display_id': display_id,
'title': title,
'description': video_data.get('description'),
'thumbnails': thumbnails,
'timestamp': parse_iso8601(video_data.get('publicationDate')),
'series': video_data.get('showTitle'),
'season_number': int_or_none(video_data.get('seasonNum')),
'episode_number': int_or_none(video_data.get('episodeNum')),
})
return info

View File

@@ -1,14 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
clean_html,
get_element_by_attribute,
determine_ext,
ExtractorError,
get_element_by_attribute,
orderedSet,
)
@@ -19,12 +21,12 @@ class TVPIE(InfoExtractor):
_TESTS = [{
'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536',
'md5': '8aa518c15e5cc32dfe8db400dc921fbb',
'md5': 'a21eb0aa862f25414430f15fdfb9e76c',
'info_dict': {
'id': '194536',
'ext': 'mp4',
'title': 'Czas honoru, I seria odc. 13',
'description': 'md5:381afa5bca72655fe94b05cfe82bf53d',
'title': 'Czas honoru, odc. 13 Władek',
'description': 'md5:437f48b93558370b031740546b696e24',
},
}, {
'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
@@ -45,6 +47,7 @@ class TVPIE(InfoExtractor):
'title': 'Wiadomości, 28.09.2017, 19:30',
'description': 'Wydanie główne codziennego serwisu informacyjnego.'
},
'skip': 'HTTP Error 404: Not Found',
}, {
'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
'only_matching': True,
@@ -75,8 +78,10 @@ class TVPIE(InfoExtractor):
return {
'_type': 'url_transparent',
'url': 'tvp:' + video_id,
'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(
webpage, default=None) or self._html_search_meta(
'description', webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
'ie_key': 'TVPEmbed',
}
@@ -87,6 +92,15 @@ class TVPEmbedIE(InfoExtractor):
_VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)'
_TESTS = [{
'url': 'tvp:194536',
'md5': 'a21eb0aa862f25414430f15fdfb9e76c',
'info_dict': {
'id': '194536',
'ext': 'mp4',
'title': 'Czas honoru, odc. 13 Władek',
},
}, {
# not available
'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268',
'md5': '8c9cd59d16edabf39331f93bf8a766c7',
'info_dict': {
@@ -94,6 +108,7 @@ class TVPEmbedIE(InfoExtractor):
'ext': 'mp4',
'title': 'Panorama, 07.12.2015, 15:40',
},
'skip': 'Transmisja została zakończona lub materiał niedostępny',
}, {
'url': 'tvp:22670268',
'only_matching': True,
@@ -105,10 +120,13 @@ class TVPEmbedIE(InfoExtractor):
webpage = self._download_webpage(
'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id)
error_massage = get_element_by_attribute('class', 'msg error', webpage)
if error_massage:
error = self._html_search_regex(
r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>',
webpage, 'error', default=None) or clean_html(
get_element_by_attribute('class', 'msg error', webpage))
if error:
raise ExtractorError('%s said: %s' % (
self.IE_NAME, clean_html(error_massage)), expected=True)
self.IE_NAME, clean_html(error)), expected=True)
title = self._search_regex(
r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1',
@@ -180,48 +198,55 @@ class TVPEmbedIE(InfoExtractor):
}
class TVPSeriesIE(InfoExtractor):
class TVPWebsiteIE(InfoExtractor):
IE_NAME = 'tvp:series'
_VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P<id>[^/]+)/?$'
_VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)'
_TESTS = [{
'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem',
# series
'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video',
'info_dict': {
'title': 'Ogniem i mieczem',
'id': '4278026',
'id': '38678312',
},
'playlist_count': 4,
'playlist_count': 115,
}, {
'url': 'http://vod.tvp.pl/audycje/podroze/boso-przez-swiat',
# film
'url': 'https://vod.tvp.pl/website/gloria,35139666',
'info_dict': {
'title': 'Boso przez świat',
'id': '9329207',
'id': '36637049',
'ext': 'mp4',
'title': 'Gloria, Gloria',
},
'playlist_count': 86,
'params': {
'skip_download': True,
},
'add_ie': ['TVPEmbed'],
}, {
'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312',
'only_matching': True,
}]
def _entries(self, display_id, playlist_id):
url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id)
for page_num in itertools.count(1):
page = self._download_webpage(
url, display_id, 'Downloading page %d' % page_num,
query={'page': page_num})
video_ids = orderedSet(re.findall(
r'<a[^>]+\bhref=["\']/video/%s,[^,]+,(\d+)' % display_id,
page))
if not video_ids:
break
for video_id in video_ids:
yield self.url_result(
'tvp:%s' % video_id, ie=TVPEmbedIE.ie_key(),
video_id=video_id)
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id, tries=5)
title = self._html_search_regex(
r'(?s) id=[\'"]path[\'"]>(?:.*? / ){2}(.*?)</span>', webpage, 'series')
playlist_id = self._search_regex(r'nodeId:\s*(\d+)', webpage, 'playlist id')
playlist = self._download_webpage(
'http://vod.tvp.pl/vod/seriesAjax?type=series&nodeId=%s&recommend'
'edId=0&sort=&page=0&pageSize=10000' % playlist_id, display_id, tries=5,
note='Downloading playlist')
videos_paths = re.findall(
'(?s)class="shortTitle">.*?href="(/[^"]+)', playlist)
entries = [
self.url_result('http://vod.tvp.pl%s' % v_path, ie=TVPIE.ie_key())
for v_path in videos_paths]
return {
'_type': 'playlist',
'id': playlist_id,
'display_id': display_id,
'title': title,
'entries': entries,
}
mobj = re.match(self._VALID_URL, url)
display_id, playlist_id = mobj.group('display_id', 'id')
return self.playlist_result(
self._entries(display_id, playlist_id), playlist_id)

View File

@@ -493,10 +493,9 @@ class TVPlayHomeIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
video_id = self._search_regex(
r'data-asset-id\s*=\s*["\'](\d{5,7})\b', webpage, 'video id',
default=None)
r'data-asset-id\s*=\s*["\'](\d{5,})\b', webpage, 'video id')
if video_id:
if len(video_id) < 8:
return self.url_result(
'mtg:%s' % video_id, ie=TVPlayIE.ie_key(), video_id=video_id)
@@ -537,8 +536,9 @@ class TVPlayHomeIE(InfoExtractor):
r'(\d+)(?:[.\s]+sezona|\s+HOOAEG)', season or '', 'season number',
default=None))
episode = self._search_regex(
r'(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, 'episode',
default=None, group='value')
(r'\bepisode\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
r'data-subtitle\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
'episode', default=None, group='value')
episode_number = int_or_none(self._search_regex(
r'(?:S[eē]rija|Osa)\s+(\d+)', episode or '', 'episode number',
default=None))

View File

@@ -136,7 +136,12 @@ class TwitchBaseIE(InfoExtractor):
source = next(f for f in formats if f['format_id'] == 'Source')
source['preference'] = 10
except StopIteration:
pass # No Source stream present
for f in formats:
if '/chunked/' in f['url']:
f.update({
'source_preference': 10,
'format_note': 'Source',
})
self._sort_formats(formats)

View File

@@ -29,7 +29,7 @@ class UdemyIE(InfoExtractor):
IE_NAME = 'udemy'
_VALID_URL = r'''(?x)
https?://
www\.udemy\.com/
(?:[^/]+\.)?udemy\.com/
(?:
[^#]+\#/lecture/|
lecture/view/?\?lectureId=|
@@ -64,6 +64,9 @@ class UdemyIE(InfoExtractor):
# only outputs rendition
'url': 'https://www.udemy.com/how-you-can-help-your-local-community-5-amazing-examples/learn/v4/t/lecture/3225750?start=0',
'only_matching': True,
}, {
'url': 'https://wipro.udemy.com/java-tutorial/#/lecture/172757',
'only_matching': True,
}]
def _extract_course_info(self, webpage, video_id):
@@ -123,10 +126,22 @@ class UdemyIE(InfoExtractor):
def _download_webpage_handle(self, *args, **kwargs):
headers = kwargs.get('headers', {}).copy()
headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4'
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
kwargs['headers'] = headers
return super(UdemyIE, self)._download_webpage_handle(
ret = super(UdemyIE, self)._download_webpage_handle(
*args, **compat_kwargs(kwargs))
if not ret:
return ret
webpage, _ = ret
if any(p in webpage for p in (
'>Please verify you are a human',
'Access to this page has been denied because we believe you are using automation tools to browse the website',
'"_pxCaptcha"')):
raise ExtractorError(
'Udemy asks you to solve a CAPTCHA. Login with browser, '
'solve CAPTCHA, then export cookies and pass cookie file to '
'youtube-dl with --cookies.', expected=True)
return ret
def _download_json(self, url_or_request, *args, **kwargs):
headers = {
@@ -403,8 +418,14 @@ class UdemyIE(InfoExtractor):
class UdemyCourseIE(UdemyIE):
IE_NAME = 'udemy:course'
_VALID_URL = r'https?://(?:www\.)?udemy\.com/(?P<id>[^/?#&]+)'
_TESTS = []
_VALID_URL = r'https?://(?:[^/]+\.)?udemy\.com/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.udemy.com/java-tutorial/',
'only_matching': True,
}, {
'url': 'https://wipro.udemy.com/java-tutorial/',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):

View File

@@ -2,18 +2,31 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import unified_timestamp
class URPlayIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ur(?:play|skola)\.se/(?:program|Produkter)/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://urplay.se/program/190031-tripp-trapp-trad-sovkudde',
'md5': 'ad5f0de86f16ca4c8062cd103959a9eb',
'url': 'https://urplay.se/program/203704-ur-samtiden-livet-universum-och-rymdens-markliga-musik-om-vetenskap-kritiskt-tankande-och-motstand',
'md5': 'ff5b0c89928f8083c74bbd5099c9292d',
'info_dict': {
'id': '203704',
'ext': 'mp4',
'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd',
'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a',
'timestamp': 1513512768,
'upload_date': '20171217',
},
}, {
'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde',
'info_dict': {
'id': '190031',
'ext': 'mp4',
'title': 'Tripp, Trapp, Träd : Sovkudde',
'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1',
'timestamp': 1440093600,
'upload_date': '20150820',
},
}, {
'url': 'http://urskola.se/Produkter/155794-Smasagor-meankieli-Grodan-i-vida-varlden',
@@ -51,6 +64,7 @@ class URPlayIE(InfoExtractor):
'title': urplayer_data['title'],
'description': self._og_search_description(webpage),
'thumbnail': urplayer_data.get('image'),
'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'), webpage, 'timestamp')),
'series': urplayer_data.get('series_title'),
'subtitles': subtitles,
'formats': formats,

View File

@@ -502,7 +502,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
orig_url = url
if mobj.group('pro') or mobj.group('player'):
if mobj.group('pro'):
# some videos require portfolio_id to be present in player url
# https://github.com/rg3/youtube-dl/issues/20070
url = self._extract_url(url, self._download_webpage(url, video_id))
elif mobj.group('player'):
url = 'https://player.vimeo.com/video/' + video_id
elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')):
url = 'https://vimeo.com/' + video_id

View File

@@ -1,123 +0,0 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
parse_duration,
str_to_int,
urljoin,
)
class VpornIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?vporn\.com/[^/]+/(?P<display_id>[^/]+)/(?P<id>\d+)'
_TESTS = [
{
'url': 'http://www.vporn.com/masturbation/violet-on-her-th-birthday/497944/',
'md5': 'facf37c1b86546fa0208058546842c55',
'info_dict': {
'id': '497944',
'display_id': 'violet-on-her-th-birthday',
'ext': 'mp4',
'title': 'Violet on her 19th birthday',
'description': 'Violet dances in front of the camera which is sure to get you horny.',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'kileyGrope',
'categories': ['Masturbation', 'Teen'],
'duration': 393,
'age_limit': 18,
'view_count': int,
},
'skip': 'video removed',
},
{
'url': 'http://www.vporn.com/female/hana-shower/523564/',
'md5': 'ced35a4656198a1664cf2cda1575a25f',
'info_dict': {
'id': '523564',
'display_id': 'hana-shower',
'ext': 'mp4',
'title': 'Hana Shower',
'description': 'Hana showers at the bathroom.',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Hmmmmm',
'categories': ['Big Boobs', 'Erotic', 'Teen', 'Female', '720p'],
'duration': 588,
'age_limit': 18,
'view_count': int,
}
},
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
webpage = self._download_webpage(url, display_id)
errmsg = 'This video has been deleted due to Copyright Infringement or by the account owner!'
if errmsg in webpage:
raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True)
title = self._html_search_regex(
r'videoname\s*=\s*\'([^\']+)\'', webpage, 'title').strip()
description = self._html_search_regex(
r'class="(?:descr|description_txt)">(.*?)</div>',
webpage, 'description', fatal=False)
thumbnail = urljoin('http://www.vporn.com', self._html_search_regex(
r'flashvars\.imageUrl\s*=\s*"([^"]+)"', webpage, 'description',
default=None))
uploader = self._html_search_regex(
r'(?s)Uploaded by:.*?<a href="/user/[^"]+"[^>]*>(.+?)</a>',
webpage, 'uploader', fatal=False)
categories = re.findall(r'<a href="/cat/[^"]+"[^>]*>([^<]+)</a>', webpage)
duration = parse_duration(self._search_regex(
r'Runtime:\s*</span>\s*(\d+ min \d+ sec)',
webpage, 'duration', fatal=False))
view_count = str_to_int(self._search_regex(
r'class="views">([\d,\.]+) [Vv]iews<',
webpage, 'view count', fatal=False))
comment_count = str_to_int(self._html_search_regex(
r"'Comments \(([\d,\.]+)\)'",
webpage, 'comment count', default=None))
formats = []
for video in re.findall(r'flashvars\.videoUrl([^=]+?)\s*=\s*"(https?://[^"]+)"', webpage):
video_url = video[1]
fmt = {
'url': video_url,
'format_id': video[0],
}
m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)_(?P<vbr>\d+)k\.mp4$', video_url)
if m:
fmt.update({
'width': int(m.group('width')),
'height': int(m.group('height')),
'vbr': int(m.group('vbr')),
})
formats.append(fmt)
self._sort_formats(formats)
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'categories': categories,
'duration': duration,
'view_count': view_count,
'comment_count': comment_count,
'age_limit': 18,
'formats': formats,
}

View File

@@ -48,7 +48,7 @@ class VShareIE(InfoExtractor):
webpage = self._download_webpage(
'https://vshare.io/v/%s/width-650/height-430/1' % video_id,
video_id)
video_id, headers={'Referer': url})
title = self._html_search_regex(
r'<title>([^<]+)</title>', webpage, 'title')

View File

@@ -352,6 +352,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(?:www\.)?yourepeat\.com/|
tube\.majestyc\.net/|
(?:www\.)?invidio\.us/|
(?:www\.)?invidious\.snopyta\.org/|
(?:www\.)?invidious\.kabi\.tk/|
(?:www\.)?vid\.wxzm\.sx/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:

View File

@@ -217,11 +217,13 @@ class FFmpegPostProcessor(PostProcessor):
encodeArgument('-i'),
encodeFilename(self._ffmpeg_filename_argument(path), True)
])
cmd = ([encodeFilename(self.executable, True), encodeArgument('-y')] +
[encodeArgument('-loglevel'), encodeArgument('repeat+info')] +
files_cmd +
[encodeArgument(o) for o in opts] +
[encodeFilename(self._ffmpeg_filename_argument(out_path), True)])
cmd = [encodeFilename(self.executable, True), encodeArgument('-y')]
# avconv does not have repeat option
if self.basename == 'ffmpeg':
cmd += [encodeArgument('-loglevel'), encodeArgument('repeat+info')]
cmd += (files_cmd +
[encodeArgument(o) for o in opts] +
[encodeFilename(self._ffmpeg_filename_argument(out_path), True)])
if self._downloader.params.get('verbose', False):
self._downloader.to_screen('[debug] ffmpeg command line: %s' % shell_quote(cmd))

View File

@@ -184,7 +184,7 @@ DATE_FORMATS_MONTH_FIRST.extend([
])
PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
JSON_LD_RE = r'(?is)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
def preferredencoding():
@@ -1141,6 +1141,8 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
_HTTPONLY_PREFIX = '#HttpOnly_'
def save(self, filename=None, ignore_discard=False, ignore_expires=False):
# Store session cookies with `expires` set to 0 instead of an empty
# string
@@ -1150,7 +1152,21 @@ class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires)
def load(self, filename=None, ignore_discard=False, ignore_expires=False):
compat_cookiejar.MozillaCookieJar.load(self, filename, ignore_discard, ignore_expires)
"""Load cookies from a file."""
if filename is None:
if self.filename is not None:
filename = self.filename
else:
raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
cf = io.StringIO()
with open(filename) as f:
for line in f:
if line.startswith(self._HTTPONLY_PREFIX):
line = line[len(self._HTTPONLY_PREFIX):]
cf.write(compat_str(line))
cf.seek(0)
self._really_load(cf, filename, ignore_discard, ignore_expires)
# Session cookies are denoted by either `expires` field set to
# an empty string or 0. MozillaCookieJar only recognizes the former
# (see [1]). So we need force the latter to be recognized as session

View File

@@ -1,3 +1,3 @@
from __future__ import unicode_literals
__version__ = '2019.01.30'
__version__ = '2019.03.09'