diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 4eb505231..e5405c235 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.06.06** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.06.06 + [debug] youtube-dl version 2021.12.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 9fed0b489..33b01ce7f 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.06.06** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 573e8ded0..285610cc7 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.06.06** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index c0031bf7a..af73525fb 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.06.06** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.06.06 + [debug] youtube-dl version 2021.12.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 1138ab2ca..42c878b83 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.06.06** +- [ ] I've verified that I'm running youtube-dl version **2021.12.17** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 000000000..3ba13e0ce --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 90bd63c32..a609f3704 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,12 +15,12 @@ jobs: run-tests-ext: [sh] include: # python 3.2 is only available on windows via setup-python - - os: windows-latest + - os: windows-2019 python-version: 3.2 python-impl: cpython ytdl-test-set: core run-tests-ext: bat - - os: windows-latest + - os: windows-2019 python-version: 3.2 python-impl: cpython ytdl-test-set: download diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 58ab3a4b8..ff40cef78 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -150,7 +150,7 @@ After you have ensured this site is distributing its content legally, you can fo # TODO more properties (see youtube_dl/extractor/common.py) } ``` -5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). +5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). This makes the extractor available for use, as long as the class ends with `IE`. 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. 8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): diff --git a/ChangeLog b/ChangeLog index 680fffdf8..658864282 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +version 2021.12.17 + +Core +* [postprocessor/ffmpeg] Show ffmpeg output on error (#22680, #29336) + +Extractors +* [youtube] Update signature function patterns (#30363, #30366) +* [peertube] Only call description endpoint if necessary (#29383) +* [periscope] Pass referer to HLS requests (#29419) +- [liveleak] Remove extractor (#17625, #24222, #29331) ++ [pornhub] Add support for pornhubthbh7ap3u.onion +* [pornhub] Detect geo restriction +* [pornhub] Dismiss tbr extracted from download URLs (#28927) +* [curiositystream:collection] Extend _VALID_URL (#26326, #29117) +* [youtube] Make get_video_info processing more robust (#29333) +* [youtube] Workaround for get_video_info request (#29333) +* [bilibili] Strip uploader name (#29202) +* [youtube] Update invidious instance list (#29281) +* [umg:de] Update GraphQL API URL (#29304) +* [nrk] Switch psapi URL to https (#29344) ++ [egghead] Add support for app.egghead.io (#28404, #29303) +* [appleconnect] Fix extraction (#29208) ++ [orf:tvthek] Add support for MPD formats (#28672, #29236) + + version 2021.06.06 Extractors diff --git a/README.md b/README.md index 2841ed68f..6e07ddb1c 100644 --- a/README.md +++ b/README.md @@ -632,7 +632,7 @@ To use percent literals in an output template use `%%`. To output to stdout use The current default template is `%(title)s-%(id)s.%(ext)s`. -In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title: +In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title. #### Output template and Windows batch files @@ -1069,9 +1069,11 @@ After you have ensured this site is distributing its content legally, you can fo } ``` 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. -7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test (actually, test case) then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note: + * the test names use the extractor class name **without the trailing `IE`** + * tests with `only_matching` key in test's dict are not counted. +8. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. +9. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): $ flake8 youtube_dl/extractor/yourextractor.py diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 878ae72b1..edc19183d 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -13,6 +13,11 @@ sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) lazy_extractors_filename = sys.argv[1] if os.path.exists(lazy_extractors_filename): os.remove(lazy_extractors_filename) +# Py2: may be confused by leftover lazy_extractors.pyc +try: + os.remove(lazy_extractors_filename + 'c') +except OSError: + pass from youtube_dl.extractor import _ALL_CLASSES from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor @@ -22,7 +27,10 @@ with open('devscripts/lazy_load_template.py', 'rt') as f: module_contents = [ module_template + '\n' + getsource(InfoExtractor.suitable) + '\n', - 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n'] + 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n', + # needed for suitable() methods of Youtube extractor (see #28780) + 'from youtube_dl.utils import parse_qs\n', +] ie_template = ''' class {name}({bases}): diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ed0d5e9d9..ae2a6b8b0 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -472,8 +472,6 @@ - **LinuxAcademy** - **LiTV** - **LiveJournal** - - **LiveLeak** - - **LiveLeakEmbed** - **livestream** - **livestream:original** - **LnkGo** diff --git a/test/helper.py b/test/helper.py index e62aab11e..883b2e877 100644 --- a/test/helper.py +++ b/test/helper.py @@ -89,6 +89,17 @@ class FakeYDL(YoutubeDL): self.report_warning = types.MethodType(report_warning, self) +class FakeLogger(object): + def debug(self, msg): + pass + + def warning(self, msg): + pass + + def error(self, msg): + pass + + def gettestcases(include_onlymatching=False): for ie in youtube_dl.extractor.gen_extractors(): for tc in ie.get_testcases(include_onlymatching): @@ -128,6 +139,12 @@ def expect_value(self, got, expected, field): self.assertTrue( contains_str in got, 'field %s (value: %r) should contain %r' % (field, got, contains_str)) + elif isinstance(expected, compat_str) and re.match(r'^lambda \w+:', expected): + fn = eval(expected) + suite = expected.split(':', 1)[1].strip() + self.assertTrue( + fn(got), + 'Expected field %s to meet condition %s, but value %r failed ' % (field, suite, got)) elif isinstance(expected, type): self.assertTrue( isinstance(got, expected), @@ -137,7 +154,7 @@ def expect_value(self, got, expected, field): elif isinstance(expected, list) and isinstance(got, list): self.assertEqual( len(expected), len(got), - 'Expect a list of length %d, but got a list of length %d for field %s' % ( + 'Expected a list of length %d, but got a list of length %d for field %s' % ( len(expected), len(got), field)) for index, (item_got, item_expected) in enumerate(zip(got, expected)): type_got = type(item_got) diff --git a/test/parameters.json b/test/parameters.json index 65fd54428..864c9d130 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -18,7 +18,6 @@ "noprogress": false, "outtmpl": "%(id)s.%(ext)s", "password": null, - "playlistend": -1, "playliststart": 1, "prefer_free_formats": false, "quiet": false, diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index dd69a681b..6d25441db 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -35,13 +35,13 @@ class InfoExtractorTestRequestHandler(compat_http_server.BaseHTTPRequestHandler) assert False -class TestIE(InfoExtractor): +class DummyIE(InfoExtractor): pass class TestInfoExtractor(unittest.TestCase): def setUp(self): - self.ie = TestIE(FakeYDL()) + self.ie = DummyIE(FakeYDL()) def test_ie_key(self): self.assertEqual(get_info_extractor(YoutubeIE.ie_key()), YoutubeIE) @@ -62,6 +62,7 @@ class TestInfoExtractor(unittest.TestCase): + ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') @@ -74,6 +75,7 @@ class TestInfoExtractor(unittest.TestCase): self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar') self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True) + self.assertEqual(ie._og_search_property('test4', html), 'unquoted-value') def test_html_search_meta(self): ie = self.ie diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index a35effe0e..f8c8e619c 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -997,6 +997,25 @@ class TestYoutubeDL(unittest.TestCase): self.assertEqual(downloaded['extractor'], 'Video') self.assertEqual(downloaded['extractor_key'], 'Video') + def test_default_times(self): + """Test addition of missing upload/release/_date from /release_/timestamp""" + info = { + 'id': '1234', + 'url': TEST_URL, + 'title': 'Title', + 'ext': 'mp4', + 'timestamp': 1631352900, + 'release_timestamp': 1632995931, + } + + params = {'simulate': True, } + ydl = FakeYDL(params) + out_info = ydl.process_ie_result(info) + self.assertTrue(isinstance(out_info['upload_date'], compat_str)) + self.assertEqual(out_info['upload_date'], '20210911') + self.assertTrue(isinstance(out_info['release_date'], compat_str)) + self.assertEqual(out_info['release_date'], '20210930') + if __name__ == '__main__': unittest.main() diff --git a/test/test_aes.py b/test/test_aes.py index cc89fb6ab..0f181466b 100644 --- a/test/test_aes.py +++ b/test/test_aes.py @@ -8,7 +8,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.aes import aes_decrypt, aes_encrypt, aes_cbc_decrypt, aes_cbc_encrypt, aes_decrypt_text +from youtube_dl.aes import aes_decrypt, aes_encrypt, aes_cbc_decrypt, aes_cbc_encrypt, aes_decrypt_text, aes_ecb_encrypt from youtube_dl.utils import bytes_to_intlist, intlist_to_bytes import base64 @@ -58,6 +58,13 @@ class TestAES(unittest.TestCase): decrypted = (aes_decrypt_text(encrypted, password, 32)) self.assertEqual(decrypted, self.secret_msg) + def test_ecb_encrypt(self): + data = bytes_to_intlist(self.secret_msg) + encrypted = intlist_to_bytes(aes_ecb_encrypt(data, self.key)) + self.assertEqual( + encrypted, + b'\xaa\x86]\x81\x97>\x02\x92\x9d\x1bR[[L/u\xd3&\xd1(h\xde{\x81\x94\xba\x02\xae\xbd\xa6\xd0:') + if __name__ == '__main__': unittest.main() diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index 6f5513faa..db98494ab 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -11,6 +11,7 @@ from test.helper import try_rm from youtube_dl import YoutubeDL +from youtube_dl.utils import DownloadError def _download_restricted(url, filename, age): @@ -26,7 +27,10 @@ def _download_restricted(url, filename, age): ydl.add_default_info_extractors() json_filename = os.path.splitext(filename)[0] + '.info.json' try_rm(json_filename) - ydl.download([url]) + try: + ydl.download([url]) + except DownloadError: + try_rm(json_filename) res = os.path.exists(json_filename) try_rm(json_filename) return res @@ -38,12 +42,12 @@ class TestAgeRestriction(unittest.TestCase): self.assertFalse(_download_restricted(url, filename, age)) def test_youtube(self): - self._assert_restricted('07FYdnEawAQ', '07FYdnEawAQ.mp4', 10) + self._assert_restricted('HtVdAasjOgU', 'HtVdAasjOgU.mp4', 10) def test_youporn(self): self._assert_restricted( - 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', - '505835.mp4', 2, old_age=25) + 'https://www.youporn.com/watch/16715086/sex-ed-in-detention-18-asmr/', + '16715086.mp4', 2, old_age=25) if __name__ == '__main__': diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 365b66bad..26df356b4 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -66,9 +66,9 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab']) self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab']) - # def test_youtube_search_matching(self): - # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) - # self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) + def test_youtube_search_matching(self): + self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) + self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) def test_facebook_matching(self): self.assertTrue(FacebookIE.suitable('https://www.facebook.com/Shiniknoh#!/photo.php?v=10153317450565268')) diff --git a/test/test_cache.py b/test/test_cache.py index a16160142..931074aa1 100644 --- a/test/test_cache.py +++ b/test/test_cache.py @@ -3,17 +3,18 @@ from __future__ import unicode_literals -import shutil - # Allow direct execution import os import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import shutil from test.helper import FakeYDL from youtube_dl.cache import Cache +from youtube_dl.utils import version_tuple +from youtube_dl.version import __version__ def _is_empty(d): @@ -54,6 +55,17 @@ class TestCache(unittest.TestCase): self.assertFalse(os.path.exists(self.test_dir)) self.assertEqual(c.load('test_cache', 'k.'), None) + def test_cache_validation(self): + ydl = FakeYDL({ + 'cachedir': self.test_dir, + }) + c = Cache(ydl) + obj = {'x': 1, 'y': ['ä', '\\a', True]} + c.store('test_cache', 'k.', obj) + self.assertEqual(c.load('test_cache', 'k.', min_ver='1970.01.01'), obj) + new_version = '.'.join(('%d' % ((v + 1) if i == 0 else v, )) for i, v in enumerate(version_tuple(__version__))) + self.assertIs(c.load('test_cache', 'k.', min_ver=new_version), None) + if __name__ == '__main__': unittest.main() diff --git a/test/test_compat.py b/test/test_compat.py index 86ff389fd..e233b1ae1 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -11,6 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.compat import ( + compat_casefold, compat_getenv, compat_setenv, compat_etree_Element, @@ -47,10 +48,11 @@ class TestCompat(unittest.TestCase): def test_all_present(self): import youtube_dl.compat - all_names = youtube_dl.compat.__all__ - present_names = set(filter( + all_names = sorted( + youtube_dl.compat.__all__ + youtube_dl.compat.legacy) + present_names = set(map(compat_str, filter( lambda c: '_' in c and not c.startswith('_'), - dir(youtube_dl.compat))) - set(['unicode_literals']) + dir(youtube_dl.compat)))) - set(['unicode_literals']) self.assertEqual(all_names, sorted(present_names)) def test_compat_urllib_parse_unquote(self): @@ -118,9 +120,21 @@ class TestCompat(unittest.TestCase): ''' compat_etree_fromstring(xml) - def test_struct_unpack(self): + def test_compat_struct_unpack(self): self.assertEqual(compat_struct_unpack('!B', b'\x00'), (0,)) + def test_compat_casefold(self): + if hasattr(compat_str, 'casefold'): + # don't bother to test str.casefold() (again) + return + # thanks https://bugs.python.org/file24232/casefolding.patch + self.assertEqual(compat_casefold('hello'), 'hello') + self.assertEqual(compat_casefold('hELlo'), 'hello') + self.assertEqual(compat_casefold('ß'), 'ss') + self.assertEqual(compat_casefold('fi'), 'fi') + self.assertEqual(compat_casefold('\u03a3'), '\u03c3') + self.assertEqual(compat_casefold('A\u0345\u03a3'), 'a\u03b9\u03c3') + if __name__ == '__main__': unittest.main() diff --git a/test/test_download.py b/test/test_download.py index ebe820dfc..19936969f 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -33,6 +33,7 @@ from youtube_dl.compat import ( from youtube_dl.utils import ( DownloadError, ExtractorError, + error_to_compat_str, format_bytes, UnavailableVideoError, ) @@ -100,27 +101,28 @@ def generator(test_case, tname): def print_skipping(reason): print('Skipping %s: %s' % (test_case['name'], reason)) + self.skipTest(reason) + if not ie.working(): print_skipping('IE marked as not _WORKING') - return for tc in test_cases: info_dict = tc.get('info_dict', {}) if not (info_dict.get('id') and info_dict.get('ext')): - raise Exception('Test definition incorrect. The output file cannot be known. Are both \'id\' and \'ext\' keys present?') + raise Exception('Test definition (%s) requires both \'id\' and \'ext\' keys present to define the output file' % (tname, )) if 'skip' in test_case: print_skipping(test_case['skip']) - return + for other_ie in other_ies: if not other_ie.working(): print_skipping('test depends on %sIE, marked as not WORKING' % other_ie.ie_key()) - return params = get_params(test_case.get('params', {})) params['outtmpl'] = tname + '_' + params['outtmpl'] if is_playlist and 'playlist' not in test_case: params.setdefault('extract_flat', 'in_playlist') + params.setdefault('playlistend', test_case.get('playlist_mincount')) params.setdefault('skip_download', True) ydl = YoutubeDL(params, auto_init=False) @@ -160,7 +162,9 @@ def generator(test_case, tname): except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): - raise + msg = getattr(err, 'msg', error_to_compat_str(err)) + err.msg = '%s (%s)' % (msg, tname, ) + raise err if try_num == RETRIES: report_warning('%s failed due to network errors, skipping...' % tname) diff --git a/test/test_downloader_external.py b/test/test_downloader_external.py new file mode 100644 index 000000000..c0239502b --- /dev/null +++ b/test/test_downloader_external.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# coding: utf-8 +from __future__ import unicode_literals + +# Allow direct execution +import os +import re +import sys +import subprocess +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import ( + FakeLogger, + http_server_port, + try_rm, +) +from youtube_dl import YoutubeDL +from youtube_dl.compat import compat_http_server +from youtube_dl.utils import encodeFilename +from youtube_dl.downloader.external import Aria2pFD +import threading + +TEST_DIR = os.path.dirname(os.path.abspath(__file__)) + + +TEST_SIZE = 10 * 1024 + + +class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): + def log_message(self, format, *args): + pass + + def send_content_range(self, total=None): + range_header = self.headers.get('Range') + start = end = None + if range_header: + mobj = re.match(r'bytes=(\d+)-(\d+)', range_header) + if mobj: + start, end = (int(mobj.group(i)) for i in (1, 2)) + valid_range = start is not None and end is not None + if valid_range: + content_range = 'bytes %d-%d' % (start, end) + if total: + content_range += '/%d' % total + self.send_header('Content-Range', content_range) + return (end - start + 1) if valid_range else total + + def serve(self, range=True, content_length=True): + self.send_response(200) + self.send_header('Content-Type', 'video/mp4') + size = TEST_SIZE + if range: + size = self.send_content_range(TEST_SIZE) + if content_length: + self.send_header('Content-Length', size) + self.end_headers() + self.wfile.write(b'#' * size) + + def do_GET(self): + if self.path == '/regular': + self.serve() + elif self.path == '/no-content-length': + self.serve(content_length=False) + elif self.path == '/no-range': + self.serve(range=False) + elif self.path == '/no-range-no-content-length': + self.serve(range=False, content_length=False) + else: + assert False, 'unrecognised server path' + + +@unittest.skipUnless(Aria2pFD.available(), 'aria2p module not found') +class TestAria2pFD(unittest.TestCase): + def setUp(self): + self.httpd = compat_http_server.HTTPServer( + ('127.0.0.1', 0), HTTPTestRequestHandler) + self.port = http_server_port(self.httpd) + self.server_thread = threading.Thread(target=self.httpd.serve_forever) + self.server_thread.daemon = True + self.server_thread.start() + + def download(self, params, ep): + with subprocess.Popen( + ['aria2c', '--enable-rpc'], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL + ) as process: + if not process.poll(): + filename = 'testfile.mp4' + params['logger'] = FakeLogger() + params['outtmpl'] = filename + ydl = YoutubeDL(params) + try_rm(encodeFilename(filename)) + self.assertEqual(ydl.download(['http://127.0.0.1:%d/%s' % (self.port, ep)]), 0) + self.assertEqual(os.path.getsize(encodeFilename(filename)), TEST_SIZE) + try_rm(encodeFilename(filename)) + process.kill() + + def download_all(self, params): + for ep in ('regular', 'no-content-length', 'no-range', 'no-range-no-content-length'): + self.download(params, ep) + + def test_regular(self): + self.download_all({'external_downloader': 'aria2p'}) + + def test_chunked(self): + self.download_all({ + 'external_downloader': 'aria2p', + 'http_chunk_size': 1000, + }) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_downloader_http.py b/test/test_downloader_http.py index 750472281..4e6d7a2a0 100644 --- a/test/test_downloader_http.py +++ b/test/test_downloader_http.py @@ -9,7 +9,11 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import http_server_port, try_rm +from test.helper import ( + FakeLogger, + http_server_port, + try_rm, +) from youtube_dl import YoutubeDL from youtube_dl.compat import compat_http_server from youtube_dl.downloader.http import HttpFD @@ -66,17 +70,6 @@ class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): assert False -class FakeLogger(object): - def debug(self, msg): - pass - - def warning(self, msg): - pass - - def error(self, msg): - pass - - class TestHttpFD(unittest.TestCase): def setUp(self): self.httpd = compat_http_server.HTTPServer( diff --git a/test/test_execution.py b/test/test_execution.py index 32948d93e..704e14612 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -40,14 +40,16 @@ class TestExecution(unittest.TestCase): self.assertFalse(stderr) def test_lazy_extractors(self): + lazy_extractors = 'youtube_dl/extractor/lazy_extractors.py' try: - subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'youtube_dl/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', lazy_extractors], cwd=rootDir, stdout=_DEV_NULL) subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL) finally: - try: - os.remove('youtube_dl/extractor/lazy_extractors.py') - except (IOError, OSError): - pass + for x in ['', 'c'] if sys.version_info[0] < 3 else ['']: + try: + os.remove(lazy_extractors + x) + except (IOError, OSError): + pass if __name__ == '__main__': diff --git a/test/test_http.py b/test/test_http.py index 3ee0a5dda..487a9bc77 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -8,7 +8,10 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import http_server_port +from test.helper import ( + FakeLogger, + http_server_port, +) from youtube_dl import YoutubeDL from youtube_dl.compat import compat_http_server, compat_urllib_request import ssl @@ -52,17 +55,6 @@ class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): assert False -class FakeLogger(object): - def debug(self, msg): - pass - - def warning(self, msg): - pass - - def error(self, msg): - pass - - class TestHTTP(unittest.TestCase): def setUp(self): self.httpd = compat_http_server.HTTPServer( diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index c24b8ca74..b5962356c 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -8,7 +8,10 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.jsinterp import JSInterpreter +import math +import re + +from youtube_dl.jsinterp import JS_Undefined, JSInterpreter class TestJSInterpreter(unittest.TestCase): @@ -19,6 +22,9 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('function x3(){return 42;}') self.assertEqual(jsi.call_function('x3'), 42) + jsi = JSInterpreter('function x3(){42}') + self.assertEqual(jsi.call_function('x3'), None) + jsi = JSInterpreter('var x5 = function(){return 42;}') self.assertEqual(jsi.call_function('x5'), 42) @@ -45,14 +51,32 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('function f(){return 1 << 5;}') self.assertEqual(jsi.call_function('f'), 32) + jsi = JSInterpreter('function f(){return 2 ** 5}') + self.assertEqual(jsi.call_function('f'), 32) + jsi = JSInterpreter('function f(){return 19 & 21;}') self.assertEqual(jsi.call_function('f'), 17) jsi = JSInterpreter('function f(){return 11 >> 2;}') self.assertEqual(jsi.call_function('f'), 2) + jsi = JSInterpreter('function f(){return []? 2+3: 4;}') + self.assertEqual(jsi.call_function('f'), 5) + + jsi = JSInterpreter('function f(){return 1 == 2}') + self.assertEqual(jsi.call_function('f'), False) + + jsi = JSInterpreter('function f(){return 0 && 1 || 2;}') + self.assertEqual(jsi.call_function('f'), 2) + + jsi = JSInterpreter('function f(){return 0 ?? 42;}') + self.assertEqual(jsi.call_function('f'), 0) + + jsi = JSInterpreter('function f(){return "life, the universe and everything" < 42;}') + self.assertFalse(jsi.call_function('f')) + def test_array_access(self): - jsi = JSInterpreter('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2] = 7; return x;}') + jsi = JSInterpreter('function f(){var x = [1,2,3]; x[0] = 4; x[0] = 5; x[2.0] = 7; return x;}') self.assertEqual(jsi.call_function('f'), [5, 2, 7]) def test_parens(self): @@ -62,6 +86,10 @@ class TestJSInterpreter(unittest.TestCase): jsi = JSInterpreter('function f(){return (1 + 2) * 3;}') self.assertEqual(jsi.call_function('f'), 9) + def test_quotes(self): + jsi = JSInterpreter(r'function f(){return "a\"\\("}') + self.assertEqual(jsi.call_function('f'), r'a"\(') + def test_assignments(self): jsi = JSInterpreter('function f(){var x = 20; x = 30 + 1; return x;}') self.assertEqual(jsi.call_function('f'), 31) @@ -104,13 +132,358 @@ class TestJSInterpreter(unittest.TestCase): }''') self.assertEqual(jsi.call_function('x'), [20, 20, 30, 40, 50]) + def test_builtins(self): + jsi = JSInterpreter(''' + function x() { return NaN } + ''') + self.assertTrue(math.isnan(jsi.call_function('x'))) + + def test_Date(self): + jsi = JSInterpreter(''' + function x() { return new Date('Wednesday 31 December 1969 18:01:26 MDT') - 0; } + ''') + self.assertEqual(jsi.call_function('x'), 86000) + + jsi = JSInterpreter(''' + function x(dt) { return new Date(dt) - 0; } + ''') + self.assertEqual(jsi.call_function('x', 'Wednesday 31 December 1969 18:01:26 MDT'), 86000) + + # date format m/d/y + jsi = JSInterpreter(''' + function x() { return new Date('12/31/1969 18:01:26 MDT') - 0; } + ''') + self.assertEqual(jsi.call_function('x'), 86000) + def test_call(self): jsi = JSInterpreter(''' function x() { return 2; } - function y(a) { return x() + a; } + function y(a) { return x() + (a?a:0); } function z() { return y(3); } ''') self.assertEqual(jsi.call_function('z'), 5) + self.assertEqual(jsi.call_function('y'), 2) + + def test_if(self): + jsi = JSInterpreter(''' + function x() { + let a = 9; + if (0==0) {a++} + return a + }''') + self.assertEqual(jsi.call_function('x'), 10) + + jsi = JSInterpreter(''' + function x() { + if (0==0) {return 10} + }''') + self.assertEqual(jsi.call_function('x'), 10) + + jsi = JSInterpreter(''' + function x() { + if (0!=0) {return 1} + else {return 10} + }''') + self.assertEqual(jsi.call_function('x'), 10) + + """ # Unsupported + jsi = JSInterpreter(''' + function x() { + if (0!=0) return 1; + else {return 10} + }''') + self.assertEqual(jsi.call_function('x'), 10) + """ + + def test_elseif(self): + jsi = JSInterpreter(''' + function x() { + if (0!=0) {return 1} + else if (1==0) {return 2} + else {return 10} + }''') + self.assertEqual(jsi.call_function('x'), 10) + + """ # Unsupported + jsi = JSInterpreter(''' + function x() { + if (0!=0) return 1; + else if (1==0) {return 2} + else {return 10} + }''') + self.assertEqual(jsi.call_function('x'), 10) + # etc + """ + + def test_for_loop(self): + # function x() { a=0; for (i=0; i-10; i++) {a++} a } + jsi = JSInterpreter(''' + function x() { a=0; for (i=0; i-10; i++) {a++} return a } + ''') + self.assertEqual(jsi.call_function('x'), 10) + + def test_while_loop(self): + # function x() { a=0; while (a<10) {a++} a } + jsi = JSInterpreter(''' + function x() { a=0; while (a<10) {a++} return a } + ''') + self.assertEqual(jsi.call_function('x'), 10) + + def test_switch(self): + jsi = JSInterpreter(''' + function x(f) { switch(f){ + case 1:f+=1; + case 2:f+=2; + case 3:f+=3;break; + case 4:f+=4; + default:f=0; + } return f } + ''') + self.assertEqual(jsi.call_function('x', 1), 7) + self.assertEqual(jsi.call_function('x', 3), 6) + self.assertEqual(jsi.call_function('x', 5), 0) + + def test_switch_default(self): + jsi = JSInterpreter(''' + function x(f) { switch(f){ + case 2: f+=2; + default: f-=1; + case 5: + case 6: f+=6; + case 0: break; + case 1: f+=1; + } return f } + ''') + self.assertEqual(jsi.call_function('x', 1), 2) + self.assertEqual(jsi.call_function('x', 5), 11) + self.assertEqual(jsi.call_function('x', 9), 14) + + def test_try(self): + jsi = JSInterpreter(''' + function x() { try{return 10} catch(e){return 5} } + ''') + self.assertEqual(jsi.call_function('x'), 10) + + def test_catch(self): + jsi = JSInterpreter(''' + function x() { try{throw 10} catch(e){return 5} } + ''') + self.assertEqual(jsi.call_function('x'), 5) + + def test_finally(self): + jsi = JSInterpreter(''' + function x() { try{throw 10} finally {return 42} } + ''') + self.assertEqual(jsi.call_function('x'), 42) + jsi = JSInterpreter(''' + function x() { try{throw 10} catch(e){return 5} finally {return 42} } + ''') + self.assertEqual(jsi.call_function('x'), 42) + + def test_nested_try(self): + jsi = JSInterpreter(''' + function x() {try { + try{throw 10} finally {throw 42} + } catch(e){return 5} } + ''') + self.assertEqual(jsi.call_function('x'), 5) + + def test_for_loop_continue(self): + jsi = JSInterpreter(''' + function x() { a=0; for (i=0; i-10; i++) { continue; a++ } return a } + ''') + self.assertEqual(jsi.call_function('x'), 0) + + def test_for_loop_break(self): + jsi = JSInterpreter(''' + function x() { a=0; for (i=0; i-10; i++) { break; a++ } return a } + ''') + self.assertEqual(jsi.call_function('x'), 0) + + def test_for_loop_try(self): + jsi = JSInterpreter(''' + function x() { + for (i=0; i-10; i++) { try { if (i == 5) throw i} catch {return 10} finally {break} }; + return 42 } + ''') + self.assertEqual(jsi.call_function('x'), 42) + + def test_literal_list(self): + jsi = JSInterpreter(''' + function x() { return [1, 2, "asdf", [5, 6, 7]][3] } + ''') + self.assertEqual(jsi.call_function('x'), [5, 6, 7]) + + def test_comma(self): + jsi = JSInterpreter(''' + function x() { a=5; a -= 1, a+=3; return a } + ''') + self.assertEqual(jsi.call_function('x'), 7) + jsi = JSInterpreter(''' + function x() { a=5; return (a -= 1, a+=3, a); } + ''') + self.assertEqual(jsi.call_function('x'), 7) + + jsi = JSInterpreter(''' + function x() { return (l=[0,1,2,3], function(a, b){return a+b})((l[1], l[2]), l[3]) } + ''') + self.assertEqual(jsi.call_function('x'), 5) + + def test_void(self): + jsi = JSInterpreter(''' + function x() { return void 42; } + ''') + self.assertEqual(jsi.call_function('x'), None) + + def test_return_function(self): + jsi = JSInterpreter(''' + function x() { return [1, function(){return 1}][1] } + ''') + self.assertEqual(jsi.call_function('x')([]), 1) + + def test_null(self): + jsi = JSInterpreter(''' + function x() { return null; } + ''') + self.assertIs(jsi.call_function('x'), None) + + jsi = JSInterpreter(''' + function x() { return [null > 0, null < 0, null == 0, null === 0]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, False, False, False]) + + jsi = JSInterpreter(''' + function x() { return [null >= 0, null <= 0]; } + ''') + self.assertEqual(jsi.call_function('x'), [True, True]) + + def test_undefined(self): + jsi = JSInterpreter(''' + function x() { return undefined === undefined; } + ''') + self.assertTrue(jsi.call_function('x')) + + jsi = JSInterpreter(''' + function x() { return undefined; } + ''') + self.assertIs(jsi.call_function('x'), JS_Undefined) + + jsi = JSInterpreter(''' + function x() { let v; return v; } + ''') + self.assertIs(jsi.call_function('x'), JS_Undefined) + + jsi = JSInterpreter(''' + function x() { return [undefined === undefined, undefined == undefined, undefined < undefined, undefined > undefined]; } + ''') + self.assertEqual(jsi.call_function('x'), [True, True, False, False]) + + jsi = JSInterpreter(''' + function x() { return [undefined === 0, undefined == 0, undefined < 0, undefined > 0]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, False, False, False]) + + jsi = JSInterpreter(''' + function x() { return [undefined >= 0, undefined <= 0]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, False]) + + jsi = JSInterpreter(''' + function x() { return [undefined > null, undefined < null, undefined == null, undefined === null]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, False, True, False]) + + jsi = JSInterpreter(''' + function x() { return [undefined === null, undefined == null, undefined < null, undefined > null]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, True, False, False]) + + jsi = JSInterpreter(''' + function x() { let v; return [42+v, v+42, v**42, 42**v, 0**v]; } + ''') + for y in jsi.call_function('x'): + self.assertTrue(math.isnan(y)) + + jsi = JSInterpreter(''' + function x() { let v; return v**0; } + ''') + self.assertEqual(jsi.call_function('x'), 1) + + jsi = JSInterpreter(''' + function x() { let v; return [v>42, v<=42, v&&42, 42&&v]; } + ''') + self.assertEqual(jsi.call_function('x'), [False, False, JS_Undefined, JS_Undefined]) + + jsi = JSInterpreter('function x(){return undefined ?? 42; }') + self.assertEqual(jsi.call_function('x'), 42) + + def test_object(self): + jsi = JSInterpreter(''' + function x() { return {}; } + ''') + self.assertEqual(jsi.call_function('x'), {}) + + jsi = JSInterpreter(''' + function x() { let a = {m1: 42, m2: 0 }; return [a["m1"], a.m2]; } + ''') + self.assertEqual(jsi.call_function('x'), [42, 0]) + + jsi = JSInterpreter(''' + function x() { let a; return a?.qq; } + ''') + self.assertIs(jsi.call_function('x'), JS_Undefined) + + jsi = JSInterpreter(''' + function x() { let a = {m1: 42, m2: 0 }; return a?.qq; } + ''') + self.assertIs(jsi.call_function('x'), JS_Undefined) + + def test_regex(self): + jsi = JSInterpreter(''' + function x() { let a=/,,[/,913,/](,)}/; } + ''') + self.assertIs(jsi.call_function('x'), None) + + jsi = JSInterpreter(''' + function x() { let a=/,,[/,913,/](,)}/; return a; } + ''') + attrs = set(('findall', 'finditer', 'flags', 'groupindex', + 'groups', 'match', 'pattern', 'scanner', + 'search', 'split', 'sub', 'subn')) + self.assertTrue(set(dir(jsi.call_function('x'))) > attrs) + + jsi = JSInterpreter(''' + function x() { let a=/,,[/,913,/](,)}/i; return a; } + ''') + self.assertEqual(jsi.call_function('x').flags & ~re.U, re.I) + + jsi = JSInterpreter(r''' + function x() { let a=[/[)\\]/]; return a[0]; } + ''') + self.assertEqual(jsi.call_function('x').pattern, r'[)\\]') + + """ # fails + jsi = JSInterpreter(r''' + function x() { let a=100; a/=/[0-9]+/.exec('divide by 20 today')[0]; } + ''') + self.assertEqual(jsi.call_function('x'), 5) + """ + + def test_char_code_at(self): + jsi = JSInterpreter('function x(i){return "test".charCodeAt(i)}') + self.assertEqual(jsi.call_function('x', 0), 116) + self.assertEqual(jsi.call_function('x', 1), 101) + self.assertEqual(jsi.call_function('x', 2), 115) + self.assertEqual(jsi.call_function('x', 3), 116) + self.assertEqual(jsi.call_function('x', 4), None) + self.assertEqual(jsi.call_function('x', 'not_a_number'), 116) + + def test_bitwise_operators_overflow(self): + jsi = JSInterpreter('function x(){return -524999584 << 5}') + self.assertEqual(jsi.call_function('x'), 379882496) + + jsi = JSInterpreter('function x(){return 1236566549 << 5}') + self.assertEqual(jsi.call_function('x'), 915423904) if __name__ == '__main__': diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 550e0ca00..1197721ff 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -38,6 +38,9 @@ class BaseTestSubtitles(unittest.TestCase): self.DL = FakeYDL() self.ie = self.IE() self.DL.add_info_extractor(self.ie) + if not self.IE.working(): + print('Skipping: %s marked as not _WORKING' % self.IE.ie_key()) + self.skipTest('IE marked as not _WORKING') def getInfoDict(self): info_dict = self.DL.extract_info(self.url, download=False) @@ -56,6 +59,21 @@ class BaseTestSubtitles(unittest.TestCase): class TestYoutubeSubtitles(BaseTestSubtitles): + # Available subtitles for QRS8MkLhQmM: + # Language formats + # ru vtt, ttml, srv3, srv2, srv1, json3 + # fr vtt, ttml, srv3, srv2, srv1, json3 + # en vtt, ttml, srv3, srv2, srv1, json3 + # nl vtt, ttml, srv3, srv2, srv1, json3 + # de vtt, ttml, srv3, srv2, srv1, json3 + # ko vtt, ttml, srv3, srv2, srv1, json3 + # it vtt, ttml, srv3, srv2, srv1, json3 + # zh-Hant vtt, ttml, srv3, srv2, srv1, json3 + # hi vtt, ttml, srv3, srv2, srv1, json3 + # pt-BR vtt, ttml, srv3, srv2, srv1, json3 + # es-MX vtt, ttml, srv3, srv2, srv1, json3 + # ja vtt, ttml, srv3, srv2, srv1, json3 + # pl vtt, ttml, srv3, srv2, srv1, json3 url = 'QRS8MkLhQmM' IE = YoutubeIE @@ -64,41 +82,60 @@ class TestYoutubeSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 13) - self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') - self.assertEqual(md5(subtitles['it']), '6d752b98c31f1cf8d597050c7a2cb4b5') + self.assertEqual(md5(subtitles['en']), 'ae1bd34126571a77aabd4d276b28044d') + self.assertEqual(md5(subtitles['it']), '0e0b667ba68411d88fd1c5f4f4eab2f9') for lang in ['fr', 'de']: self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) - def test_youtube_subtitles_ttml_format(self): + def _test_subtitles_format(self, fmt, md5_hash, lang='en'): self.DL.params['writesubtitles'] = True - self.DL.params['subtitlesformat'] = 'ttml' + self.DL.params['subtitlesformat'] = fmt subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), 'e306f8c42842f723447d9f63ad65df54') + self.assertEqual(md5(subtitles[lang]), md5_hash) + + def test_youtube_subtitles_ttml_format(self): + self._test_subtitles_format('ttml', 'c97ddf1217390906fa9fbd34901f3da2') def test_youtube_subtitles_vtt_format(self): - self.DL.params['writesubtitles'] = True - self.DL.params['subtitlesformat'] = 'vtt' + self._test_subtitles_format('vtt', 'ae1bd34126571a77aabd4d276b28044d') + + def test_youtube_subtitles_json3_format(self): + self._test_subtitles_format('json3', '688dd1ce0981683867e7fe6fde2a224b') + + def _test_automatic_captions(self, url, lang): + self.url = url + self.DL.params['writeautomaticsub'] = True + self.DL.params['subtitleslangs'] = [lang] subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') + self.assertTrue(subtitles[lang] is not None) def test_youtube_automatic_captions(self): - self.url = '8YoUxe5ncPo' - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslangs'] = ['it'] - subtitles = self.getSubtitles() - self.assertTrue(subtitles['it'] is not None) + # Available automatic captions for 8YoUxe5ncPo: + # Language formats (all in vtt, ttml, srv3, srv2, srv1, json3) + # gu, zh-Hans, zh-Hant, gd, ga, gl, lb, la, lo, tt, tr, + # lv, lt, tk, th, tg, te, fil, haw, yi, ceb, yo, de, da, + # el, eo, en, eu, et, es, ru, rw, ro, bn, be, bg, uk, jv, + # bs, ja, or, xh, co, ca, cy, cs, ps, pt, pa, vi, pl, hy, + # hr, ht, hu, hmn, hi, ha, mg, uz, ml, mn, mi, mk, ur, + # mt, ms, mr, ug, ta, my, af, sw, is, am, + # *it*, iw, sv, ar, + # su, zu, az, id, ig, nl, no, ne, ny, fr, ku, fy, fa, fi, + # ka, kk, sr, sq, ko, kn, km, st, sk, si, so, sn, sm, sl, + # ky, sd + # ... + self._test_automatic_captions('8YoUxe5ncPo', 'it') + @unittest.skip('ASR subs all in all supported langs now') def test_youtube_translated_subtitles(self): - # This video has a subtitles track, which can be translated - self.url = 'Ky9eprVWzlI' - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslangs'] = ['it'] - subtitles = self.getSubtitles() - self.assertTrue(subtitles['it'] is not None) + # This video has a subtitles track, which can be translated (#4555) + self._test_automatic_captions('Ky9eprVWzlI', 'it') def test_youtube_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') - self.url = 'n5BB19UTcdA' + # Available automatic captions for 8YoUxe5ncPo: + # ... + # 8YoUxe5ncPo has no subtitles + self.url = '8YoUxe5ncPo' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() @@ -128,6 +165,7 @@ class TestDailymotionSubtitles(BaseTestSubtitles): self.assertFalse(subtitles) +@unittest.skip('IE broken') class TestTedSubtitles(BaseTestSubtitles): url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html' IE = TEDIE @@ -152,18 +190,19 @@ class TestVimeoSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['de', 'en', 'es', 'fr'])) - self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888') - self.assertEqual(md5(subtitles['fr']), 'b6191146a6c5d3a452244d853fde6dc8') + self.assertEqual(md5(subtitles['en']), '386cbc9320b94e25cb364b97935e5dd1') + self.assertEqual(md5(subtitles['fr']), 'c9b69eef35bc6641c0d4da8a04f9dfac') def test_nosubtitles(self): self.DL.expect_warning('video doesn\'t have subtitles') - self.url = 'http://vimeo.com/56015672' + self.url = 'http://vimeo.com/68093876' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertFalse(subtitles) +@unittest.skip('IE broken') class TestWallaSubtitles(BaseTestSubtitles): url = 'http://vod.walla.co.il/movie/2705958/the-yes-men' IE = WallaIE @@ -185,6 +224,7 @@ class TestWallaSubtitles(BaseTestSubtitles): self.assertFalse(subtitles) +@unittest.skip('IE broken') class TestCeskaTelevizeSubtitles(BaseTestSubtitles): url = 'http://www.ceskatelevize.cz/ivysilani/10600540290-u6-uzasny-svet-techniky' IE = CeskaTelevizeIE @@ -206,6 +246,7 @@ class TestCeskaTelevizeSubtitles(BaseTestSubtitles): self.assertFalse(subtitles) +@unittest.skip('IE broken') class TestLyndaSubtitles(BaseTestSubtitles): url = 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html' IE = LyndaIE @@ -218,6 +259,7 @@ class TestLyndaSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), '09bbe67222259bed60deaa26997d73a7') +@unittest.skip('IE broken') class TestNPOSubtitles(BaseTestSubtitles): url = 'http://www.npo.nl/nos-journaal/28-08-2014/POW_00722860' IE = NPOIE @@ -230,6 +272,7 @@ class TestNPOSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['nl']), 'fc6435027572b63fb4ab143abd5ad3f4') +@unittest.skip('IE broken') class TestMTVSubtitles(BaseTestSubtitles): url = 'http://www.cc.com/video-clips/p63lk0/adam-devine-s-house-party-chasing-white-swans' IE = ComedyCentralIE @@ -253,8 +296,8 @@ class TestNRKSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(set(subtitles.keys()), set(['no'])) - self.assertEqual(md5(subtitles['no']), '544fa917d3197fcbee64634559221cc2') + self.assertEqual(set(subtitles.keys()), set(['nb-ttv'])) + self.assertEqual(md5(subtitles['nb-ttv']), '67e06ff02d0deaf975e68f6cb8f6a149') class TestRaiPlaySubtitles(BaseTestSubtitles): @@ -277,6 +320,7 @@ class TestRaiPlaySubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['it']), '4b3264186fbb103508abe5311cfcb9cd') +@unittest.skip('IE broken - DRM only') class TestVikiSubtitles(BaseTestSubtitles): url = 'http://www.viki.com/videos/1060846v-punch-episode-18' IE = VikiIE @@ -303,6 +347,7 @@ class TestThePlatformSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), '97e7670cbae3c4d26ae8bcc7fdd78d4b') +@unittest.skip('IE broken') class TestThePlatformFeedSubtitles(BaseTestSubtitles): url = 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207' IE = ThePlatformFeedIE @@ -338,7 +383,7 @@ class TestDemocracynowSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) - self.assertEqual(md5(subtitles['en']), 'acaca989e24a9e45a6719c9b3d60815c') + self.assertEqual(md5(subtitles['en']), 'a3cc4c0b5eadd74d9974f1c1f5101045') def test_subtitles_in_page(self): self.url = 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree' @@ -346,7 +391,7 @@ class TestDemocracynowSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['en'])) - self.assertEqual(md5(subtitles['en']), 'acaca989e24a9e45a6719c9b3d60815c') + self.assertEqual(md5(subtitles['en']), 'a3cc4c0b5eadd74d9974f1c1f5101045') if __name__ == '__main__': diff --git a/test/test_utils.py b/test/test_utils.py index 259c4763e..9d364c863 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -12,7 +12,9 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Various small unit tests import io +import itertools import json +import re import xml.etree.ElementTree from youtube_dl.utils import ( @@ -40,11 +42,14 @@ from youtube_dl.utils import ( get_element_by_attribute, get_elements_by_class, get_elements_by_attribute, + get_first, InAdvancePagedList, int_or_none, intlist_to_bytes, is_html, + join_nonempty, js_to_json, + LazyList, limit_length, merge_dicts, mimetype2ext, @@ -79,6 +84,8 @@ from youtube_dl.utils import ( strip_or_none, subtitles_filename, timeconvert, + traverse_obj, + try_call, unescapeHTML, unified_strdate, unified_timestamp, @@ -92,6 +99,7 @@ from youtube_dl.utils import ( urlencode_postdata, urshift, update_url_query, + variadic, version_tuple, xpath_with_ns, xpath_element, @@ -112,12 +120,18 @@ from youtube_dl.compat import ( compat_getenv, compat_os_name, compat_setenv, + compat_str, compat_urlparse, compat_parse_qs, ) class TestUtil(unittest.TestCase): + + # yt-dlp shim + def assertCountEqual(self, expected, got, msg='count should be the same'): + return self.assertEqual(len(tuple(expected)), len(tuple(got)), msg=msg) + def test_timeconvert(self): self.assertTrue(timeconvert('') is None) self.assertTrue(timeconvert('bougrg') is None) @@ -370,6 +384,9 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540) self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140) self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363) + self.assertEqual(unified_timestamp('December 31 1969 20:00:01 EDT'), 1) + self.assertEqual(unified_timestamp('Wednesday 31 December 1969 18:01:26 MDT'), 86) + self.assertEqual(unified_timestamp('12/31/1969 20:01:18 EDT', False), 78) def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') @@ -1475,6 +1492,315 @@ Line 1 self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3') self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3') + def test_LazyList(self): + it = list(range(10)) + + self.assertEqual(list(LazyList(it)), it) + self.assertEqual(LazyList(it).exhaust(), it) + self.assertEqual(LazyList(it)[5], it[5]) + + self.assertEqual(LazyList(it)[5:], it[5:]) + self.assertEqual(LazyList(it)[:5], it[:5]) + self.assertEqual(LazyList(it)[::2], it[::2]) + self.assertEqual(LazyList(it)[1::2], it[1::2]) + self.assertEqual(LazyList(it)[5::-1], it[5::-1]) + self.assertEqual(LazyList(it)[6:2:-2], it[6:2:-2]) + self.assertEqual(LazyList(it)[::-1], it[::-1]) + + self.assertTrue(LazyList(it)) + self.assertFalse(LazyList(range(0))) + self.assertEqual(len(LazyList(it)), len(it)) + self.assertEqual(repr(LazyList(it)), repr(it)) + self.assertEqual(compat_str(LazyList(it)), compat_str(it)) + + self.assertEqual(list(LazyList(it, reverse=True)), it[::-1]) + self.assertEqual(list(reversed(LazyList(it))[::-1]), it) + self.assertEqual(list(reversed(LazyList(it))[1:3:7]), it[::-1][1:3:7]) + + def test_LazyList_laziness(self): + + def test(ll, idx, val, cache): + self.assertEqual(ll[idx], val) + self.assertEqual(ll._cache, list(cache)) + + ll = LazyList(range(10)) + test(ll, 0, 0, range(1)) + test(ll, 5, 5, range(6)) + test(ll, -3, 7, range(10)) + + ll = LazyList(range(10), reverse=True) + test(ll, -1, 0, range(1)) + test(ll, 3, 6, range(10)) + + ll = LazyList(itertools.count()) + test(ll, 10, 10, range(11)) + ll = reversed(ll) + test(ll, -15, 14, range(15)) + + def test_try_call(self): + def total(*x, **kwargs): + return sum(x) + sum(kwargs.values()) + + self.assertEqual(try_call(None), None, + msg='not a fn should give None') + self.assertEqual(try_call(lambda: 1), 1, + msg='int fn with no expected_type should give int') + self.assertEqual(try_call(lambda: 1, expected_type=int), 1, + msg='int fn with expected_type int should give int') + self.assertEqual(try_call(lambda: 1, expected_type=dict), None, + msg='int fn with wrong expected_type should give None') + self.assertEqual(try_call(total, args=(0, 1, 0, ), expected_type=int), 1, + msg='fn should accept arglist') + self.assertEqual(try_call(total, kwargs={'a': 0, 'b': 1, 'c': 0}, expected_type=int), 1, + msg='fn should accept kwargs') + self.assertEqual(try_call(lambda: 1, expected_type=dict), None, + msg='int fn with no expected_type should give None') + self.assertEqual(try_call(lambda x: {}, total, args=(42, ), expected_type=int), 42, + msg='expect first int result with expected_type int') + + def test_variadic(self): + self.assertEqual(variadic(None), (None, )) + self.assertEqual(variadic('spam'), ('spam', )) + self.assertEqual(variadic('spam', allowed_types=dict), 'spam') + + def test_traverse_obj(self): + _TEST_DATA = { + 100: 100, + 1.2: 1.2, + 'str': 'str', + 'None': None, + '...': Ellipsis, + 'urls': [ + {'index': 0, 'url': 'https://www.example.com/0'}, + {'index': 1, 'url': 'https://www.example.com/1'}, + ], + 'data': ( + {'index': 2}, + {'index': 3}, + ), + 'dict': {}, + } + + # Test base functionality + self.assertEqual(traverse_obj(_TEST_DATA, ('str',)), 'str', + msg='allow tuple path') + self.assertEqual(traverse_obj(_TEST_DATA, ['str']), 'str', + msg='allow list path') + self.assertEqual(traverse_obj(_TEST_DATA, (value for value in ("str",))), 'str', + msg='allow iterable path') + self.assertEqual(traverse_obj(_TEST_DATA, 'str'), 'str', + msg='single items should be treated as a path') + self.assertEqual(traverse_obj(_TEST_DATA, None), _TEST_DATA) + self.assertEqual(traverse_obj(_TEST_DATA, 100), 100) + self.assertEqual(traverse_obj(_TEST_DATA, 1.2), 1.2) + + # Test Ellipsis behavior + self.assertCountEqual(traverse_obj(_TEST_DATA, Ellipsis), + (item for item in _TEST_DATA.values() if item is not None), + msg='`...` should give all values except `None`') + self.assertCountEqual(traverse_obj(_TEST_DATA, ('urls', 0, Ellipsis)), _TEST_DATA['urls'][0].values(), + msg='`...` selection for dicts should select all values') + self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'url')), + ['https://www.example.com/0', 'https://www.example.com/1'], + msg='nested `...` queries should work') + self.assertCountEqual(traverse_obj(_TEST_DATA, (Ellipsis, Ellipsis, 'index')), range(4), + msg='`...` query result should be flattened') + + # Test function as key + self.assertEqual(traverse_obj(_TEST_DATA, lambda x, y: x == 'urls' and isinstance(y, list)), + [_TEST_DATA['urls']], + msg='function as query key should perform a filter based on (key, value)') + self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], compat_str)), {'str'}, + msg='exceptions in the query function should be caught') + + # Test alternative paths + self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'str'), 'str', + msg='multiple `paths` should be treated as alternative paths') + self.assertEqual(traverse_obj(_TEST_DATA, 'str', 100), 'str', + msg='alternatives should exit early') + self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'fail'), None, + msg='alternatives should return `default` if exhausted') + self.assertEqual(traverse_obj(_TEST_DATA, (Ellipsis, 'fail'), 100), 100, + msg='alternatives should track their own branching return') + self.assertEqual(traverse_obj(_TEST_DATA, ('dict', Ellipsis), ('data', Ellipsis)), list(_TEST_DATA['data']), + msg='alternatives on empty objects should search further') + + # Test branch and path nesting + self.assertEqual(traverse_obj(_TEST_DATA, ('urls', (3, 0), 'url')), ['https://www.example.com/0'], + msg='tuple as key should be treated as branches') + self.assertEqual(traverse_obj(_TEST_DATA, ('urls', [3, 0], 'url')), ['https://www.example.com/0'], + msg='list as key should be treated as branches') + self.assertEqual(traverse_obj(_TEST_DATA, ('urls', ((1, 'fail'), (0, 'url')))), ['https://www.example.com/0'], + msg='double nesting in path should be treated as paths') + self.assertEqual(traverse_obj(['0', [1, 2]], [(0, 1), 0]), [1], + msg='do not fail early on branching') + self.assertCountEqual(traverse_obj(_TEST_DATA, ('urls', ((1, ('fail', 'url')), (0, 'url')))), + ['https://www.example.com/0', 'https://www.example.com/1'], + msg='triple nesting in path should be treated as branches') + self.assertEqual(traverse_obj(_TEST_DATA, ('urls', ('fail', (Ellipsis, 'url')))), + ['https://www.example.com/0', 'https://www.example.com/1'], + msg='ellipsis as branch path start gets flattened') + + # Test dictionary as key + self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2}), {0: 100, 1: 1.2}, + msg='dict key should result in a dict with the same keys') + self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', 0, 'url')}), + {0: 'https://www.example.com/0'}, + msg='dict key should allow paths') + self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', (3, 0), 'url')}), + {0: ['https://www.example.com/0']}, + msg='tuple in dict path should be treated as branches') + self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', ((1, 'fail'), (0, 'url')))}), + {0: ['https://www.example.com/0']}, + msg='double nesting in dict path should be treated as paths') + self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', ((1, ('fail', 'url')), (0, 'url')))}), + {0: ['https://www.example.com/1', 'https://www.example.com/0']}, + msg='triple nesting in dict path should be treated as branches') + self.assertEqual(traverse_obj(_TEST_DATA, {0: 'fail'}), {}, + msg='remove `None` values when dict key') + self.assertEqual(traverse_obj(_TEST_DATA, {0: 'fail'}, default=Ellipsis), {0: Ellipsis}, + msg='do not remove `None` values if `default`') + self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}), {0: {}}, + msg='do not remove empty values when dict key') + self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}, default=Ellipsis), {0: {}}, + msg='do not remove empty values when dict key and a default') + self.assertEqual(traverse_obj(_TEST_DATA, {0: ('dict', Ellipsis)}), {0: []}, + msg='if branch in dict key not successful, return `[]`') + + # Testing default parameter behavior + _DEFAULT_DATA = {'None': None, 'int': 0, 'list': []} + self.assertEqual(traverse_obj(_DEFAULT_DATA, 'fail'), None, + msg='default value should be `None`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, 'fail', 'fail', default=Ellipsis), Ellipsis, + msg='chained fails should result in default') + self.assertEqual(traverse_obj(_DEFAULT_DATA, 'None', 'int'), 0, + msg='should not short cirquit on `None`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, 'fail', default=1), 1, + msg='invalid dict key should result in `default`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, 'None', default=1), 1, + msg='`None` is a deliberate sentinel and should become `default`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, ('list', 10)), None, + msg='`IndexError` should result in `default`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, (Ellipsis, 'fail'), default=1), 1, + msg='if branched but not successful return `default` if defined, not `[]`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, (Ellipsis, 'fail'), default=None), None, + msg='if branched but not successful return `default` even if `default` is `None`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, (Ellipsis, 'fail')), [], + msg='if branched but not successful return `[]`, not `default`') + self.assertEqual(traverse_obj(_DEFAULT_DATA, ('list', Ellipsis)), [], + msg='if branched but object is empty return `[]`, not `default`') + + # Testing expected_type behavior + _EXPECTED_TYPE_DATA = {'str': 'str', 'int': 0} + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=compat_str), 'str', + msg='accept matching `expected_type` type') + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=int), None, + msg='reject non matching `expected_type` type') + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'int', expected_type=lambda x: compat_str(x)), '0', + msg='transform type using type function') + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', + expected_type=lambda _: 1 / 0), None, + msg='wrap expected_type function in try_call') + self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, Ellipsis, expected_type=compat_str), ['str'], + msg='eliminate items that expected_type fails on') + + # Test get_all behavior + _GET_ALL_DATA = {'key': [0, 1, 2]} + self.assertEqual(traverse_obj(_GET_ALL_DATA, ('key', Ellipsis), get_all=False), 0, + msg='if not `get_all`, return only first matching value') + self.assertEqual(traverse_obj(_GET_ALL_DATA, Ellipsis, get_all=False), [0, 1, 2], + msg='do not overflatten if not `get_all`') + + # Test casesense behavior + _CASESENSE_DATA = { + 'KeY': 'value0', + 0: { + 'KeY': 'value1', + 0: {'KeY': 'value2'}, + }, + # FULLWIDTH LATIN CAPITAL LETTER K + '\uff2bey': 'value3', + } + self.assertEqual(traverse_obj(_CASESENSE_DATA, 'key'), None, + msg='dict keys should be case sensitive unless `casesense`') + self.assertEqual(traverse_obj(_CASESENSE_DATA, 'keY', + casesense=False), 'value0', + msg='allow non matching key case if `casesense`') + self.assertEqual(traverse_obj(_CASESENSE_DATA, '\uff4bey', # FULLWIDTH LATIN SMALL LETTER K + casesense=False), 'value3', + msg='allow non matching Unicode key case if `casesense`') + self.assertEqual(traverse_obj(_CASESENSE_DATA, (0, ('keY',)), + casesense=False), ['value1'], + msg='allow non matching key case in branch if `casesense`') + self.assertEqual(traverse_obj(_CASESENSE_DATA, (0, ((0, 'keY'),)), + casesense=False), ['value2'], + msg='allow non matching key case in branch path if `casesense`') + + # Test traverse_string behavior + _TRAVERSE_STRING_DATA = {'str': 'str', 1.2: 1.2} + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', 0)), None, + msg='do not traverse into string if not `traverse_string`') + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', 0), + _traverse_string=True), 's', + msg='traverse into string if `traverse_string`') + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, (1.2, 1), + _traverse_string=True), '.', + msg='traverse into converted data if `traverse_string`') + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', Ellipsis), + _traverse_string=True), list('str'), + msg='`...` branching into string should result in list') + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', (0, 2)), + _traverse_string=True), ['s', 'r'], + msg='branching into string should result in list') + self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda _, x: x), + _traverse_string=True), list('str'), + msg='function branching into string should result in list') + + # Test is_user_input behavior + _IS_USER_INPUT_DATA = {'range8': list(range(8))} + self.assertEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', '3'), + _is_user_input=True), 3, + msg='allow for string indexing if `is_user_input`') + self.assertCountEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', '3:'), + _is_user_input=True), tuple(range(8))[3:], + msg='allow for string slice if `is_user_input`') + self.assertCountEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', ':4:2'), + _is_user_input=True), tuple(range(8))[:4:2], + msg='allow step in string slice if `is_user_input`') + self.assertCountEqual(traverse_obj(_IS_USER_INPUT_DATA, ('range8', ':'), + _is_user_input=True), range(8), + msg='`:` should be treated as `...` if `is_user_input`') + with self.assertRaises(TypeError, msg='too many params should result in error'): + traverse_obj(_IS_USER_INPUT_DATA, ('range8', ':::'), _is_user_input=True) + + # Test re.Match as input obj + mobj = re.match(r'^0(12)(?P3)(4)?$', '0123') + self.assertEqual(traverse_obj(mobj, Ellipsis), [x for x in mobj.groups() if x is not None], + msg='`...` on a `re.Match` should give its `groups()`') + self.assertEqual(traverse_obj(mobj, lambda k, _: k in (0, 2)), ['0123', '3'], + msg='function on a `re.Match` should give groupno, value starting at 0') + self.assertEqual(traverse_obj(mobj, 'group'), '3', + msg='str key on a `re.Match` should give group with that name') + self.assertEqual(traverse_obj(mobj, 2), '3', + msg='int key on a `re.Match` should give group with that name') + self.assertEqual(traverse_obj(mobj, 'gRoUp', casesense=False), '3', + msg='str key on a `re.Match` should respect casesense') + self.assertEqual(traverse_obj(mobj, 'fail'), None, + msg='failing str key on a `re.Match` should return `default`') + self.assertEqual(traverse_obj(mobj, 'gRoUpS', casesense=False), None, + msg='failing str key on a `re.Match` should return `default`') + self.assertEqual(traverse_obj(mobj, 8), None, + msg='failing int key on a `re.Match` should return `default`') + + def test_get_first(self): + self.assertEqual(get_first([{'a': None}, {'a': 'spam'}], 'a'), 'spam') + + def test_join_nonempty(self): + self.assertEqual(join_nonempty('a', 'b'), 'a-b') + self.assertEqual(join_nonempty( + 'a', 'b', 'c', 'd', + from_dict={'a': 'c', 'c': [], 'b': 'd', 'd': None}), 'c-d') + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index cf2fdf14f..e0e8891ba 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- from __future__ import unicode_literals # Allow direct execution @@ -9,11 +10,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL - from youtube_dl.extractor import ( + YoutubeIE, YoutubePlaylistIE, YoutubeTabIE, - YoutubeIE, ) @@ -25,38 +25,23 @@ class TestYoutubeLists(unittest.TestCase): def test_youtube_playlist_noplaylist(self): dl = FakeYDL() dl.params['noplaylist'] = True + dl.params['format'] = 'best' ie = YoutubePlaylistIE(dl) result = ie.extract('https://www.youtube.com/watch?v=FXxLjLQi3Fg&list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') self.assertEqual(result['_type'], 'url') + result = dl.extract_info(result['url'], download=False, ie_key=result.get('ie_key'), process=False) self.assertEqual(YoutubeIE().extract_id(result['url']), 'FXxLjLQi3Fg') - def test_youtube_course(self): - dl = FakeYDL() - ie = YoutubePlaylistIE(dl) - # TODO find a > 100 (paginating?) videos course - result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') - entries = list(result['entries']) - self.assertEqual(YoutubeIE().extract_id(entries[0]['url']), 'j9WZyLZCBzs') - self.assertEqual(len(entries), 25) - self.assertEqual(YoutubeIE().extract_id(entries[-1]['url']), 'rYefUsYuEp0') - def test_youtube_mix(self): dl = FakeYDL() - ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/watch?v=W01L70IGBgE&index=2&list=RDOQpdSVF_k_w') - entries = result['entries'] - self.assertTrue(len(entries) >= 50) + dl.params['format'] = 'best' + ie = YoutubeTabIE(dl) + result = dl.extract_info('https://www.youtube.com/watch?v=tyITL_exICo&list=RDCLAK5uy_kLWIr9gv1XLlPbaDS965-Db4TrBoUTxQ8', + download=False, ie_key=ie.ie_key(), process=True) + entries = (result or {}).get('entries', [{'id': 'not_found', }]) + self.assertTrue(len(entries) >= 25) original_video = entries[0] - self.assertEqual(original_video['id'], 'OQpdSVF_k_w') - - def test_youtube_toptracks(self): - print('Skipping: The playlist page gives error 500') - return - dl = FakeYDL() - ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/playlist?list=MCUS') - entries = result['entries'] - self.assertEqual(len(entries), 100) + self.assertEqual(original_video['id'], 'tyITL_exICo') def test_youtube_flat_playlist_extraction(self): dl = FakeYDL() @@ -67,7 +52,7 @@ class TestYoutubeLists(unittest.TestCase): entries = list(result['entries']) self.assertTrue(len(entries) == 1) video = entries[0] - self.assertEqual(video['_type'], 'url_transparent') + self.assertEqual(video['_type'], 'url') self.assertEqual(video['ie_key'], 'Youtube') self.assertEqual(video['id'], 'BaW_jenozKc') self.assertEqual(video['url'], 'BaW_jenozKc') diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 627d4cb92..ac37ffa45 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -12,11 +12,13 @@ import io import re import string -from test.helper import FakeYDL -from youtube_dl.extractor import YoutubeIE from youtube_dl.compat import compat_str, compat_urlretrieve -_TESTS = [ +from test.helper import FakeYDL +from youtube_dl.extractor import YoutubeIE +from youtube_dl.jsinterp import JSInterpreter + +_SIG_TESTS = [ ( 'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js', 86, @@ -64,10 +66,86 @@ _TESTS = [ ) ] +_NSIG_TESTS = [ + ( + 'https://www.youtube.com/s/player/9216d1f7/player_ias.vflset/en_US/base.js', + 'SLp9F5bwjAdhE9F-', 'gWnb9IK2DJ8Q1w', + ), + ( + 'https://www.youtube.com/s/player/f8cb7a3b/player_ias.vflset/en_US/base.js', + 'oBo2h5euWy6osrUt', 'ivXHpm7qJjJN', + ), + ( + 'https://www.youtube.com/s/player/2dfe380c/player_ias.vflset/en_US/base.js', + 'oBo2h5euWy6osrUt', '3DIBbn3qdQ', + ), + ( + 'https://www.youtube.com/s/player/f1ca6900/player_ias.vflset/en_US/base.js', + 'cu3wyu6LQn2hse', 'jvxetvmlI9AN9Q', + ), + ( + 'https://www.youtube.com/s/player/8040e515/player_ias.vflset/en_US/base.js', + 'wvOFaY-yjgDuIEg5', 'HkfBFDHmgw4rsw', + ), + ( + 'https://www.youtube.com/s/player/e06dea74/player_ias.vflset/en_US/base.js', + 'AiuodmaDDYw8d3y4bf', 'ankd8eza2T6Qmw', + ), + ( + 'https://www.youtube.com/s/player/5dd88d1d/player-plasma-ias-phone-en_US.vflset/base.js', + 'kSxKFLeqzv_ZyHSAt', 'n8gS8oRlHOxPFA', + ), + ( + 'https://www.youtube.com/s/player/324f67b9/player_ias.vflset/en_US/base.js', + 'xdftNy7dh9QGnhW', '22qLGxrmX8F1rA', + ), + ( + 'https://www.youtube.com/s/player/4c3f79c5/player_ias.vflset/en_US/base.js', + 'TDCstCG66tEAO5pR9o', 'dbxNtZ14c-yWyw', + ), + ( + 'https://www.youtube.com/s/player/c81bbb4a/player_ias.vflset/en_US/base.js', + 'gre3EcLurNY2vqp94', 'Z9DfGxWP115WTg', + ), + ( + 'https://www.youtube.com/s/player/1f7d5369/player_ias.vflset/en_US/base.js', + 'batNX7sYqIJdkJ', 'IhOkL_zxbkOZBw', + ), + ( + 'https://www.youtube.com/s/player/009f1d77/player_ias.vflset/en_US/base.js', + '5dwFHw8aFWQUQtffRq', 'audescmLUzI3jw', + ), + ( + 'https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/en_US/base.js', + '5EHDMgYLV6HPGk_Mu-kk', 'n9lUJLHbxUI0GQ', + ), + ( + 'https://www.youtube.com/s/player/c2199353/player_ias.vflset/en_US/base.js', + '5EHDMgYLV6HPGk_Mu-kk', 'AD5rgS85EkrE7', + ), + ( + 'https://www.youtube.com/s/player/113ca41c/player_ias.vflset/en_US/base.js', + 'cgYl-tlYkhjT7A', 'hI7BBr2zUgcmMg', + ), + ( + 'https://www.youtube.com/s/player/c57c113c/player_ias.vflset/en_US/base.js', + '-Txvy6bT5R6LqgnQNx', 'dcklJCnRUHbgSg', + ), + ( + 'https://www.youtube.com/s/player/5a3b6271/player_ias.vflset/en_US/base.js', + 'B2j7f_UPT4rfje85Lu_e', 'm5DmNymaGQ5RdQ', + ), + ( + 'https://www.youtube.com/s/player/dac945fd/player_ias.vflset/en_US/base.js', + 'o8BkRxXhuYsBCWi6RplPdP', '3Lx32v_hmzTm6A', + ), +] + class TestPlayerInfo(unittest.TestCase): def test_youtube_extract_player_info(self): PLAYER_URLS = ( + ('https://www.youtube.com/s/player/4c3f79c5/player_ias.vflset/en_US/base.js', '4c3f79c5'), ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/en_US/base.js', '64dddad9'), ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/fr_FR/base.js', '64dddad9'), ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-en_US.vflset/base.js', '64dddad9'), @@ -90,40 +168,61 @@ class TestPlayerInfo(unittest.TestCase): class TestSignature(unittest.TestCase): def setUp(self): TEST_DIR = os.path.dirname(os.path.abspath(__file__)) - self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata') + self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata/sigs') if not os.path.exists(self.TESTDATA_DIR): os.mkdir(self.TESTDATA_DIR) - -def make_tfunc(url, sig_input, expected_sig): - m = re.match(r'.*-([a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$', url) - assert m, '%r should follow URL format' % url - test_id = m.group(1) - - def test_func(self): - basename = 'player-%s.js' % test_id - fn = os.path.join(self.TESTDATA_DIR, basename) - - if not os.path.exists(fn): - compat_urlretrieve(url, fn) - - ydl = FakeYDL() - ie = YoutubeIE(ydl) - with io.open(fn, encoding='utf-8') as testf: - jscode = testf.read() - func = ie._parse_sig_js(jscode) - src_sig = ( - compat_str(string.printable[:sig_input]) - if isinstance(sig_input, int) else sig_input) - got_sig = func(src_sig) - self.assertEqual(got_sig, expected_sig) - - test_func.__name__ = str('test_signature_js_' + test_id) - setattr(TestSignature, test_func.__name__, test_func) + def tearDown(self): + try: + for f in os.listdir(self.TESTDATA_DIR): + os.remove(f) + except OSError: + pass -for test_spec in _TESTS: - make_tfunc(*test_spec) +def t_factory(name, sig_func, url_pattern): + def make_tfunc(url, sig_input, expected_sig): + m = url_pattern.match(url) + assert m, '%r should follow URL format' % url + test_id = m.group('id') + + def test_func(self): + basename = 'player-{0}-{1}.js'.format(name, test_id) + fn = os.path.join(self.TESTDATA_DIR, basename) + + if not os.path.exists(fn): + compat_urlretrieve(url, fn) + with io.open(fn, encoding='utf-8') as testf: + jscode = testf.read() + self.assertEqual(sig_func(jscode, sig_input), expected_sig) + + test_func.__name__ = str('test_{0}_js_{1}'.format(name, test_id)) + setattr(TestSignature, test_func.__name__, test_func) + return make_tfunc + + +def signature(jscode, sig_input): + func = YoutubeIE(FakeYDL())._parse_sig_js(jscode) + src_sig = ( + compat_str(string.printable[:sig_input]) + if isinstance(sig_input, int) else sig_input) + return func(src_sig) + + +def n_sig(jscode, sig_input): + funcname = YoutubeIE(FakeYDL())._extract_n_function_name(jscode) + return JSInterpreter(jscode).call_function(funcname, sig_input) + + +make_sig_test = t_factory( + 'signature', signature, re.compile(r'.*-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$')) +for test_spec in _SIG_TESTS: + make_sig_test(*test_spec) + +make_nsig_test = t_factory( + 'nsig', n_sig, re.compile(r'.+/player/(?P[a-zA-Z0-9_-]+)/.+.js$')) +for test_spec in _NSIG_TESTS: + make_nsig_test(*test_spec) if __name__ == '__main__': diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index fe30758ef..8e8546596 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -73,6 +73,7 @@ from .utils import ( PostProcessingError, preferredencoding, prepend_extension, + process_communicate_or_kill, register_socks_protocols, render_table, replace_extension, @@ -720,7 +721,7 @@ class YoutubeDL(object): filename = encodeFilename(filename, True).decode(preferredencoding()) return sanitize_path(filename) except ValueError as err: - self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') + self.report_error('Error in output template: ' + error_to_compat_str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') return None def _match_entry(self, info_dict, incomplete): @@ -1529,7 +1530,7 @@ class YoutubeDL(object): # see http://bugs.python.org/issue1646728) try: upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) - info_dict[date_key] = upload_date.strftime('%Y%m%d') + info_dict[date_key] = compat_str(upload_date.strftime('%Y%m%d')) except (ValueError, OverflowError, OSError): pass @@ -1569,9 +1570,6 @@ class YoutubeDL(object): else: formats = info_dict['formats'] - if not formats: - raise ExtractorError('No video formats found!') - def is_wellformed(f): url = f.get('url') if not url: @@ -1584,7 +1582,10 @@ class YoutubeDL(object): return True # Filter out malformed formats for better extraction robustness - formats = list(filter(is_wellformed, formats)) + formats = list(filter(is_wellformed, formats or [])) + + if not formats: + raise ExtractorError('No video formats found!') formats_dict = {} @@ -1778,10 +1779,9 @@ class YoutubeDL(object): assert info_dict.get('_type', 'video') == 'video' - max_downloads = self.params.get('max_downloads') - if max_downloads is not None: - if self._num_downloads >= int(max_downloads): - raise MaxDownloadsReached() + max_downloads = int_or_none(self.params.get('max_downloads')) or float('inf') + if self._num_downloads >= max_downloads: + raise MaxDownloadsReached() # TODO: backward compatibility, to be removed info_dict['fulltitle'] = info_dict['title'] @@ -1906,8 +1906,17 @@ class YoutubeDL(object): if not self.params.get('skip_download', False): try: + def checked_get_suitable_downloader(info_dict, params): + ed_args = params.get('external_downloader_args') + dler = get_suitable_downloader(info_dict, params) + if ed_args and not params.get('external_downloader_args'): + # external_downloader_args was cleared because external_downloader was rejected + self.report_warning('Requested external downloader cannot be used: ' + 'ignoring --external-downloader-args.') + return dler + def dl(name, info): - fd = get_suitable_downloader(info, self.params)(self, self.params) + fd = checked_get_suitable_downloader(info, self.params)(self, self.params) for ph in self._progress_hooks: fd.add_progress_hook(ph) if self.params.get('verbose'): @@ -2049,9 +2058,12 @@ class YoutubeDL(object): try: self.post_process(filename, info_dict) except (PostProcessingError) as err: - self.report_error('postprocessing: %s' % str(err)) + self.report_error('postprocessing: %s' % error_to_compat_str(err)) return self.record_download_archive(info_dict) + # avoid possible nugatory search for further items (PR #26638) + if self._num_downloads >= max_downloads: + raise MaxDownloadsReached() def download(self, url_list): """Download a given list of URLs.""" @@ -2314,7 +2326,7 @@ class YoutubeDL(object): ['git', 'rev-parse', '--short', 'HEAD'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.dirname(os.path.abspath(__file__))) - out, err = sp.communicate() + out, err = process_communicate_or_kill(sp) out = out.decode().strip() if re.match('[0-9a-f]+', out): self._write_string('[debug] Git HEAD: ' + out + '\n') diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index 461bb6d41..a94a41079 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -8,6 +8,18 @@ from .utils import bytes_to_intlist, intlist_to_bytes BLOCK_SIZE_BYTES = 16 +def pkcs7_padding(data): + """ + PKCS#7 padding + + @param {int[]} data cleartext + @returns {int[]} padding data + """ + + remaining_length = BLOCK_SIZE_BYTES - len(data) % BLOCK_SIZE_BYTES + return data + [remaining_length] * remaining_length + + def aes_ctr_decrypt(data, key, counter): """ Decrypt with aes in counter mode @@ -76,8 +88,7 @@ def aes_cbc_encrypt(data, key, iv): previous_cipher_block = iv for i in range(block_count): block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] - remaining_length = BLOCK_SIZE_BYTES - len(block) - block += [remaining_length] * remaining_length + block = pkcs7_padding(block) mixed_block = xor(block, previous_cipher_block) encrypted_block = aes_encrypt(mixed_block, expanded_key) @@ -88,6 +99,28 @@ def aes_cbc_encrypt(data, key, iv): return encrypted_data +def aes_ecb_encrypt(data, key): + """ + Encrypt with aes in ECB mode. Using PKCS#7 padding + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @returns {int[]} encrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + encrypted_data = [] + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + block = pkcs7_padding(block) + + encrypted_block = aes_encrypt(block, expanded_key) + encrypted_data += encrypted_block + + return encrypted_data + + def key_expansion(data): """ Generate key schedule @@ -303,7 +336,7 @@ def xor(data1, data2): def rijndael_mul(a, b): - if(a == 0 or b == 0): + if (a == 0 or b == 0): return 0 return RIJNDAEL_EXP_TABLE[(RIJNDAEL_LOG_TABLE[a] + RIJNDAEL_LOG_TABLE[b]) % 0xFF] diff --git a/youtube_dl/cache.py b/youtube_dl/cache.py index 7bdade1bd..4822439d0 100644 --- a/youtube_dl/cache.py +++ b/youtube_dl/cache.py @@ -10,12 +10,21 @@ import traceback from .compat import compat_getenv from .utils import ( + error_to_compat_str, expand_path, + is_outdated_version, + try_get, write_json_file, ) +from .version import __version__ class Cache(object): + + _YTDL_DIR = 'youtube-dl' + _VERSION_KEY = _YTDL_DIR + '_version' + _DEFAULT_VERSION = '2021.12.17' + def __init__(self, ydl): self._ydl = ydl @@ -23,7 +32,7 @@ class Cache(object): res = self._ydl.params.get('cachedir') if res is None: cache_root = compat_getenv('XDG_CACHE_HOME', '~/.cache') - res = os.path.join(cache_root, 'youtube-dl') + res = os.path.join(cache_root, self._YTDL_DIR) return expand_path(res) def _get_cache_fn(self, section, key, dtype): @@ -50,13 +59,22 @@ class Cache(object): except OSError as ose: if ose.errno != errno.EEXIST: raise - write_json_file(data, fn) + write_json_file({self._VERSION_KEY: __version__, 'data': data}, fn) except Exception: tb = traceback.format_exc() self._ydl.report_warning( 'Writing cache to %r failed: %s' % (fn, tb)) - def load(self, section, key, dtype='json', default=None): + def _validate(self, data, min_ver): + version = try_get(data, lambda x: x[self._VERSION_KEY]) + if not version: # Backward compatibility + data, version = {'data': data}, self._DEFAULT_VERSION + if not is_outdated_version(version, min_ver or '0', assume_new=False): + return data['data'] + self._ydl.to_screen( + 'Discarding old cache from version {version} (needs {min_ver})'.format(**locals())) + + def load(self, section, key, dtype='json', default=None, min_ver=None): assert dtype in ('json',) if not self.enabled: @@ -66,12 +84,12 @@ class Cache(object): try: try: with io.open(cache_fn, 'r', encoding='utf-8') as cachef: - return json.load(cachef) + return self._validate(json.load(cachef), min_ver) except ValueError: try: file_size = os.path.getsize(cache_fn) except (OSError, IOError) as oe: - file_size = str(oe) + file_size = error_to_compat_str(oe) self._ydl.report_warning( 'Cache retrieval from %s failed (%s)' % (cache_fn, file_size)) except IOError: diff --git a/youtube_dl/casefold.py b/youtube_dl/casefold.py new file mode 100644 index 000000000..748c2d491 --- /dev/null +++ b/youtube_dl/casefold.py @@ -0,0 +1,1667 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .compat import ( + compat_str, + compat_chr, +) + +# Below is included the text of icu/CaseFolding.txt retrieved from +# https://github.com/unicode-org/icu/blob/main/icu4c/source/data/unidata/CaseFolding.txt +# In case newly foldable Unicode characters are defined, paste the new version +# of the text inside the ''' marks. +# The text is expected to have only blank lines andlines with 1st character #, +# all ignored, and fold definitions like this: +# `from_hex_code; space_separated_to_hex_code_list; comment` + +_map_str = ''' +# CaseFolding-15.0.0.txt +# Date: 2022-02-02, 23:35:35 GMT +# © 2022 Unicode®, Inc. +# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. +# For terms of use, see https://www.unicode.org/terms_of_use.html +# +# Unicode Character Database +# For documentation, see https://www.unicode.org/reports/tr44/ +# +# Case Folding Properties +# +# This file is a supplement to the UnicodeData file. +# It provides a case folding mapping generated from the Unicode Character Database. +# If all characters are mapped according to the full mapping below, then +# case differences (according to UnicodeData.txt and SpecialCasing.txt) +# are eliminated. +# +# The data supports both implementations that require simple case foldings +# (where string lengths don't change), and implementations that allow full case folding +# (where string lengths may grow). Note that where they can be supported, the +# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match. +# +# All code points not listed in this file map to themselves. +# +# NOTE: case folding does not preserve normalization formats! +# +# For information on case folding, including how to have case folding +# preserve normalization formats, see Section 3.13 Default Case Algorithms in +# The Unicode Standard. +# +# ================================================================================ +# Format +# ================================================================================ +# The entries in this file are in the following machine-readable format: +# +# ; ; ; # +# +# The status field is: +# C: common case folding, common mappings shared by both simple and full mappings. +# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. +# S: simple case folding, mappings to single characters where different from F. +# T: special case for uppercase I and dotted uppercase I +# - For non-Turkic languages, this mapping is normally not used. +# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. +# Note that the Turkic mappings do not maintain canonical equivalence without additional processing. +# See the discussions of case mapping in the Unicode Standard for more information. +# +# Usage: +# A. To do a simple case folding, use the mappings with status C + S. +# B. To do a full case folding, use the mappings with status C + F. +# +# The mappings with status T can be used or omitted depending on the desired case-folding +# behavior. (The default option is to exclude them.) +# +# ================================================================= + +# Property: Case_Folding + +# All code points not explicitly listed for Case_Folding +# have the value C for the status field, and the code point itself for the mapping field. + +# ================================================================= +0041; C; 0061; # LATIN CAPITAL LETTER A +0042; C; 0062; # LATIN CAPITAL LETTER B +0043; C; 0063; # LATIN CAPITAL LETTER C +0044; C; 0064; # LATIN CAPITAL LETTER D +0045; C; 0065; # LATIN CAPITAL LETTER E +0046; C; 0066; # LATIN CAPITAL LETTER F +0047; C; 0067; # LATIN CAPITAL LETTER G +0048; C; 0068; # LATIN CAPITAL LETTER H +0049; C; 0069; # LATIN CAPITAL LETTER I +0049; T; 0131; # LATIN CAPITAL LETTER I +004A; C; 006A; # LATIN CAPITAL LETTER J +004B; C; 006B; # LATIN CAPITAL LETTER K +004C; C; 006C; # LATIN CAPITAL LETTER L +004D; C; 006D; # LATIN CAPITAL LETTER M +004E; C; 006E; # LATIN CAPITAL LETTER N +004F; C; 006F; # LATIN CAPITAL LETTER O +0050; C; 0070; # LATIN CAPITAL LETTER P +0051; C; 0071; # LATIN CAPITAL LETTER Q +0052; C; 0072; # LATIN CAPITAL LETTER R +0053; C; 0073; # LATIN CAPITAL LETTER S +0054; C; 0074; # LATIN CAPITAL LETTER T +0055; C; 0075; # LATIN CAPITAL LETTER U +0056; C; 0076; # LATIN CAPITAL LETTER V +0057; C; 0077; # LATIN CAPITAL LETTER W +0058; C; 0078; # LATIN CAPITAL LETTER X +0059; C; 0079; # LATIN CAPITAL LETTER Y +005A; C; 007A; # LATIN CAPITAL LETTER Z +00B5; C; 03BC; # MICRO SIGN +00C0; C; 00E0; # LATIN CAPITAL LETTER A WITH GRAVE +00C1; C; 00E1; # LATIN CAPITAL LETTER A WITH ACUTE +00C2; C; 00E2; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +00C3; C; 00E3; # LATIN CAPITAL LETTER A WITH TILDE +00C4; C; 00E4; # LATIN CAPITAL LETTER A WITH DIAERESIS +00C5; C; 00E5; # LATIN CAPITAL LETTER A WITH RING ABOVE +00C6; C; 00E6; # LATIN CAPITAL LETTER AE +00C7; C; 00E7; # LATIN CAPITAL LETTER C WITH CEDILLA +00C8; C; 00E8; # LATIN CAPITAL LETTER E WITH GRAVE +00C9; C; 00E9; # LATIN CAPITAL LETTER E WITH ACUTE +00CA; C; 00EA; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX +00CB; C; 00EB; # LATIN CAPITAL LETTER E WITH DIAERESIS +00CC; C; 00EC; # LATIN CAPITAL LETTER I WITH GRAVE +00CD; C; 00ED; # LATIN CAPITAL LETTER I WITH ACUTE +00CE; C; 00EE; # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +00CF; C; 00EF; # LATIN CAPITAL LETTER I WITH DIAERESIS +00D0; C; 00F0; # LATIN CAPITAL LETTER ETH +00D1; C; 00F1; # LATIN CAPITAL LETTER N WITH TILDE +00D2; C; 00F2; # LATIN CAPITAL LETTER O WITH GRAVE +00D3; C; 00F3; # LATIN CAPITAL LETTER O WITH ACUTE +00D4; C; 00F4; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +00D5; C; 00F5; # LATIN CAPITAL LETTER O WITH TILDE +00D6; C; 00F6; # LATIN CAPITAL LETTER O WITH DIAERESIS +00D8; C; 00F8; # LATIN CAPITAL LETTER O WITH STROKE +00D9; C; 00F9; # LATIN CAPITAL LETTER U WITH GRAVE +00DA; C; 00FA; # LATIN CAPITAL LETTER U WITH ACUTE +00DB; C; 00FB; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX +00DC; C; 00FC; # LATIN CAPITAL LETTER U WITH DIAERESIS +00DD; C; 00FD; # LATIN CAPITAL LETTER Y WITH ACUTE +00DE; C; 00FE; # LATIN CAPITAL LETTER THORN +00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S +0100; C; 0101; # LATIN CAPITAL LETTER A WITH MACRON +0102; C; 0103; # LATIN CAPITAL LETTER A WITH BREVE +0104; C; 0105; # LATIN CAPITAL LETTER A WITH OGONEK +0106; C; 0107; # LATIN CAPITAL LETTER C WITH ACUTE +0108; C; 0109; # LATIN CAPITAL LETTER C WITH CIRCUMFLEX +010A; C; 010B; # LATIN CAPITAL LETTER C WITH DOT ABOVE +010C; C; 010D; # LATIN CAPITAL LETTER C WITH CARON +010E; C; 010F; # LATIN CAPITAL LETTER D WITH CARON +0110; C; 0111; # LATIN CAPITAL LETTER D WITH STROKE +0112; C; 0113; # LATIN CAPITAL LETTER E WITH MACRON +0114; C; 0115; # LATIN CAPITAL LETTER E WITH BREVE +0116; C; 0117; # LATIN CAPITAL LETTER E WITH DOT ABOVE +0118; C; 0119; # LATIN CAPITAL LETTER E WITH OGONEK +011A; C; 011B; # LATIN CAPITAL LETTER E WITH CARON +011C; C; 011D; # LATIN CAPITAL LETTER G WITH CIRCUMFLEX +011E; C; 011F; # LATIN CAPITAL LETTER G WITH BREVE +0120; C; 0121; # LATIN CAPITAL LETTER G WITH DOT ABOVE +0122; C; 0123; # LATIN CAPITAL LETTER G WITH CEDILLA +0124; C; 0125; # LATIN CAPITAL LETTER H WITH CIRCUMFLEX +0126; C; 0127; # LATIN CAPITAL LETTER H WITH STROKE +0128; C; 0129; # LATIN CAPITAL LETTER I WITH TILDE +012A; C; 012B; # LATIN CAPITAL LETTER I WITH MACRON +012C; C; 012D; # LATIN CAPITAL LETTER I WITH BREVE +012E; C; 012F; # LATIN CAPITAL LETTER I WITH OGONEK +0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE +0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE +0132; C; 0133; # LATIN CAPITAL LIGATURE IJ +0134; C; 0135; # LATIN CAPITAL LETTER J WITH CIRCUMFLEX +0136; C; 0137; # LATIN CAPITAL LETTER K WITH CEDILLA +0139; C; 013A; # LATIN CAPITAL LETTER L WITH ACUTE +013B; C; 013C; # LATIN CAPITAL LETTER L WITH CEDILLA +013D; C; 013E; # LATIN CAPITAL LETTER L WITH CARON +013F; C; 0140; # LATIN CAPITAL LETTER L WITH MIDDLE DOT +0141; C; 0142; # LATIN CAPITAL LETTER L WITH STROKE +0143; C; 0144; # LATIN CAPITAL LETTER N WITH ACUTE +0145; C; 0146; # LATIN CAPITAL LETTER N WITH CEDILLA +0147; C; 0148; # LATIN CAPITAL LETTER N WITH CARON +0149; F; 02BC 006E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE +014A; C; 014B; # LATIN CAPITAL LETTER ENG +014C; C; 014D; # LATIN CAPITAL LETTER O WITH MACRON +014E; C; 014F; # LATIN CAPITAL LETTER O WITH BREVE +0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE +0152; C; 0153; # LATIN CAPITAL LIGATURE OE +0154; C; 0155; # LATIN CAPITAL LETTER R WITH ACUTE +0156; C; 0157; # LATIN CAPITAL LETTER R WITH CEDILLA +0158; C; 0159; # LATIN CAPITAL LETTER R WITH CARON +015A; C; 015B; # LATIN CAPITAL LETTER S WITH ACUTE +015C; C; 015D; # LATIN CAPITAL LETTER S WITH CIRCUMFLEX +015E; C; 015F; # LATIN CAPITAL LETTER S WITH CEDILLA +0160; C; 0161; # LATIN CAPITAL LETTER S WITH CARON +0162; C; 0163; # LATIN CAPITAL LETTER T WITH CEDILLA +0164; C; 0165; # LATIN CAPITAL LETTER T WITH CARON +0166; C; 0167; # LATIN CAPITAL LETTER T WITH STROKE +0168; C; 0169; # LATIN CAPITAL LETTER U WITH TILDE +016A; C; 016B; # LATIN CAPITAL LETTER U WITH MACRON +016C; C; 016D; # LATIN CAPITAL LETTER U WITH BREVE +016E; C; 016F; # LATIN CAPITAL LETTER U WITH RING ABOVE +0170; C; 0171; # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE +0172; C; 0173; # LATIN CAPITAL LETTER U WITH OGONEK +0174; C; 0175; # LATIN CAPITAL LETTER W WITH CIRCUMFLEX +0176; C; 0177; # LATIN CAPITAL LETTER Y WITH CIRCUMFLEX +0178; C; 00FF; # LATIN CAPITAL LETTER Y WITH DIAERESIS +0179; C; 017A; # LATIN CAPITAL LETTER Z WITH ACUTE +017B; C; 017C; # LATIN CAPITAL LETTER Z WITH DOT ABOVE +017D; C; 017E; # LATIN CAPITAL LETTER Z WITH CARON +017F; C; 0073; # LATIN SMALL LETTER LONG S +0181; C; 0253; # LATIN CAPITAL LETTER B WITH HOOK +0182; C; 0183; # LATIN CAPITAL LETTER B WITH TOPBAR +0184; C; 0185; # LATIN CAPITAL LETTER TONE SIX +0186; C; 0254; # LATIN CAPITAL LETTER OPEN O +0187; C; 0188; # LATIN CAPITAL LETTER C WITH HOOK +0189; C; 0256; # LATIN CAPITAL LETTER AFRICAN D +018A; C; 0257; # LATIN CAPITAL LETTER D WITH HOOK +018B; C; 018C; # LATIN CAPITAL LETTER D WITH TOPBAR +018E; C; 01DD; # LATIN CAPITAL LETTER REVERSED E +018F; C; 0259; # LATIN CAPITAL LETTER SCHWA +0190; C; 025B; # LATIN CAPITAL LETTER OPEN E +0191; C; 0192; # LATIN CAPITAL LETTER F WITH HOOK +0193; C; 0260; # LATIN CAPITAL LETTER G WITH HOOK +0194; C; 0263; # LATIN CAPITAL LETTER GAMMA +0196; C; 0269; # LATIN CAPITAL LETTER IOTA +0197; C; 0268; # LATIN CAPITAL LETTER I WITH STROKE +0198; C; 0199; # LATIN CAPITAL LETTER K WITH HOOK +019C; C; 026F; # LATIN CAPITAL LETTER TURNED M +019D; C; 0272; # LATIN CAPITAL LETTER N WITH LEFT HOOK +019F; C; 0275; # LATIN CAPITAL LETTER O WITH MIDDLE TILDE +01A0; C; 01A1; # LATIN CAPITAL LETTER O WITH HORN +01A2; C; 01A3; # LATIN CAPITAL LETTER OI +01A4; C; 01A5; # LATIN CAPITAL LETTER P WITH HOOK +01A6; C; 0280; # LATIN LETTER YR +01A7; C; 01A8; # LATIN CAPITAL LETTER TONE TWO +01A9; C; 0283; # LATIN CAPITAL LETTER ESH +01AC; C; 01AD; # LATIN CAPITAL LETTER T WITH HOOK +01AE; C; 0288; # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK +01AF; C; 01B0; # LATIN CAPITAL LETTER U WITH HORN +01B1; C; 028A; # LATIN CAPITAL LETTER UPSILON +01B2; C; 028B; # LATIN CAPITAL LETTER V WITH HOOK +01B3; C; 01B4; # LATIN CAPITAL LETTER Y WITH HOOK +01B5; C; 01B6; # LATIN CAPITAL LETTER Z WITH STROKE +01B7; C; 0292; # LATIN CAPITAL LETTER EZH +01B8; C; 01B9; # LATIN CAPITAL LETTER EZH REVERSED +01BC; C; 01BD; # LATIN CAPITAL LETTER TONE FIVE +01C4; C; 01C6; # LATIN CAPITAL LETTER DZ WITH CARON +01C5; C; 01C6; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON +01C7; C; 01C9; # LATIN CAPITAL LETTER LJ +01C8; C; 01C9; # LATIN CAPITAL LETTER L WITH SMALL LETTER J +01CA; C; 01CC; # LATIN CAPITAL LETTER NJ +01CB; C; 01CC; # LATIN CAPITAL LETTER N WITH SMALL LETTER J +01CD; C; 01CE; # LATIN CAPITAL LETTER A WITH CARON +01CF; C; 01D0; # LATIN CAPITAL LETTER I WITH CARON +01D1; C; 01D2; # LATIN CAPITAL LETTER O WITH CARON +01D3; C; 01D4; # LATIN CAPITAL LETTER U WITH CARON +01D5; C; 01D6; # LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON +01D7; C; 01D8; # LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE +01D9; C; 01DA; # LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON +01DB; C; 01DC; # LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE +01DE; C; 01DF; # LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON +01E0; C; 01E1; # LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON +01E2; C; 01E3; # LATIN CAPITAL LETTER AE WITH MACRON +01E4; C; 01E5; # LATIN CAPITAL LETTER G WITH STROKE +01E6; C; 01E7; # LATIN CAPITAL LETTER G WITH CARON +01E8; C; 01E9; # LATIN CAPITAL LETTER K WITH CARON +01EA; C; 01EB; # LATIN CAPITAL LETTER O WITH OGONEK +01EC; C; 01ED; # LATIN CAPITAL LETTER O WITH OGONEK AND MACRON +01EE; C; 01EF; # LATIN CAPITAL LETTER EZH WITH CARON +01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON +01F1; C; 01F3; # LATIN CAPITAL LETTER DZ +01F2; C; 01F3; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z +01F4; C; 01F5; # LATIN CAPITAL LETTER G WITH ACUTE +01F6; C; 0195; # LATIN CAPITAL LETTER HWAIR +01F7; C; 01BF; # LATIN CAPITAL LETTER WYNN +01F8; C; 01F9; # LATIN CAPITAL LETTER N WITH GRAVE +01FA; C; 01FB; # LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE +01FC; C; 01FD; # LATIN CAPITAL LETTER AE WITH ACUTE +01FE; C; 01FF; # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE +0200; C; 0201; # LATIN CAPITAL LETTER A WITH DOUBLE GRAVE +0202; C; 0203; # LATIN CAPITAL LETTER A WITH INVERTED BREVE +0204; C; 0205; # LATIN CAPITAL LETTER E WITH DOUBLE GRAVE +0206; C; 0207; # LATIN CAPITAL LETTER E WITH INVERTED BREVE +0208; C; 0209; # LATIN CAPITAL LETTER I WITH DOUBLE GRAVE +020A; C; 020B; # LATIN CAPITAL LETTER I WITH INVERTED BREVE +020C; C; 020D; # LATIN CAPITAL LETTER O WITH DOUBLE GRAVE +020E; C; 020F; # LATIN CAPITAL LETTER O WITH INVERTED BREVE +0210; C; 0211; # LATIN CAPITAL LETTER R WITH DOUBLE GRAVE +0212; C; 0213; # LATIN CAPITAL LETTER R WITH INVERTED BREVE +0214; C; 0215; # LATIN CAPITAL LETTER U WITH DOUBLE GRAVE +0216; C; 0217; # LATIN CAPITAL LETTER U WITH INVERTED BREVE +0218; C; 0219; # LATIN CAPITAL LETTER S WITH COMMA BELOW +021A; C; 021B; # LATIN CAPITAL LETTER T WITH COMMA BELOW +021C; C; 021D; # LATIN CAPITAL LETTER YOGH +021E; C; 021F; # LATIN CAPITAL LETTER H WITH CARON +0220; C; 019E; # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG +0222; C; 0223; # LATIN CAPITAL LETTER OU +0224; C; 0225; # LATIN CAPITAL LETTER Z WITH HOOK +0226; C; 0227; # LATIN CAPITAL LETTER A WITH DOT ABOVE +0228; C; 0229; # LATIN CAPITAL LETTER E WITH CEDILLA +022A; C; 022B; # LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON +022C; C; 022D; # LATIN CAPITAL LETTER O WITH TILDE AND MACRON +022E; C; 022F; # LATIN CAPITAL LETTER O WITH DOT ABOVE +0230; C; 0231; # LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON +0232; C; 0233; # LATIN CAPITAL LETTER Y WITH MACRON +023A; C; 2C65; # LATIN CAPITAL LETTER A WITH STROKE +023B; C; 023C; # LATIN CAPITAL LETTER C WITH STROKE +023D; C; 019A; # LATIN CAPITAL LETTER L WITH BAR +023E; C; 2C66; # LATIN CAPITAL LETTER T WITH DIAGONAL STROKE +0241; C; 0242; # LATIN CAPITAL LETTER GLOTTAL STOP +0243; C; 0180; # LATIN CAPITAL LETTER B WITH STROKE +0244; C; 0289; # LATIN CAPITAL LETTER U BAR +0245; C; 028C; # LATIN CAPITAL LETTER TURNED V +0246; C; 0247; # LATIN CAPITAL LETTER E WITH STROKE +0248; C; 0249; # LATIN CAPITAL LETTER J WITH STROKE +024A; C; 024B; # LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL +024C; C; 024D; # LATIN CAPITAL LETTER R WITH STROKE +024E; C; 024F; # LATIN CAPITAL LETTER Y WITH STROKE +0345; C; 03B9; # COMBINING GREEK YPOGEGRAMMENI +0370; C; 0371; # GREEK CAPITAL LETTER HETA +0372; C; 0373; # GREEK CAPITAL LETTER ARCHAIC SAMPI +0376; C; 0377; # GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA +037F; C; 03F3; # GREEK CAPITAL LETTER YOT +0386; C; 03AC; # GREEK CAPITAL LETTER ALPHA WITH TONOS +0388; C; 03AD; # GREEK CAPITAL LETTER EPSILON WITH TONOS +0389; C; 03AE; # GREEK CAPITAL LETTER ETA WITH TONOS +038A; C; 03AF; # GREEK CAPITAL LETTER IOTA WITH TONOS +038C; C; 03CC; # GREEK CAPITAL LETTER OMICRON WITH TONOS +038E; C; 03CD; # GREEK CAPITAL LETTER UPSILON WITH TONOS +038F; C; 03CE; # GREEK CAPITAL LETTER OMEGA WITH TONOS +0390; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS +0391; C; 03B1; # GREEK CAPITAL LETTER ALPHA +0392; C; 03B2; # GREEK CAPITAL LETTER BETA +0393; C; 03B3; # GREEK CAPITAL LETTER GAMMA +0394; C; 03B4; # GREEK CAPITAL LETTER DELTA +0395; C; 03B5; # GREEK CAPITAL LETTER EPSILON +0396; C; 03B6; # GREEK CAPITAL LETTER ZETA +0397; C; 03B7; # GREEK CAPITAL LETTER ETA +0398; C; 03B8; # GREEK CAPITAL LETTER THETA +0399; C; 03B9; # GREEK CAPITAL LETTER IOTA +039A; C; 03BA; # GREEK CAPITAL LETTER KAPPA +039B; C; 03BB; # GREEK CAPITAL LETTER LAMDA +039C; C; 03BC; # GREEK CAPITAL LETTER MU +039D; C; 03BD; # GREEK CAPITAL LETTER NU +039E; C; 03BE; # GREEK CAPITAL LETTER XI +039F; C; 03BF; # GREEK CAPITAL LETTER OMICRON +03A0; C; 03C0; # GREEK CAPITAL LETTER PI +03A1; C; 03C1; # GREEK CAPITAL LETTER RHO +03A3; C; 03C3; # GREEK CAPITAL LETTER SIGMA +03A4; C; 03C4; # GREEK CAPITAL LETTER TAU +03A5; C; 03C5; # GREEK CAPITAL LETTER UPSILON +03A6; C; 03C6; # GREEK CAPITAL LETTER PHI +03A7; C; 03C7; # GREEK CAPITAL LETTER CHI +03A8; C; 03C8; # GREEK CAPITAL LETTER PSI +03A9; C; 03C9; # GREEK CAPITAL LETTER OMEGA +03AA; C; 03CA; # GREEK CAPITAL LETTER IOTA WITH DIALYTIKA +03AB; C; 03CB; # GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA +03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS +03C2; C; 03C3; # GREEK SMALL LETTER FINAL SIGMA +03CF; C; 03D7; # GREEK CAPITAL KAI SYMBOL +03D0; C; 03B2; # GREEK BETA SYMBOL +03D1; C; 03B8; # GREEK THETA SYMBOL +03D5; C; 03C6; # GREEK PHI SYMBOL +03D6; C; 03C0; # GREEK PI SYMBOL +03D8; C; 03D9; # GREEK LETTER ARCHAIC KOPPA +03DA; C; 03DB; # GREEK LETTER STIGMA +03DC; C; 03DD; # GREEK LETTER DIGAMMA +03DE; C; 03DF; # GREEK LETTER KOPPA +03E0; C; 03E1; # GREEK LETTER SAMPI +03E2; C; 03E3; # COPTIC CAPITAL LETTER SHEI +03E4; C; 03E5; # COPTIC CAPITAL LETTER FEI +03E6; C; 03E7; # COPTIC CAPITAL LETTER KHEI +03E8; C; 03E9; # COPTIC CAPITAL LETTER HORI +03EA; C; 03EB; # COPTIC CAPITAL LETTER GANGIA +03EC; C; 03ED; # COPTIC CAPITAL LETTER SHIMA +03EE; C; 03EF; # COPTIC CAPITAL LETTER DEI +03F0; C; 03BA; # GREEK KAPPA SYMBOL +03F1; C; 03C1; # GREEK RHO SYMBOL +03F4; C; 03B8; # GREEK CAPITAL THETA SYMBOL +03F5; C; 03B5; # GREEK LUNATE EPSILON SYMBOL +03F7; C; 03F8; # GREEK CAPITAL LETTER SHO +03F9; C; 03F2; # GREEK CAPITAL LUNATE SIGMA SYMBOL +03FA; C; 03FB; # GREEK CAPITAL LETTER SAN +03FD; C; 037B; # GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL +03FE; C; 037C; # GREEK CAPITAL DOTTED LUNATE SIGMA SYMBOL +03FF; C; 037D; # GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL +0400; C; 0450; # CYRILLIC CAPITAL LETTER IE WITH GRAVE +0401; C; 0451; # CYRILLIC CAPITAL LETTER IO +0402; C; 0452; # CYRILLIC CAPITAL LETTER DJE +0403; C; 0453; # CYRILLIC CAPITAL LETTER GJE +0404; C; 0454; # CYRILLIC CAPITAL LETTER UKRAINIAN IE +0405; C; 0455; # CYRILLIC CAPITAL LETTER DZE +0406; C; 0456; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I +0407; C; 0457; # CYRILLIC CAPITAL LETTER YI +0408; C; 0458; # CYRILLIC CAPITAL LETTER JE +0409; C; 0459; # CYRILLIC CAPITAL LETTER LJE +040A; C; 045A; # CYRILLIC CAPITAL LETTER NJE +040B; C; 045B; # CYRILLIC CAPITAL LETTER TSHE +040C; C; 045C; # CYRILLIC CAPITAL LETTER KJE +040D; C; 045D; # CYRILLIC CAPITAL LETTER I WITH GRAVE +040E; C; 045E; # CYRILLIC CAPITAL LETTER SHORT U +040F; C; 045F; # CYRILLIC CAPITAL LETTER DZHE +0410; C; 0430; # CYRILLIC CAPITAL LETTER A +0411; C; 0431; # CYRILLIC CAPITAL LETTER BE +0412; C; 0432; # CYRILLIC CAPITAL LETTER VE +0413; C; 0433; # CYRILLIC CAPITAL LETTER GHE +0414; C; 0434; # CYRILLIC CAPITAL LETTER DE +0415; C; 0435; # CYRILLIC CAPITAL LETTER IE +0416; C; 0436; # CYRILLIC CAPITAL LETTER ZHE +0417; C; 0437; # CYRILLIC CAPITAL LETTER ZE +0418; C; 0438; # CYRILLIC CAPITAL LETTER I +0419; C; 0439; # CYRILLIC CAPITAL LETTER SHORT I +041A; C; 043A; # CYRILLIC CAPITAL LETTER KA +041B; C; 043B; # CYRILLIC CAPITAL LETTER EL +041C; C; 043C; # CYRILLIC CAPITAL LETTER EM +041D; C; 043D; # CYRILLIC CAPITAL LETTER EN +041E; C; 043E; # CYRILLIC CAPITAL LETTER O +041F; C; 043F; # CYRILLIC CAPITAL LETTER PE +0420; C; 0440; # CYRILLIC CAPITAL LETTER ER +0421; C; 0441; # CYRILLIC CAPITAL LETTER ES +0422; C; 0442; # CYRILLIC CAPITAL LETTER TE +0423; C; 0443; # CYRILLIC CAPITAL LETTER U +0424; C; 0444; # CYRILLIC CAPITAL LETTER EF +0425; C; 0445; # CYRILLIC CAPITAL LETTER HA +0426; C; 0446; # CYRILLIC CAPITAL LETTER TSE +0427; C; 0447; # CYRILLIC CAPITAL LETTER CHE +0428; C; 0448; # CYRILLIC CAPITAL LETTER SHA +0429; C; 0449; # CYRILLIC CAPITAL LETTER SHCHA +042A; C; 044A; # CYRILLIC CAPITAL LETTER HARD SIGN +042B; C; 044B; # CYRILLIC CAPITAL LETTER YERU +042C; C; 044C; # CYRILLIC CAPITAL LETTER SOFT SIGN +042D; C; 044D; # CYRILLIC CAPITAL LETTER E +042E; C; 044E; # CYRILLIC CAPITAL LETTER YU +042F; C; 044F; # CYRILLIC CAPITAL LETTER YA +0460; C; 0461; # CYRILLIC CAPITAL LETTER OMEGA +0462; C; 0463; # CYRILLIC CAPITAL LETTER YAT +0464; C; 0465; # CYRILLIC CAPITAL LETTER IOTIFIED E +0466; C; 0467; # CYRILLIC CAPITAL LETTER LITTLE YUS +0468; C; 0469; # CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS +046A; C; 046B; # CYRILLIC CAPITAL LETTER BIG YUS +046C; C; 046D; # CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS +046E; C; 046F; # CYRILLIC CAPITAL LETTER KSI +0470; C; 0471; # CYRILLIC CAPITAL LETTER PSI +0472; C; 0473; # CYRILLIC CAPITAL LETTER FITA +0474; C; 0475; # CYRILLIC CAPITAL LETTER IZHITSA +0476; C; 0477; # CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT +0478; C; 0479; # CYRILLIC CAPITAL LETTER UK +047A; C; 047B; # CYRILLIC CAPITAL LETTER ROUND OMEGA +047C; C; 047D; # CYRILLIC CAPITAL LETTER OMEGA WITH TITLO +047E; C; 047F; # CYRILLIC CAPITAL LETTER OT +0480; C; 0481; # CYRILLIC CAPITAL LETTER KOPPA +048A; C; 048B; # CYRILLIC CAPITAL LETTER SHORT I WITH TAIL +048C; C; 048D; # CYRILLIC CAPITAL LETTER SEMISOFT SIGN +048E; C; 048F; # CYRILLIC CAPITAL LETTER ER WITH TICK +0490; C; 0491; # CYRILLIC CAPITAL LETTER GHE WITH UPTURN +0492; C; 0493; # CYRILLIC CAPITAL LETTER GHE WITH STROKE +0494; C; 0495; # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK +0496; C; 0497; # CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER +0498; C; 0499; # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER +049A; C; 049B; # CYRILLIC CAPITAL LETTER KA WITH DESCENDER +049C; C; 049D; # CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE +049E; C; 049F; # CYRILLIC CAPITAL LETTER KA WITH STROKE +04A0; C; 04A1; # CYRILLIC CAPITAL LETTER BASHKIR KA +04A2; C; 04A3; # CYRILLIC CAPITAL LETTER EN WITH DESCENDER +04A4; C; 04A5; # CYRILLIC CAPITAL LIGATURE EN GHE +04A6; C; 04A7; # CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK +04A8; C; 04A9; # CYRILLIC CAPITAL LETTER ABKHASIAN HA +04AA; C; 04AB; # CYRILLIC CAPITAL LETTER ES WITH DESCENDER +04AC; C; 04AD; # CYRILLIC CAPITAL LETTER TE WITH DESCENDER +04AE; C; 04AF; # CYRILLIC CAPITAL LETTER STRAIGHT U +04B0; C; 04B1; # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE +04B2; C; 04B3; # CYRILLIC CAPITAL LETTER HA WITH DESCENDER +04B4; C; 04B5; # CYRILLIC CAPITAL LIGATURE TE TSE +04B6; C; 04B7; # CYRILLIC CAPITAL LETTER CHE WITH DESCENDER +04B8; C; 04B9; # CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE +04BA; C; 04BB; # CYRILLIC CAPITAL LETTER SHHA +04BC; C; 04BD; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE +04BE; C; 04BF; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER +04C0; C; 04CF; # CYRILLIC LETTER PALOCHKA +04C1; C; 04C2; # CYRILLIC CAPITAL LETTER ZHE WITH BREVE +04C3; C; 04C4; # CYRILLIC CAPITAL LETTER KA WITH HOOK +04C5; C; 04C6; # CYRILLIC CAPITAL LETTER EL WITH TAIL +04C7; C; 04C8; # CYRILLIC CAPITAL LETTER EN WITH HOOK +04C9; C; 04CA; # CYRILLIC CAPITAL LETTER EN WITH TAIL +04CB; C; 04CC; # CYRILLIC CAPITAL LETTER KHAKASSIAN CHE +04CD; C; 04CE; # CYRILLIC CAPITAL LETTER EM WITH TAIL +04D0; C; 04D1; # CYRILLIC CAPITAL LETTER A WITH BREVE +04D2; C; 04D3; # CYRILLIC CAPITAL LETTER A WITH DIAERESIS +04D4; C; 04D5; # CYRILLIC CAPITAL LIGATURE A IE +04D6; C; 04D7; # CYRILLIC CAPITAL LETTER IE WITH BREVE +04D8; C; 04D9; # CYRILLIC CAPITAL LETTER SCHWA +04DA; C; 04DB; # CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS +04DC; C; 04DD; # CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS +04DE; C; 04DF; # CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS +04E0; C; 04E1; # CYRILLIC CAPITAL LETTER ABKHASIAN DZE +04E2; C; 04E3; # CYRILLIC CAPITAL LETTER I WITH MACRON +04E4; C; 04E5; # CYRILLIC CAPITAL LETTER I WITH DIAERESIS +04E6; C; 04E7; # CYRILLIC CAPITAL LETTER O WITH DIAERESIS +04E8; C; 04E9; # CYRILLIC CAPITAL LETTER BARRED O +04EA; C; 04EB; # CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS +04EC; C; 04ED; # CYRILLIC CAPITAL LETTER E WITH DIAERESIS +04EE; C; 04EF; # CYRILLIC CAPITAL LETTER U WITH MACRON +04F0; C; 04F1; # CYRILLIC CAPITAL LETTER U WITH DIAERESIS +04F2; C; 04F3; # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE +04F4; C; 04F5; # CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS +04F6; C; 04F7; # CYRILLIC CAPITAL LETTER GHE WITH DESCENDER +04F8; C; 04F9; # CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS +04FA; C; 04FB; # CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK +04FC; C; 04FD; # CYRILLIC CAPITAL LETTER HA WITH HOOK +04FE; C; 04FF; # CYRILLIC CAPITAL LETTER HA WITH STROKE +0500; C; 0501; # CYRILLIC CAPITAL LETTER KOMI DE +0502; C; 0503; # CYRILLIC CAPITAL LETTER KOMI DJE +0504; C; 0505; # CYRILLIC CAPITAL LETTER KOMI ZJE +0506; C; 0507; # CYRILLIC CAPITAL LETTER KOMI DZJE +0508; C; 0509; # CYRILLIC CAPITAL LETTER KOMI LJE +050A; C; 050B; # CYRILLIC CAPITAL LETTER KOMI NJE +050C; C; 050D; # CYRILLIC CAPITAL LETTER KOMI SJE +050E; C; 050F; # CYRILLIC CAPITAL LETTER KOMI TJE +0510; C; 0511; # CYRILLIC CAPITAL LETTER REVERSED ZE +0512; C; 0513; # CYRILLIC CAPITAL LETTER EL WITH HOOK +0514; C; 0515; # CYRILLIC CAPITAL LETTER LHA +0516; C; 0517; # CYRILLIC CAPITAL LETTER RHA +0518; C; 0519; # CYRILLIC CAPITAL LETTER YAE +051A; C; 051B; # CYRILLIC CAPITAL LETTER QA +051C; C; 051D; # CYRILLIC CAPITAL LETTER WE +051E; C; 051F; # CYRILLIC CAPITAL LETTER ALEUT KA +0520; C; 0521; # CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK +0522; C; 0523; # CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK +0524; C; 0525; # CYRILLIC CAPITAL LETTER PE WITH DESCENDER +0526; C; 0527; # CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER +0528; C; 0529; # CYRILLIC CAPITAL LETTER EN WITH LEFT HOOK +052A; C; 052B; # CYRILLIC CAPITAL LETTER DZZHE +052C; C; 052D; # CYRILLIC CAPITAL LETTER DCHE +052E; C; 052F; # CYRILLIC CAPITAL LETTER EL WITH DESCENDER +0531; C; 0561; # ARMENIAN CAPITAL LETTER AYB +0532; C; 0562; # ARMENIAN CAPITAL LETTER BEN +0533; C; 0563; # ARMENIAN CAPITAL LETTER GIM +0534; C; 0564; # ARMENIAN CAPITAL LETTER DA +0535; C; 0565; # ARMENIAN CAPITAL LETTER ECH +0536; C; 0566; # ARMENIAN CAPITAL LETTER ZA +0537; C; 0567; # ARMENIAN CAPITAL LETTER EH +0538; C; 0568; # ARMENIAN CAPITAL LETTER ET +0539; C; 0569; # ARMENIAN CAPITAL LETTER TO +053A; C; 056A; # ARMENIAN CAPITAL LETTER ZHE +053B; C; 056B; # ARMENIAN CAPITAL LETTER INI +053C; C; 056C; # ARMENIAN CAPITAL LETTER LIWN +053D; C; 056D; # ARMENIAN CAPITAL LETTER XEH +053E; C; 056E; # ARMENIAN CAPITAL LETTER CA +053F; C; 056F; # ARMENIAN CAPITAL LETTER KEN +0540; C; 0570; # ARMENIAN CAPITAL LETTER HO +0541; C; 0571; # ARMENIAN CAPITAL LETTER JA +0542; C; 0572; # ARMENIAN CAPITAL LETTER GHAD +0543; C; 0573; # ARMENIAN CAPITAL LETTER CHEH +0544; C; 0574; # ARMENIAN CAPITAL LETTER MEN +0545; C; 0575; # ARMENIAN CAPITAL LETTER YI +0546; C; 0576; # ARMENIAN CAPITAL LETTER NOW +0547; C; 0577; # ARMENIAN CAPITAL LETTER SHA +0548; C; 0578; # ARMENIAN CAPITAL LETTER VO +0549; C; 0579; # ARMENIAN CAPITAL LETTER CHA +054A; C; 057A; # ARMENIAN CAPITAL LETTER PEH +054B; C; 057B; # ARMENIAN CAPITAL LETTER JHEH +054C; C; 057C; # ARMENIAN CAPITAL LETTER RA +054D; C; 057D; # ARMENIAN CAPITAL LETTER SEH +054E; C; 057E; # ARMENIAN CAPITAL LETTER VEW +054F; C; 057F; # ARMENIAN CAPITAL LETTER TIWN +0550; C; 0580; # ARMENIAN CAPITAL LETTER REH +0551; C; 0581; # ARMENIAN CAPITAL LETTER CO +0552; C; 0582; # ARMENIAN CAPITAL LETTER YIWN +0553; C; 0583; # ARMENIAN CAPITAL LETTER PIWR +0554; C; 0584; # ARMENIAN CAPITAL LETTER KEH +0555; C; 0585; # ARMENIAN CAPITAL LETTER OH +0556; C; 0586; # ARMENIAN CAPITAL LETTER FEH +0587; F; 0565 0582; # ARMENIAN SMALL LIGATURE ECH YIWN +10A0; C; 2D00; # GEORGIAN CAPITAL LETTER AN +10A1; C; 2D01; # GEORGIAN CAPITAL LETTER BAN +10A2; C; 2D02; # GEORGIAN CAPITAL LETTER GAN +10A3; C; 2D03; # GEORGIAN CAPITAL LETTER DON +10A4; C; 2D04; # GEORGIAN CAPITAL LETTER EN +10A5; C; 2D05; # GEORGIAN CAPITAL LETTER VIN +10A6; C; 2D06; # GEORGIAN CAPITAL LETTER ZEN +10A7; C; 2D07; # GEORGIAN CAPITAL LETTER TAN +10A8; C; 2D08; # GEORGIAN CAPITAL LETTER IN +10A9; C; 2D09; # GEORGIAN CAPITAL LETTER KAN +10AA; C; 2D0A; # GEORGIAN CAPITAL LETTER LAS +10AB; C; 2D0B; # GEORGIAN CAPITAL LETTER MAN +10AC; C; 2D0C; # GEORGIAN CAPITAL LETTER NAR +10AD; C; 2D0D; # GEORGIAN CAPITAL LETTER ON +10AE; C; 2D0E; # GEORGIAN CAPITAL LETTER PAR +10AF; C; 2D0F; # GEORGIAN CAPITAL LETTER ZHAR +10B0; C; 2D10; # GEORGIAN CAPITAL LETTER RAE +10B1; C; 2D11; # GEORGIAN CAPITAL LETTER SAN +10B2; C; 2D12; # GEORGIAN CAPITAL LETTER TAR +10B3; C; 2D13; # GEORGIAN CAPITAL LETTER UN +10B4; C; 2D14; # GEORGIAN CAPITAL LETTER PHAR +10B5; C; 2D15; # GEORGIAN CAPITAL LETTER KHAR +10B6; C; 2D16; # GEORGIAN CAPITAL LETTER GHAN +10B7; C; 2D17; # GEORGIAN CAPITAL LETTER QAR +10B8; C; 2D18; # GEORGIAN CAPITAL LETTER SHIN +10B9; C; 2D19; # GEORGIAN CAPITAL LETTER CHIN +10BA; C; 2D1A; # GEORGIAN CAPITAL LETTER CAN +10BB; C; 2D1B; # GEORGIAN CAPITAL LETTER JIL +10BC; C; 2D1C; # GEORGIAN CAPITAL LETTER CIL +10BD; C; 2D1D; # GEORGIAN CAPITAL LETTER CHAR +10BE; C; 2D1E; # GEORGIAN CAPITAL LETTER XAN +10BF; C; 2D1F; # GEORGIAN CAPITAL LETTER JHAN +10C0; C; 2D20; # GEORGIAN CAPITAL LETTER HAE +10C1; C; 2D21; # GEORGIAN CAPITAL LETTER HE +10C2; C; 2D22; # GEORGIAN CAPITAL LETTER HIE +10C3; C; 2D23; # GEORGIAN CAPITAL LETTER WE +10C4; C; 2D24; # GEORGIAN CAPITAL LETTER HAR +10C5; C; 2D25; # GEORGIAN CAPITAL LETTER HOE +10C7; C; 2D27; # GEORGIAN CAPITAL LETTER YN +10CD; C; 2D2D; # GEORGIAN CAPITAL LETTER AEN +13F8; C; 13F0; # CHEROKEE SMALL LETTER YE +13F9; C; 13F1; # CHEROKEE SMALL LETTER YI +13FA; C; 13F2; # CHEROKEE SMALL LETTER YO +13FB; C; 13F3; # CHEROKEE SMALL LETTER YU +13FC; C; 13F4; # CHEROKEE SMALL LETTER YV +13FD; C; 13F5; # CHEROKEE SMALL LETTER MV +1C80; C; 0432; # CYRILLIC SMALL LETTER ROUNDED VE +1C81; C; 0434; # CYRILLIC SMALL LETTER LONG-LEGGED DE +1C82; C; 043E; # CYRILLIC SMALL LETTER NARROW O +1C83; C; 0441; # CYRILLIC SMALL LETTER WIDE ES +1C84; C; 0442; # CYRILLIC SMALL LETTER TALL TE +1C85; C; 0442; # CYRILLIC SMALL LETTER THREE-LEGGED TE +1C86; C; 044A; # CYRILLIC SMALL LETTER TALL HARD SIGN +1C87; C; 0463; # CYRILLIC SMALL LETTER TALL YAT +1C88; C; A64B; # CYRILLIC SMALL LETTER UNBLENDED UK +1C90; C; 10D0; # GEORGIAN MTAVRULI CAPITAL LETTER AN +1C91; C; 10D1; # GEORGIAN MTAVRULI CAPITAL LETTER BAN +1C92; C; 10D2; # GEORGIAN MTAVRULI CAPITAL LETTER GAN +1C93; C; 10D3; # GEORGIAN MTAVRULI CAPITAL LETTER DON +1C94; C; 10D4; # GEORGIAN MTAVRULI CAPITAL LETTER EN +1C95; C; 10D5; # GEORGIAN MTAVRULI CAPITAL LETTER VIN +1C96; C; 10D6; # GEORGIAN MTAVRULI CAPITAL LETTER ZEN +1C97; C; 10D7; # GEORGIAN MTAVRULI CAPITAL LETTER TAN +1C98; C; 10D8; # GEORGIAN MTAVRULI CAPITAL LETTER IN +1C99; C; 10D9; # GEORGIAN MTAVRULI CAPITAL LETTER KAN +1C9A; C; 10DA; # GEORGIAN MTAVRULI CAPITAL LETTER LAS +1C9B; C; 10DB; # GEORGIAN MTAVRULI CAPITAL LETTER MAN +1C9C; C; 10DC; # GEORGIAN MTAVRULI CAPITAL LETTER NAR +1C9D; C; 10DD; # GEORGIAN MTAVRULI CAPITAL LETTER ON +1C9E; C; 10DE; # GEORGIAN MTAVRULI CAPITAL LETTER PAR +1C9F; C; 10DF; # GEORGIAN MTAVRULI CAPITAL LETTER ZHAR +1CA0; C; 10E0; # GEORGIAN MTAVRULI CAPITAL LETTER RAE +1CA1; C; 10E1; # GEORGIAN MTAVRULI CAPITAL LETTER SAN +1CA2; C; 10E2; # GEORGIAN MTAVRULI CAPITAL LETTER TAR +1CA3; C; 10E3; # GEORGIAN MTAVRULI CAPITAL LETTER UN +1CA4; C; 10E4; # GEORGIAN MTAVRULI CAPITAL LETTER PHAR +1CA5; C; 10E5; # GEORGIAN MTAVRULI CAPITAL LETTER KHAR +1CA6; C; 10E6; # GEORGIAN MTAVRULI CAPITAL LETTER GHAN +1CA7; C; 10E7; # GEORGIAN MTAVRULI CAPITAL LETTER QAR +1CA8; C; 10E8; # GEORGIAN MTAVRULI CAPITAL LETTER SHIN +1CA9; C; 10E9; # GEORGIAN MTAVRULI CAPITAL LETTER CHIN +1CAA; C; 10EA; # GEORGIAN MTAVRULI CAPITAL LETTER CAN +1CAB; C; 10EB; # GEORGIAN MTAVRULI CAPITAL LETTER JIL +1CAC; C; 10EC; # GEORGIAN MTAVRULI CAPITAL LETTER CIL +1CAD; C; 10ED; # GEORGIAN MTAVRULI CAPITAL LETTER CHAR +1CAE; C; 10EE; # GEORGIAN MTAVRULI CAPITAL LETTER XAN +1CAF; C; 10EF; # GEORGIAN MTAVRULI CAPITAL LETTER JHAN +1CB0; C; 10F0; # GEORGIAN MTAVRULI CAPITAL LETTER HAE +1CB1; C; 10F1; # GEORGIAN MTAVRULI CAPITAL LETTER HE +1CB2; C; 10F2; # GEORGIAN MTAVRULI CAPITAL LETTER HIE +1CB3; C; 10F3; # GEORGIAN MTAVRULI CAPITAL LETTER WE +1CB4; C; 10F4; # GEORGIAN MTAVRULI CAPITAL LETTER HAR +1CB5; C; 10F5; # GEORGIAN MTAVRULI CAPITAL LETTER HOE +1CB6; C; 10F6; # GEORGIAN MTAVRULI CAPITAL LETTER FI +1CB7; C; 10F7; # GEORGIAN MTAVRULI CAPITAL LETTER YN +1CB8; C; 10F8; # GEORGIAN MTAVRULI CAPITAL LETTER ELIFI +1CB9; C; 10F9; # GEORGIAN MTAVRULI CAPITAL LETTER TURNED GAN +1CBA; C; 10FA; # GEORGIAN MTAVRULI CAPITAL LETTER AIN +1CBD; C; 10FD; # GEORGIAN MTAVRULI CAPITAL LETTER AEN +1CBE; C; 10FE; # GEORGIAN MTAVRULI CAPITAL LETTER HARD SIGN +1CBF; C; 10FF; # GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN +1E00; C; 1E01; # LATIN CAPITAL LETTER A WITH RING BELOW +1E02; C; 1E03; # LATIN CAPITAL LETTER B WITH DOT ABOVE +1E04; C; 1E05; # LATIN CAPITAL LETTER B WITH DOT BELOW +1E06; C; 1E07; # LATIN CAPITAL LETTER B WITH LINE BELOW +1E08; C; 1E09; # LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE +1E0A; C; 1E0B; # LATIN CAPITAL LETTER D WITH DOT ABOVE +1E0C; C; 1E0D; # LATIN CAPITAL LETTER D WITH DOT BELOW +1E0E; C; 1E0F; # LATIN CAPITAL LETTER D WITH LINE BELOW +1E10; C; 1E11; # LATIN CAPITAL LETTER D WITH CEDILLA +1E12; C; 1E13; # LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW +1E14; C; 1E15; # LATIN CAPITAL LETTER E WITH MACRON AND GRAVE +1E16; C; 1E17; # LATIN CAPITAL LETTER E WITH MACRON AND ACUTE +1E18; C; 1E19; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW +1E1A; C; 1E1B; # LATIN CAPITAL LETTER E WITH TILDE BELOW +1E1C; C; 1E1D; # LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE +1E1E; C; 1E1F; # LATIN CAPITAL LETTER F WITH DOT ABOVE +1E20; C; 1E21; # LATIN CAPITAL LETTER G WITH MACRON +1E22; C; 1E23; # LATIN CAPITAL LETTER H WITH DOT ABOVE +1E24; C; 1E25; # LATIN CAPITAL LETTER H WITH DOT BELOW +1E26; C; 1E27; # LATIN CAPITAL LETTER H WITH DIAERESIS +1E28; C; 1E29; # LATIN CAPITAL LETTER H WITH CEDILLA +1E2A; C; 1E2B; # LATIN CAPITAL LETTER H WITH BREVE BELOW +1E2C; C; 1E2D; # LATIN CAPITAL LETTER I WITH TILDE BELOW +1E2E; C; 1E2F; # LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE +1E30; C; 1E31; # LATIN CAPITAL LETTER K WITH ACUTE +1E32; C; 1E33; # LATIN CAPITAL LETTER K WITH DOT BELOW +1E34; C; 1E35; # LATIN CAPITAL LETTER K WITH LINE BELOW +1E36; C; 1E37; # LATIN CAPITAL LETTER L WITH DOT BELOW +1E38; C; 1E39; # LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON +1E3A; C; 1E3B; # LATIN CAPITAL LETTER L WITH LINE BELOW +1E3C; C; 1E3D; # LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW +1E3E; C; 1E3F; # LATIN CAPITAL LETTER M WITH ACUTE +1E40; C; 1E41; # LATIN CAPITAL LETTER M WITH DOT ABOVE +1E42; C; 1E43; # LATIN CAPITAL LETTER M WITH DOT BELOW +1E44; C; 1E45; # LATIN CAPITAL LETTER N WITH DOT ABOVE +1E46; C; 1E47; # LATIN CAPITAL LETTER N WITH DOT BELOW +1E48; C; 1E49; # LATIN CAPITAL LETTER N WITH LINE BELOW +1E4A; C; 1E4B; # LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW +1E4C; C; 1E4D; # LATIN CAPITAL LETTER O WITH TILDE AND ACUTE +1E4E; C; 1E4F; # LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS +1E50; C; 1E51; # LATIN CAPITAL LETTER O WITH MACRON AND GRAVE +1E52; C; 1E53; # LATIN CAPITAL LETTER O WITH MACRON AND ACUTE +1E54; C; 1E55; # LATIN CAPITAL LETTER P WITH ACUTE +1E56; C; 1E57; # LATIN CAPITAL LETTER P WITH DOT ABOVE +1E58; C; 1E59; # LATIN CAPITAL LETTER R WITH DOT ABOVE +1E5A; C; 1E5B; # LATIN CAPITAL LETTER R WITH DOT BELOW +1E5C; C; 1E5D; # LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON +1E5E; C; 1E5F; # LATIN CAPITAL LETTER R WITH LINE BELOW +1E60; C; 1E61; # LATIN CAPITAL LETTER S WITH DOT ABOVE +1E62; C; 1E63; # LATIN CAPITAL LETTER S WITH DOT BELOW +1E64; C; 1E65; # LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE +1E66; C; 1E67; # LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE +1E68; C; 1E69; # LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE +1E6A; C; 1E6B; # LATIN CAPITAL LETTER T WITH DOT ABOVE +1E6C; C; 1E6D; # LATIN CAPITAL LETTER T WITH DOT BELOW +1E6E; C; 1E6F; # LATIN CAPITAL LETTER T WITH LINE BELOW +1E70; C; 1E71; # LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW +1E72; C; 1E73; # LATIN CAPITAL LETTER U WITH DIAERESIS BELOW +1E74; C; 1E75; # LATIN CAPITAL LETTER U WITH TILDE BELOW +1E76; C; 1E77; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW +1E78; C; 1E79; # LATIN CAPITAL LETTER U WITH TILDE AND ACUTE +1E7A; C; 1E7B; # LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS +1E7C; C; 1E7D; # LATIN CAPITAL LETTER V WITH TILDE +1E7E; C; 1E7F; # LATIN CAPITAL LETTER V WITH DOT BELOW +1E80; C; 1E81; # LATIN CAPITAL LETTER W WITH GRAVE +1E82; C; 1E83; # LATIN CAPITAL LETTER W WITH ACUTE +1E84; C; 1E85; # LATIN CAPITAL LETTER W WITH DIAERESIS +1E86; C; 1E87; # LATIN CAPITAL LETTER W WITH DOT ABOVE +1E88; C; 1E89; # LATIN CAPITAL LETTER W WITH DOT BELOW +1E8A; C; 1E8B; # LATIN CAPITAL LETTER X WITH DOT ABOVE +1E8C; C; 1E8D; # LATIN CAPITAL LETTER X WITH DIAERESIS +1E8E; C; 1E8F; # LATIN CAPITAL LETTER Y WITH DOT ABOVE +1E90; C; 1E91; # LATIN CAPITAL LETTER Z WITH CIRCUMFLEX +1E92; C; 1E93; # LATIN CAPITAL LETTER Z WITH DOT BELOW +1E94; C; 1E95; # LATIN CAPITAL LETTER Z WITH LINE BELOW +1E96; F; 0068 0331; # LATIN SMALL LETTER H WITH LINE BELOW +1E97; F; 0074 0308; # LATIN SMALL LETTER T WITH DIAERESIS +1E98; F; 0077 030A; # LATIN SMALL LETTER W WITH RING ABOVE +1E99; F; 0079 030A; # LATIN SMALL LETTER Y WITH RING ABOVE +1E9A; F; 0061 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING +1E9B; C; 1E61; # LATIN SMALL LETTER LONG S WITH DOT ABOVE +1E9E; F; 0073 0073; # LATIN CAPITAL LETTER SHARP S +1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S +1EA0; C; 1EA1; # LATIN CAPITAL LETTER A WITH DOT BELOW +1EA2; C; 1EA3; # LATIN CAPITAL LETTER A WITH HOOK ABOVE +1EA4; C; 1EA5; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE +1EA6; C; 1EA7; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE +1EA8; C; 1EA9; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE +1EAA; C; 1EAB; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE +1EAC; C; 1EAD; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW +1EAE; C; 1EAF; # LATIN CAPITAL LETTER A WITH BREVE AND ACUTE +1EB0; C; 1EB1; # LATIN CAPITAL LETTER A WITH BREVE AND GRAVE +1EB2; C; 1EB3; # LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE +1EB4; C; 1EB5; # LATIN CAPITAL LETTER A WITH BREVE AND TILDE +1EB6; C; 1EB7; # LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW +1EB8; C; 1EB9; # LATIN CAPITAL LETTER E WITH DOT BELOW +1EBA; C; 1EBB; # LATIN CAPITAL LETTER E WITH HOOK ABOVE +1EBC; C; 1EBD; # LATIN CAPITAL LETTER E WITH TILDE +1EBE; C; 1EBF; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE +1EC0; C; 1EC1; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE +1EC2; C; 1EC3; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE +1EC4; C; 1EC5; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE +1EC6; C; 1EC7; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW +1EC8; C; 1EC9; # LATIN CAPITAL LETTER I WITH HOOK ABOVE +1ECA; C; 1ECB; # LATIN CAPITAL LETTER I WITH DOT BELOW +1ECC; C; 1ECD; # LATIN CAPITAL LETTER O WITH DOT BELOW +1ECE; C; 1ECF; # LATIN CAPITAL LETTER O WITH HOOK ABOVE +1ED0; C; 1ED1; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE +1ED2; C; 1ED3; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE +1ED4; C; 1ED5; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE +1ED6; C; 1ED7; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE +1ED8; C; 1ED9; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW +1EDA; C; 1EDB; # LATIN CAPITAL LETTER O WITH HORN AND ACUTE +1EDC; C; 1EDD; # LATIN CAPITAL LETTER O WITH HORN AND GRAVE +1EDE; C; 1EDF; # LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE +1EE0; C; 1EE1; # LATIN CAPITAL LETTER O WITH HORN AND TILDE +1EE2; C; 1EE3; # LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW +1EE4; C; 1EE5; # LATIN CAPITAL LETTER U WITH DOT BELOW +1EE6; C; 1EE7; # LATIN CAPITAL LETTER U WITH HOOK ABOVE +1EE8; C; 1EE9; # LATIN CAPITAL LETTER U WITH HORN AND ACUTE +1EEA; C; 1EEB; # LATIN CAPITAL LETTER U WITH HORN AND GRAVE +1EEC; C; 1EED; # LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE +1EEE; C; 1EEF; # LATIN CAPITAL LETTER U WITH HORN AND TILDE +1EF0; C; 1EF1; # LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW +1EF2; C; 1EF3; # LATIN CAPITAL LETTER Y WITH GRAVE +1EF4; C; 1EF5; # LATIN CAPITAL LETTER Y WITH DOT BELOW +1EF6; C; 1EF7; # LATIN CAPITAL LETTER Y WITH HOOK ABOVE +1EF8; C; 1EF9; # LATIN CAPITAL LETTER Y WITH TILDE +1EFA; C; 1EFB; # LATIN CAPITAL LETTER MIDDLE-WELSH LL +1EFC; C; 1EFD; # LATIN CAPITAL LETTER MIDDLE-WELSH V +1EFE; C; 1EFF; # LATIN CAPITAL LETTER Y WITH LOOP +1F08; C; 1F00; # GREEK CAPITAL LETTER ALPHA WITH PSILI +1F09; C; 1F01; # GREEK CAPITAL LETTER ALPHA WITH DASIA +1F0A; C; 1F02; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA +1F0B; C; 1F03; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA +1F0C; C; 1F04; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA +1F0D; C; 1F05; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA +1F0E; C; 1F06; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI +1F0F; C; 1F07; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI +1F18; C; 1F10; # GREEK CAPITAL LETTER EPSILON WITH PSILI +1F19; C; 1F11; # GREEK CAPITAL LETTER EPSILON WITH DASIA +1F1A; C; 1F12; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND VARIA +1F1B; C; 1F13; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND VARIA +1F1C; C; 1F14; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND OXIA +1F1D; C; 1F15; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA +1F28; C; 1F20; # GREEK CAPITAL LETTER ETA WITH PSILI +1F29; C; 1F21; # GREEK CAPITAL LETTER ETA WITH DASIA +1F2A; C; 1F22; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA +1F2B; C; 1F23; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA +1F2C; C; 1F24; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA +1F2D; C; 1F25; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA +1F2E; C; 1F26; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI +1F2F; C; 1F27; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI +1F38; C; 1F30; # GREEK CAPITAL LETTER IOTA WITH PSILI +1F39; C; 1F31; # GREEK CAPITAL LETTER IOTA WITH DASIA +1F3A; C; 1F32; # GREEK CAPITAL LETTER IOTA WITH PSILI AND VARIA +1F3B; C; 1F33; # GREEK CAPITAL LETTER IOTA WITH DASIA AND VARIA +1F3C; C; 1F34; # GREEK CAPITAL LETTER IOTA WITH PSILI AND OXIA +1F3D; C; 1F35; # GREEK CAPITAL LETTER IOTA WITH DASIA AND OXIA +1F3E; C; 1F36; # GREEK CAPITAL LETTER IOTA WITH PSILI AND PERISPOMENI +1F3F; C; 1F37; # GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI +1F48; C; 1F40; # GREEK CAPITAL LETTER OMICRON WITH PSILI +1F49; C; 1F41; # GREEK CAPITAL LETTER OMICRON WITH DASIA +1F4A; C; 1F42; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND VARIA +1F4B; C; 1F43; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND VARIA +1F4C; C; 1F44; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND OXIA +1F4D; C; 1F45; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA +1F50; F; 03C5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI +1F52; F; 03C5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA +1F54; F; 03C5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA +1F56; F; 03C5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI +1F59; C; 1F51; # GREEK CAPITAL LETTER UPSILON WITH DASIA +1F5B; C; 1F53; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA +1F5D; C; 1F55; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA +1F5F; C; 1F57; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI +1F68; C; 1F60; # GREEK CAPITAL LETTER OMEGA WITH PSILI +1F69; C; 1F61; # GREEK CAPITAL LETTER OMEGA WITH DASIA +1F6A; C; 1F62; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA +1F6B; C; 1F63; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA +1F6C; C; 1F64; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA +1F6D; C; 1F65; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA +1F6E; C; 1F66; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI +1F6F; C; 1F67; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI +1F80; F; 1F00 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI +1F81; F; 1F01 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI +1F82; F; 1F02 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1F83; F; 1F03 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1F84; F; 1F04 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1F85; F; 1F05 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1F86; F; 1F06 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1F87; F; 1F07 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1F88; F; 1F00 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI +1F88; S; 1F80; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI +1F89; F; 1F01 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI +1F89; S; 1F81; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI +1F8A; F; 1F02 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F8A; S; 1F82; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F8B; F; 1F03 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F8B; S; 1F83; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F8C; F; 1F04 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F8C; S; 1F84; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F8D; F; 1F05 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F8D; S; 1F85; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F8E; F; 1F06 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F8E; S; 1F86; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F8F; F; 1F07 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F90; F; 1F20 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI +1F91; F; 1F21 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI +1F92; F; 1F22 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1F93; F; 1F23 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1F94; F; 1F24 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1F95; F; 1F25 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1F96; F; 1F26 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1F97; F; 1F27 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1F98; F; 1F20 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI +1F98; S; 1F90; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI +1F99; F; 1F21 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI +1F99; S; 1F91; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI +1F9A; F; 1F22 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F9A; S; 1F92; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F9B; F; 1F23 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F9B; S; 1F93; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F9C; F; 1F24 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F9C; S; 1F94; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F9D; F; 1F25 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F9D; S; 1F95; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F9E; F; 1F26 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F9E; S; 1F96; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F9F; F; 1F27 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F9F; S; 1F97; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FA0; F; 1F60 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI +1FA1; F; 1F61 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI +1FA2; F; 1F62 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1FA3; F; 1F63 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1FA4; F; 1F64 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1FA5; F; 1F65 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1FA6; F; 1F66 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1FA7; F; 1F67 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1FA8; F; 1F60 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI +1FA8; S; 1FA0; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI +1FA9; F; 1F61 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI +1FA9; S; 1FA1; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI +1FAA; F; 1F62 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1FAA; S; 1FA2; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1FAB; F; 1F63 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1FAB; S; 1FA3; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1FAC; F; 1F64 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1FAC; S; 1FA4; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1FAD; F; 1F65 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1FAD; S; 1FA5; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1FAE; F; 1F66 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1FAE; S; 1FA6; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1FAF; F; 1F67 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FAF; S; 1FA7; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FB2; F; 1F70 03B9; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI +1FB3; F; 03B1 03B9; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI +1FB4; F; 03AC 03B9; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI +1FB6; F; 03B1 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI +1FB7; F; 03B1 0342 03B9; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI +1FB8; C; 1FB0; # GREEK CAPITAL LETTER ALPHA WITH VRACHY +1FB9; C; 1FB1; # GREEK CAPITAL LETTER ALPHA WITH MACRON +1FBA; C; 1F70; # GREEK CAPITAL LETTER ALPHA WITH VARIA +1FBB; C; 1F71; # GREEK CAPITAL LETTER ALPHA WITH OXIA +1FBC; F; 03B1 03B9; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +1FBC; S; 1FB3; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +1FBE; C; 03B9; # GREEK PROSGEGRAMMENI +1FC2; F; 1F74 03B9; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI +1FC3; F; 03B7 03B9; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI +1FC4; F; 03AE 03B9; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI +1FC6; F; 03B7 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI +1FC7; F; 03B7 0342 03B9; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI +1FC8; C; 1F72; # GREEK CAPITAL LETTER EPSILON WITH VARIA +1FC9; C; 1F73; # GREEK CAPITAL LETTER EPSILON WITH OXIA +1FCA; C; 1F74; # GREEK CAPITAL LETTER ETA WITH VARIA +1FCB; C; 1F75; # GREEK CAPITAL LETTER ETA WITH OXIA +1FCC; F; 03B7 03B9; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +1FCC; S; 1FC3; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +1FD2; F; 03B9 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA +1FD3; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA +1FD6; F; 03B9 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI +1FD7; F; 03B9 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI +1FD8; C; 1FD0; # GREEK CAPITAL LETTER IOTA WITH VRACHY +1FD9; C; 1FD1; # GREEK CAPITAL LETTER IOTA WITH MACRON +1FDA; C; 1F76; # GREEK CAPITAL LETTER IOTA WITH VARIA +1FDB; C; 1F77; # GREEK CAPITAL LETTER IOTA WITH OXIA +1FE2; F; 03C5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA +1FE3; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA +1FE4; F; 03C1 0313; # GREEK SMALL LETTER RHO WITH PSILI +1FE6; F; 03C5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI +1FE7; F; 03C5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI +1FE8; C; 1FE0; # GREEK CAPITAL LETTER UPSILON WITH VRACHY +1FE9; C; 1FE1; # GREEK CAPITAL LETTER UPSILON WITH MACRON +1FEA; C; 1F7A; # GREEK CAPITAL LETTER UPSILON WITH VARIA +1FEB; C; 1F7B; # GREEK CAPITAL LETTER UPSILON WITH OXIA +1FEC; C; 1FE5; # GREEK CAPITAL LETTER RHO WITH DASIA +1FF2; F; 1F7C 03B9; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI +1FF3; F; 03C9 03B9; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI +1FF4; F; 03CE 03B9; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI +1FF6; F; 03C9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI +1FF7; F; 03C9 0342 03B9; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI +1FF8; C; 1F78; # GREEK CAPITAL LETTER OMICRON WITH VARIA +1FF9; C; 1F79; # GREEK CAPITAL LETTER OMICRON WITH OXIA +1FFA; C; 1F7C; # GREEK CAPITAL LETTER OMEGA WITH VARIA +1FFB; C; 1F7D; # GREEK CAPITAL LETTER OMEGA WITH OXIA +1FFC; F; 03C9 03B9; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI +1FFC; S; 1FF3; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI +2126; C; 03C9; # OHM SIGN +212A; C; 006B; # KELVIN SIGN +212B; C; 00E5; # ANGSTROM SIGN +2132; C; 214E; # TURNED CAPITAL F +2160; C; 2170; # ROMAN NUMERAL ONE +2161; C; 2171; # ROMAN NUMERAL TWO +2162; C; 2172; # ROMAN NUMERAL THREE +2163; C; 2173; # ROMAN NUMERAL FOUR +2164; C; 2174; # ROMAN NUMERAL FIVE +2165; C; 2175; # ROMAN NUMERAL SIX +2166; C; 2176; # ROMAN NUMERAL SEVEN +2167; C; 2177; # ROMAN NUMERAL EIGHT +2168; C; 2178; # ROMAN NUMERAL NINE +2169; C; 2179; # ROMAN NUMERAL TEN +216A; C; 217A; # ROMAN NUMERAL ELEVEN +216B; C; 217B; # ROMAN NUMERAL TWELVE +216C; C; 217C; # ROMAN NUMERAL FIFTY +216D; C; 217D; # ROMAN NUMERAL ONE HUNDRED +216E; C; 217E; # ROMAN NUMERAL FIVE HUNDRED +216F; C; 217F; # ROMAN NUMERAL ONE THOUSAND +2183; C; 2184; # ROMAN NUMERAL REVERSED ONE HUNDRED +24B6; C; 24D0; # CIRCLED LATIN CAPITAL LETTER A +24B7; C; 24D1; # CIRCLED LATIN CAPITAL LETTER B +24B8; C; 24D2; # CIRCLED LATIN CAPITAL LETTER C +24B9; C; 24D3; # CIRCLED LATIN CAPITAL LETTER D +24BA; C; 24D4; # CIRCLED LATIN CAPITAL LETTER E +24BB; C; 24D5; # CIRCLED LATIN CAPITAL LETTER F +24BC; C; 24D6; # CIRCLED LATIN CAPITAL LETTER G +24BD; C; 24D7; # CIRCLED LATIN CAPITAL LETTER H +24BE; C; 24D8; # CIRCLED LATIN CAPITAL LETTER I +24BF; C; 24D9; # CIRCLED LATIN CAPITAL LETTER J +24C0; C; 24DA; # CIRCLED LATIN CAPITAL LETTER K +24C1; C; 24DB; # CIRCLED LATIN CAPITAL LETTER L +24C2; C; 24DC; # CIRCLED LATIN CAPITAL LETTER M +24C3; C; 24DD; # CIRCLED LATIN CAPITAL LETTER N +24C4; C; 24DE; # CIRCLED LATIN CAPITAL LETTER O +24C5; C; 24DF; # CIRCLED LATIN CAPITAL LETTER P +24C6; C; 24E0; # CIRCLED LATIN CAPITAL LETTER Q +24C7; C; 24E1; # CIRCLED LATIN CAPITAL LETTER R +24C8; C; 24E2; # CIRCLED LATIN CAPITAL LETTER S +24C9; C; 24E3; # CIRCLED LATIN CAPITAL LETTER T +24CA; C; 24E4; # CIRCLED LATIN CAPITAL LETTER U +24CB; C; 24E5; # CIRCLED LATIN CAPITAL LETTER V +24CC; C; 24E6; # CIRCLED LATIN CAPITAL LETTER W +24CD; C; 24E7; # CIRCLED LATIN CAPITAL LETTER X +24CE; C; 24E8; # CIRCLED LATIN CAPITAL LETTER Y +24CF; C; 24E9; # CIRCLED LATIN CAPITAL LETTER Z +2C00; C; 2C30; # GLAGOLITIC CAPITAL LETTER AZU +2C01; C; 2C31; # GLAGOLITIC CAPITAL LETTER BUKY +2C02; C; 2C32; # GLAGOLITIC CAPITAL LETTER VEDE +2C03; C; 2C33; # GLAGOLITIC CAPITAL LETTER GLAGOLI +2C04; C; 2C34; # GLAGOLITIC CAPITAL LETTER DOBRO +2C05; C; 2C35; # GLAGOLITIC CAPITAL LETTER YESTU +2C06; C; 2C36; # GLAGOLITIC CAPITAL LETTER ZHIVETE +2C07; C; 2C37; # GLAGOLITIC CAPITAL LETTER DZELO +2C08; C; 2C38; # GLAGOLITIC CAPITAL LETTER ZEMLJA +2C09; C; 2C39; # GLAGOLITIC CAPITAL LETTER IZHE +2C0A; C; 2C3A; # GLAGOLITIC CAPITAL LETTER INITIAL IZHE +2C0B; C; 2C3B; # GLAGOLITIC CAPITAL LETTER I +2C0C; C; 2C3C; # GLAGOLITIC CAPITAL LETTER DJERVI +2C0D; C; 2C3D; # GLAGOLITIC CAPITAL LETTER KAKO +2C0E; C; 2C3E; # GLAGOLITIC CAPITAL LETTER LJUDIJE +2C0F; C; 2C3F; # GLAGOLITIC CAPITAL LETTER MYSLITE +2C10; C; 2C40; # GLAGOLITIC CAPITAL LETTER NASHI +2C11; C; 2C41; # GLAGOLITIC CAPITAL LETTER ONU +2C12; C; 2C42; # GLAGOLITIC CAPITAL LETTER POKOJI +2C13; C; 2C43; # GLAGOLITIC CAPITAL LETTER RITSI +2C14; C; 2C44; # GLAGOLITIC CAPITAL LETTER SLOVO +2C15; C; 2C45; # GLAGOLITIC CAPITAL LETTER TVRIDO +2C16; C; 2C46; # GLAGOLITIC CAPITAL LETTER UKU +2C17; C; 2C47; # GLAGOLITIC CAPITAL LETTER FRITU +2C18; C; 2C48; # GLAGOLITIC CAPITAL LETTER HERU +2C19; C; 2C49; # GLAGOLITIC CAPITAL LETTER OTU +2C1A; C; 2C4A; # GLAGOLITIC CAPITAL LETTER PE +2C1B; C; 2C4B; # GLAGOLITIC CAPITAL LETTER SHTA +2C1C; C; 2C4C; # GLAGOLITIC CAPITAL LETTER TSI +2C1D; C; 2C4D; # GLAGOLITIC CAPITAL LETTER CHRIVI +2C1E; C; 2C4E; # GLAGOLITIC CAPITAL LETTER SHA +2C1F; C; 2C4F; # GLAGOLITIC CAPITAL LETTER YERU +2C20; C; 2C50; # GLAGOLITIC CAPITAL LETTER YERI +2C21; C; 2C51; # GLAGOLITIC CAPITAL LETTER YATI +2C22; C; 2C52; # GLAGOLITIC CAPITAL LETTER SPIDERY HA +2C23; C; 2C53; # GLAGOLITIC CAPITAL LETTER YU +2C24; C; 2C54; # GLAGOLITIC CAPITAL LETTER SMALL YUS +2C25; C; 2C55; # GLAGOLITIC CAPITAL LETTER SMALL YUS WITH TAIL +2C26; C; 2C56; # GLAGOLITIC CAPITAL LETTER YO +2C27; C; 2C57; # GLAGOLITIC CAPITAL LETTER IOTATED SMALL YUS +2C28; C; 2C58; # GLAGOLITIC CAPITAL LETTER BIG YUS +2C29; C; 2C59; # GLAGOLITIC CAPITAL LETTER IOTATED BIG YUS +2C2A; C; 2C5A; # GLAGOLITIC CAPITAL LETTER FITA +2C2B; C; 2C5B; # GLAGOLITIC CAPITAL LETTER IZHITSA +2C2C; C; 2C5C; # GLAGOLITIC CAPITAL LETTER SHTAPIC +2C2D; C; 2C5D; # GLAGOLITIC CAPITAL LETTER TROKUTASTI A +2C2E; C; 2C5E; # GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE +2C2F; C; 2C5F; # GLAGOLITIC CAPITAL LETTER CAUDATE CHRIVI +2C60; C; 2C61; # LATIN CAPITAL LETTER L WITH DOUBLE BAR +2C62; C; 026B; # LATIN CAPITAL LETTER L WITH MIDDLE TILDE +2C63; C; 1D7D; # LATIN CAPITAL LETTER P WITH STROKE +2C64; C; 027D; # LATIN CAPITAL LETTER R WITH TAIL +2C67; C; 2C68; # LATIN CAPITAL LETTER H WITH DESCENDER +2C69; C; 2C6A; # LATIN CAPITAL LETTER K WITH DESCENDER +2C6B; C; 2C6C; # LATIN CAPITAL LETTER Z WITH DESCENDER +2C6D; C; 0251; # LATIN CAPITAL LETTER ALPHA +2C6E; C; 0271; # LATIN CAPITAL LETTER M WITH HOOK +2C6F; C; 0250; # LATIN CAPITAL LETTER TURNED A +2C70; C; 0252; # LATIN CAPITAL LETTER TURNED ALPHA +2C72; C; 2C73; # LATIN CAPITAL LETTER W WITH HOOK +2C75; C; 2C76; # LATIN CAPITAL LETTER HALF H +2C7E; C; 023F; # LATIN CAPITAL LETTER S WITH SWASH TAIL +2C7F; C; 0240; # LATIN CAPITAL LETTER Z WITH SWASH TAIL +2C80; C; 2C81; # COPTIC CAPITAL LETTER ALFA +2C82; C; 2C83; # COPTIC CAPITAL LETTER VIDA +2C84; C; 2C85; # COPTIC CAPITAL LETTER GAMMA +2C86; C; 2C87; # COPTIC CAPITAL LETTER DALDA +2C88; C; 2C89; # COPTIC CAPITAL LETTER EIE +2C8A; C; 2C8B; # COPTIC CAPITAL LETTER SOU +2C8C; C; 2C8D; # COPTIC CAPITAL LETTER ZATA +2C8E; C; 2C8F; # COPTIC CAPITAL LETTER HATE +2C90; C; 2C91; # COPTIC CAPITAL LETTER THETHE +2C92; C; 2C93; # COPTIC CAPITAL LETTER IAUDA +2C94; C; 2C95; # COPTIC CAPITAL LETTER KAPA +2C96; C; 2C97; # COPTIC CAPITAL LETTER LAULA +2C98; C; 2C99; # COPTIC CAPITAL LETTER MI +2C9A; C; 2C9B; # COPTIC CAPITAL LETTER NI +2C9C; C; 2C9D; # COPTIC CAPITAL LETTER KSI +2C9E; C; 2C9F; # COPTIC CAPITAL LETTER O +2CA0; C; 2CA1; # COPTIC CAPITAL LETTER PI +2CA2; C; 2CA3; # COPTIC CAPITAL LETTER RO +2CA4; C; 2CA5; # COPTIC CAPITAL LETTER SIMA +2CA6; C; 2CA7; # COPTIC CAPITAL LETTER TAU +2CA8; C; 2CA9; # COPTIC CAPITAL LETTER UA +2CAA; C; 2CAB; # COPTIC CAPITAL LETTER FI +2CAC; C; 2CAD; # COPTIC CAPITAL LETTER KHI +2CAE; C; 2CAF; # COPTIC CAPITAL LETTER PSI +2CB0; C; 2CB1; # COPTIC CAPITAL LETTER OOU +2CB2; C; 2CB3; # COPTIC CAPITAL LETTER DIALECT-P ALEF +2CB4; C; 2CB5; # COPTIC CAPITAL LETTER OLD COPTIC AIN +2CB6; C; 2CB7; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC EIE +2CB8; C; 2CB9; # COPTIC CAPITAL LETTER DIALECT-P KAPA +2CBA; C; 2CBB; # COPTIC CAPITAL LETTER DIALECT-P NI +2CBC; C; 2CBD; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC NI +2CBE; C; 2CBF; # COPTIC CAPITAL LETTER OLD COPTIC OOU +2CC0; C; 2CC1; # COPTIC CAPITAL LETTER SAMPI +2CC2; C; 2CC3; # COPTIC CAPITAL LETTER CROSSED SHEI +2CC4; C; 2CC5; # COPTIC CAPITAL LETTER OLD COPTIC SHEI +2CC6; C; 2CC7; # COPTIC CAPITAL LETTER OLD COPTIC ESH +2CC8; C; 2CC9; # COPTIC CAPITAL LETTER AKHMIMIC KHEI +2CCA; C; 2CCB; # COPTIC CAPITAL LETTER DIALECT-P HORI +2CCC; C; 2CCD; # COPTIC CAPITAL LETTER OLD COPTIC HORI +2CCE; C; 2CCF; # COPTIC CAPITAL LETTER OLD COPTIC HA +2CD0; C; 2CD1; # COPTIC CAPITAL LETTER L-SHAPED HA +2CD2; C; 2CD3; # COPTIC CAPITAL LETTER OLD COPTIC HEI +2CD4; C; 2CD5; # COPTIC CAPITAL LETTER OLD COPTIC HAT +2CD6; C; 2CD7; # COPTIC CAPITAL LETTER OLD COPTIC GANGIA +2CD8; C; 2CD9; # COPTIC CAPITAL LETTER OLD COPTIC DJA +2CDA; C; 2CDB; # COPTIC CAPITAL LETTER OLD COPTIC SHIMA +2CDC; C; 2CDD; # COPTIC CAPITAL LETTER OLD NUBIAN SHIMA +2CDE; C; 2CDF; # COPTIC CAPITAL LETTER OLD NUBIAN NGI +2CE0; C; 2CE1; # COPTIC CAPITAL LETTER OLD NUBIAN NYI +2CE2; C; 2CE3; # COPTIC CAPITAL LETTER OLD NUBIAN WAU +2CEB; C; 2CEC; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI +2CED; C; 2CEE; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA +2CF2; C; 2CF3; # COPTIC CAPITAL LETTER BOHAIRIC KHEI +A640; C; A641; # CYRILLIC CAPITAL LETTER ZEMLYA +A642; C; A643; # CYRILLIC CAPITAL LETTER DZELO +A644; C; A645; # CYRILLIC CAPITAL LETTER REVERSED DZE +A646; C; A647; # CYRILLIC CAPITAL LETTER IOTA +A648; C; A649; # CYRILLIC CAPITAL LETTER DJERV +A64A; C; A64B; # CYRILLIC CAPITAL LETTER MONOGRAPH UK +A64C; C; A64D; # CYRILLIC CAPITAL LETTER BROAD OMEGA +A64E; C; A64F; # CYRILLIC CAPITAL LETTER NEUTRAL YER +A650; C; A651; # CYRILLIC CAPITAL LETTER YERU WITH BACK YER +A652; C; A653; # CYRILLIC CAPITAL LETTER IOTIFIED YAT +A654; C; A655; # CYRILLIC CAPITAL LETTER REVERSED YU +A656; C; A657; # CYRILLIC CAPITAL LETTER IOTIFIED A +A658; C; A659; # CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS +A65A; C; A65B; # CYRILLIC CAPITAL LETTER BLENDED YUS +A65C; C; A65D; # CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS +A65E; C; A65F; # CYRILLIC CAPITAL LETTER YN +A660; C; A661; # CYRILLIC CAPITAL LETTER REVERSED TSE +A662; C; A663; # CYRILLIC CAPITAL LETTER SOFT DE +A664; C; A665; # CYRILLIC CAPITAL LETTER SOFT EL +A666; C; A667; # CYRILLIC CAPITAL LETTER SOFT EM +A668; C; A669; # CYRILLIC CAPITAL LETTER MONOCULAR O +A66A; C; A66B; # CYRILLIC CAPITAL LETTER BINOCULAR O +A66C; C; A66D; # CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O +A680; C; A681; # CYRILLIC CAPITAL LETTER DWE +A682; C; A683; # CYRILLIC CAPITAL LETTER DZWE +A684; C; A685; # CYRILLIC CAPITAL LETTER ZHWE +A686; C; A687; # CYRILLIC CAPITAL LETTER CCHE +A688; C; A689; # CYRILLIC CAPITAL LETTER DZZE +A68A; C; A68B; # CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK +A68C; C; A68D; # CYRILLIC CAPITAL LETTER TWE +A68E; C; A68F; # CYRILLIC CAPITAL LETTER TSWE +A690; C; A691; # CYRILLIC CAPITAL LETTER TSSE +A692; C; A693; # CYRILLIC CAPITAL LETTER TCHE +A694; C; A695; # CYRILLIC CAPITAL LETTER HWE +A696; C; A697; # CYRILLIC CAPITAL LETTER SHWE +A698; C; A699; # CYRILLIC CAPITAL LETTER DOUBLE O +A69A; C; A69B; # CYRILLIC CAPITAL LETTER CROSSED O +A722; C; A723; # LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF +A724; C; A725; # LATIN CAPITAL LETTER EGYPTOLOGICAL AIN +A726; C; A727; # LATIN CAPITAL LETTER HENG +A728; C; A729; # LATIN CAPITAL LETTER TZ +A72A; C; A72B; # LATIN CAPITAL LETTER TRESILLO +A72C; C; A72D; # LATIN CAPITAL LETTER CUATRILLO +A72E; C; A72F; # LATIN CAPITAL LETTER CUATRILLO WITH COMMA +A732; C; A733; # LATIN CAPITAL LETTER AA +A734; C; A735; # LATIN CAPITAL LETTER AO +A736; C; A737; # LATIN CAPITAL LETTER AU +A738; C; A739; # LATIN CAPITAL LETTER AV +A73A; C; A73B; # LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR +A73C; C; A73D; # LATIN CAPITAL LETTER AY +A73E; C; A73F; # LATIN CAPITAL LETTER REVERSED C WITH DOT +A740; C; A741; # LATIN CAPITAL LETTER K WITH STROKE +A742; C; A743; # LATIN CAPITAL LETTER K WITH DIAGONAL STROKE +A744; C; A745; # LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE +A746; C; A747; # LATIN CAPITAL LETTER BROKEN L +A748; C; A749; # LATIN CAPITAL LETTER L WITH HIGH STROKE +A74A; C; A74B; # LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY +A74C; C; A74D; # LATIN CAPITAL LETTER O WITH LOOP +A74E; C; A74F; # LATIN CAPITAL LETTER OO +A750; C; A751; # LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER +A752; C; A753; # LATIN CAPITAL LETTER P WITH FLOURISH +A754; C; A755; # LATIN CAPITAL LETTER P WITH SQUIRREL TAIL +A756; C; A757; # LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER +A758; C; A759; # LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE +A75A; C; A75B; # LATIN CAPITAL LETTER R ROTUNDA +A75C; C; A75D; # LATIN CAPITAL LETTER RUM ROTUNDA +A75E; C; A75F; # LATIN CAPITAL LETTER V WITH DIAGONAL STROKE +A760; C; A761; # LATIN CAPITAL LETTER VY +A762; C; A763; # LATIN CAPITAL LETTER VISIGOTHIC Z +A764; C; A765; # LATIN CAPITAL LETTER THORN WITH STROKE +A766; C; A767; # LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER +A768; C; A769; # LATIN CAPITAL LETTER VEND +A76A; C; A76B; # LATIN CAPITAL LETTER ET +A76C; C; A76D; # LATIN CAPITAL LETTER IS +A76E; C; A76F; # LATIN CAPITAL LETTER CON +A779; C; A77A; # LATIN CAPITAL LETTER INSULAR D +A77B; C; A77C; # LATIN CAPITAL LETTER INSULAR F +A77D; C; 1D79; # LATIN CAPITAL LETTER INSULAR G +A77E; C; A77F; # LATIN CAPITAL LETTER TURNED INSULAR G +A780; C; A781; # LATIN CAPITAL LETTER TURNED L +A782; C; A783; # LATIN CAPITAL LETTER INSULAR R +A784; C; A785; # LATIN CAPITAL LETTER INSULAR S +A786; C; A787; # LATIN CAPITAL LETTER INSULAR T +A78B; C; A78C; # LATIN CAPITAL LETTER SALTILLO +A78D; C; 0265; # LATIN CAPITAL LETTER TURNED H +A790; C; A791; # LATIN CAPITAL LETTER N WITH DESCENDER +A792; C; A793; # LATIN CAPITAL LETTER C WITH BAR +A796; C; A797; # LATIN CAPITAL LETTER B WITH FLOURISH +A798; C; A799; # LATIN CAPITAL LETTER F WITH STROKE +A79A; C; A79B; # LATIN CAPITAL LETTER VOLAPUK AE +A79C; C; A79D; # LATIN CAPITAL LETTER VOLAPUK OE +A79E; C; A79F; # LATIN CAPITAL LETTER VOLAPUK UE +A7A0; C; A7A1; # LATIN CAPITAL LETTER G WITH OBLIQUE STROKE +A7A2; C; A7A3; # LATIN CAPITAL LETTER K WITH OBLIQUE STROKE +A7A4; C; A7A5; # LATIN CAPITAL LETTER N WITH OBLIQUE STROKE +A7A6; C; A7A7; # LATIN CAPITAL LETTER R WITH OBLIQUE STROKE +A7A8; C; A7A9; # LATIN CAPITAL LETTER S WITH OBLIQUE STROKE +A7AA; C; 0266; # LATIN CAPITAL LETTER H WITH HOOK +A7AB; C; 025C; # LATIN CAPITAL LETTER REVERSED OPEN E +A7AC; C; 0261; # LATIN CAPITAL LETTER SCRIPT G +A7AD; C; 026C; # LATIN CAPITAL LETTER L WITH BELT +A7AE; C; 026A; # LATIN CAPITAL LETTER SMALL CAPITAL I +A7B0; C; 029E; # LATIN CAPITAL LETTER TURNED K +A7B1; C; 0287; # LATIN CAPITAL LETTER TURNED T +A7B2; C; 029D; # LATIN CAPITAL LETTER J WITH CROSSED-TAIL +A7B3; C; AB53; # LATIN CAPITAL LETTER CHI +A7B4; C; A7B5; # LATIN CAPITAL LETTER BETA +A7B6; C; A7B7; # LATIN CAPITAL LETTER OMEGA +A7B8; C; A7B9; # LATIN CAPITAL LETTER U WITH STROKE +A7BA; C; A7BB; # LATIN CAPITAL LETTER GLOTTAL A +A7BC; C; A7BD; # LATIN CAPITAL LETTER GLOTTAL I +A7BE; C; A7BF; # LATIN CAPITAL LETTER GLOTTAL U +A7C0; C; A7C1; # LATIN CAPITAL LETTER OLD POLISH O +A7C2; C; A7C3; # LATIN CAPITAL LETTER ANGLICANA W +A7C4; C; A794; # LATIN CAPITAL LETTER C WITH PALATAL HOOK +A7C5; C; 0282; # LATIN CAPITAL LETTER S WITH HOOK +A7C6; C; 1D8E; # LATIN CAPITAL LETTER Z WITH PALATAL HOOK +A7C7; C; A7C8; # LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY +A7C9; C; A7CA; # LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY +A7D0; C; A7D1; # LATIN CAPITAL LETTER CLOSED INSULAR G +A7D6; C; A7D7; # LATIN CAPITAL LETTER MIDDLE SCOTS S +A7D8; C; A7D9; # LATIN CAPITAL LETTER SIGMOID S +A7F5; C; A7F6; # LATIN CAPITAL LETTER REVERSED HALF H +AB70; C; 13A0; # CHEROKEE SMALL LETTER A +AB71; C; 13A1; # CHEROKEE SMALL LETTER E +AB72; C; 13A2; # CHEROKEE SMALL LETTER I +AB73; C; 13A3; # CHEROKEE SMALL LETTER O +AB74; C; 13A4; # CHEROKEE SMALL LETTER U +AB75; C; 13A5; # CHEROKEE SMALL LETTER V +AB76; C; 13A6; # CHEROKEE SMALL LETTER GA +AB77; C; 13A7; # CHEROKEE SMALL LETTER KA +AB78; C; 13A8; # CHEROKEE SMALL LETTER GE +AB79; C; 13A9; # CHEROKEE SMALL LETTER GI +AB7A; C; 13AA; # CHEROKEE SMALL LETTER GO +AB7B; C; 13AB; # CHEROKEE SMALL LETTER GU +AB7C; C; 13AC; # CHEROKEE SMALL LETTER GV +AB7D; C; 13AD; # CHEROKEE SMALL LETTER HA +AB7E; C; 13AE; # CHEROKEE SMALL LETTER HE +AB7F; C; 13AF; # CHEROKEE SMALL LETTER HI +AB80; C; 13B0; # CHEROKEE SMALL LETTER HO +AB81; C; 13B1; # CHEROKEE SMALL LETTER HU +AB82; C; 13B2; # CHEROKEE SMALL LETTER HV +AB83; C; 13B3; # CHEROKEE SMALL LETTER LA +AB84; C; 13B4; # CHEROKEE SMALL LETTER LE +AB85; C; 13B5; # CHEROKEE SMALL LETTER LI +AB86; C; 13B6; # CHEROKEE SMALL LETTER LO +AB87; C; 13B7; # CHEROKEE SMALL LETTER LU +AB88; C; 13B8; # CHEROKEE SMALL LETTER LV +AB89; C; 13B9; # CHEROKEE SMALL LETTER MA +AB8A; C; 13BA; # CHEROKEE SMALL LETTER ME +AB8B; C; 13BB; # CHEROKEE SMALL LETTER MI +AB8C; C; 13BC; # CHEROKEE SMALL LETTER MO +AB8D; C; 13BD; # CHEROKEE SMALL LETTER MU +AB8E; C; 13BE; # CHEROKEE SMALL LETTER NA +AB8F; C; 13BF; # CHEROKEE SMALL LETTER HNA +AB90; C; 13C0; # CHEROKEE SMALL LETTER NAH +AB91; C; 13C1; # CHEROKEE SMALL LETTER NE +AB92; C; 13C2; # CHEROKEE SMALL LETTER NI +AB93; C; 13C3; # CHEROKEE SMALL LETTER NO +AB94; C; 13C4; # CHEROKEE SMALL LETTER NU +AB95; C; 13C5; # CHEROKEE SMALL LETTER NV +AB96; C; 13C6; # CHEROKEE SMALL LETTER QUA +AB97; C; 13C7; # CHEROKEE SMALL LETTER QUE +AB98; C; 13C8; # CHEROKEE SMALL LETTER QUI +AB99; C; 13C9; # CHEROKEE SMALL LETTER QUO +AB9A; C; 13CA; # CHEROKEE SMALL LETTER QUU +AB9B; C; 13CB; # CHEROKEE SMALL LETTER QUV +AB9C; C; 13CC; # CHEROKEE SMALL LETTER SA +AB9D; C; 13CD; # CHEROKEE SMALL LETTER S +AB9E; C; 13CE; # CHEROKEE SMALL LETTER SE +AB9F; C; 13CF; # CHEROKEE SMALL LETTER SI +ABA0; C; 13D0; # CHEROKEE SMALL LETTER SO +ABA1; C; 13D1; # CHEROKEE SMALL LETTER SU +ABA2; C; 13D2; # CHEROKEE SMALL LETTER SV +ABA3; C; 13D3; # CHEROKEE SMALL LETTER DA +ABA4; C; 13D4; # CHEROKEE SMALL LETTER TA +ABA5; C; 13D5; # CHEROKEE SMALL LETTER DE +ABA6; C; 13D6; # CHEROKEE SMALL LETTER TE +ABA7; C; 13D7; # CHEROKEE SMALL LETTER DI +ABA8; C; 13D8; # CHEROKEE SMALL LETTER TI +ABA9; C; 13D9; # CHEROKEE SMALL LETTER DO +ABAA; C; 13DA; # CHEROKEE SMALL LETTER DU +ABAB; C; 13DB; # CHEROKEE SMALL LETTER DV +ABAC; C; 13DC; # CHEROKEE SMALL LETTER DLA +ABAD; C; 13DD; # CHEROKEE SMALL LETTER TLA +ABAE; C; 13DE; # CHEROKEE SMALL LETTER TLE +ABAF; C; 13DF; # CHEROKEE SMALL LETTER TLI +ABB0; C; 13E0; # CHEROKEE SMALL LETTER TLO +ABB1; C; 13E1; # CHEROKEE SMALL LETTER TLU +ABB2; C; 13E2; # CHEROKEE SMALL LETTER TLV +ABB3; C; 13E3; # CHEROKEE SMALL LETTER TSA +ABB4; C; 13E4; # CHEROKEE SMALL LETTER TSE +ABB5; C; 13E5; # CHEROKEE SMALL LETTER TSI +ABB6; C; 13E6; # CHEROKEE SMALL LETTER TSO +ABB7; C; 13E7; # CHEROKEE SMALL LETTER TSU +ABB8; C; 13E8; # CHEROKEE SMALL LETTER TSV +ABB9; C; 13E9; # CHEROKEE SMALL LETTER WA +ABBA; C; 13EA; # CHEROKEE SMALL LETTER WE +ABBB; C; 13EB; # CHEROKEE SMALL LETTER WI +ABBC; C; 13EC; # CHEROKEE SMALL LETTER WO +ABBD; C; 13ED; # CHEROKEE SMALL LETTER WU +ABBE; C; 13EE; # CHEROKEE SMALL LETTER WV +ABBF; C; 13EF; # CHEROKEE SMALL LETTER YA +FB00; F; 0066 0066; # LATIN SMALL LIGATURE FF +FB01; F; 0066 0069; # LATIN SMALL LIGATURE FI +FB02; F; 0066 006C; # LATIN SMALL LIGATURE FL +FB03; F; 0066 0066 0069; # LATIN SMALL LIGATURE FFI +FB04; F; 0066 0066 006C; # LATIN SMALL LIGATURE FFL +FB05; F; 0073 0074; # LATIN SMALL LIGATURE LONG S T +FB06; F; 0073 0074; # LATIN SMALL LIGATURE ST +FB13; F; 0574 0576; # ARMENIAN SMALL LIGATURE MEN NOW +FB14; F; 0574 0565; # ARMENIAN SMALL LIGATURE MEN ECH +FB15; F; 0574 056B; # ARMENIAN SMALL LIGATURE MEN INI +FB16; F; 057E 0576; # ARMENIAN SMALL LIGATURE VEW NOW +FB17; F; 0574 056D; # ARMENIAN SMALL LIGATURE MEN XEH +FF21; C; FF41; # FULLWIDTH LATIN CAPITAL LETTER A +FF22; C; FF42; # FULLWIDTH LATIN CAPITAL LETTER B +FF23; C; FF43; # FULLWIDTH LATIN CAPITAL LETTER C +FF24; C; FF44; # FULLWIDTH LATIN CAPITAL LETTER D +FF25; C; FF45; # FULLWIDTH LATIN CAPITAL LETTER E +FF26; C; FF46; # FULLWIDTH LATIN CAPITAL LETTER F +FF27; C; FF47; # FULLWIDTH LATIN CAPITAL LETTER G +FF28; C; FF48; # FULLWIDTH LATIN CAPITAL LETTER H +FF29; C; FF49; # FULLWIDTH LATIN CAPITAL LETTER I +FF2A; C; FF4A; # FULLWIDTH LATIN CAPITAL LETTER J +FF2B; C; FF4B; # FULLWIDTH LATIN CAPITAL LETTER K +FF2C; C; FF4C; # FULLWIDTH LATIN CAPITAL LETTER L +FF2D; C; FF4D; # FULLWIDTH LATIN CAPITAL LETTER M +FF2E; C; FF4E; # FULLWIDTH LATIN CAPITAL LETTER N +FF2F; C; FF4F; # FULLWIDTH LATIN CAPITAL LETTER O +FF30; C; FF50; # FULLWIDTH LATIN CAPITAL LETTER P +FF31; C; FF51; # FULLWIDTH LATIN CAPITAL LETTER Q +FF32; C; FF52; # FULLWIDTH LATIN CAPITAL LETTER R +FF33; C; FF53; # FULLWIDTH LATIN CAPITAL LETTER S +FF34; C; FF54; # FULLWIDTH LATIN CAPITAL LETTER T +FF35; C; FF55; # FULLWIDTH LATIN CAPITAL LETTER U +FF36; C; FF56; # FULLWIDTH LATIN CAPITAL LETTER V +FF37; C; FF57; # FULLWIDTH LATIN CAPITAL LETTER W +FF38; C; FF58; # FULLWIDTH LATIN CAPITAL LETTER X +FF39; C; FF59; # FULLWIDTH LATIN CAPITAL LETTER Y +FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z +10400; C; 10428; # DESERET CAPITAL LETTER LONG I +10401; C; 10429; # DESERET CAPITAL LETTER LONG E +10402; C; 1042A; # DESERET CAPITAL LETTER LONG A +10403; C; 1042B; # DESERET CAPITAL LETTER LONG AH +10404; C; 1042C; # DESERET CAPITAL LETTER LONG O +10405; C; 1042D; # DESERET CAPITAL LETTER LONG OO +10406; C; 1042E; # DESERET CAPITAL LETTER SHORT I +10407; C; 1042F; # DESERET CAPITAL LETTER SHORT E +10408; C; 10430; # DESERET CAPITAL LETTER SHORT A +10409; C; 10431; # DESERET CAPITAL LETTER SHORT AH +1040A; C; 10432; # DESERET CAPITAL LETTER SHORT O +1040B; C; 10433; # DESERET CAPITAL LETTER SHORT OO +1040C; C; 10434; # DESERET CAPITAL LETTER AY +1040D; C; 10435; # DESERET CAPITAL LETTER OW +1040E; C; 10436; # DESERET CAPITAL LETTER WU +1040F; C; 10437; # DESERET CAPITAL LETTER YEE +10410; C; 10438; # DESERET CAPITAL LETTER H +10411; C; 10439; # DESERET CAPITAL LETTER PEE +10412; C; 1043A; # DESERET CAPITAL LETTER BEE +10413; C; 1043B; # DESERET CAPITAL LETTER TEE +10414; C; 1043C; # DESERET CAPITAL LETTER DEE +10415; C; 1043D; # DESERET CAPITAL LETTER CHEE +10416; C; 1043E; # DESERET CAPITAL LETTER JEE +10417; C; 1043F; # DESERET CAPITAL LETTER KAY +10418; C; 10440; # DESERET CAPITAL LETTER GAY +10419; C; 10441; # DESERET CAPITAL LETTER EF +1041A; C; 10442; # DESERET CAPITAL LETTER VEE +1041B; C; 10443; # DESERET CAPITAL LETTER ETH +1041C; C; 10444; # DESERET CAPITAL LETTER THEE +1041D; C; 10445; # DESERET CAPITAL LETTER ES +1041E; C; 10446; # DESERET CAPITAL LETTER ZEE +1041F; C; 10447; # DESERET CAPITAL LETTER ESH +10420; C; 10448; # DESERET CAPITAL LETTER ZHEE +10421; C; 10449; # DESERET CAPITAL LETTER ER +10422; C; 1044A; # DESERET CAPITAL LETTER EL +10423; C; 1044B; # DESERET CAPITAL LETTER EM +10424; C; 1044C; # DESERET CAPITAL LETTER EN +10425; C; 1044D; # DESERET CAPITAL LETTER ENG +10426; C; 1044E; # DESERET CAPITAL LETTER OI +10427; C; 1044F; # DESERET CAPITAL LETTER EW +104B0; C; 104D8; # OSAGE CAPITAL LETTER A +104B1; C; 104D9; # OSAGE CAPITAL LETTER AI +104B2; C; 104DA; # OSAGE CAPITAL LETTER AIN +104B3; C; 104DB; # OSAGE CAPITAL LETTER AH +104B4; C; 104DC; # OSAGE CAPITAL LETTER BRA +104B5; C; 104DD; # OSAGE CAPITAL LETTER CHA +104B6; C; 104DE; # OSAGE CAPITAL LETTER EHCHA +104B7; C; 104DF; # OSAGE CAPITAL LETTER E +104B8; C; 104E0; # OSAGE CAPITAL LETTER EIN +104B9; C; 104E1; # OSAGE CAPITAL LETTER HA +104BA; C; 104E2; # OSAGE CAPITAL LETTER HYA +104BB; C; 104E3; # OSAGE CAPITAL LETTER I +104BC; C; 104E4; # OSAGE CAPITAL LETTER KA +104BD; C; 104E5; # OSAGE CAPITAL LETTER EHKA +104BE; C; 104E6; # OSAGE CAPITAL LETTER KYA +104BF; C; 104E7; # OSAGE CAPITAL LETTER LA +104C0; C; 104E8; # OSAGE CAPITAL LETTER MA +104C1; C; 104E9; # OSAGE CAPITAL LETTER NA +104C2; C; 104EA; # OSAGE CAPITAL LETTER O +104C3; C; 104EB; # OSAGE CAPITAL LETTER OIN +104C4; C; 104EC; # OSAGE CAPITAL LETTER PA +104C5; C; 104ED; # OSAGE CAPITAL LETTER EHPA +104C6; C; 104EE; # OSAGE CAPITAL LETTER SA +104C7; C; 104EF; # OSAGE CAPITAL LETTER SHA +104C8; C; 104F0; # OSAGE CAPITAL LETTER TA +104C9; C; 104F1; # OSAGE CAPITAL LETTER EHTA +104CA; C; 104F2; # OSAGE CAPITAL LETTER TSA +104CB; C; 104F3; # OSAGE CAPITAL LETTER EHTSA +104CC; C; 104F4; # OSAGE CAPITAL LETTER TSHA +104CD; C; 104F5; # OSAGE CAPITAL LETTER DHA +104CE; C; 104F6; # OSAGE CAPITAL LETTER U +104CF; C; 104F7; # OSAGE CAPITAL LETTER WA +104D0; C; 104F8; # OSAGE CAPITAL LETTER KHA +104D1; C; 104F9; # OSAGE CAPITAL LETTER GHA +104D2; C; 104FA; # OSAGE CAPITAL LETTER ZA +104D3; C; 104FB; # OSAGE CAPITAL LETTER ZHA +10570; C; 10597; # VITHKUQI CAPITAL LETTER A +10571; C; 10598; # VITHKUQI CAPITAL LETTER BBE +10572; C; 10599; # VITHKUQI CAPITAL LETTER BE +10573; C; 1059A; # VITHKUQI CAPITAL LETTER CE +10574; C; 1059B; # VITHKUQI CAPITAL LETTER CHE +10575; C; 1059C; # VITHKUQI CAPITAL LETTER DE +10576; C; 1059D; # VITHKUQI CAPITAL LETTER DHE +10577; C; 1059E; # VITHKUQI CAPITAL LETTER EI +10578; C; 1059F; # VITHKUQI CAPITAL LETTER E +10579; C; 105A0; # VITHKUQI CAPITAL LETTER FE +1057A; C; 105A1; # VITHKUQI CAPITAL LETTER GA +1057C; C; 105A3; # VITHKUQI CAPITAL LETTER HA +1057D; C; 105A4; # VITHKUQI CAPITAL LETTER HHA +1057E; C; 105A5; # VITHKUQI CAPITAL LETTER I +1057F; C; 105A6; # VITHKUQI CAPITAL LETTER IJE +10580; C; 105A7; # VITHKUQI CAPITAL LETTER JE +10581; C; 105A8; # VITHKUQI CAPITAL LETTER KA +10582; C; 105A9; # VITHKUQI CAPITAL LETTER LA +10583; C; 105AA; # VITHKUQI CAPITAL LETTER LLA +10584; C; 105AB; # VITHKUQI CAPITAL LETTER ME +10585; C; 105AC; # VITHKUQI CAPITAL LETTER NE +10586; C; 105AD; # VITHKUQI CAPITAL LETTER NJE +10587; C; 105AE; # VITHKUQI CAPITAL LETTER O +10588; C; 105AF; # VITHKUQI CAPITAL LETTER PE +10589; C; 105B0; # VITHKUQI CAPITAL LETTER QA +1058A; C; 105B1; # VITHKUQI CAPITAL LETTER RE +1058C; C; 105B3; # VITHKUQI CAPITAL LETTER SE +1058D; C; 105B4; # VITHKUQI CAPITAL LETTER SHE +1058E; C; 105B5; # VITHKUQI CAPITAL LETTER TE +1058F; C; 105B6; # VITHKUQI CAPITAL LETTER THE +10590; C; 105B7; # VITHKUQI CAPITAL LETTER U +10591; C; 105B8; # VITHKUQI CAPITAL LETTER VE +10592; C; 105B9; # VITHKUQI CAPITAL LETTER XE +10594; C; 105BB; # VITHKUQI CAPITAL LETTER Y +10595; C; 105BC; # VITHKUQI CAPITAL LETTER ZE +10C80; C; 10CC0; # OLD HUNGARIAN CAPITAL LETTER A +10C81; C; 10CC1; # OLD HUNGARIAN CAPITAL LETTER AA +10C82; C; 10CC2; # OLD HUNGARIAN CAPITAL LETTER EB +10C83; C; 10CC3; # OLD HUNGARIAN CAPITAL LETTER AMB +10C84; C; 10CC4; # OLD HUNGARIAN CAPITAL LETTER EC +10C85; C; 10CC5; # OLD HUNGARIAN CAPITAL LETTER ENC +10C86; C; 10CC6; # OLD HUNGARIAN CAPITAL LETTER ECS +10C87; C; 10CC7; # OLD HUNGARIAN CAPITAL LETTER ED +10C88; C; 10CC8; # OLD HUNGARIAN CAPITAL LETTER AND +10C89; C; 10CC9; # OLD HUNGARIAN CAPITAL LETTER E +10C8A; C; 10CCA; # OLD HUNGARIAN CAPITAL LETTER CLOSE E +10C8B; C; 10CCB; # OLD HUNGARIAN CAPITAL LETTER EE +10C8C; C; 10CCC; # OLD HUNGARIAN CAPITAL LETTER EF +10C8D; C; 10CCD; # OLD HUNGARIAN CAPITAL LETTER EG +10C8E; C; 10CCE; # OLD HUNGARIAN CAPITAL LETTER EGY +10C8F; C; 10CCF; # OLD HUNGARIAN CAPITAL LETTER EH +10C90; C; 10CD0; # OLD HUNGARIAN CAPITAL LETTER I +10C91; C; 10CD1; # OLD HUNGARIAN CAPITAL LETTER II +10C92; C; 10CD2; # OLD HUNGARIAN CAPITAL LETTER EJ +10C93; C; 10CD3; # OLD HUNGARIAN CAPITAL LETTER EK +10C94; C; 10CD4; # OLD HUNGARIAN CAPITAL LETTER AK +10C95; C; 10CD5; # OLD HUNGARIAN CAPITAL LETTER UNK +10C96; C; 10CD6; # OLD HUNGARIAN CAPITAL LETTER EL +10C97; C; 10CD7; # OLD HUNGARIAN CAPITAL LETTER ELY +10C98; C; 10CD8; # OLD HUNGARIAN CAPITAL LETTER EM +10C99; C; 10CD9; # OLD HUNGARIAN CAPITAL LETTER EN +10C9A; C; 10CDA; # OLD HUNGARIAN CAPITAL LETTER ENY +10C9B; C; 10CDB; # OLD HUNGARIAN CAPITAL LETTER O +10C9C; C; 10CDC; # OLD HUNGARIAN CAPITAL LETTER OO +10C9D; C; 10CDD; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG OE +10C9E; C; 10CDE; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA OE +10C9F; C; 10CDF; # OLD HUNGARIAN CAPITAL LETTER OEE +10CA0; C; 10CE0; # OLD HUNGARIAN CAPITAL LETTER EP +10CA1; C; 10CE1; # OLD HUNGARIAN CAPITAL LETTER EMP +10CA2; C; 10CE2; # OLD HUNGARIAN CAPITAL LETTER ER +10CA3; C; 10CE3; # OLD HUNGARIAN CAPITAL LETTER SHORT ER +10CA4; C; 10CE4; # OLD HUNGARIAN CAPITAL LETTER ES +10CA5; C; 10CE5; # OLD HUNGARIAN CAPITAL LETTER ESZ +10CA6; C; 10CE6; # OLD HUNGARIAN CAPITAL LETTER ET +10CA7; C; 10CE7; # OLD HUNGARIAN CAPITAL LETTER ENT +10CA8; C; 10CE8; # OLD HUNGARIAN CAPITAL LETTER ETY +10CA9; C; 10CE9; # OLD HUNGARIAN CAPITAL LETTER ECH +10CAA; C; 10CEA; # OLD HUNGARIAN CAPITAL LETTER U +10CAB; C; 10CEB; # OLD HUNGARIAN CAPITAL LETTER UU +10CAC; C; 10CEC; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG UE +10CAD; C; 10CED; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA UE +10CAE; C; 10CEE; # OLD HUNGARIAN CAPITAL LETTER EV +10CAF; C; 10CEF; # OLD HUNGARIAN CAPITAL LETTER EZ +10CB0; C; 10CF0; # OLD HUNGARIAN CAPITAL LETTER EZS +10CB1; C; 10CF1; # OLD HUNGARIAN CAPITAL LETTER ENT-SHAPED SIGN +10CB2; C; 10CF2; # OLD HUNGARIAN CAPITAL LETTER US +118A0; C; 118C0; # WARANG CITI CAPITAL LETTER NGAA +118A1; C; 118C1; # WARANG CITI CAPITAL LETTER A +118A2; C; 118C2; # WARANG CITI CAPITAL LETTER WI +118A3; C; 118C3; # WARANG CITI CAPITAL LETTER YU +118A4; C; 118C4; # WARANG CITI CAPITAL LETTER YA +118A5; C; 118C5; # WARANG CITI CAPITAL LETTER YO +118A6; C; 118C6; # WARANG CITI CAPITAL LETTER II +118A7; C; 118C7; # WARANG CITI CAPITAL LETTER UU +118A8; C; 118C8; # WARANG CITI CAPITAL LETTER E +118A9; C; 118C9; # WARANG CITI CAPITAL LETTER O +118AA; C; 118CA; # WARANG CITI CAPITAL LETTER ANG +118AB; C; 118CB; # WARANG CITI CAPITAL LETTER GA +118AC; C; 118CC; # WARANG CITI CAPITAL LETTER KO +118AD; C; 118CD; # WARANG CITI CAPITAL LETTER ENY +118AE; C; 118CE; # WARANG CITI CAPITAL LETTER YUJ +118AF; C; 118CF; # WARANG CITI CAPITAL LETTER UC +118B0; C; 118D0; # WARANG CITI CAPITAL LETTER ENN +118B1; C; 118D1; # WARANG CITI CAPITAL LETTER ODD +118B2; C; 118D2; # WARANG CITI CAPITAL LETTER TTE +118B3; C; 118D3; # WARANG CITI CAPITAL LETTER NUNG +118B4; C; 118D4; # WARANG CITI CAPITAL LETTER DA +118B5; C; 118D5; # WARANG CITI CAPITAL LETTER AT +118B6; C; 118D6; # WARANG CITI CAPITAL LETTER AM +118B7; C; 118D7; # WARANG CITI CAPITAL LETTER BU +118B8; C; 118D8; # WARANG CITI CAPITAL LETTER PU +118B9; C; 118D9; # WARANG CITI CAPITAL LETTER HIYO +118BA; C; 118DA; # WARANG CITI CAPITAL LETTER HOLO +118BB; C; 118DB; # WARANG CITI CAPITAL LETTER HORR +118BC; C; 118DC; # WARANG CITI CAPITAL LETTER HAR +118BD; C; 118DD; # WARANG CITI CAPITAL LETTER SSUU +118BE; C; 118DE; # WARANG CITI CAPITAL LETTER SII +118BF; C; 118DF; # WARANG CITI CAPITAL LETTER VIYO +16E40; C; 16E60; # MEDEFAIDRIN CAPITAL LETTER M +16E41; C; 16E61; # MEDEFAIDRIN CAPITAL LETTER S +16E42; C; 16E62; # MEDEFAIDRIN CAPITAL LETTER V +16E43; C; 16E63; # MEDEFAIDRIN CAPITAL LETTER W +16E44; C; 16E64; # MEDEFAIDRIN CAPITAL LETTER ATIU +16E45; C; 16E65; # MEDEFAIDRIN CAPITAL LETTER Z +16E46; C; 16E66; # MEDEFAIDRIN CAPITAL LETTER KP +16E47; C; 16E67; # MEDEFAIDRIN CAPITAL LETTER P +16E48; C; 16E68; # MEDEFAIDRIN CAPITAL LETTER T +16E49; C; 16E69; # MEDEFAIDRIN CAPITAL LETTER G +16E4A; C; 16E6A; # MEDEFAIDRIN CAPITAL LETTER F +16E4B; C; 16E6B; # MEDEFAIDRIN CAPITAL LETTER I +16E4C; C; 16E6C; # MEDEFAIDRIN CAPITAL LETTER K +16E4D; C; 16E6D; # MEDEFAIDRIN CAPITAL LETTER A +16E4E; C; 16E6E; # MEDEFAIDRIN CAPITAL LETTER J +16E4F; C; 16E6F; # MEDEFAIDRIN CAPITAL LETTER E +16E50; C; 16E70; # MEDEFAIDRIN CAPITAL LETTER B +16E51; C; 16E71; # MEDEFAIDRIN CAPITAL LETTER C +16E52; C; 16E72; # MEDEFAIDRIN CAPITAL LETTER U +16E53; C; 16E73; # MEDEFAIDRIN CAPITAL LETTER YU +16E54; C; 16E74; # MEDEFAIDRIN CAPITAL LETTER L +16E55; C; 16E75; # MEDEFAIDRIN CAPITAL LETTER Q +16E56; C; 16E76; # MEDEFAIDRIN CAPITAL LETTER HP +16E57; C; 16E77; # MEDEFAIDRIN CAPITAL LETTER NY +16E58; C; 16E78; # MEDEFAIDRIN CAPITAL LETTER X +16E59; C; 16E79; # MEDEFAIDRIN CAPITAL LETTER D +16E5A; C; 16E7A; # MEDEFAIDRIN CAPITAL LETTER OE +16E5B; C; 16E7B; # MEDEFAIDRIN CAPITAL LETTER N +16E5C; C; 16E7C; # MEDEFAIDRIN CAPITAL LETTER R +16E5D; C; 16E7D; # MEDEFAIDRIN CAPITAL LETTER O +16E5E; C; 16E7E; # MEDEFAIDRIN CAPITAL LETTER AI +16E5F; C; 16E7F; # MEDEFAIDRIN CAPITAL LETTER Y +1E900; C; 1E922; # ADLAM CAPITAL LETTER ALIF +1E901; C; 1E923; # ADLAM CAPITAL LETTER DAALI +1E902; C; 1E924; # ADLAM CAPITAL LETTER LAAM +1E903; C; 1E925; # ADLAM CAPITAL LETTER MIIM +1E904; C; 1E926; # ADLAM CAPITAL LETTER BA +1E905; C; 1E927; # ADLAM CAPITAL LETTER SINNYIIYHE +1E906; C; 1E928; # ADLAM CAPITAL LETTER PE +1E907; C; 1E929; # ADLAM CAPITAL LETTER BHE +1E908; C; 1E92A; # ADLAM CAPITAL LETTER RA +1E909; C; 1E92B; # ADLAM CAPITAL LETTER E +1E90A; C; 1E92C; # ADLAM CAPITAL LETTER FA +1E90B; C; 1E92D; # ADLAM CAPITAL LETTER I +1E90C; C; 1E92E; # ADLAM CAPITAL LETTER O +1E90D; C; 1E92F; # ADLAM CAPITAL LETTER DHA +1E90E; C; 1E930; # ADLAM CAPITAL LETTER YHE +1E90F; C; 1E931; # ADLAM CAPITAL LETTER WAW +1E910; C; 1E932; # ADLAM CAPITAL LETTER NUN +1E911; C; 1E933; # ADLAM CAPITAL LETTER KAF +1E912; C; 1E934; # ADLAM CAPITAL LETTER YA +1E913; C; 1E935; # ADLAM CAPITAL LETTER U +1E914; C; 1E936; # ADLAM CAPITAL LETTER JIIM +1E915; C; 1E937; # ADLAM CAPITAL LETTER CHI +1E916; C; 1E938; # ADLAM CAPITAL LETTER HA +1E917; C; 1E939; # ADLAM CAPITAL LETTER QAAF +1E918; C; 1E93A; # ADLAM CAPITAL LETTER GA +1E919; C; 1E93B; # ADLAM CAPITAL LETTER NYA +1E91A; C; 1E93C; # ADLAM CAPITAL LETTER TU +1E91B; C; 1E93D; # ADLAM CAPITAL LETTER NHA +1E91C; C; 1E93E; # ADLAM CAPITAL LETTER VA +1E91D; C; 1E93F; # ADLAM CAPITAL LETTER KHA +1E91E; C; 1E940; # ADLAM CAPITAL LETTER GBE +1E91F; C; 1E941; # ADLAM CAPITAL LETTER ZAL +1E920; C; 1E942; # ADLAM CAPITAL LETTER KPO +1E921; C; 1E943; # ADLAM CAPITAL LETTER SHA +''' + + +def _parse_unichr(s): + s = int(s, 16) + try: + return compat_chr(s) + except ValueError: + # work around "unichr() arg not in range(0x10000) (narrow Python build)" + return ('\\U%08x' % s).decode('unicode-escape') + + +_map = dict( + (_parse_unichr(from_), ''.join(map(_parse_unichr, to_.split(' ')))) + for from_, type_, to_, _ in ( + l.split('; ', 3) for l in _map_str.splitlines() if l and not l[0] == '#') + if type_ in ('C', 'F')) +del _map_str + + +def casefold(s): + assert isinstance(s, compat_str) + return ''.join((_map.get(c, c) for c in s)) + + +__all__ = [ + casefold +] diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 9e45c454b..39551f810 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -21,6 +21,28 @@ import subprocess import sys import xml.etree.ElementTree +# naming convention +# 'compat_' + Python3_name.replace('.', '_') +# other aliases exist for convenience and/or legacy + +# deal with critical unicode/str things first +try: + # Python 2 + compat_str, compat_basestring, compat_chr = ( + unicode, basestring, unichr + ) + from .casefold import casefold as compat_casefold + +except NameError: + compat_str, compat_basestring, compat_chr = ( + str, str, chr + ) + compat_casefold = lambda s: s.casefold() + +try: + import collections.abc as compat_collections_abc +except ImportError: + import collections as compat_collections_abc try: import urllib.request as compat_urllib_request @@ -36,16 +58,15 @@ try: import urllib.parse as compat_urllib_parse except ImportError: # Python 2 import urllib as compat_urllib_parse + import urlparse as _urlparse + for a in dir(_urlparse): + if not hasattr(compat_urllib_parse, a): + setattr(compat_urllib_parse, a, getattr(_urlparse, a)) + del _urlparse -try: - from urllib.parse import urlparse as compat_urllib_parse_urlparse -except ImportError: # Python 2 - from urlparse import urlparse as compat_urllib_parse_urlparse - -try: - import urllib.parse as compat_urlparse -except ImportError: # Python 2 - import urlparse as compat_urlparse +# unfavoured aliases +compat_urlparse = compat_urllib_parse +compat_urllib_parse_urlparse = compat_urllib_parse.urlparse try: import urllib.response as compat_urllib_response @@ -56,6 +77,7 @@ try: import http.cookiejar as compat_cookiejar except ImportError: # Python 2 import cookielib as compat_cookiejar +compat_http_cookiejar = compat_cookiejar if sys.version_info[0] == 2: class compat_cookiejar_Cookie(compat_cookiejar.Cookie): @@ -67,11 +89,13 @@ if sys.version_info[0] == 2: compat_cookiejar.Cookie.__init__(self, version, name, value, *args, **kwargs) else: compat_cookiejar_Cookie = compat_cookiejar.Cookie +compat_http_cookiejar_Cookie = compat_cookiejar_Cookie try: import http.cookies as compat_cookies except ImportError: # Python 2 import Cookie as compat_cookies +compat_http_cookies = compat_cookies if sys.version_info[0] == 2: class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie): @@ -81,6 +105,7 @@ if sys.version_info[0] == 2: return super(compat_cookies_SimpleCookie, self).load(rawdata) else: compat_cookies_SimpleCookie = compat_cookies.SimpleCookie +compat_http_cookies_SimpleCookie = compat_cookies_SimpleCookie try: import html.entities as compat_html_entities @@ -2334,16 +2359,19 @@ try: from urllib.error import HTTPError as compat_HTTPError except ImportError: # Python 2 from urllib2 import HTTPError as compat_HTTPError +compat_urllib_HTTPError = compat_HTTPError try: from urllib.request import urlretrieve as compat_urlretrieve except ImportError: # Python 2 from urllib import urlretrieve as compat_urlretrieve +compat_urllib_request_urlretrieve = compat_urlretrieve try: from html.parser import HTMLParser as compat_HTMLParser except ImportError: # Python 2 from HTMLParser import HTMLParser as compat_HTMLParser +compat_html_parser_HTMLParser = compat_HTMLParser try: # Python 2 from HTMLParser import HTMLParseError as compat_HTMLParseError @@ -2357,6 +2385,7 @@ except ImportError: # Python <3.4 # and uniform cross-version exception handling class compat_HTMLParseError(Exception): pass +compat_html_parser_HTMLParseError = compat_HTMLParseError try: from subprocess import DEVNULL @@ -2369,15 +2398,12 @@ try: except ImportError: import BaseHTTPServer as compat_http_server -try: - compat_str = unicode # Python 2 -except NameError: - compat_str = str - try: from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote as compat_urllib_parse_unquote from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus + from urllib.parse import urlencode as compat_urllib_parse_urlencode + from urllib.parse import parse_qs as compat_parse_qs except ImportError: # Python 2 _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire') else re.compile(r'([\x00-\x7f]+)')) @@ -2444,9 +2470,6 @@ except ImportError: # Python 2 string = string.replace('+', ' ') return compat_urllib_parse_unquote(string, encoding, errors) -try: - from urllib.parse import urlencode as compat_urllib_parse_urlencode -except ImportError: # Python 2 # Python 2 will choke in urlencode on mixture of byte and unicode strings. # Possible solutions are to either port it from python 3 with all # the friends or manually ensure input query contains only byte strings. @@ -2468,7 +2491,62 @@ except ImportError: # Python 2 def encode_list(l): return [encode_elem(e) for e in l] - return compat_urllib_parse.urlencode(encode_elem(query), doseq=doseq) + return compat_urllib_parse._urlencode(encode_elem(query), doseq=doseq) + + # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. + # Python 2's version is apparently totally broken + def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + qs, _coerce_result = qs, compat_str + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + r = [] + for name_value in pairs: + if not name_value and not strict_parsing: + continue + nv = name_value.split('=', 1) + if len(nv) != 2: + if strict_parsing: + raise ValueError('bad query field: %r' % (name_value,)) + # Handle case of a control-name with no equal sign + if keep_blank_values: + nv.append('') + else: + continue + if len(nv[1]) or keep_blank_values: + name = nv[0].replace('+', ' ') + name = compat_urllib_parse_unquote( + name, encoding=encoding, errors=errors) + name = _coerce_result(name) + value = nv[1].replace('+', ' ') + value = compat_urllib_parse_unquote( + value, encoding=encoding, errors=errors) + value = _coerce_result(value) + r.append((name, value)) + return r + + def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + parsed_result = {} + pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, + encoding=encoding, errors=errors) + for name, value in pairs: + if name in parsed_result: + parsed_result[name].append(value) + else: + parsed_result[name] = [value] + return parsed_result + + setattr(compat_urllib_parse, '_urlencode', + getattr(compat_urllib_parse, 'urlencode')) + for name, fix in ( + ('unquote_to_bytes', compat_urllib_parse_unquote_to_bytes), + ('parse_unquote', compat_urllib_parse_unquote), + ('unquote_plus', compat_urllib_parse_unquote_plus), + ('urlencode', compat_urllib_parse_urlencode), + ('parse_qs', compat_parse_qs)): + setattr(compat_urllib_parse, name, fix) + +compat_urllib_parse_parse_qs = compat_parse_qs try: from urllib.request import DataHandler as compat_urllib_request_DataHandler @@ -2504,21 +2582,11 @@ except ImportError: # Python < 3.4 return compat_urllib_response.addinfourl(io.BytesIO(data), headers, url) -try: - compat_basestring = basestring # Python 2 -except NameError: - compat_basestring = str - -try: - compat_chr = unichr # Python 2 -except NameError: - compat_chr = chr - try: from xml.etree.ElementTree import ParseError as compat_xml_parse_error except ImportError: # Python 2.6 from xml.parsers.expat import ExpatError as compat_xml_parse_error - +compat_xml_etree_ElementTree_ParseError = compat_xml_parse_error etree = xml.etree.ElementTree @@ -2532,10 +2600,11 @@ try: # xml.etree.ElementTree.Element is a method in Python <=2.6 and # the following will crash with: # TypeError: isinstance() arg 2 must be a class, type, or tuple of classes and types - isinstance(None, xml.etree.ElementTree.Element) + isinstance(None, etree.Element) from xml.etree.ElementTree import Element as compat_etree_Element except TypeError: # Python <=2.6 from xml.etree.ElementTree import _ElementInterface as compat_etree_Element +compat_xml_etree_ElementTree_Element = compat_etree_Element if sys.version_info[0] >= 3: def compat_etree_fromstring(text): @@ -2591,6 +2660,7 @@ else: if k == uri or v == prefix: del etree._namespace_map[k] etree._namespace_map[uri] = prefix +compat_xml_etree_register_namespace = compat_etree_register_namespace if sys.version_info < (2, 7): # Here comes the crazy part: In 2.6, if the xpath is a unicode, @@ -2602,53 +2672,6 @@ if sys.version_info < (2, 7): else: compat_xpath = lambda xpath: xpath -try: - from urllib.parse import parse_qs as compat_parse_qs -except ImportError: # Python 2 - # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. - # Python 2's version is apparently totally broken - - def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - qs, _coerce_result = qs, compat_str - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] - r = [] - for name_value in pairs: - if not name_value and not strict_parsing: - continue - nv = name_value.split('=', 1) - if len(nv) != 2: - if strict_parsing: - raise ValueError('bad query field: %r' % (name_value,)) - # Handle case of a control-name with no equal sign - if keep_blank_values: - nv.append('') - else: - continue - if len(nv[1]) or keep_blank_values: - name = nv[0].replace('+', ' ') - name = compat_urllib_parse_unquote( - name, encoding=encoding, errors=errors) - name = _coerce_result(name) - value = nv[1].replace('+', ' ') - value = compat_urllib_parse_unquote( - value, encoding=encoding, errors=errors) - value = _coerce_result(value) - r.append((name, value)) - return r - - def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - parsed_result = {} - pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, - encoding=encoding, errors=errors) - for name, value in pairs: - if name in parsed_result: - parsed_result[name].append(value) - else: - parsed_result[name] = [value] - return parsed_result - compat_os_name = os._name if os.name == 'java' else os.name @@ -2773,6 +2796,8 @@ else: else: compat_expanduser = os.path.expanduser +compat_os_path_expanduser = compat_expanduser + if compat_os_name == 'nt' and sys.version_info < (3, 8): # os.path.realpath on Windows does not follow symbolic links @@ -2784,6 +2809,8 @@ if compat_os_name == 'nt' and sys.version_info < (3, 8): else: compat_realpath = os.path.realpath +compat_os_path_realpath = compat_realpath + if sys.version_info < (3, 0): def compat_print(s): @@ -2804,11 +2831,15 @@ if sys.version_info < (3, 0) and sys.platform == 'win32': else: compat_getpass = getpass.getpass +compat_getpass_getpass = compat_getpass + + try: compat_input = raw_input except NameError: # Python 3 compat_input = input + # Python < 2.6.5 require kwargs to be bytes try: def _testfunc(x): @@ -2886,6 +2917,7 @@ else: _terminal_size = collections.namedtuple('terminal_size', ['columns', 'lines']) def compat_get_terminal_size(fallback=(80, 24)): + from .utils import process_communicate_or_kill columns = compat_getenv('COLUMNS') if columns: columns = int(columns) @@ -2902,7 +2934,7 @@ else: sp = subprocess.Popen( ['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = sp.communicate() + out, err = process_communicate_or_kill(sp) _lines, _columns = map(int, out.split()) except Exception: _columns, _lines = _terminal_size(*fallback) @@ -2913,15 +2945,16 @@ else: lines = _lines return _terminal_size(columns, lines) + try: itertools.count(start=0, step=1) compat_itertools_count = itertools.count except TypeError: # Python 2.6 def compat_itertools_count(start=0, step=1): - n = start while True: - yield n - n += step + yield start + start += step + if sys.version_info >= (3, 0): from tokenize import tokenize as compat_tokenize_tokenize @@ -2962,6 +2995,24 @@ else: compat_Struct = struct.Struct +# compat_map/filter() returning an iterator, supposedly the +# same versioning as for zip below +try: + from future_builtins import map as compat_map +except ImportError: + try: + from itertools import imap as compat_map + except ImportError: + compat_map = map + +try: + from future_builtins import filter as compat_filter +except ImportError: + try: + from itertools import ifilter as compat_filter + except ImportError: + compat_filter = filter + try: from future_builtins import zip as compat_zip except ImportError: # not 2.6+ or is 3.x @@ -2971,6 +3022,82 @@ except ImportError: # not 2.6+ or is 3.x compat_zip = zip +# method renamed between Py2/3 +try: + from itertools import zip_longest as compat_itertools_zip_longest +except ImportError: + from itertools import izip_longest as compat_itertools_zip_longest + + +# new class in collections +try: + from collections import ChainMap as compat_collections_chain_map + # Py3.3's ChainMap is deficient + if sys.version_info < (3, 4): + raise ImportError +except ImportError: + # Py <= 3.3 + class compat_collections_chain_map(compat_collections_abc.MutableMapping): + + maps = [{}] + + def __init__(self, *maps): + self.maps = list(maps) or [{}] + + def __getitem__(self, k): + for m in self.maps: + if k in m: + return m[k] + raise KeyError(k) + + def __setitem__(self, k, v): + self.maps[0].__setitem__(k, v) + return + + def __contains__(self, k): + return any((k in m) for m in self.maps) + + def __delitem(self, k): + if k in self.maps[0]: + del self.maps[0][k] + return + raise KeyError(k) + + def __delitem__(self, k): + self.__delitem(k) + + def __iter__(self): + return itertools.chain(*reversed(self.maps)) + + def __len__(self): + return len(iter(self)) + + # to match Py3, don't del directly + def pop(self, k, *args): + if self.__contains__(k): + off = self.__getitem__(k) + self.__delitem(k) + return off + elif len(args) > 0: + return args[0] + raise KeyError(k) + + def new_child(self, m=None, **kwargs): + m = m or {} + m.update(kwargs) + return compat_collections_chain_map(m, *self.maps) + + @property + def parents(self): + return compat_collections_chain_map(*(self.maps[1:])) + + +# Pythons disagree on the type of a pattern (RegexObject, _sre.SRE_Pattern, Pattern, ...?) +compat_re_Pattern = type(re.compile('')) +# and on the type of a match +compat_re_Match = type(re.match('a', 'a')) + + if sys.version_info < (3, 3): def compat_b64decode(s, *args, **kwargs): if isinstance(s, compat_str): @@ -2979,6 +3106,8 @@ if sys.version_info < (3, 3): else: compat_b64decode = base64.b64decode +compat_base64_b64decode = compat_b64decode + if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0): # PyPy2 prior to version 5.4.0 expects byte strings as Windows function @@ -2998,26 +3127,53 @@ else: return ctypes.WINFUNCTYPE(*args, **kwargs) -__all__ = [ +legacy = [ 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', - 'compat_Struct', 'compat_b64decode', - 'compat_basestring', - 'compat_chr', 'compat_cookiejar', 'compat_cookiejar_Cookie', 'compat_cookies', 'compat_cookies_SimpleCookie', - 'compat_ctypes_WINFUNCTYPE', 'compat_etree_Element', - 'compat_etree_fromstring', 'compat_etree_register_namespace', 'compat_expanduser', + 'compat_getpass', + 'compat_parse_qs', + 'compat_realpath', + 'compat_urllib_parse_parse_qs', + 'compat_urllib_parse_unquote', + 'compat_urllib_parse_unquote_plus', + 'compat_urllib_parse_unquote_to_bytes', + 'compat_urllib_parse_urlencode', + 'compat_urllib_parse_urlparse', + 'compat_urlparse', + 'compat_urlretrieve', + 'compat_xml_parse_error', +] + + +__all__ = [ + 'compat_html_parser_HTMLParseError', + 'compat_html_parser_HTMLParser', + 'compat_Struct', + 'compat_base64_b64decode', + 'compat_basestring', + 'compat_casefold', + 'compat_chr', + 'compat_collections_abc', + 'compat_collections_chain_map', + 'compat_http_cookiejar', + 'compat_http_cookiejar_Cookie', + 'compat_http_cookies', + 'compat_http_cookies_SimpleCookie', + 'compat_ctypes_WINFUNCTYPE', + 'compat_etree_fromstring', + 'compat_filter', 'compat_get_terminal_size', 'compat_getenv', - 'compat_getpass', + 'compat_getpass_getpass', 'compat_html_entities', 'compat_html_entities_html5', 'compat_http_client', @@ -3025,13 +3181,17 @@ __all__ = [ 'compat_input', 'compat_integer_types', 'compat_itertools_count', + 'compat_itertools_zip_longest', 'compat_kwargs', + 'compat_map', 'compat_numeric_types', 'compat_ord', 'compat_os_name', - 'compat_parse_qs', + 'compat_os_path_expanduser', + 'compat_os_path_realpath', 'compat_print', - 'compat_realpath', + 'compat_re_Match', + 'compat_re_Pattern', 'compat_setenv', 'compat_shlex_quote', 'compat_shlex_split', @@ -3043,17 +3203,14 @@ __all__ = [ 'compat_tokenize_tokenize', 'compat_urllib_error', 'compat_urllib_parse', - 'compat_urllib_parse_unquote', - 'compat_urllib_parse_unquote_plus', - 'compat_urllib_parse_unquote_to_bytes', - 'compat_urllib_parse_urlencode', - 'compat_urllib_parse_urlparse', 'compat_urllib_request', 'compat_urllib_request_DataHandler', 'compat_urllib_response', - 'compat_urlparse', - 'compat_urlretrieve', - 'compat_xml_parse_error', + 'compat_urllib_request_urlretrieve', + 'compat_urllib_HTTPError', + 'compat_xml_etree_ElementTree_Element', + 'compat_xml_etree_ElementTree_ParseError', + 'compat_xml_etree_register_namespace', 'compat_xpath', 'compat_zip', 'workaround_optparse_bug9161', diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index 2e485df9d..d701d6292 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -1,22 +1,31 @@ from __future__ import unicode_literals +from ..utils import ( + determine_protocol, +) + + +def get_suitable_downloader(info_dict, params={}): + info_dict['protocol'] = determine_protocol(info_dict) + info_copy = info_dict.copy() + return _get_suitable_downloader(info_copy, params) + + +# Some of these require get_suitable_downloader from .common import FileDownloader +from .dash import DashSegmentsFD from .f4m import F4mFD from .hls import HlsFD from .http import HttpFD from .rtmp import RtmpFD -from .dash import DashSegmentsFD from .rtsp import RtspFD from .ism import IsmFD +from .niconico import NiconicoDmcFD from .external import ( get_external_downloader, FFmpegFD, ) -from ..utils import ( - determine_protocol, -) - PROTOCOL_MAP = { 'rtmp': RtmpFD, 'm3u8_native': HlsFD, @@ -26,13 +35,12 @@ PROTOCOL_MAP = { 'f4m': F4mFD, 'http_dash_segments': DashSegmentsFD, 'ism': IsmFD, + 'niconico_dmc': NiconicoDmcFD, } -def get_suitable_downloader(info_dict, params={}): +def _get_suitable_downloader(info_dict, params={}): """Get the downloader class that can handle the info dict.""" - protocol = determine_protocol(info_dict) - info_dict['protocol'] = protocol # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): # return FFmpegFD @@ -42,7 +50,11 @@ def get_suitable_downloader(info_dict, params={}): ed = get_external_downloader(external_downloader) if ed.can_download(info_dict): return ed + # Avoid using unwanted args since external_downloader was rejected + if params.get('external_downloader_args'): + params['external_downloader_args'] = None + protocol = info_dict['protocol'] if protocol.startswith('m3u8') and info_dict.get('is_live'): return FFmpegFD diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index c31f8910a..bffcd10b6 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -22,6 +22,7 @@ from ..utils import ( handle_youtubedl_headers, check_executable, is_outdated_version, + process_communicate_or_kill, ) @@ -104,7 +105,7 @@ class ExternalFD(FileDownloader): p = subprocess.Popen( cmd, stderr=subprocess.PIPE) - _, stderr = p.communicate() + _, stderr = process_communicate_or_kill(p) if p.returncode != 0: self.to_stderr(stderr.decode('utf-8', 'replace')) return p.returncode @@ -141,7 +142,7 @@ class CurlFD(ExternalFD): # curl writes the progress to stderr so don't capture it. p = subprocess.Popen(cmd) - p.communicate() + process_communicate_or_kill(p) return p.returncode @@ -199,6 +200,64 @@ class Aria2cFD(ExternalFD): return cmd +class Aria2pFD(ExternalFD): + ''' Aria2pFD class + This class support to use aria2p as downloader. + (Aria2p, a command-line tool and Python library to interact with an aria2c daemon process + through JSON-RPC.) + It can help you to get download progress more easily. + To use aria2p as downloader, you need to install aria2c and aria2p, aria2p can download with pip. + Then run aria2c in the background and enable with the --enable-rpc option. + ''' + try: + import aria2p + __avail = True + except ImportError: + __avail = False + + @classmethod + def available(cls): + return cls.__avail + + def _call_downloader(self, tmpfilename, info_dict): + aria2 = self.aria2p.API( + self.aria2p.Client( + host='http://localhost', + port=6800, + secret='' + ) + ) + + options = { + 'min-split-size': '1M', + 'max-connection-per-server': 4, + 'auto-file-renaming': 'false', + } + options['dir'] = os.path.dirname(tmpfilename) or os.path.abspath('.') + options['out'] = os.path.basename(tmpfilename) + options['header'] = [] + for key, val in info_dict['http_headers'].items(): + options['header'].append('{0}: {1}'.format(key, val)) + download = aria2.add_uris([info_dict['url']], options) + status = { + 'status': 'downloading', + 'tmpfilename': tmpfilename, + } + started = time.time() + while download.status in ['active', 'waiting']: + download = aria2.get_download(download.gid) + status.update({ + 'downloaded_bytes': download.completed_length, + 'total_bytes': download.total_length, + 'elapsed': time.time() - started, + 'eta': download.eta.total_seconds(), + 'speed': download.download_speed, + }) + self._hook_progress(status) + time.sleep(.5) + return download.status != 'complete' + + class HttpieFD(ExternalFD): @classmethod def available(cls): @@ -336,14 +395,17 @@ class FFmpegFD(ExternalFD): proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env) try: retval = proc.wait() - except KeyboardInterrupt: - # subprocces.run would send the SIGKILL signal to ffmpeg and the + except BaseException as e: + # subprocess.run would send the SIGKILL signal to ffmpeg and the # mp4 file couldn't be played, but if we ask ffmpeg to quit it # produces a file that is playable (this is mostly useful for live # streams). Note that Windows is not affected and produces playable # files (see https://github.com/ytdl-org/youtube-dl/issues/8300). - if sys.platform != 'win32': - proc.communicate(b'q') + if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32': + process_communicate_or_kill(proc, b'q') + else: + proc.kill() + proc.wait() raise return retval diff --git a/youtube_dl/downloader/niconico.py b/youtube_dl/downloader/niconico.py new file mode 100644 index 000000000..6392c9989 --- /dev/null +++ b/youtube_dl/downloader/niconico.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +try: + import threading +except ImportError: + threading = None + +from .common import FileDownloader +from ..downloader import get_suitable_downloader +from ..extractor.niconico import NiconicoIE +from ..utils import sanitized_Request + + +class NiconicoDmcFD(FileDownloader): + """ Downloading niconico douga from DMC with heartbeat """ + + FD_NAME = 'niconico_dmc' + + def real_download(self, filename, info_dict): + self.to_screen('[%s] Downloading from DMC' % self.FD_NAME) + + ie = NiconicoIE(self.ydl) + info_dict, heartbeat_info_dict = ie._get_heartbeat_info(info_dict) + + fd = get_suitable_downloader(info_dict, params=self.params)(self.ydl, self.params) + for ph in self._progress_hooks: + fd.add_progress_hook(ph) + + if not threading: + self.to_screen('[%s] Threading for Heartbeat not available' % self.FD_NAME) + return fd.real_download(filename, info_dict) + + success = download_complete = False + timer = [None] + heartbeat_lock = threading.Lock() + heartbeat_url = heartbeat_info_dict['url'] + heartbeat_data = heartbeat_info_dict['data'].encode() + heartbeat_interval = heartbeat_info_dict.get('interval', 30) + + request = sanitized_Request(heartbeat_url, heartbeat_data) + + def heartbeat(): + try: + self.ydl.urlopen(request).read() + except Exception: + self.to_screen('[%s] Heartbeat failed' % self.FD_NAME) + + with heartbeat_lock: + if not download_complete: + timer[0] = threading.Timer(heartbeat_interval, heartbeat) + timer[0].start() + + heartbeat_info_dict['ping']() + self.to_screen('[%s] Heartbeat with %d second interval ...' % (self.FD_NAME, heartbeat_interval)) + try: + heartbeat() + if type(fd).__name__ == 'HlsFD': + info_dict.update(ie._extract_m3u8_formats(info_dict['url'], info_dict['id'])[0]) + success = fd.real_download(filename, info_dict) + finally: + if heartbeat_lock: + with heartbeat_lock: + timer[0].cancel() + download_complete = True + return success diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index fbb7f51b0..8a25dbc8d 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -89,11 +89,13 @@ class RtmpFD(FileDownloader): self.to_screen('') cursor_in_new_line = True self.to_screen('[rtmpdump] ' + line) - finally: + if not cursor_in_new_line: + self.to_screen('') + return proc.wait() + except BaseException: # Including KeyboardInterrupt + proc.kill() proc.wait() - if not cursor_in_new_line: - self.to_screen('') - return proc.returncode + raise url = info_dict['url'] player_url = info_dict.get('player_url') diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index a55ebbcbd..5ff419f19 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -31,30 +31,34 @@ from ..utils import ( class ADNIE(InfoExtractor): - IE_DESC = 'Anime Digital Network' - _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P\d+)' - _TEST = { - 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', - 'md5': '0319c99885ff5547565cacb4f3f9348d', + IE_DESC = 'Animation Digital Network' + _VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.fr/video/[^/]+/(?P\d+)' + _TESTS = [{ + 'url': 'https://animationdigitalnetwork.fr/video/fruits-basket/9841-episode-1-a-ce-soir', + 'md5': '1c9ef066ceb302c86f80c2b371615261', 'info_dict': { - 'id': '7778', + 'id': '9841', 'ext': 'mp4', - 'title': 'Blue Exorcist - Kyôto Saga - Episode 1', - 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5', - 'series': 'Blue Exorcist - Kyôto Saga', - 'duration': 1467, - 'release_date': '20170106', + 'title': 'Fruits Basket - Episode 1', + 'description': 'md5:14be2f72c3c96809b0ca424b0097d336', + 'series': 'Fruits Basket', + 'duration': 1437, + 'release_date': '20190405', 'comment_count': int, 'average_rating': float, - 'season_number': 2, - 'episode': 'Début des hostilités', + 'season_number': 1, + 'episode': 'À ce soir !', 'episode_number': 1, - } - } + }, + 'skip': 'Only available in region (FR, ...)', + }, { + 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', + 'only_matching': True, + }] - _NETRC_MACHINE = 'animedigitalnetwork' - _BASE_URL = 'http://animedigitalnetwork.fr' - _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/' + _NETRC_MACHINE = 'animationdigitalnetwork' + _BASE = 'animationdigitalnetwork.fr' + _API_BASE_URL = 'https://gw.api.' + _BASE + '/' _PLAYER_BASE_URL = _API_BASE_URL + 'player/' _HEADERS = {} _LOGIN_ERR_MESSAGE = 'Unable to log in' @@ -82,14 +86,14 @@ class ADNIE(InfoExtractor): if subtitle_location: enc_subtitles = self._download_webpage( subtitle_location, video_id, 'Downloading subtitles data', - fatal=False, headers={'Origin': 'https://animedigitalnetwork.fr'}) + fatal=False, headers={'Origin': 'https://' + self._BASE}) if not enc_subtitles: return None - # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js + # http://animationdigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), - bytes_to_intlist(binascii.unhexlify(self._K + 'ab9f52f5baae7c72')), + bytes_to_intlist(binascii.unhexlify(self._K + '7fac1178830cfe0c')), bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( @@ -138,9 +142,9 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' if not username: return try: + url = self._API_BASE_URL + 'authentication/login' access_token = (self._download_json( - self._API_BASE_URL + 'authentication/login', None, - 'Logging in', self._LOGIN_ERR_MESSAGE, fatal=False, + url, None, 'Logging in', self._LOGIN_ERR_MESSAGE, fatal=False, data=urlencode_postdata({ 'password': password, 'rememberMe': False, @@ -153,7 +157,8 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' message = None if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: resp = self._parse_json( - e.cause.read().decode(), None, fatal=False) or {} + self._webpage_read_content(e.cause, url, username), + username, fatal=False) or {} message = resp.get('message') or resp.get('code') self.report_warning(message or self._LOGIN_ERR_MESSAGE) @@ -211,7 +216,9 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' # This usually goes away with a different random pkcs1pad, so retry continue - error = self._parse_json(e.cause.read(), video_id) + error = self._parse_json( + self._webpage_read_content(e.cause, links_url, video_id), + video_id, fatal=False) or {} message = error.get('message') if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country': self.raise_geo_restricted(msg=message) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index e55c03fd7..2a1f08e39 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -20,8 +20,8 @@ class AENetworksBaseIE(ThePlatformIE): (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| fyi\.tv )/''' - _THEPLATFORM_KEY = 'crazyjava' - _THEPLATFORM_SECRET = 's3cr3t' + _THEPLATFORM_KEY = '43jXaGRQud' + _THEPLATFORM_SECRET = 'S10BPXHMlb' _DOMAIN_MAP = { 'history.com': ('HISTORY', 'history'), 'aetv.com': ('AETV', 'aetv'), diff --git a/youtube_dl/extractor/aliexpress.py b/youtube_dl/extractor/aliexpress.py index 6f241e683..9722fe9ac 100644 --- a/youtube_dl/extractor/aliexpress.py +++ b/youtube_dl/extractor/aliexpress.py @@ -18,7 +18,7 @@ class AliExpressLiveIE(InfoExtractor): 'id': '2800002704436634', 'ext': 'mp4', 'title': 'CASIMA7.22', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'uploader': 'CASIMA Official Store', 'timestamp': 1500717600, 'upload_date': '20170722', diff --git a/youtube_dl/extractor/alsace20tv.py b/youtube_dl/extractor/alsace20tv.py new file mode 100644 index 000000000..228cec3ec --- /dev/null +++ b/youtube_dl/extractor/alsace20tv.py @@ -0,0 +1,89 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + dict_get, + get_element_by_class, + int_or_none, + unified_strdate, + url_or_none, +) + + +class Alsace20TVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/(?:[\w-]+/)+[\w-]+-(?P[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/VOD/Actu/JT/Votre-JT-jeudi-3-fevrier-lyNHCXpYJh.html', + # 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'description': 'md5:fc0bc4a0692d3d2dba4524053de4c7b7', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'duration': 1073, + 'view_count': int, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _extract_video(self, video_id, url=None): + info = self._download_json( + 'https://www.alsace20.tv/visionneuse/visio_v9_js.php?key=%s&habillage=0&mode=html' % (video_id, ), + video_id) or {} + title = info['titre'] + + formats = [] + for res, fmt_url in (info.get('files') or {}).items(): + formats.extend( + self._extract_smil_formats(fmt_url, video_id, fatal=False) + if '/smil:_' in fmt_url + else self._extract_mpd_formats(fmt_url, video_id, mpd_id=res, fatal=False)) + self._sort_formats(formats) + + webpage = (url and self._download_webpage(url, video_id, fatal=False)) or '' + thumbnail = url_or_none(dict_get(info, ('image', 'preview', )) or self._og_search_thumbnail(webpage)) + upload_date = self._search_regex(r'/(\d{6})_', thumbnail, 'upload_date', default=None) + upload_date = unified_strdate('20%s-%s-%s' % (upload_date[:2], upload_date[2:4], upload_date[4:])) if upload_date else None + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': clean_html(get_element_by_class('wysiwyg', webpage)), + 'upload_date': upload_date, + 'thumbnail': thumbnail, + 'duration': int_or_none(self._og_search_property('video:duration', webpage) if webpage else None), + 'view_count': int_or_none(info.get('nb_vues')), + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id, url) + + +class Alsace20TVEmbedIE(Alsace20TVIE): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/emb/(?P[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/emb/lyNHCXpYJh', + # 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'view_count': int, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index be960c0f9..08d3604e9 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -15,7 +15,7 @@ from ..utils import ( class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?Pepisode|videos)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?:cooks(?:country|illustrated)/)?(?Pepisode|videos)/(?P\d+)' _TESTS = [{ 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', @@ -23,15 +23,20 @@ class AmericasTestKitchenIE(InfoExtractor): 'id': '5b400b9ee338f922cb06450c', 'title': 'Japanese Suppers', 'ext': 'mp4', + 'display_id': 'weeknight-japanese-suppers', 'description': 'md5:64e606bfee910627efc4b5f050de92b3', - 'thumbnail': r're:^https?://', - 'timestamp': 1523318400, - 'upload_date': '20180410', - 'release_date': '20180410', + 'timestamp': 1523304000, + 'upload_date': '20180409', + 'release_date': '20180409', 'series': "America's Test Kitchen", + 'season': 'Season 18', 'season_number': 18, 'episode': 'Japanese Suppers', 'episode_number': 15, + 'duration': 1376, + 'thumbnail': r're:^https?://', + 'average_rating': 0, + 'view_count': int, }, 'params': { 'skip_download': True, @@ -44,15 +49,20 @@ class AmericasTestKitchenIE(InfoExtractor): 'id': '5fbe8c61bda2010001c6763b', 'title': 'Simple Chicken Dinner', 'ext': 'mp4', + 'display_id': 'atktv_2103_simple-chicken-dinner_full-episode_web-mp4', 'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7', - 'thumbnail': r're:^https?://', - 'timestamp': 1610755200, - 'upload_date': '20210116', - 'release_date': '20210116', + 'timestamp': 1610737200, + 'upload_date': '20210115', + 'release_date': '20210115', 'series': "America's Test Kitchen", + 'season': 'Season 21', 'season_number': 21, 'episode': 'Simple Chicken Dinner', 'episode_number': 3, + 'duration': 1397, + 'thumbnail': r're:^https?://', + 'view_count': int, + 'average_rating': 0, }, 'params': { 'skip_download': True, @@ -60,6 +70,12 @@ class AmericasTestKitchenIE(InfoExtractor): }, { 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, + }, { + 'url': 'https://www.americastestkitchen.com/cookscountry/episode/564-when-only-chocolate-will-do', + 'only_matching': True, + }, { + 'url': 'https://www.americastestkitchen.com/cooksillustrated/videos/4478-beef-wellington', + 'only_matching': True, }, { 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', 'only_matching': True, @@ -94,7 +110,7 @@ class AmericasTestKitchenIE(InfoExtractor): class AmericasTestKitchenSeasonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pamericastestkitchen|cookscountry)\.com/episodes/browse/season_(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?Pamericastestkitchen|(?Pcooks(?:country|illustrated)))\.com(?:(?:/(?Pcooks(?:country|illustrated)))?(?:/?$|(?\d+)))' _TESTS = [{ # ATK Season 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', @@ -105,48 +121,93 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): 'playlist_count': 13, }, { # Cooks Country Season - 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'url': 'https://www.americastestkitchen.com/cookscountry/episodes/browse/season_12', 'info_dict': { 'id': 'season_12', 'title': 'Season 12', }, 'playlist_count': 13, + }, { + # America's Test Kitchen Series + 'url': 'https://www.americastestkitchen.com/', + 'info_dict': { + 'id': 'americastestkitchen', + 'title': 'America\'s Test Kitchen', + }, + 'playlist_count': 558, + }, { + # Cooks Country Series + 'url': 'https://www.americastestkitchen.com/cookscountry', + 'info_dict': { + 'id': 'cookscountry', + 'title': 'Cook\'s Country', + }, + 'playlist_count': 199, + }, { + 'url': 'https://www.americastestkitchen.com/cookscountry/', + 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com', + 'only_matching': True, + }, { + 'url': 'https://www.americastestkitchen.com/cooksillustrated/', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com', + 'only_matching': True, }] def _real_extract(self, url): - show_name, season_number = re.match(self._VALID_URL, url).groups() - season_number = int(season_number) + match = re.match(self._VALID_URL, url).groupdict() + show = match.get('show2') + show_path = ('/' + show) if show else '' + show = show or match['show'] + season_number = int_or_none(match.get('season')) - slug = 'atk' if show_name == 'americastestkitchen' else 'cco' + slug, title = { + 'americastestkitchen': ('atk', 'America\'s Test Kitchen'), + 'cookscountry': ('cco', 'Cook\'s Country'), + 'cooksillustrated': ('cio', 'Cook\'s Illustrated'), + }[show] - season = 'Season %d' % season_number + facet_filters = [ + 'search_document_klass:episode', + 'search_show_slug:' + slug, + ] + + if season_number: + playlist_id = 'season_%d' % season_number + playlist_title = 'Season %d' % season_number + facet_filters.append('search_season_list:' + playlist_title) + else: + playlist_id = show + playlist_title = title season_search = self._download_json( 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, - season, headers={ - 'Origin': 'https://www.%s.com' % show_name, + playlist_id, headers={ + 'Origin': 'https://www.americastestkitchen.com', 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', 'X-Algolia-Application-Id': 'Y1FNZXUI30', }, query={ - 'facetFilters': json.dumps([ - 'search_season_list:' + season, - 'search_document_klass:episode', - 'search_show_slug:' + slug, - ]), - 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug, + 'facetFilters': json.dumps(facet_filters), + 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title,search_atk_episode_season' % slug, 'attributesToHighlight': '', 'hitsPerPage': 1000, }) def entries(): for episode in (season_search.get('hits') or []): - search_url = episode.get('search_url') + search_url = episode.get('search_url') # always formatted like '/episode/123-title-of-episode' if not search_url: continue yield { '_type': 'url', - 'url': 'https://www.%s.com%s' % (show_name, search_url), - 'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]), + 'url': 'https://www.americastestkitchen.com%s%s' % (show_path, search_url), + 'id': try_get(episode, lambda e: e['objectID'].rsplit('_', 1)[-1]), 'title': episode.get('title'), 'description': episode.get('description'), 'timestamp': unified_timestamp(episode.get('search_document_date')), @@ -156,4 +217,4 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): } return self.playlist_result( - entries(), 'season_%d' % season_number, season) + entries(), playlist_id, playlist_title) diff --git a/youtube_dl/extractor/applepodcasts.py b/youtube_dl/extractor/applepodcasts.py index 6a74de758..95e0f663c 100644 --- a/youtube_dl/extractor/applepodcasts.py +++ b/youtube_dl/extractor/applepodcasts.py @@ -3,8 +3,11 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + clean_html, clean_podcast_url, + get_element_by_class, int_or_none, + parse_codecs, parse_iso8601, try_get, ) @@ -14,16 +17,17 @@ class ApplePodcastsIE(InfoExtractor): _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P\d+)' _TESTS = [{ 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', - 'md5': 'df02e6acb11c10e844946a39e7222b08', + 'md5': '41dc31cd650143e530d9423b6b5a344f', 'info_dict': { 'id': '1000482637777', 'ext': 'mp3', 'title': '207 - Whitney Webb Returns', - 'description': 'md5:13a73bade02d2e43737751e3987e1399', + 'description': 'md5:75ef4316031df7b41ced4e7b987f79c6', 'upload_date': '20200705', - 'timestamp': 1593921600, - 'duration': 6425, + 'timestamp': 1593932400, + 'duration': 6454, 'series': 'The Tim Dillon Show', + 'thumbnail': 're:.+[.](png|jpe?g|webp)', } }, { 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', @@ -39,19 +43,40 @@ class ApplePodcastsIE(InfoExtractor): def _real_extract(self, url): episode_id = self._match_id(url) webpage = self._download_webpage(url, episode_id) - ember_data = self._parse_json(self._search_regex( - r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', - webpage, 'ember data'), episode_id) - ember_data = ember_data.get(episode_id) or ember_data - episode = ember_data['data']['attributes'] + episode_data = {} + ember_data = {} + # new page type 2021-11 + amp_data = self._parse_json(self._search_regex( + r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<', + webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {} + amp_data = try_get(amp_data, + lambda a: self._parse_json( + next(a[x] for x in iter(a) if episode_id in x), + episode_id), + dict) or {} + amp_data = amp_data.get('d') or [] + episode_data = try_get( + amp_data, + lambda a: next(x for x in a + if x['type'] == 'podcast-episodes' and x['id'] == episode_id), + dict) + if not episode_data: + # try pre 2021-11 page type: TODO: consider deleting if no longer used + ember_data = self._parse_json(self._search_regex( + r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', + webpage, 'ember data'), episode_id) or {} + ember_data = ember_data.get(episode_id) or ember_data + episode_data = try_get(ember_data, lambda x: x['data'], dict) + episode = episode_data['attributes'] description = episode.get('description') or {} series = None - for inc in (ember_data.get('included') or []): + for inc in (amp_data or ember_data.get('included') or []): if inc.get('type') == 'media/podcast': series = try_get(inc, lambda x: x['attributes']['name']) + series = series or clean_html(get_element_by_class('podcast-header__identity', webpage)) - return { + info = [{ 'id': episode_id, 'title': episode['name'], 'url': clean_podcast_url(episode['assetUrl']), @@ -59,4 +84,10 @@ class ApplePodcastsIE(InfoExtractor): 'timestamp': parse_iso8601(episode.get('releaseDateTime')), 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000), 'series': series, - } + 'thumbnail': self._og_search_thumbnail(webpage), + }] + self._sort_formats(info) + info = info[0] + codecs = parse_codecs(info.get('ext', 'mp3')) + info.update(codecs) + return info diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index d45a9fe52..a5b1f54d5 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -332,9 +332,24 @@ class ARDIE(InfoExtractor): formats.append(f) self._sort_formats(formats) + _SUB_FORMATS = ( + ('./dataTimedText', 'ttml'), + ('./dataTimedTextNoOffset', 'ttml'), + ('./dataTimedTextVtt', 'vtt'), + ) + + subtitles = {} + for subsel, subext in _SUB_FORMATS: + for node in video_node.findall(subsel): + subtitles.setdefault('de', []).append({ + 'url': node.attrib['url'], + 'ext': subext, + }) + return { 'id': xpath_text(video_node, './videoId', default=display_id), 'formats': formats, + 'subtitles': subtitles, 'display_id': display_id, 'title': video_node.find('./title').text, 'duration': parse_duration(video_node.find('./duration').text), diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index e6f72852e..2c3d37e01 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -342,4 +342,4 @@ class ArteTVCategoryIE(ArteTVBaseIE): return merge_dicts( self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title), - {'description': self._og_search_description(webpage, default=None)}) + {'description': self._og_search_description(webpage, default=None)}) \ No newline at end of file diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index cc7771354..4d1fbad1f 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -14,7 +14,7 @@ from ..utils import ( class AudiomackIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:song/|(?=.+/song/))(?P[\w/-]+)' IE_NAME = 'audiomack' _TESTS = [ # hosted on audiomack @@ -29,25 +29,27 @@ class AudiomackIE(InfoExtractor): } }, # audiomack wrapper around soundcloud song + # Needs new test URL. { 'add_ie': ['Soundcloud'], 'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle', - 'info_dict': { - 'id': '258901379', - 'ext': 'mp3', - 'description': 'mamba day freestyle for the legend Kobe Bryant ', - 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]', - 'uploader': 'ILOVEMAKONNEN', - 'upload_date': '20160414', - } + 'only_matching': True, + # 'info_dict': { + # 'id': '258901379', + # 'ext': 'mp3', + # 'description': 'mamba day freestyle for the legend Kobe Bryant ', + # 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]', + # 'uploader': 'ILOVEMAKONNEN', + # 'upload_date': '20160414', + # } }, ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/song/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/song/', '/') # Request the extended version of the api for extra fields like artist and title api_response = self._download_json( @@ -73,13 +75,13 @@ class AudiomackIE(InfoExtractor): class AudiomackAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/album/(?P[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:album/|(?=.+/album/))(?P[\w/-]+)' IE_NAME = 'audiomack:album' _TESTS = [ # Standard album playlist { 'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape', - 'playlist_count': 15, + 'playlist_count': 11, 'info_dict': { 'id': '812251', @@ -95,24 +97,24 @@ class AudiomackAlbumIE(InfoExtractor): }, 'playlist': [{ 'info_dict': { - 'title': 'PPP (Pistol P Project) - 9. Heaven or Hell (CHIMACA) ft Zuse (prod by DJ FU)', - 'id': '837577', + 'title': 'PPP (Pistol P Project) - 10. 4 Minutes Of Hell Part 4 (prod by DY OF 808 MAFIA)', + 'id': '837580', 'ext': 'mp3', 'uploader': 'Lil Herb a.k.a. G Herbo', } }], 'params': { - 'playliststart': 9, - 'playlistend': 9, + 'playliststart': 2, + 'playlistend': 2, } } ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/album/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/album/', '/') result = {'_type': 'playlist', 'entries': []} # There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata # Therefore we don't know how many songs the album has and must infi-loop until failure @@ -134,7 +136,7 @@ class AudiomackAlbumIE(InfoExtractor): # Pull out the album metadata and add to result (if it exists) for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]: if apikey in api_response and resultkey not in result: - result[resultkey] = api_response[apikey] + result[resultkey] = compat_str(api_response[apikey]) song_id = url_basename(api_response['url']).rpartition('.')[0] result['entries'].append({ 'id': compat_str(api_response.get('id', song_id)), diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 247d982ce..378b52f4f 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -12,6 +12,7 @@ from ..compat import ( compat_HTTPError, compat_parse_qs, compat_str, + compat_urllib_error, compat_urllib_parse_urlparse, compat_urlparse, ) @@ -39,7 +40,7 @@ from ..utils import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})' + _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ @@ -395,9 +396,17 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_mpd_formats( href, programme_id, mpd_id=format_id, fatal=False)) elif transfer_format == 'hls': - formats.extend(self._extract_m3u8_formats( - href, programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False)) + # TODO: let expected_status be passed into _extract_xxx_formats() instead + try: + fmts = self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + except ExtractorError as e: + if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError) + and e.exc_info[1].code in (403, 404)): + raise + fmts = [] + formats.extend(fmts) elif transfer_format == 'hds': formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) @@ -775,21 +784,33 @@ class BBCIE(BBCCoUkIE): 'timestamp': 1437785037, 'upload_date': '20150725', }, + }, { + # video with window.__INITIAL_DATA__ and value as JSON string + 'url': 'https://www.bbc.com/news/av/world-europe-59468682', + 'info_dict': { + 'id': 'p0b71qth', + 'ext': 'mp4', + 'title': 'Why France is making this woman a national hero', + 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1638230731, + 'upload_date': '20211130', + }, }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', 'only_matching': True, }, { + # bbcthreeConfig 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', 'info_dict': { 'id': 'p06556y7', 'ext': 'mp4', - 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', + 'title': 'Things Not To Say to people that live on council estates', + 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.", + 'duration': 360, + 'thumbnail': r're:https?://.+/.+\.jpg', }, - 'params': { - 'skip_download': True, - } }, { # window.__PRELOADED_STATE__ 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', @@ -1162,9 +1183,16 @@ class BBCIE(BBCCoUkIE): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) - initial_data = self._parse_json(self._search_regex( - r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), playlist_id, fatal=False) + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, + 'quoted preload state', default=None) + if initial_data is None: + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage, + 'preload state', default={}) + else: + initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) + initial_data = self._parse_json(initial_data, playlist_id, fatal=False) if initial_data: def parse_media(media): if not media: @@ -1205,7 +1233,10 @@ class BBCIE(BBCCoUkIE): if name == 'media-experience': parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) elif name == 'article': - for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): + for block in (try_get(resp, + (lambda x: x['data']['blocks'], + lambda x: x['data']['content']['model']['blocks'],), + list) or []): if block.get('type') != 'media': continue parse_media(block.get('model')) diff --git a/youtube_dl/extractor/bigo.py b/youtube_dl/extractor/bigo.py new file mode 100644 index 000000000..ddf76ac55 --- /dev/null +++ b/youtube_dl/extractor/bigo.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError, urlencode_postdata + + +class BigoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bigo\.tv/(?:[a-z]{2,}/)?(?P[^/]+)' + + _TESTS = [{ + 'url': 'https://www.bigo.tv/ja/221338632', + 'info_dict': { + 'id': '6576287577575737440', + 'title': '土よ〜💁‍♂️ 休憩室/REST room', + 'thumbnail': r're:https?://.+', + 'uploader': '✨Shin💫', + 'uploader_id': '221338632', + 'is_live': True, + }, + 'skip': 'livestream', + }, { + 'url': 'https://www.bigo.tv/th/Tarlerm1304', + 'only_matching': True, + }, { + 'url': 'https://bigo.tv/115976881', + 'only_matching': True, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + info_raw = self._download_json( + 'https://bigo.tv/studio/getInternalStudioInfo', + user_id, data=urlencode_postdata({'siteId': user_id})) + + if not isinstance(info_raw, dict): + raise ExtractorError('Received invalid JSON data') + if info_raw.get('code'): + raise ExtractorError( + 'Bigo says: %s (code %s)' % (info_raw.get('msg'), info_raw.get('code')), expected=True) + info = info_raw.get('data') or {} + + if not info.get('alive'): + raise ExtractorError('This user is offline.', expected=True) + + return { + 'id': info.get('roomId') or user_id, + 'title': info.get('roomTopic') or info.get('nick_name') or user_id, + 'formats': [{ + 'url': info.get('hls_src'), + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + 'thumbnail': info.get('snapshot'), + 'uploader': info.get('nick_name'), + 'uploader_id': user_id, + 'is_live': True, + } diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index bff6ea194..d42f0e98a 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -369,6 +369,11 @@ class BilibiliAudioIE(BilibiliAudioBaseIE): 'filesize': int_or_none(play_data.get('size')), }] + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': url, + }) + song = self._call_api('song/info', au_id) title = song['title'] statistic = song.get('statistic') or {} diff --git a/youtube_dl/extractor/blerp.py b/youtube_dl/extractor/blerp.py new file mode 100644 index 000000000..355daef6e --- /dev/null +++ b/youtube_dl/extractor/blerp.py @@ -0,0 +1,173 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from ..utils import ( + strip_or_none, + traverse_obj, +) +from .common import InfoExtractor + + +class BlerpIE(InfoExtractor): + IE_NAME = 'blerp' + _VALID_URL = r'https?://(?:www\.)?blerp\.com/soundbites/(?P[0-9a-zA-Z]+)' + _TESTS = [{ + 'url': 'https://blerp.com/soundbites/6320fe8745636cb4dd677a5a', + 'info_dict': { + 'id': '6320fe8745636cb4dd677a5a', + 'title': 'Samsung Galaxy S8 Over the Horizon Ringtone 2016', + 'uploader': 'luminousaj', + 'uploader_id': '5fb81e51aa66ae000c395478', + 'ext': 'mp3', + 'tags': ['samsung', 'galaxy', 's8', 'over the horizon', '2016', 'ringtone'], + } + }, { + 'url': 'https://blerp.com/soundbites/5bc94ef4796001000498429f', + 'info_dict': { + 'id': '5bc94ef4796001000498429f', + 'title': 'Yee', + 'uploader': '179617322678353920', + 'uploader_id': '5ba99cf71386730004552c42', + 'ext': 'mp3', + 'tags': ['YEE', 'YEET', 'wo ha haah catchy tune yee', 'yee'] + } + }] + + _GRAPHQL_OPERATIONNAME = "webBitePageGetBite" + _GRAPHQL_QUERY = ( + '''query webBitePageGetBite($_id: MongoID!) { + web { + biteById(_id: $_id) { + ...bitePageFrag + __typename + } + __typename + } + } + + fragment bitePageFrag on Bite { + _id + title + userKeywords + keywords + color + visibility + isPremium + owned + price + extraReview + isAudioExists + image { + filename + original { + url + __typename + } + __typename + } + userReactions { + _id + reactions + createdAt + __typename + } + topReactions + totalSaveCount + saved + blerpLibraryType + license + licenseMetaData + playCount + totalShareCount + totalFavoriteCount + totalAddedToBoardCount + userCategory + userAudioQuality + audioCreationState + transcription + userTranscription + description + createdAt + updatedAt + author + listingType + ownerObject { + _id + username + profileImage { + filename + original { + url + __typename + } + __typename + } + __typename + } + transcription + favorited + visibility + isCurated + sourceUrl + audienceRating + strictAudienceRating + ownerId + reportObject { + reportedContentStatus + __typename + } + giphy { + mp4 + gif + __typename + } + audio { + filename + original { + url + __typename + } + mp3 { + url + __typename + } + __typename + } + __typename + } + + ''') + + def _real_extract(self, url): + audio_id = self._match_id(url) + + data = { + 'operationName': self._GRAPHQL_OPERATIONNAME, + 'query': self._GRAPHQL_QUERY, + 'variables': { + '_id': audio_id + } + } + + headers = { + 'Content-Type': 'application/json' + } + + json_result = self._download_json('https://api.blerp.com/graphql', + audio_id, data=json.dumps(data).encode('utf-8'), headers=headers) + + bite_json = json_result['data']['web']['biteById'] + + info_dict = { + 'id': bite_json['_id'], + 'url': bite_json['audio']['mp3']['url'], + 'title': bite_json['title'], + 'uploader': traverse_obj(bite_json, ('ownerObject', 'username'), expected_type=strip_or_none), + 'uploader_id': traverse_obj(bite_json, ('ownerObject', '_id'), expected_type=strip_or_none), + 'ext': 'mp3', + 'tags': list(filter(None, map(strip_or_none, (traverse_obj(bite_json, 'userKeywords', expected_type=list) or []))) or None) + } + + return info_dict diff --git a/youtube_dl/extractor/bongacams.py b/youtube_dl/extractor/bongacams.py index 180542fbc..016999d55 100644 --- a/youtube_dl/extractor/bongacams.py +++ b/youtube_dl/extractor/bongacams.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -12,13 +13,28 @@ from ..utils import ( class BongaCamsIE(InfoExtractor): - _VALID_URL = r'https?://(?P(?:[^/]+\.)?bongacams\d*\.com)/(?P[^/?&#]+)' + _VALID_URL = r'https?://(?P(?:[^/]+\.)?bongacams\d*\.(?:com|net))/(?P[^/?&#]+)' _TESTS = [{ 'url': 'https://de.bongacams.com/azumi-8', 'only_matching': True, }, { 'url': 'https://cn.bongacams.com/azumi-8', 'only_matching': True, + }, { + 'url': 'https://de.bongacams.net/claireashton', + 'info_dict': { + 'id': 'claireashton', + 'ext': 'mp4', + 'title': r're:ClaireAshton \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'age_limit': 18, + 'uploader_id': 'ClaireAshton', + 'uploader': 'ClaireAshton', + 'like_count': int, + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/callin.py b/youtube_dl/extractor/callin.py new file mode 100644 index 000000000..341be479f --- /dev/null +++ b/youtube_dl/extractor/callin.py @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + traverse_obj, + try_get, +) + + +class CallinIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?callin\.com/episode/(?:[^/#?-]+-)*(?P[^/#?-]+)' + _TESTS = [{ + 'url': 'https://www.callin.com/episode/fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW', + 'md5': '14ede27ee2c957b7e4db93140fc0745c', + 'info_dict': { + 'id': 'PrumRdSQJW', + 'ext': 'mp4', + 'title': 'FCC Commissioner Brendan Carr on Elon’s Starlink', + 'description': 'Or, why the government doesn’t like SpaceX', + 'channel': 'The Pull Request', + 'channel_url': 'https://callin.com/show/the-pull-request-ucnDJmEKAa', + } + }, { + 'url': 'https://www.callin.com/episode/episode-81-elites-melt-down-over-student-debt-lzxMidUnjA', + 'md5': '16f704ddbf82a27e3930533b12062f07', + 'info_dict': { + 'id': 'lzxMidUnjA', + 'ext': 'mp4', + 'title': 'Episode 81- Elites MELT DOWN over Student Debt Victory? Rumble in NYC?', + 'description': 'Let’s talk todays episode about the primary election shake up in NYC and the elites melting down over student debt cancelation.', + 'channel': 'The DEBRIEF With Briahna Joy Gray', + 'channel_url': 'https://callin.com/show/the-debrief-with-briahna-joy-gray-siiFDzGegm', + } + }] + + def _search_nextjs_data(self, webpage, video_id, transform_source=None, fatal=True, **kw): + return self._parse_json( + self._search_regex( + r'(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', + webpage, 'next.js data', fatal=fatal, **kw), + video_id, transform_source=transform_source, fatal=fatal) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + next_data = self._search_nextjs_data(webpage, video_id) + episode = traverse_obj(next_data, ('props', 'pageProps', 'episode'), expected_type=dict) + if not episode: + raise ExtractorError('Failed to find episode data') + + title = episode.get('title') or self._og_search_title(webpage) + description = episode.get('description') or self._og_search_description(webpage) + + formats = [] + formats.extend(self._extract_m3u8_formats( + episode.get('m3u8'), video_id, 'mp4', + entry_protocol='m3u8_native', fatal=False)) + self._sort_formats(formats) + + channel = try_get(episode, lambda x: x['show']['title'], compat_str) + channel_url = try_get(episode, lambda x: x['show']['linkObj']['resourceUrl'], compat_str) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'channel': channel, + 'channel_url': channel_url, + } diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py index 1eb81b75e..d2e860b24 100644 --- a/youtube_dl/extractor/cammodels.py +++ b/youtube_dl/extractor/cammodels.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - ExtractorError, int_or_none, url_or_none, ) @@ -20,32 +19,11 @@ class CamModelsIE(InfoExtractor): def _real_extract(self, url): user_id = self._match_id(url) - webpage = self._download_webpage( - url, user_id, headers=self.geo_verification_headers()) - - manifest_root = self._html_search_regex( - r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) - - if not manifest_root: - ERRORS = ( - ("I'm offline, but let's stay connected", 'This user is currently offline'), - ('in a private show', 'This user is in a private show'), - ('is currently performing LIVE', 'This model is currently performing live'), - ) - for pattern, message in ERRORS: - if pattern in webpage: - error = message - expected = True - break - else: - error = 'Unable to find manifest URL root' - expected = False - raise ExtractorError(error, expected=expected) - manifest = self._download_json( - '%s%s.json' % (manifest_root, user_id), user_id) + 'https://manifest-server.naiadsystems.com/live/s:%s.json' % user_id, user_id) formats = [] + thumbnails = [] for format_id, format_dict in manifest['formats'].items(): if not isinstance(format_dict, dict): continue @@ -85,6 +63,13 @@ class CamModelsIE(InfoExtractor): 'preference': -1, }) else: + if format_id == 'jpeg': + thumbnails.append({ + 'url': f['url'], + 'width': f['width'], + 'height': f['height'], + 'format_id': f['format_id'], + }) continue formats.append(f) self._sort_formats(formats) @@ -92,6 +77,7 @@ class CamModelsIE(InfoExtractor): return { 'id': user_id, 'title': self._live_title(user_id), + 'thumbnails': thumbnails, 'is_live': True, 'formats': formats, 'age_limit': 18 diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 7cb4efb74..fe677d8e8 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -12,35 +12,21 @@ from ..utils import ( ExtractorError, float_or_none, sanitized_Request, - unescapeHTML, - update_url_query, + str_or_none, + traverse_obj, urlencode_postdata, USER_AGENTS, ) class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P[^/#?]+)' + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady|zive)/(?:[^/?#&]+/)*(?P[^/#?]+)' _TESTS = [{ - 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', - 'info_dict': { - 'id': '61924494877246241', - 'ext': 'mp4', - 'title': 'Hyde Park Civilizace: Život v Grónsku', - 'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 3350, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', 'info_dict': { 'id': '61924494877028507', 'ext': 'mp4', - 'title': 'Hyde Park Civilizace: Bonus 01 - En', + 'title': 'Bonus 01 - En - Hyde Park Civilizace', 'description': 'English Subtittles', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 81.3, @@ -51,31 +37,111 @@ class CeskaTelevizeIE(InfoExtractor): }, }, { # live stream - 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', + 'url': 'http://www.ceskatelevize.cz/zive/ct1/', 'info_dict': { - 'id': 402, + 'id': '102', 'ext': 'mp4', - 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'title': r'ČT1 - živé vysílání online', + 'description': 'Sledujte živé vysílání kanálu ČT1 online. Vybírat si můžete i z dalších kanálů České televize na kterémkoli z vašich zařízení.', 'is_live': True, }, 'params': { # m3u8 download 'skip_download': True, }, - 'skip': 'Georestricted to Czech Republic', + }, { + # another + 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', + 'only_matching': True, + 'info_dict': { + 'id': 402, + 'ext': 'mp4', + 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': True, + }, + # 'skip': 'Georestricted to Czech Republic', }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', 'only_matching': True, + }, { + # video with 18+ caution trailer + 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + 'info_dict': { + 'id': '215562210900007-bogotart', + 'title': 'Bogotart - Queer', + 'description': 'Hlavní město Kolumbie v doprovodu queer umělců. Vroucí svět plný vášně, sebevědomí, ale i násilí a bolesti', + }, + 'playlist': [{ + 'info_dict': { + 'id': '61924494877311053', + 'ext': 'mp4', + 'title': 'Bogotart - Queer (Varování 18+)', + 'duration': 11.9, + }, + }, { + 'info_dict': { + 'id': '61924494877068022', + 'ext': 'mp4', + 'title': 'Bogotart - Queer (Queer)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 1558.3, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # iframe embed + 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', + 'only_matching': True, }] + def _search_nextjs_data(self, webpage, video_id, **kw): + return self._parse_json( + self._search_regex( + r'(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', + webpage, 'next.js data', **kw), + video_id, **kw) + def _real_extract(self, url): playlist_id = self._match_id(url) + webpage, urlh = self._download_webpage_handle(url, playlist_id) + parsed_url = compat_urllib_parse_urlparse(urlh.geturl()) + site_name = self._og_search_property('site_name', webpage, fatal=False, default='Česká televize') + playlist_title = self._og_search_title(webpage, default=None) + if site_name and playlist_title: + playlist_title = re.split(r'\s*[—|]\s*%s' % (site_name, ), playlist_title, 1)[0] + playlist_description = self._og_search_description(webpage, default=None) + if playlist_description: + playlist_description = playlist_description.replace('\xa0', ' ') - webpage = self._download_webpage(url, playlist_id) + type_ = 'IDEC' + if re.search(r'(^/porady|/zive)/', parsed_url.path): + next_data = self._search_nextjs_data(webpage, playlist_id) + if '/zive/' in parsed_url.path: + idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'liveBroadcast', 'current', 'idec'), get_all=False) + else: + idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False) + if not idec: + idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'videobonusDetail', 'bonusId'), get_all=False) + if idec: + type_ = 'bonus' + if not idec: + raise ExtractorError('Failed to find IDEC id') + iframe_hash = self._download_webpage( + 'https://www.ceskatelevize.cz/v-api/iframe-hash/', + playlist_id, note='Getting IFRAME hash') + query = {'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', type_: idec, } + webpage = self._download_webpage( + 'https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php', + playlist_id, note='Downloading player', query=query) NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' if '%s

' % NOT_AVAILABLE_STRING in webpage: - raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + self.raise_geo_restricted(NOT_AVAILABLE_STRING) + if any(not_found in webpage for not_found in ('Neplatný parametr pro videopřehrávač', 'IDEC nebyl nalezen', )): + raise ExtractorError('no video with IDEC available', video_id=idec, expected=True) type_ = None episode_id = None @@ -100,7 +166,7 @@ class CeskaTelevizeIE(InfoExtractor): data = { 'playlist[0][type]': type_, 'playlist[0][id]': episode_id, - 'requestUrl': compat_urllib_parse_urlparse(url).path, + 'requestUrl': parsed_url.path, 'requestSource': 'iVysilani', } @@ -108,7 +174,7 @@ class CeskaTelevizeIE(InfoExtractor): for user_agent in (None, USER_AGENTS['Safari']): req = sanitized_Request( - 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/', data=urlencode_postdata(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') @@ -130,9 +196,6 @@ class CeskaTelevizeIE(InfoExtractor): req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) - playlist_title = self._og_search_title(webpage, default=None) - playlist_description = self._og_search_description(webpage, default=None) - playlist = self._download_json(req, playlist_id, fatal=False) if not playlist: continue @@ -167,7 +230,7 @@ class CeskaTelevizeIE(InfoExtractor): entries[num]['formats'].extend(formats) continue - item_id = item.get('id') or item['assetId'] + item_id = str_or_none(item.get('id') or item['assetId']) title = item['title'] duration = float_or_none(item.get('duration')) @@ -181,8 +244,6 @@ class CeskaTelevizeIE(InfoExtractor): if playlist_len == 1: final_title = playlist_title or title - if is_live: - final_title = self._live_title(final_title) else: final_title = '%s (%s)' % (playlist_title, title) @@ -200,6 +261,8 @@ class CeskaTelevizeIE(InfoExtractor): for e in entries: self._sort_formats(e['formats']) + if len(entries) == 1: + return entries[0] return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) def _get_subtitles(self, episode_id, subs): @@ -236,54 +299,3 @@ class CeskaTelevizeIE(InfoExtractor): yield line return '\r\n'.join(_fix_subtitle(subtitles)) - - -class CeskaTelevizePoradyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P[^/#?]+)' - _TESTS = [{ - # video with 18+ caution trailer - 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', - 'info_dict': { - 'id': '215562210900007-bogotart', - 'title': 'Queer: Bogotart', - 'description': 'Alternativní průvodce současným queer světem', - }, - 'playlist': [{ - 'info_dict': { - 'id': '61924494876844842', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Varování 18+)', - 'duration': 10.2, - }, - }, { - 'info_dict': { - 'id': '61924494877068022', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Queer)', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 1558.3, - }, - }], - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # iframe embed - 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - data_url = update_url_query(unescapeHTML(self._search_regex( - (r']*\bdata-url=(["\'])(?P(?:(?!\1).)+)\1', - r']+\bsrc=(["\'])(?P(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), - webpage, 'iframe player url', group='url')), query={ - 'autoStart': 'true', - }) - - return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 797c35fd5..7244e5df6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -70,6 +70,7 @@ from ..utils import ( str_or_none, str_to_int, strip_or_none, + try_get, unescapeHTML, unified_strdate, unified_timestamp, @@ -1086,7 +1087,7 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))' property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)' % {'prop': re.escape(prop)}) template = r']+?%s[^>]+?%s' @@ -2713,7 +2714,7 @@ class InfoExtractor(object): def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): mobj = re.search( - r'(?s)jwplayer\((?P[\'"])[^\'" ]+(?P=quote)\)(?!).*?\.setup\s*\((?P[^)]+)\)', + r'''(?s)jwplayer\s*\(\s*(?P'|")(?!(?P=q)).+(?P=q)\s*\)(?!).*?\.\s*setup\s*\(\s*(?P(?:\([^)]*\)|[^)])+)\s*\)''', webpage) if mobj: try: @@ -2734,9 +2735,14 @@ class InfoExtractor(object): def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): + flat_pl = try_get(jwplayer_data, lambda x: x.get('playlist') or True) + if flat_pl is None: + # not even a dict + return [] + # JWPlayer backward compatibility: flattened playlists # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 - if 'playlist' not in jwplayer_data: + if flat_pl is True: jwplayer_data = {'playlist': [jwplayer_data]} entries = [] @@ -2784,6 +2790,13 @@ class InfoExtractor(object): 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 'subtitles': subtitles, + 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ... + 'genre': clean_html(video_data.get('genre')), + 'channel': clean_html(dict_get(video_data, ('category', 'channel'))), + 'season_number': int_or_none(video_data.get('season')), + 'episode_number': int_or_none(video_data.get('episode')), + 'release_year': int_or_none(video_data.get('releasedate')), + 'age_limit': int_or_none(video_data.get('age_restriction')), } # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): @@ -2792,7 +2805,9 @@ class InfoExtractor(object): 'url': formats[0]['url'], }) else: - self._sort_formats(formats) + # avoid exception in case of only sttls + if formats: + self._sort_formats(formats) entry['formats'] = formats entries.append(entry) if len(entries) == 1: @@ -2802,7 +2817,7 @@ class InfoExtractor(object): def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - urls = [] + urls = set() formats = [] for source in jwplayer_sources_data: if not isinstance(source, dict): @@ -2811,14 +2826,14 @@ class InfoExtractor(object): base_url, self._proto_relative_url(source.get('file'))) if not source_url or source_url in urls: continue - urls.append(source_url) + urls.add(source_url) source_type = source.get('type') or '' ext = mimetype2ext(source_type) or determine_ext(source_url) - if source_type == 'hls' or ext == 'm3u8': + if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url: formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif source_type == 'dash' or ext == 'mpd': + elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url: formats.extend(self._extract_mpd_formats( source_url, video_id, mpd_id=mpd_id, fatal=False)) elif ext == 'smil': @@ -2833,20 +2848,23 @@ class InfoExtractor(object): 'ext': ext, }) else: + format_id = str_or_none(source.get('label')) height = int_or_none(source.get('height')) - if height is None: + if height is None and format_id: # Often no height is provided but there is a label in # format like "1080p", "720p SD", or 1080. - height = int_or_none(self._search_regex( - r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), - 'height', default=None)) + height = parse_resolution(format_id).get('height') a_format = { 'url': source_url, 'width': int_or_none(source.get('width')), 'height': height, - 'tbr': int_or_none(source.get('bitrate')), + 'tbr': int_or_none(source.get('bitrate'), scale=1000), + 'filesize': int_or_none(source.get('filesize')), 'ext': ext, } + if format_id: + a_format['format_id'] = format_id + if source_url.startswith('rtmp'): a_format['ext'] = 'flv' # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as diff --git a/youtube_dl/extractor/cpac.py b/youtube_dl/extractor/cpac.py new file mode 100644 index 000000000..22741152c --- /dev/null +++ b/youtube_dl/extractor/cpac.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + str_or_none, + try_get, + unified_timestamp, + update_url_query, + urljoin, +) + +# compat_range +try: + if callable(xrange): + range = xrange +except (NameError, TypeError): + pass + + +class CPACIE(InfoExtractor): + IE_NAME = 'cpac' + _VALID_URL = r'https?://(?:www\.)?cpac\.ca/(?Pl-)?episode\?id=(?P[\da-f]{8}(?:-[\da-f]{4}){3}-[\da-f]{12})' + _TEST = { + # 'url': 'http://www.cpac.ca/en/programs/primetime-politics/episodes/65490909', + 'url': 'https://www.cpac.ca/episode?id=fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'md5': 'e46ad699caafd7aa6024279f2614e8fa', + 'info_dict': { + 'id': 'fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'ext': 'mp4', + 'upload_date': '20220215', + 'title': 'News Conference to Celebrate National Kindness Week – February 15, 2022', + 'description': 'md5:466a206abd21f3a6f776cdef290c23fb', + 'timestamp': 1644901200, + }, + 'params': { + 'format': 'bestvideo', + 'hls_prefer_native': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if '/l-episode?' in url else 'en' + + content = self._download_json( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/episode/index.xml&crafterSite=cpacca&id=' + video_id, + video_id) + video_url = try_get(content, lambda x: x['page']['details']['videoUrl'], compat_str) + formats = [] + if video_url: + content = content['page'] + title = str_or_none(content['details']['title_%s_t' % (url_lang, )]) + formats = self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', ext='mp4') + for fmt in formats: + # prefer language to match URL + fmt_lang = fmt.get('language') + if fmt_lang == url_lang: + fmt['language_preference'] = 10 + elif not fmt_lang: + fmt['language_preference'] = -1 + else: + fmt['language_preference'] = -10 + + self._sort_formats(formats) + + category = str_or_none(content['details']['category_%s_t' % (url_lang, )]) + + def is_live(v_type): + return (v_type == 'live') if v_type is not None else None + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': str_or_none(content['details'].get('description_%s_t' % (url_lang, ))), + 'timestamp': unified_timestamp(content['details'].get('liveDateTime')), + 'category': [category] if category else None, + 'thumbnail': urljoin(url, str_or_none(content['details'].get('image_%s_s' % (url_lang, )))), + 'is_live': is_live(content['details'].get('type')), + } + + +class CPACPlaylistIE(InfoExtractor): + IE_NAME = 'cpac:playlist' + _VALID_URL = r'(?i)https?://(?:www\.)?cpac\.ca/(?:program|search|(?Pemission|rechercher))\?(?:[^&]+&)*?(?P(?:id=\d+|programId=\d+|key=[^&]+))' + + _TESTS = [{ + 'url': 'https://www.cpac.ca/program?id=6', + 'info_dict': { + 'id': 'id=6', + 'title': 'Headline Politics', + 'description': 'Watch CPAC’s signature long-form coverage of the day’s pressing political events as they unfold.', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.cpac.ca/search?key=hudson&type=all&order=desc', + 'info_dict': { + 'id': 'key=hudson', + 'title': 'hudson', + }, + 'playlist_count': 22, + }, { + 'url': 'https://www.cpac.ca/search?programId=50', + 'info_dict': { + 'id': 'programId=50', + 'title': '50', + }, + 'playlist_count': 9, + }, { + 'url': 'https://www.cpac.ca/emission?id=6', + 'only_matching': True, + }, { + 'url': 'https://www.cpac.ca/rechercher?key=hudson&type=all&order=desc', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if any(x in url for x in ('/emission?', '/rechercher?')) else 'en' + pl_type, list_type = ('program', 'itemList') if any(x in url for x in ('/program?', '/emission?')) else ('search', 'searchResult') + api_url = ( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/%s/index.xml&crafterSite=cpacca&%s' + % (pl_type, video_id, )) + content = self._download_json(api_url, video_id) + entries = [] + total_pages = int_or_none(try_get(content, lambda x: x['page'][list_type]['totalPages']), default=1) + for page in range(1, total_pages + 1): + if page > 1: + api_url = update_url_query(api_url, {'page': '%d' % (page, ), }) + content = self._download_json( + api_url, video_id, + note='Downloading continuation - %d' % (page, ), + fatal=False) + + for item in try_get(content, lambda x: x['page'][list_type]['item'], list) or []: + episode_url = urljoin(url, try_get(item, lambda x: x['url_%s_s' % (url_lang, )])) + if episode_url: + entries.append(episode_url) + + return self.playlist_result( + (self.url_result(entry) for entry in entries), + playlist_id=video_id, + playlist_title=try_get(content, lambda x: x['page']['program']['title_%s_t' % (url_lang, )]) or video_id.split('=')[-1], + playlist_description=try_get(content, lambda x: x['page']['program']['description_%s_t' % (url_lang, )]), + ) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6e8fc3961..3a87f9e33 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -51,6 +51,10 @@ from .anvato import AnvatoIE from .aol import AolIE from .allocine import AllocineIE from .aliexpress import AliExpressLiveIE +from .alsace20tv import ( + Alsace20TVIE, + Alsace20TVEmbedIE, +) from .apa import APAIE from .aparat import AparatIE from .appleconnect import AppleConnectIE @@ -71,6 +75,7 @@ from .arte import ( ArteTVIE, ArteTVEmbedIE, ArteTVPlaylistIE, + ArteTVCategoryIE, ) from .arnes import ArnesIE from .asiancrush import ( @@ -114,6 +119,7 @@ from .bfmtv import ( ) from .bibeltv import BibelTVIE from .bigflix import BigflixIE +from .bigo import BigoIE from .bild import BildIE from .bilibili import ( BiliBiliIE, @@ -132,6 +138,7 @@ from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, ) +from .blerp import BlerpIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bongacams import BongaCamsIE @@ -152,6 +159,7 @@ from .businessinsider import BusinessInsiderIE from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE +from .callin import CallinIE from .camdemy import ( CamdemyIE, CamdemyFolderIE @@ -202,10 +210,7 @@ from .ccc import ( from .ccma import CCMAIE from .cctv import CCTVIE from .cda import CDAIE -from .ceskatelevize import ( - CeskaTelevizeIE, - CeskaTelevizePoradyIE, -) +from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .charlierose import CharlieRoseIE from .chaturbate import ChaturbateIE @@ -253,6 +258,10 @@ from .commonprotocols import ( from .condenast import CondeNastIE from .contv import CONtvIE from .corus import CorusIE +from .cpac import ( + CPACIE, + CPACPlaylistIE, +) from .cracked import CrackedIE from .crackle import CrackleIE from .crooksandliars import CrooksAndLiarsIE @@ -367,6 +376,8 @@ from .fc2 import ( FC2EmbedIE, ) from .fczenit import FczenitIE +from .filemoon import FileMoonIE +from .fifa import FifaIE from .filmon import ( FilmOnIE, FilmOnChannelIE, @@ -469,6 +480,7 @@ from .hotstar import ( ) from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE +from .hrfernsehen import HRFernsehenIE from .hrti import ( HRTiIE, HRTiPlaylistIE, @@ -545,8 +557,10 @@ from .khanacademy import ( from .kickstarter import KickStarterIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE +from .kommunetv import KommunetvIE from .konserthusetplay import KonserthusetPlayIE from .krasview import KrasViewIE +from .kth import KTHIE from .ku6 import Ku6IE from .kusi import KUSIIE from .kuwo import ( @@ -716,6 +730,7 @@ from .myvi import ( MyviIE, MyviEmbedIE, ) +from .myvideoge import MyVideoGeIE from .myvidster import MyVidsterIE from .nationalgeographic import ( NationalGeographicVideoIE, @@ -789,7 +804,14 @@ from .nick import ( NickNightIE, NickRuIE, ) -from .niconico import NiconicoIE, NiconicoPlaylistIE +from .niconico import ( + NiconicoIE, + NiconicoPlaylistIE, + NiconicoUserIE, + NicovideoSearchIE, + NicovideoSearchDateIE, + NicovideoSearchURLIE, +) from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE from .ninenow import NineNowIE @@ -893,6 +915,10 @@ from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE from .pearvideo import PearVideoIE +from .peekvids import ( + PeekVidsIE, + PlayVidsIE, +) from .peertube import PeerTubeIE from .people import PeopleIE from .performgroup import PerformGroupIE @@ -986,6 +1012,10 @@ from .raywenderlich import ( RayWenderlichIE, RayWenderlichCourseIE, ) +from .rbgtum import ( + RbgTumIE, + RbgTumCourseIE, +) from .rbmaradio import RBMARadioIE from .rds import RDSIE from .redbulltv import ( @@ -1176,6 +1206,7 @@ from .storyfire import ( from .streamable import StreamableIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE +from .streamsb import StreamsbIE from .streetvoice import StreetVoiceIE from .stretchinternet import StretchInternetIE from .stv import STVPlayerIE @@ -1245,6 +1276,11 @@ from .theweatherchannel import TheWeatherChannelIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE +from .thisvid import ( + ThisVidIE, + ThisVidMemberIE, + ThisVidPlaylistIE, +) from .threeqsdn import ThreeQSDNIE from .tiktok import ( TikTokIE, @@ -1606,7 +1642,7 @@ from .youtube import ( YoutubeRecommendedIE, YoutubeSearchDateIE, YoutubeSearchIE, - #YoutubeSearchURLIE, + YoutubeSearchURLIE, YoutubeSubscriptionsIE, YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, @@ -1642,3 +1678,7 @@ from .zingmp3 import ( ) from .zoom import ZoomIE from .zype import ZypeIE +from .pr0gramm import ( + Pr0grammIE, + Pr0grammStaticIE, +) diff --git a/youtube_dl/extractor/fifa.py b/youtube_dl/extractor/fifa.py new file mode 100644 index 000000000..15157774e --- /dev/null +++ b/youtube_dl/extractor/fifa.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + int_or_none, + traverse_obj, + unified_timestamp, +) + +if not callable(getattr(InfoExtractor, '_match_valid_url', None)): + + BaseInfoExtractor = InfoExtractor + + import re + + class InfoExtractor(BaseInfoExtractor): + + @classmethod + def _match_valid_url(cls, url): + return re.match(cls._VALID_URL, url) + + +class FifaIE(InfoExtractor): + _VALID_URL = r'https?://www.fifa.com/fifaplus/(?P\w{2})/watch/([^#?]+/)?(?P\w+)' + _TESTS = [{ + 'url': 'https://www.fifa.com/fifaplus/en/watch/7on10qPcnyLajDDU3ntg6y', + 'info_dict': { + 'id': '7on10qPcnyLajDDU3ntg6y', + 'title': 'Italy v France | Final | 2006 FIFA World Cup Germany™ | Full Match Replay', + 'description': 'md5:f4520d0ee80529c8ba4134a7d692ff8b', + 'ext': 'mp4', + 'categories': ['FIFA Tournaments'], + 'thumbnail': 'https://digitalhub.fifa.com/transform/135e2656-3a51-407b-8810-6c34bec5b59b/FMR_2006_Italy_France_Final_Hero', + 'duration': 8165, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.fifa.com/fifaplus/pt/watch/1cg5r5Qt6Qt12ilkDgb1sV', + 'info_dict': { + 'id': '1cg5r5Qt6Qt12ilkDgb1sV', + 'title': 'Brazil v Germany | Semi-finals | 2014 FIFA World Cup Brazil™ | Extended Highlights', + 'description': 'md5:d908c74ee66322b804ae2e521b02a855', + 'ext': 'mp4', + 'categories': ['FIFA Tournaments', 'Highlights'], + 'thumbnail': 'https://digitalhub.fifa.com/transform/d8fe6f61-276d-4a73-a7fe-6878a35fd082/FIFAPLS_100EXTHL_2014BRAvGER_TMB', + 'duration': 902, + 'release_timestamp': 1404777600, + 'release_date': '20140708', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.fifa.com/fifaplus/fr/watch/3C6gQH9C2DLwzNx7BMRQdp', + 'info_dict': { + 'id': '3C6gQH9C2DLwzNx7BMRQdp', + 'title': 'Josimar goal against Northern Ireland | Classic Goals', + 'description': 'md5:cbe7e7bb52f603c9f1fe9a4780fe983b', + 'ext': 'mp4', + 'categories': ['FIFA Tournaments', 'Goal'], + 'duration': 28, + 'thumbnail': 'https://digitalhub.fifa.com/transform/f9301391-f8d9-48b5-823e-c093ac5e3e11/CG_MEN_1986_JOSIMAR', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id, locale = self._match_valid_url(url).group('id', 'locale') + webpage = self._download_webpage(url, video_id) + + preconnect_link = self._search_regex( + r']+\brel\s*=\s*"preconnect"[^>]+href\s*=\s*"([^"]+)"', webpage, 'Preconnect Link') + + video_details = self._download_json( + '{preconnect_link}/sections/videoDetails/{video_id}'.format(**locals()), video_id, 'Downloading Video Details', fatal=False) + + preplay_parameters = self._download_json( + '{preconnect_link}/videoPlayerData/{video_id}'.format(**locals()), video_id, 'Downloading Preplay Parameters')['preplayParameters'] + + content_data = self._download_json( + # 1. query string is expected to be sent as-is + # 2. `sig` must be appended + # 3. if absent, the call appears to work but the manifest is bad (404) + 'https://content.uplynk.com/preplay/{contentId}/multiple.json?{queryStr}&sig={signature}'.format(**preplay_parameters), + video_id, 'Downloading Content Data') + + # formats, subtitles = self._extract_m3u8_formats_and_subtitles(content_data['playURL'], video_id) + formats, subtitles = self._extract_m3u8_formats(content_data['playURL'], video_id, ext='mp4', entry_protocol='m3u8_native'), None + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_details['title'], + 'description': video_details.get('description'), + 'duration': int_or_none(video_details.get('duration')), + 'release_timestamp': unified_timestamp(video_details.get('dateOfRelease')), + 'categories': traverse_obj(video_details, (('videoCategory', 'videoSubcategory'),)), + 'thumbnail': traverse_obj(video_details, ('backgroundImage', 'src')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/filemoon.py b/youtube_dl/extractor/filemoon.py new file mode 100644 index 000000000..654df9b69 --- /dev/null +++ b/youtube_dl/extractor/filemoon.py @@ -0,0 +1,43 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + js_to_json, +) + + +class FileMoonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?filemoon\.sx/./(?P\w+)' + _TEST = { + 'url': 'https://filemoon.sx/e/dw40rxrzruqz', + 'md5': '5a713742f57ac4aef29b74733e8dda01', + 'info_dict': { + 'id': 'dw40rxrzruqz', + 'title': 'dw40rxrzruqz', + 'ext': 'mp4' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + matches = re.findall(r'(?s)(eval.*?)', webpage) + packed = matches[-1] + unpacked = decode_packed_codes(packed) + jwplayer_sources = self._parse_json( + self._search_regex( + r'(?s)player\s*\.\s*setup\s*\(\s*\{\s*sources\s*:\s*(.*?])', unpacked, 'jwplayer sources'), + video_id, transform_source=js_to_json) + + formats = self._parse_jwplayer_formats(jwplayer_sources, video_id) + + return { + 'id': video_id, + 'title': self._generic_title(url) or video_id, + 'formats': formats + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a9c064105..0e473e952 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -28,6 +28,7 @@ from ..utils import ( mimetype2ext, orderedSet, parse_duration, + parse_resolution, sanitized_Request, smuggle_url, unescapeHTML, @@ -35,6 +36,7 @@ from ..utils import ( unsmuggle_url, UnsupportedError, url_or_none, + urljoin, xpath_attr, xpath_text, xpath_with_ns, @@ -2227,6 +2229,97 @@ class GenericIE(InfoExtractor): # Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed) 'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html', 'only_matching': True, + }, { + # KVS Player + 'url': 'https://www.kvs-demo.com/videos/105/kelis-4th-of-july/', + 'info_dict': { + 'id': '105', + 'display_id': 'kelis-4th-of-july', + 'ext': 'mp4', + 'title': 'Kelis - 4th Of July', + 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', + }, + }, { + # KVS Player + 'url': 'https://www.kvs-demo.com/embed/105/', + 'info_dict': { + 'id': '105', + 'display_id': 'kelis-4th-of-july', + 'ext': 'mp4', + 'title': 'Kelis - 4th Of July / Embed Player', + 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, { + # KVS Player (tested also in thisvid.py) + 'url': 'https://youix.com/video/leningrad-zoj/', + 'md5': '94f96ba95706dc3880812b27b7d8a2b8', + 'info_dict': { + 'id': '18485', + 'display_id': 'leningrad-zoj', + 'ext': 'mp4', + 'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com', + 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg', + }, + }, { + # KVS Player + 'url': 'https://youix.com/embed/18485', + 'md5': '94f96ba95706dc3880812b27b7d8a2b8', + 'info_dict': { + 'id': '18485', + 'display_id': 'leningrad-zoj', + 'ext': 'mp4', + 'title': 'Ленинград - ЗОЖ', + 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg', + }, + }, { + # KVS Player + 'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/', + 'md5': '94166bdb26b4cb1fb9214319a629fc51', + 'info_dict': { + 'id': '21217', + 'display_id': '40-nochey-2016', + 'ext': 'mp4', + 'title': '40 ночей (2016) - BogMedia.org', + 'description': 'md5:4e6d7d622636eb7948275432eb256dc3', + 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg', + }, + }, { + # KVS Player (for sites that serve kt_player.js via non-https urls) + 'url': 'http://www.camhub.world/embed/389508', + 'md5': 'fbe89af4cfb59c8fd9f34a202bb03e32', + 'info_dict': { + 'id': '389508', + 'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source', + 'ext': 'mp4', + 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер', + 'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg', + }, + }, { + 'url': 'https://mrdeepfakes.com/video/5/selena-gomez-pov-deep-fakes', + 'md5': 'fec4ad5ec150f655e0c74c696a4a2ff4', + 'info_dict': { + 'id': '5', + 'display_id': 'selena-gomez-pov-deep-fakes', + 'ext': 'mp4', + 'title': 'Selena Gomez POV (Deep Fakes) DeepFake Porn - MrDeepFakes', + 'description': 'md5:17d1f84b578c9c26875ac5ef9a932354', + 'height': 720, + 'age_limit': 18, + }, + }, { + 'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/', + 'md5': 'e2f0a4c329f7986280b7328e24036d60', + 'info_dict': { + 'id': '284002', + 'display_id': 'just-out-of-the-shower-joi', + 'ext': 'mp4', + 'title': 'Just Out Of The Shower JOI - Shooshtime', + 'height': 720, + 'age_limit': 18, + }, }, ] @@ -2332,6 +2425,88 @@ class GenericIE(InfoExtractor): 'title': title, } + def _extract_kvs(self, url, webpage, video_id): + + def getlicensetoken(license): + modlicense = license.replace('$', '').replace('0', '1') + center = int(len(modlicense) / 2) + fronthalf = int(modlicense[:center + 1]) + backhalf = int(modlicense[center:]) + + modlicense = compat_str(4 * abs(fronthalf - backhalf)) + + def parts(): + for o in range(0, center + 1): + for i in range(1, 5): + yield compat_str((int(license[o + i]) + int(modlicense[o])) % 10) + + return ''.join(parts()) + + def getrealurl(video_url, license_code): + if not video_url.startswith('function/0/'): + return video_url # not obfuscated + + url_path, _, url_query = video_url.partition('?') + urlparts = url_path.split('/')[2:] + license = getlicensetoken(license_code) + newmagic = urlparts[5][:32] + + def spells(x, o): + l = (o + sum(int(n) for n in license[o:])) % 32 + for i in range(0, len(x)): + yield {l: x[o], o: x[l]}.get(i, x[i]) + + for o in range(len(newmagic) - 1, -1, -1): + newmagic = ''.join(spells(newmagic, o)) + + urlparts[5] = newmagic + urlparts[5][32:] + return '/'.join(urlparts) + '?' + url_query + + flashvars = self._search_regex( + r'(?s)]*>.*?var\s+flashvars\s*=\s*(\{.+?\});.*?', + webpage, 'flashvars') + flashvars = self._parse_json(flashvars, video_id, transform_source=js_to_json) + + # extract the part after the last / as the display_id from the + # canonical URL. + display_id = self._search_regex( + r'(?:' + r'|)', + webpage, 'display_id', fatal=False + ) + title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)', webpage, 'title') + + thumbnail = flashvars['preview_url'] + if thumbnail.startswith('//'): + protocol, _, _ = url.partition('/') + thumbnail = protocol + thumbnail + + url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys())) + formats = [] + for key in url_keys: + if '/get_file/' not in flashvars[key]: + continue + format_id = flashvars.get(key + '_text', key) + formats.append(merge_dicts( + parse_resolution(format_id) or parse_resolution(flashvars[key]), { + 'url': urljoin(url, getrealurl(flashvars[key], flashvars['license_code'])), + 'format_id': format_id, + 'ext': 'mp4', + 'http_headers': {'Referer': url}, + })) + if not formats[-1].get('height'): + formats[-1]['quality'] = 1 + + self._sort_formats(formats) + + return { + 'id': flashvars['video_id'], + 'display_id': display_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } + def _real_extract(self, url): if url.startswith('//'): return self.url_result(self.http_scheme() + url) @@ -2540,9 +2715,16 @@ class GenericIE(InfoExtractor): # but actually don't. AGE_LIMIT_MARKERS = [ r'Proudly Labeled RTA', + r'>[^<]*you acknowledge you are at least (\d+) years old', + r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b', ] - if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): - age_limit = 18 + for marker in AGE_LIMIT_MARKERS: + m = re.search(marker, webpage) + if not m: + continue + age_limit = max( + age_limit or 0, + int_or_none(m.groups() and m.group(1), default=18)) # video uploader is domain name video_uploader = self._search_regex( @@ -3389,6 +3571,20 @@ class GenericIE(InfoExtractor): info_dict['formats'] = formats return info_dict + # Look for generic KVS player (before ld+json for tests) + found = self._search_regex( + (r']+?\bsrc\s*=\s*(["\'])https?://(?:\S+?/)+kt_player\.js\?v=(?P\d+(?:\.\d+)+)\1[^>]*>', + # kt_player('kt_player', 'https://i.shoosh.co/player/kt_player.swf?v=5.5.1', ... + r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:\S+?/)+kt_player\.swf\?v=(?P\d+(?:\.\d+)+)\2\s*,', + ), webpage, 'KVS player', group='ver', default=False) + if found: + self.report_extraction('%s: KVS Player' % (video_id, )) + if found.split('.')[0] not in ('4', '5', '6'): + self.report_warning('Untested major version (%s) in player engine - download may fail.' % (found, )) + return merge_dicts( + self._extract_kvs(url, webpage, video_id), + info_dict) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') diff --git a/youtube_dl/extractor/hrfernsehen.py b/youtube_dl/extractor/hrfernsehen.py new file mode 100644 index 000000000..11b879dbd --- /dev/null +++ b/youtube_dl/extractor/hrfernsehen.py @@ -0,0 +1,101 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import json +import re + +from ..utils import ( + int_or_none, + unified_timestamp, + unescapeHTML +) +from .common import InfoExtractor + + +class HRFernsehenIE(InfoExtractor): + IE_NAME = 'hrfernsehen' + _VALID_URL = r'^https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P[0-9]{6})\.html' + + _TESTS = [{ + 'url': 'https://www.hessenschau.de/tv-sendung/hessenschau-vom-26082020,video-130546.html', + 'md5': '5c4e0ba94677c516a2f65a84110fc536', + 'info_dict': { + 'id': '130546', + 'ext': 'mp4', + 'description': 'Sturmtief Kirsten fegt über Hessen / Die Corona-Pandemie – eine Chronologie / ' + 'Sterbehilfe: Die Lage in Hessen / Miss Hessen leitet zwei eigene Unternehmen / ' + 'Pop-Up Museum zeigt Schwarze Unterhaltung und Black Music', + 'subtitles': {'de': [{ + 'url': 'https://hr-a.akamaihd.net/video/as/hessenschau/2020_08/hrLogo_200826200407_L385592_512x288-25p-500kbit.vtt' + }]}, + 'timestamp': 1598470200, + 'upload_date': '20200826', + 'thumbnail': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg', + 'title': 'hessenschau vom 26.08.2020' + } + }, { + 'url': 'https://www.hr-fernsehen.de/sendungen-a-z/mex/sendungen/fair-und-gut---was-hinter-aldis-eigenem-guetesiegel-steckt,video-130544.html', + 'only_matching': True + }] + + _GEO_COUNTRIES = ['DE'] + + def extract_airdate(self, loader_data): + airdate_str = loader_data.get('mediaMetadata', {}).get('agf', {}).get('airdate') + + if airdate_str is None: + return None + + return unified_timestamp(airdate_str) + + def extract_formats(self, loader_data): + stream_formats = [] + for stream_obj in loader_data["videoResolutionLevels"]: + stream_format = { + 'format_id': str(stream_obj['verticalResolution']) + "p", + 'height': stream_obj['verticalResolution'], + 'url': stream_obj['url'], + } + + quality_information = re.search(r'([0-9]{3,4})x([0-9]{3,4})-([0-9]{2})p-([0-9]{3,4})kbit', + stream_obj['url']) + if quality_information: + stream_format['width'] = int_or_none(quality_information.group(1)) + stream_format['height'] = int_or_none(quality_information.group(2)) + stream_format['fps'] = int_or_none(quality_information.group(3)) + stream_format['tbr'] = int_or_none(quality_information.group(4)) + + stream_formats.append(stream_format) + + self._sort_formats(stream_formats) + return stream_formats + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta( + ['og:title', 'twitter:title', 'name'], webpage) + description = self._html_search_meta( + ['description'], webpage) + + loader_str = unescapeHTML(self._search_regex(r"data-new-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader")) + loader_data = json.loads(loader_str) + + info = { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': self.extract_formats(loader_data), + 'timestamp': self.extract_airdate(loader_data) + } + + if "subtitle" in loader_data: + info["subtitles"] = {"de": [{"url": loader_data["subtitle"]}]} + + thumbnails = list(set([t for t in loader_data.get("previewImageUrl", {}).values()])) + if len(thumbnails) > 0: + info["thumbnails"] = [{"url": t} for t in thumbnails] + + return info diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 0d9f50ed2..c7daa30e5 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -1,19 +1,29 @@ +# coding: utf-8 + from __future__ import unicode_literals import re from .common import InfoExtractor from ..compat import ( + compat_filter as filter, + compat_HTTPError, compat_parse_qs, - compat_urllib_parse_urlparse, + compat_urlparse, ) from ..utils import ( - HEADRequest, determine_ext, + error_to_compat_str, + extract_attributes, + ExtractorError, int_or_none, + merge_dicts, + orderedSet, parse_iso8601, strip_or_none, - try_get, + traverse_obj, + url_or_none, + urljoin, ) @@ -22,14 +32,102 @@ class IGNBaseIE(InfoExtractor): return self._download_json( 'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug) + def _checked_call_api(self, slug): + try: + return self._call_api(slug) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + e.cause.args = e.cause.args or [ + e.cause.geturl(), e.cause.getcode(), e.cause.reason] + raise ExtractorError( + 'Content not found: expired?', cause=e.cause, + expected=True) + raise + + def _extract_video_info(self, video, fatal=True): + video_id = video['videoId'] + + formats = [] + refs = traverse_obj(video, 'refs', expected_type=dict) or {} + + m3u8_url = url_or_none(refs.get('m3uUrl')) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + f4m_url = url_or_none(refs.get('f4mUrl')) + if f4m_url: + formats.extend(self._extract_f4m_formats( + f4m_url, video_id, f4m_id='hds', fatal=False)) + + for asset in (video.get('assets') or []): + asset_url = url_or_none(asset.get('url')) + if not asset_url: + continue + formats.append({ + 'url': asset_url, + 'tbr': int_or_none(asset.get('bitrate'), 1000), + 'fps': int_or_none(asset.get('frame_rate')), + 'height': int_or_none(asset.get('height')), + 'width': int_or_none(asset.get('width')), + }) + + mezzanine_url = traverse_obj( + video, ('system', 'mezzanineUrl'), expected_type=url_or_none) + if mezzanine_url: + formats.append({ + 'ext': determine_ext(mezzanine_url, 'mp4'), + 'format_id': 'mezzanine', + 'preference': 1, + 'url': mezzanine_url, + }) + + if formats or fatal: + self._sort_formats(formats) + else: + return + + thumbnails = traverse_obj( + video, ('thumbnails', Ellipsis, {'url': 'url'}), expected_type=url_or_none) + tags = traverse_obj( + video, ('tags', Ellipsis, 'displayName'), + expected_type=lambda x: x.strip() or None) + + metadata = traverse_obj(video, 'metadata', expected_type=dict) or {} + title = traverse_obj( + metadata, 'longTitle', 'title', 'name', + expected_type=lambda x: x.strip() or None) + + return { + 'id': video_id, + 'title': title, + 'description': strip_or_none(metadata.get('description')), + 'timestamp': parse_iso8601(metadata.get('publishDate')), + 'duration': int_or_none(metadata.get('duration')), + 'thumbnails': thumbnails, + 'formats': formats, + 'tags': tags, + } + + # yt-dlp shim + @classmethod + def _extract_from_webpage(cls, url, webpage): + for embed_url in orderedSet( + cls._extract_embed_urls(url, webpage) or [], lazy=True): + yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls) + class IGNIE(IGNBaseIE): """ Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. Some videos of it.ign.com are also supported """ - - _VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P[^/?&#]+)' + _VIDEO_PATH_RE = r'/(?:\d{4}/\d{2}/\d{2}/)?(?P.+?)' + _PLAYLIST_PATH_RE = r'(?:/?\?(?P[^&#]+))?' + _VALID_URL = ( + r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos(?:%s)' + % '|'.join((_VIDEO_PATH_RE + r'(?:[/?&#]|$)', _PLAYLIST_PATH_RE))) IE_NAME = 'ign.com' _PAGE_TYPE = 'video' @@ -44,7 +142,10 @@ class IGNIE(IGNBaseIE): 'timestamp': 1370440800, 'upload_date': '20130605', 'tags': 'count:9', - } + }, + 'params': { + 'nocheckcertificate': True, + }, }, { 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', 'md5': 'f1581a6fe8c5121be5b807684aeac3f6', @@ -56,86 +157,51 @@ class IGNIE(IGNBaseIE): 'timestamp': 1420571160, 'upload_date': '20150106', 'tags': 'count:4', - } + }, + 'skip': '404 Not Found', }, { 'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix', 'only_matching': True, }] + @classmethod + def _extract_embed_urls(cls, url, webpage): + grids = re.findall( + r'''(?s)]+\bclass\s*=\s*['"](?:[\w-]+\s+)*?content-feed-grid(?!\B|-)[^>]+>(.+?)]*>''', + webpage) + return filter(None, + (urljoin(url, m.group('path')) for m in re.finditer( + r''']+\bhref\s*=\s*('|")(?P/videos%s)\1''' + % cls._VIDEO_PATH_RE, grids[0] if grids else ''))) + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + display_id = m.group('id') + if display_id: + return self._extract_video(url, display_id) + display_id = m.group('filt') or 'all' + return self._extract_playlist(url, display_id) + + def _extract_playlist(self, url, display_id): + webpage = self._download_webpage(url, display_id) + + return self.playlist_result( + (self.url_result(u, ie=self.ie_key()) + for u in self._extract_embed_urls(url, webpage)), + playlist_id=display_id) + + def _extract_video(self, url, display_id): display_id = self._match_id(url) - video = self._call_api(display_id) - video_id = video['videoId'] - metadata = video['metadata'] - title = metadata.get('longTitle') or metadata.get('title') or metadata['name'] + video = self._checked_call_api(display_id) - formats = [] - refs = video.get('refs') or {} + info = self._extract_video_info(video) - m3u8_url = refs.get('m3uUrl') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - f4m_url = refs.get('f4mUrl') - if f4m_url: - formats.extend(self._extract_f4m_formats( - f4m_url, video_id, f4m_id='hds', fatal=False)) - - for asset in (video.get('assets') or []): - asset_url = asset.get('url') - if not asset_url: - continue - formats.append({ - 'url': asset_url, - 'tbr': int_or_none(asset.get('bitrate'), 1000), - 'fps': int_or_none(asset.get('frame_rate')), - 'height': int_or_none(asset.get('height')), - 'width': int_or_none(asset.get('width')), - }) - - mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl']) - if mezzanine_url: - formats.append({ - 'ext': determine_ext(mezzanine_url, 'mp4'), - 'format_id': 'mezzanine', - 'preference': 1, - 'url': mezzanine_url, - }) - - self._sort_formats(formats) - - thumbnails = [] - for thumbnail in (video.get('thumbnails') or []): - thumbnail_url = thumbnail.get('url') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - }) - - tags = [] - for tag in (video.get('tags') or []): - display_name = tag.get('displayName') - if not display_name: - continue - tags.append(display_name) - - return { - 'id': video_id, - 'title': title, - 'description': strip_or_none(metadata.get('description')), - 'timestamp': parse_iso8601(metadata.get('publishDate')), - 'duration': int_or_none(metadata.get('duration')), + return merge_dicts({ 'display_id': display_id, - 'thumbnails': thumbnails, - 'formats': formats, - 'tags': tags, - } + }, info) -class IGNVideoIE(InfoExtractor): +class IGNVideoIE(IGNBaseIE): _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P\d+)/(?:video|trailer)/' _TESTS = [{ 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', @@ -147,7 +213,8 @@ class IGNVideoIE(InfoExtractor): 'description': 'Taking out assassination targets in Hitman has never been more stylish.', 'timestamp': 1444665600, 'upload_date': '20151012', - } + }, + 'expected_warnings': ['HTTP Error 400: Bad Request'], }, { 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', 'only_matching': True, @@ -167,22 +234,38 @@ class IGNVideoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - req = HEADRequest(url.rsplit('/', 1)[0] + '/embed') - url = self._request_webpage(req, video_id).geturl() + parsed_url = compat_urlparse.urlparse(url) + embed_url = compat_urlparse.urlunparse( + parsed_url._replace(path=parsed_url.path.rsplit('/', 1)[0] + '/embed')) + + webpage, urlh = self._download_webpage_handle(embed_url, video_id) + new_url = urlh.geturl() ign_url = compat_parse_qs( - compat_urllib_parse_urlparse(url).query).get('url', [None])[0] + compat_urlparse.urlparse(new_url).query).get('url', [None])[-1] if ign_url: return self.url_result(ign_url, IGNIE.ie_key()) - return self.url_result(url) + video = self._search_regex(r'(]+\bdata-video-id\s*=\s*[^>]+>)', webpage, 'video element', fatal=False) + if not video: + if new_url == url: + raise ExtractorError('Redirect loop: ' + url) + return self.url_result(new_url) + video = extract_attributes(video) + video_data = video.get('data-settings') or '{}' + video_data = self._parse_json(video_data, video_id)['video'] + info = self._extract_video_info(video_data) + + return merge_dicts({ + 'display_id': video_id, + }, info) class IGNArticleIE(IGNBaseIE): - _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P[^/?&#]+)' + _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?(?:[\w-]+/)*?feature/\d+)/(?P[^/?&#]+)' _PAGE_TYPE = 'article' _TESTS = [{ 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', 'info_dict': { - 'id': '524497489e4e8ff5848ece34', + 'id': '72113', 'title': '100 Little Things in GTA 5 That Will Blow Your Mind', }, 'playlist': [ @@ -190,7 +273,7 @@ class IGNArticleIE(IGNBaseIE): 'info_dict': { 'id': '5ebbd138523268b93c9141af17bec937', 'ext': 'mp4', - 'title': 'GTA 5 Video Review', + 'title': 'Grand Theft Auto V Video Review', 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', 'timestamp': 1379339880, 'upload_date': '20130916', @@ -200,7 +283,7 @@ class IGNArticleIE(IGNBaseIE): 'info_dict': { 'id': '638672ee848ae4ff108df2a296418ee2', 'ext': 'mp4', - 'title': '26 Twisted Moments from GTA 5 in Slow Motion', + 'title': 'GTA 5 In Slow Motion', 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', 'timestamp': 1386878820, 'upload_date': '20131212', @@ -208,16 +291,17 @@ class IGNArticleIE(IGNBaseIE): }, ], 'params': { - 'playlist_items': '2-3', 'skip_download': True, }, + 'expected_warnings': ['Backend fetch failed'], }, { 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', 'info_dict': { 'id': '53ee806780a81ec46e0790f8', 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', }, - 'playlist_count': 2, + 'playlist_count': 1, + 'expected_warnings': ['Backend fetch failed'], }, { # videoId pattern 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', @@ -240,18 +324,91 @@ class IGNArticleIE(IGNBaseIE): 'only_matching': True, }] + def _checked_call_api(self, slug): + try: + return self._call_api(slug) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + e.cause.args = e.cause.args or [ + e.cause.geturl(), e.cause.getcode(), e.cause.reason] + if e.cause.code == 404: + raise ExtractorError( + 'Content not found: expired?', cause=e.cause, + expected=True) + elif e.cause.code == 503: + self.report_warning(error_to_compat_str(e.cause)) + return + raise + + def _search_nextjs_data(self, webpage, video_id, **kw): + return self._parse_json( + self._search_regex( + r'(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', + webpage, 'next.js data', **kw), + video_id, **kw) + def _real_extract(self, url): display_id = self._match_id(url) - article = self._call_api(display_id) + article = self._checked_call_api(display_id) - def entries(): - media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url']) - if media_url: - yield self.url_result(media_url, IGNIE.ie_key()) - for content in (article.get('content') or []): - for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|]+src)="([^"]+)"', content): - yield self.url_result(video_url) + if article: + # obsolete ? + def entries(): + media_url = traverse_obj( + article, ('mediaRelations', 0, 'media', 'metadata', 'url'), + expected_type=url_or_none) + if media_url: + yield self.url_result(media_url, IGNIE.ie_key()) + for content in (article.get('content') or []): + for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|]+src)="([^"]+)"', content): + if url_or_none(video_url): + yield self.url_result(video_url) + + return self.playlist_result( + entries(), article.get('articleId'), + traverse_obj( + article, ('metadata', 'headline'), + expected_type=lambda x: x.strip() or None)) + + webpage = self._download_webpage(url, display_id) + + playlist_id = self._html_search_meta('dable:item_id', webpage, default=None) + if playlist_id: + + def entries(): + for m in re.finditer( + r'''(?s)]+\bclass\s*=\s*("|')ign-videoplayer\1[^>]*>(?P.+?)]+\bname\s*=\s*("|')flashvars\2[^>]*>)''', + m.group('params'), 'flashvars', default='') + flashvars = compat_parse_qs(extract_attributes(flashvars).get('value') or '') + v_url = url_or_none((flashvars.get('url') or [None])[-1]) + if v_url: + yield self.url_result(v_url) + else: + playlist_id = self._search_regex( + r'''\bdata-post-id\s*=\s*("|')(?P[\da-f]+)\1''', + webpage, 'id', group='id', default=None) + + nextjs_data = self._search_nextjs_data(webpage, display_id) + + def entries(): + for player in traverse_obj( + nextjs_data, + ('props', 'apolloState', 'ROOT_QUERY', lambda k, _: k.startswith('videoPlayerProps('), '__ref')): + # skip promo links (which may not always be served, eg GH CI servers) + if traverse_obj(nextjs_data, + ('props', 'apolloState', player.replace('PlayerProps', 'ModernContent')), + expected_type=dict): + continue + video = traverse_obj(nextjs_data, ('props', 'apolloState', player), expected_type=dict) or {} + info = self._extract_video_info(video, fatal=False) + if info: + yield merge_dicts({ + 'display_id': display_id, + }, info) return self.playlist_result( - entries(), article.get('articleId'), - strip_or_none(try_get(article, lambda x: x['metadata']['headline']))) + entries(), playlist_id or display_id, + re.sub(r'\s+-\s+IGN\s*$', '', self._og_search_title(webpage, default='')) or None) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 0a70a1fb4..60b02b699 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -1,6 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals +from ..utils import ( + ExtractorError, +) from ..compat import ( compat_b64decode, @@ -90,7 +93,11 @@ class InfoQIE(BokeCCBaseIE): }] def _extract_http_audio(self, webpage, video_id): - fields = self._form_hidden_inputs('mp3Form', webpage) + try: + fields = self._form_hidden_inputs('mp3Form', webpage) + except ExtractorError: + fields = {} + http_audio_url = fields.get('filename') if not http_audio_url: return [] diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index e86c40b42..7026139ea 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -3,123 +3,266 @@ from __future__ import unicode_literals import json import re +import sys from .common import InfoExtractor from .brightcove import BrightcoveNewIE +from ..compat import ( + compat_HTTPError, + compat_integer_types, + compat_kwargs, + compat_urlparse, +) from ..utils import ( clean_html, determine_ext, + error_to_compat_str, extract_attributes, - get_element_by_class, - JSON_LD_RE, + ExtractorError, + get_element_by_attribute, + int_or_none, merge_dicts, parse_duration, + parse_iso8601, + remove_start, smuggle_url, + strip_or_none, + traverse_obj, url_or_none, + urljoin, ) -class ITVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P[0-9a-zA-Z]+)' - _GEO_COUNTRIES = ['GB'] +class ITVBaseIE(InfoExtractor): + + def _search_nextjs_data(self, webpage, video_id, **kw): + transform_source = kw.pop('transform_source', None) + fatal = kw.pop('fatal', True) + return self._parse_json( + self._search_regex( + r''']+\bid=('|")__NEXT_DATA__\1[^>]*>(?P[^<]+)''', + webpage, 'next.js data', group='js', fatal=fatal, **kw), + video_id, transform_source=transform_source, fatal=fatal) + + def __handle_request_webpage_error(self, err, video_id=None, errnote=None, fatal=True): + if errnote is False: + return False + if errnote is None: + errnote = 'Unable to download webpage' + + errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) + if fatal: + raise ExtractorError(errmsg, sys.exc_info()[2], cause=err, video_id=video_id) + else: + self._downloader.report_warning(errmsg) + return False + + @staticmethod + def _vanilla_ua_header(): + return {'User-agent': 'Mozilla/5.0'} + + def _download_webpage_handle(self, url, video_id, *args, **kwargs): + # specialised to (a) use vanilla UA (b) detect geo-block + params = self._downloader.params + nkwargs = {} + if ( + 'user_agent' not in params + and not any(re.match(r'(?i)user-agent\s*:', h) + for h in (params.get('headers') or [])) + and 'User-agent' not in (kwargs.get('headers') or {})): + + kwargs.setdefault('headers', {}) + kwargs['headers'] = self._vanilla_ua_header() + nkwargs = kwargs + if kwargs.get('expected_status') is not None: + exp = kwargs['expected_status'] + if isinstance(exp, compat_integer_types): + exp = [exp] + if isinstance(exp, (list, tuple)) and 403 not in exp: + kwargs['expected_status'] = [403] + kwargs['expected_status'].extend(exp) + nkwargs = kwargs + else: + kwargs['expected_status'] = 403 + nkwargs = kwargs + + if nkwargs: + kwargs = compat_kwargs(kwargs) + + ret = super(ITVBaseIE, self)._download_webpage_handle(url, video_id, *args, **kwargs) + if ret is False: + return ret + webpage, urlh = ret + + if urlh.getcode() == 403: + # geo-block error is like this, with an unnecessary 'Of': + # '{\n "Message" : "Request Originated Outside Of Allowed Geographic Region",\ + # \n "TransactionId" : "oas-magni-475082-xbYF0W"\n}' + if '"Request Originated Outside Of Allowed Geographic Region"' in webpage: + self.raise_geo_restricted(countries=['GB']) + ret = self.__handle_request_webpage_error( + compat_HTTPError(urlh.geturl(), 403, 'HTTP Error 403: Forbidden', urlh.headers, urlh), + fatal=kwargs.get('fatal')) + + return ret + + +class ITVIE(ITVBaseIE): + _VALID_URL = r'https?://(?:www\.)?itv\.com/(?:(?Pwatch)|hub)/[^/]+/(?(w)[\w-]+/)(?P\w+)' + _IE_DESC = 'ITVX' _TESTS = [{ + 'note': 'Hub URLs redirect to ITVX', 'url': 'https://www.itv.com/hub/liar/2a4547a0012', - 'info_dict': { - 'id': '2a4547a0012', - 'ext': 'mp4', - 'title': 'Liar - Series 2 - Episode 6', - 'description': 'md5:d0f91536569dec79ea184f0a44cca089', - 'series': 'Liar', - 'season_number': 2, - 'episode_number': 6, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, + 'only_matching': True, }, { - # unavailable via data-playlist-url + 'note': 'Hub page unavailable via data-playlist-url (404 now)', 'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033', 'only_matching': True, }, { - # InvalidVodcrid + 'note': 'Hub page with InvalidVodcrid (404 now)', 'url': 'https://www.itv.com/hub/james-martins-saturday-morning/2a5159a0034', 'only_matching': True, }, { - # ContentUnavailable + 'note': 'Hub page with ContentUnavailable (404 now)', 'url': 'https://www.itv.com/hub/whos-doing-the-dishes/2a2898a0024', 'only_matching': True, - }] + }, { + 'note': 'ITVX, or itvX, show', + 'url': 'https://www.itv.com/watch/vera/1a7314/1a7314a0014', + 'md5': 'bd0ad666b2c058fffe7d036785880064', + 'info_dict': { + 'id': '1a7314a0014', + 'ext': 'mp4', + 'title': 'Vera - Series 3 - Episode 4 - Prodigal Son', + 'description': 'Vera and her team investigate the fatal stabbing of an ex-Met police officer outside a busy Newcastle nightclub - but there aren\'t many clues.', + 'timestamp': 1653591600, + 'upload_date': '20220526', + 'uploader': 'ITVX', + 'thumbnail': r're:https://\w+\.itv\.com/images/(?:\w+/)+\d+x\d+\?', + 'duration': 5340.8, + 'age_limit': 16, + 'series': 'Vera', + 'series_number': 3, + 'episode': 'Prodigal Son', + 'episode_number': 4, + 'channel': 'ITV3', + 'categories': list, + }, + 'params': { + # m3u8 download + # 'skip_download': True, + }, + 'skip': 'only available in UK', + }, { + 'note': 'Latest ITV news bulletin: details change daily', + 'url': 'https://www.itv.com/watch/news/varies-but-is-not-checked/6js5d0f', + 'info_dict': { + 'id': '6js5d0f', + 'ext': 'mp4', + 'title': r're:The latest ITV News headlines - \S.+', + 'description': r'''re:.* today's top stories from the ITV News team.$''', + 'timestamp': int, + 'upload_date': r're:2\d\d\d(?:0[1-9]|1[0-2])(?:[012][1-9]|3[01])', + 'uploader': 'ITVX', + 'thumbnail': r're:https://images\.ctfassets\.net/(?:\w+/)+[\w.]+\.(?:jpg|png)', + 'duration': float, + 'age_limit': None, + }, + 'params': { + # variable download + # 'skip_download': True, + }, + 'skip': 'only available in UK', + } + ] + + def _og_extract(self, webpage, require_title=False): + return { + 'title': self._og_search_title(webpage, fatal=require_title), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'uploader': self._og_search_property('site_name', webpage, default=None), + } def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - params = extract_attributes(self._search_regex( - r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params')) - ios_playlist_url = params.get('data-video-playlist') or params['data-video-id'] - hmac = params['data-video-hmac'] + webpage = self._download_webpage(url, video_id) + + # now quite different params! + params = extract_attributes(self._search_regex( + r'''(<[^>]+\b(?:class|data-testid)\s*=\s*("|')genie-container\2[^>]*>)''', + webpage, 'params')) + + ios_playlist_url = traverse_obj( + params, 'data-video-id', 'data-video-playlist', + get_all=False, expected_type=url_or_none) + headers = self.geo_verification_headers() headers.update({ 'Accept': 'application/vnd.itv.vod.playlist.v2+json', 'Content-Type': 'application/json', - 'hmac': hmac.upper(), }) ios_playlist = self._download_json( ios_playlist_url, video_id, data=json.dumps({ 'user': { - 'itvUserId': '', 'entitlements': [], - 'token': '' }, 'device': { - 'manufacturer': 'Safari', - 'model': '5', + 'manufacturer': 'Mobile Safari', + 'model': '5.1', 'os': { - 'name': 'Windows NT', - 'version': '6.1', - 'type': 'desktop' + 'name': 'iOS', + 'version': '5.0', + 'type': ' mobile' } }, 'client': { 'version': '4.1', - 'id': 'browser' + 'id': 'browser', + 'supportsAdPods': True, + 'service': 'itv.x', + 'appversion': '2.43.28', }, 'variantAvailability': { + 'player': 'hls', 'featureset': { 'min': ['hls', 'aes', 'outband-webvtt'], 'max': ['hls', 'aes', 'outband-webvtt'] }, - 'platformTag': 'dotcom' + 'platformTag': 'mobile' } }).encode(), headers=headers) video_data = ios_playlist['Playlist']['Video'] - ios_base_url = video_data.get('Base') + ios_base_url = traverse_obj(video_data, 'Base', expected_type=url_or_none) + + media_url = ( + (lambda u: url_or_none(urljoin(ios_base_url, u))) + if ios_base_url else url_or_none) formats = [] - for media_file in (video_data.get('MediaFiles') or []): - href = media_file.get('Href') + for media_file in traverse_obj(video_data, 'MediaFiles', expected_type=list) or []: + href = traverse_obj(media_file, 'Href', expected_type=media_url) if not href: continue - if ios_base_url: - href = ios_base_url + href ext = determine_ext(href) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - href, video_id, 'mp4', entry_protocol='m3u8_native', + href, video_id, 'mp4', entry_protocol='m3u8', m3u8_id='hls', fatal=False)) + else: formats.append({ 'url': href, }) self._sort_formats(formats) + for f in formats: + f.setdefault('http_headers', {}) + f['http_headers'].update(self._vanilla_ua_header()) subtitles = {} - subs = video_data.get('Subtitles') or [] - for sub in subs: - if not isinstance(sub, dict): - continue - href = url_or_none(sub.get('Href')) + for sub in traverse_obj(video_data, 'Subtitles', expected_type=list) or []: + href = traverse_obj(sub, 'Href', expected_type=url_or_none) if not href: continue subtitles.setdefault('en', []).append({ @@ -127,59 +270,132 @@ class ITVIE(InfoExtractor): 'ext': determine_ext(href, 'vtt'), }) - info = self._search_json_ld(webpage, video_id, default={}) - if not info: - json_ld = self._parse_json(self._search_regex( - JSON_LD_RE, webpage, 'JSON-LD', '{}', - group='json_ld'), video_id, fatal=False) - if json_ld and json_ld.get('@type') == 'BreadcrumbList': - for ile in (json_ld.get('itemListElement:') or []): - item = ile.get('item:') or {} - if item.get('@type') == 'TVEpisode': - item['@context'] = 'http://schema.org' - info = self._json_ld(item, video_id, fatal=False) or {} - break + next_data = self._search_nextjs_data(webpage, video_id, fatal=False, default='{}') + video_data.update(traverse_obj(next_data, ('props', 'pageProps', ('title', 'episode')), expected_type=dict)[0] or {}) + title = traverse_obj(video_data, 'headerTitle', 'episodeTitle') + info = self._og_extract(webpage, require_title=not title) + tn = info.pop('thumbnail', None) + if tn: + info['thumbnails'] = [{'url': tn}] + + # num. episode title + num_ep_title = video_data.get('numberedEpisodeTitle') + if not num_ep_title: + num_ep_title = clean_html(get_element_by_attribute('data-testid', 'episode-hero-description-strong', webpage)) + num_ep_title = num_ep_title and num_ep_title.rstrip(' -') + ep_title = strip_or_none( + video_data.get('episodeTitle') + or (num_ep_title.split('.', 1)[-1] if num_ep_title else None)) + title = title or re.sub(r'\s+-\s+ITVX$', '', info['title']) + if ep_title and ep_title != title: + title = title + ' - ' + ep_title + + def get_thumbnails(): + tns = [] + for w, x in (traverse_obj(video_data, ('imagePresets'), expected_type=dict) or {}).items(): + if isinstance(x, dict): + for y, z in x.items(): + tns.append({'id': w + '_' + y, 'url': z}) + return tns or None + + video_str = lambda *x: traverse_obj( + video_data, *x, get_all=False, expected_type=strip_or_none) return merge_dicts({ 'id': video_id, - 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage), + 'title': title, 'formats': formats, 'subtitles': subtitles, - 'duration': parse_duration(video_data.get('Duration')), - 'description': clean_html(get_element_by_class('episode-info__synopsis', webpage)), + # parsing hh:mm:ss:nnn not yet patched + 'duration': parse_duration(re.sub(r'(\d{2})(:)(\d{3}$)', r'\1.\3', video_data.get('Duration') or '')), + 'description': video_str('synopsis'), + 'timestamp': traverse_obj(video_data, 'broadcastDateTime', 'dateTime', expected_type=parse_iso8601), + 'thumbnails': get_thumbnails(), + 'series': video_str('showTitle', 'programmeTitle'), + 'series_number': int_or_none(video_data.get('seriesNumber')), + 'episode': ep_title, + 'episode_number': int_or_none((num_ep_title or '').split('.')[0]), + 'channel': video_str('channel'), + 'categories': traverse_obj(video_data, ('categories', 'formatted'), expected_type=list), + 'age_limit': {False: 16, True: 0}.get(video_data.get('isChildrenCategory')), }, info) -class ITVBTCCIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P[^/?#&]+)' - _TEST = { - 'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch', +class ITVBTCCIE(ITVBaseIE): + _VALID_URL = r'https?://(?:www\.)?itv\.com/(?!(?:watch|hub)/)(?:[^/]+/)+(?P[^/?#&]+)' + _IE_DESC = 'ITV articles: News, British Touring Car Championship' + _TESTS = [{ + 'note': 'British Touring Car Championship', + 'url': 'https://www.itv.com/btcc/articles/btcc-2018-all-the-action-from-brands-hatch', 'info_dict': { 'id': 'btcc-2018-all-the-action-from-brands-hatch', 'title': 'BTCC 2018: All the action from Brands Hatch', }, 'playlist_mincount': 9, - } - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s' + }, { + 'note': 'redirects to /btcc/articles/...', + 'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch', + 'only_matching': True, + }, { + 'note': 'news article', + 'url': 'https://www.itv.com/news/wales/2020-07-23/sean-fletcher-shows-off-wales-coastline-in-new-itv-series-as-british-tourists-opt-for-staycations', + 'info_dict': { + 'id': 'sean-fletcher-shows-off-wales-coastline-in-new-itv-series-as-british-tourists-opt-for-staycations', + 'title': '''Sean Fletcher on why Wales' coastline should be your 'staycation' destination | ITV News''', + }, + 'playlist_mincount': 1, + }] + + # should really be a class var of the BC IE + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + BRIGHTCOVE_ACCOUNT = '1582188683001' + BRIGHTCOVE_PLAYER = 'HkiHLnNRx' def _real_extract(self, url): playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) + webpage, urlh = self._download_webpage_handle(url, playlist_id) + link = compat_urlparse.urlparse(urlh.geturl()).path.strip('/') - entries = [ - self.url_result( - smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, { - # ITV does not like some GB IP ranges, so here are some - # IP blocks it accepts - 'geo_ip_blocks': [ - '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21' - ], - 'referrer': url, - }), - ie=BrightcoveNewIE.ie_key(), video_id=video_id) - for video_id in re.findall(r'data-video-id=["\'](\d+)', webpage)] + next_data = self._search_nextjs_data(webpage, playlist_id, fatal=False, default='{}') + path_prefix = compat_urlparse.urlparse(next_data.get('assetPrefix') or '').path.strip('/') + link = remove_start(link, path_prefix).strip('/') + + content = traverse_obj( + next_data, ('props', 'pageProps', Ellipsis), + expected_type=lambda x: x if x['link'] == link else None, + get_all=False, default={}) + content = traverse_obj( + content, ('body', 'content', Ellipsis, 'data'), + expected_type=lambda x: x if x.get('name') == 'Brightcove' or x.get('type') == 'Brightcove' else None) + + contraband = { + # ITV does not like some GB IP ranges, so here are some + # IP blocks it accepts + 'geo_ip_blocks': [ + '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21' + ], + 'referrer': urlh.geturl(), + } + + def entries(): + + for data in content or []: + video_id = data.get('id') + if not video_id: + continue + account = data.get('accountId') or self.BRIGHTCOVE_ACCOUNT + player = data.get('playerId') or self.BRIGHTCOVE_PLAYER + yield self.url_result( + smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account, player, video_id), contraband), + ie=BrightcoveNewIE.ie_key(), video_id=video_id) + + # obsolete ? + for video_id in re.findall(r'''data-video-id=["'](\d+)''', webpage): + yield self.url_result( + smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (self.BRIGHTCOVE_ACCOUNT, self.BRIGHTCOVE_PLAYER, video_id), contraband), + ie=BrightcoveNewIE.ie_key(), video_id=video_id) title = self._og_search_title(webpage, fatal=False) - return self.playlist_result(entries, playlist_id, title) + return self.playlist_result(entries(), playlist_id, title) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index c731612c4..6d4d93394 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -373,5 +373,5 @@ class KalturaIE(InfoExtractor): 'duration': info.get('duration'), 'timestamp': info.get('createdAt'), 'uploader_id': info.get('userId') if info.get('userId') != 'None' else None, - 'view_count': info.get('plays'), + 'view_count': int_or_none(info.get('plays')), } diff --git a/youtube_dl/extractor/kommunetv.py b/youtube_dl/extractor/kommunetv.py new file mode 100644 index 000000000..91d06a74f --- /dev/null +++ b/youtube_dl/extractor/kommunetv.py @@ -0,0 +1,35 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import update_url + + +class KommunetvIE(InfoExtractor): + _VALID_URL = r'https://(\w+).kommunetv.no/archive/(?P\w+)' + _TEST = { + 'url': 'https://oslo.kommunetv.no/archive/921', + 'md5': '5f102be308ee759be1e12b63d5da4bbc', + 'info_dict': { + 'id': '921', + 'title': 'Bystyremøte', + 'ext': 'mp4' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + headers = { + 'Accept': 'application/json' + } + data = self._download_json('https://oslo.kommunetv.no/api/streams?streamType=1&id=%s' % video_id, video_id, headers=headers) + title = data['stream']['title'] + file = data['playlist'][0]['playlist'][0]['file'] + url = update_url(file, query=None, fragment=None) + formats = self._extract_m3u8_formats(url, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + self._sort_formats(formats) + return { + 'id': video_id, + 'formats': formats, + 'title': title + } diff --git a/youtube_dl/extractor/kth.py b/youtube_dl/extractor/kth.py new file mode 100644 index 000000000..b8db461f5 --- /dev/null +++ b/youtube_dl/extractor/kth.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class KTHIE(InfoExtractor): + _VALID_URL = r'https?://play\.kth\.se/(?:[^/]+/)+(?P[a-z0-9_]+)' + _TEST = { + 'url': 'https://play.kth.se/media/Lunch+breakA+De+nya+aff%C3%A4rerna+inom+Fordonsdalen/0_uoop6oz9', + 'md5': 'd83ada6d00ca98b73243a88efe19e8a6', + 'info_dict': { + 'id': '0_uoop6oz9', + 'ext': 'mp4', + 'title': 'md5:bd1d6931facb6828762a33e6ce865f37', + 'thumbnail': 're:https?://.+/thumbnail/.+', + 'duration': 3516, + 'timestamp': 1647345358, + 'upload_date': '20220315', + 'uploader_id': 'md5:0ec23e33a89e795a4512930c8102509f', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + result = self.url_result( + smuggle_url('kaltura:308:%s' % video_id, { + 'service_url': 'https://api.kaltura.nordu.net'}), + 'Kaltura') + return result diff --git a/youtube_dl/extractor/manyvids.py b/youtube_dl/extractor/manyvids.py index e8d7163e4..75978cfd6 100644 --- a/youtube_dl/extractor/manyvids.py +++ b/youtube_dl/extractor/manyvids.py @@ -1,11 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( determine_ext, + extract_attributes, int_or_none, str_to_int, + url_or_none, urlencode_postdata, ) @@ -20,17 +25,20 @@ class ManyVidsIE(InfoExtractor): 'id': '133957', 'ext': 'mp4', 'title': 'everthing about me (Preview)', + 'uploader': 'ellyxxix', 'view_count': int, 'like_count': int, }, }, { # full video 'url': 'https://www.manyvids.com/Video/935718/MY-FACE-REVEAL/', - 'md5': 'f3e8f7086409e9b470e2643edb96bdcc', + 'md5': 'bb47bab0e0802c2a60c24ef079dfe60f', 'info_dict': { 'id': '935718', 'ext': 'mp4', 'title': 'MY FACE REVEAL', + 'description': 'md5:ec5901d41808b3746fed90face161612', + 'uploader': 'Sarah Calanthe', 'view_count': int, 'like_count': int, }, @@ -39,17 +47,50 @@ class ManyVidsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + real_url = 'https://www.manyvids.com/video/%s/gtm.js' % (video_id, ) + try: + webpage = self._download_webpage(real_url, video_id) + except Exception: + # probably useless fallback + webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - r'data-(?:video-filepath|meta-video)\s*=s*(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'video URL', group='url') + info = self._search_regex( + r'''(]*\bid\s*=\s*(['"])pageMetaDetails\2[^>]*>)''', + webpage, 'meta details', default='') + info = extract_attributes(info) - title = self._html_search_regex( - (r']+class=["\']item-title[^>]+>([^<]+)', - r']+class=["\']h2 m-0["\'][^>]*>([^<]+)'), - webpage, 'title', default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', fatal=True) + player = self._search_regex( + r'''(]*\bid\s*=\s*(['"])rmpPlayerStream\2[^>]*>)''', + webpage, 'player details', default='') + player = extract_attributes(player) + + video_urls_and_ids = ( + (info.get('data-meta-video'), 'video'), + (player.get('data-video-transcoded'), 'transcoded'), + (player.get('data-video-filepath'), 'filepath'), + (self._og_search_video_url(webpage, secure=False, default=None), 'og_video'), + ) + + def txt_or_none(s, default=None): + return (s.strip() or default) if isinstance(s, compat_str) else default + + uploader = txt_or_none(info.get('data-meta-author')) + + def mung_title(s): + if uploader: + s = re.sub(r'^\s*%s\s+[|-]' % (re.escape(uploader), ), '', s) + return txt_or_none(s) + + title = ( + mung_title(info.get('data-meta-title')) + or self._html_search_regex( + (r']+class=["\']item-title[^>]+>([^<]+)', + r']+class=["\']h2 m-0["\'][^>]*>([^<]+)'), + webpage, 'title', default=None) + or self._html_search_meta( + 'twitter:title', webpage, 'title', fatal=True)) + + title = re.sub(r'\s*[|-]\s+ManyVids\s*$', '', title) or title if any(p in webpage for p in ('preview_videos', '_preview.mp4')): title += ' (Preview)' @@ -62,7 +103,8 @@ class ManyVidsIE(InfoExtractor): # Sets some cookies self._download_webpage( 'https://www.manyvids.com/includes/ajax_repository/you_had_me_at_hello.php', - video_id, fatal=False, data=urlencode_postdata({ + video_id, note='Setting format cookies', fatal=False, + data=urlencode_postdata({ 'mvtoken': mv_token, 'vid': video_id, }), headers={ @@ -70,23 +112,56 @@ class ManyVidsIE(InfoExtractor): 'X-Requested-With': 'XMLHttpRequest' }) - if determine_ext(video_url) == 'm3u8': - formats = self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - else: - formats = [{'url': video_url}] + formats = [] + for v_url, fmt in video_urls_and_ids: + v_url = url_or_none(v_url) + if not v_url: + continue + if determine_ext(v_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + v_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls')) + else: + formats.append({ + 'url': v_url, + 'format_id': fmt, + }) - like_count = int_or_none(self._search_regex( - r'data-likes=["\'](\d+)', webpage, 'like count', default=None)) - view_count = str_to_int(self._html_search_regex( - r'(?s)]+class="views-wrapper"[^>]*>(.+?)]*\bdata-id\s*=\s*(['"])%s\2[^>]*>)''' % (video_id, ), + webpage, 'likes', default='') + likes = extract_attributes(likes) + return int_or_none(likes.get('data-likes')) + + def get_views(): + return str_to_int(self._html_search_regex( + r'''(?s)]*\bclass\s*=["']views-wrapper\b[^>]+>.+?]+>\s*(\d[\d,.]*)\s*''', + webpage, 'view count', default=None)) return { 'id': video_id, 'title': title, - 'view_count': view_count, - 'like_count': like_count, 'formats': formats, + 'description': txt_or_none(info.get('data-meta-description')), + 'uploader': txt_or_none(info.get('data-meta-author')), + 'thumbnail': ( + url_or_none(info.get('data-meta-image')) + or url_or_none(player.get('data-video-screenshot'))), + 'view_count': get_views(), + 'like_count': get_likes(), } diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py index 2c16fc9e2..20048c6ab 100644 --- a/youtube_dl/extractor/mediaset.py +++ b/youtube_dl/extractor/mediaset.py @@ -24,7 +24,7 @@ class MediasetIE(ThePlatformBaseIE): (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ (?: (?:video|on-demand|movie)/(?:[^/]+/)+[^/]+_| - player/index\.html\?.*?\bprogramGuid= + player(?:/v\d+)?/index\.html\?.*?\bprogramGuid= ) )(?P[0-9A-Z]{16,}) ''' @@ -73,6 +73,10 @@ class MediasetIE(ThePlatformBaseIE): # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104', 'only_matching': True, + }, { + # embedUrl (from https://www.wittytv.it/amici/est-ce-que-tu-maimes-gabriele-5-dicembre-copia/) + 'url': 'https://static3.mediasetplay.mediaset.it/player/v2/index.html?partnerId=wittytv&configId=&programGuid=FD00000000153323&autoplay=true&purl=http://www.wittytv.it/amici/est-ce-que-tu-maimes-gabriele-5-dicembre-copia/', + 'only_matching': True, }, { 'url': 'mediaset:FAFU000000665924', 'only_matching': True, diff --git a/youtube_dl/extractor/minds.py b/youtube_dl/extractor/minds.py index 8e9f0f825..e8fd582aa 100644 --- a/youtube_dl/extractor/minds.py +++ b/youtube_dl/extractor/minds.py @@ -78,7 +78,7 @@ class MindsIE(MindsBaseIE): else: return self.url_result(entity['perma_url']) else: - assert(entity['subtype'] == 'video') + assert (entity['subtype'] == 'video') video_id = entity_id # 1080p and webm formats available only on the sources array video = self._call_api( diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index ef1e081f2..d352cb180 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import datetime @@ -71,7 +72,7 @@ class MotherlessIE(InfoExtractor): 'title': 'a/ Hot Teens', 'categories': list, 'upload_date': '20210104', - 'uploader_id': 'yonbiw', + 'uploader_id': 'anonymous', 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, }, @@ -125,9 +126,10 @@ class MotherlessIE(InfoExtractor): kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') - comment_count = webpage.count('class="media-comment-contents"') + comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage)) uploader_id = self._html_search_regex( - r'"thumb-member-username">\s+]+\bclass\s*=\s*["']username\b[^>]*>([^<]+)''', + r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*]+\bhref\s*=\s*['"]/m/([^"']+)'''), webpage, 'uploader_id') categories = self._html_search_meta('keywords', webpage, default=None) @@ -169,7 +171,18 @@ class MotherlessGroupIE(InfoExtractor): 'description': 'Sex can be funny. Wide smiles,laugh, games, fun of ' 'any kind!' }, - 'playlist_mincount': 9, + 'playlist_mincount': 0, + 'expected_warnings': [ + 'This group has no videos.', + ] + }, { + 'url': 'https://motherless.com/g/beautiful_cock', + 'info_dict': { + 'id': 'beautiful_cock', + 'title': 'Beautiful Cock', + 'description': 'Group for lovely cocks yours, mine, a friends anything human', + }, + 'playlist_mincount': 2500, }] @classmethod @@ -208,16 +221,23 @@ class MotherlessGroupIE(InfoExtractor): r'([\w\s]+\w)\s+-', webpage, 'title', fatal=False) description = self._html_search_meta( 'description', webpage, fatal=False) - page_count = self._int(self._search_regex( - r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT', - webpage, 'page_count'), 'page_count') + page_count = str_to_int(self._search_regex( + r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b', + webpage, 'page_count', default=0)) + if not page_count: + message = self._search_regex( + r'''class\s*=\s*['"]error-page\b[^>]*>\s*<p[^>]*>\s*(?P<error_msg>[^<]+)(?<=\S)\s*''', + webpage, 'error_msg', default=None) or 'This group has no videos.' + self.report_warning(message, group_id) + page_count = 1 PAGE_SIZE = 80 def _get_page(idx): - webpage = self._download_webpage( - page_url, group_id, query={'page': idx + 1}, - note='Downloading page %d/%d' % (idx + 1, page_count) - ) + if idx > 0: + webpage = self._download_webpage( + page_url, group_id, query={'page': idx + 1}, + note='Downloading page %d/%d' % (idx + 1, page_count) + ) for entry in self._extract_entries(webpage, url): yield entry diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index db7ebc94c..f540c52ee 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -35,7 +35,9 @@ class MySpassIE(InfoExtractor): title = xpath_text(metadata, 'title', fatal=True) video_url = xpath_text(metadata, 'url_flv', 'download url', True) video_id_int = int(video_id) - for group in re.search(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url).groups(): + + grps = re.search(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url) + for group in grps.groups() if grps else []: group_int = int(group) if group_int > video_id_int: video_url = video_url.replace( diff --git a/youtube_dl/extractor/myvideoge.py b/youtube_dl/extractor/myvideoge.py new file mode 100644 index 000000000..efbfda7a6 --- /dev/null +++ b/youtube_dl/extractor/myvideoge.py @@ -0,0 +1,87 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + get_element_by_id, + get_element_by_class, + int_or_none, + js_to_json, + MONTH_NAMES, + qualities, + unified_strdate, +) + + +class MyVideoGeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?myvideo\.ge/v/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://www.myvideo.ge/v/3941048', + 'md5': '8c192a7d2b15454ba4f29dc9c9a52ea9', + 'info_dict': { + 'id': '3941048', + 'ext': 'mp4', + 'title': 'The best prikol', + 'upload_date': '20200611', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'chixa33', + 'description': 'md5:5b067801318e33c2e6eea4ab90b1fdd3', + }, + # working from local dev system + 'skip': 'site blocks CI servers', + } + _MONTH_NAMES_KA = ['იანვარი', 'თებერვალი', 'მარტი', 'აპრილი', 'მაისი', 'ივნისი', 'ივლისი', 'აგვისტო', 'სექტემბერი', 'ოქტომბერი', 'ნოემბერი', 'დეკემბერი'] + + _quality = staticmethod(qualities(('SD', 'HD'))) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = ( + self._og_search_title(webpage, default=None) + or clean_html(get_element_by_class('my_video_title', webpage)) + or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title\b', webpage, 'title')) + + jwplayer_sources = self._parse_json( + self._search_regex( + r'''(?s)jwplayer\s*\(\s*['"]mvplayer['"]\s*\)\s*\.\s*setup\s*\(.*?\bsources\s*:\s*(\[.*?])\s*[,});]''', webpage, 'jwplayer sources', fatal=False) + or '', + video_id, transform_source=js_to_json, fatal=False) + + formats = self._parse_jwplayer_formats(jwplayer_sources or [], video_id) + for f in formats or []: + f['preference'] = self._quality(f['format_id']) + self._sort_formats(formats) + + description = ( + self._og_search_description(webpage) + or get_element_by_id('long_desc_holder', webpage) + or self._html_search_meta('description', webpage)) + + uploader = self._search_regex(r'<a[^>]+class="mv_user_name"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False) + + upload_date = get_element_by_class('mv_vid_upl_date', webpage) + # as ka locale may not be present roll a local date conversion + upload_date = (unified_strdate( + # translate any ka month to an en one + re.sub('|'.join(self._MONTH_NAMES_KA), + lambda m: MONTH_NAMES['en'][self._MONTH_NAMES_KA.index(m.group(0))], + upload_date, re.I)) + if upload_date else None) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'formats': formats, + 'thumbnail': self._og_search_thumbnail(webpage), + 'upload_date': upload_date, + 'view_count': int_or_none(get_element_by_class('mv_vid_views', webpage)), + 'like_count': int_or_none(get_element_by_id('likes_count', webpage)), + 'dislike_count': int_or_none(get_element_by_id('dislikes_count', webpage)), + } diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index ddd828d92..26627f8b0 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -4,8 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse from ..utils import ( determine_ext, + ExtractorError, int_or_none, merge_dicts, parse_iso8601, @@ -20,13 +22,13 @@ class NDRBaseIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) display_id = next(group for group in mobj.groups() if group) webpage = self._download_webpage(url, display_id) - return self._extract_embed(webpage, display_id) + return self._extract_embed(webpage, display_id, url) class NDRIE(NDRBaseIE): IE_NAME = 'ndr' IE_DESC = 'NDR.de - Norddeutscher Rundfunk' - _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P<id>[^/?#]+),[\da-z]+\.html' + _VALID_URL = r'https?://(?:\w+\.)*ndr\.de/(?:[^/]+/)*(?P<id>[^/?#]+),[\da-z]+\.html' _TESTS = [{ # httpVideo, same content id 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', @@ -38,13 +40,14 @@ class NDRIE(NDRBaseIE): 'title': 'Party, Pötte und Parade', 'description': 'md5:ad14f9d2f91d3040b6930c697e5f6b4c', 'uploader': 'ndrtv', - 'timestamp': 1431108900, + 'timestamp': 1431255671, 'upload_date': '20150510', 'duration': 3498, }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { # httpVideo, different content id 'url': 'http://www.ndr.de/sport/fussball/40-Osnabrueck-spielt-sich-in-einen-Rausch,osna270.html', @@ -63,6 +66,7 @@ class NDRIE(NDRBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpAudio, same content id 'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html', @@ -74,8 +78,8 @@ class NDRIE(NDRBaseIE): 'title': 'La Valette entgeht der Hinrichtung', 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', 'uploader': 'ndrinfo', - 'timestamp': 1290626100, - 'upload_date': '20140729', + 'timestamp': 1631711863, + 'upload_date': '20210915', 'duration': 884, }, 'params': { @@ -89,9 +93,10 @@ class NDRIE(NDRBaseIE): 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring', 'ext': 'mp4', 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', - 'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6', + 'description': 'md5:700f6de264010585012a72f97b0ac0c9', 'uploader': 'ndrtv', - 'upload_date': '20201113', + 'upload_date': '20201207', + 'timestamp': 1614349457, 'duration': 1749, 'subtitles': { 'de': [{ @@ -109,19 +114,38 @@ class NDRIE(NDRBaseIE): 'only_matching': True, }] - def _extract_embed(self, webpage, display_id): - embed_url = self._html_search_meta( - 'embedURL', webpage, 'embed URL', - default=None) or self._search_regex( - r'\bembedUrl["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'embed URL', group='url') + def _extract_embed(self, webpage, display_id, url): + embed_url = ( + self._html_search_meta( + 'embedURL', webpage, 'embed URL', + default=None) + or self._search_regex( + r'\bembedUrl["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'embed URL', group='url', default=None) + or self._search_regex( + r'\bvar\s*sophoraID\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'embed URL', group='url', default='')) + # some more work needed if we only found sophoraID + if re.match(r'^[a-z]+\d+$', embed_url): + # get the initial part of the url path,. eg /panorama/archiv/2022/ + parsed_url = compat_urllib_parse_urlparse(url) + path = self._search_regex(r'(.+/)%s' % display_id, parsed_url.path or '', 'embed URL', default='') + # find tell-tale image with the actual ID + ndr_id = self._search_regex(r'%s([a-z]+\d+)(?!\.)\b' % (path, ), webpage, 'embed URL', default=None) + # or try to use special knowledge! + NDR_INFO_URL_TPL = 'https://www.ndr.de/info/%s-player.html' + embed_url = 'ndr:%s' % (ndr_id, ) if ndr_id else NDR_INFO_URL_TPL % (embed_url, ) + if not embed_url: + raise ExtractorError('Unable to extract embedUrl') + description = self._search_regex( r'<p[^>]+itemprop="description">([^<]+)</p>', webpage, 'description', default=None) or self._og_search_description(webpage) timestamp = parse_iso8601( self._search_regex( - r'<span[^>]+itemprop="(?:datePublished|uploadDate)"[^>]+content="([^"]+)"', - webpage, 'upload date', default=None)) + (r'<span[^>]+itemprop="(?:datePublished|uploadDate)"[^>]+content="(?P<cont>[^"]+)"', + r'\bvar\s*pdt\s*=\s*(?P<q>["\'])(?P<cont>(?:(?!(?P=q)).)+)(?P=q)', ), + webpage, 'upload date', group='cont', default=None)) info = self._search_json_ld(webpage, display_id, default={}) return merge_dicts({ '_type': 'url_transparent', @@ -153,19 +177,19 @@ class NJoyIE(NDRBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpVideo, different content id 'url': 'http://www.n-joy.de/musik/Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-,felixjaehn168.html', 'md5': '417660fffa90e6df2fda19f1b40a64d8', 'info_dict': { - 'id': 'dockville882', + 'id': 'livestream283', 'display_id': 'Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-', - 'ext': 'mp4', - 'title': '"Ich hab noch nie" mit Felix Jaehn', - 'description': 'md5:85dd312d53be1b99e1f998a16452a2f3', + 'ext': 'mp3', + 'title': 'Das frueheste DJ Set des Nordens live mit Felix Jaehn', + 'description': 'md5:681698f527b8601e511e7b79edde7d2c', 'uploader': 'njoy', - 'upload_date': '20150822', - 'duration': 211, + 'upload_date': '20210830', }, 'params': { 'skip_download': True, @@ -175,18 +199,25 @@ class NJoyIE(NDRBaseIE): 'only_matching': True, }] - def _extract_embed(self, webpage, display_id): + def _extract_embed(self, webpage, display_id, url=None): + # find tell-tale URL with the actual ID, or ... video_id = self._search_regex( - r'<iframe[^>]+id="pp_([\da-z]+)"', webpage, 'embed id') - description = self._search_regex( - r'<div[^>]+class="subline"[^>]*>[^<]+</div>\s*<p>([^<]+)</p>', - webpage, 'description', fatal=False) + (r'''\bsrc\s*=\s*["']?(?:/\w+)+/([a-z]+\d+)(?!\.)\b''', + r'<iframe[^>]+id="pp_([\da-z]+)"', ), + webpage, 'NDR id', default=None) + + description = ( + self._html_search_meta('description', webpage) + or self._search_regex( + r'<div[^>]+class="subline"[^>]*>[^<]+</div>\s*<p>([^<]+)</p>', + webpage, 'description', fatal=False)) return { '_type': 'url_transparent', 'ie_key': 'NDREmbedBase', 'url': 'ndr:%s' % video_id, 'display_id': display_id, 'description': description, + 'title': display_id.replace('-', ' ').strip(), } @@ -291,7 +322,7 @@ class NDREmbedBaseIE(InfoExtractor): class NDREmbedIE(NDREmbedBaseIE): IE_NAME = 'ndr:embed' - _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html' + _VALID_URL = r'https?://(?:\w+\.)*ndr\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:(?:ard)?player|externalPlayer)\.html' _TESTS = [{ 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html', 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9', @@ -304,6 +335,7 @@ class NDREmbedIE(NDREmbedBaseIE): 'upload_date': '20150907', 'duration': 132, }, + 'skip': 'No longer available', }, { 'url': 'http://www.ndr.de/ndr2/events/soundcheck/soundcheck3366-player.html', 'md5': '002085c44bae38802d94ae5802a36e78', @@ -319,6 +351,7 @@ class NDREmbedIE(NDREmbedBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { 'url': 'http://www.ndr.de/info/audio51535-player.html', 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', @@ -328,7 +361,7 @@ class NDREmbedIE(NDREmbedBaseIE): 'title': 'La Valette entgeht der Hinrichtung', 'is_live': False, 'uploader': 'ndrinfo', - 'upload_date': '20140729', + 'upload_date': '20210915', 'duration': 884, }, 'params': { @@ -349,15 +382,17 @@ class NDREmbedIE(NDREmbedBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpVideoLive 'url': 'http://www.ndr.de/fernsehen/livestream/livestream217-externalPlayer.html', 'info_dict': { 'id': 'livestream217', - 'ext': 'flv', + 'ext': 'mp4', 'title': r're:^NDR Fernsehen Niedersachsen \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, - 'upload_date': '20150910', + 'upload_date': '20210409', + 'uploader': 'ndrtv', }, 'params': { 'skip_download': True, @@ -395,9 +430,10 @@ class NJoyEmbedIE(NDREmbedBaseIE): 'ext': 'mp4', 'title': 'Zehn Jahre Reeperbahn Festival - die Doku', 'is_live': False, - 'upload_date': '20150807', + 'upload_date': '20200826', 'duration': 1011, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { # httpAudio 'url': 'http://www.n-joy.de/news_wissen/stefanrichter100-player_image-d5e938b1-f21a-4b9a-86b8-aaba8bca3a13_theme-n-joy.html', @@ -414,6 +450,7 @@ class NJoyEmbedIE(NDREmbedBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpAudioLive, no explicit ext 'url': 'http://www.n-joy.de/news_wissen/webradioweltweit100-player_image-3fec0484-2244-4565-8fb8-ed25fd28b173_theme-n-joy.html', @@ -423,7 +460,7 @@ class NJoyEmbedIE(NDREmbedBaseIE): 'title': r're:^N-JOY Weltweit \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, 'uploader': 'njoy', - 'upload_date': '20150810', + 'upload_date': '20210830', }, 'params': { 'skip_download': True, diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py index 978a05841..5e5c6271b 100644 --- a/youtube_dl/extractor/neteasemusic.py +++ b/youtube_dl/extractor/neteasemusic.py @@ -1,20 +1,32 @@ # coding: utf-8 from __future__ import unicode_literals -from hashlib import md5 from base64 import b64encode +from binascii import hexlify from datetime import datetime +from hashlib import md5 +from random import randint +import json import re +import time from .common import InfoExtractor +from ..aes import aes_ecb_encrypt, pkcs7_padding from ..compat import ( compat_urllib_parse_urlencode, compat_str, compat_itertools_count, ) from ..utils import ( - sanitized_Request, + ExtractorError, + bytes_to_intlist, + error_to_compat_str, float_or_none, + int_or_none, + intlist_to_bytes, + sanitized_Request, + std_headers, + try_get, ) @@ -35,32 +47,106 @@ class NetEaseMusicBaseIE(InfoExtractor): result = b64encode(m.digest()).decode('ascii') return result.replace('/', '_').replace('+', '-') + @classmethod + def make_player_api_request_data_and_headers(cls, song_id, bitrate): + KEY = b'e82ckenh8dichen8' + URL = '/api/song/enhance/player/url' + now = int(time.time() * 1000) + rand = randint(0, 1000) + cookie = { + 'osver': None, + 'deviceId': None, + 'appver': '8.0.0', + 'versioncode': '140', + 'mobilename': None, + 'buildver': '1623435496', + 'resolution': '1920x1080', + '__csrf': '', + 'os': 'pc', + 'channel': None, + 'requestId': '{0}_{1:04}'.format(now, rand), + } + request_text = json.dumps( + {'ids': '[{0}]'.format(song_id), 'br': bitrate, 'header': cookie}, + separators=(',', ':')) + message = 'nobody{0}use{1}md5forencrypt'.format( + URL, request_text).encode('latin1') + msg_digest = md5(message).hexdigest() + + data = '{0}-36cd479b6b5-{1}-36cd479b6b5-{2}'.format( + URL, request_text, msg_digest) + data = pkcs7_padding(bytes_to_intlist(data)) + encrypted = intlist_to_bytes(aes_ecb_encrypt(data, bytes_to_intlist(KEY))) + encrypted_params = hexlify(encrypted).decode('ascii').upper() + + cookie = '; '.join( + ['{0}={1}'.format(k, v if v is not None else 'undefined') + for [k, v] in cookie.items()]) + + headers = { + 'User-Agent': std_headers['User-Agent'], + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': 'https://music.163.com', + 'Cookie': cookie, + } + return ('params={0}'.format(encrypted_params), headers) + + def _call_player_api(self, song_id, bitrate): + url = 'https://interface3.music.163.com/eapi/song/enhance/player/url' + data, headers = self.make_player_api_request_data_and_headers(song_id, bitrate) + try: + msg = 'empty result' + result = self._download_json( + url, song_id, data=data.encode('ascii'), headers=headers) + if result: + return result + except ExtractorError as e: + if type(e.cause) in (ValueError, TypeError): + # JSON load failure + raise + except Exception as e: + msg = error_to_compat_str(e) + self.report_warning('%s API call (%s) failed: %s' % ( + song_id, bitrate, msg)) + return {} + def extract_formats(self, info): + err = 0 formats = [] + song_id = info['id'] for song_format in self._FORMATS: details = info.get(song_format) if not details: continue - song_file_path = '/%s/%s.%s' % ( - self._encrypt(details['dfsId']), details['dfsId'], details['extension']) - # 203.130.59.9, 124.40.233.182, 115.231.74.139, etc is a reverse proxy-like feature - # from NetEase's CDN provider that can be used if m5.music.126.net does not - # work, especially for users outside of Mainland China - # via: https://github.com/JixunMoe/unblock-163/issues/3#issuecomment-163115880 - for host in ('http://m5.music.126.net', 'http://115.231.74.139/m1.music.126.net', - 'http://124.40.233.182/m1.music.126.net', 'http://203.130.59.9/m1.music.126.net'): - song_url = host + song_file_path + bitrate = int_or_none(details.get('bitrate')) or 999000 + data = self._call_player_api(song_id, bitrate) + for song in try_get(data, lambda x: x['data'], list) or []: + song_url = try_get(song, lambda x: x['url']) + if not song_url: + continue if self._is_valid_url(song_url, info['id'], 'song'): formats.append({ 'url': song_url, 'ext': details.get('extension'), - 'abr': float_or_none(details.get('bitrate'), scale=1000), + 'abr': float_or_none(song.get('br'), scale=1000), 'format_id': song_format, - 'filesize': details.get('size'), - 'asr': details.get('sr') + 'filesize': int_or_none(song.get('size')), + 'asr': int_or_none(details.get('sr')), }) - break + elif err == 0: + err = try_get(song, lambda x: x['code'], int) + + if not formats: + msg = 'No media links found' + if err != 0 and (err < 200 or err >= 400): + raise ExtractorError( + '%s (site code %d)' % (msg, err, ), expected=True) + else: + self.raise_geo_restricted( + msg + ': probably this video is not available from your location due to geo restriction.', + countries=['CN']) + return formats @classmethod @@ -76,33 +162,19 @@ class NetEaseMusicBaseIE(InfoExtractor): class NetEaseMusicIE(NetEaseMusicBaseIE): IE_NAME = 'netease:song' IE_DESC = '网易云音乐' - _VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://(y\.)?music\.163\.com/(?:[#m]/)?song\?.*?\bid=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://music.163.com/#/song?id=32102397', - 'md5': 'f2e97280e6345c74ba9d5677dd5dcb45', + 'md5': '3e909614ce09b1ccef4a3eb205441190', 'info_dict': { 'id': '32102397', 'ext': 'mp3', - 'title': 'Bad Blood (feat. Kendrick Lamar)', + 'title': 'Bad Blood', 'creator': 'Taylor Swift / Kendrick Lamar', - 'upload_date': '20150517', - 'timestamp': 1431878400, - 'description': 'md5:a10a54589c2860300d02e1de821eb2ef', + 'upload_date': '20150516', + 'timestamp': 1431792000, + 'description': 'md5:25fc5f27e47aad975aa6d36382c7833c', }, - 'skip': 'Blocked outside Mainland China', - }, { - 'note': 'No lyrics translation.', - 'url': 'http://music.163.com/#/song?id=29822014', - 'info_dict': { - 'id': '29822014', - 'ext': 'mp3', - 'title': '听见下雨的声音', - 'creator': '周杰伦', - 'upload_date': '20141225', - 'timestamp': 1419523200, - 'description': 'md5:a4d8d89f44656af206b7b2555c0bce6c', - }, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'No lyrics.', 'url': 'http://music.163.com/song?id=17241424', @@ -112,9 +184,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'title': 'Opus 28', 'creator': 'Dustin O\'Halloran', 'upload_date': '20080211', + 'description': 'md5:f12945b0f6e0365e3b73c5032e1b0ff4', 'timestamp': 1202745600, }, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'Has translated name.', 'url': 'http://music.163.com/#/song?id=22735043', @@ -128,7 +200,18 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'timestamp': 1264608000, 'alt_title': '说出愿望吧(Genie)', }, - 'skip': 'Blocked outside Mainland China', + }, { + 'url': 'https://y.music.163.com/m/song?app_version=8.8.45&id=95670&uct2=sKnvS4+0YStsWkqsPhFijw%3D%3D&dlt=0846', + 'md5': '95826c73ea50b1c288b22180ec9e754d', + 'info_dict': { + 'id': '95670', + 'ext': 'mp3', + 'title': '国际歌', + 'creator': '马备', + 'upload_date': '19911130', + 'timestamp': 691516800, + 'description': 'md5:1ba2f911a2b0aa398479f595224f2141', + }, }] def _process_lyrics(self, lyrics_info): diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index 8a9331a79..f43d91cd5 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -7,7 +8,7 @@ from ..utils import urljoin class NhkBaseIE(InfoExtractor): - _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json' + _API_URL_TEMPLATE = 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json' _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand' _TYPE_REGEX = r'/(?P<type>video|audio)/' @@ -23,7 +24,7 @@ class NhkBaseIE(InfoExtractor): def _extract_episode_info(self, url, episode=None): fetch_episode = episode is None lang, m_type, episode_id = re.match(NhkVodIE._VALID_URL, url).groups() - if episode_id.isdigit(): + if len(episode_id) == 7: episode_id = episode_id[:4] + '-' + episode_id[4:] is_video = m_type == 'video' @@ -84,7 +85,8 @@ class NhkBaseIE(InfoExtractor): class NhkVodIE(NhkBaseIE): - _VALID_URL = r'%s%s(?P<id>\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg + _VALID_URL = r'%s%s(?P<id>[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ @@ -124,6 +126,19 @@ class NhkVodIE(NhkBaseIE): }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/', 'only_matching': True, + }, { + # video, alphabetic character in ID #29670 + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/', + 'only_matching': True, + 'info_dict': { + 'id': 'qfjay6cg', + 'ext': 'mp4', + 'title': 'DESIGN TALKS plus - Fishermen’s Finery', + 'description': 'md5:8a8f958aaafb0d7cb59d38de53f1e448', + 'thumbnail': r're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$', + 'upload_date': '20210615', + 'timestamp': 1623722008, + } }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index a85fc3d5c..93f813968 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -2,25 +2,28 @@ from __future__ import unicode_literals import datetime -import functools +import itertools import json -import math +import re -from .common import InfoExtractor +from .common import InfoExtractor, SearchInfoExtractor +from ..postprocessor.ffmpeg import FFmpegPostProcessor from ..compat import ( compat_parse_qs, + compat_str, compat_urllib_parse_urlparse, ) from ..utils import ( - determine_ext, - dict_get, ExtractorError, + dict_get, float_or_none, - InAdvancePagedList, int_or_none, + OnDemandPagedList, parse_duration, parse_iso8601, + PostProcessingError, remove_start, + str_or_none, try_get, unified_timestamp, urlencode_postdata, @@ -34,7 +37,7 @@ class NiconicoIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.nicovideo.jp/watch/sm22312215', - 'md5': 'd1a75c0823e2f629128c43e1212760f9', + 'md5': 'a5bad06f1347452102953f323c69da34s', 'info_dict': { 'id': 'sm22312215', 'ext': 'mp4', @@ -157,11 +160,34 @@ class NiconicoIE(InfoExtractor): }, { 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', 'only_matching': True, + }, { + # DMC video with heartbeat + 'url': 'https://www.nicovideo.jp/watch/sm34815188', + 'md5': '9360c6e1f1519d7759e2fe8e1326ae83', + 'info_dict': { + 'id': 'sm34815188', + 'ext': 'mp4', + 'title': 'md5:aee93e9f3366db72f902f6cd5d389cb7', + 'description': 'md5:7b9149fc7a00ab053cafaf5c19662704', + 'thumbnail': r're:https?://.*', + 'uploader': 'md5:2762e18fa74dbb40aa1ad27c6291ee32', + 'uploader_id': '67449889', + 'upload_date': '20190322', + 'timestamp': int, # timestamp is unstable + 'duration': 1082.0, + 'view_count': int, + 'comment_count': int, + }, }] _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' _NETRC_MACHINE = 'niconico' + _API_HEADERS = { + 'X-Frontend-ID': '6', + 'X-Frontend-Version': '0' + } + def _real_initialize(self): self._login() @@ -191,37 +217,89 @@ class NiconicoIE(InfoExtractor): self._downloader.report_warning('unable to log in: bad username or password') return login_ok - def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality): - def yesno(boolean): - return 'yes' if boolean else 'no' + def _get_heartbeat_info(self, info_dict): - session_api_data = api_data['video']['dmcInfo']['session_api'] - session_api_endpoint = session_api_data['urls'][0] + video_id, video_src_id, audio_src_id = info_dict['url'].split(':')[1].split('/') - format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality])) + api_data = ( + info_dict.get('_api_data') + or self._parse_json( + self._html_search_regex( + 'data-api-data="([^"]+)"', + self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id), + 'API data', default='{}'), + video_id)) + + session_api_data = try_get(api_data, lambda x: x['media']['delivery']['movie']['session']) + session_api_endpoint = try_get(session_api_data, lambda x: x['urls'][0]) + + def ping(): + status = try_get( + self._download_json( + 'https://nvapi.nicovideo.jp/v1/2ab0cbaa/watch', video_id, + query={'t': try_get(api_data, lambda x: x['media']['delivery']['trackingId'])}, + note='Acquiring permission for downloading video', + headers=self._API_HEADERS), + lambda x: x['meta']['status']) + if status != 200: + self.report_warning('Failed to acquire permission for playing video. The video may not download.') + + yesno = lambda x: 'yes' if x else 'no' + + # m3u8 (encryption) + if try_get(api_data, lambda x: x['media']['delivery']['encryption']) is not None: + protocol = 'm3u8' + encryption = self._parse_json(session_api_data['token'], video_id)['hls_encryption'] + session_api_http_parameters = { + 'parameters': { + 'hls_parameters': { + 'encryption': { + encryption: { + 'encrypted_key': try_get(api_data, lambda x: x['media']['delivery']['encryption']['encryptedKey']), + 'key_uri': try_get(api_data, lambda x: x['media']['delivery']['encryption']['keyUri']) + } + }, + 'transfer_preset': '', + 'use_ssl': yesno(session_api_endpoint['isSsl']), + 'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']), + 'segment_duration': 6000, + } + } + } + # http + else: + protocol = 'http' + session_api_http_parameters = { + 'parameters': { + 'http_output_download_parameters': { + 'use_ssl': yesno(session_api_endpoint['isSsl']), + 'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']), + } + } + } session_response = self._download_json( session_api_endpoint['url'], video_id, query={'_format': 'json'}, headers={'Content-Type': 'application/json'}, - note='Downloading JSON metadata for %s' % format_id, + note='Downloading JSON metadata for %s' % info_dict['format_id'], data=json.dumps({ 'session': { 'client_info': { - 'player_id': session_api_data['player_id'], + 'player_id': session_api_data.get('playerId'), }, 'content_auth': { - 'auth_type': session_api_data['auth_types'][session_api_data['protocols'][0]], - 'content_key_timeout': session_api_data['content_key_timeout'], + 'auth_type': try_get(session_api_data, lambda x: x['authTypes'][session_api_data['protocols'][0]]), + 'content_key_timeout': session_api_data.get('contentKeyTimeout'), 'service_id': 'nicovideo', - 'service_user_id': session_api_data['service_user_id'] + 'service_user_id': session_api_data.get('serviceUserId') }, - 'content_id': session_api_data['content_id'], + 'content_id': session_api_data.get('contentId'), 'content_src_id_sets': [{ 'content_src_ids': [{ 'src_id_to_mux': { - 'audio_src_ids': [audio_quality['id']], - 'video_src_ids': [video_quality['id']], + 'audio_src_ids': [audio_src_id], + 'video_src_ids': [video_src_id], } }] }], @@ -229,52 +307,81 @@ class NiconicoIE(InfoExtractor): 'content_uri': '', 'keep_method': { 'heartbeat': { - 'lifetime': session_api_data['heartbeat_lifetime'] + 'lifetime': session_api_data.get('heartbeatLifetime') } }, - 'priority': session_api_data['priority'], + 'priority': session_api_data.get('priority'), 'protocol': { 'name': 'http', 'parameters': { - 'http_parameters': { - 'parameters': { - 'http_output_download_parameters': { - 'use_ssl': yesno(session_api_endpoint['is_ssl']), - 'use_well_known_port': yesno(session_api_endpoint['is_well_known_port']), - } - } - } + 'http_parameters': session_api_http_parameters } }, - 'recipe_id': session_api_data['recipe_id'], + 'recipe_id': session_api_data.get('recipeId'), 'session_operation_auth': { 'session_operation_auth_by_signature': { - 'signature': session_api_data['signature'], - 'token': session_api_data['token'], + 'signature': session_api_data.get('signature'), + 'token': session_api_data.get('token'), } }, 'timing_constraint': 'unlimited' } }).encode()) - resolution = video_quality.get('resolution', {}) + info_dict['url'] = session_response['data']['session']['content_uri'] + info_dict['protocol'] = protocol + + # get heartbeat info + heartbeat_info_dict = { + 'url': session_api_endpoint['url'] + '/' + session_response['data']['session']['id'] + '?_format=json&_method=PUT', + 'data': json.dumps(session_response['data']), + # interval, convert milliseconds to seconds, then halve to make a buffer. + 'interval': float_or_none(session_api_data.get('heartbeatLifetime'), scale=3000), + 'ping': ping + } + + return info_dict, heartbeat_info_dict + + def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality): + def parse_format_id(id_code): + mobj = re.match(r'''(?x) + (?:archive_)? + (?:(?P<codec>[^_]+)_)? + (?:(?P<br>[\d]+)kbps_)? + (?:(?P<res>[\d+]+)p_)? + ''', '%s_' % id_code) + return mobj.groupdict() if mobj else {} + + protocol = 'niconico_dmc' + format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality])) + vdict = parse_format_id(video_quality['id']) + adict = parse_format_id(audio_quality['id']) + resolution = try_get(video_quality, lambda x: x['metadata']['resolution'], dict) or {'height': vdict.get('res')} + vbr = try_get(video_quality, lambda x: x['metadata']['bitrate'], float) return { - 'url': session_response['data']['session']['content_uri'], + 'url': '%s:%s/%s/%s' % (protocol, video_id, video_quality['id'], audio_quality['id']), 'format_id': format_id, + 'format_note': 'DMC %s' % try_get(video_quality, lambda x: x['metadata']['label'], compat_str), 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4 - 'abr': float_or_none(audio_quality.get('bitrate'), 1000), - 'vbr': float_or_none(video_quality.get('bitrate'), 1000), - 'height': resolution.get('height'), - 'width': resolution.get('width'), + 'vcodec': vdict.get('codec'), + 'acodec': adict.get('codec'), + 'vbr': float_or_none(vbr, 1000) or float_or_none(vdict.get('br')), + 'abr': float_or_none(audio_quality.get('bitrate'), 1000) or float_or_none(adict.get('br')), + 'height': int_or_none(resolution.get('height', vdict.get('res'))), + 'width': int_or_none(resolution.get('width')), + 'quality': -2 if 'low' in format_id else -1, # Default quality value is -1 + 'protocol': protocol, + 'http_headers': { + 'Origin': 'https://www.nicovideo.jp', + 'Referer': 'https://www.nicovideo.jp/watch/' + video_id, + } } def _real_extract(self, url): video_id = self._match_id(url) - # Get video webpage. We are not actually interested in it for normal - # cases, but need the cookies in order to be able to download the - # info webpage + # Get video webpage for API data. webpage, handle = self._download_webpage_handle( 'http://www.nicovideo.jp/watch/' + video_id, video_id) if video_id.startswith('so'): @@ -284,86 +391,136 @@ class NiconicoIE(InfoExtractor): 'data-api-data="([^"]+)"', webpage, 'API data', default='{}'), video_id) - def _format_id_from_url(video_url): - return 'economy' if video_real_url.endswith('low') else 'normal' + def get_video_info_web(items): + return dict_get(api_data['video'], items) - try: - video_real_url = api_data['video']['smileInfo']['url'] - except KeyError: # Flash videos - # Get flv info - flv_info_webpage = self._download_webpage( - 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', - video_id, 'Downloading flv info') + # Get video info + video_info_xml = self._download_xml( + 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, + video_id, note='Downloading video info page') - flv_info = compat_parse_qs(flv_info_webpage) - if 'url' not in flv_info: - if 'deleted' in flv_info: - raise ExtractorError('The video has been deleted.', - expected=True) - elif 'closed' in flv_info: - raise ExtractorError('Niconico videos now require logging in', - expected=True) - elif 'error' in flv_info: - raise ExtractorError('%s reports error: %s' % ( - self.IE_NAME, flv_info['error'][0]), expected=True) - else: - raise ExtractorError('Unable to find video URL') + def get_video_info_xml(items): + if not isinstance(items, list): + items = [items] + for item in items: + ret = xpath_text(video_info_xml, './/' + item) + if ret: + return ret - video_info_xml = self._download_xml( - 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, - video_id, note='Downloading video info page') + if get_video_info_xml('error'): + error_code = get_video_info_xml('code') - def get_video_info(items): - if not isinstance(items, list): - items = [items] - for item in items: - ret = xpath_text(video_info_xml, './/' + item) - if ret: - return ret + if error_code == 'DELETED': + raise ExtractorError('The video has been deleted.', + expected=True) + elif error_code == 'NOT_FOUND': + raise ExtractorError('The video is not found.', + expected=True) + elif error_code == 'COMMUNITY': + self.to_screen('%s: The video is community members only.' % video_id) + else: + raise ExtractorError('%s reports error: %s' % (self.IE_NAME, error_code)) - video_real_url = flv_info['url'][0] + # Start extracting video formats + formats = [] - extension = get_video_info('movie_type') - if not extension: - extension = determine_ext(video_real_url) + # Get HTML5 videos info + quality_info = try_get(api_data, lambda x: x['media']['delivery']['movie']) + if not quality_info: + raise ExtractorError('The video can\'t be downloaded', expected=True) - formats = [{ - 'url': video_real_url, - 'ext': extension, - 'format_id': _format_id_from_url(video_real_url), - }] - else: - formats = [] + for audio_quality in quality_info.get('audios') or {}: + for video_quality in quality_info.get('videos') or {}: + if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'): + continue + formats.append(self._extract_format_for_quality( + api_data, video_id, audio_quality, video_quality)) - dmc_info = api_data['video'].get('dmcInfo') - if dmc_info: # "New" HTML5 videos - quality_info = dmc_info['quality'] - for audio_quality in quality_info['audios']: - for video_quality in quality_info['videos']: - if not audio_quality['available'] or not video_quality['available']: - continue - formats.append(self._extract_format_for_quality( - api_data, video_id, audio_quality, video_quality)) + # Get flv/swf info + timestamp = None + video_real_url = try_get(api_data, lambda x: x['video']['smileInfo']['url']) + if video_real_url: + is_economy = video_real_url.endswith('low') - self._sort_formats(formats) - else: # "Old" HTML5 videos - formats = [{ + if is_economy: + self.report_warning('Site is currently in economy mode! You will only have access to lower quality streams') + + # Invoking ffprobe to determine resolution + pp = FFmpegPostProcessor(self._downloader) + cookies = self._get_cookies('https://nicovideo.jp').output(header='', sep='; path=/; domain=nicovideo.jp;\n') + + self.to_screen('%s: %s' % (video_id, 'Checking smile format with ffprobe')) + + try: + metadata = pp.get_metadata_object(video_real_url, ['-cookies', cookies]) + except PostProcessingError as err: + raise ExtractorError(err.msg, expected=True) + + v_stream = a_stream = {} + + # Some complex swf files doesn't have video stream (e.g. nm4809023) + for stream in metadata['streams']: + if stream['codec_type'] == 'video': + v_stream = stream + elif stream['codec_type'] == 'audio': + a_stream = stream + + # Community restricted videos seem to have issues with the thumb API not returning anything at all + filesize = int( + (get_video_info_xml('size_high') if not is_economy else get_video_info_xml('size_low')) + or metadata['format']['size'] + ) + extension = ( + get_video_info_xml('movie_type') + or 'mp4' if 'mp4' in metadata['format']['format_name'] else metadata['format']['format_name'] + ) + + # 'creation_time' tag on video stream of re-encoded SMILEVIDEO mp4 files are '1970-01-01T00:00:00.000000Z'. + timestamp = ( + parse_iso8601(get_video_info_web('first_retrieve')) + or unified_timestamp(get_video_info_web('postedDateTime')) + ) + metadata_timestamp = ( + parse_iso8601(try_get(v_stream, lambda x: x['tags']['creation_time'])) + or timestamp if extension != 'mp4' else 0 + ) + + # According to compconf, smile videos from pre-2017 are always better quality than their DMC counterparts + smile_threshold_timestamp = parse_iso8601('2016-12-08T00:00:00+09:00') + + is_source = timestamp < smile_threshold_timestamp or metadata_timestamp > 0 + + # If movie file size is unstable, old server movie is not source movie. + if filesize > 1: + formats.append({ 'url': video_real_url, - 'ext': 'mp4', - 'format_id': _format_id_from_url(video_real_url), - }] + 'format_id': 'smile' if not is_economy else 'smile_low', + 'format_note': 'SMILEVIDEO source' if not is_economy else 'SMILEVIDEO low quality', + 'ext': extension, + 'container': extension, + 'vcodec': v_stream.get('codec_name'), + 'acodec': a_stream.get('codec_name'), + # Some complex swf files doesn't have total bit rate metadata (e.g. nm6049209) + 'tbr': int_or_none(metadata['format'].get('bit_rate'), scale=1000), + 'vbr': int_or_none(v_stream.get('bit_rate'), scale=1000), + 'abr': int_or_none(a_stream.get('bit_rate'), scale=1000), + 'height': int_or_none(v_stream.get('height')), + 'width': int_or_none(v_stream.get('width')), + 'source_preference': 5 if not is_economy else -2, + 'quality': 5 if is_source and not is_economy else None, + 'filesize': filesize + }) - def get_video_info(items): - return dict_get(api_data['video'], items) + self._sort_formats(formats) # Start extracting information - title = get_video_info('title') - if not title: - title = self._og_search_title(webpage, default=None) - if not title: - title = self._html_search_regex( + title = ( + get_video_info_xml('title') # prefer to get the untranslated original title + or get_video_info_web(['originalTitle', 'title']) + or self._og_search_title(webpage, default=None) + or self._html_search_regex( r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>', - webpage, 'video title') + webpage, 'video title')) watch_api_data_string = self._html_search_regex( r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>', @@ -372,14 +529,15 @@ class NiconicoIE(InfoExtractor): video_detail = watch_api_data.get('videoDetail', {}) thumbnail = ( - get_video_info(['thumbnail_url', 'thumbnailURL']) + self._html_search_regex(r'<meta property="og:image" content="([^"]+)">', webpage, 'thumbnail data', default=None) + or dict_get( # choose highest from 720p to 240p + get_video_info_web('thumbnail'), + ['ogp', 'player', 'largeUrl', 'middleUrl', 'url']) or self._html_search_meta('image', webpage, 'thumbnail', default=None) or video_detail.get('thumbnail')) - description = get_video_info('description') + description = get_video_info_web('description') - timestamp = (parse_iso8601(get_video_info('first_retrieve')) - or unified_timestamp(get_video_info('postedDateTime'))) if not timestamp: match = self._html_search_meta('datePublished', webpage, 'date published', default=None) if match: @@ -388,19 +546,25 @@ class NiconicoIE(InfoExtractor): timestamp = parse_iso8601( video_detail['postedAt'].replace('/', '-'), delimiter=' ', timezone=datetime.timedelta(hours=9)) + timestamp = timestamp or try_get(api_data, lambda x: parse_iso8601(x['video']['registeredAt'])) - view_count = int_or_none(get_video_info(['view_counter', 'viewCount'])) + view_count = int_or_none(get_video_info_web(['view_counter', 'viewCount'])) if not view_count: match = self._html_search_regex( r'>Views: <strong[^>]*>([^<]+)</strong>', webpage, 'view count', default=None) if match: view_count = int_or_none(match.replace(',', '')) - view_count = view_count or video_detail.get('viewCount') + view_count = ( + view_count + or video_detail.get('viewCount') + or try_get(api_data, lambda x: x['video']['count']['view'])) + + comment_count = ( + int_or_none(get_video_info_web('comment_num')) + or video_detail.get('commentCount') + or try_get(api_data, lambda x: x['video']['count']['comment'])) - comment_count = (int_or_none(get_video_info('comment_num')) - or video_detail.get('commentCount') - or try_get(api_data, lambda x: x['thread']['commentCount'])) if not comment_count: match = self._html_search_regex( r'>Comments: <strong[^>]*>([^<]+)</strong>', @@ -409,22 +573,41 @@ class NiconicoIE(InfoExtractor): comment_count = int_or_none(match.replace(',', '')) duration = (parse_duration( - get_video_info('length') + get_video_info_web('length') or self._html_search_meta( 'video:duration', webpage, 'video duration', default=None)) or video_detail.get('length') - or get_video_info('duration')) + or get_video_info_web('duration')) - webpage_url = get_video_info('watch_url') or url + webpage_url = get_video_info_web('watch_url') or url + + # for channel movie and community movie + channel_id = try_get( + api_data, + (lambda x: x['channel']['globalId'], + lambda x: x['community']['globalId'])) + channel = try_get( + api_data, + (lambda x: x['channel']['name'], + lambda x: x['community']['name'])) # Note: cannot use api_data.get('owner', {}) because owner may be set to "null" # in the JSON, which will cause None to be returned instead of {}. owner = try_get(api_data, lambda x: x.get('owner'), dict) or {} - uploader_id = get_video_info(['ch_id', 'user_id']) or owner.get('id') - uploader = get_video_info(['ch_name', 'user_nickname']) or owner.get('nickname') + uploader_id = str_or_none( + get_video_info_web(['ch_id', 'user_id']) + or owner.get('id') + or channel_id + ) + uploader = ( + get_video_info_web(['ch_name', 'user_nickname']) + or owner.get('nickname') + or channel + ) return { 'id': video_id, + '_api_data': api_data, 'title': title, 'formats': formats, 'thumbnail': thumbnail, @@ -432,6 +615,8 @@ class NiconicoIE(InfoExtractor): 'uploader': uploader, 'timestamp': timestamp, 'uploader_id': uploader_id, + 'channel': channel, + 'channel_id': channel_id, 'view_count': view_count, 'comment_count': comment_count, 'duration': duration, @@ -440,7 +625,7 @@ class NiconicoIE(InfoExtractor): class NiconicoPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/)?mylist/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/|my/)?mylist/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.nicovideo.jp/mylist/27411728', @@ -456,60 +641,185 @@ class NiconicoPlaylistIE(InfoExtractor): 'url': 'https://www.nicovideo.jp/user/805442/mylist/27411728', 'only_matching': True, }] - _PAGE_SIZE = 100 - def _call_api(self, list_id, resource, query): - return self._download_json( - 'https://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id, - 'Downloading %s JSON metatdata' % resource, query=query, - headers={'X-Frontend-Id': 6})['data']['mylist'] - - def _parse_owner(self, item): - owner = item.get('owner') or {} - if owner: - return { - 'uploader': owner.get('name'), - 'uploader_id': owner.get('id'), - } - return {} - - def _fetch_page(self, list_id, page): - page += 1 - items = self._call_api(list_id, 'page %d' % page, { - 'page': page, - 'pageSize': self._PAGE_SIZE, - })['items'] - for item in items: - video = item.get('video') or {} - video_id = video.get('id') - if not video_id: - continue - count = video.get('count') or {} - get_count = lambda x: int_or_none(count.get(x)) - info = { - '_type': 'url', - 'id': video_id, - 'title': video.get('title'), - 'url': 'https://www.nicovideo.jp/watch/' + video_id, - 'description': video.get('shortDescription'), - 'duration': int_or_none(video.get('duration')), - 'view_count': get_count('view'), - 'comment_count': get_count('comment'), - 'ie_key': NiconicoIE.ie_key(), - } - info.update(self._parse_owner(video)) - yield info + _API_HEADERS = { + 'X-Frontend-ID': '6', + 'X-Frontend-Version': '0' + } def _real_extract(self, url): list_id = self._match_id(url) - mylist = self._call_api(list_id, 'list', { - 'pageSize': 1, - }) - entries = InAdvancePagedList( - functools.partial(self._fetch_page, list_id), - math.ceil(mylist['totalItemCount'] / self._PAGE_SIZE), - self._PAGE_SIZE) - result = self.playlist_result( - entries, list_id, mylist.get('name'), mylist.get('description')) - result.update(self._parse_owner(mylist)) - return result + + def get_page_data(pagenum, pagesize): + return self._download_json( + 'http://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id, + query={'page': 1 + pagenum, 'pageSize': pagesize}, + headers=self._API_HEADERS).get('data').get('mylist') + + data = get_page_data(0, 1) + title = data.get('name') + description = data.get('description') + uploader = data.get('owner').get('name') + uploader_id = data.get('owner').get('id') + + def pagefunc(pagenum): + data = get_page_data(pagenum, 25) + return ({ + '_type': 'url', + 'url': 'http://www.nicovideo.jp/watch/' + item.get('watchId'), + } for item in data.get('items')) + + return { + '_type': 'playlist', + 'id': list_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'entries': OnDemandPagedList(pagefunc, 25), + } + + +class NicovideoSearchBaseIE(InfoExtractor): + _MAX_RESULTS = float('inf') + + def _entries(self, url, item_id, query=None, note='Downloading page %(page)s'): + query = query or {} + pages = [query['page']] if 'page' in query else itertools.count(1) + for page_num in pages: + query['page'] = str(page_num) + webpage = self._download_webpage(url, item_id, query=query, note=note % {'page': page_num}) + results = re.findall(r'(?<=data-video-id=)["\']?(?P<videoid>.+?)(?=["\'])', webpage) + for item in results: + yield self.url_result('http://www.nicovideo.jp/watch/%s' % item, 'Niconico', item) + if not results: + break + + def _get_n_results(self, query, n): + entries = self._entries(self._proto_relative_url('//www.nicovideo.jp/search/%s' % query), query) + if n < self._MAX_RESULTS: + entries = itertools.islice(entries, 0, n) + return self.playlist_result(entries, query, query) + + +class NicovideoSearchIE(NicovideoSearchBaseIE, SearchInfoExtractor): + IE_DESC = 'Nico video search' + IE_NAME = 'nicovideo:search' + _SEARCH_KEY = 'nicosearch' + + def _search_results(self, query): + return self._entries( + self._proto_relative_url('//www.nicovideo.jp/search/%s' % query), query) + + +class NicovideoSearchURLIE(NicovideoSearchBaseIE): + IE_NAME = '%s_url' % NicovideoSearchIE.IE_NAME + IE_DESC = 'Nico video search URLs' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/search/(?P<id>[^?#&]+)?' + _TESTS = [{ + 'url': 'http://www.nicovideo.jp/search/sm9', + 'info_dict': { + 'id': 'sm9', + 'title': 'sm9' + }, + 'playlist_mincount': 40, + }, { + 'url': 'https://www.nicovideo.jp/search/sm9?sort=h&order=d&end=2020-12-31&start=2020-01-01', + 'info_dict': { + 'id': 'sm9', + 'title': 'sm9' + }, + 'playlist_count': 31, + }] + + def _real_extract(self, url): + query = self._match_id(url) + return self.playlist_result(self._entries(url, query), query, query) + + +class NicovideoSearchDateIE(NicovideoSearchBaseIE, SearchInfoExtractor): + IE_DESC = 'Nico video search, newest first' + IE_NAME = '%s:date' % NicovideoSearchIE.IE_NAME + _SEARCH_KEY = 'nicosearchdate' + + _TESTS = [{ + 'url': 'nicosearchdateall:a', + 'info_dict': { + 'id': 'a', + 'title': 'a' + }, + 'playlist_mincount': 1610, + }] + + _START_DATE = datetime.date(2007, 1, 1) + _RESULTS_PER_PAGE = 32 + _MAX_PAGES = 50 + + def _entries(self, url, item_id, start_date=None, end_date=None): + start_date, end_date = start_date or self._START_DATE, end_date or datetime.datetime.now().date() + + # If the last page has a full page of videos, we need to break down the query interval further + last_page_len = len(list(self._get_entries_for_date( + url, item_id, start_date, end_date, self._MAX_PAGES, + note='Checking number of videos from {0} to {1}'.format(start_date, end_date)))) + if (last_page_len == self._RESULTS_PER_PAGE and start_date != end_date): + midpoint = start_date + ((end_date - start_date) // 2) + for entry in itertools.chain( + iter(self._entries(url, item_id, midpoint, end_date)), + iter(self._entries(url, item_id, start_date, midpoint))): + yield entry + else: + self.to_screen('{0}: Downloading results from {1} to {2}'.format(item_id, start_date, end_date)) + for entry in iter(self._get_entries_for_date( + url, item_id, start_date, end_date, note=' Downloading page %(page)s')): + yield entry + + def _get_entries_for_date(self, url, item_id, start_date, end_date=None, page_num=None, note=None): + query = { + 'start': compat_str(start_date), + 'end': compat_str(end_date or start_date), + 'sort': 'f', + 'order': 'd', + } + if page_num: + query['page'] = compat_str(page_num) + + for entry in iter(super(NicovideoSearchDateIE, self)._entries(url, item_id, query=query, note=note)): + yield entry + + +class NiconicoUserIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/user/(?P<id>\d+)/?(?:$|[#?])' + _TEST = { + 'url': 'https://www.nicovideo.jp/user/419948', + 'info_dict': { + 'id': '419948', + }, + 'playlist_mincount': 101, + } + _API_URL = "https://nvapi.nicovideo.jp/v1/users/%s/videos?sortKey=registeredAt&sortOrder=desc&pageSize=%s&page=%s" + _PAGE_SIZE = 100 + + _API_HEADERS = { + 'X-Frontend-ID': '6', + 'X-Frontend-Version': '0' + } + + def _entries(self, list_id): + total_count = 1 + count = page_num = 0 + while count < total_count: + json_parsed = self._download_json( + self._API_URL % (list_id, self._PAGE_SIZE, page_num + 1), list_id, + headers=self._API_HEADERS, + note='Downloading JSON metadata%s' % (' page %d' % page_num if page_num else '')) + if not page_num: + total_count = int_or_none(json_parsed['data'].get('totalCount')) + for entry in json_parsed["data"]["items"]: + count += 1 + yield self.url_result('https://www.nicovideo.jp/watch/%s' % entry['id']) + page_num += 1 + + def _real_extract(self, url): + list_id = self._match_id(url) + return self.playlist_result(self._entries(list_id), list_id) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 6d01a25c3..5a62b50fc 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -60,8 +60,7 @@ class NRKBaseIE(InfoExtractor): return self._download_json( urljoin('https://psapi.nrk.no/', path), video_id, note or 'Downloading %s JSON' % item, - fatal=fatal, query=query, - headers={'Accept-Encoding': 'gzip, deflate, br'}) + fatal=fatal, query=query) class NRKIE(NRKBaseIE): diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py index ab6bfcd7f..f6c94dd77 100644 --- a/youtube_dl/extractor/nuvid.py +++ b/youtube_dl/extractor/nuvid.py @@ -1,71 +1,113 @@ +# coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( parse_duration, + int_or_none, + try_get, + url_or_none, ) +import re + class NuvidIE(InfoExtractor): _VALID_URL = r'https?://(?:www|m)\.nuvid\.com/video/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://m.nuvid.com/video/1310741/', - 'md5': 'eab207b7ac4fccfb4e23c86201f11277', + _TESTS = [{ + 'url': 'https://www.nuvid.com/video/6513023/italian-babe', + 'md5': '772d2f8288f3d3c5c45f7a41761c7844', 'info_dict': { - 'id': '1310741', + 'id': '6513023', 'ext': 'mp4', - 'title': 'Horny babes show their awesome bodeis and', - 'duration': 129, + 'title': 'italian babe', + 'format_id': '360p', + 'duration': 321.0, 'age_limit': 18, + 'thumbnail': r're:https?://.+\.jpg', + 'thumbnails': list, } - } + }, { + 'url': 'https://m.nuvid.com/video/6523263', + 'md5': 'ebd22ce8e47e1d9a4d0756a15c67da52', + 'info_dict': { + 'id': '6523263', + 'ext': 'mp4', + 'title': 'Slut brunette college student anal dorm', + 'format_id': '720p', + 'duration': 421.0, + 'age_limit': 18, + 'thumbnail': r're:https?://.+\.jpg', + 'thumbnails': list, + } + }, { + 'url': 'http://m.nuvid.com/video/6415801/', + 'md5': '638d5ececb138d5753593f751ae3f697', + 'info_dict': { + 'id': '6415801', + 'ext': 'mp4', + 'title': 'My best friend wanted to fuck my wife for a long time', + 'format_id': '720p', + 'duration': 1882, + 'age_limit': 18, + 'thumbnail': r're:https?://.+\.jpg', + 'thumbnails': list, + } + }] def _real_extract(self, url): video_id = self._match_id(url) - page_url = 'http://m.nuvid.com/video/%s' % video_id + qualities = { + 'lq': '360p', + 'hq': '720p', + } + + json_url = 'https://www.nuvid.com/player_config_json/?vid={video_id}&aid=0&domain_id=0&embed=0&check_speed=0'.format(**locals()) + video_data = self._download_json( + json_url, video_id, headers={ + 'Accept': 'application/json, text/javascript, */*; q = 0.01', + 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', + }) or {} + + # nice to have, not required webpage = self._download_webpage( - page_url, video_id, 'Downloading video page') - # When dwnld_speed exists and has a value larger than the MP4 file's - # bitrate, Nuvid returns the MP4 URL - # It's unit is 100bytes/millisecond, see mobile-nuvid-min.js for the algorithm - self._set_cookie('nuvid.com', 'dwnld_speed', '10.0') - mp4_webpage = self._download_webpage( - page_url, video_id, 'Downloading video page for MP4 format') + 'http://m.nuvid.com/video/%s' % (video_id, ), + video_id, 'Downloading video page', fatal=False) or '' + + title = ( + try_get(video_data, lambda x: x['title'], compat_str) + or self._html_search_regex( + (r'''<span\s[^>]*?\btitle\s*=\s*(?P<q>"|'|\b)(?P<title>[^"]+)(?P=q)\s*>''', + r'''<div\s[^>]*?\bclass\s*=\s*(?P<q>"|'|\b)thumb-holder video(?P=q)>\s*<h5\b[^>]*>(?P<title>[^<]+)</h5''', + r'''<span\s[^>]*?\bclass\s*=\s*(?P<q>"|'|\b)title_thumb(?P=q)>(?P<title>[^<]+)</span'''), + webpage, 'title', group='title')).strip() - html5_video_re = r'(?s)<(?:video|audio)[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', - video_url = self._html_search_regex(html5_video_re, webpage, video_id) - mp4_video_url = self._html_search_regex(html5_video_re, mp4_webpage, video_id) formats = [{ - 'url': video_url, - }] - if mp4_video_url != video_url: - formats.append({ - 'url': mp4_video_url, - }) + 'url': source, + 'format_id': qualities.get(quality), + 'height': int_or_none(qualities.get(quality)[:-1]), + } for quality, source in video_data.get('files').items() if source] - title = self._html_search_regex( - [r'<span title="([^"]+)">', - r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>', - r'<span[^>]+class="title_thumb">([^<]+)</span>'], webpage, 'title').strip() + self._check_formats(formats, video_id) + self._sort_formats(formats) + + duration = parse_duration(video_data.get('duration') or video_data.get('duration_format')) thumbnails = [ - { - 'url': thumb_url, - } for thumb_url in re.findall(r'<img src="([^"]+)" alt="" />', webpage) + {'url': thumb_url, } + for thumb_url in ( + url_or_none(src) for src in re.findall( + r'<div\s+class\s*=\s*"video-tmb-wrap"\s*>\s*<img\s+src\s*=\s*"([^"]+)"\s*/>', + webpage)) ] - thumbnail = thumbnails[0]['url'] if thumbnails else None - duration = parse_duration(self._html_search_regex( - [r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})', - r'<span[^>]+class="view_time">([^<]+)</span>'], webpage, 'duration', fatal=False)) return { 'id': video_id, + 'formats': formats, 'title': title, + 'thumbnail': url_or_none(video_data.get('poster')), 'thumbnails': thumbnails, - 'thumbnail': thumbnail, 'duration': duration, 'age_limit': 18, - 'formats': formats, } diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 0c20d0177..b05d60435 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -16,6 +16,7 @@ from ..utils import ( ExtractorError, get_exe_version, is_outdated_version, + process_communicate_or_kill, std_headers, ) @@ -226,7 +227,7 @@ class PhantomJSwrapper(object): self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = p.communicate() + out, err = process_communicate_or_kill(p) if p.returncode != 0: raise ExtractorError( 'Executing JS failed\n:' + encodeArgument(err)) diff --git a/youtube_dl/extractor/peekvids.py b/youtube_dl/extractor/peekvids.py new file mode 100644 index 000000000..c8aad564b --- /dev/null +++ b/youtube_dl/extractor/peekvids.py @@ -0,0 +1,193 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + get_element_by_class, + int_or_none, + merge_dicts, + url_or_none, +) + + +class PeekVidsIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:www\.)?peekvids\.com/ + (?:(?:[^/?#]+/){2}|embed/?\?(?:[^#]*&)?v=) + (?P<id>[^/?&#]*) + ''' + _TESTS = [{ + 'url': 'https://peekvids.com/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp/BSyLMbN0YCd', + 'md5': '2ff6a357a9717dc9dc9894b51307e9a2', + 'info_dict': { + 'id': '1262717', + 'display_id': 'BSyLMbN0YCd', + 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:0a61df3620de26c0af8963b1a730cd69', + 'timestamp': 1642579329, + 'upload_date': '20220119', + 'duration': 416, + 'view_count': int, + 'age_limit': 18, + 'uploader': 'SEXYhub.com', + 'categories': list, + 'tags': list, + }, + }] + _DOMAIN = 'www.peekvids.com' + + def _get_detail(self, html): + return get_element_by_class('detail-video-block', html) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id, expected_status=429) + if '>Rate Limit Exceeded' in webpage: + raise ExtractorError( + '[%s] %s: %s' % (self.IE_NAME, video_id, 'You are suspected as a bot. Wait, or pass the captcha test on the site and provide --cookies.'), + expected=True) + + title = self._html_search_regex(r'(?s)<h1\b[^>]*>(.+?)</h1>', webpage, 'title') + + display_id = video_id + video_id = self._search_regex(r'(?s)<video\b[^>]+\bdata-id\s*=\s*["\']?([\w-]+)', webpage, 'short video ID') + srcs = self._download_json( + 'https://%s/v-alt/%s' % (self._DOMAIN, video_id), video_id, + note='Downloading list of source files') + formats = [{ + 'url': f_url, + 'format_id': f_id, + 'height': int_or_none(f_id), + } for f_url, f_id in ( + (url_or_none(f_v), f_match.group(1)) + for f_v, f_match in ( + (v, re.match(r'^data-src(\d{3,})$', k)) + for k, v in srcs.items() if v) if f_match) + if f_url + ] + if not formats: + formats = [{'url': url} for url in srcs.values()] + self._sort_formats(formats) + + info = self._search_json_ld(webpage, video_id, expected_type='VideoObject', default={}) + info.pop('url', None) + # may not have found the thumbnail if it was in a list in the ld+json + info.setdefault('thumbnail', self._og_search_thumbnail(webpage)) + detail = self._get_detail(webpage) or '' + info['description'] = self._html_search_regex( + r'(?s)(.+?)(?:%s\s*<|<ul\b)' % (re.escape(info.get('description', '')), ), + detail, 'description', default=None) or None + info['title'] = re.sub(r'\s*[,-][^,-]+$', '', info.get('title') or title) or self._generic_title(url) + + def cat_tags(name, html): + l = self._html_search_regex( + r'(?s)<span\b[^>]*>\s*%s\s*:\s*</span>(.+?)</li>' % (re.escape(name), ), + html, name, default='') + return [x for x in re.split(r'\s+', l) if x] + + return merge_dicts({ + 'id': video_id, + 'display_id': display_id, + 'age_limit': 18, + 'formats': formats, + 'categories': cat_tags('Categories', detail), + 'tags': cat_tags('Tags', detail), + 'uploader': self._html_search_regex(r'[Uu]ploaded\s+by\s(.+?)"', webpage, 'uploader', default=None), + }, info) + + +class PlayVidsIE(PeekVidsIE): + _VALID_URL = r'https?://(?:www\.)?playvids\.com/(?:embed/|\w\w?/)?(?P<id>[^/?#]*)' + _TESTS = [{ + 'url': 'https://www.playvids.com/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp', + 'md5': '2f12e50213dd65f142175da633c4564c', + 'info_dict': { + 'id': '1978030', + 'display_id': 'U3pBrYhsjXM', + 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:0a61df3620de26c0af8963b1a730cd69', + 'timestamp': 1640435839, + 'upload_date': '20211225', + 'duration': 416, + 'view_count': int, + 'age_limit': 18, + 'uploader': 'SEXYhub.com', + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://www.playvids.com/es/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp', + 'only_matching': True, + }, { + 'url': 'https://www.playvids.com/embed/U3pBrYhsjXM', + 'only_matching': True, + }, { + 'url': 'https://www.playvids.com/bKmGLe3IwjZ/sv/brazzers-800-phone-sex-madison-ivy-always-on-the-line', + 'md5': 'e783986e596cafbf46411a174ab42ba6', + 'info_dict': { + 'id': '762385', + 'display_id': 'bKmGLe3IwjZ', + 'ext': 'mp4', + 'title': 'Brazzers - 1 800 Phone Sex: Madison Ivy Always On The Line 6', + 'description': 'md5:bdcd2db2b8ad85831a491d7c8605dcef', + 'timestamp': 1516958544, + 'upload_date': '20180126', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 480, + 'uploader': 'Brazzers', + 'age_limit': 18, + 'view_count': int, + 'age_limit': 18, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://www.playvids.com/v/47iUho33toY', + 'md5': 'b056b5049d34b648c1e86497cf4febce', + 'info_dict': { + 'id': '700621', + 'display_id': '47iUho33toY', + 'ext': 'mp4', + 'title': 'KATEE OWEN STRIPTIASE IN SEXY RED LINGERIE', + 'description': None, + 'timestamp': 1507052209, + 'upload_date': '20171003', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 332, + 'uploader': 'Cacerenele', + 'age_limit': 18, + 'view_count': int, + 'categories': list, + 'tags': list, + } + }, { + 'url': 'https://www.playvids.com/z3_7iwWCmqt/sexy-teen-filipina-striptease-beautiful-pinay-bargirl-strips-and-dances', + 'md5': 'efa09be9f031314b7b7e3bc6510cd0df', + 'info_dict': { + 'id': '1523518', + 'display_id': 'z3_7iwWCmqt', + 'ext': 'mp4', + 'title': 'SEXY TEEN FILIPINA STRIPTEASE - Beautiful Pinay Bargirl Strips and Dances', + 'description': None, + 'timestamp': 1607470323, + 'upload_date': '20201208', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 593, + 'uploader': 'yorours', + 'age_limit': 18, + 'view_count': int, + 'categories': list, + 'tags': list, + }, + }] + _DOMAIN = 'www.playvids.com' + + def _get_detail(self, html): + return get_element_by_class('detail-block', html) diff --git a/youtube_dl/extractor/pr0gramm.py b/youtube_dl/extractor/pr0gramm.py new file mode 100644 index 000000000..b68224fd5 --- /dev/null +++ b/youtube_dl/extractor/pr0gramm.py @@ -0,0 +1,105 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +import re +from ..utils import ( + merge_dicts, +) + + +class Pr0grammStaticIE(InfoExtractor): + # Possible urls: + # https://pr0gramm.com/static/5466437 + _VALID_URL = r'https?://pr0gramm\.com/static/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://pr0gramm.com/static/5466437', + 'md5': '52fa540d70d3edc286846f8ca85938aa', + 'info_dict': { + 'id': '5466437', + 'ext': 'mp4', + 'title': 'pr0gramm-5466437 by g11st', + 'uploader': 'g11st', + 'upload_date': '20221221', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # Fetch media sources + entries = self._parse_html5_media_entries(url, webpage, video_id) + media_info = entries[0] + + # this raises if there are no formats + self._sort_formats(media_info.get('formats') or []) + + # Fetch author + uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader') + + # Fetch approx upload timestamp from filename + # Have None-defaults in case the extraction fails + uploadDay = None + uploadMon = None + uploadYear = None + uploadTimestr = None + # (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4) + m = re.search(r'//img\.pr0gramm\.com/(?P<year>[\d]+)/(?P<mon>[\d]+)/(?P<day>[\d]+)/\w+\.\w{,4}', webpage) + + if (m): + # Up to a day of accuracy should suffice... + uploadDay = m.groupdict().get('day') + uploadMon = m.groupdict().get('mon') + uploadYear = m.groupdict().get('year') + uploadTimestr = uploadYear + uploadMon + uploadDay + + return merge_dicts({ + 'id': video_id, + 'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''), + 'uploader': uploader, + 'upload_date': uploadTimestr + }, media_info) + + +# This extractor is for the primary url (used for sharing, and appears in the +# location bar) Since this page loads the DOM via JS, yt-dl can't find any +# video information here. So let's redirect to a compatibility version of +# the site, which does contain the <video>-element by itself, without requiring +# js to be ran. +class Pr0grammIE(InfoExtractor): + # Possible urls: + # https://pr0gramm.com/new/546637 + # https://pr0gramm.com/new/video/546637 + # https://pr0gramm.com/top/546637 + # https://pr0gramm.com/top/video/546637 + # https://pr0gramm.com/user/g11st/uploads/5466437 + # https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290 + # https://pr0gramm.com/user/froschler/reinziehen-1elf/5232030 + # https://pr0gramm.com/user/froschler/1elf/5232030 + # https://pr0gramm.com/new/5495710:comment62621020 <- this is not the id! + # https://pr0gramm.com/top/fruher war alles damals/5498175 + + _VALID_URL = r'https?:\/\/pr0gramm\.com\/(?!static/\d+).+?\/(?P<id>[\d]+)(:|$)' + _TEST = { + 'url': 'https://pr0gramm.com/new/video/5466437', + 'info_dict': { + 'id': '5466437', + 'ext': 'mp4', + 'title': 'pr0gramm-5466437 by g11st', + 'uploader': 'g11st', + 'upload_date': '20221221', + } + } + + def _generic_title(): + return "oof" + + def _real_extract(self, url): + video_id = self._match_id(url) + + return self.url_result( + 'https://pr0gramm.com/static/' + video_id, + video_id=video_id, + ie=Pr0grammStaticIE.ie_key()) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 67b86fc72..563d3400f 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -5,15 +5,16 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urlparse, compat_str, + compat_urlparse, ) from ..utils import ( - ExtractorError, determine_ext, + ExtractorError, find_xpath_attr, fix_xml_ampersands, GeoRestrictedError, + HEADRequest, int_or_none, parse_duration, remove_start, @@ -96,12 +97,100 @@ class RaiBaseIE(InfoExtractor): if not formats and geoprotection is True: self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + formats.extend(self._create_http_urls(relinker_url, formats)) + return dict((k, v) for k, v in { 'is_live': is_live, 'duration': duration, 'formats': formats, }.items() if v is not None) + def _create_http_urls(self, relinker_url, fmts): + _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' + _MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s' + _QUALITY = { + # tbr: w, h + '250': [352, 198], + '400': [512, 288], + '700': [512, 288], + '800': [700, 394], + '1200': [736, 414], + '1800': [1024, 576], + '2400': [1280, 720], + '3200': [1440, 810], + '3600': [1440, 810], + '5000': [1920, 1080], + '10000': [1920, 1080], + } + + def test_url(url): + resp = self._request_webpage( + HEADRequest(url), None, headers={'User-Agent': 'Rai'}, + fatal=False, errnote=False, note=False) + + if resp is False: + return False + + if resp.code == 200: + return False if resp.url == url else resp.url + return None + + def get_format_info(tbr): + import math + br = int_or_none(tbr) + if len(fmts) == 1 and not br: + br = fmts[0].get('tbr') + if br > 300: + tbr = compat_str(math.floor(br / 100) * 100) + else: + tbr = '250' + + # try extracting info from available m3u8 formats + format_copy = None + for f in fmts: + if f.get('tbr'): + br_limit = math.floor(br / 100) + if br_limit - 1 <= math.floor(f['tbr'] / 100) <= br_limit + 1: + format_copy = f.copy() + return { + 'width': format_copy.get('width'), + 'height': format_copy.get('height'), + 'tbr': format_copy.get('tbr'), + 'vcodec': format_copy.get('vcodec'), + 'acodec': format_copy.get('acodec'), + 'fps': format_copy.get('fps'), + 'format_id': 'https-%s' % tbr, + } if format_copy else { + 'width': _QUALITY[tbr][0], + 'height': _QUALITY[tbr][1], + 'format_id': 'https-%s' % tbr, + 'tbr': int(tbr), + } + + loc = test_url(_MP4_TMPL % (relinker_url, '*')) + if not isinstance(loc, compat_str): + return [] + + mobj = re.match( + _RELINKER_REG, + test_url(relinker_url) or '') + if not mobj: + return [] + + available_qualities = mobj.group('quality').split(',') if mobj.group('quality') else ['*'] + available_qualities = [i for i in available_qualities if i] + + formats = [] + for q in available_qualities: + fmt = { + 'url': _MP4_TMPL % (relinker_url, q), + 'protocol': 'https', + 'ext': 'mp4', + } + fmt.update(get_format_info(q)) + formats.append(fmt) + return formats + @staticmethod def _extract_subtitles(url, video_data): STL_EXT = 'stl' @@ -151,6 +240,22 @@ class RaiPlayIE(RaiBaseIE): 'params': { 'skip_download': True, }, + }, { + # 1080p direct mp4 url + 'url': 'https://www.raiplay.it/video/2021/03/Leonardo-S1E1-b5703b02-82ee-475a-85b6-c9e4a8adf642.html', + 'md5': '2e501e8651d72f05ffe8f5d286ad560b', + 'info_dict': { + 'id': 'b5703b02-82ee-475a-85b6-c9e4a8adf642', + 'ext': 'mp4', + 'title': 'Leonardo - S1E1', + 'alt_title': 'St 1 Ep 1 - Episodio 1', + 'description': 'md5:f5360cd267d2de146e4e3879a5a47d31', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Rai 1', + 'duration': 3229, + 'series': 'Leonardo', + 'season': 'Season 1', + }, }, { 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', 'only_matching': True, @@ -158,6 +263,10 @@ class RaiPlayIE(RaiBaseIE): # subtitles at 'subtitlesArray' key (see #27698) 'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html', 'only_matching': True, + }, { + # DRM protected + 'url': 'https://www.raiplay.it/video/2020/09/Lo-straordinario-mondo-di-Zoey-S1E1-Lo-straordinario-potere-di-Zoey-ed493918-1d32-44b7-8454-862e473d00ff.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -166,6 +275,13 @@ class RaiPlayIE(RaiBaseIE): media = self._download_json( base + '.json', video_id, 'Downloading video JSON') + if try_get( + media, + (lambda x: x['rights_management']['rights']['drm'], + lambda x: x['program_info']['rights_management']['rights']['drm']), + dict): + raise ExtractorError('This video is DRM protected.', expected=True) + title = media['name'] video = media['video'] @@ -307,7 +423,7 @@ class RaiIE(RaiBaseIE): }, { # with ContentItem in og:url 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', - 'md5': '6865dd00cf0bbf5772fdd89d59bd768a', + 'md5': '06345bd97c932f19ffb129973d07a020', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 'ext': 'mp4', diff --git a/youtube_dl/extractor/rbgtum.py b/youtube_dl/extractor/rbgtum.py new file mode 100644 index 000000000..da48ebbc4 --- /dev/null +++ b/youtube_dl/extractor/rbgtum.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class RbgTumIE(InfoExtractor): + _VALID_URL = r'https://live\.rbg\.tum\.de/w/(?P<id>.+)' + _TESTS = [{ + # Combined view + 'url': 'https://live.rbg.tum.de/w/cpp/22128', + 'md5': '53a5e7b3e07128e33bbf36687fe1c08f', + 'info_dict': { + 'id': 'cpp/22128', + 'ext': 'mp4', + 'title': 'Lecture: October 18. 2022', + 'series': 'Concepts of C++ programming (IN2377)', + } + }, { + # Presentation only + 'url': 'https://live.rbg.tum.de/w/I2DL/12349/PRES', + 'md5': '36c584272179f3e56b0db5d880639cba', + 'info_dict': { + 'id': 'I2DL/12349/PRES', + 'ext': 'mp4', + 'title': 'Lecture 3: Introduction to Neural Networks', + 'series': 'Introduction to Deep Learning (IN2346)', + } + }, { + # Camera only + 'url': 'https://live.rbg.tum.de/w/fvv-info/16130/CAM', + 'md5': 'e04189d92ff2f56aedf5cede65d37aad', + 'info_dict': { + 'id': 'fvv-info/16130/CAM', + 'ext': 'mp4', + 'title': 'Fachschaftsvollversammlung', + 'series': 'Fachschaftsvollversammlung Informatik', + } + }, ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m3u8 = self._html_search_regex(r'(https://.+?\.m3u8)', webpage, 'm3u8') + lecture_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title') + lecture_series_title = self._html_search_regex( + r'(?s)<title\b[^>]*>\s*(?:TUM-Live\s\|\s?)?([^:]+):?.*?', webpage, 'series') + + formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': lecture_title, + 'series': lecture_series_title, + 'formats': formats, + } + + +class RbgTumCourseIE(InfoExtractor): + _VALID_URL = r'https://live\.rbg\.tum\.de/course/(?P.+)' + _TESTS = [{ + 'url': 'https://live.rbg.tum.de/course/2022/S/fpv', + 'info_dict': { + 'title': 'Funktionale Programmierung und Verifikation (IN0003)', + 'id': '2022/S/fpv', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 13, + }, { + 'url': 'https://live.rbg.tum.de/course/2022/W/set', + 'info_dict': { + 'title': 'SET FSMPIC', + 'id': '2022/W/set', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 6, + }, ] + + def _real_extract(self, url): + course_id = self._match_id(url) + webpage = self._download_webpage(url, course_id) + + lecture_series_title = self._html_search_regex(r'(?si)(.*)', webpage, 'title') + + lecture_urls = [] + for lecture_url in re.findall(r'(?i)href="/w/(.+)(?[0-9]+)' - _API_URL = 'http://www.stream.cz/API' - + _VALID_URL = r'https?://(?:www\.)?(?:stream|televizeseznam)\.cz/[^?#]+/(?P[^?#]+)-(?P[0-9]+)' _TESTS = [{ - 'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti', - 'md5': '934bb6a6d220d99c010783c9719960d5', + 'url': 'https://www.televizeseznam.cz/video/lajna/buh-57953890', + 'md5': '40c41ade1464a390a0b447e333df4239', 'info_dict': { - 'id': '765767', + 'id': '57953890', 'ext': 'mp4', - 'title': 'Peklo na talíři: Éčka pro děti', - 'description': 'Taška s grónskou pomazánkou a další pekelnosti ZDE', - 'thumbnail': 're:^http://im.stream.cz/episode/52961d7e19d423f8f06f0100', - 'duration': 256, - }, + 'title': 'Bůh', + 'display_id': 'buh', + 'description': 'md5:8f5f09b9b7bc67df910486cdd88f7165', + 'duration': 1369.6, + 'view_count': int, + } }, { - 'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka', - 'md5': '849a88c1e1ca47d41403c2ba5e59e261', + 'url': 'https://www.stream.cz/kdo-to-mluvi/kdo-to-mluvi-velke-odhaleni-prinasi-novy-porad-uz-od-25-srpna-64087937', + 'md5': '41fd358000086a1ccdb068c77809b158', 'info_dict': { - 'id': '10002447', + 'id': '64087937', 'ext': 'mp4', - 'title': 'Kancelář Blaník: Tři roky pro Mazánka', - 'description': 'md5:3862a00ba7bf0b3e44806b544032c859', - 'thumbnail': 're:^http://im.stream.cz/episode/537f838c50c11f8d21320000', - 'duration': 368, - }, + 'title': 'Kdo to mluví? Velké odhalení přináší nový pořad už od 25. srpna', + 'display_id': 'kdo-to-mluvi-velke-odhaleni-prinasi-novy-porad-uz-od-25-srpna', + 'description': 'md5:97a811000a6460266029d6c1c2ebcd59', + 'duration': 50.2, + 'view_count': int, + } + }, { + 'url': 'https://www.stream.cz/tajemno/znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili-64147267', + 'md5': '3ee4d0be040e8f4a543e67e509d55e3f', + 'info_dict': { + 'id': '64147267', + 'ext': 'mp4', + 'title': 'Zničehonic jim skrz střechu prolítnul záhadný předmět. Badatelé vše objasnili', + 'display_id': 'znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili', + 'description': 'md5:4b8ada6718d34bb011c4e04ca4bc19bf', + 'duration': 442.84, + 'view_count': int, + } }] + def _extract_formats(self, spl_url, video): + for ext, pref, streams in ( + ('ts', -1, video.get('http_stream', {}).get('qualities', {})), + ('mp4', 1, video.get('mp4'))): + for format_id, stream in streams.items(): + if not stream.get('url'): + continue + yield merge_dicts({ + 'format_id': '-'.join((format_id, ext)), + 'ext': ext, + 'source_preference': pref, + 'url': urljoin(spl_url, stream['url']), + 'tbr': float_or_none(stream.get('bandwidth'), scale=1000), + 'duration': float_or_none(stream.get('duration'), scale=1000), + 'width': stream.get('resolution', 2 * [0])[0] or None, + 'height': stream.get('resolution', 2 * [0])[1] or int_or_none(format_id.replace('p', '')), + }, parse_codecs(stream.get('codec'))) + def _real_extract(self, url): - video_id = self._match_id(url) - api_path = '/episode/%s' % video_id + display_id, video_id = re.match(self._VALID_URL, url).groups() - req = sanitized_Request(self._API_URL + api_path) - req.add_header('Api-Password', _get_api_key(api_path)) - data = self._download_json(req, video_id) + data = self._download_json( + 'https://www.televizeseznam.cz/api/graphql', video_id, 'Downloading GraphQL result', + data=json.dumps({ + 'variables': {'urlName': video_id}, + 'query': ''' + query LoadEpisode($urlName : String){ episode(urlName: $urlName){ ...VideoDetailFragmentOnEpisode } } + fragment VideoDetailFragmentOnEpisode on Episode { + id + spl + urlName + name + perex + duration + views + }''' + }).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=UTF-8'} + )['data']['episode'] - formats = [] - for quality, video in enumerate(data['video_qualities']): - for f in video['formats']: - typ = f['type'].partition('/')[2] - qlabel = video.get('quality_label') - formats.append({ - 'format_note': '%s-%s' % (qlabel, typ) if qlabel else typ, - 'format_id': '%s-%s' % (typ, f['quality']), - 'url': f['source'], - 'height': int_or_none(f['quality'].rstrip('p')), - 'quality': quality, - }) - self._sort_formats(formats) - - image = data.get('image') - if image: - thumbnail = self._proto_relative_url( - image.replace('{width}', '1240').replace('{height}', '697'), - scheme='http:', - ) - else: - thumbnail = None - - stream = data.get('_embedded', {}).get('stream:show', {}).get('name') - if stream: - title = '%s: %s' % (stream, data['name']) - else: - title = data['name'] + spl_url = data['spl'] + 'spl2,3' + metadata = self._download_json(spl_url, video_id, 'Downloading playlist') + if 'Location' in metadata and 'data' not in metadata: + spl_url = metadata['Location'] + metadata = self._download_json(spl_url, video_id, 'Downloading redirected playlist') + video = metadata['data'] subtitles = {} - srt_url = data.get('subtitles_srt') - if srt_url: - subtitles['cs'] = [{ - 'ext': 'srt', - 'url': srt_url, - }] + for subs in video.get('subtitles', {}).values(): + if not subs.get('language'): + continue + for ext, sub_url in subs.get('urls').items(): + subtitles.setdefault(subs['language'], []).append({ + 'ext': ext, + 'url': urljoin(spl_url, sub_url) + }) + + formats = list(self._extract_formats(spl_url, video)) + self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - 'description': data.get('web_site_text'), - 'duration': int_or_none(data.get('duration')), + 'display_id': display_id, + 'title': data.get('name'), + 'description': data.get('perex'), + 'duration': float_or_none(data.get('duration')), 'view_count': int_or_none(data.get('views')), + 'formats': formats, 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/streamsb.py b/youtube_dl/extractor/streamsb.py new file mode 100644 index 000000000..bffcb3de1 --- /dev/null +++ b/youtube_dl/extractor/streamsb.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import binascii +import random +import re +import string + +from .common import InfoExtractor +from ..utils import urljoin, url_basename + + +def to_ascii_hex(str1): + return binascii.hexlify(str1.encode('utf-8')).decode('ascii') + + +def generate_random_string(length): + return ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(length)) + + +class StreamsbIE(InfoExtractor): + _DOMAINS = ('viewsb.com', ) + _VALID_URL = r'https://(?P%s)/(?P.+)' % '|'.join(_DOMAINS) + _TEST = { + 'url': 'https://viewsb.com/dxfvlu4qanjx', + 'md5': '488d111a63415369bf90ea83adc8a325', + 'info_dict': { + 'id': 'dxfvlu4qanjx', + 'ext': 'mp4', + 'title': 'Sintel' + } + } + + def _real_extract(self, url): + domain, video_id = re.match(self._VALID_URL, url).group('domain', 'id') + webpage = self._download_webpage(url, video_id) + + iframe_rel_url = self._search_regex(r'''(?i)]+\bsrc\s*=\s*('|")(?P/.*\.html)\1''', webpage, 'iframe', group='path') + iframe_url = urljoin('https://' + domain, iframe_rel_url) + + iframe_data = self._download_webpage(iframe_url, video_id) + app_version = self._search_regex(r''']+\bsrc\s*=\s*["|'].*/app\.min\.(\d+)\.js''', iframe_data, 'app version', fatal=False) or '50' + + video_code = url_basename(iframe_url).rsplit('.')[0] + + length = 12 + req = '||'.join((generate_random_string(length), video_code, generate_random_string(length), 'streamsb')) + ereq = 'https://{0}/sources{1}/{2}'.format(domain, app_version, to_ascii_hex(req)) + + video_data = self._download_webpage(ereq, video_id, headers={ + 'Referer': iframe_url, + 'watchsb': 'sbstream', + }) + player_data = self._parse_json(video_data, video_id) + title = player_data['stream_data']['title'] + formats = self._extract_m3u8_formats(player_data['stream_data']['file'], video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + return { + 'id': video_id, + 'formats': formats, + 'title': title, + } diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index 3e1a7a9e6..df02dfc47 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -1,19 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from .jwplatform import JWPlatformIE -from .nexx import NexxIE from ..compat import compat_urlparse from ..utils import ( - NO_DEFAULT, - smuggle_url, + ExtractorError, + extract_attributes, ) +from .dplay import DPlayIE -class Tele5IE(InfoExtractor): + +class Tele5IE(DPlayIE): _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P[^/?#&]+)' _GEO_COUNTRIES = ['DE'] _TESTS = [{ @@ -28,6 +25,7 @@ class Tele5IE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'No longer available: "404 Seite nicht gefunden"', }, { # jwplatform, nexx unavailable 'url': 'https://www.tele5.de/filme/ghoul-das-geheimnis-des-friedhofmonsters/', @@ -42,7 +40,20 @@ class Tele5IE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [JWPlatformIE.ie_key()], + 'skip': 'No longer available, redirects to Filme page', + }, { + 'url': 'https://tele5.de/mediathek/angel-of-mine/', + 'info_dict': { + 'id': '1252360', + 'ext': 'mp4', + 'upload_date': '20220109', + 'timestamp': 1641762000, + 'title': 'Angel of Mine', + 'description': 'md5:a72546a175e1286eb3251843a52d1ad7', + }, + 'params': { + 'format': 'bestvideo', + }, }, { 'url': 'https://www.tele5.de/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191', 'only_matching': True, @@ -64,45 +75,18 @@ class Tele5IE(InfoExtractor): }] def _real_extract(self, url): - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0] - - NEXX_ID_RE = r'\d{6,}' - JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}' - - def nexx_result(nexx_id): - return self.url_result( - 'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id, - ie=NexxIE.ie_key(), video_id=nexx_id) - - nexx_id = jwplatform_id = None - - if video_id: - if re.match(NEXX_ID_RE, video_id): - return nexx_result(video_id) - elif re.match(JWPLATFORM_ID_RE, video_id): - jwplatform_id = video_id - - if not nexx_id: - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - def extract_id(pattern, name, default=NO_DEFAULT): - return self._html_search_regex( - (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern, - r'\s+id\s*=\s*["\']player_(%s)' % pattern, - r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, - default=default) - - nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None) - if nexx_id: - return nexx_result(nexx_id) - - if not jwplatform_id: - jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id') - - return self.url_result( - smuggle_url( - 'jwplatform:%s' % jwplatform_id, - {'geo_countries': self._GEO_COUNTRIES}), - ie=JWPlatformIE.ie_key(), video_id=jwplatform_id) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + player_element = self._search_regex(r'(]+?>)', webpage, 'video player') + player_info = extract_attributes(player_element) + asset_id, country, realm = (player_info[x] for x in ('assetid', 'locale', 'realm', )) + endpoint = compat_urlparse.urlparse(player_info['endpoint']).hostname + source_type = player_info.get('sourcetype') + if source_type: + endpoint = '%s-%s' % (source_type, endpoint) + try: + return self._get_disco_api_info(url, asset_id, endpoint, realm, country) + except ExtractorError as e: + if getattr(e, 'message', '') == 'Missing deviceId in context': + raise ExtractorError('DRM protected', cause=e, expected=True) + raise diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py index 2dc020537..5174898f2 100644 --- a/youtube_dl/extractor/telegraaf.py +++ b/youtube_dl/extractor/telegraaf.py @@ -34,7 +34,9 @@ class TelegraafIE(InfoExtractor): article_id = self._match_id(url) video_id = self._download_json( - 'https://www.telegraaf.nl/graphql', article_id, query={ + 'https://app.telegraaf.nl/graphql', article_id, + headers={'User-Agent': 'De Telegraaf/6.8.11 (Android 11; en_US)'}, + query={ 'query': '''{ article(uid: %s) { videos { diff --git a/youtube_dl/extractor/thisvid.py b/youtube_dl/extractor/thisvid.py new file mode 100644 index 000000000..bc4bcb2d1 --- /dev/null +++ b/youtube_dl/extractor/thisvid.py @@ -0,0 +1,218 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import itertools + +from .common import InfoExtractor +from ..compat import ( + compat_urlparse, +) +from ..utils import ( + clean_html, + get_element_by_class, + int_or_none, + merge_dicts, + url_or_none, + urljoin, +) + + +class ThisVidIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?Pvideos|embed)/(?P[A-Za-z0-9-]+)' + _TESTS = [{ + 'url': 'https://thisvid.com/videos/sitting-on-ball-tight-jeans/', + 'md5': '839becb572995687e11a69dc4358a386', + 'info_dict': { + 'id': '3533241', + 'ext': 'mp4', + 'title': 'Sitting on ball tight jeans', + 'description': 'md5:372353bb995883d1b65fddf507489acd', + 'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg', + 'uploader_id': '150629', + 'uploader': 'jeanslevisjeans', + 'age_limit': 18, + } + }, { + 'url': 'https://thisvid.com/embed/3533241/', + 'md5': '839becb572995687e11a69dc4358a386', + 'info_dict': { + 'id': '3533241', + 'ext': 'mp4', + 'title': 'Sitting on ball tight jeans', + 'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg', + 'uploader_id': '150629', + 'uploader': 'jeanslevisjeans', + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + main_id, type_ = re.match(self._VALID_URL, url).group('id', 'type') + webpage = self._download_webpage(url, main_id) + + title = self._html_search_regex( + r']*?>(?:Video:\s+)?(.+?)(?:\s+-\s+ThisVid(?:\.com| tube))?', + webpage, 'title') + + if type_ == 'embed': + # look for more metadata + video_alt_url = url_or_none(self._search_regex( + r'''video_alt_url\s*:\s+'(%s/)',''' % (self._VALID_URL, ), + webpage, 'video_alt_url', default=None)) + if video_alt_url and video_alt_url != url: + webpage = self._download_webpage( + video_alt_url, main_id, + note='Redirecting embed to main page', fatal=False) or webpage + + video_holder = get_element_by_class('video-holder', webpage) or '' + if '>This video is a private video' in video_holder: + self.raise_login_required( + (clean_html(video_holder) or 'Private video').split('\n', 1)[0]) + + uploader = self._html_search_regex( + r'''(?s)]*>Added by:\s*]+\bclass\s*=\s*["']author\b[^>]+\bhref\s*=\s*["']https://thisvid\.com/members/([0-9]+/.{3,}?)\s*''', + webpage, 'uploader', default='') + uploader = re.split(r'''/["'][^>]*>\s*''', uploader) + if len(uploader) == 2: + # id must be non-empty, uploader could be '' + uploader_id, uploader = uploader + uploader = uploader or None + else: + uploader_id = uploader = None + + return merge_dicts({ + '_type': 'url_transparent', + 'title': title, + 'age_limit': 18, + 'uploader': uploader, + 'uploader_id': uploader_id, + }, self.url_result(url, ie='Generic')) + + +class ThisVidMemberIE(InfoExtractor): + _VALID_URL = r'https?://thisvid\.com/members/(?P\d+)' + _TESTS = [{ + 'url': 'https://thisvid.com/members/2140501/', + 'info_dict': { + 'id': '2140501', + 'title': 'Rafflesia\'s Profile', + }, + 'playlist_mincount': 16, + }, { + 'url': 'https://thisvid.com/members/2140501/favourite_videos/', + 'info_dict': { + 'id': '2140501', + 'title': 'Rafflesia\'s Favourite Videos', + }, + 'playlist_mincount': 15, + }, { + 'url': 'https://thisvid.com/members/636468/public_videos/', + 'info_dict': { + 'id': '636468', + 'title': 'Happymouth\'s Public Videos', + }, + 'playlist_mincount': 196, + }, + ] + + def _urls(self, html): + for m in re.finditer(r''']+\bhref\s*=\s*["'](?P%s\b)[^>]+>''' % (ThisVidIE._VALID_URL, ), html): + yield m.group('url') + + def _real_extract(self, url): + pl_id = self._match_id(url) + webpage = self._download_webpage(url, pl_id) + + title = re.split( + r'(?i)\s*\|\s*ThisVid\.com\s*$', + self._og_search_title(webpage, default=None) or self._html_search_regex(r'(?s)]*>(.+?)]+\bhref\s*=\s*("|')(?P(?!#)(?:(?!\1).)+)''', + next_page, 'next page link', group='url', default=None)) + # in case a member page should have pagination-next with empty link, not just `else:` + if next_page is None: + # playlist page + parsed_url = compat_urlparse.urlparse(page_url) + base_path, num = parsed_url.path.rsplit('/', 1) + num = int_or_none(num) + if num is None: + base_path, num = parsed_url.path.rstrip('/'), 1 + parsed_url = parsed_url._replace(path=base_path + ('/%d' % (num + 1, ))) + next_page = compat_urlparse.urlunparse(parsed_url) + if page_url == next_page: + next_page = None + if not next_page: + break + page_url, html = next_page, None + + return self.playlist_from_matches( + entries(url, webpage), playlist_id=pl_id, playlist_title=title, ie='ThisVid') + + +class ThisVidPlaylistIE(ThisVidMemberIE): + _VALID_URL = r'https?://thisvid\.com/playlist/(?P\d+)/video/(?P[A-Za-z0-9-]+)' + _TESTS = [{ + 'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/', + 'info_dict': { + 'id': '6615', + 'title': 'Underwear Stuff', + }, + 'playlist_mincount': 200, + }, { + 'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/', + 'info_dict': { + 'id': '1072387', + 'ext': 'mp4', + 'title': 'Big Italian Booty 28', + 'description': 'md5:1bccf7b13765e18fb27bf764dba7ede2', + 'uploader_id': '367912', + 'uploader': 'Jcmusclefun', + 'age_limit': 18, + }, + 'params': { + 'noplaylist': True, + }, + }] + + def _get_video_url(self, pl_url): + video_id = re.match(self._VALID_URL, pl_url).group('video_id') + return urljoin(pl_url, '/videos/%s/' % (video_id, )) + + def _urls(self, html): + for m in re.finditer(r''']+\bhref\s*=\s*["'](?P%s\b)[^>]+>''' % (self._VALID_URL, ), html): + yield self._get_video_url(m.group('url')) + + def _real_extract(self, url): + pl_id = self._match_id(url) + + if self._downloader.params.get('noplaylist'): + self.to_screen('Downloading just the featured video because of --no-playlist') + return self.url_result(self._get_video_url(url), 'ThisVid') + + self.to_screen( + 'Downloading playlist %s - add --no-playlist to download just the featured video' % (pl_id, )) + result = super(ThisVidPlaylistIE, self)._real_extract(url) + + # rework title returned as `the title - the title` + title = result['title'] + t_len = len(title) + if t_len > 5 and t_len % 2 != 0: + t_len = t_len // 2 + if title[t_len] == '-': + title = [t.strip() for t in (title[:t_len], title[t_len + 1:])] + if title[0] and title[0] == title[1]: + result['title'] = title[0] + return result diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py index 8bd5fd640..ec5cbdf03 100644 --- a/youtube_dl/extractor/tv2dk.py +++ b/youtube_dl/extractor/tv2dk.py @@ -41,8 +41,16 @@ class TV2DKIE(InfoExtractor): 'duration': 1347, 'view_count': int, }, - 'params': { - 'skip_download': True, + 'add_ie': ['Kaltura'], + }, { + 'url': 'https://www.tv2lorry.dk/gadekamp/gadekamp-6-hoejhuse-i-koebenhavn', + 'info_dict': { + 'id': '1_7iwll9n0', + 'ext': 'mp4', + 'upload_date': '20211027', + 'title': 'Gadekamp #6 - Højhuse i København', + 'uploader_id': 'tv2lorry', + 'timestamp': 1635345229, }, 'add_ie': ['Kaltura'], }, { @@ -91,11 +99,14 @@ class TV2DKIE(InfoExtractor): add_entry(partner_id, kaltura_id) if not entries: kaltura_id = self._search_regex( - r'entry_id\s*:\s*["\']([0-9a-z_]+)', webpage, 'kaltura id') + (r'entry_id\s*:\s*["\']([0-9a-z_]+)', + r'\\u002FentryId\\u002F(\w+)\\u002F'), webpage, 'kaltura id') partner_id = self._search_regex( (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage, 'partner id') add_entry(partner_id, kaltura_id) + if len(entries) == 1: + return entries[0] return self.playlist_result(entries) diff --git a/youtube_dl/extractor/uktvplay.py b/youtube_dl/extractor/uktvplay.py index f28fd514d..9ef9638cd 100644 --- a/youtube_dl/extractor/uktvplay.py +++ b/youtube_dl/extractor/uktvplay.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class UKTVPlayIE(InfoExtractor): - _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/(?:.+?\?.*?\bvideo=|([^/]+/)*watch-online/)(?P\d+)' + _VALID_URL = r'https?://uktvplay\.(?:uktv\.)?co\.uk/(?:.+?\?.*?\bvideo=|([^/]+/)*watch-online/)(?P\d+)' _TESTS = [{ 'url': 'https://uktvplay.uktv.co.uk/shows/world-at-war/c/200/watch-online/?video=2117008346001', 'info_dict': { diff --git a/youtube_dl/extractor/uol.py b/youtube_dl/extractor/uol.py index 628adf219..59f8e5dc3 100644 --- a/youtube_dl/extractor/uol.py +++ b/youtube_dl/extractor/uol.py @@ -95,7 +95,6 @@ class UOLIE(InfoExtractor): if v: query[k] = v f_url = update_url_query(f_url, query) - format_id = format_id if format_id == 'HLS': m3u8_formats = self._extract_m3u8_formats( f_url, media_id, 'mp4', 'm3u8_native', diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py index d6c79147e..abd2bee84 100644 --- a/youtube_dl/extractor/urplay.py +++ b/youtube_dl/extractor/urplay.py @@ -4,7 +4,11 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( dict_get, + ExtractorError, int_or_none, + ISO639Utils, + parse_age_limit, + try_get, unified_timestamp, ) @@ -23,9 +27,10 @@ class URPlayIE(InfoExtractor): 'upload_date': '20171214', 'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik', 'duration': 2269, - 'categories': ['Kultur & historia'], + 'categories': ['Vetenskap & teknik'], 'tags': ['Kritiskt tänkande', 'Vetenskap', 'Vetenskaplig verksamhet'], 'episode': 'Om vetenskap, kritiskt tänkande och motstånd', + 'age_limit': 15, }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', @@ -50,11 +55,19 @@ class URPlayIE(InfoExtractor): video_id = self._match_id(url) url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) - vid = int(video_id) - accessible_episodes = self._parse_json(self._html_search_regex( - r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', - webpage, 'urplayer data'), video_id)['accessibleEpisodes'] - urplayer_data = next(e for e in accessible_episodes if e.get('id') == vid) + urplayer_data = self._search_regex( + r'(?s)\bid\s*=\s*"__NEXT_DATA__"[^>]*>\s*({.+?})\s*]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['accessibleEpisodes'] + urplayer_data = next(e for e in accessible_episodes if e.get('id') == int_or_none(video_id)) episode = urplayer_data['title'] raw_streaming_info = urplayer_data['streamingInfo']['raw'] host = self._download_json( @@ -72,6 +85,30 @@ class URPlayIE(InfoExtractor): video_id, skip_protocols=['f4m', 'rtmp', 'rtsp'])) self._sort_formats(formats) + subtitles = {} + + def parse_lang_code(code): + "3-character language code or None (utils candidate)" + if code is None: + return + lang = code.lower() + if not ISO639Utils.long2short(lang): + lang = ISO639Utils.short2long(lang) + return lang or None + + for k, v in (urplayer_data['streamingInfo'].get('sweComplete') or {}).items(): + if (k in ('sd', 'hd') or not isinstance(v, dict)): + continue + lang, sttl_url = (v.get(kk) for kk in ('language', 'location', )) + if not sttl_url: + continue + lang = parse_lang_code(lang) + if not lang: + continue + sttl = subtitles.get(lang) or [] + sttl.append({'ext': k, 'url': sttl_url, }) + subtitles[lang] = sttl + image = urplayer_data.get('image') or {} thumbnails = [] for k, v in image.items(): @@ -104,4 +141,7 @@ class URPlayIE(InfoExtractor): 'season': series.get('label'), 'episode': episode, 'episode_number': int_or_none(urplayer_data.get('episodeNumber')), + 'age_limit': parse_age_limit(min(try_get(a, lambda x: x['from'], int) or 0 + for a in urplayer_data.get('ageRanges', []))), + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py index ab2c15cde..4589e78a1 100644 --- a/youtube_dl/extractor/videa.py +++ b/youtube_dl/extractor/videa.py @@ -12,6 +12,7 @@ from ..utils import ( mimetype2ext, parse_codecs, update_url_query, + urljoin, xpath_element, xpath_text, ) @@ -19,6 +20,7 @@ from ..compat import ( compat_b64decode, compat_ord, compat_struct_pack, + compat_urlparse, ) @@ -45,10 +47,24 @@ class VideaIE(InfoExtractor): }, }, { 'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH', - 'only_matching': True, + 'md5': 'd57ccd8812c7fd491d33b1eab8c99975', + 'info_dict': { + 'id': 'jAHDWfWSJH5XuFhH', + 'ext': 'mp4', + 'title': 'Supercars előzés', + 'thumbnail': r're:^https?://.*', + 'duration': 64, + }, }, { 'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ', - 'only_matching': True, + 'md5': '97a7af41faeaffd9f1fc864a7c7e7603', + 'info_dict': { + 'id': '8YfIAjxwWGwT8HVQ', + 'ext': 'mp4', + 'title': 'Az őrült kígyász 285 kígyót enged szabadon', + 'thumbnail': r're:^https?://.*', + 'duration': 21, + }, }, { 'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', 'only_matching': True, @@ -91,13 +107,20 @@ class VideaIE(InfoExtractor): k = S[(S[i] + S[j]) % 256] res += compat_struct_pack('B', k ^ compat_ord(cipher_text[m])) - return res.decode() + return res.decode('utf-8') def _real_extract(self, url): video_id = self._match_id(url) - query = {'v': video_id} - player_page = self._download_webpage( - 'https://videa.hu/player', video_id, query=query) + video_page = self._download_webpage(url, video_id) + + if 'videa.hu/player' in url: + player_url = url + player_page = video_page + else: + player_url = self._search_regex( + r'[0-9]+v)' % VikiBaseIE._VALID_URL_BASE _TESTS = [{ + 'note': 'Free non-DRM video with storyboards in MPD', + 'url': 'https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1', + 'info_dict': { + 'id': '1175236v', + 'ext': 'mp4', + 'title': 'Choosing Spouse by Lottery - Episode 1', + 'timestamp': 1606463239, + 'age_limit': 12, + 'uploader': 'FCC', + 'upload_date': '20201127', + }, + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], + 'params': { + 'format': 'bestvideo', + }, + }, { 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', 'info_dict': { 'id': '1023585v', @@ -146,7 +159,7 @@ class VikiIE(VikiBaseIE): 'params': { 'format': 'bestvideo', }, - 'skip': 'Blocked in the US', + 'skip': 'Content is only available to Viki Pass Plus subscribers', 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { # clip @@ -178,11 +191,11 @@ class VikiIE(VikiBaseIE): 'like_count': int, 'age_limit': 13, }, - 'skip': 'Blocked in the US', + 'skip': 'Page not found!', }, { # episode 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '0a53dc252e6e690feccd756861495a8c', + 'md5': '670440c79f7109ca6564d4c7f24e3e81', 'info_dict': { 'id': '44699v', 'ext': 'mp4', @@ -193,7 +206,7 @@ class VikiIE(VikiBaseIE): 'upload_date': '20100405', 'uploader': 'group8', 'like_count': int, - 'age_limit': 13, + 'age_limit': 15, 'episode_number': 1, }, 'params': { @@ -224,7 +237,7 @@ class VikiIE(VikiBaseIE): }, { # non-English description 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': '41faaba0de90483fb4848952af7c7d0d', + 'md5': '78bf49fdaa51f9e7f9150262a9ef9bdf', 'info_dict': { 'id': '158036v', 'ext': 'mp4', @@ -232,8 +245,8 @@ class VikiIE(VikiBaseIE): 'upload_date': '20111122', 'timestamp': 1321985454, 'description': 'md5:44b1e46619df3a072294645c770cef36', - 'title': 'Love In Magic', - 'age_limit': 13, + 'title': 'Love in Magic', + 'age_limit': 15, }, 'params': { 'format': 'bestvideo', @@ -244,45 +257,53 @@ class VikiIE(VikiBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - resp = self._download_json( - 'https://www.viki.com/api/videos/' + video_id, - video_id, 'Downloading video JSON', headers={ - 'x-client-user-agent': std_headers['User-Agent'], - 'x-viki-app-ver': '3.0.0', - }) - video = resp['video'] + video = self._call_api('videos/{0}.json'.format(video_id), video_id, 'Downloading video JSON', query={}) self._check_errors(video) - title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False) + title = try_get(video, lambda x: x['titles']['en'], str) episode_number = int_or_none(video.get('number')) if not title: title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id container_titles = try_get(video, lambda x: x['container']['titles'], dict) or {} container_title = self.dict_selection(container_titles, 'en') - title = '%s - %s' % (container_title, title) + if container_title and title == video_id: + title = container_title + else: + title = '%s - %s' % (container_title, title) + + resp = self._call_api( + 'playback_streams/%s.json?drms=dt3&device_id=%s' % (video_id, self._DEVICE_ID), + video_id, 'Downloading video streams JSON')['main'][0] + + mpd_url = resp['url'] + # 720p is hidden in another MPD which can be found in the current manifest content + mpd_content = self._download_webpage(mpd_url, video_id, note='Downloading initial MPD manifest') + mpd_url = self._search_regex( + r'(?mi)(http.+.mpd)', mpd_content, 'new manifest', default=mpd_url) + if 'mpdhd_high' not in mpd_url: + # Modify the URL to get 1080p + mpd_url = mpd_url.replace('mpdhd', 'mpdhd_high') + formats = self._extract_mpd_formats(mpd_url, video_id) + self._sort_formats(formats) description = self.dict_selection(video.get('descriptions', {}), 'en') - + thumbnails = [{ + 'id': thumbnail_id, + 'url': thumbnail['url'], + } for thumbnail_id, thumbnail in (video.get('images') or {}).items() if thumbnail.get('url')] like_count = int_or_none(try_get(video, lambda x: x['likes']['count'])) - thumbnails = [] - for thumbnail_id, thumbnail in (video.get('images') or {}).items(): - thumbnails.append({ - 'id': thumbnail_id, - 'url': thumbnail.get('url'), - }) + stream_id = try_get(resp, lambda x: x['properties']['track']['stream_id']) + subtitles = dict((lang, [{ + 'ext': ext, + 'url': self._API_URL_TEMPLATE % self._api_query( + 'videos/{0}/auth_subtitles/{1}.{2}'.format(video_id, lang, ext), stream_id=stream_id) + } for ext in ('srt', 'vtt')]) for lang in (video.get('subtitle_completions') or {}).keys()) - subtitles = {} - for subtitle_lang, _ in (video.get('subtitle_completions') or {}).items(): - subtitles[subtitle_lang] = [{ - 'ext': subtitles_format, - 'url': self._prepare_call( - 'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)), - } for subtitles_format in ('srt', 'vtt')] - - result = { + return { 'id': video_id, + 'formats': formats, 'title': title, 'description': description, 'duration': int_or_none(video.get('duration')), @@ -296,79 +317,6 @@ class VikiIE(VikiBaseIE): 'episode_number': episode_number, } - formats = [] - - def add_format(format_id, format_dict, protocol='http'): - # rtmps URLs does not seem to work - if protocol == 'rtmps': - return - format_url = format_dict.get('url') - if not format_url: - return - qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query) - stream = qs.get('stream', [None])[0] - if stream: - format_url = base64.b64decode(stream).decode() - if format_id in ('m3u8', 'hls'): - m3u8_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', - m3u8_id='m3u8-%s' % protocol, fatal=False) - # Despite CODECS metadata in m3u8 all video-only formats - # are actually video+audio - for f in m3u8_formats: - if '_drm/index_' in f['url']: - continue - if f.get('acodec') == 'none' and f.get('vcodec') != 'none': - f['acodec'] = None - formats.append(f) - elif format_id in ('mpd', 'dash'): - formats.extend(self._extract_mpd_formats( - format_url, video_id, 'mpd-%s' % protocol, fatal=False)) - elif format_url.startswith('rtmp'): - mobj = re.search( - r'^(?Prtmp://[^/]+/(?P.+?))/(?Pmp4:.+)$', - format_url) - if not mobj: - return - formats.append({ - 'format_id': 'rtmp-%s' % format_id, - 'ext': 'flv', - 'url': mobj.group('url'), - 'play_path': mobj.group('playpath'), - 'app': mobj.group('app'), - 'page_url': url, - }) - else: - formats.append({ - 'url': format_url, - 'format_id': '%s-%s' % (format_id, protocol), - 'height': int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)), - }) - - for format_id, format_dict in (resp.get('streams') or {}).items(): - add_format(format_id, format_dict) - if not formats: - streams = self._call_api( - 'videos/%s/streams.json' % video_id, video_id, - 'Downloading video streams JSON') - - if 'external' in streams: - result.update({ - '_type': 'url_transparent', - 'url': streams['external']['url'], - }) - return result - - for format_id, stream_dict in streams.items(): - for protocol, format_dict in stream_dict.items(): - add_format(format_id, format_dict, protocol) - self._sort_formats(formats) - - result['formats'] = formats - return result - class VikiChannelIE(VikiBaseIE): IE_NAME = 'viki:channel' @@ -378,9 +326,9 @@ class VikiChannelIE(VikiBaseIE): 'info_dict': { 'id': '50c', 'title': 'Boys Over Flowers', - 'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59', + 'description': 'md5:f08b679c200e1a273c695fe9986f21d7', }, - 'playlist_mincount': 71, + 'playlist_mincount': 51, }, { 'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete', 'info_dict': { @@ -401,33 +349,38 @@ class VikiChannelIE(VikiBaseIE): 'only_matching': True, }] - _PER_PAGE = 25 + _video_types = ('episodes', 'movies', 'clips', 'trailers') + + def _entries(self, channel_id): + params = { + 'app': self._APP, 'token': self._token, 'only_ids': 'true', + 'direction': 'asc', 'sort': 'number', 'per_page': 30 + } + video_types = self._video_types + for video_type in video_types: + if video_type not in self._video_types: + self.report_warning('Unknown video_type: ' + video_type) + page_num = 0 + while True: + page_num += 1 + params['page'] = page_num + res = self._call_api( + 'containers/{channel_id}/{video_type}.json'.format(**locals()), channel_id, query=params, fatal=False, + note='Downloading %s JSON page %d' % (video_type.title(), page_num)) + + for video_id in res.get('response') or []: + yield self.url_result('https://www.viki.com/videos/' + video_id, VikiIE.ie_key(), video_id) + if not res.get('more'): + break def _real_extract(self, url): channel_id = self._match_id(url) - channel = self._call_api( - 'containers/%s.json' % channel_id, channel_id, - 'Downloading channel JSON') + channel = self._call_api('containers/%s.json' % channel_id, channel_id, 'Downloading channel JSON') self._check_errors(channel) - title = self.dict_selection(channel['titles'], 'en') - - description = self.dict_selection(channel['descriptions'], 'en') - - entries = [] - for video_type in ('episodes', 'clips', 'movies'): - for page_num in itertools.count(1): - page = self._call_api( - 'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d' - % (channel_id, video_type, self._PER_PAGE, page_num), channel_id, - 'Downloading %s JSON page #%d' % (video_type, page_num)) - for video in page['response']: - video_id = video['id'] - entries.append(self.url_result( - 'https://www.viki.com/videos/%s' % video_id, 'Viki')) - if not page['pagination']['next']: - break - - return self.playlist_result(entries, channel_id, title, description) + return self.playlist_result( + self._entries(channel_id), channel_id, + self.dict_selection(channel['titles'], 'en'), + self.dict_selection(channel['descriptions'], 'en')) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 0b386f450..7f2731d83 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -261,27 +261,33 @@ class VimeoIE(VimeoBaseInfoExtractor): # _VALID_URL matches Vimeo URLs _VALID_URL = r'''(?x) - https?:// - (?: - (?: - www| - player - ) - \. - )? - vimeo(?:pro)?\.com/ - (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) - (?:.*?/)? - (?: - (?: - play_redirect_hls| - moogaloop\.swf)\?clip_id= - )? - (?:videos?/)? - (?P[0-9]+) - (?:/(?P[\da-f]{10}))? - /?(?:[?&].*)?(?:[#].*)?$ - ''' + https?:// + (?: + (?: + www| + player + ) + \. + )? + vimeo(?:pro)?\.com/ + (?: + (?Puser)| + (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) + (?:.*?/)?? + (?P + (?: + play_redirect_hls| + moogaloop\.swf)\?clip_id= + )? + (?:videos?/)? + ) + (?P[0-9]+) + (?(u) + /(?!videos|likes)[^/?#]+/?| + (?(q)|/(?P[\da-f]{10}))? + ) + (?:(?(q)[&]|(?(u)|/?)[?]).*?)?(?:[#].*)?$ + ''' IE_NAME = 'vimeo' _TESTS = [ { @@ -517,15 +523,34 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/7809605', 'only_matching': True, }, - { - 'url': 'https://vimeo.com/160743502/abd0e13fb4', - 'only_matching': True, - }, { # requires passing unlisted_hash(a52724358e) to load_download_config request 'url': 'https://vimeo.com/392479337/a52724358e', 'only_matching': True, - } + }, + { + # similar, but all numeric: ID must be 581039021, not 9603038895 + # issue #29690 + 'url': 'https://vimeo.com/581039021/9603038895', + 'info_dict': { + 'id': '581039021', + # these have to be provided but we don't care + 'ext': 'mp4', + 'timestamp': 1627621014, + 'title': 're:.+', + 'uploader_id': 're:.+', + 'uploader': 're:.+', + 'upload_date': r're:\d+', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # user playlist alias -> https://vimeo.com/258705797 + 'url': 'https://vimeo.com/user26785108/newspiritualguide', + 'only_matching': True, + }, # https://gettingthingsdone.com/workflowmap/ # vimeo embed with check-password page protected by Referer header ] @@ -649,7 +674,7 @@ class VimeoIE(VimeoBaseInfoExtractor): if '//player.vimeo.com/video/' in url: config = self._parse_json(self._search_regex( - r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id) + r'(?s)\b(?:playerC|c)onfig\s*=\s*({.+?})\s*[;\n]', webpage, 'info section'), video_id) if config.get('view') == 4: config = self._verify_player_video_password( redirect_url, video_id, headers) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index bc196f8a0..6a0d4e8f0 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -64,6 +64,18 @@ class VVVVIDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # video_type == 'video/dash' + 'url': 'https://www.vvvvid.it/show/683/made-in-abyss/1542/693786/nanachi', + 'info_dict': { + 'id': '693786', + 'ext': 'mp4', + 'title': 'Nanachi', + }, + 'params': { + 'skip_download': True, + 'format': 'mp4', + }, }, { 'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048', 'only_matching': True @@ -205,6 +217,9 @@ class VVVVIDIE(InfoExtractor): }) is_youtube = True break + elif video_type == 'video/dash': + formats.extend(self._extract_m3u8_formats( + embed_code, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index f1bccc2d6..b15e03768 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -57,7 +57,7 @@ class WatIE(InfoExtractor): # 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) video_data = self._download_json( 'https://mediainfo.tf1.fr/mediainfocombo/' + video_id, - video_id, query={'context': 'MYTF1'}) + video_id, query={'context': 'MYTF1', 'pver': '4001000'}) video_info = video_data['media'] error_desc = video_info.get('error_desc') diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 2903d189e..10db73148 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -10,6 +10,7 @@ from ..compat import ( ) from ..utils import ( determine_ext, + dict_get, ExtractorError, js_to_json, strip_jsonp, @@ -22,9 +23,10 @@ from ..utils import ( class WDRIE(InfoExtractor): - _VALID_URL = r'https?://deviceids-medp\.wdr\.de/ondemand/\d+/(?P\d+)\.js' + __API_URL_TPL = '//deviceids-medp.wdr.de/ondemand/%s/%s' + _VALID_URL = (r'(?:https?:' + __API_URL_TPL) % (r'\d+', r'(?=\d+\.js)|wdr:)(?P\d{6,})') _GEO_COUNTRIES = ['DE'] - _TEST = { + _TESTS = [{ 'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js', 'info_dict': { 'id': 'mdb-1557833', @@ -32,11 +34,20 @@ class WDRIE(InfoExtractor): 'title': 'Biathlon-Staffel verpasst Podest bei Olympia-Generalprobe', 'upload_date': '20180112', }, - } + }, + ] + + def _asset_url(self, wdr_id): + id_len = max(len(wdr_id), 5) + return ''.join(('https:', self.__API_URL_TPL % (wdr_id[:id_len - 4], wdr_id, ), '.js')) def _real_extract(self, url): video_id = self._match_id(url) + if url.startswith('wdr:'): + video_id = url[4:] + url = self._asset_url(video_id) + metadata = self._download_json( url, video_id, transform_source=strip_jsonp) @@ -115,10 +126,10 @@ class WDRIE(InfoExtractor): } -class WDRPageIE(InfoExtractor): - _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' +class WDRPageIE(WDRIE): + _MAUS_REGEX = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/)*?(?P[^/?#.]+)(?:/?|/index\.php5|\.php5)$' _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P[^/]+)\.html' - _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL + _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _MAUS_REGEX _TESTS = [ { @@ -159,11 +170,11 @@ class WDRPageIE(InfoExtractor): { 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', 'info_dict': { - 'id': 'mdb-1406149', + 'id': 'mdb-2296252', 'ext': 'mp4', - 'title': r're:^WDR Fernsehen im Livestream \(nur in Deutschland erreichbar\) [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': r're:^WDR Fernsehen im Livestream (?:\(nur in Deutschland erreichbar\) )?[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'alt_title': 'WDR Fernsehen Live', - 'upload_date': '20150101', + 'upload_date': '20201112', 'is_live': True, }, 'params': { @@ -172,7 +183,7 @@ class WDRPageIE(InfoExtractor): }, { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', - 'playlist_mincount': 7, + 'playlist_mincount': 6, 'info_dict': { 'id': 'aktuelle-stunde-120', }, @@ -180,10 +191,10 @@ class WDRPageIE(InfoExtractor): { 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', 'info_dict': { - 'id': 'mdb-1552552', + 'id': 'mdb-2627637', 'ext': 'mp4', 'upload_date': 're:^[0-9]{8}$', - 'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$', + 'title': 're:^Die Sendung (?:mit der Maus )?vom [0-9.]{10}$', }, 'skip': 'The id changes from week to week because of the new episode' }, @@ -196,6 +207,7 @@ class WDRPageIE(InfoExtractor): 'upload_date': '20130919', 'title': 'Sachgeschichte - Achterbahn ', }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html', @@ -221,6 +233,7 @@ class WDRPageIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html', @@ -234,7 +247,7 @@ class WDRPageIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') + display_id = dict_get(mobj.groupdict(), ('display_id', 'maus_id'), 'wdrmaus') webpage = self._download_webpage(url, display_id) entries = [] @@ -260,6 +273,14 @@ class WDRPageIE(InfoExtractor): jsonp_url = try_get( media_link_obj, lambda x: x['mediaObj']['url'], compat_str) if jsonp_url: + # metadata, or player JS with ['ref'] giving WDR id, or just media, perhaps + clip_id = media_link_obj['mediaObj'].get('ref') + if jsonp_url.endswith('.assetjsonp'): + asset = self._download_json( + jsonp_url, display_id, fatal=False, transform_source=strip_jsonp) + clip_id = try_get(asset, lambda x: x['trackerData']['trackerClipId'], compat_str) + if clip_id: + jsonp_url = self._asset_url(clip_id[4:]) entries.append(self.url_result(jsonp_url, ie=WDRIE.ie_key())) # Playlist (e.g. https://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html) @@ -279,16 +300,14 @@ class WDRPageIE(InfoExtractor): class WDRElefantIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)wdrmaus\.de/elefantenseite/#(?P.+)' _TEST = { - 'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015', + 'url': 'http://www.wdrmaus.de/elefantenseite/#elefantenkino_wippe', + # adaptive stream: unstable file MD5 'info_dict': { - 'title': 'Folge Oster-Spezial 2015', - 'id': 'mdb-1088195', + 'title': 'Wippe', + 'id': 'mdb-1198320', 'ext': 'mp4', 'age_limit': None, - 'upload_date': '20150406' - }, - 'params': { - 'skip_download': True, + 'upload_date': '20071003' }, } @@ -323,6 +342,7 @@ class WDRMobileIE(InfoExtractor): /[0-9]+/[0-9]+/ (?P[0-9]+)_(?P[0-9]+)''' IE_NAME = 'wdr:mobile' + _WORKING = False # no such domain _TEST = { 'url': 'http://mobile-ondemand.wdr.de/CMS2010/mdb/ondemand/weltweit/fsk0/42/421735/421735_4283021.mp4', 'info_dict': { diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index f73b9778f..e17947fc6 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import itertools @@ -23,7 +24,7 @@ from ..utils import ( class XHamsterIE(InfoExtractor): - _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com)' + _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com|xhvid\.com)' _VALID_URL = r'''(?x) https?:// (?:.+?\.)?%s/ @@ -34,7 +35,7 @@ class XHamsterIE(InfoExtractor): ''' % _DOMAINS _TESTS = [{ 'url': 'https://xhamster.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', - 'md5': '98b4687efb1ffd331c4197854dc09e8f', + 'md5': '34e1ab926db5dc2750fed9e1f34304bb', 'info_dict': { 'id': '1509445', 'display_id': 'femaleagent-shy-beauty-takes-the-bait', @@ -43,6 +44,7 @@ class XHamsterIE(InfoExtractor): 'timestamp': 1350194821, 'upload_date': '20121014', 'uploader': 'Ruseful2011', + 'uploader_id': 'ruseful2011', 'duration': 893, 'age_limit': 18, }, @@ -72,6 +74,7 @@ class XHamsterIE(InfoExtractor): 'timestamp': 1454948101, 'upload_date': '20160208', 'uploader': 'parejafree', + 'uploader_id': 'parejafree', 'duration': 72, 'age_limit': 18, }, @@ -117,6 +120,12 @@ class XHamsterIE(InfoExtractor): }, { 'url': 'http://de.xhamster.com/videos/skinny-girl-fucks-herself-hard-in-the-forest-xhnBJZx', 'only_matching': True, + }, { + 'url': 'https://xhday.com/videos/strapless-threesome-xhh7yVf', + 'only_matching': True, + }, { + 'url': 'https://xhvid.com/videos/lk-mm-xhc6wn6', + 'only_matching': True, }] def _real_extract(self, url): @@ -245,6 +254,7 @@ class XHamsterIE(InfoExtractor): else: categories = None + uploader_url = url_or_none(try_get(video, lambda x: x['author']['pageURL'])) return { 'id': video_id, 'display_id': display_id, @@ -253,6 +263,8 @@ class XHamsterIE(InfoExtractor): 'timestamp': int_or_none(video.get('created')), 'uploader': try_get( video, lambda x: x['author']['name'], compat_str), + 'uploader_url': uploader_url, + 'uploader_id': uploader_url.split('/')[-1] if uploader_url else None, 'thumbnail': video.get('thumbURL'), 'duration': int_or_none(video.get('duration')), 'view_count': int_or_none(video.get('views')), @@ -261,7 +273,7 @@ class XHamsterIE(InfoExtractor): 'dislike_count': int_or_none(try_get( video, lambda x: x['rating']['dislikes'], int)), 'comment_count': int_or_none(video.get('views')), - 'age_limit': age_limit, + 'age_limit': age_limit if age_limit is not None else 18, 'categories': categories, 'formats': formats, } @@ -352,6 +364,7 @@ class XHamsterIE(InfoExtractor): 'description': description, 'upload_date': upload_date, 'uploader': uploader, + 'uploader_id': uploader.lower() if uploader else None, 'thumbnail': thumbnail, 'duration': duration, 'view_count': view_count, @@ -420,6 +433,12 @@ class XHamsterUserIE(InfoExtractor): 'id': 'firatkaan', }, 'playlist_mincount': 1, + }, { + 'url': 'https://xhday.com/users/mobhunter', + 'only_matching': True, + }, { + 'url': 'https://xhvid.com/users/pelushe21', + 'only_matching': True, }] def _entries(self, user_id): diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 8fc64914c..e63d4690d 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -82,7 +82,7 @@ class XVideosIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'https://www.xvideos.com/video%s/' % video_id, video_id) + 'https://www.xvideos.com/video%s/0' % video_id, video_id) mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage) if mobj: diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 7084d3d12..31e8abb72 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -137,9 +138,10 @@ class YouPornIE(InfoExtractor): r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( - [r'UPLOADED:\s*<span>([^<]+)', + (r'UPLOADED:\s*<span>([^<]+)', r'Date\s+[Aa]dded:\s*<span>([^<]+)', - r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'], + r'''(?s)<div[^>]+class=["']videoInfo(?:Date|Time)\b[^>]*>(.+?)</div>''', + r'(?s)<label\b[^>]*>Uploaded[^<]*</label>\s*<span\b[^>]*>(.+?)</span>'), webpage, 'upload date', fatal=False)) age_limit = self._rta_search(webpage) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index dc4bd4a77..ba0f5c8b6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -13,32 +13,39 @@ from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_chr, compat_HTTPError, - compat_parse_qs, + compat_map as map, compat_str, + compat_urllib_parse, + compat_urllib_parse_parse_qs as compat_parse_qs, compat_urllib_parse_unquote_plus, - compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, - compat_urlparse, ) from ..jsinterp import JSInterpreter from ..utils import ( ExtractorError, clean_html, dict_get, + error_to_compat_str, float_or_none, + extract_attributes, + get_element_by_attribute, int_or_none, + js_to_json, mimetype2ext, parse_codecs, parse_duration, + parse_qs, qualities, remove_start, smuggle_url, str_or_none, str_to_int, + traverse_obj, try_get, unescapeHTML, unified_strdate, unsmuggle_url, + update_url, update_url_query, url_or_none, urlencode_postdata, @@ -46,10 +53,6 @@ from ..utils import ( ) -def parse_qs(url): - return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - - class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' @@ -283,15 +286,18 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)' - def _call_api(self, ep, query, video_id, fatal=True): + def _call_api(self, ep, query, video_id, fatal=True, headers=None): data = self._DEFAULT_API_DATA.copy() data.update(query) + real_headers = {'content-type': 'application/json'} + if headers: + real_headers.update(headers) return self._download_json( 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id, note='Downloading API JSON', errnote='Unable to download API page', data=json.dumps(data).encode('utf8'), fatal=fatal, - headers={'content-type': 'application/json'}, + headers=real_headers, query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'}) def _extract_yt_initial_data(self, video_id, webpage): @@ -312,7 +318,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): title = try_get( renderer, (lambda x: x['title']['runs'][0]['text'], - lambda x: x['title']['simpleText']), compat_str) + lambda x: x['title']['simpleText'], + lambda x: x['headline']['simpleText']), compat_str) description = try_get( renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str) @@ -339,6 +346,60 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'uploader': uploader, } + def _search_results(self, query, params): + data = { + 'context': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20201021.03.00', + } + }, + 'query': query, + } + if params: + data['params'] = params + for page_num in itertools.count(1): + search = self._download_json( + 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + video_id='query "%s"' % query, + note='Downloading page %s' % page_num, + errnote='Unable to download API page', fatal=False, + data=json.dumps(data).encode('utf8'), + headers={'content-type': 'application/json'}) + if not search: + break + slr_contents = try_get( + search, + (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], + lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), + list) + if not slr_contents: + break + for slr_content in slr_contents: + isr_contents = try_get( + slr_content, + lambda x: x['itemSectionRenderer']['contents'], + list) + if not isr_contents: + continue + for content in isr_contents: + if not isinstance(content, dict): + continue + video = content.get('videoRenderer') + if not isinstance(video, dict): + continue + video_id = video.get('videoId') + if not video_id: + continue + yield self._extract_video(video) + token = try_get( + slr_contents, + lambda x: x[-1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], + compat_str) + if not token: + break + data['continuation'] = token + class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' @@ -415,6 +476,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ + |shorts/ |(?: # or the v= param in all its forms (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! @@ -441,7 +503,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$', ) - _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') + _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') _GEO_BYPASS = False @@ -456,6 +518,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'channel': 'Philipp Hagemeister', 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', 'upload_date': '20121002', @@ -465,10 +528,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': 10, 'view_count': int, 'like_count': int, - 'dislike_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/BaW_jenozKc/maxresdefault.jpg', 'start_time': 1, 'end_time': 9, - } + }, }, { 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', @@ -503,7 +566,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': 10, 'view_count': int, 'like_count': int, - 'dislike_count': int, }, 'params': { 'skip_download': True, @@ -562,8 +624,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', } }, - # Normal age-gate video (No vevo, embed allowed), available via embed page + # Age-gated videos { + 'note': 'Age-gated video (No vevo, embed allowed)', 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', 'info_dict': { 'id': 'HtVdAasjOgU', @@ -575,14 +638,98 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'WitcherGame', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', 'upload_date': '20140605', + 'thumbnail': 'https://i.ytimg.com/vi/HtVdAasjOgU/maxresdefault.jpg', 'age_limit': 18, + 'categories': ['Gaming'], + 'tags': 'count:17', + 'channel': 'The Witcher', + 'channel_url': 'https://www.youtube.com/channel/UCzybXLxv08IApdjdN0mJhEg', + 'channel_id': 'UCzybXLxv08IApdjdN0mJhEg', + 'view_count': int, + 'like_count': int, }, }, { - # Age-gated video only available with authentication (unavailable - # via embed page workaround) + 'note': 'Age-gated video with embed allowed in public site', + 'url': 'https://youtube.com/watch?v=HsUATh_Nc2U', + 'info_dict': { + 'id': 'HsUATh_Nc2U', + 'ext': 'mp4', + 'title': 'Godzilla 2 (Official Video)', + 'description': 'md5:bf77e03fcae5529475e500129b05668a', + 'duration': 177, + 'uploader': 'FlyingKitty', + 'uploader_id': 'FlyingKitty900', + 'upload_date': '20200408', + 'thumbnail': 'https://i.ytimg.com/vi/HsUATh_Nc2U/maxresdefault.jpg', + 'age_limit': 18, + 'categories': ['Entertainment'], + 'tags': ['Flyingkitty', 'godzilla 2'], + 'channel': 'FlyingKitty', + 'channel_url': 'https://www.youtube.com/channel/UCYQT13AtrJC0gsM1far_zJg', + 'channel_id': 'UCYQT13AtrJC0gsM1far_zJg', + 'view_count': int, + 'like_count': int, + }, + }, + { + 'note': 'Age-gated video embeddable only with clientScreen=EMBED', + 'url': 'https://youtube.com/watch?v=Tq92D6wQ1mg', + 'info_dict': { + 'id': 'Tq92D6wQ1mg', + 'ext': 'mp4', + 'title': '[MMD] Adios - EVERGLOW [+Motion DL]', + 'description': 'md5:17eccca93a786d51bc67646756894066', + 'duration': 106, + 'uploader': 'Projekt Melody', + 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', + 'upload_date': '20191227', + 'age_limit': 18, + 'thumbnail': 'https://i.ytimg.com/vi/Tq92D6wQ1mg/sddefault.jpg', + 'tags': ['mmd', 'dance', 'mikumikudance', 'kpop', 'vtuber'], + 'categories': ['Entertainment'], + 'channel': 'Projekt Melody', + 'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ', + 'channel_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', + 'view_count': int, + 'like_count': int, + }, + }, + { + 'note': 'Non-Age-gated non-embeddable video', + 'url': 'https://youtube.com/watch?v=MeJVWBSsPAY', + 'info_dict': { + 'id': 'MeJVWBSsPAY', + 'ext': 'mp4', + 'title': 'OOMPH! - Such Mich Find Mich (Lyrics)', + 'description': 'Fan Video. Music & Lyrics by OOMPH!.', + 'duration': 210, + 'uploader': 'Herr Lurik', + 'uploader_id': 'st3in234', + 'upload_date': '20130730', + 'uploader_url': 'http://www.youtube.com/user/st3in234', + 'age_limit': 0, + 'thumbnail': 'https://i.ytimg.com/vi/MeJVWBSsPAY/hqdefault.jpg', + 'tags': ['oomph', 'such mich find mich', 'lyrics', 'german industrial', 'musica industrial'], + 'categories': ['Music'], + 'channel': 'Herr Lurik', + 'channel_url': 'https://www.youtube.com/channel/UCdR3RSDPqub28LjZx0v9-aA', + 'channel_id': 'UCdR3RSDPqub28LjZx0v9-aA', + 'artist': 'OOMPH!', + 'view_count': int, + 'like_count': int, + }, + }, + { + 'note': 'Non-bypassable age-gated video', + 'url': 'https://youtube.com/watch?v=Cr381pDsSsA', + 'only_matching': True, + }, + { + 'note': 'Age-gated video only available with authentication (not via embed workaround)', 'url': 'XgnwCQzjau8', 'only_matching': True, + 'skip': '''This video has been removed for violating YouTube's Community Guidelines''', }, # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) # YouTube Red ad is not captured for creator @@ -611,17 +758,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'lqQg6PlCWgI', 'ext': 'mp4', + 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', + 'description': r're:(?s)(?:.+\s)?HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games\s*', 'duration': 6085, 'upload_date': '20150827', 'uploader_id': 'olympic', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', - 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', - 'uploader': 'Olympic', - 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', + 'uploader': r're:Olympics?', + 'age_limit': 0, + 'thumbnail': 'https://i.ytimg.com/vi/lqQg6PlCWgI/maxresdefault.jpg', + 'categories': ['Sports'], + 'tags': ['Hockey', '2012-07-31', '31 July 2012', 'Riverbank Arena', 'Session', 'Olympics', 'Olympic Games', 'London 2012', '2012 Summer Olympics', 'Summer Games'], + 'channel': 'Olympics', + 'channel_url': 'https://www.youtube.com/channel/UCTl3QQTvqHFjurroKxexy2Q', + 'channel_id': 'UCTl3QQTvqHFjurroKxexy2Q', + 'view_count': int, + 'like_count': int, }, - 'params': { - 'skip_download': 'requires avconv', - } }, # Non-square pixels { @@ -781,16 +934,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'lsguqyKfVQg', 'ext': 'mp4', 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', - 'alt_title': 'Dark Walk - Position Music', + 'alt_title': 'Dark Walk', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'duration': 133, 'upload_date': '20151119', 'uploader_id': 'IronSoulElf', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader': 'IronSoulElf', - 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', - 'track': 'Dark Walk - Position Music', - 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', + 'creator': r're:Todd Haberman[;,]\s+Daniel Law Heath and Aaron Kaplan', + 'track': 'Dark Walk', + 'artist': r're:Todd Haberman[;,]\s+Daniel Law Heath and Aaron Kaplan', 'album': 'Position Music - Production Music Vol. 143 - Dark Walk', }, 'params': { @@ -1117,6 +1270,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, + { + # YT 'Shorts' + 'url': 'https://youtube.com/shorts/4L2J27mJ3Dc', + 'info_dict': { + 'id': '4L2J27mJ3Dc', + 'ext': 'mp4', + 'upload_date': '20211025', + 'uploader': 'Charlie Berens', + 'description': 'md5:976512b8a29269b93bbd8a61edc45a6d', + 'uploader_id': 'fivedlrmilkshake', + 'title': 'Midwest Squid Game #Shorts', + }, + 'params': { + 'skip_download': True, + }, + }, ] _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -1226,11 +1395,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): @classmethod def suitable(cls, url): - # Hack for lazy extractors until more generic solution is implemented - # (see #28780) - from .youtube import parse_qs - qs = parse_qs(url) - if qs.get('list', [None])[0]: + if parse_qs(url).get('list', [None])[0]: return False return super(YoutubeIE, cls).suitable(url) @@ -1253,6 +1418,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError('Cannot identify player %r' % player_url) return id_m.group('id') + def _get_player_code(self, video_id, player_url, player_id=None): + if not player_id: + player_id = self._extract_player_info(player_url) + + if player_id not in self._code_cache: + self._code_cache[player_id] = self._download_webpage( + player_url, video_id, + note='Downloading player ' + player_id, + errnote='Download of %s failed' % player_url) + return self._code_cache[player_id] + def _extract_signature_function(self, video_id, player_url, example_sig): player_id = self._extract_player_info(player_url) @@ -1265,12 +1441,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) - if player_id not in self._code_cache: - self._code_cache[player_id] = self._download_webpage( - player_url, video_id, - note='Downloading player ' + player_id, - errnote='Download of %s failed' % player_url) - code = self._code_cache[player_id] + code = self._get_player_code(video_id, player_url, player_id) res = self._parse_sig_js(code) test_string = ''.join(map(compat_chr, range(len(example_sig)))) @@ -1323,10 +1494,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): funcname = self._search_regex( (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bm=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)', - r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)', - r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', - r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + r'\bm=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)', + r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)', + r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', + r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', @@ -1349,11 +1520,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_url is None: raise ExtractorError('Cannot decrypt signature without player_url') - if player_url.startswith('//'): - player_url = 'https:' + player_url - elif not re.match(r'https?://', player_url): - player_url = compat_urlparse.urljoin( - 'https://www.youtube.com', player_url) try: player_id = (player_url, self._signature_cache_id(s)) if player_id not in self._player_cache: @@ -1370,26 +1536,147 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError( 'Signature extraction failed: ' + tb, cause=e) + def _extract_player_url(self, webpage): + player_url = self._search_regex( + r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', + webpage or '', 'player URL', fatal=False) + if not player_url: + return + if player_url.startswith('//'): + player_url = 'https:' + player_url + elif not re.match(r'https?://', player_url): + player_url = compat_urllib_parse.urljoin( + 'https://www.youtube.com', player_url) + return player_url + + # from yt-dlp + # See also: + # 1. https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-894619419 + # 2. https://code.videolan.org/videolan/vlc/-/blob/4fb284e5af69aa9ac2100ccbdd3b88debec9987f/share/lua/playlist/youtube.lua#L116 + # 3. https://github.com/ytdl-org/youtube-dl/issues/30097#issuecomment-950157377 + def _extract_n_function_name(self, jscode): + target = r'(?P<nfunc>[a-zA-Z_$][\w$]*)(?:\[(?P<idx>\d+)\])?' + nfunc_and_idx = self._search_regex( + r'\.get\("n"\)\)&&\(b=(%s)\([\w$]+\)' % (target, ), + jscode, 'Initial JS player n function name') + nfunc, idx = re.match(target, nfunc_and_idx).group('nfunc', 'idx') + if not idx: + return nfunc + if int_or_none(idx) == 0: + real_nfunc = self._search_regex( + r'var %s\s*=\s*\[([a-zA-Z_$][\w$]*)\];' % (re.escape(nfunc), ), jscode, + 'Initial JS player n function alias ({nfunc}[{idx}])'.format(**locals())) + if real_nfunc: + return real_nfunc + return self._parse_json(self._search_regex( + r'var %s\s*=\s*(\[.+?\]);' % (re.escape(nfunc), ), jscode, + 'Initial JS player n function name ({nfunc}[{idx}])'.format(**locals())), nfunc, transform_source=js_to_json)[int(idx)] + + def _extract_n_function(self, video_id, player_url): + player_id = self._extract_player_info(player_url) + func_code = self._downloader.cache.load('youtube-nsig', player_id) + + if func_code: + jsi = JSInterpreter(func_code) + else: + jscode = self._get_player_code(video_id, player_url, player_id) + funcname = self._extract_n_function_name(jscode) + jsi = JSInterpreter(jscode) + func_code = jsi.extract_function_code(funcname) + self._downloader.cache.store('youtube-nsig', player_id, func_code) + + if self._downloader.params.get('youtube_print_sig_code'): + self.to_screen('Extracted nsig function from {0}:\n{1}\n'.format(player_id, func_code[1])) + + return lambda s: jsi.extract_function_from_code(*func_code)([s]) + + def _n_descramble(self, n_param, player_url, video_id): + """Compute the response to YT's "n" parameter challenge, + or None + + Args: + n_param -- challenge string that is the value of the + URL's "n" query parameter + player_url -- URL of YT player JS + video_id + """ + + sig_id = ('nsig_value', n_param) + if sig_id in self._player_cache: + return self._player_cache[sig_id] + + try: + player_id = ('nsig', player_url) + if player_id not in self._player_cache: + self._player_cache[player_id] = self._extract_n_function(video_id, player_url) + func = self._player_cache[player_id] + ret = func(n_param) + if ret.startswith('enhanced_except_'): + raise ExtractorError('Unhandled exception in decode') + self._player_cache[sig_id] = ret + if self._downloader.params.get('verbose', False): + self._downloader.to_screen('[debug] [%s] %s' % (self.IE_NAME, 'Decrypted nsig {0} => {1}'.format(n_param, self._player_cache[sig_id]))) + return self._player_cache[sig_id] + except Exception as e: + self._downloader.report_warning( + '[%s] %s (%s %s)' % ( + self.IE_NAME, + 'Unable to decode n-parameter: download likely to be throttled', + error_to_compat_str(e), + traceback.format_exc())) + + def _unthrottle_format_urls(self, video_id, player_url, formats): + for fmt in formats: + parsed_fmt_url = compat_urllib_parse.urlparse(fmt['url']) + n_param = compat_parse_qs(parsed_fmt_url.query).get('n') + if not n_param: + continue + n_param = n_param[-1] + n_response = self._n_descramble(n_param, player_url, video_id) + if n_response is None: + # give up if descrambling failed + break + fmt['url'] = update_url( + parsed_fmt_url, query_update={'n': [n_response]}) + + # from yt-dlp, with tweaks + def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): + """ + Extract signatureTimestamp (sts) + Required to tell API what sig/player version is in use. + """ + sts = int_or_none(ytcfg.get('STS')) if isinstance(ytcfg, dict) else None + if not sts: + # Attempt to extract from player + if player_url is None: + error_msg = 'Cannot extract signature timestamp without player_url.' + if fatal: + raise ExtractorError(error_msg) + self._downloader.report_warning(error_msg) + return + code = self._get_player_code(video_id, player_url) + sts = int_or_none(self._search_regex( + r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code or '', + 'JS player signature timestamp', group='sts', fatal=fatal)) + return sts + def _mark_watched(self, video_id, player_response): playback_url = url_or_none(try_get( player_response, lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl'])) if not playback_url: return - parsed_playback_url = compat_urlparse.urlparse(playback_url) - qs = compat_urlparse.parse_qs(parsed_playback_url.query) # cpn generation algorithm is reverse engineered from base.js. # In fact it works even with dummy cpn. CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) - qs.update({ - 'ver': ['2'], - 'cpn': [cpn], - }) - playback_url = compat_urlparse.urlunparse( - parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + playback_url = update_url( + playback_url, query_update={ + 'ver': ['2'], + 'cpn': [cpn], + }) self._download_webpage( playback_url, video_id, 'Marking watched', @@ -1494,6 +1781,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False) player_response = None + player_url = None if webpage: player_response = self._extract_yt_initial_variable( webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, @@ -1502,27 +1790,61 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_response = self._call_api( 'player', {'videoId': video_id}, video_id) - playability_status = player_response.get('playabilityStatus') or {} - if playability_status.get('reason') == 'Sign in to confirm your age': - video_info = self._download_webpage( - base_url + 'get_video_info', video_id, - 'Refetching age-gated info webpage', - 'unable to download video info webpage', query={ - 'video_id': video_id, - 'eurl': 'https://youtube.googleapis.com/v/' + video_id, - 'html5': 1, - # See https://github.com/ytdl-org/youtube-dl/issues/29333#issuecomment-864049544 - 'c': 'TVHTML5', - 'cver': '6.20180913', - }, fatal=False) - if video_info: - pr = self._parse_json( - try_get( - compat_parse_qs(video_info), - lambda x: x['player_response'][0], compat_str) or '{}', - video_id, fatal=False) - if pr and isinstance(pr, dict): - player_response = pr + def is_agegated(playability): + if not isinstance(playability, dict): + return + + if playability.get('desktopLegacyAgeGateReason'): + return True + + reasons = filter(None, (playability.get(r) for r in ('status', 'reason'))) + AGE_GATE_REASONS = ( + 'confirm your age', 'age-restricted', 'inappropriate', # reason + 'age_verification_required', 'age_check_required', # status + ) + return any(expected in reason for expected in AGE_GATE_REASONS for reason in reasons) + + def get_playability_status(response): + return try_get(response, lambda x: x['playabilityStatus'], dict) or {} + + playability_status = get_playability_status(player_response) + if (is_agegated(playability_status) + and int_or_none(self._downloader.params.get('age_limit'), default=18) >= 18): + + self.report_age_confirmation() + + # Thanks: https://github.com/yt-dlp/yt-dlp/pull/3233 + pb_context = {'html5Preference': 'HTML5_PREF_WANTS'} + + # Use signatureTimestamp if available + # Thanks https://github.com/ytdl-org/youtube-dl/issues/31034#issuecomment-1160718026 + player_url = self._extract_player_url(webpage) + ytcfg = self._extract_ytcfg(video_id, webpage) + sts = self._extract_signature_timestamp(video_id, player_url, ytcfg) + if sts: + pb_context['signatureTimestamp'] = sts + + query = { + 'playbackContext': {'contentPlaybackContext': pb_context}, + 'contentCheckOk': True, + 'racyCheckOk': True, + 'context': { + 'client': {'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', 'clientVersion': '2.0', 'hl': 'en', 'clientScreen': 'EMBED'}, + 'thirdParty': {'embedUrl': 'https://google.com'}, + }, + 'videoId': video_id, + } + headers = { + 'X-YouTube-Client-Name': '85', + 'X-YouTube-Client-Version': '2.0', + 'Origin': 'https://www.youtube.com' + } + + video_info = self._call_api('player', query, video_id, fatal=False, headers=headers) + age_gate_status = get_playability_status(video_info) + if age_gate_status.get('status') == 'OK': + player_response = video_info + playability_status = age_gate_status trailer_video_id = try_get( playability_status, @@ -1604,7 +1926,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): formats = [] itags = [] itag_qualities = {} - player_url = None q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) streaming_data = player_response.get('streamingData') or {} streaming_formats = streaming_data.get('formats') or [] @@ -1631,11 +1952,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not (sc and fmt_url and encrypted_sig): continue if not player_url: - if not webpage: - continue - player_url = self._search_regex( - r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"', - webpage, 'player URL', fatal=False) + player_url = self._extract_player_url(webpage) if not player_url: continue signature = self._decrypt_signature(sc['s'][0], video_id, player_url) @@ -1752,15 +2069,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): thumbnails = [] for container in (video_details, microformat): - for thumbnail in (try_get( + for thumbnail in try_get( container, - lambda x: x['thumbnail']['thumbnails'], list) or []): - thumbnail_url = thumbnail.get('url') + lambda x: x['thumbnail']['thumbnails'], list) or []: + thumbnail_url = url_or_none(thumbnail.get('url')) if not thumbnail_url: continue thumbnails.append({ 'height': int_or_none(thumbnail.get('height')), - 'url': thumbnail_url, + 'url': update_url(thumbnail_url, query=None, fragment=None), 'width': int_or_none(thumbnail.get('width')), }) if thumbnails: @@ -1779,7 +2096,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): or microformat.get('lengthSeconds')) \ or parse_duration(search_meta('duration')) is_live = video_details.get('isLive') - owner_profile_url = microformat.get('ownerProfileUrl') + + def gen_owner_profile_url(): + yield microformat.get('ownerProfileUrl') + yield extract_attributes(self._search_regex( + r'''(?s)(<link\b[^>]+\bitemprop\s*=\s*("|')url\2[^>]*>)''', + get_element_by_attribute('itemprop', 'author', webpage), + 'owner_profile_url', default='')).get('href') + + owner_profile_url = next( + (x for x in map(url_or_none, gen_owner_profile_url()) if x), + None) + + if not player_url: + player_url = self._extract_player_url(webpage) + self._unthrottle_format_urls(video_id, player_url, formats) info = { 'id': video_id, @@ -1860,6 +2191,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): info[d_k] = parse_duration(query[k][0]) if video_description: + # Youtube Music Auto-generated description mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) if mobj: release_year = mobj.group('release_year') @@ -1934,7 +2266,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN': info['location'] = stl else: - mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl) + # •? doesn't match, but [•]? does; \xa0 = non-breaking space + mobj = re.search(r'([^\xa0\s].*?)[\xa0\s]*S(\d+)[\xa0\s]*[•]?[\xa0\s]*E(\d+)', stl) if mobj: info.update({ 'series': mobj.group(1), @@ -1945,7 +2278,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): vpir, lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], list) or []): - tbr = tlb.get('toggleButtonRenderer') or {} + tbr = traverse_obj(tlb, ('segmentedLikeDislikeButtonRenderer', 'likeButton', 'toggleButtonRenderer'), 'toggleButtonRenderer') or {} for getter, regex in [( lambda x: x['defaultText']['accessibility']['accessibilityData'], r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([ @@ -1961,6 +2294,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sbr_tooltip = try_get( vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) if sbr_tooltip: + # however dislike_count was hidden by YT, as if there could ever be dislikable content on YT like_count, dislike_count = sbr_tooltip.split(' / ') info.update({ 'like_count': str_to_int(like_count), @@ -1998,6 +2332,30 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif mrr_title == 'Song': info['track'] = mrr_contents_text + # this is not extraction but spelunking! + carousel_lockups = traverse_obj( + initial_data, + ('engagementPanels', Ellipsis, 'engagementPanelSectionListRenderer', + 'content', 'structuredDescriptionContentRenderer', 'items', Ellipsis, + 'videoDescriptionMusicSectionRenderer', 'carouselLockups', Ellipsis), + expected_type=dict) or [] + # try to reproduce logic from metadataRowContainerRenderer above (if it still is) + fields = (('ALBUM', 'album'), ('ARTIST', 'artist'), ('SONG', 'track'), ('LICENSES', 'license')) + # multiple_songs ? + if len(carousel_lockups) > 1: + fields = fields[-1:] + for info_row in traverse_obj( + carousel_lockups, + (0, 'carouselLockupRenderer', 'infoRows', Ellipsis, 'infoRowRenderer'), + expected_type=dict): + row_title = traverse_obj(info_row, ('title', 'simpleText')) + row_text = traverse_obj(info_row, 'defaultMetadata', 'expandedMetadata', expected_type=get_text) + if not row_text: + continue + for name, field in fields: + if name == row_title and not info.get(field): + info[field] = row_text + for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: v = info.get(s_k) if v: @@ -2020,13 +2378,31 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): (?: (?:channel|c|user|feed|hashtag)/| (?:playlist|watch)\?.*?\blist=| - (?!(?:watch|embed|v|e)\b) + (?!(?:watch|embed|v|e|results)\b) ) (?P<id>[^/?\#&]+) ''' IE_NAME = 'youtube:tab' _TESTS = [{ + # Shorts + 'url': 'https://www.youtube.com/@SuperCooperShorts/shorts', + 'playlist_mincount': 5, + 'info_dict': { + 'description': 'Short clips from Super Cooper Sundays!', + 'id': 'UCKMA8kHZ8bPYpnMNaUSxfEQ', + 'title': 'Super Cooper Shorts - Shorts', + } + }, { + # Channel that does not have a Shorts tab. Test should just download videos on Home tab instead + 'url': 'https://www.youtube.com/@emergencyawesome/shorts', + 'info_dict': { + 'description': 'md5:592c080c06fef4de3c902c4a8eecd850', + 'id': 'UCDiFRMQWpcp8_KD4vwIVicw', + 'title': 'Emergency Awesome - Home', + }, + 'playlist_mincount': 5, + }, { # playlists, multipage 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', 'playlist_mincount': 94, @@ -2212,7 +2588,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'tags': list, 'view_count': int, 'like_count': int, - 'dislike_count': int, }, 'params': { 'skip_download': True, @@ -2239,7 +2614,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'categories': ['News & Politics'], 'tags': list, 'like_count': int, - 'dislike_count': int, }, 'params': { 'skip_download': True, @@ -2259,7 +2633,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'categories': ['News & Politics'], 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], 'like_count': int, - 'dislike_count': int, }, 'params': { 'skip_download': True, @@ -2323,6 +2696,17 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', 'only_matching': True, + }, { + 'note': 'Search tab', + 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', + 'playlist_mincount': 40, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Search - linear algebra', + 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + 'uploader': '3Blue1Brown', + 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', + } }] @classmethod @@ -2489,7 +2873,11 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): def _rich_grid_entries(self, contents): for content in contents: - video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict) + video_renderer = try_get( + content, + (lambda x: x['richItemRenderer']['content']['videoRenderer'], + lambda x: x['richItemRenderer']['content']['reelItemRenderer']), + dict) if video_renderer: entry = self._video_entry(video_renderer) if entry: @@ -2720,8 +3108,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): @staticmethod def _extract_selected_tab(tabs): for tab in tabs: - if try_get(tab, lambda x: x['tabRenderer']['selected'], bool): - return tab['tabRenderer'] + renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {} + if renderer.get('selected') is True: + return renderer else: raise ExtractorError('Unable to find selected tab') @@ -2778,6 +3167,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): title = channel_title or item_id if tab_title: title += ' - %s' % tab_title + if selected_tab.get('expandedText'): + title += ' - %s' % selected_tab['expandedText'] description = renderer.get('description') playlist_id = renderer.get('externalId') else: @@ -2826,8 +3217,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): def _real_extract(self, url): item_id = self._match_id(url) - url = compat_urlparse.urlunparse( - compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) + url = update_url(url, netloc='www.youtube.com') # Handle both video/playlist URLs qs = parse_qs(url) video_id = qs.get('v', [None])[0] @@ -2927,11 +3317,7 @@ class YoutubePlaylistIE(InfoExtractor): def suitable(cls, url): if YoutubeTabIE.suitable(url): return False - # Hack for lazy extractors until more generic solution is implemented - # (see #28780) - from .youtube import parse_qs - qs = parse_qs(url) - if qs.get('v', [None])[0]: + if parse_qs(url).get('v', [None])[0]: return False return super(YoutubePlaylistIE, cls).suitable(url) @@ -2961,7 +3347,6 @@ class YoutubeYtBeIE(InfoExtractor): 'categories': ['Nonprofits & Activism'], 'tags': list, 'like_count': int, - 'dislike_count': int, }, 'params': { 'noplaylist': True, @@ -3019,106 +3404,62 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com searches' - # there doesn't appear to be a real limit, for example if you search for - # 'python' you get more than 8.000.000 results - _MAX_RESULTS = float('inf') IE_NAME = 'youtube:search' _SEARCH_KEY = 'ytsearch' - _SEARCH_PARAMS = None - _TESTS = [] - - def _entries(self, query, n): - data = { - 'context': { - 'client': { - 'clientName': 'WEB', - 'clientVersion': '2.20201021.03.00', - } - }, - 'query': query, + _SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only + _MAX_RESULTS = float('inf') + _TESTS = [{ + 'url': 'ytsearch10:youtube-dl test video', + 'playlist_count': 10, + 'info_dict': { + 'id': 'youtube-dl test video', + 'title': 'youtube-dl test video', } - if self._SEARCH_PARAMS: - data['params'] = self._SEARCH_PARAMS - total = 0 - for page_num in itertools.count(1): - search = self._download_json( - 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', - video_id='query "%s"' % query, - note='Downloading page %s' % page_num, - errnote='Unable to download API page', fatal=False, - data=json.dumps(data).encode('utf8'), - headers={'content-type': 'application/json'}) - if not search: - break - slr_contents = try_get( - search, - (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], - lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), - list) - if not slr_contents: - break - for slr_content in slr_contents: - isr_contents = try_get( - slr_content, - lambda x: x['itemSectionRenderer']['contents'], - list) - if not isr_contents: - continue - for content in isr_contents: - if not isinstance(content, dict): - continue - video = content.get('videoRenderer') - if not isinstance(video, dict): - continue - video_id = video.get('videoId') - if not video_id: - continue - yield self._extract_video(video) - total += 1 - if total == n: - return - token = try_get( - slr_contents, - lambda x: x[-1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], - compat_str) - if not token: - break - data['continuation'] = token + }] def _get_n_results(self, query, n): """Get a specified number of results for a query""" - return self.playlist_result(self._entries(query, n), query) + entries = itertools.islice(self._search_results(query, self._SEARCH_PARAMS), 0, None if n == float('inf') else n) + return self.playlist_result(entries, query, query) class YoutubeSearchDateIE(YoutubeSearchIE): IE_NAME = YoutubeSearchIE.IE_NAME + ':date' _SEARCH_KEY = 'ytsearchdate' IE_DESC = 'YouTube.com searches, newest videos first' - _SEARCH_PARAMS = 'CAI%3D' + _SEARCH_PARAMS = 'CAISAhAB' # Videos only, sorted by date + _TESTS = [{ + 'url': 'ytsearchdate10:youtube-dl test video', + 'playlist_count': 10, + 'info_dict': { + 'id': 'youtube-dl test video', + 'title': 'youtube-dl test video', + } + }] -r""" -class YoutubeSearchURLIE(YoutubeSearchIE): - IE_DESC = 'YouTube.com search URLs' - IE_NAME = 'youtube:search_url' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' +class YoutubeSearchURLIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube search URLs with sorting and filter support' + IE_NAME = YoutubeSearchIE.IE_NAME + '_url' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, 'info_dict': { + 'id': 'youtube-dl test video', 'title': 'youtube-dl test video', - } + }, + 'params': {'playlistend': 5} }, { 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - return self.playlist_result(self._process_page(webpage), playlist_title=query) -""" + qs = parse_qs(url) + query = (qs.get('search_query') or qs.get('q'))[-1] + params = qs.get('sp', ('',))[-1] + return self.playlist_result(self._search_results(query, params), query, query) class YoutubeFeedsInfoExtractor(YoutubeTabIE): diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 4dd56f66d..fcc63ef52 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -7,13 +7,15 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( determine_ext, + ExtractorError, + extract_attributes, float_or_none, int_or_none, merge_dicts, NO_DEFAULT, - orderedSet, parse_codecs, qualities, + str_or_none, try_get, unified_timestamp, update_url_query, @@ -56,28 +58,39 @@ class ZDFBaseIE(InfoExtractor): format_urls.add(format_url) mime_type = meta.get('mimeType') ext = determine_ext(format_url) + + join_nonempty = lambda s, l: s.join(filter(None, l)) + meta_map = lambda t: map(lambda x: str_or_none(meta.get(x)), t) + if mime_type == 'application/x-mpegURL' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + new_formats = self._extract_m3u8_formats( format_url, video_id, 'mp4', m3u8_id='hls', - entry_protocol='m3u8_native', fatal=False)) + entry_protocol='m3u8_native', fatal=False) elif mime_type == 'application/f4m+xml' or ext == 'f4m': - formats.extend(self._extract_f4m_formats( - update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False)) + new_formats = self._extract_f4m_formats( + update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False) else: f = parse_codecs(meta.get('mimeCodec')) + if not f: + data = meta.get('type', '').split('_') + if try_get(data, lambda x: x[2]) == ext: + f = dict(zip(('vcodec', 'acodec'), data[1])) + format_id = ['http'] - for p in (meta.get('type'), meta.get('quality')): - if p and isinstance(p, compat_str): - format_id.append(p) + format_id.extend(join_nonempty('-', meta_map(('type', 'quality')))) f.update({ 'url': format_url, 'format_id': '-'.join(format_id), - 'format_note': meta.get('quality'), - 'language': meta.get('language'), - 'quality': qualities(self._QUALITIES)(meta.get('quality')), - 'preference': -10, + 'tbr': int_or_none(self._search_regex(r'_(\d+)k_', format_url, 'tbr', default=None)) }) - formats.append(f) + new_formats = [f] + + formats.extend(merge_dicts(f, { + 'format_note': join_nonempty(',', meta_map(('quality', 'class'))), + 'language': meta.get('language'), + 'language_preference': 10 if meta.get('class') == 'main' else -10 if meta.get('class') == 'ad' else -1, + 'quality': qualities(self._QUALITIES)(meta.get('quality')), + }) for f in new_formats) def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer): ptmd = self._call_api( @@ -106,6 +119,7 @@ class ZDFBaseIE(InfoExtractor): 'type': f.get('type'), 'mimeType': f.get('mimeType'), 'quality': quality.get('quality'), + 'class': track.get('class'), 'language': track.get('language'), }) self._sort_formats(formats) @@ -145,6 +159,7 @@ class ZDFIE(ZDFBaseIE): 'timestamp': 1613948400, 'upload_date': '20210221', }, + 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', }, { # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', @@ -158,6 +173,7 @@ class ZDFIE(ZDFBaseIE): 'timestamp': 1608604200, 'upload_date': '20201222', }, + 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', }, { 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', 'info_dict': { @@ -168,6 +184,20 @@ class ZDFIE(ZDFBaseIE): 'duration': 2615, 'timestamp': 1465021200, 'upload_date': '20160604', + 'thumbnail': 'https://www.zdf.de/assets/mauve-im-labor-100~768x432?cb=1464909117806', + }, + }, { + 'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html', + 'md5': '1b93bdec7d02fc0b703c5e7687461628', + 'info_dict': { + 'ext': 'mp4', + 'id': 'video_funk_1770473', + 'duration': 1278, + 'description': 'Die Neue an der Schule verdreht Ismail den Kopf.', + 'title': 'Alles ist verzaubert', + 'timestamp': 1635520560, + 'upload_date': '20211029', + 'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-100~1920x1080?cb=1636466431799', }, }, { # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche @@ -190,6 +220,30 @@ class ZDFIE(ZDFBaseIE): }, { 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/arte/todliche-flucht/page-video-artede-toedliche-flucht-16-100.html', + 'info_dict': { + 'id': 'video_artede_083871-001-A', + 'ext': 'mp4', + 'title': 'Tödliche Flucht (1/6)', + 'description': 'md5:e34f96a9a5f8abd839ccfcebad3d5315', + 'duration': 3193.0, + 'timestamp': 1641355200, + 'upload_date': '20220105', + }, + 'skip': 'No longer available "Diese Seite wurde leider nicht gefunden"' + }, { + 'url': 'https://www.zdf.de/serien/soko-stuttgart/das-geld-anderer-leute-100.html', + 'info_dict': { + 'id': '191205_1800_sendung_sok8', + 'ext': 'mp4', + 'title': 'Das Geld anderer Leute', + 'description': 'md5:cb6f660850dc5eb7d1ab776ea094959d', + 'duration': 2581.0, + 'timestamp': 1654790700, + 'upload_date': '20220609', + 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/e2d7e55a-09f0-424e-ac73-6cac4dd65f35?layout=2400x1350', + }, }] def _extract_entry(self, url, player, content, video_id): @@ -197,12 +251,18 @@ class ZDFIE(ZDFBaseIE): t = content['mainVideoContent']['http://zdf.de/rels/target'] - ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') + def get_ptmd_path(d): + return ( + d.get('http://zdf.de/rels/streams/ptmd') + or d.get('http://zdf.de/rels/streams/ptmd-template', + '').replace('{playerId}', 'ngplayer_2_4')) + + ptmd_path = get_ptmd_path(try_get(t, lambda x: x['streams']['default'], dict) or {}) + if not ptmd_path: + ptmd_path = get_ptmd_path(t) if not ptmd_path: - ptmd_path = t[ - 'http://zdf.de/rels/streams/ptmd-template'].replace( - '{playerId}', 'ngplayer_2_4') + raise ExtractorError('Could not extract ptmd_path') info = self._extract_ptmd( urljoin(url, ptmd_path), video_id, player['apiToken'], url) @@ -245,15 +305,16 @@ class ZDFIE(ZDFBaseIE): 'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id, video_id) - document = video['document'] - - title = document['titel'] - content_id = document['basename'] - formats = [] - format_urls = set() - for f in document['formitaeten']: - self._extract_format(content_id, formats, format_urls, f) + formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list) + document = formitaeten and video['document'] + if formitaeten: + title = document['titel'] + content_id = document['basename'] + + format_urls = set() + for f in formitaeten or []: + self._extract_format(content_id, formats, format_urls, f) self._sort_formats(formats) thumbnails = [] @@ -300,9 +361,9 @@ class ZDFChannelIE(ZDFBaseIE): 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio', 'info_dict': { 'id': 'das-aktuelle-sportstudio', - 'title': 'das aktuelle sportstudio | ZDF', + 'title': 'das aktuelle sportstudio', }, - 'playlist_mincount': 23, + 'playlist_mincount': 18, }, { 'url': 'https://www.zdf.de/dokumentation/planet-e', 'info_dict': { @@ -310,6 +371,14 @@ class ZDFChannelIE(ZDFBaseIE): 'title': 'planet e.', }, 'playlist_mincount': 50, + }, { + 'url': 'https://www.zdf.de/gesellschaft/aktenzeichen-xy-ungeloest', + 'info_dict': { + 'id': 'aktenzeichen-xy-ungeloest', + 'title': 'Aktenzeichen XY... ungelöst', + 'entries': "lambda x: not any('xy580-fall1-kindermoerder-gesucht-100' in e['url'] for e in x)", + }, + 'playlist_mincount': 2, }, { 'url': 'https://www.zdf.de/filme/taunuskrimi/', 'only_matching': True, @@ -319,60 +388,36 @@ class ZDFChannelIE(ZDFBaseIE): def suitable(cls, url): return False if ZDFIE.suitable(url) else super(ZDFChannelIE, cls).suitable(url) + def _og_search_title(self, webpage, fatal=False): + title = super(ZDFChannelIE, self)._og_search_title(webpage, fatal=fatal) + return re.split(r'\s+[-|]\s+ZDF(?:mediathek)?$', title or '')[0] or None + def _real_extract(self, url): channel_id = self._match_id(url) webpage = self._download_webpage(url, channel_id) - entries = [ - self.url_result(item_url, ie=ZDFIE.ie_key()) - for item_url in orderedSet(re.findall( - r'data-plusbar-url=["\'](http.+?\.html)', webpage))] + matches = re.finditer( + r'''<div\b[^>]*?\sdata-plusbar-id\s*=\s*(["'])(?P<p_id>[\w-]+)\1[^>]*?\sdata-plusbar-url=\1(?P<url>%s)\1''' % ZDFIE._VALID_URL, + webpage) - return self.playlist_result( - entries, channel_id, self._og_search_title(webpage, fatal=False)) + if self._downloader.params.get('noplaylist', False): + entry = next( + (self.url_result(m.group('url'), ie=ZDFIE.ie_key()) for m in matches), + None) + self.to_screen('Downloading just the main video because of --no-playlist') + if entry: + return entry + else: + self.to_screen('Downloading playlist %s - add --no-playlist to download just the main video' % (channel_id, )) - r""" - player = self._extract_player(webpage, channel_id) + def check_video(m): + v_ref = self._search_regex( + r'''(<a\b[^>]*?\shref\s*=[^>]+?\sdata-target-id\s*=\s*(["'])%s\2[^>]*>)''' % (m.group('p_id'), ), + webpage, 'check id', default='') + v_ref = extract_attributes(v_ref) + return v_ref.get('data-target-video-type') != 'novideo' - channel_id = self._search_regex( - r'docId\s*:\s*(["\'])(?P<id>(?!\1).+?)\1', webpage, - 'channel id', group='id') - - channel = self._call_api( - 'https://api.zdf.de/content/documents/%s.json' % channel_id, - player, url, channel_id) - - items = [] - for module in channel['module']: - for teaser in try_get(module, lambda x: x['teaser'], list) or []: - t = try_get( - teaser, lambda x: x['http://zdf.de/rels/target'], dict) - if not t: - continue - items.extend(try_get( - t, - lambda x: x['resultsWithVideo']['http://zdf.de/rels/search/results'], - list) or []) - items.extend(try_get( - module, - lambda x: x['filterRef']['resultsWithVideo']['http://zdf.de/rels/search/results'], - list) or []) - - entries = [] - entry_urls = set() - for item in items: - t = try_get(item, lambda x: x['http://zdf.de/rels/target'], dict) - if not t: - continue - sharing_url = t.get('http://zdf.de/rels/sharing-url') - if not sharing_url or not isinstance(sharing_url, compat_str): - continue - if sharing_url in entry_urls: - continue - entry_urls.add(sharing_url) - entries.append(self.url_result( - sharing_url, ie=ZDFIE.ie_key(), video_id=t.get('id'))) - - return self.playlist_result(entries, channel_id, channel.get('title')) - """ + return self.playlist_from_matches( + (m.group('url') for m in matches if check_video(m)), + channel_id, self._og_search_title(webpage, fatal=False)) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 7bda59610..60fa2b1b9 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -1,214 +1,851 @@ from __future__ import unicode_literals +import itertools import json +import math import operator import re from .utils import ( + error_to_compat_str, ExtractorError, + js_to_json, remove_quotes, + unified_timestamp, +) +from .compat import ( + compat_basestring, + compat_collections_chain_map as ChainMap, + compat_itertools_zip_longest as zip_longest, + compat_str, ) -_OPERATORS = [ - ('|', operator.or_), - ('^', operator.xor), - ('&', operator.and_), - ('>>', operator.rshift), - ('<<', operator.lshift), - ('-', operator.sub), - ('+', operator.add), - ('%', operator.mod), - ('/', operator.truediv), - ('*', operator.mul), -] -_ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS] -_ASSIGN_OPERATORS.append(('=', lambda cur, right: right)) -_NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*' +def _js_bit_op(op): + + def zeroise(x): + return 0 if x in (None, JS_Undefined) else x + + def wrapped(a, b): + return op(zeroise(a), zeroise(b)) & 0xffffffff + + return wrapped + + +def _js_arith_op(op): + + def wrapped(a, b): + if JS_Undefined in (a, b): + return float('nan') + return op(a or 0, b or 0) + + return wrapped + + +def _js_div(a, b): + if JS_Undefined in (a, b) or not (a and b): + return float('nan') + return operator.truediv(a or 0, b) if b else float('inf') + + +def _js_mod(a, b): + if JS_Undefined in (a, b) or not b: + return float('nan') + return (a or 0) % b + + +def _js_exp(a, b): + if not b: + return 1 # even 0 ** 0 !! + elif JS_Undefined in (a, b): + return float('nan') + return (a or 0) ** b + + +def _js_eq_op(op): + + def wrapped(a, b): + if set((a, b)) <= set((None, JS_Undefined)): + return op(a, a) + return op(a, b) + + return wrapped + + +def _js_comp_op(op): + + def wrapped(a, b): + if JS_Undefined in (a, b): + return False + if isinstance(a, compat_basestring): + b = compat_str(b or 0) + elif isinstance(b, compat_basestring): + a = compat_str(a or 0) + return op(a or 0, b or 0) + + return wrapped + + +def _js_ternary(cndn, if_true=True, if_false=False): + """Simulate JS's ternary operator (cndn?if_true:if_false)""" + if cndn in (False, None, 0, '', JS_Undefined): + return if_false + try: + if math.isnan(cndn): # NB: NaN cannot be checked by membership + return if_false + except TypeError: + pass + return if_true + + +# (op, definition) in order of binding priority, tightest first +# avoid dict to maintain order +# definition None => Defined in JSInterpreter._operator +_OPERATORS = ( + ('>>', _js_bit_op(operator.rshift)), + ('<<', _js_bit_op(operator.lshift)), + ('+', _js_arith_op(operator.add)), + ('-', _js_arith_op(operator.sub)), + ('*', _js_arith_op(operator.mul)), + ('%', _js_mod), + ('/', _js_div), + ('**', _js_exp), +) + +_COMP_OPERATORS = ( + ('===', operator.is_), + ('!==', operator.is_not), + ('==', _js_eq_op(operator.eq)), + ('!=', _js_eq_op(operator.ne)), + ('<=', _js_comp_op(operator.le)), + ('>=', _js_comp_op(operator.ge)), + ('<', _js_comp_op(operator.lt)), + ('>', _js_comp_op(operator.gt)), +) + +_LOG_OPERATORS = ( + ('|', _js_bit_op(operator.or_)), + ('^', _js_bit_op(operator.xor)), + ('&', _js_bit_op(operator.and_)), +) + +_SC_OPERATORS = ( + ('?', None), + ('??', None), + ('||', None), + ('&&', None), +) + +_OPERATOR_RE = '|'.join(map(lambda x: re.escape(x[0]), _OPERATORS + _LOG_OPERATORS)) + +_NAME_RE = r'[a-zA-Z_$][\w$]*' +_MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]'))) +_QUOTES = '\'"/' + + +class JS_Undefined(object): + pass + + +class JS_Break(ExtractorError): + def __init__(self): + ExtractorError.__init__(self, 'Invalid break') + + +class JS_Continue(ExtractorError): + def __init__(self): + ExtractorError.__init__(self, 'Invalid continue') + + +class JS_Throw(ExtractorError): + def __init__(self, e): + self.error = e + ExtractorError.__init__(self, 'Uncaught exception ' + error_to_compat_str(e)) + + +class LocalNameSpace(ChainMap): + def __getitem__(self, key): + try: + return super(LocalNameSpace, self).__getitem__(key) + except KeyError: + return JS_Undefined + + def __setitem__(self, key, value): + for scope in self.maps: + if key in scope: + scope[key] = value + return + self.maps[0][key] = value + + def __delitem__(self, key): + raise NotImplementedError('Deleting is not supported') + + def __repr__(self): + return 'LocalNameSpace%s' % (self.maps, ) class JSInterpreter(object): + __named_object_counter = 0 + + _OBJ_NAME = '__youtube_dl_jsinterp_obj' + + OP_CHARS = None + def __init__(self, code, objects=None): - if objects is None: - objects = {} - self.code = code - self._functions = {} - self._objects = objects + self.code, self._functions = code, {} + self._objects = {} if objects is None else objects + if type(self).OP_CHARS is None: + type(self).OP_CHARS = self.OP_CHARS = self.__op_chars() + + class Exception(ExtractorError): + def __init__(self, msg, *args, **kwargs): + expr = kwargs.pop('expr', None) + if expr is not None: + msg = '{0} in: {1!r:.100}'.format(msg.rstrip(), expr) + super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) + + class JS_RegExp(object): + _RE_FLAGS = { + # special knowledge: Python's re flags are bitmask values, current max 128 + # invent new bitmask values well above that for literal parsing + # TODO: new pattern class to execute matches with these flags + 'd': 1024, # Generate indices for substring matches + 'g': 2048, # Global search + 'i': re.I, # Case-insensitive search + 'm': re.M, # Multi-line search + 's': re.S, # Allows . to match newline characters + 'u': re.U, # Treat a pattern as a sequence of unicode code points + 'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string + } + + def __init__(self, pattern_txt, flags=''): + if isinstance(flags, compat_str): + flags, _ = self.regex_flags(flags) + # Thx: https://stackoverflow.com/questions/44773522/setattr-on-python2-sre-sre-pattern + # First, avoid https://github.com/python/cpython/issues/74534 + self.__self = re.compile(pattern_txt.replace('[[', r'[\['), flags) + for name in dir(self.__self): + # Only these? Obviously __class__, __init__. + # PyPy creates a __weakref__ attribute with value None + # that can't be setattr'd but also can't need to be copied. + if name in ('__class__', '__init__', '__weakref__'): + continue + setattr(self, name, getattr(self.__self, name)) + + @classmethod + def regex_flags(cls, expr): + flags = 0 + if not expr: + return flags, expr + for idx, ch in enumerate(expr): + if ch not in cls._RE_FLAGS: + break + flags |= cls._RE_FLAGS[ch] + return flags, expr[idx + 1:] + + @classmethod + def __op_chars(cls): + op_chars = set(';,[') + for op in cls._all_operators(): + for c in op[0]: + op_chars.add(c) + return op_chars + + def _named_object(self, namespace, obj): + self.__named_object_counter += 1 + name = '%s%d' % (self._OBJ_NAME, self.__named_object_counter) + namespace[name] = obj + return name + + @classmethod + def _separate(cls, expr, delim=',', max_split=None, skip_delims=None): + if not expr: + return + # collections.Counter() is ~10% slower in both 2.7 and 3.9 + counters = {k: 0 for k in _MATCHING_PARENS.values()} + start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 + in_quote, escaping, skipping = None, False, 0 + after_op, in_regex_char_group, skip_re = True, False, 0 + + for idx, char in enumerate(expr): + if skip_re > 0: + skip_re -= 1 + continue + if not in_quote: + if char in _MATCHING_PARENS: + counters[_MATCHING_PARENS[char]] += 1 + elif char in counters: + counters[char] -= 1 + if not escaping: + if char in _QUOTES and in_quote in (char, None): + if in_quote or after_op or char != '/': + in_quote = None if in_quote and not in_regex_char_group else char + elif in_quote == '/' and char in '[]': + in_regex_char_group = char == '[' + escaping = not escaping and in_quote and char == '\\' + after_op = not in_quote and (char in cls.OP_CHARS or (char.isspace() and after_op)) + + if char != delim[pos] or any(counters.values()) or in_quote: + pos = skipping = 0 + continue + elif skipping > 0: + skipping -= 1 + continue + elif pos == 0 and skip_delims: + here = expr[idx:] + for s in skip_delims if isinstance(skip_delims, (list, tuple)) else [skip_delims]: + if here.startswith(s) and s: + skipping = len(s) - 1 + break + if skipping > 0: + continue + if pos < delim_len: + pos += 1 + continue + yield expr[start: idx - delim_len] + start, pos = idx + 1, 0 + splits += 1 + if max_split and splits >= max_split: + break + yield expr[start:] + + @classmethod + def _separate_at_paren(cls, expr, delim=None): + if delim is None: + delim = expr and _MATCHING_PARENS[expr[0]] + separated = list(cls._separate(expr, delim, 1)) + + if len(separated) < 2: + raise cls.Exception('No terminating paren {delim} in {expr}'.format(**locals())) + return separated[0][1:].strip(), separated[1].strip() + + @staticmethod + def _all_operators(): + return itertools.chain( + # Ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Operator_Precedence + _SC_OPERATORS, _LOG_OPERATORS, _COMP_OPERATORS, _OPERATORS) + + def _operator(self, op, left_val, right_expr, expr, local_vars, allow_recursion): + if op in ('||', '&&'): + if (op == '&&') ^ _js_ternary(left_val): + return left_val # short circuiting + elif op == '??': + if left_val not in (None, JS_Undefined): + return left_val + elif op == '?': + right_expr = _js_ternary(left_val, *self._separate(right_expr, ':', 1)) + + right_val = self.interpret_expression(right_expr, local_vars, allow_recursion) + opfunc = op and next((v for k, v in self._all_operators() if k == op), None) + if not opfunc: + return right_val + + try: + return opfunc(left_val, right_val) + except Exception as e: + raise self.Exception('Failed to evaluate {left_val!r:.50} {op} {right_val!r:.50}'.format(**locals()), expr, cause=e) + + def _index(self, obj, idx, allow_undefined=False): + if idx == 'length': + return len(obj) + try: + return obj[int(idx)] if isinstance(obj, list) else obj[idx] + except Exception as e: + if allow_undefined: + return JS_Undefined + raise self.Exception('Cannot get index {idx:.100}'.format(**locals()), expr=repr(obj), cause=e) + + def _dump(self, obj, namespace): + try: + return json.dumps(obj) + except TypeError: + return self._named_object(namespace, obj) def interpret_statement(self, stmt, local_vars, allow_recursion=100): if allow_recursion < 0: - raise ExtractorError('Recursion limit reached') + raise self.Exception('Recursion limit reached') + allow_recursion -= 1 - should_abort = False - stmt = stmt.lstrip() - stmt_m = re.match(r'var\s', stmt) - if stmt_m: - expr = stmt[len(stmt_m.group(0)):] - else: - return_m = re.match(r'return(?:\s+|$)', stmt) - if return_m: - expr = stmt[len(return_m.group(0)):] - should_abort = True + should_return = False + # fails on (eg) if (...) stmt1; else stmt2; + sub_statements = list(self._separate(stmt, ';')) or [''] + expr = stmt = sub_statements.pop().strip() + for sub_stmt in sub_statements: + ret, should_return = self.interpret_statement(sub_stmt, local_vars, allow_recursion) + if should_return: + return ret, should_return + + m = re.match(r'(?P<var>(?:var|const|let)\s)|return(?:\s+|(?=["\'])|$)|(?P<throw>throw\s+)', stmt) + if m: + expr = stmt[len(m.group(0)):].strip() + if m.group('throw'): + raise JS_Throw(self.interpret_expression(expr, local_vars, allow_recursion)) + should_return = not m.group('var') + if not expr: + return None, should_return + + if expr[0] in _QUOTES: + inner, outer = self._separate(expr, expr[0], 1) + if expr[0] == '/': + flags, outer = self.JS_RegExp.regex_flags(outer) + inner = self.JS_RegExp(inner[1:], flags=flags) else: - # Try interpreting it as an expression - expr = stmt + inner = json.loads(js_to_json(inner + expr[0])) # , strict=True)) + if not outer: + return inner, should_return + expr = self._named_object(local_vars, inner) + outer - v = self.interpret_expression(expr, local_vars, allow_recursion) - return v, should_abort + new_kw, _, obj = expr.partition('new ') + if not new_kw: + for klass, konstr in (('Date', lambda x: int(unified_timestamp(x, False) * 1000)), + ('RegExp', self.JS_RegExp), + ('Error', self.Exception)): + if not obj.startswith(klass + '('): + continue + left, right = self._separate_at_paren(obj[len(klass):]) + argvals = self.interpret_iter(left, local_vars, allow_recursion) + expr = konstr(*argvals) + if not expr: + raise self.Exception('Failed to parse {klass} {left!r:.100}'.format(**locals()), expr=expr) + expr = self._dump(expr, local_vars) + right + break + else: + raise self.Exception('Unsupported object {obj:.100}'.format(**locals()), expr=expr) - def interpret_expression(self, expr, local_vars, allow_recursion): - expr = expr.strip() - if expr == '': # Empty expression - return None + if expr.startswith('void '): + left = self.interpret_expression(expr[5:], local_vars, allow_recursion) + return None, should_return + + if expr.startswith('{'): + inner, outer = self._separate_at_paren(expr) + # try for object expression (Map) + sub_expressions = [list(self._separate(sub_expr.strip(), ':', 1)) for sub_expr in self._separate(inner)] + if all(len(sub_expr) == 2 for sub_expr in sub_expressions): + return dict( + (key_expr if re.match(_NAME_RE, key_expr) else key_expr, + self.interpret_expression(val_expr, local_vars, allow_recursion)) + for key_expr, val_expr in sub_expressions), should_return + # or statement list + inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion) + if not outer or should_abort: + return inner, should_abort or should_return + else: + expr = self._dump(inner, local_vars) + outer if expr.startswith('('): - parens_count = 0 - for m in re.finditer(r'[()]', expr): - if m.group(0) == '(': - parens_count += 1 + inner, outer = self._separate_at_paren(expr) + inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion) + if not outer or should_abort: + return inner, should_abort or should_return + else: + expr = self._dump(inner, local_vars) + outer + + if expr.startswith('['): + inner, outer = self._separate_at_paren(expr) + name = self._named_object(local_vars, [ + self.interpret_expression(item, local_vars, allow_recursion) + for item in self._separate(inner)]) + expr = name + outer + + m = re.match(r'''(?x) + (?P<try>try)\s*\{| + (?P<if>if)\s*\(| + (?P<switch>switch)\s*\(| + (?P<for>for)\s*\(| + (?P<while>while)\s*\( + ''', expr) + md = m.groupdict() if m else {} + if md.get('if'): + cndn, expr = self._separate_at_paren(expr[m.end() - 1:]) + if expr.startswith('{'): + if_expr, expr = self._separate_at_paren(expr) + else: + # may lose ... else ... because of ll.368-374 + if_expr, expr = self._separate_at_paren(expr, delim=';') + else_expr = None + m = re.match(r'else\s*(?P<block>\{)?', expr) + if m: + if m.group('block'): + else_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) else: - parens_count -= 1 - if parens_count == 0: - sub_expr = expr[1:m.start()] - sub_result = self.interpret_expression( - sub_expr, local_vars, allow_recursion) - remaining_expr = expr[m.end():].strip() - if not remaining_expr: - return sub_result + # handle subset ... else if (...) {...} else ... + # TODO: make interpret_statement do this properly, if possible + exprs = list(self._separate(expr[m.end():], delim='}', max_split=2)) + if len(exprs) > 1: + if re.match(r'\s*if\s*\(', exprs[0]) and re.match(r'\s*else\b', exprs[1]): + else_expr = exprs[0] + '}' + exprs[1] + expr = (exprs[2] + '}') if len(exprs) == 3 else None else: - expr = json.dumps(sub_result) + remaining_expr + else_expr = exprs[0] + exprs.append('') + expr = '}'.join(exprs[1:]) + else: + else_expr = exprs[0] + expr = None + else_expr = else_expr.lstrip() + '}' + cndn = _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)) + ret, should_abort = self.interpret_statement( + if_expr if cndn else else_expr, local_vars, allow_recursion) + if should_abort: + return ret, True + + elif md.get('try'): + try_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + err = None + try: + ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion) + if should_abort: + return ret, True + except Exception as e: + # XXX: This works for now, but makes debugging future issues very hard + err = e + + pending = (None, False) + m = re.match(r'catch\s*(?P<err>\(\s*{_NAME_RE}\s*\))?\{{'.format(**globals()), expr) + if m: + sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + if err: + catch_vars = {} + if m.group('err'): + catch_vars[m.group('err')] = err.error if isinstance(err, JS_Throw) else err + catch_vars = local_vars.new_child(m=catch_vars) + err = None + pending = self.interpret_statement(sub_expr, catch_vars, allow_recursion) + + m = re.match(r'finally\s*\{', expr) + if m: + sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion) + if should_abort: + return ret, True + + ret, should_abort = pending + if should_abort: + return ret, True + + if err: + raise err + + elif md.get('for') or md.get('while'): + init_or_cond, remaining = self._separate_at_paren(expr[m.end() - 1:]) + if remaining.startswith('{'): + body, expr = self._separate_at_paren(remaining) + else: + switch_m = re.match(r'switch\s*\(', remaining) # FIXME + if switch_m: + switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:]) + body, expr = self._separate_at_paren(remaining, '}') + body = 'switch(%s){%s}' % (switch_val, body) + else: + body, expr = remaining, '' + if md.get('for'): + start, cndn, increment = self._separate(init_or_cond, ';') + self.interpret_expression(start, local_vars, allow_recursion) + else: + cndn, increment = init_or_cond, None + while _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): + try: + ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion) + if should_abort: + return ret, True + except JS_Break: + break + except JS_Continue: + pass + if increment: + self.interpret_expression(increment, local_vars, allow_recursion) + + elif md.get('switch'): + switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:]) + switch_val = self.interpret_expression(switch_val, local_vars, allow_recursion) + body, expr = self._separate_at_paren(remaining, '}') + items = body.replace('default:', 'case default:').split('case ')[1:] + for default in (False, True): + matched = False + for item in items: + case, stmt = (i.strip() for i in self._separate(item, ':', 1)) + if default: + matched = matched or case == 'default' + elif not matched: + matched = (case != 'default' + and switch_val == self.interpret_expression(case, local_vars, allow_recursion)) + if not matched: + continue + try: + ret, should_abort = self.interpret_statement(stmt, local_vars, allow_recursion) + if should_abort: + return ret + except JS_Break: break - else: - raise ExtractorError('Premature end of parens in %r' % expr) + if matched: + break - for op, opfunc in _ASSIGN_OPERATORS: - m = re.match(r'''(?x) - (?P<out>%s)(?:\[(?P<index>[^\]]+?)\])? - \s*%s - (?P<expr>.*)$''' % (_NAME_RE, re.escape(op)), expr) - if not m: - continue - right_val = self.interpret_expression( - m.group('expr'), local_vars, allow_recursion - 1) + if md: + ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) + return ret, should_abort or should_return - if m.groupdict().get('index'): - lvar = local_vars[m.group('out')] - idx = self.interpret_expression( - m.group('index'), local_vars, allow_recursion) - assert isinstance(idx, int) - cur = lvar[idx] - val = opfunc(cur, right_val) - lvar[idx] = val - return val - else: - cur = local_vars.get(m.group('out')) - val = opfunc(cur, right_val) - local_vars[m.group('out')] = val - return val + # Comma separated statements + sub_expressions = list(self._separate(expr)) + if len(sub_expressions) > 1: + for sub_expr in sub_expressions: + ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion) + if should_abort: + return ret, True + return ret, False - if expr.isdigit(): - return int(expr) + for m in re.finditer(r'''(?x) + (?P<pre_sign>\+\+|--)(?P<var1>{_NAME_RE})| + (?P<var2>{_NAME_RE})(?P<post_sign>\+\+|--)'''.format(**globals()), expr): + var = m.group('var1') or m.group('var2') + start, end = m.span() + sign = m.group('pre_sign') or m.group('post_sign') + ret = local_vars[var] + local_vars[var] += 1 if sign[0] == '+' else -1 + if m.group('pre_sign'): + ret = local_vars[var] + expr = expr[:start] + self._dump(ret, local_vars) + expr[end:] - var_m = re.match( - r'(?!if|return|true|false)(?P<name>%s)$' % _NAME_RE, - expr) - if var_m: - return local_vars[var_m.group('name')] + if not expr: + return None, should_return + + m = re.match(r'''(?x) + (?P<assign> + (?P<out>{_NAME_RE})(?:\[(?P<index>[^\]]+?)\])?\s* + (?P<op>{_OPERATOR_RE})? + =(?!=)(?P<expr>.*)$ + )|(?P<return> + (?!if|return|true|false|null|undefined)(?P<name>{_NAME_RE})$ + )|(?P<indexing> + (?P<in>{_NAME_RE})\[(?P<idx>.+)\]$ + )|(?P<attribute> + (?P<var>{_NAME_RE})(?:(?P<nullish>\?)?\.(?P<member>[^(]+)|\[(?P<member2>[^\]]+)\])\s* + )|(?P<function> + (?P<fname>{_NAME_RE})\((?P<args>.*)\)$ + )'''.format(**globals()), expr) + md = m.groupdict() if m else {} + if md.get('assign'): + left_val = local_vars.get(m.group('out')) + + if not m.group('index'): + local_vars[m.group('out')] = self._operator( + m.group('op'), left_val, m.group('expr'), expr, local_vars, allow_recursion) + return local_vars[m.group('out')], should_return + elif left_val in (None, JS_Undefined): + raise self.Exception('Cannot index undefined variable ' + m.group('out'), expr=expr) + + idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) + if not isinstance(idx, (int, float)): + raise self.Exception('List index %s must be integer' % (idx, ), expr=expr) + idx = int(idx) + left_val[idx] = self._operator( + m.group('op'), self._index(left_val, idx), m.group('expr'), expr, local_vars, allow_recursion) + return left_val[idx], should_return + + elif expr.isdigit(): + return int(expr), should_return + + elif expr == 'break': + raise JS_Break() + elif expr == 'continue': + raise JS_Continue() + + elif expr == 'undefined': + return JS_Undefined, should_return + elif expr == 'NaN': + return float('NaN'), should_return + + elif md.get('return'): + return local_vars[m.group('name')], should_return try: - return json.loads(expr) + ret = json.loads(js_to_json(expr)) # strict=True) + if not md.get('attribute'): + return ret, should_return except ValueError: pass - m = re.match( - r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr) - if m: + if md.get('indexing'): val = local_vars[m.group('in')] - idx = self.interpret_expression( - m.group('idx'), local_vars, allow_recursion - 1) - return val[idx] + idx = self.interpret_expression(m.group('idx'), local_vars, allow_recursion) + return self._index(val, idx), should_return - m = re.match( - r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE, - expr) - if m: - variable = m.group('var') - member = remove_quotes(m.group('member') or m.group('member2')) - arg_str = m.group('args') - - if variable in local_vars: - obj = local_vars[variable] - else: - if variable not in self._objects: - self._objects[variable] = self.extract_object(variable) - obj = self._objects[variable] - - if arg_str is None: - # Member access - if member == 'length': - return len(obj) - return obj[member] - - assert expr.endswith(')') - # Function call - if arg_str == '': - argvals = tuple() - else: - argvals = tuple([ - self.interpret_expression(v, local_vars, allow_recursion) - for v in arg_str.split(',')]) - - if member == 'split': - assert argvals == ('',) - return list(obj) - if member == 'join': - assert len(argvals) == 1 - return argvals[0].join(obj) - if member == 'reverse': - assert len(argvals) == 0 - obj.reverse() - return obj - if member == 'slice': - assert len(argvals) == 1 - return obj[argvals[0]:] - if member == 'splice': - assert isinstance(obj, list) - index, howMany = argvals - res = [] - for i in range(index, min(index + howMany, len(obj))): - res.append(obj.pop(index)) - return res - - return obj[member](argvals) - - for op, opfunc in _OPERATORS: - m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr) - if not m: + for op, _ in self._all_operators(): + # hackety: </> have higher priority than <</>>, but don't confuse them + skip_delim = (op + op) if op in '<>*?' else None + if op == '?': + skip_delim = (skip_delim, '?.') + separated = list(self._separate(expr, op, skip_delims=skip_delim)) + if len(separated) < 2: continue - x, abort = self.interpret_statement( - m.group('x'), local_vars, allow_recursion - 1) - if abort: - raise ExtractorError( - 'Premature left-side return of %s in %r' % (op, expr)) - y, abort = self.interpret_statement( - m.group('y'), local_vars, allow_recursion - 1) - if abort: - raise ExtractorError( - 'Premature right-side return of %s in %r' % (op, expr)) - return opfunc(x, y) - m = re.match( - r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr) - if m: - fname = m.group('func') - argvals = tuple([ - int(v) if v.isdigit() else local_vars[v] - for v in m.group('args').split(',')]) if len(m.group('args')) > 0 else tuple() - if fname not in self._functions: + right_expr = separated.pop() + while op == '-' and len(separated) > 1 and not separated[-1].strip(): + right_expr = '-' + right_expr + separated.pop() + left_val = self.interpret_expression(op.join(separated), local_vars, allow_recursion) + return self._operator(op, left_val, right_expr, expr, local_vars, allow_recursion), should_return + + if md.get('attribute'): + variable, member, nullish = m.group('var', 'member', 'nullish') + if not member: + member = self.interpret_expression(m.group('member2'), local_vars, allow_recursion) + arg_str = expr[m.end():] + if arg_str.startswith('('): + arg_str, remaining = self._separate_at_paren(arg_str) + else: + arg_str, remaining = None, arg_str + + def assertion(cndn, msg): + """ assert, but without risk of getting optimized out """ + if not cndn: + memb = member + raise self.Exception('{memb} {msg}'.format(**locals()), expr=expr) + + def eval_method(): + if (variable, member) == ('console', 'debug'): + return + types = { + 'String': compat_str, + 'Math': float, + } + obj = local_vars.get(variable) + if obj in (JS_Undefined, None): + obj = types.get(variable, JS_Undefined) + if obj is JS_Undefined: + try: + if variable not in self._objects: + self._objects[variable] = self.extract_object(variable) + obj = self._objects[variable] + except self.Exception: + if not nullish: + raise + + if nullish and obj is JS_Undefined: + return JS_Undefined + + # Member access + if arg_str is None: + return self._index(obj, member, nullish) + + # Function call + argvals = [ + self.interpret_expression(v, local_vars, allow_recursion) + for v in self._separate(arg_str)] + + if obj == compat_str: + if member == 'fromCharCode': + assertion(argvals, 'takes one or more arguments') + return ''.join(map(chr, argvals)) + raise self.Exception('Unsupported string method ' + member, expr=expr) + elif obj == float: + if member == 'pow': + assertion(len(argvals) == 2, 'takes two arguments') + return argvals[0] ** argvals[1] + raise self.Exception('Unsupported Math method ' + member, expr=expr) + + if member == 'split': + assertion(argvals, 'takes one or more arguments') + assertion(len(argvals) == 1, 'with limit argument is not implemented') + return obj.split(argvals[0]) if argvals[0] else list(obj) + elif member == 'join': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(len(argvals) == 1, 'takes exactly one argument') + return argvals[0].join(obj) + elif member == 'reverse': + assertion(not argvals, 'does not take any arguments') + obj.reverse() + return obj + elif member == 'slice': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(len(argvals) == 1, 'takes exactly one argument') + return obj[argvals[0]:] + elif member == 'splice': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(argvals, 'takes one or more arguments') + index, howMany = map(int, (argvals + [len(obj)])[:2]) + if index < 0: + index += len(obj) + add_items = argvals[2:] + res = [] + for i in range(index, min(index + howMany, len(obj))): + res.append(obj.pop(index)) + for i, item in enumerate(add_items): + obj.insert(index + i, item) + return res + elif member == 'unshift': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(argvals, 'takes one or more arguments') + for item in reversed(argvals): + obj.insert(0, item) + return obj + elif member == 'pop': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(not argvals, 'does not take any arguments') + if not obj: + return + return obj.pop() + elif member == 'push': + assertion(argvals, 'takes one or more arguments') + obj.extend(argvals) + return obj + elif member == 'forEach': + assertion(argvals, 'takes one or more arguments') + assertion(len(argvals) <= 2, 'takes at-most 2 arguments') + f, this = (argvals + [''])[:2] + return [f((item, idx, obj), {'this': this}, allow_recursion) for idx, item in enumerate(obj)] + elif member == 'indexOf': + assertion(argvals, 'takes one or more arguments') + assertion(len(argvals) <= 2, 'takes at-most 2 arguments') + idx, start = (argvals + [0])[:2] + try: + return obj.index(idx, start) + except ValueError: + return -1 + elif member == 'charCodeAt': + assertion(isinstance(obj, compat_str), 'must be applied on a string') + # assertion(len(argvals) == 1, 'takes exactly one argument') # but not enforced + idx = argvals[0] if isinstance(argvals[0], int) else 0 + if idx >= len(obj): + return None + return ord(obj[idx]) + elif member == 'replace': + assertion(isinstance(obj, compat_str), 'must be applied on a string') + assertion(len(argvals) == 2, 'takes exactly two arguments') + return re.sub(argvals[0], argvals[1], obj) + + idx = int(member) if isinstance(obj, list) else member + return obj[idx](argvals, allow_recursion=allow_recursion) + + if remaining: + ret, should_abort = self.interpret_statement( + self._named_object(local_vars, eval_method()) + remaining, + local_vars, allow_recursion) + return ret, should_return or should_abort + else: + return eval_method(), should_return + + elif md.get('function'): + fname = m.group('fname') + argvals = [self.interpret_expression(v, local_vars, allow_recursion) + for v in self._separate(m.group('args'))] + if fname in local_vars: + return local_vars[fname](argvals, allow_recursion=allow_recursion), should_return + elif fname not in self._functions: self._functions[fname] = self.extract_function(fname) - return self._functions[fname](argvals) + return self._functions[fname](argvals, allow_recursion=allow_recursion), should_return - raise ExtractorError('Unsupported JS expression %r' % expr) + raise self.Exception( + 'Unsupported JS expression ' + (expr[:40] if expr != stmt else ''), expr=stmt) + + def interpret_expression(self, expr, local_vars, allow_recursion): + ret, should_return = self.interpret_statement(expr, local_vars, allow_recursion) + if should_return: + raise self.Exception('Cannot return from an expression', expr) + return ret + + def interpret_iter(self, list_txt, local_vars, allow_recursion): + for v in self._separate(list_txt): + yield self.interpret_expression(v, local_vars, allow_recursion) def extract_object(self, objname): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' @@ -220,43 +857,81 @@ class JSInterpreter(object): }\s*; ''' % (re.escape(objname), _FUNC_NAME_RE), self.code) + if not obj_m: + raise self.Exception('Could not find object ' + objname) fields = obj_m.group('fields') # Currently, it only supports function definitions fields_m = re.finditer( r'''(?x) - (?P<key>%s)\s*:\s*function\s*\((?P<args>[a-z,]+)\){(?P<code>[^}]+)} - ''' % _FUNC_NAME_RE, + (?P<key>%s)\s*:\s*function\s*\((?P<args>(?:%s|,)*)\){(?P<code>[^}]+)} + ''' % (_FUNC_NAME_RE, _NAME_RE), fields) for f in fields_m: - argnames = f.group('args').split(',') + argnames = self.build_arglist(f.group('args')) obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code')) return obj - def extract_function(self, funcname): + def extract_function_code(self, funcname): + """ @returns argnames, code """ func_m = re.search( - r'''(?x) - (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s* + r'''(?xs) + (?: + function\s+%(name)s| + [{;,]\s*%(name)s\s*=\s*function| + (?:var|const|let)\s+%(name)s\s*=\s*function + )\s* \((?P<args>[^)]*)\)\s* - \{(?P<code>[^}]+)\}''' % ( - re.escape(funcname), re.escape(funcname), re.escape(funcname)), + (?P<code>{.+})''' % {'name': re.escape(funcname)}, self.code) + code, _ = self._separate_at_paren(func_m.group('code')) # refine the match if func_m is None: - raise ExtractorError('Could not find JS function %r' % funcname) - argnames = func_m.group('args').split(',') + raise self.Exception('Could not find JS function "{funcname}"'.format(**locals())) + return self.build_arglist(func_m.group('args')), code - return self.build_function(argnames, func_m.group('code')) + def extract_function(self, funcname): + return self.extract_function_from_code(*self.extract_function_code(funcname)) + + def extract_function_from_code(self, argnames, code, *global_stack): + local_vars = {} + while True: + mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code) + if mobj is None: + break + start, body_start = mobj.span() + body, remaining = self._separate_at_paren(code[body_start - 1:], '}') + name = self._named_object(local_vars, self.extract_function_from_code( + [x.strip() for x in mobj.group('args').split(',')], + body, local_vars, *global_stack)) + code = code[:start] + name + remaining + return self.build_function(argnames, code, local_vars, *global_stack) def call_function(self, funcname, *args): - f = self.extract_function(funcname) - return f(args) + return self.extract_function(funcname)(args) - def build_function(self, argnames, code): - def resf(args): - local_vars = dict(zip(argnames, args)) - for stmt in code.split(';'): - res, abort = self.interpret_statement(stmt, local_vars) - if abort: - break - return res + @classmethod + def build_arglist(cls, arg_text): + if not arg_text: + return [] + + def valid_arg(y): + y = y.strip() + if not y: + raise cls.Exception('Missing arg in "%s"' % (arg_text, )) + return y + + return [valid_arg(x) for x in cls._separate(arg_text)] + + def build_function(self, argnames, code, *global_stack): + global_stack = list(global_stack) or [{}] + argnames = tuple(argnames) + + def resf(args, kwargs={}, allow_recursion=100): + global_stack[0].update( + zip_longest(argnames, args, fillvalue=None)) + global_stack[0].update(kwargs) + var_stack = LocalNameSpace(*global_stack) + ret, should_abort = self.interpret_statement(code.replace('\n', ' '), var_stack, allow_recursion - 1) + if should_abort: + return ret return resf diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 0a0641bd4..f6d2b0898 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -270,11 +270,11 @@ def parseOpts(overrideArguments=None): selection.add_option( '--match-title', dest='matchtitle', metavar='REGEX', - help='Download only matching titles (regex or caseless sub-string)') + help='Download only matching titles (case-insensitive regex or alphanumeric sub-string)') selection.add_option( '--reject-title', dest='rejecttitle', metavar='REGEX', - help='Skip download for matching titles (regex or caseless sub-string)') + help='Skip download for matching titles (case-insensitive regex or alphanumeric sub-string)') selection.add_option( '--max-downloads', dest='max_downloads', metavar='NUMBER', type=int, default=None, @@ -801,7 +801,7 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--postprocessor-args', dest='postprocessor_args', metavar='ARGS', - help='Give these arguments to the postprocessor') + help='Give these arguments to the postprocessor (if postprocessing is required)') postproc.add_option( '-k', '--keep-video', action='store_true', dest='keepvideo', default=False, diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index 3990908b6..5e7b6e2df 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -13,8 +13,9 @@ from ..utils import ( encodeFilename, PostProcessingError, prepend_extension, + process_communicate_or_kill, replace_extension, - shell_quote + shell_quote, ) @@ -109,7 +110,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor): self._downloader.to_screen('[debug] AtomicParsley command line: %s' % shell_quote(cmd)) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate() + stdout, stderr = process_communicate_or_kill(p) if p.returncode != 0: msg = stderr.decode('utf-8', 'replace').strip() diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 9f76c9d4e..8c29c8d59 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -16,6 +16,7 @@ from ..utils import ( is_outdated_version, PostProcessingError, prepend_extension, + process_communicate_or_kill, shell_quote, subtitles_filename, dfxp2srt, @@ -180,7 +181,7 @@ class FFmpegPostProcessor(PostProcessor): handle = subprocess.Popen( cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, stdin=subprocess.PIPE) - stdout_data, stderr_data = handle.communicate() + stdout_data, stderr_data = process_communicate_or_kill(handle) expected_ret = 0 if self.probe_available else 1 if handle.wait() != expected_ret: return None @@ -228,7 +229,7 @@ class FFmpegPostProcessor(PostProcessor): if self._downloader.params.get('verbose', False): self._downloader.to_screen('[debug] ffmpeg command line: %s' % shell_quote(cmd)) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) - stdout, stderr = p.communicate() + stdout, stderr = process_communicate_or_kill(p) if p.returncode != 0: stderr = stderr.decode('utf-8', 'replace') msgs = stderr.strip().split('\n') diff --git a/youtube_dl/postprocessor/metadatafromtitle.py b/youtube_dl/postprocessor/metadatafromtitle.py index f5c14d974..6cd5bb70f 100644 --- a/youtube_dl/postprocessor/metadatafromtitle.py +++ b/youtube_dl/postprocessor/metadatafromtitle.py @@ -40,6 +40,8 @@ class MetadataFromTitlePP(PostProcessor): % self._titleformat) return [], info for attribute, value in match.groupdict().items(): + if value is None: + continue info[attribute] = value self._downloader.to_screen( '[fromtitle] parsed %s: %s' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e722eed58..4edbfa27b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -33,6 +33,7 @@ import sys import tempfile import time import traceback +import unicodedata import xml.etree.ElementTree import zlib @@ -41,7 +42,9 @@ from .compat import ( compat_HTMLParser, compat_HTTPError, compat_basestring, + compat_casefold, compat_chr, + compat_collections_abc, compat_cookiejar, compat_ctypes_WINFUNCTYPE, compat_etree_fromstring, @@ -52,18 +55,18 @@ from .compat import ( compat_integer_types, compat_kwargs, compat_os_name, - compat_parse_qs, + compat_re_Match, compat_shlex_quote, compat_str, compat_struct_pack, compat_struct_unpack, compat_urllib_error, compat_urllib_parse, + compat_urllib_parse_parse_qs as compat_parse_qs, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urllib_parse_unquote_plus, compat_urllib_request, - compat_urlparse, compat_xpath, ) @@ -78,12 +81,12 @@ def register_socks_protocols(): # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 # URLs with protocols not in urlparse.uses_netloc are not handled correctly for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): - if scheme not in compat_urlparse.uses_netloc: - compat_urlparse.uses_netloc.append(scheme) + if scheme not in compat_urllib_parse.uses_netloc: + compat_urllib_parse.uses_netloc.append(scheme) -# This is not clearly defined otherwise -compiled_regex_type = type(re.compile('')) +# Unfavoured alias +compiled_regex_type = compat_re_Match def random_user_agent(): @@ -1684,6 +1687,7 @@ USER_AGENTS = { NO_DEFAULT = object() +IDENTITY = lambda x: x ENGLISH_MONTH_NAMES = [ 'January', 'February', 'March', 'April', 'May', 'June', @@ -1696,6 +1700,17 @@ MONTH_NAMES = { 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'], } +# Timezone names for RFC2822 obs-zone +# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42 +TIMEZONE_NAMES = { + 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0, + 'AST': -4, 'ADT': -3, # Atlantic (used in Canada) + 'EST': -5, 'EDT': -4, # Eastern + 'CST': -6, 'CDT': -5, # Central + 'MST': -7, 'MDT': -6, # Mountain + 'PST': -8, 'PDT': -7 # Pacific +} + KNOWN_EXTENSIONS = ( 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', 'flv', 'f4v', 'f4a', 'f4b', @@ -1735,12 +1750,17 @@ DATE_FORMATS = ( '%b %dth %Y %I:%M', '%Y %m %d', '%Y-%m-%d', + '%Y.%m.%d.', '%Y/%m/%d', '%Y/%m/%d %H:%M', '%Y/%m/%d %H:%M:%S', + '%Y%m%d%H%M', + '%Y%m%d%H%M%S', + '%Y%m%d', '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S.%f', + '%Y-%m-%d %H:%M:%S:%f', '%d.%m.%Y %H:%M', '%d.%m.%Y %H.%M', '%Y-%m-%dT%H:%M:%SZ', @@ -1753,6 +1773,7 @@ DATE_FORMATS = ( '%b %d %Y at %H:%M:%S', '%B %d %Y at %H:%M', '%B %d %Y at %H:%M:%S', + '%H:%M %d-%b-%Y', ) DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) @@ -1763,6 +1784,7 @@ DATE_FORMATS_DAY_FIRST.extend([ '%d/%m/%Y', '%d/%m/%y', '%d/%m/%Y %H:%M:%S', + '%d-%m-%Y %H:%M', ]) DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) @@ -2100,6 +2122,9 @@ def sanitize_filename(s, restricted=False, is_id=False): return '_' return char + # Replace look-alike Unicode glyphs + if restricted and not is_id: + s = unicodedata.normalize('NFKC', s) # Handle timestamps s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) result = ''.join(map(replace_insane, s)) @@ -2151,7 +2176,7 @@ def sanitize_url(url): for mistake, fixup in COMMON_TYPOS: if re.match(mistake, url): return re.sub(mistake, fixup, url) - return url + return escape_url(url) def sanitized_Request(url, *args, **kwargs): @@ -2212,6 +2237,15 @@ def unescapeHTML(s): r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) +def process_communicate_or_kill(p, *args, **kwargs): + try: + return p.communicate(*args, **kwargs) + except BaseException: # Including KeyboardInterrupt + p.kill() + p.wait() + raise + + def get_subprocess_encoding(): if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: # For subprocess calls, encode with locale encoding @@ -2292,12 +2326,30 @@ def formatSeconds(secs): def make_HTTPS_handler(params, **kwargs): + + # https://www.rfc-editor.org/info/rfc7301 + ALPN_PROTOCOLS = ['http/1.1'] + + def set_alpn_protocols(ctx): + # From https://github.com/yt-dlp/yt-dlp/commit/2c6dcb65fb612fc5bc5c61937bf438d3c473d8d0 + # Thanks @coletdjnz + # Some servers may (wrongly) reject requests if ALPN extension is not sent. See: + # https://github.com/python/cpython/issues/85140 + # https://github.com/yt-dlp/yt-dlp/issues/3878 + try: + ctx.set_alpn_protocols(ALPN_PROTOCOLS) + except (AttributeError, NotImplementedError): + # Python < 2.7.10, not ssl.HAS_ALPN + pass + opts_no_check_certificate = params.get('nocheckcertificate', False) if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH) + set_alpn_protocols(context) if opts_no_check_certificate: context.check_hostname = False context.verify_mode = ssl.CERT_NONE + try: return YoutubeDLHTTPSHandler(params, context=context, **kwargs) except TypeError: @@ -2313,6 +2365,7 @@ def make_HTTPS_handler(params, **kwargs): if opts_no_check_certificate else ssl.CERT_REQUIRED) context.set_default_verify_paths() + set_alpn_protocols(context) return YoutubeDLHTTPSHandler(params, context=context, **kwargs) @@ -2673,7 +2726,7 @@ def make_socks_conn_class(base_class, socks_proxy): assert issubclass(base_class, ( compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection)) - url_components = compat_urlparse.urlparse(socks_proxy) + url_components = compat_urllib_parse.urlparse(socks_proxy) if url_components.scheme.lower() == 'socks5': socks_type = ProxyType.SOCKS5 elif url_components.scheme.lower() in ('socks', 'socks4'): @@ -2938,10 +2991,22 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): def extract_timezone(date_str): m = re.search( - r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', - date_str) + r'''(?x) + ^.{8,}? # >=8 char non-TZ prefix, if present + (?P<tz>Z| # just the UTC Z, or + (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or + (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits + [ ]? # optional space + (?P<sign>\+|-) # +/- + (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm + $) + ''', date_str) if not m: - timezone = datetime.timedelta() + m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str) + timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip()) + if timezone is not None: + date_str = date_str[:-len(m.group('tz'))] + timezone = datetime.timedelta(hours=timezone or 0) else: date_str = date_str[:-len(m.group('tz'))] if not m.group('sign'): @@ -3009,7 +3074,8 @@ def unified_timestamp(date_str, day_first=True): if date_str is None: return None - date_str = re.sub(r'[,|]', '', date_str) + date_str = re.sub(r'\s+', ' ', re.sub( + r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str)) pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 timezone, date_str = extract_timezone(date_str) @@ -3035,7 +3101,7 @@ def unified_timestamp(date_str, day_first=True): pass timetuple = email.utils.parsedate_tz(date_str) if timetuple: - return calendar.timegm(timetuple) + pm_delta * 3600 + return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds() def determine_ext(url, default_ext='unknown_video'): @@ -3608,7 +3674,7 @@ def remove_quotes(s): def url_basename(url): - path = compat_urlparse.urlparse(url).path + path = compat_urllib_parse.urlparse(url).path return path.strip('/').split('/')[-1] @@ -3628,7 +3694,7 @@ def urljoin(base, path): if not isinstance(base, compat_str) or not re.match( r'^(?:https?:)?//', base): return None - return compat_urlparse.urljoin(base, path) + return compat_urllib_parse.urljoin(base, path) class HEADRequest(compat_urllib_request.Request): @@ -3645,13 +3711,11 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): if get_attr: if v is not None: v = getattr(v, get_attr, None) - if v == '': - v = None - if v is None: + if v in (None, ''): return default try: return int(v) * invscale // scale - except (ValueError, TypeError): + except (ValueError, TypeError, OverflowError): return default @@ -3769,7 +3833,8 @@ def check_executable(exe, args=[]): """ Checks if the given binary is installed somewhere in PATH, and returns its name. args can be a list of arguments for a short output (like -version) """ try: - subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() + process_communicate_or_kill(subprocess.Popen( + [exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)) except OSError: return False return exe @@ -3783,10 +3848,10 @@ def get_exe_version(exe, args=['--version'], # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers # SIGTTOU if youtube-dl is run in the background. # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656 - out, _ = subprocess.Popen( + out, _ = process_communicate_or_kill(subprocess.Popen( [encodeArgument(exe)] + args, stdin=subprocess.PIPE, - stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate() + stdout=subprocess.PIPE, stderr=subprocess.STDOUT)) except OSError: return False if isinstance(out, bytes): # Python 2.x @@ -3805,6 +3870,105 @@ def detect_exe_version(output, version_re=None, unrecognized='present'): return unrecognized +class LazyList(compat_collections_abc.Sequence): + """Lazy immutable list from an iterable + Note that slices of a LazyList are lists and not LazyList""" + + class IndexError(IndexError): + def __init__(self, cause=None): + if cause: + # reproduce `raise from` + self.__cause__ = cause + super(IndexError, self).__init__() + + def __init__(self, iterable, **kwargs): + # kwarg-only + reverse = kwargs.get('reverse', False) + _cache = kwargs.get('_cache') + + self._iterable = iter(iterable) + self._cache = [] if _cache is None else _cache + self._reversed = reverse + + def __iter__(self): + if self._reversed: + # We need to consume the entire iterable to iterate in reverse + for item in self.exhaust(): + yield item + return + for item in self._cache: + yield item + for item in self._iterable: + self._cache.append(item) + yield item + + def _exhaust(self): + self._cache.extend(self._iterable) + self._iterable = [] # Discard the emptied iterable to make it pickle-able + return self._cache + + def exhaust(self): + """Evaluate the entire iterable""" + return self._exhaust()[::-1 if self._reversed else 1] + + @staticmethod + def _reverse_index(x): + return None if x is None else ~x + + def __getitem__(self, idx): + if isinstance(idx, slice): + if self._reversed: + idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1)) + start, stop, step = idx.start, idx.stop, idx.step or 1 + elif isinstance(idx, int): + if self._reversed: + idx = self._reverse_index(idx) + start, stop, step = idx, idx, 0 + else: + raise TypeError('indices must be integers or slices') + if ((start or 0) < 0 or (stop or 0) < 0 + or (start is None and step < 0) + or (stop is None and step > 0)): + # We need to consume the entire iterable to be able to slice from the end + # Obviously, never use this with infinite iterables + self._exhaust() + try: + return self._cache[idx] + except IndexError as e: + raise self.IndexError(e) + n = max(start or 0, stop or 0) - len(self._cache) + 1 + if n > 0: + self._cache.extend(itertools.islice(self._iterable, n)) + try: + return self._cache[idx] + except IndexError as e: + raise self.IndexError(e) + + def __bool__(self): + try: + self[-1] if self._reversed else self[0] + except self.IndexError: + return False + return True + + def __len__(self): + self._exhaust() + return len(self._cache) + + def __reversed__(self): + return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache) + + def __copy__(self): + return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache) + + def __repr__(self): + # repr and str should mimic a list. So we exhaust the iterable + return repr(self.exhaust()) + + def __str__(self): + return repr(self.exhaust()) + + class PagedList(object): def __len__(self): # This is only useful for tests @@ -3912,7 +4076,8 @@ def escape_rfc3986(s): """Escape non-ASCII characters as suggested by RFC 3986""" if sys.version_info < (3, 0) and isinstance(s, compat_str): s = s.encode('utf-8') - return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") + # ensure unicode: after quoting, it can always be converted + return compat_str(compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")) def escape_url(url): @@ -3927,6 +4092,10 @@ def escape_url(url): ).geturl() +def parse_qs(url): + return compat_parse_qs(compat_urllib_parse.urlparse(url).query) + + def read_batch_urls(batch_fd): def fixup(url): if not isinstance(url, compat_str): @@ -3947,14 +4116,28 @@ def urlencode_postdata(*args, **kargs): return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii') +def update_url(url, **kwargs): + """Replace URL components specified by kwargs + url: compat_str or parsed URL tuple + if query_update is in kwargs, update query with + its value instead of replacing (overrides any `query`) + returns: compat_str + """ + if not kwargs: + return compat_urllib_parse.urlunparse(url) if isinstance(url, tuple) else url + if not isinstance(url, tuple): + url = compat_urllib_parse.urlparse(url) + query = kwargs.pop('query_update', None) + if query: + qs = compat_parse_qs(url.query) + qs.update(query) + kwargs['query'] = compat_urllib_parse_urlencode(qs, True) + kwargs = compat_kwargs(kwargs) + return compat_urllib_parse.urlunparse(url._replace(**kwargs)) + + def update_url_query(url, query): - if not query: - return url - parsed_url = compat_urlparse.urlparse(url) - qs = compat_parse_qs(parsed_url.query) - qs.update(query) - return compat_urlparse.urlunparse(parsed_url._replace( - query=compat_urllib_parse_urlencode(qs, True))) + return update_url(url, query_update=query) def update_Request(req, url=None, data=None, headers={}, query={}): @@ -4029,6 +4212,10 @@ def multipart_encode(data, boundary=None): return out, content_type +def variadic(x, allowed_types=(compat_str, bytes, dict)): + return x if isinstance(x, compat_collections_abc.Iterable) and not isinstance(x, allowed_types) else (x,) + + def dict_get(d, key_or_keys, default=None, skip_false_values=True): if isinstance(key_or_keys, (list, tuple)): for key in key_or_keys: @@ -4039,6 +4226,23 @@ def dict_get(d, key_or_keys, default=None, skip_false_values=True): return d.get(key_or_keys, default) +def try_call(*funcs, **kwargs): + + # parameter defaults + expected_type = kwargs.get('expected_type') + fargs = kwargs.get('args', []) + fkwargs = kwargs.get('kwargs', {}) + + for f in funcs: + try: + val = f(*fargs, **fkwargs) + except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError): + pass + else: + if expected_type is None or isinstance(val, expected_type): + return val + + def try_get(src, getter, expected_type=None): if not isinstance(getter, (list, tuple)): getter = [getter] @@ -5401,7 +5605,7 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): if proxy == '__noproxy__': return None # No Proxy - if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): + if compat_urllib_parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): req.add_header('Ytdl-socks-proxy', proxy) # youtube-dl's http/https handlers do wrapping the socket with socks return None @@ -5725,7 +5929,7 @@ def write_xattr(path, key, value): cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) except EnvironmentError as e: raise XAttrMetadataError(e.errno, e.strerror) - stdout, stderr = p.communicate() + stdout, stderr = process_communicate_or_kill(p) stderr = stderr.decode('utf-8', 'replace') if p.returncode != 0: raise XAttrMetadataError(p.returncode, stderr) @@ -5772,3 +5976,212 @@ def clean_podcast_url(url): st\.fm # https://podsights.com/docs/ )/e )/''', '', url) + + +def traverse_obj(obj, *paths, **kwargs): + """ + Safely traverse nested `dict`s and `Sequence`s + + >>> obj = [{}, {"key": "value"}] + >>> traverse_obj(obj, (1, "key")) + "value" + + Each of the provided `paths` is tested and the first producing a valid result will be returned. + The next path will also be tested if the path branched but no results could be found. + Supported values for traversal are `Mapping`, `Sequence` and `re.Match`. + A value of None is treated as the absence of a value. + + The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`. + + The keys in the path can be one of: + - `None`: Return the current object. + - `str`/`int`: Return `obj[key]`. For `re.Match, return `obj.group(key)`. + - `slice`: Branch out and return all values in `obj[key]`. + - `Ellipsis`: Branch out and return a list of all values. + - `tuple`/`list`: Branch out and return a list of all matching values. + Read as: `[traverse_obj(obj, branch) for branch in branches]`. + - `function`: Branch out and return values filtered by the function. + Read as: `[value for key, value in obj if function(key, value)]`. + For `Sequence`s, `key` is the index of the value. + - `dict` Transform the current object and return a matching dict. + Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`. + + `tuple`, `list`, and `dict` all support nested paths and branches. + + @params paths Paths which to traverse by. + Keyword arguments: + @param default Value to return if the paths do not match. + @param expected_type If a `type`, only accept final values of this type. + If any other callable, try to call the function on each result. + @param get_all If `False`, return the first matching result, otherwise all matching ones. + @param casesense If `False`, consider string dictionary keys as case insensitive. + + The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API + + @param _is_user_input Whether the keys are generated from user input. + If `True` strings get converted to `int`/`slice` if needed. + @param _traverse_string Whether to traverse into objects as strings. + If `True`, any non-compatible object will first be + converted into a string and then traversed into. + + + @returns The result of the object traversal. + If successful, `get_all=True`, and the path branches at least once, + then a list of results is returned instead. + A list is always returned if the last path branches and no `default` is given. + """ + + # parameter defaults + default = kwargs.get('default', NO_DEFAULT) + expected_type = kwargs.get('expected_type') + get_all = kwargs.get('get_all', True) + casesense = kwargs.get('casesense', True) + _is_user_input = kwargs.get('_is_user_input', False) + _traverse_string = kwargs.get('_traverse_string', False) + + # instant compat + str = compat_str + + is_sequence = lambda x: isinstance(x, compat_collections_abc.Sequence) and not isinstance(x, (str, bytes)) + casefold = lambda k: compat_casefold(k) if isinstance(k, str) else k + + if isinstance(expected_type, type): + type_test = lambda val: val if isinstance(val, expected_type) else None + else: + type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,)) + + def from_iterable(iterables): + # chain.from_iterable(['ABC', 'DEF']) --> A B C D E F + for it in iterables: + for item in it: + yield item + + def apply_key(key, obj): + if obj is None: + return + + elif key is None: + yield obj + + elif isinstance(key, (list, tuple)): + for branch in key: + _, result = apply_path(obj, branch) + for item in result: + yield item + + elif key is Ellipsis: + result = [] + if isinstance(obj, compat_collections_abc.Mapping): + result = obj.values() + elif is_sequence(obj): + result = obj + elif isinstance(obj, compat_re_Match): + result = obj.groups() + elif _traverse_string: + result = str(obj) + for item in result: + yield item + + elif callable(key): + if is_sequence(obj): + iter_obj = enumerate(obj) + elif isinstance(obj, compat_collections_abc.Mapping): + iter_obj = obj.items() + elif isinstance(obj, compat_re_Match): + iter_obj = enumerate(itertools.chain([obj.group()], obj.groups())) + elif _traverse_string: + iter_obj = enumerate(str(obj)) + else: + return + for item in (v for k, v in iter_obj if try_call(key, args=(k, v))): + yield item + + elif isinstance(key, dict): + iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items()) + yield dict((k, v if v is not None else default) for k, v in iter_obj + if v is not None or default is not NO_DEFAULT) + + elif isinstance(obj, compat_collections_abc.Mapping): + yield (obj.get(key) if casesense or (key in obj) + else next((v for k, v in obj.items() if casefold(k) == key), None)) + + elif isinstance(obj, compat_re_Match): + if isinstance(key, int) or casesense: + try: + yield obj.group(key) + return + except IndexError: + pass + if not isinstance(key, str): + return + + yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None) + + else: + if _is_user_input: + key = (int_or_none(key) if ':' not in key + else slice(*map(int_or_none, key.split(':')))) + + if not isinstance(key, (int, slice)): + return + + if not is_sequence(obj): + if not _traverse_string: + return + obj = str(obj) + + try: + yield obj[key] + except IndexError: + pass + + def apply_path(start_obj, path): + objs = (start_obj,) + has_branched = False + + for key in variadic(path): + if _is_user_input and key == ':': + key = Ellipsis + + if not casesense and isinstance(key, str): + key = compat_casefold(key) + + if key is Ellipsis or isinstance(key, (list, tuple)) or callable(key): + has_branched = True + + key_func = functools.partial(apply_key, key) + objs = from_iterable(map(key_func, objs)) + + return has_branched, objs + + def _traverse_obj(obj, path, use_list=True): + has_branched, results = apply_path(obj, path) + results = LazyList(x for x in map(type_test, results) if x is not None) + + if get_all and has_branched: + return results.exhaust() if results or use_list else None + + return results[0] if results else None + + for index, path in enumerate(paths, 1): + use_list = default is NO_DEFAULT and index == len(paths) + result = _traverse_obj(obj, path, use_list) + if result is not None: + return result + + return None if default is NO_DEFAULT else default + + +def get_first(obj, keys, **kwargs): + return traverse_obj(obj, (Ellipsis,) + tuple(variadic(keys)), get_all=False, **kwargs) + + +def join_nonempty(*values, **kwargs): + + # parameter defaults + delim = kwargs.get('delim', '-') + from_dict = kwargs.get('from_dict') + + if from_dict is not None: + values = (traverse_obj(from_dict, variadic(v)) for v in values) + return delim.join(map(compat_str, filter(None, values))) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 461dd87ca..b82fbc702 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.06.06' +__version__ = '2021.12.17'