[NHK] Support alphabetic characters in 7-char NhkVod IDs (#29682 )

[doc] Clarify test naming
[streamcz] Remove empty '{}'.format() for Py2.6
2025-07-13 23:14:13 +09:00 · 2022-05-09 18:54:41 +01:00 · 2022-04-29 16:56:00 +01:00 · 2022-04-29 13:36:02 +01:00 · 2022-04-28 10:18:10 +01:00 · 2022-04-15 16:07:09 +01:00
7 changed files with 59 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -1069,9 +1069,11 @@ After you have ensured this site is distributing its content legally, you can fo
            }
    ```
 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py).
-6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in.
+6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test (actually, test case) then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note:
-7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want.
+    * the test names use the extractor class name **without the trailing `IE`**
-8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart):
+    * tests with `only_matching` key in test's dict are not counted.
 8. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want.
 9. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart):
        $ flake8 youtube_dl/extractor/yourextractor.py
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -557,6 +557,7 @@ from .kinja import KinjaEmbedIE
 from .kinopoisk import KinoPoiskIE
 from .konserthusetplay import KonserthusetPlayIE
 from .krasview import KrasViewIE
 from .kth import KTHIE
 from .ku6 import Ku6IE
 from .kusi import KUSIIE
 from .kuwo import (
--- a/youtube_dl/extractor/kaltura.py
+++ b/youtube_dl/extractor/kaltura.py
@ -373,5 +373,5 @@ class KalturaIE(InfoExtractor):
            'duration': info.get('duration'),
            'timestamp': info.get('createdAt'),
            'uploader_id': info.get('userId') if info.get('userId') != 'None' else None,
-            'view_count': info.get('plays'),
+            'view_count': int_or_none(info.get('plays')),
        }
--- a/youtube_dl/extractor/kth.py
+++ b/youtube_dl/extractor/kth.py
@ -0,0 +1,31 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..utils import smuggle_url
 class KTHIE(InfoExtractor):
    _VALID_URL = r'https?://play\.kth\.se/(?:[^/]+/)+(?P<id>[a-z0-9_]+)'
    _TEST = {
        'url': 'https://play.kth.se/media/Lunch+breakA+De+nya+aff%C3%A4rerna+inom+Fordonsdalen/0_uoop6oz9',
        'md5': 'd83ada6d00ca98b73243a88efe19e8a6',
        'info_dict': {
            'id': '0_uoop6oz9',
            'ext': 'mp4',
            'title': 'md5:bd1d6931facb6828762a33e6ce865f37',
            'thumbnail': 're:https?://.+/thumbnail/.+',
            'duration': 3516,
            'timestamp': 1647345358,
            'upload_date': '20220315',
            'uploader_id': 'md5:0ec23e33a89e795a4512930c8102509f',
        }
    }
    def _real_extract(self, url):
        video_id = self._match_id(url)
        result = self.url_result(
            smuggle_url('kaltura:308:%s' % video_id, {
                'service_url': 'https://api.kaltura.nordu.net'}),
            'Kaltura')
        return result
--- a/youtube_dl/extractor/nhk.py
+++ b/youtube_dl/extractor/nhk.py
@ -1,3 +1,4 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
@ -23,7 +24,7 @@ class NhkBaseIE(InfoExtractor):
    def _extract_episode_info(self, url, episode=None):
        fetch_episode = episode is None
        lang, m_type, episode_id = re.match(NhkVodIE._VALID_URL, url).groups()
-        if episode_id.isdigit():
+        if len(episode_id) == 7:
            episode_id = episode_id[:4] + '-' + episode_id[4:]
        is_video = m_type == 'video'
@ -84,7 +85,8 @@ class NhkBaseIE(InfoExtractor):
 class NhkVodIE(NhkBaseIE):
-    _VALID_URL = r'%s%s(?P<id>\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
+    # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg
    _VALID_URL = r'%s%s(?P<id>[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
    # Content available only for a limited period of time. Visit
    # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
    _TESTS = [{
@ -124,6 +126,19 @@ class NhkVodIE(NhkBaseIE):
    }, {
        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/',
        'only_matching': True,
    }, {
        # video, alphabetic character in ID #29670
        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/',
        'only_matching': True,
        'info_dict': {
            'id': 'qfjay6cg',
            'ext': 'mp4',
            'title': 'DESIGN TALKS plus - Fishermen’s Finery',
            'description': 'md5:8a8f958aaafb0d7cb59d38de53f1e448',
            'thumbnail': r're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$',
            'upload_date': '20210615',
            'timestamp': 1623722008,
        }
    }]
    def _real_extract(self, url):
--- a/youtube_dl/extractor/streamcz.py
+++ b/youtube_dl/extractor/streamcz.py
@ -62,7 +62,7 @@ class StreamCZIE(InfoExtractor):
                if not stream.get('url'):
                    continue
                yield merge_dicts({
-                    'format_id': '{}-{}'.format(format_id, ext),
+                    'format_id': '-'.join((format_id, ext)),
                    'ext': ext,
                    'source_preference': pref,
                    'url': urljoin(spl_url, stream['url']),
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -1464,15 +1464,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
    # 2. https://code.videolan.org/videolan/vlc/-/blob/4fb284e5af69aa9ac2100ccbdd3b88debec9987f/share/lua/playlist/youtube.lua#L116
    # 3. https://github.com/ytdl-org/youtube-dl/issues/30097#issuecomment-950157377
    def _extract_n_function_name(self, jscode):
-        target = r'(?P<nfunc>[a-zA-Z0-9$]{3})(?:\[(?P<idx>\d+)\])?'
+        target = r'(?P<nfunc>[a-zA-Z_$][\w$]*)(?:\[(?P<idx>\d+)\])?'
        nfunc_and_idx = self._search_regex(
-            r'\.get\("n"\)\)&&\(b=(%s)\([a-zA-Z0-9]\)' % (target, ),
+            r'\.get\("n"\)\)&&\(b=(%s)\([\w$]+\)' % (target, ),
            jscode, 'Initial JS player n function name')
        nfunc, idx = re.match(target, nfunc_and_idx).group('nfunc', 'idx')
        if not idx:
            return nfunc
        return self._parse_json(self._search_regex(
-            r'var %s\s*=\s*(\[.+?\]);' % (nfunc, ), jscode,
+            r'var %s\s*=\s*(\[.+?\]);' % (re.escape(nfunc), ), jscode,
            'Initial JS player n function list ({nfunc}[{idx}])'.format(**locals())), nfunc, transform_source=js_to_json)[int(idx)]
    def _extract_n_function(self, video_id, player_url):
Author	SHA1	Message	Date
dirkf	c7965b9fc2	[NHK] Support alphabetic characters in 7-char NhkVod IDs (#29682 )	2022-05-09 18:54:41 +01:00
dirkf	e988fa4523	[doc] Clarify test naming	2022-04-29 16:56:00 +01:00
dirkf	e27d8d819f	[streamcz] Remove empty `'{}'.format()` for Py2.6 Use `'-join()'` here, or `{0}`, ..., in general.	2022-04-29 13:36:02 +01:00
Árni Dagur	ebc627847c	[KTH] Add new extractor for KTH play (#30885 ) * Implement extractor for KTH play * Make KTH Play url regex more relaxed	2022-04-28 10:18:10 +01:00
dirkf	a0068bd6be	[Youtube] Fix "n" descrambling for player fae06c11 Resolves #30856.	2022-04-15 16:07:09 +01:00