Merge a2fd63ce22280cef3856d39d795ec91303152ff6 into 4e714f9df1ed2cccd51df60d45ff5504abe827b7

[Misc] Correct [_]IE_DESC/NAME in a few IEs
* thx seproDev, yt-dlp/yt-dlp/pull/12694/commits/ae69e3c * also add documenting comment in `InfoExtractor`
2025-07-06 11:34:14 +09:00 · 2025-03-27 10:14:43 +02:00 · 2025-03-26 12:47:19 +00:00 · 2025-03-26 12:17:49 +00:00 · 2021-11-12 09:35:50 +02:00 · 2021-11-12 09:35:50 +02:00
6 changed files with 107 additions and 33 deletions
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@ -187,10 +187,10 @@ class TestInfoExtractor(unittest.TestCase):
        self.assertEqual(search['track']['id'], 'testid')

    def test_search_json_ld_realworld(self):
-        # https://github.com/ytdl-org/youtube-dl/issues/23306
-        expect_dict(
-            self,
-            self.ie._search_json_ld(r'''<script type="application/ld+json">
+        _TESTS = [
+            # https://github.com/ytdl-org/youtube-dl/issues/23306
+            (
+                r'''<script type="application/ld+json">
 {
 "@context": "http://schema.org/",
 "@type": "VideoObject",
@ -223,17 +223,86 @@ class TestInfoExtractor(unittest.TestCase):
 "name": "Kleio Valentien",
 "url": "https://www.eporner.com/pornstar/kleio-valentien/"
 }]}
-</script>''', None),
-            {
-                'title': '1 On 1 With Kleio',
-                'description': 'Kleio Valentien',
-                'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4',
-                'timestamp': 1449347075,
-                'duration': 743.0,
-                'view_count': 1120958,
-                'width': 1920,
-                'height': 1080,
-            })
+                </script>''',
+                {
+                    'title': '1 On 1 With Kleio',
+                    'description': 'Kleio Valentien',
+                    'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4',
+                    'timestamp': 1449347075,
+                    'duration': 743.0,
+                    'view_count': 1120958,
+                    'width': 1920,
+                    'height': 1080,
+                },
+                {},
+            ),
+            (
+                r'''<script type="application/ld+json">
+      {
+      "@context": "https://schema.org",
+      "@graph": [
+      {
+      "@type": "NewsArticle",
+      "mainEntityOfPage": {
+      "@type": "WebPage",
+      "@id": "https://www.ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn"
+      },
+      "headline": "Συμμορία ανηλίκων – δικηγόρος θυμάτων: ήθελαν να τους αποτελειώσουν",
+      "name": "Συμμορία ανηλίκων – δικηγόρος θυμάτων: ήθελαν να τους αποτελειώσουν",
+      "description": "Τα παιδιά δέχθηκαν την επίθεση επειδή αρνήθηκαν να γίνουν μέλη της συμμορίας, ανέφερε ο Γ. Ζαχαρόπουλος.",
+      "image": {
+      "@type": "ImageObject",
+      "url": "https://ant1media.azureedge.net/imgHandler/1100/a635c968-be71-447c-bf9c-80d843ece21e.jpg",
+      "width": 1100,
+      "height": 756            },
+      "datePublished": "2021-11-10T08:50:00+03:00",
+      "dateModified": "2021-11-10T08:52:53+03:00",
+      "author": {
+      "@type": "Person",
+      "@id": "https://www.ant1news.gr/",
+      "name": "Ant1news",
+      "image": "https://www.ant1news.gr/images/logo-e5d7e4b3e714c88e8d2eca96130142f6.png",
+      "url": "https://www.ant1news.gr/"
+      },
+      "publisher": {
+      "@type": "Organization",
+      "@id": "https://www.ant1news.gr#publisher",
+      "name": "Ant1news",
+      "url": "https://www.ant1news.gr",
+      "logo": {
+      "@type": "ImageObject",
+      "url": "https://www.ant1news.gr/images/logo-e5d7e4b3e714c88e8d2eca96130142f6.png",
+      "width": 400,
+      "height": 400                },
+      "sameAs": [
+      "https://www.facebook.com/Ant1news.gr",
+      "https://twitter.com/antennanews",
+      "https://www.youtube.com/channel/UC0smvAbfczoN75dP0Hw4Pzw",
+      "https://www.instagram.com/ant1news/"
+      ]
+      },
+
+      "keywords": "μαχαίρωμα,συμμορία ανηλίκων,ΕΙΔΗΣΕΙΣ,ΕΙΔΗΣΕΙΣ ΣΗΜΕΡΑ,ΝΕΑ,Κοινωνία - Ant1news",
+
+
+      "articleSection": "Κοινωνία"
+      }
+      ]
+      }
+                </script>''',
+                {
+                    'timestamp': 1636523400,
+                    'title': 'md5:91fe569e952e4d146485740ae927662b',
+                },
+                {'expected_type': 'NewsArticle'},
+            ),
+        ]
+        for html, expected_dict, search_json_ld_kwargs in _TESTS:
+            expect_dict(
+                self,
+                self.ie._search_json_ld(html, None, **search_json_ld_kwargs),
+                expected_dict
+            )

    def test_download_json(self):
        uri = encode_data_uri(b'{"foo": "blah"}', 'application/json')
--- a/youtube_dl/extractor/bokecc.py
+++ b/youtube_dl/extractor/bokecc.py
@ -32,7 +32,7 @@ class BokeCCBaseIE(InfoExtractor):


 class BokeCCIE(BokeCCBaseIE):
-    _IE_DESC = 'CC视频'
+    IE_DESC = 'CC视频'
    _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)'

    _TESTS = [{
--- a/youtube_dl/extractor/cloudy.py
+++ b/youtube_dl/extractor/cloudy.py
@ -9,7 +9,7 @@ from ..utils import (


 class CloudyIE(InfoExtractor):
-    _IE_DESC = 'cloudy.ec'
+    IE_DESC = 'cloudy.ec'
    _VALID_URL = r'https?://(?:www\.)?cloudy\.ec/(?:v/|embed\.php\?.*?\bid=)(?P<id>[A-Za-z0-9]+)'
    _TESTS = [{
        'url': 'https://www.cloudy.ec/v/af511e2527aac',
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -422,6 +422,8 @@ class InfoExtractor(object):
    _GEO_COUNTRIES = None
    _GEO_IP_BLOCKS = None
    _WORKING = True
+    # supply this in public subclasses: used in supported sites list, etc
+    # IE_DESC = 'short description of IE'

    def __init__(self, downloader=None):
        """Constructor. Receives an optional downloader."""
@ -1438,8 +1440,16 @@ class InfoExtractor(object):
            })
            extract_interaction_statistic(e)

-        for e in json_ld:
-            if '@context' in e:
+        def traverse_json_ld(json_ld, info, at_top_level=True):
+            for e in json_ld:
+                if at_top_level and '@context' not in e:
+                    continue
+                if at_top_level and all(k in ('@context', '@graph') for k in e):
+                    graph = e['@graph']
+                    if isinstance(graph, dict):
+                        graph = [graph]
+                    traverse_json_ld(graph, info, at_top_level=False)
+                    break
                item_type = e.get('@type')
                if expected_type is not None and expected_type != item_type:
                    continue
@ -1472,7 +1482,7 @@ class InfoExtractor(object):
                    info.update({
                        'timestamp': parse_iso8601(e.get('datePublished')),
                        'title': unescapeHTML(e.get('headline')),
-                        'description': unescapeHTML(e.get('articleBody')),
+                        'description': unescapeHTML(e.get('articleBody') or e.get('description')),
                    })
                elif item_type == 'VideoObject':
                    extract_video_object(e)
@ -1487,6 +1497,8 @@ class InfoExtractor(object):
                    continue
                else:
                    break
+        traverse_json_ld(json_ld, info)
+
        return dict((k, v) for k, v in info.items() if v is not None)

    def _search_nextjs_data(self, webpage, video_id, **kw):
--- a/youtube_dl/extractor/itv.py
+++ b/youtube_dl/extractor/itv.py
@ -35,15 +35,6 @@ from ..utils import (

 class ITVBaseIE(InfoExtractor):

-    def _search_nextjs_data(self, webpage, video_id, **kw):
-        transform_source = kw.pop('transform_source', None)
-        fatal = kw.pop('fatal', True)
-        return self._parse_json(
-            self._search_regex(
-                r'''<script\b[^>]+\bid=('|")__NEXT_DATA__\1[^>]*>(?P<js>[^<]+)</script>''',
-                webpage, 'next.js data', group='js', fatal=fatal, **kw),
-            video_id, transform_source=transform_source, fatal=fatal)
-
    def __handle_request_webpage_error(self, err, video_id=None, errnote=None, fatal=True):
        if errnote is False:
            return False
@ -109,7 +100,9 @@ class ITVBaseIE(InfoExtractor):

 class ITVIE(ITVBaseIE):
    _VALID_URL = r'https?://(?:www\.)?itv\.com/(?:(?P<w>watch)|hub)/[^/]+/(?(w)[\w-]+/)(?P<id>\w+)'
-    _IE_DESC = 'ITVX'
+    IE_DESC = 'ITVX'
+    _WORKING = False
+
    _TESTS = [{
        'note': 'Hub URLs redirect to ITVX',
        'url': 'https://www.itv.com/hub/liar/2a4547a0012',
@ -270,7 +263,7 @@ class ITVIE(ITVBaseIE):
                'ext': determine_ext(href, 'vtt'),
            })

-        next_data = self._search_nextjs_data(webpage, video_id, fatal=False, default='{}')
+        next_data = self._search_nextjs_data(webpage, video_id, fatal=False, default={})
        video_data.update(traverse_obj(next_data, ('props', 'pageProps', ('title', 'episode')), expected_type=dict)[0] or {})
        title = traverse_obj(video_data, 'headerTitle', 'episodeTitle')
        info = self._og_extract(webpage, require_title=not title)
@ -323,7 +316,7 @@ class ITVIE(ITVBaseIE):

 class ITVBTCCIE(ITVBaseIE):
    _VALID_URL = r'https?://(?:www\.)?itv\.com/(?!(?:watch|hub)/)(?:[^/]+/)+(?P<id>[^/?#&]+)'
-    _IE_DESC = 'ITV articles: News, British Touring Car Championship'
+    IE_DESC = 'ITV articles: News, British Touring Car Championship'
    _TESTS = [{
        'note': 'British Touring Car Championship',
        'url': 'https://www.itv.com/btcc/articles/btcc-2018-all-the-action-from-brands-hatch',
--- a/youtube_dl/extractor/senateisvp.py
+++ b/youtube_dl/extractor/senateisvp.py
@ -47,7 +47,7 @@ class SenateISVPIE(InfoExtractor):
        ['vetaff', '76462', 'http://vetaff-f.akamaihd.net'],
        ['arch', '', 'http://ussenate-f.akamaihd.net/']
    ]
-    _IE_NAME = 'senate.gov'
+    IE_NAME = 'senate.gov'
    _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)'
    _TESTS = [{
        'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
Author	SHA1	Message	Date
Zenon Mousmoulas	05b755ef35	Merge a2fd63ce22280cef3856d39d795ec91303152ff6 into 4e714f9df1ed2cccd51df60d45ff5504abe827b7	2025-03-27 10:14:43 +02:00
dirkf	4e714f9df1	[Misc] Correct [_]IE_DESC/NAME in a few IEs * thx seproDev, yt-dlp/yt-dlp/pull/12694/commits/ae69e3c * also add documenting comment in `InfoExtractor`	2025-03-26 12:47:19 +00:00
dirkf	c1ea7f5a24	[ITV] Mark ITVX not working * update old shim * correct [_]IE_DESC	2025-03-26 12:17:49 +00:00
Zenon Mousmoulas	a2fd63ce22	JSON-LD: Tweak (News)Article description extraction Let JSON-LD extract description from articleBody and fall back to description field when processing (News)Article typed nodes	2021-11-12 09:35:50 +02:00
Zenon Mousmoulas	d6469de1da	Extend TestInfoExtractor.test_search_json_ld_realworld to cover @graph expressing JSON-LD implicit default graph * Refactor tests in a list of 3-tuples: test html string, expected dict, keyword args for InfoExtractor._search_json_ld * Adapt test code accordingly * Add test for @graph expressing JSON-LD implicit default graph	2021-11-12 09:35:50 +02:00
Zenon Mousmoulas	77e8f5353c	JSON-LD: Support top-level @graph expressing implicit default graph Per W3C JSON-LD v1.1 §4.9 (non-normative ref): When a JSON-LD document's top-level structure is a map that contains no other keys than @graph and optionally @context (properties that are not mapped to an IRI or a keyword are ignored), @graph is considered to express the otherwise implicit default graph. Support such a structure in InfoExtractor._json_ld parsing: Wrap the control flow block in a function, which is called recursively upon such a structure	2021-11-12 09:30:17 +02:00