From 3bffd20f3391d5c18e5af141b204f800d893a27c Mon Sep 17 00:00:00 2001
From: Daenges
Date: Mon, 16 May 2022 21:28:19 +0200
Subject: [PATCH 1/8] Add extractor for MegaCartoons
---
youtube_dl/extractor/extractors.py | 1 +
youtube_dl/extractor/megacartoons.py | 45 ++++++++++++++++++++++++++++
2 files changed, 46 insertions(+)
create mode 100644 youtube_dl/extractor/megacartoons.py
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index 452caeade..072ef38ac 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -690,6 +690,7 @@ from .mixcloud import (
MixcloudUserIE,
MixcloudPlaylistIE,
)
+from .megacartoons import MegaCartoonsIE
from .mlb import (
MLBIE,
MLBVideoIE,
diff --git a/youtube_dl/extractor/megacartoons.py b/youtube_dl/extractor/megacartoons.py
new file mode 100644
index 000000000..999594e1e
--- /dev/null
+++ b/youtube_dl/extractor/megacartoons.py
@@ -0,0 +1,45 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+
+
+class MegaCartoonsIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?megacartoons\.net/(?P[a-z-]+)/'
+ _TEST = {
+ 'url': 'https://www.megacartoons.net/help-wanted/',
+ 'md5': '4ba9be574f9a17abe0c074e2f955fded',
+ 'info_dict': {
+ 'id': 'help-wanted',
+ 'title': 'help-wanted',
+ 'ext': 'mp4',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'description': 'Help Wanted: Encouraged by his best friend, Patrick Starfish, SpongeBob overcomes his fears and finally applies for that dream job as a fry cook at the Krusty Krab. Challenged by the owner, Mr. Krabs, and his assistant Squidward, to prove himself worthy of the job, SpongeBob rises to the occasion, with the help of one very special spatula, by feeding a sea of ravenous anchovies.'
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ # The id is equal to the title
+ title = video_id
+ # Video and thumbnail are
+ url_json = json.loads(self._html_search_regex(r'{.*})".*>', webpage, 'videourls'))
+
+ video_url = url_json['sources'][0]['src']
+ video_type = url_json['sources'][0]['type']
+ video_thumbnail = url_json['splash']
+
+ video_description = self._html_search_regex(r'(?P.*)
', webpage, 'videodescription')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'format': video_type,
+ 'url': video_url,
+ 'thumbnail': video_thumbnail,
+ 'description': video_description,
+ }
From 2b128c729ede8353fe4583651008a56f3b308ac4 Mon Sep 17 00:00:00 2001
From: Daenges
Date: Mon, 16 May 2022 21:37:35 +0200
Subject: [PATCH 2/8] Add further comments
---
youtube_dl/extractor/megacartoons.py | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/youtube_dl/extractor/megacartoons.py b/youtube_dl/extractor/megacartoons.py
index 999594e1e..c2789f4e9 100644
--- a/youtube_dl/extractor/megacartoons.py
+++ b/youtube_dl/extractor/megacartoons.py
@@ -21,18 +21,21 @@ class MegaCartoonsIE(InfoExtractor):
}
def _real_extract(self, url):
+ # ID is equal to the episode name
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
# The id is equal to the title
title = video_id
- # Video and thumbnail are
+
+ # Video data is stored in a json -> extract it from the raw html
url_json = json.loads(self._html_search_regex(r'{.*})".*>', webpage, 'videourls'))
- video_url = url_json['sources'][0]['src']
- video_type = url_json['sources'][0]['type']
- video_thumbnail = url_json['splash']
+ video_url = url_json['sources'][0]['src'] # Get the video url
+ video_type = url_json['sources'][0]['type'] # Get the video type -> 'video/mp4'
+ video_thumbnail = url_json['splash'] # Get the thumbnail
+ # Every video has a short summary -> save it as description
video_description = self._html_search_regex(r'(?P.*)
', webpage, 'videodescription')
return {
From 041abb92b3ca37ce477e4d4b4f4e6d33e6263338 Mon Sep 17 00:00:00 2001
From: Daenges
Date: Mon, 16 May 2022 22:32:47 +0200
Subject: [PATCH 3/8] Apply coding conventions
---
youtube_dl/extractor/megacartoons.py | 23 ++++++++++++++---------
1 file changed, 14 insertions(+), 9 deletions(-)
diff --git a/youtube_dl/extractor/megacartoons.py b/youtube_dl/extractor/megacartoons.py
index c2789f4e9..5218c034e 100644
--- a/youtube_dl/extractor/megacartoons.py
+++ b/youtube_dl/extractor/megacartoons.py
@@ -13,10 +13,15 @@ class MegaCartoonsIE(InfoExtractor):
'md5': '4ba9be574f9a17abe0c074e2f955fded',
'info_dict': {
'id': 'help-wanted',
- 'title': 'help-wanted',
+ 'title': 'Help Wanted',
'ext': 'mp4',
'thumbnail': r're:^https?://.*\.jpg$',
- 'description': 'Help Wanted: Encouraged by his best friend, Patrick Starfish, SpongeBob overcomes his fears and finally applies for that dream job as a fry cook at the Krusty Krab. Challenged by the owner, Mr. Krabs, and his assistant Squidward, to prove himself worthy of the job, SpongeBob rises to the occasion, with the help of one very special spatula, by feeding a sea of ravenous anchovies.'
+ 'description': 'Help Wanted: Encouraged by his best friend, Patrick Starfish, '
+ 'SpongeBob overcomes his fears and finally applies for that '
+ 'dream job as a fry cook at the Krusty Krab. Challenged by the '
+ 'owner, Mr. Krabs, and his assistant Squidward, to prove himself '
+ 'worthy of the job, SpongeBob rises to the occasion, with the help '
+ 'of one very special spatula, by feeding a sea of ravenous anchovies.'
}
}
@@ -25,18 +30,18 @@ class MegaCartoonsIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- # The id is equal to the title
- title = video_id
+ # Try to find a good title or fallback to the ID
+ title = self._og_search_title(webpage) or video_id
# Video data is stored in a json -> extract it from the raw html
- url_json = json.loads(self._html_search_regex(r'{.*})".*>', webpage, 'videourls'))
+ url_json = json.loads(self._html_search_regex(r'{.*})["/\'].*>', webpage, 'videourls'))
- video_url = url_json['sources'][0]['src'] # Get the video url
- video_type = url_json['sources'][0]['type'] # Get the video type -> 'video/mp4'
- video_thumbnail = url_json['splash'] # Get the thumbnail
+ video_url = url_json.get('sources')[0].get('src') or self._og_search_video_url(webpage) # Get the video url
+ video_type = url_json.get('sources')[0].get('type') # Get the video type -> 'video/mp4'
+ video_thumbnail = url_json.get('splash') or self._og_search_thumbnail(webpage) # Get the thumbnail
# Every video has a short summary -> save it as description
- video_description = self._html_search_regex(r'(?P.*)
', webpage, 'videodescription')
+ video_description = self._html_search_regex(r'(?P.*)
', webpage, 'videodescription', fatal=False) or self._og_search_description(webpage)
return {
'id': video_id,
From 73771cd768bdff96745f4d80536c0f4e61a7f5e7 Mon Sep 17 00:00:00 2001
From: Daenges
Date: Sat, 21 May 2022 16:19:18 +0200
Subject: [PATCH 4/8] Commit suggested changes. - Verify description through
md5 - Implement robust detection of description - Remove format attribute to
allow auto detection - Allow conditioning of URLs
---
youtube_dl/extractor/megacartoons.py | 23 +++++++++++------------
1 file changed, 11 insertions(+), 12 deletions(-)
diff --git a/youtube_dl/extractor/megacartoons.py b/youtube_dl/extractor/megacartoons.py
index 5218c034e..80c17c100 100644
--- a/youtube_dl/extractor/megacartoons.py
+++ b/youtube_dl/extractor/megacartoons.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
import json
+from ..utils import url_or_none
from .common import InfoExtractor
@@ -16,12 +17,7 @@ class MegaCartoonsIE(InfoExtractor):
'title': 'Help Wanted',
'ext': 'mp4',
'thumbnail': r're:^https?://.*\.jpg$',
- 'description': 'Help Wanted: Encouraged by his best friend, Patrick Starfish, '
- 'SpongeBob overcomes his fears and finally applies for that '
- 'dream job as a fry cook at the Krusty Krab. Challenged by the '
- 'owner, Mr. Krabs, and his assistant Squidward, to prove himself '
- 'worthy of the job, SpongeBob rises to the occasion, with the help '
- 'of one very special spatula, by feeding a sea of ravenous anchovies.'
+ 'description': 'md5:2c909daa6c6cb16b2d4d791dd1a31632'
}
}
@@ -36,17 +32,20 @@ class MegaCartoonsIE(InfoExtractor):
# Video data is stored in a json -> extract it from the raw html
url_json = json.loads(self._html_search_regex(r'{.*})["/\'].*>', webpage, 'videourls'))
- video_url = url_json.get('sources')[0].get('src') or self._og_search_video_url(webpage) # Get the video url
- video_type = url_json.get('sources')[0].get('type') # Get the video type -> 'video/mp4'
- video_thumbnail = url_json.get('splash') or self._og_search_thumbnail(webpage) # Get the thumbnail
+ video_url = url_or_none(url_json.get('sources')[0].get('src') or self._og_search_video_url(webpage)) # Get the video url
+ video_thumbnail = url_or_none(url_json.get('splash') or self._og_search_thumbnail(webpage)) # Get the thumbnail
- # Every video has a short summary -> save it as description
- video_description = self._html_search_regex(r'(?P.*)
', webpage, 'videodescription', fatal=False) or self._og_search_description(webpage)
+ # Find the class in the html
+ article = self._search_regex(
+ r'(?s)]*?\bclass\s*=\s*[^>]*?\bpost\b[^>]*>(.+?)\s*([^<]+)\s*
', article, 'videodescription', fatal=False)
+ or self._og_search_description(webpage))
return {
'id': video_id,
'title': title,
- 'format': video_type,
'url': video_url,
'thumbnail': video_thumbnail,
'description': video_description,
From cf4a829c138c100000086cb55c2b772fe2db47ac Mon Sep 17 00:00:00 2001
From: Daenges
Date: Mon, 23 May 2022 19:21:56 +0200
Subject: [PATCH 5/8] Implement _search_json_ld()
---
youtube_dl/extractor/megacartoons.py | 85 ++++++++++++++++++++++------
1 file changed, 68 insertions(+), 17 deletions(-)
diff --git a/youtube_dl/extractor/megacartoons.py b/youtube_dl/extractor/megacartoons.py
index 80c17c100..ea6161b7b 100644
--- a/youtube_dl/extractor/megacartoons.py
+++ b/youtube_dl/extractor/megacartoons.py
@@ -2,7 +2,16 @@
from __future__ import unicode_literals
import json
-from ..utils import url_or_none
+import re
+
+from ..utils import (
+ bug_reports_message,
+ JSON_LD_RE,
+ merge_dicts,
+ NO_DEFAULT,
+ RegexNotFoundError,
+ url_or_none,
+)
from .common import InfoExtractor
@@ -14,39 +23,81 @@ class MegaCartoonsIE(InfoExtractor):
'md5': '4ba9be574f9a17abe0c074e2f955fded',
'info_dict': {
'id': 'help-wanted',
- 'title': 'Help Wanted',
'ext': 'mp4',
+ 'title': 'Help Wanted - SpongeBob SquarePants',
+ 'upload_date': '20200223',
+ 'timestamp': 1582416000,
'thumbnail': r're:^https?://.*\.jpg$',
'description': 'md5:2c909daa6c6cb16b2d4d791dd1a31632'
}
}
+ # adapted from common.py pending yt-dlp back-port
+ def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
+ json_ld_list = list(re.finditer(JSON_LD_RE, html))
+ default = kwargs.get('default', NO_DEFAULT)
+ fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
+ json_ld = []
+ for mobj in json_ld_list:
+ json_ld_item = self._parse_json(
+ mobj.group('json_ld'), video_id, fatal=fatal)
+ if not json_ld_item:
+ continue
+ if isinstance(json_ld_item, dict):
+ json_ld.append(json_ld_item)
+ elif isinstance(json_ld_item, (list, tuple)):
+ json_ld.extend(json_ld_item)
+ if json_ld:
+ # handle initial '@graph' with one level of children
+ if len(json_ld) > 0 and '@graph' in json_ld[0] and '@context' in json_ld[0]:
+ # should always be hit here
+ context = json_ld[0]['@context']
+ json_ld_g = json_ld[0]['@graph'] or []
+ for item in json_ld_g:
+ item.setdefault('@context', context)
+ json_ld = json_ld_g + json_ld[1:]
+ json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
+ if json_ld:
+ return json_ld
+ if default is not NO_DEFAULT:
+ return default
+ elif fatal:
+ raise RegexNotFoundError('Unable to extract JSON-LD')
+ else:
+ self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
+ return {}
+
def _real_extract(self, url):
# ID is equal to the episode name
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- # Try to find a good title or fallback to the ID
- title = self._og_search_title(webpage) or video_id
+ info = self._search_json_ld(webpage, video_id, fatal=False) or {}
- # Video data is stored in a json -> extract it from the raw html
- url_json = json.loads(self._html_search_regex(r'{.*})["/\'].*>', webpage, 'videourls'))
+ info.update({
+ 'id': video_id,
+ # Try to find a good title or fallback to the ID
+ 'title': info.get('title') or self._og_search_title(webpage) or video_id.replace('-', ' ').capitalize(),
+ })
- video_url = url_or_none(url_json.get('sources')[0].get('src') or self._og_search_video_url(webpage)) # Get the video url
- video_thumbnail = url_or_none(url_json.get('splash') or self._og_search_thumbnail(webpage)) # Get the thumbnail
+ if 'url' not in info or 'thumbnail' not in info:
+ # Video data is stored in a json -> extract it from the raw html
+ url_json = json.loads(self._html_search_regex(r'{.*})["/\'].*>', webpage, 'videourls'))
+
+ video_url = url_or_none(url_json.get('sources')[0].get('src') or self._og_search_video_url(webpage)) # Get the video url
+ video_thumbnail = url_or_none(url_json.get('splash') or self._og_search_thumbnail(webpage)) # Get the thumbnail
+ info = merge_dicts(info, {
+ 'url': video_url,
+ 'thumbnail': video_thumbnail,
+ })
# Find the class in the html
article = self._search_regex(
r'(?s)]*?\bclass\s*=\s*[^>]*?\bpost\b[^>]*>(.+?)\s*([^<]+)\s*', article, 'videodescription', fatal=False)
- or self._og_search_description(webpage))
+ info['description'] = (
+ self._html_search_regex(r'(?s)\s*([^<]+)\s*
', article, 'videodescription', fatal=False)
+ or self._og_search_description(webpage))
- return {
- 'id': video_id,
- 'title': title,
- 'url': video_url,
- 'thumbnail': video_thumbnail,
- 'description': video_description,
- }
+ return info
From e8aca87278f43ddd577d57b6955eaa609aa5760e Mon Sep 17 00:00:00 2001
From: dirkf
Date: Fri, 22 Jul 2022 15:48:26 +0100
Subject: [PATCH 6/8] Improve fallback
---
youtube_dl/extractor/megacartoons.py | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/youtube_dl/extractor/megacartoons.py b/youtube_dl/extractor/megacartoons.py
index ea6161b7b..672bccbdd 100644
--- a/youtube_dl/extractor/megacartoons.py
+++ b/youtube_dl/extractor/megacartoons.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import json
import re
from ..utils import (
@@ -10,8 +9,10 @@ from ..utils import (
merge_dicts,
NO_DEFAULT,
RegexNotFoundError,
+ try_get,
url_or_none,
)
+from ..compat import compat_str
from .common import InfoExtractor
@@ -82,9 +83,9 @@ class MegaCartoonsIE(InfoExtractor):
if 'url' not in info or 'thumbnail' not in info:
# Video data is stored in a json -> extract it from the raw html
- url_json = json.loads(self._html_search_regex(r'{.*})["/\'].*>', webpage, 'videourls'))
+ url_json = self._parse_json(self._html_search_regex(r''']+\bdata-item\s*=\s*(["'])(?P
\{.*})\1''', webpage, 'videourls', group='videourls', default='{}'), video_id, fatal=False)) or {}
- video_url = url_or_none(url_json.get('sources')[0].get('src') or self._og_search_video_url(webpage)) # Get the video url
+ video_url = url_or_none(try_get(url_json, lambda x: x['sources'][0]['src'], compat_str) or self._og_search_video_url(webpage)) # Get the video url
video_thumbnail = url_or_none(url_json.get('splash') or self._og_search_thumbnail(webpage)) # Get the thumbnail
info = merge_dicts(info, {
'url': video_url,
From 2804216e81de31ea33f65de3881f68f28ae54a59 Mon Sep 17 00:00:00 2001
From: dirkf
Date: Fri, 22 Jul 2022 15:54:30 +0100
Subject: [PATCH 7/8] Improve fallback
---
youtube_dl/extractor/megacartoons.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/youtube_dl/extractor/megacartoons.py b/youtube_dl/extractor/megacartoons.py
index 672bccbdd..c6186e3f8 100644
--- a/youtube_dl/extractor/megacartoons.py
+++ b/youtube_dl/extractor/megacartoons.py
@@ -83,7 +83,7 @@ class MegaCartoonsIE(InfoExtractor):
if 'url' not in info or 'thumbnail' not in info:
# Video data is stored in a json -> extract it from the raw html
- url_json = self._parse_json(self._html_search_regex(r''']+\bdata-item\s*=\s*(["'])(?P
\{.*})\1''', webpage, 'videourls', group='videourls', default='{}'), video_id, fatal=False)) or {}
+ url_json = self._parse_json(self._html_search_regex(r''']+\bdata-item\s*=\s*(["'])(?P\{.*})\1''', webpage, 'videourls', group='videourls', default='{}'), video_id, fatal=False) or {}
video_url = url_or_none(try_get(url_json, lambda x: x['sources'][0]['src'], compat_str) or self._og_search_video_url(webpage)) # Get the video url
video_thumbnail = url_or_none(url_json.get('splash') or self._og_search_thumbnail(webpage)) # Get the thumbnail
From 4232427a8d4615085c3229b88c09d6adb90d2922 Mon Sep 17 00:00:00 2001
From: dirkf
Date: Wed, 5 Jul 2023 20:51:58 +0100
Subject: [PATCH 8/8] Allow numbers and upper-case letters in ID
---
youtube_dl/extractor/megacartoons.py | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/youtube_dl/extractor/megacartoons.py b/youtube_dl/extractor/megacartoons.py
index c6186e3f8..621d4d64e 100644
--- a/youtube_dl/extractor/megacartoons.py
+++ b/youtube_dl/extractor/megacartoons.py
@@ -18,8 +18,8 @@ from .common import InfoExtractor
class MegaCartoonsIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?megacartoons\.net/(?P[a-z-]+)/'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?megacartoons\.net/(?P[a-zA-Z\d-]+)/'
+ _TESTS = [{
'url': 'https://www.megacartoons.net/help-wanted/',
'md5': '4ba9be574f9a17abe0c074e2f955fded',
'info_dict': {
@@ -31,7 +31,13 @@ class MegaCartoonsIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$',
'description': 'md5:2c909daa6c6cb16b2d4d791dd1a31632'
}
- }
+ }, {
+ 'url': 'https://www.megacartoons.net/1000-years-of-courage/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.megacartoons.net/911-2/',
+ 'only_matching': True,
+ }]
# adapted from common.py pending yt-dlp back-port
def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):