From aded139c4f9d9adb7a992fa24d87be8ef7fbe1f5 Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir Date: Thu, 31 Mar 2022 03:56:02 +0500 Subject: [PATCH 01/18] adding https://commons.wikimedia.org to youtube-dl --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/wikimedia.py | 35 ++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 youtube_dl/extractor/wikimedia.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 535080d0a..a6391997e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1659,3 +1659,4 @@ from .zingmp3 import ( ) from .zoom import ZoomIE from .zype import ZypeIE +from .wikimedia import WikimediaIE diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py new file mode 100644 index 000000000..fec867a12 --- /dev/null +++ b/youtube_dl/extractor/wikimedia.py @@ -0,0 +1,35 @@ +from .common import InfoExtractor +import re +import requests +import urllib.parse + + +class WikimediaIE(InfoExtractor): + _NETRC_MACHINE = 'wikimediaorg' + IE_NAME = 'wikimedia.org' + _API_BASE_URL = 'https://commons.wikimedia.org/' + _VALID_URL = r'https://commons.wikimedia.org/wiki/File:(?P[^/]+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + self.report_extraction(video_id) + video_url = self._html_search_regex(r']*src="([^"]+)"', webpage, + u'video URL') + resp = {} + + subtitle_url = f'https://commons.wikimedia.org/w/api.php?' \ + f'action=timedtext&lang=nl&title=File%3A{urllib.parse.quote(video_id)}&trackformat=srt' + with open(video_id + '.srt', 'w+', encoding='utf') as f: + subtitles = requests.post(subtitle_url).text + if 'timedtext-notfound' not in subtitles: + f.write(subtitles) + else: + print("subtitles not found") + resp['url'] = video_url + resp['ext'] = 'webm' + resp['id'] = video_id + resp['title'] = video_id + return [resp] From ecc770b99eabfe0c8677a59da4b14f4f4b643cfe Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir Date: Thu, 31 Mar 2022 15:19:00 +0500 Subject: [PATCH 02/18] adding https://commons.wikimedia.org to youtube-dl --- youtube_dl/extractor/wikimedia.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py index fec867a12..290db6baf 100644 --- a/youtube_dl/extractor/wikimedia.py +++ b/youtube_dl/extractor/wikimedia.py @@ -10,6 +10,14 @@ class WikimediaIE(InfoExtractor): _API_BASE_URL = 'https://commons.wikimedia.org/' _VALID_URL = r'https://commons.wikimedia.org/wiki/File:(?P[^/]+)' + _TEST = { + 'url': 'https://upload.wikimedia.org/wikipedia/commons/transcoded/d/d7/Die_Temperaturkurve_der_Erde_%28ZDF' + '%2C_Terra_X%29_720p_HD_50FPS.webm/Die_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS' + '.webm.480p.vp9.webm', + 'ext': 'webm', 'id': 'Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.webm', + 'title': 'Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.webm', + 'license': 'This file is licensed under the Creative Commons Attribution 4.0 International license.'} + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -18,6 +26,8 @@ class WikimediaIE(InfoExtractor): self.report_extraction(video_id) video_url = self._html_search_regex(r']*src="([^"]+)"', webpage, u'video URL') + licenze = self._html_search_regex(f"(?<=td>This)(.*)(?=license.)", webpage, u'video license') + licenze = "This " + licenze + " license." resp = {} subtitle_url = f'https://commons.wikimedia.org/w/api.php?' \ @@ -32,4 +42,5 @@ class WikimediaIE(InfoExtractor): resp['ext'] = 'webm' resp['id'] = video_id resp['title'] = video_id + resp['license'] = licenze return [resp] From e0dcf1db42e36113e638db7fbf669a2827b41acf Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir Date: Thu, 31 Mar 2022 15:21:38 +0500 Subject: [PATCH 03/18] adding https://commons.wikimedia.org to youtube-dl --- youtube_dl/extractor/wikimedia.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py index 290db6baf..0cf538464 100644 --- a/youtube_dl/extractor/wikimedia.py +++ b/youtube_dl/extractor/wikimedia.py @@ -32,12 +32,14 @@ class WikimediaIE(InfoExtractor): subtitle_url = f'https://commons.wikimedia.org/w/api.php?' \ f'action=timedtext&lang=nl&title=File%3A{urllib.parse.quote(video_id)}&trackformat=srt' - with open(video_id + '.srt', 'w+', encoding='utf') as f: - subtitles = requests.post(subtitle_url).text - if 'timedtext-notfound' not in subtitles: + + subtitles = requests.post(subtitle_url).text + if 'timedtext-notfound' not in subtitles: + with open(video_id + '.srt', 'w+', encoding='utf') as f: f.write(subtitles) - else: - print("subtitles not found") + else: + print("subtitles not found") + resp['url'] = video_url resp['ext'] = 'webm' resp['id'] = video_id From 7bf92fd726f4ebaf4752831263d91b8ef9fb0987 Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir Date: Fri, 1 Apr 2022 03:09:52 +0500 Subject: [PATCH 04/18] adding https://commons.wikimedia.org to youtube-dl with url author description extension id title license --- youtube_dl/extractor/wikimedia.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py index 0cf538464..6dc558b71 100644 --- a/youtube_dl/extractor/wikimedia.py +++ b/youtube_dl/extractor/wikimedia.py @@ -2,6 +2,7 @@ from .common import InfoExtractor import re import requests import urllib.parse +from ..utils import clean_html class WikimediaIE(InfoExtractor): @@ -14,20 +15,39 @@ class WikimediaIE(InfoExtractor): 'url': 'https://upload.wikimedia.org/wikipedia/commons/transcoded/d/d7/Die_Temperaturkurve_der_Erde_%28ZDF' '%2C_Terra_X%29_720p_HD_50FPS.webm/Die_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS' '.webm.480p.vp9.webm', + 'description': 'Deutsch: Beschreibung auf der Seite: "Im Verlauf der Erdgeschichte glich das Klima einer ' + 'Achterbahnfahrt. Die „Fieberkurve“ unseres Planeten zeigt die globalen Temperaturschwankungen ' + 'bis heute – rekonstruiert anhand von historischen Klimadaten."\nZu Wikimedia Commons ' + 'hochgeladen von: PantheraLeo1359531.\nHinweise zur Weiterverwendung: ' + 'https://www.zdf.de/dokumentation/terra-x/terra-x-creative-commons-cc-100.html' + '.\nVereinfachender Verlauf in der Geschichte der Erde, für die Zukunft spätestens ab dem Jahr ' + '2050 mit spekulativem Verlauf in der Prognose (ausgeprägtes Global-warming-Szenario ist ' + 'dargestellt).English: Climate change, Temperature in history of Earth, Video of Terra X.', 'ext': 'webm', 'id': 'Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.webm', - 'title': 'Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.webm', - 'license': 'This file is licensed under the Creative Commons Attribution 4.0 International license.'} + 'title': 'File:Die Temperaturkurve der Erde (ZDF, Terra X) 720p HD 50FPS.webm - Wikimedia Commons', + 'license': 'This file is licensed under the Creative Commons Attribution 4.0 International license.', + 'author': 'ZDF/Terra X/Gruppe 5/Luise Wagner, Jonas Sichert, Andreas Hougardy'} def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + if not video_id.endswith('.webm'): + raise Exception("invalid video url") + webpage = self._download_webpage(url, video_id) self.report_extraction(video_id) video_url = self._html_search_regex(r']*src="([^"]+)"', webpage, u'video URL') licenze = self._html_search_regex(f"(?<=td>This)(.*)(?=license.)", webpage, u'video license') licenze = "This " + licenze + " license." + + description = self._html_search_regex(f'(?<=
)(' + f'.*)(?=
)', webpage, + u'video description') + + author = re.search(r'([^\<]*?)<\/td>', str(webpage)) + author = clean_html(author.group(0)) resp = {} subtitle_url = f'https://commons.wikimedia.org/w/api.php?' \ @@ -41,8 +61,10 @@ class WikimediaIE(InfoExtractor): print("subtitles not found") resp['url'] = video_url + resp['description'] = description resp['ext'] = 'webm' resp['id'] = video_id - resp['title'] = video_id + resp['title'] = self._og_search_title(webpage) resp['license'] = licenze + resp['author'] = author return [resp] From bf428499a017d5021f24057e14fb36838062505a Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir Date: Fri, 1 Apr 2022 05:21:57 +0500 Subject: [PATCH 05/18] made changes according to request also I really appreciate help by @dirkf --- youtube_dl/extractor/wikimedia.py | 49 ++++++++++++------------------- 1 file changed, 18 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py index 6dc558b71..0572760be 100644 --- a/youtube_dl/extractor/wikimedia.py +++ b/youtube_dl/extractor/wikimedia.py @@ -1,8 +1,5 @@ from .common import InfoExtractor -import re -import requests -import urllib.parse -from ..utils import clean_html +from ..utils import get_element_by_class, compat_urlparse class WikimediaIE(InfoExtractor): @@ -29,8 +26,7 @@ class WikimediaIE(InfoExtractor): 'author': 'ZDF/Terra X/Gruppe 5/Luise Wagner, Jonas Sichert, Andreas Hougardy'} def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) if not video_id.endswith('.webm'): raise Exception("invalid video url") @@ -39,32 +35,23 @@ class WikimediaIE(InfoExtractor): self.report_extraction(video_id) video_url = self._html_search_regex(r']*src="([^"]+)"', webpage, u'video URL') - licenze = self._html_search_regex(f"(?<=td>This)(.*)(?=license.)", webpage, u'video license') + licenze = self._html_search_regex(r'\bThis\s*(.*?)\s*license\b', webpage, u'video license') licenze = "This " + licenze + " license." - description = self._html_search_regex(f'(?<=
)(' - f'.*)(?=
)', webpage, - u'video description') + description = get_element_by_class('description', webpage) - author = re.search(r'([^\<]*?)<\/td>', str(webpage)) - author = clean_html(author.group(0)) - resp = {} + author = self._html_search_regex(r'([^\<]*?)<\/td>', str(webpage), u"video author") - subtitle_url = f'https://commons.wikimedia.org/w/api.php?' \ - f'action=timedtext&lang=nl&title=File%3A{urllib.parse.quote(video_id)}&trackformat=srt' - - subtitles = requests.post(subtitle_url).text - if 'timedtext-notfound' not in subtitles: - with open(video_id + '.srt', 'w+', encoding='utf') as f: - f.write(subtitles) - else: - print("subtitles not found") - - resp['url'] = video_url - resp['description'] = description - resp['ext'] = 'webm' - resp['id'] = video_id - resp['title'] = self._og_search_title(webpage) - resp['license'] = licenze - resp['author'] = author - return [resp] + info = {} + subtitles = 'https://commons.wikimedia.org/w/api.php?action=timedtext&lang=nl&title=File%3A{}' \ + '&trackformat=srt'.format(compat_urlparse.quote_plus(video_id)) + info['url'] = video_url + info['description'] = description + info['ext'] = 'webm' + info['id'] = video_id + info['title'] = self._og_search_title(webpage).replace("File:", "") + info['license'] = licenze + info['author'] = author + info['subtitles'] = {"nl": [{"ext": "srt", "url": subtitles}]} + print("ih") + return info From 810c1a56a04fab4daa2bdf1bfe888da2e976143c Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir Date: Fri, 1 Apr 2022 05:25:00 +0500 Subject: [PATCH 06/18] made changes according to request also I really appreciate help by @dirkf --- youtube_dl/extractor/wikimedia.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py index 0572760be..ef2536d0d 100644 --- a/youtube_dl/extractor/wikimedia.py +++ b/youtube_dl/extractor/wikimedia.py @@ -39,9 +39,7 @@ class WikimediaIE(InfoExtractor): licenze = "This " + licenze + " license." description = get_element_by_class('description', webpage) - author = self._html_search_regex(r'([^\<]*?)<\/td>', str(webpage), u"video author") - info = {} subtitles = 'https://commons.wikimedia.org/w/api.php?action=timedtext&lang=nl&title=File%3A{}' \ '&trackformat=srt'.format(compat_urlparse.quote_plus(video_id)) @@ -53,5 +51,4 @@ class WikimediaIE(InfoExtractor): info['license'] = licenze info['author'] = author info['subtitles'] = {"nl": [{"ext": "srt", "url": subtitles}]} - print("ih") return info From 42b0ca233da69291f63006b4f8be3abb94a73aca Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir Date: Fri, 1 Apr 2022 05:30:38 +0500 Subject: [PATCH 07/18] cleaned html tags inside description. --- youtube_dl/extractor/wikimedia.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py index ef2536d0d..8478dbbc4 100644 --- a/youtube_dl/extractor/wikimedia.py +++ b/youtube_dl/extractor/wikimedia.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..utils import get_element_by_class, compat_urlparse +from ..utils import get_element_by_class, compat_urlparse, clean_html class WikimediaIE(InfoExtractor): @@ -21,9 +21,12 @@ class WikimediaIE(InfoExtractor): '2050 mit spekulativem Verlauf in der Prognose (ausgeprägtes Global-warming-Szenario ist ' 'dargestellt).English: Climate change, Temperature in history of Earth, Video of Terra X.', 'ext': 'webm', 'id': 'Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.webm', - 'title': 'File:Die Temperaturkurve der Erde (ZDF, Terra X) 720p HD 50FPS.webm - Wikimedia Commons', + 'title': 'Die Temperaturkurve der Erde (ZDF, Terra X) 720p HD 50FPS.webm - Wikimedia Commons', 'license': 'This file is licensed under the Creative Commons Attribution 4.0 International license.', - 'author': 'ZDF/Terra X/Gruppe 5/Luise Wagner, Jonas Sichert, Andreas Hougardy'} + 'author': 'ZDF/Terra X/Gruppe 5/Luise Wagner, Jonas Sichert, Andreas Hougardy', 'subtitles': {'nl': [ + {'ext': 'srt', + 'url': 'https://commons.wikimedia.org/w/api.php?action=timedtext&lang=nl&title=File' + '%3ADie_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS.webm&trackformat=srt'}]}} def _real_extract(self, url): video_id = self._match_id(url) @@ -44,7 +47,7 @@ class WikimediaIE(InfoExtractor): subtitles = 'https://commons.wikimedia.org/w/api.php?action=timedtext&lang=nl&title=File%3A{}' \ '&trackformat=srt'.format(compat_urlparse.quote_plus(video_id)) info['url'] = video_url - info['description'] = description + info['description'] = clean_html(description) info['ext'] = 'webm' info['id'] = video_id info['title'] = self._og_search_title(webpage).replace("File:", "") From aeb5abe2b9d2512d39476d6ee292f9c0dc6f7fa1 Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir Date: Sat, 2 Apr 2022 01:02:09 +0500 Subject: [PATCH 08/18] video name fixed. --- youtube_dl/extractor/wikimedia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py index 8478dbbc4..339f7f8c0 100644 --- a/youtube_dl/extractor/wikimedia.py +++ b/youtube_dl/extractor/wikimedia.py @@ -49,7 +49,7 @@ class WikimediaIE(InfoExtractor): info['url'] = video_url info['description'] = clean_html(description) info['ext'] = 'webm' - info['id'] = video_id + info['id'] = video_id[:-5] info['title'] = self._og_search_title(webpage).replace("File:", "") info['license'] = licenze info['author'] = author From 6dfb8d4ec6786248bb6e2d833aaa41f616be0724 Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir Date: Sat, 2 Apr 2022 14:31:52 +0500 Subject: [PATCH 09/18] subtitles extraction for all language fixed. --- youtube_dl/extractor/wikimedia.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py index 339f7f8c0..b256c6879 100644 --- a/youtube_dl/extractor/wikimedia.py +++ b/youtube_dl/extractor/wikimedia.py @@ -1,5 +1,6 @@ from .common import InfoExtractor from ..utils import get_element_by_class, compat_urlparse, clean_html +import re class WikimediaIE(InfoExtractor): @@ -44,8 +45,7 @@ class WikimediaIE(InfoExtractor): description = get_element_by_class('description', webpage) author = self._html_search_regex(r'([^\<]*?)<\/td>', str(webpage), u"video author") info = {} - subtitles = 'https://commons.wikimedia.org/w/api.php?action=timedtext&lang=nl&title=File%3A{}' \ - '&trackformat=srt'.format(compat_urlparse.quote_plus(video_id)) + info['url'] = video_url info['description'] = clean_html(description) info['ext'] = 'webm' @@ -53,5 +53,12 @@ class WikimediaIE(InfoExtractor): info['title'] = self._og_search_title(webpage).replace("File:", "") info['license'] = licenze info['author'] = author - info['subtitles'] = {"nl": [{"ext": "srt", "url": subtitles}]} + + subtitles = re.findall(r'\bsrc=\"\/w\/api\s*(.*?)\s*srt\b', str(webpage)) + info['subtitles'] = {} + for sub in subtitles: + sub = 'https://commons.wikimedia.org/w/api' + sub + 'srt' + lang = sub[sub.find('lang=') + 5:] + lang = lang[:lang.find('&')] + info['subtitles'][lang] = [{"ext": "srt", "url": sub}] return info From 8f48f187f8496d05590295705b4b533697508399 Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir Date: Sun, 3 Apr 2022 01:31:15 +0500 Subject: [PATCH 10/18] subtitles extraction for all language fixed bad request --- youtube_dl/extractor/wikimedia.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py index b256c6879..c864563b6 100644 --- a/youtube_dl/extractor/wikimedia.py +++ b/youtube_dl/extractor/wikimedia.py @@ -60,5 +60,6 @@ class WikimediaIE(InfoExtractor): sub = 'https://commons.wikimedia.org/w/api' + sub + 'srt' lang = sub[sub.find('lang=') + 5:] lang = lang[:lang.find('&')] + sub = sub.replace(';','&') info['subtitles'][lang] = [{"ext": "srt", "url": sub}] return info From a10f613d62eacbd0ddedfcc2f70ee34d088be4a6 Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir Date: Mon, 4 Apr 2022 17:47:05 +0500 Subject: [PATCH 11/18] fixed Test part for the wikimedia extractor python test/test_download.py TestDownload.test_Wikimedia [wikimedia.org] Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.webm: Downloading webpage [wikimedia.org] Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.webm: Extracting information [info] Writing video description metadata as JSON to: test_Wikimedia_Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.info.json [debug] Invoking downloader on 'https://upload.wikimedia.org/wikipedia/commons/transcoded/d/d7/Die_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS.webm/Die_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS.webm.480p.vp9.webm' [download] Destination: test_Wikimedia_Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.webm [download] 100% of 10.00KiB in 00:00 . ---------------------------------------------------------------------- Ran 1 test in 2.262s OK --- youtube_dl/extractor/wikimedia.py | 40 ++++++++++++++++--------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py index c864563b6..b24410de2 100644 --- a/youtube_dl/extractor/wikimedia.py +++ b/youtube_dl/extractor/wikimedia.py @@ -10,24 +10,26 @@ class WikimediaIE(InfoExtractor): _VALID_URL = r'https://commons.wikimedia.org/wiki/File:(?P[^/]+)' _TEST = { - 'url': 'https://upload.wikimedia.org/wikipedia/commons/transcoded/d/d7/Die_Temperaturkurve_der_Erde_%28ZDF' - '%2C_Terra_X%29_720p_HD_50FPS.webm/Die_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS' - '.webm.480p.vp9.webm', - 'description': 'Deutsch: Beschreibung auf der Seite: "Im Verlauf der Erdgeschichte glich das Klima einer ' - 'Achterbahnfahrt. Die „Fieberkurve“ unseres Planeten zeigt die globalen Temperaturschwankungen ' - 'bis heute – rekonstruiert anhand von historischen Klimadaten."\nZu Wikimedia Commons ' - 'hochgeladen von: PantheraLeo1359531.\nHinweise zur Weiterverwendung: ' - 'https://www.zdf.de/dokumentation/terra-x/terra-x-creative-commons-cc-100.html' - '.\nVereinfachender Verlauf in der Geschichte der Erde, für die Zukunft spätestens ab dem Jahr ' - '2050 mit spekulativem Verlauf in der Prognose (ausgeprägtes Global-warming-Szenario ist ' - 'dargestellt).English: Climate change, Temperature in history of Earth, Video of Terra X.', - 'ext': 'webm', 'id': 'Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.webm', - 'title': 'Die Temperaturkurve der Erde (ZDF, Terra X) 720p HD 50FPS.webm - Wikimedia Commons', - 'license': 'This file is licensed under the Creative Commons Attribution 4.0 International license.', - 'author': 'ZDF/Terra X/Gruppe 5/Luise Wagner, Jonas Sichert, Andreas Hougardy', 'subtitles': {'nl': [ - {'ext': 'srt', - 'url': 'https://commons.wikimedia.org/w/api.php?action=timedtext&lang=nl&title=File' - '%3ADie_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS.webm&trackformat=srt'}]}} + 'url': 'https://commons.wikimedia.org/wiki/File:Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.webm', + 'info_dict': { + 'description': 'Deutsch: Beschreibung auf der Seite: "Im Verlauf der Erdgeschichte glich das Klima einer Achterbahnfahrt. Die „Fieberkurve“ unseres Planeten zeigt die globalen Temperaturschwankungen bis heute – rekonstruiert anhand von historischen Klimadaten."\nZu Wikimedia Commons hochgeladen von: PantheraLeo1359531.\nHinweise zur Weiterverwendung: https://www.zdf.de/dokumentation/terra-x/terra-x-creative-commons-cc-100.html.\nVereinfachender Verlauf in der Geschichte der Erde, für die Zukunft spätestens ab dem Jahr 2050 mit spekulativem Verlauf in der Prognose (ausgeprägtes Global-warming-Szenario ist dargestellt).English: Climate change, Temperature in history of Earth, Video of Terra X.', + 'ext': 'webm', + 'id': 'Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS', + 'title': 'Die Temperaturkurve der Erde (ZDF, Terra X) 720p HD 50FPS.webm - Wikimedia Commons', + 'license': 'This file is licensed under the Creative Commons Attribution 4.0 International license.', + 'author': 'ZDF/Terra X/Gruppe 5/Luise Wagner, Jonas Sichert, Andreas Hougardy', + 'subtitles': {'de': [ + {'ext': 'srt', + 'url': 'https://commons.wikimedia.org/w/api.php?action=timedtext&&title=File%3ADie_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS.webm&&lang=de&&trackformat=srt'}], + 'en-gb': [ + {'ext': 'srt', + 'url': 'https://commons.wikimedia.org/w/api.php?action=timedtext&&title=File%3ADie_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS.webm&&lang=en-gb&&trackformat=srt'}], + 'nl': [ + {'ext': 'srt', + 'url': 'https://commons.wikimedia.org/w/api.php?action=timedtext&&title=File%3ADie_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS.webm&&lang=nl&&trackformat=srt'} + ]} + } + } def _real_extract(self, url): video_id = self._match_id(url) @@ -60,6 +62,6 @@ class WikimediaIE(InfoExtractor): sub = 'https://commons.wikimedia.org/w/api' + sub + 'srt' lang = sub[sub.find('lang=') + 5:] lang = lang[:lang.find('&')] - sub = sub.replace(';','&') + sub = sub.replace(';', '&') info['subtitles'][lang] = [{"ext": "srt", "url": sub}] return info From 961a3f3d8b2d88954ac3b63c0c36c4bf2d78533f Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir Date: Tue, 5 Apr 2022 05:30:07 +0500 Subject: [PATCH 12/18] fixed added support for multiple video format. --- youtube_dl/extractor/wikimedia.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py index b24410de2..e5b6c3273 100644 --- a/youtube_dl/extractor/wikimedia.py +++ b/youtube_dl/extractor/wikimedia.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..utils import get_element_by_class, compat_urlparse, clean_html +from ..utils import get_element_by_class, determine_ext, clean_html, KNOWN_EXTENSIONS import re @@ -33,8 +33,8 @@ class WikimediaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - if not video_id.endswith('.webm'): + ext = determine_ext(url) + if not ext.lower() in KNOWN_EXTENSIONS: raise Exception("invalid video url") webpage = self._download_webpage(url, video_id) @@ -50,8 +50,8 @@ class WikimediaIE(InfoExtractor): info['url'] = video_url info['description'] = clean_html(description) - info['ext'] = 'webm' - info['id'] = video_id[:-5] + info['ext'] = ext + info['id'] = video_id.replace('.' + ext, "") info['title'] = self._og_search_title(webpage).replace("File:", "") info['license'] = licenze info['author'] = author From b3d2887ed81834e0c4afd4f98d6e770ca4f497ec Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir Date: Thu, 14 Apr 2022 17:11:14 +0500 Subject: [PATCH 13/18] fixed issues. --- youtube_dl/extractor/wikimedia.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py index e5b6c3273..27c023a6e 100644 --- a/youtube_dl/extractor/wikimedia.py +++ b/youtube_dl/extractor/wikimedia.py @@ -1,3 +1,5 @@ +# coding: utf-8 +from __future__ import unicode_literals from .common import InfoExtractor from ..utils import get_element_by_class, determine_ext, clean_html, KNOWN_EXTENSIONS import re @@ -12,7 +14,7 @@ class WikimediaIE(InfoExtractor): _TEST = { 'url': 'https://commons.wikimedia.org/wiki/File:Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.webm', 'info_dict': { - 'description': 'Deutsch: Beschreibung auf der Seite: "Im Verlauf der Erdgeschichte glich das Klima einer Achterbahnfahrt. Die „Fieberkurve“ unseres Planeten zeigt die globalen Temperaturschwankungen bis heute – rekonstruiert anhand von historischen Klimadaten."\nZu Wikimedia Commons hochgeladen von: PantheraLeo1359531.\nHinweise zur Weiterverwendung: https://www.zdf.de/dokumentation/terra-x/terra-x-creative-commons-cc-100.html.\nVereinfachender Verlauf in der Geschichte der Erde, für die Zukunft spätestens ab dem Jahr 2050 mit spekulativem Verlauf in der Prognose (ausgeprägtes Global-warming-Szenario ist dargestellt).English: Climate change, Temperature in history of Earth, Video of Terra X.', + 'description': 'md5:D6F4C7BF1C0DB1EAE80371B1F93EA85E', 'ext': 'webm', 'id': 'Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS', 'title': 'Die Temperaturkurve der Erde (ZDF, Terra X) 720p HD 50FPS.webm - Wikimedia Commons', @@ -56,12 +58,13 @@ class WikimediaIE(InfoExtractor): info['license'] = licenze info['author'] = author - subtitles = re.findall(r'\bsrc=\"\/w\/api\s*(.*?)\s*srt\b', str(webpage)) - info['subtitles'] = {} - for sub in subtitles: + subtitles = {} + for sub in re.findall(r'\bsrc=\"\/w\/api\s*(.*?)\s*srt\b', str(webpage)): sub = 'https://commons.wikimedia.org/w/api' + sub + 'srt' lang = sub[sub.find('lang=') + 5:] lang = lang[:lang.find('&')] sub = sub.replace(';', '&') info['subtitles'][lang] = [{"ext": "srt", "url": sub}] + + info['subtitles'] = subtitles return info From 67c58034efe89efd864fbb9b14fcea8789ced0c3 Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir Date: Thu, 5 May 2022 07:05:38 +0500 Subject: [PATCH 14/18] fixed issues. --- youtube_dl/extractor/wikimedia.py | 87 +++++++++++++++---------------- 1 file changed, 43 insertions(+), 44 deletions(-) diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py index 27c023a6e..84c17c2a7 100644 --- a/youtube_dl/extractor/wikimedia.py +++ b/youtube_dl/extractor/wikimedia.py @@ -1,70 +1,69 @@ # coding: utf-8 from __future__ import unicode_literals + from .common import InfoExtractor -from ..utils import get_element_by_class, determine_ext, clean_html, KNOWN_EXTENSIONS + import re +from ..utils import ( + clean_html, + determine_ext, + get_element_by_class, + urljoin, + compat_parse_qs, + ExtractorError) + class WikimediaIE(InfoExtractor): - _NETRC_MACHINE = 'wikimediaorg' IE_NAME = 'wikimedia.org' + _NETRC_MACHINE = 'wikimediaorg' _API_BASE_URL = 'https://commons.wikimedia.org/' - _VALID_URL = r'https://commons.wikimedia.org/wiki/File:(?P[^/]+)' + _VALID_URL = 'https://commons.wikimedia.org/wiki/File:(?P[^/]+)' _TEST = { 'url': 'https://commons.wikimedia.org/wiki/File:Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.webm', 'info_dict': { - 'description': 'md5:D6F4C7BF1C0DB1EAE80371B1F93EA85E', - 'ext': 'webm', - 'id': 'Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS', + 'description': 'md5:7cd84f76e7081f1be033d0b155b4a460', + 'ext': 'webm', 'id': 'Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS', 'title': 'Die Temperaturkurve der Erde (ZDF, Terra X) 720p HD 50FPS.webm - Wikimedia Commons', - 'license': 'This file is licensed under the Creative Commons Attribution 4.0 International license.', - 'author': 'ZDF/Terra X/Gruppe 5/Luise Wagner, Jonas Sichert, Andreas Hougardy', - 'subtitles': {'de': [ - {'ext': 'srt', - 'url': 'https://commons.wikimedia.org/w/api.php?action=timedtext&&title=File%3ADie_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS.webm&&lang=de&&trackformat=srt'}], - 'en-gb': [ - {'ext': 'srt', - 'url': 'https://commons.wikimedia.org/w/api.php?action=timedtext&&title=File%3ADie_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS.webm&&lang=en-gb&&trackformat=srt'}], - 'nl': [ - {'ext': 'srt', - 'url': 'https://commons.wikimedia.org/w/api.php?action=timedtext&&title=File%3ADie_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS.webm&&lang=nl&&trackformat=srt'} - ]} - } + 'license': 'md5:62907cddf705a9f7ae7076c15407a977', + 'author': None, 'subtitles': {'de': [{'ext': 'srt', + 'url': 'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3ADie_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS.webm&lang=de&trackformat=vtt '}], + 'en-gb': [{'ext': 'srt', + 'url': 'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3ADie_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS.webm&lang=en-gb&trackformat=vtt '}], + 'nl': [{'ext': 'srt', + 'url': 'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3ADie_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS.webm&lang=nl&trackformat=vtt '}], + 'en': [{'ext': 'srt', + 'url': 'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3ADie_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS_-_redub_NL.webm&lang=en&trackformat=vtt '}]}} } def _real_extract(self, url): video_id = self._match_id(url) - ext = determine_ext(url) - if not ext.lower() in KNOWN_EXTENSIONS: - raise Exception("invalid video url") - + ext = determine_ext(url, None) + if ext is None: + raise ExtractorError('invalid video url', expected=True) webpage = self._download_webpage(url, video_id) self.report_extraction(video_id) - video_url = self._html_search_regex(r']*src="([^"]+)"', webpage, - u'video URL') - licenze = self._html_search_regex(r'\bThis\s*(.*?)\s*license\b', webpage, u'video license') - licenze = "This " + licenze + " license." + video_url = self._html_search_regex(']*src="([^"]+)"', webpage, + 'video URL') + license = get_element_by_class('layouttemplate licensetpl mw-content-ltr', webpage) + license = clean_html(license) description = get_element_by_class('description', webpage) - author = self._html_search_regex(r'([^\<]*?)<\/td>', str(webpage), u"video author") - info = {} - - info['url'] = video_url - info['description'] = clean_html(description) - info['ext'] = ext - info['id'] = video_id.replace('.' + ext, "") - info['title'] = self._og_search_title(webpage).replace("File:", "") - info['license'] = licenze - info['author'] = author + author = self._html_search_regex(']*>[\n\s]([^<]+?)\s*', webpage, 'video author', + default=None) + info = {'url': video_url, 'description': clean_html(description), 'ext': ext, + 'id': video_id.replace('.' + ext, ''), 'title': self._og_search_title(webpage).replace('File:', ''), + 'license': license, 'author': author} subtitles = {} - for sub in re.findall(r'\bsrc=\"\/w\/api\s*(.*?)\s*srt\b', str(webpage)): - sub = 'https://commons.wikimedia.org/w/api' + sub + 'srt' - lang = sub[sub.find('lang=') + 5:] - lang = lang[:lang.find('&')] - sub = sub.replace(';', '&') - info['subtitles'][lang] = [{"ext": "srt", "url": sub}] - + for sub in re.findall(r'''\bsrc\s*=\s*[\"\'](\/w\/api(.*?)[\s\"])\b''', webpage): + sub = sub[0].replace('"', '''''') + sub = urljoin('https://commons.wikimedia.org', sub) + qs = compat_parse_qs(sub) + lang = qs.get('lang', [None])[-1] + if not lang: + continue + subtitles[lang] = [{'ext': 'srt', 'url': sub}] info['subtitles'] = subtitles return info From 65274df94b482e60db50e915a0d910d0a9704924 Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir <36196667+EhtishamSabir@users.noreply.github.com> Date: Wed, 18 May 2022 18:30:33 +0500 Subject: [PATCH 15/18] Update youtube_dl/extractor/wikimedia.py Co-authored-by: dirkf --- youtube_dl/extractor/wikimedia.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py index 84c17c2a7..3529b432a 100644 --- a/youtube_dl/extractor/wikimedia.py +++ b/youtube_dl/extractor/wikimedia.py @@ -50,7 +50,8 @@ class WikimediaIE(InfoExtractor): license = clean_html(license) description = get_element_by_class('description', webpage) - author = self._html_search_regex(']*>[\n\s]([^<]+?)\s*', webpage, 'video author', + author = self._html_search_regex('>\s*Author\s*\s*]*>\s*([^<]+?)\s*', + webpage, 'video author', default=None) default=None) info = {'url': video_url, 'description': clean_html(description), 'ext': ext, 'id': video_id.replace('.' + ext, ''), 'title': self._og_search_title(webpage).replace('File:', ''), From 4a43765c84d7b2fe3589030120700f8a013af252 Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir Date: Wed, 18 May 2022 18:32:00 +0500 Subject: [PATCH 16/18] fixed issues. --- youtube_dl/extractor/wikimedia.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py index 3529b432a..7942142c3 100644 --- a/youtube_dl/extractor/wikimedia.py +++ b/youtube_dl/extractor/wikimedia.py @@ -52,7 +52,6 @@ class WikimediaIE(InfoExtractor): description = get_element_by_class('description', webpage) author = self._html_search_regex('>\s*Author\s*\s*]*>\s*([^<]+?)\s*', webpage, 'video author', default=None) - default=None) info = {'url': video_url, 'description': clean_html(description), 'ext': ext, 'id': video_id.replace('.' + ext, ''), 'title': self._og_search_title(webpage).replace('File:', ''), 'license': license, 'author': author} From 5df576b5a14514b413daa65ad7ad097d02fd40dd Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir Date: Wed, 18 May 2022 18:42:06 +0500 Subject: [PATCH 17/18] fixed issues. --- youtube_dl/extractor/wikimedia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py index 7942142c3..364c77a33 100644 --- a/youtube_dl/extractor/wikimedia.py +++ b/youtube_dl/extractor/wikimedia.py @@ -50,7 +50,7 @@ class WikimediaIE(InfoExtractor): license = clean_html(license) description = get_element_by_class('description', webpage) - author = self._html_search_regex('>\s*Author\s*\s*]*>\s*([^<]+?)\s*', + author = self._html_search_regex(r'>\s*Author\s*\s*]*>\s*([^<]+?)\s*', webpage, 'video author', default=None) info = {'url': video_url, 'description': clean_html(description), 'ext': ext, 'id': video_id.replace('.' + ext, ''), 'title': self._og_search_title(webpage).replace('File:', ''), From dce1e146109d60949e5660ea134fcec95441438e Mon Sep 17 00:00:00 2001 From: Ehtisham Sabir Date: Sat, 9 Jul 2022 15:07:13 +0500 Subject: [PATCH 18/18] fixed issues. --- youtube_dl/extractor/wikimedia.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/wikimedia.py b/youtube_dl/extractor/wikimedia.py index 364c77a33..bb05682ff 100644 --- a/youtube_dl/extractor/wikimedia.py +++ b/youtube_dl/extractor/wikimedia.py @@ -28,13 +28,13 @@ class WikimediaIE(InfoExtractor): 'title': 'Die Temperaturkurve der Erde (ZDF, Terra X) 720p HD 50FPS.webm - Wikimedia Commons', 'license': 'md5:62907cddf705a9f7ae7076c15407a977', 'author': None, 'subtitles': {'de': [{'ext': 'srt', - 'url': 'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3ADie_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS.webm&lang=de&trackformat=vtt '}], + 'url': 'https?://commons.wikimedia.org/w/api.php'}], 'en-gb': [{'ext': 'srt', - 'url': 'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3ADie_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS.webm&lang=en-gb&trackformat=vtt '}], + 'url': 'https?://commons.wikimedia.org/w/api.php'}], 'nl': [{'ext': 'srt', - 'url': 'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3ADie_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS.webm&lang=nl&trackformat=vtt '}], + 'url': 'https?://commons.wikimedia.org/w/api.php'}], 'en': [{'ext': 'srt', - 'url': 'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3ADie_Temperaturkurve_der_Erde_%28ZDF%2C_Terra_X%29_720p_HD_50FPS_-_redub_NL.webm&lang=en&trackformat=vtt '}]}} + 'url': 're:https?://commons.wikimedia.org/w/api.php'}]}} } def _real_extract(self, url):