2022-03-31 07:56:02 +09:00
from . common import InfoExtractor
2022-04-05 09:30:07 +09:00
from . . utils import get_element_by_class , determine_ext , clean_html , KNOWN_EXTENSIONS
2022-04-02 18:31:52 +09:00
import re
2022-03-31 07:56:02 +09:00
class WikimediaIE ( InfoExtractor ) :
_NETRC_MACHINE = ' wikimediaorg '
IE_NAME = ' wikimedia.org '
_API_BASE_URL = ' https://commons.wikimedia.org/ '
_VALID_URL = r ' https://commons.wikimedia.org/wiki/File:(?P<id>[^/]+) '
2022-03-31 19:19:00 +09:00
_TEST = {
2022-04-04 21:47:05 +09:00
' url ' : ' https://commons.wikimedia.org/wiki/File:Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.webm ' ,
' info_dict ' : {
' description ' : ' Deutsch: Beschreibung auf der Seite: " Im Verlauf der Erdgeschichte glich das Klima einer Achterbahnfahrt. Die „Fieberkurve“ unseres Planeten zeigt die globalen Temperaturschwankungen bis heute – rekonstruiert anhand von historischen Klimadaten. " \n Zu Wikimedia Commons hochgeladen von: PantheraLeo1359531. \n Hinweise zur Weiterverwendung: https://www.zdf.de/dokumentation/terra-x/terra-x-creative-commons-cc-100.html. \n Vereinfachender Verlauf in der Geschichte der Erde, für die Zukunft spätestens ab dem Jahr 2050 mit spekulativem Verlauf in der Prognose (ausgeprägtes Global-warming-Szenario ist dargestellt).English: Climate change, Temperature in history of Earth, Video of Terra X. ' ,
' ext ' : ' webm ' ,
' id ' : ' Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS ' ,
' title ' : ' Die Temperaturkurve der Erde (ZDF, Terra X) 720p HD 50FPS.webm - Wikimedia Commons ' ,
' license ' : ' This file is licensed under the Creative Commons Attribution 4.0 International license. ' ,
' author ' : ' ZDF/Terra X/Gruppe 5/Luise Wagner, Jonas Sichert, Andreas Hougardy ' ,
' subtitles ' : { ' de ' : [
{ ' ext ' : ' srt ' ,
' url ' : ' https://commons.wikimedia.org/w/api.php?action=timedtext&&title=File % 3ADie_Temperaturkurve_der_Erde_ % 28ZDF % 2C_Terra_X % 29_720p_HD_50FPS.webm&&lang=de&&trackformat=srt ' } ] ,
' en-gb ' : [
{ ' ext ' : ' srt ' ,
' url ' : ' https://commons.wikimedia.org/w/api.php?action=timedtext&&title=File % 3ADie_Temperaturkurve_der_Erde_ % 28ZDF % 2C_Terra_X % 29_720p_HD_50FPS.webm&&lang=en-gb&&trackformat=srt ' } ] ,
' nl ' : [
{ ' ext ' : ' srt ' ,
' url ' : ' https://commons.wikimedia.org/w/api.php?action=timedtext&&title=File % 3ADie_Temperaturkurve_der_Erde_ % 28ZDF % 2C_Terra_X % 29_720p_HD_50FPS.webm&&lang=nl&&trackformat=srt ' }
] }
}
}
2022-03-31 19:19:00 +09:00
2022-03-31 07:56:02 +09:00
def _real_extract ( self , url ) :
2022-04-01 09:21:57 +09:00
video_id = self . _match_id ( url )
2022-04-05 09:30:07 +09:00
ext = determine_ext ( url )
if not ext . lower ( ) in KNOWN_EXTENSIONS :
2022-04-01 07:09:52 +09:00
raise Exception ( " invalid video url " )
2022-03-31 07:56:02 +09:00
webpage = self . _download_webpage ( url , video_id )
self . report_extraction ( video_id )
video_url = self . _html_search_regex ( r ' <source [^>]*src= " ([^ " ]+) " ' , webpage ,
u ' video URL ' )
2022-04-01 09:21:57 +09:00
licenze = self . _html_search_regex ( r ' \ bThis \ s*(.*?) \ s*license \ b ' , webpage , u ' video license ' )
2022-03-31 19:19:00 +09:00
licenze = " This " + licenze + " license. "
2022-04-01 07:09:52 +09:00
2022-04-01 09:21:57 +09:00
description = get_element_by_class ( ' description ' , webpage )
author = self . _html_search_regex ( r ' <td>([^ \ <]*?)< \ /td> ' , str ( webpage ) , u " video author " )
info = { }
2022-04-02 18:31:52 +09:00
2022-04-01 09:21:57 +09:00
info [ ' url ' ] = video_url
2022-04-01 09:30:38 +09:00
info [ ' description ' ] = clean_html ( description )
2022-04-05 09:30:07 +09:00
info [ ' ext ' ] = ext
info [ ' id ' ] = video_id . replace ( ' . ' + ext , " " )
2022-04-01 09:21:57 +09:00
info [ ' title ' ] = self . _og_search_title ( webpage ) . replace ( " File: " , " " )
info [ ' license ' ] = licenze
info [ ' author ' ] = author
2022-04-02 18:31:52 +09:00
subtitles = re . findall ( r ' \ bsrc= \ " \ /w \ /api \ s*(.*?) \ s*srt \ b ' , str ( webpage ) )
info [ ' subtitles ' ] = { }
for sub in subtitles :
sub = ' https://commons.wikimedia.org/w/api ' + sub + ' srt '
lang = sub [ sub . find ( ' lang= ' ) + 5 : ]
lang = lang [ : lang . find ( ' & ' ) ]
2022-04-04 21:47:05 +09:00
sub = sub . replace ( ' ; ' , ' & ' )
2022-04-02 18:31:52 +09:00
info [ ' subtitles ' ] [ lang ] = [ { " ext " : " srt " , " url " : sub } ]
2022-04-01 09:21:57 +09:00
return info