2016-10-02 20:39:18 +09:00
# coding: utf-8
2014-01-07 18:04:48 +09:00
from __future__ import unicode_literals
2013-06-24 03:24:07 +09:00
import re
from . common import InfoExtractor
2023-02-16 01:37:05 +09:00
from . . compat import compat_str
2013-06-24 03:24:07 +09:00
from . . utils import (
2017-08-18 02:58:23 +09:00
ExtractorError ,
2023-02-15 03:18:47 +09:00
GeoRestrictedError ,
2014-10-20 22:27:59 +09:00
int_or_none ,
2023-02-15 03:18:47 +09:00
merge_dicts ,
parse_iso8601 ,
parse_qs ,
strip_or_none ,
traverse_obj ,
2020-11-19 05:26:49 +09:00
url_or_none ,
2023-02-15 03:18:47 +09:00
urljoin ,
2013-06-24 03:24:07 +09:00
)
2016-06-02 03:10:23 +09:00
class ArteTVBaseIE ( InfoExtractor ) :
2020-11-19 07:02:04 +09:00
_ARTE_LANGUAGES = ' fr|de|en|es|it|pl '
2023-02-15 03:18:47 +09:00
_API_BASE = ' https://api.arte.tv/api/player/v2 '
2023-02-16 01:37:05 +09:00
# yt-dlp shims
2023-02-15 03:18:47 +09:00
@classmethod
def _match_valid_url ( cls , url ) :
return re . match ( cls . _VALID_URL , url )
def _extract_m3u8_formats_and_subtitles ( self , * args , * * kwargs ) :
return self . _extract_m3u8_formats ( * args , * * kwargs ) , { }
2020-11-19 07:02:04 +09:00
class ArteTVIE ( ArteTVBaseIE ) :
_VALID_URL = r ''' (?x)
2023-02-15 03:18:47 +09:00
( ? : https ? : / /
2020-11-19 07:02:04 +09:00
( ? :
( ? : www \. ) ? arte \. tv / ( ? P < lang > % ( langs ) s ) / videos |
api \. arte \. tv / api / player / v \d + / config / ( ? P < lang_2 > % ( langs ) s )
)
2023-02-15 03:18:47 +09:00
| arte : / / program )
/ ( ? P < id > \d { 6 } - \d { 3 } - [ AF ] | LIVE )
2020-11-19 07:02:04 +09:00
''' % { ' langs ' : ArteTVBaseIE._ARTE_LANGUAGES}
_TESTS = [ {
' url ' : ' https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/ ' ,
2023-02-15 03:18:47 +09:00
' only_matching ' : True ,
} , {
' url ' : ' https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/ ' ,
2020-11-19 07:02:04 +09:00
' info_dict ' : {
2023-02-15 03:18:47 +09:00
' id ' : ' 100103-000-A ' ,
' title ' : ' USA: Dyskryminacja na porodówce ' ,
' description ' : ' md5:242017b7cce59ffae340a54baefcafb1 ' ,
' alt_title ' : ' ARTE Reportage ' ,
2023-02-15 09:24:38 +09:00
' timestamp ' : 1604417980 ,
2023-02-15 03:18:47 +09:00
' upload_date ' : ' 20201103 ' ,
' duration ' : 554 ,
2023-02-15 09:24:38 +09:00
# test format sort
' height ' : 720 ,
2023-02-15 03:18:47 +09:00
' thumbnail ' : r ' re:https://api-cdn \ .arte \ .tv/.+940x530 ' ,
2020-11-19 07:02:04 +09:00
' ext ' : ' mp4 ' ,
2023-02-15 03:18:47 +09:00
} ,
' params ' : {
' format ' : ' bestvideo ' ,
' skip_download ' : ' m3u8 ' ,
2020-11-19 07:02:04 +09:00
} ,
} , {
2023-02-15 03:18:47 +09:00
' note ' : ' No alt_title ' ,
' url ' : ' https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/ ' ,
' info_dict ' : {
' id ' : ' 110371-000-A ' ,
' ext ' : ' mp4 ' ,
' upload_date ' : ' 20220718 ' ,
' duration ' : 154 ,
' timestamp ' : 1658162460 ,
' description ' : ' md5:5890f36fe7dccfadb8b7c0891de54786 ' ,
' title ' : ' La chaleur, supplice des arbres de rue ' ,
' thumbnail ' : ' https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530 ' ,
} ,
' params ' : {
' format ' : ' bestvideo ' ,
' skip_download ' : ' m3u8 ' ,
} ,
2020-11-19 07:02:04 +09:00
} , {
' url ' : ' https://api.arte.tv/api/player/v2/config/de/100605-013-A ' ,
' only_matching ' : True ,
2023-02-15 03:18:47 +09:00
} , {
' url ' : ' https://api.arte.tv/api/player/v2/config/de/LIVE ' ,
' only_matching ' : True ,
2020-11-19 07:02:04 +09:00
} ]
2023-02-15 03:18:47 +09:00
_GEO_BYPASS = True
_LANG_MAP = { # ISO639 -> French abbreviations
' r ' : ' F ' ,
' de ' : ' A ' ,
' en ' : ' E[ANG] ' ,
' es ' : ' E[ESP] ' ,
' it ' : ' E[ITA] ' ,
' pl ' : ' E[POL] ' ,
# XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/>
# uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed)
' mul ' : ' EU ' ,
}
_VERSION_CODE_RE = re . compile ( r ''' (?x)
V
( ? P < original_voice > O ? )
( ? P < vlang > [ FA ] | E \[ [ A - Z ] + \] | EU ) ?
( ? P < audio_desc > AUD | )
( ? :
( ? P < has_sub > - ST )
( ? P < sdh_sub > M ? )
( ? P < sub_lang > [ FA ] | E \[ [ A - Z ] + \] | EU )
) ?
''' )
# all obtained by exhaustive testing
_COUNTRIES_MAP = {
' DE_FR ' : (
' BL ' , ' DE ' , ' FR ' , ' GF ' , ' GP ' , ' MF ' , ' MQ ' , ' NC ' ,
' PF ' , ' PM ' , ' RE ' , ' WF ' , ' YT ' ,
) ,
# with both of the below 'BE' sometimes works, sometimes doesn't
' EUR_DE_FR ' : (
' AT ' , ' BL ' , ' CH ' , ' DE ' , ' FR ' , ' GF ' , ' GP ' , ' LI ' ,
' MC ' , ' MF ' , ' MQ ' , ' NC ' , ' PF ' , ' PM ' , ' RE ' , ' WF ' ,
' YT ' ,
) ,
' SAT ' : (
' AD ' , ' AT ' , ' AX ' , ' BG ' , ' BL ' , ' CH ' , ' CY ' , ' CZ ' ,
' DE ' , ' DK ' , ' EE ' , ' ES ' , ' FI ' , ' FR ' , ' GB ' , ' GF ' ,
' GR ' , ' HR ' , ' HU ' , ' IE ' , ' IS ' , ' IT ' , ' KN ' , ' LI ' ,
' LT ' , ' LU ' , ' LV ' , ' MC ' , ' MF ' , ' MQ ' , ' MT ' , ' NC ' ,
' NL ' , ' NO ' , ' PF ' , ' PL ' , ' PM ' , ' PT ' , ' RE ' , ' RO ' ,
' SE ' , ' SI ' , ' SK ' , ' SM ' , ' VA ' , ' WF ' , ' YT ' ,
) ,
}
2020-11-19 07:02:04 +09:00
def _real_extract ( self , url ) :
2023-02-15 03:18:47 +09:00
mobj = self . _match_valid_url ( url )
2020-11-19 07:02:04 +09:00
video_id = mobj . group ( ' id ' )
lang = mobj . group ( ' lang ' ) or mobj . group ( ' lang_2 ' )
2023-02-15 09:29:39 +09:00
language_code = self . _LANG_MAP . get ( lang )
2023-02-15 03:18:47 +09:00
config = self . _download_json ( ' {0} /config/ {1} / {2} ' . format ( self . _API_BASE , lang , video_id ) , video_id )
geoblocking = traverse_obj ( config , ( ' data ' , ' attributes ' , ' restriction ' , ' geoblocking ' ) ) or { }
if geoblocking . get ( ' restrictedArea ' ) :
raise GeoRestrictedError ( ' Video restricted to {0!r} ' . format ( geoblocking [ ' code ' ] ) ,
countries = self . _COUNTRIES_MAP . get ( geoblocking [ ' code ' ] , ( ' DE ' , ' FR ' ) ) )
if not traverse_obj ( config , ( ' data ' , ' attributes ' , ' rights ' ) ) :
# Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten
# Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23
raise ExtractorError (
' Video is not available in this language edition of Arte or broadcast rights expired ' , expected = True )
formats , subtitles = [ ] , { }
secondary_formats = [ ]
for stream in config [ ' data ' ] [ ' attributes ' ] [ ' streams ' ] :
# official player contains code like `e.get("versions")[0].eStat.ml5`
stream_version = stream [ ' versions ' ] [ 0 ]
stream_version_code = stream_version [ ' eStat ' ] [ ' ml5 ' ]
lang_pref = - 1
m = self . _VERSION_CODE_RE . match ( stream_version_code )
if m :
lang_pref = int ( ' ' . join ( ' 01 ' [ x ] for x in (
2023-02-15 09:29:39 +09:00
m . group ( ' vlang ' ) == language_code , # we prefer voice in the requested language
2023-02-15 03:18:47 +09:00
not m . group ( ' audio_desc ' ) , # and not the audio description version
bool ( m . group ( ' original_voice ' ) ) , # but if voice is not in the requested language, at least choose the original voice
2023-02-15 09:29:39 +09:00
m . group ( ' sub_lang ' ) == language_code , # if subtitles are present, we prefer them in the requested language
2023-02-15 03:18:47 +09:00
not m . group ( ' has_sub ' ) , # but we prefer no subtitles otherwise
not m . group ( ' sdh_sub ' ) , # and we prefer not the hard-of-hearing subtitles if there are subtitles
) ) )
2023-02-16 01:37:05 +09:00
short_label = traverse_obj ( stream_version , ' shortLabel ' , expected_type = compat_str , default = ' ? ' )
2023-02-15 03:18:47 +09:00
if stream [ ' protocol ' ] . startswith ( ' HLS ' ) :
fmts , subs = self . _extract_m3u8_formats_and_subtitles (
stream [ ' url ' ] , video_id = video_id , ext = ' mp4 ' , m3u8_id = stream_version_code , fatal = False )
for fmt in fmts :
fmt . update ( {
' format_note ' : ' {0} [ {1} ] ' . format ( stream_version . get ( " label " , " unknown " ) , short_label ) ,
' language_preference ' : lang_pref ,
} )
if any ( map ( short_label . startswith , ( ' cc ' , ' OGsub ' ) ) ) :
secondary_formats . extend ( fmts )
else :
formats . extend ( fmts )
for sub in subs :
subtitles = self . _merge_subtitles ( subtitles , sub )
elif stream [ ' protocol ' ] in ( ' HTTPS ' , ' RTMP ' ) :
formats . append ( {
' format_id ' : ' {0} - {1} ' . format ( stream [ " protocol " ] , stream_version_code ) ,
' url ' : stream [ ' url ' ] ,
' format_note ' : ' {0} [ {1} ] ' . format ( stream_version . get ( " label " , " unknown " ) , short_label ) ,
' language_preference ' : lang_pref ,
# 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS
} )
2020-11-19 07:02:04 +09:00
2023-02-15 03:18:47 +09:00
else :
self . report_warning ( ' Skipping stream with unknown protocol {0} ' . format ( stream [ " protocol " ] ) )
2013-10-13 20:54:31 +09:00
2023-02-15 03:18:47 +09:00
# TODO: chapters from stream['segments']?
# The JS also looks for chapters in config['data']['attributes']['chapters'],
# but I am yet to find a video having those
2016-02-18 01:37:05 +09:00
2023-02-15 03:18:47 +09:00
formats . extend ( secondary_formats )
self . _remove_duplicate_formats ( formats )
2023-02-15 09:24:38 +09:00
self . _sort_formats ( formats )
2016-05-08 09:52:42 +09:00
2023-02-15 03:18:47 +09:00
metadata = config [ ' data ' ] [ ' attributes ' ] [ ' metadata ' ]
2020-11-19 05:26:49 +09:00
2023-02-15 03:18:47 +09:00
return {
' id ' : metadata [ ' providerId ' ] ,
' webpage_url ' : traverse_obj ( metadata , ( ' link ' , ' url ' ) ) ,
' title ' : traverse_obj ( metadata , ' subtitle ' , ' title ' ) ,
' alt_title ' : metadata . get ( ' subtitle ' ) and metadata . get ( ' title ' ) ,
' description ' : metadata . get ( ' description ' ) ,
' duration ' : traverse_obj ( metadata , ( ' duration ' , ' seconds ' ) ) ,
' language ' : metadata . get ( ' language ' ) ,
' timestamp ' : traverse_obj ( config , ( ' data ' , ' attributes ' , ' rights ' , ' begin ' ) , expected_type = parse_iso8601 ) ,
' is_live ' : config [ ' data ' ] [ ' attributes ' ] . get ( ' live ' , False ) ,
' formats ' : formats ,
' subtitles ' : subtitles ,
' thumbnails ' : [
{ ' url ' : image [ ' url ' ] , ' id ' : image . get ( ' caption ' ) }
for image in metadata . get ( ' images ' ) or [ ] if url_or_none ( image . get ( ' url ' ) )
] ,
}
2013-10-13 20:54:31 +09:00
2016-01-22 02:47:43 +09:00
2020-11-19 07:02:04 +09:00
class ArteTVEmbedIE ( InfoExtractor ) :
_VALID_URL = r ' https?://(?:www \ .)?arte \ .tv/player/v \ d+/index \ .php \ ?.*? \ bjson_url=.+ '
2023-02-15 03:18:47 +09:00
_EMBED_REGEX = [ r ' <(?:iframe|script)[^>]+src=([ " \' ])(?P<url>(?:https?:)?//(?:www \ .)?arte \ .tv/player/v \ d+/index \ .php \ ?.*? \ bjson_url=.+?) \ 1 ' ]
2016-01-23 02:00:05 +09:00
_TESTS = [ {
2020-11-19 07:02:04 +09:00
' url ' : ' https://www.arte.tv/player/v5/index.php?json_url=https % 3A %2F %2F api.arte.tv %2F api %2F player %2F v2 %2F config %2F de %2F 100605-013-A&lang=de&autoplay=true&mute=0100605-013-A ' ,
2023-02-16 01:37:05 +09:00
' only_matching ' : True ,
' skip ' : ' Video is not available in this language edition of Arte or broadcast rights expired '
} , {
' url ' : ' https://www.arte.tv/player/v5/index.php?json_url=https % 3A %2F %2F api.arte.tv %2F api %2F player %2F v2 %2F config %2F pl %2F 100103-000-A&lang=pl&autoplay=true&mute=100103-000-A ' ,
2016-01-23 02:00:05 +09:00
' info_dict ' : {
2023-02-16 01:37:05 +09:00
' id ' : ' 100103-000-A ' ,
2016-01-23 02:00:05 +09:00
' ext ' : ' mp4 ' ,
2023-02-16 01:37:05 +09:00
' title ' : ' USA: Dyskryminacja na porodówce ' ,
' timestamp ' : 1604417980 ,
' upload_date ' : ' 20201103 ' ,
' description ' : ' md5:242017b7cce59ffae340a54baefcafb1 ' ,
' duration ' : 554 ,
} ,
' params ' : {
' format ' : ' bestvideo ' ,
' skip_download ' : ' m3u8 ' ,
2013-10-13 21:21:13 +09:00
} ,
2020-11-19 07:02:04 +09:00
} , {
' url ' : ' https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A ' ,
' only_matching ' : True ,
2016-01-23 02:00:05 +09:00
} ]
2013-12-08 22:02:14 +09:00
2023-02-16 01:37:05 +09:00
@classmethod
def _extract_urls ( cls , webpage ) :
import itertools # just until this is lifted into IE
return list ( itertools . chain ( * (
( url for _ , url in re . findall ( erx , webpage ) ) for erx in cls . _EMBED_REGEX )
) )
2016-06-02 03:10:23 +09:00
2014-03-25 06:01:47 +09:00
def _real_extract ( self , url ) :
2023-02-15 03:18:47 +09:00
qs = parse_qs ( url )
2023-02-16 01:37:05 +09:00
json_url = qs [ ' json_url ' ] [ - 1 ]
2020-11-19 07:02:04 +09:00
video_id = ArteTVIE . _match_id ( json_url )
return self . url_result (
json_url , ie = ArteTVIE . ie_key ( ) , video_id = video_id )
2016-10-16 02:24:06 +09:00
2016-06-02 03:10:23 +09:00
class ArteTVPlaylistIE ( ArteTVBaseIE ) :
2020-11-19 07:02:04 +09:00
_VALID_URL = r ' https?://(?:www \ .)?arte \ .tv/(?P<lang> %s )/videos/(?P<id>RC- \ d {6} ) ' % ArteTVBaseIE . _ARTE_LANGUAGES
2016-06-02 03:10:23 +09:00
_TESTS = [ {
2019-07-03 06:07:01 +09:00
' url ' : ' https://www.arte.tv/en/videos/RC-016954/earn-a-living/ ' ,
2023-02-15 03:18:47 +09:00
' only_matching ' : True ,
} , {
' url ' : ' https://www.arte.tv/pl/videos/RC-014123/arte-reportage/ ' ,
' playlist_mincount ' : 100 ,
2016-06-02 03:10:23 +09:00
' info_dict ' : {
2023-02-15 03:18:47 +09:00
' description ' : ' md5:84e7bf1feda248bc325ebfac818c476e ' ,
' id ' : ' RC-014123 ' ,
' title ' : ' ARTE Reportage - najlepsze reportaże ' ,
2016-06-02 03:10:23 +09:00
} ,
2023-02-15 03:18:47 +09:00
' skip ' : ' 404 Not Found ' ,
2020-11-19 07:02:04 +09:00
} , {
2023-02-15 03:18:47 +09:00
' url ' : ' https://www.arte.tv/en/videos/RC-016979/war-in-ukraine/ ' ,
' playlist_mincount ' : 79 ,
' info_dict ' : {
' id ' : ' RC-016979 ' ,
' title ' : ' War in Ukraine ' ,
' description ' : ' On 24 February, Russian armed forces invaded Ukraine. We follow the war day by day and provide background information with special insights, reports and documentaries. ' ,
} ,
2016-06-02 03:10:23 +09:00
} ]
def _real_extract ( self , url ) :
2023-02-15 03:18:47 +09:00
lang , playlist_id = self . _match_valid_url ( url ) . group ( ' lang ' , ' id ' )
playlist = self . _download_json (
' {0} /playlist/ {1} / {2} ' . format ( self . _API_BASE , lang , playlist_id ) , playlist_id ) [ ' data ' ] [ ' attributes ' ]
entries = [ {
' _type ' : ' url_transparent ' ,
' url ' : video [ ' config ' ] [ ' url ' ] ,
' ie_key ' : ArteTVIE . ie_key ( ) ,
' id ' : video . get ( ' providerId ' ) ,
' title ' : video . get ( ' title ' ) ,
' alt_title ' : video . get ( ' subtitle ' ) ,
' thumbnail ' : url_or_none ( traverse_obj ( video , ( ' mainImage ' , ' url ' ) ) ) ,
' duration ' : int_or_none ( traverse_obj ( video , ( ' duration ' , ' seconds ' ) ) ) ,
} for video in traverse_obj ( playlist , ( ' items ' , lambda _ , v : v [ ' config ' ] [ ' url ' ] ) ) ]
return self . playlist_result ( entries , playlist_id ,
traverse_obj ( playlist , ( ' metadata ' , ' title ' ) ) ,
traverse_obj ( playlist , ( ' metadata ' , ' description ' ) ) )
class ArteTVCategoryIE ( ArteTVBaseIE ) :
_VALID_URL = r ' https?://(?:www \ .)?arte \ .tv/(?P<lang> %s )/videos/(?P<id>[ \ w-]+(?:/[ \ w-]+)*)/? \ s*$ ' % ArteTVBaseIE . _ARTE_LANGUAGES
_TESTS = [ {
' url ' : ' https://www.arte.tv/en/videos/politics-and-society/ ' ,
' info_dict ' : {
' id ' : ' politics-and-society ' ,
' title ' : ' Politics and society ' ,
' description ' : ' Watch documentaries and reportage about politics, society and current affairs. ' ,
} ,
' playlist_mincount ' : 13 ,
} ]
@classmethod
def suitable ( cls , url ) :
return (
not any ( ie . suitable ( url ) for ie in ( ArteTVIE , ArteTVPlaylistIE , ) )
and super ( ArteTVCategoryIE , cls ) . suitable ( url ) )
def _real_extract ( self , url ) :
lang , playlist_id = self . _match_valid_url ( url ) . groups ( )
webpage = self . _download_webpage ( url , playlist_id )
items = [ ]
for video in re . finditer (
r ' <a \ b[^>]+ \ bhref \ s*= \ s*(?P<q> " | \' | \ b)(?P<url>(?:https?://www \ .arte \ .tv)?/ %s /videos/[ \ w/-]+)(?P=q) ' % lang ,
webpage ) :
video = urljoin ( url , video . group ( ' url ' ) )
if video == url :
2020-11-19 07:02:04 +09:00
continue
2023-02-15 03:18:47 +09:00
if any ( ie . suitable ( video ) for ie in ( ArteTVIE , ArteTVPlaylistIE , ) ) :
items . append ( video )
title = ( self . _og_search_title ( webpage , default = None )
or self . _html_search_regex ( r ' <title \ b[^>]*>([^<]+)</title> ' , default = None ) )
title = strip_or_none ( title . rsplit ( ' | ' , 1 ) [ 0 ] ) or self . _generic_title ( url )
return merge_dicts (
self . playlist_from_matches ( items , playlist_id = playlist_id , playlist_title = title ) ,
2023-02-15 03:29:26 +09:00
{ ' description ' : self . _og_search_description ( webpage , default = None ) } )