2014-12-13 03:22:24 +09:00
# coding: utf-8
from __future__ import unicode_literals
from . common import InfoExtractor
2019-05-28 12:58:12 +09:00
from . . utils import (
2021-08-29 05:10:18 +09:00
ExtractorError ,
2019-05-28 12:58:12 +09:00
determine_ext ,
js_to_json ,
)
2021-02-19 23:10:10 +09:00
from . . compat import (
compat_b64decode ,
compat_urllib_parse_unquote ,
)
import re
2014-12-13 03:22:24 +09:00
class RTPIE ( InfoExtractor ) :
2021-02-21 23:05:42 +09:00
_VALID_URL = r ' https?://(?:(?:(?:www \ .)?rtp \ .pt/play/(?P<subarea>.*/)?p(?P<program_id>[0-9]+)/(?P<episode_id>e[0-9]+/)?)|(?:arquivos \ .rtp \ .pt/conteudos/))(?P<id>[^/?#]+)/? '
2014-12-14 08:13:07 +09:00
_TESTS = [ {
2021-08-29 05:10:18 +09:00
' url ' : ' https://www.rtp.pt/play/p117/e563265/os-contemporaneos ' ,
2014-12-13 03:22:24 +09:00
' info_dict ' : {
2021-02-21 23:05:42 +09:00
' id ' : ' os-contemporaneos ' ,
2021-02-19 23:10:10 +09:00
' ext ' : ' mp4 ' ,
2021-08-29 05:10:18 +09:00
' title ' : ' Os Contemporâneos Episódio 1 ' ,
2021-02-19 23:10:10 +09:00
' description ' : ' Os Contemporâneos, um programa de humor com um olhar na sociedade portuguesa! ' ,
2021-08-29 05:10:18 +09:00
' thumbnail ' : r ' re:^https?://.* \ .(jpg|png) '
} ,
} , {
' url ' : ' https://www.rtp.pt/play/p8157/e541212/telejornal ' ,
' info_dict ' : {
' id ' : ' telejornal ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Telejornal de 01 Mai 2021 PARTE 1 ' ,
' description ' : ' A mais rigorosa seleção de notícias, todos os dias às 20h00. De segunda a domingo, João Adelino Faria e José Rodrigues dos Santos mostram-lhe o que de '
} ,
} , {
' url ' : ' https://www.rtp.pt/play/p6646/e457262/grande-entrevista ' ,
' info_dict ' : {
' id ' : ' grande-entrevista ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Grande Entrevista Episódio 7 - de 19 Fev 2020 ' ,
' description ' : ' Bruno Nogueira - É um dos mais originais humoristas portugueses e de maior êxito! Bruno Nogueira na Grande Entrevista com Vítor Gonçalves. '
} ,
} , {
' url ' : ' https://www.rtp.pt/play/estudoemcasa/p7776/e539826/portugues-1-ano ' ,
' info_dict ' : {
' id ' : ' portugues-1-ano ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Português - 1.º ano , aula 45 - 27 Abr 2021 - Estudo Em Casa - RTP ' ,
' description ' : ' A História do Pedrito Coelho, de Beatrix Potter. O dígrafo \' lh \' - A História do Pedrito Coelho, de Beatrix Potter. O dígrafo \' lh \' . '
} ,
} , {
' url ' : ' https://www.rtp.pt/play/zigzag/p5449/e385973/banda-zig-zag ' ,
' info_dict ' : {
' id ' : ' banda-zig-zag ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Banda Zig Zag Episódio 1 - Zig Zag Play - RTP ' ,
' description ' : ' A Amizade é o Nosso Mel - Zig: é a menina que além de tocar também canta. Adora aprender palavras novas e adora ler. Gosta de fazer palavras cruzadas '
} ,
} , {
' url ' : ' https://arquivos.rtp.pt/conteudos/liga-dos-ultimos-152/ ' ,
' info_dict ' : {
' id ' : ' liga-dos-ultimos-152 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Liga dos Últimos – RTP Arquivos ' ,
' description ' : ' Magazine desportivo, com apresentação de Álvaro Costa e comentários em estúdio do professor Hernâni Gonçalves e do sociólogo João Nuno Coelho. Destaque para os jogos de futebol das equipas dos escalões secundários de Portugal, com momentos dos jogos: Agrário de Lamas vs Pampilhoense e Apúlia vs Fragoso. '
2014-12-13 03:22:24 +09:00
} ,
2014-12-14 08:13:07 +09:00
} , {
2021-02-19 23:10:10 +09:00
' url ' : ' https://www.rtp.pt/play/p510/aleixo-fm ' ,
2014-12-14 08:13:07 +09:00
' only_matching ' : True ,
} ]
2014-12-13 03:22:24 +09:00
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , video_id )
2021-04-22 00:24:26 +09:00
# Remove JS multi-line comments from webpage source
webpage = re . sub ( r ' ( \ / \ *.* \ * \ /) ' , ' ' , webpage , flags = re . DOTALL )
2021-02-19 23:10:10 +09:00
title = self . _html_search_regex ( r ' <title>(.+?)</title> ' , webpage , ' title ' )
2021-08-29 05:10:18 +09:00
if ' Este episódio não se encontra disponível ' in title :
raise ExtractorError ( ' Episode unavailable ' , expected = True )
2021-04-24 07:02:04 +09:00
# Replace irrelevant text in title
2021-08-29 05:10:18 +09:00
title = re . sub ( r ' - ?RTP Play - RTP ' , ' ' , title )
2021-04-24 07:02:04 +09:00
# Check if it's a video split in parts, if so add part number to title
part = self . _html_search_regex ( r ' section \ -parts.*<span.*>(.+?)</span>.*</ul> ' , webpage , ' part ' , default = None )
if part :
title = f ' { title } { part } '
2021-02-19 23:10:10 +09:00
# Get JS object
js_object = self . _search_regex ( r ' (?s)RTPPlayer * \ ( *( { .+?}) * \ ); ' , webpage , ' player config ' )
json_string_for_config = ' '
2021-08-29 05:10:18 +09:00
filekey_found = False
2021-02-19 23:10:10 +09:00
2021-08-29 05:10:18 +09:00
# Verify JS object since it isn't pure JSON and probably needs some fixing
2021-02-19 23:10:10 +09:00
for line in js_object . splitlines ( ) :
stripped_line = line . strip ( )
2021-08-29 05:10:18 +09:00
# If JS object key is 'fileKey'
if re . match ( ' fileKey ?: ' , stripped_line ) :
filekey_found = True
2021-02-19 23:10:10 +09:00
if ' decodeURIComponent ' in stripped_line :
2021-08-29 05:10:18 +09:00
# 1) The value is an encoded URL
encoded_url = re . match ( r " [^[]* \ [([^]]*) \ ] " , stripped_line ) . groups ( ) [ 0 ]
encoded_url = re . sub ( r ' [ \ s " \' ,] ' , ' ' , encoded_url )
2021-02-21 23:05:42 +09:00
if ' atob ' in stripped_line :
2021-08-29 05:10:18 +09:00
# Most of the times 'atob' approach is used but not always so we need to be sure
decoded_url = compat_b64decode (
2021-02-21 23:05:42 +09:00
compat_urllib_parse_unquote (
2021-08-29 05:10:18 +09:00
encoded_url ) ) . decode ( ' utf-8 ' )
2021-02-21 23:05:42 +09:00
else :
2021-08-29 05:10:18 +09:00
# If no 'atob' we just need to unquote it
decoded_url = compat_urllib_parse_unquote ( encoded_url )
2021-02-19 23:10:10 +09:00
2021-08-29 05:10:18 +09:00
# Insert the (relative) decoded URL in JSON
json_string_for_config + = f ' \n fileKey: " { decoded_url } " , '
2021-02-19 23:10:10 +09:00
else :
2021-08-29 05:10:18 +09:00
# 2) ... or the value URL is not encoded so keep it that way
json_string_for_config + = f ' \n { stripped_line } '
2021-02-19 23:10:10 +09:00
2021-08-29 05:10:18 +09:00
elif (
not stripped_line . startswith ( " // " )
and not re . match ( ' .*extraSettings ?: ' , stripped_line )
and ( not filekey_found or ( filekey_found and not re . match ( ' file ?: ' , stripped_line ) ) )
) :
# Ignore commented lines and 'extraSettings'. Also ignore 'file' if 'fileKey' already exists
json_string_for_config + = f ' \n { stripped_line } '
2021-02-19 23:10:10 +09:00
# Finally send pure JSON string for JSON parsing
config = self . _parse_json ( json_string_for_config , video_id , js_to_json )
2021-08-29 05:10:18 +09:00
if ' fileKey ' in config :
# 'fileKey' has priority over 'file' on our end
file_url = config [ ' fileKey ' ]
elif ' file ' in config :
# 'RTP Arquivos' still uses old regular non-encoded 'file' key
2021-02-21 05:42:33 +09:00
file_url = config [ ' file ' ]
else :
2021-08-29 05:10:18 +09:00
raise ExtractorError ( ' No valid media source found in page ' )
2015-02-07 05:59:17 +09:00
2019-05-28 12:58:12 +09:00
ext = determine_ext ( file_url )
2021-02-19 23:10:10 +09:00
2021-08-29 05:10:18 +09:00
if ext == ' mp4 ' :
# Due to recent changes, we need to hardcode the URL like this and download it using 'm3u8'
file_url = f ' https://streaming-vod.rtp.pt/hls { file_url } /index-v1-a1.m3u8 '
formats = self . _extract_m3u8_formats (
file_url , video_id , ' mp4 ' , ' m3u8_native ' ,
m3u8_id = ' hls ' )
elif ext == ' m3u8 ' :
# It can be downloaded without any further changes
2019-05-28 12:58:12 +09:00
formats = self . _extract_m3u8_formats (
file_url , video_id , ' mp4 ' , ' m3u8_native ' ,
2021-02-19 23:10:10 +09:00
m3u8_id = ' hls ' )
2019-05-28 12:58:12 +09:00
else :
2021-08-29 05:10:18 +09:00
# Need to set basepath
file_url = f ' https://cdn-ondemand.rtp.pt { file_url } '
2019-05-28 12:58:12 +09:00
formats = [ {
' url ' : file_url ,
' ext ' : ext ,
} ]
2021-02-19 23:10:10 +09:00
2021-08-29 05:10:18 +09:00
if config [ ' mediaType ' ] == ' audio ' :
2019-05-28 12:58:12 +09:00
for f in formats :
f [ ' vcodec ' ] = ' none '
2015-02-07 05:59:17 +09:00
2021-08-29 05:10:18 +09:00
subtitles = { }
if ' vtt ' in config :
sub_lang , sub_lang_full , sub_url = config [ ' vtt ' ] [ 0 ]
subtitles . setdefault ( sub_lang , [ ] ) . append ( {
' url ' : sub_url ,
' ext ' : ' vtt ' ,
} )
2014-12-13 03:22:24 +09:00
return {
' id ' : video_id ,
' title ' : title ,
' formats ' : formats ,
2021-08-29 05:10:18 +09:00
' subtitles ' : subtitles ,
' description ' : self . _html_search_meta ( [ ' og:description ' , ' description ' , ' twitter:description ' ] , webpage ) ,
' thumbnail ' : config [ ' poster ' ] or self . _og_search_thumbnail ( webpage ) ,
2014-12-13 03:22:24 +09:00
}