2014-12-13 03:22:24 +09:00
# coding: utf-8
from __future__ import unicode_literals
from . common import InfoExtractor
2019-05-28 12:58:12 +09:00
from . . utils import (
2021-08-19 21:56:53 +09:00
ExtractorError ,
2019-05-28 12:58:12 +09:00
determine_ext ,
js_to_json ,
)
2021-02-19 23:10:10 +09:00
from . . compat import (
compat_b64decode ,
compat_urllib_parse_unquote ,
)
import re
2014-12-13 03:22:24 +09:00
2021-08-19 21:56:53 +09:00
def decode_b64_url ( code ) :
decoded_url = re . match ( r " [^[]* \ [([^]]*) \ ] " , code ) . groups ( ) [ 0 ]
return compat_b64decode (
compat_urllib_parse_unquote (
decoded_url . replace ( ' " ' , ' ' ) . replace ( ' \' ' , ' ' ) . replace ( ' , ' , ' ' ) ) ) . decode ( ' utf-8 ' )
2014-12-13 03:22:24 +09:00
class RTPIE ( InfoExtractor ) :
2021-02-21 23:05:42 +09:00
_VALID_URL = r ' https?://(?:(?:(?:www \ .)?rtp \ .pt/play/(?P<subarea>.*/)?p(?P<program_id>[0-9]+)/(?P<episode_id>e[0-9]+/)?)|(?:arquivos \ .rtp \ .pt/conteudos/))(?P<id>[^/?#]+)/? '
2014-12-14 08:13:07 +09:00
_TESTS = [ {
2021-08-19 21:56:53 +09:00
' url ' : ' https://www.rtp.pt/play/p9165/e562949/por-do-sol ' ,
2014-12-13 03:22:24 +09:00
' info_dict ' : {
2021-08-19 21:56:53 +09:00
' id ' : ' por-do-sol ' ,
2021-02-19 23:10:10 +09:00
' ext ' : ' mp4 ' ,
2021-08-19 21:56:53 +09:00
' title ' : ' Pôr do Sol Episódio 1 - de 16 Ago 2021 ' ,
' description ' : ' Madalena Bourbon de Linhaça vive atormentada pelo segredo que esconde desde 1990. Matilde Bourbon de Linhaça sonha fugir com o seu amor proibido. O en ' ,
2017-01-02 21:08:07 +09:00
' thumbnail ' : r ' re:^https?://.* \ .jpg ' ,
2014-12-13 03:22:24 +09:00
} ,
2014-12-14 08:13:07 +09:00
} , {
2021-02-19 23:10:10 +09:00
' url ' : ' https://www.rtp.pt/play/p510/aleixo-fm ' ,
2014-12-14 08:13:07 +09:00
' only_matching ' : True ,
} ]
2014-12-13 03:22:24 +09:00
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , video_id )
2021-04-22 00:24:26 +09:00
# Remove JS multi-line comments from webpage source
webpage = re . sub ( r ' ( \ / \ *.* \ * \ /) ' , ' ' , webpage , flags = re . DOTALL )
2021-02-19 23:10:10 +09:00
title = self . _html_search_regex ( r ' <title>(.+?)</title> ' , webpage , ' title ' )
2021-04-24 07:02:04 +09:00
# Replace irrelevant text in title
title = title . replace ( ' - RTP Play - RTP ' , ' ' )
# Check if it's a video split in parts, if so add part number to title
part = self . _html_search_regex ( r ' section \ -parts.*<span.*>(.+?)</span>.*</ul> ' , webpage , ' part ' , default = None )
if part :
title = f ' { title } { part } '
2021-02-19 23:10:10 +09:00
# Get JS object
js_object = self . _search_regex ( r ' (?s)RTPPlayer * \ ( *( { .+?}) * \ ); ' , webpage , ' player config ' )
json_string_for_config = ' '
2021-08-19 21:56:53 +09:00
full_url = None
2021-02-19 23:10:10 +09:00
2021-08-19 21:56:53 +09:00
# Verify JS object since it isn't pure JSON and maybe it needs some tuning
2021-02-19 23:10:10 +09:00
for line in js_object . splitlines ( ) :
stripped_line = line . strip ( )
2021-08-19 21:56:53 +09:00
# key == 'fileKey', then we found what we wanted
if re . match ( r ' fileKey: ' , stripped_line ) :
if re . match ( r ' fileKey: * " " ' , stripped_line ) :
raise ExtractorError ( " Episode not found (probably removed) " , expected = True )
url = decode_b64_url ( stripped_line )
if ' mp3 ' in url :
full_url = ' https://cdn-ondemand.rtp.pt ' + url
2021-02-19 23:10:10 +09:00
else :
2021-08-19 21:56:53 +09:00
full_url = ' https://streaming-vod.rtp.pt/dash {} /manifest.mpd ' . format ( url )
2021-02-19 23:10:10 +09:00
2021-08-19 21:56:53 +09:00
elif not stripped_line . startswith ( " // " ) and not re . match ( ' file *: ' , stripped_line ) and not re . match ( ' .*extraSettings ?: ' , stripped_line ) :
# Ignore commented lines, `extraSettings` and `f`. The latter seems to some random unrelated video.
2021-02-19 23:10:10 +09:00
json_string_for_config + = ' \n ' + line
2021-08-19 21:56:53 +09:00
if not full_url :
raise ExtractorError ( " No valid media source found in page " )
2021-02-19 23:10:10 +09:00
# Finally send pure JSON string for JSON parsing
config = self . _parse_json ( json_string_for_config , video_id , js_to_json )
2021-08-19 21:56:53 +09:00
full_url = full_url . replace ( ' drm-dash ' , ' dash ' )
ext = determine_ext ( full_url )
2021-02-19 23:10:10 +09:00
2021-08-19 21:56:53 +09:00
if ext == ' mpd ' :
# Download via mpd file
formats = self . _extract_mpd_formats ( full_url , video_id )
2019-05-28 12:58:12 +09:00
self . _sort_formats ( formats )
else :
formats = [ {
2021-08-19 21:56:53 +09:00
' url ' : full_url ,
2019-05-28 12:58:12 +09:00
' ext ' : ext ,
} ]
2021-02-19 23:10:10 +09:00
2019-05-28 12:58:12 +09:00
if config . get ( ' mediaType ' ) == ' audio ' :
for f in formats :
f [ ' vcodec ' ] = ' none '
2015-02-07 05:59:17 +09:00
2014-12-13 03:22:24 +09:00
return {
' id ' : video_id ,
' title ' : title ,
' formats ' : formats ,
2019-05-28 12:58:12 +09:00
' description ' : self . _html_search_meta ( [ ' description ' , ' twitter:description ' ] , webpage ) ,
' thumbnail ' : config . get ( ' poster ' ) or self . _og_search_thumbnail ( webpage ) ,
2014-12-13 03:22:24 +09:00
}