Compare commits

...

2 Commits

Author SHA1 Message Date
nixxo
3a61e6d360
[rai] improve subtitles extraction (#27705)
closes #27698
2021-01-07 13:48:45 +00:00
Remita Amine
3d8e32dcc0 [canvas] Match only supported VRT NU URLs(#27707) 2021-01-07 12:35:04 +01:00
3 changed files with 44 additions and 20 deletions

View File

@ -258,16 +258,24 @@ class TestNRKSubtitles(BaseTestSubtitles):
class TestRaiPlaySubtitles(BaseTestSubtitles): class TestRaiPlaySubtitles(BaseTestSubtitles):
url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html'
IE = RaiPlayIE IE = RaiPlayIE
def test_allsubtitles(self): def test_subtitles_key(self):
self.url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html'
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['it'])) self.assertEqual(set(subtitles.keys()), set(['it']))
self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a') self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a')
def test_subtitles_array_key(self):
self.url = 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html'
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['it']))
self.assertEqual(md5(subtitles['it']), '4b3264186fbb103508abe5311cfcb9cd')
class TestVikiSubtitles(BaseTestSubtitles): class TestVikiSubtitles(BaseTestSubtitles):
url = 'http://www.viki.com/videos/1060846v-punch-episode-18' url = 'http://www.viki.com/videos/1060846v-punch-episode-18'

View File

@ -211,7 +211,7 @@ class CanvasEenIE(InfoExtractor):
class VrtNUIE(GigyaBaseIE): class VrtNUIE(GigyaBaseIE):
IE_DESC = 'VrtNU.be' IE_DESC = 'VrtNU.be'
_VALID_URL = r'https?://(?:www\.)?vrt\.be/(?P<site_id>vrtnu)/(?:[^/]+/)*(?P<id>[^/?#&]+)' _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)'
_TESTS = [{ _TESTS = [{
# Available via old API endpoint # Available via old API endpoint
'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/',

View File

@ -103,22 +103,28 @@ class RaiBaseIE(InfoExtractor):
}.items() if v is not None) }.items() if v is not None)
@staticmethod @staticmethod
def _extract_subtitles(url, subtitle_url): def _extract_subtitles(url, video_data):
STL_EXT = 'stl'
SRT_EXT = 'srt'
subtitles = {} subtitles = {}
if subtitle_url and isinstance(subtitle_url, compat_str): subtitles_array = video_data.get('subtitlesArray') or []
subtitle_url = urljoin(url, subtitle_url) for k in ('subtitles', 'subtitlesUrl'):
STL_EXT = '.stl' subtitles_array.append({'url': video_data.get(k)})
SRT_EXT = '.srt' for subtitle in subtitles_array:
subtitles['it'] = [{ sub_url = subtitle.get('url')
'ext': 'stl', if sub_url and isinstance(sub_url, compat_str):
'url': subtitle_url, sub_lang = subtitle.get('language') or 'it'
}] sub_url = urljoin(url, sub_url)
if subtitle_url.endswith(STL_EXT): sub_ext = determine_ext(sub_url, SRT_EXT)
srt_url = subtitle_url[:-len(STL_EXT)] + SRT_EXT subtitles.setdefault(sub_lang, []).append({
subtitles['it'].append({ 'ext': sub_ext,
'ext': 'srt', 'url': sub_url,
'url': srt_url,
}) })
if STL_EXT == sub_ext:
subtitles[sub_lang].append({
'ext': SRT_EXT,
'url': sub_url[:-len(STL_EXT)] + SRT_EXT,
})
return subtitles return subtitles
@ -138,6 +144,9 @@ class RaiPlayIE(RaiBaseIE):
'duration': 6160, 'duration': 6160,
'series': 'Report', 'series': 'Report',
'season': '2013/14', 'season': '2013/14',
'subtitles': {
'it': 'count:2',
},
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -145,6 +154,10 @@ class RaiPlayIE(RaiBaseIE):
}, { }, {
'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?',
'only_matching': True, 'only_matching': True,
}, {
# subtitles at 'subtitlesArray' key (see #27698)
'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -172,7 +185,7 @@ class RaiPlayIE(RaiBaseIE):
if date_published and time_published: if date_published and time_published:
date_published += ' ' + time_published date_published += ' ' + time_published
subtitles = self._extract_subtitles(url, video.get('subtitles')) subtitles = self._extract_subtitles(url, video)
program_info = media.get('program_info') or {} program_info = media.get('program_info') or {}
season = media.get('season') season = media.get('season')
@ -327,7 +340,7 @@ class RaiIE(RaiBaseIE):
'skip_download': True, 'skip_download': True,
}, },
}, { }, {
# ContentItem in iframe (see #12652) # ContentItem in iframe (see #12652) and subtitle at 'subtitlesUrl' key
'url': 'http://www.presadiretta.rai.it/dl/portali/site/puntata/ContentItem-3ed19d13-26c2-46ff-a551-b10828262f1b.html', 'url': 'http://www.presadiretta.rai.it/dl/portali/site/puntata/ContentItem-3ed19d13-26c2-46ff-a551-b10828262f1b.html',
'info_dict': { 'info_dict': {
'id': '1ad6dc64-444a-42a4-9bea-e5419ad2f5fd', 'id': '1ad6dc64-444a-42a4-9bea-e5419ad2f5fd',
@ -335,6 +348,9 @@ class RaiIE(RaiBaseIE):
'title': 'Partiti acchiappavoti - Presa diretta del 13/09/2015', 'title': 'Partiti acchiappavoti - Presa diretta del 13/09/2015',
'description': 'md5:d291b03407ec505f95f27970c0b025f4', 'description': 'md5:d291b03407ec505f95f27970c0b025f4',
'upload_date': '20150913', 'upload_date': '20150913',
'subtitles': {
'it': 'count:2',
},
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -379,7 +395,7 @@ class RaiIE(RaiBaseIE):
'url': compat_urlparse.urljoin(url, thumbnail_url), 'url': compat_urlparse.urljoin(url, thumbnail_url),
}) })
subtitles = self._extract_subtitles(url, media.get('subtitlesUrl')) subtitles = self._extract_subtitles(url, media)
info = { info = {
'id': content_id, 'id': content_id,