New extraction tactic for Kaltura ID using image URL

This commit is contained in:
dirkf 2022-02-08 01:48:32 +00:00
parent 09476ecdde
commit 5860937e17

View File

@ -12,7 +12,7 @@ class UNOIE(InfoExtractor):
_VALID_URL = r'https?://media\.un\.org/(?:\w+/)+(?P<id>k\d[\w]+)' _VALID_URL = r'https?://media\.un\.org/(?:\w+/)+(?P<id>k\d[\w]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://media.un.org/en/asset/k1r/k1r3vy9ikk', 'url': 'https://media.un.org/en/asset/k1r/k1r3vy9ikk',
# 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', 'md5': '981c41cb283227f079d1e5059fd0d30c',
'info_dict': { 'info_dict': {
'id': '1_r3vy9ikk', 'id': '1_r3vy9ikk',
'ext': 'mp4', 'ext': 'mp4',
@ -23,17 +23,31 @@ class UNOIE(InfoExtractor):
'timestamp': 1625216872, 'timestamp': 1625216872,
'upload_date': '20210702', 'upload_date': '20210702',
'uploader_id': 'UNWebTV_New_York', 'uploader_id': 'UNWebTV_New_York',
}
}, {
'url': 'https://media.un.org/en/asset/k12/k12gpkg3qx',
'md5': '5978503ca886a922a0f00cf5a7e82395',
'info_dict': {
'id': '1_vohfjqkj',
'ext': 'mp4',
'title': '1851st Meeting, 81st session Committee on the Elimination of Discrimination Against Women (CEDAW)',
'description': 'Informal meeting with NGOs and human rights institutions - 1851st Meeting, 81st session CEDAW',
'thumbnail': 're:https?://.+/thumbnail/.+',
'duration': 3502,
'timestamp': 1644235332,
'upload_date': '20220207',
'uploader_id': 'nathalie.minard@un.org',
}, },
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
video_id = video_id[1:2] + '_' + video_id[2:]
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
partner_id = self._search_regex(r'partnerId\s*:\s*(\d+)\b', webpage, 'Partner ID')
video_id = self._search_regex(r'/p/%s(?:/\w+)+?/entry_id/(\w+)/' % (partner_id, ), webpage, 'Kaltura ID')
title = ( title = (
self._html_search_meta(('title', 'og:title'), webpage) self._html_search_meta(('title', 'og:title'), webpage)
or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title\b', webpage, 'title').rsplit('|', 1)[0]).strip() or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title\b', webpage, 'title').rsplit('|', 1)[0]).strip()
partner_id = self._search_regex(r'partnerId\s*:\s*(\d+)\b', webpage, 'Partner ID')
result = self.url_result( result = self.url_result(
'kaltura:%s:%s' % (partner_id, video_id), 'Kaltura', 'kaltura:%s:%s' % (partner_id, video_id), 'Kaltura',
video_title=title, video_title=title,