code review fixes

This commit is contained in:
Yuval Hager 2021-01-27 16:03:01 -08:00
parent e3a900e707
commit e6c7b3c154
2 changed files with 78 additions and 59 deletions

View File

@ -521,7 +521,10 @@ from .joj import JojIE
from .jwplatform import JWPlatformIE from .jwplatform import JWPlatformIE
from .kakao import KakaoIE from .kakao import KakaoIE
from .kaltura import KalturaIE from .kaltura import KalturaIE
from .kan import KanIE from .kan import (
KanEpisodeIE,
KanPlaylistIE
)
from .kankan import KankanIE from .kankan import KankanIE
from .karaoketv import KaraoketvIE from .karaoketv import KaraoketvIE
from .karrierevideos import KarriereVideosIE from .karrierevideos import KarriereVideosIE

View File

@ -3,7 +3,12 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import unified_strdate, parse_duration from ..utils import (
ExtractorError,
parse_duration,
try_get,
unified_strdate,
)
def get_thumbnail(data): def get_thumbnail(data):
@ -15,9 +20,49 @@ def get_thumbnail(data):
return thumbnail return thumbnail
class KanIE(InfoExtractor): class KanBaseIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?kan\.org\.il/(?:[iI]tem/\?item[iI]d|program/\?cat[iI]d)=(?P<id>[0-9]+)' _GEO_COUNTRIES = ['IL']
_TESTS = [{
def download_webpage(self, url, video_id):
return self._download_webpage(
url,
video_id,
headers=self.geo_verification_headers())
def extract_item(self, video_id, webpage):
data = self._parse_json(
self._search_regex(
r'<script id="kan_app_search_data" type="application/json">([^<]+)</script>',
webpage,
'data',
),
video_id,
)
title = data.get('title') or self._og_search_title(webpage)
description = data.get('summary') or \
self._og_search_description(webpage, fatal=False)
creator = try_get(data, lambda x: x['author']['name'], str) or \
self._og_search_property('site_name', webpage, fatal=False)
thumbnail = get_thumbnail(data)
m3u8_url = try_get(data, lambda x: x['content']['src'], str)
if not m3u8_url:
raise ExtractorError('Unable to extract m3u8 url')
return {
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4'),
'description': description,
'creator': creator,
'release_date': unified_strdate(data.get('published')),
'duration': parse_duration(data.get('extensions', {}).get('duration')),
}
class KanEpisodeIE(KanBaseIE):
_VALID_URL = r'https?://(?:www\.)?kan\.org\.il/[iI]tem/\?item[iI]d=(?P<id>[0-9]+)'
_TEST = {
'url': 'https://www.kan.org.il/Item/?itemId=74658', 'url': 'https://www.kan.org.il/Item/?itemId=74658',
'md5': 'c28763bdb61c1bb7823528dd024e6129', 'md5': 'c28763bdb61c1bb7823528dd024e6129',
'info_dict': { 'info_dict': {
@ -28,74 +73,45 @@ class KanIE(InfoExtractor):
'description': 'הגופות ממשיכות להיערם, אך איזי עדיין מפקפק בחשדות נגד ברק', 'description': 'הגופות ממשיכות להיערם, אך איזי עדיין מפקפק בחשדות נגד ברק',
'creator': 'מערכת כאן', 'creator': 'מערכת כאן',
'release_date': '20200803', 'release_date': '20200803',
'duration': 2393} 'duration': 2393,
}, { },
}
def _real_extract(self, url):
video_id = self._match_id(url)
return self.extract_item(video_id, self.download_webpage(url, video_id))
class KanPlaylistIE(KanBaseIE):
_VALID_URL = r'https?://(?:www\.)?kan\.org\.il/program/\?cat[iI]d=(?P<id>[0-9]+)'
_TEST = {
'url': 'https://www.kan.org.il/program/?catId=1636', 'url': 'https://www.kan.org.il/program/?catId=1636',
'playlist_mincount': 9, 'playlist_mincount': 9,
'info_dict': { 'info_dict': {
'id': '1636', 'id': '1636',
'title': 'מנאייכ - פרקים מלאים לצפייה ישירה | כאן', 'title': 'מנאייכ - פרקים מלאים לצפייה ישירה | כאן',
'description': 'md5:9dfbd501189d08674d20762464c5301b' 'description': 'md5:9dfbd501189d08674d20762464c5301b',
},
} }
}]
_GEO_COUNTRIES = ['IL']
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) list_id = self._match_id(url)
webpage = self._download_webpage( webpage = self.download_webpage(url, list_id)
url,
video_id,
headers=self.geo_verification_headers())
if 'itemid' in url.lower():
return self._extract_item(video_id, webpage)
elif 'catid' in url.lower():
return self._extract_list(video_id, webpage)
return {}
def _extract_list(self, list_id, webpage):
video_ids = re.findall(r'onclick="playVideo\(.*,\'([0-9]+)\'\)', webpage) video_ids = re.findall(r'onclick="playVideo\(.*,\'([0-9]+)\'\)', webpage)
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
entries = [] entries = []
for video_id in video_ids: for video_id in video_ids:
url = 'https://www.kan.org.il/Item/?itemId=%s' % video_id url = 'https://www.kan.org.il/Item/?itemId=%s' % video_id
webpage = self._download_webpage( entries.append(self.extract_item(
url,
video_id, video_id,
headers=self.geo_verification_headers()) self.download_webpage(url, video_id))
entries.append(self._extract_item(video_id, webpage)) )
if not entries:
raise ExtractorError('Unable to extract playlist entries')
return { return {
'_type': 'playlist', '_type': 'playlist',
'id': list_id, 'id': list_id,
'entries': entries, 'entries': entries,
'title': title, 'title': self._og_search_title(webpage, fatal=False),
'description': description 'description': self._og_search_description(webpage),
}
def _extract_item(self, video_id, webpage):
data = self._parse_json(
self._search_regex(
r'<script id="kan_app_search_data" type="application/json">([^<]+)</script>',
webpage, 'data'),
video_id)
title = data.get('title') or \
self._og_search_title(webpage) or \
self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
description = data.get('summary') or \
self._og_search_description(webpage, fatal=False)
creator = data.get('author', {}).get('name') or \
self._og_search_property('site_name', webpage, fatal=False)
thumbnail = get_thumbnail(data)
m3u8_url = data.get('content', {}).get('src')
formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
return {
'_type': 'video',
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'formats': formats,
'description': description,
'creator': creator,
'release_date': unified_strdate(data.get('published')),
'duration': parse_duration(data.get('extensions', {}).get('duration'))
} }