diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3da5f8020..06851ab34 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1154,7 +1154,10 @@ from .slideshare import SlideshareIE from .slideslive import SlidesLiveIE from .slutload import SlutloadIE from .snotr import SnotrIE -from .sohu import SohuIE +from .sohu import ( + SohuIE, + SohuPlaylistIE, +) from .sonyliv import SonyLIVIE from .soundcloud import ( SoundcloudEmbedIE, diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index a62ed84f1..54094b52b 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import math import re from .common import InfoExtractor @@ -11,12 +12,13 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, - try_get, + try_get, NO_DEFAULT, HTMLAttributeParser, ) class SohuIE(InfoExtractor): - _VALID_URL = r'https?://(?Pmy\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P\d+)\.shtml.*?' + _VALID_URL = r'https?://(?:my\.)?tv\.sohu\.com/.+?/.+(?:\.html|\.shtml).*?' + _VALID_URL_OG_URL = r'https?://(?Pmy\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P\d+)\.shtml.*?' # Sohu videos give different MD5 sums on Travis CI and my machine _TESTS = [{ @@ -82,6 +84,58 @@ class SohuIE(InfoExtractor): 'params': { 'skip_download': True } + }, { + 'url': 'https://tv.sohu.com/v/dXMvMjMyNzk5ODg5Lzc4NjkzNDY0LnNodG1s.html', + 'info_dict': { + 'id': '78693464', + 'ext': 'mp4', + 'title': '【爱范品】第31期:MWC见不到的奇葩手机', + } + }, { + 'note': 'Video in issue #18542: https://github.com/ytdl-org/youtube-dl/issues/18542', + 'url': 'https://tv.sohu.com/v/MjAxNzA3MTMvbjYwMDA1MzM4MS5zaHRtbA==.html', + 'info_dict': { + 'id': '600053381', + 'ext': 'mp4', + 'title': '试听:侯旭《孤鸟》', + }, + }, { + 'note': 'Video in issue #28944: https://github.com/ytdl-org/youtube-dl/issues/28944', + 'url': 'https://tv.sohu.com/v/dXMvNTAyMjA5MTMvNjg1NjIyNTYuc2h0bWw=.html?src=pl', + 'info_dict': { + 'id': '68562256', + 'ext': 'mp4', + 'title': "Cryin' [HD 1080p] Chris.Botti(feat. Steven Tyler", + }, + }, { + 'note': 'Multipart video with new address format', + 'url': 'https://tv.sohu.com/v/dXMvMjQyNTYyMTYzLzc4OTEwMzM5LnNodG1s.html?src=pl', + 'info_dict': { + 'id': '78910339', + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + }, + 'playlist': [{ + 'info_dict': { + 'id': '78910339_part1', + 'ext': 'mp4', + 'duration': 294, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }, { + 'info_dict': { + 'id': '78910339_part2', + 'ext': 'mp4', + 'duration': 300, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }, { + 'info_dict': { + 'id': '78910339_part3', + 'ext': 'mp4', + 'duration': 150, + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + } + }] }] def _real_extract(self, url): @@ -97,7 +151,12 @@ class SohuIE(InfoExtractor): 'Downloading JSON data for %s' % vid_id, headers=self.geo_verification_headers()) - mobj = re.match(self._VALID_URL, url) + mobj = re.match(self._VALID_URL_OG_URL, url) + if mobj is None: + webpage = self._download_webpage(url, '') + url = self._og_search_url(webpage) + mobj = re.match(self._VALID_URL_OG_URL, url) + video_id = mobj.group('id') mytv = mobj.group('mytv') is not None @@ -200,3 +259,65 @@ class SohuIE(InfoExtractor): } return info + + +class SohuPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:my\.)?tv\.sohu\.com/pl/(?P\d+)$' + _URL_IN_PLAYLIST = re.compile(r'.*?') + parser = HTMLAttributeParser() + _TESTS = [{ + 'url': 'https://my.tv.sohu.com/pl/9637148', + 'info_dict': { + 'title': '【语文大师】初中必背、常考经典古诗词', + 'id': '9637148', + }, + 'playlist_count': 70, + }, { + 'url': 'https://my.tv.sohu.com/pl/9700995', + 'info_dict': { + 'title': '牛人游戏', + 'id': '9700995', + }, + 'playlist_count': 198, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('pl_id') + + webpage = self._download_webpage(url, playlist_id) + title = self._get_playlist_title(webpage) + all_pages = self._get_all_pages_in_playlist(webpage, url) + video_list = self._get_video_list(all_pages, playlist_id) + playlist = self.playlist_result(self._entries(video_list), playlist_id, title) + + return playlist + + def _get_playlist_title(self, webpage): + title = self._search_regex(r'(.*?)', webpage, 'title') + return re.sub(r'(?:^栏目:| -搜狐视频$)', '', title) + + def _entries(self, video_list): + entries = [] + for mobj in re.finditer(self._URL_IN_PLAYLIST, video_list): + self.parser.feed(mobj.group(0)) + url = self.parser.attrs['href'] + title = self.parser.attrs['title'] + entry = self.url_result(url, SohuIE.ie_key(), '', title) + entries.append(entry) + return entries + + def _get_all_pages_in_playlist(self, first_page, url): + pgcount = int(self._search_regex(r'var pgcount = \'(\d+)\'', first_page, 'pgcount')) + pgsize = int(self._search_regex(r'var pgsize = \'(\d+)\'', first_page, 'pgsize')) + return [url + '/index%d.shtml' % (i + 1) for i in range(0, math.ceil(pgcount / pgsize))] + + def _get_video_list(self, all_pages, playlist_id): + video_list = '' + for i, url in enumerate(all_pages): + webpage = self._download_webpage(url, "playlist " + playlist_id + " page: %d" % (1 + i)) + video_list += self._search_regex( + r'
    (.*?)
', + webpage, 'video list', NO_DEFAULT, True, re.DOTALL) + return video_list