From d44a707fdde6c0138e9e275ed5b4ffb0b8f72966 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 5 Apr 2020 20:34:57 +0700 Subject: [PATCH 001/340] [spankwire] Fix extraction (closes #18924, closes #20648) --- youtube_dl/extractor/spankwire.py | 201 +++++++++++++++++++----------- 1 file changed, 125 insertions(+), 76 deletions(-) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 44d8fa52f..8f67463ed 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -3,34 +3,47 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, -) from ..utils import ( - sanitized_Request, + float_or_none, + int_or_none, + merge_dicts, + str_or_none, str_to_int, - unified_strdate, + url_or_none, ) -from ..aes import aes_decrypt_text class SpankwireIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pspankwire\.com/[^/]*/video(?P[0-9]+)/?)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?spankwire\.com/ + (?: + [^/]+/video| + EmbedPlayer\.aspx/?\?.*?\bArticleId= + ) + (?P\d+) + ''' _TESTS = [{ # download URL pattern: */P_K_.mp4 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', - 'md5': '8bbfde12b101204b39e4b9fe7eb67095', + 'md5': '5aa0e4feef20aad82cbcae3aed7ab7cd', 'info_dict': { 'id': '103545', 'ext': 'mp4', 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch', 'description': 'Crazy Bitch X rated music video.', + 'duration': 222, 'uploader': 'oreusz', 'uploader_id': '124697', - 'upload_date': '20070507', + 'timestamp': 1178587885, + 'upload_date': '20070508', + 'average_rating': float, + 'view_count': int, + 'comment_count': int, 'age_limit': 18, - } + 'categories': list, + 'tags': list, + }, }, { # download URL pattern: */mp4__.mp4 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/', @@ -45,83 +58,119 @@ class SpankwireIE(InfoExtractor): 'upload_date': '20150822', 'age_limit': 18, }, + 'params': { + 'proxy': '127.0.0.1:8118' + }, + 'skip': 'removed', + }, { + 'url': 'https://www.spankwire.com/EmbedPlayer.aspx/?ArticleId=156156&autostart=true', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) - req = sanitized_Request('http://www.' + mobj.group('url')) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) + video = self._download_json( + 'https://www.spankwire.com/api/video/%s.json' % video_id, video_id) - title = self._html_search_regex( - r'

([^<]+)', webpage, 'title') - description = self._html_search_regex( - r'(?s)(.+?)', - webpage, 'description', fatal=False) - thumbnail = self._html_search_regex( - r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']', - webpage, 'thumbnail', fatal=False) - - uploader = self._html_search_regex( - r'by:\s*]*>(.+?)', - webpage, 'uploader', fatal=False) - uploader_id = self._html_search_regex( - r'by:\s* on (.+?) at \d+:\d+', - webpage, 'upload date', fatal=False)) - - view_count = str_to_int(self._html_search_regex( - r'
([\d,\.]+) views
', - webpage, 'view count', fatal=False)) - comment_count = str_to_int(self._html_search_regex( - r']*>([\d,\.]+)', - webpage, 'comment count', fatal=False)) - - videos = re.findall( - r'playerData\.cdnPath([0-9]{3,})\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage) - heights = [int(video[0]) for video in videos] - video_urls = list(map(compat_urllib_parse_unquote, [video[1] for video in videos])) - if webpage.find(r'flashvars\.encrypted = "true"') != -1: - password = self._search_regex( - r'flashvars\.video_title = "([^"]+)', - webpage, 'password').replace('+', ' ') - video_urls = list(map( - lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), - video_urls)) + title = video['title'] formats = [] - for height, video_url in zip(heights, video_urls): - path = compat_urllib_parse_urlparse(video_url).path - m = re.search(r'/(?P\d+)[pP]_(?P\d+)[kK]', path) - if m: - tbr = int(m.group('tbr')) - height = int(m.group('height')) - else: - tbr = None - formats.append({ - 'url': video_url, - 'format_id': '%dp' % height, - 'height': height, - 'tbr': tbr, + videos = video.get('videos') + if isinstance(videos, dict): + for format_id, format_url in videos.items(): + video_url = url_or_none(format_url) + if not format_url: + continue + height = int_or_none(self._search_regex( + r'(\d+)[pP]', format_id, 'height', default=None)) + m = re.search( + r'/(?P\d+)[pP]_(?P\d+)[kK]', video_url) + if m: + tbr = int(m.group('tbr')) + height = height or int(m.group('height')) + else: + tbr = None + formats.append({ + 'url': video_url, + 'format_id': '%dp' % height if height else format_id, + 'height': height, + 'tbr': tbr, + }) + m3u8_url = url_or_none(video.get('HLS')) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats, ('height', 'tbr', 'width', 'format_id')) + + view_count = str_to_int(video.get('viewed')) + + thumbnails = [] + for preference, t in enumerate(('', '2x'), start=0): + thumbnail_url = url_or_none(video.get('poster%s' % t)) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'preference': preference, }) - self._sort_formats(formats) - age_limit = self._rta_search(webpage) + def extract_names(key): + entries_list = video.get(key) + if not isinstance(entries_list, list): + return + entries = [] + for entry in entries_list: + name = str_or_none(entry.get('name')) + if name: + entries.append(name) + return entries - return { + categories = extract_names('categories') + tags = extract_names('tags') + + uploader = None + info = {} + + webpage = self._download_webpage( + 'https://www.spankwire.com/_/video%s/' % video_id, video_id, + fatal=False) + if webpage: + info = self._search_json_ld(webpage, video_id, default={}) + thumbnail_url = None + if 'thumbnail' in info: + thumbnail_url = url_or_none(info['thumbnail']) + del info['thumbnail'] + if not thumbnail_url: + thumbnail_url = self._og_search_thumbnail(webpage) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'preference': 10, + }) + uploader = self._html_search_regex( + r'(?s)by\s*]+\bclass=["\']uploaded__by[^>]*>(.+?)
', + webpage, 'uploader', fatal=False) + if not view_count: + view_count = str_to_int(self._search_regex( + r'data-views=["\']([\d,.]+)', webpage, 'view count', + fatal=False)) + + return merge_dicts({ 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'description': video.get('description'), + 'duration': int_or_none(video.get('duration')), + 'thumbnails': thumbnails, 'uploader': uploader, - 'uploader_id': uploader_id, - 'upload_date': upload_date, + 'uploader_id': str_or_none(video.get('userId')), + 'timestamp': int_or_none(video.get('time_approved_on')), + 'average_rating': float_or_none(video.get('rating')), 'view_count': view_count, - 'comment_count': comment_count, + 'comment_count': int_or_none(video.get('comments')), + 'age_limit': 18, + 'categories': categories, + 'tags': tags, 'formats': formats, - 'age_limit': age_limit, - } + }, info) From 8fae1a04eb20279a76d6b1eccdb8249718ad9942 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 5 Apr 2020 20:42:10 +0700 Subject: [PATCH 002/340] [spankwire] Add support for generic embeds (refs #24633) --- youtube_dl/extractor/generic.py | 6 ++++++ youtube_dl/extractor/spankwire.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a495ee15a..63b52306a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -60,6 +60,7 @@ from .tnaflix import TNAFlixNetworkEmbedIE from .drtuber import DrTuberIE from .redtube import RedTubeIE from .tube8 import Tube8IE +from .spankwire import SpankwireIE from .vimeo import VimeoIE from .dailymotion import DailymotionIE from .dailymail import DailyMailIE @@ -2715,6 +2716,11 @@ class GenericIE(InfoExtractor): if tube8_urls: return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) + # Look for embedded Spankwire player + spankwire_urls = SpankwireIE._extract_urls(webpage) + if spankwire_urls: + return self.playlist_from_matches(spankwire_urls, video_id, video_title, ie=SpankwireIE.ie_key()) + # Look for embedded Tvigle player mobj = re.search( r']+?src=(["\'])(?P(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 8f67463ed..35ab9ec37 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -67,6 +67,12 @@ class SpankwireIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\']((?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)', + webpage) + def _real_extract(self, url): video_id = self._match_id(url) From 52c4c51556df15f98c9cda911e36995fe0fc0a47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 5 Apr 2020 20:56:14 +0700 Subject: [PATCH 003/340] [youporn] Add support form generic embeds --- youtube_dl/extractor/generic.py | 6 ++++++ youtube_dl/extractor/youporn.py | 23 +++++++++++++++++------ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 63b52306a..0ada6354e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -61,6 +61,7 @@ from .drtuber import DrTuberIE from .redtube import RedTubeIE from .tube8 import Tube8IE from .spankwire import SpankwireIE +from .youporn import YouPornIE from .vimeo import VimeoIE from .dailymotion import DailymotionIE from .dailymail import DailyMailIE @@ -2721,6 +2722,11 @@ class GenericIE(InfoExtractor): if spankwire_urls: return self.playlist_from_matches(spankwire_urls, video_id, video_title, ie=SpankwireIE.ie_key()) + # Look for embedded YouPorn player + youporn_urls = YouPornIE._extract_urls(webpage) + if youporn_urls: + return self.playlist_from_matches(youporn_urls, video_id, video_title, ie=YouPornIE.ie_key()) + # Look for embedded Tvigle player mobj = re.search( r']+?src=(["\'])(?P(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index d4eccb4b2..e7fca22de 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..utils import ( int_or_none, - sanitized_Request, str_to_int, unescapeHTML, unified_strdate, @@ -15,7 +14,7 @@ from ..aes import aes_decrypt_text class YouPornIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?youporn\.com/watch/(?P\d+)/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P\d+)(?:/(?P[^/?#&]+))?' _TESTS = [{ 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', 'md5': '3744d24c50438cf5b6f6d59feb5055c2', @@ -57,16 +56,28 @@ class YouPornIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.youporn.com/embed/505835/sex-ed-is-it-safe-to-masturbate-daily/', + 'only_matching': True, + }, { + 'url': 'http://www.youporn.com/watch/505835', + 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\']((?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)', + webpage) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id = mobj.group('display_id') or video_id - request = sanitized_Request(url) - request.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(request, display_id) + webpage = self._download_webpage( + 'http://www.youporn.com/watch/%s' % video_id, display_id, + headers={'Cookie': 'age_verified=1'}) title = self._html_search_regex( r'(?s)]+class=["\']watchVideoTitle[^>]+>(.+?)', From 4e7b5bba5fb73502476c61e4931284c9c3d3d232 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 5 Apr 2020 21:27:36 +0700 Subject: [PATCH 004/340] [mofosex] Add support for generic embeds (closes #24633) --- youtube_dl/extractor/extractors.py | 5 ++++- youtube_dl/extractor/generic.py | 6 ++++++ youtube_dl/extractor/mofosex.py | 23 +++++++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ef803b8a7..e407ab3d9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -636,7 +636,10 @@ from .mixcloud import ( from .mlb import MLBIE from .mnet import MnetIE from .moevideo import MoeVideoIE -from .mofosex import MofosexIE +from .mofosex import ( + MofosexIE, + MofosexEmbedIE, +) from .mojvideo import MojvideoIE from .morningstar import MorningstarIE from .motherless import ( diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0ada6354e..ce8252f6a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -60,6 +60,7 @@ from .tnaflix import TNAFlixNetworkEmbedIE from .drtuber import DrTuberIE from .redtube import RedTubeIE from .tube8 import Tube8IE +from .mofosex import MofosexEmbedIE from .spankwire import SpankwireIE from .youporn import YouPornIE from .vimeo import VimeoIE @@ -2717,6 +2718,11 @@ class GenericIE(InfoExtractor): if tube8_urls: return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) + # Look for embedded Mofosex player + mofosex_urls = MofosexEmbedIE._extract_urls(webpage) + if mofosex_urls: + return self.playlist_from_matches(mofosex_urls, video_id, video_title, ie=MofosexEmbedIE.ie_key()) + # Look for embedded Spankwire player spankwire_urls = SpankwireIE._extract_urls(webpage) if spankwire_urls: diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py index 1c652813a..5234cac02 100644 --- a/youtube_dl/extractor/mofosex.py +++ b/youtube_dl/extractor/mofosex.py @@ -1,5 +1,8 @@ from __future__ import unicode_literals +import re + +from .common import InfoExtractor from ..utils import ( int_or_none, str_to_int, @@ -54,3 +57,23 @@ class MofosexIE(KeezMoviesIE): }) return info + + +class MofosexEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=(?P\d+)' + _TESTS = [{ + 'url': 'https://www.mofosex.com/embed/?videoid=318131&referrer=KM', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\']((?:https?:)?//(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=\d+)', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'http://www.mofosex.com/videos/{0}/{0}.html'.format(video_id), + ie=MofosexIE.ie_key(), video_id=video_id) From 6a6e1a0cd8bacf5a23f731eedaa1783503470227 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 6 Apr 2020 02:05:06 +0700 Subject: [PATCH 005/340] [tele5] Fix extraction (closes #24553) --- youtube_dl/extractor/tele5.py | 61 ++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py index 33a72083b..364556a1f 100644 --- a/youtube_dl/extractor/tele5.py +++ b/youtube_dl/extractor/tele5.py @@ -1,9 +1,19 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from .jwplatform import JWPlatformIE from .nexx import NexxIE -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + NO_DEFAULT, + try_get, +) class Tele5IE(InfoExtractor): @@ -44,14 +54,49 @@ class Tele5IE(InfoExtractor): qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0] - if not video_id: + NEXX_ID_RE = r'\d{6,}' + JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}' + + def nexx_result(nexx_id): + return self.url_result( + 'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id, + ie=NexxIE.ie_key(), video_id=nexx_id) + + nexx_id = jwplatform_id = None + + if video_id: + if re.match(NEXX_ID_RE, video_id): + return nexx_result(video_id) + elif re.match(JWPLATFORM_ID_RE, video_id): + jwplatform_id = video_id + + if not nexx_id: display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._html_search_regex( - (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](\d+)', - r'\s+id\s*=\s*["\']player_(\d{6,})', - r'\bdata-id\s*=\s*["\'](\d{6,})'), webpage, 'video id') + + def extract_id(pattern, name, default=NO_DEFAULT): + return self._html_search_regex( + (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern, + r'\s+id\s*=\s*["\']player_(%s)' % pattern, + r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, + default=default) + + nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None) + if nexx_id: + return nexx_result(nexx_id) + + if not jwplatform_id: + jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id') + + media = self._download_json( + 'https://cdn.jwplayer.com/v2/media/' + jwplatform_id, + display_id) + nexx_id = try_get( + media, lambda x: x['playlist'][0]['nexx_id'], compat_str) + + if nexx_id: + return nexx_result(nexx_id) return self.url_result( - 'https://api.nexx.cloud/v3/759/videos/byid/%s' % video_id, - ie=NexxIE.ie_key(), video_id=video_id) + 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(), + video_id=jwplatform_id) From 13b08034b53efdcf7055df92199a0f35cf1e172e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 7 Apr 2020 22:54:34 +0700 Subject: [PATCH 006/340] [extractor/common] Skip malformed ISM manifest XMLs while extracting ISM formats (#24667) --- youtube_dl/extractor/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eaae5e484..c51a3a07d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2340,6 +2340,8 @@ class InfoExtractor(object): if res is False: return [] ism_doc, urlh = res + if ism_doc is None: + return [] return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) From 91bd3bd0194119fccc91b7eafb7afdcda646ad57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 7 Apr 2020 22:55:36 +0700 Subject: [PATCH 007/340] [tv4] Fix ISM formats extraction (closes #24667) --- youtube_dl/extractor/tv4.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index a819d048c..c498b0191 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -99,7 +99,7 @@ class TV4IE(InfoExtractor): manifest_url.replace('.m3u8', '.f4m'), video_id, f4m_id='hds', fatal=False)) formats.extend(self._extract_ism_formats( - re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url), + re.sub(r'\.ism/.*?\.m3u8', r'.ism/Manifest', manifest_url), video_id, ism_id='mss', fatal=False)) if not formats and info.get('is_geo_restricted'): From c9595ee78027ecf6bedbdc33c690228fa7d3a5bb Mon Sep 17 00:00:00 2001 From: Felix Stupp Date: Tue, 7 Apr 2020 16:21:25 +0000 Subject: [PATCH 008/340] [twitch:clips] Extend _VALID_URL (closes #24290) (#24642) --- youtube_dl/extractor/twitch.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 0db2dca41..78ee0115c 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -643,7 +643,14 @@ class TwitchStreamIE(TwitchBaseIE): class TwitchClipsIE(TwitchBaseIE): IE_NAME = 'twitch:clips' - _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P[^/?#&]+)' + _VALID_URL = r'''(?x) + https?:// + (?: + clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)| + (?:(?:www|go|m)\.)?twitch\.tv/[^/]+/clip/ + ) + (?P[^/?#&]+) + ''' _TESTS = [{ 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', @@ -669,6 +676,12 @@ class TwitchClipsIE(TwitchBaseIE): }, { 'url': 'https://clips.twitch.tv/embed?clip=InquisitiveBreakableYogurtJebaited', 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank', + 'only_matching': True, + }, { + 'url': 'https://go.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank', + 'only_matching': True, }] def _real_extract(self, url): From dcc8522fdba4c9286ebc0548caf05b425bc68773 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 9 Apr 2020 02:11:19 +0700 Subject: [PATCH 009/340] [motherless] Fix extraction (closes #24699) --- youtube_dl/extractor/motherless.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 43fd70f11..b1615b4d8 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -26,7 +26,7 @@ class MotherlessIE(InfoExtractor): 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], 'upload_date': '20100913', 'uploader_id': 'famouslyfuckedup', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, } }, { @@ -40,7 +40,7 @@ class MotherlessIE(InfoExtractor): 'game', 'hairy'], 'upload_date': '20140622', 'uploader_id': 'Sulivana7x', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, }, 'skip': '404', @@ -54,7 +54,7 @@ class MotherlessIE(InfoExtractor): 'categories': ['superheroine heroine superher'], 'upload_date': '20140827', 'uploader_id': 'shade0230', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, } }, { @@ -76,7 +76,8 @@ class MotherlessIE(InfoExtractor): raise ExtractorError('Video %s is for friends only' % video_id, expected=True) title = self._html_search_regex( - r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') + (r'(?s)]+\bclass=["\']media-meta-title[^>]+>(.+?)', + r'id="view-upload-title">\s+([^<]+)<'), webpage, 'title') video_url = (self._html_search_regex( (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', r'fileurl\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), @@ -84,14 +85,15 @@ class MotherlessIE(InfoExtractor): or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( - r'Views\s+([^<]+)<', + (r'>(\d+)\s+Views<', r'Views\s+([^<]+)<'), webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( - r'Favorited\s+([^<]+)<', + (r'>(\d+)\s+Favorites<', r'Favorited\s+([^<]+)<'), webpage, 'like count', fatal=False)) upload_date = self._html_search_regex( - r'Uploaded\s+([^<]+)<', webpage, 'upload date') + (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', + r'Uploaded\s+([^<]+)<'), webpage, 'upload date') if 'Ago' in upload_date: days = int(re.search(r'([0-9]+)', upload_date).group(1)) upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') From 5caf88ccb4bfe3d1b53885b78b2bc509ba333f15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 9 Apr 2020 03:52:29 +0700 Subject: [PATCH 010/340] [nova:embed] Fix extraction (closes #24700) --- youtube_dl/extractor/nova.py | 106 +++++++++++++++++++++++------------ 1 file changed, 71 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 2850af5db..47b9748f0 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + determine_ext, int_or_none, js_to_json, qualities, @@ -33,42 +34,76 @@ class NovaEmbedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - bitrates = self._parse_json( - self._search_regex( - r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), - video_id, transform_source=js_to_json) - - QUALITIES = ('lq', 'mq', 'hq', 'hd') - quality_key = qualities(QUALITIES) - + duration = None formats = [] - for format_id, format_list in bitrates.items(): - if not isinstance(format_list, list): - format_list = [format_list] - for format_url in format_list: - format_url = url_or_none(format_url) - if not format_url: - continue - if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - continue - f = { - 'url': format_url, - } - f_id = format_id - for quality in QUALITIES: - if '%s.mp4' % quality in format_url: - f_id += '-%s' % quality - f.update({ - 'quality': quality_key(quality), - 'format_note': quality.upper(), + + player = self._parse_json( + self._search_regex( + r'Player\.init\s*\([^,]+,\s*({.+?})\s*,\s*{.+?}\s*\)\s*;', + webpage, 'player', default='{}'), video_id, fatal=False) + if player: + for format_id, format_list in player['tracks'].items(): + if not isinstance(format_list, list): + format_list = [format_list] + for format_dict in format_list: + if not isinstance(format_dict, dict): + continue + format_url = url_or_none(format_dict.get('src')) + format_type = format_dict.get('type') + ext = determine_ext(format_url) + if (format_type == 'application/x-mpegURL' + or format_id == 'HLS' or ext == 'm3u8'): + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + elif (format_type == 'application/dash+xml' + or format_id == 'DASH' or ext == 'mpd'): + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': format_url, }) - break - f['format_id'] = f_id - formats.append(f) + duration = int_or_none(player.get('duration')) + else: + # Old path, not actual as of 08.04.2020 + bitrates = self._parse_json( + self._search_regex( + r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), + video_id, transform_source=js_to_json) + + QUALITIES = ('lq', 'mq', 'hq', 'hd') + quality_key = qualities(QUALITIES) + + for format_id, format_list in bitrates.items(): + if not isinstance(format_list, list): + format_list = [format_list] + for format_url in format_list: + format_url = url_or_none(format_url) + if not format_url: + continue + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + continue + f = { + 'url': format_url, + } + f_id = format_id + for quality in QUALITIES: + if '%s.mp4' % quality in format_url: + f_id += '-%s' % quality + f.update({ + 'quality': quality_key(quality), + 'format_note': quality.upper(), + }) + break + f['format_id'] = f_id + formats.append(f) + self._sort_formats(formats) title = self._og_search_title( @@ -81,7 +116,8 @@ class NovaEmbedIE(InfoExtractor): r'poster\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'thumbnail', fatal=False, group='value') duration = int_or_none(self._search_regex( - r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', + default=duration)) return { 'id': video_id, From 6b09401b0ba95da5669d249c8930b3adb873d96e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 9 Apr 2020 22:42:43 +0700 Subject: [PATCH 011/340] [youtube] Skip broken multifeed videos (closes #24711) --- youtube_dl/extractor/youtube.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 908defecd..633b839e0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1840,15 +1840,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # fields may contain comma as well (see # https://github.com/ytdl-org/youtube-dl/issues/8536) feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) + + def feed_entry(name): + return try_get(feed_data, lambda x: x[name][0], compat_str) + + feed_id = feed_entry('id') + if not feed_id: + continue + feed_title = feed_entry('title') + title = video_title + if feed_title: + title += ' (%s)' % feed_title entries.append({ '_type': 'url_transparent', 'ie_key': 'Youtube', 'url': smuggle_url( '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), {'force_singlefeed': True}), - 'title': '%s (%s)' % (video_title, feed_data['title'][0]), + 'title': title, }) - feed_ids.append(feed_data['id'][0]) + feed_ids.append(feed_id) self.to_screen( 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' % (', '.join(feed_ids), video_id)) From b9e5f872916a7d753ae237459b10622c1c2c3471 Mon Sep 17 00:00:00 2001 From: tom Date: Thu, 9 Apr 2020 21:50:45 +1000 Subject: [PATCH 012/340] [soundcloud] Extract AAC format --- youtube_dl/extractor/soundcloud.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index ff6be0b54..02d56184d 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -350,6 +350,8 @@ class SoundcloudIE(InfoExtractor): format_id_list = [] if protocol: format_id_list.append(protocol) + if f.get('ext') == 'aac': + f['abr'] = '256' for k in ('ext', 'abr'): v = f.get(k) if v: From 75294a5ed03f4443970478f3f4eac572239cec45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 10 Apr 2020 17:24:21 +0700 Subject: [PATCH 013/340] [soundcloud] Improve AAC format extraction (closes #19173, closes #24708) --- youtube_dl/extractor/soundcloud.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 02d56184d..422ce1626 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -246,7 +246,12 @@ class SoundcloudIE(InfoExtractor): 'comment_count': int, 'repost_count': int, }, - } + }, + { + # with AAC HQ format available via OAuth token + 'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1', + 'only_matching': True, + }, ] _API_V2_BASE = 'https://api-v2.soundcloud.com/' @@ -350,7 +355,8 @@ class SoundcloudIE(InfoExtractor): format_id_list = [] if protocol: format_id_list.append(protocol) - if f.get('ext') == 'aac': + ext = f.get('ext') + if ext == 'aac': f['abr'] = '256' for k in ('ext', 'abr'): v = f.get(k) @@ -362,9 +368,13 @@ class SoundcloudIE(InfoExtractor): abr = f.get('abr') if abr: f['abr'] = int(abr) + if protocol == 'hls': + protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' + else: + protocol = 'http' f.update({ 'format_id': '_'.join(format_id_list), - 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + 'protocol': protocol, 'preference': -10 if preview else None, }) formats.append(f) From 533f3e3557af85e28afd72d291cb51a769c7dd7a Mon Sep 17 00:00:00 2001 From: AndrewMBL <62922222+AndrewMBL@users.noreply.github.com> Date: Tue, 31 Mar 2020 15:25:04 +1100 Subject: [PATCH 014/340] [thisoldhouse] Fix video id extraction (closes #24548) Added support for: with of without "www." and either ".chorus.build" or ".com" It now validated correctly on older URL's ```