Compare commits

...

12 Commits

Author SHA1 Message Date
marieell
62f5c74713
Merge 48d0994b5b3ef6714dba9510656808883e40df40 into 2b4fbfce25902d557b86b003cf48f738129efce4 2025-03-26 07:33:56 +00:00
dirkf
2b4fbfce25 [YouTube] Support player 4fcd6e4a
thx seproDev, bashonly: yt-dlp/yt-dlp#12748
2025-03-26 02:27:25 +00:00
dirkf
1bc45b8b6c [JSInterp] Use , for join() with null/undefined argument
Eg: [1,2,3].join(null) -> '1,2,3'
2025-03-25 22:35:06 +00:00
dirkf
b982d77d0b [YouTube] Align signature tests with yt-dlp
thx bashonly, yt-dlp/yt-dlp#12725
2025-03-25 22:35:06 +00:00
dirkf
c55dbf4838 [YouTube] Update signature extraction for players 643afba4, 363db69b 2025-03-25 22:35:06 +00:00
dirkf
087d865230 [YouTube] Support new player URL patterns 2025-03-25 22:35:06 +00:00
dirkf
a4fc1151f1 [JSInterp] Improve indexing
* catch invalid list index with `ValueError` (eg [1, 2]['ab'] -> undefined)
* allow assignment outside existing list (eg var l = [1,2]; l[9] = 0;)
2025-03-25 22:35:05 +00:00
dirkf
a464c159e6 [YouTube] Make _extract_player_info() use _search_regex() 2025-03-25 22:35:05 +00:00
dirkf
7dca08eff0 [YouTube] Also get original of translated automatic captions 2025-03-25 22:35:05 +00:00
dirkf
2239ee7965 [YouTube] Get subtitles/automatic captions from both web and API responses 2025-03-25 22:35:05 +00:00
marieell
48d0994b5b [tube8] Fix extractor a bit, replace broken test 2022-02-15 12:22:58 +01:00
marieell
7561727db6 Remove extractors for dead pages
They all redirect to youporn.com.
2022-02-15 09:41:32 +01:00
9 changed files with 211 additions and 356 deletions

View File

@ -84,6 +84,21 @@ _SIG_TESTS = [
'2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
'0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xxAj7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJ2OySqa0q', '0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xxAj7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJ2OySqa0q',
), ),
(
'https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js',
'2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
'AAOAOq0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xx8j7vgpDL0QwbdV06sCIEzpWqMGkFR20CFOS21Tp-7vj_EMu-m37KtXJoOy1',
),
(
'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js',
'2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
'0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
),
(
'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js',
'2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0',
),
] ]
_NSIG_TESTS = [ _NSIG_TESTS = [
@ -153,7 +168,7 @@ _NSIG_TESTS = [
), ),
( (
'https://www.youtube.com/s/player/c57c113c/player_ias.vflset/en_US/base.js', 'https://www.youtube.com/s/player/c57c113c/player_ias.vflset/en_US/base.js',
'-Txvy6bT5R6LqgnQNx', 'dcklJCnRUHbgSg', 'M92UUMHa8PdvPd3wyM', '3hPqLJsiNZx7yA',
), ),
( (
'https://www.youtube.com/s/player/5a3b6271/player_ias.vflset/en_US/base.js', 'https://www.youtube.com/s/player/5a3b6271/player_ias.vflset/en_US/base.js',
@ -173,7 +188,7 @@ _NSIG_TESTS = [
), ),
( (
'https://www.youtube.com/s/player/cfa9e7cb/player_ias.vflset/en_US/base.js', 'https://www.youtube.com/s/player/cfa9e7cb/player_ias.vflset/en_US/base.js',
'qO0NiMtYQ7TeJnfFG2', 'k9cuJDHNS5O7kQ', 'aCi3iElgd2kq0bxVbQ', 'QX1y8jGb2IbZ0w',
), ),
( (
'https://www.youtube.com/s/player/8c7583ff/player_ias.vflset/en_US/base.js', 'https://www.youtube.com/s/player/8c7583ff/player_ias.vflset/en_US/base.js',
@ -231,10 +246,6 @@ _NSIG_TESTS = [
'https://www.youtube.com/s/player/f6e09c70/player_ias_tce.vflset/en_US/base.js', 'https://www.youtube.com/s/player/f6e09c70/player_ias_tce.vflset/en_US/base.js',
'W9HJZKktxuYoDTqW', 'jHbbkcaxm54', 'W9HJZKktxuYoDTqW', 'jHbbkcaxm54',
), ),
(
'https://www.youtube.com/s/player/643afba4/player_ias.vflset/en_US/base.js',
'W9HJZKktxuYoDTqW', 'larxUlagTRAcSw',
),
( (
'https://www.youtube.com/s/player/e7567ecf/player_ias_tce.vflset/en_US/base.js', 'https://www.youtube.com/s/player/e7567ecf/player_ias_tce.vflset/en_US/base.js',
'Sy4aDGc0VpYRR9ew_', '5UPOT1VhoZxNLQ', 'Sy4aDGc0VpYRR9ew_', '5UPOT1VhoZxNLQ',
@ -259,6 +270,22 @@ _NSIG_TESTS = [
'https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', 'https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js',
'ir9-V6cdbCiyKxhr', '2PL7ZDYAALMfmA', 'ir9-V6cdbCiyKxhr', '2PL7ZDYAALMfmA',
), ),
(
'https://www.youtube.com/s/player/643afba4/player_ias.vflset/en_US/base.js',
'W9HJZKktxuYoDTqW', 'larxUlagTRAcSw',
),
(
'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js',
'eWYu5d5YeY_4LyEDc', 'XJQqf-N7Xra3gg',
),
(
'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js',
'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A',
),
(
'https://www.youtube.com/s/player/4fcd6e4a/tv-player-ias.vflset/tv-player-ias.js',
'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A',
),
] ]
@ -271,6 +298,8 @@ class TestPlayerInfo(unittest.TestCase):
('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-en_US.vflset/base.js', '64dddad9'), ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-en_US.vflset/base.js', '64dddad9'),
('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-de_DE.vflset/base.js', '64dddad9'), ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-de_DE.vflset/base.js', '64dddad9'),
('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-tablet-en_US.vflset/base.js', '64dddad9'), ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-tablet-en_US.vflset/base.js', '64dddad9'),
('https://www.youtube.com/s/player/e7567ecf/player_ias_tce.vflset/en_US/base.js', 'e7567ecf'),
('https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', '643afba4'),
# obsolete # obsolete
('https://www.youtube.com/yts/jsbin/player_ias-vfle4-e03/en_US/base.js', 'vfle4-e03'), ('https://www.youtube.com/yts/jsbin/player_ias-vfle4-e03/en_US/base.js', 'vfle4-e03'),
('https://www.youtube.com/yts/jsbin/player_ias-vfl49f_g4/en_US/base.js', 'vfl49f_g4'), ('https://www.youtube.com/yts/jsbin/player_ias-vfl49f_g4/en_US/base.js', 'vfl49f_g4'),
@ -280,8 +309,9 @@ class TestPlayerInfo(unittest.TestCase):
('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', 'vflXGBaUN'), ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', 'vflXGBaUN'),
('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'vflKjOTVq'), ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'vflKjOTVq'),
) )
ie = YoutubeIE(FakeYDL({'cachedir': False}))
for player_url, expected_player_id in PLAYER_URLS: for player_url, expected_player_id in PLAYER_URLS:
player_id = YoutubeIE._extract_player_info(player_url) player_id = ie._extract_player_info(player_url)
self.assertEqual(player_id, expected_player_id) self.assertEqual(player_id, expected_player_id)
@ -301,8 +331,8 @@ class TestSignature(unittest.TestCase):
def t_factory(name, sig_func, url_pattern): def t_factory(name, sig_func, url_pattern):
def make_tfunc(url, sig_input, expected_sig): def make_tfunc(url, sig_input, expected_sig):
m = url_pattern.match(url) m = url_pattern.match(url)
assert m, '%r should follow URL format' % url assert m, '{0!r} should follow URL format'.format(url)
test_id = m.group('id') test_id = re.sub(r'[/.-]', '_', m.group('id') or m.group('compat_id'))
def test_func(self): def test_func(self):
basename = 'player-{0}-{1}.js'.format(name, test_id) basename = 'player-{0}-{1}.js'.format(name, test_id)
@ -335,12 +365,16 @@ def n_sig(jscode, sig_input):
make_sig_test = t_factory( make_sig_test = t_factory(
'signature', signature, re.compile(r'.*(?:-|/player/)(?P<id>[a-zA-Z0-9_-]+)(?:/.+\.js|(?:/watch_as3|/html5player)?\.[a-z]+)$')) 'signature', signature,
re.compile(r'''(?x)
.+/(?P<h5>html5)?player(?(h5)(?:-en_US)?-|/)(?P<id>[a-zA-Z0-9/._-]+)
(?(h5)/(?:watch_as3|html5player))?\.js$
'''))
for test_spec in _SIG_TESTS: for test_spec in _SIG_TESTS:
make_sig_test(*test_spec) make_sig_test(*test_spec)
make_nsig_test = t_factory( make_nsig_test = t_factory(
'nsig', n_sig, re.compile(r'.+/player/(?P<id>[a-zA-Z0-9_-]+)/.+.js$')) 'nsig', n_sig, re.compile(r'.+/player/(?P<id>[a-zA-Z0-9_/.-]+)\.js$'))
for test_spec in _NSIG_TESTS: for test_spec in _NSIG_TESTS:
make_nsig_test(*test_spec) make_nsig_test(*test_spec)

View File

@ -371,7 +371,6 @@ from .esri import EsriVideoIE
from .europa import EuropaIE from .europa import EuropaIE
from .expotv import ExpoTVIE from .expotv import ExpoTVIE
from .expressen import ExpressenIE from .expressen import ExpressenIE
from .extremetube import ExtremeTubeIE
from .eyedotv import EyedoTVIE from .eyedotv import EyedoTVIE
from .facebook import ( from .facebook import (
FacebookIE, FacebookIE,
@ -562,7 +561,6 @@ from .kaltura import KalturaIE
from .kankan import KankanIE from .kankan import KankanIE
from .karaoketv import KaraoketvIE from .karaoketv import KaraoketvIE
from .karrierevideos import KarriereVideosIE from .karrierevideos import KarriereVideosIE
from .keezmovies import KeezMoviesIE
from .ketnet import KetnetIE from .ketnet import KetnetIE
from .khanacademy import ( from .khanacademy import (
KhanAcademyIE, KhanAcademyIE,
@ -713,10 +711,6 @@ from .mlb import (
) )
from .mnet import MnetIE from .mnet import MnetIE
from .moevideo import MoeVideoIE from .moevideo import MoeVideoIE
from .mofosex import (
MofosexIE,
MofosexEmbedIE,
)
from .mojvideo import MojvideoIE from .mojvideo import MojvideoIE
from .morningstar import MorningstarIE from .morningstar import MorningstarIE
from .motherless import ( from .motherless import (

View File

@ -1,50 +0,0 @@
from __future__ import unicode_literals
from ..utils import str_to_int
from .keezmovies import KeezMoviesIE
class ExtremeTubeIE(KeezMoviesIE):
_VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P<id>[^/#?&]+)'
_TESTS = [{
'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431',
'md5': '92feaafa4b58e82f261e5419f39c60cb',
'info_dict': {
'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431',
'ext': 'mp4',
'title': 'Music Video 14 british euro brit european cumshots swallow',
'uploader': 'anonim',
'view_count': int,
'age_limit': 18,
}
}, {
'url': 'http://www.extremetube.com/gay/video/abcde-1234',
'only_matching': True,
}, {
'url': 'http://www.extremetube.com/video/latina-slut-fucked-by-fat-black-dick',
'only_matching': True,
}, {
'url': 'http://www.extremetube.com/video/652431',
'only_matching': True,
}]
def _real_extract(self, url):
webpage, info = self._extract_info(url)
if not info['title']:
info['title'] = self._search_regex(
r'<h1[^>]+title="([^"]+)"[^>]*>', webpage, 'title')
uploader = self._html_search_regex(
r'Uploaded by:\s*</[^>]+>\s*<a[^>]+>(.+?)</a>',
webpage, 'uploader', fatal=False)
view_count = str_to_int(self._search_regex(
r'Views:\s*</[^>]+>\s*<[^>]+>([\d,\.]+)</',
webpage, 'view count', fatal=False))
info.update({
'uploader': uploader,
'view_count': view_count,
})
return info

View File

@ -66,7 +66,6 @@ from .tnaflix import TNAFlixNetworkEmbedIE
from .drtuber import DrTuberIE from .drtuber import DrTuberIE
from .redtube import RedTubeIE from .redtube import RedTubeIE
from .tube8 import Tube8IE from .tube8 import Tube8IE
from .mofosex import MofosexEmbedIE
from .spankwire import SpankwireIE from .spankwire import SpankwireIE
from .youporn import YouPornIE from .youporn import YouPornIE
from .vimeo import ( from .vimeo import (
@ -3044,11 +3043,6 @@ class GenericIE(InfoExtractor):
if tube8_urls: if tube8_urls:
return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key())
# Look for embedded Mofosex player
mofosex_urls = MofosexEmbedIE._extract_urls(webpage)
if mofosex_urls:
return self.playlist_from_matches(mofosex_urls, video_id, video_title, ie=MofosexEmbedIE.ie_key())
# Look for embedded Spankwire player # Look for embedded Spankwire player
spankwire_urls = SpankwireIE._extract_urls(webpage) spankwire_urls = SpankwireIE._extract_urls(webpage)
if spankwire_urls: if spankwire_urls:

View File

@ -1,133 +0,0 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..aes import aes_decrypt_text
from ..compat import compat_urllib_parse_unquote
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
str_to_int,
strip_or_none,
url_or_none,
)
class KeezMoviesIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.keezmovies.com/video/arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money-18070681',
'md5': '2ac69cdb882055f71d82db4311732a1a',
'info_dict': {
'id': '18070681',
'display_id': 'arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money',
'ext': 'mp4',
'title': 'Arab wife want it so bad I see she thirsty and has tiny money.',
'thumbnail': None,
'view_count': int,
'age_limit': 18,
}
}, {
'url': 'http://www.keezmovies.com/video/18070681',
'only_matching': True,
}]
def _extract_info(self, url, fatal=True):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
display_id = (mobj.group('display_id')
if 'display_id' in mobj.groupdict()
else None) or mobj.group('id')
webpage = self._download_webpage(
url, display_id, headers={'Cookie': 'age_verified=1'})
formats = []
format_urls = set()
title = None
thumbnail = None
duration = None
encrypted = False
def extract_format(format_url, height=None):
format_url = url_or_none(format_url)
if not format_url or not format_url.startswith(('http', '//')):
return
if format_url in format_urls:
return
format_urls.add(format_url)
tbr = int_or_none(self._search_regex(
r'[/_](\d+)[kK][/_]', format_url, 'tbr', default=None))
if not height:
height = int_or_none(self._search_regex(
r'[/_](\d+)[pP][/_]', format_url, 'height', default=None))
if encrypted:
format_url = aes_decrypt_text(
video_url, title, 32).decode('utf-8')
formats.append({
'url': format_url,
'format_id': '%dp' % height if height else None,
'height': height,
'tbr': tbr,
})
flashvars = self._parse_json(
self._search_regex(
r'flashvars\s*=\s*({.+?});', webpage,
'flashvars', default='{}'),
display_id, fatal=False)
if flashvars:
title = flashvars.get('video_title')
thumbnail = flashvars.get('image_url')
duration = int_or_none(flashvars.get('video_duration'))
encrypted = flashvars.get('encrypted') is True
for key, value in flashvars.items():
mobj = re.search(r'quality_(\d+)[pP]', key)
if mobj:
extract_format(value, int(mobj.group(1)))
video_url = flashvars.get('video_url')
if video_url and determine_ext(video_url, None):
extract_format(video_url)
video_url = self._html_search_regex(
r'flashvars\.video_url\s*=\s*(["\'])(?P<url>http.+?)\1',
webpage, 'video url', default=None, group='url')
if video_url:
extract_format(compat_urllib_parse_unquote(video_url))
if not formats:
if 'title="This video is no longer available"' in webpage:
raise ExtractorError(
'Video %s is no longer available' % video_id, expected=True)
try:
self._sort_formats(formats)
except ExtractorError:
if fatal:
raise
if not title:
title = self._html_search_regex(
r'<h1[^>]*>([^<]+)', webpage, 'title')
return webpage, {
'id': video_id,
'display_id': display_id,
'title': strip_or_none(title),
'thumbnail': thumbnail,
'duration': duration,
'age_limit': 18,
'formats': formats,
}
def _real_extract(self, url):
webpage, info = self._extract_info(url, fatal=False)
if not info['formats']:
return self.url_result(url, 'Generic')
info['view_count'] = str_to_int(self._search_regex(
r'<b>([\d,.]+)</b> Views?', webpage, 'view count', fatal=False))
return info

View File

@ -1,79 +0,0 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
str_to_int,
unified_strdate,
)
from .keezmovies import KeezMoviesIE
class MofosexIE(KeezMoviesIE):
_VALID_URL = r'https?://(?:www\.)?mofosex\.com/videos/(?P<id>\d+)/(?P<display_id>[^/?#&.]+)\.html'
_TESTS = [{
'url': 'http://www.mofosex.com/videos/318131/amateur-teen-playing-and-masturbating-318131.html',
'md5': '558fcdafbb63a87c019218d6e49daf8a',
'info_dict': {
'id': '318131',
'display_id': 'amateur-teen-playing-and-masturbating-318131',
'ext': 'mp4',
'title': 'amateur teen playing and masturbating',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20121114',
'view_count': int,
'like_count': int,
'dislike_count': int,
'age_limit': 18,
}
}, {
# This video is no longer available
'url': 'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html',
'only_matching': True,
}]
def _real_extract(self, url):
webpage, info = self._extract_info(url)
view_count = str_to_int(self._search_regex(
r'VIEWS:</span>\s*([\d,.]+)', webpage, 'view count', fatal=False))
like_count = int_or_none(self._search_regex(
r'id=["\']amountLikes["\'][^>]*>(\d+)', webpage,
'like count', fatal=False))
dislike_count = int_or_none(self._search_regex(
r'id=["\']amountDislikes["\'][^>]*>(\d+)', webpage,
'like count', fatal=False))
upload_date = unified_strdate(self._html_search_regex(
r'Added:</span>([^<]+)', webpage, 'upload date', fatal=False))
info.update({
'view_count': view_count,
'like_count': like_count,
'dislike_count': dislike_count,
'upload_date': upload_date,
'thumbnail': self._og_search_thumbnail(webpage),
})
return info
class MofosexEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.mofosex.com/embed/?videoid=318131&referrer=KM',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=\d+)',
webpage)
def _real_extract(self, url):
video_id = self._match_id(url)
return self.url_result(
'http://www.mofosex.com/videos/{0}/{0}.html'.format(video_id),
ie=MofosexIE.ie_key(), video_id=video_id)

View File

@ -3,32 +3,33 @@ from __future__ import unicode_literals
import re import re
from ..utils import ( from ..utils import (
determine_ext,
ExtractorError,
int_or_none, int_or_none,
str_to_int, str_to_int,
url_or_none,
) )
from .keezmovies import KeezMoviesIE from .common import InfoExtractor
from ..aes import aes_decrypt_text
from ..compat import compat_urllib_parse_unquote
class Tube8IE(KeezMoviesIE): class Tube8IE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', 'url': 'https://www.tube8.com/erotic/playtime/81807731/',
'md5': '65e20c48e6abff62ed0c3965fff13a39', 'md5': 'fefa69ff76debaa63aa59374bfc51c95',
'info_dict': { 'info_dict': {
'id': '229795', 'id': '81807731',
'display_id': 'kasia-music-video', 'display_id': 'playtime',
'ext': 'mp4', 'ext': 'mp4',
'description': 'hot teen Kasia grinding', 'uploader': 'kikkijay-ph',
'uploader': 'unknown', 'title': 'Playtime',
'title': 'Kasia music video',
'age_limit': 18, 'age_limit': 18,
'duration': 230, 'duration': 988,
'categories': ['Teen'], 'categories': ['Erotic'],
'tags': ['dancing'], 'tags': ['adult toys', 'big boobs', 'butt', 'masturbate'],
}, },
}, {
'url': 'http://www.tube8.com/shemale/teen/blonde-cd-gets-kidnapped-by-two-blacks-and-punished-for-being-a-slutty-girl/19569151/',
'only_matching': True,
}] }]
@staticmethod @staticmethod
@ -38,49 +39,121 @@ class Tube8IE(KeezMoviesIE):
webpage) webpage)
def _real_extract(self, url): def _real_extract(self, url):
webpage, info = self._extract_info(url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
display_id = mobj.group('display_id')
if not info['title']: webpage = self._download_webpage(
info['title'] = self._html_search_regex( url, display_id, headers={'Cookie': 'age_verified=1'})
r'videoTitle\s*=\s*"([^"]+)', webpage, 'title')
description = self._html_search_regex( formats = []
r'(?s)Description:</dt>\s*<dd>(.+?)</dd>', webpage, 'description', fatal=False) format_urls = set()
uploader = self._html_search_regex(
r'<span class="username">\s*(.+?)\s*<', title = None
webpage, 'uploader', fatal=False) thumbnail = None
duration = None
encrypted = False
def extract_format(format_url, height=None):
format_url = url_or_none(format_url)
if not format_url or not format_url.startswith(('http', '//')):
return
if format_url in format_urls:
return
format_urls.add(format_url)
tbr = int_or_none(self._search_regex(
r'[/_](\d+)[kK][/_]', format_url, 'tbr', default=None))
if not height:
height = int_or_none(self._search_regex(
r'[/_](\d+)[pP][/_]', format_url, 'height', default=None))
if encrypted:
format_url = aes_decrypt_text(
video_url, title, 32).decode('utf-8')
formats.append({
'url': format_url,
'format_id': '%dp' % height if height else None,
'height': height,
'tbr': tbr,
})
flashvars = self._parse_json(
self._search_regex(
r'flashvars\s*=\s*({.+?});', webpage,
'flashvars', default='{}'),
display_id, fatal=False)
if flashvars:
title = flashvars.get('video_title')
thumbnail = flashvars.get('image_url')
duration = int_or_none(flashvars.get('video_duration'))
encrypted = flashvars.get('encrypted') is True
uploader = flashvars.get('sponsor')
for key, value in flashvars.items():
mobj = re.search(r'quality_(\d+)[pP]', key)
if mobj:
extract_format(value, int(mobj.group(1)))
video_url = flashvars.get('video_url')
for key, value in flashvars.items():
mobj = re.search(r'quality_(\d+)[pP]', key)
if mobj:
extract_format(value, int(mobj.group(1)))
video_url = flashvars.get('video_url')
if video_url and determine_ext(video_url, None):
extract_format(video_url)
video_url = self._html_search_regex(
r'flashvars\.video_url\s*=\s*(["\'])(?P<url>http.+?)\1',
webpage, 'video url', default=None, group='url')
if video_url:
extract_format(compat_urllib_parse_unquote(video_url))
if not formats:
if 'title="This video is no longer available"' in webpage:
raise ExtractorError(
'Video %s is no longer available' % video_id, expected=True)
self._sort_formats(formats)
if not title:
title = self._html_search_regex([
r'<h1[^>]*>([^<]+)'
r'videoTitle\s*=\s*"([^"]+)'], webpage, 'title', default=display_id.capitalize())
like_count = int_or_none(self._search_regex( like_count = int_or_none(self._search_regex(
r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False)) r'rupVar\s*=\s*(\d+);', webpage, 'like count', fatal=False))
dislike_count = int_or_none(self._search_regex( dislike_count = int_or_none(self._search_regex(
r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False)) r'rdownVar\s*=\s*(\d+);', webpage, 'dislike count', fatal=False))
view_count = str_to_int(self._search_regex( view_count = str_to_int(self._search_regex(
r'Views:\s*</dt>\s*<dd>([\d,\.]+)', r'Views:\s*</dt>\s*<dd>([\d,\.]+)',
webpage, 'view count', fatal=False)) webpage, 'view count', fatal=False))
comment_count = str_to_int(self._search_regex( comment_count = str_to_int(self._search_regex(
r'<span id="allCommentsCount">(\d+)</span>', r'<span id="allCommentsCount">\((\d+)\)</span>',
webpage, 'comment count', fatal=False)) webpage, 'comment count', fatal=False))
category = self._search_regex( category = self._search_regex(
r'Category:\s*</dt>\s*<dd>\s*<a[^>]+href=[^>]+>([^<]+)', r'videoCategoryByName\s*=\s*"([^"]+)";',
webpage, 'category', fatal=False) webpage, 'category', fatal=False)
categories = [category] if category else None categories = [category] if category else None
tags_str = self._search_regex( tags_str = self._search_regex(
r'(?s)Tags:\s*</dt>\s*<dd>(.+?)</(?!a)', r"(?s)<li\s+class\s*=\s*'video-tag'\s+data-esp-node\s*=\s*'tag'>(.*?)</div>",
webpage, 'tags', fatal=False) webpage, 'tags', fatal=False)
tags = [t for t in re.findall( tags = [t for t in re.findall(
r'<a[^>]+href=[^>]+>([^<]+)', tags_str)] if tags_str else None r'<a[^>]+href=[^>]+>([^<]+)', tags_str)] if tags_str else None
info.update({ return {
'description': description,
'uploader': uploader, 'uploader': uploader,
'id': video_id,
'display_id': display_id,
'title': title,
'thumbnail': thumbnail,
'duration': duration,
'age_limit': 18,
'formats': formats,
'view_count': view_count, 'view_count': view_count,
'like_count': like_count, 'like_count': like_count,
'dislike_count': dislike_count, 'dislike_count': dislike_count,
'comment_count': comment_count, 'comment_count': comment_count,
'categories': categories, 'categories': categories,
'tags': tags, 'tags': tags,
}) }
return info

View File

@ -692,9 +692,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'invidious': '|'.join(_INVIDIOUS_SITES), 'invidious': '|'.join(_INVIDIOUS_SITES),
} }
_PLAYER_INFO_RE = ( _PLAYER_INFO_RE = (
r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})//(?:tv-)?player', r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/(?:tv-)?player',
r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias(?:_tce)?\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$', r'\b(?P<id>vfl[a-zA-Z0-9_-]{6,})\b.*?\.js$',
) )
_SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
@ -1626,15 +1626,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
""" Return a string representation of a signature """ """ Return a string representation of a signature """
return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
@classmethod def _extract_player_info(self, player_url):
def _extract_player_info(cls, player_url): try:
for player_re in cls._PLAYER_INFO_RE: return self._search_regex(
id_m = re.search(player_re, player_url) self._PLAYER_INFO_RE, player_url, 'player info', group='id')
if id_m: except ExtractorError as e:
break raise ExtractorError(
else: 'Cannot identify player %r' % (player_url,), cause=e)
raise ExtractorError('Cannot identify player %r' % player_url)
return id_m.group('id')
def _load_player(self, video_id, player_url, fatal=True, player_id=None): def _load_player(self, video_id, player_url, fatal=True, player_id=None):
if not player_id: if not player_id:
@ -1711,6 +1709,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
' return %s\n') % (signature_id_tuple, expr_code) ' return %s\n') % (signature_id_tuple, expr_code)
self.to_screen('Extracted signature function:\n' + code) self.to_screen('Extracted signature function:\n' + code)
def _extract_sig_fn(self, jsi, funcname):
var_ay = self._search_regex(
r'''(?x)
(?:\*/|\{|\n|^)\s*(?:'[^']+'\s*;\s*)
(var\s*[\w$]+\s*=\s*(?:
('|")(?:\\\2|(?!\2).)+\2\s*\.\s*split\(\s*('|")\W+\3\s*\)|
\[\s*(?:('|")(?:\\\4|(?!\4).)*\4\s*(?:(?=\])|,\s*))+\]
))(?=\s*[,;])
''', jsi.code, 'useful values', default='')
sig_fn = jsi.extract_function_code(funcname)
if var_ay:
sig_fn = (sig_fn[0], ';\n'.join((var_ay, sig_fn[1])))
return sig_fn
def _parse_sig_js(self, jscode): def _parse_sig_js(self, jscode):
# Examples where `sig` is funcname: # Examples where `sig` is funcname:
# sig=function(a){a=a.split(""); ... ;return a.join("")}; # sig=function(a){a=a.split(""); ... ;return a.join("")};
@ -1736,8 +1751,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
jscode, 'Initial JS player signature function name', group='sig') jscode, 'Initial JS player signature function name', group='sig')
jsi = JSInterpreter(jscode) jsi = JSInterpreter(jscode)
initial_function = jsi.extract_function(funcname)
return lambda s: initial_function([s]) initial_function = self._extract_sig_fn(jsi, funcname)
func = jsi.extract_function_from_code(*initial_function)
return lambda s: func([s])
def _cached(self, func, *cache_id): def _cached(self, func, *cache_id):
def inner(*args, **kwargs): def inner(*args, **kwargs):
@ -1856,15 +1875,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_n_function_code_jsi(self, video_id, jsi, player_id=None): def _extract_n_function_code_jsi(self, video_id, jsi, player_id=None):
var_ay = self._search_regex(
r'(?:[;\s]|^)\s*(var\s*[\w$]+\s*=\s*"(?:\\"|[^"])+"\s*\.\s*split\("\W+"\))(?=\s*[,;])',
jsi.code, 'useful values', default='')
func_name = self._extract_n_function_name(jsi.code) func_name = self._extract_n_function_name(jsi.code)
func_code = jsi.extract_function_code(func_name) func_code = self._extract_sig_fn(jsi, func_name)
if var_ay:
func_code = (func_code[0], ';\n'.join((var_ay, func_code[1])))
if player_id: if player_id:
self.cache.store('youtube-nsig', player_id, func_code) self.cache.store('youtube-nsig', player_id, func_code)
@ -2136,7 +2149,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_details = merge_dicts(*traverse_obj( video_details = merge_dicts(*traverse_obj(
(player_response, api_player_response), (player_response, api_player_response),
(Ellipsis, 'videoDetails', T(dict)))) (Ellipsis, 'videoDetails', T(dict))))
player_response.update(api_player_response or {}) player_response.update(filter_dict(
api_player_response or {}, cndn=lambda k, _: k != 'captions'))
player_response['videoDetails'] = video_details player_response['videoDetails'] = video_details
def is_agegated(playability): def is_agegated(playability):
@ -2566,8 +2580,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
} }
pctr = traverse_obj( pctr = traverse_obj(
player_response, (player_response, api_player_response),
('captions', 'playerCaptionsTracklistRenderer', T(dict))) (Ellipsis, 'captions', 'playerCaptionsTracklistRenderer', T(dict)))
if pctr: if pctr:
def process_language(container, base_url, lang_code, query): def process_language(container, base_url, lang_code, query):
lang_subs = [] lang_subs = []
@ -2584,20 +2598,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def process_subtitles(): def process_subtitles():
subtitles = {} subtitles = {}
for caption_track in traverse_obj(pctr, ( for caption_track in traverse_obj(pctr, (
'captionTracks', lambda _, v: v.get('baseUrl'))): Ellipsis, 'captionTracks', lambda _, v: (
v.get('baseUrl') and v.get('languageCode')))):
base_url = self._yt_urljoin(caption_track['baseUrl']) base_url = self._yt_urljoin(caption_track['baseUrl'])
if not base_url: if not base_url:
continue continue
lang_code = caption_track['languageCode']
if caption_track.get('kind') != 'asr': if caption_track.get('kind') != 'asr':
lang_code = caption_track.get('languageCode')
if not lang_code:
continue
process_language( process_language(
subtitles, base_url, lang_code, {}) subtitles, base_url, lang_code, {})
continue continue
automatic_captions = {} automatic_captions = {}
process_language(
automatic_captions, base_url, lang_code, {})
for translation_language in traverse_obj(pctr, ( for translation_language in traverse_obj(pctr, (
'translationLanguages', lambda _, v: v.get('languageCode'))): Ellipsis, 'translationLanguages', lambda _, v: v.get('languageCode'))):
translation_language_code = translation_language['languageCode'] translation_language_code = translation_language['languageCode']
process_language( process_language(
automatic_captions, base_url, translation_language_code, automatic_captions, base_url, translation_language_code,

View File

@ -678,7 +678,7 @@ class JSInterpreter(object):
return len(obj) return len(obj)
try: try:
return obj[int(idx)] if isinstance(obj, list) else obj[compat_str(idx)] return obj[int(idx)] if isinstance(obj, list) else obj[compat_str(idx)]
except (TypeError, KeyError, IndexError) as e: except (TypeError, KeyError, IndexError, ValueError) as e:
# allow_undefined is None gives correct behaviour # allow_undefined is None gives correct behaviour
if allow_undefined or ( if allow_undefined or (
allow_undefined is None and not isinstance(e, TypeError)): allow_undefined is None and not isinstance(e, TypeError)):
@ -1038,6 +1038,10 @@ class JSInterpreter(object):
left_val = self._index(left_val, idx) left_val = self._index(left_val, idx)
if isinstance(idx, float): if isinstance(idx, float):
idx = int(idx) idx = int(idx)
if isinstance(left_val, list) and len(left_val) <= int_or_none(idx, default=-1):
# JS Array is a sparsely assignable list
# TODO: handle extreme sparsity without memory bloat, eg using auxiliary dict
left_val.extend((idx - len(left_val) + 1) * [JS_Undefined])
left_val[idx] = self._operator( left_val[idx] = self._operator(
m.group('op'), self._index(left_val, idx) if m.group('op') else None, m.group('op'), self._index(left_val, idx) if m.group('op') else None,
m.group('expr'), expr, local_vars, allow_recursion) m.group('expr'), expr, local_vars, allow_recursion)
@ -1204,9 +1208,10 @@ class JSInterpreter(object):
elif member == 'join': elif member == 'join':
assertion(isinstance(obj, list), 'must be applied on a list') assertion(isinstance(obj, list), 'must be applied on a list')
assertion(len(argvals) <= 1, 'takes at most one argument') assertion(len(argvals) <= 1, 'takes at most one argument')
return (',' if len(argvals) == 0 else argvals[0]).join( return (',' if len(argvals) == 0 or argvals[0] in (None, JS_Undefined)
('' if x in (None, JS_Undefined) else _js_toString(x)) else argvals[0]).join(
for x in obj) ('' if x in (None, JS_Undefined) else _js_toString(x))
for x in obj)
elif member == 'reverse': elif member == 'reverse':
assertion(not argvals, 'does not take any arguments') assertion(not argvals, 'does not take any arguments')
obj.reverse() obj.reverse()
@ -1364,19 +1369,21 @@ class JSInterpreter(object):
code, _ = self._separate_at_paren(func_m.group('code')) # refine the match code, _ = self._separate_at_paren(func_m.group('code')) # refine the match
return self.build_arglist(func_m.group('args')), code return self.build_arglist(func_m.group('args')), code
def extract_function(self, funcname): def extract_function(self, funcname, *global_stack):
return function_with_repr( return function_with_repr(
self.extract_function_from_code(*self.extract_function_code(funcname)), self.extract_function_from_code(*itertools.chain(
self.extract_function_code(funcname), global_stack)),
'F<%s>' % (funcname,)) 'F<%s>' % (funcname,))
def extract_function_from_code(self, argnames, code, *global_stack): def extract_function_from_code(self, argnames, code, *global_stack):
local_vars = {} local_vars = {}
start = None
while True: while True:
mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code) mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code[start:])
if mobj is None: if mobj is None:
break break
start, body_start = mobj.span() start, body_start = ((start or 0) + x for x in mobj.span())
body, remaining = self._separate_at_paren(code[body_start - 1:]) body, remaining = self._separate_at_paren(code[body_start - 1:])
name = self._named_object(local_vars, self.extract_function_from_code( name = self._named_object(local_vars, self.extract_function_from_code(
[x.strip() for x in mobj.group('args').split(',')], [x.strip() for x in mobj.group('args').split(',')],