Merge pull request #32950 from ytdl-org/master

Merge from master
This commit is contained in:
dirkf
2024-10-14 14:09:51 +01:00
committed by GitHub
136 changed files with 17041 additions and 3547 deletions

View File

@@ -31,30 +31,34 @@ from ..utils import (
class ADNIE(InfoExtractor):
IE_DESC = 'Anime Digital Network'
_VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P<id>\d+)'
_TEST = {
'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites',
'md5': '0319c99885ff5547565cacb4f3f9348d',
IE_DESC = 'Animation Digital Network'
_VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.fr/video/[^/]+/(?P<id>\d+)'
_TESTS = [{
'url': 'https://animationdigitalnetwork.fr/video/fruits-basket/9841-episode-1-a-ce-soir',
'md5': '1c9ef066ceb302c86f80c2b371615261',
'info_dict': {
'id': '7778',
'id': '9841',
'ext': 'mp4',
'title': 'Blue Exorcist - Kyôto Saga - Episode 1',
'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5',
'series': 'Blue Exorcist - Kyôto Saga',
'duration': 1467,
'release_date': '20170106',
'title': 'Fruits Basket - Episode 1',
'description': 'md5:14be2f72c3c96809b0ca424b0097d336',
'series': 'Fruits Basket',
'duration': 1437,
'release_date': '20190405',
'comment_count': int,
'average_rating': float,
'season_number': 2,
'episode': 'Début des hostilités',
'season_number': 1,
'episode': 'À ce soir !',
'episode_number': 1,
}
}
},
'skip': 'Only available in region (FR, ...)',
}, {
'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites',
'only_matching': True,
}]
_NETRC_MACHINE = 'animedigitalnetwork'
_BASE_URL = 'http://animedigitalnetwork.fr'
_API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/'
_NETRC_MACHINE = 'animationdigitalnetwork'
_BASE = 'animationdigitalnetwork.fr'
_API_BASE_URL = 'https://gw.api.' + _BASE + '/'
_PLAYER_BASE_URL = _API_BASE_URL + 'player/'
_HEADERS = {}
_LOGIN_ERR_MESSAGE = 'Unable to log in'
@@ -82,14 +86,14 @@ class ADNIE(InfoExtractor):
if subtitle_location:
enc_subtitles = self._download_webpage(
subtitle_location, video_id, 'Downloading subtitles data',
fatal=False, headers={'Origin': 'https://animedigitalnetwork.fr'})
fatal=False, headers={'Origin': 'https://' + self._BASE})
if not enc_subtitles:
return None
# http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js
# http://animationdigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js
dec_subtitles = intlist_to_bytes(aes_cbc_decrypt(
bytes_to_intlist(compat_b64decode(enc_subtitles[24:])),
bytes_to_intlist(binascii.unhexlify(self._K + 'ab9f52f5baae7c72')),
bytes_to_intlist(binascii.unhexlify(self._K + '7fac1178830cfe0c')),
bytes_to_intlist(compat_b64decode(enc_subtitles[:24]))
))
subtitles_json = self._parse_json(
@@ -138,9 +142,9 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
if not username:
return
try:
url = self._API_BASE_URL + 'authentication/login'
access_token = (self._download_json(
self._API_BASE_URL + 'authentication/login', None,
'Logging in', self._LOGIN_ERR_MESSAGE, fatal=False,
url, None, 'Logging in', self._LOGIN_ERR_MESSAGE, fatal=False,
data=urlencode_postdata({
'password': password,
'rememberMe': False,
@@ -153,7 +157,8 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
message = None
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
resp = self._parse_json(
e.cause.read().decode(), None, fatal=False) or {}
self._webpage_read_content(e.cause, url, username),
username, fatal=False) or {}
message = resp.get('message') or resp.get('code')
self.report_warning(message or self._LOGIN_ERR_MESSAGE)
@@ -211,7 +216,9 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
# This usually goes away with a different random pkcs1pad, so retry
continue
error = self._parse_json(e.cause.read(), video_id)
error = self._parse_json(
self._webpage_read_content(e.cause, links_url, video_id),
video_id, fatal=False) or {}
message = error.get('message')
if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country':
self.raise_geo_restricted(msg=message)

View File

@@ -8,6 +8,8 @@ from ..utils import (
ExtractorError,
GeoRestrictedError,
int_or_none,
remove_start,
traverse_obj,
update_url_query,
urlencode_postdata,
)
@@ -20,8 +22,8 @@ class AENetworksBaseIE(ThePlatformIE):
(?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com|
fyi\.tv
)/'''
_THEPLATFORM_KEY = 'crazyjava'
_THEPLATFORM_SECRET = 's3cr3t'
_THEPLATFORM_KEY = '43jXaGRQud'
_THEPLATFORM_SECRET = 'S10BPXHMlb'
_DOMAIN_MAP = {
'history.com': ('HISTORY', 'history'),
'aetv.com': ('AETV', 'aetv'),
@@ -33,14 +35,17 @@ class AENetworksBaseIE(ThePlatformIE):
}
def _extract_aen_smil(self, smil_url, video_id, auth=None):
query = {'mbr': 'true'}
query = {
'mbr': 'true',
'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3',
}
if auth:
query['auth'] = auth
TP_SMIL_QUERY = [{
'assetTypes': 'high_video_ak',
'switch': 'hls_high_ak'
'switch': 'hls_high_ak',
}, {
'assetTypes': 'high_video_s3'
'assetTypes': 'high_video_s3',
}, {
'assetTypes': 'high_video_s3',
'switch': 'hls_high_fastly',
@@ -75,7 +80,14 @@ class AENetworksBaseIE(ThePlatformIE):
requestor_id, brand = self._DOMAIN_MAP[domain]
result = self._download_json(
'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand,
filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0]
filter_value, query={'filter[%s]' % filter_key: filter_value})
result = traverse_obj(
result, ('results',
lambda k, v: k == 0 and v[filter_key] == filter_value),
get_all=False)
if not result:
raise ExtractorError('Show not found in A&E feed (too new?)', expected=True,
video_id=remove_start(filter_value, '/'))
title = result['title']
video_id = result['id']
media_url = result['publicUrl']
@@ -126,7 +138,7 @@ class AENetworksIE(AENetworksBaseIE):
'skip_download': True,
},
'add_ie': ['ThePlatform'],
'skip': 'This video is only available for users of participating TV providers.',
'skip': 'Geo-restricted - This content is not available in your location.'
}, {
'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1',
'info_dict': {
@@ -143,6 +155,7 @@ class AENetworksIE(AENetworksBaseIE):
'skip_download': True,
},
'add_ie': ['ThePlatform'],
'skip': 'This video is only available for users of participating TV providers.',
}, {
'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8',
'only_matching': True

View File

@@ -18,7 +18,7 @@ class AliExpressLiveIE(InfoExtractor):
'id': '2800002704436634',
'ext': 'mp4',
'title': 'CASIMA7.22',
'thumbnail': r're:http://.*\.jpg',
'thumbnail': r're:https?://.*\.jpg',
'uploader': 'CASIMA Official Store',
'timestamp': 1500717600,
'upload_date': '20170722',

View File

@@ -0,0 +1,89 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
clean_html,
dict_get,
get_element_by_class,
int_or_none,
unified_strdate,
url_or_none,
)
class Alsace20TVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?alsace20\.tv/(?:[\w-]+/)+[\w-]+-(?P<id>[\w]+)'
_TESTS = [{
'url': 'https://www.alsace20.tv/VOD/Actu/JT/Votre-JT-jeudi-3-fevrier-lyNHCXpYJh.html',
# 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb',
'info_dict': {
'id': 'lyNHCXpYJh',
'ext': 'mp4',
'description': 'md5:fc0bc4a0692d3d2dba4524053de4c7b7',
'title': 'Votre JT du jeudi 3 février',
'upload_date': '20220203',
'thumbnail': r're:https?://.+\.jpg',
'duration': 1073,
'view_count': int,
},
'params': {
'format': 'bestvideo',
},
}]
def _extract_video(self, video_id, url=None):
info = self._download_json(
'https://www.alsace20.tv/visionneuse/visio_v9_js.php?key=%s&habillage=0&mode=html' % (video_id, ),
video_id) or {}
title = info['titre']
formats = []
for res, fmt_url in (info.get('files') or {}).items():
formats.extend(
self._extract_smil_formats(fmt_url, video_id, fatal=False)
if '/smil:_' in fmt_url
else self._extract_mpd_formats(fmt_url, video_id, mpd_id=res, fatal=False))
self._sort_formats(formats)
webpage = (url and self._download_webpage(url, video_id, fatal=False)) or ''
thumbnail = url_or_none(dict_get(info, ('image', 'preview', )) or self._og_search_thumbnail(webpage))
upload_date = self._search_regex(r'/(\d{6})_', thumbnail, 'upload_date', default=None)
upload_date = unified_strdate('20%s-%s-%s' % (upload_date[:2], upload_date[2:4], upload_date[4:])) if upload_date else None
return {
'id': video_id,
'title': title,
'formats': formats,
'description': clean_html(get_element_by_class('wysiwyg', webpage)),
'upload_date': upload_date,
'thumbnail': thumbnail,
'duration': int_or_none(self._og_search_property('video:duration', webpage) if webpage else None),
'view_count': int_or_none(info.get('nb_vues')),
}
def _real_extract(self, url):
video_id = self._match_id(url)
return self._extract_video(video_id, url)
class Alsace20TVEmbedIE(Alsace20TVIE):
_VALID_URL = r'https?://(?:www\.)?alsace20\.tv/emb/(?P<id>[\w]+)'
_TESTS = [{
'url': 'https://www.alsace20.tv/emb/lyNHCXpYJh',
# 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb',
'info_dict': {
'id': 'lyNHCXpYJh',
'ext': 'mp4',
'title': 'Votre JT du jeudi 3 février',
'upload_date': '20220203',
'thumbnail': r're:https?://.+\.jpg',
'view_count': int,
},
'params': {
'format': 'bestvideo',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
return self._extract_video(video_id)

View File

@@ -15,7 +15,7 @@ from ..utils import (
class AmericasTestKitchenIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?P<resource_type>episode|videos)/(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?:cooks(?:country|illustrated)/)?(?P<resource_type>episode|videos)/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers',
'md5': 'b861c3e365ac38ad319cfd509c30577f',
@@ -23,15 +23,20 @@ class AmericasTestKitchenIE(InfoExtractor):
'id': '5b400b9ee338f922cb06450c',
'title': 'Japanese Suppers',
'ext': 'mp4',
'display_id': 'weeknight-japanese-suppers',
'description': 'md5:64e606bfee910627efc4b5f050de92b3',
'thumbnail': r're:^https?://',
'timestamp': 1523318400,
'upload_date': '20180410',
'release_date': '20180410',
'timestamp': 1523304000,
'upload_date': '20180409',
'release_date': '20180409',
'series': "America's Test Kitchen",
'season': 'Season 18',
'season_number': 18,
'episode': 'Japanese Suppers',
'episode_number': 15,
'duration': 1376,
'thumbnail': r're:^https?://',
'average_rating': 0,
'view_count': int,
},
'params': {
'skip_download': True,
@@ -44,15 +49,20 @@ class AmericasTestKitchenIE(InfoExtractor):
'id': '5fbe8c61bda2010001c6763b',
'title': 'Simple Chicken Dinner',
'ext': 'mp4',
'display_id': 'atktv_2103_simple-chicken-dinner_full-episode_web-mp4',
'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7',
'thumbnail': r're:^https?://',
'timestamp': 1610755200,
'upload_date': '20210116',
'release_date': '20210116',
'timestamp': 1610737200,
'upload_date': '20210115',
'release_date': '20210115',
'series': "America's Test Kitchen",
'season': 'Season 21',
'season_number': 21,
'episode': 'Simple Chicken Dinner',
'episode_number': 3,
'duration': 1397,
'thumbnail': r're:^https?://',
'view_count': int,
'average_rating': 0,
},
'params': {
'skip_download': True,
@@ -60,6 +70,12 @@ class AmericasTestKitchenIE(InfoExtractor):
}, {
'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon',
'only_matching': True,
}, {
'url': 'https://www.americastestkitchen.com/cookscountry/episode/564-when-only-chocolate-will-do',
'only_matching': True,
}, {
'url': 'https://www.americastestkitchen.com/cooksillustrated/videos/4478-beef-wellington',
'only_matching': True,
}, {
'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do',
'only_matching': True,
@@ -94,7 +110,7 @@ class AmericasTestKitchenIE(InfoExtractor):
class AmericasTestKitchenSeasonIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?P<show>americastestkitchen|cookscountry)\.com/episodes/browse/season_(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?(?P<show>americastestkitchen|(?P<cooks>cooks(?:country|illustrated)))\.com(?:(?:/(?P<show2>cooks(?:country|illustrated)))?(?:/?$|(?<!ated)(?<!ated\.com)/episodes/browse/season_(?P<season>\d+)))'
_TESTS = [{
# ATK Season
'url': 'https://www.americastestkitchen.com/episodes/browse/season_1',
@@ -105,48 +121,93 @@ class AmericasTestKitchenSeasonIE(InfoExtractor):
'playlist_count': 13,
}, {
# Cooks Country Season
'url': 'https://www.cookscountry.com/episodes/browse/season_12',
'url': 'https://www.americastestkitchen.com/cookscountry/episodes/browse/season_12',
'info_dict': {
'id': 'season_12',
'title': 'Season 12',
},
'playlist_count': 13,
}, {
# America's Test Kitchen Series
'url': 'https://www.americastestkitchen.com/',
'info_dict': {
'id': 'americastestkitchen',
'title': 'America\'s Test Kitchen',
},
'playlist_count': 558,
}, {
# Cooks Country Series
'url': 'https://www.americastestkitchen.com/cookscountry',
'info_dict': {
'id': 'cookscountry',
'title': 'Cook\'s Country',
},
'playlist_count': 199,
}, {
'url': 'https://www.americastestkitchen.com/cookscountry/',
'only_matching': True,
}, {
'url': 'https://www.cookscountry.com/episodes/browse/season_12',
'only_matching': True,
}, {
'url': 'https://www.cookscountry.com',
'only_matching': True,
}, {
'url': 'https://www.americastestkitchen.com/cooksillustrated/',
'only_matching': True,
}, {
'url': 'https://www.cooksillustrated.com',
'only_matching': True,
}]
def _real_extract(self, url):
show_name, season_number = re.match(self._VALID_URL, url).groups()
season_number = int(season_number)
match = re.match(self._VALID_URL, url).groupdict()
show = match.get('show2')
show_path = ('/' + show) if show else ''
show = show or match['show']
season_number = int_or_none(match.get('season'))
slug = 'atk' if show_name == 'americastestkitchen' else 'cco'
slug, title = {
'americastestkitchen': ('atk', 'America\'s Test Kitchen'),
'cookscountry': ('cco', 'Cook\'s Country'),
'cooksillustrated': ('cio', 'Cook\'s Illustrated'),
}[show]
season = 'Season %d' % season_number
facet_filters = [
'search_document_klass:episode',
'search_show_slug:' + slug,
]
if season_number:
playlist_id = 'season_%d' % season_number
playlist_title = 'Season %d' % season_number
facet_filters.append('search_season_list:' + playlist_title)
else:
playlist_id = show
playlist_title = title
season_search = self._download_json(
'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug,
season, headers={
'Origin': 'https://www.%s.com' % show_name,
playlist_id, headers={
'Origin': 'https://www.americastestkitchen.com',
'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805',
'X-Algolia-Application-Id': 'Y1FNZXUI30',
}, query={
'facetFilters': json.dumps([
'search_season_list:' + season,
'search_document_klass:episode',
'search_show_slug:' + slug,
]),
'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug,
'facetFilters': json.dumps(facet_filters),
'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title,search_atk_episode_season' % slug,
'attributesToHighlight': '',
'hitsPerPage': 1000,
})
def entries():
for episode in (season_search.get('hits') or []):
search_url = episode.get('search_url')
search_url = episode.get('search_url') # always formatted like '/episode/123-title-of-episode'
if not search_url:
continue
yield {
'_type': 'url',
'url': 'https://www.%s.com%s' % (show_name, search_url),
'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]),
'url': 'https://www.americastestkitchen.com%s%s' % (show_path, search_url),
'id': try_get(episode, lambda e: e['objectID'].rsplit('_', 1)[-1]),
'title': episode.get('title'),
'description': episode.get('description'),
'timestamp': unified_timestamp(episode.get('search_document_date')),
@@ -156,4 +217,4 @@ class AmericasTestKitchenSeasonIE(InfoExtractor):
}
return self.playlist_result(
entries(), 'season_%d' % season_number, season)
entries(), playlist_id, playlist_title)

View File

@@ -0,0 +1,59 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import ExtractorError, urlencode_postdata
class BigoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bigo\.tv/(?:[a-z]{2,}/)?(?P<id>[^/]+)'
_TESTS = [{
'url': 'https://www.bigo.tv/ja/221338632',
'info_dict': {
'id': '6576287577575737440',
'title': '土よ〜💁‍♂️ 休憩室/REST room',
'thumbnail': r're:https?://.+',
'uploader': '✨Shin💫',
'uploader_id': '221338632',
'is_live': True,
},
'skip': 'livestream',
}, {
'url': 'https://www.bigo.tv/th/Tarlerm1304',
'only_matching': True,
}, {
'url': 'https://bigo.tv/115976881',
'only_matching': True,
}]
def _real_extract(self, url):
user_id = self._match_id(url)
info_raw = self._download_json(
'https://bigo.tv/studio/getInternalStudioInfo',
user_id, data=urlencode_postdata({'siteId': user_id}))
if not isinstance(info_raw, dict):
raise ExtractorError('Received invalid JSON data')
if info_raw.get('code'):
raise ExtractorError(
'Bigo says: %s (code %s)' % (info_raw.get('msg'), info_raw.get('code')), expected=True)
info = info_raw.get('data') or {}
if not info.get('alive'):
raise ExtractorError('This user is offline.', expected=True)
return {
'id': info.get('roomId') or user_id,
'title': info.get('roomTopic') or info.get('nick_name') or user_id,
'formats': [{
'url': info.get('hls_src'),
'ext': 'mp4',
'protocol': 'm3u8',
}],
'thumbnail': info.get('snapshot'),
'uploader': info.get('nick_name'),
'uploader_id': user_id,
'is_live': True,
}

View File

@@ -369,6 +369,11 @@ class BilibiliAudioIE(BilibiliAudioBaseIE):
'filesize': int_or_none(play_data.get('size')),
}]
for a_format in formats:
a_format.setdefault('http_headers', {}).update({
'Referer': url,
})
song = self._call_api('song/info', au_id)
title = song['title']
statistic = song.get('statistic') or {}

View File

@@ -0,0 +1,173 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from ..utils import (
strip_or_none,
traverse_obj,
)
from .common import InfoExtractor
class BlerpIE(InfoExtractor):
IE_NAME = 'blerp'
_VALID_URL = r'https?://(?:www\.)?blerp\.com/soundbites/(?P<id>[0-9a-zA-Z]+)'
_TESTS = [{
'url': 'https://blerp.com/soundbites/6320fe8745636cb4dd677a5a',
'info_dict': {
'id': '6320fe8745636cb4dd677a5a',
'title': 'Samsung Galaxy S8 Over the Horizon Ringtone 2016',
'uploader': 'luminousaj',
'uploader_id': '5fb81e51aa66ae000c395478',
'ext': 'mp3',
'tags': ['samsung', 'galaxy', 's8', 'over the horizon', '2016', 'ringtone'],
}
}, {
'url': 'https://blerp.com/soundbites/5bc94ef4796001000498429f',
'info_dict': {
'id': '5bc94ef4796001000498429f',
'title': 'Yee',
'uploader': '179617322678353920',
'uploader_id': '5ba99cf71386730004552c42',
'ext': 'mp3',
'tags': ['YEE', 'YEET', 'wo ha haah catchy tune yee', 'yee']
}
}]
_GRAPHQL_OPERATIONNAME = "webBitePageGetBite"
_GRAPHQL_QUERY = (
'''query webBitePageGetBite($_id: MongoID!) {
web {
biteById(_id: $_id) {
...bitePageFrag
__typename
}
__typename
}
}
fragment bitePageFrag on Bite {
_id
title
userKeywords
keywords
color
visibility
isPremium
owned
price
extraReview
isAudioExists
image {
filename
original {
url
__typename
}
__typename
}
userReactions {
_id
reactions
createdAt
__typename
}
topReactions
totalSaveCount
saved
blerpLibraryType
license
licenseMetaData
playCount
totalShareCount
totalFavoriteCount
totalAddedToBoardCount
userCategory
userAudioQuality
audioCreationState
transcription
userTranscription
description
createdAt
updatedAt
author
listingType
ownerObject {
_id
username
profileImage {
filename
original {
url
__typename
}
__typename
}
__typename
}
transcription
favorited
visibility
isCurated
sourceUrl
audienceRating
strictAudienceRating
ownerId
reportObject {
reportedContentStatus
__typename
}
giphy {
mp4
gif
__typename
}
audio {
filename
original {
url
__typename
}
mp3 {
url
__typename
}
__typename
}
__typename
}
''')
def _real_extract(self, url):
audio_id = self._match_id(url)
data = {
'operationName': self._GRAPHQL_OPERATIONNAME,
'query': self._GRAPHQL_QUERY,
'variables': {
'_id': audio_id
}
}
headers = {
'Content-Type': 'application/json'
}
json_result = self._download_json('https://api.blerp.com/graphql',
audio_id, data=json.dumps(data).encode('utf-8'), headers=headers)
bite_json = json_result['data']['web']['biteById']
info_dict = {
'id': bite_json['_id'],
'url': bite_json['audio']['mp3']['url'],
'title': bite_json['title'],
'uploader': traverse_obj(bite_json, ('ownerObject', 'username'), expected_type=strip_or_none),
'uploader_id': traverse_obj(bite_json, ('ownerObject', '_id'), expected_type=strip_or_none),
'ext': 'mp3',
'tags': list(filter(None, map(strip_or_none, (traverse_obj(bite_json, 'userKeywords', expected_type=list) or []))) or None)
}
return info_dict

View File

@@ -1,3 +1,4 @@
# coding: utf-8
from __future__ import unicode_literals
import re
@@ -12,13 +13,28 @@ from ..utils import (
class BongaCamsIE(InfoExtractor):
_VALID_URL = r'https?://(?P<host>(?:[^/]+\.)?bongacams\d*\.com)/(?P<id>[^/?&#]+)'
_VALID_URL = r'https?://(?P<host>(?:[^/]+\.)?bongacams\d*\.(?:com|net))/(?P<id>[^/?&#]+)'
_TESTS = [{
'url': 'https://de.bongacams.com/azumi-8',
'only_matching': True,
}, {
'url': 'https://cn.bongacams.com/azumi-8',
'only_matching': True,
}, {
'url': 'https://de.bongacams.net/claireashton',
'info_dict': {
'id': 'claireashton',
'ext': 'mp4',
'title': r're:ClaireAshton \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
'age_limit': 18,
'uploader_id': 'ClaireAshton',
'uploader': 'ClaireAshton',
'like_count': int,
'is_live': True,
},
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):

View File

@@ -0,0 +1,79 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
determine_ext,
int_or_none,
merge_dicts,
parse_iso8601,
T,
traverse_obj,
txt_or_none,
urljoin,
)
class CaffeineTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?caffeine\.tv/[^/]+/video/(?P<id>[0-9a-f-]+)'
_TESTS = [{
'url': 'https://www.caffeine.tv/TsuSurf/video/cffc0a00-e73f-11ec-8080-80017d29f26e',
'info_dict': {
'id': 'cffc0a00-e73f-11ec-8080-80017d29f26e',
'ext': 'mp4',
'title': 'GOOOOD MORNINNNNN #highlights',
'timestamp': 1654702180,
'upload_date': '20220608',
'uploader': 'TsuSurf',
'duration': 3145,
'age_limit': 17,
},
'params': {
'format': 'bestvideo',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
json_data = self._download_json(
'https://api.caffeine.tv/social/public/activity/' + video_id,
video_id)
broadcast_info = traverse_obj(json_data, ('broadcast_info', T(dict))) or {}
title = broadcast_info['broadcast_title']
video_url = broadcast_info['video_url']
ext = determine_ext(video_url)
if ext == 'm3u8':
formats = self._extract_m3u8_formats(
video_url, video_id, 'mp4', entry_protocol='m3u8',
fatal=False)
else:
formats = [{'url': video_url}]
self._sort_formats(formats)
return merge_dicts({
'id': video_id,
'title': title,
'formats': formats,
}, traverse_obj(json_data, {
'uploader': ((None, 'user'), 'username'),
}, get_all=False), traverse_obj(json_data, {
'like_count': ('like_count', T(int_or_none)),
'view_count': ('view_count', T(int_or_none)),
'comment_count': ('comment_count', T(int_or_none)),
'tags': ('tags', Ellipsis, T(txt_or_none)),
'is_live': 'is_live',
'uploader': ('user', 'name'),
}), traverse_obj(broadcast_info, {
'duration': ('content_duration', T(int_or_none)),
'timestamp': ('broadcast_start_time', T(parse_iso8601)),
'thumbnail': ('preview_image_path', T(lambda u: urljoin(url, u))),
'age_limit': ('content_rating', T(lambda r: r and {
# assume Apple Store ratings [1]
# 1. https://en.wikipedia.org/wiki/Mobile_software_content_rating_system
'FOUR_PLUS': 0,
'NINE_PLUS': 9,
'TWELVE_PLUS': 12,
'SEVENTEEN_PLUS': 17,
}.get(r, 17))),
}))

View File

@@ -0,0 +1,74 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
traverse_obj,
try_get,
)
class CallinIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?callin\.com/episode/(?:[^/#?-]+-)*(?P<id>[^/#?-]+)'
_TESTS = [{
'url': 'https://www.callin.com/episode/fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW',
'md5': '14ede27ee2c957b7e4db93140fc0745c',
'info_dict': {
'id': 'PrumRdSQJW',
'ext': 'mp4',
'title': 'FCC Commissioner Brendan Carr on Elons Starlink',
'description': 'Or, why the government doesnt like SpaceX',
'channel': 'The Pull Request',
'channel_url': 'https://callin.com/show/the-pull-request-ucnDJmEKAa',
}
}, {
'url': 'https://www.callin.com/episode/episode-81-elites-melt-down-over-student-debt-lzxMidUnjA',
'md5': '16f704ddbf82a27e3930533b12062f07',
'info_dict': {
'id': 'lzxMidUnjA',
'ext': 'mp4',
'title': 'Episode 81- Elites MELT DOWN over Student Debt Victory? Rumble in NYC?',
'description': 'Lets talk todays episode about the primary election shake up in NYC and the elites melting down over student debt cancelation.',
'channel': 'The DEBRIEF With Briahna Joy Gray',
'channel_url': 'https://callin.com/show/the-debrief-with-briahna-joy-gray-siiFDzGegm',
}
}]
def _search_nextjs_data(self, webpage, video_id, transform_source=None, fatal=True, **kw):
return self._parse_json(
self._search_regex(
r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
webpage, 'next.js data', fatal=fatal, **kw),
video_id, transform_source=transform_source, fatal=fatal)
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
next_data = self._search_nextjs_data(webpage, video_id)
episode = traverse_obj(next_data, ('props', 'pageProps', 'episode'), expected_type=dict)
if not episode:
raise ExtractorError('Failed to find episode data')
title = episode.get('title') or self._og_search_title(webpage)
description = episode.get('description') or self._og_search_description(webpage)
formats = []
formats.extend(self._extract_m3u8_formats(
episode.get('m3u8'), video_id, 'mp4',
entry_protocol='m3u8_native', fatal=False))
self._sort_formats(formats)
channel = try_get(episode, lambda x: x['show']['title'], compat_str)
channel_url = try_get(episode, lambda x: x['show']['linkObj']['resourceUrl'], compat_str)
return {
'id': video_id,
'title': title,
'description': description,
'formats': formats,
'channel': channel,
'channel_url': channel_url,
}

View File

@@ -3,7 +3,6 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
url_or_none,
)
@@ -20,32 +19,11 @@ class CamModelsIE(InfoExtractor):
def _real_extract(self, url):
user_id = self._match_id(url)
webpage = self._download_webpage(
url, user_id, headers=self.geo_verification_headers())
manifest_root = self._html_search_regex(
r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None)
if not manifest_root:
ERRORS = (
("I'm offline, but let's stay connected", 'This user is currently offline'),
('in a private show', 'This user is in a private show'),
('is currently performing LIVE', 'This model is currently performing live'),
)
for pattern, message in ERRORS:
if pattern in webpage:
error = message
expected = True
break
else:
error = 'Unable to find manifest URL root'
expected = False
raise ExtractorError(error, expected=expected)
manifest = self._download_json(
'%s%s.json' % (manifest_root, user_id), user_id)
'https://manifest-server.naiadsystems.com/live/s:%s.json' % user_id, user_id)
formats = []
thumbnails = []
for format_id, format_dict in manifest['formats'].items():
if not isinstance(format_dict, dict):
continue
@@ -85,6 +63,13 @@ class CamModelsIE(InfoExtractor):
'preference': -1,
})
else:
if format_id == 'jpeg':
thumbnails.append({
'url': f['url'],
'width': f['width'],
'height': f['height'],
'format_id': f['format_id'],
})
continue
formats.append(f)
self._sort_formats(formats)
@@ -92,6 +77,7 @@ class CamModelsIE(InfoExtractor):
return {
'id': user_id,
'title': self._live_title(user_id),
'thumbnails': thumbnails,
'is_live': True,
'formats': formats,
'age_limit': 18

View File

@@ -12,35 +12,21 @@ from ..utils import (
ExtractorError,
float_or_none,
sanitized_Request,
unescapeHTML,
update_url_query,
str_or_none,
traverse_obj,
urlencode_postdata,
USER_AGENTS,
)
class CeskaTelevizeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
_VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady|zive)/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
_TESTS = [{
'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
'info_dict': {
'id': '61924494877246241',
'ext': 'mp4',
'title': 'Hyde Park Civilizace: Život v Grónsku',
'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626',
'thumbnail': r're:^https?://.*\.jpg',
'duration': 3350,
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en',
'info_dict': {
'id': '61924494877028507',
'ext': 'mp4',
'title': 'Hyde Park Civilizace: Bonus 01 - En',
'title': 'Bonus 01 - En - Hyde Park Civilizace',
'description': 'English Subtittles',
'thumbnail': r're:^https?://.*\.jpg',
'duration': 81.3,
@@ -51,31 +37,111 @@ class CeskaTelevizeIE(InfoExtractor):
},
}, {
# live stream
'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/',
'url': 'http://www.ceskatelevize.cz/zive/ct1/',
'info_dict': {
'id': 402,
'id': '102',
'ext': 'mp4',
'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
'title': r'ČT1 - živé vysílání online',
'description': 'Sledujte živé vysílání kanálu ČT1 online. Vybírat si můžete i z dalších kanálů České televize na kterémkoli z vašich zařízení.',
'is_live': True,
},
'params': {
# m3u8 download
'skip_download': True,
},
'skip': 'Georestricted to Czech Republic',
}, {
# another
'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/',
'only_matching': True,
'info_dict': {
'id': 402,
'ext': 'mp4',
'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
'is_live': True,
},
# 'skip': 'Georestricted to Czech Republic',
}, {
'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25',
'only_matching': True,
}, {
# video with 18+ caution trailer
'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
'info_dict': {
'id': '215562210900007-bogotart',
'title': 'Bogotart - Queer',
'description': 'Hlavní město Kolumbie v doprovodu queer umělců. Vroucí svět plný vášně, sebevědomí, ale i násilí a bolesti',
},
'playlist': [{
'info_dict': {
'id': '61924494877311053',
'ext': 'mp4',
'title': 'Bogotart - Queer (Varování 18+)',
'duration': 11.9,
},
}, {
'info_dict': {
'id': '61924494877068022',
'ext': 'mp4',
'title': 'Bogotart - Queer (Queer)',
'thumbnail': r're:^https?://.*\.jpg',
'duration': 1558.3,
},
}],
'params': {
# m3u8 download
'skip_download': True,
},
}, {
# iframe embed
'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/',
'only_matching': True,
}]
def _search_nextjs_data(self, webpage, video_id, **kw):
return self._parse_json(
self._search_regex(
r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
webpage, 'next.js data', **kw),
video_id, **kw)
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage, urlh = self._download_webpage_handle(url, playlist_id)
parsed_url = compat_urllib_parse_urlparse(urlh.geturl())
site_name = self._og_search_property('site_name', webpage, fatal=False, default='Česká televize')
playlist_title = self._og_search_title(webpage, default=None)
if site_name and playlist_title:
playlist_title = re.split(r'\s*[—|]\s*%s' % (site_name, ), playlist_title, 1)[0]
playlist_description = self._og_search_description(webpage, default=None)
if playlist_description:
playlist_description = playlist_description.replace('\xa0', ' ')
webpage = self._download_webpage(url, playlist_id)
type_ = 'IDEC'
if re.search(r'(^/porady|/zive)/', parsed_url.path):
next_data = self._search_nextjs_data(webpage, playlist_id)
if '/zive/' in parsed_url.path:
idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'liveBroadcast', 'current', 'idec'), get_all=False)
else:
idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False)
if not idec:
idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'videobonusDetail', 'bonusId'), get_all=False)
if idec:
type_ = 'bonus'
if not idec:
raise ExtractorError('Failed to find IDEC id')
iframe_hash = self._download_webpage(
'https://www.ceskatelevize.cz/v-api/iframe-hash/',
playlist_id, note='Getting IFRAME hash')
query = {'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', type_: idec, }
webpage = self._download_webpage(
'https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php',
playlist_id, note='Downloading player', query=query)
NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
self.raise_geo_restricted(NOT_AVAILABLE_STRING)
if any(not_found in webpage for not_found in ('Neplatný parametr pro videopřehrávač', 'IDEC nebyl nalezen', )):
raise ExtractorError('no video with IDEC available', video_id=idec, expected=True)
type_ = None
episode_id = None
@@ -100,7 +166,7 @@ class CeskaTelevizeIE(InfoExtractor):
data = {
'playlist[0][type]': type_,
'playlist[0][id]': episode_id,
'requestUrl': compat_urllib_parse_urlparse(url).path,
'requestUrl': parsed_url.path,
'requestSource': 'iVysilani',
}
@@ -108,7 +174,7 @@ class CeskaTelevizeIE(InfoExtractor):
for user_agent in (None, USER_AGENTS['Safari']):
req = sanitized_Request(
'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/',
data=urlencode_postdata(data))
req.add_header('Content-type', 'application/x-www-form-urlencoded')
@@ -130,9 +196,6 @@ class CeskaTelevizeIE(InfoExtractor):
req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))
req.add_header('Referer', url)
playlist_title = self._og_search_title(webpage, default=None)
playlist_description = self._og_search_description(webpage, default=None)
playlist = self._download_json(req, playlist_id, fatal=False)
if not playlist:
continue
@@ -167,7 +230,7 @@ class CeskaTelevizeIE(InfoExtractor):
entries[num]['formats'].extend(formats)
continue
item_id = item.get('id') or item['assetId']
item_id = str_or_none(item.get('id') or item['assetId'])
title = item['title']
duration = float_or_none(item.get('duration'))
@@ -181,8 +244,6 @@ class CeskaTelevizeIE(InfoExtractor):
if playlist_len == 1:
final_title = playlist_title or title
if is_live:
final_title = self._live_title(final_title)
else:
final_title = '%s (%s)' % (playlist_title, title)
@@ -200,6 +261,8 @@ class CeskaTelevizeIE(InfoExtractor):
for e in entries:
self._sort_formats(e['formats'])
if len(entries) == 1:
return entries[0]
return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
def _get_subtitles(self, episode_id, subs):
@@ -236,54 +299,3 @@ class CeskaTelevizeIE(InfoExtractor):
yield line
return '\r\n'.join(_fix_subtitle(subtitles))
class CeskaTelevizePoradyIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
_TESTS = [{
# video with 18+ caution trailer
'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
'info_dict': {
'id': '215562210900007-bogotart',
'title': 'Queer: Bogotart',
'description': 'Alternativní průvodce současným queer světem',
},
'playlist': [{
'info_dict': {
'id': '61924494876844842',
'ext': 'mp4',
'title': 'Queer: Bogotart (Varování 18+)',
'duration': 10.2,
},
}, {
'info_dict': {
'id': '61924494877068022',
'ext': 'mp4',
'title': 'Queer: Bogotart (Queer)',
'thumbnail': r're:^https?://.*\.jpg',
'duration': 1558.3,
},
}],
'params': {
# m3u8 download
'skip_download': True,
},
}, {
# iframe embed
'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data_url = update_url_query(unescapeHTML(self._search_regex(
(r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'),
webpage, 'iframe player url', group='url')), query={
'autoStart': 'true',
})
return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key())

View File

@@ -0,0 +1,69 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
merge_dicts,
T,
traverse_obj,
unified_timestamp,
url_or_none,
)
class ClipchampIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?clipchamp\.com/watch/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://clipchamp.com/watch/gRXZ4ZhdDaU',
'info_dict': {
'id': 'gRXZ4ZhdDaU',
'ext': 'mp4',
'title': 'Untitled video',
'uploader': 'Alexander Schwartz',
'timestamp': 1680805580,
'upload_date': '20230406',
'thumbnail': r're:^https?://.+\.jpg',
},
'params': {
'skip_download': 'm3u8',
'format': 'bestvideo',
},
}]
_STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s'
_STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video']
storage_location = data.get('storage_location')
if storage_location != 'cf_stream':
raise ExtractorError('Unsupported clip storage location "%s"' % (storage_location,))
path = data['download_url']
iframe = self._download_webpage(
'https://iframe.cloudflarestream.com/' + path, video_id, 'Downloading player iframe')
subdomain = self._search_regex(
r'''\bcustomer-domain-prefix\s*=\s*("|')(?P<sd>[\w-]+)\1''', iframe,
'subdomain', group='sd', fatal=False) or 'customer-2ut9yn3y6fta1yxe'
formats = self._extract_mpd_formats(
self._STREAM_URL_TMPL % (subdomain, path, 'mpd'), video_id,
query=self._STREAM_URL_QUERY, fatal=False, mpd_id='dash')
formats.extend(self._extract_m3u8_formats(
self._STREAM_URL_TMPL % (subdomain, path, 'm3u8'), video_id, 'mp4',
query=self._STREAM_URL_QUERY, fatal=False, m3u8_id='hls'))
return merge_dicts({
'id': video_id,
'formats': formats,
'uploader': ' '.join(traverse_obj(data, ('creator', ('first_name', 'last_name'), T(compat_str)))) or None,
}, traverse_obj(data, {
'title': ('project', 'project_name', T(compat_str)),
'timestamp': ('created_at', T(unified_timestamp)),
'thumbnail': ('thumbnail_url', T(url_or_none)),
}), rev=True)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,148 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
int_or_none,
str_or_none,
try_get,
unified_timestamp,
update_url_query,
urljoin,
)
# compat_range
try:
if callable(xrange):
range = xrange
except (NameError, TypeError):
pass
class CPACIE(InfoExtractor):
IE_NAME = 'cpac'
_VALID_URL = r'https?://(?:www\.)?cpac\.ca/(?P<fr>l-)?episode\?id=(?P<id>[\da-f]{8}(?:-[\da-f]{4}){3}-[\da-f]{12})'
_TEST = {
# 'url': 'http://www.cpac.ca/en/programs/primetime-politics/episodes/65490909',
'url': 'https://www.cpac.ca/episode?id=fc7edcae-4660-47e1-ba61-5b7f29a9db0f',
'md5': 'e46ad699caafd7aa6024279f2614e8fa',
'info_dict': {
'id': 'fc7edcae-4660-47e1-ba61-5b7f29a9db0f',
'ext': 'mp4',
'upload_date': '20220215',
'title': 'News Conference to Celebrate National Kindness Week February 15, 2022',
'description': 'md5:466a206abd21f3a6f776cdef290c23fb',
'timestamp': 1644901200,
},
'params': {
'format': 'bestvideo',
'hls_prefer_native': True,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
url_lang = 'fr' if '/l-episode?' in url else 'en'
content = self._download_json(
'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/episode/index.xml&crafterSite=cpacca&id=' + video_id,
video_id)
video_url = try_get(content, lambda x: x['page']['details']['videoUrl'], compat_str)
formats = []
if video_url:
content = content['page']
title = str_or_none(content['details']['title_%s_t' % (url_lang, )])
formats = self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', ext='mp4')
for fmt in formats:
# prefer language to match URL
fmt_lang = fmt.get('language')
if fmt_lang == url_lang:
fmt['language_preference'] = 10
elif not fmt_lang:
fmt['language_preference'] = -1
else:
fmt['language_preference'] = -10
self._sort_formats(formats)
category = str_or_none(content['details']['category_%s_t' % (url_lang, )])
def is_live(v_type):
return (v_type == 'live') if v_type is not None else None
return {
'id': video_id,
'formats': formats,
'title': title,
'description': str_or_none(content['details'].get('description_%s_t' % (url_lang, ))),
'timestamp': unified_timestamp(content['details'].get('liveDateTime')),
'category': [category] if category else None,
'thumbnail': urljoin(url, str_or_none(content['details'].get('image_%s_s' % (url_lang, )))),
'is_live': is_live(content['details'].get('type')),
}
class CPACPlaylistIE(InfoExtractor):
IE_NAME = 'cpac:playlist'
_VALID_URL = r'(?i)https?://(?:www\.)?cpac\.ca/(?:program|search|(?P<fr>emission|rechercher))\?(?:[^&]+&)*?(?P<id>(?:id=\d+|programId=\d+|key=[^&]+))'
_TESTS = [{
'url': 'https://www.cpac.ca/program?id=6',
'info_dict': {
'id': 'id=6',
'title': 'Headline Politics',
'description': 'Watch CPACs signature long-form coverage of the days pressing political events as they unfold.',
},
'playlist_count': 10,
}, {
'url': 'https://www.cpac.ca/search?key=hudson&type=all&order=desc',
'info_dict': {
'id': 'key=hudson',
'title': 'hudson',
},
'playlist_count': 22,
}, {
'url': 'https://www.cpac.ca/search?programId=50',
'info_dict': {
'id': 'programId=50',
'title': '50',
},
'playlist_count': 9,
}, {
'url': 'https://www.cpac.ca/emission?id=6',
'only_matching': True,
}, {
'url': 'https://www.cpac.ca/rechercher?key=hudson&type=all&order=desc',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
url_lang = 'fr' if any(x in url for x in ('/emission?', '/rechercher?')) else 'en'
pl_type, list_type = ('program', 'itemList') if any(x in url for x in ('/program?', '/emission?')) else ('search', 'searchResult')
api_url = (
'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/%s/index.xml&crafterSite=cpacca&%s'
% (pl_type, video_id, ))
content = self._download_json(api_url, video_id)
entries = []
total_pages = int_or_none(try_get(content, lambda x: x['page'][list_type]['totalPages']), default=1)
for page in range(1, total_pages + 1):
if page > 1:
api_url = update_url_query(api_url, {'page': '%d' % (page, ), })
content = self._download_json(
api_url, video_id,
note='Downloading continuation - %d' % (page, ),
fatal=False)
for item in try_get(content, lambda x: x['page'][list_type]['item'], list) or []:
episode_url = urljoin(url, try_get(item, lambda x: x['url_%s_s' % (url_lang, )]))
if episode_url:
entries.append(episode_url)
return self.playlist_result(
(self.url_result(entry) for entry in entries),
playlist_id=video_id,
playlist_title=try_get(content, lambda x: x['page']['program']['title_%s_t' % (url_lang, )]) or video_id.split('=')[-1],
playlist_description=try_get(content, lambda x: x['page']['program']['description_%s_t' % (url_lang, )]),
)

204
youtube_dl/extractor/dlf.py Normal file
View File

@@ -0,0 +1,204 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_str,
)
from ..utils import (
determine_ext,
extract_attributes,
int_or_none,
merge_dicts,
traverse_obj,
url_or_none,
variadic,
)
class DLFBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?deutschlandfunk\.de/'
_BUTTON_REGEX = r'(<button[^>]+alt="Anhören"[^>]+data-audio-diraid[^>]*>)'
def _parse_button_attrs(self, button, audio_id=None):
attrs = extract_attributes(button)
audio_id = audio_id or attrs['data-audio-diraid']
url = traverse_obj(
attrs, 'data-audio-download-src', 'data-audio', 'data-audioreference',
'data-audio-src', expected_type=url_or_none)
ext = determine_ext(url)
formats = (self._extract_m3u8_formats(url, audio_id, fatal=False)
if ext == 'm3u8' else [{'url': url, 'ext': ext, 'vcodec': 'none'}])
self._sort_formats(formats)
def traverse_attrs(path):
path = list(variadic(path))
t = path.pop() if callable(path[-1]) else None
return traverse_obj(attrs, path, expected_type=t, get_all=False)
def txt_or_none(v, default=None):
return default if v is None else (compat_str(v).strip() or default)
return merge_dicts(*reversed([{
'id': audio_id,
# 'extractor_key': DLFIE.ie_key(),
# 'extractor': DLFIE.IE_NAME,
'formats': formats,
}, dict((k, traverse_attrs(v)) for k, v in {
'title': (('data-audiotitle', 'data-audio-title', 'data-audio-download-tracking-title'), txt_or_none),
'duration': (('data-audioduration', 'data-audio-duration'), int_or_none),
'thumbnail': ('data-audioimage', url_or_none),
'uploader': 'data-audio-producer',
'series': 'data-audio-series',
'channel': 'data-audio-origin-site-name',
'webpage_url': ('data-audio-download-tracking-path', url_or_none),
}.items())]))
class DLFIE(DLFBaseIE):
IE_NAME = 'dlf'
_VALID_URL = DLFBaseIE._VALID_URL_BASE + r'[\w-]+-dlf-(?P<id>[\da-f]{8})-100\.html'
_TESTS = [
# Audio as an HLS stream
{
'url': 'https://www.deutschlandfunk.de/tanz-der-saiteninstrumente-das-wild-strings-trio-aus-slowenien-dlf-03a3eb19-100.html',
'info_dict': {
'id': '03a3eb19',
'title': r're:Tanz der Saiteninstrumente [-/] Das Wild Strings Trio aus Slowenien',
'ext': 'm4a',
'duration': 3298,
'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673',
'uploader': 'Deutschlandfunk',
'series': 'On Stage',
'channel': 'deutschlandfunk'
},
'params': {
'skip_download': 'm3u8'
},
'skip': 'This webpage no longer exists'
}, {
'url': 'https://www.deutschlandfunk.de/russische-athleten-kehren-zurueck-auf-die-sportbuehne-ein-gefaehrlicher-tueroeffner-dlf-d9cc1856-100.html',
'info_dict': {
'id': 'd9cc1856',
'title': 'Russische Athleten kehren zurück auf die Sportbühne: Ein gefährlicher Türöffner',
'ext': 'mp3',
'duration': 291,
'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673',
'uploader': 'Deutschlandfunk',
'series': 'Kommentare und Themen der Woche',
'channel': 'deutschlandfunk'
}
},
]
def _real_extract(self, url):
audio_id = self._match_id(url)
webpage = self._download_webpage(url, audio_id)
return self._parse_button_attrs(
self._search_regex(self._BUTTON_REGEX, webpage, 'button'), audio_id)
class DLFCorpusIE(DLFBaseIE):
IE_NAME = 'dlf:corpus'
IE_DESC = 'DLF Multi-feed Archives'
_VALID_URL = DLFBaseIE._VALID_URL_BASE + r'(?P<id>(?![\w-]+-dlf-[\da-f]{8})[\w-]+-\d+)\.html'
_TESTS = [
# Recorded news broadcast with referrals to related broadcasts
{
'url': 'https://www.deutschlandfunk.de/fechten-russland-belarus-ukraine-protest-100.html',
'info_dict': {
'id': 'fechten-russland-belarus-ukraine-protest-100',
'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet',
'description': 'md5:91340aab29c71aa7518ad5be13d1e8ad'
},
'playlist_mincount': 5,
'playlist': [{
'info_dict': {
'id': '1fc5d64a',
'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet',
'ext': 'mp3',
'duration': 252,
'thumbnail': 'https://assets.deutschlandfunk.de/aad16241-6b76-4a09-958b-96d0ee1d6f57/512x512.jpg?t=1679480020313',
'uploader': 'Deutschlandfunk',
'series': 'Sport',
'channel': 'deutschlandfunk'
}
}, {
'info_dict': {
'id': '2ada145f',
'title': r're:(?:Sportpolitik / )?Fechtverband votiert für Rückkehr russischer Athleten',
'ext': 'mp3',
'duration': 336,
'thumbnail': 'https://assets.deutschlandfunk.de/FILE_93982766f7317df30409b8a184ac044a/512x512.jpg?t=1678547581005',
'uploader': 'Deutschlandfunk',
'series': 'Deutschlandfunk Nova',
'channel': 'deutschlandfunk-nova'
}
}, {
'info_dict': {
'id': '5e55e8c9',
'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis',
'ext': 'mp3',
'duration': 187,
'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412',
'uploader': 'Deutschlandfunk',
'series': 'Sport am Samstag',
'channel': 'deutschlandfunk'
}
}, {
'info_dict': {
'id': '47e1a096',
'title': r're:Rückkehr Russlands im Fechten [-/] "Fassungslos, dass es einfach so passiert ist"',
'ext': 'mp3',
'duration': 602,
'thumbnail': 'https://assets.deutschlandfunk.de/da4c494a-21cc-48b4-9cc7-40e09fd442c2/512x512.jpg?t=1678562155770',
'uploader': 'Deutschlandfunk',
'series': 'Sport am Samstag',
'channel': 'deutschlandfunk'
}
}, {
'info_dict': {
'id': '5e55e8c9',
'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis',
'ext': 'mp3',
'duration': 187,
'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412',
'uploader': 'Deutschlandfunk',
'series': 'Sport am Samstag',
'channel': 'deutschlandfunk'
}
}]
},
# Podcast feed with tag buttons, playlist count fluctuates
{
'url': 'https://www.deutschlandfunk.de/kommentare-und-themen-der-woche-100.html',
'info_dict': {
'id': 'kommentare-und-themen-der-woche-100',
'title': 'Meinung - Kommentare und Themen der Woche',
'description': 'md5:2901bbd65cd2d45e116d399a099ce5d5',
},
'playlist_mincount': 10,
},
# Podcast feed with no description
{
'url': 'https://www.deutschlandfunk.de/podcast-tolle-idee-100.html',
'info_dict': {
'id': 'podcast-tolle-idee-100',
'title': 'Wissenschaftspodcast - Tolle Idee! - Was wurde daraus?',
},
'playlist_mincount': 11,
},
]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
return self.playlist_result(
map(self._parse_button_attrs, re.findall(self._BUTTON_REGEX, webpage)),
playlist_id, self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None),
self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage, default=None))

View File

@@ -0,0 +1,101 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
float_or_none,
T,
traverse_obj,
txt_or_none,
unified_timestamp,
url_or_none,
)
class EpidemicSoundIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?epidemicsound\.com/track/(?P<id>[0-9a-zA-Z]+)'
_TESTS = [{
'url': 'https://www.epidemicsound.com/track/yFfQVRpSPz/',
'md5': 'd98ff2ddb49e8acab9716541cbc9dfac',
'info_dict': {
'id': '45014',
'display_id': 'yFfQVRpSPz',
'ext': 'mp3',
'tags': ['foley', 'door', 'knock', 'glass', 'window', 'glass door knock'],
'title': 'Door Knock Door 1',
'duration': 1,
'thumbnail': 'https://cdn.epidemicsound.com/curation-assets/commercial-release-cover-images/default-sfx/3000x3000.jpg',
'timestamp': 1415320353,
'upload_date': '20141107',
'age_limit': None,
# check that the "best" format was found, since test file MD5 doesn't
# distinguish the formats
'format': 'full',
},
}, {
'url': 'https://www.epidemicsound.com/track/mj8GTTwsZd/',
'md5': 'c82b745890f9baf18dc2f8d568ee3830',
'info_dict': {
'id': '148700',
'display_id': 'mj8GTTwsZd',
'ext': 'mp3',
'tags': ['liquid drum n bass', 'energetic'],
'title': 'Noplace',
'duration': 237,
'thumbnail': 'https://cdn.epidemicsound.com/curation-assets/commercial-release-cover-images/11138/3000x3000.jpg',
'timestamp': 1694426482,
'release_timestamp': 1700535606,
'upload_date': '20230911',
'age_limit': None,
'format': 'full',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
json_data = self._download_json('https://www.epidemicsound.com/json/track/' + video_id, video_id)
def fmt_or_none(f):
if not f.get('format'):
f['format'] = f.get('format_id')
elif not f.get('format_id'):
f['format_id'] = f['format']
if not (f['url'] and f['format']):
return
if f.get('format_note'):
f['format_note'] = 'track ID ' + f['format_note']
f['preference'] = -1 if f['format'] == 'full' else -2
return f
formats = traverse_obj(json_data, (
'stems', T(dict.items), Ellipsis, {
'format': (0, T(txt_or_none)),
'format_note': (1, 's3TrackId', T(txt_or_none)),
'format_id': (1, 'stemType', T(txt_or_none)),
'url': (1, 'lqMp3Url', T(url_or_none)),
}, T(fmt_or_none)))
self._sort_formats(formats)
info = traverse_obj(json_data, {
'id': ('id', T(txt_or_none)),
'tags': ('metadataTags', Ellipsis, T(txt_or_none)),
'title': ('title', T(txt_or_none)),
'duration': ('length', T(float_or_none)),
'timestamp': ('added', T(unified_timestamp)),
'thumbnail': (('imageUrl', 'cover'), T(url_or_none)),
'age_limit': ('isExplicit', T(lambda b: 18 if b else None)),
'release_timestamp': ('releaseDate', T(unified_timestamp)),
}, get_all=False)
info.update(traverse_obj(json_data, {
'categories': ('genres', Ellipsis, 'tag', T(txt_or_none)),
'tags': ('metadataTags', Ellipsis, T(txt_or_none)),
}))
info.update({
'display_id': video_id,
'formats': formats,
})
return info

View File

@@ -51,6 +51,10 @@ from .anvato import AnvatoIE
from .aol import AolIE
from .allocine import AllocineIE
from .aliexpress import AliExpressLiveIE
from .alsace20tv import (
Alsace20TVIE,
Alsace20TVEmbedIE,
)
from .apa import APAIE
from .aparat import AparatIE
from .appleconnect import AppleConnectIE
@@ -115,6 +119,7 @@ from .bfmtv import (
)
from .bibeltv import BibelTVIE
from .bigflix import BigflixIE
from .bigo import BigoIE
from .bild import BildIE
from .bilibili import (
BiliBiliIE,
@@ -133,6 +138,7 @@ from .bleacherreport import (
BleacherReportIE,
BleacherReportCMSIE,
)
from .blerp import BlerpIE
from .bloomberg import BloombergIE
from .bokecc import BokeCCIE
from .bongacams import BongaCamsIE
@@ -153,6 +159,8 @@ from .businessinsider import BusinessInsiderIE
from .buzzfeed import BuzzFeedIE
from .byutv import BYUtvIE
from .c56 import C56IE
from .caffeine import CaffeineTVIE
from .callin import CallinIE
from .camdemy import (
CamdemyIE,
CamdemyFolderIE
@@ -203,10 +211,7 @@ from .ccc import (
from .ccma import CCMAIE
from .cctv import CCTVIE
from .cda import CDAIE
from .ceskatelevize import (
CeskaTelevizeIE,
CeskaTelevizePoradyIE,
)
from .ceskatelevize import CeskaTelevizeIE
from .channel9 import Channel9IE
from .charlierose import CharlieRoseIE
from .chaturbate import ChaturbateIE
@@ -222,6 +227,7 @@ from .ciscolive import (
CiscoLiveSearchIE,
)
from .cjsw import CJSWIE
from .clipchamp import ClipchampIE
from .cliphunter import CliphunterIE
from .clippit import ClippitIE
from .cliprs import ClipRsIE
@@ -254,6 +260,10 @@ from .commonprotocols import (
from .condenast import CondeNastIE
from .contv import CONtvIE
from .corus import CorusIE
from .cpac import (
CPACIE,
CPACPlaylistIE,
)
from .cracked import CrackedIE
from .crackle import CrackleIE
from .crooksandliars import CrooksAndLiarsIE
@@ -287,6 +297,10 @@ from .dbtv import DBTVIE
from .dctp import DctpTvIE
from .deezer import DeezerPlaylistIE
from .democracynow import DemocracynowIE
from .dlf import (
DLFCorpusIE,
DLFIE,
)
from .dfb import DFBIE
from .dhm import DHMIE
from .digg import DiggIE
@@ -344,6 +358,7 @@ from .ellentube import (
from .elpais import ElPaisIE
from .embedly import EmbedlyIE
from .engadget import EngadgetIE
from .epidemicsound import EpidemicSoundIE
from .eporner import EpornerIE
from .eroprofile import EroProfileIE
from .escapist import EscapistIE
@@ -368,6 +383,7 @@ from .fc2 import (
FC2EmbedIE,
)
from .fczenit import FczenitIE
from .fifa import FifaIE
from .filmon import (
FilmOnIE,
FilmOnChannelIE,
@@ -427,6 +443,7 @@ from .gamespot import GameSpotIE
from .gamestar import GameStarIE
from .gaskrank import GaskrankIE
from .gazeta import GazetaIE
from .gbnews import GBNewsIE
from .gdcvault import GDCVaultIE
from .gedidigital import GediDigitalIE
from .generic import GenericIE
@@ -434,6 +451,13 @@ from .gfycat import GfycatIE
from .giantbomb import GiantBombIE
from .giga import GigaIE
from .glide import GlideIE
from .globalplayer import (
GlobalPlayerLiveIE,
GlobalPlayerLivePlaylistIE,
GlobalPlayerAudioIE,
GlobalPlayerAudioEpisodeIE,
GlobalPlayerVideoIE
)
from .globo import (
GloboIE,
GloboArticleIE,
@@ -470,6 +494,7 @@ from .hotstar import (
)
from .howcast import HowcastIE
from .howstuffworks import HowStuffWorksIE
from .hrfernsehen import HRFernsehenIE
from .hrti import (
HRTiIE,
HRTiPlaylistIE,
@@ -546,8 +571,10 @@ from .khanacademy import (
from .kickstarter import KickStarterIE
from .kinja import KinjaEmbedIE
from .kinopoisk import KinoPoiskIE
from .kommunetv import KommunetvIE
from .konserthusetplay import KonserthusetPlayIE
from .krasview import KrasViewIE
from .kth import KTHIE
from .ku6 import Ku6IE
from .kusi import KUSIIE
from .kuwo import (
@@ -717,6 +744,7 @@ from .myvi import (
MyviIE,
MyviEmbedIE,
)
from .myvideoge import MyVideoGeIE
from .myvidster import MyVidsterIE
from .nationalgeographic import (
NationalGeographicVideoIE,
@@ -870,21 +898,13 @@ from .ooyala import (
)
from .ora import OraTVIE
from .orf import (
ORFTVthekIE,
ORFFM4IE,
ORFONIE,
ORFONLiveIE,
ORFFM4StoryIE,
ORFOE1IE,
ORFOE3IE,
ORFNOEIE,
ORFWIEIE,
ORFBGLIE,
ORFOOEIE,
ORFSTMIE,
ORFKTNIE,
ORFSBGIE,
ORFTIRIE,
ORFVBGIE,
ORFIPTVIE,
ORFPodcastIE,
ORFRadioIE,
ORFRadioCollectionIE,
)
from .outsidetv import OutsideTVIE
from .packtpub import (
@@ -901,6 +921,10 @@ from .parliamentliveuk import ParliamentLiveUKIE
from .patreon import PatreonIE
from .pbs import PBSIE
from .pearvideo import PearVideoIE
from .peekvids import (
PeekVidsIE,
PlayVidsIE,
)
from .peertube import PeerTubeIE
from .people import PeopleIE
from .performgroup import PerformGroupIE
@@ -957,6 +981,10 @@ from .pornhub import (
from .pornotube import PornotubeIE
from .pornovoisines import PornoVoisinesIE
from .pornoxo import PornoXOIE
from .pr0gramm import (
Pr0grammIE,
Pr0grammStaticIE,
)
from .puhutv import (
PuhuTVIE,
PuhuTVSerieIE,
@@ -994,6 +1022,10 @@ from .raywenderlich import (
RayWenderlichIE,
RayWenderlichCourseIE,
)
from .rbgtum import (
RbgTumIE,
RbgTumCourseIE,
)
from .rbmaradio import RBMARadioIE
from .rds import RDSIE
from .redbulltv import (
@@ -1049,6 +1081,10 @@ from .rutube import (
from .rutv import RUTVIE
from .ruutu import RuutuIE
from .ruv import RuvIE
from .s4c import (
S4CIE,
S4CSeriesIE,
)
from .safari import (
SafariIE,
SafariApiIE,
@@ -1184,6 +1220,7 @@ from .storyfire import (
from .streamable import StreamableIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
from .streamsb import StreamsbIE
from .streetvoice import StreetVoiceIE
from .stretchinternet import StretchInternetIE
from .stv import STVPlayerIE
@@ -1253,6 +1290,11 @@ from .theweatherchannel import TheWeatherChannelIE
from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE
from .thisvid import (
ThisVidIE,
ThisVidMemberIE,
ThisVidPlaylistIE,
)
from .threeqsdn import ThreeQSDNIE
from .tiktok import (
TikTokIE,
@@ -1537,6 +1579,7 @@ from .weibo import (
WeiboMobileIE
)
from .weiqitv import WeiqiTVIE
from .whyp import WhypIE
from .wistia import (
WistiaIE,
WistiaPlaylistIE,
@@ -1602,7 +1645,15 @@ from .younow import (
YouNowChannelIE,
YouNowMomentIE,
)
from .youporn import YouPornIE
from .youporn import (
YouPornIE,
YouPornCategoryIE,
YouPornChannelIE,
YouPornCollectionIE,
YouPornStarIE,
YouPornTagIE,
YouPornVideosIE,
)
from .yourporn import YourPornIE
from .yourupload import YourUploadIE
from .youtube import (

View File

@@ -0,0 +1,101 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
traverse_obj,
unified_timestamp,
)
if not callable(getattr(InfoExtractor, '_match_valid_url', None)):
BaseInfoExtractor = InfoExtractor
import re
class InfoExtractor(BaseInfoExtractor):
@classmethod
def _match_valid_url(cls, url):
return re.match(cls._VALID_URL, url)
class FifaIE(InfoExtractor):
_VALID_URL = r'https?://www.fifa.com/fifaplus/(?P<locale>\w{2})/watch/([^#?]+/)?(?P<id>\w+)'
_TESTS = [{
'url': 'https://www.fifa.com/fifaplus/en/watch/7on10qPcnyLajDDU3ntg6y',
'info_dict': {
'id': '7on10qPcnyLajDDU3ntg6y',
'title': 'Italy v France | Final | 2006 FIFA World Cup Germany™ | Full Match Replay',
'description': 'md5:f4520d0ee80529c8ba4134a7d692ff8b',
'ext': 'mp4',
'categories': ['FIFA Tournaments'],
'thumbnail': 'https://digitalhub.fifa.com/transform/135e2656-3a51-407b-8810-6c34bec5b59b/FMR_2006_Italy_France_Final_Hero',
'duration': 8165,
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.fifa.com/fifaplus/pt/watch/1cg5r5Qt6Qt12ilkDgb1sV',
'info_dict': {
'id': '1cg5r5Qt6Qt12ilkDgb1sV',
'title': 'Brazil v Germany | Semi-finals | 2014 FIFA World Cup Brazil™ | Extended Highlights',
'description': 'md5:d908c74ee66322b804ae2e521b02a855',
'ext': 'mp4',
'categories': ['FIFA Tournaments', 'Highlights'],
'thumbnail': 'https://digitalhub.fifa.com/transform/d8fe6f61-276d-4a73-a7fe-6878a35fd082/FIFAPLS_100EXTHL_2014BRAvGER_TMB',
'duration': 902,
'release_timestamp': 1404777600,
'release_date': '20140708',
},
'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://www.fifa.com/fifaplus/fr/watch/3C6gQH9C2DLwzNx7BMRQdp',
'info_dict': {
'id': '3C6gQH9C2DLwzNx7BMRQdp',
'title': 'Josimar goal against Northern Ireland | Classic Goals',
'description': 'md5:cbe7e7bb52f603c9f1fe9a4780fe983b',
'ext': 'mp4',
'categories': ['FIFA Tournaments', 'Goal'],
'duration': 28,
'thumbnail': 'https://digitalhub.fifa.com/transform/f9301391-f8d9-48b5-823e-c093ac5e3e11/CG_MEN_1986_JOSIMAR',
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
video_id, locale = self._match_valid_url(url).group('id', 'locale')
webpage = self._download_webpage(url, video_id)
preconnect_link = self._search_regex(
r'<link\b[^>]+\brel\s*=\s*"preconnect"[^>]+href\s*=\s*"([^"]+)"', webpage, 'Preconnect Link')
video_details = self._download_json(
'{preconnect_link}/sections/videoDetails/{video_id}'.format(**locals()), video_id, 'Downloading Video Details', fatal=False)
preplay_parameters = self._download_json(
'{preconnect_link}/videoPlayerData/{video_id}'.format(**locals()), video_id, 'Downloading Preplay Parameters')['preplayParameters']
content_data = self._download_json(
# 1. query string is expected to be sent as-is
# 2. `sig` must be appended
# 3. if absent, the call appears to work but the manifest is bad (404)
'https://content.uplynk.com/preplay/{contentId}/multiple.json?{queryStr}&sig={signature}'.format(**preplay_parameters),
video_id, 'Downloading Content Data')
# formats, subtitles = self._extract_m3u8_formats_and_subtitles(content_data['playURL'], video_id)
formats, subtitles = self._extract_m3u8_formats(content_data['playURL'], video_id, ext='mp4', entry_protocol='m3u8_native'), None
self._sort_formats(formats)
return {
'id': video_id,
'title': video_details['title'],
'description': video_details.get('description'),
'duration': int_or_none(video_details.get('duration')),
'release_timestamp': unified_timestamp(video_details.get('dateOfRelease')),
'categories': traverse_obj(video_details, (('videoCategory', 'videoSubcategory'),)),
'thumbnail': traverse_obj(video_details, ('backgroundImage', 'src')),
'formats': formats,
'subtitles': subtitles,
}

View File

@@ -0,0 +1,139 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
extract_attributes,
ExtractorError,
T,
traverse_obj,
txt_or_none,
url_or_none,
)
class GBNewsIE(InfoExtractor):
IE_DESC = 'GB News clips, features and live stream'
# \w+ is normally shows or news, but apparently any word redirects to the correct URL
_VALID_URL = r'https?://(?:www\.)?gbnews\.(?:uk|com)/(?:\w+/)?(?P<id>[^#?]+)'
_PLATFORM = 'safari'
_SSMP_URL = 'https://mm-v2.simplestream.com/ssmp/api.php'
_TESTS = [{
'url': 'https://www.gbnews.uk/shows/andrew-neils-message-to-companies-choosing-to-boycott-gb-news/106889',
'info_dict': {
'id': '106889',
'ext': 'mp4',
'title': "Andrew Neil's message to companies choosing to boycott GB News",
'description': 'md5:b281f5d22fd6d5eda64a4e3ba771b351',
},
'skip': '404 not found',
}, {
'url': 'https://www.gbnews.com/news/bbc-claudine-gay-harvard-university-antisemitism-row',
'info_dict': {
'id': '52264136',
'display_id': 'bbc-claudine-gay-harvard-university-antisemitism-row',
'ext': 'mp4',
'title': 'BBC deletes post after furious backlash over headline downplaying antisemitism',
'description': 'The post was criticised by former employers of the broadcaster',
},
}, {
'url': 'https://www.gbnews.uk/watchlive',
'info_dict': {
'id': '1069',
'display_id': 'watchlive',
'ext': 'mp4',
'title': 'GB News Live',
'is_live': True,
},
'params': {
'skip_download': 'm3u8',
},
}]
def _real_extract(self, url):
display_id = self._match_id(url).split('/')[-1]
webpage = self._download_webpage(url, display_id)
# extraction based on https://github.com/ytdl-org/youtube-dl/issues/29341
'''
<div id="video-106908"
class="simplestream"
data-id="GB001"
data-type="vod"
data-key="3Li3Nt2Qs8Ct3Xq9Fi5Uy0Mb2Bj0Qs"
data-token="f9c317c727dc07f515b20036c8ef14a6"
data-expiry="1624300052"
data-uvid="37900558"
data-poster="https://thumbnails.simplestreamcdn.com/gbnews/ondemand/37900558.jpg?width=700&"
data-npaw="false"
data-env="production">
'''
# exception if no match
video_data = self._search_regex(
r'(<div\s[^>]*\bclass\s*=\s*(\'|")(?!.*sidebar\b)simplestream(?:\s[\s\w$-]*)?\2[^>]*>)',
webpage, 'video data')
video_data = extract_attributes(video_data)
ss_id = video_data.get('data-id')
if not ss_id:
raise ExtractorError('Simplestream ID not found')
json_data = self._download_json(
self._SSMP_URL, display_id,
note='Downloading Simplestream JSON metadata',
errnote='Unable to download Simplestream JSON metadata',
query={
'id': ss_id,
'env': video_data.get('data-env', 'production'),
}, fatal=False)
meta_url = traverse_obj(json_data, ('response', 'api_hostname'))
if not meta_url:
raise ExtractorError('No API host found')
uvid = video_data['data-uvid']
dtype = video_data.get('data-type')
stream_data = self._download_json(
'%s/api/%s/stream/%s' % (meta_url, 'show' if dtype == 'vod' else dtype, uvid),
uvid,
query={
'key': video_data.get('data-key'),
'platform': self._PLATFORM,
},
headers={
'Token': video_data.get('data-token'),
'Token-Expiry': video_data.get('data-expiry'),
'Uvid': uvid,
}, fatal=False)
stream_url = traverse_obj(stream_data, (
'response', 'stream', T(url_or_none)))
if not stream_url:
raise ExtractorError('No stream data/URL')
# now known to be a dict
stream_data = stream_data['response']
drm = stream_data.get('drm')
if drm:
self.report_drm(uvid)
formats = self._extract_m3u8_formats(
stream_url, uvid, ext='mp4', entry_protocol='m3u8_native',
fatal=False)
# exception if no formats
self._sort_formats(formats)
return {
'id': uvid,
'display_id': display_id,
'title': (traverse_obj(stream_data, ('title', T(txt_or_none)))
or self._og_search_title(webpage, default=None)
or display_id.replace('-', ' ').capitalize()),
'description': self._og_search_description(webpage, default=None),
'thumbnail': (traverse_obj(video_data, ('data-poster', T(url_or_none)))
or self._og_search_thumbnail(webpage)),
'formats': formats,
'is_live': (dtype == 'live') or None,
}

View File

@@ -28,6 +28,7 @@ from ..utils import (
mimetype2ext,
orderedSet,
parse_duration,
parse_resolution,
sanitized_Request,
smuggle_url,
unescapeHTML,
@@ -35,6 +36,7 @@ from ..utils import (
unsmuggle_url,
UnsupportedError,
url_or_none,
urljoin,
xpath_attr,
xpath_text,
xpath_with_ns,
@@ -2227,6 +2229,116 @@ class GenericIE(InfoExtractor):
# Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed)
'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html',
'only_matching': True,
}, {
# KVS Player
'url': 'https://www.kvs-demo.com/videos/105/kelis-4th-of-july/',
'info_dict': {
'id': '105',
'display_id': 'kelis-4th-of-july',
'ext': 'mp4',
'title': 'Kelis - 4th Of July',
'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
},
}, {
# KVS Player
'url': 'https://www.kvs-demo.com/embed/105/',
'info_dict': {
'id': '105',
'display_id': 'kelis-4th-of-july',
'ext': 'mp4',
'title': 'Kelis - 4th Of July / Embed Player',
'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
},
'params': {
'skip_download': True,
},
}, {
# KVS Player (tested also in thisvid.py)
'url': 'https://youix.com/video/leningrad-zoj/',
'md5': '94f96ba95706dc3880812b27b7d8a2b8',
'info_dict': {
'id': '18485',
'display_id': 'leningrad-zoj',
'ext': 'mp4',
'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com',
'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
},
}, {
# KVS Player
'url': 'https://youix.com/embed/18485',
'md5': '94f96ba95706dc3880812b27b7d8a2b8',
'info_dict': {
'id': '18485',
'display_id': 'leningrad-zoj',
'ext': 'mp4',
'title': 'Ленинград - ЗОЖ',
'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
},
}, {
# KVS Player
'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/',
'md5': '94166bdb26b4cb1fb9214319a629fc51',
'info_dict': {
'id': '21217',
'display_id': '40-nochey-2016',
'ext': 'mp4',
'title': '40 ночей (2016) - BogMedia.org',
'description': 'md5:4e6d7d622636eb7948275432eb256dc3',
'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg',
},
}, {
# KVS Player (for sites that serve kt_player.js via non-https urls)
'url': 'http://www.camhub.world/embed/389508',
'md5': 'fbe89af4cfb59c8fd9f34a202bb03e32',
'info_dict': {
'id': '389508',
'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source',
'ext': 'mp4',
'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg',
},
}, {
'url': 'https://mrdeepfakes.com/video/5/selena-gomez-pov-deep-fakes',
'md5': 'fec4ad5ec150f655e0c74c696a4a2ff4',
'info_dict': {
'id': '5',
'display_id': 'selena-gomez-pov-deep-fakes',
'ext': 'mp4',
'title': 'Selena Gomez POV (Deep Fakes) DeepFake Porn - MrDeepFakes',
'description': 'md5:17d1f84b578c9c26875ac5ef9a932354',
'height': 720,
'age_limit': 18,
},
}, {
'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/',
'md5': 'e2f0a4c329f7986280b7328e24036d60',
'info_dict': {
'id': '284002',
'display_id': 'just-out-of-the-shower-joi',
'ext': 'mp4',
'title': 'Just Out Of The Shower JOI - Shooshtime',
'height': 720,
'age_limit': 18,
},
}, {
# would like to use the yt-dl test video but searching for
# '"\'/\\ä↭𝕐' fails, so using an old vid from YouTube Korea
'note': 'Test default search',
'url': 'Shorts로 허락 필요없이 놀자! (BTS편)',
'info_dict': {
'id': 'usDGO4Zb-dc',
'ext': 'mp4',
'title': 'YouTube Shorts로 허락 필요없이 놀자! (BTS편)',
'description': 'md5:96e31607eba81ab441567b5e289f4716',
'upload_date': '20211107',
'uploader': 'YouTube Korea',
'location': '대한민국',
},
'params': {
'default_search': 'ytsearch',
'skip_download': True,
},
'expected_warnings': ['uploader id'],
},
]
@@ -2332,6 +2444,88 @@ class GenericIE(InfoExtractor):
'title': title,
}
def _extract_kvs(self, url, webpage, video_id):
def getlicensetoken(license):
modlicense = license.replace('$', '').replace('0', '1')
center = int(len(modlicense) / 2)
fronthalf = int(modlicense[:center + 1])
backhalf = int(modlicense[center:])
modlicense = compat_str(4 * abs(fronthalf - backhalf))
def parts():
for o in range(0, center + 1):
for i in range(1, 5):
yield compat_str((int(license[o + i]) + int(modlicense[o])) % 10)
return ''.join(parts())
def getrealurl(video_url, license_code):
if not video_url.startswith('function/0/'):
return video_url # not obfuscated
url_path, _, url_query = video_url.partition('?')
urlparts = url_path.split('/')[2:]
license = getlicensetoken(license_code)
newmagic = urlparts[5][:32]
def spells(x, o):
l = (o + sum(int(n) for n in license[o:])) % 32
for i in range(0, len(x)):
yield {l: x[o], o: x[l]}.get(i, x[i])
for o in range(len(newmagic) - 1, -1, -1):
newmagic = ''.join(spells(newmagic, o))
urlparts[5] = newmagic + urlparts[5][32:]
return '/'.join(urlparts) + '?' + url_query
flashvars = self._search_regex(
r'(?s)<script\b[^>]*>.*?var\s+flashvars\s*=\s*(\{.+?\});.*?</script>',
webpage, 'flashvars')
flashvars = self._parse_json(flashvars, video_id, transform_source=js_to_json)
# extract the part after the last / as the display_id from the
# canonical URL.
display_id = self._search_regex(
r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
webpage, 'display_id', fatal=False
)
title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
thumbnail = flashvars['preview_url']
if thumbnail.startswith('//'):
protocol, _, _ = url.partition('/')
thumbnail = protocol + thumbnail
url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys()))
formats = []
for key in url_keys:
if '/get_file/' not in flashvars[key]:
continue
format_id = flashvars.get(key + '_text', key)
formats.append(merge_dicts(
parse_resolution(format_id) or parse_resolution(flashvars[key]), {
'url': urljoin(url, getrealurl(flashvars[key], flashvars['license_code'])),
'format_id': format_id,
'ext': 'mp4',
'http_headers': {'Referer': url},
}))
if not formats[-1].get('height'):
formats[-1]['quality'] = 1
self._sort_formats(formats)
return {
'id': flashvars['video_id'],
'display_id': display_id,
'title': title,
'thumbnail': thumbnail,
'formats': formats,
}
def _real_extract(self, url):
if url.startswith('//'):
return self.url_result(self.http_scheme() + url)
@@ -2540,9 +2734,16 @@ class GenericIE(InfoExtractor):
# but actually don't.
AGE_LIMIT_MARKERS = [
r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
r'>[^<]*you acknowledge you are at least (\d+) years old',
r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
]
if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
age_limit = 18
for marker in AGE_LIMIT_MARKERS:
m = re.search(marker, webpage)
if not m:
continue
age_limit = max(
age_limit or 0,
int_or_none(m.groups() and m.group(1), default=18))
# video uploader is domain name
video_uploader = self._search_regex(
@@ -3389,6 +3590,20 @@ class GenericIE(InfoExtractor):
info_dict['formats'] = formats
return info_dict
# Look for generic KVS player (before ld+json for tests)
found = self._search_regex(
(r'<script\b[^>]+?\bsrc\s*=\s*(["\'])https?://(?:\S+?/)+kt_player\.js\?v=(?P<ver>\d+(?:\.\d+)+)\1[^>]*>',
# kt_player('kt_player', 'https://i.shoosh.co/player/kt_player.swf?v=5.5.1', ...
r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:\S+?/)+kt_player\.swf\?v=(?P<ver>\d+(?:\.\d+)+)\2\s*,',
), webpage, 'KVS player', group='ver', default=False)
if found:
self.report_extraction('%s: KVS Player' % (video_id, ))
if found.split('.')[0] not in ('4', '5', '6'):
self.report_warning('Untested major version (%s) in player engine - download may fail.' % (found, ))
return merge_dicts(
self._extract_kvs(url, webpage, video_id),
info_dict)
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(
webpage, video_id, default={}, expected_type='VideoObject')

View File

@@ -0,0 +1,273 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
clean_html,
join_nonempty,
merge_dicts,
parse_duration,
str_or_none,
T,
traverse_obj,
unified_strdate,
unified_timestamp,
urlhandle_detect_ext,
)
class GlobalPlayerBaseIE(InfoExtractor):
def _get_page_props(self, url, video_id):
webpage = self._download_webpage(url, video_id)
return self._search_nextjs_data(webpage, video_id)['props']['pageProps']
def _request_ext(self, url, video_id):
return urlhandle_detect_ext(self._request_webpage( # Server rejects HEAD requests
url, video_id, note='Determining source extension'))
@staticmethod
def _clean_desc(x):
x = clean_html(x)
if x:
x = x.replace('\xa0', ' ')
return x
def _extract_audio(self, episode, series):
return merge_dicts({
'vcodec': 'none',
}, traverse_obj(series, {
'series': 'title',
'series_id': 'id',
'thumbnail': 'imageUrl',
'uploader': 'itunesAuthor', # podcasts only
}), traverse_obj(episode, {
'id': 'id',
'description': ('description', T(self._clean_desc)),
'duration': ('duration', T(parse_duration)),
'thumbnail': 'imageUrl',
'url': 'streamUrl',
'timestamp': (('pubDate', 'startDate'), T(unified_timestamp)),
'title': 'title',
}, get_all=False), rev=True)
class GlobalPlayerLiveIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/live/(?P<id>\w+)/\w+'
_TESTS = [{
'url': 'https://www.globalplayer.com/live/smoothchill/uk/',
'info_dict': {
'id': '2mx1E',
'ext': 'aac',
'display_id': 'smoothchill-uk',
'title': 're:^Smooth Chill.+$',
'thumbnail': 'https://herald.musicradio.com/media/f296ade8-50c9-4f60-911f-924e96873620.png',
'description': 'Music To Chill To',
# 'live_status': 'is_live',
'is_live': True,
},
}, {
# national station
'url': 'https://www.globalplayer.com/live/heart/uk/',
'info_dict': {
'id': '2mwx4',
'ext': 'aac',
'description': 'turn up the feel good!',
'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png',
# 'live_status': 'is_live',
'is_live': True,
'title': 're:^Heart UK.+$',
'display_id': 'heart-uk',
},
}, {
# regional variation
'url': 'https://www.globalplayer.com/live/heart/london/',
'info_dict': {
'id': 'AMqg',
'ext': 'aac',
'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png',
'title': 're:^Heart London.+$',
# 'live_status': 'is_live',
'is_live': True,
'display_id': 'heart-london',
'description': 'turn up the feel good!',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
station = self._get_page_props(url, video_id)['station']
stream_url = station['streamUrl']
return merge_dicts({
'id': station['id'],
'display_id': (
join_nonempty('brandSlug', 'slug', from_dict=station)
or station.get('legacyStationPrefix')),
'url': stream_url,
'ext': self._request_ext(stream_url, video_id),
'vcodec': 'none',
'is_live': True,
}, {
'title': self._live_title(traverse_obj(
station, (('name', 'brandName'), T(str_or_none)),
get_all=False)),
}, traverse_obj(station, {
'description': 'tagline',
'thumbnail': 'brandLogo',
}), rev=True)
class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/playlists/(?P<id>\w+)'
_TESTS = [{
# "live playlist"
'url': 'https://www.globalplayer.com/playlists/8bLk/',
'info_dict': {
'id': '8bLk',
'ext': 'aac',
# 'live_status': 'is_live',
'is_live': True,
'description': r're:(?s).+\bclassical\b.+\bClassic FM Hall [oO]f Fame\b',
'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=',
'title': 're:Classic FM Hall of Fame.+$'
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
station = self._get_page_props(url, video_id)['playlistData']
stream_url = station['streamUrl']
return merge_dicts({
'id': video_id,
'url': stream_url,
'ext': self._request_ext(stream_url, video_id),
'vcodec': 'none',
'is_live': True,
}, traverse_obj(station, {
'title': 'title',
'description': ('description', T(self._clean_desc)),
'thumbnail': 'image',
}), rev=True)
class GlobalPlayerAudioIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)/|catchup/\w+/\w+/)(?P<id>\w+)/?(?:$|[?#])'
_TESTS = [{
# podcast
'url': 'https://www.globalplayer.com/podcasts/42KuaM/',
'playlist_mincount': 5,
'info_dict': {
'id': '42KuaM',
'title': 'Filthy Ritual',
'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
'categories': ['Society & Culture', 'True Crime'],
'uploader': 'Global',
'description': r're:(?s).+\bscam\b.+?\bseries available now\b',
},
}, {
# radio catchup
'url': 'https://www.globalplayer.com/catchup/lbc/uk/46vyD7z/',
'playlist_mincount': 2,
'info_dict': {
'id': '46vyD7z',
'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.',
'title': 'Nick Ferrari',
'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf',
},
}]
def _real_extract(self, url):
video_id, podcast = self._match_valid_url(url).group('id', 'podcast')
props = self._get_page_props(url, video_id)
series = props['podcastInfo'] if podcast else props['catchupInfo']
return merge_dicts({
'_type': 'playlist',
'id': video_id,
'entries': [self._extract_audio(ep, series) for ep in traverse_obj(
series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))],
'categories': traverse_obj(series, ('categories', Ellipsis, 'name')) or None,
}, traverse_obj(series, {
'description': ('description', T(self._clean_desc)),
'thumbnail': 'imageUrl',
'title': 'title',
'uploader': 'itunesAuthor', # podcasts only
}), rev=True)
class GlobalPlayerAudioEpisodeIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)|catchup/\w+/\w+)/episodes/(?P<id>\w+)/?(?:$|[?#])'
_TESTS = [{
# podcast
'url': 'https://www.globalplayer.com/podcasts/episodes/7DrfNnE/',
'info_dict': {
'id': '7DrfNnE',
'ext': 'mp3',
'title': 'Filthy Ritual - Trailer',
'description': 'md5:1f1562fd0f01b4773b590984f94223e0',
'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
'duration': 225.0,
'timestamp': 1681254900,
'series': 'Filthy Ritual',
'series_id': '42KuaM',
'upload_date': '20230411',
'uploader': 'Global',
},
}, {
# radio catchup
'url': 'https://www.globalplayer.com/catchup/lbc/uk/episodes/2zGq26Vcv1fCWhddC4JAwETXWe/',
'only_matching': True,
# expired: refresh the details with a current show for a full test
'info_dict': {
'id': '2zGq26Vcv1fCWhddC4JAwETXWe',
'ext': 'm4a',
'timestamp': 1682056800,
'series': 'Nick Ferrari',
'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf',
'upload_date': '20230421',
'series_id': '46vyD7z',
'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.',
'title': 'Nick Ferrari',
'duration': 10800.0,
},
}]
def _real_extract(self, url):
video_id, podcast = self._match_valid_url(url).group('id', 'podcast')
props = self._get_page_props(url, video_id)
episode = props['podcastEpisode'] if podcast else props['catchupEpisode']
return self._extract_audio(
episode, traverse_obj(episode, 'podcast', 'show', expected_type=dict) or {})
class GlobalPlayerVideoIE(GlobalPlayerBaseIE):
_VALID_URL = r'https?://www\.globalplayer\.com/videos/(?P<id>\w+)'
_TESTS = [{
'url': 'https://www.globalplayer.com/videos/2JsSZ7Gm2uP/',
'info_dict': {
'id': '2JsSZ7Gm2uP',
'ext': 'mp4',
'description': 'md5:6a9f063c67c42f218e42eee7d0298bfd',
'thumbnail': 'md5:d4498af48e15aae4839ce77b97d39550',
'upload_date': '20230420',
'title': 'Treble Malakai Bayoh sings a sublime Handel aria at Classic FM Live',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
meta = self._get_page_props(url, video_id)['videoData']
return merge_dicts({
'id': video_id,
}, traverse_obj(meta, {
'url': 'url',
'thumbnail': ('image', 'url'),
'title': 'title',
'upload_date': ('publish_date', T(unified_strdate)),
'description': 'description',
}), rev=True)

View File

@@ -0,0 +1,101 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from ..utils import (
int_or_none,
unified_timestamp,
unescapeHTML
)
from .common import InfoExtractor
class HRFernsehenIE(InfoExtractor):
IE_NAME = 'hrfernsehen'
_VALID_URL = r'^https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P<id>[0-9]{6})\.html'
_TESTS = [{
'url': 'https://www.hessenschau.de/tv-sendung/hessenschau-vom-26082020,video-130546.html',
'md5': '5c4e0ba94677c516a2f65a84110fc536',
'info_dict': {
'id': '130546',
'ext': 'mp4',
'description': 'Sturmtief Kirsten fegt über Hessen / Die Corona-Pandemie eine Chronologie / '
'Sterbehilfe: Die Lage in Hessen / Miss Hessen leitet zwei eigene Unternehmen / '
'Pop-Up Museum zeigt Schwarze Unterhaltung und Black Music',
'subtitles': {'de': [{
'url': 'https://hr-a.akamaihd.net/video/as/hessenschau/2020_08/hrLogo_200826200407_L385592_512x288-25p-500kbit.vtt'
}]},
'timestamp': 1598470200,
'upload_date': '20200826',
'thumbnail': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg',
'title': 'hessenschau vom 26.08.2020'
}
}, {
'url': 'https://www.hr-fernsehen.de/sendungen-a-z/mex/sendungen/fair-und-gut---was-hinter-aldis-eigenem-guetesiegel-steckt,video-130544.html',
'only_matching': True
}]
_GEO_COUNTRIES = ['DE']
def extract_airdate(self, loader_data):
airdate_str = loader_data.get('mediaMetadata', {}).get('agf', {}).get('airdate')
if airdate_str is None:
return None
return unified_timestamp(airdate_str)
def extract_formats(self, loader_data):
stream_formats = []
for stream_obj in loader_data["videoResolutionLevels"]:
stream_format = {
'format_id': str(stream_obj['verticalResolution']) + "p",
'height': stream_obj['verticalResolution'],
'url': stream_obj['url'],
}
quality_information = re.search(r'([0-9]{3,4})x([0-9]{3,4})-([0-9]{2})p-([0-9]{3,4})kbit',
stream_obj['url'])
if quality_information:
stream_format['width'] = int_or_none(quality_information.group(1))
stream_format['height'] = int_or_none(quality_information.group(2))
stream_format['fps'] = int_or_none(quality_information.group(3))
stream_format['tbr'] = int_or_none(quality_information.group(4))
stream_formats.append(stream_format)
self._sort_formats(stream_formats)
return stream_formats
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_meta(
['og:title', 'twitter:title', 'name'], webpage)
description = self._html_search_meta(
['description'], webpage)
loader_str = unescapeHTML(self._search_regex(r"data-new-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader"))
loader_data = json.loads(loader_str)
info = {
'id': video_id,
'title': title,
'description': description,
'formats': self.extract_formats(loader_data),
'timestamp': self.extract_airdate(loader_data)
}
if "subtitle" in loader_data:
info["subtitles"] = {"de": [{"url": loader_data["subtitle"]}]}
thumbnails = list(set([t for t in loader_data.get("previewImageUrl", {}).values()]))
if len(thumbnails) > 0:
info["thumbnails"] = [{"url": t} for t in thumbnails]
return info

View File

@@ -1,19 +1,29 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_filter as filter,
compat_HTTPError,
compat_parse_qs,
compat_urllib_parse_urlparse,
compat_urlparse,
)
from ..utils import (
HEADRequest,
determine_ext,
error_to_compat_str,
extract_attributes,
ExtractorError,
int_or_none,
merge_dicts,
orderedSet,
parse_iso8601,
strip_or_none,
try_get,
traverse_obj,
url_or_none,
urljoin,
)
@@ -22,14 +32,102 @@ class IGNBaseIE(InfoExtractor):
return self._download_json(
'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug)
def _checked_call_api(self, slug):
try:
return self._call_api(slug)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
e.cause.args = e.cause.args or [
e.cause.geturl(), e.cause.getcode(), e.cause.reason]
raise ExtractorError(
'Content not found: expired?', cause=e.cause,
expected=True)
raise
def _extract_video_info(self, video, fatal=True):
video_id = video['videoId']
formats = []
refs = traverse_obj(video, 'refs', expected_type=dict) or {}
m3u8_url = url_or_none(refs.get('m3uUrl'))
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
f4m_url = url_or_none(refs.get('f4mUrl'))
if f4m_url:
formats.extend(self._extract_f4m_formats(
f4m_url, video_id, f4m_id='hds', fatal=False))
for asset in (video.get('assets') or []):
asset_url = url_or_none(asset.get('url'))
if not asset_url:
continue
formats.append({
'url': asset_url,
'tbr': int_or_none(asset.get('bitrate'), 1000),
'fps': int_or_none(asset.get('frame_rate')),
'height': int_or_none(asset.get('height')),
'width': int_or_none(asset.get('width')),
})
mezzanine_url = traverse_obj(
video, ('system', 'mezzanineUrl'), expected_type=url_or_none)
if mezzanine_url:
formats.append({
'ext': determine_ext(mezzanine_url, 'mp4'),
'format_id': 'mezzanine',
'preference': 1,
'url': mezzanine_url,
})
if formats or fatal:
self._sort_formats(formats)
else:
return
thumbnails = traverse_obj(
video, ('thumbnails', Ellipsis, {'url': 'url'}), expected_type=url_or_none)
tags = traverse_obj(
video, ('tags', Ellipsis, 'displayName'),
expected_type=lambda x: x.strip() or None)
metadata = traverse_obj(video, 'metadata', expected_type=dict) or {}
title = traverse_obj(
metadata, 'longTitle', 'title', 'name',
expected_type=lambda x: x.strip() or None)
return {
'id': video_id,
'title': title,
'description': strip_or_none(metadata.get('description')),
'timestamp': parse_iso8601(metadata.get('publishDate')),
'duration': int_or_none(metadata.get('duration')),
'thumbnails': thumbnails,
'formats': formats,
'tags': tags,
}
# yt-dlp shim
@classmethod
def _extract_from_webpage(cls, url, webpage):
for embed_url in orderedSet(
cls._extract_embed_urls(url, webpage) or [], lazy=True):
yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
class IGNIE(IGNBaseIE):
"""
Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com.
Some videos of it.ign.com are also supported
"""
_VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[^/?&#]+)'
_VIDEO_PATH_RE = r'/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>.+?)'
_PLAYLIST_PATH_RE = r'(?:/?\?(?P<filt>[^&#]+))?'
_VALID_URL = (
r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos(?:%s)'
% '|'.join((_VIDEO_PATH_RE + r'(?:[/?&#]|$)', _PLAYLIST_PATH_RE)))
IE_NAME = 'ign.com'
_PAGE_TYPE = 'video'
@@ -44,7 +142,10 @@ class IGNIE(IGNBaseIE):
'timestamp': 1370440800,
'upload_date': '20130605',
'tags': 'count:9',
}
},
'params': {
'nocheckcertificate': True,
},
}, {
'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data',
'md5': 'f1581a6fe8c5121be5b807684aeac3f6',
@@ -56,86 +157,51 @@ class IGNIE(IGNBaseIE):
'timestamp': 1420571160,
'upload_date': '20150106',
'tags': 'count:4',
}
},
'skip': '404 Not Found',
}, {
'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix',
'only_matching': True,
}]
@classmethod
def _extract_embed_urls(cls, url, webpage):
grids = re.findall(
r'''(?s)<section\b[^>]+\bclass\s*=\s*['"](?:[\w-]+\s+)*?content-feed-grid(?!\B|-)[^>]+>(.+?)</section[^>]*>''',
webpage)
return filter(None,
(urljoin(url, m.group('path')) for m in re.finditer(
r'''<a\b[^>]+\bhref\s*=\s*('|")(?P<path>/videos%s)\1'''
% cls._VIDEO_PATH_RE, grids[0] if grids else '')))
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
display_id = m.group('id')
if display_id:
return self._extract_video(url, display_id)
display_id = m.group('filt') or 'all'
return self._extract_playlist(url, display_id)
def _extract_playlist(self, url, display_id):
webpage = self._download_webpage(url, display_id)
return self.playlist_result(
(self.url_result(u, ie=self.ie_key())
for u in self._extract_embed_urls(url, webpage)),
playlist_id=display_id)
def _extract_video(self, url, display_id):
display_id = self._match_id(url)
video = self._call_api(display_id)
video_id = video['videoId']
metadata = video['metadata']
title = metadata.get('longTitle') or metadata.get('title') or metadata['name']
video = self._checked_call_api(display_id)
formats = []
refs = video.get('refs') or {}
info = self._extract_video_info(video)
m3u8_url = refs.get('m3uUrl')
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
f4m_url = refs.get('f4mUrl')
if f4m_url:
formats.extend(self._extract_f4m_formats(
f4m_url, video_id, f4m_id='hds', fatal=False))
for asset in (video.get('assets') or []):
asset_url = asset.get('url')
if not asset_url:
continue
formats.append({
'url': asset_url,
'tbr': int_or_none(asset.get('bitrate'), 1000),
'fps': int_or_none(asset.get('frame_rate')),
'height': int_or_none(asset.get('height')),
'width': int_or_none(asset.get('width')),
})
mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl'])
if mezzanine_url:
formats.append({
'ext': determine_ext(mezzanine_url, 'mp4'),
'format_id': 'mezzanine',
'preference': 1,
'url': mezzanine_url,
})
self._sort_formats(formats)
thumbnails = []
for thumbnail in (video.get('thumbnails') or []):
thumbnail_url = thumbnail.get('url')
if not thumbnail_url:
continue
thumbnails.append({
'url': thumbnail_url,
})
tags = []
for tag in (video.get('tags') or []):
display_name = tag.get('displayName')
if not display_name:
continue
tags.append(display_name)
return {
'id': video_id,
'title': title,
'description': strip_or_none(metadata.get('description')),
'timestamp': parse_iso8601(metadata.get('publishDate')),
'duration': int_or_none(metadata.get('duration')),
return merge_dicts({
'display_id': display_id,
'thumbnails': thumbnails,
'formats': formats,
'tags': tags,
}
}, info)
class IGNVideoIE(InfoExtractor):
class IGNVideoIE(IGNBaseIE):
_VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P<id>\d+)/(?:video|trailer)/'
_TESTS = [{
'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s',
@@ -147,7 +213,8 @@ class IGNVideoIE(InfoExtractor):
'description': 'Taking out assassination targets in Hitman has never been more stylish.',
'timestamp': 1444665600,
'upload_date': '20151012',
}
},
'expected_warnings': ['HTTP Error 400: Bad Request'],
}, {
'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds',
'only_matching': True,
@@ -167,22 +234,38 @@ class IGNVideoIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
req = HEADRequest(url.rsplit('/', 1)[0] + '/embed')
url = self._request_webpage(req, video_id).geturl()
parsed_url = compat_urlparse.urlparse(url)
embed_url = compat_urlparse.urlunparse(
parsed_url._replace(path=parsed_url.path.rsplit('/', 1)[0] + '/embed'))
webpage, urlh = self._download_webpage_handle(embed_url, video_id)
new_url = urlh.geturl()
ign_url = compat_parse_qs(
compat_urllib_parse_urlparse(url).query).get('url', [None])[0]
compat_urlparse.urlparse(new_url).query).get('url', [None])[-1]
if ign_url:
return self.url_result(ign_url, IGNIE.ie_key())
return self.url_result(url)
video = self._search_regex(r'(<div\b[^>]+\bdata-video-id\s*=\s*[^>]+>)', webpage, 'video element', fatal=False)
if not video:
if new_url == url:
raise ExtractorError('Redirect loop: ' + url)
return self.url_result(new_url)
video = extract_attributes(video)
video_data = video.get('data-settings') or '{}'
video_data = self._parse_json(video_data, video_id)['video']
info = self._extract_video_info(video_data)
return merge_dicts({
'display_id': video_id,
}, info)
class IGNArticleIE(IGNBaseIE):
_VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P<id>[^/?&#]+)'
_VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?(?:[\w-]+/)*?feature/\d+)/(?P<id>[^/?&#]+)'
_PAGE_TYPE = 'article'
_TESTS = [{
'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
'info_dict': {
'id': '524497489e4e8ff5848ece34',
'id': '72113',
'title': '100 Little Things in GTA 5 That Will Blow Your Mind',
},
'playlist': [
@@ -190,7 +273,7 @@ class IGNArticleIE(IGNBaseIE):
'info_dict': {
'id': '5ebbd138523268b93c9141af17bec937',
'ext': 'mp4',
'title': 'GTA 5 Video Review',
'title': 'Grand Theft Auto V Video Review',
'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.',
'timestamp': 1379339880,
'upload_date': '20130916',
@@ -200,7 +283,7 @@ class IGNArticleIE(IGNBaseIE):
'info_dict': {
'id': '638672ee848ae4ff108df2a296418ee2',
'ext': 'mp4',
'title': '26 Twisted Moments from GTA 5 in Slow Motion',
'title': 'GTA 5 In Slow Motion',
'description': 'The twisted beauty of GTA 5 in stunning slow motion.',
'timestamp': 1386878820,
'upload_date': '20131212',
@@ -208,16 +291,17 @@ class IGNArticleIE(IGNBaseIE):
},
],
'params': {
'playlist_items': '2-3',
'skip_download': True,
},
'expected_warnings': ['Backend fetch failed'],
}, {
'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch',
'info_dict': {
'id': '53ee806780a81ec46e0790f8',
'title': 'Rewind Theater - Wild Trailer Gamescom 2014',
},
'playlist_count': 2,
'playlist_count': 1,
'expected_warnings': ['Backend fetch failed'],
}, {
# videoId pattern
'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned',
@@ -240,18 +324,91 @@ class IGNArticleIE(IGNBaseIE):
'only_matching': True,
}]
def _checked_call_api(self, slug):
try:
return self._call_api(slug)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError):
e.cause.args = e.cause.args or [
e.cause.geturl(), e.cause.getcode(), e.cause.reason]
if e.cause.code == 404:
raise ExtractorError(
'Content not found: expired?', cause=e.cause,
expected=True)
elif e.cause.code == 503:
self.report_warning(error_to_compat_str(e.cause))
return
raise
def _search_nextjs_data(self, webpage, video_id, **kw):
return self._parse_json(
self._search_regex(
r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
webpage, 'next.js data', **kw),
video_id, **kw)
def _real_extract(self, url):
display_id = self._match_id(url)
article = self._call_api(display_id)
article = self._checked_call_api(display_id)
def entries():
media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url'])
if media_url:
yield self.url_result(media_url, IGNIE.ie_key())
for content in (article.get('content') or []):
for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content):
yield self.url_result(video_url)
if article:
# obsolete ?
def entries():
media_url = traverse_obj(
article, ('mediaRelations', 0, 'media', 'metadata', 'url'),
expected_type=url_or_none)
if media_url:
yield self.url_result(media_url, IGNIE.ie_key())
for content in (article.get('content') or []):
for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content):
if url_or_none(video_url):
yield self.url_result(video_url)
return self.playlist_result(
entries(), article.get('articleId'),
traverse_obj(
article, ('metadata', 'headline'),
expected_type=lambda x: x.strip() or None))
webpage = self._download_webpage(url, display_id)
playlist_id = self._html_search_meta('dable:item_id', webpage, default=None)
if playlist_id:
def entries():
for m in re.finditer(
r'''(?s)<object\b[^>]+\bclass\s*=\s*("|')ign-videoplayer\1[^>]*>(?P<params>.+?)</object''',
webpage):
flashvars = self._search_regex(
r'''(<param\b[^>]+\bname\s*=\s*("|')flashvars\2[^>]*>)''',
m.group('params'), 'flashvars', default='')
flashvars = compat_parse_qs(extract_attributes(flashvars).get('value') or '')
v_url = url_or_none((flashvars.get('url') or [None])[-1])
if v_url:
yield self.url_result(v_url)
else:
playlist_id = self._search_regex(
r'''\bdata-post-id\s*=\s*("|')(?P<id>[\da-f]+)\1''',
webpage, 'id', group='id', default=None)
nextjs_data = self._search_nextjs_data(webpage, display_id)
def entries():
for player in traverse_obj(
nextjs_data,
('props', 'apolloState', 'ROOT_QUERY', lambda k, _: k.startswith('videoPlayerProps('), '__ref')):
# skip promo links (which may not always be served, eg GH CI servers)
if traverse_obj(nextjs_data,
('props', 'apolloState', player.replace('PlayerProps', 'ModernContent')),
expected_type=dict):
continue
video = traverse_obj(nextjs_data, ('props', 'apolloState', player), expected_type=dict) or {}
info = self._extract_video_info(video, fatal=False)
if info:
yield merge_dicts({
'display_id': display_id,
}, info)
return self.playlist_result(
entries(), article.get('articleId'),
strip_or_none(try_get(article, lambda x: x['metadata']['headline'])))
entries(), playlist_id or display_id,
re.sub(r'\s+-\s+IGN\s*$', '', self._og_search_title(webpage, default='')) or None)

View File

@@ -1,101 +1,267 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
ExtractorError,
float_or_none,
int_or_none,
js_to_json,
merge_dicts,
mimetype2ext,
ExtractorError,
parse_iso8601,
T,
traverse_obj,
txt_or_none,
url_or_none,
)
class ImgurIE(InfoExtractor):
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|(?:t(?:opic)?|r)/[^/]+)/)(?P<id>[a-zA-Z0-9]+)'
class ImgurBaseIE(InfoExtractor):
# hard-coded value, as also used by ArchiveTeam
_CLIENT_ID = '546c25a59c58ad7'
@classmethod
def _imgur_result(cls, item_id):
return cls.url_result('imgur:%s' % item_id, ImgurIE.ie_key(), item_id)
def _call_api(self, endpoint, video_id, **kwargs):
return self._download_json(
'https://api.imgur.com/post/v1/%s/%s?client_id=%s&include=media,account' % (endpoint, video_id, self._CLIENT_ID),
video_id, **kwargs)
@staticmethod
def get_description(s):
if 'Discover the magic of the internet at Imgur' in s:
return None
return txt_or_none(s)
class ImgurIE(ImgurBaseIE):
_VALID_URL = r'''(?x)
(?:
https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|t|topic|r)/)|
imgur:
)(?P<id>[a-zA-Z0-9]+)
'''
_TESTS = [{
'url': 'https://i.imgur.com/A61SaA1.gifv',
'url': 'https://imgur.com/A61SaA1',
'info_dict': {
'id': 'A61SaA1',
'ext': 'mp4',
'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$',
'timestamp': 1416446068,
'upload_date': '20141120',
},
}, {
'url': 'https://imgur.com/A61SaA1',
'url': 'https://i.imgur.com/A61SaA1.gifv',
'only_matching': True,
}, {
'url': 'https://i.imgur.com/crGpqCV.mp4',
'only_matching': True,
}, {
# no title
# previously, no title
'url': 'https://i.imgur.com/jxBXAMC.gifv',
'only_matching': True,
'info_dict': {
'id': 'jxBXAMC',
'ext': 'mp4',
'title': 'Fahaka puffer feeding',
'timestamp': 1533835503,
'upload_date': '20180809',
},
}]
def _extract_twitter_formats(self, html, tw_id='twitter', **kwargs):
fatal = kwargs.pop('fatal', False)
tw_stream = self._html_search_meta('twitter:player:stream', html, fatal=fatal, **kwargs)
if not tw_stream:
return []
ext = mimetype2ext(self._html_search_meta(
'twitter:player:stream:content_type', html, default=None))
width, height = (int_or_none(self._html_search_meta('twitter:player:' + v, html, default=None))
for v in ('width', 'height'))
return [{
'format_id': tw_id,
'url': tw_stream,
'ext': ext or determine_ext(tw_stream),
'width': width,
'height': height,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
data = self._call_api('media', video_id, fatal=False, expected_status=404)
webpage = self._download_webpage(
'https://i.imgur.com/{id}.gifv'.format(id=video_id), video_id)
'https://i.imgur.com/{id}.gifv'.format(id=video_id), video_id, fatal=not data) or ''
width = int_or_none(self._og_search_property(
'video:width', webpage, default=None))
height = int_or_none(self._og_search_property(
'video:height', webpage, default=None))
if not traverse_obj(data, ('media', 0, (
('type', T(lambda t: t == 'video' or None)),
('metadata', 'is_animated'))), get_all=False):
raise ExtractorError(
'%s is not a video or animated image' % video_id,
expected=True)
media_fmt = traverse_obj(data, ('media', 0, {
'url': ('url', T(url_or_none)),
'ext': 'ext',
'width': ('width', T(int_or_none)),
'height': ('height', T(int_or_none)),
'filesize': ('size', T(int_or_none)),
'acodec': ('metadata', 'has_sound', T(lambda b: None if b else 'none')),
}))
media_url = traverse_obj(media_fmt, 'url')
if media_url:
if not media_fmt.get('ext'):
media_fmt['ext'] = mimetype2ext(traverse_obj(
data, ('media', 0, 'mime_type'))) or determine_ext(media_url)
if traverse_obj(data, ('media', 0, 'type')) == 'image':
media_fmt['acodec'] = 'none'
media_fmt.setdefault('preference', -10)
tw_formats = self._extract_twitter_formats(webpage)
if traverse_obj(tw_formats, (0, 'url')) == media_url:
tw_formats = []
else:
# maybe this isn't an animated image/video?
self._check_formats(tw_formats, video_id)
video_elements = self._search_regex(
r'(?s)<div class="video-elements">(.*?)</div>',
webpage, 'video elements', default=None)
if not video_elements:
if not (video_elements or tw_formats or media_url):
raise ExtractorError(
'No sources found for video %s. Maybe an image?' % video_id,
'No sources found for video %s. Maybe a plain image?' % video_id,
expected=True)
formats = []
for m in re.finditer(r'<source\s+src="(?P<src>[^"]+)"\s+type="(?P<type>[^"]+)"', video_elements):
formats.append({
'format_id': m.group('type').partition('/')[2],
'url': self._proto_relative_url(m.group('src')),
'ext': mimetype2ext(m.group('type')),
'width': width,
'height': height,
def mung_format(fmt, *extra):
fmt.update({
'http_headers': {
'User-Agent': 'youtube-dl (like wget)',
},
})
for d in extra:
fmt.update(d)
return fmt
gif_json = self._search_regex(
r'(?s)var\s+videoItem\s*=\s*(\{.*?\})',
webpage, 'GIF code', fatal=False)
if gif_json:
gifd = self._parse_json(
gif_json, video_id, transform_source=js_to_json)
formats.append({
'format_id': 'gif',
'preference': -10,
'width': width,
'height': height,
'ext': 'gif',
'acodec': 'none',
'vcodec': 'gif',
'container': 'gif',
'url': self._proto_relative_url(gifd['gifUrl']),
'filesize': gifd.get('size'),
'http_headers': {
'User-Agent': 'youtube-dl (like wget)',
},
})
if video_elements:
def og_get_size(media_type):
return dict((p, int_or_none(self._og_search_property(
':'.join((media_type, p)), webpage, default=None)))
for p in ('width', 'height'))
size = og_get_size('video')
if all(v is None for v in size.values()):
size = og_get_size('image')
formats = traverse_obj(
re.finditer(r'<source\s+src="(?P<src>[^"]+)"\s+type="(?P<type>[^"]+)"', video_elements),
(Ellipsis, {
'format_id': ('type', T(lambda s: s.partition('/')[2])),
'url': ('src', T(self._proto_relative_url)),
'ext': ('type', T(mimetype2ext)),
}, T(lambda f: mung_format(f, size))))
gif_json = self._search_regex(
r'(?s)var\s+videoItem\s*=\s*(\{.*?\})',
webpage, 'GIF code', fatal=False)
MUST_BRANCH = (None, T(lambda _: None))
formats.extend(traverse_obj(gif_json, (
T(lambda j: self._parse_json(
j, video_id, transform_source=js_to_json, fatal=False)), {
'url': ('gifUrl', T(self._proto_relative_url)),
'filesize': ('size', T(int_or_none)),
}, T(lambda f: mung_format(f, size, {
'format_id': 'gif',
'preference': -10, # gifs are worse than videos
'ext': 'gif',
'acodec': 'none',
'vcodec': 'gif',
'container': 'gif',
})), MUST_BRANCH)))
else:
formats = []
# maybe add formats from JSON or page Twitter metadata
if not any((u == media_url) for u in traverse_obj(formats, (Ellipsis, 'url'))):
formats.append(mung_format(media_fmt))
tw_url = traverse_obj(tw_formats, (0, 'url'))
if not any((u == tw_url) for u in traverse_obj(formats, (Ellipsis, 'url'))):
formats.extend(mung_format(f) for f in tw_formats)
self._sort_formats(formats)
return {
return merge_dicts(traverse_obj(data, {
'uploader_id': ('account_id', T(txt_or_none),
T(lambda a: a if int_or_none(a) != 0 else None)),
'uploader': ('account', 'username', T(txt_or_none)),
'uploader_url': ('account', 'avatar_url', T(url_or_none)),
'like_count': ('upvote_count', T(int_or_none)),
'dislike_count': ('downvote_count', T(int_or_none)),
'comment_count': ('comment_count', T(int_or_none)),
'age_limit': ('is_mature', T(lambda x: 18 if x else None)),
'timestamp': (('updated_at', 'created_at'), T(parse_iso8601)),
'release_timestamp': ('created_at', T(parse_iso8601)),
}, get_all=False), traverse_obj(data, ('media', 0, 'metadata', {
'title': ('title', T(txt_or_none)),
'description': ('description', T(self.get_description)),
'duration': ('duration', T(float_or_none)),
'timestamp': (('updated_at', 'created_at'), T(parse_iso8601)),
'release_timestamp': ('created_at', T(parse_iso8601)),
})), {
'id': video_id,
'formats': formats,
'title': self._og_search_title(webpage, default=video_id),
}
'title': self._og_search_title(webpage, default='Imgur video ' + video_id),
'description': self.get_description(self._og_search_description(webpage)),
'thumbnail': url_or_none(self._html_search_meta('thumbnailUrl', webpage, default=None)),
})
class ImgurGalleryIE(InfoExtractor):
class ImgurGalleryBaseIE(ImgurBaseIE):
_GALLERY = True
def _real_extract(self, url):
gallery_id = self._match_id(url)
data = self._call_api('albums', gallery_id, fatal=False, expected_status=404)
info = traverse_obj(data, {
'title': ('title', T(txt_or_none)),
'description': ('description', T(self.get_description)),
})
if traverse_obj(data, 'is_album'):
def yield_media_ids():
for m_id in traverse_obj(data, (
'media', lambda _, v: v.get('type') == 'video' or v['metadata']['is_animated'],
'id', T(txt_or_none))):
yield m_id
# if a gallery with exactly one video, apply album metadata to video
media_id = (
self._GALLERY
and traverse_obj(data, ('image_count', T(lambda c: c == 1)))
and next(yield_media_ids(), None))
if not media_id:
result = self.playlist_result(
map(self._imgur_result, yield_media_ids()), gallery_id)
result.update(info)
return result
gallery_id = media_id
result = self._imgur_result(gallery_id)
info['_type'] = 'url_transparent'
result.update(info)
return result
class ImgurGalleryIE(ImgurGalleryBaseIE):
IE_NAME = 'imgur:gallery'
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/]+)/(?P<id>[a-zA-Z0-9]+)'
@@ -106,49 +272,93 @@ class ImgurGalleryIE(InfoExtractor):
'title': 'Adding faces make every GIF better',
},
'playlist_count': 25,
'skip': 'Zoinks! You\'ve taken a wrong turn.',
}, {
# TODO: static images - replace with animated/video gallery
'url': 'http://imgur.com/topic/Aww/ll5Vk',
'only_matching': True,
}, {
'url': 'https://imgur.com/gallery/YcAQlkx',
'add_ies': ['Imgur'],
'info_dict': {
'id': 'YcAQlkx',
'ext': 'mp4',
'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....',
}
'timestamp': 1358554297,
'upload_date': '20130119',
'uploader_id': '1648642',
'uploader': 'wittyusernamehere',
},
}, {
# TODO: static image - replace with animated/video gallery
'url': 'http://imgur.com/topic/Funny/N8rOudd',
'only_matching': True,
}, {
'url': 'http://imgur.com/r/aww/VQcQPhM',
'only_matching': True,
'add_ies': ['Imgur'],
'info_dict': {
'id': 'VQcQPhM',
'ext': 'mp4',
'title': 'The boss is here',
'timestamp': 1476494751,
'upload_date': '20161015',
'uploader_id': '19138530',
'uploader': 'thematrixcam',
},
},
# from PR #16674
{
'url': 'https://imgur.com/t/unmuted/6lAn9VQ',
'info_dict': {
'id': '6lAn9VQ',
'title': 'Penguins !',
},
'playlist_count': 3,
}, {
'url': 'https://imgur.com/t/unmuted/kx2uD3C',
'add_ies': ['Imgur'],
'info_dict': {
'id': 'ZVMv45i',
'ext': 'mp4',
'title': 'Intruder',
'timestamp': 1528129683,
'upload_date': '20180604',
},
}, {
'url': 'https://imgur.com/t/unmuted/wXSK0YH',
'add_ies': ['Imgur'],
'info_dict': {
'id': 'JCAP4io',
'ext': 'mp4',
'title': 're:I got the blues$',
'description': 'Lukas vocal stylings.\n\nFP edit: dont encourage me. Ill never stop posting Luka and friends.',
'timestamp': 1527809525,
'upload_date': '20180531',
},
}]
def _real_extract(self, url):
gallery_id = self._match_id(url)
data = self._download_json(
'https://imgur.com/gallery/%s.json' % gallery_id,
gallery_id)['data']['image']
if data.get('is_album'):
entries = [
self.url_result('http://imgur.com/%s' % image['hash'], ImgurIE.ie_key(), image['hash'])
for image in data['album_images']['images'] if image.get('hash')]
return self.playlist_result(entries, gallery_id, data.get('title'), data.get('description'))
return self.url_result('http://imgur.com/%s' % gallery_id, ImgurIE.ie_key(), gallery_id)
class ImgurAlbumIE(ImgurGalleryIE):
class ImgurAlbumIE(ImgurGalleryBaseIE):
IE_NAME = 'imgur:album'
_VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P<id>[a-zA-Z0-9]+)'
_GALLERY = False
_TESTS = [{
# TODO: only static images - replace with animated/video gallery
'url': 'http://imgur.com/a/j6Orj',
'only_matching': True,
},
# from PR #21693
{
'url': 'https://imgur.com/a/iX265HX',
'info_dict': {
'id': 'j6Orj',
'title': 'A Literary Analysis of "Star Wars: The Force Awakens"',
'id': 'iX265HX',
'title': 'enen-no-shouboutai'
},
'playlist_count': 12,
'playlist_count': 2,
}, {
'url': 'https://imgur.com/a/8pih2Ed',
'info_dict': {
'id': '8pih2Ed'
},
'playlist_mincount': 1,
}]

View File

@@ -1,6 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
from ..utils import (
ExtractorError,
)
from ..compat import (
compat_b64decode,
@@ -90,7 +93,11 @@ class InfoQIE(BokeCCBaseIE):
}]
def _extract_http_audio(self, webpage, video_id):
fields = self._form_hidden_inputs('mp3Form', webpage)
try:
fields = self._form_hidden_inputs('mp3Form', webpage)
except ExtractorError:
fields = {}
http_audio_url = fields.get('filename')
if not http_audio_url:
return []

View File

@@ -3,123 +3,266 @@ from __future__ import unicode_literals
import json
import re
import sys
from .common import InfoExtractor
from .brightcove import BrightcoveNewIE
from ..compat import (
compat_HTTPError,
compat_integer_types,
compat_kwargs,
compat_urlparse,
)
from ..utils import (
clean_html,
determine_ext,
error_to_compat_str,
extract_attributes,
get_element_by_class,
JSON_LD_RE,
ExtractorError,
get_element_by_attribute,
int_or_none,
merge_dicts,
parse_duration,
parse_iso8601,
remove_start,
smuggle_url,
strip_or_none,
traverse_obj,
url_or_none,
urljoin,
)
class ITVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)'
_GEO_COUNTRIES = ['GB']
class ITVBaseIE(InfoExtractor):
def _search_nextjs_data(self, webpage, video_id, **kw):
transform_source = kw.pop('transform_source', None)
fatal = kw.pop('fatal', True)
return self._parse_json(
self._search_regex(
r'''<script\b[^>]+\bid=('|")__NEXT_DATA__\1[^>]*>(?P<js>[^<]+)</script>''',
webpage, 'next.js data', group='js', fatal=fatal, **kw),
video_id, transform_source=transform_source, fatal=fatal)
def __handle_request_webpage_error(self, err, video_id=None, errnote=None, fatal=True):
if errnote is False:
return False
if errnote is None:
errnote = 'Unable to download webpage'
errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
if fatal:
raise ExtractorError(errmsg, sys.exc_info()[2], cause=err, video_id=video_id)
else:
self._downloader.report_warning(errmsg)
return False
@staticmethod
def _vanilla_ua_header():
return {'User-Agent': 'Mozilla/5.0'}
def _download_webpage_handle(self, url, video_id, *args, **kwargs):
# specialised to (a) use vanilla UA (b) detect geo-block
params = self._downloader.params
nkwargs = {}
if (
'user_agent' not in params
and not any(re.match(r'(?i)user-agent\s*:', h)
for h in (params.get('headers') or []))
and 'User-Agent' not in (kwargs.get('headers') or {})):
kwargs.setdefault('headers', {})
kwargs['headers'] = self._vanilla_ua_header()
nkwargs = kwargs
if kwargs.get('expected_status') is not None:
exp = kwargs['expected_status']
if isinstance(exp, compat_integer_types):
exp = [exp]
if isinstance(exp, (list, tuple)) and 403 not in exp:
kwargs['expected_status'] = [403]
kwargs['expected_status'].extend(exp)
nkwargs = kwargs
else:
kwargs['expected_status'] = 403
nkwargs = kwargs
if nkwargs:
kwargs = compat_kwargs(kwargs)
ret = super(ITVBaseIE, self)._download_webpage_handle(url, video_id, *args, **kwargs)
if ret is False:
return ret
webpage, urlh = ret
if urlh.getcode() == 403:
# geo-block error is like this, with an unnecessary 'Of':
# '{\n "Message" : "Request Originated Outside Of Allowed Geographic Region",\
# \n "TransactionId" : "oas-magni-475082-xbYF0W"\n}'
if '"Request Originated Outside Of Allowed Geographic Region"' in webpage:
self.raise_geo_restricted(countries=['GB'])
ret = self.__handle_request_webpage_error(
compat_HTTPError(urlh.geturl(), 403, 'HTTP Error 403: Forbidden', urlh.headers, urlh),
fatal=kwargs.get('fatal'))
return ret
class ITVIE(ITVBaseIE):
_VALID_URL = r'https?://(?:www\.)?itv\.com/(?:(?P<w>watch)|hub)/[^/]+/(?(w)[\w-]+/)(?P<id>\w+)'
_IE_DESC = 'ITVX'
_TESTS = [{
'note': 'Hub URLs redirect to ITVX',
'url': 'https://www.itv.com/hub/liar/2a4547a0012',
'info_dict': {
'id': '2a4547a0012',
'ext': 'mp4',
'title': 'Liar - Series 2 - Episode 6',
'description': 'md5:d0f91536569dec79ea184f0a44cca089',
'series': 'Liar',
'season_number': 2,
'episode_number': 6,
},
'params': {
# m3u8 download
'skip_download': True,
},
'only_matching': True,
}, {
# unavailable via data-playlist-url
'note': 'Hub page unavailable via data-playlist-url (404 now)',
'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033',
'only_matching': True,
}, {
# InvalidVodcrid
'note': 'Hub page with InvalidVodcrid (404 now)',
'url': 'https://www.itv.com/hub/james-martins-saturday-morning/2a5159a0034',
'only_matching': True,
}, {
# ContentUnavailable
'note': 'Hub page with ContentUnavailable (404 now)',
'url': 'https://www.itv.com/hub/whos-doing-the-dishes/2a2898a0024',
'only_matching': True,
}]
}, {
'note': 'ITVX, or itvX, show',
'url': 'https://www.itv.com/watch/vera/1a7314/1a7314a0014',
'md5': 'bd0ad666b2c058fffe7d036785880064',
'info_dict': {
'id': '1a7314a0014',
'ext': 'mp4',
'title': 'Vera - Series 3 - Episode 4 - Prodigal Son',
'description': 'Vera and her team investigate the fatal stabbing of an ex-Met police officer outside a busy Newcastle nightclub - but there aren\'t many clues.',
'timestamp': 1653591600,
'upload_date': '20220526',
'uploader': 'ITVX',
'thumbnail': r're:https://\w+\.itv\.com/images/(?:\w+/)+\d+x\d+\?',
'duration': 5340.8,
'age_limit': 16,
'series': 'Vera',
'series_number': 3,
'episode': 'Prodigal Son',
'episode_number': 4,
'channel': 'ITV3',
'categories': list,
},
'params': {
# m3u8 download
# 'skip_download': True,
},
'skip': 'only available in UK',
}, {
'note': 'Latest ITV news bulletin: details change daily',
'url': 'https://www.itv.com/watch/news/varies-but-is-not-checked/6js5d0f',
'info_dict': {
'id': '6js5d0f',
'ext': 'mp4',
'title': r're:The latest ITV News headlines - \S.+',
'description': r'''re:.* today's top stories from the ITV News team.$''',
'timestamp': int,
'upload_date': r're:2\d\d\d(?:0[1-9]|1[0-2])(?:[012][1-9]|3[01])',
'uploader': 'ITVX',
'thumbnail': r're:https://images\.ctfassets\.net/(?:\w+/)+[\w.]+\.(?:jpg|png)',
'duration': float,
'age_limit': None,
},
'params': {
# variable download
# 'skip_download': True,
},
'skip': 'only available in UK',
}
]
def _og_extract(self, webpage, require_title=False):
return {
'title': self._og_search_title(webpage, fatal=require_title),
'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
'uploader': self._og_search_property('site_name', webpage, default=None),
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
params = extract_attributes(self._search_regex(
r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params'))
ios_playlist_url = params.get('data-video-playlist') or params['data-video-id']
hmac = params['data-video-hmac']
webpage = self._download_webpage(url, video_id)
# now quite different params!
params = extract_attributes(self._search_regex(
r'''(<[^>]+\b(?:class|data-testid)\s*=\s*("|')genie-container\2[^>]*>)''',
webpage, 'params'))
ios_playlist_url = traverse_obj(
params, 'data-video-id', 'data-video-playlist',
get_all=False, expected_type=url_or_none)
headers = self.geo_verification_headers()
headers.update({
'Accept': 'application/vnd.itv.vod.playlist.v2+json',
'Content-Type': 'application/json',
'hmac': hmac.upper(),
})
ios_playlist = self._download_json(
ios_playlist_url, video_id, data=json.dumps({
'user': {
'itvUserId': '',
'entitlements': [],
'token': ''
},
'device': {
'manufacturer': 'Safari',
'model': '5',
'manufacturer': 'Mobile Safari',
'model': '5.1',
'os': {
'name': 'Windows NT',
'version': '6.1',
'type': 'desktop'
'name': 'iOS',
'version': '5.0',
'type': ' mobile'
}
},
'client': {
'version': '4.1',
'id': 'browser'
'id': 'browser',
'supportsAdPods': True,
'service': 'itv.x',
'appversion': '2.43.28',
},
'variantAvailability': {
'player': 'hls',
'featureset': {
'min': ['hls', 'aes', 'outband-webvtt'],
'max': ['hls', 'aes', 'outband-webvtt']
},
'platformTag': 'dotcom'
'platformTag': 'mobile'
}
}).encode(), headers=headers)
video_data = ios_playlist['Playlist']['Video']
ios_base_url = video_data.get('Base')
ios_base_url = traverse_obj(video_data, 'Base', expected_type=url_or_none)
media_url = (
(lambda u: url_or_none(urljoin(ios_base_url, u)))
if ios_base_url else url_or_none)
formats = []
for media_file in (video_data.get('MediaFiles') or []):
href = media_file.get('Href')
for media_file in traverse_obj(video_data, 'MediaFiles', expected_type=list) or []:
href = traverse_obj(media_file, 'Href', expected_type=media_url)
if not href:
continue
if ios_base_url:
href = ios_base_url + href
ext = determine_ext(href)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
href, video_id, 'mp4', entry_protocol='m3u8_native',
href, video_id, 'mp4', entry_protocol='m3u8',
m3u8_id='hls', fatal=False))
else:
formats.append({
'url': href,
})
self._sort_formats(formats)
for f in formats:
f.setdefault('http_headers', {})
f['http_headers'].update(self._vanilla_ua_header())
subtitles = {}
subs = video_data.get('Subtitles') or []
for sub in subs:
if not isinstance(sub, dict):
continue
href = url_or_none(sub.get('Href'))
for sub in traverse_obj(video_data, 'Subtitles', expected_type=list) or []:
href = traverse_obj(sub, 'Href', expected_type=url_or_none)
if not href:
continue
subtitles.setdefault('en', []).append({
@@ -127,59 +270,132 @@ class ITVIE(InfoExtractor):
'ext': determine_ext(href, 'vtt'),
})
info = self._search_json_ld(webpage, video_id, default={})
if not info:
json_ld = self._parse_json(self._search_regex(
JSON_LD_RE, webpage, 'JSON-LD', '{}',
group='json_ld'), video_id, fatal=False)
if json_ld and json_ld.get('@type') == 'BreadcrumbList':
for ile in (json_ld.get('itemListElement:') or []):
item = ile.get('item:') or {}
if item.get('@type') == 'TVEpisode':
item['@context'] = 'http://schema.org'
info = self._json_ld(item, video_id, fatal=False) or {}
break
next_data = self._search_nextjs_data(webpage, video_id, fatal=False, default='{}')
video_data.update(traverse_obj(next_data, ('props', 'pageProps', ('title', 'episode')), expected_type=dict)[0] or {})
title = traverse_obj(video_data, 'headerTitle', 'episodeTitle')
info = self._og_extract(webpage, require_title=not title)
tn = info.pop('thumbnail', None)
if tn:
info['thumbnails'] = [{'url': tn}]
# num. episode title
num_ep_title = video_data.get('numberedEpisodeTitle')
if not num_ep_title:
num_ep_title = clean_html(get_element_by_attribute('data-testid', 'episode-hero-description-strong', webpage))
num_ep_title = num_ep_title and num_ep_title.rstrip(' -')
ep_title = strip_or_none(
video_data.get('episodeTitle')
or (num_ep_title.split('.', 1)[-1] if num_ep_title else None))
title = title or re.sub(r'\s+-\s+ITVX$', '', info['title'])
if ep_title and ep_title != title:
title = title + ' - ' + ep_title
def get_thumbnails():
tns = []
for w, x in (traverse_obj(video_data, ('imagePresets'), expected_type=dict) or {}).items():
if isinstance(x, dict):
for y, z in x.items():
tns.append({'id': w + '_' + y, 'url': z})
return tns or None
video_str = lambda *x: traverse_obj(
video_data, *x, get_all=False, expected_type=strip_or_none)
return merge_dicts({
'id': video_id,
'title': self._html_search_meta(['og:title', 'twitter:title'], webpage),
'title': title,
'formats': formats,
'subtitles': subtitles,
'duration': parse_duration(video_data.get('Duration')),
'description': clean_html(get_element_by_class('episode-info__synopsis', webpage)),
# parsing hh:mm:ss:nnn not yet patched
'duration': parse_duration(re.sub(r'(\d{2})(:)(\d{3}$)', r'\1.\3', video_data.get('Duration') or '')),
'description': video_str('synopsis'),
'timestamp': traverse_obj(video_data, 'broadcastDateTime', 'dateTime', expected_type=parse_iso8601),
'thumbnails': get_thumbnails(),
'series': video_str('showTitle', 'programmeTitle'),
'series_number': int_or_none(video_data.get('seriesNumber')),
'episode': ep_title,
'episode_number': int_or_none((num_ep_title or '').split('.')[0]),
'channel': video_str('channel'),
'categories': traverse_obj(video_data, ('categories', 'formatted'), expected_type=list),
'age_limit': {False: 16, True: 0}.get(video_data.get('isChildrenCategory')),
}, info)
class ITVBTCCIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TEST = {
'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch',
class ITVBTCCIE(ITVBaseIE):
_VALID_URL = r'https?://(?:www\.)?itv\.com/(?!(?:watch|hub)/)(?:[^/]+/)+(?P<id>[^/?#&]+)'
_IE_DESC = 'ITV articles: News, British Touring Car Championship'
_TESTS = [{
'note': 'British Touring Car Championship',
'url': 'https://www.itv.com/btcc/articles/btcc-2018-all-the-action-from-brands-hatch',
'info_dict': {
'id': 'btcc-2018-all-the-action-from-brands-hatch',
'title': 'BTCC 2018: All the action from Brands Hatch',
},
'playlist_mincount': 9,
}
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s'
}, {
'note': 'redirects to /btcc/articles/...',
'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch',
'only_matching': True,
}, {
'note': 'news article',
'url': 'https://www.itv.com/news/wales/2020-07-23/sean-fletcher-shows-off-wales-coastline-in-new-itv-series-as-british-tourists-opt-for-staycations',
'info_dict': {
'id': 'sean-fletcher-shows-off-wales-coastline-in-new-itv-series-as-british-tourists-opt-for-staycations',
'title': '''Sean Fletcher on why Wales' coastline should be your 'staycation' destination | ITV News''',
},
'playlist_mincount': 1,
}]
# should really be a class var of the BC IE
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
BRIGHTCOVE_ACCOUNT = '1582188683001'
BRIGHTCOVE_PLAYER = 'HkiHLnNRx'
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
webpage, urlh = self._download_webpage_handle(url, playlist_id)
link = compat_urlparse.urlparse(urlh.geturl()).path.strip('/')
entries = [
self.url_result(
smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {
# ITV does not like some GB IP ranges, so here are some
# IP blocks it accepts
'geo_ip_blocks': [
'193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21'
],
'referrer': url,
}),
ie=BrightcoveNewIE.ie_key(), video_id=video_id)
for video_id in re.findall(r'data-video-id=["\'](\d+)', webpage)]
next_data = self._search_nextjs_data(webpage, playlist_id, fatal=False, default='{}')
path_prefix = compat_urlparse.urlparse(next_data.get('assetPrefix') or '').path.strip('/')
link = remove_start(link, path_prefix).strip('/')
content = traverse_obj(
next_data, ('props', 'pageProps', Ellipsis),
expected_type=lambda x: x if x['link'] == link else None,
get_all=False, default={})
content = traverse_obj(
content, ('body', 'content', Ellipsis, 'data'),
expected_type=lambda x: x if x.get('name') == 'Brightcove' or x.get('type') == 'Brightcove' else None)
contraband = {
# ITV does not like some GB IP ranges, so here are some
# IP blocks it accepts
'geo_ip_blocks': [
'193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21'
],
'referrer': urlh.geturl(),
}
def entries():
for data in content or []:
video_id = data.get('id')
if not video_id:
continue
account = data.get('accountId') or self.BRIGHTCOVE_ACCOUNT
player = data.get('playerId') or self.BRIGHTCOVE_PLAYER
yield self.url_result(
smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account, player, video_id), contraband),
ie=BrightcoveNewIE.ie_key(), video_id=video_id)
# obsolete ?
for video_id in re.findall(r'''data-video-id=["'](\d+)''', webpage):
yield self.url_result(
smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (self.BRIGHTCOVE_ACCOUNT, self.BRIGHTCOVE_PLAYER, video_id), contraband),
ie=BrightcoveNewIE.ie_key(), video_id=video_id)
title = self._og_search_title(webpage, fatal=False)
return self.playlist_result(entries, playlist_id, title)
return self.playlist_result(entries(), playlist_id, title)

View File

@@ -373,5 +373,5 @@ class KalturaIE(InfoExtractor):
'duration': info.get('duration'),
'timestamp': info.get('createdAt'),
'uploader_id': info.get('userId') if info.get('userId') != 'None' else None,
'view_count': info.get('plays'),
'view_count': int_or_none(info.get('plays')),
}

View File

@@ -0,0 +1,35 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import update_url
class KommunetvIE(InfoExtractor):
_VALID_URL = r'https://(\w+).kommunetv.no/archive/(?P<id>\w+)'
_TEST = {
'url': 'https://oslo.kommunetv.no/archive/921',
'md5': '5f102be308ee759be1e12b63d5da4bbc',
'info_dict': {
'id': '921',
'title': 'Bystyremøte',
'ext': 'mp4'
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
headers = {
'Accept': 'application/json'
}
data = self._download_json('https://oslo.kommunetv.no/api/streams?streamType=1&id=%s' % video_id, video_id, headers=headers)
title = data['stream']['title']
file = data['playlist'][0]['playlist'][0]['file']
url = update_url(file, query=None, fragment=None)
formats = self._extract_m3u8_formats(url, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
'title': title
}

View File

@@ -0,0 +1,31 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import smuggle_url
class KTHIE(InfoExtractor):
_VALID_URL = r'https?://play\.kth\.se/(?:[^/]+/)+(?P<id>[a-z0-9_]+)'
_TEST = {
'url': 'https://play.kth.se/media/Lunch+breakA+De+nya+aff%C3%A4rerna+inom+Fordonsdalen/0_uoop6oz9',
'md5': 'd83ada6d00ca98b73243a88efe19e8a6',
'info_dict': {
'id': '0_uoop6oz9',
'ext': 'mp4',
'title': 'md5:bd1d6931facb6828762a33e6ce865f37',
'thumbnail': 're:https?://.+/thumbnail/.+',
'duration': 3516,
'timestamp': 1647345358,
'upload_date': '20220315',
'uploader_id': 'md5:0ec23e33a89e795a4512930c8102509f',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
result = self.url_result(
smuggle_url('kaltura:308:%s' % video_id, {
'service_url': 'https://api.kaltura.nordu.net'}),
'Kaltura')
return result

View File

@@ -1,11 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
determine_ext,
extract_attributes,
int_or_none,
str_to_int,
url_or_none,
urlencode_postdata,
)
@@ -20,17 +25,20 @@ class ManyVidsIE(InfoExtractor):
'id': '133957',
'ext': 'mp4',
'title': 'everthing about me (Preview)',
'uploader': 'ellyxxix',
'view_count': int,
'like_count': int,
},
}, {
# full video
'url': 'https://www.manyvids.com/Video/935718/MY-FACE-REVEAL/',
'md5': 'f3e8f7086409e9b470e2643edb96bdcc',
'md5': 'bb47bab0e0802c2a60c24ef079dfe60f',
'info_dict': {
'id': '935718',
'ext': 'mp4',
'title': 'MY FACE REVEAL',
'description': 'md5:ec5901d41808b3746fed90face161612',
'uploader': 'Sarah Calanthe',
'view_count': int,
'like_count': int,
},
@@ -39,17 +47,50 @@ class ManyVidsIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
real_url = 'https://www.manyvids.com/video/%s/gtm.js' % (video_id, )
try:
webpage = self._download_webpage(real_url, video_id)
except Exception:
# probably useless fallback
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
r'data-(?:video-filepath|meta-video)\s*=s*(["\'])(?P<url>(?:(?!\1).)+)\1',
webpage, 'video URL', group='url')
info = self._search_regex(
r'''(<div\b[^>]*\bid\s*=\s*(['"])pageMetaDetails\2[^>]*>)''',
webpage, 'meta details', default='')
info = extract_attributes(info)
title = self._html_search_regex(
(r'<span[^>]+class=["\']item-title[^>]+>([^<]+)',
r'<h2[^>]+class=["\']h2 m-0["\'][^>]*>([^<]+)'),
webpage, 'title', default=None) or self._html_search_meta(
'twitter:title', webpage, 'title', fatal=True)
player = self._search_regex(
r'''(<div\b[^>]*\bid\s*=\s*(['"])rmpPlayerStream\2[^>]*>)''',
webpage, 'player details', default='')
player = extract_attributes(player)
video_urls_and_ids = (
(info.get('data-meta-video'), 'video'),
(player.get('data-video-transcoded'), 'transcoded'),
(player.get('data-video-filepath'), 'filepath'),
(self._og_search_video_url(webpage, secure=False, default=None), 'og_video'),
)
def txt_or_none(s, default=None):
return (s.strip() or default) if isinstance(s, compat_str) else default
uploader = txt_or_none(info.get('data-meta-author'))
def mung_title(s):
if uploader:
s = re.sub(r'^\s*%s\s+[|-]' % (re.escape(uploader), ), '', s)
return txt_or_none(s)
title = (
mung_title(info.get('data-meta-title'))
or self._html_search_regex(
(r'<span[^>]+class=["\']item-title[^>]+>([^<]+)',
r'<h2[^>]+class=["\']h2 m-0["\'][^>]*>([^<]+)'),
webpage, 'title', default=None)
or self._html_search_meta(
'twitter:title', webpage, 'title', fatal=True))
title = re.sub(r'\s*[|-]\s+ManyVids\s*$', '', title) or title
if any(p in webpage for p in ('preview_videos', '_preview.mp4')):
title += ' (Preview)'
@@ -62,7 +103,8 @@ class ManyVidsIE(InfoExtractor):
# Sets some cookies
self._download_webpage(
'https://www.manyvids.com/includes/ajax_repository/you_had_me_at_hello.php',
video_id, fatal=False, data=urlencode_postdata({
video_id, note='Setting format cookies', fatal=False,
data=urlencode_postdata({
'mvtoken': mv_token,
'vid': video_id,
}), headers={
@@ -70,23 +112,56 @@ class ManyVidsIE(InfoExtractor):
'X-Requested-With': 'XMLHttpRequest'
})
if determine_ext(video_url) == 'm3u8':
formats = self._extract_m3u8_formats(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls')
else:
formats = [{'url': video_url}]
formats = []
for v_url, fmt in video_urls_and_ids:
v_url = url_or_none(v_url)
if not v_url:
continue
if determine_ext(v_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats(
v_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls'))
else:
formats.append({
'url': v_url,
'format_id': fmt,
})
like_count = int_or_none(self._search_regex(
r'data-likes=["\'](\d+)', webpage, 'like count', default=None))
view_count = str_to_int(self._html_search_regex(
r'(?s)<span[^>]+class="views-wrapper"[^>]*>(.+?)</span', webpage,
'view count', default=None))
self._remove_duplicate_formats(formats)
for f in formats:
if f.get('height') is None:
f['height'] = int_or_none(
self._search_regex(r'_(\d{2,3}[02468])_', f['url'], 'video height', default=None))
if '/preview/' in f['url']:
f['format_id'] = '_'.join(filter(None, (f.get('format_id'), 'preview')))
f['preference'] = -10
if 'transcoded' in f['format_id']:
f['preference'] = f.get('preference', -1) - 1
self._sort_formats(formats)
def get_likes():
likes = self._search_regex(
r'''(<a\b[^>]*\bdata-id\s*=\s*(['"])%s\2[^>]*>)''' % (video_id, ),
webpage, 'likes', default='')
likes = extract_attributes(likes)
return int_or_none(likes.get('data-likes'))
def get_views():
return str_to_int(self._html_search_regex(
r'''(?s)<span\b[^>]*\bclass\s*=["']views-wrapper\b[^>]+>.+?<span\b[^>]+>\s*(\d[\d,.]*)\s*</span>''',
webpage, 'view count', default=None))
return {
'id': video_id,
'title': title,
'view_count': view_count,
'like_count': like_count,
'formats': formats,
'description': txt_or_none(info.get('data-meta-description')),
'uploader': txt_or_none(info.get('data-meta-author')),
'thumbnail': (
url_or_none(info.get('data-meta-image'))
or url_or_none(player.get('data-video-screenshot'))),
'view_count': get_views(),
'like_count': get_likes(),
}

View File

@@ -24,7 +24,7 @@ class MediasetIE(ThePlatformBaseIE):
(?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/
(?:
(?:video|on-demand|movie)/(?:[^/]+/)+[^/]+_|
player/index\.html\?.*?\bprogramGuid=
player(?:/v\d+)?/index\.html\?.*?\bprogramGuid=
)
)(?P<id>[0-9A-Z]{16,})
'''
@@ -73,6 +73,10 @@ class MediasetIE(ThePlatformBaseIE):
# iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/)
'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104',
'only_matching': True,
}, {
# embedUrl (from https://www.wittytv.it/amici/est-ce-que-tu-maimes-gabriele-5-dicembre-copia/)
'url': 'https://static3.mediasetplay.mediaset.it/player/v2/index.html?partnerId=wittytv&configId=&programGuid=FD00000000153323&autoplay=true&purl=http://www.wittytv.it/amici/est-ce-que-tu-maimes-gabriele-5-dicembre-copia/',
'only_matching': True,
}, {
'url': 'mediaset:FAFU000000665924',
'only_matching': True,

View File

@@ -78,7 +78,7 @@ class MindsIE(MindsBaseIE):
else:
return self.url_result(entity['perma_url'])
else:
assert(entity['subtype'] == 'video')
assert (entity['subtype'] == 'video')
video_id = entity_id
# 1080p and webm formats available only on the sources array
video = self._call_api(

View File

@@ -1,3 +1,4 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
@@ -10,7 +11,7 @@ from ..compat import (
compat_ord,
compat_str,
compat_urllib_parse_unquote,
compat_zip
compat_zip as zip,
)
from ..utils import (
int_or_none,
@@ -24,7 +25,7 @@ class MixcloudBaseIE(InfoExtractor):
def _call_api(self, object_type, object_fields, display_id, username, slug=None):
lookup_key = object_type + 'Lookup'
return self._download_json(
'https://www.mixcloud.com/graphql', display_id, query={
'https://app.mixcloud.com/graphql', display_id, query={
'query': '''{
%s(lookup: {username: "%s"%s}) {
%s
@@ -44,7 +45,7 @@ class MixcloudIE(MixcloudBaseIE):
'ext': 'm4a',
'title': 'Cryptkeeper',
'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
'uploader': 'Daniel Holbach',
'uploader': 'dholbach', # was: 'Daniel Holbach',
'uploader_id': 'dholbach',
'thumbnail': r're:https?://.*\.jpg',
'view_count': int,
@@ -57,7 +58,7 @@ class MixcloudIE(MixcloudBaseIE):
'id': 'gillespeterson_caribou-7-inch-vinyl-mix-chat',
'ext': 'mp3',
'title': 'Caribou 7 inch Vinyl Mix & Chat',
'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
'description': r're:Last week Dan Snaith aka Caribou swung by the Brownswood.{136}',
'uploader': 'Gilles Peterson Worldwide',
'uploader_id': 'gillespeterson',
'thumbnail': 're:https?://.*',
@@ -65,6 +66,23 @@ class MixcloudIE(MixcloudBaseIE):
'timestamp': 1422987057,
'upload_date': '20150203',
},
'params': {
'skip_download': '404 not found',
},
}, {
'url': 'https://www.mixcloud.com/gillespeterson/carnival-m%C3%BAsica-popular-brasileira-mix/',
'info_dict': {
'id': 'gillespeterson_carnival-música-popular-brasileira-mix',
'ext': 'm4a',
'title': 'Carnival Música Popular Brasileira Mix',
'description': r're:Gilles was recently in Brazil to play at Boiler Room.{208}',
'timestamp': 1454347174,
'upload_date': '20160201',
'uploader': 'Gilles Peterson Worldwide',
'uploader_id': 'gillespeterson',
'thumbnail': 're:https?://.*',
'view_count': int,
},
}, {
'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/',
'only_matching': True,
@@ -76,10 +94,10 @@ class MixcloudIE(MixcloudBaseIE):
"""Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR."""
return ''.join([
compat_chr(compat_ord(ch) ^ compat_ord(k))
for ch, k in compat_zip(ciphertext, itertools.cycle(key))])
for ch, k in zip(ciphertext, itertools.cycle(key))])
def _real_extract(self, url):
username, slug = re.match(self._VALID_URL, url).groups()
username, slug = self._match_valid_url(url).groups()
username, slug = compat_urllib_parse_unquote(username), compat_urllib_parse_unquote(slug)
track_id = '%s_%s' % (username, slug)

View File

@@ -1,3 +1,4 @@
# coding: utf-8
from __future__ import unicode_literals
import datetime
@@ -71,7 +72,7 @@ class MotherlessIE(InfoExtractor):
'title': 'a/ Hot Teens',
'categories': list,
'upload_date': '20210104',
'uploader_id': 'yonbiw',
'uploader_id': 'anonymous',
'thumbnail': r're:https?://.*\.jpg',
'age_limit': 18,
},
@@ -125,9 +126,10 @@ class MotherlessIE(InfoExtractor):
kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta}
upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d')
comment_count = webpage.count('class="media-comment-contents"')
comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage))
uploader_id = self._html_search_regex(
r'"thumb-member-username">\s+<a href="/m/([^"]+)"',
(r'''<span\b[^>]+\bclass\s*=\s*["']username\b[^>]*>([^<]+)</span>''',
r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)'''),
webpage, 'uploader_id')
categories = self._html_search_meta('keywords', webpage, default=None)
@@ -169,7 +171,18 @@ class MotherlessGroupIE(InfoExtractor):
'description': 'Sex can be funny. Wide smiles,laugh, games, fun of '
'any kind!'
},
'playlist_mincount': 9,
'playlist_mincount': 0,
'expected_warnings': [
'This group has no videos.',
]
}, {
'url': 'https://motherless.com/g/beautiful_cock',
'info_dict': {
'id': 'beautiful_cock',
'title': 'Beautiful Cock',
'description': 'Group for lovely cocks yours, mine, a friends anything human',
},
'playlist_mincount': 2500,
}]
@classmethod
@@ -208,16 +221,23 @@ class MotherlessGroupIE(InfoExtractor):
r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False)
description = self._html_search_meta(
'description', webpage, fatal=False)
page_count = self._int(self._search_regex(
r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT',
webpage, 'page_count'), 'page_count')
page_count = str_to_int(self._search_regex(
r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b',
webpage, 'page_count', default=0))
if not page_count:
message = self._search_regex(
r'''class\s*=\s*['"]error-page\b[^>]*>\s*<p[^>]*>\s*(?P<error_msg>[^<]+)(?<=\S)\s*''',
webpage, 'error_msg', default=None) or 'This group has no videos.'
self.report_warning(message, group_id)
page_count = 1
PAGE_SIZE = 80
def _get_page(idx):
webpage = self._download_webpage(
page_url, group_id, query={'page': idx + 1},
note='Downloading page %d/%d' % (idx + 1, page_count)
)
if idx > 0:
webpage = self._download_webpage(
page_url, group_id, query={'page': idx + 1},
note='Downloading page %d/%d' % (idx + 1, page_count)
)
for entry in self._extract_entries(webpage, url):
yield entry

View File

@@ -35,7 +35,9 @@ class MySpassIE(InfoExtractor):
title = xpath_text(metadata, 'title', fatal=True)
video_url = xpath_text(metadata, 'url_flv', 'download url', True)
video_id_int = int(video_id)
for group in re.search(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url).groups():
grps = re.search(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url)
for group in grps.groups() if grps else []:
group_int = int(group)
if group_int > video_id_int:
video_url = video_url.replace(

View File

@@ -0,0 +1,87 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
get_element_by_id,
get_element_by_class,
int_or_none,
js_to_json,
MONTH_NAMES,
qualities,
unified_strdate,
)
class MyVideoGeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?myvideo\.ge/v/(?P<id>[0-9]+)'
_TEST = {
'url': 'https://www.myvideo.ge/v/3941048',
'md5': '8c192a7d2b15454ba4f29dc9c9a52ea9',
'info_dict': {
'id': '3941048',
'ext': 'mp4',
'title': 'The best prikol',
'upload_date': '20200611',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'chixa33',
'description': 'md5:5b067801318e33c2e6eea4ab90b1fdd3',
},
# working from local dev system
'skip': 'site blocks CI servers',
}
_MONTH_NAMES_KA = ['იანვარი', 'თებერვალი', 'მარტი', 'აპრილი', 'მაისი', 'ივნისი', 'ივლისი', 'აგვისტო', 'სექტემბერი', 'ოქტომბერი', 'ნოემბერი', 'დეკემბერი']
_quality = staticmethod(qualities(('SD', 'HD')))
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = (
self._og_search_title(webpage, default=None)
or clean_html(get_element_by_class('my_video_title', webpage))
or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title\b', webpage, 'title'))
jwplayer_sources = self._parse_json(
self._search_regex(
r'''(?s)jwplayer\s*\(\s*['"]mvplayer['"]\s*\)\s*\.\s*setup\s*\(.*?\bsources\s*:\s*(\[.*?])\s*[,});]''', webpage, 'jwplayer sources', fatal=False)
or '',
video_id, transform_source=js_to_json, fatal=False)
formats = self._parse_jwplayer_formats(jwplayer_sources or [], video_id)
for f in formats or []:
f['preference'] = self._quality(f['format_id'])
self._sort_formats(formats)
description = (
self._og_search_description(webpage)
or get_element_by_id('long_desc_holder', webpage)
or self._html_search_meta('description', webpage))
uploader = self._search_regex(r'<a[^>]+class="mv_user_name"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False)
upload_date = get_element_by_class('mv_vid_upl_date', webpage)
# as ka locale may not be present roll a local date conversion
upload_date = (unified_strdate(
# translate any ka month to an en one
re.sub('|'.join(self._MONTH_NAMES_KA),
lambda m: MONTH_NAMES['en'][self._MONTH_NAMES_KA.index(m.group(0))],
upload_date, re.I))
if upload_date else None)
return {
'id': video_id,
'title': title,
'description': description,
'uploader': uploader,
'formats': formats,
'thumbnail': self._og_search_thumbnail(webpage),
'upload_date': upload_date,
'view_count': int_or_none(get_element_by_class('mv_vid_views', webpage)),
'like_count': int_or_none(get_element_by_id('likes_count', webpage)),
'dislike_count': int_or_none(get_element_by_id('dislikes_count', webpage)),
}

View File

@@ -1,20 +1,32 @@
# coding: utf-8
from __future__ import unicode_literals
from hashlib import md5
from base64 import b64encode
from binascii import hexlify
from datetime import datetime
from hashlib import md5
from random import randint
import json
import re
import time
from .common import InfoExtractor
from ..aes import aes_ecb_encrypt, pkcs7_padding
from ..compat import (
compat_urllib_parse_urlencode,
compat_str,
compat_itertools_count,
)
from ..utils import (
sanitized_Request,
ExtractorError,
bytes_to_intlist,
error_to_compat_str,
float_or_none,
int_or_none,
intlist_to_bytes,
sanitized_Request,
std_headers,
try_get,
)
@@ -35,32 +47,106 @@ class NetEaseMusicBaseIE(InfoExtractor):
result = b64encode(m.digest()).decode('ascii')
return result.replace('/', '_').replace('+', '-')
@classmethod
def make_player_api_request_data_and_headers(cls, song_id, bitrate):
KEY = b'e82ckenh8dichen8'
URL = '/api/song/enhance/player/url'
now = int(time.time() * 1000)
rand = randint(0, 1000)
cookie = {
'osver': None,
'deviceId': None,
'appver': '8.0.0',
'versioncode': '140',
'mobilename': None,
'buildver': '1623435496',
'resolution': '1920x1080',
'__csrf': '',
'os': 'pc',
'channel': None,
'requestId': '{0}_{1:04}'.format(now, rand),
}
request_text = json.dumps(
{'ids': '[{0}]'.format(song_id), 'br': bitrate, 'header': cookie},
separators=(',', ':'))
message = 'nobody{0}use{1}md5forencrypt'.format(
URL, request_text).encode('latin1')
msg_digest = md5(message).hexdigest()
data = '{0}-36cd479b6b5-{1}-36cd479b6b5-{2}'.format(
URL, request_text, msg_digest)
data = pkcs7_padding(bytes_to_intlist(data))
encrypted = intlist_to_bytes(aes_ecb_encrypt(data, bytes_to_intlist(KEY)))
encrypted_params = hexlify(encrypted).decode('ascii').upper()
cookie = '; '.join(
['{0}={1}'.format(k, v if v is not None else 'undefined')
for [k, v] in cookie.items()])
headers = {
'User-Agent': std_headers['User-Agent'],
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'https://music.163.com',
'Cookie': cookie,
}
return ('params={0}'.format(encrypted_params), headers)
def _call_player_api(self, song_id, bitrate):
url = 'https://interface3.music.163.com/eapi/song/enhance/player/url'
data, headers = self.make_player_api_request_data_and_headers(song_id, bitrate)
try:
msg = 'empty result'
result = self._download_json(
url, song_id, data=data.encode('ascii'), headers=headers)
if result:
return result
except ExtractorError as e:
if type(e.cause) in (ValueError, TypeError):
# JSON load failure
raise
except Exception as e:
msg = error_to_compat_str(e)
self.report_warning('%s API call (%s) failed: %s' % (
song_id, bitrate, msg))
return {}
def extract_formats(self, info):
err = 0
formats = []
song_id = info['id']
for song_format in self._FORMATS:
details = info.get(song_format)
if not details:
continue
song_file_path = '/%s/%s.%s' % (
self._encrypt(details['dfsId']), details['dfsId'], details['extension'])
# 203.130.59.9, 124.40.233.182, 115.231.74.139, etc is a reverse proxy-like feature
# from NetEase's CDN provider that can be used if m5.music.126.net does not
# work, especially for users outside of Mainland China
# via: https://github.com/JixunMoe/unblock-163/issues/3#issuecomment-163115880
for host in ('http://m5.music.126.net', 'http://115.231.74.139/m1.music.126.net',
'http://124.40.233.182/m1.music.126.net', 'http://203.130.59.9/m1.music.126.net'):
song_url = host + song_file_path
bitrate = int_or_none(details.get('bitrate')) or 999000
data = self._call_player_api(song_id, bitrate)
for song in try_get(data, lambda x: x['data'], list) or []:
song_url = try_get(song, lambda x: x['url'])
if not song_url:
continue
if self._is_valid_url(song_url, info['id'], 'song'):
formats.append({
'url': song_url,
'ext': details.get('extension'),
'abr': float_or_none(details.get('bitrate'), scale=1000),
'abr': float_or_none(song.get('br'), scale=1000),
'format_id': song_format,
'filesize': details.get('size'),
'asr': details.get('sr')
'filesize': int_or_none(song.get('size')),
'asr': int_or_none(details.get('sr')),
})
break
elif err == 0:
err = try_get(song, lambda x: x['code'], int)
if not formats:
msg = 'No media links found'
if err != 0 and (err < 200 or err >= 400):
raise ExtractorError(
'%s (site code %d)' % (msg, err, ), expected=True)
else:
self.raise_geo_restricted(
msg + ': probably this video is not available from your location due to geo restriction.',
countries=['CN'])
return formats
@classmethod
@@ -76,33 +162,19 @@ class NetEaseMusicBaseIE(InfoExtractor):
class NetEaseMusicIE(NetEaseMusicBaseIE):
IE_NAME = 'netease:song'
IE_DESC = '网易云音乐'
_VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P<id>[0-9]+)'
_VALID_URL = r'https?://(y\.)?music\.163\.com/(?:[#m]/)?song\?.*?\bid=(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://music.163.com/#/song?id=32102397',
'md5': 'f2e97280e6345c74ba9d5677dd5dcb45',
'md5': '3e909614ce09b1ccef4a3eb205441190',
'info_dict': {
'id': '32102397',
'ext': 'mp3',
'title': 'Bad Blood (feat. Kendrick Lamar)',
'title': 'Bad Blood',
'creator': 'Taylor Swift / Kendrick Lamar',
'upload_date': '20150517',
'timestamp': 1431878400,
'description': 'md5:a10a54589c2860300d02e1de821eb2ef',
'upload_date': '20150516',
'timestamp': 1431792000,
'description': 'md5:25fc5f27e47aad975aa6d36382c7833c',
},
'skip': 'Blocked outside Mainland China',
}, {
'note': 'No lyrics translation.',
'url': 'http://music.163.com/#/song?id=29822014',
'info_dict': {
'id': '29822014',
'ext': 'mp3',
'title': '听见下雨的声音',
'creator': '周杰伦',
'upload_date': '20141225',
'timestamp': 1419523200,
'description': 'md5:a4d8d89f44656af206b7b2555c0bce6c',
},
'skip': 'Blocked outside Mainland China',
}, {
'note': 'No lyrics.',
'url': 'http://music.163.com/song?id=17241424',
@@ -112,9 +184,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
'title': 'Opus 28',
'creator': 'Dustin O\'Halloran',
'upload_date': '20080211',
'description': 'md5:f12945b0f6e0365e3b73c5032e1b0ff4',
'timestamp': 1202745600,
},
'skip': 'Blocked outside Mainland China',
}, {
'note': 'Has translated name.',
'url': 'http://music.163.com/#/song?id=22735043',
@@ -128,7 +200,18 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
'timestamp': 1264608000,
'alt_title': '说出愿望吧(Genie)',
},
'skip': 'Blocked outside Mainland China',
}, {
'url': 'https://y.music.163.com/m/song?app_version=8.8.45&id=95670&uct2=sKnvS4+0YStsWkqsPhFijw%3D%3D&dlt=0846',
'md5': '95826c73ea50b1c288b22180ec9e754d',
'info_dict': {
'id': '95670',
'ext': 'mp3',
'title': '国际歌',
'creator': '马备',
'upload_date': '19911130',
'timestamp': 691516800,
'description': 'md5:1ba2f911a2b0aa398479f595224f2141',
},
}]
def _process_lyrics(self, lyrics_info):

View File

@@ -1,3 +1,4 @@
# coding: utf-8
from __future__ import unicode_literals
import re
@@ -7,7 +8,7 @@ from ..utils import urljoin
class NhkBaseIE(InfoExtractor):
_API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json'
_API_URL_TEMPLATE = 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json'
_BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand'
_TYPE_REGEX = r'/(?P<type>video|audio)/'
@@ -23,7 +24,7 @@ class NhkBaseIE(InfoExtractor):
def _extract_episode_info(self, url, episode=None):
fetch_episode = episode is None
lang, m_type, episode_id = re.match(NhkVodIE._VALID_URL, url).groups()
if episode_id.isdigit():
if len(episode_id) == 7:
episode_id = episode_id[:4] + '-' + episode_id[4:]
is_video = m_type == 'video'
@@ -84,7 +85,8 @@ class NhkBaseIE(InfoExtractor):
class NhkVodIE(NhkBaseIE):
_VALID_URL = r'%s%s(?P<id>\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
# the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg
_VALID_URL = r'%s%s(?P<id>[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
# Content available only for a limited period of time. Visit
# https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
_TESTS = [{
@@ -124,6 +126,19 @@ class NhkVodIE(NhkBaseIE):
}, {
'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/',
'only_matching': True,
}, {
# video, alphabetic character in ID #29670
'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/',
'only_matching': True,
'info_dict': {
'id': 'qfjay6cg',
'ext': 'mp4',
'title': 'DESIGN TALKS plus - Fishermens Finery',
'description': 'md5:8a8f958aaafb0d7cb59d38de53f1e448',
'thumbnail': r're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$',
'upload_date': '20210615',
'timestamp': 1623722008,
}
}]
def _real_extract(self, url):

View File

@@ -60,8 +60,7 @@ class NRKBaseIE(InfoExtractor):
return self._download_json(
urljoin('https://psapi.nrk.no/', path),
video_id, note or 'Downloading %s JSON' % item,
fatal=fatal, query=query,
headers={'Accept-Encoding': 'gzip, deflate, br'})
fatal=fatal, query=query)
class NRKIE(NRKBaseIE):

View File

@@ -7,6 +7,7 @@ import subprocess
import tempfile
from ..compat import (
compat_open as open,
compat_urlparse,
compat_kwargs,
)
@@ -16,6 +17,7 @@ from ..utils import (
ExtractorError,
get_exe_version,
is_outdated_version,
process_communicate_or_kill,
std_headers,
)
@@ -226,7 +228,7 @@ class PhantomJSwrapper(object):
self.exe, '--ssl-protocol=any',
self._TMP_FILES['script'].name
], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
out, err = process_communicate_or_kill(p)
if p.returncode != 0:
raise ExtractorError(
'Executing JS failed\n:' + encodeArgument(err))

File diff suppressed because it is too large Load Diff

View File

@@ -8,7 +8,7 @@ from ..compat import compat_str
from ..utils import (
int_or_none,
str_or_none,
try_get,
traverse_obj,
)
@@ -109,7 +109,7 @@ class PalcoMP3ArtistIE(PalcoMP3BaseIE):
}
name'''
@ classmethod
@classmethod
def suitable(cls, url):
return False if re.match(PalcoMP3IE._VALID_URL, url) else super(PalcoMP3ArtistIE, cls).suitable(url)
@@ -118,7 +118,8 @@ class PalcoMP3ArtistIE(PalcoMP3BaseIE):
artist = self._call_api(artist_slug, self._ARTIST_FIELDS_TMPL)['artist']
def entries():
for music in (try_get(artist, lambda x: x['musics']['nodes'], list) or []):
for music in traverse_obj(artist, (
'musics', 'nodes', lambda _, m: m['musicID'])):
yield self._parse_music(music)
return self.playlist_result(
@@ -137,7 +138,7 @@ class PalcoMP3VideoIE(PalcoMP3BaseIE):
'title': 'Maiara e Maraisa - Você Faz Falta Aqui - DVD Ao Vivo Em Campo Grande',
'description': 'md5:7043342c09a224598e93546e98e49282',
'upload_date': '20161107',
'uploader_id': 'maiaramaraisaoficial',
'uploader_id': '@maiaramaraisaoficial',
'uploader': 'Maiara e Maraisa',
}
}]

View File

@@ -0,0 +1,193 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
get_element_by_class,
int_or_none,
merge_dicts,
url_or_none,
)
class PeekVidsIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://(?:www\.)?peekvids\.com/
(?:(?:[^/?#]+/){2}|embed/?\?(?:[^#]*&)?v=)
(?P<id>[^/?&#]*)
'''
_TESTS = [{
'url': 'https://peekvids.com/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp/BSyLMbN0YCd',
'md5': '2ff6a357a9717dc9dc9894b51307e9a2',
'info_dict': {
'id': '1262717',
'display_id': 'BSyLMbN0YCd',
'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp',
'ext': 'mp4',
'thumbnail': r're:^https?://.*\.jpg$',
'description': 'md5:0a61df3620de26c0af8963b1a730cd69',
'timestamp': 1642579329,
'upload_date': '20220119',
'duration': 416,
'view_count': int,
'age_limit': 18,
'uploader': 'SEXYhub.com',
'categories': list,
'tags': list,
},
}]
_DOMAIN = 'www.peekvids.com'
def _get_detail(self, html):
return get_element_by_class('detail-video-block', html)
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id, expected_status=429)
if '>Rate Limit Exceeded' in webpage:
raise ExtractorError(
'[%s] %s: %s' % (self.IE_NAME, video_id, 'You are suspected as a bot. Wait, or pass the captcha test on the site and provide --cookies.'),
expected=True)
title = self._html_search_regex(r'(?s)<h1\b[^>]*>(.+?)</h1>', webpage, 'title')
display_id = video_id
video_id = self._search_regex(r'(?s)<video\b[^>]+\bdata-id\s*=\s*["\']?([\w-]+)', webpage, 'short video ID')
srcs = self._download_json(
'https://%s/v-alt/%s' % (self._DOMAIN, video_id), video_id,
note='Downloading list of source files')
formats = [{
'url': f_url,
'format_id': f_id,
'height': int_or_none(f_id),
} for f_url, f_id in (
(url_or_none(f_v), f_match.group(1))
for f_v, f_match in (
(v, re.match(r'^data-src(\d{3,})$', k))
for k, v in srcs.items() if v) if f_match)
if f_url
]
if not formats:
formats = [{'url': url} for url in srcs.values()]
self._sort_formats(formats)
info = self._search_json_ld(webpage, video_id, expected_type='VideoObject', default={})
info.pop('url', None)
# may not have found the thumbnail if it was in a list in the ld+json
info.setdefault('thumbnail', self._og_search_thumbnail(webpage))
detail = self._get_detail(webpage) or ''
info['description'] = self._html_search_regex(
r'(?s)(.+?)(?:%s\s*<|<ul\b)' % (re.escape(info.get('description', '')), ),
detail, 'description', default=None) or None
info['title'] = re.sub(r'\s*[,-][^,-]+$', '', info.get('title') or title) or self._generic_title(url)
def cat_tags(name, html):
l = self._html_search_regex(
r'(?s)<span\b[^>]*>\s*%s\s*:\s*</span>(.+?)</li>' % (re.escape(name), ),
html, name, default='')
return [x for x in re.split(r'\s+', l) if x]
return merge_dicts({
'id': video_id,
'display_id': display_id,
'age_limit': 18,
'formats': formats,
'categories': cat_tags('Categories', detail),
'tags': cat_tags('Tags', detail),
'uploader': self._html_search_regex(r'[Uu]ploaded\s+by\s(.+?)"', webpage, 'uploader', default=None),
}, info)
class PlayVidsIE(PeekVidsIE):
_VALID_URL = r'https?://(?:www\.)?playvids\.com/(?:embed/|\w\w?/)?(?P<id>[^/?#]*)'
_TESTS = [{
'url': 'https://www.playvids.com/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp',
'md5': '2f12e50213dd65f142175da633c4564c',
'info_dict': {
'id': '1978030',
'display_id': 'U3pBrYhsjXM',
'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp',
'ext': 'mp4',
'thumbnail': r're:^https?://.*\.jpg$',
'description': 'md5:0a61df3620de26c0af8963b1a730cd69',
'timestamp': 1640435839,
'upload_date': '20211225',
'duration': 416,
'view_count': int,
'age_limit': 18,
'uploader': 'SEXYhub.com',
'categories': list,
'tags': list,
},
}, {
'url': 'https://www.playvids.com/es/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp',
'only_matching': True,
}, {
'url': 'https://www.playvids.com/embed/U3pBrYhsjXM',
'only_matching': True,
}, {
'url': 'https://www.playvids.com/bKmGLe3IwjZ/sv/brazzers-800-phone-sex-madison-ivy-always-on-the-line',
'md5': 'e783986e596cafbf46411a174ab42ba6',
'info_dict': {
'id': '762385',
'display_id': 'bKmGLe3IwjZ',
'ext': 'mp4',
'title': 'Brazzers - 1 800 Phone Sex: Madison Ivy Always On The Line 6',
'description': 'md5:bdcd2db2b8ad85831a491d7c8605dcef',
'timestamp': 1516958544,
'upload_date': '20180126',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 480,
'uploader': 'Brazzers',
'age_limit': 18,
'view_count': int,
'age_limit': 18,
'categories': list,
'tags': list,
},
}, {
'url': 'https://www.playvids.com/v/47iUho33toY',
'md5': 'b056b5049d34b648c1e86497cf4febce',
'info_dict': {
'id': '700621',
'display_id': '47iUho33toY',
'ext': 'mp4',
'title': 'KATEE OWEN STRIPTIASE IN SEXY RED LINGERIE',
'description': None,
'timestamp': 1507052209,
'upload_date': '20171003',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 332,
'uploader': 'Cacerenele',
'age_limit': 18,
'view_count': int,
'categories': list,
'tags': list,
}
}, {
'url': 'https://www.playvids.com/z3_7iwWCmqt/sexy-teen-filipina-striptease-beautiful-pinay-bargirl-strips-and-dances',
'md5': 'efa09be9f031314b7b7e3bc6510cd0df',
'info_dict': {
'id': '1523518',
'display_id': 'z3_7iwWCmqt',
'ext': 'mp4',
'title': 'SEXY TEEN FILIPINA STRIPTEASE - Beautiful Pinay Bargirl Strips and Dances',
'description': None,
'timestamp': 1607470323,
'upload_date': '20201208',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 593,
'uploader': 'yorours',
'age_limit': 18,
'view_count': int,
'categories': list,
'tags': list,
},
}]
_DOMAIN = 'www.playvids.com'
def _get_detail(self, html):
return get_element_by_class('detail-block', html)

View File

@@ -0,0 +1,105 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
import re
from ..utils import (
merge_dicts,
)
class Pr0grammStaticIE(InfoExtractor):
# Possible urls:
# https://pr0gramm.com/static/5466437
_VALID_URL = r'https?://pr0gramm\.com/static/(?P<id>[0-9]+)'
_TEST = {
'url': 'https://pr0gramm.com/static/5466437',
'md5': '52fa540d70d3edc286846f8ca85938aa',
'info_dict': {
'id': '5466437',
'ext': 'mp4',
'title': 'pr0gramm-5466437 by g11st',
'uploader': 'g11st',
'upload_date': '20221221',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
# Fetch media sources
entries = self._parse_html5_media_entries(url, webpage, video_id)
media_info = entries[0]
# this raises if there are no formats
self._sort_formats(media_info.get('formats') or [])
# Fetch author
uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader')
# Fetch approx upload timestamp from filename
# Have None-defaults in case the extraction fails
uploadDay = None
uploadMon = None
uploadYear = None
uploadTimestr = None
# (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4)
m = re.search(r'//img\.pr0gramm\.com/(?P<year>[\d]+)/(?P<mon>[\d]+)/(?P<day>[\d]+)/\w+\.\w{,4}', webpage)
if (m):
# Up to a day of accuracy should suffice...
uploadDay = m.groupdict().get('day')
uploadMon = m.groupdict().get('mon')
uploadYear = m.groupdict().get('year')
uploadTimestr = uploadYear + uploadMon + uploadDay
return merge_dicts({
'id': video_id,
'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''),
'uploader': uploader,
'upload_date': uploadTimestr
}, media_info)
# This extractor is for the primary url (used for sharing, and appears in the
# location bar) Since this page loads the DOM via JS, yt-dl can't find any
# video information here. So let's redirect to a compatibility version of
# the site, which does contain the <video>-element by itself, without requiring
# js to be ran.
class Pr0grammIE(InfoExtractor):
# Possible urls:
# https://pr0gramm.com/new/546637
# https://pr0gramm.com/new/video/546637
# https://pr0gramm.com/top/546637
# https://pr0gramm.com/top/video/546637
# https://pr0gramm.com/user/g11st/uploads/5466437
# https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290
# https://pr0gramm.com/user/froschler/reinziehen-1elf/5232030
# https://pr0gramm.com/user/froschler/1elf/5232030
# https://pr0gramm.com/new/5495710:comment62621020 <- this is not the id!
# https://pr0gramm.com/top/fruher war alles damals/5498175
_VALID_URL = r'https?:\/\/pr0gramm\.com\/(?!static/\d+).+?\/(?P<id>[\d]+)(:|$)'
_TEST = {
'url': 'https://pr0gramm.com/new/video/5466437',
'info_dict': {
'id': '5466437',
'ext': 'mp4',
'title': 'pr0gramm-5466437 by g11st',
'uploader': 'g11st',
'upload_date': '20221221',
}
}
def _generic_title():
return "oof"
def _real_extract(self, url):
video_id = self._match_id(url)
return self.url_result(
'https://pr0gramm.com/static/' + video_id,
video_id=video_id,
ie=Pr0grammStaticIE.ie_key())

View File

@@ -5,15 +5,16 @@ import re
from .common import InfoExtractor
from ..compat import (
compat_urlparse,
compat_str,
compat_urlparse,
)
from ..utils import (
ExtractorError,
determine_ext,
ExtractorError,
find_xpath_attr,
fix_xml_ampersands,
GeoRestrictedError,
HEADRequest,
int_or_none,
parse_duration,
remove_start,
@@ -96,12 +97,100 @@ class RaiBaseIE(InfoExtractor):
if not formats and geoprotection is True:
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
formats.extend(self._create_http_urls(relinker_url, formats))
return dict((k, v) for k, v in {
'is_live': is_live,
'duration': duration,
'formats': formats,
}.items() if v is not None)
def _create_http_urls(self, relinker_url, fmts):
_RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?'
_MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s'
_QUALITY = {
# tbr: w, h
'250': [352, 198],
'400': [512, 288],
'700': [512, 288],
'800': [700, 394],
'1200': [736, 414],
'1800': [1024, 576],
'2400': [1280, 720],
'3200': [1440, 810],
'3600': [1440, 810],
'5000': [1920, 1080],
'10000': [1920, 1080],
}
def test_url(url):
resp = self._request_webpage(
HEADRequest(url), None, headers={'User-Agent': 'Rai'},
fatal=False, errnote=False, note=False)
if resp is False:
return False
if resp.code == 200:
return False if resp.url == url else resp.url
return None
def get_format_info(tbr):
import math
br = int_or_none(tbr)
if len(fmts) == 1 and not br:
br = fmts[0].get('tbr')
if br > 300:
tbr = compat_str(math.floor(br / 100) * 100)
else:
tbr = '250'
# try extracting info from available m3u8 formats
format_copy = None
for f in fmts:
if f.get('tbr'):
br_limit = math.floor(br / 100)
if br_limit - 1 <= math.floor(f['tbr'] / 100) <= br_limit + 1:
format_copy = f.copy()
return {
'width': format_copy.get('width'),
'height': format_copy.get('height'),
'tbr': format_copy.get('tbr'),
'vcodec': format_copy.get('vcodec'),
'acodec': format_copy.get('acodec'),
'fps': format_copy.get('fps'),
'format_id': 'https-%s' % tbr,
} if format_copy else {
'width': _QUALITY[tbr][0],
'height': _QUALITY[tbr][1],
'format_id': 'https-%s' % tbr,
'tbr': int(tbr),
}
loc = test_url(_MP4_TMPL % (relinker_url, '*'))
if not isinstance(loc, compat_str):
return []
mobj = re.match(
_RELINKER_REG,
test_url(relinker_url) or '')
if not mobj:
return []
available_qualities = mobj.group('quality').split(',') if mobj.group('quality') else ['*']
available_qualities = [i for i in available_qualities if i]
formats = []
for q in available_qualities:
fmt = {
'url': _MP4_TMPL % (relinker_url, q),
'protocol': 'https',
'ext': 'mp4',
}
fmt.update(get_format_info(q))
formats.append(fmt)
return formats
@staticmethod
def _extract_subtitles(url, video_data):
STL_EXT = 'stl'
@@ -151,6 +240,22 @@ class RaiPlayIE(RaiBaseIE):
'params': {
'skip_download': True,
},
}, {
# 1080p direct mp4 url
'url': 'https://www.raiplay.it/video/2021/03/Leonardo-S1E1-b5703b02-82ee-475a-85b6-c9e4a8adf642.html',
'md5': '2e501e8651d72f05ffe8f5d286ad560b',
'info_dict': {
'id': 'b5703b02-82ee-475a-85b6-c9e4a8adf642',
'ext': 'mp4',
'title': 'Leonardo - S1E1',
'alt_title': 'St 1 Ep 1 - Episodio 1',
'description': 'md5:f5360cd267d2de146e4e3879a5a47d31',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Rai 1',
'duration': 3229,
'series': 'Leonardo',
'season': 'Season 1',
},
}, {
'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?',
'only_matching': True,
@@ -158,6 +263,10 @@ class RaiPlayIE(RaiBaseIE):
# subtitles at 'subtitlesArray' key (see #27698)
'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html',
'only_matching': True,
}, {
# DRM protected
'url': 'https://www.raiplay.it/video/2020/09/Lo-straordinario-mondo-di-Zoey-S1E1-Lo-straordinario-potere-di-Zoey-ed493918-1d32-44b7-8454-862e473d00ff.html',
'only_matching': True,
}]
def _real_extract(self, url):
@@ -166,6 +275,13 @@ class RaiPlayIE(RaiBaseIE):
media = self._download_json(
base + '.json', video_id, 'Downloading video JSON')
if try_get(
media,
(lambda x: x['rights_management']['rights']['drm'],
lambda x: x['program_info']['rights_management']['rights']['drm']),
dict):
raise ExtractorError('This video is DRM protected.', expected=True)
title = media['name']
video = media['video']
@@ -307,7 +423,7 @@ class RaiIE(RaiBaseIE):
}, {
# with ContentItem in og:url
'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html',
'md5': '6865dd00cf0bbf5772fdd89d59bd768a',
'md5': '06345bd97c932f19ffb129973d07a020',
'info_dict': {
'id': 'efb17665-691c-45d5-a60c-5301333cbb0c',
'ext': 'mp4',

View File

@@ -0,0 +1,97 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class RbgTumIE(InfoExtractor):
_VALID_URL = r'https://live\.rbg\.tum\.de/w/(?P<id>.+)'
_TESTS = [{
# Combined view
'url': 'https://live.rbg.tum.de/w/cpp/22128',
'md5': '53a5e7b3e07128e33bbf36687fe1c08f',
'info_dict': {
'id': 'cpp/22128',
'ext': 'mp4',
'title': 'Lecture: October 18. 2022',
'series': 'Concepts of C++ programming (IN2377)',
}
}, {
# Presentation only
'url': 'https://live.rbg.tum.de/w/I2DL/12349/PRES',
'md5': '36c584272179f3e56b0db5d880639cba',
'info_dict': {
'id': 'I2DL/12349/PRES',
'ext': 'mp4',
'title': 'Lecture 3: Introduction to Neural Networks',
'series': 'Introduction to Deep Learning (IN2346)',
}
}, {
# Camera only
'url': 'https://live.rbg.tum.de/w/fvv-info/16130/CAM',
'md5': 'e04189d92ff2f56aedf5cede65d37aad',
'info_dict': {
'id': 'fvv-info/16130/CAM',
'ext': 'mp4',
'title': 'Fachschaftsvollversammlung',
'series': 'Fachschaftsvollversammlung Informatik',
}
}, ]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
m3u8 = self._html_search_regex(r'(https://.+?\.m3u8)', webpage, 'm3u8')
lecture_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title')
lecture_series_title = self._html_search_regex(
r'(?s)<title\b[^>]*>\s*(?:TUM-Live\s\|\s?)?([^:]+):?.*?</title>', webpage, 'series')
formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
self._sort_formats(formats)
return {
'id': video_id,
'title': lecture_title,
'series': lecture_series_title,
'formats': formats,
}
class RbgTumCourseIE(InfoExtractor):
_VALID_URL = r'https://live\.rbg\.tum\.de/course/(?P<id>.+)'
_TESTS = [{
'url': 'https://live.rbg.tum.de/course/2022/S/fpv',
'info_dict': {
'title': 'Funktionale Programmierung und Verifikation (IN0003)',
'id': '2022/S/fpv',
},
'params': {
'noplaylist': False,
},
'playlist_count': 13,
}, {
'url': 'https://live.rbg.tum.de/course/2022/W/set',
'info_dict': {
'title': 'SET FSMPIC',
'id': '2022/W/set',
},
'params': {
'noplaylist': False,
},
'playlist_count': 6,
}, ]
def _real_extract(self, url):
course_id = self._match_id(url)
webpage = self._download_webpage(url, course_id)
lecture_series_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title')
lecture_urls = []
for lecture_url in re.findall(r'(?i)href="/w/(.+)(?<!/cam)(?<!/pres)(?<!/chat)"', webpage):
lecture_urls.append(self.url_result('https://live.rbg.tum.de/w/' + lecture_url, ie=RbgTumIE.ie_key()))
return self.playlist_result(lecture_urls, course_id, lecture_series_title)

124
youtube_dl/extractor/s4c.py Normal file
View File

@@ -0,0 +1,124 @@
# coding: utf-8
from __future__ import unicode_literals
from functools import partial as partial_f
from .common import InfoExtractor
from ..utils import (
float_or_none,
merge_dicts,
T,
traverse_obj,
txt_or_none,
url_or_none,
)
class S4CIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/programme/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.s4c.cymru/clic/programme/861362209',
'info_dict': {
'id': '861362209',
'ext': 'mp4',
'title': 'Y Swn',
'description': 'md5:f7681a30e4955b250b3224aa9fe70cf0',
'duration': 5340,
'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Y_Swn_2023S4C_099_ii.jpg',
},
}, {
'url': 'https://www.s4c.cymru/clic/programme/856636948',
'info_dict': {
'id': '856636948',
'ext': 'mp4',
'title': 'Am Dro',
'duration': 2880,
'description': 'md5:100d8686fc9a632a0cb2db52a3433ffe',
'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Am_Dro_2022-23S4C_P6_4005.jpg',
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
details = self._download_json(
'https://www.s4c.cymru/df/full_prog_details',
video_id, query={
'lang': 'e',
'programme_id': video_id,
}, fatal=False)
player_config = self._download_json(
'https://player-api.s4c-cdn.co.uk/player-configuration/prod', video_id, query={
'programme_id': video_id,
'signed': '0',
'lang': 'en',
'mode': 'od',
'appId': 'clic',
'streamName': '',
}, note='Downloading player config JSON')
m3u8_url = self._download_json(
'https://player-api.s4c-cdn.co.uk/streaming-urls/prod', video_id, query={
'mode': 'od',
'application': 'clic',
'region': 'WW',
'extra': 'false',
'thirdParty': 'false',
'filename': player_config['filename'],
}, note='Downloading streaming urls JSON')['hls']
formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', entry_protocol='m3u8_native')
self._sort_formats(formats)
subtitles = {}
for sub in traverse_obj(player_config, ('subtitles', lambda _, v: url_or_none(v['0']))):
subtitles.setdefault(sub.get('3', 'en'), []).append({
'url': sub['0'],
'name': sub.get('1'),
})
return merge_dicts({
'id': video_id,
'formats': formats,
'subtitles': subtitles,
'thumbnail': url_or_none(player_config.get('poster')),
}, traverse_obj(details, ('full_prog_details', 0, {
'title': (('programme_title', 'series_title'), T(txt_or_none)),
'description': ('full_billing', T(txt_or_none)),
'duration': ('duration', T(partial_f(float_or_none, invscale=60))),
}), get_all=False),
rev=True)
class S4CSeriesIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/series/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.s4c.cymru/clic/series/864982911',
'playlist_mincount': 6,
'info_dict': {
'id': '864982911',
'title': 'Iaith ar Daith',
},
}, {
'url': 'https://www.s4c.cymru/clic/series/866852587',
'playlist_mincount': 8,
'info_dict': {
'id': '866852587',
'title': 'FFIT Cymru',
},
}]
def _real_extract(self, url):
series_id = self._match_id(url)
series_details = self._download_json(
'https://www.s4c.cymru/df/series_details', series_id, query={
'lang': 'e',
'series_id': series_id,
'show_prog_in_series': 'Y'
}, note='Downloading series details JSON')
return self.playlist_result(
(self.url_result('https://www.s4c.cymru/clic/programme/' + episode_id, S4CIE, episode_id)
for episode_id in traverse_obj(series_details, ('other_progs_in_series', Ellipsis, 'id'))),
playlist_id=series_id, playlist_title=traverse_obj(
series_details, ('full_prog_details', 0, 'series_title', T(txt_or_none))))

View File

@@ -62,7 +62,7 @@ class StreamCZIE(InfoExtractor):
if not stream.get('url'):
continue
yield merge_dicts({
'format_id': '{}-{}'.format(format_id, ext),
'format_id': '-'.join((format_id, ext)),
'ext': ext,
'source_preference': pref,
'url': urljoin(spl_url, stream['url']),

View File

@@ -0,0 +1,61 @@
# coding: utf-8
from __future__ import unicode_literals
import binascii
import random
import re
import string
from .common import InfoExtractor
from ..utils import urljoin, url_basename
def to_ascii_hex(str1):
return binascii.hexlify(str1.encode('utf-8')).decode('ascii')
def generate_random_string(length):
return ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(length))
class StreamsbIE(InfoExtractor):
_DOMAINS = ('viewsb.com', )
_VALID_URL = r'https://(?P<domain>%s)/(?P<id>.+)' % '|'.join(_DOMAINS)
_TEST = {
'url': 'https://viewsb.com/dxfvlu4qanjx',
'md5': '488d111a63415369bf90ea83adc8a325',
'info_dict': {
'id': 'dxfvlu4qanjx',
'ext': 'mp4',
'title': 'Sintel'
}
}
def _real_extract(self, url):
domain, video_id = re.match(self._VALID_URL, url).group('domain', 'id')
webpage = self._download_webpage(url, video_id)
iframe_rel_url = self._search_regex(r'''(?i)<iframe\b[^>]+\bsrc\s*=\s*('|")(?P<path>/.*\.html)\1''', webpage, 'iframe', group='path')
iframe_url = urljoin('https://' + domain, iframe_rel_url)
iframe_data = self._download_webpage(iframe_url, video_id)
app_version = self._search_regex(r'''<script\b[^>]+\bsrc\s*=\s*["|'].*/app\.min\.(\d+)\.js''', iframe_data, 'app version', fatal=False) or '50'
video_code = url_basename(iframe_url).rsplit('.')[0]
length = 12
req = '||'.join((generate_random_string(length), video_code, generate_random_string(length), 'streamsb'))
ereq = 'https://{0}/sources{1}/{2}'.format(domain, app_version, to_ascii_hex(req))
video_data = self._download_webpage(ereq, video_id, headers={
'Referer': iframe_url,
'watchsb': 'sbstream',
})
player_data = self._parse_json(video_data, video_id)
title = player_data['stream_data']['title']
formats = self._extract_m3u8_formats(player_data['stream_data']['file'], video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
return {
'id': video_id,
'formats': formats,
'title': title,
}

View File

@@ -34,7 +34,9 @@ class TelegraafIE(InfoExtractor):
article_id = self._match_id(url)
video_id = self._download_json(
'https://www.telegraaf.nl/graphql', article_id, query={
'https://app.telegraaf.nl/graphql', article_id,
headers={'User-Agent': 'De Telegraaf/6.8.11 (Android 11; en_US)'},
query={
'query': '''{
article(uid: %s) {
videos {

View File

@@ -3,17 +3,23 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
float_or_none,
int_or_none,
url_or_none,
)
class TelewebionIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?telewebion\.com/#!/episode/(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?telewebion\.com/(episode|clip)/(?P<id>[a-zA-Z0-9]+)'
_TEST = {
'url': 'http://www.telewebion.com/#!/episode/1263668/',
'url': 'http://www.telewebion.com/episode/0x1b3139c/',
'info_dict': {
'id': '1263668',
'id': '0x1b3139c',
'ext': 'mp4',
'title': 'قرعه\u200cکشی لیگ قهرمانان اروپا',
'thumbnail': r're:^https?://.*\.jpg',
'thumbnail': r're:^https?://static\.telewebion\.com/episodeImages/.*/default',
'view_count': int,
},
'params': {
@@ -25,31 +31,24 @@ class TelewebionIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
secure_token = self._download_webpage(
'http://m.s2.telewebion.com/op/op?action=getSecurityToken', video_id)
episode_details = self._download_json(
'http://m.s2.telewebion.com/op/op', video_id,
query={'action': 'getEpisodeDetails', 'episode_id': video_id})
episode_details = self._download_json('https://gateway.telewebion.ir/kandoo/episode/getEpisodeDetail/?EpisodeId={0}'.format(video_id), video_id)
episode_details = episode_details['body']['queryEpisode'][0]
m3u8_url = 'http://m.s1.telewebion.com/smil/%s.m3u8?filepath=%s&m3u8=1&secure_token=%s' % (
video_id, episode_details['file_path'], secure_token)
channel_id = episode_details['channel']['descriptor']
episode_image_id = episode_details.get('image')
episode_image = 'https://static.telewebion.com/episodeImages/{0}/default'.format(episode_image_id) if episode_image_id else None
m3u8_url = 'https://cdna.telewebion.com/{0}/episode/{1}/playlist.m3u8'.format(channel_id, video_id)
formats = self._extract_m3u8_formats(
m3u8_url, video_id, ext='mp4', m3u8_id='hls')
picture_paths = [
episode_details.get('picture_path'),
episode_details.get('large_picture_path'),
]
thumbnails = [{
'url': picture_path,
'preference': idx,
} for idx, picture_path in enumerate(picture_paths) if picture_path is not None]
m3u8_url, video_id, ext='mp4', m3u8_id='hls',
entry_protocol='m3u8_native')
self._sort_formats(formats)
return {
'id': video_id,
'title': episode_details['title'],
'formats': formats,
'thumbnails': thumbnails,
'view_count': episode_details.get('view_count'),
'thumbnail': url_or_none(episode_image),
'view_count': int_or_none(episode_details.get('view_count')),
'duration': float_or_none(episode_details.get('duration')),
}

View File

@@ -0,0 +1,218 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import itertools
from .common import InfoExtractor
from ..compat import (
compat_urlparse,
)
from ..utils import (
clean_html,
get_element_by_class,
int_or_none,
merge_dicts,
url_or_none,
urljoin,
)
class ThisVidIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+)'
_TESTS = [{
'url': 'https://thisvid.com/videos/sitting-on-ball-tight-jeans/',
'md5': '839becb572995687e11a69dc4358a386',
'info_dict': {
'id': '3533241',
'ext': 'mp4',
'title': 'Sitting on ball tight jeans',
'description': 'md5:372353bb995883d1b65fddf507489acd',
'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
'uploader_id': '150629',
'uploader': 'jeanslevisjeans',
'age_limit': 18,
}
}, {
'url': 'https://thisvid.com/embed/3533241/',
'md5': '839becb572995687e11a69dc4358a386',
'info_dict': {
'id': '3533241',
'ext': 'mp4',
'title': 'Sitting on ball tight jeans',
'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
'uploader_id': '150629',
'uploader': 'jeanslevisjeans',
'age_limit': 18,
}
}]
def _real_extract(self, url):
main_id, type_ = re.match(self._VALID_URL, url).group('id', 'type')
webpage = self._download_webpage(url, main_id)
title = self._html_search_regex(
r'<title\b[^>]*?>(?:Video:\s+)?(.+?)(?:\s+-\s+ThisVid(?:\.com| tube))?</title>',
webpage, 'title')
if type_ == 'embed':
# look for more metadata
video_alt_url = url_or_none(self._search_regex(
r'''video_alt_url\s*:\s+'(%s/)',''' % (self._VALID_URL, ),
webpage, 'video_alt_url', default=None))
if video_alt_url and video_alt_url != url:
webpage = self._download_webpage(
video_alt_url, main_id,
note='Redirecting embed to main page', fatal=False) or webpage
video_holder = get_element_by_class('video-holder', webpage) or ''
if '>This video is a private video' in video_holder:
self.raise_login_required(
(clean_html(video_holder) or 'Private video').split('\n', 1)[0])
uploader = self._html_search_regex(
r'''(?s)<span\b[^>]*>Added by:\s*</span><a\b[^>]+\bclass\s*=\s*["']author\b[^>]+\bhref\s*=\s*["']https://thisvid\.com/members/([0-9]+/.{3,}?)\s*</a>''',
webpage, 'uploader', default='')
uploader = re.split(r'''/["'][^>]*>\s*''', uploader)
if len(uploader) == 2:
# id must be non-empty, uploader could be ''
uploader_id, uploader = uploader
uploader = uploader or None
else:
uploader_id = uploader = None
return merge_dicts({
'_type': 'url_transparent',
'title': title,
'age_limit': 18,
'uploader': uploader,
'uploader_id': uploader_id,
}, self.url_result(url, ie='Generic'))
class ThisVidMemberIE(InfoExtractor):
_VALID_URL = r'https?://thisvid\.com/members/(?P<id>\d+)'
_TESTS = [{
'url': 'https://thisvid.com/members/2140501/',
'info_dict': {
'id': '2140501',
'title': 'Rafflesia\'s Profile',
},
'playlist_mincount': 16,
}, {
'url': 'https://thisvid.com/members/2140501/favourite_videos/',
'info_dict': {
'id': '2140501',
'title': 'Rafflesia\'s Favourite Videos',
},
'playlist_mincount': 15,
}, {
'url': 'https://thisvid.com/members/636468/public_videos/',
'info_dict': {
'id': '636468',
'title': 'Happymouth\'s Public Videos',
},
'playlist_mincount': 196,
},
]
def _urls(self, html):
for m in re.finditer(r'''<a\b[^>]+\bhref\s*=\s*["'](?P<url>%s\b)[^>]+>''' % (ThisVidIE._VALID_URL, ), html):
yield m.group('url')
def _real_extract(self, url):
pl_id = self._match_id(url)
webpage = self._download_webpage(url, pl_id)
title = re.split(
r'(?i)\s*\|\s*ThisVid\.com\s*$',
self._og_search_title(webpage, default=None) or self._html_search_regex(r'(?s)<title\b[^>]*>(.+?)</title', webpage, 'title', fatal=False) or '', 1)[0] or None
def entries(page_url, html=None):
for page in itertools.count(1):
if not html:
html = self._download_webpage(
page_url, pl_id, note='Downloading page %d' % (page, ),
fatal=False) or ''
for u in self._urls(html):
yield u
next_page = get_element_by_class('pagination-next', html) or ''
if next_page:
# member list page
next_page = urljoin(url, self._search_regex(
r'''<a\b[^>]+\bhref\s*=\s*("|')(?P<url>(?!#)(?:(?!\1).)+)''',
next_page, 'next page link', group='url', default=None))
# in case a member page should have pagination-next with empty link, not just `else:`
if next_page is None:
# playlist page
parsed_url = compat_urlparse.urlparse(page_url)
base_path, num = parsed_url.path.rsplit('/', 1)
num = int_or_none(num)
if num is None:
base_path, num = parsed_url.path.rstrip('/'), 1
parsed_url = parsed_url._replace(path=base_path + ('/%d' % (num + 1, )))
next_page = compat_urlparse.urlunparse(parsed_url)
if page_url == next_page:
next_page = None
if not next_page:
break
page_url, html = next_page, None
return self.playlist_from_matches(
entries(url, webpage), playlist_id=pl_id, playlist_title=title, ie='ThisVid')
class ThisVidPlaylistIE(ThisVidMemberIE):
_VALID_URL = r'https?://thisvid\.com/playlist/(?P<id>\d+)/video/(?P<video_id>[A-Za-z0-9-]+)'
_TESTS = [{
'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
'info_dict': {
'id': '6615',
'title': 'Underwear Stuff',
},
'playlist_mincount': 200,
}, {
'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
'info_dict': {
'id': '1072387',
'ext': 'mp4',
'title': 'Big Italian Booty 28',
'description': 'md5:1bccf7b13765e18fb27bf764dba7ede2',
'uploader_id': '367912',
'uploader': 'Jcmusclefun',
'age_limit': 18,
},
'params': {
'noplaylist': True,
},
}]
def _get_video_url(self, pl_url):
video_id = re.match(self._VALID_URL, pl_url).group('video_id')
return urljoin(pl_url, '/videos/%s/' % (video_id, ))
def _urls(self, html):
for m in re.finditer(r'''<a\b[^>]+\bhref\s*=\s*["'](?P<url>%s\b)[^>]+>''' % (self._VALID_URL, ), html):
yield self._get_video_url(m.group('url'))
def _real_extract(self, url):
pl_id = self._match_id(url)
if self._downloader.params.get('noplaylist'):
self.to_screen('Downloading just the featured video because of --no-playlist')
return self.url_result(self._get_video_url(url), 'ThisVid')
self.to_screen(
'Downloading playlist %s - add --no-playlist to download just the featured video' % (pl_id, ))
result = super(ThisVidPlaylistIE, self)._real_extract(url)
# rework title returned as `the title - the title`
title = result['title']
t_len = len(title)
if t_len > 5 and t_len % 2 != 0:
t_len = t_len // 2
if title[t_len] == '-':
title = [t.strip() for t in (title[:t_len], title[t_len + 1:])]
if title[0] and title[0] == title[1]:
result['title'] = title[0]
return result

View File

@@ -5,7 +5,7 @@ from .common import InfoExtractor
class UKTVPlayIE(InfoExtractor):
_VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/(?:.+?\?.*?\bvideo=|([^/]+/)*watch-online/)(?P<id>\d+)'
_VALID_URL = r'https?://uktvplay\.(?:uktv\.)?co\.uk/(?:.+?\?.*?\bvideo=|([^/]+/)*watch-online/)(?P<id>\d+)'
_TESTS = [{
'url': 'https://uktvplay.uktv.co.uk/shows/world-at-war/c/200/watch-online/?video=2117008346001',
'info_dict': {

View File

@@ -2,9 +2,22 @@
from __future__ import unicode_literals
import re
import time
from .common import InfoExtractor
from ..utils import ExtractorError
from ..compat import compat_kwargs
from ..utils import (
base_url,
determine_ext,
ExtractorError,
float_or_none,
merge_dicts,
T,
traverse_obj,
txt_or_none,
url_basename,
url_or_none,
)
class Vbox7IE(InfoExtractor):
@@ -20,23 +33,27 @@ class Vbox7IE(InfoExtractor):
)
(?P<id>[\da-fA-F]+)
'''
_EMBED_REGEX = [r'<iframe[^>]+src=(?P<q>["\'])(?P<url>(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)']
_GEO_COUNTRIES = ['BG']
_TESTS = [{
'url': 'http://vbox7.com/play:0946fff23c',
'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf',
# the http: URL just redirects here
'url': 'https://vbox7.com/play:0946fff23c',
'md5': '50ca1f78345a9c15391af47d8062d074',
'info_dict': {
'id': '0946fff23c',
'ext': 'mp4',
'title': 'Борисов: Притеснен съм за бъдещето на България',
'description': 'По думите му е опасно страната ни да бъде обявена за "сигурна"',
'thumbnail': r're:^https?://.*\.jpg$',
'timestamp': 1470982814,
'upload_date': '20160812',
'uploader': 'zdraveibulgaria',
'thumbnail': r're:^https?://.*\.jpg$',
'view_count': int,
'duration': 2640,
},
'params': {
'proxy': '127.0.0.1:8118',
},
'expected_warnings': [
'Unable to download webpage',
],
}, {
'url': 'http://vbox7.com/play:249bb972c2',
'md5': '99f65c0c9ef9b682b97313e052734c3f',
@@ -44,8 +61,15 @@ class Vbox7IE(InfoExtractor):
'id': '249bb972c2',
'ext': 'mp4',
'title': 'Смях! Чудо - чист за секунди - Скрита камера',
'description': 'Смях! Чудо - чист за секунди - Скрита камера',
'timestamp': 1360215023,
'upload_date': '20130207',
'uploader': 'svideteliat_ot_varshava',
'thumbnail': 'https://i49.vbox7.com/o/249/249bb972c20.jpg',
'view_count': int,
'duration': 83,
},
'skip': 'georestricted',
'expected_warnings': ['Failed to download m3u8 information'],
}, {
'url': 'http://vbox7.com/emb/external.php?vid=a240d20f9c&autoplay=1',
'only_matching': True,
@@ -54,52 +78,127 @@ class Vbox7IE(InfoExtractor):
'only_matching': True,
}]
@staticmethod
def _extract_url(webpage):
mobj = re.search(
r'<iframe[^>]+src=(?P<q>["\'])(?P<url>(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)',
webpage)
@classmethod
def _extract_url(cls, webpage):
mobj = re.search(cls._EMBED_REGEX[0], webpage)
if mobj:
return mobj.group('url')
# specialisation to transform what looks like ld+json that
# may contain invalid character combinations
# transform_source=None, fatal=True
def _parse_json(self, json_string, video_id, *args, **kwargs):
if '"@context"' in json_string[:30]:
# this is ld+json, or that's the way to bet
transform_source = args[0] if len(args) > 0 else kwargs.get('transform_source')
if not transform_source:
def fix_chars(src):
# fix malformed ld+json: replace raw CRLFs with escaped LFs
return re.sub(
r'"[^"]+"', lambda m: re.sub(r'\r?\n', r'\\n', m.group(0)), src)
if len(args) > 0:
args = (fix_chars,) + args[1:]
else:
kwargs['transform_source'] = fix_chars
kwargs = compat_kwargs(kwargs)
return super(Vbox7IE, self)._parse_json(
json_string, video_id, *args, **kwargs)
def _real_extract(self, url):
video_id = self._match_id(url)
url = 'https://vbox7.com/play:%s' % (video_id,)
now = time.time()
response = self._download_json(
'https://www.vbox7.com/ajax/video/nextvideo.php?vid=%s' % video_id,
video_id)
'https://www.vbox7.com/aj/player/item/options', video_id,
query={'vid': video_id}, headers={'Referer': url})
# estimate time to which possible `ago` member is relative
now = now + 0.5 * (time.time() - now)
if 'error' in response:
if traverse_obj(response, 'error'):
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, response['error']), expected=True)
video = response['options']
src_url = traverse_obj(response, ('options', 'src', T(url_or_none))) or ''
title = video['title']
video_url = video['src']
if '/na.mp4' in video_url:
fmt_base = url_basename(src_url).rsplit('.', 1)[0].rsplit('_', 1)[0]
if fmt_base in ('na', 'vn'):
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
uploader = video.get('uploader')
ext = determine_ext(src_url)
if ext == 'mpd':
# extract MPD
try:
formats, subtitles = self._extract_mpd_formats_and_subtitles(
src_url, video_id, 'dash', fatal=False)
except KeyError: # fatal doesn't catch this
self.report_warning('Failed to parse MPD manifest')
formats, subtitles = [], {}
elif ext != 'm3u8':
formats = [{
'url': src_url,
}] if src_url else []
subtitles = {}
webpage = self._download_webpage(
'http://vbox7.com/play:%s' % video_id, video_id, fatal=None)
if src_url:
# possibly extract HLS, based on https://github.com/yt-dlp/yt-dlp/pull/9100
fmt_base = base_url(src_url) + fmt_base
# prepare for _extract_m3u8_formats_and_subtitles()
# hls_formats, hls_subs = self._extract_m3u8_formats_and_subtitles(
hls_formats = self._extract_m3u8_formats(
'{0}.m3u8'.format(fmt_base), video_id, m3u8_id='hls', fatal=False)
formats.extend(hls_formats)
# self._merge_subtitles(hls_subs, target=subtitles)
info = {}
# In case MPD/HLS cannot be parsed, or anyway, get mp4 combined
# formats usually provided to Safari, iOS, and old Windows
video = response['options']
resolutions = (1080, 720, 480, 240, 144)
highest_res = traverse_obj(video, (
'highestRes', T(int))) or resolutions[0]
resolutions = traverse_obj(video, (
'resolutions', lambda _, r: highest_res >= int(r) > 0)) or resolutions
mp4_formats = traverse_obj(resolutions, (
Ellipsis, T(lambda res: {
'url': '{0}_{1}.mp4'.format(fmt_base, res),
'format_id': 'http-{0}'.format(res),
'height': res,
})))
# if above formats are flaky, enable the line below
# self._check_formats(mp4_formats, video_id)
formats.extend(mp4_formats)
if webpage:
info = self._search_json_ld(
webpage.replace('"/*@context"', '"@context"'), video_id,
fatal=False)
self._sort_formats(formats)
info.update({
webpage = self._download_webpage(url, video_id, fatal=False) or ''
info = self._search_json_ld(
webpage.replace('"/*@context"', '"@context"'), video_id,
fatal=False) if webpage else {}
if not info.get('title'):
info['title'] = traverse_obj(response, (
'options', 'title', T(txt_or_none))) or self._og_search_title(webpage)
def if_missing(k):
return lambda x: None if k in info else x
info = merge_dicts(info, {
'id': video_id,
'title': title,
'url': video_url,
'uploader': uploader,
'thumbnail': self._proto_relative_url(
'formats': formats,
'subtitles': subtitles or None,
}, info, traverse_obj(response, ('options', {
'uploader': ('uploader', T(txt_or_none)),
'timestamp': ('ago', T(if_missing('timestamp')), T(lambda t: int(round((now - t) / 60.0)) * 60)),
'duration': ('duration', T(if_missing('duration')), T(float_or_none)),
})))
if 'thumbnail' not in info:
info['thumbnail'] = self._proto_relative_url(
info.get('thumbnail') or self._og_search_thumbnail(webpage),
'http:'),
})
'https:'),
return info

View File

@@ -6,22 +6,31 @@ import re
import string
from .common import InfoExtractor
from ..compat import (
compat_b64decode,
compat_ord,
compat_struct_pack,
)
from ..utils import (
ExtractorError,
int_or_none,
mimetype2ext,
parse_codecs,
parse_qs,
update_url_query,
urljoin,
xpath_element,
xpath_text,
)
from ..compat import (
compat_b64decode,
compat_ord,
compat_struct_pack,
compat_urlparse,
)
def compat_random_choices(population, *args, **kwargs):
# weights=None, *, cum_weights=None, k=1
# limited implementation needed here
weights = args[0] if args else kwargs.get('weights')
assert all(w is None for w in (weights, kwargs.get('cum_weights')))
k = kwargs.get('k', 1)
return ''.join(random.choice(population) for _ in range(k))
class VideaIE(InfoExtractor):
@@ -35,6 +44,7 @@ class VideaIE(InfoExtractor):
)
(?P<id>[^?#&]+)
'''
_EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1']
_TESTS = [{
'url': 'http://videa.hu/videok/allatok/az-orult-kigyasz-285-kigyot-kigyo-8YfIAjxwWGwT8HVQ',
'md5': '97a7af41faeaffd9f1fc864a7c7e7603',
@@ -44,6 +54,7 @@ class VideaIE(InfoExtractor):
'title': 'Az őrült kígyász 285 kígyót enged szabadon',
'thumbnail': r're:^https?://.*',
'duration': 21,
'age_limit': 0,
},
}, {
'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH',
@@ -54,6 +65,7 @@ class VideaIE(InfoExtractor):
'title': 'Supercars előzés',
'thumbnail': r're:^https?://.*',
'duration': 64,
'age_limit': 0,
},
}, {
'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ',
@@ -64,6 +76,7 @@ class VideaIE(InfoExtractor):
'title': 'Az őrült kígyász 285 kígyót enged szabadon',
'thumbnail': r're:^https?://.*',
'duration': 21,
'age_limit': 0,
},
}, {
'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1',
@@ -80,11 +93,14 @@ class VideaIE(InfoExtractor):
}]
_STATIC_SECRET = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p'
@staticmethod
def _extract_urls(webpage):
return [url for _, url in re.findall(
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1',
webpage)]
@classmethod
def _extract_urls(cls, webpage):
def yield_urls():
for pattern in cls._EMBED_REGEX:
for m in re.finditer(pattern, webpage):
yield m.group('url')
return list(yield_urls())
@staticmethod
def rc4(cipher_text, key):
@@ -130,13 +146,13 @@ class VideaIE(InfoExtractor):
for i in range(0, 32):
result += s[i - (self._STATIC_SECRET.index(l[i]) - 31)]
query = compat_urlparse.parse_qs(compat_urlparse.urlparse(player_url).query)
random_seed = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8))
query = parse_qs(player_url)
random_seed = ''.join(compat_random_choices(string.ascii_letters + string.digits, k=8))
query['_s'] = random_seed
query['_t'] = result[:16]
b64_info, handle = self._download_webpage_handle(
'http://videa.hu/videaplayer_get_xml.php', video_id, query=query)
'http://videa.hu/player/xml', video_id, query=query)
if b64_info.startswith('<?xml'):
info = self._parse_xml(b64_info, video_id)
else:

View File

@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
float_or_none,
get_element_by_id,
@@ -11,6 +12,7 @@ from ..utils import (
strip_or_none,
unified_strdate,
urljoin,
str_to_int,
)
@@ -35,6 +37,26 @@ class VidLiiIE(InfoExtractor):
'categories': ['News & Politics'],
'tags': ['Vidlii', 'Jan', 'Videogames'],
}
}, {
# HD
'url': 'https://www.vidlii.com/watch?v=2Ng8Abj2Fkl',
'md5': '450e7da379c884788c3a4fa02a3ce1a4',
'info_dict': {
'id': '2Ng8Abj2Fkl',
'ext': 'mp4',
'title': 'test',
'description': 'md5:cc55a86032a7b6b3cbfd0f6b155b52e9',
'thumbnail': 'https://www.vidlii.com/usfi/thmp/2Ng8Abj2Fkl.jpg',
'uploader': 'VidLii',
'uploader_url': 'https://www.vidlii.com/user/VidLii',
'upload_date': '20200927',
'duration': 5,
'view_count': int,
'comment_count': int,
'average_rating': float,
'categories': ['Film & Animation'],
'tags': list,
},
}, {
'url': 'https://www.vidlii.com/embed?v=tJluaH4BJ3v&a=0',
'only_matching': True,
@@ -46,11 +68,32 @@ class VidLiiIE(InfoExtractor):
webpage = self._download_webpage(
'https://www.vidlii.com/watch?v=%s' % video_id, video_id)
video_url = self._search_regex(
r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1', webpage,
'video url', group='url')
formats = []
title = self._search_regex(
def add_format(format_url, height=None):
height = int(self._search_regex(r'(\d+)\.mp4',
format_url, 'height', default=360))
formats.append({
'url': format_url,
'format_id': '%dp' % height if height else None,
'height': height,
})
sources = re.findall(
r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1',
webpage)
formats = []
if len(sources) > 1:
add_format(sources[1][1])
self._check_formats(formats, video_id)
if len(sources) > 0:
add_format(sources[0][1])
self._sort_formats(formats)
title = self._html_search_regex(
(r'<h1>([^<]+)</h1>', r'<title>([^<]+) - VidLii<'), webpage,
'title')
@@ -82,9 +125,9 @@ class VidLiiIE(InfoExtractor):
default=None) or self._search_regex(
r'duration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
view_count = int_or_none(self._search_regex(
(r'<strong>(\d+)</strong> views',
r'Views\s*:\s*<strong>(\d+)</strong>'),
view_count = str_to_int(self._html_search_regex(
(r'<strong>([\d,.]+)</strong> views',
r'Views\s*:\s*<strong>([\d,.]+)</strong>'),
webpage, 'view count', fatal=False))
comment_count = int_or_none(self._search_regex(
@@ -109,7 +152,7 @@ class VidLiiIE(InfoExtractor):
return {
'id': video_id,
'url': video_url,
'formats': formats,
'title': title,
'description': description,
'thumbnail': thumbnail,

View File

@@ -261,27 +261,33 @@ class VimeoIE(VimeoBaseInfoExtractor):
# _VALID_URL matches Vimeo URLs
_VALID_URL = r'''(?x)
https?://
(?:
(?:
www|
player
)
\.
)?
vimeo(?:pro)?\.com/
(?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
(?:.*?/)?
(?:
(?:
play_redirect_hls|
moogaloop\.swf)\?clip_id=
)?
(?:videos?/)?
(?P<id>[0-9]+)
(?:/(?P<unlisted_hash>[\da-f]{10}))?
/?(?:[?&].*)?(?:[#].*)?$
'''
https?://
(?:
(?:
www|
player
)
\.
)?
vimeo(?:pro)?\.com/
(?:
(?P<u>user)|
(?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
(?:.*?/)??
(?P<q>
(?:
play_redirect_hls|
moogaloop\.swf)\?clip_id=
)?
(?:videos?/)?
)
(?P<id>[0-9]+)
(?(u)
/(?!videos|likes)[^/?#]+/?|
(?(q)|/(?P<unlisted_hash>[\da-f]{10}))?
)
(?:(?(q)[&]|(?(u)|/?)[?]).+?)?(?:[#].*)?$
'''
IE_NAME = 'vimeo'
_TESTS = [
{
@@ -517,15 +523,34 @@ class VimeoIE(VimeoBaseInfoExtractor):
'url': 'https://vimeo.com/7809605',
'only_matching': True,
},
{
'url': 'https://vimeo.com/160743502/abd0e13fb4',
'only_matching': True,
},
{
# requires passing unlisted_hash(a52724358e) to load_download_config request
'url': 'https://vimeo.com/392479337/a52724358e',
'only_matching': True,
}
},
{
# similar, but all numeric: ID must be 581039021, not 9603038895
# issue #29690
'url': 'https://vimeo.com/581039021/9603038895',
'info_dict': {
'id': '581039021',
# these have to be provided but we don't care
'ext': 'mp4',
'timestamp': 1627621014,
'title': 're:.+',
'uploader_id': 're:.+',
'uploader': 're:.+',
'upload_date': r're:\d+',
},
'params': {
'skip_download': True,
},
},
{
# user playlist alias -> https://vimeo.com/258705797
'url': 'https://vimeo.com/user26785108/newspiritualguide',
'only_matching': True,
},
# https://gettingthingsdone.com/workflowmap/
# vimeo embed with check-password page protected by Referer header
]
@@ -648,8 +673,8 @@ class VimeoIE(VimeoBaseInfoExtractor):
raise
if '//player.vimeo.com/video/' in url:
config = self._parse_json(self._search_regex(
r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id)
config = self._search_json(
r'\b(?:playerC|c)onfig\s*=', webpage, 'info section', video_id)
if config.get('view') == 4:
config = self._verify_player_video_password(
redirect_url, video_id, headers)

View File

@@ -64,6 +64,18 @@ class VVVVIDIE(InfoExtractor):
'params': {
'skip_download': True,
},
}, {
# video_type == 'video/dash'
'url': 'https://www.vvvvid.it/show/683/made-in-abyss/1542/693786/nanachi',
'info_dict': {
'id': '693786',
'ext': 'mp4',
'title': 'Nanachi',
},
'params': {
'skip_download': True,
'format': 'mp4',
},
}, {
'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048',
'only_matching': True
@@ -205,6 +217,9 @@ class VVVVIDIE(InfoExtractor):
})
is_youtube = True
break
elif video_type == 'video/dash':
formats.extend(self._extract_m3u8_formats(
embed_code, video_id, 'mp4', m3u8_id='hls', fatal=False))
else:
formats.extend(self._extract_wowza_formats(
'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id))

View File

@@ -57,7 +57,7 @@ class WatIE(InfoExtractor):
# 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id)
video_data = self._download_json(
'https://mediainfo.tf1.fr/mediainfocombo/' + video_id,
video_id, query={'context': 'MYTF1'})
video_id, query={'context': 'MYTF1', 'pver': '4001000'})
video_info = video_data['media']
error_desc = video_info.get('error_desc')

View File

@@ -0,0 +1,55 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
float_or_none,
merge_dicts,
str_or_none,
T,
traverse_obj,
url_or_none,
)
class WhypIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?whyp\.it/tracks/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.whyp.it/tracks/18337/home-page-example-track-b4kq7',
'md5': 'c1187b42ebf8605284e3dc92aeb33d16',
'info_dict': {
'url': 'https://cdn.whyp.it/50eb17cc-e9ff-4e18-b89b-dc9206a95cb1.mp3',
'id': '18337',
'title': 'Home Page Example Track',
'description': r're:(?s).+\bexample track\b',
'ext': 'mp3',
'duration': 52.82,
'uploader': 'Brad',
'uploader_id': '1',
'thumbnail': 'https://cdn.whyp.it/a537bb36-3373-4c61-96c8-27fc1b2f427a.jpg',
},
}, {
'url': 'https://www.whyp.it/tracks/18337',
'only_matching': True,
}]
def _real_extract(self, url):
unique_id = self._match_id(url)
webpage = self._download_webpage(url, unique_id)
data = self._search_nuxt_data(webpage, unique_id)['rawTrack']
return merge_dicts({
'url': data['audio_url'],
'id': unique_id,
}, traverse_obj(data, {
'title': 'title',
'description': 'description',
'duration': ('duration', T(float_or_none)),
'uploader': ('user', 'username'),
'uploader_id': ('user', 'id', T(str_or_none)),
'thumbnail': ('artwork_url', T(url_or_none)),
}), {
'ext': 'mp3',
'vcodec': 'none',
'http_headers': {'Referer': 'https://whyp.it/'},
}, rev=True)

View File

@@ -4,20 +4,28 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_chr
from ..compat import (
compat_chr,
compat_zip as zip,
)
from ..utils import (
clean_html,
decode_packed_codes,
determine_ext,
ExtractorError,
get_element_by_id,
int_or_none,
js_to_json,
merge_dicts,
T,
traverse_obj,
url_or_none,
urlencode_postdata,
)
# based on openload_decode from 2bfeee69b976fe049761dd3012e30b637ee05a58
def aa_decode(aa_code):
symbol_table = [
symbol_table = (
('7', '((゚ー゚) + (o^_^o))'),
('6', '((o^_^o) +(o^_^o))'),
('5', '((゚ー゚) + (゚Θ゚))'),
@@ -26,84 +34,180 @@ def aa_decode(aa_code):
('3', '(o^_^o)'),
('1', '(゚Θ゚)'),
('0', '(c^_^o)'),
]
('+', ''),
)
delim = '(゚Д゚)[゚ε゚]+'
ret = ''
for aa_char in aa_code.split(delim):
def chr_from_code(c):
for val, pat in symbol_table:
aa_char = aa_char.replace(pat, val)
aa_char = aa_char.replace('+ ', '')
m = re.match(r'^\d+', aa_char)
if m:
ret += compat_chr(int(m.group(0), 8))
c = c.replace(pat, val)
if c.startswith(('u', 'U')):
base = 16
c = c[1:]
else:
m = re.match(r'^u([\da-f]+)', aa_char)
if m:
ret += compat_chr(int(m.group(1), 16))
return ret
base = 10
c = int_or_none(c, base=base)
return '' if c is None else compat_chr(c)
return ''.join(
chr_from_code(aa_char)
for aa_char in aa_code.split(delim))
class XFileShareIE(InfoExtractor):
_SITES = (
(r'aparat\.cam', 'Aparat'),
(r'clipwatching\.com', 'ClipWatching'),
(r'gounlimited\.to', 'GoUnlimited'),
(r'govid\.me', 'GoVid'),
(r'holavid\.com', 'HolaVid'),
(r'streamty\.com', 'Streamty'),
(r'thevideobee\.to', 'TheVideoBee'),
(r'uqload\.com', 'Uqload'),
(r'vidbom\.com', 'VidBom'),
(r'vidlo\.us', 'vidlo'),
(r'vidlocker\.xyz', 'VidLocker'),
(r'vidshare\.tv', 'VidShare'),
(r'vup\.to', 'VUp'),
# status check 2024-02: site availability, G site: search
(r'aparat\.cam', 'Aparat'), # Cloudflare says host error 522, apparently changed to wolfstreeam.tv
(r'filemoon\.sx/.', 'FileMoon'),
(r'gounlimited\.to', 'GoUnlimited'), # no media pages listed
(r'govid\.me', 'GoVid'), # no media pages listed
(r'highstream\.tv', 'HighStream'), # clipwatching.com redirects here
(r'holavid\.com', 'HolaVid'), # Cloudflare says host error 522
# (r'streamty\.com', 'Streamty'), # no media pages listed, connection timeout
# (r'thevideobee\.to', 'TheVideoBee'), # no pages listed, refuses connection
(r'uqload\.to', 'Uqload'), # .com, .co redirect here
(r'(?:vedbam\.xyz|vadbam.net)', 'V?dB?m'), # vidbom.com redirects here, but no valid media pages listed
(r'vidlo\.us', 'vidlo'), # no valid media pages listed
(r'vidlocker\.xyz', 'VidLocker'), # no media pages listed
(r'(?:w\d\.)?viidshar\.com', 'VidShare'), # vidshare.tv redirects here
# (r'vup\.to', 'VUp'), # domain not found
(r'wolfstream\.tv', 'WolfStream'),
(r'xvideosharing\.com', 'XVideoSharing'),
(r'xvideosharing\.com', 'XVideoSharing'), # just started showing 'maintenance mode'
)
IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1])
IE_DESC = 'XFileShare-based sites: %s' % ', '.join(list(zip(*_SITES))[1])
_VALID_URL = (r'https?://(?:www\.)?(?P<host>%s)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
% '|'.join(site for site in list(zip(*_SITES))[0]))
_EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' % '|'.join(site for site in list(zip(*_SITES))[0])]
_FILE_NOT_FOUND_REGEXES = (
r'>(?:404 - )?File Not Found<',
r'>The file was removed by administrator<',
)
_TITLE_REGEXES = (
r'style="z-index: [0-9]+;">([^<]+)</span>',
r'<td nowrap>([^<]+)</td>',
r'h4-fine[^>]*>([^<]+)<',
r'>Watch (.+)[ <]',
r'<h2 class="video-page-head">([^<]+)</h2>',
r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<', # streamin.to (dead)
r'title\s*:\s*"([^"]+)"', # govid.me
)
_SOURCE_URL_REGEXES = (
r'(?:file|src)\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1',
r'file_link\s*=\s*(["\'])(?P<url>http(?:(?!\1).)+)\1',
r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http(?:(?!\2).)+)\2\)',
r'<embed[^>]+src=(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1',
)
_THUMBNAIL_REGEXES = (
r'<video[^>]+poster="([^"]+)"',
r'(?:image|poster)\s*:\s*["\'](http[^"\']+)["\'],',
)
_TESTS = [{
'url': 'http://xvideosharing.com/fq65f94nd2ve',
'md5': '4181f63957e8fe90ac836fa58dc3c8a6',
'note': 'link in `sources`',
'url': 'https://uqload.to/dcsu06gdb45o',
'md5': '7f8db187b254379440bf4fcad094ae86',
'info_dict': {
'id': 'fq65f94nd2ve',
'id': 'dcsu06gdb45o',
'ext': 'mp4',
'title': 'sample',
'thumbnail': r're:http://.*\.jpg',
'title': 'f2e31015957e74c8c8427982e161c3fc mp4',
'thumbnail': r're:https://.*\.jpg'
},
'params': {
'nocheckcertificate': True,
},
'expected_warnings': ['Unable to extract JWPlayer data'],
}, {
'note': 'link in decoded `sources`',
'url': 'https://xvideosharing.com/1tlg6agrrdgc',
'md5': '2608ce41932c1657ae56258a64e647d9',
'info_dict': {
'id': '1tlg6agrrdgc',
'ext': 'mp4',
'title': '0121',
'thumbnail': r're:https?://.*\.jpg',
},
'skip': 'This server is in maintenance mode.',
}, {
'note': 'JWPlayer link in un-p,a,c,k,e,d JS',
'url': 'https://filemoon.sx/e/dw40rxrzruqz',
'md5': '5a713742f57ac4aef29b74733e8dda01',
'info_dict': {
'id': 'dw40rxrzruqz',
'title': 'dw40rxrzruqz',
'ext': 'mp4'
},
}, {
'note': 'JWPlayer link in un-p,a,c,k,e,d JS',
'url': 'https://vadbam.net/6lnbkci96wly.html',
'md5': 'a1616800076177e2ac769203957c54bc',
'info_dict': {
'id': '6lnbkci96wly',
'title': 'Heart Crime S01 E03 weciima autos',
'ext': 'mp4'
},
}, {
'note': 'JWPlayer link in clear',
'url': 'https://w1.viidshar.com/nnibe0xf0h79.html',
'md5': 'f0a580ce9df06cc61b4a5c979d672367',
'info_dict': {
'id': 'nnibe0xf0h79',
'title': 'JaGa 68ar',
'ext': 'mp4'
},
'params': {
'skip_download': 'ffmpeg',
},
'expected_warnings': ['hlsnative has detected features it does not support'],
}, {
'note': 'JWPlayer link in clear',
'url': 'https://wolfstream.tv/a3drtehyrg52.html',
'md5': '1901d86a79c5e0c6a51bdc9a4cfd3769',
'info_dict': {
'id': 'a3drtehyrg52',
'title': 'NFL 2023 W04 DET@GB',
'ext': 'mp4'
},
}, {
'url': 'https://aparat.cam/n4d6dh0wvlpr',
'only_matching': True,
}, {
'url': 'https://wolfstream.tv/nthme29v9u2x',
'url': 'https://uqload.to/ug5somm0ctnk.html',
'only_matching': True,
}, {
'url': 'https://highstream.tv/2owiyz3sjoux',
'only_matching': True,
}, {
'url': 'https://vedbam.xyz/6lnbkci96wly.html',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1'
% '|'.join(site for site in list(zip(*XFileShareIE._SITES))[0]),
webpage)]
@classmethod
def _extract_urls(cls, webpage):
def yield_urls():
for regex in cls._EMBED_REGEX:
for mobj in re.finditer(regex, webpage):
yield mobj.group('url')
return list(yield_urls())
def _real_extract(self, url):
host, video_id = re.match(self._VALID_URL, url).groups()
host, video_id = self._match_valid_url(url).group('host', 'id')
url = 'https://%s/' % host + ('embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id)
url = 'https://%s/%s' % (
host,
'embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id)
webpage = self._download_webpage(url, video_id)
if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES):
container_div = get_element_by_id('container', webpage) or webpage
if self._search_regex(
r'>This server is in maintenance mode\.', container_div,
'maint error', group=0, default=None):
raise ExtractorError(clean_html(container_div), expected=True)
if self._search_regex(
self._FILE_NOT_FOUND_REGEXES, container_div,
'missing video error', group=0, default=None):
raise ExtractorError('Video %s does not exist' % video_id, expected=True)
fields = self._hidden_inputs(webpage)
@@ -122,59 +226,43 @@ class XFileShareIE(InfoExtractor):
'Content-type': 'application/x-www-form-urlencoded',
})
title = (self._search_regex(
(r'style="z-index: [0-9]+;">([^<]+)</span>',
r'<td nowrap>([^<]+)</td>',
r'h4-fine[^>]*>([^<]+)<',
r'>Watch (.+)[ <]',
r'<h2 class="video-page-head">([^<]+)</h2>',
r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<', # streamin.to
r'title\s*:\s*"([^"]+)"'), # govid.me
webpage, 'title', default=None) or self._og_search_title(
webpage, default=None) or video_id).strip()
title = (
self._search_regex(self._TITLE_REGEXES, webpage, 'title', default=None)
or self._og_search_title(webpage, default=None)
or video_id).strip()
for regex, func in (
(r'(eval\(function\(p,a,c,k,e,d\){.+)', decode_packed_codes),
(r'(゚.+)', aa_decode)):
obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None)
if obf_code:
webpage = webpage.replace(obf_code, func(obf_code))
obf_code = True
while obf_code:
for regex, func in (
(r'(?s)(?<!-)\b(eval\(function\(p,a,c,k,e,d\)\{(?:(?!</script>).)+\)\))',
decode_packed_codes),
(r'(゚.+)', aa_decode)):
obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None)
if obf_code:
webpage = webpage.replace(obf_code, func(obf_code))
break
formats = []
jwplayer_data = self._find_jwplayer_data(
webpage.replace(r'\'', '\''), video_id)
result = self._parse_jwplayer_data(
jwplayer_data, video_id, require_title=False,
m3u8_id='hls', mpd_id='dash')
jwplayer_data = self._search_regex(
[
r'jwplayer\("[^"]+"\)\.load\(\[({.+?})\]\);',
r'jwplayer\("[^"]+"\)\.setup\(({.+?})\);',
], webpage,
'jwplayer data', default=None)
if jwplayer_data:
jwplayer_data = self._parse_json(
jwplayer_data.replace(r"\'", "'"), video_id, js_to_json)
if not traverse_obj(result, 'formats'):
if jwplayer_data:
formats = self._parse_jwplayer_data(
jwplayer_data, video_id, False,
m3u8_id='hls', mpd_id='dash')['formats']
if not formats:
urls = []
for regex in (
r'(?:file|src)\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1',
r'file_link\s*=\s*(["\'])(?P<url>http(?:(?!\1).)+)\1',
r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http(?:(?!\2).)+)\2\)',
r'<embed[^>]+src=(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'):
self.report_warning(
'Failed to extract JWPlayer formats', video_id=video_id)
urls = set()
for regex in self._SOURCE_URL_REGEXES:
for mobj in re.finditer(regex, webpage):
video_url = mobj.group('url')
if video_url not in urls:
urls.append(video_url)
urls.add(mobj.group('url'))
sources = self._search_regex(
r'sources\s*:\s*(\[(?!{)[^\]]+\])', webpage, 'sources', default=None)
if sources:
urls.extend(self._parse_json(sources, video_id))
urls.update(traverse_obj(sources, (T(lambda s: self._parse_json(s, video_id)), Ellipsis)))
formats = []
for video_url in urls:
for video_url in traverse_obj(urls, (Ellipsis, T(url_or_none))):
if determine_ext(video_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4',
@@ -185,17 +273,19 @@ class XFileShareIE(InfoExtractor):
'url': video_url,
'format_id': 'sd',
})
self._sort_formats(formats)
result = {'formats': formats}
self._sort_formats(result['formats'])
thumbnail = self._search_regex(
[
r'<video[^>]+poster="([^"]+)"',
r'(?:image|poster)\s*:\s*["\'](http[^"\']+)["\'],',
], webpage, 'thumbnail', default=None)
self._THUMBNAIL_REGEXES, webpage, 'thumbnail', default=None)
return {
if not (title or result.get('title')):
title = self._generic_title(url) or video_id
return merge_dicts(result, {
'id': video_id,
'title': title,
'title': title or None,
'thumbnail': thumbnail,
'formats': formats,
}
'http_headers': {'Referer': url}
})

View File

@@ -1,3 +1,4 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
@@ -23,7 +24,7 @@ from ..utils import (
class XHamsterIE(InfoExtractor):
_DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com)'
_DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com|xhvid\.com)'
_VALID_URL = r'''(?x)
https?://
(?:.+?\.)?%s/
@@ -34,7 +35,7 @@ class XHamsterIE(InfoExtractor):
''' % _DOMAINS
_TESTS = [{
'url': 'https://xhamster.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445',
'md5': '98b4687efb1ffd331c4197854dc09e8f',
'md5': '34e1ab926db5dc2750fed9e1f34304bb',
'info_dict': {
'id': '1509445',
'display_id': 'femaleagent-shy-beauty-takes-the-bait',
@@ -43,6 +44,7 @@ class XHamsterIE(InfoExtractor):
'timestamp': 1350194821,
'upload_date': '20121014',
'uploader': 'Ruseful2011',
'uploader_id': 'ruseful2011',
'duration': 893,
'age_limit': 18,
},
@@ -72,6 +74,7 @@ class XHamsterIE(InfoExtractor):
'timestamp': 1454948101,
'upload_date': '20160208',
'uploader': 'parejafree',
'uploader_id': 'parejafree',
'duration': 72,
'age_limit': 18,
},
@@ -117,6 +120,12 @@ class XHamsterIE(InfoExtractor):
}, {
'url': 'http://de.xhamster.com/videos/skinny-girl-fucks-herself-hard-in-the-forest-xhnBJZx',
'only_matching': True,
}, {
'url': 'https://xhday.com/videos/strapless-threesome-xhh7yVf',
'only_matching': True,
}, {
'url': 'https://xhvid.com/videos/lk-mm-xhc6wn6',
'only_matching': True,
}]
def _real_extract(self, url):
@@ -245,6 +254,7 @@ class XHamsterIE(InfoExtractor):
else:
categories = None
uploader_url = url_or_none(try_get(video, lambda x: x['author']['pageURL']))
return {
'id': video_id,
'display_id': display_id,
@@ -253,6 +263,8 @@ class XHamsterIE(InfoExtractor):
'timestamp': int_or_none(video.get('created')),
'uploader': try_get(
video, lambda x: x['author']['name'], compat_str),
'uploader_url': uploader_url,
'uploader_id': uploader_url.split('/')[-1] if uploader_url else None,
'thumbnail': video.get('thumbURL'),
'duration': int_or_none(video.get('duration')),
'view_count': int_or_none(video.get('views')),
@@ -261,7 +273,7 @@ class XHamsterIE(InfoExtractor):
'dislike_count': int_or_none(try_get(
video, lambda x: x['rating']['dislikes'], int)),
'comment_count': int_or_none(video.get('views')),
'age_limit': age_limit,
'age_limit': age_limit if age_limit is not None else 18,
'categories': categories,
'formats': formats,
}
@@ -352,6 +364,7 @@ class XHamsterIE(InfoExtractor):
'description': description,
'upload_date': upload_date,
'uploader': uploader,
'uploader_id': uploader.lower() if uploader else None,
'thumbnail': thumbnail,
'duration': duration,
'view_count': view_count,
@@ -420,6 +433,12 @@ class XHamsterUserIE(InfoExtractor):
'id': 'firatkaan',
},
'playlist_mincount': 1,
}, {
'url': 'https://xhday.com/users/mobhunter',
'only_matching': True,
}, {
'url': 'https://xhvid.com/users/pelushe21',
'only_matching': True,
}]
def _entries(self, user_id):

View File

@@ -106,6 +106,25 @@ class YandexMusicTrackIE(YandexMusicBaseIE):
}, {
'url': 'http://music.yandex.com/album/540508/track/4878838',
'only_matching': True,
}, {
'url': 'https://music.yandex.ru/album/16302456/track/85430762',
'md5': '11b8d50ab03b57738deeaadf661a0a48',
'info_dict': {
'id': '85430762',
'ext': 'mp3',
'abr': 128,
'title': 'Haddadi Von Engst, Phonic Youth, Super Flu - Til The End (Super Flu Remix)',
'filesize': int,
'duration': 431.14,
'track': 'Til The End (Super Flu Remix)',
'album': 'Til The End',
'album_artist': 'Haddadi Von Engst, Phonic Youth',
'artist': 'Haddadi Von Engst, Phonic Youth, Super Flu',
'release_year': 2021,
'genre': 'house',
'disc_number': 1,
'track_number': 2,
}
}]
def _real_extract(self, url):
@@ -116,10 +135,14 @@ class YandexMusicTrackIE(YandexMusicBaseIE):
'track', tld, url, track_id, 'Downloading track JSON',
{'track': '%s:%s' % (track_id, album_id)})['track']
track_title = track['title']
track_version = track.get('version')
if track_version:
track_title = '%s (%s)' % (track_title, track_version)
download_data = self._download_json(
'https://music.yandex.ru/api/v2.1/handlers/track/%s:%s/web-album_track-track-track-main/download/m' % (track_id, album_id),
track_id, 'Downloading track location url JSON',
query={'hq': 1},
headers={'X-Retpath-Y': url})
fd_data = self._download_json(

View File

@@ -1,19 +1,38 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
import re
from time import sleep
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
ExtractorError,
get_element_by_class,
get_element_by_id,
int_or_none,
str_to_int,
merge_dicts,
parse_count,
parse_qs,
T,
traverse_obj,
unified_strdate,
url_or_none,
urljoin,
)
class YouPornIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?'
_VALID_URL = (
r'youporn:(?P<id>\d+)',
r'''(?x)
https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P<id>\d+)
(?:/(?:(?P<display_id>[^/?#&]+)/?)?)?(?:[#?]|$)
'''
)
_EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)']
_TESTS = [{
'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
'md5': '3744d24c50438cf5b6f6d59feb5055c2',
@@ -33,7 +52,7 @@ class YouPornIE(InfoExtractor):
'tags': list,
'age_limit': 18,
},
'skip': 'This video has been disabled',
'skip': 'This video has been deactivated',
}, {
# Unknown uploader
'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4',
@@ -65,57 +84,104 @@ class YouPornIE(InfoExtractor):
}, {
'url': 'https://www.youporn.com/watch/13922959/femdom-principal/',
'only_matching': True,
}, {
'url': 'https://www.youporn.com/watch/16290308/tinderspecial-trailer1/',
'info_dict': {
'id': '16290308',
'age_limit': 18,
'categories': [],
'description': None, # SEO spam using title removed
'display_id': 'tinderspecial-trailer1',
'duration': 298.0,
'ext': 'mp4',
'upload_date': '20201123',
'uploader': 'Ersties',
'tags': [],
'thumbnail': 'https://fi1.ypncdn.com/m=eaSaaTbWx/202011/23/16290308/original/3.jpg',
'timestamp': 1606147564,
'title': 'Tinder In Real Life',
'view_count': int,
}
}]
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)',
webpage)
@classmethod
def _extract_urls(cls, webpage):
def yield_urls():
for p in cls._EMBED_REGEX:
for m in re.finditer(p, webpage):
yield m.group('url')
return list(yield_urls())
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
# A different video ID (data-video-id) is hidden in the page but
# never seems to be used
video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
url = 'http://www.youporn.com/watch/%s' % (video_id,)
webpage = self._download_webpage(
url, video_id, headers={'Cookie': 'age_verified=1'})
definitions = self._download_json(
'https://www.youporn.com/api/video/media_definitions/%s/' % video_id,
display_id)
watchable = self._search_regex(
r'''(<div\s[^>]*\bid\s*=\s*('|")?watch-container(?(2)\2|(?!-)\b)[^>]*>)''',
webpage, 'watchability', default=None)
if not watchable:
msg = re.split(r'\s{4}', clean_html(get_element_by_id(
'mainContent', webpage)) or '')[0]
raise ExtractorError(
('%s says: %s' % (self.IE_NAME, msg))
if msg else 'Video unavailable: no reason found',
expected=True)
# internal ID ?
# video_id = extract_attributes(watchable).get('data-video-id')
playervars = self._search_json(
r'\bplayervars\s*:', webpage, 'playervars', video_id)
def get_fmt(x):
v_url = url_or_none(x.get('videoUrl'))
if v_url:
x['videoUrl'] = v_url
return (x['format'], x)
defs_by_format = dict(traverse_obj(playervars, (
'mediaDefinitions', lambda _, v: v.get('format'), T(get_fmt))))
def get_format_data(f):
if f not in defs_by_format:
return []
return self._download_json(
defs_by_format[f]['videoUrl'], video_id, '{0}-formats'.format(f))
formats = []
for definition in definitions:
if not isinstance(definition, dict):
continue
video_url = url_or_none(definition.get('videoUrl'))
if not video_url:
continue
f = {
'url': video_url,
'filesize': int_or_none(definition.get('videoSize')),
}
height = int_or_none(definition.get('quality'))
# Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s
for hls_url in traverse_obj(
get_format_data('hls'),
(lambda _, v: not isinstance(v['defaultQuality'], bool), 'videoUrl'),
(Ellipsis, 'videoUrl')):
formats.extend(self._extract_m3u8_formats(
hls_url, video_id, 'mp4', fatal=False, m3u8_id='hls',
entry_protocol='m3u8_native'))
for f in traverse_obj(get_format_data('mp4'), (
lambda _, v: v.get('videoUrl'), {
'url': ('videoUrl', T(url_or_none)),
'filesize': ('videoSize', T(int_or_none)),
'height': ('quality', T(int_or_none)),
}, T(lambda x: x.get('videoUrl') and x))):
# Video URL's path looks like this:
# /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# /videos/201703/11/109285532/1080P_4000K_109285532.mp4
# We will benefit from it by extracting some metadata
mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url)
mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', f['videoUrl'])
if mobj:
if not height:
height = int(mobj.group('height'))
bitrate = int(mobj.group('bitrate'))
f.update({
'format_id': '%dp-%dk' % (height, bitrate),
'tbr': bitrate,
})
f['height'] = height
if not f.get('height'):
f['height'] = int(mobj.group('height'))
f['tbr'] = int(mobj.group('bitrate'))
f['format_id'] = '%dp-%dk' % (f['height'], f['tbr'])
formats.append(f)
self._sort_formats(formats)
webpage = self._download_webpage(
'http://www.youporn.com/watch/%s' % video_id, display_id,
headers={'Cookie': 'age_verified=1'})
title = self._html_search_regex(
r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>',
webpage, 'title', default=None) or self._og_search_title(
@@ -130,27 +196,30 @@ class YouPornIE(InfoExtractor):
thumbnail = self._search_regex(
r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1',
webpage, 'thumbnail', fatal=False, group='thumbnail')
duration = int_or_none(self._html_search_meta(
'video:duration', webpage, 'duration', fatal=False))
duration = traverse_obj(playervars, ('duration', T(int_or_none)))
if duration is None:
duration = int_or_none(self._html_search_meta(
'video:duration', webpage, 'duration', fatal=False))
uploader = self._html_search_regex(
r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>',
webpage, 'uploader', fatal=False)
upload_date = unified_strdate(self._html_search_regex(
[r'UPLOADED:\s*<span>([^<]+)',
(r'UPLOADED:\s*<span>([^<]+)',
r'Date\s+[Aa]dded:\s*<span>([^<]+)',
r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'],
r'''(?s)<div[^>]+class=["']videoInfo(?:Date|Time)\b[^>]*>(.+?)</div>''',
r'(?s)<label\b[^>]*>Uploaded[^<]*</label>\s*<span\b[^>]*>(.+?)</span>'),
webpage, 'upload date', fatal=False))
age_limit = self._rta_search(webpage)
view_count = None
views = self._search_regex(
r'(<div[^>]+\bclass=["\']js_videoInfoViews["\']>)', webpage,
'views', default=None)
r'(<div\s[^>]*\bdata-value\s*=[^>]+>)\s*<label>Views:</label>',
webpage, 'views', default=None)
if views:
view_count = str_to_int(extract_attributes(views).get('data-value'))
comment_count = str_to_int(self._search_regex(
view_count = parse_count(extract_attributes(views).get('data-value'))
comment_count = parse_count(self._search_regex(
r'>All [Cc]omments? \(([\d,.]+)\)',
webpage, 'comment count', default=None))
@@ -166,7 +235,10 @@ class YouPornIE(InfoExtractor):
r'(?s)Tags:.*?</div>\s*<div[^>]+class=["\']tagBoxContent["\'][^>]*>(.+?)</div>',
'tags')
return {
data = self._search_json_ld(webpage, video_id, expected_type='VideoObject', fatal=False) or {}
data.pop('url', None)
result = merge_dicts(data, {
'id': video_id,
'display_id': display_id,
'title': title,
@@ -181,4 +253,442 @@ class YouPornIE(InfoExtractor):
'tags': tags,
'age_limit': age_limit,
'formats': formats,
}
})
# Remove promotional non-description
if result.get('description', '').startswith(
'Watch %s online' % (result['title'],)):
del result['description']
return result
class YouPornListBase(InfoExtractor):
# pattern in '.title-text' element of page section containing videos
_PLAYLIST_TITLEBAR_RE = r'\s+[Vv]ideos\s*$'
_PAGE_RETRY_COUNT = 0 # ie, no retry
_PAGE_RETRY_DELAY = 2 # seconds
def _get_next_url(self, url, pl_id, html):
return urljoin(url, self._search_regex(
r'''<a\s[^>]*?\bhref\s*=\s*("|')(?P<url>(?:(?!\1)[^>])+)\1''',
get_element_by_id('next', html) or '', 'next page',
group='url', default=None))
@classmethod
def _get_title_from_slug(cls, title_slug):
return re.sub(r'[_-]', ' ', title_slug)
def _entries(self, url, pl_id, html=None, page_num=None):
# separates page sections
PLAYLIST_SECTION_RE = (
r'''<div\s[^>]*\bclass\s*=\s*('|")(?:[\w$-]+\s+|\s)*?title-bar(?:\s+[\w$-]+|\s)*\1[^>]*>'''
)
# contains video link
VIDEO_URL_RE = r'''(?x)
<div\s[^>]*\bdata-video-id\s*=\s*('|")\d+\1[^>]*>\s*
(?:<div\b[\s\S]+?</div>\s*)*
<a\s[^>]*\bhref\s*=\s*('|")(?P<url>(?:(?!\2)[^>])+)\2
'''
def yield_pages(url, html=html, page_num=page_num):
fatal = not html
for pnum in itertools.count(start=page_num or 1):
if not html:
html = self._download_webpage(
url, pl_id, note='Downloading page %d' % pnum,
fatal=fatal)
if not html:
break
fatal = False
yield (url, html, pnum)
# explicit page: extract just that page
if page_num is not None:
break
next_url = self._get_next_url(url, pl_id, html)
if not next_url or next_url == url:
break
url, html = next_url, None
def retry_page(msg, tries_left, page_data):
if tries_left <= 0:
return
self.report_warning(msg, pl_id)
sleep(self._PAGE_RETRY_DELAY)
return next(
yield_pages(page_data[0], page_num=page_data[2]), None)
def yield_entries(html):
for frag in re.split(PLAYLIST_SECTION_RE, html):
if not frag:
continue
t_text = get_element_by_class('title-text', frag or '')
if not (t_text and re.search(self._PLAYLIST_TITLEBAR_RE, t_text)):
continue
for m in re.finditer(VIDEO_URL_RE, frag):
video_url = urljoin(url, m.group('url'))
if video_url:
yield self.url_result(video_url)
last_first_url = None
for page_data in yield_pages(url, html=html, page_num=page_num):
# page_data: url, html, page_num
first_url = None
tries_left = self._PAGE_RETRY_COUNT + 1
while tries_left > 0:
tries_left -= 1
for from_ in yield_entries(page_data[1]):
# may get the same page twice instead of empty page
# or (site bug) intead of actual next page
if not first_url:
first_url = from_['url']
if first_url == last_first_url:
# sometimes (/porntags/) the site serves the previous page
# instead but may provide the correct page after a delay
page_data = retry_page(
'Retrying duplicate page...', tries_left, page_data)
if page_data:
first_url = None
break
continue
yield from_
else:
if not first_url and 'no-result-paragarph1' in page_data[1]:
page_data = retry_page(
'Retrying empty page...', tries_left, page_data)
if page_data:
continue
else:
# success/failure
break
# may get an infinite (?) sequence of empty pages
if not first_url:
break
last_first_url = first_url
def _real_extract(self, url, html=None):
# exceptionally, id may be None
m_dict = self._match_valid_url(url).groupdict()
pl_id, page_type, sort = (m_dict.get(k) for k in ('id', 'type', 'sort'))
qs = parse_qs(url)
for q, v in qs.items():
if v:
qs[q] = v[-1]
else:
del qs[q]
base_id = pl_id or 'YouPorn'
title = self._get_title_from_slug(base_id)
if page_type:
title = '%s %s' % (page_type.capitalize(), title)
base_id = [base_id.lower()]
if sort is None:
title += ' videos'
else:
title = '%s videos by %s' % (title, re.sub(r'[_-]', ' ', sort))
base_id.append(sort)
if qs:
ps = ['%s=%s' % item for item in sorted(qs.items())]
title += ' (%s)' % ','.join(ps)
base_id.extend(ps)
pl_id = '/'.join(base_id)
return self.playlist_result(
self._entries(url, pl_id, html=html,
page_num=int_or_none(qs.get('page'))),
playlist_id=pl_id, playlist_title=title)
class YouPornCategoryIE(YouPornListBase):
IE_DESC = 'YouPorn category, with sorting, filtering and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
(?P<type>category)/(?P<id>[^/?#&]+)
(?:/(?P<sort>popular|views|rating|time|duration))?/?(?:[#?]|$)
'''
_TESTS = [{
'note': 'Full list with pagination',
'url': 'https://www.youporn.com/category/lingerie/popular/',
'info_dict': {
'id': 'lingerie/popular',
'title': 'Category lingerie videos by popular',
},
'playlist_mincount': 39,
}, {
'note': 'Filtered paginated list with single page result',
'url': 'https://www.youporn.com/category/lingerie/duration/?min_minutes=10',
'info_dict': {
'id': 'lingerie/duration/min_minutes=10',
'title': 'Category lingerie videos by duration (min_minutes=10)',
},
'playlist_maxcount': 30,
}, {
'note': 'Single page of full list',
'url': 'https://www.youporn.com/category/lingerie/popular?page=1',
'info_dict': {
'id': 'lingerie/popular/page=1',
'title': 'Category lingerie videos by popular (page=1)',
},
'playlist_count': 30,
}]
class YouPornChannelIE(YouPornListBase):
IE_DESC = 'YouPorn channel, with sorting and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
(?P<type>channel)/(?P<id>[^/?#&]+)
(?:/(?P<sort>rating|views|duration))?/?(?:[#?]|$)
'''
_TESTS = [{
'note': 'Full list with pagination',
'url': 'https://www.youporn.com/channel/x-feeds/',
'info_dict': {
'id': 'x-feeds',
'title': 'Channel X-Feeds videos',
},
'playlist_mincount': 37,
}, {
'note': 'Single page of full list (no filters here)',
'url': 'https://www.youporn.com/channel/x-feeds/duration?page=1',
'info_dict': {
'id': 'x-feeds/duration/page=1',
'title': 'Channel X-Feeds videos by duration (page=1)',
},
'playlist_count': 24,
}]
@staticmethod
def _get_title_from_slug(title_slug):
return re.sub(r'_', ' ', title_slug).title()
class YouPornCollectionIE(YouPornListBase):
IE_DESC = 'YouPorn collection (user playlist), with sorting and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
(?P<type>collection)s/videos/(?P<id>\d+)
(?:/(?P<sort>rating|views|time|duration))?/?(?:[#?]|$)
'''
_PLAYLIST_TITLEBAR_RE = r'^\s*Videos\s+in\s'
_TESTS = [{
'note': 'Full list with pagination',
'url': 'https://www.youporn.com/collections/videos/33044251/',
'info_dict': {
'id': '33044251',
'title': 'Collection Sexy Lips videos',
'uploader': 'ph-littlewillyb',
},
'playlist_mincount': 50,
}, {
'note': 'Single page of full list (no filters here)',
'url': 'https://www.youporn.com/collections/videos/33044251/time?page=1',
'info_dict': {
'id': '33044251/time/page=1',
'title': 'Collection Sexy Lips videos by time (page=1)',
'uploader': 'ph-littlewillyb',
},
'playlist_count': 20,
}]
def _real_extract(self, url):
pl_id = self._match_id(url)
html = self._download_webpage(url, pl_id)
playlist = super(YouPornCollectionIE, self)._real_extract(url, html=html)
infos = re.sub(r'\s+', ' ', clean_html(get_element_by_class(
'collection-infos', html)) or '')
title, uploader = self._search_regex(
r'^\s*Collection: (?P<title>.+?) \d+ VIDEOS \d+ VIEWS \d+ days LAST UPDATED From: (?P<uploader>[\w_-]+)',
infos, 'title/uploader', group=('title', 'uploader'), default=(None, None))
return merge_dicts({
'title': playlist['title'].replace(playlist['id'].split('/')[0], title),
'uploader': uploader,
}, playlist) if title else playlist
class YouPornTagIE(YouPornListBase):
IE_DESC = 'YouPorn tag (porntags), with sorting, filtering and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
porn(?P<type>tag)s/(?P<id>[^/?#&]+)
(?:/(?P<sort>views|rating|time|duration))?/?(?:[#?]|$)
'''
_PLAYLIST_TITLEBAR_RE = r'^\s*Videos\s+tagged\s'
_PAGE_RETRY_COUNT = 1
_TESTS = [{
'note': 'Full list with pagination',
'url': 'https://www.youporn.com/porntags/austrian',
'info_dict': {
'id': 'austrian',
'title': 'Tag austrian videos',
},
'playlist_mincount': 35,
'expected_warnings': ['Retrying duplicate page'],
}, {
'note': 'Filtered paginated list with single page result',
'url': 'https://www.youporn.com/porntags/austrian/duration/?min_minutes=10',
'info_dict': {
'id': 'austrian/duration/min_minutes=10',
'title': 'Tag austrian videos by duration (min_minutes=10)',
},
# number of videos per page is (row x col) 2x3 + 6x4 + 2, or + 3,
# or more, varying with number of ads; let's set max as 9x4
# NB col 1 may not be shown in non-JS page with site CSS and zoom 100%
'playlist_maxcount': 32,
'expected_warnings': ['Retrying duplicate page', 'Retrying empty page'],
}, {
'note': 'Single page of full list',
'url': 'https://www.youporn.com/porntags/austrian/?page=1',
'info_dict': {
'id': 'austrian/page=1',
'title': 'Tag austrian videos (page=1)',
},
'playlist_mincount': 32,
'playlist_maxcount': 34,
'expected_warnings': ['Retrying duplicate page', 'Retrying empty page'],
}]
# YP tag navigation is broken, loses sort
def _get_next_url(self, url, pl_id, html):
next_url = super(YouPornTagIE, self)._get_next_url(url, pl_id, html)
if next_url:
n = self._match_valid_url(next_url)
if n:
s = n.groupdict().get('sort')
if s:
u = self._match_valid_url(url)
if u:
u = u.groupdict().get('sort')
if s and not u:
n = n.end('sort')
next_url = next_url[:n] + '/' + u + next_url[n:]
return next_url
class YouPornStarIE(YouPornListBase):
IE_DESC = 'YouPorn Pornstar, with description, sorting and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
(?P<type>pornstar)/(?P<id>[^/?#&]+)
(?:/(?P<sort>rating|views|duration))?/?(?:[#?]|$)
'''
_PLAYLIST_TITLEBAR_RE = r'^\s*Videos\s+[fF]eaturing\s'
_TESTS = [{
'note': 'Full list with pagination',
'url': 'https://www.youporn.com/pornstar/daynia/',
'info_dict': {
'id': 'daynia',
'title': 'Pornstar Daynia videos',
'description': r're:Daynia Rank \d+ Videos \d+ Views [\d,.]+ .+ Subscribers \d+',
},
'playlist_mincount': 45,
}, {
'note': 'Single page of full list (no filters here)',
'url': 'https://www.youporn.com/pornstar/daynia/?page=1',
'info_dict': {
'id': 'daynia/page=1',
'title': 'Pornstar Daynia videos (page=1)',
'description': 're:.{180,}',
},
'playlist_count': 26,
}]
@staticmethod
def _get_title_from_slug(title_slug):
return re.sub(r'_', ' ', title_slug).title()
def _real_extract(self, url):
pl_id = self._match_id(url)
html = self._download_webpage(url, pl_id)
playlist = super(YouPornStarIE, self)._real_extract(url, html=html)
INFO_ELEMENT_RE = r'''(?x)
<div\s[^>]*\bclass\s*=\s*('|")(?:[\w$-]+\s+|\s)*?pornstar-info-wrapper(?:\s+[\w$-]+|\s)*\1[^>]*>
(?P<info>[\s\S]+?)(?:</div>\s*){6,}
'''
infos = self._search_regex(INFO_ELEMENT_RE, html, 'infos', group='info', default='')
if infos:
infos = re.sub(
r'(?:\s*nl=nl)+\s*', ' ',
re.sub(r'(?u)\s+', ' ', clean_html(
re.sub('\n', 'nl=nl', infos)))).replace('ribe Subsc', '')
return merge_dicts({
'description': infos.strip() or None,
}, playlist)
class YouPornVideosIE(YouPornListBase):
IE_DESC = 'YouPorn video (browse) playlists, with sorting, filtering and pagination'
_VALID_URL = r'''(?x)
https?://(?:www\.)?youporn\.com/
(?:(?P<id>browse)/)?
(?P<sort>(?(id)
(?:duration|rating|time|views)|
(?:most_(?:favou?rit|view)ed|recommended|top_rated)?))
(?:[/#?]|$)
'''
_PLAYLIST_TITLEBAR_RE = r'\s+(?:[Vv]ideos|VIDEOS)\s*$'
_TESTS = [{
'note': 'Full list with pagination (too long for test)',
'url': 'https://www.youporn.com/',
'info_dict': {
'id': 'youporn',
'title': 'YouPorn videos',
},
'only_matching': True,
}, {
'note': 'Full list with pagination (too long for test)',
'url': 'https://www.youporn.com/recommended',
'info_dict': {
'id': 'youporn/recommended',
'title': 'YouPorn videos by recommended',
},
'only_matching': True,
}, {
'note': 'Full list with pagination (too long for test)',
'url': 'https://www.youporn.com/top_rated',
'info_dict': {
'id': 'youporn/top_rated',
'title': 'YouPorn videos by top rated',
},
'only_matching': True,
}, {
'note': 'Full list with pagination (too long for test)',
'url': 'https://www.youporn.com/browse/time',
'info_dict': {
'id': 'browse/time',
'title': 'YouPorn videos by time',
},
'only_matching': True,
}, {
'note': 'Filtered paginated list with single page result',
'url': 'https://www.youporn.com/most_favorited/?res=VR&max_minutes=2',
'info_dict': {
'id': 'youporn/most_favorited/max_minutes=2/res=VR',
'title': 'YouPorn videos by most favorited (max_minutes=2,res=VR)',
},
'playlist_mincount': 10,
'playlist_maxcount': 28,
}, {
'note': 'Filtered paginated list with several pages',
'url': 'https://www.youporn.com/most_favorited/?res=VR&max_minutes=5',
'info_dict': {
'id': 'youporn/most_favorited/max_minutes=5/res=VR',
'title': 'YouPorn videos by most favorited (max_minutes=5,res=VR)',
},
'playlist_mincount': 45,
}, {
'note': 'Single page of full list',
'url': 'https://www.youporn.com/browse/time?page=1',
'info_dict': {
'id': 'browse/time/page=1',
'title': 'YouPorn videos by time (page=1)',
},
'playlist_count': 36,
}]
@staticmethod
def _get_title_from_slug(title_slug):
return 'YouPorn' if title_slug == 'browse' else title_slug

File diff suppressed because it is too large Load Diff

View File

@@ -8,13 +8,14 @@ from ..compat import compat_str
from ..utils import (
determine_ext,
ExtractorError,
extract_attributes,
float_or_none,
int_or_none,
merge_dicts,
NO_DEFAULT,
orderedSet,
parse_codecs,
qualities,
str_or_none,
try_get,
unified_timestamp,
update_url_query,
@@ -57,28 +58,39 @@ class ZDFBaseIE(InfoExtractor):
format_urls.add(format_url)
mime_type = meta.get('mimeType')
ext = determine_ext(format_url)
join_nonempty = lambda s, l: s.join(filter(None, l))
meta_map = lambda t: map(lambda x: str_or_none(meta.get(x)), t)
if mime_type == 'application/x-mpegURL' or ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
new_formats = self._extract_m3u8_formats(
format_url, video_id, 'mp4', m3u8_id='hls',
entry_protocol='m3u8_native', fatal=False))
entry_protocol='m3u8_native', fatal=False)
elif mime_type == 'application/f4m+xml' or ext == 'f4m':
formats.extend(self._extract_f4m_formats(
update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False))
new_formats = self._extract_f4m_formats(
update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False)
else:
f = parse_codecs(meta.get('mimeCodec'))
if not f:
data = meta.get('type', '').split('_')
if try_get(data, lambda x: x[2]) == ext:
f = dict(zip(('vcodec', 'acodec'), data[1]))
format_id = ['http']
for p in (meta.get('type'), meta.get('quality')):
if p and isinstance(p, compat_str):
format_id.append(p)
format_id.extend(join_nonempty('-', meta_map(('type', 'quality'))))
f.update({
'url': format_url,
'format_id': '-'.join(format_id),
'format_note': meta.get('quality'),
'language': meta.get('language'),
'quality': qualities(self._QUALITIES)(meta.get('quality')),
'preference': -10,
'tbr': int_or_none(self._search_regex(r'_(\d+)k_', format_url, 'tbr', default=None))
})
formats.append(f)
new_formats = [f]
formats.extend(merge_dicts(f, {
'format_note': join_nonempty(',', meta_map(('quality', 'class'))),
'language': meta.get('language'),
'language_preference': 10 if meta.get('class') == 'main' else -10 if meta.get('class') == 'ad' else -1,
'quality': qualities(self._QUALITIES)(meta.get('quality')),
}) for f in new_formats)
def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer):
ptmd = self._call_api(
@@ -107,6 +119,7 @@ class ZDFBaseIE(InfoExtractor):
'type': f.get('type'),
'mimeType': f.get('mimeType'),
'quality': quality.get('quality'),
'class': track.get('class'),
'language': track.get('language'),
})
self._sort_formats(formats)
@@ -171,6 +184,20 @@ class ZDFIE(ZDFBaseIE):
'duration': 2615,
'timestamp': 1465021200,
'upload_date': '20160604',
'thumbnail': 'https://www.zdf.de/assets/mauve-im-labor-100~768x432?cb=1464909117806',
},
}, {
'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html',
'md5': '1b93bdec7d02fc0b703c5e7687461628',
'info_dict': {
'ext': 'mp4',
'id': 'video_funk_1770473',
'duration': 1278,
'description': 'Die Neue an der Schule verdreht Ismail den Kopf.',
'title': 'Alles ist verzaubert',
'timestamp': 1635520560,
'upload_date': '20211029',
'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-100~1920x1080?cb=1636466431799',
},
}, {
# Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche
@@ -204,6 +231,19 @@ class ZDFIE(ZDFBaseIE):
'timestamp': 1641355200,
'upload_date': '20220105',
},
'skip': 'No longer available "Diese Seite wurde leider nicht gefunden"'
}, {
'url': 'https://www.zdf.de/serien/soko-stuttgart/das-geld-anderer-leute-100.html',
'info_dict': {
'id': '191205_1800_sendung_sok8',
'ext': 'mp4',
'title': 'Das Geld anderer Leute',
'description': 'md5:cb6f660850dc5eb7d1ab776ea094959d',
'duration': 2581.0,
'timestamp': 1654790700,
'upload_date': '20220609',
'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/e2d7e55a-09f0-424e-ac73-6cac4dd65f35?layout=2400x1350',
},
}]
def _extract_entry(self, url, player, content, video_id):
@@ -265,15 +305,16 @@ class ZDFIE(ZDFBaseIE):
'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id,
video_id)
document = video['document']
title = document['titel']
content_id = document['basename']
formats = []
format_urls = set()
for f in document['formitaeten']:
self._extract_format(content_id, formats, format_urls, f)
formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list)
document = formitaeten and video['document']
if formitaeten:
title = document['titel']
content_id = document['basename']
format_urls = set()
for f in formitaeten or []:
self._extract_format(content_id, formats, format_urls, f)
self._sort_formats(formats)
thumbnails = []
@@ -320,9 +361,9 @@ class ZDFChannelIE(ZDFBaseIE):
'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio',
'info_dict': {
'id': 'das-aktuelle-sportstudio',
'title': 'das aktuelle sportstudio | ZDF',
'title': 'das aktuelle sportstudio',
},
'playlist_mincount': 23,
'playlist_mincount': 18,
}, {
'url': 'https://www.zdf.de/dokumentation/planet-e',
'info_dict': {
@@ -330,6 +371,14 @@ class ZDFChannelIE(ZDFBaseIE):
'title': 'planet e.',
},
'playlist_mincount': 50,
}, {
'url': 'https://www.zdf.de/gesellschaft/aktenzeichen-xy-ungeloest',
'info_dict': {
'id': 'aktenzeichen-xy-ungeloest',
'title': 'Aktenzeichen XY... ungelöst',
'entries': "lambda x: not any('xy580-fall1-kindermoerder-gesucht-100' in e['url'] for e in x)",
},
'playlist_mincount': 2,
}, {
'url': 'https://www.zdf.de/filme/taunuskrimi/',
'only_matching': True,
@@ -339,60 +388,36 @@ class ZDFChannelIE(ZDFBaseIE):
def suitable(cls, url):
return False if ZDFIE.suitable(url) else super(ZDFChannelIE, cls).suitable(url)
def _og_search_title(self, webpage, fatal=False):
title = super(ZDFChannelIE, self)._og_search_title(webpage, fatal=fatal)
return re.split(r'\s+[-|]\s+ZDF(?:mediathek)?$', title or '')[0] or None
def _real_extract(self, url):
channel_id = self._match_id(url)
webpage = self._download_webpage(url, channel_id)
entries = [
self.url_result(item_url, ie=ZDFIE.ie_key())
for item_url in orderedSet(re.findall(
r'data-plusbar-url=["\'](http.+?\.html)', webpage))]
matches = re.finditer(
r'''<div\b[^>]*?\sdata-plusbar-id\s*=\s*(["'])(?P<p_id>[\w-]+)\1[^>]*?\sdata-plusbar-url=\1(?P<url>%s)\1''' % ZDFIE._VALID_URL,
webpage)
return self.playlist_result(
entries, channel_id, self._og_search_title(webpage, fatal=False))
if self._downloader.params.get('noplaylist', False):
entry = next(
(self.url_result(m.group('url'), ie=ZDFIE.ie_key()) for m in matches),
None)
self.to_screen('Downloading just the main video because of --no-playlist')
if entry:
return entry
else:
self.to_screen('Downloading playlist %s - add --no-playlist to download just the main video' % (channel_id, ))
r"""
player = self._extract_player(webpage, channel_id)
def check_video(m):
v_ref = self._search_regex(
r'''(<a\b[^>]*?\shref\s*=[^>]+?\sdata-target-id\s*=\s*(["'])%s\2[^>]*>)''' % (m.group('p_id'), ),
webpage, 'check id', default='')
v_ref = extract_attributes(v_ref)
return v_ref.get('data-target-video-type') != 'novideo'
channel_id = self._search_regex(
r'docId\s*:\s*(["\'])(?P<id>(?!\1).+?)\1', webpage,
'channel id', group='id')
channel = self._call_api(
'https://api.zdf.de/content/documents/%s.json' % channel_id,
player, url, channel_id)
items = []
for module in channel['module']:
for teaser in try_get(module, lambda x: x['teaser'], list) or []:
t = try_get(
teaser, lambda x: x['http://zdf.de/rels/target'], dict)
if not t:
continue
items.extend(try_get(
t,
lambda x: x['resultsWithVideo']['http://zdf.de/rels/search/results'],
list) or [])
items.extend(try_get(
module,
lambda x: x['filterRef']['resultsWithVideo']['http://zdf.de/rels/search/results'],
list) or [])
entries = []
entry_urls = set()
for item in items:
t = try_get(item, lambda x: x['http://zdf.de/rels/target'], dict)
if not t:
continue
sharing_url = t.get('http://zdf.de/rels/sharing-url')
if not sharing_url or not isinstance(sharing_url, compat_str):
continue
if sharing_url in entry_urls:
continue
entry_urls.add(sharing_url)
entries.append(self.url_result(
sharing_url, ie=ZDFIE.ie_key(), video_id=t.get('id')))
return self.playlist_result(entries, channel_id, channel.get('title'))
"""
return self.playlist_from_matches(
(m.group('url') for m in matches if check_video(m)),
channel_id, self._og_search_title(webpage, fatal=False))