Compare commits

...

7 Commits

Author SHA1 Message Date
Remita Amine
4ef1fc9707 [youtube] fix automatic captions extraction(closes #27162)(closes #27388) 2020-12-24 16:05:03 +01:00
Remita Amine
f9e6aa1dcf [sonyliv] fix title for movies 2020-12-24 13:33:12 +01:00
Remita Amine
f83db9064b [sonyliv] fix extraction(closes #25667) 2020-12-24 13:10:20 +01:00
Remita Amine
2da9a86399 [streetvoice] fix extraction(closes #27455)(closes #27492) 2020-12-24 13:10:20 +01:00
Remita Amine
ecaa535cf4 [facebook] add support for watchparty pages(closes #27507) 2020-12-24 13:10:20 +01:00
Remita Amine
79dd92b1fe [cbslocal] fix video extraction 2020-12-24 13:10:20 +01:00
Remita Amine
bd3844c9c2 [brightcove] add another method to extract policyKey 2020-12-24 13:10:20 +01:00
7 changed files with 378 additions and 180 deletions

View File

@ -28,6 +28,7 @@ from ..utils import (
parse_iso8601,
smuggle_url,
str_or_none,
try_get,
unescapeHTML,
unsmuggle_url,
UnsupportedError,
@ -600,11 +601,14 @@ class BrightcoveNewIE(AdobePassIE):
store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x)
def extract_policy_key():
base_url = 'http://players.brightcove.net/%s/%s_%s/' % (account_id, player_id, embed)
config = self._download_json(
base_url + 'config.json', video_id, fatal=False) or {}
policy_key = try_get(
config, lambda x: x['video_cloud']['policy_key'])
if not policy_key:
webpage = self._download_webpage(
'http://players.brightcove.net/%s/%s_%s/index.min.js'
% (account_id, player_id, embed), video_id)
policy_key = None
base_url + 'index.min.js', video_id)
catalog = self._search_regex(
r'catalog\(({.+?})\);', webpage, 'catalog', default=None)

View File

@ -11,7 +11,47 @@ from ..utils import (
class CBSLocalIE(AnvatoIE):
_VALID_URL = r'https?://[a-z]+\.cbslocal\.com/(?:\d+/\d+/\d+|video)/(?P<id>[0-9a-z-]+)'
_VALID_URL_BASE = r'https?://[a-z]+\.cbslocal\.com/'
_VALID_URL = _VALID_URL_BASE + r'video/(?P<id>\d+)'
_TESTS = [{
'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/',
'info_dict': {
'id': '3580809',
'ext': 'mp4',
'title': 'A Very Blue Anniversary',
'description': 'CBS2s Cindy Hsu has more.',
'thumbnail': 're:^https?://.*',
'timestamp': int,
'upload_date': r're:^\d{8}$',
'uploader': 'CBS',
'subtitles': {
'en': 'mincount:5',
},
'categories': [
'Stations\\Spoken Word\\WCBSTV',
'Syndication\\AOL',
'Syndication\\MSN',
'Syndication\\NDN',
'Syndication\\Yahoo',
'Content\\News',
'Content\\News\\Local News',
],
'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'],
},
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
mcp_id = self._match_id(url)
return self.url_result(
'anvato:anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67:' + mcp_id, 'Anvato', mcp_id)
class CBSLocalArticleIE(AnvatoIE):
_VALID_URL = CBSLocalIE._VALID_URL_BASE + r'\d+/\d+/\d+/(?P<id>[0-9a-z-]+)'
_TESTS = [{
# Anvato backend
@ -52,31 +92,6 @@ class CBSLocalIE(AnvatoIE):
# m3u8 download
'skip_download': True,
},
}, {
'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/',
'info_dict': {
'id': '3580809',
'ext': 'mp4',
'title': 'A Very Blue Anniversary',
'description': 'CBS2s Cindy Hsu has more.',
'thumbnail': 're:^https?://.*',
'timestamp': int,
'upload_date': r're:^\d{8}$',
'uploader': 'CBS',
'subtitles': {
'en': 'mincount:5',
},
'categories': [
'Stations\\Spoken Word\\WCBSTV',
'Syndication\\AOL',
'Syndication\\MSN',
'Syndication\\NDN',
'Syndication\\Yahoo',
'Content\\News',
'Content\\News\\Local News',
],
'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'],
},
}]
def _real_extract(self, url):

View File

@ -163,7 +163,10 @@ from .cbc import (
CBCOlympicsIE,
)
from .cbs import CBSIE
from .cbslocal import CBSLocalIE
from .cbslocal import (
CBSLocalIE,
CBSLocalArticleIE,
)
from .cbsinteractive import CBSInteractiveIE
from .cbsnews import (
CBSNewsEmbedIE,

View File

@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
import socket
@ -8,6 +9,7 @@ from .common import InfoExtractor
from ..compat import (
compat_etree_fromstring,
compat_http_client,
compat_str,
compat_urllib_error,
compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus,
@ -47,7 +49,8 @@ class FacebookIE(InfoExtractor):
)\?(?:.*?)(?:v|video_id|story_fbid)=|
[^/]+/videos/(?:[^/]+/)?|
[^/]+/posts/|
groups/[^/]+/permalink/
groups/[^/]+/permalink/|
watchparty/
)|
facebook:
)
@ -280,8 +283,18 @@ class FacebookIE(InfoExtractor):
# data.video.creation_story.attachments[].media
'url': 'https://www.facebook.com/watch/live/?v=1823658634322275',
'only_matching': True,
}, {
'url': 'https://www.facebook.com/watchparty/211641140192478',
'info_dict': {
'id': '211641140192478',
},
'playlist_count': 1,
'skip': 'Requires logging in',
}]
_SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)'
_api_config = {
'graphURI': '/api/graphql/'
}
@staticmethod
def _extract_urls(webpage):
@ -405,6 +418,17 @@ class FacebookIE(InfoExtractor):
self._sort_formats(formats)
def extract_relay_data(_filter):
return self._parse_json(self._search_regex(
r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter,
webpage, 'replay data', default='{}'), video_id, fatal=False) or {}
def extract_relay_prefetched_data(_filter):
replay_data = extract_relay_data(_filter)
for require in (replay_data.get('require') or []):
if require[0] == 'RelayPrefetchedStreamCache':
return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {}
if not video_data:
server_js_data = self._parse_json(self._search_regex([
r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX,
@ -413,11 +437,9 @@ class FacebookIE(InfoExtractor):
video_data = extract_from_jsmods_instances(server_js_data)
if not video_data:
graphql_data = self._parse_json(self._search_regex(
r'handleWithCustomApplyEach\([^,]+,\s*({.*?"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+".*?})\);',
webpage, 'graphql data', default='{}'), video_id, fatal=False) or {}
for require in (graphql_data.get('require') or []):
if require[0] == 'RelayPrefetchedStreamCache':
data = extract_relay_prefetched_data(
r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"')
if data:
entries = []
def parse_graphql_video(video):
@ -459,8 +481,6 @@ class FacebookIE(InfoExtractor):
if media.get('__typename') == 'Video':
return parse_graphql_video(media)
data = try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {}
nodes = data.get('nodes') or []
node = data.get('node') or {}
if not nodes and node:
@ -504,6 +524,43 @@ class FacebookIE(InfoExtractor):
elif '>You must log in to continue' in webpage:
self.raise_login_required()
if not video_data and '/watchparty/' in url:
post_data = {
'doc_id': 3731964053542869,
'variables': json.dumps({
'livingRoomID': video_id,
}),
}
prefetched_data = extract_relay_prefetched_data(r'"login_data"\s*:\s*{')
if prefetched_data:
lsd = try_get(prefetched_data, lambda x: x['login_data']['lsd'], dict)
if lsd:
post_data[lsd['name']] = lsd['value']
relay_data = extract_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,')
for define in (relay_data.get('define') or []):
if define[0] == 'RelayAPIConfigDefaults':
self._api_config = define[2]
living_room = self._download_json(
urljoin(url, self._api_config['graphURI']), video_id,
data=urlencode_postdata(post_data))['data']['living_room']
entries = []
for edge in (try_get(living_room, lambda x: x['recap']['watched_content']['edges']) or []):
video = try_get(edge, lambda x: x['node']['video']) or {}
v_id = video.get('id')
if not v_id:
continue
v_id = compat_str(v_id)
entries.append(self.url_result(
self._VIDEO_PAGE_TEMPLATE % v_id,
self.ie_key(), v_id, video.get('name')))
return self.playlist_result(entries, video_id)
if not video_data:
# Video info not in first request, do a secondary request using
# tahoe player specific URL
tahoe_data = self._download_webpage(

View File

@ -1,40 +1,112 @@
# coding: utf-8
from __future__ import unicode_literals
import time
import uuid
from .common import InfoExtractor
from ..utils import smuggle_url
from ..compat import compat_HTTPError
from ..utils import (
ExtractorError,
int_or_none,
)
class SonyLIVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?sonyliv\.com/details/[^/]+/(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?sonyliv\.com/(?:s(?:how|port)s/[^/]+|movies|clip|trailer|music-videos)/[^/?#&]+-(?P<id>\d+)'
_TESTS = [{
'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight",
'url': 'https://www.sonyliv.com/shows/bachelors-delight-1700000113/achaari-cheese-toast-1000022678?watch=true',
'info_dict': {
'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight",
'id': 'ref:5024612095001',
'title': 'Bachelors Delight - Achaari Cheese Toast',
'id': '1000022678',
'ext': 'mp4',
'upload_date': '20170923',
'description': 'md5:7f28509a148d5be9d0782b4d5106410d',
'uploader_id': '5182475815001',
'timestamp': 1506200547,
'upload_date': '20200411',
'description': 'md5:3957fa31d9309bf336ceb3f37ad5b7cb',
'timestamp': 1586632091,
'duration': 185,
'season_number': 1,
'episode': 'Achaari Cheese Toast',
'episode_number': 1,
'release_year': 2016,
},
'params': {
'skip_download': True,
},
'add_ie': ['BrightcoveNew'],
}, {
'url': 'http://www.sonyliv.com/details/full%20movie/4951168986001/Sei-Raat-(Bangla)',
'url': 'https://www.sonyliv.com/movies/tahalka-1000050121?watch=true',
'only_matching': True,
}, {
'url': 'https://www.sonyliv.com/clip/jigarbaaz-1000098925',
'only_matching': True,
}, {
'url': 'https://www.sonyliv.com/trailer/sandwiched-forever-1000100286?watch=true',
'only_matching': True,
}, {
'url': 'https://www.sonyliv.com/sports/india-tour-of-australia-2020-21-1700000286/cricket-hls-day-3-1st-test-aus-vs-ind-19-dec-2020-1000100959?watch=true',
'only_matching': True,
}, {
'url': 'https://www.sonyliv.com/music-videos/yeh-un-dinon-ki-baat-hai-1000018779',
'only_matching': True,
}]
_GEO_COUNTRIES = ['IN']
_TOKEN = None
# BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s'
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5182475815001/default_default/index.html?videoId=ref:%s'
def _call_api(self, version, path, video_id):
headers = {}
if self._TOKEN:
headers['security_token'] = self._TOKEN
try:
return self._download_json(
'https://apiv2.sonyliv.com/AGL/%s/A/ENG/WEB/%s' % (version, path),
video_id, headers=headers)['resultObj']
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
message = self._parse_json(
e.cause.read().decode(), video_id)['message']
if message == 'Geoblocked Country':
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
raise ExtractorError(message)
raise
def _real_initialize(self):
self._TOKEN = self._call_api('1.4', 'ALL/GETTOKEN', None)
def _real_extract(self, url):
brightcove_id = self._match_id(url)
return self.url_result(
smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, {
'geo_countries': ['IN'],
'referrer': url,
}),
'BrightcoveNew', brightcove_id)
video_id = self._match_id(url)
content = self._call_api(
'1.5', 'IN/CONTENT/VIDEOURL/VOD/' + video_id, video_id)
if content.get('isEncrypted'):
raise ExtractorError('This video is DRM protected.', expected=True)
dash_url = content['videoURL']
headers = {
'x-playback-session-id': '%s-%d' % (uuid.uuid4().hex, time.time() * 1000)
}
formats = self._extract_mpd_formats(
dash_url, video_id, mpd_id='dash', headers=headers, fatal=False)
formats.extend(self._extract_m3u8_formats(
dash_url.replace('.mpd', '.m3u8').replace('/DASH/', '/HLS/'),
video_id, 'mp4', m3u8_id='hls', headers=headers, fatal=False))
for f in formats:
f.setdefault('http_headers', {}).update(headers)
self._sort_formats(formats)
metadata = self._call_api(
'1.6', 'IN/DETAIL/' + video_id, video_id)['containers'][0]['metadata']
title = metadata['title']
episode = metadata.get('episodeTitle')
if episode and title != episode:
title += ' - ' + episode
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': content.get('posterURL'),
'description': metadata.get('longDescription') or metadata.get('shortDescription'),
'timestamp': int_or_none(metadata.get('creationDate'), 1000),
'duration': int_or_none(metadata.get('duration')),
'season_number': int_or_none(metadata.get('season')),
'episode': episode,
'episode_number': int_or_none(metadata.get('episodeNumber')),
'release_year': int_or_none(metadata.get('year')),
}

View File

@ -2,25 +2,40 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import unified_strdate
from ..utils import (
int_or_none,
parse_iso8601,
str_or_none,
strip_or_none,
try_get,
urljoin,
)
class StreetVoiceIE(InfoExtractor):
_VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://streetvoice.com/skippylu/songs/94440/',
'md5': '15974627fc01a29e492c98593c2fd472',
'url': 'https://streetvoice.com/skippylu/songs/123688/',
'md5': '0eb535970629a5195685355f3ed60bfd',
'info_dict': {
'id': '94440',
'id': '123688',
'ext': 'mp3',
'title': '',
'description': 'Crispy脆樂團 - 輸',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 260,
'upload_date': '20091018',
'title': '流浪',
'description': 'md5:8eb0bfcc9dcd8aa82bd6efca66e3fea6',
'thumbnail': r're:^https?://.*\.jpg',
'duration': 270,
'upload_date': '20100923',
'uploader': 'Crispy脆樂團',
'uploader_id': '627810',
'uploader_url': 're:^https?://streetvoice.com/skippylu/',
'timestamp': 1285261661,
'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
'track': '流浪',
'track_id': '123688',
'album': '2010',
}
}, {
'url': 'http://tw.streetvoice.com/skippylu/songs/94440/',
@ -29,21 +44,57 @@ class StreetVoiceIE(InfoExtractor):
def _real_extract(self, url):
song_id = self._match_id(url)
song = self._download_json(
'https://streetvoice.com/api/v1/public/song/%s/' % song_id, song_id, data=b'')
base_url = 'https://streetvoice.com/api/v4/song/%s/' % song_id
song = self._download_json(base_url, song_id, query={
'fields': 'album,comments_count,created_at,id,image,length,likes_count,name,nickname,plays_count,profile,share_count,synopsis,user,username',
})
title = song['name']
author = song['user']['nickname']
formats = []
for suffix, format_id in [('hls/file', 'hls'), ('file', 'http'), ('file/original', 'original')]:
f_url = (self._download_json(
base_url + suffix + '/', song_id,
'Downloading %s format URL' % format_id,
data=b'', fatal=False) or {}).get('file')
if not f_url:
continue
f = {
'ext': 'mp3',
'format_id': format_id,
'url': f_url,
'vcodec': 'none',
}
if format_id == 'hls':
f['protocol'] = 'm3u8_native'
abr = self._search_regex(r'\.mp3\.(\d+)k', f_url, 'bitrate', default=None)
if abr:
abr = int(abr)
f.update({
'abr': abr,
'tbr': abr,
})
formats.append(f)
user = song.get('user') or {}
username = user.get('username')
get_count = lambda x: int_or_none(song.get(x + '_count'))
return {
'id': song_id,
'url': song['file'],
'formats': formats,
'title': title,
'description': '%s - %s' % (author, title),
'thumbnail': self._proto_relative_url(song.get('image'), 'http:'),
'duration': song.get('length'),
'upload_date': unified_strdate(song.get('created_at')),
'uploader': author,
'uploader_id': compat_str(song['user']['id']),
'description': strip_or_none(song.get('synopsis')),
'thumbnail': song.get('image'),
'duration': int_or_none(song.get('length')),
'timestamp': parse_iso8601(song.get('created_at')),
'uploader': try_get(user, lambda x: x['profile']['nickname']),
'uploader_id': str_or_none(user.get('id')),
'uploader_url': urljoin(url, '/%s/' % username) if username else None,
'view_count': get_count('plays'),
'like_count': get_count('likes'),
'comment_count': get_count('comments'),
'repost_count': get_count('share'),
'track': title,
'track_id': song_id,
'album': try_get(song, lambda x: x['album']['name']),
}

View File

@ -1322,17 +1322,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return self._parse_json(
uppercase_escape(config), video_id, fatal=False)
def _get_automatic_captions(self, video_id, webpage):
def _get_automatic_captions(self, video_id, player_response, player_config):
"""We need the webpage for getting the captions url, pass it as an
argument to speed up the process."""
self.to_screen('%s: Looking for automatic captions' % video_id)
player_config = self._get_ytplayer_config(video_id, webpage)
err_msg = 'Couldn\'t find automatic captions for %s' % video_id
if not player_config:
if not (player_response or player_config):
self._downloader.report_warning(err_msg)
return {}
try:
args = player_config['args']
args = player_config.get('args') if player_config else {}
caption_url = args.get('ttsurl')
if caption_url:
timestamp = args['timestamp']
@ -1391,10 +1390,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return captions
# New captions format as of 22.06.2017
player_response = args.get('player_response')
if player_response and isinstance(player_response, compat_str):
player_response = self._parse_json(
player_response, video_id, fatal=False)
if player_response:
renderer = player_response['captions']['playerCaptionsTracklistRenderer']
base_url = renderer['captionTracks'][0]['baseUrl']
@ -1652,6 +1647,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Get video info
video_info = {}
embed_webpage = None
ytplayer_config = None
if re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None:
age_gate = True
@ -2276,7 +2272,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# subtitles
video_subtitles = self.extract_subtitles(video_id, video_webpage)
automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config)
video_duration = try_get(
video_info, lambda x: int_or_none(x['length_seconds'][0]))