Compare commits

...

7 Commits

Author SHA1 Message Date
Remita Amine
4ef1fc9707 [youtube] fix automatic captions extraction(closes #27162)(closes #27388) 2020-12-24 16:05:03 +01:00
Remita Amine
f9e6aa1dcf [sonyliv] fix title for movies 2020-12-24 13:33:12 +01:00
Remita Amine
f83db9064b [sonyliv] fix extraction(closes #25667) 2020-12-24 13:10:20 +01:00
Remita Amine
2da9a86399 [streetvoice] fix extraction(closes #27455)(closes #27492) 2020-12-24 13:10:20 +01:00
Remita Amine
ecaa535cf4 [facebook] add support for watchparty pages(closes #27507) 2020-12-24 13:10:20 +01:00
Remita Amine
79dd92b1fe [cbslocal] fix video extraction 2020-12-24 13:10:20 +01:00
Remita Amine
bd3844c9c2 [brightcove] add another method to extract policyKey 2020-12-24 13:10:20 +01:00
7 changed files with 378 additions and 180 deletions

View File

@ -28,6 +28,7 @@ from ..utils import (
parse_iso8601, parse_iso8601,
smuggle_url, smuggle_url,
str_or_none, str_or_none,
try_get,
unescapeHTML, unescapeHTML,
unsmuggle_url, unsmuggle_url,
UnsupportedError, UnsupportedError,
@ -600,24 +601,27 @@ class BrightcoveNewIE(AdobePassIE):
store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x) store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x)
def extract_policy_key(): def extract_policy_key():
webpage = self._download_webpage( base_url = 'http://players.brightcove.net/%s/%s_%s/' % (account_id, player_id, embed)
'http://players.brightcove.net/%s/%s_%s/index.min.js' config = self._download_json(
% (account_id, player_id, embed), video_id) base_url + 'config.json', video_id, fatal=False) or {}
policy_key = try_get(
policy_key = None config, lambda x: x['video_cloud']['policy_key'])
catalog = self._search_regex(
r'catalog\(({.+?})\);', webpage, 'catalog', default=None)
if catalog:
catalog = self._parse_json(
js_to_json(catalog), video_id, fatal=False)
if catalog:
policy_key = catalog.get('policyKey')
if not policy_key: if not policy_key:
policy_key = self._search_regex( webpage = self._download_webpage(
r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', base_url + 'index.min.js', video_id)
webpage, 'policy key', group='pk')
catalog = self._search_regex(
r'catalog\(({.+?})\);', webpage, 'catalog', default=None)
if catalog:
catalog = self._parse_json(
js_to_json(catalog), video_id, fatal=False)
if catalog:
policy_key = catalog.get('policyKey')
if not policy_key:
policy_key = self._search_regex(
r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1',
webpage, 'policy key', group='pk')
store_pk(policy_key) store_pk(policy_key)
return policy_key return policy_key

View File

@ -11,7 +11,47 @@ from ..utils import (
class CBSLocalIE(AnvatoIE): class CBSLocalIE(AnvatoIE):
_VALID_URL = r'https?://[a-z]+\.cbslocal\.com/(?:\d+/\d+/\d+|video)/(?P<id>[0-9a-z-]+)' _VALID_URL_BASE = r'https?://[a-z]+\.cbslocal\.com/'
_VALID_URL = _VALID_URL_BASE + r'video/(?P<id>\d+)'
_TESTS = [{
'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/',
'info_dict': {
'id': '3580809',
'ext': 'mp4',
'title': 'A Very Blue Anniversary',
'description': 'CBS2s Cindy Hsu has more.',
'thumbnail': 're:^https?://.*',
'timestamp': int,
'upload_date': r're:^\d{8}$',
'uploader': 'CBS',
'subtitles': {
'en': 'mincount:5',
},
'categories': [
'Stations\\Spoken Word\\WCBSTV',
'Syndication\\AOL',
'Syndication\\MSN',
'Syndication\\NDN',
'Syndication\\Yahoo',
'Content\\News',
'Content\\News\\Local News',
],
'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'],
},
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
mcp_id = self._match_id(url)
return self.url_result(
'anvato:anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67:' + mcp_id, 'Anvato', mcp_id)
class CBSLocalArticleIE(AnvatoIE):
_VALID_URL = CBSLocalIE._VALID_URL_BASE + r'\d+/\d+/\d+/(?P<id>[0-9a-z-]+)'
_TESTS = [{ _TESTS = [{
# Anvato backend # Anvato backend
@ -52,31 +92,6 @@ class CBSLocalIE(AnvatoIE):
# m3u8 download # m3u8 download
'skip_download': True, 'skip_download': True,
}, },
}, {
'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/',
'info_dict': {
'id': '3580809',
'ext': 'mp4',
'title': 'A Very Blue Anniversary',
'description': 'CBS2s Cindy Hsu has more.',
'thumbnail': 're:^https?://.*',
'timestamp': int,
'upload_date': r're:^\d{8}$',
'uploader': 'CBS',
'subtitles': {
'en': 'mincount:5',
},
'categories': [
'Stations\\Spoken Word\\WCBSTV',
'Syndication\\AOL',
'Syndication\\MSN',
'Syndication\\NDN',
'Syndication\\Yahoo',
'Content\\News',
'Content\\News\\Local News',
],
'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'],
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -163,7 +163,10 @@ from .cbc import (
CBCOlympicsIE, CBCOlympicsIE,
) )
from .cbs import CBSIE from .cbs import CBSIE
from .cbslocal import CBSLocalIE from .cbslocal import (
CBSLocalIE,
CBSLocalArticleIE,
)
from .cbsinteractive import CBSInteractiveIE from .cbsinteractive import CBSInteractiveIE
from .cbsnews import ( from .cbsnews import (
CBSNewsEmbedIE, CBSNewsEmbedIE,

View File

@ -1,6 +1,7 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import json
import re import re
import socket import socket
@ -8,6 +9,7 @@ from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_etree_fromstring, compat_etree_fromstring,
compat_http_client, compat_http_client,
compat_str,
compat_urllib_error, compat_urllib_error,
compat_urllib_parse_unquote, compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus, compat_urllib_parse_unquote_plus,
@ -47,7 +49,8 @@ class FacebookIE(InfoExtractor):
)\?(?:.*?)(?:v|video_id|story_fbid)=| )\?(?:.*?)(?:v|video_id|story_fbid)=|
[^/]+/videos/(?:[^/]+/)?| [^/]+/videos/(?:[^/]+/)?|
[^/]+/posts/| [^/]+/posts/|
groups/[^/]+/permalink/ groups/[^/]+/permalink/|
watchparty/
)| )|
facebook: facebook:
) )
@ -280,8 +283,18 @@ class FacebookIE(InfoExtractor):
# data.video.creation_story.attachments[].media # data.video.creation_story.attachments[].media
'url': 'https://www.facebook.com/watch/live/?v=1823658634322275', 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.facebook.com/watchparty/211641140192478',
'info_dict': {
'id': '211641140192478',
},
'playlist_count': 1,
'skip': 'Requires logging in',
}] }]
_SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)'
_api_config = {
'graphURI': '/api/graphql/'
}
@staticmethod @staticmethod
def _extract_urls(webpage): def _extract_urls(webpage):
@ -405,6 +418,17 @@ class FacebookIE(InfoExtractor):
self._sort_formats(formats) self._sort_formats(formats)
def extract_relay_data(_filter):
return self._parse_json(self._search_regex(
r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter,
webpage, 'replay data', default='{}'), video_id, fatal=False) or {}
def extract_relay_prefetched_data(_filter):
replay_data = extract_relay_data(_filter)
for require in (replay_data.get('require') or []):
if require[0] == 'RelayPrefetchedStreamCache':
return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {}
if not video_data: if not video_data:
server_js_data = self._parse_json(self._search_regex([ server_js_data = self._parse_json(self._search_regex([
r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX, r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX,
@ -413,87 +437,83 @@ class FacebookIE(InfoExtractor):
video_data = extract_from_jsmods_instances(server_js_data) video_data = extract_from_jsmods_instances(server_js_data)
if not video_data: if not video_data:
graphql_data = self._parse_json(self._search_regex( data = extract_relay_prefetched_data(
r'handleWithCustomApplyEach\([^,]+,\s*({.*?"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+".*?})\);', r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"')
webpage, 'graphql data', default='{}'), video_id, fatal=False) or {} if data:
for require in (graphql_data.get('require') or []): entries = []
if require[0] == 'RelayPrefetchedStreamCache':
entries = []
def parse_graphql_video(video): def parse_graphql_video(video):
formats = [] formats = []
q = qualities(['sd', 'hd']) q = qualities(['sd', 'hd'])
for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]:
playable_url = video.get('playable_url' + suffix) playable_url = video.get('playable_url' + suffix)
if not playable_url: if not playable_url:
continue continue
formats.append({ formats.append({
'format_id': format_id, 'format_id': format_id,
'quality': q(format_id), 'quality': q(format_id),
'url': playable_url, 'url': playable_url,
}) })
extract_dash_manifest(video, formats) extract_dash_manifest(video, formats)
process_formats(formats) process_formats(formats)
v_id = video.get('videoId') or video.get('id') or video_id v_id = video.get('videoId') or video.get('id') or video_id
info = { info = {
'id': v_id, 'id': v_id,
'formats': formats, 'formats': formats,
'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']),
'uploader_id': try_get(video, lambda x: x['owner']['id']), 'uploader_id': try_get(video, lambda x: x['owner']['id']),
'timestamp': int_or_none(video.get('publish_time')), 'timestamp': int_or_none(video.get('publish_time')),
'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000),
} }
description = try_get(video, lambda x: x['savable_description']['text']) description = try_get(video, lambda x: x['savable_description']['text'])
title = video.get('name') title = video.get('name')
if title: if title:
info.update({ info.update({
'title': title, 'title': title,
'description': description, 'description': description,
}) })
else: else:
info['title'] = description or 'Facebook video #%s' % v_id info['title'] = description or 'Facebook video #%s' % v_id
entries.append(info) entries.append(info)
def parse_attachment(attachment, key='media'): def parse_attachment(attachment, key='media'):
media = attachment.get(key) or {} media = attachment.get(key) or {}
if media.get('__typename') == 'Video': if media.get('__typename') == 'Video':
return parse_graphql_video(media) return parse_graphql_video(media)
data = try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} nodes = data.get('nodes') or []
node = data.get('node') or {}
if not nodes and node:
nodes.append(node)
for node in nodes:
story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {}
attachments = try_get(story, [
lambda x: x['attached_story']['attachments'],
lambda x: x['attachments']
], list) or []
for attachment in attachments:
attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict)
ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or []
for n in ns:
parse_attachment(n)
parse_attachment(attachment)
nodes = data.get('nodes') or [] edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or []
node = data.get('node') or {} for edge in edges:
if not nodes and node: parse_attachment(edge, key='node')
nodes.append(node)
for node in nodes:
story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {}
attachments = try_get(story, [
lambda x: x['attached_story']['attachments'],
lambda x: x['attachments']
], list) or []
for attachment in attachments:
attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict)
ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or []
for n in ns:
parse_attachment(n)
parse_attachment(attachment)
edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] video = data.get('video') or {}
for edge in edges: if video:
parse_attachment(edge, key='node') attachments = try_get(video, [
lambda x: x['story']['attachments'],
lambda x: x['creation_story']['attachments']
], list) or []
for attachment in attachments:
parse_attachment(attachment)
if not entries:
parse_graphql_video(video)
video = data.get('video') or {} return self.playlist_result(entries, video_id)
if video:
attachments = try_get(video, [
lambda x: x['story']['attachments'],
lambda x: x['creation_story']['attachments']
], list) or []
for attachment in attachments:
parse_attachment(attachment)
if not entries:
parse_graphql_video(video)
return self.playlist_result(entries, video_id)
if not video_data: if not video_data:
m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage) m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
@ -504,6 +524,43 @@ class FacebookIE(InfoExtractor):
elif '>You must log in to continue' in webpage: elif '>You must log in to continue' in webpage:
self.raise_login_required() self.raise_login_required()
if not video_data and '/watchparty/' in url:
post_data = {
'doc_id': 3731964053542869,
'variables': json.dumps({
'livingRoomID': video_id,
}),
}
prefetched_data = extract_relay_prefetched_data(r'"login_data"\s*:\s*{')
if prefetched_data:
lsd = try_get(prefetched_data, lambda x: x['login_data']['lsd'], dict)
if lsd:
post_data[lsd['name']] = lsd['value']
relay_data = extract_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,')
for define in (relay_data.get('define') or []):
if define[0] == 'RelayAPIConfigDefaults':
self._api_config = define[2]
living_room = self._download_json(
urljoin(url, self._api_config['graphURI']), video_id,
data=urlencode_postdata(post_data))['data']['living_room']
entries = []
for edge in (try_get(living_room, lambda x: x['recap']['watched_content']['edges']) or []):
video = try_get(edge, lambda x: x['node']['video']) or {}
v_id = video.get('id')
if not v_id:
continue
v_id = compat_str(v_id)
entries.append(self.url_result(
self._VIDEO_PAGE_TEMPLATE % v_id,
self.ie_key(), v_id, video.get('name')))
return self.playlist_result(entries, video_id)
if not video_data:
# Video info not in first request, do a secondary request using # Video info not in first request, do a secondary request using
# tahoe player specific URL # tahoe player specific URL
tahoe_data = self._download_webpage( tahoe_data = self._download_webpage(

View File

@ -1,40 +1,112 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import time
import uuid
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import smuggle_url from ..compat import compat_HTTPError
from ..utils import (
ExtractorError,
int_or_none,
)
class SonyLIVIE(InfoExtractor): class SonyLIVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?sonyliv\.com/details/[^/]+/(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/(?:s(?:how|port)s/[^/]+|movies|clip|trailer|music-videos)/[^/?#&]+-(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight", 'url': 'https://www.sonyliv.com/shows/bachelors-delight-1700000113/achaari-cheese-toast-1000022678?watch=true',
'info_dict': { 'info_dict': {
'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight", 'title': 'Bachelors Delight - Achaari Cheese Toast',
'id': 'ref:5024612095001', 'id': '1000022678',
'ext': 'mp4', 'ext': 'mp4',
'upload_date': '20170923', 'upload_date': '20200411',
'description': 'md5:7f28509a148d5be9d0782b4d5106410d', 'description': 'md5:3957fa31d9309bf336ceb3f37ad5b7cb',
'uploader_id': '5182475815001', 'timestamp': 1586632091,
'timestamp': 1506200547, 'duration': 185,
'season_number': 1,
'episode': 'Achaari Cheese Toast',
'episode_number': 1,
'release_year': 2016,
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
'add_ie': ['BrightcoveNew'],
}, { }, {
'url': 'http://www.sonyliv.com/details/full%20movie/4951168986001/Sei-Raat-(Bangla)', 'url': 'https://www.sonyliv.com/movies/tahalka-1000050121?watch=true',
'only_matching': True,
}, {
'url': 'https://www.sonyliv.com/clip/jigarbaaz-1000098925',
'only_matching': True,
}, {
'url': 'https://www.sonyliv.com/trailer/sandwiched-forever-1000100286?watch=true',
'only_matching': True,
}, {
'url': 'https://www.sonyliv.com/sports/india-tour-of-australia-2020-21-1700000286/cricket-hls-day-3-1st-test-aus-vs-ind-19-dec-2020-1000100959?watch=true',
'only_matching': True,
}, {
'url': 'https://www.sonyliv.com/music-videos/yeh-un-dinon-ki-baat-hai-1000018779',
'only_matching': True, 'only_matching': True,
}] }]
_GEO_COUNTRIES = ['IN']
_TOKEN = None
# BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' def _call_api(self, version, path, video_id):
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5182475815001/default_default/index.html?videoId=ref:%s' headers = {}
if self._TOKEN:
headers['security_token'] = self._TOKEN
try:
return self._download_json(
'https://apiv2.sonyliv.com/AGL/%s/A/ENG/WEB/%s' % (version, path),
video_id, headers=headers)['resultObj']
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
message = self._parse_json(
e.cause.read().decode(), video_id)['message']
if message == 'Geoblocked Country':
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
raise ExtractorError(message)
raise
def _real_initialize(self):
self._TOKEN = self._call_api('1.4', 'ALL/GETTOKEN', None)
def _real_extract(self, url): def _real_extract(self, url):
brightcove_id = self._match_id(url) video_id = self._match_id(url)
return self.url_result( content = self._call_api(
smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, { '1.5', 'IN/CONTENT/VIDEOURL/VOD/' + video_id, video_id)
'geo_countries': ['IN'], if content.get('isEncrypted'):
'referrer': url, raise ExtractorError('This video is DRM protected.', expected=True)
}), dash_url = content['videoURL']
'BrightcoveNew', brightcove_id) headers = {
'x-playback-session-id': '%s-%d' % (uuid.uuid4().hex, time.time() * 1000)
}
formats = self._extract_mpd_formats(
dash_url, video_id, mpd_id='dash', headers=headers, fatal=False)
formats.extend(self._extract_m3u8_formats(
dash_url.replace('.mpd', '.m3u8').replace('/DASH/', '/HLS/'),
video_id, 'mp4', m3u8_id='hls', headers=headers, fatal=False))
for f in formats:
f.setdefault('http_headers', {}).update(headers)
self._sort_formats(formats)
metadata = self._call_api(
'1.6', 'IN/DETAIL/' + video_id, video_id)['containers'][0]['metadata']
title = metadata['title']
episode = metadata.get('episodeTitle')
if episode and title != episode:
title += ' - ' + episode
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': content.get('posterURL'),
'description': metadata.get('longDescription') or metadata.get('shortDescription'),
'timestamp': int_or_none(metadata.get('creationDate'), 1000),
'duration': int_or_none(metadata.get('duration')),
'season_number': int_or_none(metadata.get('season')),
'episode': episode,
'episode_number': int_or_none(metadata.get('episodeNumber')),
'release_year': int_or_none(metadata.get('year')),
}

View File

@ -2,25 +2,40 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str from ..utils import (
from ..utils import unified_strdate int_or_none,
parse_iso8601,
str_or_none,
strip_or_none,
try_get,
urljoin,
)
class StreetVoiceIE(InfoExtractor): class StreetVoiceIE(InfoExtractor):
_VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P<id>[0-9]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://streetvoice.com/skippylu/songs/94440/', 'url': 'https://streetvoice.com/skippylu/songs/123688/',
'md5': '15974627fc01a29e492c98593c2fd472', 'md5': '0eb535970629a5195685355f3ed60bfd',
'info_dict': { 'info_dict': {
'id': '94440', 'id': '123688',
'ext': 'mp3', 'ext': 'mp3',
'title': '', 'title': '流浪',
'description': 'Crispy脆樂團 - 輸', 'description': 'md5:8eb0bfcc9dcd8aa82bd6efca66e3fea6',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg',
'duration': 260, 'duration': 270,
'upload_date': '20091018', 'upload_date': '20100923',
'uploader': 'Crispy脆樂團', 'uploader': 'Crispy脆樂團',
'uploader_id': '627810', 'uploader_id': '627810',
'uploader_url': 're:^https?://streetvoice.com/skippylu/',
'timestamp': 1285261661,
'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
'track': '流浪',
'track_id': '123688',
'album': '2010',
} }
}, { }, {
'url': 'http://tw.streetvoice.com/skippylu/songs/94440/', 'url': 'http://tw.streetvoice.com/skippylu/songs/94440/',
@ -29,21 +44,57 @@ class StreetVoiceIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
song_id = self._match_id(url) song_id = self._match_id(url)
base_url = 'https://streetvoice.com/api/v4/song/%s/' % song_id
song = self._download_json( song = self._download_json(base_url, song_id, query={
'https://streetvoice.com/api/v1/public/song/%s/' % song_id, song_id, data=b'') 'fields': 'album,comments_count,created_at,id,image,length,likes_count,name,nickname,plays_count,profile,share_count,synopsis,user,username',
})
title = song['name'] title = song['name']
author = song['user']['nickname']
formats = []
for suffix, format_id in [('hls/file', 'hls'), ('file', 'http'), ('file/original', 'original')]:
f_url = (self._download_json(
base_url + suffix + '/', song_id,
'Downloading %s format URL' % format_id,
data=b'', fatal=False) or {}).get('file')
if not f_url:
continue
f = {
'ext': 'mp3',
'format_id': format_id,
'url': f_url,
'vcodec': 'none',
}
if format_id == 'hls':
f['protocol'] = 'm3u8_native'
abr = self._search_regex(r'\.mp3\.(\d+)k', f_url, 'bitrate', default=None)
if abr:
abr = int(abr)
f.update({
'abr': abr,
'tbr': abr,
})
formats.append(f)
user = song.get('user') or {}
username = user.get('username')
get_count = lambda x: int_or_none(song.get(x + '_count'))
return { return {
'id': song_id, 'id': song_id,
'url': song['file'], 'formats': formats,
'title': title, 'title': title,
'description': '%s - %s' % (author, title), 'description': strip_or_none(song.get('synopsis')),
'thumbnail': self._proto_relative_url(song.get('image'), 'http:'), 'thumbnail': song.get('image'),
'duration': song.get('length'), 'duration': int_or_none(song.get('length')),
'upload_date': unified_strdate(song.get('created_at')), 'timestamp': parse_iso8601(song.get('created_at')),
'uploader': author, 'uploader': try_get(user, lambda x: x['profile']['nickname']),
'uploader_id': compat_str(song['user']['id']), 'uploader_id': str_or_none(user.get('id')),
'uploader_url': urljoin(url, '/%s/' % username) if username else None,
'view_count': get_count('plays'),
'like_count': get_count('likes'),
'comment_count': get_count('comments'),
'repost_count': get_count('share'),
'track': title,
'track_id': song_id,
'album': try_get(song, lambda x: x['album']['name']),
} }

View File

@ -1322,17 +1322,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return self._parse_json( return self._parse_json(
uppercase_escape(config), video_id, fatal=False) uppercase_escape(config), video_id, fatal=False)
def _get_automatic_captions(self, video_id, webpage): def _get_automatic_captions(self, video_id, player_response, player_config):
"""We need the webpage for getting the captions url, pass it as an """We need the webpage for getting the captions url, pass it as an
argument to speed up the process.""" argument to speed up the process."""
self.to_screen('%s: Looking for automatic captions' % video_id) self.to_screen('%s: Looking for automatic captions' % video_id)
player_config = self._get_ytplayer_config(video_id, webpage)
err_msg = 'Couldn\'t find automatic captions for %s' % video_id err_msg = 'Couldn\'t find automatic captions for %s' % video_id
if not player_config: if not (player_response or player_config):
self._downloader.report_warning(err_msg) self._downloader.report_warning(err_msg)
return {} return {}
try: try:
args = player_config['args'] args = player_config.get('args') if player_config else {}
caption_url = args.get('ttsurl') caption_url = args.get('ttsurl')
if caption_url: if caption_url:
timestamp = args['timestamp'] timestamp = args['timestamp']
@ -1391,19 +1390,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return captions return captions
# New captions format as of 22.06.2017 # New captions format as of 22.06.2017
player_response = args.get('player_response') if player_response:
if player_response and isinstance(player_response, compat_str): renderer = player_response['captions']['playerCaptionsTracklistRenderer']
player_response = self._parse_json( base_url = renderer['captionTracks'][0]['baseUrl']
player_response, video_id, fatal=False) sub_lang_list = []
if player_response: for lang in renderer['translationLanguages']:
renderer = player_response['captions']['playerCaptionsTracklistRenderer'] lang_code = lang.get('languageCode')
base_url = renderer['captionTracks'][0]['baseUrl'] if lang_code:
sub_lang_list = [] sub_lang_list.append(lang_code)
for lang in renderer['translationLanguages']: return make_captions(base_url, sub_lang_list)
lang_code = lang.get('languageCode')
if lang_code:
sub_lang_list.append(lang_code)
return make_captions(base_url, sub_lang_list)
# Some videos don't provide ttsurl but rather caption_tracks and # Some videos don't provide ttsurl but rather caption_tracks and
# caption_translation_languages (e.g. 20LmZk1hakA) # caption_translation_languages (e.g. 20LmZk1hakA)
@ -1652,6 +1647,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Get video info # Get video info
video_info = {} video_info = {}
embed_webpage = None embed_webpage = None
ytplayer_config = None
if re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None: if re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None:
age_gate = True age_gate = True
@ -2276,7 +2272,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# subtitles # subtitles
video_subtitles = self.extract_subtitles(video_id, video_webpage) video_subtitles = self.extract_subtitles(video_id, video_webpage)
automatic_captions = self.extract_automatic_captions(video_id, video_webpage) automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config)
video_duration = try_get( video_duration = try_get(
video_info, lambda x: int_or_none(x['length_seconds'][0])) video_info, lambda x: int_or_none(x['length_seconds'][0]))