[youtube] fix automatic captions extraction(closes #27162 )(closes #27388 )

[sonyliv] fix title for movies
[sonyliv] fix extraction(closes #25667 )
2025-07-13 06:54:15 +09:00 · 2020-12-24 16:05:03 +01:00 · 2020-12-24 13:33:12 +01:00 · 2020-12-24 13:10:20 +01:00 · 2020-12-24 13:10:20 +01:00 · 2020-12-24 13:10:20 +01:00
7 changed files with 378 additions and 180 deletions
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@ -28,6 +28,7 @@ from ..utils import (
    parse_iso8601,
    smuggle_url,
    str_or_none,
+    try_get,
    unescapeHTML,
    unsmuggle_url,
    UnsupportedError,
@ -600,24 +601,27 @@ class BrightcoveNewIE(AdobePassIE):
        store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x)

        def extract_policy_key():
-            webpage = self._download_webpage(
-                'http://players.brightcove.net/%s/%s_%s/index.min.js'
-                % (account_id, player_id, embed), video_id)
-
-            policy_key = None
-
-            catalog = self._search_regex(
-                r'catalog\(({.+?})\);', webpage, 'catalog', default=None)
-            if catalog:
-                catalog = self._parse_json(
-                    js_to_json(catalog), video_id, fatal=False)
-                if catalog:
-                    policy_key = catalog.get('policyKey')
-
+            base_url = 'http://players.brightcove.net/%s/%s_%s/' % (account_id, player_id, embed)
+            config = self._download_json(
+                base_url + 'config.json', video_id, fatal=False) or {}
+            policy_key = try_get(
+                config, lambda x: x['video_cloud']['policy_key'])
            if not policy_key:
-                policy_key = self._search_regex(
-                    r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1',
-                    webpage, 'policy key', group='pk')
+                webpage = self._download_webpage(
+                    base_url + 'index.min.js', video_id)
+
+                catalog = self._search_regex(
+                    r'catalog\(({.+?})\);', webpage, 'catalog', default=None)
+                if catalog:
+                    catalog = self._parse_json(
+                        js_to_json(catalog), video_id, fatal=False)
+                    if catalog:
+                        policy_key = catalog.get('policyKey')
+
+                if not policy_key:
+                    policy_key = self._search_regex(
+                        r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1',
+                        webpage, 'policy key', group='pk')

            store_pk(policy_key)
            return policy_key
--- a/youtube_dl/extractor/cbslocal.py
+++ b/youtube_dl/extractor/cbslocal.py
@ -11,7 +11,47 @@ from ..utils import (


 class CBSLocalIE(AnvatoIE):
-    _VALID_URL = r'https?://[a-z]+\.cbslocal\.com/(?:\d+/\d+/\d+|video)/(?P<id>[0-9a-z-]+)'
+    _VALID_URL_BASE = r'https?://[a-z]+\.cbslocal\.com/'
+    _VALID_URL = _VALID_URL_BASE + r'video/(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/',
+        'info_dict': {
+            'id': '3580809',
+            'ext': 'mp4',
+            'title': 'A Very Blue Anniversary',
+            'description': 'CBS2’s Cindy Hsu has more.',
+            'thumbnail': 're:^https?://.*',
+            'timestamp': int,
+            'upload_date': r're:^\d{8}$',
+            'uploader': 'CBS',
+            'subtitles': {
+                'en': 'mincount:5',
+            },
+            'categories': [
+                'Stations\\Spoken Word\\WCBSTV',
+                'Syndication\\AOL',
+                'Syndication\\MSN',
+                'Syndication\\NDN',
+                'Syndication\\Yahoo',
+                'Content\\News',
+                'Content\\News\\Local News',
+            ],
+            'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'],
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        mcp_id = self._match_id(url)
+        return self.url_result(
+            'anvato:anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67:' + mcp_id, 'Anvato', mcp_id)
+
+
+class CBSLocalArticleIE(AnvatoIE):
+    _VALID_URL = CBSLocalIE._VALID_URL_BASE + r'\d+/\d+/\d+/(?P<id>[0-9a-z-]+)'

    _TESTS = [{
        # Anvato backend
@ -52,31 +92,6 @@ class CBSLocalIE(AnvatoIE):
            # m3u8 download
            'skip_download': True,
        },
-    }, {
-        'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/',
-        'info_dict': {
-            'id': '3580809',
-            'ext': 'mp4',
-            'title': 'A Very Blue Anniversary',
-            'description': 'CBS2’s Cindy Hsu has more.',
-            'thumbnail': 're:^https?://.*',
-            'timestamp': int,
-            'upload_date': r're:^\d{8}$',
-            'uploader': 'CBS',
-            'subtitles': {
-                'en': 'mincount:5',
-            },
-            'categories': [
-                'Stations\\Spoken Word\\WCBSTV',
-                'Syndication\\AOL',
-                'Syndication\\MSN',
-                'Syndication\\NDN',
-                'Syndication\\Yahoo',
-                'Content\\News',
-                'Content\\News\\Local News',
-            ],
-            'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'],
-        },
    }]

    def _real_extract(self, url):
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -163,7 +163,10 @@ from .cbc import (
    CBCOlympicsIE,
 )
 from .cbs import CBSIE
-from .cbslocal import CBSLocalIE
+from .cbslocal import (
+    CBSLocalIE,
+    CBSLocalArticleIE,
+)
 from .cbsinteractive import CBSInteractiveIE
 from .cbsnews import (
    CBSNewsEmbedIE,
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@ -1,6 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals

+import json
 import re
 import socket

@ -8,6 +9,7 @@ from .common import InfoExtractor
 from ..compat import (
    compat_etree_fromstring,
    compat_http_client,
+    compat_str,
    compat_urllib_error,
    compat_urllib_parse_unquote,
    compat_urllib_parse_unquote_plus,
@ -47,7 +49,8 @@ class FacebookIE(InfoExtractor):
                            )\?(?:.*?)(?:v|video_id|story_fbid)=|
                            [^/]+/videos/(?:[^/]+/)?|
                            [^/]+/posts/|
-                            groups/[^/]+/permalink/
+                            groups/[^/]+/permalink/|
+                            watchparty/
                        )|
                    facebook:
                )
@ -280,8 +283,18 @@ class FacebookIE(InfoExtractor):
        # data.video.creation_story.attachments[].media
        'url': 'https://www.facebook.com/watch/live/?v=1823658634322275',
        'only_matching': True,
+    }, {
+        'url': 'https://www.facebook.com/watchparty/211641140192478',
+        'info_dict': {
+            'id': '211641140192478',
+        },
+        'playlist_count': 1,
+        'skip': 'Requires logging in',
    }]
    _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)'
+    _api_config = {
+        'graphURI': '/api/graphql/'
+    }

    @staticmethod
    def _extract_urls(webpage):
@ -405,6 +418,17 @@ class FacebookIE(InfoExtractor):

            self._sort_formats(formats)

+        def extract_relay_data(_filter):
+            return self._parse_json(self._search_regex(
+                r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter,
+                webpage, 'replay data', default='{}'), video_id, fatal=False) or {}
+
+        def extract_relay_prefetched_data(_filter):
+            replay_data = extract_relay_data(_filter)
+            for require in (replay_data.get('require') or []):
+                if require[0] == 'RelayPrefetchedStreamCache':
+                    return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {}
+
        if not video_data:
            server_js_data = self._parse_json(self._search_regex([
                r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX,
@ -413,87 +437,83 @@ class FacebookIE(InfoExtractor):
            video_data = extract_from_jsmods_instances(server_js_data)

        if not video_data:
-            graphql_data = self._parse_json(self._search_regex(
-                r'handleWithCustomApplyEach\([^,]+,\s*({.*?"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+".*?})\);',
-                webpage, 'graphql data', default='{}'), video_id, fatal=False) or {}
-            for require in (graphql_data.get('require') or []):
-                if require[0] == 'RelayPrefetchedStreamCache':
-                    entries = []
+            data = extract_relay_prefetched_data(
+                r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"')
+            if data:
+                entries = []

-                    def parse_graphql_video(video):
-                        formats = []
-                        q = qualities(['sd', 'hd'])
-                        for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]:
-                            playable_url = video.get('playable_url' + suffix)
-                            if not playable_url:
-                                continue
-                            formats.append({
-                                'format_id': format_id,
-                                'quality': q(format_id),
-                                'url': playable_url,
-                            })
-                        extract_dash_manifest(video, formats)
-                        process_formats(formats)
-                        v_id = video.get('videoId') or video.get('id') or video_id
-                        info = {
-                            'id': v_id,
-                            'formats': formats,
-                            'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']),
-                            'uploader_id': try_get(video, lambda x: x['owner']['id']),
-                            'timestamp': int_or_none(video.get('publish_time')),
-                            'duration': float_or_none(video.get('playable_duration_in_ms'), 1000),
-                        }
-                        description = try_get(video, lambda x: x['savable_description']['text'])
-                        title = video.get('name')
-                        if title:
-                            info.update({
-                                'title': title,
-                                'description': description,
-                            })
-                        else:
-                            info['title'] = description or 'Facebook video #%s' % v_id
-                        entries.append(info)
+                def parse_graphql_video(video):
+                    formats = []
+                    q = qualities(['sd', 'hd'])
+                    for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]:
+                        playable_url = video.get('playable_url' + suffix)
+                        if not playable_url:
+                            continue
+                        formats.append({
+                            'format_id': format_id,
+                            'quality': q(format_id),
+                            'url': playable_url,
+                        })
+                    extract_dash_manifest(video, formats)
+                    process_formats(formats)
+                    v_id = video.get('videoId') or video.get('id') or video_id
+                    info = {
+                        'id': v_id,
+                        'formats': formats,
+                        'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']),
+                        'uploader_id': try_get(video, lambda x: x['owner']['id']),
+                        'timestamp': int_or_none(video.get('publish_time')),
+                        'duration': float_or_none(video.get('playable_duration_in_ms'), 1000),
+                    }
+                    description = try_get(video, lambda x: x['savable_description']['text'])
+                    title = video.get('name')
+                    if title:
+                        info.update({
+                            'title': title,
+                            'description': description,
+                        })
+                    else:
+                        info['title'] = description or 'Facebook video #%s' % v_id
+                    entries.append(info)

-                    def parse_attachment(attachment, key='media'):
-                        media = attachment.get(key) or {}
-                        if media.get('__typename') == 'Video':
-                            return parse_graphql_video(media)
+                def parse_attachment(attachment, key='media'):
+                    media = attachment.get(key) or {}
+                    if media.get('__typename') == 'Video':
+                        return parse_graphql_video(media)

-                    data = try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {}
+                nodes = data.get('nodes') or []
+                node = data.get('node') or {}
+                if not nodes and node:
+                    nodes.append(node)
+                for node in nodes:
+                    story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {}
+                    attachments = try_get(story, [
+                        lambda x: x['attached_story']['attachments'],
+                        lambda x: x['attachments']
+                    ], list) or []
+                    for attachment in attachments:
+                        attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict)
+                        ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or []
+                        for n in ns:
+                            parse_attachment(n)
+                        parse_attachment(attachment)

-                    nodes = data.get('nodes') or []
-                    node = data.get('node') or {}
-                    if not nodes and node:
-                        nodes.append(node)
-                    for node in nodes:
-                        story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {}
-                        attachments = try_get(story, [
-                            lambda x: x['attached_story']['attachments'],
-                            lambda x: x['attachments']
-                        ], list) or []
-                        for attachment in attachments:
-                            attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict)
-                            ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or []
-                            for n in ns:
-                                parse_attachment(n)
-                            parse_attachment(attachment)
+                edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or []
+                for edge in edges:
+                    parse_attachment(edge, key='node')

-                    edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or []
-                    for edge in edges:
-                        parse_attachment(edge, key='node')
+                video = data.get('video') or {}
+                if video:
+                    attachments = try_get(video, [
+                        lambda x: x['story']['attachments'],
+                        lambda x: x['creation_story']['attachments']
+                    ], list) or []
+                    for attachment in attachments:
+                        parse_attachment(attachment)
+                    if not entries:
+                        parse_graphql_video(video)

-                    video = data.get('video') or {}
-                    if video:
-                        attachments = try_get(video, [
-                            lambda x: x['story']['attachments'],
-                            lambda x: x['creation_story']['attachments']
-                        ], list) or []
-                        for attachment in attachments:
-                            parse_attachment(attachment)
-                        if not entries:
-                            parse_graphql_video(video)
-
-                    return self.playlist_result(entries, video_id)
+                return self.playlist_result(entries, video_id)

        if not video_data:
            m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
@ -504,6 +524,43 @@ class FacebookIE(InfoExtractor):
            elif '>You must log in to continue' in webpage:
                self.raise_login_required()

+        if not video_data and '/watchparty/' in url:
+            post_data = {
+                'doc_id': 3731964053542869,
+                'variables': json.dumps({
+                    'livingRoomID': video_id,
+                }),
+            }
+
+            prefetched_data = extract_relay_prefetched_data(r'"login_data"\s*:\s*{')
+            if prefetched_data:
+                lsd = try_get(prefetched_data, lambda x: x['login_data']['lsd'], dict)
+                if lsd:
+                    post_data[lsd['name']] = lsd['value']
+
+            relay_data = extract_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,')
+            for define in (relay_data.get('define') or []):
+                if define[0] == 'RelayAPIConfigDefaults':
+                    self._api_config = define[2]
+
+            living_room = self._download_json(
+                urljoin(url, self._api_config['graphURI']), video_id,
+                data=urlencode_postdata(post_data))['data']['living_room']
+
+            entries = []
+            for edge in (try_get(living_room, lambda x: x['recap']['watched_content']['edges']) or []):
+                video = try_get(edge, lambda x: x['node']['video']) or {}
+                v_id = video.get('id')
+                if not v_id:
+                    continue
+                v_id = compat_str(v_id)
+                entries.append(self.url_result(
+                    self._VIDEO_PAGE_TEMPLATE % v_id,
+                    self.ie_key(), v_id, video.get('name')))
+
+            return self.playlist_result(entries, video_id)
+
+        if not video_data:
            # Video info not in first request, do a secondary request using
            # tahoe player specific URL
            tahoe_data = self._download_webpage(
--- a/youtube_dl/extractor/sonyliv.py
+++ b/youtube_dl/extractor/sonyliv.py
@ -1,40 +1,112 @@
 # coding: utf-8
 from __future__ import unicode_literals

+import time
+import uuid
+
 from .common import InfoExtractor
-from ..utils import smuggle_url
+from ..compat import compat_HTTPError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+)


 class SonyLIVIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/details/[^/]+/(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/(?:s(?:how|port)s/[^/]+|movies|clip|trailer|music-videos)/[^/?#&]+-(?P<id>\d+)'
    _TESTS = [{
-        'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight",
+        'url': 'https://www.sonyliv.com/shows/bachelors-delight-1700000113/achaari-cheese-toast-1000022678?watch=true',
        'info_dict': {
-            'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight",
-            'id': 'ref:5024612095001',
+            'title': 'Bachelors Delight - Achaari Cheese Toast',
+            'id': '1000022678',
            'ext': 'mp4',
-            'upload_date': '20170923',
-            'description': 'md5:7f28509a148d5be9d0782b4d5106410d',
-            'uploader_id': '5182475815001',
-            'timestamp': 1506200547,
+            'upload_date': '20200411',
+            'description': 'md5:3957fa31d9309bf336ceb3f37ad5b7cb',
+            'timestamp': 1586632091,
+            'duration': 185,
+            'season_number': 1,
+            'episode': 'Achaari Cheese Toast',
+            'episode_number': 1,
+            'release_year': 2016,
        },
        'params': {
            'skip_download': True,
        },
-        'add_ie': ['BrightcoveNew'],
    }, {
-        'url': 'http://www.sonyliv.com/details/full%20movie/4951168986001/Sei-Raat-(Bangla)',
+        'url': 'https://www.sonyliv.com/movies/tahalka-1000050121?watch=true',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.sonyliv.com/clip/jigarbaaz-1000098925',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.sonyliv.com/trailer/sandwiched-forever-1000100286?watch=true',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.sonyliv.com/sports/india-tour-of-australia-2020-21-1700000286/cricket-hls-day-3-1st-test-aus-vs-ind-19-dec-2020-1000100959?watch=true',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.sonyliv.com/music-videos/yeh-un-dinon-ki-baat-hai-1000018779',
        'only_matching': True,
    }]
+    _GEO_COUNTRIES = ['IN']
+    _TOKEN = None

-    # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s'
-    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5182475815001/default_default/index.html?videoId=ref:%s'
+    def _call_api(self, version, path, video_id):
+        headers = {}
+        if self._TOKEN:
+            headers['security_token'] = self._TOKEN
+        try:
+            return self._download_json(
+                'https://apiv2.sonyliv.com/AGL/%s/A/ENG/WEB/%s' % (version, path),
+                video_id, headers=headers)['resultObj']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                message = self._parse_json(
+                    e.cause.read().decode(), video_id)['message']
+                if message == 'Geoblocked Country':
+                    self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+                raise ExtractorError(message)
+            raise
+
+    def _real_initialize(self):
+        self._TOKEN = self._call_api('1.4', 'ALL/GETTOKEN', None)

    def _real_extract(self, url):
-        brightcove_id = self._match_id(url)
-        return self.url_result(
-            smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, {
-                'geo_countries': ['IN'],
-                'referrer': url,
-            }),
-            'BrightcoveNew', brightcove_id)
+        video_id = self._match_id(url)
+        content = self._call_api(
+            '1.5', 'IN/CONTENT/VIDEOURL/VOD/' + video_id, video_id)
+        if content.get('isEncrypted'):
+            raise ExtractorError('This video is DRM protected.', expected=True)
+        dash_url = content['videoURL']
+        headers = {
+            'x-playback-session-id': '%s-%d' % (uuid.uuid4().hex, time.time() * 1000)
+        }
+        formats = self._extract_mpd_formats(
+            dash_url, video_id, mpd_id='dash', headers=headers, fatal=False)
+        formats.extend(self._extract_m3u8_formats(
+            dash_url.replace('.mpd', '.m3u8').replace('/DASH/', '/HLS/'),
+            video_id, 'mp4', m3u8_id='hls', headers=headers, fatal=False))
+        for f in formats:
+            f.setdefault('http_headers', {}).update(headers)
+        self._sort_formats(formats)
+
+        metadata = self._call_api(
+            '1.6', 'IN/DETAIL/' + video_id, video_id)['containers'][0]['metadata']
+        title = metadata['title']
+        episode = metadata.get('episodeTitle')
+        if episode and title != episode:
+            title += ' - ' + episode
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': content.get('posterURL'),
+            'description': metadata.get('longDescription') or metadata.get('shortDescription'),
+            'timestamp': int_or_none(metadata.get('creationDate'), 1000),
+            'duration': int_or_none(metadata.get('duration')),
+            'season_number': int_or_none(metadata.get('season')),
+            'episode': episode,
+            'episode_number': int_or_none(metadata.get('episodeNumber')),
+            'release_year': int_or_none(metadata.get('year')),
+        }
--- a/youtube_dl/extractor/streetvoice.py
+++ b/youtube_dl/extractor/streetvoice.py
@ -2,25 +2,40 @@
 from __future__ import unicode_literals

 from .common import InfoExtractor
-from ..compat import compat_str
-from ..utils import unified_strdate
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+    str_or_none,
+    strip_or_none,
+    try_get,
+    urljoin,
+)


 class StreetVoiceIE(InfoExtractor):
    _VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P<id>[0-9]+)'
    _TESTS = [{
-        'url': 'http://streetvoice.com/skippylu/songs/94440/',
-        'md5': '15974627fc01a29e492c98593c2fd472',
+        'url': 'https://streetvoice.com/skippylu/songs/123688/',
+        'md5': '0eb535970629a5195685355f3ed60bfd',
        'info_dict': {
-            'id': '94440',
+            'id': '123688',
            'ext': 'mp3',
-            'title': '輸',
-            'description': 'Crispy脆樂團 - 輸',
-            'thumbnail': r're:^https?://.*\.jpg$',
-            'duration': 260,
-            'upload_date': '20091018',
+            'title': '流浪',
+            'description': 'md5:8eb0bfcc9dcd8aa82bd6efca66e3fea6',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 270,
+            'upload_date': '20100923',
            'uploader': 'Crispy脆樂團',
            'uploader_id': '627810',
+            'uploader_url': 're:^https?://streetvoice.com/skippylu/',
+            'timestamp': 1285261661,
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
+            'repost_count': int,
+            'track': '流浪',
+            'track_id': '123688',
+            'album': '2010',
        }
    }, {
        'url': 'http://tw.streetvoice.com/skippylu/songs/94440/',
@ -29,21 +44,57 @@ class StreetVoiceIE(InfoExtractor):

    def _real_extract(self, url):
        song_id = self._match_id(url)
-
-        song = self._download_json(
-            'https://streetvoice.com/api/v1/public/song/%s/' % song_id, song_id, data=b'')
-
+        base_url = 'https://streetvoice.com/api/v4/song/%s/' % song_id
+        song = self._download_json(base_url, song_id, query={
+            'fields': 'album,comments_count,created_at,id,image,length,likes_count,name,nickname,plays_count,profile,share_count,synopsis,user,username',
+        })
        title = song['name']
-        author = song['user']['nickname']
+
+        formats = []
+        for suffix, format_id in [('hls/file', 'hls'), ('file', 'http'), ('file/original', 'original')]:
+            f_url = (self._download_json(
+                base_url + suffix + '/', song_id,
+                'Downloading %s format URL' % format_id,
+                data=b'', fatal=False) or {}).get('file')
+            if not f_url:
+                continue
+            f = {
+                'ext': 'mp3',
+                'format_id': format_id,
+                'url': f_url,
+                'vcodec': 'none',
+            }
+            if format_id == 'hls':
+                f['protocol'] = 'm3u8_native'
+            abr = self._search_regex(r'\.mp3\.(\d+)k', f_url, 'bitrate', default=None)
+            if abr:
+                abr = int(abr)
+                f.update({
+                    'abr': abr,
+                    'tbr': abr,
+                })
+            formats.append(f)
+
+        user = song.get('user') or {}
+        username = user.get('username')
+        get_count = lambda x: int_or_none(song.get(x + '_count'))

        return {
            'id': song_id,
-            'url': song['file'],
+            'formats': formats,
            'title': title,
-            'description': '%s - %s' % (author, title),
-            'thumbnail': self._proto_relative_url(song.get('image'), 'http:'),
-            'duration': song.get('length'),
-            'upload_date': unified_strdate(song.get('created_at')),
-            'uploader': author,
-            'uploader_id': compat_str(song['user']['id']),
+            'description': strip_or_none(song.get('synopsis')),
+            'thumbnail': song.get('image'),
+            'duration': int_or_none(song.get('length')),
+            'timestamp': parse_iso8601(song.get('created_at')),
+            'uploader': try_get(user, lambda x: x['profile']['nickname']),
+            'uploader_id': str_or_none(user.get('id')),
+            'uploader_url': urljoin(url, '/%s/' % username) if username else None,
+            'view_count': get_count('plays'),
+            'like_count': get_count('likes'),
+            'comment_count': get_count('comments'),
+            'repost_count': get_count('share'),
+            'track': title,
+            'track_id': song_id,
+            'album': try_get(song, lambda x: x['album']['name']),
        }
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -1322,17 +1322,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            return self._parse_json(
                uppercase_escape(config), video_id, fatal=False)

-    def _get_automatic_captions(self, video_id, webpage):
+    def _get_automatic_captions(self, video_id, player_response, player_config):
        """We need the webpage for getting the captions url, pass it as an
           argument to speed up the process."""
        self.to_screen('%s: Looking for automatic captions' % video_id)
-        player_config = self._get_ytplayer_config(video_id, webpage)
        err_msg = 'Couldn\'t find automatic captions for %s' % video_id
-        if not player_config:
+        if not (player_response or player_config):
            self._downloader.report_warning(err_msg)
            return {}
        try:
-            args = player_config['args']
+            args = player_config.get('args') if player_config else {}
            caption_url = args.get('ttsurl')
            if caption_url:
                timestamp = args['timestamp']
@ -1391,19 +1390,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                return captions

            # New captions format as of 22.06.2017
-            player_response = args.get('player_response')
-            if player_response and isinstance(player_response, compat_str):
-                player_response = self._parse_json(
-                    player_response, video_id, fatal=False)
-                if player_response:
-                    renderer = player_response['captions']['playerCaptionsTracklistRenderer']
-                    base_url = renderer['captionTracks'][0]['baseUrl']
-                    sub_lang_list = []
-                    for lang in renderer['translationLanguages']:
-                        lang_code = lang.get('languageCode')
-                        if lang_code:
-                            sub_lang_list.append(lang_code)
-                    return make_captions(base_url, sub_lang_list)
+            if player_response:
+                renderer = player_response['captions']['playerCaptionsTracklistRenderer']
+                base_url = renderer['captionTracks'][0]['baseUrl']
+                sub_lang_list = []
+                for lang in renderer['translationLanguages']:
+                    lang_code = lang.get('languageCode')
+                    if lang_code:
+                        sub_lang_list.append(lang_code)
+                return make_captions(base_url, sub_lang_list)

            # Some videos don't provide ttsurl but rather caption_tracks and
            # caption_translation_languages (e.g. 20LmZk1hakA)
@ -1652,6 +1647,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        # Get video info
        video_info = {}
        embed_webpage = None
+        ytplayer_config = None

        if re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None:
            age_gate = True
@ -2276,7 +2272,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):

        # subtitles
        video_subtitles = self.extract_subtitles(video_id, video_webpage)
-        automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
+        automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config)

        video_duration = try_get(
            video_info, lambda x: int_or_none(x['length_seconds'][0]))
Author	SHA1	Message	Date
Remita Amine	4ef1fc9707	[youtube] fix automatic captions extraction(closes #27162 )(closes #27388 )	2020-12-24 16:05:03 +01:00
Remita Amine	f9e6aa1dcf	[sonyliv] fix title for movies	2020-12-24 13:33:12 +01:00
Remita Amine	f83db9064b	[sonyliv] fix extraction(closes #25667 )	2020-12-24 13:10:20 +01:00
Remita Amine	2da9a86399	[streetvoice] fix extraction(closes #27455 )(closes #27492 )	2020-12-24 13:10:20 +01:00
Remita Amine	ecaa535cf4	[facebook] add support for watchparty pages(closes #27507 )	2020-12-24 13:10:20 +01:00
Remita Amine	79dd92b1fe	[cbslocal] fix video extraction	2020-12-24 13:10:20 +01:00
Remita Amine	bd3844c9c2	[brightcove] add another method to extract policyKey	2020-12-24 13:10:20 +01:00