[viki] improve format extraction

[viki] fix stream extraction from mpd (#27092 )
Co-authored-by: beefchop <beefchop@users.noreply.github.com>
2025-06-04 20:12:40 +09:00 · 2020-11-19 22:49:28 +01:00 · 2020-11-19 21:38:09 +01:00 · 2020-11-19 20:01:24 +01:00 · 2020-11-19 17:29:30 +01:00 · 2020-11-19 17:26:53 +01:00
4 changed files with 208 additions and 93 deletions
--- a/youtube_dl/extractor/amara.py
+++ b/youtube_dl/extractor/amara.py
@ -0,0 +1,103 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from .youtube import YoutubeIE
 from .vimeo import VimeoIE
 from ..utils import (
    int_or_none,
    parse_iso8601,
    update_url_query,
 )
 class AmaraIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P<id>\w+)'
    _TESTS = [{
        # Youtube
        'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video',
        'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae',
        'info_dict': {
            'id': 'h6ZuVdvYnfE',
            'ext': 'mp4',
            'title': 'Why jury trials are becoming less common',
            'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1',
            'thumbnail': r're:^https?://.*\.jpg$',
            'subtitles': dict,
            'upload_date': '20160813',
            'uploader': 'PBS NewsHour',
            'uploader_id': 'PBSNewsHour',
            'timestamp': 1549639570,
        }
    }, {
        # Vimeo
        'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011',
        'md5': '99392c75fa05d432a8f11df03612195e',
        'info_dict': {
            'id': '18622084',
            'ext': 'mov',
            'title': 'Vimeo at CES 2011!',
            'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
            'thumbnail': r're:^https?://.*\.jpg$',
            'subtitles': dict,
            'timestamp': 1294763658,
            'upload_date': '20110111',
            'uploader': 'Sam Morrill',
            'uploader_id': 'sammorrill'
        }
    }, {
        # Direct Link
        'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/',
        'md5': 'd3970f08512738ee60c5807311ff5d3f',
        'info_dict': {
            'id': 's8KL7I3jLmh6',
            'ext': 'mp4',
            'title': 'The danger of a single story',
            'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23',
            'thumbnail': r're:^https?://.*\.jpg$',
            'subtitles': dict,
            'upload_date': '20091007',
            'timestamp': 1254942511,
        }
    }]
    def _real_extract(self, url):
        video_id = self._match_id(url)
        meta = self._download_json(
            'https://amara.org/api/videos/%s/' % video_id,
            video_id, query={'format': 'json'})
        title = meta['title']
        video_url = meta['all_urls'][0]
        subtitles = {}
        for language in (meta.get('languages') or []):
            subtitles_uri = language.get('subtitles_uri')
            if not (subtitles_uri and language.get('published')):
                continue
            subtitle = subtitles.setdefault(language.get('code') or 'en', [])
            for f in ('json', 'srt', 'vtt'):
                subtitle.append({
                    'ext': f,
                    'url': update_url_query(subtitles_uri, {'format': f}),
                })
        info = {
            'url': video_url,
            'id': video_id,
            'subtitles': subtitles,
            'title': title,
            'description': meta.get('description'),
            'thumbnail': meta.get('thumbnail'),
            'duration': int_or_none(meta.get('duration')),
            'timestamp': parse_iso8601(meta.get('created')),
        }
        for ie in (YoutubeIE, VimeoIE):
            if ie.suitable(video_url):
                info.update({
                    '_type': 'url_transparent',
                    'ie_key': ie.ie_key(),
                })
                break
        return info
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -36,6 +36,7 @@ from .afreecatv import AfreecaTVIE
 from .airmozilla import AirMozillaIE
 from .aljazeera import AlJazeeraIE
 from .alphaporno import AlphaPornoIE
 from .amara import AmaraIE
 from .amcnetworks import AMCNetworksIE
 from .americastestkitchen import AmericasTestKitchenIE
 from .animeondemand import AnimeOnDemandIE
--- a/youtube_dl/extractor/googledrive.py
+++ b/youtube_dl/extractor/googledrive.py
@ -3,11 +3,13 @@ from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..compat import compat_parse_qs
 from ..utils import (
    determine_ext,
    ExtractorError,
    int_or_none,
    lowercase_escape,
    try_get,
    update_url_query,
 )
@ -38,21 +40,10 @@ class GoogleDriveIE(InfoExtractor):
        # video can't be watched anonymously due to view count limit reached,
        # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
        'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
-        'md5': 'bfbd670d03a470bb1e6d4a257adec12e',
+        'only_matching': True,
        'info_dict': {
            'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ',
            'ext': 'mp4',
            'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4',
        }
    }, {
        # video id is longer than 28 characters
        'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
        'info_dict': {
            'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ',
            'ext': 'mp4',
            'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4',
            'duration': 189,
        },
        'only_matching': True,
    }, {
        'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
@ -171,23 +162,21 @@ class GoogleDriveIE(InfoExtractor):
    def _real_extract(self, url):
        video_id = self._match_id(url)
-        webpage = self._download_webpage(
+        video_info = compat_parse_qs(self._download_webpage(
-            'http://docs.google.com/file/d/%s' % video_id, video_id)
+            'https://drive.google.com/get_video_info',
            video_id, query={'docid': video_id}))
-        title = self._search_regex(
+        def get_value(key):
-            r'"title"\s*,\s*"([^"]+)', webpage, 'title',
+            return try_get(video_info, lambda x: x[key][0])
-            default=None) or self._og_search_title(webpage)
+
-        duration = int_or_none(self._search_regex(
+        reason = get_value('reason')
-            r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds',
+        title = get_value('title')
-            default=None))
+        if not title and reason:
            raise ExtractorError(reason, expected=True)
        formats = []
-        fmt_stream_map = self._search_regex(
+        fmt_stream_map = (get_value('fmt_stream_map') or '').split(',')
-            r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage,
+        fmt_list = (get_value('fmt_list') or '').split(',')
            'fmt stream map', default='').split(',')
        fmt_list = self._search_regex(
            r'"fmt_list"\s*,\s*"([^"]+)', webpage,
            'fmt_list', default='').split(',')
        if fmt_stream_map and fmt_list:
            resolutions = {}
            for fmt in fmt_list:
@ -257,19 +246,14 @@ class GoogleDriveIE(InfoExtractor):
                        if urlh and urlh.headers.get('Content-Disposition'):
                            add_source_format(urlh)
-        if not formats:
+        if not formats and reason:
-            reason = self._search_regex(
+            raise ExtractorError(reason, expected=True)
                r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
            if reason:
                raise ExtractorError(reason, expected=True)
        self._sort_formats(formats)
-        hl = self._search_regex(
+        hl = get_value('hl')
            r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None)
        subtitles_id = None
-        ttsurl = self._search_regex(
+        ttsurl = get_value('ttsurl')
            r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None)
        if ttsurl:
            # the video Id for subtitles will be the last value in the ttsurl
            # query string
@ -279,8 +263,8 @@ class GoogleDriveIE(InfoExtractor):
        return {
            'id': video_id,
            'title': title,
-            'thumbnail': self._og_search_thumbnail(webpage, default=None),
+            'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id,
-            'duration': duration,
+            'duration': int_or_none(get_value('length_seconds')),
            'formats': formats,
            'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
            'automatic_captions': self.extract_automatic_captions(
--- a/youtube_dl/extractor/viki.py
+++ b/youtube_dl/extractor/viki.py
@ -1,6 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import base64
 import hashlib
 import hmac
 import itertools
@ -9,6 +10,10 @@ import re
 import time
 from .common import InfoExtractor
 from ..compat import (
    compat_parse_qs,
    compat_urllib_parse_urlparse,
 )
 from ..utils import (
    ExtractorError,
    int_or_none,
@ -165,19 +170,20 @@ class VikiIE(VikiBaseIE):
    }, {
        # episode
        'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
-        'md5': '5fa476a902e902783ac7a4d615cdbc7a',
+        'md5': '94e0e34fd58f169f40c184f232356cfe',
        'info_dict': {
            'id': '44699v',
            'ext': 'mp4',
            'title': 'Boys Over Flowers - Episode 1',
            'description': 'md5:b89cf50038b480b88b5b3c93589a9076',
-            'duration': 4204,
+            'duration': 4172,
            'timestamp': 1270496524,
            'upload_date': '20100405',
            'uploader': 'group8',
            'like_count': int,
            'age_limit': 13,
-        }
+        },
        'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
    }, {
        # youtube external
        'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
@ -194,14 +200,15 @@ class VikiIE(VikiBaseIE):
            'uploader_id': 'ad14065n',
            'like_count': int,
            'age_limit': 13,
-        }
+        },
        'skip': 'Page not found!',
    }, {
        'url': 'http://www.viki.com/player/44699v',
        'only_matching': True,
    }, {
        # non-English description
        'url': 'http://www.viki.com/videos/158036v-love-in-magic',
-        'md5': '1713ae35df5a521b31f6dc40730e7c9c',
+        'md5': 'adf9e321a0ae5d0aace349efaaff7691',
        'info_dict': {
            'id': '158036v',
            'ext': 'mp4',
@ -217,8 +224,11 @@ class VikiIE(VikiBaseIE):
    def _real_extract(self, url):
        video_id = self._match_id(url)
-        video = self._call_api(
+        resp = self._download_json(
-            'videos/%s.json' % video_id, video_id, 'Downloading video JSON')
+            'https://www.viki.com/api/videos/' + video_id,
            video_id, 'Downloading video JSON',
            headers={'x-viki-app-ver': '4.0.57'})
        video = resp['video']
        self._check_errors(video)
@ -265,57 +275,74 @@ class VikiIE(VikiBaseIE):
            'subtitles': subtitles,
        }
        streams = self._call_api(
            'videos/%s/streams.json' % video_id, video_id,
            'Downloading video streams JSON')
        if 'external' in streams:
            result.update({
                '_type': 'url_transparent',
                'url': streams['external']['url'],
            })
            return result
        formats = []
-        for format_id, stream_dict in streams.items():
+
-            height = int_or_none(self._search_regex(
+        def add_format(format_id, format_dict, protocol='http'):
-                r'^(\d+)[pP]$', format_id, 'height', default=None))
+            # rtmps URLs does not seem to work
-            for protocol, format_dict in stream_dict.items():
+            if protocol == 'rtmps':
-                # rtmps URLs does not seem to work
+                return
-                if protocol == 'rtmps':
+            format_url = format_dict.get('url')
-                    continue
+            if not format_url:
-                format_url = format_dict['url']
+                return
-                if format_id == 'm3u8':
+            qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query)
-                    m3u8_formats = self._extract_m3u8_formats(
+            stream = qs.get('stream', [None])[0]
-                        format_url, video_id, 'mp4',
+            if stream:
-                        entry_protocol='m3u8_native',
+                format_url = base64.b64decode(stream).decode()
-                        m3u8_id='m3u8-%s' % protocol, fatal=False)
+            if format_id in ('m3u8', 'hls'):
-                    # Despite CODECS metadata in m3u8 all video-only formats
+                m3u8_formats = self._extract_m3u8_formats(
-                    # are actually video+audio
+                    format_url, video_id, 'mp4',
-                    for f in m3u8_formats:
+                    entry_protocol='m3u8_native',
-                        if f.get('acodec') == 'none' and f.get('vcodec') != 'none':
+                    m3u8_id='m3u8-%s' % protocol, fatal=False)
-                            f['acodec'] = None
+                # Despite CODECS metadata in m3u8 all video-only formats
-                    formats.extend(m3u8_formats)
+                # are actually video+audio
-                elif format_url.startswith('rtmp'):
+                for f in m3u8_formats:
-                    mobj = re.search(
+                    if '_drm/index_' in f['url']:
                        r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$',
                        format_url)
                    if not mobj:
                        continue
-                    formats.append({
+                    if f.get('acodec') == 'none' and f.get('vcodec') != 'none':
-                        'format_id': 'rtmp-%s' % format_id,
+                        f['acodec'] = None
-                        'ext': 'flv',
+                    formats.append(f)
-                        'url': mobj.group('url'),
+            elif format_id in ('mpd', 'dash'):
-                        'play_path': mobj.group('playpath'),
+                formats.extend(self._extract_mpd_formats(
-                        'app': mobj.group('app'),
+                    format_url, video_id, 'mpd-%s' % protocol, fatal=False))
-                        'page_url': url,
+            elif format_url.startswith('rtmp'):
-                    })
+                mobj = re.search(
-                else:
+                    r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$',
-                    formats.append({
+                    format_url)
-                        'url': format_url,
+                if not mobj:
-                        'format_id': '%s-%s' % (format_id, protocol),
+                    return
-                        'height': height,
+                formats.append({
-                    })
+                    'format_id': 'rtmp-%s' % format_id,
                    'ext': 'flv',
                    'url': mobj.group('url'),
                    'play_path': mobj.group('playpath'),
                    'app': mobj.group('app'),
                    'page_url': url,
                })
            else:
                formats.append({
                    'url': format_url,
                    'format_id': '%s-%s' % (format_id, protocol),
                    'height': int_or_none(self._search_regex(
                        r'^(\d+)[pP]$', format_id, 'height', default=None)),
                })
        for format_id, format_dict in (resp.get('streams') or {}).items():
            add_format(format_id, format_dict)
        if not formats:
            streams = self._call_api(
                'videos/%s/streams.json' % video_id, video_id,
                'Downloading video streams JSON')
            if 'external' in streams:
                result.update({
                    '_type': 'url_transparent',
                    'url': streams['external']['url'],
                })
                return result
            for format_id, stream_dict in streams.items():
                for protocol, format_dict in stream_dict.items():
                    add_format(format_id, format_dict, protocol)
        self._sort_formats(formats)
        result['formats'] = formats
Author	SHA1	Message	Date
Remita Amine	59e583f7e8	[viki] improve format extraction	2020-11-19 22:49:28 +01:00
beefchop	daa25d4142	[viki] fix stream extraction from mpd (#27092 ) Co-authored-by: beefchop <beefchop@users.noreply.github.com>	2020-11-19 21:38:09 +01:00
Remita Amine	25a35cb38a	[googledrive] fix format extraction(closes #26979 )	2020-11-19 20:01:24 +01:00
Remita Amine	2cf8003638	[amara] improve extraction	2020-11-19 17:29:30 +01:00
Joost Verdoorn	cf1a8668e8	[Amara] Add new extractor (#20618 ) * [Amara] Add new extractor	2020-11-19 17:26:53 +01:00