[viki] improve format extraction

[viki] fix stream extraction from mpd (#27092 )
Co-authored-by: beefchop <beefchop@users.noreply.github.com>
2025-06-04 20:12:40 +09:00 · 2020-11-19 22:49:28 +01:00 · 2020-11-19 21:38:09 +01:00 · 2020-11-19 20:01:24 +01:00 · 2020-11-19 17:29:30 +01:00 · 2020-11-19 17:26:53 +01:00
4 changed files with 208 additions and 93 deletions
--- a/youtube_dl/extractor/amara.py
+++ b/youtube_dl/extractor/amara.py
@ -0,0 +1,103 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .youtube import YoutubeIE
+from .vimeo import VimeoIE
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+    update_url_query,
+)
+
+
+class AmaraIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P<id>\w+)'
+    _TESTS = [{
+        # Youtube
+        'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video',
+        'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae',
+        'info_dict': {
+            'id': 'h6ZuVdvYnfE',
+            'ext': 'mp4',
+            'title': 'Why jury trials are becoming less common',
+            'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'subtitles': dict,
+            'upload_date': '20160813',
+            'uploader': 'PBS NewsHour',
+            'uploader_id': 'PBSNewsHour',
+            'timestamp': 1549639570,
+        }
+    }, {
+        # Vimeo
+        'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011',
+        'md5': '99392c75fa05d432a8f11df03612195e',
+        'info_dict': {
+            'id': '18622084',
+            'ext': 'mov',
+            'title': 'Vimeo at CES 2011!',
+            'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'subtitles': dict,
+            'timestamp': 1294763658,
+            'upload_date': '20110111',
+            'uploader': 'Sam Morrill',
+            'uploader_id': 'sammorrill'
+        }
+    }, {
+        # Direct Link
+        'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/',
+        'md5': 'd3970f08512738ee60c5807311ff5d3f',
+        'info_dict': {
+            'id': 's8KL7I3jLmh6',
+            'ext': 'mp4',
+            'title': 'The danger of a single story',
+            'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'subtitles': dict,
+            'upload_date': '20091007',
+            'timestamp': 1254942511,
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        meta = self._download_json(
+            'https://amara.org/api/videos/%s/' % video_id,
+            video_id, query={'format': 'json'})
+        title = meta['title']
+        video_url = meta['all_urls'][0]
+
+        subtitles = {}
+        for language in (meta.get('languages') or []):
+            subtitles_uri = language.get('subtitles_uri')
+            if not (subtitles_uri and language.get('published')):
+                continue
+            subtitle = subtitles.setdefault(language.get('code') or 'en', [])
+            for f in ('json', 'srt', 'vtt'):
+                subtitle.append({
+                    'ext': f,
+                    'url': update_url_query(subtitles_uri, {'format': f}),
+                })
+
+        info = {
+            'url': video_url,
+            'id': video_id,
+            'subtitles': subtitles,
+            'title': title,
+            'description': meta.get('description'),
+            'thumbnail': meta.get('thumbnail'),
+            'duration': int_or_none(meta.get('duration')),
+            'timestamp': parse_iso8601(meta.get('created')),
+        }
+
+        for ie in (YoutubeIE, VimeoIE):
+            if ie.suitable(video_url):
+                info.update({
+                    '_type': 'url_transparent',
+                    'ie_key': ie.ie_key(),
+                })
+                break
+
+        return info
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -36,6 +36,7 @@ from .afreecatv import AfreecaTVIE
 from .airmozilla import AirMozillaIE
 from .aljazeera import AlJazeeraIE
 from .alphaporno import AlphaPornoIE
+from .amara import AmaraIE
 from .amcnetworks import AMCNetworksIE
 from .americastestkitchen import AmericasTestKitchenIE
 from .animeondemand import AnimeOnDemandIE
--- a/youtube_dl/extractor/googledrive.py
+++ b/youtube_dl/extractor/googledrive.py
@ -3,11 +3,13 @@ from __future__ import unicode_literals
 import re

 from .common import InfoExtractor
+from ..compat import compat_parse_qs
 from ..utils import (
    determine_ext,
    ExtractorError,
    int_or_none,
    lowercase_escape,
+    try_get,
    update_url_query,
 )

@ -38,21 +40,10 @@ class GoogleDriveIE(InfoExtractor):
        # video can't be watched anonymously due to view count limit reached,
        # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
        'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
-        'md5': 'bfbd670d03a470bb1e6d4a257adec12e',
-        'info_dict': {
-            'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ',
-            'ext': 'mp4',
-            'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4',
-        }
+        'only_matching': True,
    }, {
        # video id is longer than 28 characters
        'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
-        'info_dict': {
-            'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ',
-            'ext': 'mp4',
-            'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4',
-            'duration': 189,
-        },
        'only_matching': True,
    }, {
        'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
@ -171,23 +162,21 @@ class GoogleDriveIE(InfoExtractor):

    def _real_extract(self, url):
        video_id = self._match_id(url)
-        webpage = self._download_webpage(
-            'http://docs.google.com/file/d/%s' % video_id, video_id)
+        video_info = compat_parse_qs(self._download_webpage(
+            'https://drive.google.com/get_video_info',
+            video_id, query={'docid': video_id}))

-        title = self._search_regex(
-            r'"title"\s*,\s*"([^"]+)', webpage, 'title',
-            default=None) or self._og_search_title(webpage)
-        duration = int_or_none(self._search_regex(
-            r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds',
-            default=None))
+        def get_value(key):
+            return try_get(video_info, lambda x: x[key][0])
+
+        reason = get_value('reason')
+        title = get_value('title')
+        if not title and reason:
+            raise ExtractorError(reason, expected=True)

        formats = []
-        fmt_stream_map = self._search_regex(
-            r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage,
-            'fmt stream map', default='').split(',')
-        fmt_list = self._search_regex(
-            r'"fmt_list"\s*,\s*"([^"]+)', webpage,
-            'fmt_list', default='').split(',')
+        fmt_stream_map = (get_value('fmt_stream_map') or '').split(',')
+        fmt_list = (get_value('fmt_list') or '').split(',')
        if fmt_stream_map and fmt_list:
            resolutions = {}
            for fmt in fmt_list:
@ -257,19 +246,14 @@ class GoogleDriveIE(InfoExtractor):
                        if urlh and urlh.headers.get('Content-Disposition'):
                            add_source_format(urlh)

-        if not formats:
-            reason = self._search_regex(
-                r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
-            if reason:
-                raise ExtractorError(reason, expected=True)
+        if not formats and reason:
+            raise ExtractorError(reason, expected=True)

        self._sort_formats(formats)

-        hl = self._search_regex(
-            r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None)
+        hl = get_value('hl')
        subtitles_id = None
-        ttsurl = self._search_regex(
-            r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None)
+        ttsurl = get_value('ttsurl')
        if ttsurl:
            # the video Id for subtitles will be the last value in the ttsurl
            # query string
@ -279,8 +263,8 @@ class GoogleDriveIE(InfoExtractor):
        return {
            'id': video_id,
            'title': title,
-            'thumbnail': self._og_search_thumbnail(webpage, default=None),
-            'duration': duration,
+            'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id,
+            'duration': int_or_none(get_value('length_seconds')),
            'formats': formats,
            'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
            'automatic_captions': self.extract_automatic_captions(
--- a/youtube_dl/extractor/viki.py
+++ b/youtube_dl/extractor/viki.py
@ -1,6 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals

+import base64
 import hashlib
 import hmac
 import itertools
@ -9,6 +10,10 @@ import re
 import time

 from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+    compat_urllib_parse_urlparse,
+)
 from ..utils import (
    ExtractorError,
    int_or_none,
@ -165,19 +170,20 @@ class VikiIE(VikiBaseIE):
    }, {
        # episode
        'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
-        'md5': '5fa476a902e902783ac7a4d615cdbc7a',
+        'md5': '94e0e34fd58f169f40c184f232356cfe',
        'info_dict': {
            'id': '44699v',
            'ext': 'mp4',
            'title': 'Boys Over Flowers - Episode 1',
            'description': 'md5:b89cf50038b480b88b5b3c93589a9076',
-            'duration': 4204,
+            'duration': 4172,
            'timestamp': 1270496524,
            'upload_date': '20100405',
            'uploader': 'group8',
            'like_count': int,
            'age_limit': 13,
-        }
+        },
+        'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
    }, {
        # youtube external
        'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
@ -194,14 +200,15 @@ class VikiIE(VikiBaseIE):
            'uploader_id': 'ad14065n',
            'like_count': int,
            'age_limit': 13,
-        }
+        },
+        'skip': 'Page not found!',
    }, {
        'url': 'http://www.viki.com/player/44699v',
        'only_matching': True,
    }, {
        # non-English description
        'url': 'http://www.viki.com/videos/158036v-love-in-magic',
-        'md5': '1713ae35df5a521b31f6dc40730e7c9c',
+        'md5': 'adf9e321a0ae5d0aace349efaaff7691',
        'info_dict': {
            'id': '158036v',
            'ext': 'mp4',
@ -217,8 +224,11 @@ class VikiIE(VikiBaseIE):
    def _real_extract(self, url):
        video_id = self._match_id(url)

-        video = self._call_api(
-            'videos/%s.json' % video_id, video_id, 'Downloading video JSON')
+        resp = self._download_json(
+            'https://www.viki.com/api/videos/' + video_id,
+            video_id, 'Downloading video JSON',
+            headers={'x-viki-app-ver': '4.0.57'})
+        video = resp['video']

        self._check_errors(video)

@ -265,57 +275,74 @@ class VikiIE(VikiBaseIE):
            'subtitles': subtitles,
        }

-        streams = self._call_api(
-            'videos/%s/streams.json' % video_id, video_id,
-            'Downloading video streams JSON')
-
-        if 'external' in streams:
-            result.update({
-                '_type': 'url_transparent',
-                'url': streams['external']['url'],
-            })
-            return result
-
        formats = []
-        for format_id, stream_dict in streams.items():
-            height = int_or_none(self._search_regex(
-                r'^(\d+)[pP]$', format_id, 'height', default=None))
-            for protocol, format_dict in stream_dict.items():
-                # rtmps URLs does not seem to work
-                if protocol == 'rtmps':
-                    continue
-                format_url = format_dict['url']
-                if format_id == 'm3u8':
-                    m3u8_formats = self._extract_m3u8_formats(
-                        format_url, video_id, 'mp4',
-                        entry_protocol='m3u8_native',
-                        m3u8_id='m3u8-%s' % protocol, fatal=False)
-                    # Despite CODECS metadata in m3u8 all video-only formats
-                    # are actually video+audio
-                    for f in m3u8_formats:
-                        if f.get('acodec') == 'none' and f.get('vcodec') != 'none':
-                            f['acodec'] = None
-                    formats.extend(m3u8_formats)
-                elif format_url.startswith('rtmp'):
-                    mobj = re.search(
-                        r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$',
-                        format_url)
-                    if not mobj:
+
+        def add_format(format_id, format_dict, protocol='http'):
+            # rtmps URLs does not seem to work
+            if protocol == 'rtmps':
+                return
+            format_url = format_dict.get('url')
+            if not format_url:
+                return
+            qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query)
+            stream = qs.get('stream', [None])[0]
+            if stream:
+                format_url = base64.b64decode(stream).decode()
+            if format_id in ('m3u8', 'hls'):
+                m3u8_formats = self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4',
+                    entry_protocol='m3u8_native',
+                    m3u8_id='m3u8-%s' % protocol, fatal=False)
+                # Despite CODECS metadata in m3u8 all video-only formats
+                # are actually video+audio
+                for f in m3u8_formats:
+                    if '_drm/index_' in f['url']:
                        continue
-                    formats.append({
-                        'format_id': 'rtmp-%s' % format_id,
-                        'ext': 'flv',
-                        'url': mobj.group('url'),
-                        'play_path': mobj.group('playpath'),
-                        'app': mobj.group('app'),
-                        'page_url': url,
-                    })
-                else:
-                    formats.append({
-                        'url': format_url,
-                        'format_id': '%s-%s' % (format_id, protocol),
-                        'height': height,
-                    })
+                    if f.get('acodec') == 'none' and f.get('vcodec') != 'none':
+                        f['acodec'] = None
+                    formats.append(f)
+            elif format_id in ('mpd', 'dash'):
+                formats.extend(self._extract_mpd_formats(
+                    format_url, video_id, 'mpd-%s' % protocol, fatal=False))
+            elif format_url.startswith('rtmp'):
+                mobj = re.search(
+                    r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$',
+                    format_url)
+                if not mobj:
+                    return
+                formats.append({
+                    'format_id': 'rtmp-%s' % format_id,
+                    'ext': 'flv',
+                    'url': mobj.group('url'),
+                    'play_path': mobj.group('playpath'),
+                    'app': mobj.group('app'),
+                    'page_url': url,
+                })
+            else:
+                formats.append({
+                    'url': format_url,
+                    'format_id': '%s-%s' % (format_id, protocol),
+                    'height': int_or_none(self._search_regex(
+                        r'^(\d+)[pP]$', format_id, 'height', default=None)),
+                })
+
+        for format_id, format_dict in (resp.get('streams') or {}).items():
+            add_format(format_id, format_dict)
+        if not formats:
+            streams = self._call_api(
+                'videos/%s/streams.json' % video_id, video_id,
+                'Downloading video streams JSON')
+
+            if 'external' in streams:
+                result.update({
+                    '_type': 'url_transparent',
+                    'url': streams['external']['url'],
+                })
+                return result
+
+            for format_id, stream_dict in streams.items():
+                for protocol, format_dict in stream_dict.items():
+                    add_format(format_id, format_dict, protocol)
        self._sort_formats(formats)

        result['formats'] = formats
Author	SHA1	Message	Date
Remita Amine	59e583f7e8	[viki] improve format extraction	2020-11-19 22:49:28 +01:00
beefchop	daa25d4142	[viki] fix stream extraction from mpd (#27092 ) Co-authored-by: beefchop <beefchop@users.noreply.github.com>	2020-11-19 21:38:09 +01:00
Remita Amine	25a35cb38a	[googledrive] fix format extraction(closes #26979 )	2020-11-19 20:01:24 +01:00
Remita Amine	2cf8003638	[amara] improve extraction	2020-11-19 17:29:30 +01:00
Joost Verdoorn	cf1a8668e8	[Amara] Add new extractor (#20618 ) * [Amara] Add new extractor	2020-11-19 17:26:53 +01:00