Compare commits

...

5 Commits

Author SHA1 Message Date
Remita Amine
59e583f7e8 [viki] improve format extraction 2020-11-19 22:49:28 +01:00
beefchop
daa25d4142
[viki] fix stream extraction from mpd (#27092)
Co-authored-by: beefchop <beefchop@users.noreply.github.com>
2020-11-19 21:38:09 +01:00
Remita Amine
25a35cb38a [googledrive] fix format extraction(closes #26979) 2020-11-19 20:01:24 +01:00
Remita Amine
2cf8003638 [amara] improve extraction 2020-11-19 17:29:30 +01:00
Joost Verdoorn
cf1a8668e8
[Amara] Add new extractor (#20618)
* [Amara] Add new extractor
2020-11-19 17:26:53 +01:00
4 changed files with 208 additions and 93 deletions

View File

@ -0,0 +1,103 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from .youtube import YoutubeIE
from .vimeo import VimeoIE
from ..utils import (
int_or_none,
parse_iso8601,
update_url_query,
)
class AmaraIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P<id>\w+)'
_TESTS = [{
# Youtube
'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video',
'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae',
'info_dict': {
'id': 'h6ZuVdvYnfE',
'ext': 'mp4',
'title': 'Why jury trials are becoming less common',
'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1',
'thumbnail': r're:^https?://.*\.jpg$',
'subtitles': dict,
'upload_date': '20160813',
'uploader': 'PBS NewsHour',
'uploader_id': 'PBSNewsHour',
'timestamp': 1549639570,
}
}, {
# Vimeo
'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011',
'md5': '99392c75fa05d432a8f11df03612195e',
'info_dict': {
'id': '18622084',
'ext': 'mov',
'title': 'Vimeo at CES 2011!',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'thumbnail': r're:^https?://.*\.jpg$',
'subtitles': dict,
'timestamp': 1294763658,
'upload_date': '20110111',
'uploader': 'Sam Morrill',
'uploader_id': 'sammorrill'
}
}, {
# Direct Link
'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/',
'md5': 'd3970f08512738ee60c5807311ff5d3f',
'info_dict': {
'id': 's8KL7I3jLmh6',
'ext': 'mp4',
'title': 'The danger of a single story',
'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23',
'thumbnail': r're:^https?://.*\.jpg$',
'subtitles': dict,
'upload_date': '20091007',
'timestamp': 1254942511,
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
meta = self._download_json(
'https://amara.org/api/videos/%s/' % video_id,
video_id, query={'format': 'json'})
title = meta['title']
video_url = meta['all_urls'][0]
subtitles = {}
for language in (meta.get('languages') or []):
subtitles_uri = language.get('subtitles_uri')
if not (subtitles_uri and language.get('published')):
continue
subtitle = subtitles.setdefault(language.get('code') or 'en', [])
for f in ('json', 'srt', 'vtt'):
subtitle.append({
'ext': f,
'url': update_url_query(subtitles_uri, {'format': f}),
})
info = {
'url': video_url,
'id': video_id,
'subtitles': subtitles,
'title': title,
'description': meta.get('description'),
'thumbnail': meta.get('thumbnail'),
'duration': int_or_none(meta.get('duration')),
'timestamp': parse_iso8601(meta.get('created')),
}
for ie in (YoutubeIE, VimeoIE):
if ie.suitable(video_url):
info.update({
'_type': 'url_transparent',
'ie_key': ie.ie_key(),
})
break
return info

View File

@ -36,6 +36,7 @@ from .afreecatv import AfreecaTVIE
from .airmozilla import AirMozillaIE from .airmozilla import AirMozillaIE
from .aljazeera import AlJazeeraIE from .aljazeera import AlJazeeraIE
from .alphaporno import AlphaPornoIE from .alphaporno import AlphaPornoIE
from .amara import AmaraIE
from .amcnetworks import AMCNetworksIE from .amcnetworks import AMCNetworksIE
from .americastestkitchen import AmericasTestKitchenIE from .americastestkitchen import AmericasTestKitchenIE
from .animeondemand import AnimeOnDemandIE from .animeondemand import AnimeOnDemandIE

View File

@ -3,11 +3,13 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_parse_qs
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
ExtractorError, ExtractorError,
int_or_none, int_or_none,
lowercase_escape, lowercase_escape,
try_get,
update_url_query, update_url_query,
) )
@ -38,21 +40,10 @@ class GoogleDriveIE(InfoExtractor):
# video can't be watched anonymously due to view count limit reached, # video can't be watched anonymously due to view count limit reached,
# but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046) # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view', 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
'md5': 'bfbd670d03a470bb1e6d4a257adec12e', 'only_matching': True,
'info_dict': {
'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ',
'ext': 'mp4',
'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4',
}
}, { }, {
# video id is longer than 28 characters # video id is longer than 28 characters
'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
'info_dict': {
'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ',
'ext': 'mp4',
'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4',
'duration': 189,
},
'only_matching': True, 'only_matching': True,
}, { }, {
'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28', 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
@ -171,23 +162,21 @@ class GoogleDriveIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage( video_info = compat_parse_qs(self._download_webpage(
'http://docs.google.com/file/d/%s' % video_id, video_id) 'https://drive.google.com/get_video_info',
video_id, query={'docid': video_id}))
title = self._search_regex( def get_value(key):
r'"title"\s*,\s*"([^"]+)', webpage, 'title', return try_get(video_info, lambda x: x[key][0])
default=None) or self._og_search_title(webpage)
duration = int_or_none(self._search_regex( reason = get_value('reason')
r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', title = get_value('title')
default=None)) if not title and reason:
raise ExtractorError(reason, expected=True)
formats = [] formats = []
fmt_stream_map = self._search_regex( fmt_stream_map = (get_value('fmt_stream_map') or '').split(',')
r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, fmt_list = (get_value('fmt_list') or '').split(',')
'fmt stream map', default='').split(',')
fmt_list = self._search_regex(
r'"fmt_list"\s*,\s*"([^"]+)', webpage,
'fmt_list', default='').split(',')
if fmt_stream_map and fmt_list: if fmt_stream_map and fmt_list:
resolutions = {} resolutions = {}
for fmt in fmt_list: for fmt in fmt_list:
@ -257,19 +246,14 @@ class GoogleDriveIE(InfoExtractor):
if urlh and urlh.headers.get('Content-Disposition'): if urlh and urlh.headers.get('Content-Disposition'):
add_source_format(urlh) add_source_format(urlh)
if not formats: if not formats and reason:
reason = self._search_regex( raise ExtractorError(reason, expected=True)
r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
if reason:
raise ExtractorError(reason, expected=True)
self._sort_formats(formats) self._sort_formats(formats)
hl = self._search_regex( hl = get_value('hl')
r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None)
subtitles_id = None subtitles_id = None
ttsurl = self._search_regex( ttsurl = get_value('ttsurl')
r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None)
if ttsurl: if ttsurl:
# the video Id for subtitles will be the last value in the ttsurl # the video Id for subtitles will be the last value in the ttsurl
# query string # query string
@ -279,8 +263,8 @@ class GoogleDriveIE(InfoExtractor):
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'thumbnail': self._og_search_thumbnail(webpage, default=None), 'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id,
'duration': duration, 'duration': int_or_none(get_value('length_seconds')),
'formats': formats, 'formats': formats,
'subtitles': self.extract_subtitles(video_id, subtitles_id, hl), 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
'automatic_captions': self.extract_automatic_captions( 'automatic_captions': self.extract_automatic_captions(

View File

@ -1,6 +1,7 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import base64
import hashlib import hashlib
import hmac import hmac
import itertools import itertools
@ -9,6 +10,10 @@ import re
import time import time
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urllib_parse_urlparse,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
@ -165,19 +170,20 @@ class VikiIE(VikiBaseIE):
}, { }, {
# episode # episode
'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
'md5': '5fa476a902e902783ac7a4d615cdbc7a', 'md5': '94e0e34fd58f169f40c184f232356cfe',
'info_dict': { 'info_dict': {
'id': '44699v', 'id': '44699v',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Boys Over Flowers - Episode 1', 'title': 'Boys Over Flowers - Episode 1',
'description': 'md5:b89cf50038b480b88b5b3c93589a9076', 'description': 'md5:b89cf50038b480b88b5b3c93589a9076',
'duration': 4204, 'duration': 4172,
'timestamp': 1270496524, 'timestamp': 1270496524,
'upload_date': '20100405', 'upload_date': '20100405',
'uploader': 'group8', 'uploader': 'group8',
'like_count': int, 'like_count': int,
'age_limit': 13, 'age_limit': 13,
} },
'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, { }, {
# youtube external # youtube external
'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
@ -194,14 +200,15 @@ class VikiIE(VikiBaseIE):
'uploader_id': 'ad14065n', 'uploader_id': 'ad14065n',
'like_count': int, 'like_count': int,
'age_limit': 13, 'age_limit': 13,
} },
'skip': 'Page not found!',
}, { }, {
'url': 'http://www.viki.com/player/44699v', 'url': 'http://www.viki.com/player/44699v',
'only_matching': True, 'only_matching': True,
}, { }, {
# non-English description # non-English description
'url': 'http://www.viki.com/videos/158036v-love-in-magic', 'url': 'http://www.viki.com/videos/158036v-love-in-magic',
'md5': '1713ae35df5a521b31f6dc40730e7c9c', 'md5': 'adf9e321a0ae5d0aace349efaaff7691',
'info_dict': { 'info_dict': {
'id': '158036v', 'id': '158036v',
'ext': 'mp4', 'ext': 'mp4',
@ -217,8 +224,11 @@ class VikiIE(VikiBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
video = self._call_api( resp = self._download_json(
'videos/%s.json' % video_id, video_id, 'Downloading video JSON') 'https://www.viki.com/api/videos/' + video_id,
video_id, 'Downloading video JSON',
headers={'x-viki-app-ver': '4.0.57'})
video = resp['video']
self._check_errors(video) self._check_errors(video)
@ -265,57 +275,74 @@ class VikiIE(VikiBaseIE):
'subtitles': subtitles, 'subtitles': subtitles,
} }
streams = self._call_api(
'videos/%s/streams.json' % video_id, video_id,
'Downloading video streams JSON')
if 'external' in streams:
result.update({
'_type': 'url_transparent',
'url': streams['external']['url'],
})
return result
formats = [] formats = []
for format_id, stream_dict in streams.items():
height = int_or_none(self._search_regex( def add_format(format_id, format_dict, protocol='http'):
r'^(\d+)[pP]$', format_id, 'height', default=None)) # rtmps URLs does not seem to work
for protocol, format_dict in stream_dict.items(): if protocol == 'rtmps':
# rtmps URLs does not seem to work return
if protocol == 'rtmps': format_url = format_dict.get('url')
continue if not format_url:
format_url = format_dict['url'] return
if format_id == 'm3u8': qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query)
m3u8_formats = self._extract_m3u8_formats( stream = qs.get('stream', [None])[0]
format_url, video_id, 'mp4', if stream:
entry_protocol='m3u8_native', format_url = base64.b64decode(stream).decode()
m3u8_id='m3u8-%s' % protocol, fatal=False) if format_id in ('m3u8', 'hls'):
# Despite CODECS metadata in m3u8 all video-only formats m3u8_formats = self._extract_m3u8_formats(
# are actually video+audio format_url, video_id, 'mp4',
for f in m3u8_formats: entry_protocol='m3u8_native',
if f.get('acodec') == 'none' and f.get('vcodec') != 'none': m3u8_id='m3u8-%s' % protocol, fatal=False)
f['acodec'] = None # Despite CODECS metadata in m3u8 all video-only formats
formats.extend(m3u8_formats) # are actually video+audio
elif format_url.startswith('rtmp'): for f in m3u8_formats:
mobj = re.search( if '_drm/index_' in f['url']:
r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$',
format_url)
if not mobj:
continue continue
formats.append({ if f.get('acodec') == 'none' and f.get('vcodec') != 'none':
'format_id': 'rtmp-%s' % format_id, f['acodec'] = None
'ext': 'flv', formats.append(f)
'url': mobj.group('url'), elif format_id in ('mpd', 'dash'):
'play_path': mobj.group('playpath'), formats.extend(self._extract_mpd_formats(
'app': mobj.group('app'), format_url, video_id, 'mpd-%s' % protocol, fatal=False))
'page_url': url, elif format_url.startswith('rtmp'):
}) mobj = re.search(
else: r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$',
formats.append({ format_url)
'url': format_url, if not mobj:
'format_id': '%s-%s' % (format_id, protocol), return
'height': height, formats.append({
}) 'format_id': 'rtmp-%s' % format_id,
'ext': 'flv',
'url': mobj.group('url'),
'play_path': mobj.group('playpath'),
'app': mobj.group('app'),
'page_url': url,
})
else:
formats.append({
'url': format_url,
'format_id': '%s-%s' % (format_id, protocol),
'height': int_or_none(self._search_regex(
r'^(\d+)[pP]$', format_id, 'height', default=None)),
})
for format_id, format_dict in (resp.get('streams') or {}).items():
add_format(format_id, format_dict)
if not formats:
streams = self._call_api(
'videos/%s/streams.json' % video_id, video_id,
'Downloading video streams JSON')
if 'external' in streams:
result.update({
'_type': 'url_transparent',
'url': streams['external']['url'],
})
return result
for format_id, stream_dict in streams.items():
for protocol, format_dict in stream_dict.items():
add_format(format_id, format_dict, protocol)
self._sort_formats(formats) self._sort_formats(formats)
result['formats'] = formats result['formats'] = formats