[youtube] Add alternative automatic captions extraction approach (Closes #8667)

This commit is contained in:
Sergey M․ 2016-02-26 22:21:47 +06:00
parent efbd6fb8bb
commit b78b292f0c

View File

@ -975,40 +975,67 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return {} return {}
try: try:
args = player_config['args'] args = player_config['args']
caption_url = args['ttsurl'] caption_url = args.get('ttsurl')
if not caption_url: if caption_url:
self._downloader.report_warning(err_msg) timestamp = args['timestamp']
return {} # We get the available subtitles
timestamp = args['timestamp'] list_params = compat_urllib_parse.urlencode({
# We get the available subtitles 'type': 'list',
list_params = compat_urllib_parse.urlencode({ 'tlangs': 1,
'type': 'list', 'asrs': 1,
'tlangs': 1, })
'asrs': 1, list_url = caption_url + '&' + list_params
}) caption_list = self._download_xml(list_url, video_id)
list_url = caption_url + '&' + list_params original_lang_node = caption_list.find('track')
caption_list = self._download_xml(list_url, video_id) if original_lang_node is None:
original_lang_node = caption_list.find('track') self._downloader.report_warning('Video doesn\'t have automatic captions')
if original_lang_node is None: return {}
self._downloader.report_warning('Video doesn\'t have automatic captions') original_lang = original_lang_node.attrib['lang_code']
return {} caption_kind = original_lang_node.attrib.get('kind', '')
original_lang = original_lang_node.attrib['lang_code']
caption_kind = original_lang_node.attrib.get('kind', '') sub_lang_list = {}
for lang_node in caption_list.findall('target'):
sub_lang = lang_node.attrib['lang_code']
sub_formats = []
for ext in self._SUBTITLE_FORMATS:
params = compat_urllib_parse.urlencode({
'lang': original_lang,
'tlang': sub_lang,
'fmt': ext,
'ts': timestamp,
'kind': caption_kind,
})
sub_formats.append({
'url': caption_url + '&' + params,
'ext': ext,
})
sub_lang_list[sub_lang] = sub_formats
return sub_lang_list
# Some videos don't provide ttsurl but rather caption_tracks and
# caption_translation_languages (e.g. 20LmZk1hakA)
caption_tracks = args['caption_tracks']
caption_translation_languages = args['caption_translation_languages']
caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
parsed_caption_url = compat_urlparse.urlparse(caption_url)
caption_qs = compat_parse_qs(parsed_caption_url.query)
sub_lang_list = {} sub_lang_list = {}
for lang_node in caption_list.findall('target'): for lang in caption_translation_languages.split(','):
sub_lang = lang_node.attrib['lang_code'] lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
sub_lang = lang_qs.get('lc', [None])[0]
if not sub_lang:
continue
sub_formats = [] sub_formats = []
for ext in self._SUBTITLE_FORMATS: for ext in self._SUBTITLE_FORMATS:
params = compat_urllib_parse.urlencode({ caption_qs.update({
'lang': original_lang, 'tlang': [sub_lang],
'tlang': sub_lang, 'fmt': [ext],
'fmt': ext,
'ts': timestamp,
'kind': caption_kind,
}) })
sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace(
query=compat_urllib_parse.urlencode(caption_qs, True)))
sub_formats.append({ sub_formats.append({
'url': caption_url + '&' + params, 'url': sub_url,
'ext': ext, 'ext': ext,
}) })
sub_lang_list[sub_lang] = sub_formats sub_lang_list[sub_lang] = sub_formats