[youtube] Support automatic captions with original language different from English (fixes #1225) and download in multiple languages.

This commit is contained in:
Jaime Marquínez Ferrándiz 2013-09-11 19:02:01 +02:00
parent ac4f319ba1
commit 055e6f3657
2 changed files with 47 additions and 41 deletions

View File

@ -15,15 +15,20 @@ class SubtitlesInfoExtractor(InfoExtractor):
self.to_screen(u'%s: Available subtitles for video: %s' % self.to_screen(u'%s: Available subtitles for video: %s' %
(video_id, sub_lang)) (video_id, sub_lang))
def _extract_subtitles(self, video_id): def extract_subtitles(self, video_id, video_webpage=None):
""" returns {sub_lang: sub} or {} if subtitles not found """ """ returns {sub_lang: sub} or {} if subtitles not found """
if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
available_subs_list = self._get_available_subtitles(video_id) available_subs_list = self._get_available_subtitles(video_id)
elif self._downloader.params.get('writeautomaticsub', False):
available_subs_list = self._get_available_automatic_caption(video_id, video_webpage)
else:
return None
if not available_subs_list: # error, it didn't get the available subtitles if not available_subs_list: # error, it didn't get the available subtitles
return {} return {}
if self._downloader.params.get('allsubtitles', False): if self._downloader.params.get('allsubtitles', False):
sub_lang_list = available_subs_list sub_lang_list = available_subs_list
else: else:
if self._downloader.params.get('writesubtitles', False):
if self._downloader.params.get('subtitleslangs', False): if self._downloader.params.get('subtitleslangs', False):
requested_langs = self._downloader.params.get('subtitleslangs') requested_langs = self._downloader.params.get('subtitleslangs')
elif 'en' in available_subs_list: elif 'en' in available_subs_list:
@ -64,23 +69,11 @@ class SubtitlesInfoExtractor(InfoExtractor):
""" """
pass pass
def _request_automatic_caption(self, video_id, webpage): def _get_available_automatic_caption(self, video_id, webpage):
""" """
returns {sub_lang: sub} or {} if not available returns {sub_lang: url} or {} if not available
Must be redefined by the subclasses that support automatic captions, Must be redefined by the subclasses that support automatic captions,
otherwise it will return {} otherwise it will return {}
""" """
self._downloader.report_warning(u'Automatic Captions not supported by this server') self._downloader.report_warning(u'Automatic Captions not supported by this server')
return {} return {}
def extract_subtitles(self, video_id, video_webpage=None):
"""
Extract the subtitles and/or the automatic captions if requested.
Returns None or a dictionary in the format {sub_lang: sub}
"""
video_subtitles = None
if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
video_subtitles = self._extract_subtitles(video_id)
elif self._downloader.params.get('writeautomaticsub', False):
video_subtitles = self._request_automatic_caption(video_id, video_webpage)
return video_subtitles

View File

@ -5,6 +5,7 @@ import netrc
import re import re
import socket import socket
import itertools import itertools
import xml.etree.ElementTree
from .common import InfoExtractor, SearchInfoExtractor from .common import InfoExtractor, SearchInfoExtractor
from .subtitles import SubtitlesInfoExtractor from .subtitles import SubtitlesInfoExtractor
@ -478,14 +479,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
return {} return {}
return sub_lang_list return sub_lang_list
def _request_automatic_caption(self, video_id, webpage): def _get_available_automatic_caption(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an """We need the webpage for getting the captions url, pass it as an
argument to speed up the process.""" argument to speed up the process."""
sub_lang = (self._downloader.params.get('subtitleslangs') or ['en'])[0]
sub_format = self._downloader.params.get('subtitlesformat') sub_format = self._downloader.params.get('subtitlesformat')
self.to_screen(u'%s: Looking for automatic captions' % video_id) self.to_screen(u'%s: Looking for automatic captions' % video_id)
mobj = re.search(r';ytplayer.config = ({.*?});', webpage) mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang err_msg = u'Couldn\'t find automatic captions for %s' % video_id
if mobj is None: if mobj is None:
self._downloader.report_warning(err_msg) self._downloader.report_warning(err_msg)
return {} return {}
@ -494,16 +494,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
args = player_config[u'args'] args = player_config[u'args']
caption_url = args[u'ttsurl'] caption_url = args[u'ttsurl']
timestamp = args[u'timestamp'] timestamp = args[u'timestamp']
# We get the available subtitles
list_params = compat_urllib_parse.urlencode({
'type': 'list',
'tlangs': 1,
'asrs': 1,
})
list_url = caption_url + '&' + list_params
list_page = self._download_webpage(list_url, video_id)
caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
original_lang = caption_list.find('track').attrib['lang_code']
sub_lang_list = {}
for lang_node in caption_list.findall('target'):
sub_lang = lang_node.attrib['lang_code']
params = compat_urllib_parse.urlencode({ params = compat_urllib_parse.urlencode({
'lang': 'en', 'lang': original_lang,
'tlang': sub_lang, 'tlang': sub_lang,
'fmt': sub_format, 'fmt': sub_format,
'ts': timestamp, 'ts': timestamp,
'kind': 'asr', 'kind': 'asr',
}) })
subtitles_url = caption_url + '&' + params sub_lang_list[sub_lang] = caption_url + '&' + params
sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') return sub_lang_list
return {sub_lang: sub}
# An extractor error can be raise by the download process if there are # An extractor error can be raise by the download process if there are
# no automatic captions but there are subtitles # no automatic captions but there are subtitles
except (KeyError, ExtractorError): except (KeyError, ExtractorError):