youtube-dl/youtube_dl/extractor/ted.py

from __future__ import unicode_literals

import json
import re

from .common import InfoExtractor

from ..compat import (
    compat_str,
)


class TEDIE(InfoExtractor):
    _VALID_URL = r'''(?x)
        (?P<proto>https?://)
        (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
        (
            (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
            |
            ((?P<type_talk>talks)) # We have a simple talk
            |
            (?P<type_watch>watch)/[^/]+/[^/]+
        )
        (/lang/(.*?))? # The url may contain the language
        /(?P<name>[\w-]+) # Here goes the name and then ".html"
        .*)$
        '''
    _TESTS = [{
        'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
        'md5': 'fc94ac279feebbce69f21c0c6ee82810',
        'info_dict': {
            'id': '102',
            'ext': 'mp4',
            'title': 'The illusion of consciousness',
            'description': ('Philosopher Dan Dennett makes a compelling '
                            'argument that not only don\'t we understand our own '
                            'consciousness, but that half the time our brains are '
                            'actively fooling us.'),
            'uploader': 'Dan Dennett',
            'width': 854,
            'duration': 1308,
        }
    }, {
        'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
        'md5': '226f4fb9c62380d11b7995efa4c87994',
        'info_dict': {
            'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
            'ext': 'mp4',
            'title': 'Vishal Sikka: The beauty and power of algorithms',
            'thumbnail': 're:^https?://.+\.jpg',
            'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
        }
    }, {
        'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
        'info_dict': {
            'id': '1972',
            'ext': 'mp4',
            'title': 'Be passionate. Be courageous. Be your best.',
            'uploader': 'Gabby Giffords and Mark Kelly',
            'description': 'md5:5174aed4d0f16021b704120360f72b92',
            'duration': 1128,
        },
    }, {
        'url': 'http://www.ted.com/playlists/who_are_the_hackers',
        'info_dict': {
            'id': '10',
            'title': 'Who are the hackers?',
        },
        'playlist_mincount': 6,
    }, {
        # contains a youtube video
        'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
        'add_ie': ['Youtube'],
        'info_dict': {
            'id': '_ZG8HBuDjgc',
            'ext': 'mp4',
            'title': 'Douglas Adams: Parrots the Universe and Everything',
            'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
            'uploader': 'University of California Television (UCTV)',
            'uploader_id': 'UCtelevision',
            'upload_date': '20080522',
        },
        'params': {
            'skip_download': True,
        },
    }]

    _NATIVE_FORMATS = {
        'low': {'preference': 1, 'width': 320, 'height': 180},
        'medium': {'preference': 2, 'width': 512, 'height': 288},
        'high': {'preference': 3, 'width': 854, 'height': 480},
    }

    def _extract_info(self, webpage):
        info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
                                       webpage, 'info json')
        return json.loads(info_json)

    def _real_extract(self, url):
        m = re.match(self._VALID_URL, url, re.VERBOSE)
        if m.group('type').startswith('embed'):
            desktop_url = m.group('proto') + 'www' + m.group('urlmain')
            return self.url_result(desktop_url, 'TED')
        name = m.group('name')
        if m.group('type_talk'):
            return self._talk_info(url, name)
        elif m.group('type_watch'):
            return self._watch_info(url, name)
        else:
            return self._playlist_videos_info(url, name)

    def _playlist_videos_info(self, url, name):
        '''Returns the videos of the playlist'''

        webpage = self._download_webpage(url, name,
                                         'Downloading playlist webpage')
        info = self._extract_info(webpage)
        playlist_info = info['playlist']

        playlist_entries = [
            self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
            for talk in info['talks']
        ]
        return self.playlist_result(
            playlist_entries,
            playlist_id=compat_str(playlist_info['id']),
            playlist_title=playlist_info['title'])

    def _talk_info(self, url, video_name):
        webpage = self._download_webpage(url, video_name)
        self.report_extraction(video_name)

        talk_info = self._extract_info(webpage)['talks'][0]

        if talk_info.get('external') is not None:
            self.to_screen('Found video from %s' % talk_info['external']['service'])
            return {
                '_type': 'url',
                'url': talk_info['external']['uri'],
            }

        formats = [{
            'url': format_url,
            'format_id': format_id,
            'format': format_id,
        } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
        if formats:
            for f in formats:
                finfo = self._NATIVE_FORMATS.get(f['format_id'])
                if finfo:
                    f.update(finfo)
        else:
            # Use rtmp downloads
            formats = [{
                'format_id': f['name'],
                'url': talk_info['streamer'],
                'play_path': f['file'],
                'ext': 'flv',
                'width': f['width'],
                'height': f['height'],
                'tbr': f['bitrate'],
            } for f in talk_info['resources']['rtmp']]
        self._sort_formats(formats)

        video_id = compat_str(talk_info['id'])
        # subtitles
        video_subtitles = self.extract_subtitles(video_id, talk_info)

        thumbnail = talk_info['thumb']
        if not thumbnail.startswith('http'):
            thumbnail = 'http://' + thumbnail
        return {
            'id': video_id,
            'title': talk_info['title'].strip(),
            'uploader': talk_info['speaker'],
            'thumbnail': thumbnail,
            'description': self._og_search_description(webpage),
            'subtitles': video_subtitles,
            'formats': formats,
            'duration': talk_info.get('duration'),
        }

    def _get_subtitles(self, video_id, talk_info):
        languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
        if languages:
            sub_lang_list = {}
            for l in languages:
                sub_lang_list[l] = [
                    {
                        'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
                        'ext': ext,
                    }
                    for ext in ['ted', 'srt']
                ]
            return sub_lang_list
        else:
            self._downloader.report_warning('video doesn\'t have subtitles')
            return {}

    def _watch_info(self, url, name):
        webpage = self._download_webpage(url, name)

        config_json = self._html_search_regex(
            r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
            webpage, 'config')
        config = json.loads(config_json)['config']
        video_url = config['video']['url']
        thumbnail = config.get('image', {}).get('url')

        title = self._html_search_regex(
            r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
        description = self._html_search_regex(
            [
                r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
                r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
            ],
            webpage, 'description', fatal=False)

        return {
            'id': name,
            'url': video_url,
            'title': title,
            'thumbnail': thumbnail,
            'description': description,
        }
[ted] Use unicode_literals 2014-01-17 11:52:17 +09:00			`from __future__ import unicode_literals`

Move TED IE into its own file 2013-06-24 04:55:53 +09:00			`import json`
			`import re`

Improve subtitles support For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best'). For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used. The reasons for this change are: * We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive. * It allows to easily support giving a format preference. * The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible. Currently only the ted extractor has been updated, but the old system still works. 2015-02-16 02:03:41 +09:00			`from .common import InfoExtractor`
Move TED IE into its own file 2013-06-24 04:55:53 +09:00
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 20:24:42 +09:00			`from ..compat import (`
[ted] Fix playlist extraction and add a test 2014-03-05 21:22:10 +09:00			`compat_str,`
[ted] fixed error in case of no subtitles present I created a test, but I leave it commented since TED videos get new subtitles frequently. 2013-11-05 20:00:13 +09:00			`)`

[ted] Use unicode_literals 2014-01-17 11:52:17 +09:00
Improve subtitles support For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best'). For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used. The reasons for this change are: * We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive. * It allows to easily support giving a format preference. * The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible. Currently only the ted extractor has been updated, but the old system still works. 2015-02-16 02:03:41 +09:00			`class TEDIE(InfoExtractor):`
[ted] Simplify embed code (#2587) 2014-03-21 00:33:23 +09:00			`_VALID_URL = r'''(?x)`
			`(?P<proto>https?://)`
[ted] Add support for embed-ssl.ted.com embedded videos 2015-01-05 21:11:13 +09:00			`(?P<type>www\|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/`
[ted] Style fixes 2014-03-05 21:27:26 +09:00			`(`
			`(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist`
			`\|`
			`((?P<type_talk>talks)) # We have a simple talk`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 10:22:40 +09:00			`\|`
			`(?P<type_watch>watch)/[^/]+/[^/]+`
[ted] Style fixes 2014-03-05 21:27:26 +09:00			`)`
			`(/lang/(.*?))? # The url may contain the language`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 10:22:40 +09:00			`/(?P<name>[\w-]+) # Here goes the name and then ".html"`
[ted] Simplify embed code (#2587) 2014-03-21 00:33:23 +09:00			`.*)$`
[ted] Style fixes 2014-03-05 21:27:26 +09:00			`'''`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 10:22:40 +09:00			`_TESTS = [{`
[ted] Use unicode_literals 2014-01-17 11:52:17 +09:00			`'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',`
[ted] Update test md5 2014-06-12 22:33:53 +09:00			`'md5': 'fc94ac279feebbce69f21c0c6ee82810',`
[ted] Use unicode_literals 2014-01-17 11:52:17 +09:00			`'info_dict': {`
[ted] Remove unused import and modernize test 2014-03-05 22:27:45 +09:00			`'id': '102',`
			`'ext': 'mp4',`
[ted] Fix video extraction The site has been redesigned 2014-03-05 05:47:01 +09:00			`'title': 'The illusion of consciousness',`
[ted] Style fixes 2014-03-05 21:27:26 +09:00			`'description': ('Philosopher Dan Dennett makes a compelling '`
PEP8: applied even more rules 2014-11-24 05:39:15 +09:00			`'argument that not only don\'t we understand our own '`
			`'consciousness, but that half the time our brains are '`
			`'actively fooling us.'),`
[ted] Fix video extraction The site has been redesigned 2014-03-05 05:47:01 +09:00			`'uploader': 'Dan Dennett',`
[ted] Add width and height (Fixes #2716) 2014-04-07 20:07:07 +09:00			`'width': 854,`
[ted] Extract duration (closes #4155) 2014-11-12 17:30:57 +09:00			`'duration': 1308,`
Move tests to the IE definitions 2013-06-28 03:46:46 +09:00			`}`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 10:22:40 +09:00			`}, {`
			`'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',`
			`'md5': '226f4fb9c62380d11b7995efa4c87994',`
			`'info_dict': {`
			`'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',`
			`'ext': 'mp4',`
			`'title': 'Vishal Sikka: The beauty and power of algorithms',`
			`'thumbnail': 're:^https?://.+\.jpg',`
			`'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',`
			`}`
[ted] Use the rtmp links if there http downloads are not available. 2014-04-14 22:23:12 +09:00			`}, {`
			`'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',`
			`'info_dict': {`
			`'id': '1972',`
[ted] Update test 2014-04-22 21:49:41 +09:00			`'ext': 'mp4',`
[ted] Use the rtmp links if there http downloads are not available. 2014-04-14 22:23:12 +09:00			`'title': 'Be passionate. Be courageous. Be your best.',`
			`'uploader': 'Gabby Giffords and Mark Kelly',`
[ted] Update test 2014-04-22 21:49:41 +09:00			`'description': 'md5:5174aed4d0f16021b704120360f72b92',`
[ted] Extract duration (closes #4155) 2014-11-12 17:30:57 +09:00			`'duration': 1128,`
[ted] Use the rtmp links if there http downloads are not available. 2014-04-14 22:23:12 +09:00			`},`
Move playlist tests to extractors. From now on, test_download will run these tests. That means we benefit not only from the networking setup in there, but also from the other tests (for example test_all_urls to find problems with _VALID_URLs). 2014-08-28 07:58:24 +09:00			`}, {`
			`'url': 'http://www.ted.com/playlists/who_are_the_hackers',`
			`'info_dict': {`
			`'id': '10',`
			`'title': 'Who are the hackers?',`
			`},`
			`'playlist_mincount': 6,`
[ted] Add support for external videos (fixes #3948) 2014-10-15 19:24:11 +09:00			`}, {`
			`# contains a youtube video`
			`'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',`
			`'add_ie': ['Youtube'],`
			`'info_dict': {`
			`'id': '_ZG8HBuDjgc',`
			`'ext': 'mp4',`
			`'title': 'Douglas Adams: Parrots the Universe and Everything',`
			`'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',`
			`'uploader': 'University of California Television (UCTV)',`
			`'uploader_id': 'UCtelevision',`
			`'upload_date': '20080522',`
			`},`
			`'params': {`
			`'skip_download': True,`
			`},`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 10:22:40 +09:00			`}]`
Move TED IE into its own file 2013-06-24 04:55:53 +09:00
[ted] Add width and height (Fixes #2716) 2014-04-07 20:07:07 +09:00			`_NATIVE_FORMATS = {`
			`'low': {'preference': 1, 'width': 320, 'height': 180},`
			`'medium': {'preference': 2, 'width': 512, 'height': 288},`
			`'high': {'preference': 3, 'width': 854, 'height': 480},`
[ted] Fix video extraction The site has been redesigned 2014-03-05 05:47:01 +09:00			`}`
Move TED IE into its own file 2013-06-24 04:55:53 +09:00
[ted] Fix playlist extraction and add a test 2014-03-05 21:22:10 +09:00			`def _extract_info(self, webpage):`
[ted] Style fixes 2014-03-05 21:27:26 +09:00			`info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',`
PEP8: applied even more rules 2014-11-24 05:39:15 +09:00			`webpage, 'info json')`
[ted] Fix playlist extraction and add a test 2014-03-05 21:22:10 +09:00			`return json.loads(info_json)`

Move TED IE into its own file 2013-06-24 04:55:53 +09:00			`def _real_extract(self, url):`
[ted] Style fixes 2014-03-05 21:27:26 +09:00			`m = re.match(self._VALID_URL, url, re.VERBOSE)`
[ted] Add support for embed-ssl.ted.com embedded videos 2015-01-05 21:11:13 +09:00			`if m.group('type').startswith('embed'):`
[ted] Simplify embed code (#2587) 2014-03-21 00:33:23 +09:00			`desktop_url = m.group('proto') + 'www' + m.group('urlmain')`
			`return self.url_result(desktop_url, 'TED')`
[ted] Style fixes 2014-03-05 21:27:26 +09:00			`name = m.group('name')`
Move TED IE into its own file 2013-06-24 04:55:53 +09:00			`if m.group('type_talk'):`
[ted] Style fixes 2014-03-05 21:27:26 +09:00			`return self._talk_info(url, name)`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 10:22:40 +09:00			`elif m.group('type_watch'):`
			`return self._watch_info(url, name)`
[ted] Style fixes 2014-03-05 21:27:26 +09:00			`else:`
[ted] Fix playlist extraction and add a test 2014-03-05 21:22:10 +09:00			`return self._playlist_videos_info(url, name)`
Move TED IE into its own file 2013-06-24 04:55:53 +09:00
[ted] Fix playlist extraction and add a test 2014-03-05 21:22:10 +09:00			`def _playlist_videos_info(self, url, name):`
Move TED IE into its own file 2013-06-24 04:55:53 +09:00			`'''Returns the videos of the playlist'''`
[ted] Fix playlists (Fixes #1770) 2013-11-15 22:33:51 +09:00
[ted] Fix playlist extraction and add a test 2014-03-05 21:22:10 +09:00			`webpage = self._download_webpage(url, name,`
PEP8: applied even more rules 2014-11-24 05:39:15 +09:00			`'Downloading playlist webpage')`
[ted] Fix playlist extraction and add a test 2014-03-05 21:22:10 +09:00			`info = self._extract_info(webpage)`
			`playlist_info = info['playlist']`
Move TED IE into its own file 2013-06-24 04:55:53 +09:00
[ted] Fix playlists (Fixes #1770) 2013-11-15 22:33:51 +09:00			`playlist_entries = [`
[ted] Remove superfluous u prefixes 2014-04-21 19:34:32 +09:00			`self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())`
[ted] Fix playlist extraction and add a test 2014-03-05 21:22:10 +09:00			`for talk in info['talks']`
[ted] Fix playlists (Fixes #1770) 2013-11-15 22:33:51 +09:00			`]`
			`return self.playlist_result(`
[ted] Fix playlist extraction and add a test 2014-03-05 21:22:10 +09:00			`playlist_entries,`
			`playlist_id=compat_str(playlist_info['id']),`
			`playlist_title=playlist_info['title'])`
Move TED IE into its own file 2013-06-24 04:55:53 +09:00
[ted] Style fixes 2014-03-05 21:27:26 +09:00			`def _talk_info(self, url, video_name):`
			`webpage = self._download_webpage(url, video_name)`
Move TED IE into its own file 2013-06-24 04:55:53 +09:00			`self.report_extraction(video_name)`
[ted] Added support for subtitle download 2013-11-03 03:48:39 +09:00
[ted] Fix playlist extraction and add a test 2014-03-05 21:22:10 +09:00			`talk_info = self._extract_info(webpage)['talks'][0]`
[ted] Added support for subtitle download 2013-11-03 03:48:39 +09:00
[ted] Add support for external videos (fixes #3948) 2014-10-15 19:24:11 +09:00			`if talk_info.get('external') is not None:`
			`self.to_screen('Found video from %s' % talk_info['external']['service'])`
			`return {`
			`'_type': 'url',`
			`'url': talk_info['external']['uri'],`
			`}`

[ted] Fix video extraction The site has been redesigned 2014-03-05 05:47:01 +09:00			`formats = [{`
			`'url': format_url,`
			`'format_id': format_id,`
			`'format': format_id,`
[ted] Use the rtmp links if there http downloads are not available. 2014-04-14 22:23:12 +09:00			`} for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]`
			`if formats:`
			`for f in formats:`
			`finfo = self._NATIVE_FORMATS.get(f['format_id'])`
			`if finfo:`
			`f.update(finfo)`
			`else:`
			`# Use rtmp downloads`
			`formats = [{`
			`'format_id': f['name'],`
			`'url': talk_info['streamer'],`
			`'play_path': f['file'],`
			`'ext': 'flv',`
			`'width': f['width'],`
			`'height': f['height'],`
			`'tbr': f['bitrate'],`
			`} for f in talk_info['resources']['rtmp']]`
[ted] Fix video extraction The site has been redesigned 2014-03-05 05:47:01 +09:00			`self._sort_formats(formats)`

[ted] Remove unused import and modernize test 2014-03-05 22:27:45 +09:00			`video_id = compat_str(talk_info['id'])`
[ted] Added support for subtitle download 2013-11-03 03:48:39 +09:00			`# subtitles`
[ted] Fix video extraction The site has been redesigned 2014-03-05 05:47:01 +09:00			`video_subtitles = self.extract_subtitles(video_id, talk_info)`
[ted] Added support for subtitle download 2013-11-03 03:48:39 +09:00
[ted] Add 'http://' to the thumbnail url if it's missing 2014-03-16 19:24:11 +09:00			`thumbnail = talk_info['thumb']`
			`if not thumbnail.startswith('http'):`
			`thumbnail = 'http://' + thumbnail`
[ted] simplify 2013-11-15 22:06:38 +09:00			`return {`
[ted] Added support for subtitle download 2013-11-03 03:48:39 +09:00			`'id': video_id,`
[generic] Fix testcases 2014-09-29 12:12:57 +09:00			`'title': talk_info['title'].strip(),`
[ted] Fix video extraction The site has been redesigned 2014-03-05 05:47:01 +09:00			`'uploader': talk_info['speaker'],`
[ted] Add 'http://' to the thumbnail url if it's missing 2014-03-16 19:24:11 +09:00			`'thumbnail': thumbnail,`
[ted] Fix video extraction The site has been redesigned 2014-03-05 05:47:01 +09:00			`'description': self._og_search_description(webpage),`
[ted] Added support for subtitle download 2013-11-03 03:48:39 +09:00			`'subtitles': video_subtitles,`
[ted] Prepare #980 merge 2013-10-04 17:32:34 +09:00			`'formats': formats,`
[ted] Extract duration (closes #4155) 2014-11-12 17:30:57 +09:00			`'duration': talk_info.get('duration'),`
[ted] Prepare #980 merge 2013-10-04 17:32:34 +09:00			`}`

Improve subtitles support For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best'). For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used. The reasons for this change are: * We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive. * It allows to easily support giving a format preference. * The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible. Currently only the ted extractor has been updated, but the old system still works. 2015-02-16 02:03:41 +09:00			`def _get_subtitles(self, video_id, talk_info):`
[ted] Fix video extraction The site has been redesigned 2014-03-05 05:47:01 +09:00			`languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]`
			`if languages:`
			`sub_lang_list = {}`
			`for l in languages:`
Improve subtitles support For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best'). For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used. The reasons for this change are: * We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive. * It allows to easily support giving a format preference. * The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible. Currently only the ted extractor has been updated, but the old system still works. 2015-02-16 02:03:41 +09:00			`sub_lang_list[l] = [`
			`{`
			`'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),`
			`'ext': ext,`
			`}`
			`for ext in ['ted', 'srt']`
			`]`
[ted] Fix video extraction The site has been redesigned 2014-03-05 05:47:01 +09:00			`return sub_lang_list`
			`else:`
[ted] Remove superfluous u prefixes 2014-04-21 19:34:32 +09:00			`self._downloader.report_warning('video doesn\'t have subtitles')`
[ted] Fix video extraction The site has been redesigned 2014-03-05 05:47:01 +09:00			`return {}`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 10:22:40 +09:00
			`def _watch_info(self, url, name):`
			`webpage = self._download_webpage(url, name)`

			`config_json = self._html_search_regex(`
[ted] Fix type_watch links extraction 2014-12-04 00:17:11 +09:00			`r'"pages\.jwplayer"\s,\s({.+?})\s\)\s</script>',`
			`webpage, 'config')`
			`config = json.loads(config_json)['config']`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 10:22:40 +09:00			`video_url = config['video']['url']`
			`thumbnail = config.get('image', {}).get('url')`

			`title = self._html_search_regex(`
			`r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')`
			`description = self._html_search_regex(`
[ted] Extend search for description 2014-04-21 19:37:16 +09:00			`[`
			`r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.?</h4>(.?)</div>',`
			`r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',`
			`],`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 10:22:40 +09:00			`webpage, 'description', fatal=False)`

			`return {`
			`'id': name,`
			`'url': video_url,`
			`'title': title,`
			`'thumbnail': thumbnail,`
			`'description': description,`
			`}`