diff --git a/youtube_dl/extractor/mediathekviewweb.py b/youtube_dl/extractor/mediathekviewweb.py new file mode 100644 index 000000000..e96e5eb4d --- /dev/null +++ b/youtube_dl/extractor/mediathekviewweb.py @@ -0,0 +1,194 @@ +import datetime +import itertools +import json +import re + +from .common import InfoExtractor, SearchInfoExtractor +from ..compat import compat_urllib_parse_unquote +from ..utils import ExtractorError, int_or_none + +class MediathekViewWebSearchIE(SearchInfoExtractor): + IE_NAME = 'mediathekviewweb:search' + IE_DESC = 'MediathekViewWeb search' + _SEARCH_KEY = 'mvwsearch' + _MAX_RESULTS = float('inf') + _MAX_RESULTS_PER_PAGE = 50 + # _GEO_COUNTRIES = ['DE'] + + # _TESTS = [{ + # 'url': 'mvwsearch:tagesschau', + # 'info_dict': { + # 'title': 'post-avant jazzcore', + # }, + # 'playlist_count': 15, + # }] + + # Map of title affixes indicating video variants. + _variants = { + 'audio_description': '(Audiodeskription)', + 'sign_language': '(mit Gebärdensprache)', + } + + def _build_conditions(self, search): + # @note So far, there is no API endpoint to convert a query string into + # a complete query object, as required by the /api/query endpoint. + # @see https://github.com/mediathekview/mediathekviewweb/blob/master/client/index.ts#L144 + # for parsing the search string into properties. + # @see https://github.com/mediathekview/mediathekviewweb/blob/master/client/index.ts#L389 + # for converting properties into field queries. + filters = {} + extra = {} + for component in search.lower().split(): + if len(component) == 0: + continue + + field = None + operator = component[0:1] + value = component[1:] + # Extra, non-field settings. + if operator == '>': + value = int(value.split(',')[0]) * 60 + extra['duration_min'] = max(extra.get('duration_min', 0), value) + continue + elif operator == '<': + value = int(value.split(',')[0]) * 60 + extra['duration_max'] = min(extra.get('duration_max', float('inf')), value) + continue + + # Field query operators. + elif operator == '!': + field = 'channel' + elif operator == '#': + field = 'topic' + elif operator == '+': + field = 'title' + elif operator == '*': + field = 'description' + else: + field = 'topic,title' + operator = '' + value = component + + if field: + # @todo In theory, comma-joined values are for AND queries. + # But so far, each is an AND component, even without comma. + filters.setdefault(field, []).append(' '.join(value.split(','))) + + conditions = [] + for field, keys in filters.items(): + for query in keys: + conditions.append({ + 'fields': field.split(','), + 'query': query, + }) + + return conditions, extra + + def _extract_playlist_entries(self, results): + entries = [] + for item in results: + variant = None + for key, value in self._variants.items(): + if item['title'].find(value) != -1: + variant = key + + formats = [] + formats.append({ + 'url': item['url_video'], + 'format': ('medium ' + self._variants[variant]) if variant else None, + 'format_id': ('medium-' + variant) if variant else 'medium', + 'language_preference': -10 if variant else 10, + 'quality': -2, + 'filesize': item['size'], + }) + if len(item.get('url_video_low', '')) > 0: + formats.append({ + 'url': item['url_video_low'], + 'format': ('low ' + self._variants[variant]) if variant else None, + 'format_id': ('low-' + variant) if variant else 'low', + 'language_preference': -10 if variant else 10, + 'quality': -3, + }) + if len(item.get('url_video_hd', '')) > 0: + formats.append({ + 'url': item['url_video_hd'], + 'format': ('high ' + self._variants[variant]) if variant else None, + 'format_id': ('high-' + variant) if variant else 'high', + 'language_preference': -10 if variant else 10, + 'quality': -1, + }) + self._sort_formats(formats) + + video = { + '_type': 'video', + 'formats': formats, + 'id': item['id'], + 'title': item['title'], + 'description': item['description'], + 'series': item['topic'], + 'channel': item['channel'], + 'uploader': item['channel'], + 'duration': int_or_none(item['duration']), + 'webpage_url': item['url_website'], + } + + upload_date = datetime.datetime.utcfromtimestamp(item['timestamp']) + video['upload_date'] = upload_date.strftime('%Y%m%d') + if item['url_subtitle']: + video.setdefault('subtitles', {}).setdefault('de', []).append({ + 'url': item['url_subtitle'], + }) + entries.append(video) + + return entries + + def _get_n_results(self, query, n): + # @todo Add support for everywhere/future options. + queries, extra = self._build_conditions(query) + queryObject = { + 'queries': queries, + 'sortBy': 'timestamp', + 'sortOrder': 'desc', + 'future': True, + 'duration_min': extra.get('duration_min'), + 'duration_max': extra.get('duration_max'), + 'offset': 0, + 'size': min(n, self._MAX_RESULTS_PER_PAGE), + } + + entries = [] + for page_num in itertools.count(1): + queryObject.update({'offset': (page_num - 1) * queryObject['size']}) + results = self._download_json('https://mediathekviewweb.de/api/query', query, + note='Fetching page %d' % page_num, + data=json.dumps(queryObject).encode('utf-8'), + headers={'Content-Type': 'text/plain'}) + if results['err'] is not None: + raise ExtractorError('API returned an error: %s' % results['err'][0]) + + meta = results['result']['queryInfo'] + print(json.dumps(meta)) + + entries.extend(self._extract_playlist_entries(results['result']['results'])) + + # @todo This returns full pages: 100 results if 51 are requested. + if meta['resultCount'] == 0 or meta['resultCount'] + queryObject['offset'] >= n: + break + + return self.playlist_result(entries, playlist_title=query) + +class MediathekViewWebIE(InfoExtractor): + # @see https://github.com/mediathekview/mediathekviewweb + IE_NAME = 'mediathekviewweb' + _VALID_URL = r'https?://mediathekviewweb\.de/\#query=(?P.+)' + + # @todo Specify test cases. + + def _real_extract(self, url): + query = self._match_id(url) + search = compat_urllib_parse_unquote(query) + return { + '_type': 'url', + 'url': 'mvwsearchall:' + search, + 'ie_key': 'MediathekViewWebSearch', + }