diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 7538b73c4..f84d55118 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -7,19 +7,22 @@ import re from .common import InfoExtractor from ..compat import ( compat_kwargs, - compat_str, + compat_urlparse, ) from ..utils import ( clean_html, determine_ext, - dict_get, extract_attributes, ExtractorError, float_or_none, int_or_none, + join_nonempty, + merge_dicts, parse_duration, - str_or_none, - try_get, + parse_qs, + T, + traverse_obj, + txt_or_none, unified_strdate, url_or_none, urljoin, @@ -160,10 +163,107 @@ class XHamsterIE(XHamsterBaseIE): 'only_matching': True, }] + def _get_height(self, s): + return int_or_none(self._search_regex( + r'^(\d+)[pP]', s, 'height', default=None)) + + def _extract_initials(self, initials, video_id, display_id, url, referrer, age_limit): + video = initials['videoModel'] + title = video['title'] + formats = [] + format_urls = set() + format_sizes = {} + http_headers = {'Referer': referrer} + for quality, size in traverse_obj(video, ( + 'sources', 'download', T(dict.items), Ellipsis, + T(lambda kv: (kv[0], float_or_none(kv[1]['size']))), + T(lambda kv: (kv[1] is not None) and kv))): + format_sizes[quality] = size + # Download link takes some time to be generated, + # skipping for now + for format_id, formats_dict in traverse_obj(video, ( + 'sources', T(dict.items), + lambda _, kv: kv[0] != 'download' and isinstance(kv[1], dict))): + for quality, format_url in traverse_obj(formats_dict, ( + T(dict.items), Ellipsis, + T(lambda kv: (kv[0], url_or_none(kv[1]))))): + if format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'format_id': '%s-%s' % (format_id, quality), + 'url': format_url, + 'ext': determine_ext(format_url, 'mp4'), + 'height': self._get_height(quality), + 'filesize': format_sizes.get(quality), + 'http_headers': http_headers, + }) + xplayer_sources = traverse_obj( + initials, ('xplayerSettings', 'sources', T(dict))) + for hls_url in traverse_obj( + xplayer_sources, + ('hls', ('url', 'fallback'), T(lambda u: urljoin(url, u)))): + if hls_url in format_urls: + continue + format_urls.add(hls_url) + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + for format_id, formats_list in traverse_obj( + xplayer_sources, ('standard', T(dict.items), Ellipsis)): + for standard_format in traverse_obj(formats_list, Ellipsis): + for standard_url in traverse_obj( + standard_format, + (('url', 'fallback'), T(lambda u: urljoin(url, u)))): + format_urls.add(standard_url) + ext = determine_ext(standard_url, 'mp4') + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + standard_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + quality = traverse_obj(standard_format, (('quality', 'label'), T(txt_or_none)), get_all=False) or '' + formats.append({ + 'format_id': '%s-%s' % (format_id, quality), + 'url': standard_url, + 'ext': ext, + 'height': self._get_height(quality), + 'filesize': format_sizes.get(quality), + 'http_headers': { + 'Referer': standard_url, + }, + }) + self._sort_formats( + formats, field_preference=('height', 'width', 'tbr', 'format_id')) + + categories = traverse_obj(video, ('categories', Ellipsis, 'name', T(txt_or_none))) or None + uploader_url = traverse_obj(video, ('author', 'pageURL', T(url_or_none))) + + return merge_dicts({ + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'uploader_url': uploader_url, + 'uploader_id': uploader_url.split('/')[-1] if uploader_url else None, + 'age_limit': age_limit if age_limit is not None else 18, + 'categories': categories, + 'formats': formats, + }, traverse_obj(video, { + 'description': ('description', T(txt_or_none)), + 'timestamp': ('created', T(int_or_none)), + 'uploader': ('author', 'name', T(txt_or_none)), + 'thumbnail': ('thumbURL', T(url_or_none)), + 'duration': ('duration', T(int_or_none)), + 'view_count': ('views', T(int_or_none)), + 'like_count': ('rating', 'likes', T(int_or_none)), + 'dislike_count': ('rating', 'dislikes', T(int_or_none)), + 'comment_count': ('comments', T(int_or_none)), + })) + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') or mobj.group('id_2') - display_id = mobj.group('display_id') or mobj.group('display_id_2') + mobj = self._match_valid_url(url) + video_id = traverse_obj(mobj, 'id', 'id_2') + display_id = traverse_obj(mobj, 'display_id', 'display_id_2') desktop_url = re.sub(r'^(https?://(?:.+?\.)?)m\.', r'\1', url) webpage, urlh = self._download_webpage_handle(desktop_url, video_id) @@ -176,139 +276,19 @@ class XHamsterIE(XHamsterBaseIE): age_limit = self._rta_search(webpage) - def get_height(s): - return int_or_none(self._search_regex( - r'^(\d+)[pP]', s, 'height', default=None)) - initials = self._parse_json( self._search_regex( (r'window\.initials\s*=\s*({.+?})\s*;\s*', r'window\.initials\s*=\s*({.+?})\s*;'), webpage, 'initials', default='{}'), video_id, fatal=False) + if initials: - video = initials['videoModel'] - title = video['title'] - formats = [] - format_urls = set() - format_sizes = {} - sources = try_get(video, lambda x: x['sources'], dict) or {} - for format_id, formats_dict in sources.items(): - if not isinstance(formats_dict, dict): - continue - download_sources = try_get(sources, lambda x: x['download'], dict) or {} - for quality, format_dict in download_sources.items(): - if not isinstance(format_dict, dict): - continue - format_sizes[quality] = float_or_none(format_dict.get('size')) - for quality, format_item in formats_dict.items(): - if format_id == 'download': - # Download link takes some time to be generated, - # skipping for now - continue - format_url = format_item - format_url = url_or_none(format_url) - if not format_url or format_url in format_urls: - continue - format_urls.add(format_url) - formats.append({ - 'format_id': '%s-%s' % (format_id, quality), - 'url': format_url, - 'ext': determine_ext(format_url, 'mp4'), - 'height': get_height(quality), - 'filesize': format_sizes.get(quality), - 'http_headers': { - 'Referer': urlh.geturl(), - }, - }) - xplayer_sources = try_get( - initials, lambda x: x['xplayerSettings']['sources'], dict) - if xplayer_sources: - hls_sources = xplayer_sources.get('hls') - if isinstance(hls_sources, dict): - for hls_format_key in ('url', 'fallback'): - hls_url = hls_sources.get(hls_format_key) - if not hls_url: - continue - hls_url = urljoin(url, hls_url) - if not hls_url or hls_url in format_urls: - continue - format_urls.add(hls_url) - formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - standard_sources = xplayer_sources.get('standard') - if isinstance(standard_sources, dict): - for format_id, formats_list in standard_sources.items(): - if not isinstance(formats_list, list): - continue - for standard_format in formats_list: - if not isinstance(standard_format, dict): - continue - for standard_format_key in ('url', 'fallback'): - standard_url = standard_format.get(standard_format_key) - if not standard_url: - continue - standard_url = urljoin(url, standard_url) - if not standard_url or standard_url in format_urls: - continue - format_urls.add(standard_url) - ext = determine_ext(standard_url, 'mp4') - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - standard_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - quality = (str_or_none(standard_format.get('quality')) - or str_or_none(standard_format.get('label')) - or '') - formats.append({ - 'format_id': '%s-%s' % (format_id, quality), - 'url': standard_url, - 'ext': ext, - 'height': get_height(quality), - 'filesize': format_sizes.get(quality), - 'http_headers': { - 'Referer': standard_url, - }, - }) - self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) + return self._extract_initials(initials, video_id, display_id, url, urlh.geturl, age_limit) - categories_list = video.get('categories') - if isinstance(categories_list, list): - categories = [] - for c in categories_list: - if not isinstance(c, dict): - continue - c_name = c.get('name') - if isinstance(c_name, compat_str): - categories.append(c_name) - else: - categories = None + return self._old_real_extract(webpage, video_id, display_id, age_limit) - uploader_url = url_or_none(try_get(video, lambda x: x['author']['pageURL'])) - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': video.get('description'), - 'timestamp': int_or_none(video.get('created')), - 'uploader': try_get( - video, lambda x: x['author']['name'], compat_str), - 'uploader_url': uploader_url, - 'uploader_id': uploader_url.split('/')[-1] if uploader_url else None, - 'thumbnail': video.get('thumbURL'), - 'duration': int_or_none(video.get('duration')), - 'view_count': int_or_none(video.get('views')), - 'like_count': int_or_none(try_get( - video, lambda x: x['rating']['likes'], int)), - 'dislike_count': int_or_none(try_get( - video, lambda x: x['rating']['dislikes'], int)), - 'comment_count': int_or_none(video.get('views')), - 'age_limit': age_limit if age_limit is not None else 18, - 'categories': categories, - 'formats': formats, - } + def _old_real_extract(self, webpage, video_id, display_id, age_limit): # Old layout fallback @@ -326,17 +306,17 @@ class XHamsterIE(XHamsterBaseIE): r'sources\s*:\s*({.+?})\s*,?\s*\n', webpage, 'sources', default='{}'), video_id, fatal=False) - for format_id, format_url in sources.items(): - format_url = url_or_none(format_url) - if not format_url: - continue + for format_id, format_url in traverse_obj(sources, ( + T(dict.items), Ellipsis, + T(lambda kv: (kv[0], url_or_none(kv[1]))), + T(lambda kv: kv[1] and kv))): if format_url in format_urls: continue format_urls.add(format_url) formats.append({ 'format_id': format_id, 'url': format_url, - 'height': get_height(format_id), + 'height': self._get_height(format_id), }) video_url = self._search_regex( @@ -351,62 +331,49 @@ class XHamsterIE(XHamsterBaseIE): self._sort_formats(formats) - # Only a few videos have an description - mobj = re.search(r'Description: ([^<]+)', webpage) - description = mobj.group(1) if mobj else None - - upload_date = unified_strdate(self._search_regex( - r'hint=["\'](\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}', - webpage, 'upload date', fatal=False)) - uploader = self._html_search_regex( r']+itemprop=["\']author[^>]+>]+>]+>([^<]+)', webpage, 'uploader', default='anonymous') - thumbnail = self._search_regex( - [r'''["']thumbUrl["']\s*:\s*(?P["'])(?P.+?)(?P=q)''', - r''']+"poster"=(?P["'])(?P.+?)(?P=q)[^>]*>'''], - webpage, 'thumbnail', fatal=False, group='thumbnail') - - duration = parse_duration(self._search_regex( - [r'<[^<]+\bitemprop=["\']duration["\'][^<]+\bcontent=["\'](.+?)["\']', - r'Runtime:\s*\s*([\d:]+)'], webpage, - 'duration', fatal=False)) - - view_count = int_or_none(self._search_regex( - r'content=["\']User(?:View|Play)s:(\d+)', - webpage, 'view count', fatal=False)) - - mobj = re.search(r'hint=[\'"](?P\d+) Likes / (?P\d+) Dislikes', webpage) - (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None) - - mobj = re.search(r'Comments \((?P\d+)\)', webpage) - comment_count = mobj.group('commentcount') if mobj else 0 - - categories_html = self._search_regex( - r'(?s)Categories:.+?)', webpage, - 'categories', default=None) categories = [clean_html(category) for category in re.findall( - r']+>(.+?)', categories_html)] if categories_html else None + r']+>(.+?)', self._search_regex( + r'(?s)Categories:.+?)', webpage, + 'categories', default=''))] - return { + return merge_dicts({ 'id': video_id, 'display_id': display_id, 'title': title, - 'description': description, - 'upload_date': upload_date, + # Only a few videos have a description + 'description': traverse_obj( + re.search(r'Description:\s*([^<]+)', webpage), 1), + 'upload_date': unified_strdate(self._search_regex( + r'hint=["\'](\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}', + webpage, 'upload date', fatal=False)), 'uploader': uploader, - 'uploader_id': uploader.lower() if uploader else None, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, - 'like_count': int_or_none(like_count), - 'dislike_count': int_or_none(dislike_count), - 'comment_count': int_or_none(comment_count), + 'uploader_id': (uploader or '').lower() or None, + 'thumbnail': url_or_none(self._search_regex( + (r'''["']thumbUrl["']\s*:\s*(?P["'])(?P.+?)(?P=q)''', + r''']+"poster"=(?P["'])(?P.+?)(?P=q)[^>]*>'''), + webpage, 'thumbnail', fatal=False, group='thumbnail')), + 'duration': parse_duration(self._search_regex( + (r'<[^<]+\bitemprop=["\']duration["\'][^<]+\bcontent=["\'](.+?)["\']', + r'Runtime:\s*\s*([\d:]+)'), webpage, + 'duration', fatal=False)), + 'view_count': int_or_none(self._search_regex( + r'content=["\']User(?:View|Play)s:\s*(\d+)', + webpage, 'view count', fatal=False)), + 'comment_count': traverse_obj( + re.search(r'Comments \((?P\d+)\)', webpage), + ('commentcount', T(int_or_none))), 'age_limit': age_limit, - 'categories': categories, + 'categories': categories or None, 'formats': formats, - } + }, traverse_obj( + re.search(r'hint=[\'"](?P\d+) Likes / (?P\d+) Dislikes', webpage), { + 'like_count': ('likecount', T(int_or_none)), + 'dislike_count': ('dislikecount', T(int_or_none)), + })) class XHamsterEmbedIE(XHamsterBaseIE): @@ -420,6 +387,7 @@ class XHamsterEmbedIE(XHamsterBaseIE): 'timestamp': 1406581861, 'upload_date': '20140728', 'uploader': 'ManyakisArt', + 'uploader_id': 'manyakisart', 'duration': 5, 'age_limit': 18, } @@ -444,7 +412,7 @@ class XHamsterEmbedIE(XHamsterBaseIE): vars = self._parse_json( self._search_regex(r'vars\s*:\s*({.+?})\s*,\s*\n', webpage, 'vars'), video_id) - video_url = dict_get(vars, ('downloadLink', 'homepageLink', 'commentsLink', 'shareUrl')) + video_url = traverse_obj(vars, 'downloadLink', 'homepageLink', 'commentsLink', 'shareUrl', expected_type=url_or_none) return self.url_result(video_url, 'XHamster')