youtube-dl/youtube_dl/extractor/ninegag.py

131 lines
4.1 KiB
Python
Raw Normal View History

2014-01-29 02:55:06 +09:00
from __future__ import unicode_literals
2013-12-05 22:29:08 +09:00
from .common import InfoExtractor
2021-01-19 18:21:37 +09:00
from ..utils import (
2021-01-19 18:23:02 +09:00
ExtractorError,
2021-02-19 19:55:14 +09:00
determine_ext,
2021-01-19 18:21:37 +09:00
int_or_none,
2021-01-19 18:23:02 +09:00
try_get,
2021-02-19 19:55:14 +09:00
unescapeHTML,
2021-01-19 18:23:02 +09:00
url_or_none,
2021-01-19 18:21:37 +09:00
)
2013-12-05 22:29:08 +09:00
class NineGagIE(InfoExtractor):
IE_NAME = '9gag'
2021-01-19 18:23:02 +09:00
_VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P<id>[^/?&#]+)'
2013-12-05 22:29:08 +09:00
2021-02-19 19:55:14 +09:00
_TESTS = [{
2021-01-19 18:21:37 +09:00
'url': 'https://9gag.com/gag/ae5Ag7B',
2015-09-23 04:55:16 +09:00
'info_dict': {
2021-01-19 18:21:37 +09:00
'id': 'ae5Ag7B',
2021-01-19 18:23:02 +09:00
'ext': 'mp4',
2021-01-19 18:21:37 +09:00
'title': 'Capybara Agility Training',
'upload_date': '20191108',
'timestamp': 1573237208,
2021-01-19 18:23:02 +09:00
'categories': ['Awesome'],
'tags': ['Weimaraner', 'American Pit Bull Terrier'],
'duration': 44,
'like_count': int,
'dislike_count': int,
'comment_count': int,
2021-01-19 18:21:37 +09:00
}
2021-02-19 19:55:14 +09:00
}, {
# HTML escaped title
'url': 'https://9gag.com/gag/av5nvyb',
'only_matching': True,
}]
2013-12-05 22:29:08 +09:00
def _real_extract(self, url):
2021-01-19 18:23:02 +09:00
post_id = self._match_id(url)
post = self._download_json(
'https://9gag.com/v1/post', post_id, query={
'id': post_id
})['data']['post']
2013-12-05 22:29:08 +09:00
2021-01-19 18:23:02 +09:00
if post.get('type') != 'Animated':
2021-01-19 18:21:37 +09:00
raise ExtractorError(
'The given url does not contain a video',
expected=True)
2014-04-15 21:49:38 +09:00
2021-02-19 19:55:14 +09:00
title = unescapeHTML(post['title'])
2021-01-19 18:23:02 +09:00
2021-01-19 18:21:37 +09:00
duration = None
formats = []
thumbnails = []
2021-01-19 18:23:02 +09:00
for key, image in (post.get('images') or {}).items():
image_url = url_or_none(image.get('url'))
if not image_url:
2021-01-19 18:21:37 +09:00
continue
2021-01-19 18:23:02 +09:00
ext = determine_ext(image_url)
image_id = key.strip('image')
common = {
'url': image_url,
'width': int_or_none(image.get('width')),
'height': int_or_none(image.get('height')),
}
if ext in ('jpg', 'png'):
webp_url = image.get('webpUrl')
if webp_url:
t = common.copy()
t.update({
'id': image_id + '-webp',
'url': webp_url,
})
thumbnails.append(t)
common.update({
'id': image_id,
2021-01-19 18:21:37 +09:00
'ext': ext,
})
2021-01-19 18:23:02 +09:00
thumbnails.append(common)
elif ext in ('webm', 'mp4'):
if not duration:
duration = int_or_none(image.get('duration'))
common['acodec'] = 'none' if image.get('hasAudio') == 0 else None
for vcodec in ('vp8', 'vp9', 'h265'):
c_url = image.get(vcodec + 'Url')
if not c_url:
continue
c_f = common.copy()
c_f.update({
'format_id': image_id + '-' + vcodec,
'url': c_url,
'vcodec': vcodec,
})
formats.append(c_f)
common.update({
'ext': ext,
'format_id': image_id,
})
formats.append(common)
self._sort_formats(formats)
section = try_get(post, lambda x: x['postSection']['name'])
2021-01-19 18:21:37 +09:00
tags = None
2021-01-19 18:23:02 +09:00
post_tags = post.get('tags')
if post_tags:
2021-01-19 18:21:37 +09:00
tags = []
2021-01-19 18:23:02 +09:00
for tag in post_tags:
tag_key = tag.get('key')
if not tag_key:
continue
tags.append(tag_key)
get_count = lambda x: int_or_none(post.get(x + 'Count'))
2013-12-05 22:29:08 +09:00
return {
2021-01-19 18:23:02 +09:00
'id': post_id,
'title': title,
'timestamp': int_or_none(post.get('creationTs')),
2021-01-19 18:21:37 +09:00
'duration': duration,
'formats': formats,
'thumbnails': thumbnails,
2021-01-19 18:23:02 +09:00
'like_count': get_count('upVote'),
'dislike_count': get_count('downVote'),
'comment_count': get_count('comments'),
'age_limit': 18 if post.get('nsfw') == 1 else None,
'categories': [section] if section else None,
2021-01-19 18:21:37 +09:00
'tags': tags,
2013-12-05 22:29:08 +09:00
}