[youtube] Improve xsrf token extraction (closes #27442)

This commit is contained in:
Sergey M․ 2020-12-20 00:48:44 +07:00
parent 3729c52f9d
commit 942b8ca3be
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D

View File

@ -300,6 +300,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'), self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
video_id) video_id)
def _extract_ytcfg(self, video_id, webpage):
return self._parse_json(
self._search_regex(
r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
default='{}'), video_id, fatal=False)
class YoutubeIE(YoutubeBaseInfoExtractor): class YoutubeIE(YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com' IE_DESC = 'YouTube.com'
@ -2283,12 +2289,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# annotations # annotations
video_annotations = None video_annotations = None
if self._downloader.params.get('writeannotations', False): if self._downloader.params.get('writeannotations', False):
xsrf_token = None
ytcfg = self._extract_ytcfg(video_id, video_webpage)
if ytcfg:
xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
if not xsrf_token:
xsrf_token = self._search_regex( xsrf_token = self._search_regex(
r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2', r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
video_webpage, 'xsrf token', group='xsrf_token', fatal=False) video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
invideo_url = try_get( invideo_url = try_get(
player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str) player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
if xsrf_token and invideo_url: if xsrf_token and invideo_url:
xsrf_field_name = None
if ytcfg:
xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
if not xsrf_field_name:
xsrf_field_name = self._search_regex( xsrf_field_name = self._search_regex(
r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2', r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
video_webpage, 'xsrf field name', video_webpage, 'xsrf field name',
@ -3130,10 +3145,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
playlist_title=title) playlist_title=title)
def _extract_identity_token(self, webpage, item_id): def _extract_identity_token(self, webpage, item_id):
ytcfg = self._parse_json( ytcfg = self._extract_ytcfg(item_id, webpage)
self._search_regex(
r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
default='{}'), item_id, fatal=False)
if ytcfg: if ytcfg:
token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str) token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
if token: if token: