From fd61f317bf71b91abbc5cd256027877b70f6dbce Mon Sep 17 00:00:00 2001 From: Tim Mann Date: Sat, 13 Feb 2021 16:26:33 -0800 Subject: [PATCH] Step 1 of a rewrite to find and parse embedded JSON instead of just running a regexp over the whole page. This version passes the tests, but more work is needed. --- youtube_dl/extractor/pac12.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pac12.py b/youtube_dl/extractor/pac12.py index 534261e15..07da749d3 100644 --- a/youtube_dl/extractor/pac12.py +++ b/youtube_dl/extractor/pac12.py @@ -1,5 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +from __future__ import print_function #XXX +import pprint #XXX import re @@ -32,9 +34,15 @@ class Pac12IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_url = \ - self._search_regex(r'"manifest_url":"(?Phttps:[^"]+)"', - webpage, 'url', group='url', default=None) + + drupal_settings = self._parse_json( + self._search_regex( + r']+type="application/json"[^>]*data-drupal-selector="drupal-settings-json">([^<]+)', + webpage, 'drupal settings'), video_id) + pprint.pprint(drupal_settings.get('currentVideo')) + + video_url = drupal_settings.get('currentVideo', {}).get('manifest_url') + vod_url = None if (video_url is None) or ('vod-' not in url): vod_url = self._search_regex(r'(https?://(?:embed\.)?pac-12\.com/(?:embed/)?vod-[0-9a-zA-Z]+)',