From af3434b839b2aae4cdf8ae607b34caa3560c663a Mon Sep 17 00:00:00 2001
From: Henrik Heimbuerger <henrik@heimbuerger.de>
Date: Sat, 18 Apr 2020 06:15:03 +0200
Subject: [PATCH] [nebula] Relax meta data lookups

---
 youtube_dl/extractor/nebula.py | 48 ++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 19 deletions(-)
diff --git a/youtube_dl/extractor/nebula.py b/youtube_dl/extractor/nebula.py
index 828ea1c6f..038863348 100644
--- a/youtube_dl/extractor/nebula.py
+++ b/youtube_dl/extractor/nebula.py
@@ -4,7 +4,8 @@ from __future__ import unicode_literals
 import os
 
 from .common import InfoExtractor
-from ..utils import parse_iso8601
+from ..compat import compat_str
+from ..utils import parse_iso8601, try_get
 
 COOKIE_NEBULA_AUTH = os.environ.get('COOKIE_NEBULA_AUTH')   # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests
 
@@ -74,9 +75,13 @@ class NebulaIE(InfoExtractor):
         """
         As of 2020-04-07, every Nebula video page is a React base page, containing an initial state JSON in a script
         tag. This function is extracting this script tag, parsing it as JSON.
+
+        May return None if no state object could be found or it didn't contain valid JSON.
         """
-        initial_state_object = self._search_regex(r'<script id="initial-app-state" type="application/json">(.+?)</script>', webpage, 'initial_state')
-        metadata = self._parse_json(initial_state_object, video_id=display_id)   # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead?
+        initial_state_object = self._search_regex(
+            r'<script[^>]*id="initial-app-state"[^>]*>(.+?)</script>', webpage,
+            'initial_state', fatal=False, default=None)
+        metadata = self._parse_json(initial_state_object, video_id=display_id) if initial_state_object else None   # TODO: we don't have the real video ID yet, is it okay to pass the display_id instead?
 
         return metadata
 
@@ -84,9 +89,12 @@ class NebulaIE(InfoExtractor):
         """
         The state object contains a videos.byURL dictionary, which maps URL display IDs to video IDs. Using the
         video ID, we can then extract a dictionary with various meta data about the video itself.
+
+        May return (None, {}) if no state object was given or it didn't contain the expected lookup table or
+        meta data.
         """
-        video_id = state_object['videos']['byURL'][display_id]
-        video_meta = state_object['videos']['byID'][video_id]
+        video_id = try_get(state_object, lambda x: x['videos']['byURL'][display_id], compat_str)
+        video_meta = try_get(state_object, lambda x: x['videos']['byID'][video_id], dict) or {}
 
         return video_id, video_meta
 
@@ -100,8 +108,10 @@ class NebulaIE(InfoExtractor):
 
         # fallback: reconstruct using video ID and access token from state object
         if not video_url:
-            access_token = state_object['account']['userInfo']['zypeAuthInfo']['accessToken']
-            video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format(video_id=video_id, access_token=access_token)
+            access_token = try_get(state_object, lambda x: x['account']['userInfo']['zypeAuthInfo']['accessToken'],
+                                   compat_str)
+            video_url = 'https://player.zype.com/embed/{video_id}.html?access_token={access_token}'.format(
+                video_id=video_id, access_token=access_token)
 
         return video_url
 
@@ -125,12 +135,13 @@ class NebulaIE(InfoExtractor):
         kind of ID) via an additional API call.
 
         TODO: Implement the API calls giving us the channel list, so that we can do the title lookup and then figure out the channel URL
+
+        May return None of no category list could be found or no category had a label ('value').
         """
-        categories = video_meta['categories']
+        categories = video_meta.get('categories', []) if video_meta else []
         for category in categories:
-            if category['value']:
+            if category.get('value'):   # we're intentionally not using "'value' in category" here, because the expression is supposed to be falsy for empty lists in category['value'] as well!
                 return category['value'][0]
-        return None
 
     def _real_extract(self, url):
         # FIXME: a workaround for testing, because I couldn't figure out how to supply a cookiejar when running the unit tests
@@ -163,18 +174,17 @@ class NebulaIE(InfoExtractor):
             'url': video_url,
 
             # the meta data we were able to extract from Nebula
-            'title': video_meta['title'],
-            'description': video_meta['description'],
-            'timestamp': parse_iso8601(video_meta['published_at']),
+            'title': video_meta.get('title'),
+            'description': video_meta.get('description'),
+            'timestamp': parse_iso8601(video_meta.get('published_at')),
             'thumbnails': [
                 {
-                    'id': tn['name'],   # this appears to be null in all cases I've seen
+                    'id': tn.get('name'),   # this appears to be null in all cases I've seen
                     'url': tn['url'],
-                    'width': tn['width'],
-                    'height': tn['height'],
-                } for tn in video_meta['thumbnails']
-            ],
-            'duration': video_meta['duration'],
+                    'width': tn.get('width'),
+                    'height': tn.get('height'),
+                } for tn in video_meta.get('thumbnails', [])],
+            'duration': video_meta.get('duration'),
             'channel': channel_title,
             'uploader': channel_title,   # we chose here to declare the channel name as the 'uploader' -- that's certainly arguable, as sometimes it's more of a series
             # TODO: uploader_url: the video page clearly links to this (in the example case: /lindsayellis), but I cannot figure out where it gets it from!