[InfoExtractor] Add _match_valid_url() class method and refactor

* API compatible with yt-dlp
* also support Sequence of patterns in _VALID_URL
* one place to compile _VALID_URL
* TODO: remove existing extractor shims
This commit is contained in:
dirkf 2023-07-19 14:14:50 +01:00
parent a190b55964
commit b2ba24bb02
3 changed files with 49 additions and 22 deletions

View File

@ -4,6 +4,7 @@ from inspect import getsource
import io import io
import os import os
from os.path import dirname as dirn from os.path import dirname as dirn
import re
import sys import sys
print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr) print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr)
@ -29,11 +30,18 @@ from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor
with open('devscripts/lazy_load_template.py', 'rt') as f: with open('devscripts/lazy_load_template.py', 'rt') as f:
module_template = f.read() module_template = f.read()
def get_source(m):
return re.sub(r'(?m)^\s*#.*\n', '', getsource(m))
module_contents = [ module_contents = [
module_template + '\n' + getsource(InfoExtractor.suitable) + '\n', module_template,
get_source(InfoExtractor.suitable),
get_source(InfoExtractor._match_valid_url) + '\n',
'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n', 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n',
# needed for suitable() methods of Youtube extractor (see #28780) # needed for suitable() methods of Youtube extractor (see #28780)
'from youtube_dl.utils import parse_qs\n', 'from youtube_dl.utils import parse_qs, variadic\n',
] ]
ie_template = ''' ie_template = '''
@ -66,7 +74,7 @@ def build_lazy_ie(ie, name):
valid_url=valid_url, valid_url=valid_url,
module=ie.__module__) module=ie.__module__)
if ie.suitable.__func__ is not InfoExtractor.suitable.__func__: if ie.suitable.__func__ is not InfoExtractor.suitable.__func__:
s += '\n' + getsource(ie.suitable) s += '\n' + get_source(ie.suitable)
if hasattr(ie, '_make_valid_url'): if hasattr(ie, '_make_valid_url'):
# search extractors # search extractors
s += make_valid_template.format(valid_url=ie._make_valid_url()) s += make_valid_template.format(valid_url=ie._make_valid_url())

View File

@ -83,6 +83,7 @@ from ..utils import (
urljoin, urljoin,
url_basename, url_basename,
url_or_none, url_or_none,
variadic,
xpath_element, xpath_element,
xpath_text, xpath_text,
xpath_with_ns, xpath_with_ns,
@ -371,9 +372,22 @@ class InfoExtractor(object):
title, description etc. title, description etc.
Subclasses of this one should re-define the _real_initialize() and A subclass of InfoExtractor must be defined to handle each specific site (or
_real_extract() methods and define a _VALID_URL regexp. several sites). Such a concrete subclass should be added to the list of
Probably, they should also be added to the list of extractors. extractors. It should also:
* define its _VALID_URL attribute as a regexp, or a Sequence of alternative
regexps (but see below)
* re-define the _real_extract() method
* optionally re-define the _real_initialize() method.
An extractor subclass may also override suitable() if necessary, but the
function signature must be preserved and the function must import everything
it needs (except other extractors), so that lazy_extractors works correctly.
If the subclass's suitable() and _real_extract() functions avoid using
_VALID_URL, the subclass need not set that class attribute.
An abstract subclass of InfoExtractor may be used to simplify implementation
within an extractor module; it should not be added to the list of extractors.
_GEO_BYPASS attribute may be set to False in order to disable _GEO_BYPASS attribute may be set to False in order to disable
geo restriction bypass mechanisms for a particular extractor. geo restriction bypass mechanisms for a particular extractor.
@ -408,22 +422,33 @@ class InfoExtractor(object):
self._x_forwarded_for_ip = None self._x_forwarded_for_ip = None
self.set_downloader(downloader) self.set_downloader(downloader)
@classmethod
def __match_valid_url(cls, url):
# This does not use has/getattr intentionally - we want to know whether
# we have cached the regexp for cls, whereas getattr would also
# match its superclass
if '_VALID_URL_RE' not in cls.__dict__:
# _VALID_URL can now be a list/tuple of patterns
cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
# 20% faster than next(filter(None, (p.match(url) for p in cls._VALID_URL_RE)), None) in 2.7
for p in cls._VALID_URL_RE:
p = p.match(url)
if p:
return p
# The public alias can safely be overridden, as in some back-ports
_match_valid_url = __match_valid_url
@classmethod @classmethod
def suitable(cls, url): def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE.""" """Receives a URL and returns True if suitable for this IE."""
# This function must import everything it needs (except other extractors),
# This does not use has/getattr intentionally - we want to know whether # so that lazy_extractors works correctly
# we have cached the regexp for *this* class, whereas getattr would also return cls.__match_valid_url(url) is not None
# match the superclass
if '_VALID_URL_RE' not in cls.__dict__:
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
return cls._VALID_URL_RE.match(url) is not None
@classmethod @classmethod
def _match_id(cls, url): def _match_id(cls, url):
if '_VALID_URL_RE' not in cls.__dict__: m = cls.__match_valid_url(url)
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
m = cls._VALID_URL_RE.match(url)
assert m assert m
return compat_str(m.group('id')) return compat_str(m.group('id'))

View File

@ -18,12 +18,6 @@ from ..utils import (
class GlobalPlayerBaseIE(InfoExtractor): class GlobalPlayerBaseIE(InfoExtractor):
import re
@classmethod
def _match_valid_url(cls, url):
return cls.re.match(cls._VALID_URL, url)
def _get_page_props(self, url, video_id): def _get_page_props(self, url, video_id):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
return self._search_nextjs_data(webpage, video_id)['props']['pageProps'] return self._search_nextjs_data(webpage, video_id)['props']['pageProps']