Compare commits

...

22 Commits

Author SHA1 Message Date
dirkf
dd2406deb8
Merge f7dc45d3a643063e653e0dcda9faeb9973ad93b5 into 420d53387cff54ea1fccca061438d59bdb50a39c 2025-03-20 02:14:49 +05:30
dirkf
420d53387c [JSInterp] Improve tests
* from yt-dlp/yt-dlp#12313
* also fix d7c2708
2025-03-11 02:00:24 +00:00
dirkf
32f89de92b [YouTube] Update TVHTML5 client parameters
* resolves #33078
2025-03-11 02:00:24 +00:00
dirkf
283dca56fe [YouTube] Initially support tce-style player JS
* resolves #33079
2025-03-11 02:00:24 +00:00
dirkf
422b1b31cf [YouTube] Temporarily redirect from tce-style player JS 2025-03-11 02:00:24 +00:00
dirkf
1dc27e1c3b [JSInterp] Make indexing error handling more conformant
* by default TypeError -> undefined, else raise
* set allow_undefined=True/False to override
2025-03-11 02:00:24 +00:00
dirkf
af049e309b [JSInterp] Handle undefined, etc, passed to JS_RegExp and Exception 2025-03-11 02:00:24 +00:00
dirkf
94849bc997 [JSInterp] Improve Date processing
* add JS_Date class implementing JS Date
* support constructor args other than date string
* support static methods of Date
* Date objects are still automatically coerced to timestamp before using in JS.
2025-03-11 02:00:24 +00:00
dirkf
974c7d7f34 [compat] Fix inheriting from compat_collections_chain_map
* see ytdl-org/youtube-dl#33079#issuecomment-2704038049
2025-03-11 02:00:24 +00:00
dirkf
8738407d77 [compat] Support zstd Content-Encoding
* see RFC 8878 7.2
2025-03-11 02:00:24 +00:00
dirkf
cecaa18b80 [compat] Clean-up
* make workaround_optparse_bug9161 private
* add comments
* avoid leaving test objects behind
2025-03-11 02:00:24 +00:00
dirkf
f7dc45d3a6 [Generic] Update KVS extraction for player v10
* support "renamed" flashvars variable
* use `_search_json()
* support `rnd` query parameter
* extract tags, categories, age_limit, more thumbnails`
* closes #31007
2024-10-22 11:18:55 +01:00
dirkf
06996aca12 [utils] Don't raise in js_to_json() template substitution when non-strict
* template extression should be evaluated with the same strict-ness
2024-10-22 11:18:55 +01:00
dirkf
13b0e81f17 [utils] Correctly match class names in get_element[s]_by_class()
* reproduce CSS .classname behaviour ("bar" matches "bar", "foo bar baz", etc)
* add tests
2024-10-22 11:18:55 +01:00
dirkf
01b80a0802 [XFileShare] Re-factor and fix tests
* update site list
* support page with player data in <iframe>
* use `_search_json()`
* improve "not found" detection
* improve title extraction
2024-10-22 11:18:55 +01:00
dirkf
ae0cbb84f2 [XFileShare] Add geo-block detection 2024-10-22 11:18:55 +01:00
dirkf
94f181f9f5 [YandexMusic] Fix CAPTCHA check
* correct logic in _download_webpage() hook (yt-dlp/yt-dlp#4432)
* improve error message.
2024-10-22 11:18:55 +01:00
dirkf
620298e0ff [core] Fix jwplayer format parsing
* thx yt-dlp/yt-dlp#10956
2024-10-22 11:18:55 +01:00
dirkf
c445489a46 [Mgoon,Kaltura] Fix regex typo (:?
* thx yt-dlp/yt-dlp#10807 (584d455)
2024-10-22 11:18:55 +01:00
dirkf
ac0c9c8f9f [HentaiStigma] Support new frame format with HTML5 video
* resolves #25019
2024-10-22 11:18:55 +01:00
dirkf
515c8b85b1 [ORFRadio] Support /programm/ URL format
* fixes yt-dlp/yt-dlp#11014
2024-10-22 11:18:55 +01:00
dirkf
48082c9091 [core] Let Git ignore __pycache__, .pytest_cache 2024-10-22 11:18:55 +01:00
17 changed files with 528 additions and 176 deletions

2
.gitignore vendored
View File

@ -1,3 +1,4 @@
__pycache__/
*.pyc *.pyc
*.pyo *.pyo
*.class *.class
@ -5,6 +6,7 @@
*.DS_Store *.DS_Store
wine-py2exe/ wine-py2exe/
py2exe.log py2exe.log
.pytest_cache/
*.kate-swp *.kate-swp
build/ build/
dist/ dist/

View File

@ -11,6 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import math import math
import re import re
import time
from youtube_dl.compat import compat_str as str from youtube_dl.compat import compat_str as str
from youtube_dl.jsinterp import JS_Undefined, JSInterpreter from youtube_dl.jsinterp import JS_Undefined, JSInterpreter
@ -208,6 +209,34 @@ class TestJSInterpreter(unittest.TestCase):
self._test(jsi, 86000, args=['12/31/1969 18:01:26 MDT']) self._test(jsi, 86000, args=['12/31/1969 18:01:26 MDT'])
# epoch 0 # epoch 0
self._test(jsi, 0, args=['1 January 1970 00:00:00 UTC']) self._test(jsi, 0, args=['1 January 1970 00:00:00 UTC'])
# undefined
self._test(jsi, NaN, args=[JS_Undefined])
# y,m,d, ... - may fail with older dates lacking DST data
jsi = JSInterpreter(
'function f() { return new Date(%s); }'
% ('2024, 5, 29, 2, 52, 12, 42',))
self._test(jsi, (
1719625932042 # UK value
+ (
+ 3600 # back to GMT
+ (time.altzone if time.daylight # host's DST
else time.timezone)
) * 1000))
# no arg
self.assertAlmostEqual(JSInterpreter(
'function f() { return new Date() - 0; }').call_function('f'),
time.time() * 1000, delta=100)
# Date.now()
self.assertAlmostEqual(JSInterpreter(
'function f() { return Date.now(); }').call_function('f'),
time.time() * 1000, delta=100)
# Date.parse()
jsi = JSInterpreter('function f(dt) { return Date.parse(dt); }')
self._test(jsi, 0, args=['1 January 1970 00:00:00 UTC'])
# Date.UTC()
jsi = JSInterpreter('function f() { return Date.UTC(%s); }'
% ('1970, 0, 1, 0, 0, 0, 0',))
self._test(jsi, 0)
def test_call(self): def test_call(self):
jsi = JSInterpreter(''' jsi = JSInterpreter('''
@ -463,6 +492,14 @@ class TestJSInterpreter(unittest.TestCase):
self._test('function f(){return NaN << 42}', 0) self._test('function f(){return NaN << 42}', 0)
self._test('function f(){return "21.9" << 1}', 42) self._test('function f(){return "21.9" << 1}', 42)
self._test('function f(){return 21 << 4294967297}', 42) self._test('function f(){return 21 << 4294967297}', 42)
self._test('function f(){return true << "5";}', 32)
self._test('function f(){return true << true;}', 2)
self._test('function f(){return "19" & "21.9";}', 17)
self._test('function f(){return "19" & false;}', 0)
self._test('function f(){return "11.0" >> "2.1";}', 2)
self._test('function f(){return 5 ^ 9;}', 12)
self._test('function f(){return 0.0 << NaN}', 0)
self._test('function f(){return null << undefined}', 0)
def test_negative(self): def test_negative(self):
self._test('function f(){return 2 * -2.0 ;}', -4) self._test('function f(){return 2 * -2.0 ;}', -4)

View File

@ -1603,11 +1603,14 @@ Line 1
def test_get_element_by_class(self): def test_get_element_by_class(self):
html = ''' html = '''
<span class="foo bar">nice</span> <span class="foo bar baz-bam">nice</span>
''' '''
self.assertEqual(get_element_by_class('foo', html), 'nice') self.assertEqual(get_element_by_class('foo', html), 'nice')
self.assertEqual(get_element_by_class('bar', html), 'nice')
self.assertEqual(get_element_by_class('no-such-class', html), None) self.assertEqual(get_element_by_class('no-such-class', html), None)
self.assertEqual(get_element_by_class('baz', html), None)
self.assertEqual(get_element_by_class('bam', html), None)
def test_get_element_by_attribute(self): def test_get_element_by_attribute(self):
html = ''' html = '''
@ -1626,10 +1629,13 @@ Line 1
def test_get_elements_by_class(self): def test_get_elements_by_class(self):
html = ''' html = '''
<span class="foo bar">nice</span><span class="foo bar">also nice</span> <span class="foo bar baz-bam">nice</span><span class="foo bar">also nice</span>
''' '''
self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice']) self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_class('bar', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_class('baz', html), [])
self.assertEqual(get_elements_by_class('bam', html), [])
self.assertEqual(get_elements_by_class('no-such-class', html), []) self.assertEqual(get_elements_by_class('no-such-class', html), [])
def test_get_elements_by_attribute(self): def test_get_elements_by_attribute(self):

View File

@ -223,6 +223,18 @@ _NSIG_TESTS = [
'https://www.youtube.com/s/player/9c6dfc4a/player_ias.vflset/en_US/base.js', 'https://www.youtube.com/s/player/9c6dfc4a/player_ias.vflset/en_US/base.js',
'jbu7ylIosQHyJyJV', 'uwI0ESiynAmhNg', 'jbu7ylIosQHyJyJV', 'uwI0ESiynAmhNg',
), ),
(
'https://www.youtube.com/s/player/f6e09c70/player_ias.vflset/en_US/base.js',
'W9HJZKktxuYoDTqW', 'jHbbkcaxm54',
),
(
'https://www.youtube.com/s/player/f6e09c70/player_ias_tce.vflset/en_US/base.js',
'W9HJZKktxuYoDTqW', 'jHbbkcaxm54',
),
(
'https://www.youtube.com/s/player/91201489/player_ias_tce.vflset/en_US/base.js',
'W9HJZKktxuYoDTqW', 'U48vOZHaeYS6vO',
),
] ]
@ -284,7 +296,7 @@ def t_factory(name, sig_func, url_pattern):
def signature(jscode, sig_input): def signature(jscode, sig_input):
func = YoutubeIE(FakeYDL())._parse_sig_js(jscode) func = YoutubeIE(FakeYDL({'cachedir': False}))._parse_sig_js(jscode)
src_sig = ( src_sig = (
compat_str(string.printable[:sig_input]) compat_str(string.printable[:sig_input])
if isinstance(sig_input, int) else sig_input) if isinstance(sig_input, int) else sig_input)
@ -292,9 +304,10 @@ def signature(jscode, sig_input):
def n_sig(jscode, sig_input): def n_sig(jscode, sig_input):
funcname = YoutubeIE(FakeYDL())._extract_n_function_name(jscode) ie = YoutubeIE(FakeYDL({'cachedir': False}))
return JSInterpreter(jscode).call_function( jsi = JSInterpreter(jscode)
funcname, sig_input, _ytdl_do_not_return=sig_input) jsi, _, func_code = ie._extract_n_function_code_jsi(sig_input, jsi)
return ie._extract_n_function_from_code(jsi, func_code)(sig_input)
make_sig_test = t_factory( make_sig_test = t_factory(

View File

@ -18,7 +18,7 @@ from .compat import (
compat_getpass, compat_getpass,
compat_register_utf8, compat_register_utf8,
compat_shlex_split, compat_shlex_split,
workaround_optparse_bug9161, _workaround_optparse_bug9161,
) )
from .utils import ( from .utils import (
_UnsafeExtensionError, _UnsafeExtensionError,
@ -50,7 +50,7 @@ def _real_main(argv=None):
# Compatibility fix for Windows # Compatibility fix for Windows
compat_register_utf8() compat_register_utf8()
workaround_optparse_bug9161() _workaround_optparse_bug9161()
setproctitle('youtube-dl') setproctitle('youtube-dl')

View File

@ -16,7 +16,6 @@ import os
import platform import platform
import re import re
import shlex import shlex
import shutil
import socket import socket
import struct import struct
import subprocess import subprocess
@ -24,11 +23,15 @@ import sys
import types import types
import xml.etree.ElementTree import xml.etree.ElementTree
_IDENTITY = lambda x: x
# naming convention # naming convention
# 'compat_' + Python3_name.replace('.', '_') # 'compat_' + Python3_name.replace('.', '_')
# other aliases exist for convenience and/or legacy # other aliases exist for convenience and/or legacy
# wrap disposable test values in type() to reclaim storage
# deal with critical unicode/str things first # deal with critical unicode/str things first:
# compat_str, compat_basestring, compat_chr
try: try:
# Python 2 # Python 2
compat_str, compat_basestring, compat_chr = ( compat_str, compat_basestring, compat_chr = (
@ -39,18 +42,23 @@ except NameError:
str, (str, bytes), chr str, (str, bytes), chr
) )
# casefold
# compat_casefold
try: try:
compat_str.casefold compat_str.casefold
compat_casefold = lambda s: s.casefold() compat_casefold = lambda s: s.casefold()
except AttributeError: except AttributeError:
from .casefold import _casefold as compat_casefold from .casefold import _casefold as compat_casefold
# compat_collections_abc
try: try:
import collections.abc as compat_collections_abc import collections.abc as compat_collections_abc
except ImportError: except ImportError:
import collections as compat_collections_abc import collections as compat_collections_abc
# compat_urllib_request
try: try:
import urllib.request as compat_urllib_request import urllib.request as compat_urllib_request
except ImportError: # Python 2 except ImportError: # Python 2
@ -79,11 +87,15 @@ except TypeError:
_add_init_method_arg(compat_urllib_request.Request) _add_init_method_arg(compat_urllib_request.Request)
del _add_init_method_arg del _add_init_method_arg
# compat_urllib_error
try: try:
import urllib.error as compat_urllib_error import urllib.error as compat_urllib_error
except ImportError: # Python 2 except ImportError: # Python 2
import urllib2 as compat_urllib_error import urllib2 as compat_urllib_error
# compat_urllib_parse
try: try:
import urllib.parse as compat_urllib_parse import urllib.parse as compat_urllib_parse
except ImportError: # Python 2 except ImportError: # Python 2
@ -98,17 +110,23 @@ except ImportError: # Python 2
compat_urlparse = compat_urllib_parse compat_urlparse = compat_urllib_parse
compat_urllib_parse_urlparse = compat_urllib_parse.urlparse compat_urllib_parse_urlparse = compat_urllib_parse.urlparse
# compat_urllib_response
try: try:
import urllib.response as compat_urllib_response import urllib.response as compat_urllib_response
except ImportError: # Python 2 except ImportError: # Python 2
import urllib as compat_urllib_response import urllib as compat_urllib_response
# compat_urllib_response.addinfourl
try: try:
compat_urllib_response.addinfourl.status compat_urllib_response.addinfourl.status
except AttributeError: except AttributeError:
# .getcode() is deprecated in Py 3. # .getcode() is deprecated in Py 3.
compat_urllib_response.addinfourl.status = property(lambda self: self.getcode()) compat_urllib_response.addinfourl.status = property(lambda self: self.getcode())
# compat_http_cookiejar
try: try:
import http.cookiejar as compat_cookiejar import http.cookiejar as compat_cookiejar
except ImportError: # Python 2 except ImportError: # Python 2
@ -127,12 +145,16 @@ else:
compat_cookiejar_Cookie = compat_cookiejar.Cookie compat_cookiejar_Cookie = compat_cookiejar.Cookie
compat_http_cookiejar_Cookie = compat_cookiejar_Cookie compat_http_cookiejar_Cookie = compat_cookiejar_Cookie
# compat_http_cookies
try: try:
import http.cookies as compat_cookies import http.cookies as compat_cookies
except ImportError: # Python 2 except ImportError: # Python 2
import Cookie as compat_cookies import Cookie as compat_cookies
compat_http_cookies = compat_cookies compat_http_cookies = compat_cookies
# compat_http_cookies_SimpleCookie
if sys.version_info[0] == 2 or sys.version_info < (3, 3): if sys.version_info[0] == 2 or sys.version_info < (3, 3):
class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie): class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie):
def load(self, rawdata): def load(self, rawdata):
@ -155,11 +177,15 @@ else:
compat_cookies_SimpleCookie = compat_cookies.SimpleCookie compat_cookies_SimpleCookie = compat_cookies.SimpleCookie
compat_http_cookies_SimpleCookie = compat_cookies_SimpleCookie compat_http_cookies_SimpleCookie = compat_cookies_SimpleCookie
# compat_html_entities, probably useless now
try: try:
import html.entities as compat_html_entities import html.entities as compat_html_entities
except ImportError: # Python 2 except ImportError: # Python 2
import htmlentitydefs as compat_html_entities import htmlentitydefs as compat_html_entities
# compat_html_entities_html5
try: # Python >= 3.3 try: # Python >= 3.3
compat_html_entities_html5 = compat_html_entities.html5 compat_html_entities_html5 = compat_html_entities.html5
except AttributeError: except AttributeError:
@ -2408,18 +2434,24 @@ except AttributeError:
# Py < 3.1 # Py < 3.1
compat_http_client.HTTPResponse.getcode = lambda self: self.status compat_http_client.HTTPResponse.getcode = lambda self: self.status
# compat_urllib_HTTPError
try: try:
from urllib.error import HTTPError as compat_HTTPError from urllib.error import HTTPError as compat_HTTPError
except ImportError: # Python 2 except ImportError: # Python 2
from urllib2 import HTTPError as compat_HTTPError from urllib2 import HTTPError as compat_HTTPError
compat_urllib_HTTPError = compat_HTTPError compat_urllib_HTTPError = compat_HTTPError
# compat_urllib_request_urlretrieve
try: try:
from urllib.request import urlretrieve as compat_urlretrieve from urllib.request import urlretrieve as compat_urlretrieve
except ImportError: # Python 2 except ImportError: # Python 2
from urllib import urlretrieve as compat_urlretrieve from urllib import urlretrieve as compat_urlretrieve
compat_urllib_request_urlretrieve = compat_urlretrieve compat_urllib_request_urlretrieve = compat_urlretrieve
# compat_html_parser_HTMLParser, compat_html_parser_HTMLParseError
try: try:
from HTMLParser import ( from HTMLParser import (
HTMLParser as compat_HTMLParser, HTMLParser as compat_HTMLParser,
@ -2432,22 +2464,33 @@ except ImportError: # Python 3
# HTMLParseError was deprecated in Python 3.3 and removed in # HTMLParseError was deprecated in Python 3.3 and removed in
# Python 3.5. Introducing dummy exception for Python >3.5 for compatible # Python 3.5. Introducing dummy exception for Python >3.5 for compatible
# and uniform cross-version exception handling # and uniform cross-version exception handling
class compat_HTMLParseError(Exception): class compat_HTMLParseError(Exception):
pass pass
compat_html_parser_HTMLParser = compat_HTMLParser compat_html_parser_HTMLParser = compat_HTMLParser
compat_html_parser_HTMLParseError = compat_HTMLParseError compat_html_parser_HTMLParseError = compat_HTMLParseError
# compat_subprocess_get_DEVNULL
try: try:
_DEVNULL = subprocess.DEVNULL _DEVNULL = subprocess.DEVNULL
compat_subprocess_get_DEVNULL = lambda: _DEVNULL compat_subprocess_get_DEVNULL = lambda: _DEVNULL
except AttributeError: except AttributeError:
compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
# compat_http_server
try: try:
import http.server as compat_http_server import http.server as compat_http_server
except ImportError: except ImportError:
import BaseHTTPServer as compat_http_server import BaseHTTPServer as compat_http_server
# compat_urllib_parse_unquote_to_bytes,
# compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus,
# compat_urllib_parse_urlencode,
# compat_urllib_parse_parse_qs
try: try:
from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes
from urllib.parse import unquote as compat_urllib_parse_unquote from urllib.parse import unquote as compat_urllib_parse_unquote
@ -2598,6 +2641,8 @@ except ImportError: # Python 2
compat_urllib_parse_parse_qs = compat_parse_qs compat_urllib_parse_parse_qs = compat_parse_qs
# compat_urllib_request_DataHandler
try: try:
from urllib.request import DataHandler as compat_urllib_request_DataHandler from urllib.request import DataHandler as compat_urllib_request_DataHandler
except ImportError: # Python < 3.4 except ImportError: # Python < 3.4
@ -2632,16 +2677,20 @@ except ImportError: # Python < 3.4
return compat_urllib_response.addinfourl(io.BytesIO(data), headers, url) return compat_urllib_response.addinfourl(io.BytesIO(data), headers, url)
# compat_xml_etree_ElementTree_ParseError
try: try:
from xml.etree.ElementTree import ParseError as compat_xml_parse_error from xml.etree.ElementTree import ParseError as compat_xml_parse_error
except ImportError: # Python 2.6 except ImportError: # Python 2.6
from xml.parsers.expat import ExpatError as compat_xml_parse_error from xml.parsers.expat import ExpatError as compat_xml_parse_error
compat_xml_etree_ElementTree_ParseError = compat_xml_parse_error compat_xml_etree_ElementTree_ParseError = compat_xml_parse_error
etree = xml.etree.ElementTree
# compat_xml_etree_ElementTree_Element
_etree = xml.etree.ElementTree
class _TreeBuilder(etree.TreeBuilder): class _TreeBuilder(_etree.TreeBuilder):
def doctype(self, name, pubid, system): def doctype(self, name, pubid, system):
pass pass
@ -2650,7 +2699,7 @@ try:
# xml.etree.ElementTree.Element is a method in Python <=2.6 and # xml.etree.ElementTree.Element is a method in Python <=2.6 and
# the following will crash with: # the following will crash with:
# TypeError: isinstance() arg 2 must be a class, type, or tuple of classes and types # TypeError: isinstance() arg 2 must be a class, type, or tuple of classes and types
isinstance(None, etree.Element) isinstance(None, _etree.Element)
from xml.etree.ElementTree import Element as compat_etree_Element from xml.etree.ElementTree import Element as compat_etree_Element
except TypeError: # Python <=2.6 except TypeError: # Python <=2.6
from xml.etree.ElementTree import _ElementInterface as compat_etree_Element from xml.etree.ElementTree import _ElementInterface as compat_etree_Element
@ -2658,12 +2707,12 @@ compat_xml_etree_ElementTree_Element = compat_etree_Element
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
def compat_etree_fromstring(text): def compat_etree_fromstring(text):
return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder())) return _etree.XML(text, parser=_etree.XMLParser(target=_TreeBuilder()))
else: else:
# python 2.x tries to encode unicode strings with ascii (see the # python 2.x tries to encode unicode strings with ascii (see the
# XMLParser._fixtext method) # XMLParser._fixtext method)
try: try:
_etree_iter = etree.Element.iter _etree_iter = _etree.Element.iter
except AttributeError: # Python <=2.6 except AttributeError: # Python <=2.6
def _etree_iter(root): def _etree_iter(root):
for el in root.findall('*'): for el in root.findall('*'):
@ -2675,27 +2724,29 @@ else:
# 2.7 source # 2.7 source
def _XML(text, parser=None): def _XML(text, parser=None):
if not parser: if not parser:
parser = etree.XMLParser(target=_TreeBuilder()) parser = _etree.XMLParser(target=_TreeBuilder())
parser.feed(text) parser.feed(text)
return parser.close() return parser.close()
def _element_factory(*args, **kwargs): def _element_factory(*args, **kwargs):
el = etree.Element(*args, **kwargs) el = _etree.Element(*args, **kwargs)
for k, v in el.items(): for k, v in el.items():
if isinstance(v, bytes): if isinstance(v, bytes):
el.set(k, v.decode('utf-8')) el.set(k, v.decode('utf-8'))
return el return el
def compat_etree_fromstring(text): def compat_etree_fromstring(text):
doc = _XML(text, parser=etree.XMLParser(target=_TreeBuilder(element_factory=_element_factory))) doc = _XML(text, parser=_etree.XMLParser(target=_TreeBuilder(element_factory=_element_factory)))
for el in _etree_iter(doc): for el in _etree_iter(doc):
if el.text is not None and isinstance(el.text, bytes): if el.text is not None and isinstance(el.text, bytes):
el.text = el.text.decode('utf-8') el.text = el.text.decode('utf-8')
return doc return doc
if hasattr(etree, 'register_namespace'):
compat_etree_register_namespace = etree.register_namespace # compat_xml_etree_register_namespace
else: try:
compat_etree_register_namespace = _etree.register_namespace
except AttributeError:
def compat_etree_register_namespace(prefix, uri): def compat_etree_register_namespace(prefix, uri):
"""Register a namespace prefix. """Register a namespace prefix.
The registry is global, and any existing mapping for either the The registry is global, and any existing mapping for either the
@ -2704,14 +2755,16 @@ else:
attributes in this namespace will be serialized with prefix if possible. attributes in this namespace will be serialized with prefix if possible.
ValueError is raised if prefix is reserved or is invalid. ValueError is raised if prefix is reserved or is invalid.
""" """
if re.match(r"ns\d+$", prefix): if re.match(r'ns\d+$', prefix):
raise ValueError("Prefix format reserved for internal use") raise ValueError('Prefix format reserved for internal use')
for k, v in list(etree._namespace_map.items()): for k, v in list(_etree._namespace_map.items()):
if k == uri or v == prefix: if k == uri or v == prefix:
del etree._namespace_map[k] del _etree._namespace_map[k]
etree._namespace_map[uri] = prefix _etree._namespace_map[uri] = prefix
compat_xml_etree_register_namespace = compat_etree_register_namespace compat_xml_etree_register_namespace = compat_etree_register_namespace
# compat_xpath, compat_etree_iterfind
if sys.version_info < (2, 7): if sys.version_info < (2, 7):
# Here comes the crazy part: In 2.6, if the xpath is a unicode, # Here comes the crazy part: In 2.6, if the xpath is a unicode,
# .//node does not match if a node is a direct child of . ! # .//node does not match if a node is a direct child of . !
@ -2898,7 +2951,6 @@ if sys.version_info < (2, 7):
def __init__(self, root): def __init__(self, root):
self.root = root self.root = root
##
# Generate all matching objects. # Generate all matching objects.
def compat_etree_iterfind(elem, path, namespaces=None): def compat_etree_iterfind(elem, path, namespaces=None):
@ -2933,13 +2985,15 @@ if sys.version_info < (2, 7):
else: else:
compat_xpath = lambda xpath: xpath
compat_etree_iterfind = lambda element, match: element.iterfind(match) compat_etree_iterfind = lambda element, match: element.iterfind(match)
compat_xpath = _IDENTITY
# compat_os_name
compat_os_name = os._name if os.name == 'java' else os.name compat_os_name = os._name if os.name == 'java' else os.name
# compat_shlex_quote
if compat_os_name == 'nt': if compat_os_name == 'nt':
def compat_shlex_quote(s): def compat_shlex_quote(s):
return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"') return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"')
@ -2954,6 +3008,7 @@ else:
return "'" + s.replace("'", "'\"'\"'") + "'" return "'" + s.replace("'", "'\"'\"'") + "'"
# compat_shlex.split
try: try:
args = shlex.split('中文') args = shlex.split('中文')
assert (isinstance(args, list) assert (isinstance(args, list)
@ -2969,6 +3024,7 @@ except (AssertionError, UnicodeEncodeError):
return list(map(lambda s: s.decode('utf-8'), shlex.split(s, comments, posix))) return list(map(lambda s: s.decode('utf-8'), shlex.split(s, comments, posix)))
# compat_ord
def compat_ord(c): def compat_ord(c):
if isinstance(c, int): if isinstance(c, int):
return c return c
@ -2976,6 +3032,7 @@ def compat_ord(c):
return ord(c) return ord(c)
# compat_getenv, compat_os_path_expanduser, compat_setenv
if sys.version_info >= (3, 0): if sys.version_info >= (3, 0):
compat_getenv = os.getenv compat_getenv = os.getenv
compat_expanduser = os.path.expanduser compat_expanduser = os.path.expanduser
@ -3063,6 +3120,7 @@ else:
compat_os_path_expanduser = compat_expanduser compat_os_path_expanduser = compat_expanduser
# compat_os_path_realpath
if compat_os_name == 'nt' and sys.version_info < (3, 8): if compat_os_name == 'nt' and sys.version_info < (3, 8):
# os.path.realpath on Windows does not follow symbolic links # os.path.realpath on Windows does not follow symbolic links
# prior to Python 3.8 (see https://bugs.python.org/issue9949) # prior to Python 3.8 (see https://bugs.python.org/issue9949)
@ -3076,6 +3134,7 @@ else:
compat_os_path_realpath = compat_realpath compat_os_path_realpath = compat_realpath
# compat_print
if sys.version_info < (3, 0): if sys.version_info < (3, 0):
def compat_print(s): def compat_print(s):
from .utils import preferredencoding from .utils import preferredencoding
@ -3086,6 +3145,7 @@ else:
print(s) print(s)
# compat_getpass_getpass
if sys.version_info < (3, 0) and sys.platform == 'win32': if sys.version_info < (3, 0) and sys.platform == 'win32':
def compat_getpass(prompt, *args, **kwargs): def compat_getpass(prompt, *args, **kwargs):
if isinstance(prompt, compat_str): if isinstance(prompt, compat_str):
@ -3098,22 +3158,22 @@ else:
compat_getpass_getpass = compat_getpass compat_getpass_getpass = compat_getpass
# compat_input
try: try:
compat_input = raw_input compat_input = raw_input
except NameError: # Python 3 except NameError: # Python 3
compat_input = input compat_input = input
# compat_kwargs
# Python < 2.6.5 require kwargs to be bytes # Python < 2.6.5 require kwargs to be bytes
try: try:
def _testfunc(x): (lambda x: x)(**{'x': 0})
pass
_testfunc(**{'x': 0})
except TypeError: except TypeError:
def compat_kwargs(kwargs): def compat_kwargs(kwargs):
return dict((bytes(k), v) for k, v in kwargs.items()) return dict((bytes(k), v) for k, v in kwargs.items())
else: else:
compat_kwargs = lambda kwargs: kwargs compat_kwargs = _IDENTITY
# compat_numeric_types # compat_numeric_types
@ -3132,6 +3192,8 @@ except NameError: # Python 3
# compat_int # compat_int
compat_int = compat_integer_types[-1] compat_int = compat_integer_types[-1]
# compat_socket_create_connection
if sys.version_info < (2, 7): if sys.version_info < (2, 7):
def compat_socket_create_connection(address, timeout, source_address=None): def compat_socket_create_connection(address, timeout, source_address=None):
host, port = address host, port = address
@ -3158,6 +3220,7 @@ else:
compat_socket_create_connection = socket.create_connection compat_socket_create_connection = socket.create_connection
# compat_contextlib_suppress
try: try:
from contextlib import suppress as compat_contextlib_suppress from contextlib import suppress as compat_contextlib_suppress
except ImportError: except ImportError:
@ -3205,7 +3268,7 @@ except AttributeError:
# Fix https://github.com/ytdl-org/youtube-dl/issues/4223 # Fix https://github.com/ytdl-org/youtube-dl/issues/4223
# See http://bugs.python.org/issue9161 for what is broken # See http://bugs.python.org/issue9161 for what is broken
def workaround_optparse_bug9161(): def _workaround_optparse_bug9161():
op = optparse.OptionParser() op = optparse.OptionParser()
og = optparse.OptionGroup(op, 'foo') og = optparse.OptionGroup(op, 'foo')
try: try:
@ -3224,9 +3287,10 @@ def workaround_optparse_bug9161():
optparse.OptionGroup.add_option = _compat_add_option optparse.OptionGroup.add_option = _compat_add_option
if hasattr(shutil, 'get_terminal_size'): # Python >= 3.3 # compat_shutil_get_terminal_size
compat_get_terminal_size = shutil.get_terminal_size try:
else: from shutil import get_terminal_size as compat_get_terminal_size # Python >= 3.3
except ImportError:
_terminal_size = collections.namedtuple('terminal_size', ['columns', 'lines']) _terminal_size = collections.namedtuple('terminal_size', ['columns', 'lines'])
def compat_get_terminal_size(fallback=(80, 24)): def compat_get_terminal_size(fallback=(80, 24)):
@ -3256,27 +3320,33 @@ else:
columns = _columns columns = _columns
if lines is None or lines <= 0: if lines is None or lines <= 0:
lines = _lines lines = _lines
return _terminal_size(columns, lines) return _terminal_size(columns, lines)
compat_shutil_get_terminal_size = compat_get_terminal_size
# compat_itertools_count
try: try:
itertools.count(start=0, step=1) type(itertools.count(start=0, step=1))
compat_itertools_count = itertools.count compat_itertools_count = itertools.count
except TypeError: # Python 2.6 except TypeError: # Python 2.6 lacks step
def compat_itertools_count(start=0, step=1): def compat_itertools_count(start=0, step=1):
while True: while True:
yield start yield start
start += step start += step
# compat_tokenize_tokenize
if sys.version_info >= (3, 0): if sys.version_info >= (3, 0):
from tokenize import tokenize as compat_tokenize_tokenize from tokenize import tokenize as compat_tokenize_tokenize
else: else:
from tokenize import generate_tokens as compat_tokenize_tokenize from tokenize import generate_tokens as compat_tokenize_tokenize
# compat_struct_pack, compat_struct_unpack, compat_Struct
try: try:
struct.pack('!I', 0) type(struct.pack('!I', 0))
except TypeError: except TypeError:
# In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
# See https://bugs.python.org/issue19099 # See https://bugs.python.org/issue19099
@ -3308,8 +3378,10 @@ else:
compat_Struct = struct.Struct compat_Struct = struct.Struct
# compat_map/filter() returning an iterator, supposedly the # builtins returning an iterator
# same versioning as for zip below
# compat_map, compat_filter
# supposedly the same versioning as for zip below
try: try:
from future_builtins import map as compat_map from future_builtins import map as compat_map
except ImportError: except ImportError:
@ -3326,6 +3398,7 @@ except ImportError:
except ImportError: except ImportError:
compat_filter = filter compat_filter = filter
# compat_zip
try: try:
from future_builtins import zip as compat_zip from future_builtins import zip as compat_zip
except ImportError: # not 2.6+ or is 3.x except ImportError: # not 2.6+ or is 3.x
@ -3335,6 +3408,7 @@ except ImportError: # not 2.6+ or is 3.x
compat_zip = zip compat_zip = zip
# compat_itertools_zip_longest
# method renamed between Py2/3 # method renamed between Py2/3
try: try:
from itertools import zip_longest as compat_itertools_zip_longest from itertools import zip_longest as compat_itertools_zip_longest
@ -3342,7 +3416,8 @@ except ImportError:
from itertools import izip_longest as compat_itertools_zip_longest from itertools import izip_longest as compat_itertools_zip_longest
# new class in collections # compat_collections_chain_map
# collections.ChainMap: new class
try: try:
from collections import ChainMap as compat_collections_chain_map from collections import ChainMap as compat_collections_chain_map
# Py3.3's ChainMap is deficient # Py3.3's ChainMap is deficient
@ -3398,19 +3473,22 @@ except ImportError:
def new_child(self, m=None, **kwargs): def new_child(self, m=None, **kwargs):
m = m or {} m = m or {}
m.update(kwargs) m.update(kwargs)
return compat_collections_chain_map(m, *self.maps) # support inheritance !
return type(self)(m, *self.maps)
@property @property
def parents(self): def parents(self):
return compat_collections_chain_map(*(self.maps[1:])) return type(self)(*(self.maps[1:]))
# compat_re_Pattern, compat_re_Match
# Pythons disagree on the type of a pattern (RegexObject, _sre.SRE_Pattern, Pattern, ...?) # Pythons disagree on the type of a pattern (RegexObject, _sre.SRE_Pattern, Pattern, ...?)
compat_re_Pattern = type(re.compile('')) compat_re_Pattern = type(re.compile(''))
# and on the type of a match # and on the type of a match
compat_re_Match = type(re.match('a', 'a')) compat_re_Match = type(re.match('a', 'a'))
# compat_base64_b64decode
if sys.version_info < (3, 3): if sys.version_info < (3, 3):
def compat_b64decode(s, *args, **kwargs): def compat_b64decode(s, *args, **kwargs):
if isinstance(s, compat_str): if isinstance(s, compat_str):
@ -3422,6 +3500,7 @@ else:
compat_base64_b64decode = compat_b64decode compat_base64_b64decode = compat_b64decode
# compat_ctypes_WINFUNCTYPE
if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0): if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0):
# PyPy2 prior to version 5.4.0 expects byte strings as Windows function # PyPy2 prior to version 5.4.0 expects byte strings as Windows function
# names, see the original PyPy issue [1] and the youtube-dl one [2]. # names, see the original PyPy issue [1] and the youtube-dl one [2].
@ -3440,6 +3519,7 @@ else:
return ctypes.WINFUNCTYPE(*args, **kwargs) return ctypes.WINFUNCTYPE(*args, **kwargs)
# compat_open
if sys.version_info < (3, 0): if sys.version_info < (3, 0):
# open(file, mode='r', buffering=- 1, encoding=None, errors=None, newline=None, closefd=True) not: opener=None # open(file, mode='r', buffering=- 1, encoding=None, errors=None, newline=None, closefd=True) not: opener=None
def compat_open(file_, *args, **kwargs): def compat_open(file_, *args, **kwargs):
@ -3467,18 +3547,28 @@ except AttributeError:
def compat_datetime_timedelta_total_seconds(td): def compat_datetime_timedelta_total_seconds(td):
return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6 return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6
# optional decompression packages # optional decompression packages
# compat_brotli
# PyPi brotli package implements 'br' Content-Encoding # PyPi brotli package implements 'br' Content-Encoding
try: try:
import brotli as compat_brotli import brotli as compat_brotli
except ImportError: except ImportError:
compat_brotli = None compat_brotli = None
# compat_ncompress
# PyPi ncompress package implements 'compress' Content-Encoding # PyPi ncompress package implements 'compress' Content-Encoding
try: try:
import ncompress as compat_ncompress import ncompress as compat_ncompress
except ImportError: except ImportError:
compat_ncompress = None compat_ncompress = None
# compat_zstandard
# PyPi zstandard package implements 'zstd' Content-Encoding (RFC 8878 7.2)
try:
import zstandard as compat_zstandard
except ImportError:
compat_zstandard = None
legacy = [ legacy = [
'compat_HTMLParseError', 'compat_HTMLParseError',
@ -3495,6 +3585,7 @@ legacy = [
'compat_getpass', 'compat_getpass',
'compat_parse_qs', 'compat_parse_qs',
'compat_realpath', 'compat_realpath',
'compat_shlex_split',
'compat_urllib_parse_parse_qs', 'compat_urllib_parse_parse_qs',
'compat_urllib_parse_unquote', 'compat_urllib_parse_unquote',
'compat_urllib_parse_unquote_plus', 'compat_urllib_parse_unquote_plus',
@ -3508,8 +3599,6 @@ legacy = [
__all__ = [ __all__ = [
'compat_html_parser_HTMLParseError',
'compat_html_parser_HTMLParser',
'compat_Struct', 'compat_Struct',
'compat_base64_b64decode', 'compat_base64_b64decode',
'compat_basestring', 'compat_basestring',
@ -3518,13 +3607,9 @@ __all__ = [
'compat_chr', 'compat_chr',
'compat_collections_abc', 'compat_collections_abc',
'compat_collections_chain_map', 'compat_collections_chain_map',
'compat_datetime_timedelta_total_seconds',
'compat_http_cookiejar',
'compat_http_cookiejar_Cookie',
'compat_http_cookies',
'compat_http_cookies_SimpleCookie',
'compat_contextlib_suppress', 'compat_contextlib_suppress',
'compat_ctypes_WINFUNCTYPE', 'compat_ctypes_WINFUNCTYPE',
'compat_datetime_timedelta_total_seconds',
'compat_etree_fromstring', 'compat_etree_fromstring',
'compat_etree_iterfind', 'compat_etree_iterfind',
'compat_filter', 'compat_filter',
@ -3533,6 +3618,12 @@ __all__ = [
'compat_getpass_getpass', 'compat_getpass_getpass',
'compat_html_entities', 'compat_html_entities',
'compat_html_entities_html5', 'compat_html_entities_html5',
'compat_html_parser_HTMLParseError',
'compat_html_parser_HTMLParser',
'compat_http_cookiejar',
'compat_http_cookiejar_Cookie',
'compat_http_cookies',
'compat_http_cookies_SimpleCookie',
'compat_http_client', 'compat_http_client',
'compat_http_server', 'compat_http_server',
'compat_input', 'compat_input',
@ -3555,7 +3646,7 @@ __all__ = [
'compat_register_utf8', 'compat_register_utf8',
'compat_setenv', 'compat_setenv',
'compat_shlex_quote', 'compat_shlex_quote',
'compat_shlex_split', 'compat_shutil_get_terminal_size',
'compat_socket_create_connection', 'compat_socket_create_connection',
'compat_str', 'compat_str',
'compat_struct_pack', 'compat_struct_pack',
@ -3575,5 +3666,5 @@ __all__ = [
'compat_xml_etree_register_namespace', 'compat_xml_etree_register_namespace',
'compat_xpath', 'compat_xpath',
'compat_zip', 'compat_zip',
'workaround_optparse_bug9161', 'compat_zstandard',
] ]

View File

@ -3128,7 +3128,8 @@ class InfoExtractor(object):
continue continue
urls.add(source_url) urls.add(source_url)
source_type = source.get('type') or '' source_type = source.get('type') or ''
ext = mimetype2ext(source_type) or determine_ext(source_url) # https://github.com/yt-dlp/yt-dlp/pull/10956
ext = determine_ext(source_url, default_ext=mimetype2ext(source_type))
if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url: if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
source_url, video_id, 'mp4', entry_protocol='m3u8_native', source_url, video_id, 'mp4', entry_protocol='m3u8_native',

View File

@ -31,10 +31,14 @@ from ..utils import (
parse_resolution, parse_resolution,
sanitized_Request, sanitized_Request,
smuggle_url, smuggle_url,
strip_or_none,
T,
traverse_obj,
unescapeHTML, unescapeHTML,
unified_timestamp, unified_timestamp,
unsmuggle_url, unsmuggle_url,
UnsupportedError, UnsupportedError,
update_url_query,
url_or_none, url_or_none,
urljoin, urljoin,
xpath_attr, xpath_attr,
@ -2237,6 +2241,7 @@ class GenericIE(InfoExtractor):
'display_id': 'kelis-4th-of-july', 'display_id': 'kelis-4th-of-july',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Kelis - 4th Of July', 'title': 'Kelis - 4th Of July',
'description': 'Kelis - 4th Of July',
'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
}, },
}, { }, {
@ -2246,7 +2251,7 @@ class GenericIE(InfoExtractor):
'id': '105', 'id': '105',
'display_id': 'kelis-4th-of-july', 'display_id': 'kelis-4th-of-july',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Kelis - 4th Of July / Embed Player', 'title': r're:Kelis - 4th Of July(?: / Embed Player)?$',
'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
}, },
'params': { 'params': {
@ -2297,6 +2302,32 @@ class GenericIE(InfoExtractor):
'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер', 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg', 'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg',
}, },
'skip': 'needs Referer ?',
}, {
# KVS Player v10
'url': 'https://www.cambro.tv/588174/marleny-1/',
'md5': '759d2050590986c6fc341da0592c4d8e',
'info_dict': {
'id': '588174',
'display_id': 'marleny-1',
'ext': 'mp4',
'title': 'marleny 1',
'description': 'la maestra de tic toc',
'thumbnail': r're:https?://www\.cambro\.tv/contents/videos_screenshots/588000/588174/preview\.jpg',
'age_limit': 18,
},
}, {
# KVS Player v10 embed, NSFW
'url': 'https://www.cambro.tv/embed/436185',
'md5': '24338dc8b182900a2c9eda075a0a46c0',
'info_dict': {
'id': '436185',
'display_id': 'jaeandbailey-chaturbate-webcam-porn-videos',
'ext': 'mp4',
'title': 'jaeandbailey Chaturbate webcam porn videos',
'thumbnail': r're:https?://www\.cambro\.tv/contents/videos_screenshots/436000/436185/preview\.jpg',
'age_limit': 18,
},
}, { }, {
'url': 'https://mrdeepfakes.com/video/5/selena-gomez-pov-deep-fakes', 'url': 'https://mrdeepfakes.com/video/5/selena-gomez-pov-deep-fakes',
'md5': 'fec4ad5ec150f655e0c74c696a4a2ff4', 'md5': 'fec4ad5ec150f655e0c74c696a4a2ff4',
@ -2309,14 +2340,16 @@ class GenericIE(InfoExtractor):
'height': 720, 'height': 720,
'age_limit': 18, 'age_limit': 18,
}, },
# 'skip': 'Geo-blocked in some mjurisdictions',
}, { }, {
# KVS Player v2
'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/', 'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/',
'md5': 'e2f0a4c329f7986280b7328e24036d60', 'md5': 'e2f0a4c329f7986280b7328e24036d60',
'info_dict': { 'info_dict': {
'id': '284002', 'id': '284002',
'display_id': 'just-out-of-the-shower-joi', 'display_id': 'just-out-of-the-shower-joi',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Just Out Of The Shower JOI - Shooshtime', 'title': r're:Just Out Of The Shower JOI(?: - Shooshtime)?$',
'height': 720, 'height': 720,
'age_limit': 18, 'age_limit': 18,
}, },
@ -2482,9 +2515,12 @@ class GenericIE(InfoExtractor):
return '/'.join(urlparts) + '?' + url_query return '/'.join(urlparts) + '?' + url_query
flashvars = self._search_regex( flashvars = self._search_regex(
r'(?s)<script\b[^>]*>.*?var\s+flashvars\s*=\s*(\{.+?\});.*?</script>', r'''(?<![=!+*-])=\s*kt_player\s*\(\s*'kt_player'\s*,\s*[^)]+,\s*([\w$]+)\s*\)''',
webpage, 'flashvars') webpage, 'flashvars name', default='flashvars')
flashvars = self._parse_json(flashvars, video_id, transform_source=js_to_json) flashvars = self._search_json(
r'<script(?:\s[^>]*)?>[\s\S]*?var\s+%s\s*=' % (flashvars,),
webpage, 'flashvars', video_id, end_pattern=r';[\s\S]*?</script>',
transform_source=js_to_json)
# extract the part after the last / as the display_id from the # extract the part after the last / as the display_id from the
# canonical URL. # canonical URL.
@ -2493,12 +2529,7 @@ class GenericIE(InfoExtractor):
r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)', r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
webpage, 'display_id', fatal=False webpage, 'display_id', fatal=False
) )
title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title') title = flashvars.get('video_title') or self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
thumbnail = flashvars['preview_url']
if thumbnail.startswith('//'):
protocol, _, _ = url.partition('/')
thumbnail = protocol + thumbnail
url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys())) url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys()))
formats = [] formats = []
@ -2506,9 +2537,13 @@ class GenericIE(InfoExtractor):
if '/get_file/' not in flashvars[key]: if '/get_file/' not in flashvars[key]:
continue continue
format_id = flashvars.get(key + '_text', key) format_id = flashvars.get(key + '_text', key)
f_url = urljoin(url, getrealurl(flashvars[key], flashvars['license_code']))
rnd = flashvars.get('rnd', key)
if rnd:
f_url = update_url_query(f_url, {'rnd': rnd})
formats.append(merge_dicts( formats.append(merge_dicts(
parse_resolution(format_id) or parse_resolution(flashvars[key]), { parse_resolution(format_id) or parse_resolution(flashvars[key]), {
'url': urljoin(url, getrealurl(flashvars[key], flashvars['license_code'])), 'url': f_url,
'format_id': format_id, 'format_id': format_id,
'ext': 'mp4', 'ext': 'mp4',
'http_headers': {'Referer': url}, 'http_headers': {'Referer': url},
@ -2518,13 +2553,31 @@ class GenericIE(InfoExtractor):
self._sort_formats(formats) self._sort_formats(formats)
return { csv2list = (T(lambda s: s.split(',')), Ellipsis, T(strip_or_none))
info = traverse_obj(flashvars, {
'tags': ('video_tags',) + csv2list,
'categories': ('video_categories',) + csv2list,
'thumbnails': (
T(dict.items), lambda _, k_v: k_v[0].startswith('preview_url'), {
'url': (1, T(lambda u: urljoin(url, u))),
'preference': (0, T(lambda k: 100 - len(k))),
}),
})
info = merge_dicts(info, {
'id': flashvars['video_id'], 'id': flashvars['video_id'],
'display_id': display_id, 'display_id': display_id,
'title': title, 'title': title,
'thumbnail': thumbnail,
'formats': formats, 'formats': formats,
} })
# check-porn test for embed pages
if 'age_limit' not in info and traverse_obj(info, (
('title', (('tags', 'categories'), Ellipsis) or []),
T(lambda t: bool(re.search(r'(?i)(?:^|\s+)porn(?:$|\s+)', t)) or None)),
get_all=False):
info['age_limit'] = 18
return info
def _real_extract(self, url): def _real_extract(self, url):
if url.startswith('//'): if url.startswith('//'):
@ -3598,7 +3651,7 @@ class GenericIE(InfoExtractor):
), webpage, 'KVS player', group='ver', default=False) ), webpage, 'KVS player', group='ver', default=False)
if found: if found:
self.report_extraction('%s: KVS Player' % (video_id, )) self.report_extraction('%s: KVS Player' % (video_id, ))
if found.split('.')[0] not in ('4', '5', '6'): if found.split('.')[0] not in ('2', '4', '5', '6', '10'):
self.report_warning('Untested major version (%s) in player engine - download may fail.' % (found, )) self.report_warning('Untested major version (%s) in player engine - download may fail.' % (found, ))
return merge_dicts( return merge_dicts(
self._extract_kvs(url, webpage, video_id), self._extract_kvs(url, webpage, video_id),

View File

@ -1,6 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
merge_dicts,
traverse_obj,
)
class HentaiStigmaIE(InfoExtractor): class HentaiStigmaIE(InfoExtractor):
@ -24,16 +29,17 @@ class HentaiStigmaIE(InfoExtractor):
title = self._html_search_regex( title = self._html_search_regex(
r'<h2[^>]+class="posttitle"[^>]*><a[^>]*>([^<]+)</a>', r'<h2[^>]+class="posttitle"[^>]*><a[^>]*>([^<]+)</a>',
webpage, 'title') webpage, 'title')
wrap_url = self._html_search_regex(
wrap_url = self._search_regex(
r'<iframe[^>]+src="([^"]+mp4)"', webpage, 'wrapper url') r'<iframe[^>]+src="([^"]+mp4)"', webpage, 'wrapper url')
wrap_webpage = self._download_webpage(wrap_url, video_id)
video_url = self._html_search_regex( vid_page = self._download_webpage(wrap_url, video_id)
r'file\s*:\s*"([^"]+)"', wrap_webpage, 'video url')
return { entries = self._parse_html5_media_entries(wrap_url, vid_page, video_id)
self._sort_formats(traverse_obj(entries, (0, 'formats')) or [])
return merge_dicts({
'id': video_id, 'id': video_id,
'url': video_url,
'title': title, 'title': title,
'age_limit': 18, 'age_limit': 18,
} }, entries[0])

View File

@ -23,7 +23,7 @@ class KalturaIE(InfoExtractor):
(?: (?:
kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)| kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)|
https?:// https?://
(:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/ (?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/
(?: (?:
(?: (?:
# flash player # flash player

View File

@ -13,7 +13,7 @@ from ..utils import (
class MgoonIE(InfoExtractor): class MgoonIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://(?:www\.)? _VALID_URL = r'''(?x)https?://(?:www\.)?
(?:(:?m\.)?mgoon\.com/(?:ch/(?:.+)/v|play/view)| (?:(?:m\.)?mgoon\.com/(?:ch/(?:.+)/v|play/view)|
video\.mgoon\.com)/(?P<id>[0-9]+)''' video\.mgoon\.com)/(?P<id>[0-9]+)'''
_API_URL = 'http://mpos.mgoon.com/player/video?id={0:}' _API_URL = 'http://mpos.mgoon.com/player/video?id={0:}'
_TESTS = [ _TESTS = [

View File

@ -112,7 +112,7 @@ class ORFRadioIE(ORFRadioBase):
_VALID_URL = ( _VALID_URL = (
r'https?://sound\.orf\.at/radio/(?P<station>{0})/sendung/(?P<id>\d+)(?:/(?P<show>\w+))?'.format(_STATION_RE), r'https?://sound\.orf\.at/radio/(?P<station>{0})/sendung/(?P<id>\d+)(?:/(?P<show>\w+))?'.format(_STATION_RE),
r'https?://(?P<station>{0})\.orf\.at/player/(?P<date>\d{{8}})/(?P<id>\d+)'.format(_STATION_RE), r'https?://(?P<station>{0})\.orf\.at/(?:player|programm)/(?P<date>\d{{8}})/(?P<id>\d+)'.format(_STATION_RE),
) )
_TESTS = [{ _TESTS = [{
@ -150,6 +150,10 @@ class ORFRadioIE(ORFRadioBase):
'duration': 1500, 'duration': 1500,
}, },
'skip': 'Shows from ORF Sound are only available for 30 days.' 'skip': 'Shows from ORF Sound are only available for 30 days.'
}, {
# yt-dlp/yt-dlp#11014
'url': 'https://oe1.orf.at/programm/20240916/769302/Playgrounds',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -12,7 +12,9 @@ from ..utils import (
clean_html, clean_html,
decode_packed_codes, decode_packed_codes,
determine_ext, determine_ext,
extract_attributes,
ExtractorError, ExtractorError,
get_element_by_class,
get_element_by_id, get_element_by_id,
int_or_none, int_or_none,
merge_dicts, merge_dicts,
@ -56,39 +58,40 @@ def aa_decode(aa_code):
class XFileShareIE(InfoExtractor): class XFileShareIE(InfoExtractor):
_SITES = ( _SITES = (
# status check 2024-02: site availability, G site: search # status check 2024-10: site availability, G site: search
(r'aparat\.cam', 'Aparat'), # Cloudflare says host error 522, apparently changed to wolfstreeam.tv (r'aparat\.cam', 'Aparat'), # Cloudflare says host error 522, apparently changed to wolfstream.tv
(r'filemoon\.sx/.', 'FileMoon'), (r'filemoon\.(?:sx|to|in)', 'FileMoon'),
(r'gounlimited\.to', 'GoUnlimited'), # no media pages listed # (r'gounlimited\.to', 'GoUnlimited'), # domain not found
(r'govid\.me', 'GoVid'), # no media pages listed (r'govid\.me', 'GoVid'), # no media pages listed
(r'highstream\.tv', 'HighStream'), # clipwatching.com redirects here (r'highstream\.tv', 'HighStream'), # Cloudflare says host error 522, clipwatching.com now dead
(r'holavid\.com', 'HolaVid'), # Cloudflare says host error 522 (r'holavid\.com', 'HolaVid'), # hoster default home page
# (r'streamty\.com', 'Streamty'), # no media pages listed, connection timeout # (r'streamty\.com', 'Streamty'), # spam parking domain
# (r'thevideobee\.to', 'TheVideoBee'), # no pages listed, refuses connection # (r'thevideobee\.to', 'TheVideoBee'), # domain for sale
(r'uqload\.to', 'Uqload'), # .com, .co redirect here (r'uqload\.ws', 'Uqload'), # .com, .co, .to redirect here
(r'(?:vedbam\.xyz|vadbam.net)', 'V?dB?m'), # vidbom.com redirects here, but no valid media pages listed # (r'(vadbam.net', 'VadBam'), # domain not found
(r'(?:vedbam\.xyz|vadbam\.net|vbn\d\.vdbtm\.shop)', 'V?dB?m'), # vidbom.com redirects here, but no valid media pages listed
(r'vidlo\.us', 'vidlo'), # no valid media pages listed (r'vidlo\.us', 'vidlo'), # no valid media pages listed
(r'vidlocker\.xyz', 'VidLocker'), # no media pages listed (r'vidlocker\.xyz', 'VidLocker'), # no media pages listed
(r'(?:w\d\.)?viidshar\.com', 'VidShare'), # vidshare.tv redirects here (r'(?:w\d\.)?viidshar\.com', 'VidShare'), # vidshare.tv parked
# (r'vup\.to', 'VUp'), # domain not found # (r'vup\.to', 'VUp'), # domain not found
(r'wolfstream\.tv', 'WolfStream'), # (r'wolfstream\.tv', 'WolfStream'), # domain not found
(r'xvideosharing\.com', 'XVideoSharing'), # just started showing 'maintenance mode' (r'xvideosharing\.com', 'XVideoSharing'),
) )
IE_DESC = 'XFileShare-based sites: %s' % ', '.join(list(zip(*_SITES))[1]) IE_DESC = 'XFileShare-based sites: %s' % ', '.join(list(zip(*_SITES))[1])
_VALID_URL = (r'https?://(?:www\.)?(?P<host>%s)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' _VALID_URL = (r'https?://(?:www\.)?(?P<host>%s)/(?P<sub>[a-z]/|)(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
% '|'.join(site for site in list(zip(*_SITES))[0])) % '|'.join(site for site in list(zip(*_SITES))[0]))
_EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' % '|'.join(site for site in list(zip(*_SITES))[0])] _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' % '|'.join(site for site in list(zip(*_SITES))[0])]
_FILE_NOT_FOUND_REGEXES = ( _FILE_NOT_FOUND_REGEXES = (
r'>(?:404 - )?File Not Found<', r'>\s*(?:404 - )?File Not Found\s*<',
r'>The file was removed by administrator<', r'>\s*The file was removed by administrator\s*<',
) )
_TITLE_REGEXES = ( _TITLE_REGEXES = (
r'style="z-index: [0-9]+;">([^<]+)</span>', r'style="z-index: [0-9]+;">([^<]+)</span>',
r'<td nowrap>([^<]+)</td>', r'<td nowrap>([^<]+)</td>',
r'h4-fine[^>]*>([^<]+)<', r'h4-fine[^>]*>([^<]+)<',
r'>Watch (.+)[ <]', r'>Watch (.+?)(?: mp4)?(?: The Ultimate Free Video Hosting Solution for Webmasters and Bloggers)?<',
r'<h2 class="video-page-head">([^<]+)</h2>', r'<h2 class="video-page-head">([^<]+)</h2>',
r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<', # streamin.to (dead) r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<', # streamin.to (dead)
r'title\s*:\s*"([^"]+)"', # govid.me r'title\s*:\s*"([^"]+)"', # govid.me
@ -106,38 +109,41 @@ class XFileShareIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
'note': 'link in `sources`', 'note': 'link in `sources`',
'url': 'https://uqload.to/dcsu06gdb45o', 'url': 'https://uqload.ws/4sah252totrk.html',
'md5': '7f8db187b254379440bf4fcad094ae86', 'md5': '1f11151b5044862fbc3c112732f9f7d8',
'info_dict': { 'info_dict': {
'id': 'dcsu06gdb45o', 'id': '4sah252totrk',
'ext': 'mp4', 'ext': 'mp4',
'title': 'f2e31015957e74c8c8427982e161c3fc mp4', 'title': 'JEONGHAN WONWOO Interview With Allure Korea Arabic Sub',
'thumbnail': r're:https://.*\.jpg' 'thumbnail': r're:https://.*\.jpg'
}, },
'params': { 'params': {
'nocheckcertificate': True, 'nocheckcertificate': True,
}, },
'expected_warnings': ['Unable to extract JWPlayer data'], # 'expected_warnings': ['Unable to extract JWPlayer data'],
}, { }, {
'note': 'link in decoded `sources`', 'note': 'link in Playerjs', # need test with 'link in decoded `sources`'
'url': 'https://xvideosharing.com/1tlg6agrrdgc', 'url': 'https://xvideosharing.com/8cnupzc1z8xq.html',
'md5': '2608ce41932c1657ae56258a64e647d9', 'md5': '9725ca7229e8f3046f2417da3bd5eddc',
'info_dict': { 'info_dict': {
'id': '1tlg6agrrdgc', 'id': '8cnupzc1z8xq',
'ext': 'mp4', 'ext': 'mp4',
'title': '0121', 'title': 'HEVC X265 Big Buck Bunny 1080 10s 20MB',
'thumbnail': r're:https?://.*\.jpg', 'thumbnail': r're:https?://.*\.jpg',
}, },
'skip': 'This server is in maintenance mode.',
}, { }, {
'note': 'JWPlayer link in un-p,a,c,k,e,d JS', 'note': 'JWPlayer link in un-p,a,c,k,e,d JS, in player frame',
'url': 'https://filemoon.sx/e/dw40rxrzruqz', 'url': 'https://filemoon.sx/d/fbsxidybremo',
'md5': '5a713742f57ac4aef29b74733e8dda01', 'md5': '82007a71661630f60e866f0d6ed31b2a',
'info_dict': { 'info_dict': {
'id': 'dw40rxrzruqz', 'id': 'fbsxidybremo',
'title': 'dw40rxrzruqz', 'title': 'Uchouten',
'ext': 'mp4' 'ext': 'mp4'
}, },
'params': {
'skip_download': 'ffmpeg',
},
'expected_warnings': ['hlsnative has detected features it does not support'],
}, { }, {
'note': 'JWPlayer link in un-p,a,c,k,e,d JS', 'note': 'JWPlayer link in un-p,a,c,k,e,d JS',
'url': 'https://vadbam.net/6lnbkci96wly.html', 'url': 'https://vadbam.net/6lnbkci96wly.html',
@ -150,7 +156,7 @@ class XFileShareIE(InfoExtractor):
}, { }, {
'note': 'JWPlayer link in clear', 'note': 'JWPlayer link in clear',
'url': 'https://w1.viidshar.com/nnibe0xf0h79.html', 'url': 'https://w1.viidshar.com/nnibe0xf0h79.html',
'md5': 'f0a580ce9df06cc61b4a5c979d672367', 'md5': 'b95b97978093bc287c322307c689bd94',
'info_dict': { 'info_dict': {
'id': 'nnibe0xf0h79', 'id': 'nnibe0xf0h79',
'title': 'JaGa 68ar', 'title': 'JaGa 68ar',
@ -160,27 +166,21 @@ class XFileShareIE(InfoExtractor):
'skip_download': 'ffmpeg', 'skip_download': 'ffmpeg',
}, },
'expected_warnings': ['hlsnative has detected features it does not support'], 'expected_warnings': ['hlsnative has detected features it does not support'],
}, {
'note': 'JWPlayer link in clear',
'url': 'https://wolfstream.tv/a3drtehyrg52.html',
'md5': '1901d86a79c5e0c6a51bdc9a4cfd3769',
'info_dict': {
'id': 'a3drtehyrg52',
'title': 'NFL 2023 W04 DET@GB',
'ext': 'mp4'
},
}, { }, {
'url': 'https://aparat.cam/n4d6dh0wvlpr', 'url': 'https://aparat.cam/n4d6dh0wvlpr',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://uqload.to/ug5somm0ctnk.html',
'only_matching': True,
}, { }, {
'url': 'https://highstream.tv/2owiyz3sjoux', 'url': 'https://highstream.tv/2owiyz3sjoux',
'only_matching': True, 'only_matching': True,
}, { }, {
'url': 'https://vedbam.xyz/6lnbkci96wly.html', 'url': 'https://vedbam.xyz/6lnbkci96wly.html',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://vbn2.vdbtm.shop/6lnbkci96wly.html',
'only_matching': True,
}, {
'url': 'https://filemoon.in/e/5abn1ze9jifb',
'only_matching': True,
}] }]
@classmethod @classmethod
@ -194,17 +194,26 @@ class XFileShareIE(InfoExtractor):
return list(yield_urls()) return list(yield_urls())
def _real_extract(self, url): def _real_extract(self, url):
host, video_id = self._match_valid_url(url).group('host', 'id') host, sub, video_id = self._match_valid_url(url).group('host', 'sub', 'id')
url = 'https://%s/%s' % ( url = 'https://%s/%s%s' % (
host, host, sub,
'embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id) 'embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
container_div = get_element_by_id('container', webpage) or webpage main = self._search_regex(
r'(?s)<main>(.+)</main>', webpage, 'main', default=webpage)
container_div = (
get_element_by_id('container', main)
or get_element_by_class('container', main)
or webpage)
if self._search_regex( if self._search_regex(
r'>This server is in maintenance mode\.', container_div, r'>This server is in maintenance mode\.', container_div,
'maint error', group=0, default=None): 'maint error', group=0, default=None):
raise ExtractorError(clean_html(container_div), expected=True) raise ExtractorError(clean_html(container_div), expected=True)
if self._search_regex(
'not available in your country', container_div,
'geo block', group=0, default=None):
self.raise_geo_restricted()
if self._search_regex( if self._search_regex(
self._FILE_NOT_FOUND_REGEXES, container_div, self._FILE_NOT_FOUND_REGEXES, container_div,
'missing video error', group=0, default=None): 'missing video error', group=0, default=None):
@ -228,38 +237,41 @@ class XFileShareIE(InfoExtractor):
title = ( title = (
self._search_regex(self._TITLE_REGEXES, webpage, 'title', default=None) self._search_regex(self._TITLE_REGEXES, webpage, 'title', default=None)
or self._og_search_title(webpage, default=None) or self._og_search_title(webpage, default='')).strip()
or video_id).strip()
obf_code = True def deobfuscate(html):
while obf_code:
for regex, func in ( for regex, func in (
(r'(?s)(?<!-)\b(eval\(function\(p,a,c,k,e,d\)\{(?:(?!</script>).)+\)\))', (r'(?s)(?<!-)\b(eval\(function\(p,a,c,k,e,d\)\{(?:(?!</script>).)+\)\))',
decode_packed_codes), decode_packed_codes),
(r'(゚.+)', aa_decode)): (r'(゚.+)', aa_decode)):
obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None) obf_code = self._search_regex(regex, html, 'obfuscated code', default=None)
if obf_code: if obf_code:
webpage = webpage.replace(obf_code, func(obf_code)) return html.replace(obf_code, func(obf_code))
break
def jw_extract(html):
jwplayer_data = self._find_jwplayer_data( jwplayer_data = self._find_jwplayer_data(
webpage.replace(r'\'', '\''), video_id) html.replace(r'\'', '\''), video_id)
result = self._parse_jwplayer_data( result = self._parse_jwplayer_data(
jwplayer_data, video_id, require_title=False, jwplayer_data, video_id, require_title=False,
m3u8_id='hls', mpd_id='dash') m3u8_id='hls', mpd_id='dash')
result = traverse_obj(result, (
if not traverse_obj(result, 'formats'): (None, ('entries', 0)), T(lambda r: r if r['formats'] else None)),
if jwplayer_data: get_all=False) or {}
if not result and jwplayer_data:
self.report_warning( self.report_warning(
'Failed to extract JWPlayer formats', video_id=video_id) 'Failed to extract JWPlayer formats', video_id=video_id)
return result
def extract_from_links(html):
urls = set() urls = set()
for regex in self._SOURCE_URL_REGEXES: for regex in self._SOURCE_URL_REGEXES:
for mobj in re.finditer(regex, webpage): for mobj in re.finditer(regex, html):
urls.add(mobj.group('url')) urls.add(mobj.group('url'))
sources = self._search_regex( sources = self._search_json(
r'sources\s*:\s*(\[(?!{)[^\]]+\])', webpage, 'sources', default=None) r'\bsources\s*:', webpage, 'sources', video_id,
urls.update(traverse_obj(sources, (T(lambda s: self._parse_json(s, video_id)), Ellipsis))) contains_pattern=r'\[(?!{)[^\]]+\]', default=[])
urls.update(sources)
formats = [] formats = []
for video_url in traverse_obj(urls, (Ellipsis, T(url_or_none))): for video_url in traverse_obj(urls, (Ellipsis, T(url_or_none))):
@ -273,7 +285,33 @@ class XFileShareIE(InfoExtractor):
'url': video_url, 'url': video_url,
'format_id': 'sd', 'format_id': 'sd',
}) })
result = {'formats': formats} return {'formats': formats}
def extract_info(html):
html = deobfuscate(html) or html
result = jw_extract(html)
if not result.get('formats'):
result = extract_from_links(html)
return result
def pages_to_extract(html):
yield html
# page with separate protected download page also has player link
player_iframe = self._search_regex(
r'(<iframe\s[^>]+>)',
get_element_by_id('iframe-holder', html) or '',
'player iframe', default='')
player_url = extract_attributes(player_iframe).get('src')
if player_url:
html = self._download_webpage(player_url, video_id, note='Downloading player page', fatal=False)
if html:
yield html
result = {}
for html in pages_to_extract(webpage):
result = extract_info(html)
if result.get('formats'):
break
self._sort_formats(result['formats']) self._sort_formats(result['formats'])

View File

@ -30,17 +30,20 @@ class YandexMusicBaseIE(InfoExtractor):
@staticmethod @staticmethod
def _raise_captcha(): def _raise_captcha():
raise ExtractorError( raise ExtractorError(
'YandexMusic has considered youtube-dl requests automated and ' 'YandexMusic has considered youtube-dl requests automated '
'asks you to solve a CAPTCHA. You can either wait for some ' 'and asks you to solve a CAPTCHA. You can wait for some time '
'time until unblocked and optionally use --sleep-interval ' 'until unblocked and optionally use --sleep-interval in future; '
'in future or alternatively you can go to https://music.yandex.ru/ ' 'otherwise solve the CAPTCHA at https://music.yandex.ru/, '
'solve CAPTCHA, then export cookies and pass cookie file to ' 'then export cookies and pass the cookie file to youtube-dl '
'youtube-dl with --cookies', 'with --cookies.',
expected=True) expected=True)
def _download_webpage_handle(self, *args, **kwargs): def _download_webpage_handle(self, *args, **kwargs):
webpage = super(YandexMusicBaseIE, self)._download_webpage_handle(*args, **kwargs) webpage = super(YandexMusicBaseIE, self)._download_webpage_handle(*args, **kwargs)
if 'Нам очень жаль, но&nbsp;запросы, поступившие с&nbsp;вашего IP-адреса, похожи на&nbsp;автоматические.' in webpage: blocked_ip_msg = (
'Нам очень жаль, но&nbsp;запросы, поступившие с&nbsp;'
'вашего IP-адреса, похожи на&nbsp;автоматические.')
if blocked_ip_msg in (webpage or [''])[0]:
self._raise_captcha() self._raise_captcha()
return webpage return webpage

View File

@ -122,7 +122,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'INNERTUBE_CONTEXT': { 'INNERTUBE_CONTEXT': {
'client': { 'client': {
'clientName': 'TVHTML5', 'clientName': 'TVHTML5',
'clientVersion': '7.20241201.18.00', 'clientVersion': '7.20250120.19.00',
'userAgent': 'Mozilla/5.0 (ChromiumStylePlatform) Cobalt/Version',
}, },
}, },
'INNERTUBE_CONTEXT_CLIENT_NAME': 7, 'INNERTUBE_CONTEXT_CLIENT_NAME': 7,
@ -1851,11 +1852,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if func_code: if func_code:
return jsi, player_id, func_code return jsi, player_id, func_code
return self._extract_n_function_code_jsi(video_id, jsi, player_id)
func_name = self._extract_n_function_name(jscode) def _extract_n_function_code_jsi(self, video_id, jsi, player_id=None):
var_ay = self._search_regex(
r'(?:[;\s]|^)\s*(var\s*[\w$]+\s*=\s*"[^"]+"\s*\.\s*split\("\{"\))(?=\s*[,;])',
jsi.code, 'useful values', default='')
func_name = self._extract_n_function_name(jsi.code)
func_code = jsi.extract_function_code(func_name) func_code = jsi.extract_function_code(func_name)
if var_ay:
func_code = (func_code[0], ';\n'.join((var_ay, func_code[1])))
if player_id:
self.cache.store('youtube-nsig', player_id, func_code) self.cache.store('youtube-nsig', player_id, func_code)
return jsi, player_id, func_code return jsi, player_id, func_code

View File

@ -1,10 +1,12 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import calendar
import itertools import itertools
import json import json
import operator import operator
import re import re
import time
from functools import update_wrapper, wraps from functools import update_wrapper, wraps
@ -12,8 +14,10 @@ from .utils import (
error_to_compat_str, error_to_compat_str,
ExtractorError, ExtractorError,
float_or_none, float_or_none,
int_or_none,
js_to_json, js_to_json,
remove_quotes, remove_quotes,
str_or_none,
unified_timestamp, unified_timestamp,
variadic, variadic,
write_string, write_string,
@ -150,6 +154,7 @@ def _js_to_primitive(v):
) )
# more exact: yt-dlp/yt-dlp#12110
def _js_toString(v): def _js_toString(v):
return ( return (
'undefined' if v is JS_Undefined 'undefined' if v is JS_Undefined
@ -158,7 +163,7 @@ def _js_toString(v):
else 'null' if v is None else 'null' if v is None
# bool <= int: do this first # bool <= int: do this first
else ('false', 'true')[v] if isinstance(v, bool) else ('false', 'true')[v] if isinstance(v, bool)
else '{0:.7f}'.format(v).rstrip('.0') if isinstance(v, compat_numeric_types) else re.sub(r'(?<=\d)\.?0*$', '', '{0:.7f}'.format(v)) if isinstance(v, compat_numeric_types)
else _js_to_primitive(v)) else _js_to_primitive(v))
@ -404,6 +409,7 @@ class JSInterpreter(object):
class Exception(ExtractorError): class Exception(ExtractorError):
def __init__(self, msg, *args, **kwargs): def __init__(self, msg, *args, **kwargs):
expr = kwargs.pop('expr', None) expr = kwargs.pop('expr', None)
msg = str_or_none(msg, default='"None"')
if expr is not None: if expr is not None:
msg = '{0} in: {1!r:.100}'.format(msg.rstrip(), expr) msg = '{0} in: {1!r:.100}'.format(msg.rstrip(), expr)
super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs)
@ -431,6 +437,7 @@ class JSInterpreter(object):
flags, _ = self.regex_flags(flags) flags, _ = self.regex_flags(flags)
# First, avoid https://github.com/python/cpython/issues/74534 # First, avoid https://github.com/python/cpython/issues/74534
self.__self = None self.__self = None
pattern_txt = str_or_none(pattern_txt) or '(?:)'
self.__pattern_txt = pattern_txt.replace('[[', r'[\[') self.__pattern_txt = pattern_txt.replace('[[', r'[\[')
self.__flags = flags self.__flags = flags
@ -475,6 +482,73 @@ class JSInterpreter(object):
flags |= cls.RE_FLAGS[ch] flags |= cls.RE_FLAGS[ch]
return flags, expr[idx + 1:] return flags, expr[idx + 1:]
class JS_Date(object):
_t = None
@staticmethod
def __ymd_etc(*args, **kw_is_utc):
# args: year, monthIndex, day, hours, minutes, seconds, milliseconds
is_utc = kw_is_utc.get('is_utc', False)
args = list(args[:7])
args += [0] * (9 - len(args))
args[1] += 1 # month 0..11 -> 1..12
ms = args[6]
for i in range(6, 9):
args[i] = -1 # don't know
if is_utc:
args[-1] = 1
# TODO: [MDN] When a segment overflows or underflows its expected
# range, it usually "carries over to" or "borrows from" the higher segment.
try:
mktime = calendar.timegm if is_utc else time.mktime
return mktime(time.struct_time(args)) * 1000 + ms
except (OverflowError, ValueError):
return None
@classmethod
def UTC(cls, *args):
t = cls.__ymd_etc(*args, is_utc=True)
return _NaN if t is None else t
@staticmethod
def parse(date_str, **kw_is_raw):
is_raw = kw_is_raw.get('is_raw', False)
t = unified_timestamp(str_or_none(date_str), False)
return int(t * 1000) if t is not None else t if is_raw else _NaN
@staticmethod
def now(**kw_is_raw):
is_raw = kw_is_raw.get('is_raw', False)
t = time.time()
return int(t * 1000) if t is not None else t if is_raw else _NaN
def __init__(self, *args):
if not args:
args = [self.now(is_raw=True)]
if len(args) == 1:
if isinstance(args[0], JSInterpreter.JS_Date):
self._t = int_or_none(args[0].valueOf(), default=None)
else:
arg_type = _js_typeof(args[0])
if arg_type == 'string':
self._t = self.parse(args[0], is_raw=True)
elif arg_type == 'number':
self._t = int(args[0])
else:
self._t = self.__ymd_etc(*args)
def toString(self):
try:
return time.strftime('%a %b %0d %Y %H:%M:%S %Z%z', self._t).rstrip()
except TypeError:
return "Invalid Date"
def valueOf(self):
return _NaN if self._t is None else self._t
@classmethod @classmethod
def __op_chars(cls): def __op_chars(cls):
op_chars = set(';,[') op_chars = set(';,[')
@ -599,14 +673,15 @@ class JSInterpreter(object):
except Exception as e: except Exception as e:
raise self.Exception('Failed to evaluate {left_val!r:.50} {op} {right_val!r:.50}'.format(**locals()), expr, cause=e) raise self.Exception('Failed to evaluate {left_val!r:.50} {op} {right_val!r:.50}'.format(**locals()), expr, cause=e)
def _index(self, obj, idx, allow_undefined=True): def _index(self, obj, idx, allow_undefined=None):
if idx == 'length' and isinstance(obj, list): if idx == 'length' and isinstance(obj, list):
return len(obj) return len(obj)
try: try:
return obj[int(idx)] if isinstance(obj, list) else obj[compat_str(idx)] return obj[int(idx)] if isinstance(obj, list) else obj[compat_str(idx)]
except (TypeError, KeyError, IndexError) as e: except (TypeError, KeyError, IndexError) as e:
if allow_undefined: # allow_undefined is None gives correct behaviour
# when is not allowed? if allow_undefined or (
allow_undefined is None and not isinstance(e, TypeError)):
return JS_Undefined return JS_Undefined
raise self.Exception('Cannot get index {idx!r:.100}'.format(**locals()), expr=repr(obj), cause=e) raise self.Exception('Cannot get index {idx!r:.100}'.format(**locals()), expr=repr(obj), cause=e)
@ -715,7 +790,7 @@ class JSInterpreter(object):
new_kw, _, obj = expr.partition('new ') new_kw, _, obj = expr.partition('new ')
if not new_kw: if not new_kw:
for klass, konstr in (('Date', lambda x: int(unified_timestamp(x, False) * 1000)), for klass, konstr in (('Date', lambda *x: self.JS_Date(*x).valueOf()),
('RegExp', self.JS_RegExp), ('RegExp', self.JS_RegExp),
('Error', self.Exception)): ('Error', self.Exception)):
if not obj.startswith(klass + '('): if not obj.startswith(klass + '('):
@ -1034,6 +1109,7 @@ class JSInterpreter(object):
'String': compat_str, 'String': compat_str,
'Math': float, 'Math': float,
'Array': list, 'Array': list,
'Date': self.JS_Date,
} }
obj = local_vars.get(variable) obj = local_vars.get(variable)
if obj in (JS_Undefined, None): if obj in (JS_Undefined, None):
@ -1086,6 +1162,8 @@ class JSInterpreter(object):
assertion(len(argvals) == 2, 'takes two arguments') assertion(len(argvals) == 2, 'takes two arguments')
return argvals[0] ** argvals[1] return argvals[0] ** argvals[1]
raise self.Exception('Unsupported Math method ' + member, expr=expr) raise self.Exception('Unsupported Math method ' + member, expr=expr)
elif obj is self.JS_Date:
return getattr(obj, member)(*argvals)
if member == 'split': if member == 'split':
assertion(len(argvals) <= 2, 'takes at most two arguments') assertion(len(argvals) <= 2, 'takes at most two arguments')

View File

@ -1960,7 +1960,7 @@ def get_element_by_attribute(attribute, value, html, escape_value=True):
def get_elements_by_class(class_name, html): def get_elements_by_class(class_name, html):
"""Return the content of all tags with the specified class in the passed HTML document as a list""" """Return the content of all tags with the specified class in the passed HTML document as a list"""
return get_elements_by_attribute( return get_elements_by_attribute(
'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), 'class', r'[^\'"]*(?<!-)\b%s\b(?!-)[^\'"]*' % re.escape(class_name),
html, escape_value=False) html, escape_value=False)
@ -4498,7 +4498,7 @@ def strip_jsonp(code):
def js_to_json(code, *args, **kwargs): def js_to_json(code, *args, **kwargs):
# vars is a dict of (var, val) pairs to substitute # vars is a dict of (var, val) pairs to substitute
vars = args[0] if len(args) > 0 else kwargs.get('vars', {}) js_vars = args[0] if len(args) > 0 else kwargs.get('vars', {})
strict = kwargs.get('strict', False) strict = kwargs.get('strict', False)
STRING_QUOTES = '\'"`' STRING_QUOTES = '\'"`'
@ -4523,9 +4523,13 @@ def js_to_json(code, *args, **kwargs):
else escape) else escape)
def template_substitute(match): def template_substitute(match):
evaluated = js_to_json(match.group(1), vars, strict=strict) evaluated = js_to_json(match.group(1), js_vars, strict=strict)
if evaluated[0] == '"': if evaluated[0] == '"':
try:
return json.loads(evaluated) return json.loads(evaluated)
except JSONDecodeError:
if strict:
raise
return evaluated return evaluated
def fix_kv(m): def fix_kv(m):
@ -4559,14 +4563,14 @@ def js_to_json(code, *args, **kwargs):
i = int(im.group(1), base) i = int(im.group(1), base)
return ('"%s":' if v.endswith(':') else '%s') % inv(i) return ('"%s":' if v.endswith(':') else '%s') % inv(i)
if v in vars: if v in js_vars:
try: try:
if not strict: if not strict:
json.loads(vars[v]) json.loads(js_vars[v])
except JSONDecodeError: except JSONDecodeError:
return inv(json.dumps(vars[v])) return inv(json.dumps(js_vars[v]))
else: else:
return inv(vars[v]) return inv(js_vars[v])
if not strict: if not strict:
v = try_call(inv, args=(v,), default=v) v = try_call(inv, args=(v,), default=v)
@ -4577,7 +4581,7 @@ def js_to_json(code, *args, **kwargs):
raise ValueError('Unknown value: ' + v) raise ValueError('Unknown value: ' + v)
def create_map(mobj): def create_map(mobj):
return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars)))) return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=js_vars))))
code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code) code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
if not strict: if not strict:
@ -6715,3 +6719,8 @@ class _UnsafeExtensionError(Exception):
raise cls(extension) raise cls(extension)
return extension return extension
def json_stringify(json_data, **kwargs):
kwargs.setdefault('separators', (',', ':'))
return json.dumps(json_data, **kwargs).decode('utf-8')