From 58f15bb6c8733635cbb2486ebb49bde3b17675a2 Mon Sep 17 00:00:00 2001 From: df Date: Sat, 13 Nov 2021 22:03:19 +0000 Subject: [PATCH] [utils] Don't find classname as part of class="... x-classname ...", etc Eg, in [1], the class with name 'plist-info' was found when searching for 'info'. 1. https://github.com/ytdl-org/youtube-dl/issues/30230 --- test/test_utils.py | 5 ++++- youtube_dl/utils.py | 12 ++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 8df794c13..1186dd46b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1619,7 +1619,10 @@ Line 1 def test_get_elements_by_class(self): html = ''' - nicealso nice + nasty + nice + "also nice" + also nasty ''' self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice']) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 653fe9fcd..19bdd3049 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1986,7 +1986,9 @@ def get_element_by_attribute(attribute, value, html, escape_value=True): def get_elements_by_class(class_name, html): """Return the content of all tags with the specified class in the passed HTML document as a list""" return get_elements_by_attribute( - 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), + # class names can contain alphanumeric, -, _ and \ for escapes + # don't allow a word break at - + 'class', r'(?:[\w\s\\-]*?[\w\s])?\b%s\b(?:[\w\s\\][\w\s\\-]*?)?' % re.escape(class_name), html, escape_value=False) @@ -1997,11 +1999,13 @@ def get_elements_by_attribute(attribute, value, html, escape_value=True): retlist = [] for m in re.finditer(r'''(?xs) - <([a-zA-Z0-9:._-]+) + <([a-zA-Z0-9:._-]+) # conservative pattern: HTML tags don't have :._- + # (?:\s[^>]+) # this seems to be simpler than the below and work the same? (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? - \s+%s=['"]?%s['"]? + \s*\b%s\s*=\s*(?P<__q>'|"|\b)%s(?P=__q) + # (?:\s[^>]+)? # as above (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? - \s*> + \s*> (?P.*?) ''' % (re.escape(attribute), value), html):