[SpankBang] Rework SpankBangPlaylistIE with pagination

This commit is contained in:
dirkf 2022-06-06 14:23:54 +01:00
parent 30a954bad9
commit 836463013c

View File

@ -1,11 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import itertools
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
extract_attributes,
ExtractorError, ExtractorError,
get_element_by_class,
get_element_by_id,
merge_dicts, merge_dicts,
parse_duration, parse_duration,
parse_resolution, parse_resolution,
@ -173,32 +178,56 @@ class SpankBangIE(InfoExtractor):
class SpankBangPlaylistIE(InfoExtractor): class SpankBangPlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/(?P<display_id>[^/]+)' _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/(?P<display_id>[^/]+)'
_TEST = { _TESTS = [{
'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties', 'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties',
'info_dict': { 'info_dict': {
'id': 'ug0k', 'id': 'ug0k',
'title': 'Big Ass Titties', 'title': 'Big Ass Titties',
}, },
'playlist_mincount': 40, 'playlist_mincount': 35,
} }, {
# pagination required
'url': 'https://spankbang.com/51wxk/playlist/dance',
'info_dict': {
'id': '51wxk',
'title': 'Dance',
},
'playlist_mincount': 60,
}]
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
playlist_id = mobj.group('id') playlist_id = mobj.group('id')
display_id = mobj.group('display_id') display_id = mobj.group('display_id')
webpage = self._download_webpage( webpage = self._download_webpage(url, playlist_id)
url, playlist_id, headers={'Cookie': 'country=US; mobile=on'})
entries = [self.url_result( def _entries(url, webpage=None):
urljoin(url, mobj.group('path')), for ii in itertools.count(1):
ie=SpankBangIE.ie_key(), video_id=mobj.group('id')) if not webpage:
webpage = self._download_webpage(
url, playlist_id,
note='Downloading playlist page %d' % (ii, ),
fatal=False)
if not webpage:
break
# search <main id="container">...</main>.innerHTML
for mobj in re.finditer( for mobj in re.finditer(
r'<a[^>]+\bhref=(["\'])(?P<path>/?[\da-z]+-(?P<id>[\da-z]+)/playlist/%s(?:(?!\1).)*)\1' r'''<a\b[^>]*?\bclass\s*=\s*('|")(?:(?:(?!\1).)+?\s)?\s*thumb\b[^>]*>''',
% re.escape(display_id), webpage)] get_element_by_id('container', webpage) or webpage):
item_url = extract_attributes(mobj.group(0)).get('href')
if item_url:
yield urljoin(url, item_url)
next_url = self._search_regex(
r'''\bhref\s*=\s*(["'])(?P<path>(?!\1).+?)/?\1''',
get_element_by_class('next', webpage) or '',
'continuation page', group='path', default=None)
if next_url is None or next_url in url:
break
url, webpage = urljoin(url, next_url + '/'), None
title = self._html_search_regex( title = self._html_search_regex(
r'<h1>([^<]+)\s+playlist\s*<', webpage, 'playlist title', r'<h1>([^<]+)\s+playlist\s*<', webpage, 'playlist title',
fatal=False) fatal=False) or re.sub(r'(\w)\+(\w)', r'\1 \2', display_id).title()
return self.playlist_result(entries, playlist_id, title) return self.playlist_from_matches(_entries(url, webpage), playlist_id, title, ie=SpankBangIE.ie_key())