[motherless] Fixed the broken uploader_id in the extractor (#31243)

* Fixed the broken uploader_id in the extractor.
* Make uploader_id RE looser
* Fix uploader_id in test Motherless_3
* Fix group pagination
* # coding: utf-8

Co-authored-by: Andy Xuming <xuminic@gmail.com>
Co-authored-by: dirkf <fieldhouse@gmx.net>
This commit is contained in:
Xiyue 2022-10-11 09:52:48 +11:00 committed by GitHub
parent 1b1442887e
commit 82e4eca711
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,3 +1,4 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import datetime import datetime
@ -71,7 +72,7 @@ class MotherlessIE(InfoExtractor):
'title': 'a/ Hot Teens', 'title': 'a/ Hot Teens',
'categories': list, 'categories': list,
'upload_date': '20210104', 'upload_date': '20210104',
'uploader_id': 'yonbiw', 'uploader_id': 'anonymous',
'thumbnail': r're:https?://.*\.jpg', 'thumbnail': r're:https?://.*\.jpg',
'age_limit': 18, 'age_limit': 18,
}, },
@ -127,7 +128,7 @@ class MotherlessIE(InfoExtractor):
comment_count = webpage.count('class="media-comment-contents"') comment_count = webpage.count('class="media-comment-contents"')
uploader_id = self._html_search_regex( uploader_id = self._html_search_regex(
r'"thumb-member-username">\s+<a href="/m/([^"]+)"', r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)''',
webpage, 'uploader_id') webpage, 'uploader_id')
categories = self._html_search_meta('keywords', webpage, default=None) categories = self._html_search_meta('keywords', webpage, default=None)
@ -169,7 +170,7 @@ class MotherlessGroupIE(InfoExtractor):
'description': 'Sex can be funny. Wide smiles,laugh, games, fun of ' 'description': 'Sex can be funny. Wide smiles,laugh, games, fun of '
'any kind!' 'any kind!'
}, },
'playlist_mincount': 9, 'playlist_mincount': 0,
}] }]
@classmethod @classmethod
@ -208,9 +209,9 @@ class MotherlessGroupIE(InfoExtractor):
r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False) r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False)
description = self._html_search_meta( description = self._html_search_meta(
'description', webpage, fatal=False) 'description', webpage, fatal=False)
page_count = self._int(self._search_regex( page_count = str_to_int(self._search_regex(
r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT', r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b',
webpage, 'page_count'), 'page_count') webpage, 'page_count', default='1'))
PAGE_SIZE = 80 PAGE_SIZE = 80
def _get_page(idx): def _get_page(idx):