Compare commits

...

2 Commits

Author SHA1 Message Date
Sergey M․
ecae54a98d
[motherless] Fix review issues and improve extraction (closes #26495, closes #27450) 2021-01-05 07:40:30 +07:00
cladmi
f318882955
[motherless] Fix recent videos upload date extraction (closes #27661)
Less than a week old videos use a '20h ago' or '1d ago' format.

I kept the support for 'Ago' with uppercase start at is was already in the code.
2021-01-05 07:40:25 +07:00

View File

@ -61,6 +61,23 @@ class MotherlessIE(InfoExtractor):
# no keywords # no keywords
'url': 'http://motherless.com/8B4BBC1', 'url': 'http://motherless.com/8B4BBC1',
'only_matching': True, 'only_matching': True,
}, {
# see https://motherless.com/videos/recent for recent videos with
# uploaded date in "ago" format
'url': 'https://motherless.com/3C3E2CF',
'info_dict': {
'id': '3C3E2CF',
'ext': 'mp4',
'title': 'a/ Hot Teens',
'categories': list,
'upload_date': '20210104',
'uploader_id': 'yonbiw',
'thumbnail': r're:https?://.*\.jpg',
'age_limit': 18,
},
'params': {
'skip_download': True,
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -85,20 +102,28 @@ class MotherlessIE(InfoExtractor):
or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id)
age_limit = self._rta_search(webpage) age_limit = self._rta_search(webpage)
view_count = str_to_int(self._html_search_regex( view_count = str_to_int(self._html_search_regex(
(r'>(\d+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'), (r'>([\d,.]+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'),
webpage, 'view count', fatal=False)) webpage, 'view count', fatal=False))
like_count = str_to_int(self._html_search_regex( like_count = str_to_int(self._html_search_regex(
(r'>(\d+)\s+Favorites<', r'<strong>Favorited</strong>\s+([^<]+)<'), (r'>([\d,.]+)\s+Favorites<',
r'<strong>Favorited</strong>\s+([^<]+)<'),
webpage, 'like count', fatal=False)) webpage, 'like count', fatal=False))
upload_date = self._html_search_regex( upload_date = unified_strdate(self._search_regex(
(r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage,
r'<strong>Uploaded</strong>\s+([^<]+)<'), webpage, 'upload date') 'upload date', default=None))
if 'Ago' in upload_date: if not upload_date:
days = int(re.search(r'([0-9]+)', upload_date).group(1)) uploaded_ago = self._search_regex(
upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago',
else: default=None)
upload_date = unified_strdate(upload_date) if uploaded_ago:
delta = int(uploaded_ago[:-1])
_AGO_UNITS = {
'h': 'hours',
'd': 'days',
}
kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta}
upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d')
comment_count = webpage.count('class="media-comment-contents"') comment_count = webpage.count('class="media-comment-contents"')
uploader_id = self._html_search_regex( uploader_id = self._html_search_regex(