[common] Extract author from SocialMediaPosting in ld+json

This commit is contained in:
dirkf 2022-02-27 13:31:42 +00:00
parent ab2df904e0
commit 078f268317

View File

@ -1273,9 +1273,21 @@ class InfoExtractor(object):
continue continue
info[count_key] = interaction_count info[count_key] = interaction_count
def extract_author(e):
author = e.get('author')
if not author:
return
info.update({
# author can be an instance of 'Organization' or 'Person' types.
# both types can have 'name' property(inherited from 'Thing' type). [1]
# however some websites are using 'Text' type instead.
# 1. https://schema.org/VideoObject
'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
})
def extract_video_object(e): def extract_video_object(e):
assert e['@type'] == 'VideoObject' assert e['@type'] == 'VideoObject'
author = e.get('author') extract_author(e)
info.update({ info.update({
'url': url_or_none(e.get('contentUrl')), 'url': url_or_none(e.get('contentUrl')),
'title': unescapeHTML(e.get('name')), 'title': unescapeHTML(e.get('name')),
@ -1283,11 +1295,6 @@ class InfoExtractor(object):
'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
'duration': parse_duration(e.get('duration')), 'duration': parse_duration(e.get('duration')),
'timestamp': unified_timestamp(e.get('uploadDate')), 'timestamp': unified_timestamp(e.get('uploadDate')),
# author can be an instance of 'Organization' or 'Person' types.
# both types can have 'name' property(inherited from 'Thing' type). [1]
# however some websites are using 'Text' type instead.
# 1. https://schema.org/VideoObject
'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
'filesize': float_or_none(e.get('contentSize')), 'filesize': float_or_none(e.get('contentSize')),
'tbr': int_or_none(e.get('bitrate')), 'tbr': int_or_none(e.get('bitrate')),
'width': int_or_none(e.get('width')), 'width': int_or_none(e.get('width')),
@ -1332,6 +1339,8 @@ class InfoExtractor(object):
'title': unescapeHTML(e.get('headline')), 'title': unescapeHTML(e.get('headline')),
'description': unescapeHTML(e.get('articleBody')), 'description': unescapeHTML(e.get('articleBody')),
}) })
elif item_type == 'SocialMediaPosting':
extract_author(e)
elif item_type == 'VideoObject': elif item_type == 'VideoObject':
extract_video_object(e) extract_video_object(e)
if expected_type is None: if expected_type is None: