[yahoo] Extract all <iframe>s

Fixes test_yahoo_6

(https://ca.finance.yahoo.com/news/hackers-sony-more-trouble-well-154609075.html)
This commit is contained in:
Yen Chi Hsuan 2016-04-24 17:46:25 +08:00
parent 4f54958097
commit d9ed362116
No known key found for this signature in database
GPG Key ID: 3FDDD575826C5C30

View File

@ -92,14 +92,28 @@ class YahooIE(InfoExtractor):
} }
}, { }, {
'url': 'https://ca.finance.yahoo.com/news/hackers-sony-more-trouble-well-154609075.html', 'url': 'https://ca.finance.yahoo.com/news/hackers-sony-more-trouble-well-154609075.html',
'md5': '226a895aae7e21b0129e2a2006fe9690', 'info_dict': {
'id': '154609075',
},
'playlist': [{
'md5': 'f8e336c6b66f503282e5f719641d6565',
'info_dict': { 'info_dict': {
'id': 'e624c4bc-3389-34de-9dfc-025f74943409', 'id': 'e624c4bc-3389-34de-9dfc-025f74943409',
'ext': 'mp4', 'ext': 'mp4',
'title': '\'The Interview\' TV Spot: War', 'title': '\'The Interview\' TV Spot: War',
'description': 'The Interview', 'description': 'The Interview',
'duration': 30, 'duration': 30,
} },
}, {
'md5': '958bcb90b4d6df71c56312137ee1cd5a',
'info_dict': {
'id': '1fc8ada0-718e-3abe-a450-bf31f246d1a9',
'ext': 'mp4',
'title': '\'The Interview\' TV Spot: Guys',
'description': 'The Interview',
'duration': 30,
},
}],
}, { }, {
'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
'md5': '88e209b417f173d86186bef6e4d1f160', 'md5': '88e209b417f173d86186bef6e4d1f160',
@ -191,16 +205,21 @@ class YahooIE(InfoExtractor):
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
# Look for iframed media first # Look for iframed media first
iframe_m = re.search(r'<iframe[^>]+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage) entries = []
if iframe_m: iframe_urls = re.findall(r'<iframe[^>]+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage)
for idx, iframe_url in enumerate(iframe_urls):
iframepage = self._download_webpage( iframepage = self._download_webpage(
host + iframe_m.group(1), display_id, 'Downloading iframe webpage') host + iframe_url, display_id,
note='Downloading iframe webpage for video #%d' % idx)
items_json = self._search_regex( items_json = self._search_regex(
r'mediaItems: (\[.+?\])$', iframepage, 'items', flags=re.MULTILINE, default=None) r'mediaItems: (\[.+?\])$', iframepage, 'items', flags=re.MULTILINE, default=None)
if items_json: if items_json:
items = json.loads(items_json) items = json.loads(items_json)
video_id = items[0]['id'] video_id = items[0]['id']
return self._get_info(video_id, display_id, webpage) entries.append(self._get_info(video_id, display_id, webpage))
if entries:
return self.playlist_result(entries, page_id)
# Look for NBCSports iframes # Look for NBCSports iframes
nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
if nbc_sports_url: if nbc_sports_url: