Compare commits

..

30 Commits

Author SHA1 Message Date
Philipp Hagemeister
fd87ff26b9 release 2013.07.11 2013-07-11 21:04:59 +02:00
Jaime Marquínez Ferrándiz
85347e1cb6 YoutubeIE: a new algo for length 83 2013-07-11 20:21:45 +02:00
Jaime Marquínez Ferrándiz
41897817cc GametrailersIE: support multipart videos
Use xml.etree.ElementTree instead of re when possible
2013-07-11 18:24:53 +02:00
Philipp Hagemeister
45ff2d51d0 [brightcove] add import 2013-07-11 16:31:29 +02:00
Philipp Hagemeister
5de3ece225 [brightcove] fix on Python 2.6 2013-07-11 16:16:02 +02:00
Philipp Hagemeister
df50a41289 [arte] Fix on 2.6 2013-07-11 16:12:16 +02:00
Philipp Hagemeister
59ae56fad5 Add helper function find_path_attr 2013-07-11 16:12:08 +02:00
Philipp Hagemeister
690e872c51 Remove video_result helper method
Calling it was more complex then actually including the type in the video info
2013-07-11 12:12:30 +02:00
Philipp Hagemeister
81082e046e [ehow] improve minor bits 2013-07-11 12:11:00 +02:00
Philipp Hagemeister
3fa9550837 Merge remote-tracking branch 'yasoob/master' 2013-07-11 12:02:16 +02:00
M.Yasoob Khalid
b1082f01a6 added test for ehow 2013-07-11 14:30:25 +05:00
M.Yasoob Khalid
f35b84c807 added an IE for Ehow videos 2013-07-11 14:25:14 +05:00
Jaime Marquínez Ferrándiz
117adb0f0f GenericIE: detect more Brightcove videos
In some sites "class" contains more that BrightcoveExperience
2013-07-11 00:25:38 +02:00
Jaime Marquínez Ferrándiz
abb285fb1b BrightcoveIE: add support for playlists 2013-07-11 00:04:33 +02:00
Jaime Marquínez Ferrándiz
a431154706 Set the playlist_index and playlist fields for already resolved video results. 2013-07-10 23:36:30 +02:00
Jaime Marquínez Ferrándiz
cfe50f04ed GenericIE: Detect videos from Brightcove
Brightcove videos info is usually found in an <object class="BrightcoveExperience"></object> node, this is passed to a new method of BrightcoveIE that builds a url to extract the video.
2013-07-10 17:49:11 +02:00
Jaime Marquínez Ferrándiz
a7055eb956 YoutubeIE: show a more meaningful error when it founds a rtmpe download (related #343) 2013-07-10 14:35:11 +02:00
Philipp Hagemeister
0a1be1e997 release 2013.07.10 2013-07-10 11:36:11 +02:00
Jaime Marquínez Ferrándiz
c93898dae9 YoutubeIE: new algo for length 83 (closes #1017 and closes #1016) 2013-07-10 10:44:04 +02:00
Jaime Marquínez Ferrándiz
ebdf2af727 GameSpotIE: support more urls and download videos in the best quality 2013-07-09 20:07:52 +02:00
Jaime Marquínez Ferrándiz
c108eb73cc YoutubeIE: Fix vevo explicit videos (closes #956)
When an age restricted video is detected it simulates accessing the video from www.youtube.com/v/{video_id}
2013-07-09 15:43:44 +02:00
Jaime Marquínez Ferrándiz
3a1375dacf VeohIE: remove debug logging 2013-07-09 11:11:55 +02:00
Jaime Marquínez Ferrándiz
41bece30b4 DotsubIE: simplify and extract the upload date
Do not declare variables for fields in the info dictionary.
2013-07-08 22:40:42 +02:00
Jaime Marquínez Ferrándiz
16ea58cbda Merge pull request #1009 from yasoob/master
Added an IE and test for dotsub.com videos. ( closes #1008 )
2013-07-08 22:21:06 +02:00
Jaime Marquínez Ferrándiz
99e350d902 Add VeohIE (closes #1006) 2013-07-08 22:02:23 +02:00
M.Yasoob Khalid
13e06d298c added an IE and test for dotsub. 2013-07-09 00:05:52 +05:00
Jaime Marquínez Ferrándiz
81f0259b9e YoutubeSubscriptionsIE: raise an error if there's no login information. 2013-07-08 11:24:11 +02:00
Jaime Marquínez Ferrándiz
fefcb5d314 YoutubeIE: use the new method in the base IE for getting the login info 2013-07-08 11:24:11 +02:00
Philipp Hagemeister
345b0c9b46 Remove dead code 2013-07-08 02:13:50 +02:00
Philipp Hagemeister
20c3893f0e Do not redefine variables in list comprehensions 2013-07-08 02:12:20 +02:00
22 changed files with 385 additions and 113 deletions

View File

@@ -20,9 +20,9 @@ tests = [
# 84
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<",
"<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1"),
# 83
# 83 - vflcaqGO8 2013/07/11
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<",
"D.>/?;}[{=+_)(*&^%$#!MNBVCXeAS<FGHJKLPOIUYTREWZ0987654321mnbvcxzasdfghjklpoiuytrQ"),
"urty8ioplkjhgfdsazxcvbqm1234567S90QWERTYUIOPLKJHGFDnAZXCVBNM!#$%^&*()_+={[};?/>.<"),
# 82
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<",
"Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"),

View File

@@ -4,6 +4,7 @@
import sys
import unittest
import xml.etree.ElementTree
# Allow direct execution
import os
@@ -16,6 +17,7 @@ from youtube_dl.utils import unescapeHTML
from youtube_dl.utils import orderedSet
from youtube_dl.utils import DateRange
from youtube_dl.utils import unified_strdate
from youtube_dl.utils import find_xpath_attr
if sys.version_info < (3, 0):
_compat_str = lambda b: b.decode('unicode-escape')
@@ -112,5 +114,18 @@ class TestUtil(unittest.TestCase):
self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214')
self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
def test_find_xpath_attr(self):
testxml = u'''<root>
<node/>
<node x="a"/>
<node x="a" y="c" />
<node x="b" y="d" />
</root>'''
doc = xml.etree.ElementTree.fromstring(testxml)
self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None)
self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1])
self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2])
if __name__ == '__main__':
unittest.main()

View File

@@ -45,7 +45,7 @@ class TestYoutubeSig(unittest.TestCase):
def test_83(self):
wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<"
right = "D.>/?;}[{=+_)(*&^%$#!MNBVCXeAS<FGHJKLPOIUYTREWZ0987654321mnbvcxzasdfghjklpoiuytrQ"
right = "urty8ioplkjhgfdsazxcvbqm1234567S90QWERTYUIOPLKJHGFDnAZXCVBNM!#$%^&*()_+={[};?/>.<"
self.assertEqual(sig(wrong), right)
def test_82(self):

View File

@@ -348,6 +348,7 @@ class YoutubeDL(object):
result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
if result_type == 'video':
ie_result.update(extra_info)
if 'playlist' not in ie_result:
# It isn't part of a playlist
ie_result['playlist'] = None

View File

@@ -1,4 +1,3 @@
from .archiveorg import ArchiveOrgIE
from .ard import ARDIE
from .arte import ArteTvIE
@@ -12,7 +11,9 @@ from .comedycentral import ComedyCentralIE
from .cspan import CSpanIE
from .dailymotion import DailymotionIE
from .depositfiles import DepositFilesIE
from .dotsub import DotsubIE
from .dreisat import DreiSatIE
from .ehow import EHowIE
from .eighttracks import EightTracksIE
from .escapist import EscapistIE
from .facebook import FacebookIE
@@ -58,6 +59,7 @@ from .tumblr import TumblrIE
from .tutv import TutvIE
from .ustream import UstreamIE
from .vbox7 import Vbox7IE
from .veoh import VeohIE
from .vevo import VevoIE
from .vimeo import VimeoIE
from .vine import VineIE

View File

@@ -48,6 +48,7 @@ class ArchiveOrgIE(InfoExtractor):
formats.sort(key=lambda fdata: fdata['file_size'])
info = {
'_type': 'video',
'id': video_id,
'title': title,
'formats': formats,
@@ -63,4 +64,4 @@ class ArchiveOrgIE(InfoExtractor):
info['url'] = formats[-1]['url']
info['ext'] = determine_ext(formats[-1]['url'])
return self.video_result(info)
return info

View File

@@ -32,7 +32,7 @@ class ARDIE(InfoExtractor):
# determine title and media streams from webpage
html = self._download_webpage(url, video_id)
title = re.search(self._TITLE, html).group('title')
streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
streams = [mo.groupdict() for mo in re.finditer(self._MEDIA_STREAM, html)]
if not streams:
assert '"fsk"' in html
raise ExtractorError(u'This video is only available after 8:00 pm')

View File

@@ -4,10 +4,8 @@ import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
# This is used by the not implemented extractLiveStream method
compat_urllib_parse,
ExtractorError,
find_xpath_attr,
unified_strdate,
)
@@ -28,6 +26,7 @@ class ArteTvIE(InfoExtractor):
return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL))
# TODO implement Live Stream
# from ..utils import compat_urllib_parse
# def extractLiveStream(self, url):
# video_lang = url.split('/')[-4]
# info = self.grep_webpage(
@@ -57,7 +56,6 @@ class ArteTvIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._EMISSION_URL, url)
if mobj is not None:
name = mobj.group('name')
lang = mobj.group('lang')
# This is not a real id, it can be for example AJT for the news
# http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
@@ -122,7 +120,7 @@ class ArteTvIE(InfoExtractor):
ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata')
ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml)
config_node = ref_xml_doc.find('.//video[@lang="%s"]' % lang)
config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)
config_xml_url = config_node.attrib['ref']
config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')

View File

@@ -189,5 +189,5 @@ class BlipTVUserIE(InfoExtractor):
pagenum += 1
urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
url_entries = [self.url_result(url, 'BlipTV') for url in urls]
url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls]
return [self.playlist_result(url_entries, playlist_title = username)]

View File

@@ -1,28 +1,81 @@
import re
import json
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
find_xpath_attr,
)
class BrightcoveIE(InfoExtractor):
_VALID_URL = r'http://.*brightcove\.com/.*\?(?P<query>.*videoPlayer=(?P<id>\d*).*)'
_VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'
_FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
_PLAYLIST_URL_TEMPLATE = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s'
# There is a test for Brigtcove in GenericIE, that way we test both the download
# and the detection of videos, and we don't have to find an URL that is always valid
@classmethod
def _build_brighcove_url(cls, object_str):
"""
Build a Brightcove url from a xml string containing
<object class="BrightcoveExperience">{params}</object>
"""
object_doc = xml.etree.ElementTree.fromstring(object_str)
assert u'BrightcoveExperience' in object_doc.attrib['class']
params = {'flashID': object_doc.attrib['id'],
'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
}
playerKey = find_xpath_attr(object_doc, './param', 'name', 'playerKey')
# Not all pages define this value
if playerKey is not None:
params['playerKey'] = playerKey.attrib['value']
videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer')
if videoPlayer is not None:
params['@videoPlayer'] = videoPlayer.attrib['value']
data = compat_urllib_parse.urlencode(params)
return cls._FEDERATED_URL_TEMPLATE % data
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
query = mobj.group('query')
video_id = mobj.group('id')
request_url = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' % query
m_video_id = re.search(r'videoPlayer=(\d+)', query)
if m_video_id is not None:
video_id = m_video_id.group(1)
return self._get_video_info(video_id, query)
else:
player_key = self._search_regex(r'playerKey=(.+?)(&|$)', query, 'playlist_id')
return self._get_playlist_info(player_key)
def _get_video_info(self, video_id, query):
request_url = self._FEDERATED_URL_TEMPLATE % query
webpage = self._download_webpage(request_url, video_id)
self.report_extraction(video_id)
info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
info = json.loads(info)['data']
video_info = info['programmedContent']['videoPlayer']['mediaDTO']
return self._extract_video_info(video_info)
def _get_playlist_info(self, player_key):
playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key,
player_key, u'Downloading playlist information')
playlist_info = json.loads(playlist_info)['videoList']
videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
return self.playlist_result(videos, playlist_id=playlist_info['id'],
playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
def _extract_video_info(self, video_info):
renditions = video_info['renditions']
renditions = sorted(renditions, key=lambda r: r['size'])
best_format = renditions[-1]
return {'id': video_id,
return {'id': video_info['id'],
'title': video_info['displayName'],
'url': best_format['defaultURL'],
'ext': 'mp4',

View File

@@ -169,11 +169,6 @@ class InfoExtractor(object):
self.to_screen(u'Logging in')
#Methods for following #608
#They set the correct value of the '_type' key
def video_result(self, video_info):
"""Returns a video"""
video_info['_type'] = 'video'
return video_info
def url_result(self, url, ie=None):
"""Returns a url that points to a page that should be processed"""
#TODO: ie should be the class used for getting the info

View File

@@ -0,0 +1,41 @@
import re
import json
import time
from .common import InfoExtractor
class DotsubIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?dotsub\.com/view/([^/]+)'
_TEST = {
u'url': u'http://dotsub.com/view/aed3b8b2-1889-4df5-ae63-ad85f5572f27',
u'file': u'aed3b8b2-1889-4df5-ae63-ad85f5572f27.flv',
u'md5': u'0914d4d69605090f623b7ac329fea66e',
u'info_dict': {
u"title": u"Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary",
u"uploader": u"4v4l0n42",
u'description': u'Pyramids of Waste (2010) also known as "The lightbulb conspiracy" is a documentary about how our economic system based on consumerism and planned obsolescence is breaking our planet down.\r\n\r\nSolutions to this can be found at:\r\nhttp://robotswillstealyourjob.com\r\nhttp://www.federicopistono.org\r\n\r\nhttp://opensourceecology.org\r\nhttp://thezeitgeistmovement.com',
u'thumbnail': u'http://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p',
u'upload_date': u'20101213',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
info_url = "https://dotsub.com/api/media/%s/metadata" %(video_id)
webpage = self._download_webpage(info_url, video_id)
info = json.loads(webpage)
date = time.gmtime(info['dateCreated']/1000) # The timestamp is in miliseconds
return [{
'id': video_id,
'url': info['mediaURI'],
'ext': 'flv',
'title': info['title'],
'thumbnail': info['screenshotURI'],
'description': info['description'],
'uploader': info['user'],
'view_count': info['numberOfViews'],
'upload_date': u'%04i%02i%02i' % (date.tm_year, date.tm_mon, date.tm_mday),
}]

View File

@@ -6,7 +6,6 @@ import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
determine_ext,
ExtractorError,
unified_strdate,
)
@@ -68,6 +67,7 @@ class DreiSatIE(InfoExtractor):
formats.sort(key=_sortkey)
info = {
'_type': 'video',
'id': video_id,
'title': video_title,
'formats': formats,
@@ -82,4 +82,4 @@ class DreiSatIE(InfoExtractor):
info['url'] = formats[-1]['url']
info['ext'] = determine_ext(formats[-1]['url'])
return self.video_result(info)
return info

View File

@@ -0,0 +1,51 @@
import re
from ..utils import (
compat_urllib_parse,
determine_ext
)
from .common import InfoExtractor
class EHowIE(InfoExtractor):
IE_NAME = u'eHow'
_VALID_URL = r'(?:https?://)?(?:www\.)?ehow\.com/[^/_?]*_(?P<id>[0-9]+)'
_TEST = {
u'url': u'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html',
u'file': u'12245069.flv',
u'md5': u'9809b4e3f115ae2088440bcb4efbf371',
u'info_dict': {
u"title": u"Hardwood Flooring Basics",
u"description": u"Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...",
u"uploader": u"Erick Nathan"
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)',
webpage, u'video URL')
final_url = compat_urllib_parse.unquote(video_url)
thumbnail_url = self._search_regex(r'<meta property="og:image" content="(.+?)" />',
webpage, u'thumbnail URL')
uploader = self._search_regex(r'<meta name="uploader" content="(.+?)" />',
webpage, u'uploader')
title = self._search_regex(r'<meta property="og:title" content="(.+?)" />',
webpage, u'Video title').replace(' | eHow', '')
description = self._search_regex(r'<meta property="og:description" content="(.+?)" />',
webpage, u'video description')
ext = determine_ext(final_url)
return {
'_type': 'video',
'id': video_id,
'url': final_url,
'ext': ext,
'title': title,
'thumbnail': thumbnail_url,
'description': description,
'uploader': uploader,
}

View File

@@ -4,14 +4,15 @@ import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
unified_strdate,
compat_urllib_parse,
)
class GameSpotIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/([^/]+)/videos/([^/]+)-([^/d]+)/'
_VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?'
_TEST = {
u"url": u"http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/",
u"file": u"6410818.mp4",
u"md5": u"5569d64ca98db01f0177c934fe8c1e9b",
u"md5": u"b2a30deaa8654fcccd43713a6b6a4825",
u"info_dict": {
u"title": u"Arma III - Community Guide: SITREP I",
u"upload_date": u"20130627",
@@ -21,13 +22,22 @@ class GameSpotIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(3).split("-")[-1]
info_url = "http://www.gamespot.com/pages/video_player/xml.php?id="+str(video_id)
page_id = mobj.group('page_id')
webpage = self._download_webpage(url, page_id)
video_id = self._html_search_regex([r'"og:video" content=".*?\?id=(\d+)"',
r'http://www\.gamespot\.com/videoembed/(\d+)'],
webpage, 'video id')
data = compat_urllib_parse.urlencode({'id': video_id, 'newplayer': '1'})
info_url = 'http://www.gamespot.com/pages/video_player/xml.php?' + data
info_xml = self._download_webpage(info_url, video_id)
doc = xml.etree.ElementTree.fromstring(info_xml)
clip_el = doc.find('./playList/clip')
video_url = clip_el.find('./URI').text
http_urls = [{'url': node.find('filePath').text,
'rate': int(node.find('rate').text)}
for node in clip_el.find('./httpURI')]
best_quality = sorted(http_urls, key=lambda f: f['rate'])[-1]
video_url = best_quality['url']
title = clip_el.find('./title').text
ext = video_url.rpartition('.')[2]
thumbnail_url = clip_el.find('./screenGrabURI').text

View File

@@ -1,4 +1,5 @@
import re
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@@ -11,7 +12,7 @@ class GametrailersIE(InfoExtractor):
_VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
_TEST = {
u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
u'file': u'zbvr8i.flv',
u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.flv',
u'md5': u'c3edbc995ab4081976e16779bd96a878',
u'info_dict': {
u"title": u"E3 2013: Debut Trailer"
@@ -24,45 +25,39 @@ class GametrailersIE(InfoExtractor):
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('id')
video_type = mobj.group('type')
webpage = self._download_webpage(url, video_id)
if video_type == 'full-episodes':
mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
else:
mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
mgid = self._search_regex(mgid_re, webpage, u'mgid')
data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
mgid = self._search_regex([r'data-video="(?P<mgid>mgid:.*?)"',
r'data-contentId=\'(?P<mgid>mgid:.*?)\''],
webpage, u'mgid')
data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
video_id, u'Downloading video info')
links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8'))
default_thumb = doc.find('./channel/image/url').text
media_namespace = {'media': 'http://search.yahoo.com/mrss/'}
parts = [{
'title': video_doc.find('title').text,
'ext': 'flv',
'id': video_doc.find('guid').text.rpartition(':')[2],
# Videos are actually flv not mp4
'url': self._get_video_url(video_doc.find('media:group/media:content', media_namespace).attrib['url'], video_id),
# The thumbnail may not be defined, it would be ''
'thumbnail': video_doc.find('media:group/media:thumbnail', media_namespace).attrib['url'] or default_thumb,
'description': video_doc.find('description').text,
} for video_doc in doc.findall('./channel/item')]
return parts
def _get_video_url(self, mediagen_url, video_id):
if 'acceptMethods' not in mediagen_url:
mediagen_url += '&acceptMethods=fms'
links_webpage = self._download_webpage(mediagen_url,
video_id, u'Downloading video urls info')
self.report_extraction(video_id)
info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
<description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
<image>.*
<url>(?P<thumb>.*?)</url>.*
</image>'''
m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
if m_info is None:
raise ExtractorError(u'Unable to extract video info')
video_title = m_info.group('title')
video_description = m_info.group('description')
video_thumb = m_info.group('thumb')
m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
if m_urls is None or len(m_urls) == 0:
doc = xml.etree.ElementTree.fromstring(links_webpage)
urls = list(doc.iter('src'))
if len(urls) == 0:
raise ExtractorError(u'Unable to extract video url')
# They are sorted from worst to best quality
video_url = m_urls[-1].group('url')
return urls[-1].text
return {'url': video_url,
'id': video_id,
'title': video_title,
# Videos are actually flv not mp4
'ext': 'flv',
'thumbnail': video_thumb,
'description': video_description,
}

View File

@@ -1,3 +1,5 @@
# encoding: utf-8
import os
import re
@@ -9,20 +11,34 @@ from ..utils import (
ExtractorError,
)
from .brightcove import BrightcoveIE
class GenericIE(InfoExtractor):
IE_DESC = u'Generic downloader that works on some sites'
_VALID_URL = r'.*'
IE_NAME = u'generic'
_TEST = {
u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
u'file': u'13601338388002.mp4',
u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89',
u'info_dict': {
u"uploader": u"www.hodiho.fr",
u"title": u"R\u00e9gis plante sa Jeep"
}
}
_TESTS = [
{
u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
u'file': u'13601338388002.mp4',
u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89',
u'info_dict': {
u"uploader": u"www.hodiho.fr",
u"title": u"R\u00e9gis plante sa Jeep"
}
},
{
u'url': u'http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/',
u'file': u'2371591881001.mp4',
u'md5': u'9e80619e0a94663f0bdc849b4566af19',
u'note': u'Test Brightcove downloads and detection in GenericIE',
u'info_dict': {
u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
u'uploader': u'8TV',
u'description': u'md5:a950cc4285c43e44d763d036710cd9cd',
}
},
]
def report_download_webpage(self, video_id):
"""Report webpage download."""
@@ -103,6 +119,13 @@ class GenericIE(InfoExtractor):
raise ExtractorError(u'Invalid URL: %s' % url)
self.report_extraction(video_id)
# Look for BrigthCove:
m_brightcove = re.search(r'<object.+?class=".*?BrightcoveExperience.*?".+?</object>', webpage, re.DOTALL)
if m_brightcove is not None:
self.to_screen(u'Brightcove video detected.')
bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group())
return self.url_result(bc_url, 'Brightcove')
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:

View File

@@ -23,14 +23,16 @@ class SteamIE(InfoExtractor):
u"file": u"81300.flv",
u"md5": u"f870007cee7065d7c76b88f0a45ecc07",
u"info_dict": {
u"title": u"Terraria 1.1 Trailer"
u"title": u"Terraria 1.1 Trailer",
u'playlist_index': 1,
}
},
{
u"file": u"80859.flv",
u"md5": u"61aaf31a5c5c3041afb58fb83cbb5751",
u"info_dict": {
u"title": u"Terraria Trailer"
u"title": u"Terraria Trailer",
u'playlist_index': 2,
}
}
]

View File

@@ -0,0 +1,47 @@
import re
import json
from .common import InfoExtractor
from ..utils import (
determine_ext,
)
class VeohIE(InfoExtractor):
_VALID_URL = r'http://www\.veoh\.com/watch/v(?P<id>\d*)'
_TEST = {
u'url': u'http://www.veoh.com/watch/v56314296nk7Zdmz3',
u'file': u'56314296.mp4',
u'md5': u'620e68e6a3cff80086df3348426c9ca3',
u'info_dict': {
u'title': u'Straight Backs Are Stronger',
u'uploader': u'LUMOback',
u'description': u'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage)
if m_youtube is not None:
youtube_id = m_youtube.group(1)
self.to_screen(u'%s: detected Youtube video.' % video_id)
return self.url_result(youtube_id, 'Youtube')
self.report_extraction(video_id)
info = self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info')
info = json.loads(info)
video_url = info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath')
return {'id': info['videoId'],
'title': info['title'],
'ext': determine_ext(video_url),
'url': video_url,
'uploader': info['username'],
'thumbnail': info.get('highResImage') or info.get('medResImage'),
'description': info['description'],
'view_count': info['views'],
}

View File

@@ -117,7 +117,19 @@ class YoutubeIE(InfoExtractor):
u"uploader": u"IconaPop",
u"uploader_id": u"IconaPop"
}
}
},
{
u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
u"file": u"07FYdnEawAQ.mp4",
u"note": u"Test VEVO video with age protection (#956)",
u"info_dict": {
u"upload_date": u"20130703",
u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
u"description": u"md5:64249768eec3bc4276236606ea996373",
u"uploader": u"justintimberlakeVEVO",
u"uploader_id": u"justintimberlakeVEVO"
}
},
]
@@ -131,10 +143,6 @@ class YoutubeIE(InfoExtractor):
"""Report attempt to set language."""
self.to_screen(u'Setting language')
def report_login(self):
"""Report attempt to log in."""
self.to_screen(u'Logging in')
def report_video_webpage_download(self, video_id):
"""Report attempt to download video webpage."""
self.to_screen(u'%s: Downloading video webpage' % video_id)
@@ -182,7 +190,7 @@ class YoutubeIE(InfoExtractor):
elif len(s) == 84:
return s[83:36:-1] + s[2] + s[35:26:-1] + s[3] + s[25:3:-1] + s[26]
elif len(s) == 83:
return s[52] + s[81:55:-1] + s[2] + s[54:52:-1] + s[82] + s[51:36:-1] + s[55] + s[35:2:-1] + s[36]
return s[6] + s[3:6] + s[33] + s[7:24] + s[0] + s[25:33] + s[53] + s[34:53] + s[24] + s[54:]
elif len(s) == 82:
return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34]
@@ -296,26 +304,6 @@ class YoutubeIE(InfoExtractor):
if self._downloader is None:
return
username = None
password = None
downloader_params = self._downloader.params
# Attempt to use provided username and password or .netrc data
if downloader_params.get('username', None) is not None:
username = downloader_params['username']
password = downloader_params['password']
elif downloader_params.get('usenetrc', False):
try:
info = netrc.netrc().authenticators(self._NETRC_MACHINE)
if info is not None:
username = info[0]
password = info[2]
else:
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
except (IOError, netrc.NetrcParseError) as err:
self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
return
# Set language
request = compat_urllib_request.Request(self._LANG_URL)
try:
@@ -325,6 +313,8 @@ class YoutubeIE(InfoExtractor):
self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
return
(username, password) = self._get_login_info()
# No authentication to be performed
if username is None:
return
@@ -432,15 +422,35 @@ class YoutubeIE(InfoExtractor):
# Get video info
self.report_video_info_webpage_download(video_id)
for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
% (video_id, el_type))
if re.search(r'player-age-gate-content">', video_webpage) is not None:
self.report_age_confirmation()
age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
data = compat_urllib_parse.urlencode({'video_id': video_id,
'el': 'embedded',
'gl': 'US',
'hl': 'en',
'eurl': 'https://youtube.googleapis.com/v/' + video_id,
'asv': 3,
'sts':'1588',
})
video_info_url = 'https://www.youtube.com/get_video_info?' + data
video_info_webpage = self._download_webpage(video_info_url, video_id,
note=False,
errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
if 'token' in video_info:
break
else:
age_gate = False
for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
% (video_id, el_type))
video_info_webpage = self._download_webpage(video_info_url, video_id,
note=False,
errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
if 'token' in video_info:
break
if 'token' not in video_info:
if 'reason' in video_info:
raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
@@ -557,6 +567,8 @@ class YoutubeIE(InfoExtractor):
self.report_rtmp_download()
video_url_list = [(None, video_info['conn'][0])]
elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
url_map = {}
for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
url_data = compat_parse_qs(url_data_str)
@@ -567,9 +579,15 @@ class YoutubeIE(InfoExtractor):
elif 's' in url_data:
if self._downloader.params.get('verbose'):
s = url_data['s'][0]
player = self._search_regex(r'html5player-(.+?)\.js', video_webpage,
'html5 player', fatal=False)
self.to_screen('encrypted signature length %d (%d.%d), itag %s, html5 player %s' %
if age_gate:
player_version = self._search_regex(r'ad3-(.+?)\.swf',
video_info['ad3_module'][0], 'flash player',
fatal=False)
player = 'flash player %s' % player_version
else:
player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
'html5 player', fatal=False)
self.to_screen('encrypted signature length %d (%d.%d), itag %s, %s' %
(len(s), len(s.split('.')[0]), len(s.split('.')[1]), url_data['itag'][0], player))
signature = self._decrypt_signature(url_data['s'][0])
url += '&signature=' + signature
@@ -697,7 +715,7 @@ class YoutubePlaylistIE(InfoExtractor):
videos = [v[1] for v in sorted(videos)]
url_results = [self.url_result(url, 'Youtube') for url in videos]
url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
return [self.playlist_result(url_results, playlist_id, playlist_title)]
@@ -755,7 +773,7 @@ class YoutubeChannelIE(InfoExtractor):
self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
url_entries = [self.url_result(url, 'Youtube') for url in urls]
url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
return [self.playlist_result(url_entries, channel_id)]
@@ -812,7 +830,7 @@ class YoutubeUserIE(InfoExtractor):
pagenum += 1
urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
url_results = [self.url_result(url, 'Youtube') for url in urls]
url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
return [self.playlist_result(url_results, playlist_title = username)]
class YoutubeSearchIE(SearchInfoExtractor):
@@ -887,6 +905,12 @@ class YoutubeSubscriptionsIE(YoutubeIE):
def suitable(cls, url):
return re.match(cls._VALID_URL, url) is not None
def _real_initialize(self):
(username, password) = self._get_login_info()
if username is None:
raise ExtractorError(u'No login info available, needed for downloading the Youtube subscriptions.', expected=True)
super(YoutubeSubscriptionsIE, self)._real_initialize()
def _real_extract(self, url):
feed_entries = []
# The step argument is available only in 2.7 or higher

View File

@@ -198,6 +198,20 @@ else:
with open(fn, 'w', encoding='utf-8') as f:
json.dump(obj, f)
if sys.version_info >= (2,7):
def find_xpath_attr(node, xpath, key, val):
""" Find the xpath xpath[@key=val] """
assert re.match(r'^[a-zA-Z]+$', key)
assert re.match(r'^[a-zA-Z@]*$', val)
expr = xpath + u"[@%s='%s']" % (key, val)
return node.find(expr)
else:
def find_xpath_attr(node, xpath, key, val):
for f in node.findall(xpath):
if f.attrib.get(key) == val:
return f
return None
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a character.

View File

@@ -1,2 +1,2 @@
__version__ = '2013.07.08.1'
__version__ = '2013.07.11'