Restore accidentally deleted commits

That's what happens if you let Windows machines write :(
This commit is contained in:
Philipp Hagemeister 2013-09-21 14:19:30 +02:00
parent b00ca882a4
commit e0df6211cc
4 changed files with 677 additions and 21 deletions

1
.gitignore vendored
View File

@ -24,3 +24,4 @@ updates_key.pem
*.flv
*.mp4
*.part
test/testdata

View File

@ -0,0 +1,80 @@
#!/usr/bin/env python
import io
import re
import string
import sys
import unittest
# Allow direct execution
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from youtube_dl.extractor import YoutubeIE
from youtube_dl.utils import compat_str, compat_urlretrieve
_TESTS = [
(
u'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js',
u'js',
86,
u'>=<;:/.-[+*)(\'&%$#"!ZYX0VUTSRQPONMLKJIHGFEDCBA\\yxwvutsrqponmlkjihgfedcba987654321',
),
(
u'https://s.ytimg.com/yts/jsbin/html5player-vfldJ8xgI.js',
u'js',
85,
u'3456789a0cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRS[UVWXYZ!"#$%&\'()*+,-./:;<=>?@',
),
(
u'https://s.ytimg.com/yts/swfbin/watch_as3-vflg5GhxU.swf',
u'swf',
82,
u'23456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!?#$%&\'()*+,-./:;<=>"'
),
]
class TestSignature(unittest.TestCase):
def setUp(self):
TEST_DIR = os.path.dirname(os.path.abspath(__file__))
self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata')
if not os.path.exists(self.TESTDATA_DIR):
os.mkdir(self.TESTDATA_DIR)
def make_testfunc(url, stype, sig_length, expected_sig):
basename = url.rpartition('/')[2]
m = re.match(r'.*-([a-zA-Z0-9_-]+)\.[a-z]+$', basename)
assert m, '%r should follow URL format' % basename
test_id = m.group(1)
def test_func(self):
fn = os.path.join(self.TESTDATA_DIR, basename)
if not os.path.exists(fn):
compat_urlretrieve(url, fn)
ie = YoutubeIE()
if stype == 'js':
with io.open(fn, encoding='utf-8') as testf:
jscode = testf.read()
func = ie._parse_sig_js(jscode)
else:
assert stype == 'swf'
with open(fn, 'rb') as testf:
swfcode = testf.read()
func = ie._parse_sig_swf(swfcode)
src_sig = compat_str(string.printable[:sig_length])
got_sig = func(src_sig)
self.assertEqual(got_sig, expected_sig)
test_func.__name__ = str('test_signature_' + stype + '_' + test_id)
setattr(TestSignature, test_func.__name__, test_func)
for test_spec in _TESTS:
make_testfunc(*test_spec)
if __name__ == '__main__':
unittest.main()

View File

@ -1,11 +1,16 @@
# coding: utf-8
import collections
import itertools
import io
import json
import netrc
import re
import socket
import itertools
import xml.etree.ElementTree
import string
import struct
import traceback
import zlib
from .common import InfoExtractor, SearchInfoExtractor
from .subtitles import SubtitlesInfoExtractor
@ -393,6 +398,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
if YoutubePlaylistIE.suitable(url): return False
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
def __init__(self, *args, **kwargs):
super(YoutubeIE, self).__init__(*args, **kwargs)
self._jsplayer_cache = {}
def report_video_webpage_download(self, video_id):
"""Report attempt to download video webpage."""
self.to_screen(u'%s: Downloading video webpage' % video_id)
@ -413,15 +422,565 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
"""Indicate the download will use the RTMP protocol."""
self.to_screen(u'RTMP download detected')
def _decrypt_signature(self, s):
def _extract_signature_function(self, video_id, player_url):
id_m = re.match(r'.*-(?P<id>[^.]+)\.(?P<ext>[^.]+)$', player_url)
player_type = id_m.group('ext')
player_id = id_m.group('id')
if player_type == 'js':
code = self._download_webpage(
player_url, video_id,
note=u'Downloading %s player %s' % (player_type, jsplayer_id),
errnote=u'Download of %s failed' % player_url)
return self._parse_sig_js(code)
elif player_tpye == 'swf':
urlh = self._request_webpage(
player_url, video_id,
note=u'Downloading %s player %s' % (player_type, jsplayer_id),
errnote=u'Download of %s failed' % player_url)
code = urlh.read()
return self._parse_sig_swf(code)
else:
assert False, 'Invalid player type %r' % player_type
def _parse_sig_js(self, jscode):
funcname = self._search_regex(
r'signature=([a-zA-Z]+)', jscode,
u'Initial JS player signature function name')
functions = {}
def argidx(varname):
return string.lowercase.index(varname)
def interpret_statement(stmt, local_vars, allow_recursion=20):
if allow_recursion < 0:
raise ExctractorError(u'Recursion limit reached')
if stmt.startswith(u'var '):
stmt = stmt[len(u'var '):]
ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
r'=(?P<expr>.*)$', stmt)
if ass_m:
if ass_m.groupdict().get('index'):
def assign(val):
lvar = local_vars[ass_m.group('out')]
idx = interpret_expression(ass_m.group('index'),
local_vars, allow_recursion)
assert isinstance(idx, int)
lvar[idx] = val
return val
expr = ass_m.group('expr')
else:
def assign(val):
local_vars[ass_m.group('out')] = val
return val
expr = ass_m.group('expr')
elif stmt.startswith(u'return '):
assign = lambda v: v
expr = stmt[len(u'return '):]
else:
raise ExtractorError(
u'Cannot determine left side of statement in %r' % stmt)
v = interpret_expression(expr, local_vars, allow_recursion)
return assign(v)
def interpret_expression(expr, local_vars, allow_recursion):
if expr.isdigit():
return int(expr)
if expr.isalpha():
return local_vars[expr]
m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
if m:
member = m.group('member')
val = local_vars[m.group('in')]
if member == 'split("")':
return list(val)
if member == 'join("")':
return u''.join(val)
if member == 'length':
return len(val)
if member == 'reverse()':
return val[::-1]
slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
if slice_m:
idx = interpret_expression(
slice_m.group('idx'), local_vars, allow_recursion-1)
return val[idx:]
m = re.match(
r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
if m:
val = local_vars[m.group('in')]
idx = interpret_expression(m.group('idx'), local_vars,
allow_recursion-1)
return val[idx]
m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
if m:
a = interpret_expression(m.group('a'),
local_vars, allow_recursion)
b = interpret_expression(m.group('b'),
local_vars, allow_recursion)
return a % b
m = re.match(
r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
if m:
fname = m.group('func')
if fname not in functions:
functions[fname] = extract_function(fname)
argvals = [int(v) if v.isdigit() else local_vars[v]
for v in m.group('args').split(',')]
return functions[fname](argvals)
raise ExtractorError(u'Unsupported JS expression %r' % expr)
def extract_function(funcname):
func_m = re.search(
r'function ' + re.escape(funcname) +
r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
jscode)
argnames = func_m.group('args').split(',')
def resf(args):
local_vars = dict(zip(argnames, args))
for stmt in func_m.group('code').split(';'):
res = interpret_statement(stmt, local_vars)
return res
return resf
initial_function = extract_function(funcname)
return lambda s: initial_function([s])
def _parse_sig_swf(self, file_contents):
if file_contents[1:3] != b'WS':
raise ExtractorError(
u'Not an SWF file; header is %r' % file_contents[:3])
if file_contents[:1] == b'C':
content = zlib.decompress(file_contents[8:])
else:
raise NotImplementedError(u'Unsupported compression format %r' %
file_contents[:1])
def extract_tags(content):
pos = 0
while pos < len(content):
header16 = struct.unpack('<H', content[pos:pos+2])[0]
pos += 2
tag_code = header16 >> 6
tag_len = header16 & 0x3f
if tag_len == 0x3f:
tag_len = struct.unpack('<I', content[pos:pos+4])[0]
pos += 4
assert pos+tag_len <= len(content)
yield (tag_code, content[pos:pos+tag_len])
pos += tag_len
code_tag = next(tag
for tag_code, tag in extract_tags(content)
if tag_code == 82)
p = code_tag.index(b'\0', 4) + 1
# Parse ABC (AVM2 ByteCode)
def read_int(data=None, pos=None):
if hasattr(data, 'read'):
assert pos is None
res = 0
shift = 0
for _ in range(5):
buf = data.read(1)
assert len(buf) == 1
b = struct.unpack('<B', buf)[0]
res = res | ((b & 0x7f) << shift)
if b & 0x80 == 0:
break
shift += 7
return res
if data is None:
data = code_tag
if pos is None:
pos = p
res = 0
shift = 0
for _ in range(5):
b = struct.unpack('<B', data[pos:pos+1])[0]
pos += 1
res = res | ((b & 0x7f) << shift)
if b & 0x80 == 0:
break
shift += 7
return (res, pos)
assert read_int(b'\x00', 0) == (0, 1)
assert read_int(b'\x10', 0) == (16, 1)
assert read_int(b'\x34', 0) == (0x34, 1)
assert read_int(b'\xb4\x12', 0) == (0x12 * 0x80 + 0x34, 2)
assert read_int(b'\xff\xff\xff\x00', 0) == (0x1fffff, 4)
def u30(*args, **kwargs):
res = read_int(*args, **kwargs)
if isinstance(res, tuple):
assert res[0] & 0xf0000000 == 0
else:
assert res & 0xf0000000 == 0
return res
u32 = read_int
def s32(data=None, pos=None):
v, pos = read_int(data, pos)
if v & 0x80000000 != 0:
v = - ((v ^ 0xffffffff) + 1)
return (v, pos)
assert s32(b'\xff\xff\xff\xff\x0f', 0) == (-1, 5)
def string():
slen, p = u30()
return (code_tag[p:p+slen].decode('utf-8'), p + slen)
def read_byte(data=None, pos=None):
if data is None:
data = code_tag
if pos is None:
pos = p
res = struct.unpack('<B', data[pos:pos+1])[0]
return (res, pos + 1)
# minor_version + major_version
p += 2 + 2
# Constant pool
int_count, p = u30()
for _c in range(1, int_count):
_, p = s32()
uint_count, p = u30()
for _c in range(1, uint_count):
_, p = u32()
double_count, p = u30()
p += (double_count-1) * 8
string_count, p = u30()
constant_strings = [u'']
for _c in range(1, string_count):
s, p = string()
constant_strings.append(s)
namespace_count, p = u30()
for _c in range(1, namespace_count):
p += 1 # kind
_, p = u30() # name
ns_set_count, p = u30()
for _c in range(1, ns_set_count):
count, p = u30()
for _c2 in range(count):
_, p = u30()
multiname_count, p = u30()
MULTINAME_SIZES = {
0x07: 2, # QName
0x0d: 2, # QNameA
0x0f: 1, # RTQName
0x10: 1, # RTQNameA
0x11: 0, # RTQNameL
0x12: 0, # RTQNameLA
0x09: 2, # Multiname
0x0e: 2, # MultinameA
0x1b: 1, # MultinameL
0x1c: 1, # MultinameLA
}
multinames = [u'']
for _c in range(1, multiname_count):
kind, p = u30()
assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
if kind == 0x07:
namespace_idx, p = u30()
name_idx, p = u30()
multinames.append(constant_strings[name_idx])
else:
multinames.append('[MULTINAME kind: %d]' % kind)
for _c2 in range(MULTINAME_SIZES[kind]):
_, p = u30()
# Methods
method_count, p = u30()
MethodInfo = collections.namedtuple(
'MethodInfo',
['NEED_ARGUMENTS', 'NEED_REST'])
method_infos = []
for method_id in range(method_count):
param_count, p = u30()
_, p = u30() # return type
for _ in range(param_count):
_, p = u30() # param type
_, p = u30() # name index (always 0 for youtube)
flags, p = read_byte()
if flags & 0x08 != 0:
# Options present
option_count, p = u30()
for c in range(option_count):
_, p = u30() # val
p += 1 # kind
if flags & 0x80 != 0:
# Param names present
for _ in range(param_count):
_, p = u30() # param name
mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
method_infos.append(mi)
# Metadata
metadata_count, p = u30()
for _c in range(metadata_count):
_, p = u30() # name
item_count, p = u30()
for _c2 in range(item_count):
_, p = u30() # key
_, p = u30() # value
def parse_traits_info(pos=None):
if pos is None:
pos = p
trait_name_idx, pos = u30(pos=pos)
kind_full, pos = read_byte(pos=pos)
kind = kind_full & 0x0f
attrs = kind_full >> 4
methods = {}
if kind in [0x00, 0x06]: # Slot or Const
_, pos = u30(pos=pos) # Slot id
type_name_idx, pos = u30(pos=pos)
vindex, pos = u30(pos=pos)
if vindex != 0:
_, pos = read_byte(pos=pos) # vkind
elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
_, pos = u30(pos=pos) # disp_id
method_idx, pos = u30(pos=pos)
methods[multinames[trait_name_idx]] = method_idx
elif kind == 0x04: # Class
_, pos = u30(pos=pos) # slot_id
_, pos = u30(pos=pos) # classi
elif kind == 0x05: # Function
_, pos = u30(pos=pos) # slot_id
function_idx, pos = u30(pos=pos)
methods[function_idx] = multinames[trait_name_idx]
else:
raise ExtractorError(u'Unsupported trait kind %d' % kind)
if attrs & 0x4 != 0: # Metadata present
metadata_count, pos = u30(pos=pos)
for _c3 in range(metadata_count):
_, pos = u30(pos=pos)
return (methods, pos)
# Classes
TARGET_CLASSNAME = u'SignatureDecipher'
searched_idx = multinames.index(TARGET_CLASSNAME)
searched_class_id = None
class_count, p = u30()
for class_id in range(class_count):
name_idx, p = u30()
if name_idx == searched_idx:
# We found the class we're looking for!
searched_class_id = class_id
_, p = u30() # super_name idx
flags, p = read_byte()
if flags & 0x08 != 0: # Protected namespace is present
protected_ns_idx, p = u30()
intrf_count, p = u30()
for _c2 in range(intrf_count):
_, p = u30()
_, p = u30() # iinit
trait_count, p = u30()
for _c2 in range(trait_count):
_, p = parse_traits_info()
if searched_class_id is None:
raise ExtractorError(u'Target class %r not found' %
TARGET_CLASSNAME)
method_names = {}
method_idxs = {}
for class_id in range(class_count):
_, p = u30() # cinit
trait_count, p = u30()
for _c2 in range(trait_count):
trait_methods, p = parse_traits_info()
if class_id == searched_class_id:
method_names.update(trait_methods.items())
method_idxs.update(dict(
(idx, name)
for name, idx in trait_methods.items()))
# Scripts
script_count, p = u30()
for _c in range(script_count):
_, p = u30() # init
trait_count, p = u30()
for _c2 in range(trait_count):
_, p = parse_traits_info()
# Method bodies
method_body_count, p = u30()
Method = collections.namedtuple('Method', ['code', 'local_count'])
methods = {}
for _c in range(method_body_count):
method_idx, p = u30()
max_stack, p = u30()
local_count, p = u30()
init_scope_depth, p = u30()
max_scope_depth, p = u30()
code_length, p = u30()
if method_idx in method_idxs:
m = Method(code_tag[p:p+code_length], local_count)
methods[method_idxs[method_idx]] = m
p += code_length
exception_count, p = u30()
for _c2 in range(exception_count):
_, p = u30() # from
_, p = u30() # to
_, p = u30() # target
_, p = u30() # exc_type
_, p = u30() # var_name
trait_count, p = u30()
for _c2 in range(trait_count):
_, p = parse_traits_info()
assert p == len(code_tag)
assert len(methods) == len(method_idxs)
method_pyfunctions = {}
def extract_function(func_name):
if func_name in method_pyfunctions:
return method_pyfunctions[func_name]
if func_name not in methods:
raise ExtractorError(u'Cannot find function %r' % func_name)
m = methods[func_name]
def resfunc(args):
print('Entering function %s(%r)' % (func_name, args))
registers = ['(this)'] + list(args) + [None] * m.local_count
stack = []
coder = io.BytesIO(m.code)
while True:
opcode = struct.unpack('!B', coder.read(1))[0]
if opcode == 208: # getlocal_0
stack.append(registers[0])
elif opcode == 209: # getlocal_1
stack.append(registers[1])
elif opcode == 210: # getlocal_2
stack.append(registers[2])
elif opcode == 36: # pushbyte
v = struct.unpack('!B', coder.read(1))[0]
stack.append(v)
elif opcode == 44: # pushstring
idx = u30(coder)
stack.append(constant_strings[idx])
elif opcode == 48: # pushscope
# We don't implement the scope register, so we'll just
# ignore the popped value
stack.pop()
elif opcode == 70: # callproperty
index = u30(coder)
mname = multinames[index]
arg_count = u30(coder)
args = list(reversed(
[stack.pop() for _ in range(arg_count)]))
obj = stack.pop()
if mname == u'split':
assert len(args) == 1
assert isinstance(args[0], compat_str)
assert isinstance(obj, compat_str)
if args[0] == u'':
res = list(obj)
else:
res = obj.split(args[0])
stack.append(res)
elif mname in method_pyfunctions:
stack.append(method_pyfunctions[mname](args))
else:
raise NotImplementedError(
u'Unsupported property %r on %r'
% (mname, obj))
elif opcode == 93: # findpropstrict
index = u30(coder)
mname = multinames[index]
res = extract_function(mname)
stack.append(res)
elif opcode == 97: # setproperty
index = u30(coder)
value = stack.pop()
idx = stack.pop()
obj = stack.pop()
assert isinstance(obj, list)
assert isinstance(idx, int)
obj[idx] = value
elif opcode == 98: # getlocal
index = u30(coder)
stack.append(registers[index])
elif opcode == 99: # setlocal
index = u30(coder)
value = stack.pop()
registers[index] = value
elif opcode == 102: # getproperty
index = u30(coder)
pname = multinames[index]
if pname == u'length':
obj = stack.pop()
assert isinstance(obj, list)
stack.append(len(obj))
else: # Assume attribute access
idx = stack.pop()
assert isinstance(idx, int)
obj = stack.pop()
assert isinstance(obj, list)
stack.append(obj[idx])
elif opcode == 128: # coerce
_ = u30(coder)
elif opcode == 133: # coerce_s
assert isinstance(stack[-1], (type(None), compat_str))
elif opcode == 164: # modulo
value2 = stack.pop()
value1 = stack.pop()
res = value1 % value2
stack.append(res)
elif opcode == 214: # setlocal_2
registers[2] = stack.pop()
elif opcode == 215: # setlocal_3
registers[3] = stack.pop()
else:
raise NotImplementedError(
u'Unsupported opcode %d' % opcode)
method_pyfunctions[func_name] = resfunc
return resfunc
initial_function = extract_function(u'decipher')
return lambda s: initial_function([s])
def _decrypt_signature(self, s, video_id, jsplayer_url, age_gate=False):
"""Turn the encrypted s field into a working signature"""
if len(s) == 93:
return s[86:29:-1] + s[88] + s[28:5:-1]
elif len(s) == 92:
if jsplayer_url is not None:
try:
if jsplayer_url not in self._jsplayer_cache:
self._jsplayer_cache[jsplayer_url] = self._extract_signature_function(
video_id, jsplayer_url
)
return self._jsplayer_cache[jsplayer_url]([s])
except Exception as e:
tb = traceback.format_exc()
self._downloader.report_warning(u'Automatic signature extraction failed: ' + tb)
self._downloader.report_warning(u'Warning: Falling back to static signature algorithm')
if age_gate:
# The videos with age protection use another player, so the
# algorithms can be different.
if len(s) == 86:
return s[2:63] + s[82] + s[64:82] + s[63]
if len(s) == 92:
return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
elif len(s) == 91:
return s[84:27:-1] + s[86] + s[26:5:-1]
elif len(s) == 90:
return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
elif len(s) == 89:
@ -631,7 +1190,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
# Attempt to extract SWF player URL
mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
if mobj is not None:
player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
else:
@ -784,21 +1343,31 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
if 'sig' in url_data:
url += '&signature=' + url_data['sig'][0]
elif 's' in url_data:
if self._downloader.params.get('verbose'):
s = url_data['s'][0]
if age_gate:
player = 'flash player'
else:
player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
'html5 player', fatal=False)
parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.'))
self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
(len(s), parts_sizes, url_data['itag'][0], player))
encrypted_sig = url_data['s'][0]
if self._downloader.params.get('verbose'):
if age_gate:
signature = self._decrypt_signature_age_gate(encrypted_sig)
player_version = self._search_regex(r'-(.+)\.swf$',
player_url if player_url else 'NOT FOUND',
'flash player', fatal=False)
player_desc = 'flash player %s' % player_version
else:
signature = self._decrypt_signature(encrypted_sig)
player_version = self._search_regex(r'html5player-(.+?)\.js', video_webpage,
'html5 player', fatal=False)
player_desc = u'html5 player %s' % player_version
parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
(len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
if age_gate:
jsplayer_url = None
else:
jsplayer_url_json = self._search_regex(
r'"assets":.+?"js":\s*("[^"]+")',
video_webpage, u'JS player URL')
jsplayer_url = json.loads(jsplayer_url_json)
signature = self._decrypt_signature(encrypted_sig, video_id, jsplayer_url, age_gate)
url += '&signature=' + signature
if 'ratebypass' not in url:
url += '&ratebypass=yes'

View File

@ -66,6 +66,12 @@ try:
except ImportError: # Python 2
from urllib2 import HTTPError as compat_HTTPError
try:
from urllib.request import urlretrieve as compat_urlretrieve
except ImportError: # Python 2
from urllib import urlretrieve as compat_urlretrieve
try:
from subprocess import DEVNULL
compat_subprocess_get_DEVNULL = lambda: DEVNULL