From 2b25cb5d7693b62736d4cdfa656289cc429c4c81 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 30 Mar 2014 07:02:58 +0200 Subject: [PATCH] [youtube] Move JavaScript interpreter into its own module --- youtube_dl/extractor/youtube.py | 110 ++----------------------------- youtube_dl/jsinterp.py | 113 ++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+), 106 deletions(-) create mode 100644 youtube_dl/jsinterp.py diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3a3a5a39e..2d1a19123 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -14,6 +14,7 @@ import zlib from .common import InfoExtractor, SearchInfoExtractor from .subtitles import SubtitlesInfoExtractor +from ..jsinterp import JSInterpreter from ..utils import ( compat_chr, compat_parse_qs, @@ -438,113 +439,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _parse_sig_js(self, jscode): funcname = self._search_regex( r'signature=([a-zA-Z]+)', jscode, - u'Initial JS player signature function name') + u'Initial JS player signature function name') - functions = {} - - def argidx(varname): - return string.lowercase.index(varname) - - def interpret_statement(stmt, local_vars, allow_recursion=20): - if allow_recursion < 0: - raise ExtractorError(u'Recursion limit reached') - - if stmt.startswith(u'var '): - stmt = stmt[len(u'var '):] - ass_m = re.match(r'^(?P[a-z]+)(?:\[(?P[^\]]+)\])?' + - r'=(?P.*)$', stmt) - if ass_m: - if ass_m.groupdict().get('index'): - def assign(val): - lvar = local_vars[ass_m.group('out')] - idx = interpret_expression(ass_m.group('index'), - local_vars, allow_recursion) - assert isinstance(idx, int) - lvar[idx] = val - return val - expr = ass_m.group('expr') - else: - def assign(val): - local_vars[ass_m.group('out')] = val - return val - expr = ass_m.group('expr') - elif stmt.startswith(u'return '): - assign = lambda v: v - expr = stmt[len(u'return '):] - else: - raise ExtractorError( - u'Cannot determine left side of statement in %r' % stmt) - - v = interpret_expression(expr, local_vars, allow_recursion) - return assign(v) - - def interpret_expression(expr, local_vars, allow_recursion): - if expr.isdigit(): - return int(expr) - - if expr.isalpha(): - return local_vars[expr] - - m = re.match(r'^(?P[a-z]+)\.(?P.*)$', expr) - if m: - member = m.group('member') - val = local_vars[m.group('in')] - if member == 'split("")': - return list(val) - if member == 'join("")': - return u''.join(val) - if member == 'length': - return len(val) - if member == 'reverse()': - return val[::-1] - slice_m = re.match(r'slice\((?P.*)\)', member) - if slice_m: - idx = interpret_expression( - slice_m.group('idx'), local_vars, allow_recursion-1) - return val[idx:] - - m = re.match( - r'^(?P[a-z]+)\[(?P.+)\]$', expr) - if m: - val = local_vars[m.group('in')] - idx = interpret_expression(m.group('idx'), local_vars, - allow_recursion-1) - return val[idx] - - m = re.match(r'^(?P.+?)(?P[%])(?P.+?)$', expr) - if m: - a = interpret_expression(m.group('a'), - local_vars, allow_recursion) - b = interpret_expression(m.group('b'), - local_vars, allow_recursion) - return a % b - - m = re.match( - r'^(?P[a-zA-Z$]+)\((?P[a-z0-9,]+)\)$', expr) - if m: - fname = m.group('func') - if fname not in functions: - functions[fname] = extract_function(fname) - argvals = [int(v) if v.isdigit() else local_vars[v] - for v in m.group('args').split(',')] - return functions[fname](argvals) - raise ExtractorError(u'Unsupported JS expression %r' % expr) - - def extract_function(funcname): - func_m = re.search( - r'function ' + re.escape(funcname) + - r'\((?P[a-z,]+)\){(?P[^}]+)}', - jscode) - argnames = func_m.group('args').split(',') - - def resf(args): - local_vars = dict(zip(argnames, args)) - for stmt in func_m.group('code').split(';'): - res = interpret_statement(stmt, local_vars) - return res - return resf - - initial_function = extract_function(funcname) + jsi = JSInterpreter(jscode) + initial_function = jsi.extract_function(funcname) return lambda s: initial_function([s]) def _parse_sig_swf(self, file_contents): diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py new file mode 100644 index 000000000..129a4027b --- /dev/null +++ b/youtube_dl/jsinterp.py @@ -0,0 +1,113 @@ +from __future__ import unicode_literals + +import re + +from .utils import ( + ExtractorError, +) + + +class JSInterpreter(object): + def __init__(self, code): + self.code = code + self._functions = {} + + def interpret_statement(self, stmt, local_vars, allow_recursion=20): + if allow_recursion < 0: + raise ExtractorError('Recursion limit reached') + + if stmt.startswith('var '): + stmt = stmt[len('var '):] + ass_m = re.match(r'^(?P[a-z]+)(?:\[(?P[^\]]+)\])?' + + r'=(?P.*)$', stmt) + if ass_m: + if ass_m.groupdict().get('index'): + def assign(val): + lvar = local_vars[ass_m.group('out')] + idx = self.interpret_expression( + ass_m.group('index'), local_vars, allow_recursion) + assert isinstance(idx, int) + lvar[idx] = val + return val + expr = ass_m.group('expr') + else: + def assign(val): + local_vars[ass_m.group('out')] = val + return val + expr = ass_m.group('expr') + elif stmt.startswith('return '): + assign = lambda v: v + expr = stmt[len('return '):] + else: + raise ExtractorError( + 'Cannot determine left side of statement in %r' % stmt) + + v = self.interpret_expression(expr, local_vars, allow_recursion) + return assign(v) + + def interpret_expression(self, expr, local_vars, allow_recursion): + if expr.isdigit(): + return int(expr) + + if expr.isalpha(): + return local_vars[expr] + + m = re.match(r'^(?P[a-z]+)\.(?P.*)$', expr) + if m: + member = m.group('member') + val = local_vars[m.group('in')] + if member == 'split("")': + return list(val) + if member == 'join("")': + return u''.join(val) + if member == 'length': + return len(val) + if member == 'reverse()': + return val[::-1] + slice_m = re.match(r'slice\((?P.*)\)', member) + if slice_m: + idx = self.interpret_expression( + slice_m.group('idx'), local_vars, allow_recursion - 1) + return val[idx:] + + m = re.match( + r'^(?P[a-z]+)\[(?P.+)\]$', expr) + if m: + val = local_vars[m.group('in')] + idx = self.interpret_expression( + m.group('idx'), local_vars, allow_recursion - 1) + return val[idx] + + m = re.match(r'^(?P.+?)(?P[%])(?P.+?)$', expr) + if m: + a = self.interpret_expression( + m.group('a'), local_vars, allow_recursion) + b = self.interpret_expression( + m.group('b'), local_vars, allow_recursion) + return a % b + + m = re.match( + r'^(?P[a-zA-Z$]+)\((?P[a-z0-9,]+)\)$', expr) + if m: + fname = m.group('func') + if fname not in self._functions: + self._functions[fname] = self.extract_function(fname) + argvals = [int(v) if v.isdigit() else local_vars[v] + for v in m.group('args').split(',')] + return self._functions[fname](argvals) + raise ExtractorError('Unsupported JS expression %r' % expr) + + def extract_function(self, funcname): + func_m = re.search( + r'function ' + re.escape(funcname) + + r'\((?P[a-z,]+)\){(?P[^}]+)}', + self.code) + argnames = func_m.group('args').split(',') + + def resf(args): + local_vars = dict(zip(argnames, args)) + for stmt in func_m.group('code').split(';'): + res = self.interpret_statement(stmt, local_vars) + return res + return resf +