2012-03-25 10:07:37 +09:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
2013-08-28 19:57:10 +09:00
import datetime
import email . utils
2013-05-13 16:20:08 +09:00
import errno
2012-03-25 10:07:37 +09:00
import gzip
2012-11-28 08:09:17 +09:00
import io
2012-12-20 21:13:24 +09:00
import json
2012-03-25 10:07:37 +09:00
import locale
import os
2013-10-12 20:49:27 +09:00
import pipes
2013-08-28 19:57:10 +09:00
import platform
2012-03-25 10:07:37 +09:00
import re
2013-08-28 19:57:10 +09:00
import socket
2012-03-25 10:07:37 +09:00
import sys
2013-01-03 23:39:55 +09:00
import traceback
2012-03-25 10:07:37 +09:00
import zlib
2012-11-28 07:54:09 +09:00
try :
2012-11-28 10:04:46 +09:00
import urllib . request as compat_urllib_request
2012-11-28 07:54:09 +09:00
except ImportError : # Python 2
2012-11-28 10:04:46 +09:00
import urllib2 as compat_urllib_request
2012-11-28 07:54:09 +09:00
try :
2012-11-28 10:04:46 +09:00
import urllib . error as compat_urllib_error
2012-11-28 07:54:09 +09:00
except ImportError : # Python 2
2012-11-28 10:04:46 +09:00
import urllib2 as compat_urllib_error
2012-11-28 07:54:09 +09:00
try :
2012-11-28 10:04:46 +09:00
import urllib . parse as compat_urllib_parse
2012-11-28 07:54:09 +09:00
except ImportError : # Python 2
2012-11-28 10:04:46 +09:00
import urllib as compat_urllib_parse
2012-11-28 07:54:09 +09:00
2012-11-28 12:51:27 +09:00
try :
from urllib . parse import urlparse as compat_urllib_parse_urlparse
except ImportError : # Python 2
from urlparse import urlparse as compat_urllib_parse_urlparse
2013-07-12 21:53:28 +09:00
try :
import urllib . parse as compat_urlparse
except ImportError : # Python 2
import urlparse as compat_urlparse
2012-11-28 07:54:09 +09:00
try :
2012-11-28 10:04:46 +09:00
import http . cookiejar as compat_cookiejar
2012-11-28 07:54:09 +09:00
except ImportError : # Python 2
2012-11-28 10:04:46 +09:00
import cookielib as compat_cookiejar
2012-11-28 07:54:09 +09:00
2012-11-28 08:02:55 +09:00
try :
2012-11-28 10:04:46 +09:00
import html . entities as compat_html_entities
2012-11-28 08:17:12 +09:00
except ImportError : # Python 2
2012-11-28 10:04:46 +09:00
import htmlentitydefs as compat_html_entities
2012-11-28 08:02:55 +09:00
2012-11-28 08:06:28 +09:00
try :
2012-11-28 10:04:46 +09:00
import html . parser as compat_html_parser
2012-11-28 08:17:12 +09:00
except ImportError : # Python 2
2012-11-28 10:04:46 +09:00
import HTMLParser as compat_html_parser
2012-11-28 08:06:28 +09:00
2012-11-28 08:13:00 +09:00
try :
2012-11-28 10:04:46 +09:00
import http . client as compat_http_client
2012-11-28 08:17:12 +09:00
except ImportError : # Python 2
2012-11-28 10:04:46 +09:00
import httplib as compat_http_client
2012-11-28 08:13:00 +09:00
2013-08-28 11:25:38 +09:00
try :
2013-08-28 17:18:39 +09:00
from urllib . error import HTTPError as compat_HTTPError
2013-08-28 11:25:38 +09:00
except ImportError : # Python 2
from urllib2 import HTTPError as compat_HTTPError
2013-09-21 21:19:30 +09:00
try :
from urllib . request import urlretrieve as compat_urlretrieve
except ImportError : # Python 2
from urllib import urlretrieve as compat_urlretrieve
2012-12-16 20:29:03 +09:00
try :
from subprocess import DEVNULL
compat_subprocess_get_DEVNULL = lambda : DEVNULL
except ImportError :
compat_subprocess_get_DEVNULL = lambda : open ( os . path . devnull , ' w ' )
2012-11-28 08:17:12 +09:00
try :
2012-11-28 10:04:46 +09:00
from urllib . parse import parse_qs as compat_parse_qs
2012-11-28 08:17:12 +09:00
except ImportError : # Python 2
2012-11-28 10:04:46 +09:00
# HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
# Python 2's version is apparently totally broken
def _unquote ( string , encoding = ' utf-8 ' , errors = ' replace ' ) :
if string == ' ' :
return string
res = string . split ( ' % ' )
if len ( res ) == 1 :
return string
if encoding is None :
encoding = ' utf-8 '
if errors is None :
errors = ' replace '
# pct_sequence: contiguous sequence of percent-encoded bytes, decoded
pct_sequence = b ' '
string = res [ 0 ]
for item in res [ 1 : ] :
try :
if not item :
raise ValueError
pct_sequence + = item [ : 2 ] . decode ( ' hex ' )
rest = item [ 2 : ]
if not rest :
# This segment was just a single percent-encoded character.
# May be part of a sequence of code units, so delay decoding.
# (Stored in pct_sequence).
continue
except ValueError :
rest = ' % ' + item
# Encountered non-percent-encoded characters. Flush the current
# pct_sequence.
string + = pct_sequence . decode ( encoding , errors ) + rest
pct_sequence = b ' '
if pct_sequence :
# Flush the final pct_sequence
string + = pct_sequence . decode ( encoding , errors )
return string
def _parse_qsl ( qs , keep_blank_values = False , strict_parsing = False ,
encoding = ' utf-8 ' , errors = ' replace ' ) :
qs , _coerce_result = qs , unicode
pairs = [ s2 for s1 in qs . split ( ' & ' ) for s2 in s1 . split ( ' ; ' ) ]
r = [ ]
for name_value in pairs :
if not name_value and not strict_parsing :
continue
nv = name_value . split ( ' = ' , 1 )
if len ( nv ) != 2 :
if strict_parsing :
raise ValueError ( " bad query field: %r " % ( name_value , ) )
# Handle case of a control-name with no equal sign
if keep_blank_values :
nv . append ( ' ' )
else :
continue
if len ( nv [ 1 ] ) or keep_blank_values :
name = nv [ 0 ] . replace ( ' + ' , ' ' )
name = _unquote ( name , encoding = encoding , errors = errors )
name = _coerce_result ( name )
value = nv [ 1 ] . replace ( ' + ' , ' ' )
value = _unquote ( value , encoding = encoding , errors = errors )
value = _coerce_result ( value )
r . append ( ( name , value ) )
return r
def compat_parse_qs ( qs , keep_blank_values = False , strict_parsing = False ,
encoding = ' utf-8 ' , errors = ' replace ' ) :
parsed_result = { }
pairs = _parse_qsl ( qs , keep_blank_values , strict_parsing ,
encoding = encoding , errors = errors )
for name , value in pairs :
if name in parsed_result :
parsed_result [ name ] . append ( value )
else :
parsed_result [ name ] = [ value ]
return parsed_result
2012-11-28 08:13:00 +09:00
2012-11-28 08:02:55 +09:00
try :
2012-11-28 10:04:46 +09:00
compat_str = unicode # Python 2
2012-11-28 08:02:55 +09:00
except NameError :
2012-11-28 10:04:46 +09:00
compat_str = str
2012-11-28 08:02:55 +09:00
try :
2012-11-28 10:04:46 +09:00
compat_chr = unichr # Python 2
2012-11-28 08:02:55 +09:00
except NameError :
2012-11-28 10:04:46 +09:00
compat_chr = chr
2012-11-28 08:02:55 +09:00
2013-05-20 18:57:10 +09:00
def compat_ord ( c ) :
if type ( c ) is int : return c
else : return ord ( c )
2013-06-06 21:35:08 +09:00
# This is not clearly defined otherwise
compiled_regex_type = type ( re . compile ( ' ' ) )
2012-11-28 08:02:55 +09:00
std_headers = {
2013-11-18 21:52:24 +09:00
' User-Agent ' : ' Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome) ' ,
2012-11-28 10:04:46 +09:00
' Accept-Charset ' : ' ISO-8859-1,utf-8;q=0.7,*;q=0.7 ' ,
' Accept ' : ' text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 ' ,
' Accept-Encoding ' : ' gzip, deflate ' ,
' Accept-Language ' : ' en-us,en;q=0.5 ' ,
2012-11-28 08:02:55 +09:00
}
2012-12-31 02:22:36 +09:00
2012-03-25 10:07:37 +09:00
def preferredencoding ( ) :
2012-11-28 10:04:46 +09:00
""" Get preferred encoding.
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
Returns the best encoding scheme for the system , based on
locale . getpreferredencoding ( ) and some further tweaks .
"""
try :
pref = locale . getpreferredencoding ( )
u ' TEST ' . encode ( pref )
except :
pref = ' UTF-8 '
2012-07-02 01:21:27 +09:00
2012-11-28 10:04:46 +09:00
return pref
2012-03-25 10:07:37 +09:00
2012-11-28 08:46:21 +09:00
if sys . version_info < ( 3 , 0 ) :
2012-11-28 10:04:46 +09:00
def compat_print ( s ) :
print ( s . encode ( preferredencoding ( ) , ' xmlcharrefreplace ' ) )
2012-11-28 08:46:21 +09:00
else :
2012-11-28 10:04:46 +09:00
def compat_print ( s ) :
assert type ( s ) == type ( u ' ' )
print ( s )
2012-03-25 10:07:37 +09:00
2012-12-20 21:13:24 +09:00
# In Python 2.x, json.dump expects a bytestream.
# In Python 3.x, it writes to a character stream
if sys . version_info < ( 3 , 0 ) :
def write_json_file ( obj , fn ) :
with open ( fn , ' wb ' ) as f :
json . dump ( obj , f )
else :
def write_json_file ( obj , fn ) :
with open ( fn , ' w ' , encoding = ' utf-8 ' ) as f :
json . dump ( obj , f )
2013-07-11 23:12:08 +09:00
if sys . version_info > = ( 2 , 7 ) :
def find_xpath_attr ( node , xpath , key , val ) :
""" Find the xpath xpath[@key=val] """
2013-07-11 23:16:02 +09:00
assert re . match ( r ' ^[a-zA-Z]+$ ' , key )
2013-08-30 02:16:07 +09:00
assert re . match ( r ' ^[a-zA-Z0-9@ \ s]*$ ' , val )
2013-07-11 23:12:08 +09:00
expr = xpath + u " [@ %s = ' %s ' ] " % ( key , val )
return node . find ( expr )
else :
def find_xpath_attr ( node , xpath , key , val ) :
for f in node . findall ( xpath ) :
if f . attrib . get ( key ) == val :
return f
return None
2013-10-13 04:34:04 +09:00
# On python2.6 the xml.etree.ElementTree.Element methods don't support
# the namespace parameter
def xpath_with_ns ( path , ns_map ) :
components = [ c . split ( ' : ' ) for c in path . split ( ' / ' ) ]
replaced = [ ]
for c in components :
if len ( c ) == 1 :
replaced . append ( c [ 0 ] )
else :
ns , tag = c
replaced . append ( ' { %s } %s ' % ( ns_map [ ns ] , tag ) )
return ' / ' . join ( replaced )
2012-03-25 10:07:37 +09:00
def htmlentity_transform ( matchobj ) :
2012-11-28 10:04:46 +09:00
""" Transforms an HTML entity to a character.
This function receives a match object and is intended to be used with
the re . sub ( ) function .
"""
entity = matchobj . group ( 1 )
# Known non-numeric HTML entity
if entity in compat_html_entities . name2codepoint :
return compat_chr ( compat_html_entities . name2codepoint [ entity ] )
mobj = re . match ( u ' (?u)#(x? \\ d+) ' , entity )
if mobj is not None :
numstr = mobj . group ( 1 )
if numstr . startswith ( u ' x ' ) :
base = 16
numstr = u ' 0 %s ' % numstr
else :
base = 10
return compat_chr ( int ( numstr , base ) )
# Unknown entity in name, return its literal representation
return ( u ' & %s ; ' % entity )
2012-03-25 10:07:37 +09:00
2012-11-28 08:06:28 +09:00
compat_html_parser . locatestarttagend = re . compile ( r """ <[a-zA-Z][-.a-zA-Z0-9:_]*(?: \ s+(?:(?<=[ ' " \ s])[^ \ s/>][^ \ s/=>]*(?: \ s*=+ \ s*(?: ' [^ ' ]* ' | " [^ " ]* " |(?![ ' " ])[^> \ s]*))? \ s*)*)? \ s* """ , re . VERBOSE ) # backport bugfix
2013-09-14 05:05:29 +09:00
class BaseHTMLParser ( compat_html_parser . HTMLParser ) :
def __init ( self ) :
compat_html_parser . HTMLParser . __init__ ( self )
self . html = None
def loads ( self , html ) :
self . html = html
self . feed ( html )
self . close ( )
class AttrParser ( BaseHTMLParser ) :
2012-12-19 23:21:14 +09:00
""" Modified HTMLParser that isolates a tag with the specified attribute """
def __init__ ( self , attribute , value ) :
self . attribute = attribute
self . value = value
2012-11-28 10:04:46 +09:00
self . result = None
self . started = False
self . depth = { }
self . watch_startpos = False
self . error_count = 0
2013-09-14 05:05:29 +09:00
BaseHTMLParser . __init__ ( self )
2012-11-28 10:04:46 +09:00
def error ( self , message ) :
if self . error_count > 10 or self . started :
raise compat_html_parser . HTMLParseError ( message , self . getpos ( ) )
self . rawdata = ' \n ' . join ( self . html . split ( ' \n ' ) [ self . getpos ( ) [ 0 ] : ] ) # skip one line
self . error_count + = 1
self . goahead ( 1 )
def handle_starttag ( self , tag , attrs ) :
attrs = dict ( attrs )
if self . started :
self . find_startpos ( None )
2012-12-19 23:21:14 +09:00
if self . attribute in attrs and attrs [ self . attribute ] == self . value :
2012-11-28 10:04:46 +09:00
self . result = [ tag ]
self . started = True
self . watch_startpos = True
if self . started :
if not tag in self . depth : self . depth [ tag ] = 0
self . depth [ tag ] + = 1
def handle_endtag ( self , tag ) :
if self . started :
if tag in self . depth : self . depth [ tag ] - = 1
if self . depth [ self . result [ 0 ] ] == 0 :
self . started = False
self . result . append ( self . getpos ( ) )
def find_startpos ( self , x ) :
""" Needed to put the start position of the result (self.result[1])
after the opening tag with the requested id """
if self . watch_startpos :
self . watch_startpos = False
self . result . append ( self . getpos ( ) )
handle_entityref = handle_charref = handle_data = handle_comment = \
handle_decl = handle_pi = unknown_decl = find_startpos
def get_result ( self ) :
if self . result is None :
return None
if len ( self . result ) != 3 :
return None
lines = self . html . split ( ' \n ' )
lines = lines [ self . result [ 1 ] [ 0 ] - 1 : self . result [ 2 ] [ 0 ] ]
lines [ 0 ] = lines [ 0 ] [ self . result [ 1 ] [ 1 ] : ]
if len ( lines ) == 1 :
lines [ - 1 ] = lines [ - 1 ] [ : self . result [ 2 ] [ 1 ] - self . result [ 1 ] [ 1 ] ]
lines [ - 1 ] = lines [ - 1 ] [ : self . result [ 2 ] [ 1 ] ]
return ' \n ' . join ( lines ) . strip ( )
2013-02-02 01:29:50 +09:00
# Hack for https://github.com/rg3/youtube-dl/issues/662
if sys . version_info < ( 2 , 7 , 3 ) :
AttrParser . parse_endtag = ( lambda self , i :
i + len ( " </scr ' + ' ipt> " )
if self . rawdata [ i : ] . startswith ( " </scr ' + ' ipt> " )
else compat_html_parser . HTMLParser . parse_endtag ( self , i ) )
2012-04-11 07:22:51 +09:00
def get_element_by_id ( id , html ) :
2012-12-19 23:21:14 +09:00
""" Return the content of the tag with the specified ID in the passed HTML document """
return get_element_by_attribute ( " id " , id , html )
def get_element_by_attribute ( attribute , value , html ) :
""" Return the content of the tag with the specified attribute in the passed HTML document """
parser = AttrParser ( attribute , value )
2012-11-28 10:04:46 +09:00
try :
parser . loads ( html )
except compat_html_parser . HTMLParseError :
pass
return parser . get_result ( )
2012-04-11 07:22:51 +09:00
2013-09-14 05:05:29 +09:00
class MetaParser ( BaseHTMLParser ) :
"""
Modified HTMLParser that isolates a meta tag with the specified name
attribute .
"""
def __init__ ( self , name ) :
BaseHTMLParser . __init__ ( self )
self . name = name
self . content = None
self . result = None
def handle_starttag ( self , tag , attrs ) :
if tag != ' meta ' :
return
attrs = dict ( attrs )
if attrs . get ( ' name ' ) == self . name :
self . result = attrs . get ( ' content ' )
def get_result ( self ) :
return self . result
def get_meta_content ( name , html ) :
"""
Return the content attribute from the meta tag with the given name attribute .
"""
parser = MetaParser ( name )
try :
parser . loads ( html )
except compat_html_parser . HTMLParseError :
pass
return parser . get_result ( )
2012-04-11 07:22:51 +09:00
def clean_html ( html ) :
2012-11-28 10:04:46 +09:00
""" Clean an HTML snippet into a readable string """
# Newline vs <br />
html = html . replace ( ' \n ' , ' ' )
2012-12-21 00:30:55 +09:00
html = re . sub ( r ' \ s*< \ s*br \ s*/? \ s*> \ s* ' , ' \n ' , html )
html = re . sub ( r ' < \ s*/ \ s*p \ s*> \ s*< \ s*p[^>]*> ' , ' \n ' , html )
2012-11-28 10:04:46 +09:00
# Strip html tags
html = re . sub ( ' <.*?> ' , ' ' , html )
# Replace html entities
html = unescapeHTML ( html )
2013-03-29 23:59:13 +09:00
return html . strip ( )
2012-04-11 07:22:51 +09:00
2012-03-25 10:07:37 +09:00
def sanitize_open ( filename , open_mode ) :
2012-11-28 10:04:46 +09:00
""" Try to open the given filename, and slightly tweak it if this fails.
Attempts to open the given filename . If this fails , it tries to change
the filename slightly , step by step , until it ' s either able to open it
or it fails and raises a final exception , like the standard open ( )
function .
It returns the tuple ( stream , definitive_file_name ) .
"""
try :
if filename == u ' - ' :
if sys . platform == ' win32 ' :
import msvcrt
msvcrt . setmode ( sys . stdout . fileno ( ) , os . O_BINARY )
2013-03-28 21:13:03 +09:00
return ( sys . stdout . buffer if hasattr ( sys . stdout , ' buffer ' ) else sys . stdout , filename )
2012-11-28 10:04:46 +09:00
stream = open ( encodeFilename ( filename ) , open_mode )
return ( stream , filename )
except ( IOError , OSError ) as err :
2013-05-13 16:20:08 +09:00
if err . errno in ( errno . EACCES , ) :
raise
2012-11-28 10:04:46 +09:00
2013-05-13 16:20:08 +09:00
# In case of error, try to remove win32 forbidden chars
alt_filename = os . path . join (
re . sub ( u ' [/<>: " \\ | \\ \\ ? \\ *] ' , u ' # ' , path_part )
for path_part in os . path . split ( filename )
)
if alt_filename == filename :
raise
else :
# An exception here should be caught in the caller
stream = open ( encodeFilename ( filename ) , open_mode )
return ( stream , alt_filename )
2012-03-25 10:07:37 +09:00
def timeconvert ( timestr ) :
2012-11-28 10:04:46 +09:00
""" Convert RFC 2822 defined time string into system timestamp """
timestamp = None
timetuple = email . utils . parsedate_tz ( timestr )
if timetuple is not None :
timestamp = email . utils . mktime_tz ( timetuple )
return timestamp
2012-11-27 07:58:46 +09:00
2012-12-03 23:36:24 +09:00
def sanitize_filename ( s , restricted = False , is_id = False ) :
2012-11-28 10:04:46 +09:00
""" Sanitizes a string so it could be used as part of a filename.
If restricted is set , use a stricter subset of allowed characters .
2012-12-03 23:36:24 +09:00
Set is_id if this is not an arbitrary string , but an ID that should be kept if possible
2012-11-28 10:04:46 +09:00
"""
def replace_insane ( char ) :
if char == ' ? ' or ord ( char ) < 32 or ord ( char ) == 127 :
return ' '
elif char == ' " ' :
return ' ' if restricted else ' \' '
elif char == ' : ' :
return ' _- ' if restricted else ' - '
elif char in ' \\ /|*<> ' :
return ' _ '
2012-11-28 20:59:27 +09:00
if restricted and ( char in ' !& \' ()[] {} $;`^,# ' or char . isspace ( ) ) :
2012-11-28 10:04:46 +09:00
return ' _ '
if restricted and ord ( char ) > 127 :
return ' _ '
return char
result = u ' ' . join ( map ( replace_insane , s ) )
2012-12-03 23:36:24 +09:00
if not is_id :
while ' __ ' in result :
result = result . replace ( ' __ ' , ' _ ' )
result = result . strip ( ' _ ' )
# Common case of "Foreign band name - English song title"
if restricted and result . startswith ( ' -_ ' ) :
result = result [ 2 : ]
if not result :
result = ' _ '
2012-11-28 10:04:46 +09:00
return result
2012-03-25 10:07:37 +09:00
def orderedSet ( iterable ) :
2012-11-28 10:04:46 +09:00
""" Remove all duplicates from the input iterable """
res = [ ]
for el in iterable :
if el not in res :
res . append ( el )
return res
2012-03-25 10:07:37 +09:00
def unescapeHTML ( s ) :
2012-11-28 10:04:46 +09:00
"""
@param s a string
"""
assert type ( s ) == type ( u ' ' )
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
result = re . sub ( u ' (?u)&(.+?); ' , htmlentity_transform , s )
return result
2012-03-25 10:07:37 +09:00
def encodeFilename ( s ) :
2012-11-28 10:04:46 +09:00
"""
@param s The name of the file
"""
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
assert type ( s ) == type ( u ' ' )
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
# Python 3 has a Unicode API
if sys . version_info > = ( 3 , 0 ) :
return s
2012-11-28 08:56:20 +09:00
2012-11-28 10:04:46 +09:00
if sys . platform == ' win32 ' and sys . getwindowsversion ( ) [ 0 ] > = 5 :
# Pass u'' directly to use Unicode APIs on Windows 2000 and up
# (Detecting Windows NT 4 is tricky because 'major >= 4' would
# match Windows 9x series as well. Besides, NT 4 is obsolete.)
return s
else :
2013-01-20 09:48:05 +09:00
encoding = sys . getfilesystemencoding ( )
if encoding is None :
encoding = ' utf-8 '
return s . encode ( encoding , ' ignore ' )
2012-03-25 10:07:37 +09:00
2013-02-22 01:09:39 +09:00
def decodeOption ( optval ) :
if optval is None :
return optval
if isinstance ( optval , bytes ) :
optval = optval . decode ( preferredencoding ( ) )
assert isinstance ( optval , compat_str )
return optval
2013-01-02 04:27:53 +09:00
2013-05-04 19:02:18 +09:00
def formatSeconds ( secs ) :
if secs > 3600 :
return ' %d : %02d : %02d ' % ( secs / / 3600 , ( secs % 3600 ) / / 60 , secs % 60 )
elif secs > 60 :
return ' %d : %02d ' % ( secs / / 60 , secs % 60 )
else :
return ' %d ' % secs
2013-05-04 19:19:02 +09:00
def make_HTTPS_handler ( opts ) :
if sys . version_info < ( 3 , 2 ) :
# Python's 2.x handler is very simplistic
2013-08-28 06:15:01 +09:00
return compat_urllib_request . HTTPSHandler ( )
2013-05-04 19:19:02 +09:00
else :
import ssl
context = ssl . SSLContext ( ssl . PROTOCOL_SSLv23 )
context . set_default_verify_paths ( )
context . verify_mode = ( ssl . CERT_NONE
if opts . no_check_certificate
else ssl . CERT_REQUIRED )
2013-08-28 06:15:01 +09:00
return compat_urllib_request . HTTPSHandler ( context = context )
2013-05-04 19:19:02 +09:00
2013-01-02 04:27:53 +09:00
class ExtractorError ( Exception ) :
""" Error during info extraction. """
2013-08-28 11:25:38 +09:00
def __init__ ( self , msg , tb = None , expected = False , cause = None ) :
2013-07-02 15:40:21 +09:00
""" tb, if given, is the original traceback (so that it can be printed out).
If expected is set , this is a normal error message and most likely not a bug in youtube - dl .
"""
if sys . exc_info ( ) [ 0 ] in ( compat_urllib_error . URLError , socket . timeout , UnavailableVideoError ) :
expected = True
if not expected :
2013-08-11 13:46:24 +09:00
msg = msg + u ' ; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update. '
2013-01-02 04:27:53 +09:00
super ( ExtractorError , self ) . __init__ ( msg )
2013-06-09 18:55:08 +09:00
2013-01-02 04:27:53 +09:00
self . traceback = tb
2013-03-09 18:05:43 +09:00
self . exc_info = sys . exc_info ( ) # preserve original exception
2013-08-28 11:25:38 +09:00
self . cause = cause
2013-01-02 04:27:53 +09:00
2013-01-03 23:39:55 +09:00
def format_traceback ( self ) :
if self . traceback is None :
return None
return u ' ' . join ( traceback . format_tb ( self . traceback ) )
2013-01-02 04:27:53 +09:00
2013-10-23 21:38:03 +09:00
class RegexNotFoundError ( ExtractorError ) :
""" Error when a regex didn ' t match """
pass
2012-03-25 10:07:37 +09:00
class DownloadError ( Exception ) :
2012-11-28 10:04:46 +09:00
""" Download Error exception.
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
This exception may be thrown by FileDownloader objects if they are not
configured to continue on errors . They will contain the appropriate
error message .
"""
2013-03-09 18:05:43 +09:00
def __init__ ( self , msg , exc_info = None ) :
""" exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
super ( DownloadError , self ) . __init__ ( msg )
self . exc_info = exc_info
2012-03-25 10:07:37 +09:00
class SameFileError ( Exception ) :
2012-11-28 10:04:46 +09:00
""" Same File exception.
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
This exception will be thrown by FileDownloader objects if they detect
multiple files would have to be downloaded to the same file on disk .
"""
pass
2012-03-25 10:07:37 +09:00
class PostProcessingError ( Exception ) :
2012-11-28 10:04:46 +09:00
""" Post Processing exception.
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
This exception may be raised by PostProcessor ' s .run() method to
indicate an error in the postprocessing task .
"""
2013-01-12 23:07:59 +09:00
def __init__ ( self , msg ) :
self . msg = msg
2012-03-25 10:07:37 +09:00
class MaxDownloadsReached ( Exception ) :
2012-11-28 10:04:46 +09:00
""" --max-downloads limit has been reached. """
pass
2012-03-25 10:07:37 +09:00
class UnavailableVideoError ( Exception ) :
2012-11-28 10:04:46 +09:00
""" Unavailable Format exception.
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
This exception will be thrown when a video is requested
in a format that is not available for that video .
"""
pass
2012-03-25 10:07:37 +09:00
class ContentTooShortError ( Exception ) :
2012-11-28 10:04:46 +09:00
""" Content Too Short exception.
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
This exception may be raised by FileDownloader objects when a file they
download is too small for what the server announced first , indicating
the connection was probably interrupted .
"""
# Both in bytes
downloaded = None
expected = None
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
def __init__ ( self , downloaded , expected ) :
self . downloaded = downloaded
self . expected = expected
2012-03-25 10:07:37 +09:00
2013-08-28 06:15:01 +09:00
class YoutubeDLHandler ( compat_urllib_request . HTTPHandler ) :
2012-11-28 10:04:46 +09:00
""" Handler for HTTP requests and responses.
This class , when installed with an OpenerDirector , automatically adds
the standard headers to every HTTP request and handles gzipped and
deflated responses from web servers . If compression is to be avoided in
a particular request , the original request in the program code only has
to include the HTTP header " Youtubedl-No-Compression " , which will be
removed before making the real request .
Part of this code was copied from :
http : / / techknack . net / python - urllib2 - handlers /
Andrew Rowls , the author of that code , agreed to release it to the
public domain .
"""
@staticmethod
def deflate ( data ) :
try :
return zlib . decompress ( data , - zlib . MAX_WBITS )
except zlib . error :
return zlib . decompress ( data )
@staticmethod
def addinfourl_wrapper ( stream , headers , url , code ) :
if hasattr ( compat_urllib_request . addinfourl , ' getcode ' ) :
return compat_urllib_request . addinfourl ( stream , headers , url , code )
ret = compat_urllib_request . addinfourl ( stream , headers , url )
ret . code = code
return ret
2013-08-28 06:15:01 +09:00
def http_request ( self , req ) :
for h , v in std_headers . items ( ) :
2012-11-28 10:04:46 +09:00
if h in req . headers :
del req . headers [ h ]
2013-01-13 02:38:23 +09:00
req . add_header ( h , v )
2012-11-28 10:04:46 +09:00
if ' Youtubedl-no-compression ' in req . headers :
if ' Accept-encoding ' in req . headers :
del req . headers [ ' Accept-encoding ' ]
del req . headers [ ' Youtubedl-no-compression ' ]
2013-01-13 00:49:13 +09:00
if ' Youtubedl-user-agent ' in req . headers :
2013-01-13 02:38:23 +09:00
if ' User-agent ' in req . headers :
del req . headers [ ' User-agent ' ]
req . headers [ ' User-agent ' ] = req . headers [ ' Youtubedl-user-agent ' ]
2013-01-13 00:49:13 +09:00
del req . headers [ ' Youtubedl-user-agent ' ]
2012-11-28 10:04:46 +09:00
return req
2013-08-28 06:15:01 +09:00
def http_response ( self , req , resp ) :
2012-11-28 10:04:46 +09:00
old_resp = resp
# gzip
if resp . headers . get ( ' Content-encoding ' , ' ' ) == ' gzip ' :
2013-08-28 18:57:13 +09:00
content = resp . read ( )
gz = gzip . GzipFile ( fileobj = io . BytesIO ( content ) , mode = ' rb ' )
try :
uncompressed = io . BytesIO ( gz . read ( ) )
except IOError as original_ioerror :
# There may be junk add the end of the file
# See http://stackoverflow.com/q/4928560/35070 for details
for i in range ( 1 , 1024 ) :
try :
gz = gzip . GzipFile ( fileobj = io . BytesIO ( content [ : - i ] ) , mode = ' rb ' )
uncompressed = io . BytesIO ( gz . read ( ) )
except IOError :
continue
break
else :
raise original_ioerror
resp = self . addinfourl_wrapper ( uncompressed , old_resp . headers , old_resp . url , old_resp . code )
2012-11-28 10:04:46 +09:00
resp . msg = old_resp . msg
# deflate
if resp . headers . get ( ' Content-encoding ' , ' ' ) == ' deflate ' :
gz = io . BytesIO ( self . deflate ( resp . read ( ) ) )
resp = self . addinfourl_wrapper ( gz , old_resp . headers , old_resp . url , old_resp . code )
resp . msg = old_resp . msg
return resp
2012-12-07 08:39:44 +09:00
2013-08-28 06:15:01 +09:00
https_request = http_request
https_response = http_response
2013-04-27 22:14:20 +09:00
def unified_strdate ( date_str ) :
""" Return a string with the date in the format YYYYMMDD """
upload_date = None
#Replace commas
date_str = date_str . replace ( ' , ' , ' ' )
# %z (UTC offset) is only supported in python>=3.2
date_str = re . sub ( r ' ( \ +|-)[ \ d]*$ ' , ' ' , date_str )
2013-09-14 21:26:42 +09:00
format_expressions = [
' %d % B % Y ' ,
' % B %d % Y ' ,
' % b %d % Y ' ,
' % Y- % m- %d ' ,
' %d / % m/ % Y ' ,
' % Y/ % m/ %d % H: % M: % S ' ,
' %d . % m. % Y % H: % M ' ,
' % Y- % m- %d T % H: % M: % SZ ' ,
2013-11-20 14:13:19 +09:00
' % Y- % m- %d T % H: % M: % S. %f Z ' ,
' % Y- % m- %d T % H: % M: % S. %f 0Z ' ,
2013-10-10 22:25:11 +09:00
' % Y- % m- %d T % H: % M: % S ' ,
2013-09-14 21:26:42 +09:00
]
2013-04-27 22:14:20 +09:00
for expression in format_expressions :
try :
upload_date = datetime . datetime . strptime ( date_str , expression ) . strftime ( ' % Y % m %d ' )
except :
pass
return upload_date
2013-07-13 04:52:59 +09:00
def determine_ext ( url , default_ext = u ' unknown_video ' ) :
2013-07-08 08:13:55 +09:00
guess = url . partition ( u ' ? ' ) [ 0 ] . rpartition ( u ' . ' ) [ 2 ]
if re . match ( r ' ^[A-Za-z0-9]+$ ' , guess ) :
return guess
else :
2013-07-13 04:52:59 +09:00
return default_ext
2013-07-08 08:13:55 +09:00
2013-07-20 19:48:57 +09:00
def subtitles_filename ( filename , sub_lang , sub_format ) :
return filename . rsplit ( ' . ' , 1 ) [ 0 ] + u ' . ' + sub_lang + u ' . ' + sub_format
2013-04-27 21:01:55 +09:00
def date_from_str ( date_str ) :
2013-04-28 18:39:37 +09:00
"""
Return a datetime object from a string in the format YYYYMMDD or
( now | today ) [ + - ] [ 0 - 9 ] ( day | week | month | year ) ( s ) ? """
today = datetime . date . today ( )
if date_str == ' now ' or date_str == ' today ' :
return today
match = re . match ( ' (now|today)(?P<sign>[+-])(?P<time> \ d+)(?P<unit>day|week|month|year)(s)? ' , date_str )
if match is not None :
sign = match . group ( ' sign ' )
time = int ( match . group ( ' time ' ) )
if sign == ' - ' :
time = - time
unit = match . group ( ' unit ' )
#A bad aproximation?
if unit == ' month ' :
unit = ' day '
time * = 30
elif unit == ' year ' :
unit = ' day '
time * = 365
unit + = ' s '
delta = datetime . timedelta ( * * { unit : time } )
return today + delta
2013-04-27 21:01:55 +09:00
return datetime . datetime . strptime ( date_str , " % Y % m %d " ) . date ( )
class DateRange ( object ) :
""" Represents a time interval between two dates """
def __init__ ( self , start = None , end = None ) :
""" start and end must be strings in the format accepted by date """
if start is not None :
self . start = date_from_str ( start )
else :
self . start = datetime . datetime . min . date ( )
if end is not None :
self . end = date_from_str ( end )
else :
self . end = datetime . datetime . max . date ( )
2013-04-28 18:39:37 +09:00
if self . start > self . end :
2013-04-27 21:01:55 +09:00
raise ValueError ( ' Date range: " %s " , the start date must be before the end date ' % self )
@classmethod
def day ( cls , day ) :
""" Returns a range that only contains the given day """
return cls ( day , day )
def __contains__ ( self , date ) :
""" Check if the date is in the range """
2013-04-28 18:39:37 +09:00
if not isinstance ( date , datetime . date ) :
date = date_from_str ( date )
return self . start < = date < = self . end
2013-04-27 21:01:55 +09:00
def __str__ ( self ) :
return ' %s - %s ' % ( self . start . isoformat ( ) , self . end . isoformat ( ) )
2013-08-28 19:57:10 +09:00
def platform_name ( ) :
""" Returns the platform name as a compat_str """
res = platform . platform ( )
if isinstance ( res , bytes ) :
res = res . decode ( preferredencoding ( ) )
assert isinstance ( res , compat_str )
return res
2013-08-29 01:22:28 +09:00
2013-09-16 13:55:33 +09:00
def write_string ( s , out = None ) :
if out is None :
out = sys . stderr
assert type ( s ) == type ( u ' ' )
if ( ' b ' in getattr ( out , ' mode ' , ' ' ) or
sys . version_info [ 0 ] < 3 ) : # Python 2 lies about mode of sys.stderr
s = s . encode ( preferredencoding ( ) , ' ignore ' )
out . write ( s )
out . flush ( )
2013-08-28 21:28:55 +09:00
def bytes_to_intlist ( bs ) :
if not bs :
return [ ]
if isinstance ( bs [ 0 ] , int ) : # Python 3
return list ( bs )
else :
return [ ord ( c ) for c in bs ]
2013-08-29 01:22:28 +09:00
2013-08-28 22:59:07 +09:00
def intlist_to_bytes ( xs ) :
if not xs :
return b ' '
if isinstance ( chr ( 0 ) , bytes ) : # Python 2
return ' ' . join ( [ chr ( x ) for x in xs ] )
else :
return bytes ( xs )
2013-10-02 15:41:03 +09:00
def get_cachedir ( params = { } ) :
cache_root = os . environ . get ( ' XDG_CACHE_HOME ' ,
os . path . expanduser ( ' ~/.cache ' ) )
return params . get ( ' cachedir ' , os . path . join ( cache_root , ' youtube-dl ' ) )
2013-10-06 11:27:09 +09:00
# Cross-platform file locking
if sys . platform == ' win32 ' :
import ctypes . wintypes
import msvcrt
class OVERLAPPED ( ctypes . Structure ) :
_fields_ = [
( ' Internal ' , ctypes . wintypes . LPVOID ) ,
( ' InternalHigh ' , ctypes . wintypes . LPVOID ) ,
( ' Offset ' , ctypes . wintypes . DWORD ) ,
( ' OffsetHigh ' , ctypes . wintypes . DWORD ) ,
( ' hEvent ' , ctypes . wintypes . HANDLE ) ,
]
kernel32 = ctypes . windll . kernel32
LockFileEx = kernel32 . LockFileEx
LockFileEx . argtypes = [
ctypes . wintypes . HANDLE , # hFile
ctypes . wintypes . DWORD , # dwFlags
ctypes . wintypes . DWORD , # dwReserved
ctypes . wintypes . DWORD , # nNumberOfBytesToLockLow
ctypes . wintypes . DWORD , # nNumberOfBytesToLockHigh
ctypes . POINTER ( OVERLAPPED ) # Overlapped
]
LockFileEx . restype = ctypes . wintypes . BOOL
UnlockFileEx = kernel32 . UnlockFileEx
UnlockFileEx . argtypes = [
ctypes . wintypes . HANDLE , # hFile
ctypes . wintypes . DWORD , # dwReserved
ctypes . wintypes . DWORD , # nNumberOfBytesToLockLow
ctypes . wintypes . DWORD , # nNumberOfBytesToLockHigh
ctypes . POINTER ( OVERLAPPED ) # Overlapped
]
UnlockFileEx . restype = ctypes . wintypes . BOOL
whole_low = 0xffffffff
whole_high = 0x7fffffff
def _lock_file ( f , exclusive ) :
overlapped = OVERLAPPED ( )
overlapped . Offset = 0
overlapped . OffsetHigh = 0
overlapped . hEvent = 0
f . _lock_file_overlapped_p = ctypes . pointer ( overlapped )
handle = msvcrt . get_osfhandle ( f . fileno ( ) )
if not LockFileEx ( handle , 0x2 if exclusive else 0x0 , 0 ,
whole_low , whole_high , f . _lock_file_overlapped_p ) :
raise OSError ( ' Locking file failed: %r ' % ctypes . FormatError ( ) )
def _unlock_file ( f ) :
assert f . _lock_file_overlapped_p
handle = msvcrt . get_osfhandle ( f . fileno ( ) )
if not UnlockFileEx ( handle , 0 ,
whole_low , whole_high , f . _lock_file_overlapped_p ) :
raise OSError ( ' Unlocking file failed: %r ' % ctypes . FormatError ( ) )
else :
import fcntl
def _lock_file ( f , exclusive ) :
fcntl . lockf ( f , fcntl . LOCK_EX if exclusive else fcntl . LOCK_SH )
def _unlock_file ( f ) :
fcntl . lockf ( f , fcntl . LOCK_UN )
class locked_file ( object ) :
def __init__ ( self , filename , mode , encoding = None ) :
assert mode in [ ' r ' , ' a ' , ' w ' ]
self . f = io . open ( filename , mode , encoding = encoding )
self . mode = mode
def __enter__ ( self ) :
exclusive = self . mode != ' r '
try :
_lock_file ( self . f , exclusive )
except IOError :
self . f . close ( )
raise
return self
def __exit__ ( self , etype , value , traceback ) :
try :
_unlock_file ( self . f )
finally :
self . f . close ( )
def __iter__ ( self ) :
return iter ( self . f )
def write ( self , * args ) :
return self . f . write ( * args )
def read ( self , * args ) :
return self . f . read ( * args )
2013-10-12 20:49:27 +09:00
def shell_quote ( args ) :
return ' ' . join ( map ( pipes . quote , args ) )
2013-10-15 19:05:13 +09:00
2013-10-18 07:46:35 +09:00
def takewhile_inclusive ( pred , seq ) :
""" Like itertools.takewhile, but include the latest evaluated element
( the first element so that Not pred ( e ) ) """
for e in seq :
yield e
if not pred ( e ) :
return
2013-10-15 19:05:13 +09:00
def smuggle_url ( url , data ) :
""" Pass additional data in a URL for internal use. """
sdata = compat_urllib_parse . urlencode (
{ u ' __youtubedl_smuggle ' : json . dumps ( data ) } )
return url + u ' # ' + sdata
def unsmuggle_url ( smug_url ) :
if not ' #__youtubedl_smuggle ' in smug_url :
return smug_url , None
url , _ , sdata = smug_url . rpartition ( u ' # ' )
jsond = compat_parse_qs ( sdata ) [ u ' __youtubedl_smuggle ' ] [ 0 ]
data = json . loads ( jsond )
return url , data