2012-03-25 10:07:37 +09:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
2014-11-02 19:37:49 +09:00
from __future__ import unicode_literals
2014-03-24 09:40:09 +09:00
import calendar
2014-04-05 06:00:51 +09:00
import codecs
2014-02-25 09:43:17 +09:00
import contextlib
2013-12-16 13:04:12 +09:00
import ctypes
2013-08-28 19:57:10 +09:00
import datetime
import email . utils
2013-05-13 16:20:08 +09:00
import errno
2012-03-25 10:07:37 +09:00
import gzip
2014-01-20 19:36:47 +09:00
import itertools
2012-11-28 08:09:17 +09:00
import io
2012-12-20 21:13:24 +09:00
import json
2012-03-25 10:07:37 +09:00
import locale
2013-11-25 11:12:26 +09:00
import math
2012-03-25 10:07:37 +09:00
import os
2013-10-12 20:49:27 +09:00
import pipes
2013-08-28 19:57:10 +09:00
import platform
2012-03-25 10:07:37 +09:00
import re
2013-11-24 14:37:14 +09:00
import ssl
2013-08-28 19:57:10 +09:00
import socket
2014-02-16 00:24:43 +09:00
import struct
2013-12-10 02:29:07 +09:00
import subprocess
2012-03-25 10:07:37 +09:00
import sys
2014-08-21 20:01:13 +09:00
import tempfile
2013-01-03 23:39:55 +09:00
import traceback
2014-03-11 01:31:32 +09:00
import xml . etree . ElementTree
2012-03-25 10:07:37 +09:00
import zlib
2014-11-02 19:23:40 +09:00
from . compat import (
compat_chr ,
compat_getenv ,
compat_html_entities ,
compat_parse_qs ,
compat_str ,
compat_urllib_error ,
compat_urllib_parse ,
compat_urllib_parse_urlparse ,
compat_urllib_request ,
compat_urlparse ,
)
2014-10-01 00:27:53 +09:00
2013-06-06 21:35:08 +09:00
# This is not clearly defined otherwise
compiled_regex_type = type ( re . compile ( ' ' ) )
2012-11-28 08:02:55 +09:00
std_headers = {
2013-11-18 21:52:24 +09:00
' User-Agent ' : ' Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome) ' ,
2012-11-28 10:04:46 +09:00
' Accept-Charset ' : ' ISO-8859-1,utf-8;q=0.7,*;q=0.7 ' ,
' Accept ' : ' text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 ' ,
' Accept-Encoding ' : ' gzip, deflate ' ,
' Accept-Language ' : ' en-us,en;q=0.5 ' ,
2012-11-28 08:02:55 +09:00
}
2012-12-31 02:22:36 +09:00
2012-03-25 10:07:37 +09:00
def preferredencoding ( ) :
2012-11-28 10:04:46 +09:00
""" Get preferred encoding.
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
Returns the best encoding scheme for the system , based on
locale . getpreferredencoding ( ) and some further tweaks .
"""
try :
pref = locale . getpreferredencoding ( )
2014-11-17 15:16:12 +09:00
' TEST ' . encode ( pref )
2012-11-28 10:04:46 +09:00
except :
pref = ' UTF-8 '
2012-07-02 01:21:27 +09:00
2012-11-28 10:04:46 +09:00
return pref
2012-03-25 10:07:37 +09:00
2012-12-20 21:13:24 +09:00
2014-08-21 20:01:13 +09:00
def write_json_file ( obj , fn ) :
""" Encode obj as JSON and write it to fn, atomically """
2014-11-16 06:00:32 +09:00
if sys . version_info < ( 3 , 0 ) :
encoding = get_filesystem_encoding ( )
# os.path.basename returns a bytes object, but NamedTemporaryFile
# will fail if the filename contains non ascii characters unless we
# use a unicode object
path_basename = lambda f : os . path . basename ( fn ) . decode ( encoding )
# the same for os.path.dirname
path_dirname = lambda f : os . path . dirname ( fn ) . decode ( encoding )
else :
path_basename = os . path . basename
path_dirname = os . path . dirname
2014-08-22 00:03:00 +09:00
args = {
' suffix ' : ' .tmp ' ,
2014-11-16 06:00:32 +09:00
' prefix ' : path_basename ( fn ) + ' . ' ,
' dir ' : path_dirname ( fn ) ,
2014-08-22 00:03:00 +09:00
' delete ' : False ,
}
2014-08-21 20:01:13 +09:00
# In Python 2.x, json.dump expects a bytestream.
# In Python 3.x, it writes to a character stream
if sys . version_info < ( 3 , 0 ) :
2014-08-22 00:03:00 +09:00
args [ ' mode ' ] = ' wb '
2014-08-21 20:01:13 +09:00
else :
2014-08-22 00:03:00 +09:00
args . update ( {
' mode ' : ' w ' ,
' encoding ' : ' utf-8 ' ,
} )
tf = tempfile . NamedTemporaryFile ( * * args )
2014-08-21 20:01:13 +09:00
try :
with tf :
json . dump ( obj , tf )
os . rename ( tf . name , fn )
except :
try :
os . remove ( tf . name )
except OSError :
pass
raise
if sys . version_info > = ( 2 , 7 ) :
2013-07-11 23:12:08 +09:00
def find_xpath_attr ( node , xpath , key , val ) :
""" Find the xpath xpath[@key=val] """
2014-07-25 18:39:17 +09:00
assert re . match ( r ' ^[a-zA-Z-]+$ ' , key )
assert re . match ( r ' ^[a-zA-Z0-9@ \ s:._-]*$ ' , val )
2013-07-11 23:12:08 +09:00
expr = xpath + u " [@ %s = ' %s ' ] " % ( key , val )
return node . find ( expr )
else :
def find_xpath_attr ( node , xpath , key , val ) :
2014-09-13 15:34:15 +09:00
# Here comes the crazy part: In 2.6, if the xpath is a unicode,
# .//node does not match if a node is a direct child of . !
if isinstance ( xpath , unicode ) :
xpath = xpath . encode ( ' ascii ' )
2013-07-11 23:12:08 +09:00
for f in node . findall ( xpath ) :
if f . attrib . get ( key ) == val :
return f
return None
2013-10-13 04:34:04 +09:00
# On python2.6 the xml.etree.ElementTree.Element methods don't support
# the namespace parameter
def xpath_with_ns ( path , ns_map ) :
components = [ c . split ( ' : ' ) for c in path . split ( ' / ' ) ]
replaced = [ ]
for c in components :
if len ( c ) == 1 :
replaced . append ( c [ 0 ] )
else :
ns , tag = c
replaced . append ( ' { %s } %s ' % ( ns_map [ ns ] , tag ) )
return ' / ' . join ( replaced )
2012-03-25 10:07:37 +09:00
2014-09-13 16:09:55 +09:00
def xpath_text ( node , xpath , name = None , fatal = False ) :
2014-09-13 16:11:14 +09:00
if sys . version_info < ( 2 , 7 ) : # Crazy 2.6
xpath = xpath . encode ( ' ascii ' )
2014-09-13 16:09:55 +09:00
n = node . find ( xpath )
if n is None :
if fatal :
name = xpath if name is None else name
raise ExtractorError ( ' Could not find XML element %s ' % name )
else :
return None
return n . text
2012-04-11 07:22:51 +09:00
def get_element_by_id ( id , html ) :
2012-12-19 23:21:14 +09:00
""" Return the content of the tag with the specified ID in the passed HTML document """
return get_element_by_attribute ( " id " , id , html )
2014-11-05 07:20:39 +09:00
2012-12-19 23:21:14 +09:00
def get_element_by_attribute ( attribute , value , html ) :
""" Return the content of the tag with the specified attribute in the passed HTML document """
2012-04-11 07:22:51 +09:00
2014-11-05 07:33:43 +09:00
m = re . search ( r ''' (?xs)
< ( [ a - zA - Z0 - 9 : . _ - ] + )
( ? : \s + [ a - zA - Z0 - 9 : . _ - ] + ( ? := [ a - zA - Z0 - 9 : . _ - ] + | = " [^ " ] + " |= ' [^ ' ]+ ' ))*?
\s + % s = [ ' " ]? %s [ ' " ]?
( ? : \s + [ a - zA - Z0 - 9 : . _ - ] + ( ? := [ a - zA - Z0 - 9 : . _ - ] + | = " [^ " ] + " |= ' [^ ' ]+ ' ))*?
\s * >
( ? P < content > . * ? )
< / \1 >
''' % (re.escape(attribute), re.escape(value)), html)
if not m :
return None
res = m . group ( ' content ' )
if res . startswith ( ' " ' ) or res . startswith ( " ' " ) :
res = res [ 1 : - 1 ]
2013-09-14 05:05:29 +09:00
2014-11-05 07:33:43 +09:00
return unescapeHTML ( res )
2013-09-14 05:05:29 +09:00
2012-04-11 07:22:51 +09:00
def clean_html ( html ) :
2012-11-28 10:04:46 +09:00
""" Clean an HTML snippet into a readable string """
# Newline vs <br />
html = html . replace ( ' \n ' , ' ' )
2012-12-21 00:30:55 +09:00
html = re . sub ( r ' \ s*< \ s*br \ s*/? \ s*> \ s* ' , ' \n ' , html )
html = re . sub ( r ' < \ s*/ \ s*p \ s*> \ s*< \ s*p[^>]*> ' , ' \n ' , html )
2012-11-28 10:04:46 +09:00
# Strip html tags
html = re . sub ( ' <.*?> ' , ' ' , html )
# Replace html entities
html = unescapeHTML ( html )
2013-03-29 23:59:13 +09:00
return html . strip ( )
2012-04-11 07:22:51 +09:00
2012-03-25 10:07:37 +09:00
def sanitize_open ( filename , open_mode ) :
2012-11-28 10:04:46 +09:00
""" Try to open the given filename, and slightly tweak it if this fails.
Attempts to open the given filename . If this fails , it tries to change
the filename slightly , step by step , until it ' s either able to open it
or it fails and raises a final exception , like the standard open ( )
function .
It returns the tuple ( stream , definitive_file_name ) .
"""
try :
2014-11-17 15:16:12 +09:00
if filename == ' - ' :
2012-11-28 10:04:46 +09:00
if sys . platform == ' win32 ' :
import msvcrt
msvcrt . setmode ( sys . stdout . fileno ( ) , os . O_BINARY )
2013-03-28 21:13:03 +09:00
return ( sys . stdout . buffer if hasattr ( sys . stdout , ' buffer ' ) else sys . stdout , filename )
2012-11-28 10:04:46 +09:00
stream = open ( encodeFilename ( filename ) , open_mode )
return ( stream , filename )
except ( IOError , OSError ) as err :
2013-05-13 16:20:08 +09:00
if err . errno in ( errno . EACCES , ) :
raise
2012-11-28 10:04:46 +09:00
2013-05-13 16:20:08 +09:00
# In case of error, try to remove win32 forbidden chars
alt_filename = os . path . join (
2014-11-17 15:16:12 +09:00
re . sub ( ' [/<>: " \\ | \\ \\ ? \\ *] ' , ' # ' , path_part )
2013-05-13 16:20:08 +09:00
for path_part in os . path . split ( filename )
)
if alt_filename == filename :
raise
else :
# An exception here should be caught in the caller
stream = open ( encodeFilename ( filename ) , open_mode )
return ( stream , alt_filename )
2012-03-25 10:07:37 +09:00
def timeconvert ( timestr ) :
2012-11-28 10:04:46 +09:00
""" Convert RFC 2822 defined time string into system timestamp """
timestamp = None
timetuple = email . utils . parsedate_tz ( timestr )
if timetuple is not None :
timestamp = email . utils . mktime_tz ( timetuple )
return timestamp
2012-11-27 07:58:46 +09:00
2012-12-03 23:36:24 +09:00
def sanitize_filename ( s , restricted = False , is_id = False ) :
2012-11-28 10:04:46 +09:00
""" Sanitizes a string so it could be used as part of a filename.
If restricted is set , use a stricter subset of allowed characters .
2012-12-03 23:36:24 +09:00
Set is_id if this is not an arbitrary string , but an ID that should be kept if possible
2012-11-28 10:04:46 +09:00
"""
def replace_insane ( char ) :
if char == ' ? ' or ord ( char ) < 32 or ord ( char ) == 127 :
return ' '
elif char == ' " ' :
return ' ' if restricted else ' \' '
elif char == ' : ' :
return ' _- ' if restricted else ' - '
elif char in ' \\ /|*<> ' :
return ' _ '
2012-11-28 20:59:27 +09:00
if restricted and ( char in ' !& \' ()[] {} $;`^,# ' or char . isspace ( ) ) :
2012-11-28 10:04:46 +09:00
return ' _ '
if restricted and ord ( char ) > 127 :
return ' _ '
return char
2014-11-17 15:16:12 +09:00
result = ' ' . join ( map ( replace_insane , s ) )
2012-12-03 23:36:24 +09:00
if not is_id :
while ' __ ' in result :
result = result . replace ( ' __ ' , ' _ ' )
result = result . strip ( ' _ ' )
# Common case of "Foreign band name - English song title"
if restricted and result . startswith ( ' -_ ' ) :
result = result [ 2 : ]
if not result :
result = ' _ '
2012-11-28 10:04:46 +09:00
return result
2012-03-25 10:07:37 +09:00
def orderedSet ( iterable ) :
2012-11-28 10:04:46 +09:00
""" Remove all duplicates from the input iterable """
res = [ ]
for el in iterable :
if el not in res :
res . append ( el )
return res
2012-03-25 10:07:37 +09:00
2014-03-24 09:40:09 +09:00
2014-08-28 02:11:45 +09:00
def _htmlentity_transform ( entity ) :
""" Transforms an HTML entity to a character. """
# Known non-numeric HTML entity
if entity in compat_html_entities . name2codepoint :
return compat_chr ( compat_html_entities . name2codepoint [ entity ] )
mobj = re . match ( r ' #(x?[0-9]+) ' , entity )
if mobj is not None :
numstr = mobj . group ( 1 )
2014-11-17 15:16:12 +09:00
if numstr . startswith ( ' x ' ) :
2014-08-28 02:11:45 +09:00
base = 16
2014-11-17 15:16:12 +09:00
numstr = ' 0 %s ' % numstr
2014-08-28 02:11:45 +09:00
else :
base = 10
return compat_chr ( int ( numstr , base ) )
# Unknown entity in name, return its literal representation
2014-11-17 15:16:12 +09:00
return ( ' & %s ; ' % entity )
2014-08-28 02:11:45 +09:00
2012-03-25 10:07:37 +09:00
def unescapeHTML ( s ) :
2014-03-24 09:40:09 +09:00
if s is None :
return None
assert type ( s ) == compat_str
2012-03-25 10:07:37 +09:00
2014-08-28 02:11:45 +09:00
return re . sub (
r ' &([^;]+); ' , lambda m : _htmlentity_transform ( m . group ( 1 ) ) , s )
2012-03-25 10:07:37 +09:00
2014-01-05 11:07:55 +09:00
def encodeFilename ( s , for_subprocess = False ) :
2012-11-28 10:04:46 +09:00
"""
@param s The name of the file
"""
2012-03-25 10:07:37 +09:00
2014-01-05 11:07:55 +09:00
assert type ( s ) == compat_str
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
# Python 3 has a Unicode API
if sys . version_info > = ( 3 , 0 ) :
return s
2012-11-28 08:56:20 +09:00
2012-11-28 10:04:46 +09:00
if sys . platform == ' win32 ' and sys . getwindowsversion ( ) [ 0 ] > = 5 :
2014-11-17 15:16:12 +09:00
# Pass '' directly to use Unicode APIs on Windows 2000 and up
2012-11-28 10:04:46 +09:00
# (Detecting Windows NT 4 is tricky because 'major >= 4' would
# match Windows 9x series as well. Besides, NT 4 is obsolete.)
2014-01-05 11:07:55 +09:00
if not for_subprocess :
return s
else :
# For subprocess calls, encode with locale encoding
# Refer to http://stackoverflow.com/a/9951851/35070
encoding = preferredencoding ( )
2012-11-28 10:04:46 +09:00
else :
2013-01-20 09:48:05 +09:00
encoding = sys . getfilesystemencoding ( )
2014-01-05 11:07:55 +09:00
if encoding is None :
encoding = ' utf-8 '
return s . encode ( encoding , ' ignore ' )
2014-05-16 22:47:54 +09:00
def encodeArgument ( s ) :
if not isinstance ( s , compat_str ) :
# Legacy code that uses byte strings
# Uncomment the following line after fixing all post processors
#assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
s = s . decode ( ' ascii ' )
return encodeFilename ( s , True )
2013-02-22 01:09:39 +09:00
def decodeOption ( optval ) :
if optval is None :
return optval
if isinstance ( optval , bytes ) :
optval = optval . decode ( preferredencoding ( ) )
assert isinstance ( optval , compat_str )
return optval
2013-01-02 04:27:53 +09:00
2013-05-04 19:02:18 +09:00
def formatSeconds ( secs ) :
if secs > 3600 :
return ' %d : %02d : %02d ' % ( secs / / 3600 , ( secs % 3600 ) / / 60 , secs % 60 )
elif secs > 60 :
return ' %d : %02d ' % ( secs / / 60 , secs % 60 )
else :
return ' %d ' % secs
2013-12-29 23:28:32 +09:00
def make_HTTPS_handler ( opts_no_check_certificate , * * kwargs ) :
2013-11-24 14:37:14 +09:00
if sys . version_info < ( 3 , 2 ) :
import httplib
class HTTPSConnectionV3 ( httplib . HTTPSConnection ) :
def __init__ ( self , * args , * * kwargs ) :
httplib . HTTPSConnection . __init__ ( self , * args , * * kwargs )
def connect ( self ) :
sock = socket . create_connection ( ( self . host , self . port ) , self . timeout )
2013-12-09 11:02:54 +09:00
if getattr ( self , ' _tunnel_host ' , False ) :
2013-11-24 14:37:14 +09:00
self . sock = sock
self . _tunnel ( )
try :
2014-09-12 14:50:31 +09:00
self . sock = ssl . wrap_socket ( sock , self . key_file , self . cert_file , ssl_version = ssl . PROTOCOL_TLSv1 )
2013-11-25 14:06:18 +09:00
except ssl . SSLError :
2013-11-24 14:37:14 +09:00
self . sock = ssl . wrap_socket ( sock , self . key_file , self . cert_file , ssl_version = ssl . PROTOCOL_SSLv23 )
class HTTPSHandlerV3 ( compat_urllib_request . HTTPSHandler ) :
def https_open ( self , req ) :
return self . do_open ( HTTPSConnectionV3 , req )
2013-12-29 23:28:32 +09:00
return HTTPSHandlerV3 ( * * kwargs )
2014-09-12 14:50:31 +09:00
elif hasattr ( ssl , ' create_default_context ' ) : # Python >= 3.4
context = ssl . create_default_context ( ssl . Purpose . CLIENT_AUTH )
context . options & = ~ ssl . OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
if opts_no_check_certificate :
context . verify_mode = ssl . CERT_NONE
return compat_urllib_request . HTTPSHandler ( context = context , * * kwargs )
else : # Python < 3.4
context = ssl . SSLContext ( ssl . PROTOCOL_SSLv23 )
2013-05-04 19:19:02 +09:00
context . verify_mode = ( ssl . CERT_NONE
2013-11-23 03:57:52 +09:00
if opts_no_check_certificate
2013-05-04 19:19:02 +09:00
else ssl . CERT_REQUIRED )
2013-12-08 14:54:39 +09:00
context . set_default_verify_paths ( )
try :
context . load_default_certs ( )
except AttributeError :
pass # Python < 3.4
2013-12-29 23:28:32 +09:00
return compat_urllib_request . HTTPSHandler ( context = context , * * kwargs )
2013-05-04 19:19:02 +09:00
2013-01-02 04:27:53 +09:00
class ExtractorError ( Exception ) :
""" Error during info extraction. """
2014-04-22 03:34:03 +09:00
def __init__ ( self , msg , tb = None , expected = False , cause = None , video_id = None ) :
2013-07-02 15:40:21 +09:00
""" tb, if given, is the original traceback (so that it can be printed out).
If expected is set , this is a normal error message and most likely not a bug in youtube - dl .
"""
if sys . exc_info ( ) [ 0 ] in ( compat_urllib_error . URLError , socket . timeout , UnavailableVideoError ) :
expected = True
2014-04-22 03:34:03 +09:00
if video_id is not None :
msg = video_id + ' : ' + msg
2014-09-30 14:56:24 +09:00
if cause :
2014-11-17 15:16:12 +09:00
msg + = ' (caused by %r ) ' % cause
2013-07-02 15:40:21 +09:00
if not expected :
2014-11-17 15:16:12 +09:00
msg = msg + ' ; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update. '
2013-01-02 04:27:53 +09:00
super ( ExtractorError , self ) . __init__ ( msg )
2013-06-09 18:55:08 +09:00
2013-01-02 04:27:53 +09:00
self . traceback = tb
2013-03-09 18:05:43 +09:00
self . exc_info = sys . exc_info ( ) # preserve original exception
2013-08-28 11:25:38 +09:00
self . cause = cause
2014-04-22 03:34:03 +09:00
self . video_id = video_id
2013-01-02 04:27:53 +09:00
2013-01-03 23:39:55 +09:00
def format_traceback ( self ) :
if self . traceback is None :
return None
2014-11-17 15:16:12 +09:00
return ' ' . join ( traceback . format_tb ( self . traceback ) )
2013-01-03 23:39:55 +09:00
2013-01-02 04:27:53 +09:00
2013-10-23 21:38:03 +09:00
class RegexNotFoundError ( ExtractorError ) :
""" Error when a regex didn ' t match """
pass
2012-03-25 10:07:37 +09:00
class DownloadError ( Exception ) :
2012-11-28 10:04:46 +09:00
""" Download Error exception.
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
This exception may be thrown by FileDownloader objects if they are not
configured to continue on errors . They will contain the appropriate
error message .
"""
2013-03-09 18:05:43 +09:00
def __init__ ( self , msg , exc_info = None ) :
""" exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
super ( DownloadError , self ) . __init__ ( msg )
self . exc_info = exc_info
2012-03-25 10:07:37 +09:00
class SameFileError ( Exception ) :
2012-11-28 10:04:46 +09:00
""" Same File exception.
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
This exception will be thrown by FileDownloader objects if they detect
multiple files would have to be downloaded to the same file on disk .
"""
pass
2012-03-25 10:07:37 +09:00
class PostProcessingError ( Exception ) :
2012-11-28 10:04:46 +09:00
""" Post Processing exception.
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
This exception may be raised by PostProcessor ' s .run() method to
indicate an error in the postprocessing task .
"""
2013-01-12 23:07:59 +09:00
def __init__ ( self , msg ) :
self . msg = msg
2012-03-25 10:07:37 +09:00
class MaxDownloadsReached ( Exception ) :
2012-11-28 10:04:46 +09:00
""" --max-downloads limit has been reached. """
pass
2012-03-25 10:07:37 +09:00
class UnavailableVideoError ( Exception ) :
2012-11-28 10:04:46 +09:00
""" Unavailable Format exception.
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
This exception will be thrown when a video is requested
in a format that is not available for that video .
"""
pass
2012-03-25 10:07:37 +09:00
class ContentTooShortError ( Exception ) :
2012-11-28 10:04:46 +09:00
""" Content Too Short exception.
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
This exception may be raised by FileDownloader objects when a file they
download is too small for what the server announced first , indicating
the connection was probably interrupted .
"""
# Both in bytes
downloaded = None
expected = None
2012-03-25 10:07:37 +09:00
2012-11-28 10:04:46 +09:00
def __init__ ( self , downloaded , expected ) :
self . downloaded = downloaded
self . expected = expected
2012-03-25 10:07:37 +09:00
2013-08-28 06:15:01 +09:00
class YoutubeDLHandler ( compat_urllib_request . HTTPHandler ) :
2012-11-28 10:04:46 +09:00
""" Handler for HTTP requests and responses.
This class , when installed with an OpenerDirector , automatically adds
the standard headers to every HTTP request and handles gzipped and
deflated responses from web servers . If compression is to be avoided in
a particular request , the original request in the program code only has
to include the HTTP header " Youtubedl-No-Compression " , which will be
removed before making the real request .
Part of this code was copied from :
http : / / techknack . net / python - urllib2 - handlers /
Andrew Rowls , the author of that code , agreed to release it to the
public domain .
"""
@staticmethod
def deflate ( data ) :
try :
return zlib . decompress ( data , - zlib . MAX_WBITS )
except zlib . error :
return zlib . decompress ( data )
@staticmethod
def addinfourl_wrapper ( stream , headers , url , code ) :
if hasattr ( compat_urllib_request . addinfourl , ' getcode ' ) :
return compat_urllib_request . addinfourl ( stream , headers , url , code )
ret = compat_urllib_request . addinfourl ( stream , headers , url )
ret . code = code
return ret
2013-08-28 06:15:01 +09:00
def http_request ( self , req ) :
2014-08-26 18:51:48 +09:00
for h , v in std_headers . items ( ) :
if h not in req . headers :
req . add_header ( h , v )
2012-11-28 10:04:46 +09:00
if ' Youtubedl-no-compression ' in req . headers :
if ' Accept-encoding ' in req . headers :
del req . headers [ ' Accept-encoding ' ]
del req . headers [ ' Youtubedl-no-compression ' ]
2013-01-13 00:49:13 +09:00
if ' Youtubedl-user-agent ' in req . headers :
2013-01-13 02:38:23 +09:00
if ' User-agent ' in req . headers :
del req . headers [ ' User-agent ' ]
req . headers [ ' User-agent ' ] = req . headers [ ' Youtubedl-user-agent ' ]
2013-01-13 00:49:13 +09:00
del req . headers [ ' Youtubedl-user-agent ' ]
2014-09-29 13:15:46 +09:00
if sys . version_info < ( 2 , 7 ) and ' # ' in req . get_full_url ( ) :
# Python 2.6 is brain-dead when it comes to fragments
req . _Request__original = req . _Request__original . partition ( ' # ' ) [ 0 ]
req . _Request__r_type = req . _Request__r_type . partition ( ' # ' ) [ 0 ]
2012-11-28 10:04:46 +09:00
return req
2013-08-28 06:15:01 +09:00
def http_response ( self , req , resp ) :
2012-11-28 10:04:46 +09:00
old_resp = resp
# gzip
if resp . headers . get ( ' Content-encoding ' , ' ' ) == ' gzip ' :
2013-08-28 18:57:13 +09:00
content = resp . read ( )
gz = gzip . GzipFile ( fileobj = io . BytesIO ( content ) , mode = ' rb ' )
try :
uncompressed = io . BytesIO ( gz . read ( ) )
except IOError as original_ioerror :
# There may be junk add the end of the file
# See http://stackoverflow.com/q/4928560/35070 for details
for i in range ( 1 , 1024 ) :
try :
gz = gzip . GzipFile ( fileobj = io . BytesIO ( content [ : - i ] ) , mode = ' rb ' )
uncompressed = io . BytesIO ( gz . read ( ) )
except IOError :
continue
break
else :
raise original_ioerror
resp = self . addinfourl_wrapper ( uncompressed , old_resp . headers , old_resp . url , old_resp . code )
2012-11-28 10:04:46 +09:00
resp . msg = old_resp . msg
# deflate
if resp . headers . get ( ' Content-encoding ' , ' ' ) == ' deflate ' :
gz = io . BytesIO ( self . deflate ( resp . read ( ) ) )
resp = self . addinfourl_wrapper ( gz , old_resp . headers , old_resp . url , old_resp . code )
resp . msg = old_resp . msg
return resp
2012-12-07 08:39:44 +09:00
2013-08-28 06:15:01 +09:00
https_request = http_request
https_response = http_response
2013-04-27 22:14:20 +09:00
2014-02-06 19:29:46 +09:00
2014-05-18 02:04:02 +09:00
def parse_iso8601 ( date_str , delimiter = ' T ' ) :
2014-03-24 09:40:09 +09:00
""" Return a UNIX timestamp from the given date """
if date_str is None :
return None
m = re . search (
2014-10-30 04:10:00 +09:00
r ' ( \ .[0-9]+)?(?:Z$| ?(?P<sign> \ +|-)(?P<hours>[0-9] {2} ):?(?P<minutes>[0-9] {2} )$) ' ,
2014-03-24 09:40:09 +09:00
date_str )
if not m :
timezone = datetime . timedelta ( )
else :
date_str = date_str [ : - len ( m . group ( 0 ) ) ]
if not m . group ( ' sign ' ) :
timezone = datetime . timedelta ( )
else :
sign = 1 if m . group ( ' sign ' ) == ' + ' else - 1
timezone = datetime . timedelta (
hours = sign * int ( m . group ( ' hours ' ) ) ,
minutes = sign * int ( m . group ( ' minutes ' ) ) )
2014-10-30 04:10:00 +09:00
date_format = ' % Y- % m- %d {0} % H: % M: % S ' . format ( delimiter )
2014-05-18 02:04:02 +09:00
dt = datetime . datetime . strptime ( date_str , date_format ) - timezone
2014-03-24 09:40:09 +09:00
return calendar . timegm ( dt . timetuple ( ) )
2013-04-27 22:14:20 +09:00
def unified_strdate ( date_str ) :
""" Return a string with the date in the format YYYYMMDD """
2014-03-21 22:38:37 +09:00
if date_str is None :
return None
2013-04-27 22:14:20 +09:00
upload_date = None
#Replace commas
2014-02-10 02:09:57 +09:00
date_str = date_str . replace ( ' , ' , ' ' )
2013-04-27 22:14:20 +09:00
# %z (UTC offset) is only supported in python>=3.2
2014-02-10 02:09:57 +09:00
date_str = re . sub ( r ' ?( \ +|-)[0-9] {2} :?[0-9] {2} $ ' , ' ' , date_str )
2013-09-14 21:26:42 +09:00
format_expressions = [
' %d % B % Y ' ,
2014-02-17 05:47:03 +09:00
' %d % b % Y ' ,
2013-09-14 21:26:42 +09:00
' % B %d % Y ' ,
' % b %d % Y ' ,
2014-06-29 03:02:02 +09:00
' % b %d st % Y % I: % M % p ' ,
' % b %d nd % Y % I: % M % p ' ,
' % b %d th % Y % I: % M % p ' ,
2013-09-14 21:26:42 +09:00
' % Y- % m- %d ' ,
2014-08-19 22:02:08 +09:00
' % Y/ % m/ %d ' ,
2014-02-27 19:44:05 +09:00
' %d . % m. % Y ' ,
2013-09-14 21:26:42 +09:00
' %d / % m/ % Y ' ,
2014-08-24 13:41:55 +09:00
' %d / % m/ % y ' ,
2013-09-14 21:26:42 +09:00
' % Y/ % m/ %d % H: % M: % S ' ,
2014-09-29 19:45:18 +09:00
' %d / % m/ % Y % H: % M: % S ' ,
2014-01-07 01:15:27 +09:00
' % Y- % m- %d % H: % M: % S ' ,
2014-10-05 04:38:23 +09:00
' % Y- % m- %d % H: % M: % S. %f ' ,
2013-09-14 21:26:42 +09:00
' %d . % m. % Y % H: % M ' ,
2014-03-12 06:18:43 +09:00
' %d . % m. % Y % H. % M ' ,
2013-09-14 21:26:42 +09:00
' % Y- % m- %d T % H: % M: % SZ ' ,
2013-11-20 14:13:19 +09:00
' % Y- % m- %d T % H: % M: % S. %f Z ' ,
' % Y- % m- %d T % H: % M: % S. %f 0Z ' ,
2013-10-10 22:25:11 +09:00
' % Y- % m- %d T % H: % M: % S ' ,
2014-02-23 21:00:51 +09:00
' % Y- % m- %d T % H: % M: % S. %f ' ,
2014-02-06 19:29:46 +09:00
' % Y- % m- %d T % H: % M ' ,
2013-09-14 21:26:42 +09:00
]
2013-04-27 22:14:20 +09:00
for expression in format_expressions :
try :
upload_date = datetime . datetime . strptime ( date_str , expression ) . strftime ( ' % Y % m %d ' )
2014-02-06 19:29:46 +09:00
except ValueError :
2013-04-27 22:14:20 +09:00
pass
2013-12-17 20:33:55 +09:00
if upload_date is None :
timetuple = email . utils . parsedate_tz ( date_str )
if timetuple :
upload_date = datetime . datetime ( * timetuple [ : 6 ] ) . strftime ( ' % Y % m %d ' )
2013-04-27 22:14:20 +09:00
return upload_date
2014-11-17 15:16:12 +09:00
def determine_ext ( url , default_ext = ' unknown_video ' ) :
2014-08-01 21:08:09 +09:00
if url is None :
return default_ext
2014-11-17 15:16:12 +09:00
guess = url . partition ( ' ? ' ) [ 0 ] . rpartition ( ' . ' ) [ 2 ]
2013-07-08 08:13:55 +09:00
if re . match ( r ' ^[A-Za-z0-9]+$ ' , guess ) :
return guess
else :
2013-07-13 04:52:59 +09:00
return default_ext
2013-07-08 08:13:55 +09:00
2013-07-20 19:48:57 +09:00
def subtitles_filename ( filename , sub_lang , sub_format ) :
2014-11-17 15:16:12 +09:00
return filename . rsplit ( ' . ' , 1 ) [ 0 ] + ' . ' + sub_lang + ' . ' + sub_format
2013-07-20 19:48:57 +09:00
2013-04-27 21:01:55 +09:00
def date_from_str ( date_str ) :
2013-04-28 18:39:37 +09:00
"""
Return a datetime object from a string in the format YYYYMMDD or
( now | today ) [ + - ] [ 0 - 9 ] ( day | week | month | year ) ( s ) ? """
today = datetime . date . today ( )
if date_str == ' now ' or date_str == ' today ' :
return today
match = re . match ( ' (now|today)(?P<sign>[+-])(?P<time> \ d+)(?P<unit>day|week|month|year)(s)? ' , date_str )
if match is not None :
sign = match . group ( ' sign ' )
time = int ( match . group ( ' time ' ) )
if sign == ' - ' :
time = - time
unit = match . group ( ' unit ' )
#A bad aproximation?
if unit == ' month ' :
unit = ' day '
time * = 30
elif unit == ' year ' :
unit = ' day '
time * = 365
unit + = ' s '
delta = datetime . timedelta ( * * { unit : time } )
return today + delta
2013-04-27 21:01:55 +09:00
return datetime . datetime . strptime ( date_str , " % Y % m %d " ) . date ( )
2014-01-02 21:47:28 +09:00
def hyphenate_date ( date_str ) :
"""
Convert a date in ' YYYYMMDD ' format to ' YYYY-MM-DD ' format """
match = re . match ( r ' ^( \ d \ d \ d \ d)( \ d \ d)( \ d \ d)$ ' , date_str )
if match is not None :
return ' - ' . join ( match . groups ( ) )
else :
return date_str
2013-04-27 21:01:55 +09:00
class DateRange ( object ) :
""" Represents a time interval between two dates """
def __init__ ( self , start = None , end = None ) :
""" start and end must be strings in the format accepted by date """
if start is not None :
self . start = date_from_str ( start )
else :
self . start = datetime . datetime . min . date ( )
if end is not None :
self . end = date_from_str ( end )
else :
self . end = datetime . datetime . max . date ( )
2013-04-28 18:39:37 +09:00
if self . start > self . end :
2013-04-27 21:01:55 +09:00
raise ValueError ( ' Date range: " %s " , the start date must be before the end date ' % self )
@classmethod
def day ( cls , day ) :
""" Returns a range that only contains the given day """
return cls ( day , day )
def __contains__ ( self , date ) :
""" Check if the date is in the range """
2013-04-28 18:39:37 +09:00
if not isinstance ( date , datetime . date ) :
date = date_from_str ( date )
return self . start < = date < = self . end
2013-04-27 21:01:55 +09:00
def __str__ ( self ) :
return ' %s - %s ' % ( self . start . isoformat ( ) , self . end . isoformat ( ) )
2013-08-28 19:57:10 +09:00
def platform_name ( ) :
""" Returns the platform name as a compat_str """
res = platform . platform ( )
if isinstance ( res , bytes ) :
res = res . decode ( preferredencoding ( ) )
assert isinstance ( res , compat_str )
return res
2013-08-29 01:22:28 +09:00
2014-04-08 05:48:13 +09:00
def _windows_write_string ( s , out ) :
""" Returns True if the string was written using special methods,
False if it has yet to be written out . """
# Adapted from http://stackoverflow.com/a/3259271/35070
import ctypes
import ctypes . wintypes
WIN_OUTPUT_IDS = {
1 : - 11 ,
2 : - 12 ,
}
2014-04-30 17:07:32 +09:00
try :
fileno = out . fileno ( )
except AttributeError :
# If the output stream doesn't have a fileno, it's virtual
return False
2014-04-08 05:48:13 +09:00
if fileno not in WIN_OUTPUT_IDS :
return False
GetStdHandle = ctypes . WINFUNCTYPE (
ctypes . wintypes . HANDLE , ctypes . wintypes . DWORD ) (
( " GetStdHandle " , ctypes . windll . kernel32 ) )
h = GetStdHandle ( WIN_OUTPUT_IDS [ fileno ] )
WriteConsoleW = ctypes . WINFUNCTYPE (
ctypes . wintypes . BOOL , ctypes . wintypes . HANDLE , ctypes . wintypes . LPWSTR ,
ctypes . wintypes . DWORD , ctypes . POINTER ( ctypes . wintypes . DWORD ) ,
ctypes . wintypes . LPVOID ) ( ( " WriteConsoleW " , ctypes . windll . kernel32 ) )
written = ctypes . wintypes . DWORD ( 0 )
GetFileType = ctypes . WINFUNCTYPE ( ctypes . wintypes . DWORD , ctypes . wintypes . DWORD ) ( ( " GetFileType " , ctypes . windll . kernel32 ) )
FILE_TYPE_CHAR = 0x0002
FILE_TYPE_REMOTE = 0x8000
GetConsoleMode = ctypes . WINFUNCTYPE (
ctypes . wintypes . BOOL , ctypes . wintypes . HANDLE ,
ctypes . POINTER ( ctypes . wintypes . DWORD ) ) (
( " GetConsoleMode " , ctypes . windll . kernel32 ) )
INVALID_HANDLE_VALUE = ctypes . wintypes . DWORD ( - 1 ) . value
def not_a_console ( handle ) :
if handle == INVALID_HANDLE_VALUE or handle is None :
return True
return ( ( GetFileType ( handle ) & ~ FILE_TYPE_REMOTE ) != FILE_TYPE_CHAR
or GetConsoleMode ( handle , ctypes . byref ( ctypes . wintypes . DWORD ( ) ) ) == 0 )
if not_a_console ( h ) :
return False
2014-04-21 11:59:44 +09:00
def next_nonbmp_pos ( s ) :
try :
return next ( i for i , c in enumerate ( s ) if ord ( c ) > 0xffff )
except StopIteration :
return len ( s )
while s :
count = min ( next_nonbmp_pos ( s ) , 1024 )
2014-04-08 05:48:13 +09:00
ret = WriteConsoleW (
2014-04-21 11:59:44 +09:00
h , s , count if count else 2 , ctypes . byref ( written ) , None )
2014-04-08 05:48:13 +09:00
if ret == 0 :
raise OSError ( ' Failed to write string ' )
2014-04-21 11:59:44 +09:00
if not count : # We just wrote a non-BMP character
assert written . value == 2
s = s [ 1 : ]
else :
assert written . value > 0
s = s [ written . value : ]
2014-04-08 05:48:13 +09:00
return True
2014-04-08 02:57:42 +09:00
def write_string ( s , out = None , encoding = None ) :
2013-09-16 13:55:33 +09:00
if out is None :
out = sys . stderr
2014-01-05 11:07:55 +09:00
assert type ( s ) == compat_str
2013-09-16 13:55:33 +09:00
2014-04-08 05:48:13 +09:00
if sys . platform == ' win32 ' and encoding is None and hasattr ( out , ' fileno ' ) :
if _windows_write_string ( s , out ) :
return
2013-09-16 13:55:33 +09:00
if ( ' b ' in getattr ( out , ' mode ' , ' ' ) or
sys . version_info [ 0 ] < 3 ) : # Python 2 lies about mode of sys.stderr
2014-04-08 04:40:34 +09:00
byt = s . encode ( encoding or preferredencoding ( ) , ' ignore ' )
out . write ( byt )
elif hasattr ( out , ' buffer ' ) :
enc = encoding or getattr ( out , ' encoding ' , None ) or preferredencoding ( )
byt = s . encode ( enc , ' ignore ' )
out . buffer . write ( byt )
else :
2014-01-05 11:07:55 +09:00
out . write ( s )
2013-09-16 13:55:33 +09:00
out . flush ( )
2013-08-28 21:28:55 +09:00
def bytes_to_intlist ( bs ) :
if not bs :
return [ ]
if isinstance ( bs [ 0 ] , int ) : # Python 3
return list ( bs )
else :
return [ ord ( c ) for c in bs ]
2013-08-29 01:22:28 +09:00
2013-08-28 22:59:07 +09:00
def intlist_to_bytes ( xs ) :
if not xs :
return b ' '
2014-11-14 08:39:32 +09:00
return struct_pack ( ' %d B ' % len ( xs ) , * xs )
2013-10-02 15:41:03 +09:00
2013-10-06 11:27:09 +09:00
# Cross-platform file locking
if sys . platform == ' win32 ' :
import ctypes . wintypes
import msvcrt
class OVERLAPPED ( ctypes . Structure ) :
_fields_ = [
( ' Internal ' , ctypes . wintypes . LPVOID ) ,
( ' InternalHigh ' , ctypes . wintypes . LPVOID ) ,
( ' Offset ' , ctypes . wintypes . DWORD ) ,
( ' OffsetHigh ' , ctypes . wintypes . DWORD ) ,
( ' hEvent ' , ctypes . wintypes . HANDLE ) ,
]
kernel32 = ctypes . windll . kernel32
LockFileEx = kernel32 . LockFileEx
LockFileEx . argtypes = [
ctypes . wintypes . HANDLE , # hFile
ctypes . wintypes . DWORD , # dwFlags
ctypes . wintypes . DWORD , # dwReserved
ctypes . wintypes . DWORD , # nNumberOfBytesToLockLow
ctypes . wintypes . DWORD , # nNumberOfBytesToLockHigh
ctypes . POINTER ( OVERLAPPED ) # Overlapped
]
LockFileEx . restype = ctypes . wintypes . BOOL
UnlockFileEx = kernel32 . UnlockFileEx
UnlockFileEx . argtypes = [
ctypes . wintypes . HANDLE , # hFile
ctypes . wintypes . DWORD , # dwReserved
ctypes . wintypes . DWORD , # nNumberOfBytesToLockLow
ctypes . wintypes . DWORD , # nNumberOfBytesToLockHigh
ctypes . POINTER ( OVERLAPPED ) # Overlapped
]
UnlockFileEx . restype = ctypes . wintypes . BOOL
whole_low = 0xffffffff
whole_high = 0x7fffffff
def _lock_file ( f , exclusive ) :
overlapped = OVERLAPPED ( )
overlapped . Offset = 0
overlapped . OffsetHigh = 0
overlapped . hEvent = 0
f . _lock_file_overlapped_p = ctypes . pointer ( overlapped )
handle = msvcrt . get_osfhandle ( f . fileno ( ) )
if not LockFileEx ( handle , 0x2 if exclusive else 0x0 , 0 ,
whole_low , whole_high , f . _lock_file_overlapped_p ) :
raise OSError ( ' Locking file failed: %r ' % ctypes . FormatError ( ) )
def _unlock_file ( f ) :
assert f . _lock_file_overlapped_p
handle = msvcrt . get_osfhandle ( f . fileno ( ) )
if not UnlockFileEx ( handle , 0 ,
whole_low , whole_high , f . _lock_file_overlapped_p ) :
raise OSError ( ' Unlocking file failed: %r ' % ctypes . FormatError ( ) )
else :
import fcntl
def _lock_file ( f , exclusive ) :
2014-09-01 08:41:25 +09:00
fcntl . flock ( f , fcntl . LOCK_EX if exclusive else fcntl . LOCK_SH )
2013-10-06 11:27:09 +09:00
def _unlock_file ( f ) :
2014-09-01 08:41:25 +09:00
fcntl . flock ( f , fcntl . LOCK_UN )
2013-10-06 11:27:09 +09:00
class locked_file ( object ) :
def __init__ ( self , filename , mode , encoding = None ) :
assert mode in [ ' r ' , ' a ' , ' w ' ]
self . f = io . open ( filename , mode , encoding = encoding )
self . mode = mode
def __enter__ ( self ) :
exclusive = self . mode != ' r '
try :
_lock_file ( self . f , exclusive )
except IOError :
self . f . close ( )
raise
return self
def __exit__ ( self , etype , value , traceback ) :
try :
_unlock_file ( self . f )
finally :
self . f . close ( )
def __iter__ ( self ) :
return iter ( self . f )
def write ( self , * args ) :
return self . f . write ( * args )
def read ( self , * args ) :
return self . f . read ( * args )
2013-10-12 20:49:27 +09:00
2014-10-01 00:27:53 +09:00
def get_filesystem_encoding ( ) :
encoding = sys . getfilesystemencoding ( )
return encoding if encoding is not None else ' utf-8 '
2013-10-12 20:49:27 +09:00
def shell_quote ( args ) :
2013-11-21 22:09:28 +09:00
quoted_args = [ ]
2014-10-01 00:27:53 +09:00
encoding = get_filesystem_encoding ( )
2013-11-21 22:09:28 +09:00
for a in args :
if isinstance ( a , bytes ) :
# We may get a filename encoded with 'encodeFilename'
a = a . decode ( encoding )
quoted_args . append ( pipes . quote ( a ) )
2014-11-17 15:16:12 +09:00
return ' ' . join ( quoted_args )
2013-10-15 19:05:13 +09:00
2013-10-18 07:46:35 +09:00
def takewhile_inclusive ( pred , seq ) :
""" Like itertools.takewhile, but include the latest evaluated element
( the first element so that Not pred ( e ) ) """
for e in seq :
yield e
if not pred ( e ) :
return
2013-10-15 19:05:13 +09:00
def smuggle_url ( url , data ) :
""" Pass additional data in a URL for internal use. """
sdata = compat_urllib_parse . urlencode (
2014-11-17 15:16:12 +09:00
{ ' __youtubedl_smuggle ' : json . dumps ( data ) } )
return url + ' # ' + sdata
2013-10-15 19:05:13 +09:00
2014-01-07 13:34:14 +09:00
def unsmuggle_url ( smug_url , default = None ) :
2013-10-15 19:05:13 +09:00
if not ' #__youtubedl_smuggle ' in smug_url :
2014-01-07 13:34:14 +09:00
return smug_url , default
2014-11-17 15:16:12 +09:00
url , _ , sdata = smug_url . rpartition ( ' # ' )
jsond = compat_parse_qs ( sdata ) [ ' __youtubedl_smuggle ' ] [ 0 ]
2013-10-15 19:05:13 +09:00
data = json . loads ( jsond )
return url , data
2013-11-25 11:12:26 +09:00
def format_bytes ( bytes ) :
if bytes is None :
2014-11-17 15:16:12 +09:00
return ' N/A '
2013-11-25 11:12:26 +09:00
if type ( bytes ) is str :
bytes = float ( bytes )
if bytes == 0.0 :
exponent = 0
else :
exponent = int ( math . log ( bytes , 1024.0 ) )
2014-11-17 15:16:12 +09:00
suffix = [ ' B ' , ' KiB ' , ' MiB ' , ' GiB ' , ' TiB ' , ' PiB ' , ' EiB ' , ' ZiB ' , ' YiB ' ] [ exponent ]
2013-11-25 11:12:26 +09:00
converted = float ( bytes ) / float ( 1024 * * exponent )
2014-11-17 15:16:12 +09:00
return ' %.2f %s ' % ( converted , suffix )
2013-12-06 21:36:36 +09:00
2013-12-10 02:29:07 +09:00
def get_term_width ( ) :
2014-10-01 00:27:53 +09:00
columns = compat_getenv ( ' COLUMNS ' , None )
2013-12-10 02:29:07 +09:00
if columns :
return int ( columns )
try :
sp = subprocess . Popen (
[ ' stty ' , ' size ' ] ,
stdout = subprocess . PIPE , stderr = subprocess . PIPE )
out , err = sp . communicate ( )
return int ( out . split ( ) [ 1 ] )
except :
pass
return None
2013-12-10 03:39:41 +09:00
def month_by_name ( name ) :
""" Return the number of a month by (locale-independently) English name """
ENGLISH_NAMES = [
2014-11-17 15:16:12 +09:00
' January ' , ' February ' , ' March ' , ' April ' , ' May ' , ' June ' ,
' July ' , ' August ' , ' September ' , ' October ' , ' November ' , ' December ' ]
2013-12-10 03:39:41 +09:00
try :
return ENGLISH_NAMES . index ( name ) + 1
except ValueError :
return None
2013-12-11 05:03:53 +09:00
2014-01-21 06:11:34 +09:00
def fix_xml_ampersands ( xml_str ) :
2013-12-11 05:03:53 +09:00
""" Replace all the ' & ' by ' & ' in XML """
2014-01-21 06:11:34 +09:00
return re . sub (
r ' &(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F] { ,4};|#[0-9] { ,4};) ' ,
2014-11-17 15:16:12 +09:00
' & ' ,
2014-01-21 06:11:34 +09:00
xml_str )
2013-12-16 13:04:12 +09:00
def setproctitle ( title ) :
2014-01-05 11:07:55 +09:00
assert isinstance ( title , compat_str )
2013-12-16 13:04:12 +09:00
try :
libc = ctypes . cdll . LoadLibrary ( " libc.so.6 " )
except OSError :
return
2014-03-23 22:28:22 +09:00
title_bytes = title . encode ( ' utf-8 ' )
buf = ctypes . create_string_buffer ( len ( title_bytes ) )
buf . value = title_bytes
2013-12-16 13:04:12 +09:00
try :
2014-03-23 22:28:22 +09:00
libc . prctl ( 15 , buf , 0 , 0 , 0 )
2013-12-16 13:04:12 +09:00
except AttributeError :
return # Strange libc, just skip this
2013-12-16 21:56:13 +09:00
def remove_start ( s , start ) :
if s . startswith ( start ) :
return s [ len ( start ) : ]
return s
2013-12-17 12:13:36 +09:00
2014-08-23 01:40:26 +09:00
def remove_end ( s , end ) :
if s . endswith ( end ) :
return s [ : - len ( end ) ]
return s
2013-12-17 12:13:36 +09:00
def url_basename ( url ) :
2013-12-17 22:56:29 +09:00
path = compat_urlparse . urlparse ( url ) . path
2014-11-17 15:16:12 +09:00
return path . strip ( ' / ' ) . split ( ' / ' ) [ - 1 ]
2013-12-21 01:05:28 +09:00
class HEADRequest ( compat_urllib_request . Request ) :
def get_method ( self ) :
return " HEAD "
2013-12-25 23:18:40 +09:00
2014-07-21 19:02:44 +09:00
def int_or_none ( v , scale = 1 , default = None , get_attr = None , invscale = 1 ) :
2014-04-21 20:45:27 +09:00
if get_attr :
if v is not None :
v = getattr ( v , get_attr , None )
2014-08-10 20:04:45 +09:00
if v == ' ' :
v = None
2014-07-21 19:02:44 +09:00
return default if v is None else ( int ( v ) * invscale / / scale )
2014-08-10 20:04:45 +09:00
2014-08-10 18:00:14 +09:00
def str_or_none ( v , default = None ) :
return default if v is None else compat_str ( v )
2014-07-21 19:02:44 +09:00
def str_to_int ( int_str ) :
2014-09-01 06:51:36 +09:00
""" A more relaxed version of int_or_none """
2014-07-21 19:02:44 +09:00
if int_str is None :
return None
2014-11-17 15:16:12 +09:00
int_str = re . sub ( r ' [, \ . \ +] ' , ' ' , int_str )
2014-07-21 19:02:44 +09:00
return int ( int_str )
2013-12-26 21:49:44 +09:00
2014-07-21 19:02:44 +09:00
def float_or_none ( v , scale = 1 , invscale = 1 , default = None ) :
return default if v is None else ( float ( v ) * invscale / scale )
2014-03-29 07:06:34 +09:00
2013-12-26 21:49:44 +09:00
def parse_duration ( s ) :
if s is None :
return None
2014-08-31 08:41:30 +09:00
s = s . strip ( )
2013-12-26 21:49:44 +09:00
m = re . match (
2014-11-16 22:55:22 +09:00
r ''' (?ix)T?
( ? :
( ? : ( ? P < hours > [ 0 - 9 ] + ) \s * ( ? : [ : h ] | hours ? ) \s * ) ?
( ? P < mins > [ 0 - 9 ] + ) \s * ( ? : [ : m ] | mins ? | minutes ? ) \s *
) ?
( ? P < secs > [ 0 - 9 ] + ) ( ? P < ms > \. [ 0 - 9 ] + ) ? \s * ( ? : s | secs ? | seconds ? ) ? $ ''' , s)
2013-12-26 21:49:44 +09:00
if not m :
return None
res = int ( m . group ( ' secs ' ) )
if m . group ( ' mins ' ) :
res + = int ( m . group ( ' mins ' ) ) * 60
if m . group ( ' hours ' ) :
res + = int ( m . group ( ' hours ' ) ) * 60 * 60
2014-08-25 19:59:53 +09:00
if m . group ( ' ms ' ) :
res + = float ( m . group ( ' ms ' ) )
2013-12-26 21:49:44 +09:00
return res
2014-01-03 20:52:27 +09:00
def prepend_extension ( filename , ext ) :
name , real_ext = os . path . splitext ( filename )
2014-11-17 15:16:12 +09:00
return ' {0} . {1} {2} ' . format ( name , ext , real_ext )
2014-01-07 14:23:41 +09:00
def check_executable ( exe , args = [ ] ) :
""" Checks if the given binary is installed somewhere in PATH, and returns its name.
args can be a list of arguments for a short output ( like - version ) """
try :
subprocess . Popen ( [ exe ] + args , stdout = subprocess . PIPE , stderr = subprocess . PIPE ) . communicate ( )
except OSError :
return False
return exe
2014-01-20 19:36:47 +09:00
2014-11-02 18:50:30 +09:00
def get_exe_version ( exe , args = [ ' --version ' ] ,
version_re = r ' version \ s+([0-9._-a-zA-Z]+) ' ,
2014-11-17 15:16:12 +09:00
unrecognized = ' present ' ) :
2014-11-02 18:50:30 +09:00
""" Returns the version of the specified executable,
or False if the executable is not present """
try :
out , err = subprocess . Popen (
[ exe ] + args ,
stdout = subprocess . PIPE , stderr = subprocess . STDOUT ) . communicate ( )
except OSError :
return False
firstline = out . partition ( b ' \n ' ) [ 0 ] . decode ( ' ascii ' , ' ignore ' )
m = re . search ( version_re , firstline )
if m :
return m . group ( 1 )
else :
return unrecognized
2014-01-20 19:36:47 +09:00
class PagedList ( object ) :
2014-01-23 05:43:33 +09:00
def __len__ ( self ) :
# This is only useful for tests
return len ( self . getslice ( ) )
2014-09-29 07:36:06 +09:00
class OnDemandPagedList ( PagedList ) :
def __init__ ( self , pagefunc , pagesize ) :
self . _pagefunc = pagefunc
self . _pagesize = pagesize
2014-01-20 19:36:47 +09:00
def getslice ( self , start = 0 , end = None ) :
res = [ ]
for pagenum in itertools . count ( start / / self . _pagesize ) :
firstid = pagenum * self . _pagesize
nextfirstid = pagenum * self . _pagesize + self . _pagesize
if start > = nextfirstid :
continue
page_results = list ( self . _pagefunc ( pagenum ) )
startv = (
start % self . _pagesize
if firstid < = start < nextfirstid
else 0 )
endv = (
( ( end - 1 ) % self . _pagesize ) + 1
if ( end is not None and firstid < = end < = nextfirstid )
else None )
if startv != 0 or endv is not None :
page_results = page_results [ startv : endv ]
res . extend ( page_results )
# A little optimization - if current page is not "full", ie. does
# not contain page_size videos then we can assume that this page
# is the last one - there are no more ids on further pages -
# i.e. no need to query again.
if len ( page_results ) + startv < self . _pagesize :
break
# If we got the whole page, but the next page is not interesting,
# break out early as well
if end == nextfirstid :
break
return res
2014-02-10 01:56:10 +09:00
2014-09-29 07:36:06 +09:00
class InAdvancePagedList ( PagedList ) :
def __init__ ( self , pagefunc , pagecount , pagesize ) :
self . _pagefunc = pagefunc
self . _pagecount = pagecount
self . _pagesize = pagesize
def getslice ( self , start = 0 , end = None ) :
res = [ ]
start_page = start / / self . _pagesize
end_page = (
self . _pagecount if end is None else ( end / / self . _pagesize + 1 ) )
skip_elems = start - start_page * self . _pagesize
only_more = None if end is None else end - start
for pagenum in range ( start_page , end_page ) :
page = list ( self . _pagefunc ( pagenum ) )
if skip_elems :
page = page [ skip_elems : ]
skip_elems = None
if only_more is not None :
if len ( page ) < only_more :
only_more - = len ( page )
else :
page = page [ : only_more ]
res . extend ( page )
break
res . extend ( page )
return res
2014-02-10 01:56:10 +09:00
def uppercase_escape ( s ) :
2014-04-05 06:00:51 +09:00
unicode_escape = codecs . getdecoder ( ' unicode_escape ' )
2014-02-10 01:56:10 +09:00
return re . sub (
2014-04-01 20:17:07 +09:00
r ' \\ U[0-9a-fA-F] {8} ' ,
2014-04-05 06:00:51 +09:00
lambda m : unicode_escape ( m . group ( 0 ) ) [ 0 ] ,
s )
2014-02-16 00:24:43 +09:00
2014-09-13 22:59:16 +09:00
def escape_rfc3986 ( s ) :
""" Escape non-ASCII characters as suggested by RFC 3986 """
if sys . version_info < ( 3 , 0 ) and isinstance ( s , unicode ) :
s = s . encode ( ' utf-8 ' )
2014-11-02 19:37:49 +09:00
return compat_urllib_parse . quote ( s , b " % /;:@&=+$,!~* ' ()?#[] " )
2014-09-13 22:59:16 +09:00
def escape_url ( url ) :
""" Escape URL as suggested by RFC 3986 """
url_parsed = compat_urllib_parse_urlparse ( url )
return url_parsed . _replace (
path = escape_rfc3986 ( url_parsed . path ) ,
params = escape_rfc3986 ( url_parsed . params ) ,
query = escape_rfc3986 ( url_parsed . query ) ,
fragment = escape_rfc3986 ( url_parsed . fragment )
) . geturl ( )
2014-02-16 00:24:43 +09:00
try :
2014-11-17 15:16:12 +09:00
struct . pack ( ' !I ' , 0 )
2014-02-16 00:24:43 +09:00
except TypeError :
# In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
def struct_pack ( spec , * args ) :
if isinstance ( spec , compat_str ) :
spec = spec . encode ( ' ascii ' )
return struct . pack ( spec , * args )
def struct_unpack ( spec , * args ) :
if isinstance ( spec , compat_str ) :
spec = spec . encode ( ' ascii ' )
return struct . unpack ( spec , * args )
else :
struct_pack = struct . pack
struct_unpack = struct . unpack
2014-02-25 09:43:17 +09:00
def read_batch_urls ( batch_fd ) :
def fixup ( url ) :
if not isinstance ( url , compat_str ) :
url = url . decode ( ' utf-8 ' , ' replace ' )
2014-11-17 15:16:12 +09:00
BOM_UTF8 = ' \xef \xbb \xbf '
2014-02-25 09:43:17 +09:00
if url . startswith ( BOM_UTF8 ) :
url = url [ len ( BOM_UTF8 ) : ]
url = url . strip ( )
if url . startswith ( ( ' # ' , ' ; ' , ' ] ' ) ) :
return False
return url
with contextlib . closing ( batch_fd ) as fd :
return [ url for url in map ( fixup , fd ) if url ]
2014-03-07 23:25:33 +09:00
def urlencode_postdata ( * args , * * kargs ) :
return compat_urllib_parse . urlencode ( * args , * * kargs ) . encode ( ' ascii ' )
2014-03-11 01:31:32 +09:00
2014-08-26 01:03:01 +09:00
try :
etree_iter = xml . etree . ElementTree . Element . iter
except AttributeError : # Python <=2.6
etree_iter = lambda n : n . findall ( ' .//* ' )
2014-03-11 01:31:32 +09:00
def parse_xml ( s ) :
class TreeBuilder ( xml . etree . ElementTree . TreeBuilder ) :
def doctype ( self , name , pubid , system ) :
pass # Ignore doctypes
parser = xml . etree . ElementTree . XMLParser ( target = TreeBuilder ( ) )
kwargs = { ' parser ' : parser } if sys . version_info > = ( 2 , 7 ) else { }
2014-08-26 01:03:01 +09:00
tree = xml . etree . ElementTree . XML ( s . encode ( ' utf-8 ' ) , * * kwargs )
# Fix up XML parser in Python 2.x
if sys . version_info < ( 3 , 0 ) :
for n in etree_iter ( tree ) :
if n . text is not None :
if not isinstance ( n . text , compat_str ) :
n . text = n . text . decode ( ' utf-8 ' )
return tree
2014-03-18 22:27:42 +09:00
2014-03-21 08:59:51 +09:00
US_RATINGS = {
' G ' : 0 ,
' PG ' : 10 ,
' PG-13 ' : 13 ,
' R ' : 16 ,
' NC ' : 18 ,
}
2014-03-25 07:21:20 +09:00
2014-10-03 21:37:25 +09:00
def parse_age_limit ( s ) :
if s is None :
2014-10-04 03:17:10 +09:00
return None
2014-10-03 21:37:25 +09:00
m = re . match ( r ' ^(?P<age> \ d { 1,2}) \ +?$ ' , s )
2014-10-04 03:17:10 +09:00
return int ( m . group ( ' age ' ) ) if m else US_RATINGS . get ( s , None )
2014-10-03 21:37:25 +09:00
2014-03-25 07:21:20 +09:00
def strip_jsonp ( code ) :
2014-11-14 00:28:05 +09:00
return re . sub (
r ' (?s)^[a-zA-Z0-9_]+ \ s* \ ( \ s*(.*) \ );? \ s*?(?://[^ \ n]*)*$ ' , r ' \ 1 ' , code )
2014-04-21 14:12:02 +09:00
2014-08-22 09:33:29 +09:00
def js_to_json ( code ) :
def fix_kv ( m ) :
2014-09-30 18:12:59 +09:00
v = m . group ( 0 )
if v in ( ' true ' , ' false ' , ' null ' ) :
return v
if v . startswith ( ' " ' ) :
return v
if v . startswith ( " ' " ) :
v = v [ 1 : - 1 ]
v = re . sub ( r " \\ \\ | \\ ' | \" " , lambda m : {
' \\ \\ ' : ' \\ \\ ' ,
" \\ ' " : " ' " ,
' " ' : ' \\ " ' ,
} [ m . group ( 0 ) ] , v )
return ' " %s " ' % v
2014-08-22 09:33:29 +09:00
res = re . sub ( r ''' (?x)
2014-09-30 18:12:59 +09:00
" (?:[^ " \\] * ( ? : \\\\| \\" )?)* " |
' (?:[^ ' \\] * ( ? : \\\\| \\' )?)* ' |
[ a - zA - Z_ ] [ a - zA - Z_0 - 9 ] *
2014-08-22 09:33:29 +09:00
''' , fix_kv, code)
res = re . sub ( r ' ,( \ s* \ ]) ' , lambda m : m . group ( 1 ) , res )
return res
2014-04-21 14:12:02 +09:00
def qualities ( quality_ids ) :
""" Get a numeric quality value out of a list of possible values """
def q ( qid ) :
try :
return quality_ids . index ( qid )
except ValueError :
return - 1
return q
2014-04-30 17:02:03 +09:00
DEFAULT_OUTTMPL = ' %(title)s - %(id)s . %(ext)s '
2014-05-16 19:03:59 +09:00
2014-09-15 22:10:24 +09:00
def limit_length ( s , length ) :
""" Add ellipses to overly long strings """
if s is None :
return None
ELLIPSES = ' ... '
if len ( s ) > length :
return s [ : length - len ( ELLIPSES ) ] + ELLIPSES
return s
2014-10-27 00:46:34 +09:00
def version_tuple ( v ) :
return [ int ( e ) for e in v . split ( ' . ' ) ]
def is_outdated_version ( version , limit , assume_new = True ) :
if not version :
return not assume_new
try :
return version_tuple ( version ) < version_tuple ( limit )
except ValueError :
return not assume_new