#!/usr/bin/env python # -*- coding: utf-8 -*- import datetime import HTMLParser import httplib import netrc import os import re import socket import time import urllib import urllib2 import email.utils import xml.etree.ElementTree import random import math from urlparse import parse_qs try: import cStringIO as StringIO except ImportError: import StringIO from utils import * class InfoExtractor(object): """Information Extractor class. Information extractors are the classes that, given a URL, extract information from the video (or videos) the URL refers to. This information includes the real video URL, the video title and simplified title, author and others. The information is stored in a dictionary which is then passed to the FileDownloader. The FileDownloader processes this information possibly downloading the video to the file system, among other possible outcomes. The dictionaries must include the following fields: id: Video identifier. url: Final video URL. uploader: Nickname of the video uploader. title: Literal title. ext: Video filename extension. format: Video format. player_url: SWF Player URL (may be None). The following fields are optional. Their primary purpose is to allow youtube-dl to serve as the backend for a video search function, such as the one in youtube2mp3. They are only used when their respective forced printing functions are called: thumbnail: Full URL to a video thumbnail image. description: One-line video description. Subclasses of this one should re-define the _real_initialize() and _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. """ _ready = False _downloader = None def __init__(self, downloader=None): """Constructor. Receives an optional downloader.""" self._ready = False self.set_downloader(downloader) def suitable(self, url): """Receives a URL and returns True if suitable for this IE.""" return re.match(self._VALID_URL, url) is not None def initialize(self): """Initializes an instance (authentication, etc).""" if not self._ready: self._real_initialize() self._ready = True def extract(self, url): """Extracts URL information and returns it in list of dicts.""" self.initialize() return self._real_extract(url) def set_downloader(self, downloader): """Sets the downloader for this IE.""" self._downloader = downloader def _real_initialize(self): """Real initialization process. Redefine in subclasses.""" pass def _real_extract(self, url): """Real extraction process. Redefine in subclasses.""" pass def _login(self): if self._downloader is None: return False username = None password = None downloader_params = self._downloader.params # Attempt to use provided username and password or .netrc data if downloader_params.get('username', None) and \ downloader_params.get('password', None): username = downloader_params['username'] password = downloader_params['password'] elif downloader_params.get('usenetrc', False): try: info = netrc.netrc().authenticators(self._NETRC_MACHINE) if info is not None: username = info[0] password = info[2] else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError), err: self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err)) return False # Set language if hasattr(self, "_LANG_URL"): request = urllib2.Request(self._LANG_URL) try: self.report_lang() urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err)) return False # No authentication to be performed if username is None: return False login_form = self._LOGIN_FORM # Set login credentials for k in login_form: if login_form[k] == "username": login_form[k] = username elif login_form[k] == "password": login_form[k] = password request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form)) try: self.report_login() login_results = urllib2.urlopen(request).read() if re.search(self._FAILED_LOGIN, login_results) is not None: self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password') return except (urllib2.URLError, httplib.HTTPException, socket.error), err: self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err)) return False return request class YoutubeIE(InfoExtractor): """Information extractor for youtube.com.""" _VALID_URL = r"""^ ( (?:https?://)? # http(s):// (optional) (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs (?: # the various things that can precede the ID: (?:(?:v|embed|e)/) # v/ or embed/ or e/ |(?: # or the v= param in all its forms (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx) v= ) )? # optional -> youtube.com/xxxx is OK )? # all until now is optional -> you can pass the naked ID ([0-9A-Za-z_-]+) # here is it! the YouTube video ID (?(1).+)? # if we found the ID, everything can follow $""" _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' _FAILED_LOGIN = r'(?i)