2015-10-18 18:07:48 +09:00
# coding: utf-8
2015-06-28 04:22:25 +09:00
from __future__ import unicode_literals
import re
from . common import InfoExtractor
from . . utils import (
float_or_none ,
2015-10-18 18:11:55 +09:00
xpath_text ,
2015-10-18 19:04:13 +09:00
remove_end ,
2015-11-12 03:13:42 +09:00
int_or_none ,
ExtractorError ,
2015-11-22 01:18:17 +09:00
sanitized_Request ,
2015-06-28 04:22:25 +09:00
)
2016-02-21 17:41:24 +09:00
class TwitterBaseIE ( InfoExtractor ) :
def _get_vmap_video_url ( self , vmap_url , video_id ) :
vmap_data = self . _download_xml ( vmap_url , video_id )
return xpath_text ( vmap_data , ' .//MediaFile ' ) . strip ( )
class TwitterCardIE ( TwitterBaseIE ) :
2015-10-18 18:13:58 +09:00
IE_NAME = ' twitter:card '
2015-06-28 04:22:25 +09:00
_VALID_URL = r ' https?://(?:www \ .)?twitter \ .com/i/cards/tfw/v1/(?P<id> \ d+) '
2015-07-22 06:45:36 +09:00
_TESTS = [
{
' url ' : ' https://twitter.com/i/cards/tfw/v1/560070183650213889 ' ,
2016-02-21 17:57:56 +09:00
# MD5 checksums are different in different places
2015-07-22 06:45:36 +09:00
' info_dict ' : {
' id ' : ' 560070183650213889 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' TwitterCard ' ,
' thumbnail ' : ' re:^https?://.* \ .jpg$ ' ,
' duration ' : 30.033 ,
}
2015-06-28 04:22:25 +09:00
} ,
2015-07-22 06:45:36 +09:00
{
' url ' : ' https://twitter.com/i/cards/tfw/v1/623160978427936768 ' ,
' md5 ' : ' 7ee2a553b63d1bccba97fbed97d9e1c8 ' ,
' info_dict ' : {
' id ' : ' 623160978427936768 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' TwitterCard ' ,
' thumbnail ' : ' re:^https?://.* \ .jpg ' ,
' duration ' : 80.155 ,
} ,
2015-10-18 20:07:37 +09:00
} ,
{
' url ' : ' https://twitter.com/i/cards/tfw/v1/654001591733886977 ' ,
2016-02-21 17:57:56 +09:00
' md5 ' : ' d4724ffe6d2437886d004fa5de1043b3 ' ,
2015-10-18 20:07:37 +09:00
' info_dict ' : {
' id ' : ' dq4Oj5quskI ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Ubuntu 11.10 Overview ' ,
' description ' : ' Take a quick peek at what \' s new and improved in Ubuntu 11.10. \n \n Once installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/ ' ,
' upload_date ' : ' 20111013 ' ,
' uploader ' : ' OMG! Ubuntu! ' ,
' uploader_id ' : ' omgubuntu ' ,
} ,
2015-11-14 18:03:26 +09:00
' add_ie ' : [ ' Youtube ' ] ,
2015-11-14 18:02:07 +09:00
} ,
{
' url ' : ' https://twitter.com/i/cards/tfw/v1/665289828897005568 ' ,
' md5 ' : ' ab2745d0b0ce53319a534fccaa986439 ' ,
' info_dict ' : {
' id ' : ' iBb2x00UVlv ' ,
' ext ' : ' mp4 ' ,
' upload_date ' : ' 20151113 ' ,
' uploader_id ' : ' 1189339351084113920 ' ,
2016-02-21 17:57:56 +09:00
' uploader ' : ' ArsenalTerje ' ,
' title ' : ' Vine by ArsenalTerje ' ,
2015-11-14 18:02:07 +09:00
} ,
' add_ie ' : [ ' Vine ' ] ,
2015-07-22 06:45:36 +09:00
}
]
2015-06-28 04:22:25 +09:00
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
# Different formats served for different User-Agents
USER_AGENTS = [
' Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome) ' , # mp4
' Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0 ' , # webm
]
config = None
formats = [ ]
for user_agent in USER_AGENTS :
2015-11-22 01:18:17 +09:00
request = sanitized_Request ( url )
2015-06-28 04:22:25 +09:00
request . add_header ( ' User-Agent ' , user_agent )
webpage = self . _download_webpage ( request , video_id )
2015-11-14 18:02:07 +09:00
iframe_url = self . _html_search_regex (
r ' <iframe[^>]+src= " ((?:https?:)?//(?:www.youtube.com/embed/[^ " ]+|(?:www \ .)?vine \ .co/v/ \ w+/card)) " ' ,
webpage , ' video iframe ' , default = None )
if iframe_url :
return self . url_result ( iframe_url )
2015-10-18 20:07:37 +09:00
2015-10-18 19:08:24 +09:00
config = self . _parse_json ( self . _html_search_regex (
r ' data-player-config= " ([^ " ]+) " ' , webpage , ' data player config ' ) ,
2015-06-28 04:22:25 +09:00
video_id )
2015-07-22 06:45:36 +09:00
if ' playlist ' not in config :
if ' vmapUrl ' in config :
2015-10-18 18:15:47 +09:00
formats . append ( {
2016-02-21 17:41:24 +09:00
' url ' : self . _get_vmap_video_url ( config [ ' vmapUrl ' ] , video_id ) ,
2015-10-18 18:15:47 +09:00
} )
2015-07-22 06:45:36 +09:00
break # same video regardless of UA
continue
2015-06-28 04:22:25 +09:00
video_url = config [ ' playlist ' ] [ 0 ] [ ' source ' ]
f = {
' url ' : video_url ,
}
m = re . search ( r ' /(?P<width> \ d+)x(?P<height> \ d+)/ ' , video_url )
if m :
f . update ( {
' width ' : int ( m . group ( ' width ' ) ) ,
' height ' : int ( m . group ( ' height ' ) ) ,
} )
formats . append ( f )
self . _sort_formats ( formats )
thumbnail = config . get ( ' posterImageUrl ' )
duration = float_or_none ( config . get ( ' duration ' ) )
return {
' id ' : video_id ,
' title ' : ' TwitterCard ' ,
' thumbnail ' : thumbnail ,
' duration ' : duration ,
' formats ' : formats ,
}
2015-07-22 06:38:40 +09:00
2015-10-18 18:16:57 +09:00
class TwitterIE ( InfoExtractor ) :
2015-10-18 18:13:58 +09:00
IE_NAME = ' twitter '
2015-10-18 19:04:13 +09:00
_VALID_URL = r ' https?://(?:www \ .|m \ .|mobile \ .)?twitter \ .com/(?P<user_id>[^/]+)/status/(?P<id> \ d+) '
_TEMPLATE_URL = ' https://twitter.com/ %s /status/ %s '
2015-07-22 06:38:40 +09:00
2015-11-12 03:13:42 +09:00
_TESTS = [ {
2015-10-18 18:07:48 +09:00
' url ' : ' https://twitter.com/freethenipple/status/643211948184596480 ' ,
2016-02-21 17:57:56 +09:00
# MD5 checksums are different in different places
2015-07-22 06:38:40 +09:00
' info_dict ' : {
2015-10-18 18:07:48 +09:00
' id ' : ' 643211948184596480 ' ,
2015-07-22 06:38:40 +09:00
' ext ' : ' mp4 ' ,
2015-10-18 19:04:13 +09:00
' title ' : ' FREE THE NIPPLE - FTN supporters on Hollywood Blvd today! ' ,
2015-07-22 06:38:40 +09:00
' thumbnail ' : ' re:^https?://.* \ .jpg ' ,
2015-10-18 18:07:48 +09:00
' duration ' : 12.922 ,
' description ' : ' FREE THE NIPPLE on Twitter: " FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ " ' ,
' uploader ' : ' FREE THE NIPPLE ' ,
' uploader_id ' : ' freethenipple ' ,
2015-07-22 06:38:40 +09:00
} ,
2015-11-12 03:13:42 +09:00
} , {
' url ' : ' https://twitter.com/giphz/status/657991469417025536/photo/1 ' ,
' md5 ' : ' f36dcd5fb92bf7057f155e7d927eeb42 ' ,
' info_dict ' : {
' id ' : ' 657991469417025536 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai ' ,
' description ' : ' Gifs on Twitter: " tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5 " ' ,
' thumbnail ' : ' re:^https?://.* \ .png ' ,
' uploader ' : ' Gifs ' ,
' uploader_id ' : ' giphz ' ,
} ,
2016-02-21 18:29:28 +09:00
' expected_warnings ' : [ ' height ' , ' width ' ] ,
2015-11-14 03:09:42 +09:00
} , {
' url ' : ' https://twitter.com/starwars/status/665052190608723968 ' ,
' md5 ' : ' 39b7199856dee6cd4432e72c74bc69d4 ' ,
' info_dict ' : {
' id ' : ' 665052190608723968 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. ' ,
' description ' : ' Star Wars on Twitter: " A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. " ' ,
' uploader_id ' : ' starwars ' ,
' uploader ' : ' Star Wars ' ,
} ,
2015-11-12 03:13:42 +09:00
} ]
2015-07-22 06:38:40 +09:00
def _real_extract ( self , url ) :
2015-10-18 19:04:13 +09:00
mobj = re . match ( self . _VALID_URL , url )
user_id = mobj . group ( ' user_id ' )
twid = mobj . group ( ' id ' )
webpage = self . _download_webpage ( self . _TEMPLATE_URL % ( user_id , twid ) , twid )
username = remove_end ( self . _og_search_title ( webpage ) , ' on Twitter ' )
2015-11-14 03:09:42 +09:00
title = description = self . _og_search_description ( webpage ) . strip ( ' ' ) . replace ( ' \n ' , ' ' ) . strip ( ' “” ' )
2015-10-18 19:04:13 +09:00
# strip 'https -_t.co_BJYgOjSeGA' junk from filenames
2015-11-14 03:09:42 +09:00
title = re . sub ( r ' \ s+(https?://[^ ]+) ' , ' ' , title )
2015-10-18 19:04:13 +09:00
2015-11-12 03:13:42 +09:00
info = {
2015-10-18 19:04:13 +09:00
' uploader_id ' : user_id ,
' uploader ' : username ,
2015-07-22 06:38:40 +09:00
' webpage_url ' : url ,
2015-11-14 03:09:42 +09:00
' description ' : ' %s on Twitter: " %s " ' % ( username , description ) ,
2015-07-22 06:38:40 +09:00
' title ' : username + ' - ' + title ,
}
2015-11-12 03:13:42 +09:00
card_id = self . _search_regex (
r ' [ " \' ]/i/cards/tfw/v1/( \ d+) ' , webpage , ' twitter card url ' , default = None )
if card_id :
card_url = ' https://twitter.com/i/cards/tfw/v1/ ' + card_id
info . update ( {
' _type ' : ' url_transparent ' ,
' ie_key ' : ' TwitterCard ' ,
' url ' : card_url ,
} )
return info
mobj = re . search ( r ''' (?x)
2016-02-21 18:29:28 +09:00
< video [ ^ > ] + class = " animated-gif " ( ? P < more_info > [ ^ > ] + ) > \s *
2015-11-12 03:13:42 +09:00
< source [ ^ > ] + video - src = " (?P<url>[^ " ] + ) "
''' , webpage)
if mobj :
2016-02-21 18:29:28 +09:00
more_info = mobj . group ( ' more_info ' )
height = int_or_none ( self . _search_regex (
r ' data-height= " ( \ d+) " ' , more_info , ' height ' , fatal = False ) )
width = int_or_none ( self . _search_regex (
r ' data-width= " ( \ d+) " ' , more_info , ' width ' , fatal = False ) )
thumbnail = self . _search_regex (
r ' poster= " ([^ " ]+) " ' , more_info , ' poster ' , fatal = False )
2015-11-12 03:13:42 +09:00
info . update ( {
' id ' : twid ,
' url ' : mobj . group ( ' url ' ) ,
2016-02-21 18:29:28 +09:00
' height ' : height ,
' width ' : width ,
' thumbnail ' : thumbnail ,
2015-11-12 03:13:42 +09:00
} )
return info
2016-02-21 18:21:37 +09:00
raise ExtractorError ( ' There \' s no video in this tweet. ' )
2016-02-21 17:41:24 +09:00
class TwitterAmplifyIE ( TwitterBaseIE ) :
IE_NAME = ' twitter:amplify '
_VALID_URL = ' https?://amp \ .twimg \ .com/v/(?P<id>[0-9a-f \ -] {36} ) '
_TEST = {
' url ' : ' https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951 ' ,
' md5 ' : ' 7df102d0b9fd7066b86f3159f8e81bf6 ' ,
' info_dict ' : {
' id ' : ' 0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Twitter Video ' ,
2016-02-21 18:16:35 +09:00
' thumbnail ' : ' re:^https?://.* ' ,
2016-02-21 17:41:24 +09:00
} ,
}
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , video_id )
vmap_url = self . _html_search_meta (
' twitter:amplify:vmap ' , webpage , ' vmap url ' )
video_url = self . _get_vmap_video_url ( vmap_url , video_id )
2016-02-21 18:16:35 +09:00
thumbnails = [ ]
thumbnail = self . _html_search_meta (
' twitter:image:src ' , webpage , ' thumbnail ' , fatal = False )
def _find_dimension ( target ) :
w = int_or_none ( self . _html_search_meta (
' twitter: %s :width ' % target , webpage , fatal = False ) )
h = int_or_none ( self . _html_search_meta (
' twitter: %s :height ' % target , webpage , fatal = False ) )
return w , h
if thumbnail :
thumbnail_w , thumbnail_h = _find_dimension ( ' image ' )
thumbnails . append ( {
' url ' : thumbnail ,
' width ' : thumbnail_w ,
' height ' : thumbnail_h ,
} )
video_w , video_h = _find_dimension ( ' player ' )
formats = [ {
' url ' : video_url ,
' width ' : video_w ,
' height ' : video_h ,
} ]
2016-02-21 17:41:24 +09:00
return {
' id ' : video_id ,
' title ' : ' Twitter Video ' ,
2016-02-21 18:16:35 +09:00
' formats ' : formats ,
' thumbnails ' : thumbnails ,
2016-02-21 17:41:24 +09:00
}