diff --git a/devscripts/wine-py2exe.sh b/devscripts/wine-py2exe.sh
index 319ffcbc8..dc2d6501a 100644
--- a/devscripts/wine-py2exe.sh
+++ b/devscripts/wine-py2exe.sh
@@ -18,7 +18,6 @@ if [ ! -d wine-py2exe ]; then
axel -a "http://www.python.org/ftp/python/2.7/python-2.7.msi"
axel -a "http://downloads.sourceforge.net/project/py2exe/py2exe/0.6.9/py2exe-0.6.9.win32-py2.7.exe"
- axel -a "http://pypi.python.org/packages/2.7/l/lxml/lxml-2.3.win32-py2.7.exe"
#axel -a "http://winetricks.org/winetricks"
# http://appdb.winehq.org/objectManager.php?sClass=version&iId=21957
@@ -28,13 +27,9 @@ if [ ! -d wine-py2exe ]; then
echo "Follow py2exe setup on screen"
wine py2exe-0.6.9.win32-py2.7.exe
- echo "Follow lxml setup on screen"
- wine lxml-2.3.win32-py2.7.exe
-
#echo "Follow Microsoft Visual C++ 2008 Redistributable Package setup on screen"
#bash winetricks vcrun2008
- rm lxml-2.3.win32-py2.7.exe
rm py2exe-0.6.9.win32-py2.7.exe
rm python-2.7.msi
#rm winetricks
diff --git a/youtube-dl b/youtube-dl
index 7e4640c66..c4b5c07ca 100755
Binary files a/youtube-dl and b/youtube-dl differ
diff --git a/youtube-dl.exe b/youtube-dl.exe
index ec793ecee..cb9654283 100755
Binary files a/youtube-dl.exe and b/youtube-dl.exe differ
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 4314f1402..d77154dcb 100644
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -24,11 +24,6 @@ try:
except ImportError:
from cgi import parse_qs
-try:
- import lxml.etree
-except ImportError:
- pass # Handled below
-
try:
import xml.etree.ElementTree
except ImportError: # Python<2.5: Not officially supported, but let it slip
@@ -193,8 +188,8 @@ class YoutubeIE(InfoExtractor):
end = start + float(dur)
start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
- caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
- caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
+ caption = unescapeHTML(caption)
+ caption = unescapeHTML(caption) # double cycle, inentional
srt += str(n) + '\n'
srt += start + ' --> ' + end + '\n'
srt += caption + '\n\n'
@@ -364,18 +359,9 @@ class YoutubeIE(InfoExtractor):
pass
# description
- try:
- lxml.etree
- except NameError:
- video_description = u'No description available.'
- mobj = re.search(r'', video_webpage)
- if mobj is not None:
- video_description = mobj.group(1).decode('utf-8')
- else:
- html_parser = lxml.etree.HTMLParser(encoding='utf-8')
- vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
- video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
- # TODO use another parser
+ video_description = get_element_by_id("eow-description", video_webpage)
+ if video_description: video_description = clean_html(video_description.decode('utf8'))
+ else: video_description = ''
# closed captions
video_subtitles = None
@@ -992,7 +978,7 @@ class YahooIE(InfoExtractor):
self._downloader.trouble(u'ERROR: Unable to extract media URL')
return
video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
- video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
+ video_url = unescapeHTML(video_url)
return [{
'id': video_id.decode('utf-8'),
@@ -1069,18 +1055,9 @@ class VimeoIE(InfoExtractor):
video_thumbnail = config["video"]["thumbnail"]
# Extract video description
- try:
- lxml.etree
- except NameError:
- video_description = u'No description available.'
- mobj = re.search(r'', webpage, re.MULTILINE)
- if mobj is not None:
- video_description = mobj.group(1)
- else:
- html_parser = lxml.etree.HTMLParser()
- vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
- video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
- # TODO use another parser
+ video_description = get_element_by_id("description", webpage)
+ if video_description: video_description = clean_html(video_description.decode('utf8'))
+ else: video_description = ''
# Extract upload date
video_upload_date = u'NA'
@@ -2248,8 +2225,6 @@ class EscapistIE(InfoExtractor):
self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
def _real_extract(self, url):
- htmlParser = HTMLParser.HTMLParser()
-
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -2265,11 +2240,11 @@ class EscapistIE(InfoExtractor):
return
descMatch = re.search('
+ html = html.replace('\n', ' ')
+ html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
+ # Strip html tags
+ html = re.sub('<.*?>', '', html)
+ # Replace html entities
+ html = unescapeHTML(html)
+ return html
+
+
def sanitize_title(utitle):
"""Sanitizes a video title so it could be used as part of a filename."""
- utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
+ utitle = unescapeHTML(utitle)
return utitle.replace(unicode(os.sep), u'%')
@@ -133,8 +210,8 @@ def unescapeHTML(s):
"""
assert type(s) == type(u'')
- htmlParser = HTMLParser.HTMLParser()
- return htmlParser.unescape(s)
+ result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
+ return result
def encodeFilename(s):
"""