[extractor/common] improve Akamai HTTP format extraction

- Allow m3u8 manifest without an additional audio format - Fix extraction for qualities starting with a number Solution provided by @nixxo based on: https://stackoverflow.com/a/5984688
[tver] Add new extractor (closes #26662 )(closes #27284 )
2025-09-18 15:46:28 +09:00 · 2020-12-02 21:49:09 +01:00 · 2020-12-02 21:49:09 +01:00 · 2020-12-03 01:30:08 +07:00
5 changed files with 114 additions and 7 deletions
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -2623,7 +2623,7 @@ class InfoExtractor(object):
            REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+'
            qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
            qualities_length = len(qualities)
-            if len(formats) in (qualities_length + 1, qualities_length * 2 + 1):
+            if len(formats) in (qualities_length, qualities_length + 1, qualities_length * 2, qualities_length * 2 + 1):
                i = 0
                http_formats = []
                for f in formats:
@ -2632,7 +2632,7 @@ class InfoExtractor(object):
                            http_f = f.copy()
                            del http_f['manifest_url']
                            http_url = re.sub(
-                                REPL_REGEX, protocol + r'://%s/\1%s\3' % (http_host, qualities[i]), f['url'])
+                                REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
                            http_f.update({
                                'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
                                'url': http_url,
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -393,6 +393,7 @@ from .frontendmasters import (
    FrontendMastersLessonIE,
    FrontendMastersCourseIE
 )
+from .fujitv import FujiTVFODPlus7IE
 from .funimation import FunimationIE
 from .funk import FunkIE
 from .fusion import FusionIE
@ -1233,6 +1234,7 @@ from .tvc import (
    TVCIE,
    TVCArticleIE,
 )
+from .tver import TVerIE
 from .tvigle import TvigleIE
 from .tvland import TVLandIE
 from .tvn24 import TVN24IE
--- a/youtube_dl/extractor/fujitv.py
+++ b/youtube_dl/extractor/fujitv.py
@ -0,0 +1,35 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class FujiTVFODPlus7IE(InfoExtractor):
+    _VALID_URL = r'https?://i\.fod\.fujitv\.co\.jp/plus7/web/[0-9a-z]{4}/(?P<id>[0-9a-z]+)'
+    _BASE_URL = 'http://i.fod.fujitv.co.jp/'
+    _BITRATE_MAP = {
+        300: (320, 180),
+        800: (640, 360),
+        1200: (1280, 720),
+        2000: (1280, 720),
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        formats = self._extract_m3u8_formats(
+            self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id)
+        for f in formats:
+            wh = self._BITRATE_MAP.get(f.get('tbr'))
+            if wh:
+                f.update({
+                    'width': wh[0],
+                    'height': wh[1],
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_id,
+            'formats': formats,
+            'thumbnail': self._BASE_URL + 'pc/image/wbtn/wbtn_%s.jpg' % video_id,
+        }
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@ -53,7 +53,7 @@ class PornHubIE(PornHubBaseIE):
    _VALID_URL = r'''(?x)
                    https?://
                        (?:
-                            (?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
+                            (?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
                            (?:www\.)?thumbzilla\.com/video/
                        )
                        (?P<id>[\da-z]+)
@ -152,6 +152,9 @@ class PornHubIE(PornHubBaseIE):
    }, {
        'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933',
        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.org/view_video.php?viewkey=203640933',
+        'only_matching': True,
    }, {
        'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82',
        'only_matching': True,
@ -160,7 +163,7 @@ class PornHubIE(PornHubBaseIE):
    @staticmethod
    def _extract_urls(webpage):
        return re.findall(
-            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)',
+            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net|org)/embed/[\da-z]+)',
            webpage)

    def _extract_count(self, pattern, webpage, name):
@ -422,7 +425,7 @@ class PornHubPlaylistBaseIE(PornHubBaseIE):


 class PornHubUserIE(PornHubPlaylistBaseIE):
-    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
+    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
    _TESTS = [{
        'url': 'https://www.pornhub.com/model/zoe_ph',
        'playlist_mincount': 118,
@ -490,7 +493,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):


 class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
-    _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)'
+    _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?P<id>(?:[^/]+/)*[^/?#&]+)'
    _TESTS = [{
        'url': 'https://www.pornhub.com/model/zoe_ph/videos',
        'only_matching': True,
@ -605,7 +608,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):


 class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
-    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)'
+    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)'
    _TESTS = [{
        'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
        'info_dict': {
--- a/youtube_dl/extractor/tver.py
+++ b/youtube_dl/extractor/tver.py
@ -0,0 +1,67 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    remove_start,
+    smuggle_url,
+    try_get,
+)
+
+
+class TVerIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P<path>(?:corner|episode|feature)/(?P<id>f?\d+))'
+    # videos are only available for 7 days
+    _TESTS = [{
+        'url': 'https://tver.jp/corner/f0062178',
+        'only_matching': True,
+    }, {
+        'url': 'https://tver.jp/feature/f0062413',
+        'only_matching': True,
+    }, {
+        'url': 'https://tver.jp/episode/79622438',
+        'only_matching': True,
+    }]
+    _TOKEN = None
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
+
+    def _real_initialize(self):
+        self._TOKEN = self._download_json(
+            'https://tver.jp/api/access_token.php', None)['token']
+
+    def _real_extract(self, url):
+        path, video_id = re.match(self._VALID_URL, url).groups()
+        main = self._download_json(
+            'https://api.tver.jp/v4/' + path, video_id,
+            query={'token': self._TOKEN})['main']
+        p_id = main['publisher_id']
+        service = remove_start(main['service'], 'ts_')
+        info = {
+            '_type': 'url_transparent',
+            'description': try_get(main, lambda x: x['note'][0]['text'], compat_str),
+            'episode': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])),
+        }
+
+        if service == 'cx':
+            info.update({
+                'title': main.get('subtitle') or main['title'],
+                'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id),
+                'ie_key': 'FujiTVFODPlus7',
+            })
+        else:
+            r_id = main['reference_id']
+            if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'):
+                r_id = 'ref:' + r_id
+            bc_url = smuggle_url(
+                self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id),
+                {'geo_countries': ['JP']})
+            info.update({
+                'url': bc_url,
+                'ie_key': 'BrightcoveNew',
+            })
+
+        return info
Author	SHA1	Message	Date
Remita Amine	664dd8ba85	[extractor/common] improve Akamai HTTP format extraction - Allow m3u8 manifest without an additional audio format - Fix extraction for qualities starting with a number Solution provided by @nixxo based on: https://stackoverflow.com/a/5984688	2020-12-02 21:49:09 +01:00
Remita Amine	64554c12e1	[tver] Add new extractor (closes #26662 )(closes #27284 )	2020-12-02 21:49:09 +01:00
opusforlife2	4ded9c0f00	[pornhub] Add support for pornhub.org (#27276 ) Most ISPs block the other two TLDs through deep packet inspection	2020-12-03 01:30:08 +07:00