[udn] Add new extractor
This commit is contained in:
parent
de5c545648
commit
418c5cc3fc
5 changed files with 103 additions and 0 deletions
|
@ -53,6 +53,7 @@ from youtube_dl.utils import (
|
|||
uppercase_escape,
|
||||
url_basename,
|
||||
urlencode_postdata,
|
||||
url_infer_protocol,
|
||||
version_tuple,
|
||||
xpath_with_ns,
|
||||
xpath_text,
|
||||
|
@ -296,6 +297,10 @@ class TestUtil(unittest.TestCase):
|
|||
url_basename('http://media.w3.org/2010/05/sintel/trailer.mp4'),
|
||||
'trailer.mp4')
|
||||
|
||||
def test_url_infer_protocol(self):
|
||||
self.assertEqual(url_infer_protocol('http://foo.com/', '//bar.com/'), 'http://bar.com/')
|
||||
self.assertEqual(url_infer_protocol('http://foo.com/', 'https://bar.com/'), 'https://bar.com/')
|
||||
|
||||
def test_parse_duration(self):
|
||||
self.assertEqual(parse_duration(None), None)
|
||||
self.assertEqual(parse_duration(False), None)
|
||||
|
|
|
@ -557,6 +557,7 @@ from .udemy import (
|
|||
UdemyIE,
|
||||
UdemyCourseIE
|
||||
)
|
||||
from .udn import UDNEmbedIE
|
||||
from .ultimedia import UltimediaIE
|
||||
from .unistra import UnistraIE
|
||||
from .urort import UrortIE
|
||||
|
|
|
@ -26,6 +26,7 @@ from ..utils import (
|
|||
unsmuggle_url,
|
||||
UnsupportedError,
|
||||
url_basename,
|
||||
url_infer_protocol,
|
||||
xpath_text,
|
||||
)
|
||||
from .brightcove import BrightcoveIE
|
||||
|
@ -34,6 +35,7 @@ from .ooyala import OoyalaIE
|
|||
from .rutv import RUTVIE
|
||||
from .smotri import SmotriIE
|
||||
from .condenast import CondeNastIE
|
||||
from .udn import UDNEmbedIE
|
||||
|
||||
|
||||
class GenericIE(InfoExtractor):
|
||||
|
@ -650,6 +652,17 @@ class GenericIE(InfoExtractor):
|
|||
'title': "PFT Live: New leader in the 'new-look' defense",
|
||||
'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
|
||||
},
|
||||
},
|
||||
# UDN embed
|
||||
{
|
||||
'url': 'http://www.udn.com/news/story/7314/822787',
|
||||
'md5': 'de06b4c90b042c128395a88f0384817e',
|
||||
'info_dict': {
|
||||
'id': '300040',
|
||||
'ext': 'mp4',
|
||||
'title': '生物老師男變女 全校挺"做自己"',
|
||||
'thumbnail': 're:^https?://.*\.jpg$',
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
|
@ -1268,6 +1281,13 @@ class GenericIE(InfoExtractor):
|
|||
if nbc_sports_url:
|
||||
return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
|
||||
|
||||
# Look for UDN embeds
|
||||
mobj = re.search(
|
||||
r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
|
||||
if mobj is not None:
|
||||
return self.url_result(
|
||||
url_infer_protocol(url, mobj.group('url')), 'UDNEmbed')
|
||||
|
||||
def check_video(vurl):
|
||||
if YoutubeIE.suitable(vurl):
|
||||
return True
|
||||
|
|
66
youtube_dl/extractor/udn.py
Normal file
66
youtube_dl/extractor/udn.py
Normal file
|
@ -0,0 +1,66 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import json
|
||||
from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
url_infer_protocol,
|
||||
js_to_json
|
||||
)
|
||||
|
||||
|
||||
class UDNEmbedIE(InfoExtractor):
|
||||
_VALID_URL = r'(?:https?:)?//video\.udn\.com/embed/news/(?P<id>\d+)'
|
||||
_TESTS = [{
|
||||
'url': 'http://video.udn.com/embed/news/300040',
|
||||
'md5': 'de06b4c90b042c128395a88f0384817e',
|
||||
'info_dict': {
|
||||
'id': '300040',
|
||||
'ext': 'mp4',
|
||||
'title': '生物老師男變女 全校挺"做自己"',
|
||||
'thumbnail': 're:^https?://.*\.jpg$',
|
||||
}
|
||||
}, {
|
||||
'url': '//video.udn.com/embed/news/300040',
|
||||
'only_matching': True,
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
|
||||
page = self._download_webpage(url, video_id)
|
||||
|
||||
options = json.loads(js_to_json(self._html_search_regex(
|
||||
r'var options\s*=\s*([^;]+);', page, 'video urls dictionary')))
|
||||
|
||||
video_urls = options['video']
|
||||
|
||||
if video_urls.get('youtube'):
|
||||
return self.url_result(video_urls.get('youtube'), 'Youtube')
|
||||
|
||||
try:
|
||||
del video_urls['youtube']
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
formats = [{
|
||||
'url': self._download_webpage(
|
||||
url_infer_protocol(url, api_url), video_id,
|
||||
'retrieve url for %s video' % video_type),
|
||||
'format_id': video_type,
|
||||
'preference': 0 if video_type == 'mp4' else -1,
|
||||
} for video_type, api_url in video_urls.items()]
|
||||
|
||||
self._sort_formats(formats)
|
||||
|
||||
thumbnail = None
|
||||
|
||||
if options.get('gallery') and len(options['gallery']):
|
||||
thumbnail = options['gallery'][0].get('original')
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'formats': formats,
|
||||
'title': options['title'],
|
||||
'thumbnail': thumbnail
|
||||
}
|
|
@ -1711,6 +1711,17 @@ def determine_protocol(info_dict):
|
|||
return compat_urllib_parse_urlparse(url).scheme
|
||||
|
||||
|
||||
def url_infer_protocol(ref_url, target_url):
|
||||
""" Infer protocol for protocol independent target urls """
|
||||
parsed_target_url = list(compat_urllib_parse_urlparse(target_url))
|
||||
if parsed_target_url[0]:
|
||||
return target_url
|
||||
|
||||
parsed_target_url[0] = compat_urllib_parse_urlparse(ref_url).scheme
|
||||
|
||||
return compat_urlparse.urlunparse(parsed_target_url)
|
||||
|
||||
|
||||
def render_table(header_row, data):
|
||||
""" Render a list of rows, each as a list of values """
|
||||
table = [header_row] + data
|
||||
|
|
Loading…
Reference in a new issue