[udn] Add new extractor

This commit is contained in:
Yen Chi Hsuan 2015-04-08 17:26:51 +08:00
parent de5c545648
commit 418c5cc3fc
5 changed files with 103 additions and 0 deletions

View file

@ -53,6 +53,7 @@ from youtube_dl.utils import (
uppercase_escape, uppercase_escape,
url_basename, url_basename,
urlencode_postdata, urlencode_postdata,
url_infer_protocol,
version_tuple, version_tuple,
xpath_with_ns, xpath_with_ns,
xpath_text, xpath_text,
@ -296,6 +297,10 @@ class TestUtil(unittest.TestCase):
url_basename('http://media.w3.org/2010/05/sintel/trailer.mp4'), url_basename('http://media.w3.org/2010/05/sintel/trailer.mp4'),
'trailer.mp4') 'trailer.mp4')
def test_url_infer_protocol(self):
self.assertEqual(url_infer_protocol('http://foo.com/', '//bar.com/'), 'http://bar.com/')
self.assertEqual(url_infer_protocol('http://foo.com/', 'https://bar.com/'), 'https://bar.com/')
def test_parse_duration(self): def test_parse_duration(self):
self.assertEqual(parse_duration(None), None) self.assertEqual(parse_duration(None), None)
self.assertEqual(parse_duration(False), None) self.assertEqual(parse_duration(False), None)

View file

@ -557,6 +557,7 @@ from .udemy import (
UdemyIE, UdemyIE,
UdemyCourseIE UdemyCourseIE
) )
from .udn import UDNEmbedIE
from .ultimedia import UltimediaIE from .ultimedia import UltimediaIE
from .unistra import UnistraIE from .unistra import UnistraIE
from .urort import UrortIE from .urort import UrortIE

View file

@ -26,6 +26,7 @@ from ..utils import (
unsmuggle_url, unsmuggle_url,
UnsupportedError, UnsupportedError,
url_basename, url_basename,
url_infer_protocol,
xpath_text, xpath_text,
) )
from .brightcove import BrightcoveIE from .brightcove import BrightcoveIE
@ -34,6 +35,7 @@ from .ooyala import OoyalaIE
from .rutv import RUTVIE from .rutv import RUTVIE
from .smotri import SmotriIE from .smotri import SmotriIE
from .condenast import CondeNastIE from .condenast import CondeNastIE
from .udn import UDNEmbedIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -650,6 +652,17 @@ class GenericIE(InfoExtractor):
'title': "PFT Live: New leader in the 'new-look' defense", 'title': "PFT Live: New leader in the 'new-look' defense",
'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
}, },
},
# UDN embed
{
'url': 'http://www.udn.com/news/story/7314/822787',
'md5': 'de06b4c90b042c128395a88f0384817e',
'info_dict': {
'id': '300040',
'ext': 'mp4',
'title': '生物老師男變女 全校挺"做自己"',
'thumbnail': 're:^https?://.*\.jpg$',
}
} }
] ]
@ -1268,6 +1281,13 @@ class GenericIE(InfoExtractor):
if nbc_sports_url: if nbc_sports_url:
return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
# Look for UDN embeds
mobj = re.search(
r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
if mobj is not None:
return self.url_result(
url_infer_protocol(url, mobj.group('url')), 'UDNEmbed')
def check_video(vurl): def check_video(vurl):
if YoutubeIE.suitable(vurl): if YoutubeIE.suitable(vurl):
return True return True

View file

@ -0,0 +1,66 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from .common import InfoExtractor
from ..utils import (
url_infer_protocol,
js_to_json
)
class UDNEmbedIE(InfoExtractor):
_VALID_URL = r'(?:https?:)?//video\.udn\.com/embed/news/(?P<id>\d+)'
_TESTS = [{
'url': 'http://video.udn.com/embed/news/300040',
'md5': 'de06b4c90b042c128395a88f0384817e',
'info_dict': {
'id': '300040',
'ext': 'mp4',
'title': '生物老師男變女 全校挺"做自己"',
'thumbnail': 're:^https?://.*\.jpg$',
}
}, {
'url': '//video.udn.com/embed/news/300040',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
page = self._download_webpage(url, video_id)
options = json.loads(js_to_json(self._html_search_regex(
r'var options\s*=\s*([^;]+);', page, 'video urls dictionary')))
video_urls = options['video']
if video_urls.get('youtube'):
return self.url_result(video_urls.get('youtube'), 'Youtube')
try:
del video_urls['youtube']
except KeyError:
pass
formats = [{
'url': self._download_webpage(
url_infer_protocol(url, api_url), video_id,
'retrieve url for %s video' % video_type),
'format_id': video_type,
'preference': 0 if video_type == 'mp4' else -1,
} for video_type, api_url in video_urls.items()]
self._sort_formats(formats)
thumbnail = None
if options.get('gallery') and len(options['gallery']):
thumbnail = options['gallery'][0].get('original')
return {
'id': video_id,
'formats': formats,
'title': options['title'],
'thumbnail': thumbnail
}

View file

@ -1711,6 +1711,17 @@ def determine_protocol(info_dict):
return compat_urllib_parse_urlparse(url).scheme return compat_urllib_parse_urlparse(url).scheme
def url_infer_protocol(ref_url, target_url):
""" Infer protocol for protocol independent target urls """
parsed_target_url = list(compat_urllib_parse_urlparse(target_url))
if parsed_target_url[0]:
return target_url
parsed_target_url[0] = compat_urllib_parse_urlparse(ref_url).scheme
return compat_urlparse.urlunparse(parsed_target_url)
def render_table(header_row, data): def render_table(header_row, data):
""" Render a list of rows, each as a list of values """ """ Render a list of rows, each as a list of values """
table = [header_row] + data table = [header_row] + data