[rtl2] improve _VALID_URL regex

This commit is contained in:
Remita Amine 2019-04-03 01:00:02 +01:00
parent d7d86fdd49
commit 4f7db46887

View file

@ -21,7 +21,7 @@ from ..utils import (
class RTL2IE(InfoExtractor): class RTL2IE(InfoExtractor):
IE_NAME = 'rtl2' IE_NAME = 'rtl2'
_VALID_URL = r'http?://(?:www\.)?rtl2\.de/[^?#]*?/(?P<id>[^?#/]*?)(?:$|/(?:$|[?#]))' _VALID_URL = r'https?://(?:www\.)?rtl2\.de/sendung/[^/]+/(?:video/(?P<vico_id>\d+)[^/]+/(?P<vivi_id>\d+)-|folge/)(?P<id>[^/?#]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0', 'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0',
'info_dict': { 'info_dict': {
@ -34,10 +34,11 @@ class RTL2IE(InfoExtractor):
# rtmp download # rtmp download
'skip_download': True, 'skip_download': True,
}, },
'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'],
}, { }, {
'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/', 'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/',
'info_dict': { 'info_dict': {
'id': '21040-anna-erwischt-alex', 'id': 'anna-erwischt-alex',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Anna erwischt Alex!', 'title': 'Anna erwischt Alex!',
'description': 'Anna nimmt ihrem Vater nicht ab, dass er nicht spielt. Und tatsächlich erwischt sie ihn auf frischer Tat.' 'description': 'Anna nimmt ihrem Vater nicht ab, dass er nicht spielt. Und tatsächlich erwischt sie ihn auf frischer Tat.'
@ -46,31 +47,29 @@ class RTL2IE(InfoExtractor):
# rtmp download # rtmp download
'skip_download': True, 'skip_download': True,
}, },
'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'],
}] }]
def _real_extract(self, url): def _real_extract(self, url):
# Some rtl2 urls have no slash at the end, so append it. vico_id, vivi_id, display_id = re.match(self._VALID_URL, url).groups()
if not url.endswith('/'): if not vico_id:
url += '/' webpage = self._download_webpage(url, display_id)
video_id = self._match_id(url) mobj = re.search(
webpage = self._download_webpage(url, video_id) r'data-collection="(?P<vico_id>\d+)"[^>]+data-video="(?P<vivi_id>\d+)"',
webpage)
mobj = re.search( if mobj:
r'<div[^>]+data-collection="(?P<vico_id>\d+)"[^>]+data-video="(?P<vivi_id>\d+)"', vico_id = mobj.group('vico_id')
webpage) vivi_id = mobj.group('vivi_id')
if mobj: else:
vico_id = mobj.group('vico_id') vico_id = self._html_search_regex(
vivi_id = mobj.group('vivi_id') r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id')
else: vivi_id = self._html_search_regex(
vico_id = self._html_search_regex( r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id')
r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id')
vivi_id = self._html_search_regex(
r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id')
info = self._download_json( info = self._download_json(
'http://www.rtl2.de/sites/default/modules/rtl2/mediathek/php/get_video_jw.php', 'https://service.rtl2.de/api-player-vipo/video.php',
video_id, query={ display_id, query={
'vico_id': vico_id, 'vico_id': vico_id,
'vivi_id': vivi_id, 'vivi_id': vivi_id,
}) })
@ -99,12 +98,12 @@ class RTL2IE(InfoExtractor):
m3u8_url = video_info.get('streamurl_hls') m3u8_url = video_info.get('streamurl_hls')
if m3u8_url: if m3u8_url:
formats.extend(self._extract_akamai_formats(m3u8_url, video_id)) formats.extend(self._extract_akamai_formats(m3u8_url, display_id))
self._sort_formats(formats) self._sort_formats(formats)
return { return {
'id': video_id, 'id': display_id,
'title': title, 'title': title,
'thumbnail': video_info.get('image'), 'thumbnail': video_info.get('image'),
'description': video_info.get('beschreibung'), 'description': video_info.get('beschreibung'),