From 06f1de2daff8351f572974dafccebefd378b9f99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 15 Feb 2020 02:16:26 +0700 Subject: [PATCH] [nova] Improve extraction (refs #23690) --- youtube_dl/extractor/nova.py | 37 +++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 62d0552e9..2850af5db 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -97,7 +97,7 @@ class NovaIE(InfoExtractor): _VALID_URL = r'https?://(?:[^.]+\.)?(?Ptv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P[^/]+?)(?:\.html|/|$)' _TESTS = [{ 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260', - 'md5': '1dd7b9d5ea27bc361f110cd855a19bd3', + 'md5': '249baab7d0104e186e78b0899c7d5f28', 'info_dict': { 'id': '1757139', 'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci', @@ -119,7 +119,8 @@ class NovaIE(InfoExtractor): 'params': { # rtmp download 'skip_download': True, - } + }, + 'skip': 'gone', }, { # media.cms.nova.cz embed 'url': 'https://novaplus.nova.cz/porad/ulice/epizoda/18760-2180-dil', @@ -134,6 +135,7 @@ class NovaIE(InfoExtractor): 'skip_download': True, }, 'add_ie': [NovaEmbedIE.ie_key()], + 'skip': 'CHYBA 404: STRÁNKA NENALEZENA', }, { 'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html', 'only_matching': True, @@ -158,14 +160,29 @@ class NovaIE(InfoExtractor): webpage = self._download_webpage(url, display_id) + description = clean_html(self._og_search_description(webpage, default=None)) + if site == 'novaplus': + upload_date = unified_strdate(self._search_regex( + r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None)) + elif site == 'fanda': + upload_date = unified_strdate(self._search_regex( + r'(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None)) + else: + upload_date = None + # novaplus embed_id = self._search_regex( r']+\bsrc=["\'](?:https?:)?//media\.cms\.nova\.cz/embed/([^/?#&]+)', webpage, 'embed url', default=None) if embed_id: - return self.url_result( - 'https://media.cms.nova.cz/embed/%s' % embed_id, - ie=NovaEmbedIE.ie_key(), video_id=embed_id) + return { + '_type': 'url_transparent', + 'url': 'https://media.cms.nova.cz/embed/%s' % embed_id, + 'ie_key': NovaEmbedIE.ie_key(), + 'id': embed_id, + 'description': description, + 'upload_date': upload_date + } video_id = self._search_regex( [r"(?:media|video_id)\s*:\s*'(\d+)'", @@ -239,18 +256,8 @@ class NovaIE(InfoExtractor): self._sort_formats(formats) title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage) - description = clean_html(self._og_search_description(webpage, default=None)) thumbnail = config.get('poster') - if site == 'novaplus': - upload_date = unified_strdate(self._search_regex( - r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None)) - elif site == 'fanda': - upload_date = unified_strdate(self._search_regex( - r'(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None)) - else: - upload_date = None - return { 'id': video_id, 'display_id': display_id,