From d8d24a922adb0646f9f78b2fc4a287a72fa7ff73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Jun 2015 00:36:23 +0600 Subject: [PATCH 1/6] [youtube] Extract formats from multiple DASH manifests (Closes #6093) DASH manifest pointed by dashmpd from the video webpage and one pointed by get_video_info may be different (namely different itag set) - some itags are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH manifest pointed by get_video_info's dashmpd). The general idea is to take a union of itags of both DASH manifests (for example video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093). --- youtube_dl/extractor/youtube.py | 77 ++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a3da56c14..fa1a2e544 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -853,6 +853,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: player_url = None + dash_mpds = [] + + def add_dash_mpd(video_info): + dash_mpd = video_info.get('dashmpd') + if dash_mpd and dash_mpd[0] not in dash_mpds: + dash_mpds.append(dash_mpd[0]) + # Get video info embed_webpage = None if re.search(r'player-age-gate-content">', video_webpage) is not None: @@ -873,34 +880,40 @@ class YoutubeIE(YoutubeBaseInfoExtractor): note='Refetching age-gated info webpage', errnote='unable to download video info webpage') video_info = compat_parse_qs(video_info_webpage) + add_dash_mpd(video_info) else: age_gate = False - try: - # Try looking directly into the video webpage - mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage) - if not mobj: - raise ValueError('Could not find ytplayer.config') # caught below + # Try looking directly into the video webpage + mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage) + if mobj: json_code = uppercase_escape(mobj.group(1)) ytplayer_config = json.loads(json_code) args = ytplayer_config['args'] - # Convert to the same format returned by compat_parse_qs - video_info = dict((k, [v]) for k, v in args.items()) - if not args.get('url_encoded_fmt_stream_map'): - raise ValueError('No stream_map present') # caught below - except ValueError: - # We fallback to the get_video_info pages (used by the embed page) - self.report_video_info_webpage_download(video_id) - for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = ( - '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' - % (proto, video_id, el_type)) - video_info_webpage = self._download_webpage( - video_info_url, - video_id, note=False, - errnote='unable to download video info webpage') - video_info = compat_parse_qs(video_info_webpage) - if 'token' in video_info: - break + if args.get('url_encoded_fmt_stream_map'): + # Convert to the same format returned by compat_parse_qs + video_info = dict((k, [v]) for k, v in args.items()) + add_dash_mpd(video_info) + # We also try looking in get_video_info since it may contain different dashmpd + # URL that points to a DASH manifest with possibly different itag set (some itags + # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH + # manifest pointed by get_video_info's dashmpd). + # The general idea is to take a union of itags of both DASH manifests (for example + # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093) + self.report_video_info_webpage_download(video_id) + for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']: + video_info_url = ( + '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' + % (proto, video_id, el_type)) + video_info_webpage = self._download_webpage( + video_info_url, + video_id, note=False, + errnote='unable to download video info webpage') + get_video_info = compat_parse_qs(video_info_webpage) + add_dash_mpd(get_video_info) + if not video_info: + video_info = get_video_info + if 'token' in get_video_info: + break if 'token' not in video_info: if 'reason' in video_info: raise ExtractorError( @@ -1118,24 +1131,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): - dash_mpd = video_info.get('dashmpd') - if dash_mpd: - dash_manifest_url = dash_mpd[0] + for dash_manifest_url in dash_mpds: + dash_formats = {} try: - dash_formats = self._parse_dash_manifest( - video_id, dash_manifest_url, player_url, age_gate) + for df in self._parse_dash_manifest( + video_id, dash_manifest_url, player_url, age_gate): + # Do not overwrite DASH format found in some previous DASH manifest + if df['format_id'] not in dash_formats: + dash_formats[df['format_id']] = df except (ExtractorError, KeyError) as e: self.report_warning( 'Skipping DASH manifest: %r' % e, video_id) - else: + if dash_formats: # Remove the formats we found through non-DASH, they # contain less info and it can be wrong, because we use # fixed values (for example the resolution). See # https://github.com/rg3/youtube-dl/issues/5774 for an # example. - dash_keys = set(df['format_id'] for df in dash_formats) + dash_keys = set(df['format_id'] for df in dash_formats.values()) formats = [f for f in formats if f['format_id'] not in dash_keys] - formats.extend(dash_formats) + formats.extend(dash_formats.values()) # Check for malformed aspect ratio stretched_m = re.search( From d80265ccd6ff08ef273d83aff847a457f051071f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Jun 2015 02:48:50 +0600 Subject: [PATCH 2/6] [youtube] Simplify non-DASH formats exclusion --- youtube_dl/extractor/youtube.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index fa1a2e544..46841617a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1148,8 +1148,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # fixed values (for example the resolution). See # https://github.com/rg3/youtube-dl/issues/5774 for an # example. - dash_keys = set(df['format_id'] for df in dash_formats.values()) - formats = [f for f in formats if f['format_id'] not in dash_keys] + formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] formats.extend(dash_formats.values()) # Check for malformed aspect ratio From bc93bdb5bbf48b121977a5088ab9607f0fbeb83e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Jun 2015 13:19:46 +0600 Subject: [PATCH 3/6] [youtube] Fix reference before assignment for video_info --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 46841617a..45e5cb80e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -883,6 +883,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): add_dash_mpd(video_info) else: age_gate = False + video_info = None # Try looking directly into the video webpage mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage) if mobj: From 0a3cf9ad3dc1a038018ff347473c0e2d8fc10a69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Jun 2015 14:31:18 +0600 Subject: [PATCH 4/6] [youtube] Skip get_video_info requests when --youtube-skip-dash-manifest is specified --- youtube_dl/extractor/youtube.py | 43 +++++++++++++++++---------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 45e5cb80e..2c9ad5e92 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -894,27 +894,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) add_dash_mpd(video_info) - # We also try looking in get_video_info since it may contain different dashmpd - # URL that points to a DASH manifest with possibly different itag set (some itags - # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH - # manifest pointed by get_video_info's dashmpd). - # The general idea is to take a union of itags of both DASH manifests (for example - # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093) - self.report_video_info_webpage_download(video_id) - for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = ( - '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' - % (proto, video_id, el_type)) - video_info_webpage = self._download_webpage( - video_info_url, - video_id, note=False, - errnote='unable to download video info webpage') - get_video_info = compat_parse_qs(video_info_webpage) - add_dash_mpd(get_video_info) - if not video_info: - video_info = get_video_info - if 'token' in get_video_info: - break + if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): + # We also try looking in get_video_info since it may contain different dashmpd + # URL that points to a DASH manifest with possibly different itag set (some itags + # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH + # manifest pointed by get_video_info's dashmpd). + # The general idea is to take a union of itags of both DASH manifests (for example + # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093) + self.report_video_info_webpage_download(video_id) + for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']: + video_info_url = ( + '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' + % (proto, video_id, el_type)) + video_info_webpage = self._download_webpage( + video_info_url, + video_id, note=False, + errnote='unable to download video info webpage') + get_video_info = compat_parse_qs(video_info_webpage) + add_dash_mpd(get_video_info) + if not video_info: + video_info = get_video_info + if 'token' in get_video_info: + break if 'token' not in video_info: if 'reason' in video_info: raise ExtractorError( From b2575b38e76e14b7552a43cd707a03fe78748f40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Jun 2015 14:38:41 +0600 Subject: [PATCH 5/6] [options] Clarify --youtube-skip-dash-manifest --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 6aeca61ee..e3dfb7af9 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -346,7 +346,7 @@ def parseOpts(overrideArguments=None): video_format.add_option( '--youtube-skip-dash-manifest', action='store_false', dest='youtube_include_dash_manifest', - help='Do not download the DASH manifest on YouTube videos') + help='Do not download the DASH manifests and related data on YouTube videos') video_format.add_option( '--merge-output-format', action='store', dest='merge_output_format', metavar='FORMAT', default=None, From da77d856a1310b52975abbad82121a1b4c6597a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Jun 2015 14:55:46 +0600 Subject: [PATCH 6/6] [youtube] Add test for #6093 --- youtube_dl/extractor/youtube.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2c9ad5e92..20e1781f8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -518,6 +518,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': 'requires avconv', } }, + # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097) + { + 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', + 'info_dict': { + 'id': 'FIl7x6_3R5Y', + 'ext': 'mp4', + 'title': 'md5:7b81415841e02ecd4313668cde88737a', + 'description': 'md5:116377fd2963b81ec4ce64b542173306', + 'upload_date': '20150625', + 'uploader_id': 'dorappi2000', + 'uploader': 'dorappi2000', + 'formats': 'mincount:33', + }, + } ] def __init__(self, *args, **kwargs):