[youtube] Convert to new subtitles system

The automatic captions are stored in the 'automactic_captions' field, which is used if no normal subtitles are found for an specific language.
This commit is contained in:
Jaime Marquínez Ferrándiz 2015-02-16 21:44:17 +01:00
parent a1f2a06b34
commit 360e1ca5cc
4 changed files with 61 additions and 37 deletions

View file

@ -50,11 +50,6 @@ class TestYoutubeSubtitles(BaseTestSubtitles):
url = 'QRS8MkLhQmM' url = 'QRS8MkLhQmM'
IE = YoutubeIE IE = YoutubeIE
def test_youtube_no_writesubtitles(self):
self.DL.params['writesubtitles'] = False
subtitles = self.getSubtitles()
self.assertEqual(subtitles, None)
def test_youtube_subtitles(self): def test_youtube_subtitles(self):
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()

View file

@ -1020,9 +1020,13 @@ class YoutubeDL(object):
info_dict['upload_date'] = upload_date.strftime('%Y%m%d') info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
if self.params.get('listsubtitles', False): if self.params.get('listsubtitles', False):
self.list_subtitles(info_dict['id'], info_dict.get('subtitles')) if 'automatic_captions' in info_dict:
self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles')
return return
info_dict['requested_subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles')) info_dict['requested_subtitles'] = self.process_subtitles(
info_dict['id'], info_dict.get('subtitles'),
info_dict.get('automatic_captions'))
# This extractors handle format selection themselves # This extractors handle format selection themselves
if info_dict['extractor'] in ['Youku']: if info_dict['extractor'] in ['Youku']:
@ -1152,8 +1156,14 @@ class YoutubeDL(object):
info_dict.update(formats_to_download[-1]) info_dict.update(formats_to_download[-1])
return info_dict return info_dict
def process_subtitles(self, video_id, available_subs): def process_subtitles(self, video_id, available_subs, available_autocaps):
"""Select the requested subtitles and their format""" """Select the requested subtitles and their format"""
if available_autocaps and self.params.get('writeautomaticsub'):
available_subs = available_subs.copy()
for lang, cap_info in available_autocaps.items():
if lang not in available_subs:
available_subs[lang] = cap_info
if not available_subs: if not available_subs:
return available_subs return available_subs
@ -1645,17 +1655,17 @@ class YoutubeDL(object):
['ID', 'width', 'height', 'URL'], ['ID', 'width', 'height', 'URL'],
[[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
def list_subtitles(self, video_id, subtitles): def list_subtitles(self, video_id, subtitles, name='subtitles'):
if not subtitles: if not subtitles:
self.to_screen('%s has no subtitles' % video_id) self.to_screen('%s has no %s' % (video_id, name))
return return
header_line = 'Language formats' header_line = 'Language formats'
sub_lines = [ sub_lines = [
'%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats))) '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats)))
for lang, formats in subtitles.items()] for lang, formats in subtitles.items()]
self.to_screen( self.to_screen(
'Available subtitles for %s:\n%s\n%s' % 'Available %s for %s:\n%s\n%s' %
(video_id, header_line, '\n'.join(sub_lines))) (name, video_id, header_line, '\n'.join(sub_lines)))
def urlopen(self, req): def urlopen(self, req):
""" Start an HTTP download """ """ Start an HTTP download """

View file

@ -157,6 +157,8 @@ class InfoExtractor(object):
with the "ext" entry and one of: with the "ext" entry and one of:
* "data": The subtitles file contents * "data": The subtitles file contents
* "url": A url pointing to the subtitles file * "url": A url pointing to the subtitles file
automatic_captions: Like 'subtitles', used by the YoutubeIE for
automatically generated captions
duration: Length of the video in seconds, as an integer. duration: Length of the video in seconds, as an integer.
view_count: How many users have watched the video on the platform. view_count: How many users have watched the video on the platform.
like_count: Number of positive ratings of the video like_count: Number of positive ratings of the video
@ -1007,6 +1009,16 @@ class InfoExtractor(object):
def _get_subtitles(self, *args, **kwargs): def _get_subtitles(self, *args, **kwargs):
raise NotImplementedError("This method must be implemented by subclasses") raise NotImplementedError("This method must be implemented by subclasses")
def extract_automatic_captions(self, *args, **kwargs):
automatic_captions = {}
list_subtitles = self._downloader.params.get('listsubtitles')
if self._downloader.params.get('writeautomaticsub', False) or list_subtitles:
automatic_captions.update(self._get_automatic_captions(*args, **kwargs))
return automatic_captions
def _get_automatic_captions(self, *args, **kwargs):
raise NotImplementedError("This method must be implemented by subclasses")
class SearchInfoExtractor(InfoExtractor): class SearchInfoExtractor(InfoExtractor):
""" """

View file

@ -11,7 +11,6 @@ import time
import traceback import traceback
from .common import InfoExtractor, SearchInfoExtractor from .common import InfoExtractor, SearchInfoExtractor
from .subtitles import SubtitlesInfoExtractor
from ..jsinterp import JSInterpreter from ..jsinterp import JSInterpreter
from ..swfinterp import SWFInterpreter from ..swfinterp import SWFInterpreter
from ..compat import ( from ..compat import (
@ -185,7 +184,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return return
class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): class YoutubeIE(YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com' IE_DESC = 'YouTube.com'
_VALID_URL = r"""(?x)^ _VALID_URL = r"""(?x)^
( (
@ -644,7 +643,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
raise ExtractorError( raise ExtractorError(
'Signature extraction failed: ' + tb, cause=e) 'Signature extraction failed: ' + tb, cause=e)
def _get_available_subtitles(self, video_id, webpage): def _get_subtitles(self, video_id, webpage):
try: try:
subs_doc = self._download_xml( subs_doc = self._download_xml(
'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
@ -658,23 +657,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
lang = track.attrib['lang_code'] lang = track.attrib['lang_code']
if lang in sub_lang_list: if lang in sub_lang_list:
continue continue
sub_formats = []
for ext in ['sbv', 'vtt', 'srt']:
params = compat_urllib_parse.urlencode({ params = compat_urllib_parse.urlencode({
'lang': lang, 'lang': lang,
'v': video_id, 'v': video_id,
'fmt': self._downloader.params.get('subtitlesformat', 'srt'), 'fmt': ext,
'name': track.attrib['name'].encode('utf-8'), 'name': track.attrib['name'].encode('utf-8'),
}) })
url = 'https://www.youtube.com/api/timedtext?' + params sub_formats.append({
sub_lang_list[lang] = url 'url': 'https://www.youtube.com/api/timedtext?' + params,
'ext': ext,
})
sub_lang_list[lang] = sub_formats
if not sub_lang_list: if not sub_lang_list:
self._downloader.report_warning('video doesn\'t have subtitles') self._downloader.report_warning('video doesn\'t have subtitles')
return {} return {}
return sub_lang_list return sub_lang_list
def _get_available_automatic_caption(self, video_id, webpage): def _get_automatic_captions(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an """We need the webpage for getting the captions url, pass it as an
argument to speed up the process.""" argument to speed up the process."""
sub_format = self._downloader.params.get('subtitlesformat', 'srt')
self.to_screen('%s: Looking for automatic captions' % video_id) self.to_screen('%s: Looking for automatic captions' % video_id)
mobj = re.search(r';ytplayer.config = ({.*?});', webpage) mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
err_msg = 'Couldn\'t find automatic captions for %s' % video_id err_msg = 'Couldn\'t find automatic captions for %s' % video_id
@ -704,14 +707,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
sub_lang_list = {} sub_lang_list = {}
for lang_node in caption_list.findall('target'): for lang_node in caption_list.findall('target'):
sub_lang = lang_node.attrib['lang_code'] sub_lang = lang_node.attrib['lang_code']
sub_formats = []
for ext in ['sbv', 'vtt', 'srt']:
params = compat_urllib_parse.urlencode({ params = compat_urllib_parse.urlencode({
'lang': original_lang, 'lang': original_lang,
'tlang': sub_lang, 'tlang': sub_lang,
'fmt': sub_format, 'fmt': ext,
'ts': timestamp, 'ts': timestamp,
'kind': caption_kind, 'kind': caption_kind,
}) })
sub_lang_list[sub_lang] = caption_url + '&' + params sub_formats.append({
'url': caption_url + '&' + params,
'ext': ext,
})
sub_lang_list[sub_lang] = sub_formats
return sub_lang_list return sub_lang_list
# An extractor error can be raise by the download process if there are # An extractor error can be raise by the download process if there are
# no automatic captions but there are subtitles # no automatic captions but there are subtitles
@ -966,10 +975,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# subtitles # subtitles
video_subtitles = self.extract_subtitles(video_id, video_webpage) video_subtitles = self.extract_subtitles(video_id, video_webpage)
automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, video_webpage)
return
if 'length_seconds' not in video_info: if 'length_seconds' not in video_info:
self._downloader.report_warning('unable to extract video duration') self._downloader.report_warning('unable to extract video duration')
@ -1118,6 +1124,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'description': video_description, 'description': video_description,
'categories': video_categories, 'categories': video_categories,
'subtitles': video_subtitles, 'subtitles': video_subtitles,
'automatic_captions': automatic_captions,
'duration': video_duration, 'duration': video_duration,
'age_limit': 18 if age_gate else 0, 'age_limit': 18 if age_gate else 0,
'annotations': video_annotations, 'annotations': video_annotations,