[YoutubeDL] Add generic video filtering (Fixes #4916)

This functionality is intended to eventually encompass the current format filtering.
This commit is contained in:
Philipp Hagemeister 2015-02-10 03:32:21 +01:00
parent 8829650513
commit 347de4931c
5 changed files with 147 additions and 2 deletions

View file

@ -53,6 +53,7 @@ from youtube_dl.utils import (
version_tuple, version_tuple,
xpath_with_ns, xpath_with_ns,
render_table, render_table,
match_str,
) )
@ -459,6 +460,37 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')
'123 4\n' '123 4\n'
'9999 51') '9999 51')
def test_match_str(self):
self.assertRaises(ValueError, match_str, 'xy>foobar', {})
self.assertFalse(match_str('xy', {'x': 1200}))
self.assertTrue(match_str('!xy', {'x': 1200}))
self.assertTrue(match_str('x', {'x': 1200}))
self.assertFalse(match_str('!x', {'x': 1200}))
self.assertTrue(match_str('x', {'x': 0}))
self.assertFalse(match_str('x>0', {'x': 0}))
self.assertFalse(match_str('x>0', {}))
self.assertTrue(match_str('x>?0', {}))
self.assertTrue(match_str('x>1K', {'x': 1200}))
self.assertFalse(match_str('x>2K', {'x': 1200}))
self.assertTrue(match_str('x>=1200 & x < 1300', {'x': 1200}))
self.assertFalse(match_str('x>=1100 & x < 1200', {'x': 1200}))
self.assertFalse(match_str('y=a212', {'y': 'foobar42'}))
self.assertTrue(match_str('y=foobar42', {'y': 'foobar42'}))
self.assertFalse(match_str('y!=foobar42', {'y': 'foobar42'}))
self.assertTrue(match_str('y!=foobar2', {'y': 'foobar42'}))
self.assertFalse(match_str(
'like_count > 100 & dislike_count <? 50 & description',
{'like_count': 90, 'description': 'foo'}))
self.assertTrue(match_str(
'like_count > 100 & dislike_count <? 50 & description',
{'like_count': 190, 'description': 'foo'}))
self.assertFalse(match_str(
'like_count > 100 & dislike_count <? 50 & description',
{'like_count': 190, 'dislike_count': 60, 'description': 'foo'}))
self.assertFalse(match_str(
'like_count > 100 & dislike_count <? 50 & description',
{'like_count': 190, 'dislike_count': 10}))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View file

@ -228,6 +228,11 @@ class YoutubeDL(object):
external_downloader: Executable of the external downloader to call. external_downloader: Executable of the external downloader to call.
listformats: Print an overview of available video formats and exit. listformats: Print an overview of available video formats and exit.
list_thumbnails: Print a table of all thumbnails and exit. list_thumbnails: Print a table of all thumbnails and exit.
match_filter: A function that gets called with the info_dict of
every video.
If it returns a message, the video is ignored.
If it returns None, the video is downloaded.
match_filter_func in utils.py is one example for this.
The following parameters are not used by YoutubeDL itself, they are used by The following parameters are not used by YoutubeDL itself, they are used by
@ -583,9 +588,16 @@ class YoutubeDL(object):
if max_views is not None and view_count > max_views: if max_views is not None and view_count > max_views:
return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views) return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')): if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
return 'Skipping "%s" because it is age restricted' % title return 'Skipping "%s" because it is age restricted' % video_title
if self.in_download_archive(info_dict): if self.in_download_archive(info_dict):
return '%s has already been recorded in archive' % video_title return '%s has already been recorded in archive' % video_title
match_filter = self.params.get('match_filter')
if match_filter is not None:
ret = match_filter(info_dict)
if ret is not None:
return ret
return None return None
@staticmethod @staticmethod

View file

@ -23,9 +23,10 @@ from .compat import (
) )
from .utils import ( from .utils import (
DateRange, DateRange,
DEFAULT_OUTTMPL,
decodeOption, decodeOption,
DEFAULT_OUTTMPL,
DownloadError, DownloadError,
match_filter_func,
MaxDownloadsReached, MaxDownloadsReached,
preferredencoding, preferredencoding,
read_batch_urls, read_batch_urls,
@ -247,6 +248,9 @@ def _real_main(argv=None):
xattr # Confuse flake8 xattr # Confuse flake8
except ImportError: except ImportError:
parser.error('setting filesize xattr requested but python-xattr is not available') parser.error('setting filesize xattr requested but python-xattr is not available')
match_filter = (
None if opts.match_filter is None
else match_filter_func(opts.match_filter))
ydl_opts = { ydl_opts = {
'usenetrc': opts.usenetrc, 'usenetrc': opts.usenetrc,
@ -344,6 +348,7 @@ def _real_main(argv=None):
'list_thumbnails': opts.list_thumbnails, 'list_thumbnails': opts.list_thumbnails,
'playlist_items': opts.playlist_items, 'playlist_items': opts.playlist_items,
'xattr_set_filesize': opts.xattr_set_filesize, 'xattr_set_filesize': opts.xattr_set_filesize,
'match_filter': match_filter,
} }
with YoutubeDL(ydl_opts) as ydl: with YoutubeDL(ydl_opts) as ydl:

View file

@ -244,6 +244,25 @@ def parseOpts(overrideArguments=None):
'--max-views', '--max-views',
metavar='COUNT', dest='max_views', default=None, type=int, metavar='COUNT', dest='max_views', default=None, type=int,
help='Do not download any videos with more than COUNT views') help='Do not download any videos with more than COUNT views')
selection.add_option(
'--match-filter',
metavar='FILTER', dest='match_filter', default=None,
help=(
'(Experimental) Generic video filter. '
'Specify any key (see help for -o for a list of available keys) to'
' match if the key is present, '
'!key to check if the key is not present,'
'key > NUMBER (like "comment_count > 12", also works with '
'>=, <, <=, !=, =) to compare against a number, and '
'& to require multiple matches. '
'Values which are not known are excluded unless you'
' put a question mark (?) after the operator.'
'For example, to only match videos that have been liked more than '
'100 times and disliked less than 50 times (or the dislike '
'functionality is not available at the given service), but who '
'also have a description, use --match-filter '
'"like_count > 100 & dislike_count <? 50 & description" .'
))
selection.add_option( selection.add_option(
'--no-playlist', '--no-playlist',
action='store_true', dest='noplaylist', default=False, action='store_true', dest='noplaylist', default=False,

View file

@ -17,6 +17,7 @@ import io
import json import json
import locale import locale
import math import math
import operator
import os import os
import pipes import pipes
import platform import platform
@ -1678,3 +1679,79 @@ def render_table(header_row, data):
max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)] max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s' format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
return '\n'.join(format_str % tuple(row) for row in table) return '\n'.join(format_str % tuple(row) for row in table)
def _match_one(filter_part, dct):
COMPARISON_OPERATORS = {
'<': operator.lt,
'<=': operator.le,
'>': operator.gt,
'>=': operator.ge,
'=': operator.eq,
'!=': operator.ne,
}
operator_rex = re.compile(r'''(?x)\s*
(?P<key>[a-z_]+)
\s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
(?:
(?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
(?P<strval>(?![0-9.])[a-z0-9A-Z]*)
)
\s*$
''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
m = operator_rex.search(filter_part)
if m:
op = COMPARISON_OPERATORS[m.group('op')]
if m.group('strval') is not None:
if m.group('op') not in ('=', '!='):
raise ValueError(
'Operator %s does not support string values!' % m.group('op'))
comparison_value = m.group('strval')
else:
try:
comparison_value = int(m.group('intval'))
except ValueError:
comparison_value = parse_filesize(m.group('intval'))
if comparison_value is None:
comparison_value = parse_filesize(m.group('intval') + 'B')
if comparison_value is None:
raise ValueError(
'Invalid integer value %r in filter part %r' % (
m.group('intval'), filter_part))
actual_value = dct.get(m.group('key'))
if actual_value is None:
return m.group('none_inclusive')
return op(actual_value, comparison_value)
UNARY_OPERATORS = {
'': lambda v: v is not None,
'!': lambda v: v is None,
}
operator_rex = re.compile(r'''(?x)\s*
(?P<op>%s)\s*(?P<key>[a-z_]+)
\s*$
''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
m = operator_rex.search(filter_part)
if m:
op = UNARY_OPERATORS[m.group('op')]
actual_value = dct.get(m.group('key'))
return op(actual_value)
raise ValueError('Invalid filter part %r' % filter_part)
def match_str(filter_str, dct):
""" Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
return all(
_match_one(filter_part, dct) for filter_part in filter_str.split('&'))
def match_filter_func(filter_str):
def _match_func(info_dict):
if match_str(filter_str, info_dict):
return None
else:
video_title = info_dict.get('title', info_dict.get('id', 'video'))
return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
return _match_func