[generic] Ignore some non-video file extensions during generic extraction (Closes #3900)
This commit is contained in:
parent
842cca7d56
commit
ced659bb4d
1 changed files with 19 additions and 15 deletions
|
@ -847,47 +847,51 @@ class GenericIE(InfoExtractor):
|
||||||
if mobj is not None:
|
if mobj is not None:
|
||||||
return self.url_result(mobj.group('url'), 'MLB')
|
return self.url_result(mobj.group('url'), 'MLB')
|
||||||
|
|
||||||
|
def check_video(vurl):
|
||||||
|
vpath = compat_urlparse.urlparse(vurl).path
|
||||||
|
vext = determine_ext(vpath)
|
||||||
|
return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
|
||||||
|
|
||||||
|
def filter_video(urls):
|
||||||
|
return list(filter(check_video, urls))
|
||||||
|
|
||||||
# Start with something easy: JW Player in SWFObject
|
# Start with something easy: JW Player in SWFObject
|
||||||
found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
|
found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
|
||||||
if not found:
|
if not found:
|
||||||
# Look for gorilla-vid style embedding
|
# Look for gorilla-vid style embedding
|
||||||
found = re.findall(r'''(?sx)
|
found = filter_video(re.findall(r'''(?sx)
|
||||||
(?:
|
(?:
|
||||||
jw_plugins|
|
jw_plugins|
|
||||||
JWPlayerOptions|
|
JWPlayerOptions|
|
||||||
jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
|
jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
|
||||||
)
|
)
|
||||||
.*?file\s*:\s*["\'](.*?)["\']''', webpage)
|
.*?file\s*:\s*["\'](.*?)["\']''', webpage))
|
||||||
if not found:
|
if not found:
|
||||||
# Broaden the search a little bit
|
# Broaden the search a little bit
|
||||||
found = re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
|
found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
|
||||||
if not found:
|
if not found:
|
||||||
# Broaden the findall a little bit: JWPlayer JS loader
|
# Broaden the findall a little bit: JWPlayer JS loader
|
||||||
found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
|
found = filter_video(re.findall(
|
||||||
|
r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
|
||||||
if not found:
|
if not found:
|
||||||
# Flow player
|
# Flow player
|
||||||
found = re.findall(r'''(?xs)
|
found = filter_video(re.findall(r'''(?xs)
|
||||||
flowplayer\("[^"]+",\s*
|
flowplayer\("[^"]+",\s*
|
||||||
\{[^}]+?\}\s*,
|
\{[^}]+?\}\s*,
|
||||||
\s*{[^}]+? ["']?clip["']?\s*:\s*\{\s*
|
\s*{[^}]+? ["']?clip["']?\s*:\s*\{\s*
|
||||||
["']?url["']?\s*:\s*["']([^"']+)["']
|
["']?url["']?\s*:\s*["']([^"']+)["']
|
||||||
''', webpage)
|
''', webpage))
|
||||||
if not found:
|
if not found:
|
||||||
# Try to find twitter cards info
|
# Try to find twitter cards info
|
||||||
found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
|
found = filter_video(re.findall(
|
||||||
|
r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
|
||||||
if not found:
|
if not found:
|
||||||
# We look for Open Graph info:
|
# We look for Open Graph info:
|
||||||
# We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
|
# We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
|
||||||
m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
|
m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
|
||||||
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
|
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
|
||||||
if m_video_type is not None:
|
if m_video_type is not None:
|
||||||
def check_video(vurl):
|
found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
|
||||||
vpath = compat_urlparse.urlparse(vurl).path
|
|
||||||
vext = determine_ext(vpath)
|
|
||||||
return '.' in vpath and vext not in ('swf', 'png', 'jpg')
|
|
||||||
found = list(filter(
|
|
||||||
check_video,
|
|
||||||
re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)))
|
|
||||||
if not found:
|
if not found:
|
||||||
# HTML5 video
|
# HTML5 video
|
||||||
found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]+)? src="([^"]+)"', webpage)
|
found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]+)? src="([^"]+)"', webpage)
|
||||||
|
|
Loading…
Reference in a new issue