[vk] improve extraction

- add support for Odnoklassniki embeds
- update tests
- extract more video from user lists(closes #4470)
- fix wall post audio extraction(closes #18332)
- improve error detection(closes #22568)
This commit is contained in:
Remita Amine 2019-10-25 19:35:07 +01:00
parent 416c3ca7f5
commit 3c989818e7

View file

@ -12,7 +12,6 @@ from ..utils import (
get_element_by_class, get_element_by_class,
int_or_none, int_or_none,
orderedSet, orderedSet,
remove_start,
str_or_none, str_or_none,
str_to_int, str_to_int,
unescapeHTML, unescapeHTML,
@ -21,6 +20,7 @@ from ..utils import (
urlencode_postdata, urlencode_postdata,
) )
from .dailymotion import DailymotionIE from .dailymotion import DailymotionIE
from .odnoklassniki import OdnoklassnikiIE
from .pladform import PladformIE from .pladform import PladformIE
from .vimeo import VimeoIE from .vimeo import VimeoIE
from .youtube import YoutubeIE from .youtube import YoutubeIE
@ -60,6 +60,18 @@ class VKBaseIE(InfoExtractor):
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
def _download_payload(self, path, video_id, data, fatal=True):
data['al'] = 1
code, payload = self._download_json(
'https://vk.com/%s.php' % path, video_id,
data=urlencode_postdata(data), fatal=fatal,
headers={'X-Requested-With': 'XMLHttpRequest'})['payload']
if code == '3':
self.raise_login_required()
elif code == '8':
raise ExtractorError(clean_html(payload[0][1:-1]), expected=True)
return payload
class VKIE(VKBaseIE): class VKIE(VKBaseIE):
IE_NAME = 'vk' IE_NAME = 'vk'
@ -96,7 +108,6 @@ class VKIE(VKBaseIE):
}, },
{ {
'url': 'http://vk.com/video205387401_165548505', 'url': 'http://vk.com/video205387401_165548505',
'md5': '6c0aeb2e90396ba97035b9cbde548700',
'info_dict': { 'info_dict': {
'id': '205387401_165548505', 'id': '205387401_165548505',
'ext': 'mp4', 'ext': 'mp4',
@ -110,18 +121,18 @@ class VKIE(VKBaseIE):
}, },
{ {
'note': 'Embedded video', 'note': 'Embedded video',
'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1', 'url': 'https://vk.com/video_ext.php?oid=-77521&id=162222515&hash=87b046504ccd8bfa',
'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a', 'md5': '7babad3b85ea2e91948005b1b8b0cb84',
'info_dict': { 'info_dict': {
'id': '32194266_162925554', 'id': '-77521_162222515',
'ext': 'mp4', 'ext': 'mp4',
'uploader': 'Vladimir Gavrin', 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',
'title': 'Lin Dan', 'title': 'ProtivoGunz - Хуёвая песня',
'duration': 101, 'duration': 195,
'upload_date': '20120730', 'upload_date': '20120212',
'view_count': int, 'timestamp': 1329049880,
'uploader_id': '-77521',
}, },
'skip': 'This video has been removed from public access.',
}, },
{ {
# VIDEO NOW REMOVED # VIDEO NOW REMOVED
@ -138,18 +149,19 @@ class VKIE(VKBaseIE):
'upload_date': '20121218', 'upload_date': '20121218',
'view_count': int, 'view_count': int,
}, },
'skip': 'Requires vk account credentials', 'skip': 'Removed',
}, },
{ {
'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d', 'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d',
'md5': '4d7a5ef8cf114dfa09577e57b2993202',
'info_dict': { 'info_dict': {
'id': '-43215063_168067957', 'id': '-43215063_168067957',
'ext': 'mp4', 'ext': 'mp4',
'uploader': 'Киномания - лучшее из мира кино', 'uploader': 'Bro Mazter',
'title': ' ', 'title': ' ',
'duration': 7291, 'duration': 7291,
'upload_date': '20140328', 'upload_date': '20140328',
'uploader_id': '223413403',
'timestamp': 1396018030,
}, },
'skip': 'Requires vk account credentials', 'skip': 'Requires vk account credentials',
}, },
@ -165,7 +177,7 @@ class VKIE(VKBaseIE):
'upload_date': '20140626', 'upload_date': '20140626',
'view_count': int, 'view_count': int,
}, },
'skip': 'Only works from Russia', 'skip': 'Removed',
}, },
{ {
# video (removed?) only available with list id # video (removed?) only available with list id
@ -247,6 +259,9 @@ class VKIE(VKBaseIE):
'uploader_id': '-387766', 'uploader_id': '-387766',
'timestamp': 1475137527, 'timestamp': 1475137527,
}, },
'params': {
'skip_download': True,
},
}, },
{ {
# live stream, hls and rtmp links, most likely already finished live # live stream, hls and rtmp links, most likely already finished live
@ -288,17 +303,27 @@ class VKIE(VKBaseIE):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid') video_id = mobj.group('videoid')
mv_data = {}
if video_id: if video_id:
info_url = 'https://vk.com/al_video.php?act=show_inline&al=1&video=' + video_id data = {
'act': 'show_inline',
'video': video_id,
}
# Some videos (removed?) can only be downloaded with list id specified # Some videos (removed?) can only be downloaded with list id specified
list_id = mobj.group('list_id') list_id = mobj.group('list_id')
if list_id: if list_id:
info_url += '&list=%s' % list_id data['list'] = list_id
payload = self._download_payload('al_video', video_id, data)
info_page = payload[1]
opts = payload[-1]
mv_data = opts.get('mvData') or {}
player = opts.get('player') or {}
else: else:
info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query')
video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
info_page = self._download_webpage(info_url, video_id) info_page = self._download_webpage(
'http://vk.com/video_ext.php?' + mobj.group('embed_query'), video_id)
error_message = self._html_search_regex( error_message = self._html_search_regex(
[r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
@ -351,17 +376,21 @@ class VKIE(VKBaseIE):
if re.search(error_re, info_page): if re.search(error_re, info_page):
raise ExtractorError(error_msg % video_id, expected=True) raise ExtractorError(error_msg % video_id, expected=True)
player = self._parse_json(self._search_regex(
r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n',
info_page, 'player params'), video_id)
youtube_url = YoutubeIE._extract_url(info_page) youtube_url = YoutubeIE._extract_url(info_page)
if youtube_url: if youtube_url:
return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) return self.url_result(youtube_url, YoutubeIE.ie_key())
vimeo_url = VimeoIE._extract_url(url, info_page) vimeo_url = VimeoIE._extract_url(url, info_page)
if vimeo_url is not None: if vimeo_url is not None:
return self.url_result(vimeo_url) return self.url_result(vimeo_url, VimeoIE.ie_key())
pladform_url = PladformIE._extract_url(info_page) pladform_url = PladformIE._extract_url(info_page)
if pladform_url: if pladform_url:
return self.url_result(pladform_url) return self.url_result(pladform_url, PladformIE.ie_key())
m_rutube = re.search( m_rutube = re.search(
r'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video|play)\\?/embed(?:.*?))\\?"', info_page) r'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video|play)\\?/embed(?:.*?))\\?"', info_page)
@ -374,6 +403,10 @@ class VKIE(VKBaseIE):
if dailymotion_urls: if dailymotion_urls:
return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key()) return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key())
odnoklassniki_url = OdnoklassnikiIE._extract_url(info_page)
if odnoklassniki_url:
return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key())
m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page) m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page)
if m_opts: if m_opts:
m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1)) m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1))
@ -383,38 +416,7 @@ class VKIE(VKBaseIE):
opts_url = 'http:' + opts_url opts_url = 'http:' + opts_url
return self.url_result(opts_url) return self.url_result(opts_url)
# vars does not look to be served anymore since 24.10.2016 data = player['params'][0]
data = self._parse_json(
self._search_regex(
r'var\s+vars\s*=\s*({.+?});', info_page, 'vars', default='{}'),
video_id, fatal=False)
# <!json> is served instead
if not data:
data = self._parse_json(
self._search_regex(
[r'<!json>\s*({.+?})\s*<!>', r'<!json>\s*({.+})'],
info_page, 'json', default='{}'),
video_id)
if data:
data = data['player']['params'][0]
if not data:
data = self._parse_json(
self._search_regex(
r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', info_page,
'player params', default='{}'),
video_id)
if data:
data = data['params'][0]
# <!--{...}
if not data:
data = self._parse_json(
self._search_regex(
r'<!--\s*({.+})', info_page, 'payload'),
video_id)['payload'][-1][-1]['player']['params'][0]
title = unescapeHTML(data['md_title']) title = unescapeHTML(data['md_title'])
# 2 = live # 2 = live
@ -463,12 +465,12 @@ class VKIE(VKBaseIE):
'title': title, 'title': title,
'thumbnail': data.get('jpg'), 'thumbnail': data.get('jpg'),
'uploader': data.get('md_author'), 'uploader': data.get('md_author'),
'uploader_id': str_or_none(data.get('author_id')), 'uploader_id': str_or_none(data.get('author_id') or mv_data.get('authorId')),
'duration': data.get('duration'), 'duration': int_or_none(data.get('duration') or mv_data.get('duration')),
'timestamp': timestamp, 'timestamp': timestamp,
'view_count': view_count, 'view_count': view_count,
'like_count': int_or_none(data.get('liked')), 'like_count': int_or_none(mv_data.get('likes')),
'dislike_count': int_or_none(data.get('nolikes')), 'comment_count': int_or_none(mv_data.get('commcount')),
'is_live': is_live, 'is_live': is_live,
} }
@ -482,7 +484,6 @@ class VKUserVideosIE(VKBaseIE):
'url': 'http://vk.com/videos205387401', 'url': 'http://vk.com/videos205387401',
'info_dict': { 'info_dict': {
'id': '205387401', 'id': '205387401',
'title': "Tom Cruise's Videos",
}, },
'playlist_mincount': 4, 'playlist_mincount': 4,
}, { }, {
@ -498,22 +499,25 @@ class VKUserVideosIE(VKBaseIE):
'url': 'http://new.vk.com/videos205387401', 'url': 'http://new.vk.com/videos205387401',
'only_matching': True, 'only_matching': True,
}] }]
_VIDEO = collections.namedtuple(
'Video', ['owner_id', 'id', 'thumb', 'title', 'flags', 'duration', 'hash', 'moder_acts', 'owner', 'date', 'views', 'platform', 'blocked', 'music_video_meta'])
def _real_extract(self, url): def _real_extract(self, url):
page_id = self._match_id(url) page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id) l = self._download_payload('al_video', page_id, {
'act': 'load_videos_silent',
'oid': page_id,
})[0]['']['list']
entries = [ entries = []
self.url_result( for video in l:
'http://vk.com/video' + video_id, 'VK', video_id=video_id) v = self._VIDEO._make(video)
for video_id in orderedSet(re.findall(r'href="/video(-?[0-9_]+)"', webpage))] video_id = '%d_%d' % (v.owner_id, v.id)
entries.append(self.url_result(
'http://vk.com/video' + video_id, 'VK', video_id=video_id))
title = unescapeHTML(self._search_regex( return self.playlist_result(entries, page_id)
r'<title>\s*([^<]+?)\s+\|\s+\d+\s+videos',
webpage, 'title', default=page_id))
return self.playlist_result(entries, page_id, title)
class VKWallPostIE(VKBaseIE): class VKWallPostIE(VKBaseIE):
@ -523,15 +527,15 @@ class VKWallPostIE(VKBaseIE):
# public page URL, audio playlist # public page URL, audio playlist
'url': 'https://vk.com/bs.official?w=wall-23538238_35', 'url': 'https://vk.com/bs.official?w=wall-23538238_35',
'info_dict': { 'info_dict': {
'id': '23538238_35', 'id': '-23538238_35',
'title': 'Black Shadow - Wall post 23538238_35', 'title': 'Black Shadow - Wall post -23538238_35',
'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c', 'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c',
}, },
'playlist': [{ 'playlist': [{
'md5': '5ba93864ec5b85f7ce19a9af4af080f6', 'md5': '5ba93864ec5b85f7ce19a9af4af080f6',
'info_dict': { 'info_dict': {
'id': '135220665_111806521', 'id': '135220665_111806521',
'ext': 'mp3', 'ext': 'mp4',
'title': 'Black Shadow - Слепое Верование', 'title': 'Black Shadow - Слепое Верование',
'duration': 370, 'duration': 370,
'uploader': 'Black Shadow', 'uploader': 'Black Shadow',
@ -542,18 +546,16 @@ class VKWallPostIE(VKBaseIE):
'md5': '4cc7e804579122b17ea95af7834c9233', 'md5': '4cc7e804579122b17ea95af7834c9233',
'info_dict': { 'info_dict': {
'id': '135220665_111802303', 'id': '135220665_111802303',
'ext': 'mp3', 'ext': 'mp4',
'title': 'Black Shadow - Война - Негасимое Бездны Пламя!', 'title': 'Black Shadow - Война - Негасимое Бездны Пламя!',
'duration': 423, 'duration': 423,
'uploader': 'Black Shadow', 'uploader': 'Black Shadow',
'artist': 'Black Shadow', 'artist': 'Black Shadow',
'track': 'Война - Негасимое Бездны Пламя!', 'track': 'Война - Негасимое Бездны Пламя!',
}, },
'params': {
'skip_download': True,
},
}], }],
'params': { 'params': {
'skip_download': True,
'usenetrc': True, 'usenetrc': True,
}, },
'skip': 'Requires vk account credentials', 'skip': 'Requires vk account credentials',
@ -562,7 +564,7 @@ class VKWallPostIE(VKBaseIE):
'url': 'https://vk.com/wall85155021_6319', 'url': 'https://vk.com/wall85155021_6319',
'info_dict': { 'info_dict': {
'id': '85155021_6319', 'id': '85155021_6319',
'title': 'Sergey Gorbunov - Wall post 85155021_6319', 'title': 'Сергей Горбунов - Wall post 85155021_6319',
}, },
'playlist_count': 1, 'playlist_count': 1,
'params': { 'params': {
@ -578,57 +580,72 @@ class VKWallPostIE(VKBaseIE):
'url': 'https://m.vk.com/wall-23538238_35', 'url': 'https://m.vk.com/wall-23538238_35',
'only_matching': True, 'only_matching': True,
}] }]
_BASE64_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN0PQRSTUVWXYZO123456789+/='
_AUDIO = collections.namedtuple(
'Audio', ['id', 'owner_id', 'url', 'title', 'performer', 'duration', 'album_id', 'unk', 'author_link', 'lyrics', 'flags', 'context', 'extra', 'hashes', 'cover_url', 'ads', 'subtitle', 'main_artists', 'feat_artists', 'album', 'track_code', 'restriction', 'album_part', 'new_stats', 'access_key'])
def _decode(self, enc):
dec = ''
e = n = 0
for c in enc:
r = self._BASE64_CHARS.index(c)
cond = n % 4
e = 64 * e + r if cond else r
n += 1
if cond:
dec += chr(255 & e >> (-2 * n & 6))
return dec
def _unmask_url(self, mask_url, vk_id):
if 'audio_api_unavailable' in mask_url:
extra = mask_url.split('?extra=')[1].split('#')
func, base = self._decode(extra[1]).split(chr(11))
assert (func == 'i')
mask_url = list(self._decode(extra[0]))
url_len = len(mask_url)
indexes = [None] * url_len
index = int(base) ^ vk_id
for n in range(url_len - 1, -1, -1):
index = (url_len * (n + 1) ^ index + n) % url_len
indexes[n] = index
for n in range(1, url_len):
c = mask_url[n]
index = indexes[url_len - 1 - n]
mask_url[n] = mask_url[index]
mask_url[index] = c
mask_url = ''.join(mask_url)
return mask_url
def _real_extract(self, url): def _real_extract(self, url):
post_id = self._match_id(url) post_id = self._match_id(url)
wall_url = 'https://vk.com/wall%s' % post_id webpage = self._download_payload('wkview', post_id, {
'act': 'show',
post_id = remove_start(post_id, '-') 'w': 'wall' + post_id,
})[1]
webpage = self._download_webpage(wall_url, post_id)
error = self._html_search_regex(
r'>Error</div>\s*<div[^>]+class=["\']body["\'][^>]*>([^<]+)',
webpage, 'error', default=None)
if error:
raise ExtractorError('VK said: %s' % error, expected=True)
description = clean_html(get_element_by_class('wall_post_text', webpage)) description = clean_html(get_element_by_class('wall_post_text', webpage))
uploader = clean_html(get_element_by_class('author', webpage)) uploader = clean_html(get_element_by_class('author', webpage))
thumbnail = self._og_search_thumbnail(webpage)
entries = [] entries = []
audio_ids = re.findall(r'data-full-id=["\'](\d+_\d+)', webpage) for audio in re.findall(r'data-audio="([^"]+)', webpage):
if audio_ids: audio = self._parse_json(unescapeHTML(audio), post_id)
al_audio = self._download_webpage( a = self._AUDIO._make(audio)
'https://vk.com/al_audio.php', post_id, if not a.url:
note='Downloading audio info', fatal=False, continue
data=urlencode_postdata({ title = unescapeHTML(a.title)
'act': 'reload_audio',
'al': '1',
'ids': ','.join(audio_ids)
}))
if al_audio:
Audio = collections.namedtuple(
'Audio', ['id', 'user_id', 'url', 'track', 'artist', 'duration'])
audios = self._parse_json(
self._search_regex(
r'<!json>(.+?)<!>', al_audio, 'audios', default='[]'),
post_id, fatal=False, transform_source=unescapeHTML)
if isinstance(audios, list):
for audio in audios:
a = Audio._make(audio[:6])
entries.append({ entries.append({
'id': '%s_%s' % (a.user_id, a.id), 'id': '%s_%s' % (a.owner_id, a.id),
'url': a.url, 'url': self._unmask_url(a.url, a.ads['vk_id']),
'title': '%s - %s' % (a.artist, a.track) if a.artist and a.track else a.id, 'title': '%s - %s' % (a.performer, title) if a.performer else title,
'thumbnail': thumbnail, 'thumbnail': a.cover_url.split(',') if a.cover_url else None,
'duration': a.duration, 'duration': a.duration,
'uploader': uploader, 'uploader': uploader,
'artist': a.artist, 'artist': a.performer,
'track': a.track, 'track': title,
'ext': 'mp4',
'protocol': 'm3u8',
}) })
for video in re.finditer( for video in re.finditer(