[tiktok] Improve extraction and add support for user pages (closes #18135)

This commit is contained in:
Sergey M․ 2018-12-02 02:39:22 +07:00
parent 1ead840d2c
commit ce18a19be9
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D
2 changed files with 91 additions and 50 deletions

View file

@ -1124,7 +1124,10 @@ from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE from .thisoldhouse import ThisOldHouseIE
from .threeqsdn import ThreeQSDNIE from .threeqsdn import ThreeQSDNIE
from .tiktok import TikTokIE from .tiktok import (
TikTokIE,
TikTokUserIE,
)
from .tinypic import TinyPicIE from .tinypic import TinyPicIE
from .tmz import ( from .tmz import (
TMZIE, TMZIE,

View file

@ -4,6 +4,7 @@ from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
compat_str, compat_str,
ExtractorError,
int_or_none, int_or_none,
str_or_none, str_or_none,
try_get, try_get,
@ -11,69 +12,106 @@ from ..utils import (
) )
class TikTokIE(InfoExtractor): class TikTokBaseIE(InfoExtractor):
_VALID_URL = r'https?://(?:m\.)?tiktok\.com/v/(?P<id>[0-9]+)' def _extract_aweme(self, data):
video = data['video']
description = str_or_none(try_get(data, lambda x: x['desc']))
width = int_or_none(try_get(data, lambda x: video['width']))
height = int_or_none(try_get(data, lambda x: video['height']))
format_urls = set()
formats = []
for format_id in (
'play_addr_lowbr', 'play_addr', 'play_addr_h264',
'download_addr'):
for format in try_get(
video, lambda x: x[format_id]['url_list'], list) or []:
format_url = url_or_none(format)
if not format_url:
continue
if format_url in format_urls:
continue
format_urls.add(format_url)
formats.append({
'url': format_url,
'ext': 'mp4',
'height': height,
'width': width,
})
self._sort_formats(formats)
thumbnail = url_or_none(try_get(
video, lambda x: x['cover']['url_list'][0], compat_str))
uploader = try_get(data, lambda x: x['author']['nickname'], compat_str)
timestamp = int_or_none(data.get('create_time'))
comment_count = int_or_none(data.get('comment_count')) or int_or_none(
try_get(data, lambda x: x['statistics']['comment_count']))
repost_count = int_or_none(try_get(
data, lambda x: x['statistics']['share_count']))
aweme_id = data['aweme_id']
return {
'id': aweme_id,
'title': uploader or aweme_id,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'timestamp': timestamp,
'comment_count': comment_count,
'repost_count': repost_count,
'formats': formats,
}
class TikTokIE(TikTokBaseIE):
_VALID_URL = r'https?://(?:m\.)?tiktok\.com/v/(?P<id>\d+)'
_TEST = { _TEST = {
'url': 'https://m.tiktok.com/v/6606727368545406213.html', 'url': 'https://m.tiktok.com/v/6606727368545406213.html',
'md5': 'd584b572e92fcd48888051f238022420', 'md5': 'd584b572e92fcd48888051f238022420',
'info_dict': { 'info_dict': {
'id': '6606727368545406213', 'id': '6606727368545406213',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Zureeal on TikTok', 'title': 'Zureeal',
'thumbnail': r're:^https?://.*~noop.image',
'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay',
'thumbnail': r're:^https?://.*~noop.image',
'uploader': 'Zureeal', 'uploader': 'Zureeal',
'width': 540, 'timestamp': 1538248586,
'height': 960, 'upload_date': '20180929',
'comment_count': int,
'repost_count': int,
} }
} }
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
data = self._parse_json(self._search_regex(
r'\bdata\s*=\s*({.+?})\s*;', webpage, 'data'), video_id)
return self._extract_aweme(data)
data = self._parse_json(
self._search_regex(
r'var\s+data\s*=\s*({.+?});', webpage, 'data'
), video_id)
title = self._og_search_title(webpage) class TikTokUserIE(TikTokBaseIE):
_VALID_URL = r'https?://(?:m\.)?tiktok\.com/h5/share/usr/(?P<id>\d+)'
_TEST = {
'url': 'https://m.tiktok.com/h5/share/usr/188294915489964032.html',
'info_dict': {
'id': '188294915489964032',
},
'playlist_mincount': 24,
}
description = str_or_none(try_get(data, lambda x: x['desc'])) def _real_extract(self, url):
width = int_or_none(try_get(data, lambda x: x['video']['width'])) user_id = self._match_id(url)
height = int_or_none(try_get(data, lambda x: x['video']['height'])) data = self._download_json(
'https://m.tiktok.com/h5/share/usr/list/%s/' % user_id, user_id,
formats = [] query={'_signature': '_'})
entries = []
for count, (key, label) in enumerate((('play_addr_lowbr', 'Low'), ('play_addr', 'Normal'), ('download_addr', 'Download')), -2): for aweme in data['aweme_list']:
for format in try_get(data, lambda x: x['video'][key]['url_list']): try:
format_url = url_or_none(format) entry = self._extract_aweme(aweme)
if not format_url: except ExtractorError:
continue continue
formats.append({ entry['extractor_key'] = TikTokIE.ie_key()
'url': format_url, entries.append(entry)
'ext': 'mp4', return self.playlist_result(entries, user_id)
'height': height,
'width': width,
'format_note': label,
'quality': count
})
self._sort_formats(formats)
uploader = try_get(data, lambda x: x['author']['nickname'], compat_str)
thumbnail = url_or_none(
try_get(
data, lambda x: x['video']['cover']['url_list'][0], compat_str))
return {
'id': video_id,
'title': title,
'description': description,
'uploader': uploader,
'formats': formats,
'thumbnail': thumbnail,
'width': width,
'height': height,
}