from __future__ import unicode_literals import json import random import re from .common import InfoExtractor from ..compat import ( compat_b64decode, compat_HTTPError, compat_str, ) from ..utils import ( ExtractorError, orderedSet, unescapeHTML, urlencode_postdata, urljoin, ) class LinuxAcademyIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?:www\.)?linuxacademy\.com/cp/ (?: courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)| modules/view/id/(?P<course_id>\d+) ) ''' _TESTS = [{ 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2/module/154', 'info_dict': { 'id': '1498-2', 'ext': 'mp4', 'title': "Introduction to the Practitioner's Brief", }, 'params': { 'skip_download': True, }, 'skip': 'Requires Linux Academy account credentials', }, { 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2', 'only_matching': True, }, { 'url': 'https://linuxacademy.com/cp/modules/view/id/154', 'info_dict': { 'id': '154', 'title': 'AWS Certified Cloud Practitioner', 'description': 'md5:039db7e60e4aac9cf43630e0a75fa834', }, 'playlist_count': 41, 'skip': 'Requires Linux Academy account credentials', }] _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize' _ORIGIN_URL = 'https://linuxacademy.com' _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx' _NETRC_MACHINE = 'linuxacademy' def _real_initialize(self): self._login() def _login(self): username, password = self._get_login_info() if username is None: return def random_string(): return ''.join([ random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~') for _ in range(32)]) webpage, urlh = self._download_webpage_handle( self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ 'client_id': self._CLIENT_ID, 'response_type': 'token id_token', 'redirect_uri': self._ORIGIN_URL, 'scope': 'openid email user_impersonation profile', 'audience': self._ORIGIN_URL, 'state': random_string(), 'nonce': random_string(), }) login_data = self._parse_json( self._search_regex( r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, 'login info', group='value'), None, transform_source=lambda x: compat_b64decode(x).decode('utf-8') )['extraParams'] login_data.update({ 'client_id': self._CLIENT_ID, 'redirect_uri': self._ORIGIN_URL, 'tenant': 'lacausers', 'connection': 'Username-Password-Authentication', 'username': username, 'password': password, 'sso': 'true', }) login_state_url = compat_str(urlh.geturl()) try: login_page = self._download_webpage( 'https://login.linuxacademy.com/usernamepassword/login', None, 'Downloading login page', data=json.dumps(login_data).encode(), headers={ 'Content-Type': 'application/json', 'Origin': 'https://login.linuxacademy.com', 'Referer': login_state_url, }) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: error = self._parse_json(e.cause.read(), None) message = error.get('description') or error['code'] raise ExtractorError( '%s said: %s' % (self.IE_NAME, message), expected=True) raise callback_page, urlh = self._download_webpage_handle( 'https://login.linuxacademy.com/login/callback', None, 'Downloading callback page', data=urlencode_postdata(self._hidden_inputs(login_page)), headers={ 'Content-Type': 'application/x-www-form-urlencoded', 'Origin': 'https://login.linuxacademy.com', 'Referer': login_state_url, }) access_token = self._search_regex( r'access_token=([^=&]+)', compat_str(urlh.geturl()), 'access token') self._download_webpage( 'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s' % access_token, None, 'Downloading token validation page') def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id') item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id) webpage = self._download_webpage(url, item_id) # course path if course_id: entries = [ self.url_result( urljoin(url, lesson_url), ie=LinuxAcademyIE.ie_key()) for lesson_url in orderedSet(re.findall( r'<a[^>]+\bhref=["\'](/cp/courses/lesson/course/\d+/lesson/\d+/module/\d+)', webpage))] title = unescapeHTML(self._html_search_regex( (r'class=["\']course-title["\'][^>]*>(?P<value>[^<]+)', r'var\s+title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, 'title', default=None, group='value')) description = unescapeHTML(self._html_search_regex( r'var\s+description\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, 'description', default=None, group='value')) return self.playlist_result(entries, course_id, title, description) # single video path info = self._extract_jwplayer_data( webpage, item_id, require_title=False, m3u8_id='hls',) title = self._search_regex( (r'>Lecture\s*:\s*(?P<value>[^<]+)', r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, 'title', group='value') info.update({ 'id': item_id, 'title': title, }) return info