Merge remote-tracking branch 'Huarong/master'

This commit is contained in:
Philipp Hagemeister 2013-08-28 13:10:59 +02:00
commit f8b362739e
3 changed files with 98 additions and 2 deletions
youtube_dl/extractor

View file

@ -70,6 +70,7 @@ from .roxwel import RoxwelIE
from .rtlnow import RTLnowIE from .rtlnow import RTLnowIE
from .sina import SinaIE from .sina import SinaIE
from .slashdot import SlashdotIE from .slashdot import SlashdotIE
from .sohu import SohuIE
from .soundcloud import SoundcloudIE, SoundcloudSetIE from .soundcloud import SoundcloudIE, SoundcloudSetIE
from .spiegel import SpiegelIE from .spiegel import SpiegelIE
from .stanfordoc import StanfordOpenClassroomIE from .stanfordoc import StanfordOpenClassroomIE

View file

@ -21,8 +21,10 @@ class KankanIE(InfoExtractor):
video_id = mobj.group('id') video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
title = self._search_regex(r'G_TITLE=[\'"](.+?)[\'"]', webpage, u'video title') title = self._search_regex(r'(?:G_TITLE=|G_MOVIE_TITLE = )[\'"](.+?)[\'"]', webpage, u'video title')
gcid = self._search_regex(r'lurl:[\'"]http://.+?/.+?/(.+?)/', webpage, u'gcid') surls = re.search(r'surls:\[\'.+?\'\]|lurl:\'.+?\.flv\'', webpage).group(0)
gcids = re.findall(r"http://.+?/.+?/(.+?)/", surls)
gcid = gcids[-1]
video_info_page = self._download_webpage('http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid, video_info_page = self._download_webpage('http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid,
video_id, u'Downloading video url info') video_id, u'Downloading video url info')

View file

@ -0,0 +1,93 @@
# encoding: utf-8
import re
import json
import time
import logging
import urllib2
from .common import InfoExtractor
from ..utils import compat_urllib_request, clean_html
class SohuIE(InfoExtractor):
_VALID_URL = r'https?://tv\.sohu\.com/\d+?/n(?P<id>\d+)\.shtml.*?'
_TEST = {
u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super',
u'file': u'382479172.flv',
u'md5': u'cc84eed6b6fbf0f2f9a8d3cb9da1939b',
u'info_dict': {
u'title': u'The Illest - Far East Movement Riff Raff',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
pattern = r'<title>(.+?)</title>'
compiled = re.compile(pattern, re.DOTALL)
title = self._search_regex(compiled, webpage, u'video title')
title = clean_html(title).split('-')[0].strip()
self.to_screen('Title: %s' % title)
pattern = re.compile(r'var vid="(\d+)"')
result = re.search(pattern, webpage)
if not result:
logging.info('[Sohu] could not get vid')
return None
vid = result.group(1)
logging.info('vid: %s' % vid)
base_url_1 = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
url_1 = base_url_1 + vid
logging.info('json url: %s' % url_1)
webpage = self._download_webpage(url_1, vid)
json_1 = json.loads(webpage)
# get the highest definition video vid and json infomation.
vids = []
qualities = ('oriVid', 'superVid', 'highVid', 'norVid')
for vid_name in qualities:
vids.append(json_1['data'][vid_name])
clearest_vid = 0
for i, v in enumerate(vids):
if v != 0:
clearest_vid = v
logging.info('quality definition: %s' % qualities[i][:-3])
break
if not clearest_vid:
logging.warning('could not find valid clearest_vid')
return None
if vid != clearest_vid:
url_1 = '%s%d' % (base_url_1, clearest_vid)
logging.info('highest definition json url: %s' % url_1)
json_1 = json.loads(urllib2.urlopen(url_1).read())
allot = json_1['allot']
prot = json_1['prot']
clipsURL = json_1['data']['clipsURL']
su = json_1['data']['su']
num_of_parts = json_1['data']['totalBlocks']
logging.info('Total parts: %d' % num_of_parts)
base_url_3 = 'http://allot/?prot=prot&file=clipsURL[i]&new=su[i]'
files_info = []
for i in range(num_of_parts):
self.to_screen('Geting json infomation of part %s/%s' % (i + 1, num_of_parts))
middle_url = 'http://%s/?prot=%s&file=%s&new=%s' % (allot, prot, clipsURL[i], su[i])
logging.info('middle url part %d: %s' % (i, middle_url))
middle_info = urllib2.urlopen(middle_url).read().split('|')
middle_part_1 = middle_info[0]
download_url = '%s%s?key=%s' % (middle_info[0], su[i], middle_info[3])
info = {
'id': '%s_part%02d' % (video_id, i + 1),
'title': title,
'url': download_url,
'ext': 'mp4',
}
files_info.append(info)
time.sleep(1)
if num_of_parts == 1:
info = files_info[0]
info['id'] = video_id
return info
return files_info