diff --git a/src/you_get/extractors/icourses.py b/src/you_get/extractors/icourses.py index 5f9b8edf..5c2f8cda 100644 --- a/src/you_get/extractors/icourses.py +++ b/src/you_get/extractors/icourses.py @@ -13,8 +13,9 @@ __all__ = ['icourses_download'] def icourses_download(url, info_only, merge=False, output_dir='.', **kwargs): - title, real_url = icourses_cn_url_parser( - url, info_only=info_only, **kwargs) + icourses_parser = ICousesExactor(url=url) + real_url = icourses_parser.icourses_cn_url_parser(**kwargs) + title = icourses_parser.title if real_url is not None: for tries in range(0, 3): try: @@ -22,108 +23,120 @@ def icourses_download(url, info_only, merge=False, output_dir='.', **kwargs): break except error.HTTPError: logging.warning('Failed to fetch the video file! Retrying...') - title, real_url = icourses_cn_url_parser(url) + real_url = icourses_parser.icourses_cn_url_parser() + title = icourses_parser.title print_info(site_info, title, type_, size) if not info_only: download_urls([real_url], title, 'flv', total_size=size, output_dir=output_dir, refer=url, merge=merge, faker=True) -def icourses_playlist_download(url, **kwargs): - import random - from time import sleep - html = get_content(url) - page_type_patt = r'showSectionNode\(this,(\d+),(\d+)\)' - video_js_number = r'changeforvideo\((.*?)\)' - fs_flag = r'' - page_navi_vars = re.search(pattern=page_type_patt, string=html) - dummy_page = 'http://www.icourses.cn/jpk/viewCharacterDetail.action?sectionId={}&courseId={}'.format( - page_navi_vars.group(2), page_navi_vars.group(1)) - html = get_content(dummy_page) - fs_status = match1(html, fs_flag) - video_list = re.findall(pattern=video_js_number, string=html) - for video in video_list: - video_args = video.replace('\'', '').split(',') - video_url = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}&firstShowFlag={}'.format( - video_args[0], video_args[1], fs_status or '1') - sleep(random.Random().randint(0, 5)) # Prevent from blockage - icourses_download(url=video_url, **kwargs) +# Why not using VideoExtractor: This site needs specical download method +class ICousesExactor(object): + def __init__(self, url): + self.url = url + self.title = '' + return -def icourses_cn_url_parser(url, **kwargs): - PLAYER_BASE_VER = '150606-1' - ENCRYPT_MOD_VER = '151020' - ENCRYPT_SALT = '3DAPmXsZ4o' # It took really long time to find this... - html = get_content(url) - if re.search(pattern=r'showSectionNode\(.*\)', string=html): - logging.warning('Switching to playlist mode!') - return icourses_playlist_download(url, **kwargs) - flashvars_patt = r'var\ flashvars\=((.|\n)*)};' - server_time_patt = r'MPlayer.swf\?v\=(\d+)' - uuid_patt = r'uuid:(\d+)' - other_args_patt = r'other:"(.*)"' - res_url_patt = r'IService:\'([^\']+)' - title_a_patt = r'
(.*?)' - title_b_patt = r'
((.|\n)*?)
' - title_a = match1(html, title_a_patt).strip() - title_b = match1(html, title_b_patt).strip() - title = title_a + title_b # WIP, FIXME - title = re.sub('( +|\n|\t|\r|\ \;)', '', - unescape_html(title).replace(' ', '')) - server_time = match1(html, server_time_patt) - flashvars = match1(html, flashvars_patt) - uuid = match1(flashvars, uuid_patt) - other_args = match1(flashvars, other_args_patt) - res_url = match1(flashvars, res_url_patt) - url_parts = {'v': server_time, 'other': other_args, - 'uuid': uuid, 'IService': res_url} - req_url = '%s?%s' % (res_url, parse.urlencode(url_parts)) - logging.debug('Requesting video resource location...') - xml_resp = get_html(req_url) - xml_obj = ET.fromstring(xml_resp) - logging.debug('The result was {}'.format(xml_obj.get('status'))) - if xml_obj.get('status') != 'success': - raise ValueError('Server returned error!') - common_args = {'lv': PLAYER_BASE_VER, 'ls': 'play', - 'lt': datetime.datetime.now().strftime('%m-%d/%H:%M:%S'), - 'start': 0} - media_host = xml_obj.find(".//*[@name='host']").text - media_url = media_host + xml_obj.find(".//*[@name='url']").text - # This is what they called `SSLModule`... But obviously, just a kind of - # encryption, takes absolutely no effect in protecting data intergrity - if xml_obj.find(".//*[@name='ssl']").text != 'true': - logging.debug('The encryption mode is disabled') - # when the so-called `SSLMode` is not activated, the parameters, `h` - # and `p` can be found in response - arg_h = xml_obj.find(".//*[@name='h']").text - assert arg_h - arg_r = xml_obj.find(".//*[@name='p']").text or ENCRYPT_MOD_VER + def icourses_playlist_download(self, **kwargs): + import random + from time import sleep + html = get_content(url) + page_type_patt = r'showSectionNode\(this,(\d+),(\d+)\)' + video_js_number = r'changeforvideo\((.*?)\)' + fs_flag = r'' + page_navi_vars = re.search(pattern=page_type_patt, string=html) + dummy_page = 'http://www.icourses.cn/jpk/viewCharacterDetail.action?sectionId={}&courseId={}'.format( + page_navi_vars.group(2), page_navi_vars.group(1)) + html = get_content(dummy_page) + fs_status = match1(html, fs_flag) + video_list = re.findall(pattern=video_js_number, string=html) + for video in video_list: + video_args = video.replace('\'', '').split(',') + video_url = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}&firstShowFlag={}'.format( + video_args[0], video_args[1], fs_status or '1') + sleep(random.Random().randint(0, 5)) # Prevent from blockage + icourses_download(video_url, **kwargs) + + def icourses_cn_url_parser(self, **kwargs): + PLAYER_BASE_VER = '150606-1' + ENCRYPT_MOD_VER = '151020' + ENCRYPT_SALT = '3DAPmXsZ4o' # It took really long time to find this... + html = get_content(self.url) + if re.search(pattern=r'showSectionNode\(.*\)', string=html): + logging.warning('Switching to playlist mode!') + return self.icourses_playlist_download(**kwargs) + flashvars_patt = r'var\ flashvars\=((.|\n)*)};' + server_time_patt = r'MPlayer.swf\?v\=(\d+)' + uuid_patt = r'uuid:(\d+)' + other_args_patt = r'other:"(.*)"' + res_url_patt = r'IService:\'([^\']+)' + title_a_patt = r'
(.*?)' + title_b_patt = r'
((.|\n)*?)
' + title_a = match1(html, title_a_patt).strip() + title_b = match1(html, title_b_patt).strip() + title = title_a + title_b # WIP, FIXME + title = re.sub('( +|\n|\t|\r|\ \;)', '', + unescape_html(title).replace(' ', '')) + server_time = match1(html, server_time_patt) + flashvars = match1(html, flashvars_patt) + uuid = match1(flashvars, uuid_patt) + other_args = match1(flashvars, other_args_patt) + res_url = match1(flashvars, res_url_patt) + url_parts = {'v': server_time, 'other': other_args, + 'uuid': uuid, 'IService': res_url} + req_url = '%s?%s' % (res_url, parse.urlencode(url_parts)) + logging.debug('Requesting video resource location...') + xml_resp = get_html(req_url) + xml_obj = ET.fromstring(xml_resp) + logging.debug('The result was {}'.format(xml_obj.get('status'))) + if xml_obj.get('status') != 'success': + raise ValueError('Server returned error!') + common_args = {'lv': PLAYER_BASE_VER, 'ls': 'play', + 'lt': datetime.datetime.now().strftime('%m-%d/%H:%M:%S'), + 'start': 0} + media_host = xml_obj.find(".//*[@name='host']").text + media_url = media_host + xml_obj.find(".//*[@name='url']").text + # This is what they called `SSLModule`... But obviously, just a kind of + # encryption, takes absolutely no effect in protecting data intergrity + if xml_obj.find(".//*[@name='ssl']").text != 'true': + logging.debug('The encryption mode is disabled') + # when the so-called `SSLMode` is not activated, the parameters, `h` + # and `p` can be found in response + arg_h = xml_obj.find(".//*[@name='h']").text + assert arg_h + arg_r = xml_obj.find(".//*[@name='p']").text or ENCRYPT_MOD_VER + url_args = common_args.copy() + url_args.update({'h': arg_h, 'r': arg_r}) + final_url = '{}?{}'.format( + media_url, parse.urlencode(url_args)) + self.title = title + return final_url + # when the `SSLMode` is activated, we need to receive the timestamp and the + # time offset (?) value from the server + logging.debug('The encryption mode is in effect') + ssl_callback = get_html( + '{}/ssl/ssl.shtml'.format(media_host)).split(',') + ssl_timestamp = int(datetime.datetime.strptime( + ssl_callback[1], "%b %d %H:%M:%S %Y").timestamp() + int(ssl_callback[0])) + sign_this = ENCRYPT_SALT + \ + parse.urlparse(media_url).path + str(ssl_timestamp) + arg_h = base64.b64encode(hashlib.md5( + bytes(sign_this, 'utf-8')).digest()) + # Post-processing, may subject to change, so leaving this alone... + arg_h = arg_h.decode('utf-8').strip('=').replace('+', + '-').replace('/', '_') + arg_r = ssl_timestamp url_args = common_args.copy() - url_args.update({'h': arg_h, 'r': arg_r}) + url_args.update({'h': arg_h, 'r': arg_r, 'p': ENCRYPT_MOD_VER}) final_url = '{}?{}'.format( media_url, parse.urlencode(url_args)) - return title, final_url - # when the `SSLMode` is activated, we need to receive the timestamp and the - # time offset (?) value from the server - logging.debug('The encryption mode is in effect') - ssl_callback = get_html('{}/ssl/ssl.shtml'.format(media_host)).split(',') - ssl_timestamp = int(datetime.datetime.strptime( - ssl_callback[1], "%b %d %H:%M:%S %Y").timestamp() + int(ssl_callback[0])) - sign_this = ENCRYPT_SALT + \ - parse.urlparse(media_url).path + str(ssl_timestamp) - arg_h = base64.b64encode(hashlib.md5(bytes(sign_this, 'utf-8')).digest()) - # Post-processing, may subject to change, so leaving this alone... - arg_h = arg_h.decode('utf-8').strip('=').replace('+', - '-').replace('/', '_') - arg_r = ssl_timestamp - url_args = common_args.copy() - url_args.update({'h': arg_h, 'r': arg_r, 'p': ENCRYPT_MOD_VER}) - final_url = '{}?{}'.format( - media_url, parse.urlencode(url_args)) - logging.debug('Concat`ed URL: {}'.format(final_url)) - return title, final_url + logging.debug('Crafted URL: {}'.format(final_url)) + self.title = title + return final_url site_info = 'icourses.cn' download = icourses_download -download_playlist = icourses_playlist_download +# download_playlist = icourses_playlist_download