#!/usr/bin/env python # -*- coding: utf-8 -*- from ..common import * from ..extractor import VideoExtractor import time import traceback import json import urllib.request import urllib.parse def fetch_cna(): def quote_cna(val): if '%' in val: return val return urllib.parse.quote(val) if cookies: for cookie in cookies: if cookie.name == 'cna' and cookie.domain == '.youku.com': log.i('Found cna in imported cookies. Use it') return quote_cna(cookie.value) url = 'http://log.mmstat.com/eg.js' req = urllib.request.urlopen(url) headers = req.getheaders() for header in headers: if header[0].lower() == 'set-cookie': n_v = header[1].split(';')[0] name, value = n_v.split('=') if name == 'cna': return quote_cna(value) log.w('It seems that the client failed to fetch a cna cookie. Please load your own cookie if possible') return quote_cna('DOG4EdW4qzsCAbZyXbU+t7Jt') class Youku(VideoExtractor): name = "优酷 (Youku)" mobile_ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36' dispatcher_url = 'vali.cp31.ott.cibntv.net' # Last updated: 2017-10-13 stream_types = [ {'id': 'hd3', 'container': 'flv', 'video_profile': '1080P'}, {'id': 'hd3v2', 'container': 'flv', 'video_profile': '1080P'}, {'id': 'mp4hd3', 'container': 'mp4', 'video_profile': '1080P'}, {'id': 'mp4hd3v2', 'container': 'mp4', 'video_profile': '1080P'}, {'id': 'hd2', 'container': 'flv', 'video_profile': '超清'}, {'id': 'hd2v2', 'container': 'flv', 'video_profile': '超清'}, {'id': 'mp4hd2', 'container': 'mp4', 'video_profile': '超清'}, {'id': 'mp4hd2v2', 'container': 'mp4', 'video_profile': '超清'}, {'id': 'mp4hd', 'container': 'mp4', 'video_profile': '高清'}, # not really equivalent to mp4hd {'id': 'flvhd', 'container': 'flv', 'video_profile': '渣清'}, {'id': '3gphd', 'container': 'mp4', 'video_profile': '渣清'}, {'id': 'mp4sd', 'container': 'mp4', 'video_profile': '标清'}, # obsolete? {'id': 'flv', 'container': 'flv', 'video_profile': '标清'}, {'id': 'mp4', 'container': 'mp4', 'video_profile': '标清'}, ] def __init__(self): super().__init__() self.ua = self.__class__.mobile_ua self.referer = 'http://v.youku.com' self.page = None self.video_list = None self.video_next = None self.password = None self.api_data = None self.api_error_code = None self.api_error_msg = None self.ccode = '0510' # Found in http://g.alicdn.com/player/ykplayer/0.5.28/youku-player.min.js # grep -oE '"[0-9a-zA-Z+/=]{256}"' youku-player.min.js self.ckey = 'DIl58SLFxFNndSV1GFNnMQVYkx1PP5tKe1siZu/86PR1u/Wh1Ptd+WOZsHHWxysSfAOhNJpdVWsdVJNsfJ8Sxd8WKVvNfAS8aS8fAOzYARzPyPc3JvtnPHjTdKfESTdnuTW6ZPvk2pNDh4uFzotgdMEFkzQ5wZVXl2Pf1/Y6hLK0OnCNxBj3+nb0v72gZ6b0td+WOZsHHWxysSo/0y9D2K42SaB8Y/+aD2K42SaB8Y/+ahU+WOZsHcrxysooUeND' self.utid = None def youku_ups(self): url = 'https://ups.youku.com/ups/get.json?vid={}&ccode={}'.format(self.vid, self.ccode) url += '&client_ip=192.168.1.1' url += '&utid=' + self.utid url += '&client_ts=' + str(int(time.time())) url += '&ckey=' + urllib.parse.quote(self.ckey) if self.password_protected: url += '&password=' + self.password headers = dict(Referer=self.referer) headers['User-Agent'] = self.ua api_meta = json.loads(get_content(url, headers=headers)) self.api_data = api_meta['data'] data_error = self.api_data.get('error') if data_error: self.api_error_code = data_error.get('code') self.api_error_msg = data_error.get('note') if 'videos' in self.api_data: if 'list' in self.api_data['videos']: self.video_list = self.api_data['videos']['list'] if 'next' in self.api_data['videos']: self.video_next = self.api_data['videos']['next'] @classmethod def change_cdn(cls, url): # if the cnd_url starts with an ip addr, it should be youku's old CDN # which rejects http requests randomly with status code > 400 # change it to the dispatcher of aliCDN can do better # at least a little more recoverable from HTTP 403 if cls.dispatcher_url in url: return url elif 'k.youku.com' in url: return url else: url_seg_list = list(urllib.parse.urlsplit(url)) url_seg_list[1] = cls.dispatcher_url return urllib.parse.urlunsplit(url_seg_list) def get_vid_from_url(self): # It's unreliable. check #1633 b64p = r'([a-zA-Z0-9=]+)' p_list = [r'youku\.com/v_show/id_'+b64p, r'player\.youku\.com/player\.php/sid/'+b64p+r'/v\.swf', r'loader\.swf\?VideoIDS='+b64p, r'player\.youku\.com/embed/'+b64p] if not self.url: raise Exception('No url') for p in p_list: hit = re.search(p, self.url) if hit is not None: self.vid = hit.group(1) return def get_vid_from_page(self): if not self.url: raise Exception('No url') self.page = get_content(self.url) hit = re.search(r'videoId2:"([A-Za-z0-9=]+)"', self.page) if hit is not None: self.vid = hit.group(1) def prepare(self, **kwargs): assert self.url or self.vid if self.url and not self.vid: self.get_vid_from_url() if self.vid is None: self.get_vid_from_page() if self.vid is None: log.wtf('Cannot fetch vid') if kwargs.get('src') and kwargs['src'] == 'tudou': self.ccode = '0512' if kwargs.get('password') and kwargs['password']: self.password_protected = True self.password = kwargs['password'] self.utid = fetch_cna() time.sleep(3) self.youku_ups() if self.api_data.get('stream') is None: if self.api_error_code == -6001: # wrong vid parsed from the page vid_from_url = self.vid self.get_vid_from_page() if vid_from_url == self.vid: log.wtf(self.api_error_msg) self.youku_ups() if self.api_data.get('stream') is None: if self.api_error_code == -2002: # wrong password self.password_protected = True # it can be True already(from cli). offer another chance to retry self.password = input(log.sprint('Password: ', log.YELLOW)) self.youku_ups() if self.api_data.get('stream') is None: if self.api_error_msg: log.wtf(self.api_error_msg) else: log.wtf('Unknown error') self.title = self.api_data['video']['title'] stream_types = dict([(i['id'], i) for i in self.stream_types]) audio_lang = self.api_data['stream'][0]['audio_lang'] for stream in self.api_data['stream']: stream_id = stream['stream_type'] is_preview = False if stream_id in stream_types and stream['audio_lang'] == audio_lang: if 'alias-of' in stream_types[stream_id]: stream_id = stream_types[stream_id]['alias-of'] if stream_id not in self.streams: self.streams[stream_id] = { 'container': stream_types[stream_id]['container'], 'video_profile': stream_types[stream_id]['video_profile'], 'size': stream['size'], 'pieces': [{ 'segs': stream['segs'] }], 'm3u8_url': stream['m3u8_url'] } src = [] for seg in stream['segs']: if seg.get('cdn_url'): src.append(self.__class__.change_cdn(seg['cdn_url'])) else: is_preview = True self.streams[stream_id]['src'] = src else: self.streams[stream_id]['size'] += stream['size'] self.streams[stream_id]['pieces'].append({ 'segs': stream['segs'] }) src = [] for seg in stream['segs']: if seg.get('cdn_url'): src.append(self.__class__.change_cdn(seg['cdn_url'])) else: is_preview = True self.streams[stream_id]['src'].extend(src) if is_preview: log.w('{} is a preview'.format(stream_id)) # Audio languages if 'dvd' in self.api_data: al = self.api_data['dvd'].get('audiolang') if al: self.audiolang = al for i in self.audiolang: i['url'] = 'http://v.youku.com/v_show/id_{}'.format(i['vid']) def youku_download_playlist_by_url(url, **kwargs): video_page_pt = 'https?://v.youku.com/v_show/id_([A-Za-z0-9=]+)' js_cb_pt = '\(({.+})\)' if re.match(video_page_pt, url): youku_obj = Youku() youku_obj.url = url youku_obj.prepare(**kwargs) total_episode = None try: total_episode = youku_obj.api_data['show']['episode_total'] except KeyError: log.wtf('Cannot get total_episode for {}'.format(url)) next_vid = youku_obj.vid for _ in range(total_episode): this_extractor = Youku() this_extractor.download_by_vid(next_vid, keep_obj=True, **kwargs) next_vid = this_extractor.video_next['encodevid'] ''' if youku_obj.video_list is None: log.wtf('Cannot find video list for {}'.format(url)) else: vid_list = [v['encodevid'] for v in youku_obj.video_list] for v in vid_list: Youku().download_by_vid(v, **kwargs) ''' elif re.match('https?://list.youku.com/show/id_', url): # http://list.youku.com/show/id_z2ae8ee1c837b11e18195.html # official playlist page = get_content(url) show_id = re.search(r'showid:"(\d+)"', page).group(1) ep = 'http://list.youku.com/show/module?id={}&tab=showInfo&callback=jQuery'.format(show_id) xhr_page = get_content(ep).replace('\/', '/').replace('\"', '"') video_url = re.search(r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_page).group(1) youku_download_playlist_by_url('http://'+video_url, **kwargs) return elif re.match('https?://list.youku.com/albumlist/show/id_(\d+)\.html', url): # http://list.youku.com/albumlist/show/id_2336634.html # UGC playlist list_id = re.search('https?://list.youku.com/albumlist/show/id_(\d+)\.html', url).group(1) ep = 'http://list.youku.com/albumlist/items?id={}&page={}&size=20&ascending=1&callback=tuijsonp6' first_u = ep.format(list_id, 1) xhr_page = get_content(first_u) json_data = json.loads(re.search(js_cb_pt, xhr_page).group(1)) video_cnt = json_data['data']['total'] xhr_html = json_data['html'] v_urls = re.findall(r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_html) if video_cnt > 20: req_cnt = video_cnt // 20 for i in range(2, req_cnt+2): req_u = ep.format(list_id, i) xhr_page = get_content(req_u) json_data = json.loads(re.search(js_cb_pt, xhr_page).group(1).replace('\/', '/')) xhr_html = json_data['html'] page_videos = re.findall(r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_html) v_urls.extend(page_videos) for u in v_urls[0::2]: url = 'http://' + u Youku().download_by_url(url, **kwargs) return def youku_download_by_url(url, **kwargs): Youku().download_by_url(url, **kwargs) def youku_download_by_vid(vid, **kwargs): Youku().download_by_vid(vid, **kwargs) download = youku_download_by_url download_playlist = youku_download_playlist_by_url