you-get/src/you_get/extractors/youku.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from ..common import *
from ..extractor import VideoExtractor

import time
import traceback
import json
import urllib.request
import urllib.parse


def fetch_cna():

    def quote_cna(val):
        if '%' in val:
            return val
        return urllib.parse.quote(val)

    if cookies:
        for cookie in cookies:
            if cookie.name == 'cna' and cookie.domain == '.youku.com':
                log.i('Found cna in imported cookies. Use it')
                return quote_cna(cookie.value)
    url = 'http://log.mmstat.com/eg.js'
    req = urllib.request.urlopen(url)
    headers = req.getheaders()
    for header in headers:
        if header[0].lower() == 'set-cookie':
            n_v = header[1].split(';')[0]
            name, value = n_v.split('=')
            if name == 'cna':
                return quote_cna(value)
    log.w('It seems that the client failed to fetch a cna cookie. Please load your own cookie if possible')
    return quote_cna('DOG4EdW4qzsCAbZyXbU+t7Jt')


class Youku(VideoExtractor):
    name = "优酷 (Youku)"
    mobile_ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'
    dispatcher_url = 'vali.cp31.ott.cibntv.net'

    # Last updated: 2017-10-13
    stream_types = [
        {'id': 'hd3',      'container': 'flv', 'video_profile': '1080P'},
        {'id': 'hd3v2',    'container': 'flv', 'video_profile': '1080P'},
        {'id': 'mp4hd3',   'container': 'mp4', 'video_profile': '1080P'},
        {'id': 'mp4hd3v2', 'container': 'mp4', 'video_profile': '1080P'},

        {'id': 'hd2',      'container': 'flv', 'video_profile': '超清'},
        {'id': 'hd2v2',    'container': 'flv', 'video_profile': '超清'},
        {'id': 'mp4hd2',   'container': 'mp4', 'video_profile': '超清'},
        {'id': 'mp4hd2v2', 'container': 'mp4', 'video_profile': '超清'},

        {'id': 'mp4hd',    'container': 'mp4', 'video_profile': '高清'},
        # not really equivalent to mp4hd
        {'id': 'flvhd',    'container': 'flv', 'video_profile': '渣清'},
        {'id': '3gphd',    'container': 'mp4', 'video_profile': '渣清'},

        {'id': 'mp4sd',    'container': 'mp4', 'video_profile': '标清'},
        # obsolete?
        {'id': 'flv',      'container': 'flv', 'video_profile': '标清'},
        {'id': 'mp4',      'container': 'mp4', 'video_profile': '标清'},
    ]

    def __init__(self):
        super().__init__()

        self.ua = self.__class__.mobile_ua
        self.referer = 'http://v.youku.com'

        self.page = None
        self.video_list = None
        self.video_next = None
        self.password = None
        self.api_data = None
        self.api_error_code = None
        self.api_error_msg = None

        self.ccode = '0519'
        # Found in http://g.alicdn.com/player/ykplayer/0.5.64/youku-player.min.js
        # grep -oE '"[0-9a-zA-Z+/=]{256}"' youku-player.min.js
        self.ckey = 'DIl58SLFxFNndSV1GFNnMQVYkx1PP5tKe1siZu/86PR1u/Wh1Ptd+WOZsHHWxysSfAOhNJpdVWsdVJNsfJ8Sxd8WKVvNfAS8aS8fAOzYARzPyPc3JvtnPHjTdKfESTdnuTW6ZPvk2pNDh4uFzotgdMEFkzQ5wZVXl2Pf1/Y6hLK0OnCNxBj3+nb0v72gZ6b0td+WOZsHHWxysSo/0y9D2K42SaB8Y/+aD2K42SaB8Y/+ahU+WOZsHcrxysooUeND'
        self.utid = None

    def youku_ups(self):
        url = 'https://ups.youku.com/ups/get.json?vid={}&ccode={}'.format(self.vid, self.ccode)
        url += '&client_ip=192.168.1.1'
        url += '&utid=' + self.utid
        url += '&client_ts=' + str(int(time.time()))
        url += '&ckey=' + urllib.parse.quote(self.ckey)
        if self.password_protected:
            url += '&password=' + self.password
        headers = dict(Referer=self.referer)
        headers['User-Agent'] = self.ua
        api_meta = json.loads(get_content(url, headers=headers))

        self.api_data = api_meta['data']
        data_error = self.api_data.get('error')
        if data_error:
            self.api_error_code = data_error.get('code')
            self.api_error_msg = data_error.get('note')
        if 'videos' in self.api_data:
            if 'list' in self.api_data['videos']:
                self.video_list = self.api_data['videos']['list']
            if 'next' in self.api_data['videos']:
                self.video_next = self.api_data['videos']['next']

    @classmethod
    def change_cdn(cls, url):
        # if the cnd_url starts with an ip addr, it should be youku's old CDN
        # which rejects http requests randomly with status code > 400
        # change it to the dispatcher of aliCDN can do better
        # at least a little more recoverable from HTTP 403
        if cls.dispatcher_url in url:
            return url
        elif 'k.youku.com' in url:
            return url
        else:
            url_seg_list = list(urllib.parse.urlsplit(url))
            url_seg_list[1] = cls.dispatcher_url
            return urllib.parse.urlunsplit(url_seg_list)

    def get_vid_from_url(self):
        # It's unreliable. check #1633
        b64p = r'([a-zA-Z0-9=]+)'
        p_list = [r'youku\.com/v_show/id_'+b64p,
                  r'player\.youku\.com/player\.php/sid/'+b64p+r'/v\.swf',
                  r'loader\.swf\?VideoIDS='+b64p,
                  r'player\.youku\.com/embed/'+b64p]
        if not self.url:
            raise Exception('No url')
        for p in p_list:
            hit = re.search(p, self.url)
            if hit is not None:
                self.vid = hit.group(1)
                return

    def get_vid_from_page(self):
        if not self.url:
            raise Exception('No url')
        self.page = get_content(self.url)
        hit = re.search(r'videoId2:"([A-Za-z0-9=]+)"', self.page)
        if hit is not None:
            self.vid = hit.group(1)

    def prepare(self, **kwargs):
        assert self.url or self.vid

        if self.url and not self.vid:
            self.get_vid_from_url()

            if self.vid is None:
                self.get_vid_from_page()

                if self.vid is None:
                    log.wtf('Cannot fetch vid')

        if kwargs.get('src') and kwargs['src'] == 'tudou':
            self.ccode = '0512'

        if kwargs.get('password') and kwargs['password']:
            self.password_protected = True
            self.password = kwargs['password']

        self.utid = fetch_cna()
        time.sleep(3)
        self.youku_ups()

        if self.api_data.get('stream') is None:
            if self.api_error_code == -6001:  # wrong vid parsed from the page
                vid_from_url = self.vid
                self.get_vid_from_page()
                if vid_from_url == self.vid:
                    log.wtf(self.api_error_msg)
                self.youku_ups()

        if self.api_data.get('stream') is None:
            if self.api_error_code == -2002:  # wrong password
                self.password_protected = True
                # it can be True already(from cli). offer another chance to retry
                self.password = input(log.sprint('Password: ', log.YELLOW))
                self.youku_ups()

        if self.api_data.get('stream') is None:
            if self.api_error_msg:
                log.wtf(self.api_error_msg)
            else:
                log.wtf('Unknown error')

        self.title = self.api_data['video']['title']
        stream_types = dict([(i['id'], i) for i in self.stream_types])
        audio_lang = self.api_data['stream'][0]['audio_lang']

        for stream in self.api_data['stream']:
            stream_id = stream['stream_type']
            is_preview = False
            if stream_id in stream_types and stream['audio_lang'] == audio_lang:
                if 'alias-of' in stream_types[stream_id]:
                    stream_id = stream_types[stream_id]['alias-of']

                if stream_id not in self.streams:
                    self.streams[stream_id] = {
                        'container': stream_types[stream_id]['container'],
                        'video_profile': stream_types[stream_id]['video_profile'],
                        'size': stream['size'],
                        'pieces': [{
                            'segs': stream['segs']
                        }],
                        'm3u8_url': stream['m3u8_url']
                    }
                    src = []
                    for seg in stream['segs']:
                        if seg.get('cdn_url'):
                            src.append(self.__class__.change_cdn(seg['cdn_url']))
                        else:
                            is_preview = True
                    self.streams[stream_id]['src'] = src
                else:
                    self.streams[stream_id]['size'] += stream['size']
                    self.streams[stream_id]['pieces'].append({
                        'segs': stream['segs']
                    })
                    src = []
                    for seg in stream['segs']:
                        if seg.get('cdn_url'):
                            src.append(self.__class__.change_cdn(seg['cdn_url']))
                        else:
                            is_preview = True
                    self.streams[stream_id]['src'].extend(src)
            if is_preview:
                log.w('{} is a preview'.format(stream_id))

        # Audio languages
        if 'dvd' in self.api_data:
            al = self.api_data['dvd'].get('audiolang')
            if al:
                self.audiolang = al
                for i in self.audiolang:
                    i['url'] = 'http://v.youku.com/v_show/id_{}'.format(i['vid'])


def youku_download_playlist_by_url(url, **kwargs):
    video_page_pt = 'https?://v.youku.com/v_show/id_([A-Za-z0-9=]+)'
    js_cb_pt = '\(({.+})\)'
    if re.match(video_page_pt, url):
        youku_obj = Youku()
        youku_obj.url = url
        youku_obj.prepare(**kwargs)
        total_episode = None
        try:
            total_episode = youku_obj.api_data['show']['episode_total']
        except KeyError:
            log.wtf('Cannot get total_episode for {}'.format(url))
        next_vid = youku_obj.vid
        for _ in range(total_episode):
            this_extractor = Youku()
            this_extractor.download_by_vid(next_vid, keep_obj=True, **kwargs)
            next_vid = this_extractor.video_next['encodevid']
        '''
        if youku_obj.video_list is None:
            log.wtf('Cannot find video list for {}'.format(url))
        else:
            vid_list = [v['encodevid'] for v in youku_obj.video_list]
            for v in vid_list:
                Youku().download_by_vid(v, **kwargs)
        '''

    elif re.match('https?://list.youku.com/show/id_', url):
        # http://list.youku.com/show/id_z2ae8ee1c837b11e18195.html
        # official playlist
        page = get_content(url)
        show_id = re.search(r'showid:"(\d+)"', page).group(1)
        ep = 'http://list.youku.com/show/module?id={}&tab=showInfo&callback=jQuery'.format(show_id)
        xhr_page = get_content(ep).replace('\/', '/').replace('\"', '"')
        video_url = re.search(r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_page).group(1)
        youku_download_playlist_by_url('http://'+video_url, **kwargs)
        return
    elif re.match('https?://list.youku.com/albumlist/show/id_(\d+)\.html', url):
        # http://list.youku.com/albumlist/show/id_2336634.html
        # UGC playlist
        list_id = re.search('https?://list.youku.com/albumlist/show/id_(\d+)\.html', url).group(1)
        ep = 'http://list.youku.com/albumlist/items?id={}&page={}&size=20&ascending=1&callback=tuijsonp6'

        first_u = ep.format(list_id, 1)
        xhr_page = get_content(first_u)
        json_data = json.loads(re.search(js_cb_pt, xhr_page).group(1))
        video_cnt = json_data['data']['total']
        xhr_html = json_data['html']
        v_urls = re.findall(r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_html)

        if video_cnt > 20:
            req_cnt = video_cnt // 20
            for i in range(2, req_cnt+2):
                req_u = ep.format(list_id, i)
                xhr_page = get_content(req_u)
                json_data = json.loads(re.search(js_cb_pt, xhr_page).group(1).replace('\/', '/'))
                xhr_html = json_data['html']
                page_videos = re.findall(r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_html)
                v_urls.extend(page_videos)
        for u in v_urls[0::2]:
            url = 'http://' + u
            Youku().download_by_url(url, **kwargs)
        return


def youku_download_by_url(url, **kwargs):
    Youku().download_by_url(url, **kwargs)


def youku_download_by_vid(vid, **kwargs):
    Youku().download_by_vid(vid, **kwargs)

download = youku_download_by_url
download_playlist = youku_download_playlist_by_url