you-get/src/you_get/extractors/youku.py
2016-03-21 16:38:36 +01:00

321 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from ..common import *
from ..extractor import VideoExtractor
import base64
import ssl
import time
import traceback
class Youku(VideoExtractor):
name = "优酷 (Youku)"
# Last updated: 2015-11-24
stream_types = [
{'id': 'mp4hd3', 'alias-of' : 'hd3'},
{'id': 'hd3', 'container': 'flv', 'video_profile': '1080P'},
{'id': 'mp4hd2', 'alias-of' : 'hd2'},
{'id': 'hd2', 'container': 'flv', 'video_profile': '超清'},
{'id': 'mp4hd', 'alias-of' : 'mp4'},
{'id': 'mp4', 'container': 'mp4', 'video_profile': '高清'},
{'id': 'flvhd', 'container': 'flv', 'video_profile': '标清'},
{'id': 'flv', 'container': 'flv', 'video_profile': '标清'},
{'id': '3gphd', 'container': '3gp', 'video_profile': '标清3GP'},
]
f_code_1 = 'becaf9be'
f_code_2 = 'bf7e5f01'
def trans_e(a, c):
f = h = 0
b = list(range(256))
result = ''
while h < 256:
f = (f + b[h] + ord(a[h % len(a)])) % 256
b[h], b[f] = b[f], b[h]
h += 1
q = f = h = 0
while q < len(c):
h = (h + 1) % 256
f = (f + b[h]) % 256
b[h], b[f] = b[f], b[h]
if isinstance(c[q], int):
result += chr(c[q] ^ b[(b[h] + b[f]) % 256])
else:
result += chr(ord(c[q]) ^ b[(b[h] + b[f]) % 256])
q += 1
return result
def generate_ep(no, streamfileids, sid, token):
number = hex(int(str(no), 10))[2:].upper()
if len(number) == 1:
number = '0' + number
fileid = streamfileids[0:8] + number + streamfileids[10:]
ep = parse.quote(base64.b64encode(
''.join(Youku.trans_e(
Youku.f_code_2,
sid + '_' + fileid + '_' + token)).encode('latin1')),
safe='~()*!.\''
)
return fileid, ep
# Obsolete -- used to parse m3u8 on pl.youku.com
def parse_m3u8(m3u8):
return re.findall(r'(http://[^?]+)\?ts_start=0', m3u8)
def oset(xs):
"""Turns a list into an ordered set. (removes duplicates)"""
mem = set()
for x in xs:
if x not in mem:
mem.add(x)
yield(x)
def get_vid_from_url(url):
"""Extracts video ID from URL.
"""
return match1(url, r'youku\.com/v_show/id_([a-zA-Z0-9=]+)') or \
match1(url, r'player\.youku\.com/player\.php/sid/([a-zA-Z0-9=]+)/v\.swf') or \
match1(url, r'loader\.swf\?VideoIDS=([a-zA-Z0-9=]+)') or \
match1(url, r'player\.youku\.com/embed/([a-zA-Z0-9=]+)')
def get_playlist_id_from_url(url):
"""Extracts playlist ID from URL.
"""
return match1(url, r'youku\.com/playlist_show/id_([a-zA-Z0-9=]+)')
def download_playlist_by_url(self, url, **kwargs):
self.url = url
try:
playlist_id = self.__class__.get_playlist_id_from_url(self.url)
assert playlist_id
video_page = get_content('http://www.youku.com/playlist_show/id_%s' % playlist_id)
videos = Youku.oset(re.findall(r'href="(http://v\.youku\.com/[^?"]+)', video_page))
# Parse multi-page playlists
for extra_page_url in Youku.oset(re.findall('href="(http://www\.youku\.com/playlist_show/id_%s_[^?"]+)' % playlist_id, video_page)):
extra_page = get_content(extra_page_url)
videos |= Youku.oset(re.findall(r'href="(http://v\.youku\.com/[^?"]+)', extra_page))
except:
# Show full list of episodes
if match1(url, r'youku\.com/show_page/id_([a-zA-Z0-9=]+)'):
ep_id = match1(url, r'youku\.com/show_page/id_([a-zA-Z0-9=]+)')
url = 'http://www.youku.com/show_episode/id_%s' % ep_id
video_page = get_content(url)
videos = Youku.oset(re.findall(r'href="(http://v\.youku\.com/[^?"]+)', video_page))
self.title = r1(r'<meta name="title" content="([^"]+)"', video_page) or \
r1(r'<title>([^<]+)', video_page)
self.p_playlist()
for video in videos:
index = parse_query_param(video, 'f')
try:
self.__class__().download_by_url(video, index=index, **kwargs)
except KeyboardInterrupt:
raise
except:
exc_type, exc_value, exc_traceback = sys.exc_info()
traceback.print_exception(exc_type, exc_value, exc_traceback)
def prepare(self, **kwargs):
# Hot-plug cookie handler
ssl_context = request.HTTPSHandler(
context=ssl.SSLContext(ssl.PROTOCOL_TLSv1))
cookie_handler = request.HTTPCookieProcessor()
if 'extractor_proxy' in kwargs and kwargs['extractor_proxy']:
proxy = parse_host(kwargs['extractor_proxy'])
proxy_handler = request.ProxyHandler({
'http': '%s:%s' % proxy,
'https': '%s:%s' % proxy,
})
else:
proxy_handler = request.ProxyHandler({})
opener = request.build_opener(ssl_context, cookie_handler, proxy_handler)
opener.addheaders = [('Cookie','__ysuid={}'.format(time.time()))]
request.install_opener(opener)
assert self.url or self.vid
if self.url and not self.vid:
self.vid = self.__class__.get_vid_from_url(self.url)
if self.vid is None:
self.download_playlist_by_url(self.url, **kwargs)
exit(0)
api_url = 'http://play.youku.com/play/get.json?vid=%s&ct=10' % self.vid
api12_url = 'http://play.youku.com/play/get.json?vid=%s&ct=12' % self.vid
try:
meta = json.loads(get_content(
api_url,
headers={'Referer': 'http://static.youku.com/'}
))
meta12 = json.loads(get_content(
api12_url,
headers={'Referer': 'http://static.youku.com/'}
))
data = meta['data']
data12 = meta12['data']
assert 'stream' in data
except AssertionError:
if 'error' in data:
if data['error']['code'] == -202:
# Password protected
self.password_protected = True
self.password = input(log.sprint('Password: ', log.YELLOW))
api_url += '&pwd={}'.format(self.password)
api_url12 += '&pwd={}'.format(self.password)
meta = json.loads(get_content(
api_url,
headers={'Referer': 'http://static.youku.com/'}
))
meta12 = json.loads(get_content(
api_url12,
headers={'Referer': 'http://static.youku.com/'}
))
data = meta['data']
data12 = meta12['data']
else:
log.wtf('[Failed] ' + data['error']['note'])
else:
log.wtf('[Failed] Video not found.')
self.title = data['video']['title']
self.ep = data12['security']['encrypt_string']
self.ip = data12['security']['ip']
if 'stream' not in data and self.password_protected:
log.wtf('[Failed] Wrong password.')
stream_types = dict([(i['id'], i) for i in self.stream_types])
audio_lang = data['stream'][0]['audio_lang']
for stream in data['stream']:
stream_id = stream['stream_type']
if stream_id in stream_types and stream['audio_lang'] == audio_lang:
if 'alias-of' in stream_types[stream_id]:
stream_id = stream_types[stream_id]['alias-of']
if stream_id not in self.streams:
self.streams[stream_id] = {
'container': stream_types[stream_id]['container'],
'video_profile': stream_types[stream_id]['video_profile'],
'size': stream['size'],
'pieces': [{
'fileid': stream['stream_fileid'],
'segs': stream['segs']
}]
}
else:
self.streams[stream_id]['size'] += stream['size']
self.streams[stream_id]['pieces'].append({
'fileid': stream['stream_fileid'],
'segs': stream['segs']
})
self.streams_fallback = {}
for stream in data12['stream']:
stream_id = stream['stream_type']
if stream_id in stream_types and stream['audio_lang'] == audio_lang:
if 'alias-of' in stream_types[stream_id]:
stream_id = stream_types[stream_id]['alias-of']
if stream_id not in self.streams_fallback:
self.streams_fallback[stream_id] = {
'container': stream_types[stream_id]['container'],
'video_profile': stream_types[stream_id]['video_profile'],
'size': stream['size'],
'pieces': [{
'fileid': stream['stream_fileid'],
'segs': stream['segs']
}]
}
else:
self.streams_fallback[stream_id]['size'] += stream['size']
self.streams_fallback[stream_id]['pieces'].append({
'fileid': stream['stream_fileid'],
'segs': stream['segs']
})
# Audio languages
if 'dvd' in data and 'audiolang' in data['dvd']:
self.audiolang = data['dvd']['audiolang']
for i in self.audiolang:
i['url'] = 'http://v.youku.com/v_show/id_{}'.format(i['vid'])
def extract(self, **kwargs):
if 'stream_id' in kwargs and kwargs['stream_id']:
# Extract the stream
stream_id = kwargs['stream_id']
if stream_id not in self.streams:
log.e('[Error] Invalid video format.')
log.e('Run \'-i\' command with no specific video format to view all available formats.')
exit(2)
else:
# Extract stream with the best quality
stream_id = self.streams_sorted[0]['id']
e_code = self.__class__.trans_e(
self.__class__.f_code_1,
base64.b64decode(bytes(self.ep, 'ascii'))
)
sid, token = e_code.split('_')
while True:
try:
ksegs = []
pieces = self.streams[stream_id]['pieces']
for piece in pieces:
segs = piece['segs']
streamfileid = piece['fileid']
for no in range(0, len(segs)):
k = segs[no]['key']
if k == -1: break # we hit the paywall; stop here
fileid, ep = self.__class__.generate_ep(no, streamfileid,
sid, token)
q = parse.urlencode(dict(
ctype = 12,
ev = 1,
K = k,
ep = parse.unquote(ep),
oip = str(self.ip),
token = token,
yxon = 1
))
u = 'http://k.youku.com/player/getFlvPath/sid/{sid}_00' \
'/st/{container}/fileid/{fileid}?{q}'.format(
sid = sid,
container = self.streams[stream_id]['container'],
fileid = fileid,
q = q
)
ksegs += [i['server'] for i in json.loads(get_content(u))]
except error.HTTPError as e:
# Use fallback stream data in case of HTTP 404
log.e('[Error] ' + str(e))
self.streams = {}
self.streams = self.streams_fallback
except KeyError:
# Move on to next stream if best quality not available
del self.streams_sorted[0]
stream_id = self.streams_sorted[0]['id']
else: break
if not kwargs['info_only']:
self.streams[stream_id]['src'] = ksegs
site = Youku()
download = site.download_by_url
download_playlist = site.download_playlist_by_url
youku_download_by_vid = site.download_by_vid
# Used by: acfun.py bilibili.py miomio.py tudou.py