From 1411c8986e32d5d555d5af9e5727e0d34a5b8b1a Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 24 Jun 2014 03:59:47 +0200 Subject: [PATCH] Youku: fix #331, refactoring --- src/you_get/common.py | 121 +++++++++++++- src/you_get/extractor/acfun.py | 4 +- src/you_get/extractor/bilibili.py | 4 +- src/you_get/extractor/miomio.py | 8 +- src/you_get/extractor/tudou.py | 22 +-- src/you_get/extractor/youku.py | 260 +++++++----------------------- 6 files changed, 194 insertions(+), 225 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index d80f39c5..dfccd436 100644 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -16,6 +16,7 @@ from .util import log, sogou_proxy_server, get_filename, unescape_html dry_run = False force = False player = None +extractor_proxy = None sogou_proxy = None sogou_env = None cookies_txt = None @@ -824,14 +825,15 @@ def script_main(script_name, download, download_playlist = None): -o | --output-dir Set the output directory for downloaded videos. -p | --player Directly play the video with PLAYER like vlc/smplayer. -x | --http-proxy Use specific HTTP proxy for downloading. + -y | --extractor-proxy Use specific HTTP proxy for extracting stream data. --no-proxy Don't use any proxy. (ignore $http_proxy) -S | --sogou Use a Sogou proxy server for downloading. --sogou-proxy Run a standalone Sogou proxy server. --debug Show traceback on KeyboardInterrupt. ''' - short_opts = 'Vhfiuc:nSo:p:x:' - opts = ['version', 'help', 'force', 'info', 'url', 'cookies', 'no-merge', 'no-proxy', 'debug', 'sogou', 'output-dir=', 'player=', 'http-proxy=', 'sogou-proxy=', 'sogou-env='] + short_opts = 'Vhfiuc:nSo:p:x:y:' + opts = ['version', 'help', 'force', 'info', 'url', 'cookies', 'no-merge', 'no-proxy', 'debug', 'sogou', 'output-dir=', 'player=', 'http-proxy=', 'extractor-proxy=', 'sogou-proxy=', 'sogou-env='] if download_playlist: short_opts = 'l' + short_opts opts = ['playlist'] + opts @@ -846,6 +848,7 @@ def script_main(script_name, download, download_playlist = None): global force global dry_run global player + global extractor_proxy global sogou_proxy global sogou_env global cookies_txt @@ -856,6 +859,7 @@ def script_main(script_name, download, download_playlist = None): merge = True output_dir = '.' proxy = None + extractor_proxy = None traceback = False for o, a in opts: if o in ('-V', '--version'): @@ -889,6 +893,8 @@ def script_main(script_name, download, download_playlist = None): player = a elif o in ('-x', '--http-proxy'): proxy = a + elif o in ('-y', '--extractor-proxy'): + extractor_proxy = a elif o in ('-S', '--sogou'): sogou_proxy = ("0.0.0.0", 0) elif o in ('--sogou-proxy',): @@ -924,3 +930,114 @@ def script_main(script_name, download, download_playlist = None): raise else: sys.exit(1) + + + +class VideoExtractor(): + def __init__(self, *args): + self.url = None + self.title = None + self.vid = None + self.streams = {} + self.streams_sorted = [] + + if args: + self.url = args[0] + + def download_by_url(self, url, **kwargs): + self.url = url + + self.prepare(**kwargs) + + self.streams_sorted = [dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams] + + global extractor_proxy + if extractor_proxy: + set_proxy(parse_host(extractor_proxy)) + self.extract(**kwargs) + if extractor_proxy: + unset_proxy() + + self.download(**kwargs) + + def download_by_vid(self, vid, **kwargs): + self.vid = vid + + self.prepare(**kwargs) + + self.streams_sorted = [dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams] + + global extractor_proxy + if extractor_proxy: + set_proxy(parse_host(extractor_proxy)) + self.extract(**kwargs) + if extractor_proxy: + unset_proxy() + + self.download(**kwargs) + + def prepare(self, **kwargs): + pass + #raise NotImplementedError() + + def extract(self, **kwargs): + pass + #raise NotImplementedError() + + def p_stream(self, stream_id): + stream = self.streams[stream_id] + print(" - id: \033[7m%s\033[0m" % stream_id) + print(" container: %s" % stream['container']) + print(" video-profile: %s" % stream['video_profile']) + print(" size: %s MiB (%s bytes)" % (round(stream['size'] / 1048576, 1), stream['size'])) + #print(" # download-with: \033[4myou-get --stream=%s\033[0m" % stream_id) + print() + + def p(self, stream_id=None): + print("site: %s" % self.__class__.name) + print("title: %s" % self.title) + if stream_id: + # Print the stream + print("stream:") + self.p_stream(stream_id) + + elif stream_id is None: + # Print stream with best quality + print("stream: # Best quality") + stream_id = self.streams_sorted[0]['id'] + self.p_stream(stream_id) + + elif stream_id == []: + # Print all available streams + print("streams: # Available quality and codecs") + for stream in self.streams_sorted: + self.p_stream(stream['id']) + + def download(self, **kwargs): + if 'info_only' in kwargs and kwargs['info_only']: + if 'stream_id' in kwargs and kwargs['stream_id']: + # Display the stream + stream_id = kwargs['stream_id'] + self.p(stream_id) + else: + # Display all available streams + self.p([]) + else: + if 'stream_id' in kwargs and kwargs['stream_id']: + # Download the stream + stream_id = kwargs['stream_id'] + else: + # Download stream with the best quality + stream_id = self.streams_sorted[0]['id'] + + self.p(None) + + urls = self.streams[stream_id]['src'] + if not urls: + log.e('[Failed] Cannot extract video source.') + log.e('This is most likely because the video has not been made available in your country.') + log.e('You may try to use a proxy via \'-y\' for extracting stream data.') + exit(1) + download_urls(urls, self.title, self.streams[stream_id]['container'], self.streams[stream_id]['size'], output_dir=kwargs['output_dir'], merge=kwargs['merge']) + + self.__init__() diff --git a/src/you_get/extractor/acfun.py b/src/you_get/extractor/acfun.py index 31b4cd06..00a2d21b 100644 --- a/src/you_get/extractor/acfun.py +++ b/src/you_get/extractor/acfun.py @@ -7,7 +7,7 @@ from ..common import * from .qq import qq_download_by_id from .sina import sina_download_by_vid from .tudou import tudou_download_by_iid -from .youku import youku_download_by_id +from .youku import youku_download_by_vid import json, re @@ -27,7 +27,7 @@ def acfun_download_by_vid(vid, title=None, output_dir='.', merge=True, info_only if sourceType == 'sina': sina_download_by_vid(sourceId, title, output_dir=output_dir, merge=merge, info_only=info_only) elif sourceType == 'youku': - youku_download_by_id(sourceId, title, output_dir=output_dir, merge=merge, info_only=info_only) + youku_download_by_vid(sourceId, title, output_dir=output_dir, merge=merge, info_only=info_only) elif sourceType == 'tudou': tudou_download_by_iid(sourceId, title, output_dir=output_dir, merge=merge, info_only=info_only) elif sourceType == 'qq': diff --git a/src/you_get/extractor/bilibili.py b/src/you_get/extractor/bilibili.py index b10ae7da..934afdd6 100644 --- a/src/you_get/extractor/bilibili.py +++ b/src/you_get/extractor/bilibili.py @@ -6,7 +6,7 @@ from ..common import * from .sina import sina_download_by_vid from .tudou import tudou_download_by_id -from .youku import youku_download_by_id +from .youku import youku_download_by_vid import re @@ -91,7 +91,7 @@ def bilibili_download(url, output_dir = '.', merge = True, info_only = False): elif t == 'vid': sina_download_by_id(id, title, output_dir = output_dir, merge = merge, info_only = info_only) elif t == 'ykid': - youku_download_by_id(id, title, output_dir = output_dir, merge = merge, info_only = info_only) + youku_download_by_vid(id, title, output_dir = output_dir, merge = merge, info_only = info_only) elif t == 'uid': tudou_download_by_id(id, title, output_dir = output_dir, merge = merge, info_only = info_only) else: diff --git a/src/you_get/extractor/miomio.py b/src/you_get/extractor/miomio.py index bac1f64c..4c23c929 100644 --- a/src/you_get/extractor/miomio.py +++ b/src/you_get/extractor/miomio.py @@ -6,18 +6,18 @@ from ..common import * from .sina import sina_download_by_vid from .tudou import tudou_download_by_id -from .youku import youku_download_by_id +from .youku import youku_download_by_vid def miomio_download(url, output_dir = '.', merge = True, info_only = False): html = get_html(url) - + title = r1(r'[^<]', r'([^<>]*)'], page) - else: - title = r1_of([r'
[^<]', r'([^-]+)—在线播放.*', r'([^<>]*)', page) - if subtitle: - subtitle = subtitle.group(1).strip() - if subtitle == title: - subtitle = None - if subtitle: - title += '-' + subtitle - return title - -def parse_playlist_title(url, page): - if re.search(r'v_playlist', url): - # if we are playing a video from play list, the meta title might be incorrect - title = re.search(r'([^<>]*)', page).group(1) - else: - title = re.search(r'> 16 - c = source.pop(index) - mixed += c - - ids = info['data'][0]['streamfileids'][stream_type].split('*')[:-1] - vid = ''.join(mixed[int(i)] for i in ids) - - sid = '%s%s%s' % (int(time() * 1000), randint(1000, 1999), randint(1000, 9999)) - - urls = [] - for s in segs[stream_type]: - no = '%02x' % int(s['no']) - url = 'http://f.youku.com/player/getFlvPath/sid/%s_%s/st/%s/fileid/%s%s%s?K=%s&ts=%s' % (sid, no, file_type, vid[:8], no.upper(), vid[10:], s['k'], s['seconds']) - urls.append((url, int(s['size']))) - return urls + return None -def file_type_of_url(url): - return str(re.search(r'/st/([^/]+)/', url).group(1)) + def parse_m3u8(m3u8): + return re.findall(r'(http://[^?]+)\?ts_start=0', m3u8) -def youku_download_by_id(id, title, output_dir = '.', stream_type = None, merge = True, info_only = False): - # Open Sogou proxy if required - if get_sogou_proxy() is not None: - server = sogou_proxy_server(get_sogou_proxy(), ostream=open(os.devnull, 'w')) - server_thread = threading.Thread(target=server.serve_forever) - server_thread.daemon = True - server_thread.start() - set_proxy(server.server_address) - - info = get_info(id) - - # Close Sogou proxy if required - if get_sogou_proxy() is not None: - server.shutdown() - unset_proxy() - - urls, sizes = zip(*find_video(info, stream_type)) - ext = file_type_of_url(urls[0]) - total_size = sum(sizes) - - print_info(site_info, title, ext, total_size) - if not info_only: - download_urls(urls, title, ext, total_size, output_dir, merge = merge) + def prepare(self, **kwargs): + assert self.url or self.vid + if self.url and not self.vid: + self.vid = __class__.get_vid_from_url(self.url) -def parse_playlist_videos(html): - return re.findall(r'id="A_(\w+)"', html) + meta = json.loads(get_html('http://v.youku.com/player/getPlayList/VideoIDS/%s' % self.vid)) + metadata0 = meta['data'][0] -def parse_playlist_pages(html): - m = re.search(r'
    .*?
', html, flags = re.S) - if m: - urls = re.findall(r'href="([^"]+)"', m.group()) - x1, x2, x3 = re.match(r'^(.*page_)(\d+)(_.*)$', urls[-1]).groups() - return ['http://v.youku.com%s%s%s?__rt=1&__ro=listShow' % (x1, i, x3) for i in range(2, int(x2) + 1)] - else: - return [] + self.title = metadata0['title'] -def parse_playlist(url): - html = get_html(url) - video_id = re.search(r"var\s+videoId\s*=\s*'(\d+)'", html).group(1) - show_id = re.search(r'var\s+showid\s*=\s*"(\d+)"', html).group(1) - list_url = 'http://v.youku.com/v_vpofficiallist/page_1_showid_%s_id_%s.html?__rt=1&__ro=listShow' % (show_id, video_id) - html = get_html(list_url) - ids = parse_playlist_videos(html) - for url in parse_playlist_pages(html): - ids.extend(parse_playlist_videos(get_html(url))) - return ids + for stream_type in self.stream_types: + if stream_type['id'] in metadata0['streamsizes']: + stream_id = stream_type['id'] + stream_size = int(metadata0['streamsizes'][stream_id]) + self.streams[stream_id] = {'container': stream_type['container'], 'video_profile': stream_type['video_profile'], 'size': stream_size} -def parse_vplaylist(url): - id = r1_of([r'^http://www.youku.com/playlist_show/id_(\d+)(?:_ascending_\d_mode_pic(?:_page_\d+)?)?.html', - r'^http://v.youku.com/v_playlist/f(\d+)o[01]p\d+.html', - r'^http://u.youku.com/user_playlist/pid_(\d+)_id_[\w=]+(?:_page_\d+)?.html'], - url) - assert id, 'not valid vplaylist url: ' + url - url = 'http://www.youku.com/playlist_show/id_%s.html' % id - n = int(re.search(r'(\d+)', get_html(url)).group(1)) - return ['http://v.youku.com/v_playlist/f%so0p%s.html' % (id, i) for i in range(n)] + def extract(self, **kwargs): + if 'stream_id' in kwargs and kwargs['stream_id']: + # Extract the stream + stream_id = kwargs['stream_id'] + else: + # Extract stream with the best quality + stream_id = self.streams_sorted[0]['id'] -def youku_download_playlist(url, output_dir='.', merge=True, info_only=False): - """Downloads a Youku playlist. - """ - - if re.match(r'http://www.youku.com/playlist_show/id_\d+(?:_ascending_\d_mode_pic(?:_page_\d+)?)?.html', url): - ids = parse_vplaylist(url) - elif re.match(r'http://v.youku.com/v_playlist/f\d+o[01]p\d+.html', url): - ids = parse_vplaylist(url) - elif re.match(r'http://u.youku.com/user_playlist/pid_(\d+)_id_[\w=]+(?:_page_\d+)?.html', url): - ids = parse_vplaylist(url) - elif re.match(r'http://www.youku.com/show_page/id_\w+.html', url): - url = find_video_id_from_show_page(url) - assert re.match(r'http://v.youku.com/v_show/id_([\w=]+).html', url), 'URL not supported as playlist' - ids = parse_playlist(url) - else: - ids = [] - assert ids != [] - - title = parse_playlist_title(url, get_html(url)) - title = filenameable(title) - output_dir = os.path.join(output_dir, title) - - for i, id in enumerate(ids): - print('Processing %s of %s videos...' % (i + 1, len(ids))) - try: - id, title = parse_page(youku_url(id)) - youku_download_by_id(id, title, output_dir=output_dir, merge=merge, info_only=info_only) - except: - continue + m3u8_url = "http://v.youku.com/player/getM3U8/vid/{vid}/type/{stream_id}/video.m3u8".format(vid=self.vid, stream_id=stream_id) + m3u8 = get_html(m3u8_url) + if not m3u8: + log.w('[Warning] This video can only be streamed within Mainland China!') + log.w('Use \'-y\' to specify a proxy server for extracting stream data.\n') -def youku_download(url, output_dir='.', merge=True, info_only=False): - """Downloads Youku videos by URL. - """ - - try: - youku_download_playlist(url, output_dir=output_dir, merge=merge, info_only=info_only) - except: - id, title = parse_page(url) - youku_download_by_id(id, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + self.streams[stream_id]['src'] = __class__.parse_m3u8(m3u8) -site_info = "Youku.com" -download = youku_download -download_playlist = youku_download_playlist +site = Youku() +download = site.download_by_url +download_playlist = playlist_not_supported('youku') + +youku_download_by_vid = site.download_by_vid +# Used by: acfun.py bilibili.py miomio.py tudou.py