From b50cc2338f21714e9b6ef7c43f2818c5eb6abf63 Mon Sep 17 00:00:00 2001 From: Zhang Ning Date: Thu, 3 Sep 2015 11:28:43 +0800 Subject: [PATCH 1/2] support embed player for youku/tudou try sites one by one, to search video. but not in video order. Signed-off-by: Zhang Ning --- src/you_get/common.py | 3 ++- src/you_get/extractors/embed.py | 44 +++++++++++++++++++++++++++++++++ src/you_get/extractors/youku.py | 3 ++- 3 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 src/you_get/extractors/embed.py diff --git a/src/you_get/common.py b/src/you_get/common.py index 0a79ab98..9435f113 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -1011,7 +1011,8 @@ def url_to_module(url): res = conn.getresponse() location = res.getheader('location') if location is None: - raise NotImplementedError(url) + from .extractors import embed + return embed, url else: return url_to_module(location) diff --git a/src/you_get/extractors/embed.py b/src/you_get/extractors/embed.py new file mode 100644 index 00000000..423d85cd --- /dev/null +++ b/src/you_get/extractors/embed.py @@ -0,0 +1,44 @@ +__all__ = ['embed_download'] + +from ..common import * + +from .letv import letvcloud_download_by_vu +from .qq import qq_download_by_vid +from .sina import sina_download_by_vid +from .tudou import tudou_download_by_id +from .youku import youku_download_by_vid +from .youku import Youku + +""" +refer to http://open.youku.com/tools +""" +youku_api_pattern = 'YKU.Player\(\'[a-zA-Z0-9]+\',{ client_id: \'[a-zA-Z0-9]+\', vid: \'([a-zA-Z0-9]+)\'' +""" +http://www.tudou.com/programs/view/html5embed.action?type=0&code=3LS_URGvl54&lcode=&resourceId=0_06_05_99 +""" +tudou_embed_pattern = 'tudou\.com[a-zA-Z0-9\/\?=\&\.\;]+code=([[a-zA-Z0-9_]+)\&' + +""" +refer to http://open.tudou.com/wiki/video/info +""" + +def embed_download(url, output_dir = '.', merge = True, info_only = False ,**kwargs): + content = get_content(url) + found = False + title = match1(content, '([^<>]+)') + vid = Youku.get_vid_from_url(content) or \ + match1(content, youku_api_pattern) + if vid is not None: + found = True + youku_download_by_vid(vid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + + vid = match1(content, tudou_embed_pattern) + if vid is not None: + found = True + tudou_download_by_id(vid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + if not found: + raise NotImplementedError(url) + +site_info = "any.any" +download = embed_download +download_playlist = playlist_not_supported('any.any') diff --git a/src/you_get/extractors/youku.py b/src/you_get/extractors/youku.py index c9d98bfd..448feeb5 100644 --- a/src/you_get/extractors/youku.py +++ b/src/you_get/extractors/youku.py @@ -57,7 +57,8 @@ class Youku(VideoExtractor): """ return match1(url, r'youku\.com/v_show/id_([a-zA-Z0-9=]+)') or \ match1(url, r'player\.youku\.com/player\.php/sid/([a-zA-Z0-9=]+)/v\.swf') or \ - match1(url, r'loader\.swf\?VideoIDS=([a-zA-Z0-9=]+)') + match1(url, r'loader\.swf\?VideoIDS=([a-zA-Z0-9=]+)') or \ + match1(url, r'player\.youku\.com/embed/([a-zA-Z0-9=]+)') def get_playlist_id_from_url(url): """Extracts playlist ID from URL. From c73a636d43dc0efe93f08cff0da51290db6317e6 Mon Sep 17 00:00:00 2001 From: Zhang Ning Date: Thu, 3 Sep 2015 15:46:29 +0800 Subject: [PATCH 2/2] support multi video in one page add matchall api matchall: almost same as match1, but it will return a list of all matches. --- src/you_get/common.py | 18 ++++++++++++++++++ src/you_get/extractors/embed.py | 23 +++++++++++++++-------- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 9435f113..36be1999 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -79,6 +79,24 @@ def match1(text, *patterns): ret.append(match.group(1)) return ret +def matchall(text, patterns): + """Scans through a string for substrings matched some patterns. + + Args: + text: A string to be scanned. + patterns: a list of regex pattern. + + Returns: + a list if matched. empty if not. + """ + + ret = [] + for pattern in patterns: + match = re.findall(pattern, text) + ret += match + + return ret + def launch_player(player, urls): import subprocess import shlex diff --git a/src/you_get/extractors/embed.py b/src/you_get/extractors/embed.py index 423d85cd..e3a929b4 100644 --- a/src/you_get/extractors/embed.py +++ b/src/you_get/extractors/embed.py @@ -7,35 +7,42 @@ from .qq import qq_download_by_vid from .sina import sina_download_by_vid from .tudou import tudou_download_by_id from .youku import youku_download_by_vid -from .youku import Youku """ refer to http://open.youku.com/tools """ -youku_api_pattern = 'YKU.Player\(\'[a-zA-Z0-9]+\',{ client_id: \'[a-zA-Z0-9]+\', vid: \'([a-zA-Z0-9]+)\'' +youku_embed_patterns = [ 'youku\.com/v_show/id_([a-zA-Z0-9=]+)', + 'player\.youku\.com/player\.php/sid/([a-zA-Z0-9=]+)/v\.swf', + 'loader\.swf\?VideoIDS=([a-zA-Z0-9=]+)', + 'player\.youku\.com/embed/([a-zA-Z0-9=]+)', + 'YKU.Player\(\'[a-zA-Z0-9]+\',{ client_id: \'[a-zA-Z0-9]+\', vid: \'([a-zA-Z0-9]+)\'' + ] + """ http://www.tudou.com/programs/view/html5embed.action?type=0&code=3LS_URGvl54&lcode=&resourceId=0_06_05_99 """ -tudou_embed_pattern = 'tudou\.com[a-zA-Z0-9\/\?=\&\.\;]+code=([[a-zA-Z0-9_]+)\&' +tudou_embed_patterns = [ 'tudou\.com[a-zA-Z0-9\/\?=\&\.\;]+code=([[a-zA-Z0-9_]+)\&' + ] """ refer to http://open.tudou.com/wiki/video/info """ +tudou_api_patterns = [ ] def embed_download(url, output_dir = '.', merge = True, info_only = False ,**kwargs): content = get_content(url) found = False title = match1(content, '([^<>]+)') - vid = Youku.get_vid_from_url(content) or \ - match1(content, youku_api_pattern) - if vid is not None: + vids = matchall(content, youku_embed_patterns) + for vid in vids: found = True youku_download_by_vid(vid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) - vid = match1(content, tudou_embed_pattern) - if vid is not None: + vids = matchall(content, tudou_embed_patterns) + for vid in vids: found = True tudou_download_by_id(vid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + if not found: raise NotImplementedError(url)