Merge remote-tracking branch 'upstream/develop' into develop

2025-02-11 12:42:29 +03:00 · 2015-09-07 07:47:22 +08:00 · 2015-09-07 07:47:22 +08:00 · 7885926815
commit 7885926815
parent 351abbad06 7bdf8af620
12 changed files with 292 additions and 30 deletions
--- a/README.md
+++ b/README.md
@ -43,6 +43,7 @@ Fork me on GitHub: <https://github.com/soimort/you-get>
 * DouyuTV (斗鱼) <http://www.douyutv.com>
 * eHow <http://www.ehow.com>
 * Facebook <http://facebook.com>
 * Fun.tv (风行, Funshion) <http://www.fun.tv/>
 * Google Drive <http://docs.google.com>
 * ifeng (凤凰视频) <http://v.ifeng.com>
 * iQIYI (爱奇艺) <http://www.iqiyi.com>
@ -61,6 +62,7 @@ Fork me on GitHub: <https://github.com/soimort/you-get>
 * QianMo (阡陌视频) <http://qianmo.com/>
 * QQ (腾讯视频) <http://v.qq.com>
 * Sina (新浪视频) <http://video.sina.com.cn>
 * Weibo Miaopai (新浪微博秒拍视频) <http://video.weibo.com/>
 * Sohu (搜狐视频) <http://tv.sohu.com>
 * SongTaste <http://www.songtaste.com>
 * SoundCloud <http://soundcloud.com>
--- a/src/you_get/common.py
+++ b/src/you_get/common.py
@ -79,6 +79,24 @@ def match1(text, *patterns):
                ret.append(match.group(1))
        return ret
 def matchall(text, patterns):
    """Scans through a string for substrings matched some patterns.
    Args:
        text: A string to be scanned.
        patterns: a list of regex pattern.
    Returns:
        a list if matched. empty if not.
    """
    ret = []
    for pattern in patterns:
        match = re.findall(pattern, text)
        ret += match
    return ret
 def launch_player(player, urls):
    import subprocess
    import shlex
@ -922,7 +940,7 @@ def script_main(script_name, download, download_playlist = None):
            sys.exit(1)
 def url_to_module(url):
-    from .extractors import netease, w56, acfun, baidu, baomihua, bilibili, blip, catfun, cntv, cbs, coursera, dailymotion, dongting, douban, douyutv, ehow, facebook, freesound, google, sina, ifeng, alive, instagram, iqiyi, joy, jpopsuki, khan, ku6, kugou, kuwo, letv, lizhi, magisto, miomio, mixcloud, mtv81, nicovideo, pptv, qianmo, qq, sohu, songtaste, soundcloud, ted, theplatform, tudou, tucao, tumblr, twitter, vid48, videobam, vidto, vimeo, vine, vk, xiami, yinyuetai, youku, youtube, zhanqi
+    from .extractors import netease, w56, acfun, baidu, baomihua, bilibili, blip, catfun, cntv, cbs, coursera, dailymotion, dongting, douban, douyutv, ehow, facebook, freesound, funshion, google, sina, ifeng, alive, instagram, iqiyi, joy, jpopsuki, khan, ku6, kugou, kuwo, letv, lizhi, magisto, miaopai, miomio, mixcloud, mtv81, nicovideo, pptv, qianmo, qq, sohu, songtaste, soundcloud, ted, theplatform, tudou, tucao, tumblr, twitter, vid48, videobam, vidto, vimeo, vine, vk, xiami, yinyuetai, youku, youtube, zhanqi
    video_host = r1(r'https?://([^/]+)/', url)
    video_url = r1(r'https?://[^/]+(.*)', url)
@ -953,6 +971,7 @@ def url_to_module(url):
        'ehow': ehow,
        'facebook': facebook,
        'freesound': freesound,
        'fun': funshion,
        'google': google,
        'iask': sina,
        'ifeng': ifeng,
@ -991,6 +1010,7 @@ def url_to_module(url):
        'videobam': videobam,
        'vidto': vidto,
        'vimeo': vimeo,
        'weibo': miaopai,
        'vine': vine,
        'vk': vk,
        'xiami': xiami,
@ -1009,7 +1029,8 @@ def url_to_module(url):
        res = conn.getresponse()
        location = res.getheader('location')
        if location is None:
-            raise NotImplementedError(url)
+            from .extractors import embed
            return embed, url
        else:
            return url_to_module(location)
--- a/src/you_get/extractors/init.py
+++ b/src/you_get/extractors/init.py
@ -15,6 +15,7 @@ from .douyutv import *
 from .ehow import *
 from .facebook import *
 from .freesound import *
 from .funshion import *
 from .google import *
 from .ifeng import *
 from .instagram import *
@ -27,6 +28,7 @@ from .kuwo import *
 from .letv import *
 from .lizhi import *
 from .magisto import *
 from .miaopai import *
 from .miomio import *
 from .mixcloud import *
 from .mtv81 import *
--- a/src/you_get/extractors/bilibili.py
+++ b/src/you_get/extractors/bilibili.py
@ -121,7 +121,7 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False):
    id = id.split('&')[0]
    if t == 'cid':
        # Multi-P
-        cids = [id]
+        cids = []
        p = re.findall('<option value=\'([^\']*)\'>', html)
        if not p:
            bilibili_download_by_cid(id, title, output_dir=output_dir, merge=merge, info_only=info_only)
--- a/src/you_get/extractors/cntv.py
+++ b/src/you_get/extractors/cntv.py
@ -12,9 +12,9 @@ def cntv_download_by_id(id, title = None, output_dir = '.', merge = True, info_o
    info = json.loads(get_html('http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do?pid=' + id))
    title = title or info['title']
    video = info['video']
-    alternatives = [x for x in video.keys() if x.startswith('chapters')]
+    alternatives = [x for x in video.keys() if x.endswith('hapters')]
-    #assert alternatives in (['chapters'], ['chapters', 'chapters2']), alternatives
+    #assert alternatives in (['chapters'], ['lowChapters', 'chapters'], ['chapters', 'lowChapters']), alternatives
-    chapters = video['chapters2'] if 'chapters2' in video else video['chapters']
+    chapters = video['chapters'] if 'chapters' in video else video['lowChapters']
    urls = [x['url'] for x in chapters]
    ext = r1(r'\.([^.]+)$', urls[0])
    assert ext in ('flv', 'mp4')
@ -29,7 +29,7 @@ def cntv_download_by_id(id, title = None, output_dir = '.', merge = True, info_o
 def cntv_download(url, output_dir = '.', merge = True, info_only = False):
    if re.match(r'http://\w+\.cntv\.cn/(\w+/\w+/(classpage/video/)?)?\d+/\d+\.shtml', url) or re.match(r'http://\w+.cntv.cn/(\w+/)*VIDE\d+.shtml', url):
-        id = r1(r'<!--repaste.video.code.begin-->(\w+)<!--repaste.video.code.end-->', get_html(url))
+        id = r1(r'videoCenterId","(\w+)"', get_html(url))
    elif re.match(r'http://xiyou.cntv.cn/v-[\w-]+\.html', url):
        id = r1(r'http://xiyou.cntv.cn/v-([\w-]+)\.html', url)
    else:
--- a/src/you_get/extractors/dailymotion.py
+++ b/src/you_get/extractors/dailymotion.py
@ -8,18 +8,17 @@ def dailymotion_download(url, output_dir = '.', merge = True, info_only = False)
    """Downloads Dailymotion videos by URL.
    """
-    id = match1(url, r'/video/([^\?]+)') or match1(url, r'video=([^\?]+)')
+    html = get_content(url)
-    embed_url = 'http://www.dailymotion.com/embed/video/%s' % id
+    info = json.loads(match1(html, r'qualities":({.+?}),"'))
-    html = get_content(embed_url)
+    title = match1(html, r'"video_title"\s*:\s*"(.+?)",')
-    info = json.loads(match1(html, r'var\s*info\s*=\s*({.+}),\n'))
+    for quality in ['720','480','380','240','auto']:
-
+        try:
-    title = info['title']
+            real_url = info[quality][0]["url"]
-
+            if real_url:
-    for quality in ['stream_h264_hd1080_url', 'stream_h264_hd_url', 'stream_h264_hq_url', 'stream_h264_url', 'stream_h264_ld_url']:
+                break
-        real_url = info[quality]
+        except KeyError:
-        if real_url:
+            pass
            break
    type, ext, size = url_info(real_url)
--- a/src/you_get/extractors/embed.py
+++ b/src/you_get/extractors/embed.py
@ -0,0 +1,51 @@
 __all__ = ['embed_download']
 from ..common import *
 from .letv import letvcloud_download_by_vu
 from .qq import qq_download_by_vid
 from .sina import sina_download_by_vid
 from .tudou import tudou_download_by_id
 from .youku import youku_download_by_vid
 """
 refer to http://open.youku.com/tools
 """
 youku_embed_patterns = [ 'youku\.com/v_show/id_([a-zA-Z0-9=]+)',
                         'player\.youku\.com/player\.php/sid/([a-zA-Z0-9=]+)/v\.swf',
                         'loader\.swf\?VideoIDS=([a-zA-Z0-9=]+)',
                         'player\.youku\.com/embed/([a-zA-Z0-9=]+)',
                         'YKU.Player\(\'[a-zA-Z0-9]+\',{ client_id: \'[a-zA-Z0-9]+\', vid: \'([a-zA-Z0-9]+)\''
                       ]
 """
 http://www.tudou.com/programs/view/html5embed.action?type=0&amp;code=3LS_URGvl54&amp;lcode=&amp;resourceId=0_06_05_99
 """
 tudou_embed_patterns = [ 'tudou\.com[a-zA-Z0-9\/\?=\&\.\;]+code=([[a-zA-Z0-9_]+)\&'
                       ]
 """
 refer to http://open.tudou.com/wiki/video/info
 """
 tudou_api_patterns = [ ]
 def embed_download(url, output_dir = '.', merge = True, info_only = False ,**kwargs):
    content = get_content(url)
    found = False
    title = match1(content, '<title>([^<>]+)</title>')
    vids = matchall(content, youku_embed_patterns)
    for vid in vids:
        found = True
        youku_download_by_vid(vid, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
    vids = matchall(content, tudou_embed_patterns)
    for vid in vids:
        found = True
        tudou_download_by_id(vid, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
    if not found:
        raise NotImplementedError(url)
 site_info = "any.any"
 download = embed_download
 download_playlist = playlist_not_supported('any.any')
--- a/src/you_get/extractors/funshion.py
+++ b/src/you_get/extractors/funshion.py
@ -0,0 +1,154 @@
 #!/usr/bin/env python
 __all__ = ['funshion_download']
 from ..common import *
 import urllib.error
 import json
 #----------------------------------------------------------------------
 def funshion_download(url, output_dir = '.', merge = False, info_only = False):
    """"""
    if re.match(r'http://www.fun.tv/vplay/v-(\w+)', url):  #single video
        funshion_download_by_url(url, output_dir = '.', merge = False, info_only = False)
    elif re.match(r'http://www.fun.tv/vplay/g-(\w+)', url):  #whole drama
        funshion_download_by_drama_url(url, output_dir = '.', merge = False, info_only = False)
    else:
        return
 # Logics for single video until drama
 #----------------------------------------------------------------------
 def funshion_download_by_url(url, output_dir = '.', merge = False, info_only = False):
    """lots of stuff->None
    Main wrapper for single video download.
    """
    if re.match(r'http://www.fun.tv/vplay/v-(\w+)', url):
        match = re.search(r'http://www.fun.tv/vplay/v-(\d+)(.?)', url)
    vid = match.group(1)
    funshion_download_by_vid(vid, output_dir = '.', merge = False, info_only = False)
 #----------------------------------------------------------------------
 def funshion_download_by_vid(vid, output_dir = '.', merge = False, info_only = False):
    """vid->None
    Secondary wrapper for single video download.
    """
    title = funshion_get_title_by_vid(vid)
    url_list = funshion_vid_to_urls(vid)
    for url in url_list:
        type, ext, size = url_info(url)
        print_info(site_info, title, type, size)
    if not info_only:
        download_urls(url_list, title, ext, total_size=None, output_dir=output_dir, merge=merge)
 #----------------------------------------------------------------------
 def funshion_get_title_by_vid(vid):
    """vid->str
    Single video vid to title."""
    html = get_content('http://pv.funshion.com/v5/video/profile?id={vid}&cl=aphone&uc=5'.format(vid = vid))
    c = json.loads(html)
    return c['name']
 #----------------------------------------------------------------------
 def funshion_vid_to_urls(vid):
    """str->str
    Select one resolution for single video download."""
    html = get_content('http://pv.funshion.com/v5/video/play/?id={vid}&cl=aphone&uc=5'.format(vid = vid))
    return select_url_from_video_api(html)
 #Logics for drama until helper functions
 #----------------------------------------------------------------------
 def funshion_download_by_drama_url(url, output_dir = '.', merge = False, info_only = False):
    """str->None
    url = 'http://www.fun.tv/vplay/g-95785/'
    """
    if re.match(r'http://www.fun.tv/vplay/g-(\w+)', url):
        match = re.search(r'http://www.fun.tv/vplay/g-(\d+)(.?)', url)
    id = match.group(1)
    video_list = funshion_drama_id_to_vid(id)
    for video in video_list:
        funshion_download_by_id((video[0], id), output_dir = '.', merge = False, info_only = False)
        # id is for drama, vid not the same as the ones used in single video
 #----------------------------------------------------------------------
 def funshion_download_by_id(vid_id_tuple, output_dir = '.', merge = False, info_only = False):
    """single_episode_id, drama_id->None
    Secondary wrapper for single drama video download.
    """
    (vid, id) = vid_id_tuple
    title = funshion_get_title_by_id(vid, id)
    url_list = funshion_id_to_urls(vid)
    for url in url_list:
        type, ext, size = url_info(url)
        print_info(site_info, title, type, size)
    if not info_only:
        download_urls(url_list, title, ext, total_size=None, output_dir=output_dir, merge=merge)
 #----------------------------------------------------------------------
 def funshion_drama_id_to_vid(episode_id):
    """int->[(int,int),...]
    id: 95785
    ->[('626464', '1'), ('626466', '2'), ('626468', '3'),...
    Drama ID to vids used in drama.
    **THIS VID IS NOT THE SAME WITH THE ONES USED IN SINGLE VIDEO!!**
    """
    html = get_content('http://pm.funshion.com/v5/media/episode?id={episode_id}&cl=aphone&uc=5'.format(episode_id = episode_id))
    c = json.loads(html)
    #{'definition': [{'name': '流畅', 'code': 'tv'}, {'name': '标清', 'code': 'dvd'}, {'name': '高清', 'code': 'hd'}], 'retmsg': 'ok', 'total': '32', 'sort': '1', 'prevues': [], 'retcode': '200', 'cid': '2', 'template': 'grid', 'episodes': [{'num': '1', 'id': '624728', 'still': None, 'name': '第1集', 'duration': '45:55'}, ], 'name': '太行山上', 'share': 'http://pm.funshion.com/v5/media/share?id=201554&num=', 'media': '201554'}
    return [(i['id'], i['num']) for i in c['episodes']]
 #----------------------------------------------------------------------
 def funshion_id_to_urls(id):
    """int->list of URL
    Select video URL for single drama video.
    """
    html = get_content('http://pm.funshion.com/v5/media/play/?id={id}&cl=aphone&uc=5'.format(id = id))
    return select_url_from_video_api(html)
 #----------------------------------------------------------------------
 def funshion_get_title_by_id(single_episode_id, drama_id):
    """single_episode_id, drama_id->str
    This is for full drama.
    Get title for single drama video."""
    html = get_content('http://pm.funshion.com/v5/media/episode?id={id}&cl=aphone&uc=5'.format(id = drama_id))
    c = json.loads(html)
    for i in c['episodes']:
        if i['id'] == str(single_episode_id):
            return c['name'] + ' - ' + i['name']
 # Helper functions.
 #----------------------------------------------------------------------
 def select_url_from_video_api(html):
    """str(html)->str(url)
    Choose the best one.
    Used in both single and drama download.
    code definition:
    {'tv': 'liuchang',
    'dvd': 'biaoqing',
    'hd': 'gaoqing',
    'sdvd': 'chaoqing'}"""
    c = json.loads(html)
    #{'retmsg': 'ok', 'retcode': '200', 'selected': 'tv', 'mp4': [{'filename': '', 'http': 'http://jobsfe.funshion.com/query/v1/mp4/7FCD71C58EBD4336DF99787A63045A8F3016EC51.json', 'filesize': '96748671', 'code': 'tv', 'name': '流畅', 'infohash': '7FCD71C58EBD4336DF99787A63045A8F3016EC51'}...], 'episode': '626464'}
    video_dic = {}
    for i in c['mp4']:
        video_dic[i['code']] = i['http']
    quality_preference_list = ['sdvd', 'hd', 'dvd', 'sd']
    url = [video_dic[quality] for quality in quality_preference_list if quality in video_dic][0]
    html = get_html(url)
    c = json.loads(html)
    #'{"return":"succ","client":{"ip":"107.191.**.**","sp":"0","loc":"0"},"playlist":[{"bits":"1638400","tname":"dvd","size":"555811243","urls":["http:\\/\\/61.155.217.4:80\\/play\\/1E070CE31DAA1373B667FD23AA5397C192CA6F7F.mp4",...]}]}'
    return [i['urls'][0] for i in c['playlist']]
 site_info = "funshion"
 download = funshion_download
 download_playlist = playlist_not_supported('funshion')
--- a/src/you_get/extractors/miaopai.py
+++ b/src/you_get/extractors/miaopai.py
@ -0,0 +1,36 @@
 #!/usr/bin/env python
 __all__ = ['miaopai_download']
 from ..common import *
 import urllib.error
 def miaopai_download(url, output_dir = '.', merge = False, info_only = False):
    '''Source: Android mobile'''
    if re.match(r'http://video.weibo.com/show\?fid=(\d{4}:\w{32})\w*', url):
        fake_headers_mobile = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Charset': 'UTF-8,*;q=0.5',
            'Accept-Encoding': 'gzip,deflate,sdch',
            'Accept-Language': 'en-US,en;q=0.8',
            'User-Agent': 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'
        }
        webpage_url = re.search(r'(http://video.weibo.com/show\?fid=\d{4}:\w{32})\w*', url).group(1) + '&type=mp4'  #mobile
        #grab download URL
        a = get_content(webpage_url, headers= fake_headers_mobile , decoded=True)
        url = match1(a, r'<video src="(.*?)\"\W')
        #grab title
        b = get_content(webpage_url)  #normal
        title = match1(b, r'<meta name="description" content="(.*?)\"\W')
        type_, ext, size = url_info(url)
        print_info(site_info, title, type_, size)
        if not info_only:
            download_urls([url], title, ext, total_size=None, output_dir=output_dir, merge=merge)
 site_info = "miaopai"
 download = miaopai_download
 download_playlist = playlist_not_supported('miaopai')
--- a/src/you_get/extractors/tudou.py
+++ b/src/you_get/extractors/tudou.py
@ -7,7 +7,7 @@ from xml.dom.minidom import parseString
 def tudou_download_by_iid(iid, title, output_dir = '.', merge = True, info_only = False):
    data = json.loads(get_decoded_html('http://www.tudou.com/outplay/goto/getItemSegs.action?iid=%s' % iid))
-    temp = max([data[i] for i in data if 'size' in data[i][0]], key=lambda x:x[0]["size"])
+    temp = max([data[i] for i in data if 'size' in data[i][0]], key=lambda x:sum([part['size'] for part in x]))
    vids, size = [t["k"] for t in temp], sum([t["size"] for t in temp])
    urls = [[n.firstChild.nodeValue.strip()
             for n in
--- a/src/you_get/extractors/yinyuetai.py
+++ b/src/you_get/extractors/yinyuetai.py
@ -4,15 +4,11 @@ __all__ = ['yinyuetai_download', 'yinyuetai_download_by_id']
 from ..common import *
-def yinyuetai_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False):
+def yinyuetai_download_by_id(vid, title=None, output_dir='.', merge=True, info_only=False):
-    assert title
+    video_info = json.loads(get_html('http://www.yinyuetai.com/insite/get-video-info?json=true&videoId=%s' % vid))
-    html = get_html('http://www.yinyuetai.com/insite/get-video-info?flex=true&videoId=' + id)
+    url_models = video_info['videoInfo']['coreVideoInfo']['videoUrlModels']
-
+    url_models = sorted(url_models, key=lambda i: i['qualityLevel'])
-    for quality in ['he\w*', 'hd\w*', 'hc\w*', '\w+']:
+    url = url_models[-1]['videoUrl']
        url = r1(r'(http://' + quality + '\.yinyuetai\.com/uploads/videos/common/\w+\.(?:flv|mp4)\?(?:sc=[a-f0-9]{16}|v=\d{12}))', html)
        if url:
            break
    assert url
    type = ext = r1(r'\.(flv|mp4)', url)
    _, _, size = url_info(url)
@ -20,7 +16,7 @@ def yinyuetai_download_by_id(id, title = None, output_dir = '.', merge = True, i
    if not info_only:
        download_urls([url], title, ext, size, output_dir, merge = merge)
-def yinyuetai_download(url, output_dir = '.', merge = True, info_only = False):
+def yinyuetai_download(url, output_dir='.', merge=True, info_only=False):
    id = r1(r'http://\w+.yinyuetai.com/video/(\d+)$', url.split('?')[0])
    assert id
    html = get_html(url, 'utf-8')
--- a/src/you_get/extractors/youku.py
+++ b/src/you_get/extractors/youku.py
@ -57,7 +57,8 @@ class Youku(VideoExtractor):
        """
        return match1(url, r'youku\.com/v_show/id_([a-zA-Z0-9=]+)') or \
          match1(url, r'player\.youku\.com/player\.php/sid/([a-zA-Z0-9=]+)/v\.swf') or \
-          match1(url, r'loader\.swf\?VideoIDS=([a-zA-Z0-9=]+)')
+          match1(url, r'loader\.swf\?VideoIDS=([a-zA-Z0-9=]+)') or \
          match1(url, r'player\.youku\.com/embed/([a-zA-Z0-9=]+)')
    def get_playlist_id_from_url(url):
        """Extracts playlist ID from URL.