Merge pull request #1 from soimort/develop

merge this
2025-02-11 20:52:31 +03:00 · 2016-08-29 19:25:53 +08:00 · 2016-08-29 19:25:53 +08:00 · 991a608b09
commit 991a608b09
parent e13247f7b6 a574dcb364
6 changed files with 389 additions and 56 deletions
--- a/src/you_get/common.py
+++ b/src/you_get/common.py
@ -79,6 +79,7 @@ SITES = {
    'videomega'        : 'videomega',
    'vidto'            : 'vidto',
    'vimeo'            : 'vimeo',
    'wanmen'           : 'wanmen',
    'weibo'            : 'miaopai',
    'veoh'             : 'veoh',
    'vine'             : 'vine',
--- a/src/you_get/extractors/init.py
+++ b/src/you_get/extractors/init.py
@ -7,6 +7,7 @@ from .baidu import *
 from .bandcamp import *
 from .bigthink import *
 from .bilibili import *
 from .bokecc import *
 from .cbs import *
 from .ckplayer import *
 from .cntv import *
@ -73,6 +74,7 @@ from .vimeo import *
 from .vine import *
 from .vk import *
 from .w56 import *
 from .wanmen import *
 from .xiami import *
 from .yinyuetai import *
 from .yixia import *
--- a/src/you_get/extractors/bilibili.py
+++ b/src/you_get/extractors/bilibili.py
@ -2,17 +2,96 @@
 __all__ = ['bilibili_download']
 import json
 import re
 from ..common import *
-def get_srt_xml(cid):
+from .sina import sina_download_by_vid
-    return get_html('http://comment.bilibili.com/%s.xml' % cid)
+from .tudou import tudou_download_by_id
 from .youku import youku_download_by_vid
-def bilibili_download_by_api(url, output_dir='.', merge=True, info_only=False, **kwargs):
+import hashlib
-    title = r1(r'cid=(\d+)', url)
+import re
-    info = json.loads(get_content(url))
+
-    urls = [i['url'] for i in info['durl']]
+appkey = 'f3bb208b3d081dc8'
 SECRETKEY_MINILOADER = '1c15888dc316e05a15fdd0a02ed6584f'
 def get_srt_xml(id):
    url = 'http://comment.bilibili.com/%s.xml' % id
    return get_html(url)
 def parse_srt_p(p):
    fields = p.split(',')
    assert len(fields) == 8, fields
    time, mode, font_size, font_color, pub_time, pool, user_id, history = fields
    time = float(time)
    mode = int(mode)
    assert 1 <= mode <= 8
    # mode 1~3: scrolling
    # mode 4: bottom
    # mode 5: top
    # mode 6: reverse?
    # mode 7: position
    # mode 8: advanced
    pool = int(pool)
    assert 0 <= pool <= 2
    # pool 0: normal
    # pool 1: srt
    # pool 2: special?
    font_size = int(font_size)
    font_color = '#%06x' % int(font_color)
    return pool, mode, font_size, font_color
 def parse_srt_xml(xml):
    d = re.findall(r'<d p="([^"]+)">(.*)</d>', xml)
    for x, y in d:
        p = parse_srt_p(x)
    raise NotImplementedError()
 def parse_cid_playurl(xml):
    from xml.dom.minidom import parseString
    try:
        doc = parseString(xml.encode('utf-8'))
        urls = [durl.getElementsByTagName('url')[0].firstChild.nodeValue for durl in doc.getElementsByTagName('durl')]
        return urls
    except:
        return []
 def bilibili_download_by_cids(cids, title, output_dir='.', merge=True, info_only=False):
    urls = []
    for cid in cids:
        sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest()
        url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this
        urls += [i
                 if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i)
                 else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i)
                 for i in parse_cid_playurl(get_content(url))]
    type_ = ''
    size = 0
    for url in urls:
        _, type_, temp = url_info(url)
        size += temp
    print_info(site_info, title, type_, size)
    if not info_only:
        download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge)
 def bilibili_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False):
    sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest()
    url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this
    urls = [i
            if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i)
            else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i)
            for i in parse_cid_playurl(get_content(url))]
    type_ = ''
    size = 0
@ -27,50 +106,82 @@ def bilibili_download_by_api(url, output_dir='.', merge=True, info_only=False, *
    if not info_only:
        download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge)
 def bilibili_live_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False):
    api_url = 'http://live.bilibili.com/api/playurl?cid=' + cid
    urls = parse_cid_playurl(get_content(api_url))
    for url in urls:
        _, type_, _ = url_info(url)
        size = 0
        print_info(site_info, title, type_, size)
        if not info_only:
            download_urls([url], title, type_, total_size=None, output_dir=output_dir, merge=merge)
 def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
    if re.match(r'https?://interface\.bilibili\.com/', url):
        # quick hack for explicit API
        bilibili_download_by_api(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
        return
    html = get_content(url)
-    main_title = r1_of([r'<meta name="title" content="\s*([^<>]{1,999})\s*" />',
+
    if re.match(r'https?://bangumi\.bilibili\.com/', url):
        # quick hack for bangumi URLs
        url = r1(r'"([^"]+)" class="v-av-link"', html)
        html = get_content(url)
    title = r1_of([r'<meta name="title" content="\s*([^<>]{1,999})\s*" />',
                   r'<h1[^>]*>\s*([^<>]+)\s*</h1>'], html)
-    cid = r1(r'cid=(\d+)', html)
+    if title:
        title = unescape_html(title)
        title = escape_file_path(title)
-    aid = r1(r'av(\d+)', url)
+    flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"',
-    page = r1(r'index_(\d+)', url)
+                       r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
-    sub_titles = re.findall('<option value=.*>\s*([^<>]+)\s*</option>', html)
+    assert flashvars
-    if page is None and sub_titles: # download all
+    flashvars = flashvars.replace(': ', '=')
-        for t in enumerate(sub_titles):
+    t, cid = flashvars.split('=', 1)
-            page, sub_title = t[0] + 1, t[1]
+    cid = cid.split('&')[0]
-            title = main_title + ' - ' + sub_title
+    if t == 'cid':
        if re.match(r'https?://live\.bilibili\.com/', url):
            title = r1(r'<title>\s*([^<>]+)\s*</title>', html)
            bilibili_live_download_by_cid(cid, title, output_dir=output_dir, merge=merge, info_only=info_only)
            api = 'http://www.bilibili.com/m/html5?aid=%s&page=%s' % (aid, page)
            info = json.loads(get_content(api))
            src = info['src']
            _, type_, size = url_info(src)
            print_info(site_info, title, type_, size)
            if not info_only:
                download_urls([src], title, type_, total_size=size, output_dir=output_dir, merge=merge)
    else: # download selected
        if page is None: page = 1
        sub_title = r1('<option value=.* selected>\s*([^<>]+)\s*</option>', html)
        if sub_title is None:
            sub_title = r1('<option value=.*>\s*([^<>]+)\s*</option>', html)
        if sub_title:
            title = main_title + ' - ' + sub_title
        else:
-            title = main_title
+            # multi-P
            cids = []
            pages = re.findall('<option value=\'([^\']*)\'', html)
            titles = re.findall('<option value=.*>\s*([^<>]+)\s*</option>', html)
            for i, page in enumerate(pages):
                html = get_html("http://www.bilibili.com%s" % page)
                flashvars = r1_of([r'(cid=\d+)',
                                   r'flashvars="([^"]+)"',
                                   r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
                if flashvars:
                    t, cid = flashvars.split('=', 1)
                    cids.append(cid.split('&')[0])
                if url.endswith(page):
                    cids = [cid.split('&')[0]]
                    titles = [titles[i]]
                    break
-        api = 'http://www.bilibili.com/m/html5?aid=%s&page=%s' % (aid, page)
+            # no multi-P
-        info = json.loads(get_content(api))
+            if not pages:
-        src = info['src']
+                cids = [cid]
-        _, type_, size = url_info(src)
+                titles = [r1(r'<option value=.* selected>\s*([^<>]+)\s*</option>', html) or title]
-        print_info(site_info, title, type_, size)
+
-        if not info_only:
+            for i in range(len(cids)):
-            download_urls([src], title, type_, total_size=size, output_dir=output_dir, merge=merge)
+                bilibili_download_by_cid(cids[i],
                                         titles[i],
                                         output_dir=output_dir,
                                         merge=merge,
                                         info_only=info_only)
    elif t == 'vid':
        sina_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
    elif t == 'ykid':
        youku_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
    elif t == 'uid':
        tudou_download_by_id(cid, title, output_dir=output_dir, merge=merge, info_only=info_only)
    else:
        raise NotImplementedError(flashvars)
    if not info_only and not dry_run:
        if not kwargs['caption']:
@ -82,6 +193,7 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs
        with open(os.path.join(output_dir, title + '.cmt.xml'), 'w', encoding='utf-8') as x:
            x.write(xml)
 site_info = "bilibili.com"
 download = bilibili_download
 download_playlist = bilibili_download
--- a/src/you_get/extractors/bokecc.py
+++ b/src/you_get/extractors/bokecc.py
@ -0,0 +1,95 @@
 #!/usr/bin/env python
 from ..common import *
 from ..extractor import VideoExtractor
 import xml.etree.ElementTree as ET
 class BokeCC(VideoExtractor):
    name = "BokeCC"
    stream_types = [  # we do now know for now, as we have to check the
                      # output from the API
    ]
    API_ENDPOINT = 'http://p.bokecc.com/'
    def download_by_id(self, vid = '', title = None, output_dir='.', merge=True, info_only=False,**kwargs):
        """self, str->None
        Keyword arguments:
        self: self
        vid: The video ID for BokeCC cloud, something like
        FE3BB999594978049C33DC5901307461
        Calls the prepare() to download the video.
        If no title is provided, this method shall try to find a proper title
        with the information providin within the
        returned content of the API."""
        assert vid
        self.prepare(vid = vid, title = title, **kwargs)
        self.extract(**kwargs)
        self.download(output_dir = output_dir, 
                    merge = merge, 
                    info_only = info_only, **kwargs)
    def prepare(self, vid = '', title = None, **kwargs):
        assert vid
        api_url = self.API_ENDPOINT + \
            'servlet/playinfo?vid={vid}&m=0'.format(vid = vid)  #return XML
        html = get_content(api_url)
        self.tree = ET.ElementTree(ET.fromstring(html))
        if self.tree.find('result').text != '1':
            log.wtf('API result says failed!')
            raise 
        if title is None:
            self.title = '_'.join([i.text for i in tree.iterfind('video/videomarks/videomark/markdesc')])
        else:
            self.title = title
        for i in self.tree.iterfind('video/quality'):
            quality = i.attrib ['value']
            url = i[0].attrib['playurl']
            self.stream_types.append({'id': quality,
                                      'video_profile': i.attrib ['desp']})
            self.streams[quality] = {'url': url,
                                     'video_profile': i.attrib ['desp']}
            self.streams_sorted = [dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams]
    def extract(self, **kwargs):
        for i in self.streams:
            s = self.streams[i]
            _, s['container'], s['size'] = url_info(s['url'])
            s['src'] = [s['url']]
        if 'stream_id' in kwargs and kwargs['stream_id']:
            # Extract the stream
            stream_id = kwargs['stream_id']
            if stream_id not in self.streams:
                log.e('[Error] Invalid video format.')
                log.e('Run \'-i\' command with no specific video format to view all available formats.')
                exit(2)
        else:
            # Extract stream with the best quality
            stream_id = self.streams_sorted[0]['id']
            _, s['container'], s['size'] = url_info(s['url'])
            s['src'] = [s['url']]
 site = BokeCC()
 # I don't know how to call the player directly so I just put it here
 # just in case anyone touchs it -- Beining@Aug.24.2016
 #download = site.download_by_url
 #download_playlist = site.download_by_url
 bokecc_download_by_id = site.download_by_id
--- a/src/you_get/extractors/universal.py
+++ b/src/you_get/extractors/universal.py
@ -6,6 +6,8 @@ from ..common import *
 from .embed import *
 def universal_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
    content_type = get_head(url, headers=fake_headers)['Content-Type']
    if content_type.startswith('text/html'):
        try:
            embed_download(url, output_dir, merge=merge, info_only=info_only)
        except: pass
@ -15,11 +17,9 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg
    if len(domains) > 2: domains = domains[1:]
    site_info = '.'.join(domains)
    response = get_response(url, faker=True)
    content_type = response.headers['Content-Type']
    if content_type.startswith('text/html'):
        # extract an HTML page
        response = get_response(url, faker=True)
        page = str(response.data)
        page_title = r1(r'<title>([^<]*)', page)
--- a/src/you_get/extractors/wanmen.py
+++ b/src/you_get/extractors/wanmen.py
@ -0,0 +1,123 @@
 #!/usr/bin/env python
 __all__ = ['wanmen_download', 'wanmen_download_by_course', 'wanmen_download_by_course_topic', 'wanmen_download_by_course_topic_part']
 from ..common import *
 from .bokecc import bokecc_download_by_id
 from json import loads
 ##Helper functions
 def _wanmen_get_json_api_content_by_courseID(courseID):
    """int->JSON
    Return a parsed JSON tree of WanMen's API."""
    return loads(get_content('http://api.wanmen.org/course/getCourseNested/{courseID}'.format(courseID = courseID)))
 def _wanmen_get_title_by_json_topic_part(json_content, tIndex, pIndex):
    """JSON, int, int, int->str
    Get a proper title with courseid+topicID+partID."""
    return '_'.join([json_content[0]['name'],
                    json_content[0]['Topics'][tIndex]['name'],
                    json_content[0]['Topics'][tIndex]['Parts'][pIndex]['name']])
 def _wanmen_get_boke_id_by_json_topic_part(json_content, tIndex, pIndex):
    """JSON, int, int, int->str
    Get one BokeCC video ID with courseid+topicID+partID."""
    return json_content[0]['Topics'][tIndex]['Parts'][pIndex]['ccVideoLink']
 ##Parsers
 def wanmen_download_by_course(json_api_content, output_dir='.', merge=True, info_only=False, **kwargs):
    """int->None
    Download a WHOLE course.
    Reuse the API call to save time."""
    for tIndex in range(len(json_api_content[0]['Topics'])):
        for pIndex in range(len(json_api_content[0]['Topics'][tIndex]['Parts'])):
            wanmen_download_by_course_topic_part(json_api_content,
                                                 tIndex,
                                                 pIndex,
                                                 output_dir=output_dir,
                                                 merge=merge,
                                                 info_only=info_only,
                                                 **kwargs)
 def wanmen_download_by_course_topic(json_api_content, tIndex, output_dir='.', merge=True, info_only=False, **kwargs):
    """int, int->None
    Download a TOPIC of a course.
    Reuse the API call to save time."""
    for pIndex in range(len(json_api_content[0]['Topics'][tIndex]['Parts'])):
        wanmen_download_by_course_topic_part(json_api_content,
                                             tIndex,
                                             pIndex, 
                                            output_dir=output_dir,
                                            merge=merge,
                                            info_only=info_only,
                                            **kwargs)
 def wanmen_download_by_course_topic_part(json_api_content, tIndex, pIndex, output_dir='.', merge=True, info_only=False, **kwargs):
    """int, int, int->None
    Download ONE PART of the course."""
    html = json_api_content
    title = _wanmen_get_title_by_json_topic_part(html, 
                                                  tIndex, 
                                                  pIndex)
    bokeccID = _wanmen_get_boke_id_by_json_topic_part(html,
                                                      tIndex, 
                                                     pIndex)
    bokecc_download_by_id(vid = bokeccID, title = title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
 ##Main entrance
 def wanmen_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
    if not 'wanmen.org' in url:
        log.wtf('You are at the wrong place dude. This is for WanMen University!')
        raise
    courseID = int(match1(url, r'course\/(\d+)'))
    assert courseID > 0  #without courseID we cannot do anything
    tIndex = int(match1(url, r'tIndex=(\d+)'))
    pIndex = int(match1(url, r'pIndex=(\d+)'))
    json_api_content = _wanmen_get_json_api_content_by_courseID(courseID)
    if pIndex:  #only download ONE single part
        assert tIndex >= 0
        wanmen_download_by_course_topic_part(json_api_content, tIndex, pIndex, 
                                            output_dir = output_dir, 
                                            merge = merge, 
                                            info_only = info_only)
    elif tIndex:  #download a topic
        wanmen_download_by_course_topic(json_api_content, tIndex, 
                                       output_dir = output_dir, 
                                       merge = merge, 
                                       info_only = info_only)
    else:  #download the whole course
        wanmen_download_by_course(json_api_content,
                                 output_dir = output_dir, 
                                 merge = merge, 
                                 info_only = info_only)
 site_info = "WanMen University"
 download = wanmen_download
 download_playlist = wanmen_download_by_course