Merge pull request #1 from soimort/develop

merge this
2025-02-11 12:42:29 +03:00 · 2016-08-29 19:25:53 +08:00 · 2016-08-29 19:25:53 +08:00 · 991a608b09
commit 991a608b09
parent e13247f7b6 a574dcb364
6 changed files with 389 additions and 56 deletions
--- a/src/you_get/common.py
+++ b/src/you_get/common.py
@ -79,6 +79,7 @@ SITES = {
    'videomega'        : 'videomega',
    'vidto'            : 'vidto',
    'vimeo'            : 'vimeo',
+    'wanmen'           : 'wanmen',
    'weibo'            : 'miaopai',
    'veoh'             : 'veoh',
    'vine'             : 'vine',
--- a/src/you_get/extractors/init.py
+++ b/src/you_get/extractors/init.py
@ -7,6 +7,7 @@ from .baidu import *
 from .bandcamp import *
 from .bigthink import *
 from .bilibili import *
+from .bokecc import *
 from .cbs import *
 from .ckplayer import *
 from .cntv import *
@ -73,6 +74,7 @@ from .vimeo import *
 from .vine import *
 from .vk import *
 from .w56 import *
+from .wanmen import *
 from .xiami import *
 from .yinyuetai import *
 from .yixia import *
--- a/src/you_get/extractors/bilibili.py
+++ b/src/you_get/extractors/bilibili.py
@ -2,17 +2,96 @@

 __all__ = ['bilibili_download']

-import json
-import re
 from ..common import *

-def get_srt_xml(cid):
-    return get_html('http://comment.bilibili.com/%s.xml' % cid)
+from .sina import sina_download_by_vid
+from .tudou import tudou_download_by_id
+from .youku import youku_download_by_vid

-def bilibili_download_by_api(url, output_dir='.', merge=True, info_only=False, **kwargs):
-    title = r1(r'cid=(\d+)', url)
-    info = json.loads(get_content(url))
-    urls = [i['url'] for i in info['durl']]
+import hashlib
+import re
+
+appkey = 'f3bb208b3d081dc8'
+SECRETKEY_MINILOADER = '1c15888dc316e05a15fdd0a02ed6584f'
+
+def get_srt_xml(id):
+    url = 'http://comment.bilibili.com/%s.xml' % id
+    return get_html(url)
+
+
+def parse_srt_p(p):
+    fields = p.split(',')
+    assert len(fields) == 8, fields
+    time, mode, font_size, font_color, pub_time, pool, user_id, history = fields
+    time = float(time)
+
+    mode = int(mode)
+    assert 1 <= mode <= 8
+    # mode 1~3: scrolling
+    # mode 4: bottom
+    # mode 5: top
+    # mode 6: reverse?
+    # mode 7: position
+    # mode 8: advanced
+
+    pool = int(pool)
+    assert 0 <= pool <= 2
+    # pool 0: normal
+    # pool 1: srt
+    # pool 2: special?
+
+    font_size = int(font_size)
+
+    font_color = '#%06x' % int(font_color)
+
+    return pool, mode, font_size, font_color
+
+
+def parse_srt_xml(xml):
+    d = re.findall(r'<d p="([^"]+)">(.*)</d>', xml)
+    for x, y in d:
+        p = parse_srt_p(x)
+    raise NotImplementedError()
+
+
+def parse_cid_playurl(xml):
+    from xml.dom.minidom import parseString
+    try:
+        doc = parseString(xml.encode('utf-8'))
+        urls = [durl.getElementsByTagName('url')[0].firstChild.nodeValue for durl in doc.getElementsByTagName('durl')]
+        return urls
+    except:
+        return []
+
+
+def bilibili_download_by_cids(cids, title, output_dir='.', merge=True, info_only=False):
+    urls = []
+    for cid in cids:
+        sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest()
+        url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this
+        urls += [i
+                 if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i)
+                 else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i)
+                 for i in parse_cid_playurl(get_content(url))]
+
+    type_ = ''
+    size = 0
+    for url in urls:
+        _, type_, temp = url_info(url)
+        size += temp
+
+    print_info(site_info, title, type_, size)
+    if not info_only:
+        download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge)
+
+
+def bilibili_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False):
+    sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest()
+    url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this
+    urls = [i
+            if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i)
+            else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i)
+            for i in parse_cid_playurl(get_content(url))]

    type_ = ''
    size = 0
@ -27,50 +106,82 @@ def bilibili_download_by_api(url, output_dir='.', merge=True, info_only=False, *
    if not info_only:
        download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge)

-def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
-    if re.match(r'https?://interface\.bilibili\.com/', url):
-        # quick hack for explicit API
-        bilibili_download_by_api(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
-        return

-    html = get_content(url)
-    main_title = r1_of([r'<meta name="title" content="\s*([^<>]{1,999})\s*" />',
-                        r'<h1[^>]*>\s*([^<>]+)\s*</h1>'], html)
-    cid = r1(r'cid=(\d+)', html)
+def bilibili_live_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False):
+    api_url = 'http://live.bilibili.com/api/playurl?cid=' + cid
+    urls = parse_cid_playurl(get_content(api_url))

-    aid = r1(r'av(\d+)', url)
-    page = r1(r'index_(\d+)', url)
-    sub_titles = re.findall('<option value=.*>\s*([^<>]+)\s*</option>', html)
-    if page is None and sub_titles: # download all
-        for t in enumerate(sub_titles):
-            page, sub_title = t[0] + 1, t[1]
-            title = main_title + ' - ' + sub_title
-
-            api = 'http://www.bilibili.com/m/html5?aid=%s&page=%s' % (aid, page)
-            info = json.loads(get_content(api))
-            src = info['src']
-            _, type_, size = url_info(src)
-            print_info(site_info, title, type_, size)
-            if not info_only:
-                download_urls([src], title, type_, total_size=size, output_dir=output_dir, merge=merge)
-
-    else: # download selected
-        if page is None: page = 1
-        sub_title = r1('<option value=.* selected>\s*([^<>]+)\s*</option>', html)
-        if sub_title is None:
-            sub_title = r1('<option value=.*>\s*([^<>]+)\s*</option>', html)
-        if sub_title:
-            title = main_title + ' - ' + sub_title
-        else:
-            title = main_title
-
-        api = 'http://www.bilibili.com/m/html5?aid=%s&page=%s' % (aid, page)
-        info = json.loads(get_content(api))
-        src = info['src']
-        _, type_, size = url_info(src)
+    for url in urls:
+        _, type_, _ = url_info(url)
+        size = 0
        print_info(site_info, title, type_, size)
        if not info_only:
-            download_urls([src], title, type_, total_size=size, output_dir=output_dir, merge=merge)
+            download_urls([url], title, type_, total_size=None, output_dir=output_dir, merge=merge)
+
+
+def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
+    html = get_content(url)
+
+    if re.match(r'https?://bangumi\.bilibili\.com/', url):
+        # quick hack for bangumi URLs
+        url = r1(r'"([^"]+)" class="v-av-link"', html)
+        html = get_content(url)
+
+    title = r1_of([r'<meta name="title" content="\s*([^<>]{1,999})\s*" />',
+                   r'<h1[^>]*>\s*([^<>]+)\s*</h1>'], html)
+    if title:
+        title = unescape_html(title)
+        title = escape_file_path(title)
+
+    flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"',
+                       r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
+    assert flashvars
+    flashvars = flashvars.replace(': ', '=')
+    t, cid = flashvars.split('=', 1)
+    cid = cid.split('&')[0]
+    if t == 'cid':
+        if re.match(r'https?://live\.bilibili\.com/', url):
+            title = r1(r'<title>\s*([^<>]+)\s*</title>', html)
+            bilibili_live_download_by_cid(cid, title, output_dir=output_dir, merge=merge, info_only=info_only)
+
+        else:
+            # multi-P
+            cids = []
+            pages = re.findall('<option value=\'([^\']*)\'', html)
+            titles = re.findall('<option value=.*>\s*([^<>]+)\s*</option>', html)
+            for i, page in enumerate(pages):
+                html = get_html("http://www.bilibili.com%s" % page)
+                flashvars = r1_of([r'(cid=\d+)',
+                                   r'flashvars="([^"]+)"',
+                                   r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
+                if flashvars:
+                    t, cid = flashvars.split('=', 1)
+                    cids.append(cid.split('&')[0])
+                if url.endswith(page):
+                    cids = [cid.split('&')[0]]
+                    titles = [titles[i]]
+                    break
+
+            # no multi-P
+            if not pages:
+                cids = [cid]
+                titles = [r1(r'<option value=.* selected>\s*([^<>]+)\s*</option>', html) or title]
+
+            for i in range(len(cids)):
+                bilibili_download_by_cid(cids[i],
+                                         titles[i],
+                                         output_dir=output_dir,
+                                         merge=merge,
+                                         info_only=info_only)
+
+    elif t == 'vid':
+        sina_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
+    elif t == 'ykid':
+        youku_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
+    elif t == 'uid':
+        tudou_download_by_id(cid, title, output_dir=output_dir, merge=merge, info_only=info_only)
+    else:
+        raise NotImplementedError(flashvars)

    if not info_only and not dry_run:
        if not kwargs['caption']:
@ -82,6 +193,7 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs
        with open(os.path.join(output_dir, title + '.cmt.xml'), 'w', encoding='utf-8') as x:
            x.write(xml)

+
 site_info = "bilibili.com"
 download = bilibili_download
-download_playlist = bilibili_download
+download_playlist = bilibili_download
--- a/src/you_get/extractors/bokecc.py
+++ b/src/you_get/extractors/bokecc.py
@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+from ..common import *
+from ..extractor import VideoExtractor
+import xml.etree.ElementTree as ET
+
+class BokeCC(VideoExtractor):
+    name = "BokeCC"
+
+    stream_types = [  # we do now know for now, as we have to check the
+                      # output from the API
+    ]
+
+    API_ENDPOINT = 'http://p.bokecc.com/'
+
+
+    def download_by_id(self, vid = '', title = None, output_dir='.', merge=True, info_only=False,**kwargs):
+        """self, str->None
+        
+        Keyword arguments:
+        self: self
+        vid: The video ID for BokeCC cloud, something like
+        FE3BB999594978049C33DC5901307461
+        
+        Calls the prepare() to download the video.
+        
+        If no title is provided, this method shall try to find a proper title
+        with the information providin within the
+        returned content of the API."""
+
+        assert vid
+
+        self.prepare(vid = vid, title = title, **kwargs)
+
+        self.extract(**kwargs)
+
+        self.download(output_dir = output_dir, 
+                    merge = merge, 
+                    info_only = info_only, **kwargs)
+
+    def prepare(self, vid = '', title = None, **kwargs):
+        assert vid
+
+        api_url = self.API_ENDPOINT + \
+            'servlet/playinfo?vid={vid}&m=0'.format(vid = vid)  #return XML
+
+        html = get_content(api_url)
+        self.tree = ET.ElementTree(ET.fromstring(html))
+
+        if self.tree.find('result').text != '1':
+            log.wtf('API result says failed!')
+            raise 
+
+        if title is None:
+            self.title = '_'.join([i.text for i in tree.iterfind('video/videomarks/videomark/markdesc')])
+        else:
+            self.title = title
+
+        for i in self.tree.iterfind('video/quality'):
+            quality = i.attrib ['value']
+            url = i[0].attrib['playurl']
+            self.stream_types.append({'id': quality,
+                                      'video_profile': i.attrib ['desp']})
+            self.streams[quality] = {'url': url,
+                                     'video_profile': i.attrib ['desp']}
+            self.streams_sorted = [dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams]
+
+
+    def extract(self, **kwargs):
+        for i in self.streams:
+            s = self.streams[i]
+            _, s['container'], s['size'] = url_info(s['url'])
+            s['src'] = [s['url']]
+        if 'stream_id' in kwargs and kwargs['stream_id']:
+            # Extract the stream
+            stream_id = kwargs['stream_id']
+
+            if stream_id not in self.streams:
+                log.e('[Error] Invalid video format.')
+                log.e('Run \'-i\' command with no specific video format to view all available formats.')
+                exit(2)
+        else:
+            # Extract stream with the best quality
+            stream_id = self.streams_sorted[0]['id']
+            _, s['container'], s['size'] = url_info(s['url'])
+            s['src'] = [s['url']]
+
+site = BokeCC()
+
+# I don't know how to call the player directly so I just put it here
+# just in case anyone touchs it -- Beining@Aug.24.2016
+#download = site.download_by_url
+#download_playlist = site.download_by_url
+
+bokecc_download_by_id = site.download_by_id
--- a/src/you_get/extractors/universal.py
+++ b/src/you_get/extractors/universal.py
@ -6,20 +6,20 @@ from ..common import *
 from .embed import *

 def universal_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
-    try:
-        embed_download(url, output_dir, merge=merge, info_only=info_only)
-    except: pass
-    else: return
+    content_type = get_head(url, headers=fake_headers)['Content-Type']
+    if content_type.startswith('text/html'):
+        try:
+            embed_download(url, output_dir, merge=merge, info_only=info_only)
+        except: pass
+        else: return

    domains = url.split('/')[2].split('.')
    if len(domains) > 2: domains = domains[1:]
    site_info = '.'.join(domains)

-    response = get_response(url, faker=True)
-    content_type = response.headers['Content-Type']
-
    if content_type.startswith('text/html'):
        # extract an HTML page
+        response = get_response(url, faker=True)
        page = str(response.data)

        page_title = r1(r'<title>([^<]*)', page)
--- a/src/you_get/extractors/wanmen.py
+++ b/src/you_get/extractors/wanmen.py
@ -0,0 +1,123 @@
+#!/usr/bin/env python
+
+__all__ = ['wanmen_download', 'wanmen_download_by_course', 'wanmen_download_by_course_topic', 'wanmen_download_by_course_topic_part']
+
+from ..common import *
+from .bokecc import bokecc_download_by_id
+from json import loads
+
+
+##Helper functions
+def _wanmen_get_json_api_content_by_courseID(courseID):
+    """int->JSON
+    
+    Return a parsed JSON tree of WanMen's API."""
+
+    return loads(get_content('http://api.wanmen.org/course/getCourseNested/{courseID}'.format(courseID = courseID)))
+
+def _wanmen_get_title_by_json_topic_part(json_content, tIndex, pIndex):
+    """JSON, int, int, int->str
+    
+    Get a proper title with courseid+topicID+partID."""
+
+    return '_'.join([json_content[0]['name'],
+                    json_content[0]['Topics'][tIndex]['name'],
+                    json_content[0]['Topics'][tIndex]['Parts'][pIndex]['name']])
+
+
+def _wanmen_get_boke_id_by_json_topic_part(json_content, tIndex, pIndex):
+    """JSON, int, int, int->str
+    
+    Get one BokeCC video ID with courseid+topicID+partID."""
+
+    return json_content[0]['Topics'][tIndex]['Parts'][pIndex]['ccVideoLink']
+
+
+##Parsers
+def wanmen_download_by_course(json_api_content, output_dir='.', merge=True, info_only=False, **kwargs):
+    """int->None
+    
+    Download a WHOLE course.
+    Reuse the API call to save time."""
+
+    for tIndex in range(len(json_api_content[0]['Topics'])):
+        for pIndex in range(len(json_api_content[0]['Topics'][tIndex]['Parts'])):
+            wanmen_download_by_course_topic_part(json_api_content,
+                                                 tIndex,
+                                                 pIndex,
+                                                 output_dir=output_dir,
+                                                 merge=merge,
+                                                 info_only=info_only,
+                                                 **kwargs)
+
+
+def wanmen_download_by_course_topic(json_api_content, tIndex, output_dir='.', merge=True, info_only=False, **kwargs):
+    """int, int->None
+    
+    Download a TOPIC of a course.
+    Reuse the API call to save time."""
+
+    for pIndex in range(len(json_api_content[0]['Topics'][tIndex]['Parts'])):
+        wanmen_download_by_course_topic_part(json_api_content,
+                                             tIndex,
+                                             pIndex, 
+                                            output_dir=output_dir,
+                                            merge=merge,
+                                            info_only=info_only,
+                                            **kwargs)
+
+def wanmen_download_by_course_topic_part(json_api_content, tIndex, pIndex, output_dir='.', merge=True, info_only=False, **kwargs):
+    """int, int, int->None
+    
+    Download ONE PART of the course."""
+
+    html = json_api_content
+
+    title = _wanmen_get_title_by_json_topic_part(html, 
+                                                  tIndex, 
+                                                  pIndex)
+
+    bokeccID = _wanmen_get_boke_id_by_json_topic_part(html,
+                                                      tIndex, 
+                                                     pIndex)
+
+    bokecc_download_by_id(vid = bokeccID, title = title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
+
+
+##Main entrance
+def wanmen_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
+
+    if not 'wanmen.org' in url:
+        log.wtf('You are at the wrong place dude. This is for WanMen University!')
+        raise
+
+    courseID = int(match1(url, r'course\/(\d+)'))
+    assert courseID > 0  #without courseID we cannot do anything
+
+    tIndex = int(match1(url, r'tIndex=(\d+)'))
+
+    pIndex = int(match1(url, r'pIndex=(\d+)'))
+
+    json_api_content = _wanmen_get_json_api_content_by_courseID(courseID)
+
+    if pIndex:  #only download ONE single part
+        assert tIndex >= 0
+        wanmen_download_by_course_topic_part(json_api_content, tIndex, pIndex, 
+                                            output_dir = output_dir, 
+                                            merge = merge, 
+                                            info_only = info_only)
+    elif tIndex:  #download a topic
+        wanmen_download_by_course_topic(json_api_content, tIndex, 
+                                       output_dir = output_dir, 
+                                       merge = merge, 
+                                       info_only = info_only)
+    else:  #download the whole course
+        wanmen_download_by_course(json_api_content,
+                                 output_dir = output_dir, 
+                                 merge = merge, 
+                                 info_only = info_only)
+
+
+site_info = "WanMen University"
+download = wanmen_download
+download_playlist = wanmen_download_by_course