From 7cecd0368dea3e8e7318f48908e39d3d8405c95f Mon Sep 17 00:00:00 2001
From: Mort Yao <soi@mort.ninja>
Date: Sun, 21 Aug 2016 21:44:05 +0200
Subject: [PATCH] [bilibili] close #1340

---
 src/you_get/extractors/bilibili.py | 206 +++++++----------------------
 1 file changed, 48 insertions(+), 158 deletions(-)
diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py
index 3fbf946f..c7c4fac9 100644
--- a/src/you_get/extractors/bilibili.py
+++ b/src/you_get/extractors/bilibili.py
@@ -2,94 +2,17 @@
 
 __all__ = ['bilibili_download']
 
+import json
+import re
 from ..common import *
 
-from .sina import sina_download_by_vid
-from .tudou import tudou_download_by_id
-from .youku import youku_download_by_vid
+def get_srt_xml(cid):
+    return get_html('http://comment.bilibili.com/%s.xml' % cid)
 
-import hashlib
-import re
-
-appkey = 'f3bb208b3d081dc8'
-
-
-def get_srt_xml(id):
-    url = 'http://comment.bilibili.com/%s.xml' % id
-    return get_html(url)
-
-
-def parse_srt_p(p):
-    fields = p.split(',')
-    assert len(fields) == 8, fields
-    time, mode, font_size, font_color, pub_time, pool, user_id, history = fields
-    time = float(time)
-
-    mode = int(mode)
-    assert 1 <= mode <= 8
-    # mode 1~3: scrolling
-    # mode 4: bottom
-    # mode 5: top
-    # mode 6: reverse?
-    # mode 7: position
-    # mode 8: advanced
-
-    pool = int(pool)
-    assert 0 <= pool <= 2
-    # pool 0: normal
-    # pool 1: srt
-    # pool 2: special?
-
-    font_size = int(font_size)
-
-    font_color = '#%06x' % int(font_color)
-
-    return pool, mode, font_size, font_color
-
-
-def parse_srt_xml(xml):
-    d = re.findall(r'<d p="([^"]+)">(.*)</d>', xml)
-    for x, y in d:
-        p = parse_srt_p(x)
-    raise NotImplementedError()
-
-
-def parse_cid_playurl(xml):
-    from xml.dom.minidom import parseString
-    try:
-        doc = parseString(xml.encode('utf-8'))
-        urls = [durl.getElementsByTagName('url')[0].firstChild.nodeValue for durl in doc.getElementsByTagName('durl')]
-        return urls
-    except:
-        return []
-
-
-def bilibili_download_by_cids(cids, title, output_dir='.', merge=True, info_only=False):
-    urls = []
-    for cid in cids:
-        url = 'http://interface.bilibili.com/playurl?appkey=' + appkey + '&cid=' + cid
-        urls += [i
-                 if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i)
-                 else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i)
-                 for i in parse_cid_playurl(get_content(url))]
-
-    type_ = ''
-    size = 0
-    for url in urls:
-        _, type_, temp = url_info(url)
-        size += temp
-
-    print_info(site_info, title, type_, size)
-    if not info_only:
-        download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge)
-
-
-def bilibili_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False):
-    url = 'http://interface.bilibili.com/playurl?appkey=' + appkey + '&cid=' + cid
-    urls = [i
-            if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i)
-            else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i)
-            for i in parse_cid_playurl(get_content(url))]
+def bilibili_download_by_api(url, output_dir='.', merge=True, info_only=False, **kwargs):
+    title = r1(r'cid=(\d+)', url)
+    info = json.loads(get_content(url))
+    urls = [i['url'] for i in info['durl']]
 
     type_ = ''
     size = 0
@@ -104,82 +27,50 @@ def bilibili_download_by_cid(cid, title, output_dir='.', merge=True, info_only=F
     if not info_only:
         download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge)
 
+def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
+    if re.match(r'https?://interface\.bilibili\.com/', url):
+        # quick hack for explicit API
+        bilibili_download_by_api(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
+        return
 
-def bilibili_live_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False):
-    api_url = 'http://live.bilibili.com/api/playurl?cid=' + cid
-    urls = parse_cid_playurl(get_content(api_url))
+    html = get_content(url)
+    main_title = r1_of([r'<meta name="title" content="\s*([^<>]{1,999})\s*" />',
+                        r'<h1[^>]*>\s*([^<>]+)\s*</h1>'], html)
+    cid = r1(r'cid=(\d+)', html)
 
-    for url in urls:
-        _, type_, _ = url_info(url)
-        size = 0
+    aid = r1(r'av(\d+)', url)
+    page = r1(r'index_(\d+)', url)
+    sub_titles = re.findall('<option value=.*>\s*([^<>]+)\s*</option>', html)
+    if page is None and sub_titles: # download all
+        for t in enumerate(sub_titles):
+            page, sub_title = t[0] + 1, t[1]
+            title = main_title + ' - ' + sub_title
+
+            api = 'http://www.bilibili.com/m/html5?aid=%s&page=%s' % (aid, page)
+            info = json.loads(get_content(api))
+            src = info['src']
+            _, type_, size = url_info(src)
+            print_info(site_info, title, type_, size)
+            if not info_only:
+                download_urls([src], title, type_, total_size=size, output_dir=output_dir, merge=merge)
+
+    else: # download selected
+        if page is None: page = 1
+        sub_title = r1('<option value=.* selected>\s*([^<>]+)\s*</option>', html)
+        if sub_title is None:
+            sub_title = r1('<option value=.*>\s*([^<>]+)\s*</option>', html)
+        if sub_title:
+            title = main_title + ' - ' + sub_title
+        else:
+            title = main_title
+
+        api = 'http://www.bilibili.com/m/html5?aid=%s&page=%s' % (aid, page)
+        info = json.loads(get_content(api))
+        src = info['src']
+        _, type_, size = url_info(src)
         print_info(site_info, title, type_, size)
         if not info_only:
-            download_urls([url], title, type_, total_size=None, output_dir=output_dir, merge=merge)
-
-
-def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
-    html = get_content(url)
-
-    if re.match(r'https?://bangumi\.bilibili\.com/', url):
-        # quick hack for bangumi URLs
-        url = r1(r'"([^"]+)" class="v-av-link"', html)
-        html = get_content(url)
-
-    title = r1_of([r'<meta name="title" content="\s*([^<>]{1,999})\s*" />',
-                   r'<h1[^>]*>\s*([^<>]+)\s*</h1>'], html)
-    if title:
-        title = unescape_html(title)
-        title = escape_file_path(title)
-
-    flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"',
-                       r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
-    assert flashvars
-    flashvars = flashvars.replace(': ', '=')
-    t, cid = flashvars.split('=', 1)
-    cid = cid.split('&')[0]
-    if t == 'cid':
-        if re.match(r'https?://live\.bilibili\.com/', url):
-            title = r1(r'<title>\s*([^<>]+)\s*</title>', html)
-            bilibili_live_download_by_cid(cid, title, output_dir=output_dir, merge=merge, info_only=info_only)
-
-        else:
-            # multi-P
-            cids = []
-            pages = re.findall('<option value=\'([^\']*)\'', html)
-            titles = re.findall('<option value=.*>\s*([^<>]+)\s*</option>', html)
-            for i, page in enumerate(pages):
-                html = get_html("http://www.bilibili.com%s" % page)
-                flashvars = r1_of([r'(cid=\d+)',
-                                   r'flashvars="([^"]+)"',
-                                   r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
-                if flashvars:
-                    t, cid = flashvars.split('=', 1)
-                    cids.append(cid.split('&')[0])
-                if url.endswith(page):
-                    cids = [cid.split('&')[0]]
-                    titles = [titles[i]]
-                    break
-
-            # no multi-P
-            if not pages:
-                cids = [cid]
-                titles = [r1(r'<option value=.* selected>\s*([^<>]+)\s*</option>', html) or title]
-
-            for i in range(len(cids)):
-                bilibili_download_by_cid(cids[i],
-                                         titles[i],
-                                         output_dir=output_dir,
-                                         merge=merge,
-                                         info_only=info_only)
-
-    elif t == 'vid':
-        sina_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
-    elif t == 'ykid':
-        youku_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
-    elif t == 'uid':
-        tudou_download_by_id(cid, title, output_dir=output_dir, merge=merge, info_only=info_only)
-    else:
-        raise NotImplementedError(flashvars)
+            download_urls([src], title, type_, total_size=size, output_dir=output_dir, merge=merge)
 
     if not info_only and not dry_run:
         if not kwargs['caption']:
@@ -191,7 +82,6 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs
         with open(os.path.join(output_dir, title + '.cmt.xml'), 'w', encoding='utf-8') as x:
             x.write(xml)
 
-
 site_info = "bilibili.com"
 download = bilibili_download
 download_playlist = bilibili_download