diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py
index 3fbf946f..c7c4fac9 100644
--- a/src/you_get/extractors/bilibili.py
+++ b/src/you_get/extractors/bilibili.py
@@ -2,94 +2,17 @@
__all__ = ['bilibili_download']
+import json
+import re
from ..common import *
-from .sina import sina_download_by_vid
-from .tudou import tudou_download_by_id
-from .youku import youku_download_by_vid
+def get_srt_xml(cid):
+ return get_html('http://comment.bilibili.com/%s.xml' % cid)
-import hashlib
-import re
-
-appkey = 'f3bb208b3d081dc8'
-
-
-def get_srt_xml(id):
- url = 'http://comment.bilibili.com/%s.xml' % id
- return get_html(url)
-
-
-def parse_srt_p(p):
- fields = p.split(',')
- assert len(fields) == 8, fields
- time, mode, font_size, font_color, pub_time, pool, user_id, history = fields
- time = float(time)
-
- mode = int(mode)
- assert 1 <= mode <= 8
- # mode 1~3: scrolling
- # mode 4: bottom
- # mode 5: top
- # mode 6: reverse?
- # mode 7: position
- # mode 8: advanced
-
- pool = int(pool)
- assert 0 <= pool <= 2
- # pool 0: normal
- # pool 1: srt
- # pool 2: special?
-
- font_size = int(font_size)
-
- font_color = '#%06x' % int(font_color)
-
- return pool, mode, font_size, font_color
-
-
-def parse_srt_xml(xml):
- d = re.findall(r'(.*)', xml)
- for x, y in d:
- p = parse_srt_p(x)
- raise NotImplementedError()
-
-
-def parse_cid_playurl(xml):
- from xml.dom.minidom import parseString
- try:
- doc = parseString(xml.encode('utf-8'))
- urls = [durl.getElementsByTagName('url')[0].firstChild.nodeValue for durl in doc.getElementsByTagName('durl')]
- return urls
- except:
- return []
-
-
-def bilibili_download_by_cids(cids, title, output_dir='.', merge=True, info_only=False):
- urls = []
- for cid in cids:
- url = 'http://interface.bilibili.com/playurl?appkey=' + appkey + '&cid=' + cid
- urls += [i
- if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i)
- else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i)
- for i in parse_cid_playurl(get_content(url))]
-
- type_ = ''
- size = 0
- for url in urls:
- _, type_, temp = url_info(url)
- size += temp
-
- print_info(site_info, title, type_, size)
- if not info_only:
- download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge)
-
-
-def bilibili_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False):
- url = 'http://interface.bilibili.com/playurl?appkey=' + appkey + '&cid=' + cid
- urls = [i
- if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i)
- else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i)
- for i in parse_cid_playurl(get_content(url))]
+def bilibili_download_by_api(url, output_dir='.', merge=True, info_only=False, **kwargs):
+ title = r1(r'cid=(\d+)', url)
+ info = json.loads(get_content(url))
+ urls = [i['url'] for i in info['durl']]
type_ = ''
size = 0
@@ -104,82 +27,50 @@ def bilibili_download_by_cid(cid, title, output_dir='.', merge=True, info_only=F
if not info_only:
download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge)
+def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
+ if re.match(r'https?://interface\.bilibili\.com/', url):
+ # quick hack for explicit API
+ bilibili_download_by_api(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
+ return
-def bilibili_live_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False):
- api_url = 'http://live.bilibili.com/api/playurl?cid=' + cid
- urls = parse_cid_playurl(get_content(api_url))
+ html = get_content(url)
+ main_title = r1_of([r'',
+ r'
]*>\s*([^<>]+)\s*
'], html)
+ cid = r1(r'cid=(\d+)', html)
- for url in urls:
- _, type_, _ = url_info(url)
- size = 0
+ aid = r1(r'av(\d+)', url)
+ page = r1(r'index_(\d+)', url)
+ sub_titles = re.findall('', html)
+ if page is None and sub_titles: # download all
+ for t in enumerate(sub_titles):
+ page, sub_title = t[0] + 1, t[1]
+ title = main_title + ' - ' + sub_title
+
+ api = 'http://www.bilibili.com/m/html5?aid=%s&page=%s' % (aid, page)
+ info = json.loads(get_content(api))
+ src = info['src']
+ _, type_, size = url_info(src)
+ print_info(site_info, title, type_, size)
+ if not info_only:
+ download_urls([src], title, type_, total_size=size, output_dir=output_dir, merge=merge)
+
+ else: # download selected
+ if page is None: page = 1
+ sub_title = r1('', html)
+ if sub_title is None:
+ sub_title = r1('', html)
+ if sub_title:
+ title = main_title + ' - ' + sub_title
+ else:
+ title = main_title
+
+ api = 'http://www.bilibili.com/m/html5?aid=%s&page=%s' % (aid, page)
+ info = json.loads(get_content(api))
+ src = info['src']
+ _, type_, size = url_info(src)
print_info(site_info, title, type_, size)
if not info_only:
- download_urls([url], title, type_, total_size=None, output_dir=output_dir, merge=merge)
-
-
-def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
- html = get_content(url)
-
- if re.match(r'https?://bangumi\.bilibili\.com/', url):
- # quick hack for bangumi URLs
- url = r1(r'"([^"]+)" class="v-av-link"', html)
- html = get_content(url)
-
- title = r1_of([r'',
- r']*>\s*([^<>]+)\s*
'], html)
- if title:
- title = unescape_html(title)
- title = escape_file_path(title)
-
- flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"',
- r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
- assert flashvars
- flashvars = flashvars.replace(': ', '=')
- t, cid = flashvars.split('=', 1)
- cid = cid.split('&')[0]
- if t == 'cid':
- if re.match(r'https?://live\.bilibili\.com/', url):
- title = r1(r'\s*([^<>]+)\s*', html)
- bilibili_live_download_by_cid(cid, title, output_dir=output_dir, merge=merge, info_only=info_only)
-
- else:
- # multi-P
- cids = []
- pages = re.findall('', html)
- for i, page in enumerate(pages):
- html = get_html("http://www.bilibili.com%s" % page)
- flashvars = r1_of([r'(cid=\d+)',
- r'flashvars="([^"]+)"',
- r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
- if flashvars:
- t, cid = flashvars.split('=', 1)
- cids.append(cid.split('&')[0])
- if url.endswith(page):
- cids = [cid.split('&')[0]]
- titles = [titles[i]]
- break
-
- # no multi-P
- if not pages:
- cids = [cid]
- titles = [r1(r'', html) or title]
-
- for i in range(len(cids)):
- bilibili_download_by_cid(cids[i],
- titles[i],
- output_dir=output_dir,
- merge=merge,
- info_only=info_only)
-
- elif t == 'vid':
- sina_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
- elif t == 'ykid':
- youku_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
- elif t == 'uid':
- tudou_download_by_id(cid, title, output_dir=output_dir, merge=merge, info_only=info_only)
- else:
- raise NotImplementedError(flashvars)
+ download_urls([src], title, type_, total_size=size, output_dir=output_dir, merge=merge)
if not info_only and not dry_run:
if not kwargs['caption']:
@@ -191,7 +82,6 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs
with open(os.path.join(output_dir, title + '.cmt.xml'), 'w', encoding='utf-8') as x:
x.write(xml)
-
site_info = "bilibili.com"
download = bilibili_download
download_playlist = bilibili_download