From d66749801f6fbb0aee4e2030e93448e9d46e13f8 Mon Sep 17 00:00:00 2001 From: David Zhuang Date: Sun, 28 Aug 2016 23:39:19 -0400 Subject: [PATCH] [Bilibili] Change back to extract via interface API with Miniloader s key --- src/you_get/extractors/bilibili.py | 203 ++++++++++++++++++++++------- 1 file changed, 155 insertions(+), 48 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index a9caf8ed..e9d3e7ad 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -2,17 +2,96 @@ __all__ = ['bilibili_download'] -import json -import re from ..common import * -def get_srt_xml(cid): - return get_html('http://comment.bilibili.com/%s.xml' % cid) +from .sina import sina_download_by_vid +from .tudou import tudou_download_by_id +from .youku import youku_download_by_vid -def bilibili_download_by_api(url, output_dir='.', merge=True, info_only=False, **kwargs): - title = r1(r'cid=(\d+)', url) - info = json.loads(get_content(url)) - urls = [i['url'] for i in info['durl']] +import hashlib +import re + +appkey = 'f3bb208b3d081dc8' +SECRETKEY_MINILOADER = '1c15888dc316e05a15fdd0a02ed6584f' + +def get_srt_xml(id): + url = 'http://comment.bilibili.com/%s.xml' % id + return get_html(url) + + +def parse_srt_p(p): + fields = p.split(',') + assert len(fields) == 8, fields + time, mode, font_size, font_color, pub_time, pool, user_id, history = fields + time = float(time) + + mode = int(mode) + assert 1 <= mode <= 8 + # mode 1~3: scrolling + # mode 4: bottom + # mode 5: top + # mode 6: reverse? + # mode 7: position + # mode 8: advanced + + pool = int(pool) + assert 0 <= pool <= 2 + # pool 0: normal + # pool 1: srt + # pool 2: special? + + font_size = int(font_size) + + font_color = '#%06x' % int(font_color) + + return pool, mode, font_size, font_color + + +def parse_srt_xml(xml): + d = re.findall(r'(.*)', xml) + for x, y in d: + p = parse_srt_p(x) + raise NotImplementedError() + + +def parse_cid_playurl(xml): + from xml.dom.minidom import parseString + try: + doc = parseString(xml.encode('utf-8')) + urls = [durl.getElementsByTagName('url')[0].firstChild.nodeValue for durl in doc.getElementsByTagName('durl')] + return urls + except: + return [] + + +def bilibili_download_by_cids(cids, title, output_dir='.', merge=True, info_only=False): + urls = [] + for cid in cids: + sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest() + url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this + urls += [i + if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i) + else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i) + for i in parse_cid_playurl(get_content(url))] + + type_ = '' + size = 0 + for url in urls: + _, type_, temp = url_info(url) + size += temp + + print_info(site_info, title, type_, size) + if not info_only: + download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge) + + +def bilibili_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False): + sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest() + url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this + urls = [i + if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i) + else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i) + for i in parse_cid_playurl(get_content(url))] type_ = '' size = 0 @@ -27,55 +106,82 @@ def bilibili_download_by_api(url, output_dir='.', merge=True, info_only=False, * if not info_only: download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge) + +def bilibili_live_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False): + api_url = 'http://live.bilibili.com/api/playurl?cid=' + cid + urls = parse_cid_playurl(get_content(api_url)) + + for url in urls: + _, type_, _ = url_info(url) + size = 0 + print_info(site_info, title, type_, size) + if not info_only: + download_urls([url], title, type_, total_size=None, output_dir=output_dir, merge=merge) + + def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - if re.match(r'https?://interface\.bilibili\.com/', url): - # quick hack for explicit API - bilibili_download_by_api(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) - return + html = get_content(url) if re.match(r'https?://bangumi\.bilibili\.com/', url): # quick hack for bangumi URLs - html = get_content(url) url = r1(r'"([^"]+)" class="v-av-link"', html) + html = get_content(url) - html = get_content(url) - main_title = r1_of([r'', - r']*>\s*([^<>]+)\s*'], html) - cid = r1(r'cid=(\d+)', html) + title = r1_of([r'', + r']*>\s*([^<>]+)\s*'], html) + if title: + title = unescape_html(title) + title = escape_file_path(title) - aid = r1(r'av(\d+)', url) - page = r1(r'index_(\d+)', url) - sub_titles = re.findall('', html) - if page is None and sub_titles: # download all - for t in enumerate(sub_titles): - page, sub_title = t[0] + 1, t[1] - title = main_title + ' - ' + sub_title + flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"', + r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) + assert flashvars + flashvars = flashvars.replace(': ', '=') + t, cid = flashvars.split('=', 1) + cid = cid.split('&')[0] + if t == 'cid': + if re.match(r'https?://live\.bilibili\.com/', url): + title = r1(r'\s*([^<>]+)\s*', html) + bilibili_live_download_by_cid(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) - api = 'http://www.bilibili.com/m/html5?aid=%s&page=%s' % (aid, page) - info = json.loads(get_content(api)) - src = info['src'] - _, type_, size = url_info(src) - print_info(site_info, title, type_, size) - if not info_only: - download_urls([src], title, type_, total_size=size, output_dir=output_dir, merge=merge) - - else: # download selected - if page is None: page = 1 - sub_title = r1('', html) - if sub_title is None: - sub_title = r1('', html) - if sub_title: - title = main_title + ' - ' + sub_title else: - title = main_title + # multi-P + cids = [] + pages = re.findall('', html) + for i, page in enumerate(pages): + html = get_html("http://www.bilibili.com%s" % page) + flashvars = r1_of([r'(cid=\d+)', + r'flashvars="([^"]+)"', + r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) + if flashvars: + t, cid = flashvars.split('=', 1) + cids.append(cid.split('&')[0]) + if url.endswith(page): + cids = [cid.split('&')[0]] + titles = [titles[i]] + break - api = 'http://www.bilibili.com/m/html5?aid=%s&page=%s' % (aid, page) - info = json.loads(get_content(api)) - src = info['src'] - _, type_, size = url_info(src) - print_info(site_info, title, type_, size) - if not info_only: - download_urls([src], title, type_, total_size=size, output_dir=output_dir, merge=merge) + # no multi-P + if not pages: + cids = [cid] + titles = [r1(r'', html) or title] + + for i in range(len(cids)): + bilibili_download_by_cid(cids[i], + titles[i], + output_dir=output_dir, + merge=merge, + info_only=info_only) + + elif t == 'vid': + sina_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + elif t == 'ykid': + youku_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + elif t == 'uid': + tudou_download_by_id(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) + else: + raise NotImplementedError(flashvars) if not info_only and not dry_run: if not kwargs['caption']: @@ -87,6 +193,7 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs with open(os.path.join(output_dir, title + '.cmt.xml'), 'w', encoding='utf-8') as x: x.write(xml) + site_info = "bilibili.com" download = bilibili_download -download_playlist = bilibili_download +download_playlist = bilibili_download \ No newline at end of file