From 7cecd0368dea3e8e7318f48908e39d3d8405c95f Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 21 Aug 2016 21:44:05 +0200 Subject: [PATCH] [bilibili] close #1340 --- src/you_get/extractors/bilibili.py | 206 +++++++---------------------- 1 file changed, 48 insertions(+), 158 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 3fbf946f..c7c4fac9 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -2,94 +2,17 @@ __all__ = ['bilibili_download'] +import json +import re from ..common import * -from .sina import sina_download_by_vid -from .tudou import tudou_download_by_id -from .youku import youku_download_by_vid +def get_srt_xml(cid): + return get_html('http://comment.bilibili.com/%s.xml' % cid) -import hashlib -import re - -appkey = 'f3bb208b3d081dc8' - - -def get_srt_xml(id): - url = 'http://comment.bilibili.com/%s.xml' % id - return get_html(url) - - -def parse_srt_p(p): - fields = p.split(',') - assert len(fields) == 8, fields - time, mode, font_size, font_color, pub_time, pool, user_id, history = fields - time = float(time) - - mode = int(mode) - assert 1 <= mode <= 8 - # mode 1~3: scrolling - # mode 4: bottom - # mode 5: top - # mode 6: reverse? - # mode 7: position - # mode 8: advanced - - pool = int(pool) - assert 0 <= pool <= 2 - # pool 0: normal - # pool 1: srt - # pool 2: special? - - font_size = int(font_size) - - font_color = '#%06x' % int(font_color) - - return pool, mode, font_size, font_color - - -def parse_srt_xml(xml): - d = re.findall(r'(.*)', xml) - for x, y in d: - p = parse_srt_p(x) - raise NotImplementedError() - - -def parse_cid_playurl(xml): - from xml.dom.minidom import parseString - try: - doc = parseString(xml.encode('utf-8')) - urls = [durl.getElementsByTagName('url')[0].firstChild.nodeValue for durl in doc.getElementsByTagName('durl')] - return urls - except: - return [] - - -def bilibili_download_by_cids(cids, title, output_dir='.', merge=True, info_only=False): - urls = [] - for cid in cids: - url = 'http://interface.bilibili.com/playurl?appkey=' + appkey + '&cid=' + cid - urls += [i - if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i) - else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i) - for i in parse_cid_playurl(get_content(url))] - - type_ = '' - size = 0 - for url in urls: - _, type_, temp = url_info(url) - size += temp - - print_info(site_info, title, type_, size) - if not info_only: - download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge) - - -def bilibili_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False): - url = 'http://interface.bilibili.com/playurl?appkey=' + appkey + '&cid=' + cid - urls = [i - if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i) - else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i) - for i in parse_cid_playurl(get_content(url))] +def bilibili_download_by_api(url, output_dir='.', merge=True, info_only=False, **kwargs): + title = r1(r'cid=(\d+)', url) + info = json.loads(get_content(url)) + urls = [i['url'] for i in info['durl']] type_ = '' size = 0 @@ -104,82 +27,50 @@ def bilibili_download_by_cid(cid, title, output_dir='.', merge=True, info_only=F if not info_only: download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge) +def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs): + if re.match(r'https?://interface\.bilibili\.com/', url): + # quick hack for explicit API + bilibili_download_by_api(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) + return -def bilibili_live_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False): - api_url = 'http://live.bilibili.com/api/playurl?cid=' + cid - urls = parse_cid_playurl(get_content(api_url)) + html = get_content(url) + main_title = r1_of([r'', + r']*>\s*([^<>]+)\s*'], html) + cid = r1(r'cid=(\d+)', html) - for url in urls: - _, type_, _ = url_info(url) - size = 0 + aid = r1(r'av(\d+)', url) + page = r1(r'index_(\d+)', url) + sub_titles = re.findall('', html) + if page is None and sub_titles: # download all + for t in enumerate(sub_titles): + page, sub_title = t[0] + 1, t[1] + title = main_title + ' - ' + sub_title + + api = 'http://www.bilibili.com/m/html5?aid=%s&page=%s' % (aid, page) + info = json.loads(get_content(api)) + src = info['src'] + _, type_, size = url_info(src) + print_info(site_info, title, type_, size) + if not info_only: + download_urls([src], title, type_, total_size=size, output_dir=output_dir, merge=merge) + + else: # download selected + if page is None: page = 1 + sub_title = r1('', html) + if sub_title is None: + sub_title = r1('', html) + if sub_title: + title = main_title + ' - ' + sub_title + else: + title = main_title + + api = 'http://www.bilibili.com/m/html5?aid=%s&page=%s' % (aid, page) + info = json.loads(get_content(api)) + src = info['src'] + _, type_, size = url_info(src) print_info(site_info, title, type_, size) if not info_only: - download_urls([url], title, type_, total_size=None, output_dir=output_dir, merge=merge) - - -def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - html = get_content(url) - - if re.match(r'https?://bangumi\.bilibili\.com/', url): - # quick hack for bangumi URLs - url = r1(r'"([^"]+)" class="v-av-link"', html) - html = get_content(url) - - title = r1_of([r'', - r']*>\s*([^<>]+)\s*'], html) - if title: - title = unescape_html(title) - title = escape_file_path(title) - - flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"', - r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) - assert flashvars - flashvars = flashvars.replace(': ', '=') - t, cid = flashvars.split('=', 1) - cid = cid.split('&')[0] - if t == 'cid': - if re.match(r'https?://live\.bilibili\.com/', url): - title = r1(r'\s*([^<>]+)\s*', html) - bilibili_live_download_by_cid(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) - - else: - # multi-P - cids = [] - pages = re.findall('', html) - for i, page in enumerate(pages): - html = get_html("http://www.bilibili.com%s" % page) - flashvars = r1_of([r'(cid=\d+)', - r'flashvars="([^"]+)"', - r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) - if flashvars: - t, cid = flashvars.split('=', 1) - cids.append(cid.split('&')[0]) - if url.endswith(page): - cids = [cid.split('&')[0]] - titles = [titles[i]] - break - - # no multi-P - if not pages: - cids = [cid] - titles = [r1(r'', html) or title] - - for i in range(len(cids)): - bilibili_download_by_cid(cids[i], - titles[i], - output_dir=output_dir, - merge=merge, - info_only=info_only) - - elif t == 'vid': - sina_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) - elif t == 'ykid': - youku_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) - elif t == 'uid': - tudou_download_by_id(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) - else: - raise NotImplementedError(flashvars) + download_urls([src], title, type_, total_size=size, output_dir=output_dir, merge=merge) if not info_only and not dry_run: if not kwargs['caption']: @@ -191,7 +82,6 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs with open(os.path.join(output_dir, title + '.cmt.xml'), 'w', encoding='utf-8') as x: x.write(xml) - site_info = "bilibili.com" download = bilibili_download download_playlist = bilibili_download