diff --git a/src/you_get/extractor.py b/src/you_get/extractor.py index af7cc824..1a68dbaf 100644 --- a/src/you_get/extractor.py +++ b/src/you_get/extractor.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -from .common import match1, maybe_print, download_urls, get_filename, parse_host, set_proxy, unset_proxy +from .common import match1, maybe_print, download_urls, get_filename, parse_host, set_proxy, unset_proxy, get_content, dry_run from .common import print_more_compatible as print from .util import log from . import json_output @@ -28,6 +28,10 @@ class VideoExtractor(): self.password_protected = False self.dash_streams = {} self.caption_tracks = {} + self.out = False + self.ua = None + self.referer = None + self.danmuku = None if args: self.url = args[0] @@ -39,6 +43,8 @@ class VideoExtractor(): if 'extractor_proxy' in kwargs and kwargs['extractor_proxy']: set_proxy(parse_host(kwargs['extractor_proxy'])) self.prepare(**kwargs) + if self.out: + return if 'extractor_proxy' in kwargs and kwargs['extractor_proxy']: unset_proxy() @@ -99,7 +105,8 @@ class VideoExtractor(): print(" quality: %s" % stream['quality']) if 'size' in stream and stream['container'].lower() != 'm3u8': - print(" size: %s MiB (%s bytes)" % (round(stream['size'] / 1048576, 1), stream['size'])) + if stream['size'] != float('inf') and stream['size'] != 0: + print(" size: %s MiB (%s bytes)" % (round(stream['size'] / 1048576, 1), stream['size'])) if 'itag' in stream: print(" # download-with: %s" % log.sprint("you-get --itag=%s [URL]" % stream_id, log.UNDERLINE)) @@ -202,12 +209,17 @@ class VideoExtractor(): if not urls: log.wtf('[Failed] Cannot extract video source.') # For legacy main() - download_urls(urls, self.title, ext, total_size, + headers = {} + if self.ua is not None: + headers['User-Agent'] = self.ua + if self.referer is not None: + headers['Referer'] = self.referer + download_urls(urls, self.title, ext, total_size, headers=headers, output_dir=kwargs['output_dir'], merge=kwargs['merge'], av=stream_id in self.dash_streams) if 'caption' not in kwargs or not kwargs['caption']: - print('Skipping captions.') + print('Skipping captions or danmuku.') return for lang in self.caption_tracks: filename = '%s.%s.srt' % (get_filename(self.title), lang) @@ -217,6 +229,11 @@ class VideoExtractor(): 'w', encoding='utf-8') as x: x.write(srt) print('Done.') + if self.danmuku is not None and not dry_run: + filename = '{}.cmt.xml'.format(get_filename(self.title)) + print('Downloading {} ...\n'.format(filename)) + with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf8') as fp: + fp.write(self.danmuku) # For main_dev() #download_urls(urls, self.title, self.streams[stream_id]['container'], self.streams[stream_id]['size']) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index ccb395cb..f904ea49 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -2,210 +2,315 @@ __all__ = ['bilibili_download'] -from ..common import * +import hashlib +import re +import time +import json +import http.cookiejar +import urllib.request +import urllib.parse +from xml.dom.minidom import parseString +from ..common import * +from ..util.log import * +from ..extractor import * + +from .qq import qq_download_by_vid from .sina import sina_download_by_vid from .tudou import tudou_download_by_id from .youku import youku_download_by_vid -import hashlib -import re +class Bilibili(VideoExtractor): + name = 'Bilibili' + live_api = 'http://live.bilibili.com/api/playurl?cid={}&otype=json' + api_url = 'http://interface.bilibili.com/playurl?' + bangumi_api_url = 'http://bangumi.bilibili.com/player/web_api/playurl?' + + SEC1 = '1c15888dc316e05a15fdd0a02ed6584f' + SEC2 = '9b288147e5474dd2aa67085f716c560d' + stream_types = [ + {'id': 'hdflv'}, + {'id': 'flv'}, + {'id': 'hdmp4'}, + {'id': 'mp4'}, + {'id': 'live'} + ] + fmt2qlt = dict(hdflv=4, flv=3, hdmp4=2, mp4=1) -appkey = 'f3bb208b3d081dc8' -SECRETKEY_MINILOADER = '1c15888dc316e05a15fdd0a02ed6584f' + @staticmethod + def bilibili_stream_type(urls): + url = urls[0] + if 'hd.flv?' in url: + return 'hdflv', 'flv' + if '.flv?' in url: + return 'flv', 'flv' + if 'hd.mp4?' in url: + return 'hdmp4', 'mp4' + if '.mp4?' in url: + return 'mp4', 'mp4' + raise Exception('Unknown stream type') -def get_srt_xml(id): - url = 'http://comment.bilibili.com/%s.xml' % id - return get_html(url) + def api_req(self, cid, quality, bangumi): + ts = str(int(time.time())) + if not bangumi: + params_str = 'cid={}&player=1&quality={}&ts={}'.format(cid, quality, ts) + chksum = hashlib.md5(bytes(params_str+self.SEC1, 'utf8')).hexdigest() + api_url = self.api_url + params_str + '&sign=' + chksum + else: + params_str = 'cid={}&module=bangumi&player=1&quality={}&ts={}'.format(cid, quality, ts) + chksum = hashlib.md5(bytes(params_str+self.SEC2, 'utf8')).hexdigest() + api_url = self.bangumi_api_url + params_str + '&sign=' + chksum + + xml_str = get_content(api_url) + return xml_str + + def parse_bili_xml(self, xml_str): + urls_list = [] + total_size = 0 + doc = parseString(xml_str.encode('utf8')) + durls = doc.getElementsByTagName('durl') + for durl in durls: + size = durl.getElementsByTagName('size')[0] + total_size += int(size.firstChild.nodeValue) + url = durl.getElementsByTagName('url')[0] + urls_list.append(url.firstChild.nodeValue) + stream_type, container = self.bilibili_stream_type(urls_list) + if stream_type not in self.streams: + self.streams[stream_type] = {} + self.streams[stream_type]['src'] = urls_list + self.streams[stream_type]['size'] = total_size + self.streams[stream_type]['container'] = container + + def download_by_vid(self, cid, bangumi, **kwargs): + stream_id = kwargs.get('stream_id') +# guard here. if stream_id invalid, fallback as not stream_id + if stream_id and stream_id in self.fmt2qlt: + quality = stream_id + else: + quality = 'hdflv' if bangumi else 'flv' + + info_only = kwargs.get('info_only') + if not info_only or stream_id: +# won't be None + qlt = self.fmt2qlt.get(quality) + api_xml = self.api_req(cid, qlt, bangumi) + self.parse_bili_xml(api_xml) + self.danmuku = get_danmuku_xml(cid) + else: + for qlt in range(4, 0, -1): + api_xml = self.api_req(cid, qlt, bangumi) + self.parse_bili_xml(api_xml) + + def prepare(self, **kwargs): + self.ua = fake_headers['User-Agent'] + self.url = url_locations([self.url])[0] + frag = urllib.parse.urlparse(self.url).fragment +# http://www.bilibili.com/video/av3141144/index_2.html#page=3 + if frag: + hit = re.search(r'page=(\d+)', frag) + if hit is not None: + page = hit.group(1) + aid = re.search(r'av(\d+)', self.url).group(1) + self.url = 'http://www.bilibili.com/video/av{}/index_{}.html'.format(aid, page) + self.referer = self.url + self.page = get_content(self.url) + try: + self.title = re.search(r'([^<]+)', self.page).group(1) + self.room_id = re.search('ROOMID\s*=\s*(\d+)', self.page).group(1) + api_url = self.live_api.format(self.room_id) + json_data = json.loads(get_content(api_url)) + urls = [json_data['durl'][0]['url']] + + self.streams['live'] = {} + self.streams['live']['src'] = urls + self.streams['live']['container'] = 'flv' + self.streams['live']['size'] = 0 + + def bangumi_entry(self, **kwargs): + bangumi_id = re.search(r'(\d+)', self.url).group(1) + bangumi_data = get_bangumi_info(bangumi_id) + bangumi_payment = bangumi_data.get('payment') + if bangumi_payment and bangumi_payment['price'] != '0': + log.w("It's a paid item") + ep_ids = collect_bangumi_epids(bangumi_data) + + frag = urllib.parse.urlparse(self.url).fragment + if frag: + episode_id = frag + else: + episode_id = re.search(r'first_ep_id\s*=\s*"(\d+)"', self.page) + cont = post_content('http://bangumi.bilibili.com/web_api/get_source', post_data=dict(episode_id=episode_id)) + cid = json.loads(cont)['result']['cid'] + cont = get_content('http://bangumi.bilibili.com/web_api/episode/{}.json'.format(episode_id)) + ep_info = json.loads(cont)['result']['currentEpisode'] + + long_title = ep_info['longTitle'] + aid = ep_info['avId'] + + idx = 0 + while ep_ids[idx] != episode_id: + idx += 1 + + self.title = '{} [{} {}]'.format(self.title, idx+1, long_title) + self.download_by_vid(cid, bangumi=True, **kwargs) -def parse_srt_p(p): - fields = p.split(',') - assert len(fields) == 8, fields - time, mode, font_size, font_color, pub_time, pool, user_id, history = fields - time = float(time) +def check_oversea(): + url = 'https://interface.bilibili.com/player?id=cid:17778881' + xml_lines = get_content(url).split('\n') + for line in xml_lines: + key = line.split('>')[0][1:] + if key == 'country': + value = line.split('>')[1].split('<')[0] + if value != '中国': + return True + else: + return False + return False - mode = int(mode) - assert 1 <= mode <= 8 - # mode 1~3: scrolling - # mode 4: bottom - # mode 5: top - # mode 6: reverse? - # mode 7: position - # mode 8: advanced +def check_sid(): + if not cookies: + return False + for cookie in cookies: + if cookie.domain == '.bilibili.com' and cookie.name == 'sid': + return True + return False - pool = int(pool) - assert 0 <= pool <= 2 - # pool 0: normal - # pool 1: srt - # pool 2: special? +def fetch_sid(cid, aid): + url = 'http://interface.bilibili.com/player?id=cid:{}&aid={}'.format(cid, aid) + cookies = http.cookiejar.CookieJar() + req = urllib.request.Request(url) + res = urllib.request.urlopen(url) + cookies.extract_cookies(res, req) + for c in cookies: + if c.domain == '.bilibili.com' and c.name == 'sid': + return c.value + raise - font_size = int(font_size) +def collect_bangumi_epids(json_data): + eps = json_data['result']['episodes'] + eps = sorted(eps, key=lambda item: int(item['index'])) + result = [] + for ep in eps: + result.append(ep['episode_id']) + return result - font_color = '#%06x' % int(font_color) - - return pool, mode, font_size, font_color - - -def parse_srt_xml(xml): - d = re.findall(r'(.*)', xml) - for x, y in d: - p = parse_srt_p(x) - raise NotImplementedError() +def get_bangumi_info(bangumi_id): + BASE_URL = 'http://bangumi.bilibili.com/jsonp/seasoninfo/' + long_epoch = int(time.time() * 1000) + req_url = BASE_URL + bangumi_id + '.ver?callback=seasonListCallback&jsonp=jsonp&_=' + str(long_epoch) + season_data = get_content(req_url) + season_data = season_data[len('seasonListCallback('):] + season_data = season_data[: -1 * len(');')] + json_data = json.loads(season_data) + return json_data +def get_danmuku_xml(cid): + return get_content('http://comment.bilibili.com/{}.xml'.format(cid)) def parse_cid_playurl(xml): from xml.dom.minidom import parseString try: + urls_list = [] + total_size = 0 doc = parseString(xml.encode('utf-8')) - urls = [durl.getElementsByTagName('url')[0].firstChild.nodeValue for durl in doc.getElementsByTagName('durl')] - return urls - except: - return [] + durls = doc.getElementsByTagName('durl') + cdn_cnt = len(durls[0].getElementsByTagName('url')) + for i in range(cdn_cnt): + urls_list.append([]) + for durl in durls: + size = durl.getElementsByTagName('size')[0] + total_size += int(size.firstChild.nodeValue) + cnt = len(durl.getElementsByTagName('url')) + for i in range(cnt): + u = durl.getElementsByTagName('url')[i].firstChild.nodeValue + urls_list[i].append(u) + return urls_list, total_size + except Exception as e: + log.w(e) + return [], 0 +def bilibili_download_playlist_by_url(url, **kwargs): + url = url_locations([url])[0] +# a bangumi here? possible? + if 'live.bilibili' in url: + site.download_by_url(url) + elif 'bangumi.bilibili' in url: + bangumi_id = re.search(r'(\d+)', url).group(1) + bangumi_data = get_bangumi_info(bangumi_id) + ep_ids = collect_bangumi_epids(bangumi_data) -def bilibili_download_by_cids(cids, title, output_dir='.', merge=True, info_only=False): - urls = [] - for cid in cids: - sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest() - url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this - urls += [i - if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i) - else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i) - for i in parse_cid_playurl(get_content(url))] - - type_ = '' - size = 0 - for url in urls: - _, type_, temp = url_info(url) - size += temp - - print_info(site_info, title, type_, size) - if not info_only: - download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge, headers={'Referer': 'http://www.bilibili.com/'}) - - -def bilibili_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False): - while True: - try: - sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest() - url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this - urls = [i - if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i) - else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i) - for i in parse_cid_playurl(get_content(url))] - - type_ = '' - size = 0 - for url in urls: - _, type_, temp = url_info(url, headers={'Referer': 'http://www.bilibili.com/'}) - size += temp or 0 - - print_info(site_info, title, type_, size) - if not info_only: - download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge, timeout=1, headers={'Referer': 'http://www.bilibili.com/'}) - except socket.timeout: - continue - else: - break - - -def bilibili_live_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False): - api_url = 'http://live.bilibili.com/api/playurl?cid=' + cid - urls = parse_cid_playurl(get_content(api_url)) - - for url in urls: - _, type_, _ = url_info(url) - size = 0 - print_info(site_info, title, type_, size) - if not info_only: - download_urls([url], title, type_, total_size=None, output_dir=output_dir, merge=merge) - - -def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - html = get_content(url) - - title = r1_of([r'', - r']*>\s*([^<>]+)\s*'], html) - if title: - title = unescape_html(title) - title = escape_file_path(title) - - if re.match(r'https?://bangumi\.bilibili\.com/', url): - # quick hack for bangumi URLs - episode_id = r1(r'#(\d+)$', url) or r1(r'first_ep_id = "(\d+)"', html) - cont = post_content('http://bangumi.bilibili.com/web_api/get_source', - post_data={'episode_id': episode_id}) - cid = json.loads(cont)['result']['cid'] - title = '%s [%s]' % (title, episode_id) - bilibili_download_by_cid(str(cid), title, output_dir=output_dir, merge=merge, info_only=info_only) - + base_url = url.split('#')[0] + for ep_id in ep_ids: + ep_url = '#'.join([base_url, ep_id]) + Bilibili().download_by_url(ep_url, **kwargs) else: - flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"', - r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) - assert flashvars - flashvars = flashvars.replace(': ', '=') - t, cid = flashvars.split('=', 1) - cid = cid.split('&')[0] - if t == 'cid': - if re.match(r'https?://live\.bilibili\.com/', url): - title = r1(r'\s*([^<>]+)\s*', html) - bilibili_live_download_by_cid(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) + aid = re.search(r'av(\d+)', url).group(1) + page_list = json.loads(get_content('http://www.bilibili.com/widget/getPageList?aid={}'.format(aid))) + page_cnt = len(page_list) + for no in range(1, page_cnt+1): + page_url = 'http://www.bilibili.com/video/av{}/index_{}.html'.format(aid, no) + subtitle = page_list[no-1]['pagename'] + Bilibili().download_by_url(page_url, subtitle=subtitle, **kwargs) - else: - # multi-P - cids = [] - pages = re.findall('', html) - for i, page in enumerate(pages): - html = get_html("http://www.bilibili.com%s" % page) - flashvars = r1_of([r'(cid=\d+)', - r'flashvars="([^"]+)"', - r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) - if flashvars: - t, cid = flashvars.split('=', 1) - cids.append(cid.split('&')[0]) - if url.endswith(page): - cids = [cid.split('&')[0]] - titles = [titles[i]] - break +site = Bilibili() +download = site.download_by_url +download_playlist = bilibili_download_playlist_by_url - # no multi-P - if not pages: - cids = [cid] - titles = [r1(r'', html) or title] - for i in range(len(cids)): - completeTitle=None - if (title == titles[i]): - completeTitle=title - else: - completeTitle=title+"-"+titles[i]#Build Better Title - bilibili_download_by_cid(cids[i], - completeTitle, - output_dir=output_dir, - merge=merge, - info_only=info_only) - - elif t == 'vid': - sina_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) - elif t == 'ykid': - youku_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) - elif t == 'uid': - tudou_download_by_id(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) - else: - raise NotImplementedError(flashvars) - - if not info_only and not dry_run: - if not kwargs['caption']: - print('Skipping danmaku.') - return - title = get_filename(title) - print('Downloading %s ...\n' % (title + '.cmt.xml')) - xml = get_srt_xml(cid) - with open(os.path.join(output_dir, title + '.cmt.xml'), 'w', encoding='utf-8') as x: - x.write(xml) - - -site_info = "bilibili.com" -download = bilibili_download -download_playlist = bilibili_download +bilibili_download = download