diff --git a/src/you_get/extractor.py b/src/you_get/extractor.py
index af7cc824..1a68dbaf 100644
--- a/src/you_get/extractor.py
+++ b/src/you_get/extractor.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python
-from .common import match1, maybe_print, download_urls, get_filename, parse_host, set_proxy, unset_proxy
+from .common import match1, maybe_print, download_urls, get_filename, parse_host, set_proxy, unset_proxy, get_content, dry_run
from .common import print_more_compatible as print
from .util import log
from . import json_output
@@ -28,6 +28,10 @@ class VideoExtractor():
self.password_protected = False
self.dash_streams = {}
self.caption_tracks = {}
+ self.out = False
+ self.ua = None
+ self.referer = None
+ self.danmuku = None
if args:
self.url = args[0]
@@ -39,6 +43,8 @@ class VideoExtractor():
if 'extractor_proxy' in kwargs and kwargs['extractor_proxy']:
set_proxy(parse_host(kwargs['extractor_proxy']))
self.prepare(**kwargs)
+ if self.out:
+ return
if 'extractor_proxy' in kwargs and kwargs['extractor_proxy']:
unset_proxy()
@@ -99,7 +105,8 @@ class VideoExtractor():
print(" quality: %s" % stream['quality'])
if 'size' in stream and stream['container'].lower() != 'm3u8':
- print(" size: %s MiB (%s bytes)" % (round(stream['size'] / 1048576, 1), stream['size']))
+ if stream['size'] != float('inf') and stream['size'] != 0:
+ print(" size: %s MiB (%s bytes)" % (round(stream['size'] / 1048576, 1), stream['size']))
if 'itag' in stream:
print(" # download-with: %s" % log.sprint("you-get --itag=%s [URL]" % stream_id, log.UNDERLINE))
@@ -202,12 +209,17 @@ class VideoExtractor():
if not urls:
log.wtf('[Failed] Cannot extract video source.')
# For legacy main()
- download_urls(urls, self.title, ext, total_size,
+ headers = {}
+ if self.ua is not None:
+ headers['User-Agent'] = self.ua
+ if self.referer is not None:
+ headers['Referer'] = self.referer
+ download_urls(urls, self.title, ext, total_size, headers=headers,
output_dir=kwargs['output_dir'],
merge=kwargs['merge'],
av=stream_id in self.dash_streams)
if 'caption' not in kwargs or not kwargs['caption']:
- print('Skipping captions.')
+ print('Skipping captions or danmuku.')
return
for lang in self.caption_tracks:
filename = '%s.%s.srt' % (get_filename(self.title), lang)
@@ -217,6 +229,11 @@ class VideoExtractor():
'w', encoding='utf-8') as x:
x.write(srt)
print('Done.')
+ if self.danmuku is not None and not dry_run:
+ filename = '{}.cmt.xml'.format(get_filename(self.title))
+ print('Downloading {} ...\n'.format(filename))
+ with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf8') as fp:
+ fp.write(self.danmuku)
# For main_dev()
#download_urls(urls, self.title, self.streams[stream_id]['container'], self.streams[stream_id]['size'])
diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py
index ccb395cb..f904ea49 100644
--- a/src/you_get/extractors/bilibili.py
+++ b/src/you_get/extractors/bilibili.py
@@ -2,210 +2,315 @@
__all__ = ['bilibili_download']
-from ..common import *
+import hashlib
+import re
+import time
+import json
+import http.cookiejar
+import urllib.request
+import urllib.parse
+from xml.dom.minidom import parseString
+from ..common import *
+from ..util.log import *
+from ..extractor import *
+
+from .qq import qq_download_by_vid
from .sina import sina_download_by_vid
from .tudou import tudou_download_by_id
from .youku import youku_download_by_vid
-import hashlib
-import re
+class Bilibili(VideoExtractor):
+ name = 'Bilibili'
+ live_api = 'http://live.bilibili.com/api/playurl?cid={}&otype=json'
+ api_url = 'http://interface.bilibili.com/playurl?'
+ bangumi_api_url = 'http://bangumi.bilibili.com/player/web_api/playurl?'
+
+ SEC1 = '1c15888dc316e05a15fdd0a02ed6584f'
+ SEC2 = '9b288147e5474dd2aa67085f716c560d'
+ stream_types = [
+ {'id': 'hdflv'},
+ {'id': 'flv'},
+ {'id': 'hdmp4'},
+ {'id': 'mp4'},
+ {'id': 'live'}
+ ]
+ fmt2qlt = dict(hdflv=4, flv=3, hdmp4=2, mp4=1)
-appkey = 'f3bb208b3d081dc8'
-SECRETKEY_MINILOADER = '1c15888dc316e05a15fdd0a02ed6584f'
+ @staticmethod
+ def bilibili_stream_type(urls):
+ url = urls[0]
+ if 'hd.flv?' in url:
+ return 'hdflv', 'flv'
+ if '.flv?' in url:
+ return 'flv', 'flv'
+ if 'hd.mp4?' in url:
+ return 'hdmp4', 'mp4'
+ if '.mp4?' in url:
+ return 'mp4', 'mp4'
+ raise Exception('Unknown stream type')
-def get_srt_xml(id):
- url = 'http://comment.bilibili.com/%s.xml' % id
- return get_html(url)
+ def api_req(self, cid, quality, bangumi):
+ ts = str(int(time.time()))
+ if not bangumi:
+ params_str = 'cid={}&player=1&quality={}&ts={}'.format(cid, quality, ts)
+ chksum = hashlib.md5(bytes(params_str+self.SEC1, 'utf8')).hexdigest()
+ api_url = self.api_url + params_str + '&sign=' + chksum
+ else:
+ params_str = 'cid={}&module=bangumi&player=1&quality={}&ts={}'.format(cid, quality, ts)
+ chksum = hashlib.md5(bytes(params_str+self.SEC2, 'utf8')).hexdigest()
+ api_url = self.bangumi_api_url + params_str + '&sign=' + chksum
+
+ xml_str = get_content(api_url)
+ return xml_str
+
+ def parse_bili_xml(self, xml_str):
+ urls_list = []
+ total_size = 0
+ doc = parseString(xml_str.encode('utf8'))
+ durls = doc.getElementsByTagName('durl')
+ for durl in durls:
+ size = durl.getElementsByTagName('size')[0]
+ total_size += int(size.firstChild.nodeValue)
+ url = durl.getElementsByTagName('url')[0]
+ urls_list.append(url.firstChild.nodeValue)
+ stream_type, container = self.bilibili_stream_type(urls_list)
+ if stream_type not in self.streams:
+ self.streams[stream_type] = {}
+ self.streams[stream_type]['src'] = urls_list
+ self.streams[stream_type]['size'] = total_size
+ self.streams[stream_type]['container'] = container
+
+ def download_by_vid(self, cid, bangumi, **kwargs):
+ stream_id = kwargs.get('stream_id')
+# guard here. if stream_id invalid, fallback as not stream_id
+ if stream_id and stream_id in self.fmt2qlt:
+ quality = stream_id
+ else:
+ quality = 'hdflv' if bangumi else 'flv'
+
+ info_only = kwargs.get('info_only')
+ if not info_only or stream_id:
+# won't be None
+ qlt = self.fmt2qlt.get(quality)
+ api_xml = self.api_req(cid, qlt, bangumi)
+ self.parse_bili_xml(api_xml)
+ self.danmuku = get_danmuku_xml(cid)
+ else:
+ for qlt in range(4, 0, -1):
+ api_xml = self.api_req(cid, qlt, bangumi)
+ self.parse_bili_xml(api_xml)
+
+ def prepare(self, **kwargs):
+ self.ua = fake_headers['User-Agent']
+ self.url = url_locations([self.url])[0]
+ frag = urllib.parse.urlparse(self.url).fragment
+# http://www.bilibili.com/video/av3141144/index_2.html#page=3
+ if frag:
+ hit = re.search(r'page=(\d+)', frag)
+ if hit is not None:
+ page = hit.group(1)
+ aid = re.search(r'av(\d+)', self.url).group(1)
+ self.url = 'http://www.bilibili.com/video/av{}/index_{}.html'.format(aid, page)
+ self.referer = self.url
+ self.page = get_content(self.url)
+ try:
+ self.title = re.search(r'
([^<]+)', self.page).group(1)
+ self.room_id = re.search('ROOMID\s*=\s*(\d+)', self.page).group(1)
+ api_url = self.live_api.format(self.room_id)
+ json_data = json.loads(get_content(api_url))
+ urls = [json_data['durl'][0]['url']]
+
+ self.streams['live'] = {}
+ self.streams['live']['src'] = urls
+ self.streams['live']['container'] = 'flv'
+ self.streams['live']['size'] = 0
+
+ def bangumi_entry(self, **kwargs):
+ bangumi_id = re.search(r'(\d+)', self.url).group(1)
+ bangumi_data = get_bangumi_info(bangumi_id)
+ bangumi_payment = bangumi_data.get('payment')
+ if bangumi_payment and bangumi_payment['price'] != '0':
+ log.w("It's a paid item")
+ ep_ids = collect_bangumi_epids(bangumi_data)
+
+ frag = urllib.parse.urlparse(self.url).fragment
+ if frag:
+ episode_id = frag
+ else:
+ episode_id = re.search(r'first_ep_id\s*=\s*"(\d+)"', self.page)
+ cont = post_content('http://bangumi.bilibili.com/web_api/get_source', post_data=dict(episode_id=episode_id))
+ cid = json.loads(cont)['result']['cid']
+ cont = get_content('http://bangumi.bilibili.com/web_api/episode/{}.json'.format(episode_id))
+ ep_info = json.loads(cont)['result']['currentEpisode']
+
+ long_title = ep_info['longTitle']
+ aid = ep_info['avId']
+
+ idx = 0
+ while ep_ids[idx] != episode_id:
+ idx += 1
+
+ self.title = '{} [{} {}]'.format(self.title, idx+1, long_title)
+ self.download_by_vid(cid, bangumi=True, **kwargs)
-def parse_srt_p(p):
- fields = p.split(',')
- assert len(fields) == 8, fields
- time, mode, font_size, font_color, pub_time, pool, user_id, history = fields
- time = float(time)
+def check_oversea():
+ url = 'https://interface.bilibili.com/player?id=cid:17778881'
+ xml_lines = get_content(url).split('\n')
+ for line in xml_lines:
+ key = line.split('>')[0][1:]
+ if key == 'country':
+ value = line.split('>')[1].split('<')[0]
+ if value != '中国':
+ return True
+ else:
+ return False
+ return False
- mode = int(mode)
- assert 1 <= mode <= 8
- # mode 1~3: scrolling
- # mode 4: bottom
- # mode 5: top
- # mode 6: reverse?
- # mode 7: position
- # mode 8: advanced
+def check_sid():
+ if not cookies:
+ return False
+ for cookie in cookies:
+ if cookie.domain == '.bilibili.com' and cookie.name == 'sid':
+ return True
+ return False
- pool = int(pool)
- assert 0 <= pool <= 2
- # pool 0: normal
- # pool 1: srt
- # pool 2: special?
+def fetch_sid(cid, aid):
+ url = 'http://interface.bilibili.com/player?id=cid:{}&aid={}'.format(cid, aid)
+ cookies = http.cookiejar.CookieJar()
+ req = urllib.request.Request(url)
+ res = urllib.request.urlopen(url)
+ cookies.extract_cookies(res, req)
+ for c in cookies:
+ if c.domain == '.bilibili.com' and c.name == 'sid':
+ return c.value
+ raise
- font_size = int(font_size)
+def collect_bangumi_epids(json_data):
+ eps = json_data['result']['episodes']
+ eps = sorted(eps, key=lambda item: int(item['index']))
+ result = []
+ for ep in eps:
+ result.append(ep['episode_id'])
+ return result
- font_color = '#%06x' % int(font_color)
-
- return pool, mode, font_size, font_color
-
-
-def parse_srt_xml(xml):
- d = re.findall(r'(.*)', xml)
- for x, y in d:
- p = parse_srt_p(x)
- raise NotImplementedError()
+def get_bangumi_info(bangumi_id):
+ BASE_URL = 'http://bangumi.bilibili.com/jsonp/seasoninfo/'
+ long_epoch = int(time.time() * 1000)
+ req_url = BASE_URL + bangumi_id + '.ver?callback=seasonListCallback&jsonp=jsonp&_=' + str(long_epoch)
+ season_data = get_content(req_url)
+ season_data = season_data[len('seasonListCallback('):]
+ season_data = season_data[: -1 * len(');')]
+ json_data = json.loads(season_data)
+ return json_data
+def get_danmuku_xml(cid):
+ return get_content('http://comment.bilibili.com/{}.xml'.format(cid))
def parse_cid_playurl(xml):
from xml.dom.minidom import parseString
try:
+ urls_list = []
+ total_size = 0
doc = parseString(xml.encode('utf-8'))
- urls = [durl.getElementsByTagName('url')[0].firstChild.nodeValue for durl in doc.getElementsByTagName('durl')]
- return urls
- except:
- return []
+ durls = doc.getElementsByTagName('durl')
+ cdn_cnt = len(durls[0].getElementsByTagName('url'))
+ for i in range(cdn_cnt):
+ urls_list.append([])
+ for durl in durls:
+ size = durl.getElementsByTagName('size')[0]
+ total_size += int(size.firstChild.nodeValue)
+ cnt = len(durl.getElementsByTagName('url'))
+ for i in range(cnt):
+ u = durl.getElementsByTagName('url')[i].firstChild.nodeValue
+ urls_list[i].append(u)
+ return urls_list, total_size
+ except Exception as e:
+ log.w(e)
+ return [], 0
+def bilibili_download_playlist_by_url(url, **kwargs):
+ url = url_locations([url])[0]
+# a bangumi here? possible?
+ if 'live.bilibili' in url:
+ site.download_by_url(url)
+ elif 'bangumi.bilibili' in url:
+ bangumi_id = re.search(r'(\d+)', url).group(1)
+ bangumi_data = get_bangumi_info(bangumi_id)
+ ep_ids = collect_bangumi_epids(bangumi_data)
-def bilibili_download_by_cids(cids, title, output_dir='.', merge=True, info_only=False):
- urls = []
- for cid in cids:
- sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest()
- url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this
- urls += [i
- if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i)
- else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i)
- for i in parse_cid_playurl(get_content(url))]
-
- type_ = ''
- size = 0
- for url in urls:
- _, type_, temp = url_info(url)
- size += temp
-
- print_info(site_info, title, type_, size)
- if not info_only:
- download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge, headers={'Referer': 'http://www.bilibili.com/'})
-
-
-def bilibili_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False):
- while True:
- try:
- sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest()
- url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this
- urls = [i
- if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i)
- else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i)
- for i in parse_cid_playurl(get_content(url))]
-
- type_ = ''
- size = 0
- for url in urls:
- _, type_, temp = url_info(url, headers={'Referer': 'http://www.bilibili.com/'})
- size += temp or 0
-
- print_info(site_info, title, type_, size)
- if not info_only:
- download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge, timeout=1, headers={'Referer': 'http://www.bilibili.com/'})
- except socket.timeout:
- continue
- else:
- break
-
-
-def bilibili_live_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False):
- api_url = 'http://live.bilibili.com/api/playurl?cid=' + cid
- urls = parse_cid_playurl(get_content(api_url))
-
- for url in urls:
- _, type_, _ = url_info(url)
- size = 0
- print_info(site_info, title, type_, size)
- if not info_only:
- download_urls([url], title, type_, total_size=None, output_dir=output_dir, merge=merge)
-
-
-def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
- html = get_content(url)
-
- title = r1_of([r'',
- r']*>\s*([^<>]+)\s*
'], html)
- if title:
- title = unescape_html(title)
- title = escape_file_path(title)
-
- if re.match(r'https?://bangumi\.bilibili\.com/', url):
- # quick hack for bangumi URLs
- episode_id = r1(r'#(\d+)$', url) or r1(r'first_ep_id = "(\d+)"', html)
- cont = post_content('http://bangumi.bilibili.com/web_api/get_source',
- post_data={'episode_id': episode_id})
- cid = json.loads(cont)['result']['cid']
- title = '%s [%s]' % (title, episode_id)
- bilibili_download_by_cid(str(cid), title, output_dir=output_dir, merge=merge, info_only=info_only)
-
+ base_url = url.split('#')[0]
+ for ep_id in ep_ids:
+ ep_url = '#'.join([base_url, ep_id])
+ Bilibili().download_by_url(ep_url, **kwargs)
else:
- flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"',
- r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
- assert flashvars
- flashvars = flashvars.replace(': ', '=')
- t, cid = flashvars.split('=', 1)
- cid = cid.split('&')[0]
- if t == 'cid':
- if re.match(r'https?://live\.bilibili\.com/', url):
- title = r1(r'\s*([^<>]+)\s*', html)
- bilibili_live_download_by_cid(cid, title, output_dir=output_dir, merge=merge, info_only=info_only)
+ aid = re.search(r'av(\d+)', url).group(1)
+ page_list = json.loads(get_content('http://www.bilibili.com/widget/getPageList?aid={}'.format(aid)))
+ page_cnt = len(page_list)
+ for no in range(1, page_cnt+1):
+ page_url = 'http://www.bilibili.com/video/av{}/index_{}.html'.format(aid, no)
+ subtitle = page_list[no-1]['pagename']
+ Bilibili().download_by_url(page_url, subtitle=subtitle, **kwargs)
- else:
- # multi-P
- cids = []
- pages = re.findall('', html)
- for i, page in enumerate(pages):
- html = get_html("http://www.bilibili.com%s" % page)
- flashvars = r1_of([r'(cid=\d+)',
- r'flashvars="([^"]+)"',
- r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
- if flashvars:
- t, cid = flashvars.split('=', 1)
- cids.append(cid.split('&')[0])
- if url.endswith(page):
- cids = [cid.split('&')[0]]
- titles = [titles[i]]
- break
+site = Bilibili()
+download = site.download_by_url
+download_playlist = bilibili_download_playlist_by_url
- # no multi-P
- if not pages:
- cids = [cid]
- titles = [r1(r'', html) or title]
- for i in range(len(cids)):
- completeTitle=None
- if (title == titles[i]):
- completeTitle=title
- else:
- completeTitle=title+"-"+titles[i]#Build Better Title
- bilibili_download_by_cid(cids[i],
- completeTitle,
- output_dir=output_dir,
- merge=merge,
- info_only=info_only)
-
- elif t == 'vid':
- sina_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
- elif t == 'ykid':
- youku_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
- elif t == 'uid':
- tudou_download_by_id(cid, title, output_dir=output_dir, merge=merge, info_only=info_only)
- else:
- raise NotImplementedError(flashvars)
-
- if not info_only and not dry_run:
- if not kwargs['caption']:
- print('Skipping danmaku.')
- return
- title = get_filename(title)
- print('Downloading %s ...\n' % (title + '.cmt.xml'))
- xml = get_srt_xml(cid)
- with open(os.path.join(output_dir, title + '.cmt.xml'), 'w', encoding='utf-8') as x:
- x.write(xml)
-
-
-site_info = "bilibili.com"
-download = bilibili_download
-download_playlist = bilibili_download
+bilibili_download = download