you-get/src/you_get/extractors/bilibili.py

171 lines
5.8 KiB
Python
Raw Normal View History

2012-09-02 00:02:14 +04:00
#!/usr/bin/env python
__all__ = ['bilibili_download']
from ..common import *
2013-07-14 19:34:42 +04:00
from .sina import sina_download_by_vid
2012-09-02 00:02:14 +04:00
from .tudou import tudou_download_by_id
2014-06-24 05:59:47 +04:00
from .youku import youku_download_by_vid
2012-09-02 00:02:14 +04:00
2014-08-03 18:00:50 +04:00
import hashlib
2012-09-02 00:02:14 +04:00
import re
2014-08-03 18:00:50 +04:00
# API key provided by cnbeining
appkey='85eb6835b0a1034e';
secretkey = '2ad42749773c441109bdc0191257a664'
client = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'UTF-8,*;q=0.5',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'User-Agent': 'Biligrab /0.8 (cnbeining@gmail.com)'
}
2014-08-03 18:00:50 +04:00
2012-09-02 00:02:14 +04:00
def get_srt_xml(id):
2014-06-18 03:14:11 +04:00
url = 'http://comment.bilibili.com/%s.xml' % id
2012-09-02 00:02:14 +04:00
return get_html(url)
def parse_srt_p(p):
fields = p.split(',')
assert len(fields) == 8, fields
time, mode, font_size, font_color, pub_time, pool, user_id, history = fields
time = float(time)
2012-09-02 00:02:14 +04:00
mode = int(mode)
assert 1 <= mode <= 8
# mode 1~3: scrolling
# mode 4: bottom
# mode 5: top
# mode 6: reverse?
# mode 7: position
# mode 8: advanced
2012-09-02 00:02:14 +04:00
pool = int(pool)
assert 0 <= pool <= 2
# pool 0: normal
# pool 1: srt
# pool 2: special?
2012-09-02 00:02:14 +04:00
font_size = int(font_size)
2012-09-02 00:02:14 +04:00
font_color = '#%06x' % int(font_color)
2012-09-02 00:02:14 +04:00
return pool, mode, font_size, font_color
def parse_srt_xml(xml):
d = re.findall(r'<d p="([^"]+)">(.*)</d>', xml)
for x, y in d:
p = parse_srt_p(x)
raise NotImplementedError()
def parse_cid_playurl(xml):
from xml.dom.minidom import parseString
2014-10-05 23:57:38 +04:00
try:
doc = parseString(xml.encode('utf-8'))
urls = [durl.getElementsByTagName('url')[0].firstChild.nodeValue for durl in doc.getElementsByTagName('durl')]
return urls
except:
return []
def bilibili_download_by_cids(cids, title, output_dir='.', merge=True, info_only=False):
2014-10-05 23:57:38 +04:00
urls = []
for cid in cids:
sign_this = hashlib.md5(bytes('appkey=' + appkey + '&cid=' + cid + secretkey, 'utf-8')).hexdigest()
url = 'http://interface.bilibili.com/playurl?appkey=' + appkey + '&cid=' + cid + '&sign=' + sign_this
urls += [i
if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i)
else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i)
for i in parse_cid_playurl(get_content(url, headers=client))]
if re.search(r'\.(flv|hlv)\b', urls[0]):
type = 'flv'
elif re.search(r'/flv/', urls[0]):
type = 'flv'
elif re.search(r'/mp4/', urls[0]):
type = 'mp4'
else:
type = 'flv'
size = 0
for url in urls:
_, _, temp = url_info(url)
size += temp
print_info(site_info, title, type, size)
if not info_only:
download_urls(urls, title, type, total_size=None, output_dir=output_dir, merge=merge)
def bilibili_download_by_cid(id, title, output_dir='.', merge=True, info_only=False):
2014-08-03 18:00:50 +04:00
sign_this = hashlib.md5(bytes('appkey=' + appkey + '&cid=' + id + secretkey, 'utf-8')).hexdigest()
url = 'http://interface.bilibili.com/playurl?appkey=' + appkey + '&cid=' + id + '&sign=' + sign_this
urls = [i
if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i)
else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i)
for i in parse_cid_playurl(get_content(url, headers=client))]
2012-12-01 19:25:32 +04:00
if re.search(r'\.(flv|hlv)\b', urls[0]):
type = 'flv'
2013-03-06 20:59:12 +04:00
elif re.search(r'/flv/', urls[0]):
type = 'flv'
2012-12-01 19:25:32 +04:00
elif re.search(r'/mp4/', urls[0]):
type = 'mp4'
else:
2013-07-01 18:39:39 +04:00
type = 'flv'
size = 0
for url in urls:
_, _, temp = url_info(url)
2014-10-25 17:08:28 +04:00
size += temp or 0
2012-12-01 19:25:32 +04:00
print_info(site_info, title, type, size)
if not info_only:
download_urls(urls, title, type, total_size=None, output_dir=output_dir, merge=merge)
def bilibili_download(url, output_dir='.', merge=True, info_only=False):
2012-09-02 00:02:14 +04:00
html = get_html(url)
2014-10-23 19:18:37 +04:00
title = r1_of([r'<meta name="title" content="([^<>]{1,999})" />',r'<h2[^>]*>([^<>]+)</h2>'], html)
2012-09-02 00:02:14 +04:00
title = unescape_html(title)
title = escape_file_path(title)
2014-10-23 19:18:37 +04:00
flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"', r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
2012-09-02 00:02:14 +04:00
assert flashvars
2014-10-23 19:18:37 +04:00
flashvars = flashvars.replace(': ','=')
2012-09-02 00:02:14 +04:00
t, id = flashvars.split('=', 1)
id = id.split('&')[0]
if t == 'cid':
# Multi-P
2014-10-05 23:57:38 +04:00
cids = [id]
p = re.findall('<option value=\'([^\']*)\'>', html)
2014-09-27 18:57:12 +04:00
if not p:
bilibili_download_by_cid(id, title, output_dir=output_dir, merge=merge, info_only=info_only)
else:
for i in p:
html = get_html("http://www.bilibili.com%s" % i)
2014-10-05 23:57:38 +04:00
flashvars = r1_of([r'(cid=\d+)', r'flashvars="([^"]+)"', r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
if flashvars:
t, cid = flashvars.split('=', 1)
cids.append(cid.split('&')[0])
2014-09-27 18:57:12 +04:00
bilibili_download_by_cids(cids, title, output_dir=output_dir, merge=merge, info_only=info_only)
elif t == 'vid':
2015-01-21 04:05:38 +03:00
sina_download_by_vid(id, title, output_dir = output_dir, merge = merge, info_only = info_only)
2012-09-02 00:02:14 +04:00
elif t == 'ykid':
2014-06-28 13:08:19 +04:00
youku_download_by_vid(id, title=title, output_dir = output_dir, merge = merge, info_only = info_only)
2012-09-02 00:02:14 +04:00
elif t == 'uid':
tudou_download_by_id(id, title, output_dir = output_dir, merge = merge, info_only = info_only)
else:
raise NotImplementedError(flashvars)
if not info_only:
title = get_filename(title)
print('Downloading %s ...\n' % (title + '.cmt.xml'))
xml = get_srt_xml(id)
2014-03-08 22:49:51 +04:00
with open(os.path.join(output_dir, title + '.cmt.xml'), 'w', encoding='utf-8') as x:
x.write(xml)
2012-09-02 00:02:14 +04:00
2014-06-18 03:14:11 +04:00
site_info = "bilibili.com"
2012-09-02 00:02:14 +04:00
download = bilibili_download
download_playlist = playlist_not_supported('bilibili')