2012-09-02 00:02:14 +04:00
#!/usr/bin/env python
__all__ = [ ' bilibili_download ' ]
2017-07-03 08:37:58 +03:00
import hashlib
import re
import time
import json
import http . cookiejar
import urllib . request
import urllib . parse
from xml . dom . minidom import parseString
2016-08-21 22:44:05 +03:00
from . . common import *
2017-07-03 08:37:58 +03:00
from . . util . log import *
from . . extractor import *
2012-09-02 00:02:14 +04:00
2017-07-03 08:37:58 +03:00
from . qq import qq_download_by_vid
2016-08-29 06:39:19 +03:00
from . sina import sina_download_by_vid
from . tudou import tudou_download_by_id
from . youku import youku_download_by_vid
2017-07-03 08:37:58 +03:00
class Bilibili ( VideoExtractor ) :
name = ' Bilibili '
live_api = ' http://live.bilibili.com/api/playurl?cid= {} &otype=json '
2018-03-07 04:48:11 +03:00
api_url = ' http://interface.bilibili.com/v2/playurl? '
2017-07-03 08:37:58 +03:00
bangumi_api_url = ' http://bangumi.bilibili.com/player/web_api/playurl? '
2017-11-22 17:13:17 +03:00
live_room_init_api_url = ' https://api.live.bilibili.com/room/v1/Room/room_init?id= {} '
live_room_info_api_url = ' https://api.live.bilibili.com/room/v1/Room/get_info?room_id= {} '
2017-09-20 23:50:37 +03:00
2018-04-22 05:38:40 +03:00
#SEC1 = '1c15888dc316e05a15fdd0a02ed6584f'
SEC1 = ' 94aba54af9065f71de72f5508f1cd42e '
2017-07-03 08:37:58 +03:00
SEC2 = ' 9b288147e5474dd2aa67085f716c560d '
stream_types = [
{ ' id ' : ' hdflv ' } ,
2017-12-04 16:17:11 +03:00
{ ' id ' : ' flv720 ' } ,
2017-07-03 08:37:58 +03:00
{ ' id ' : ' flv ' } ,
{ ' id ' : ' hdmp4 ' } ,
{ ' id ' : ' mp4 ' } ,
2017-08-05 16:49:58 +03:00
{ ' id ' : ' live ' } ,
{ ' id ' : ' vc ' }
2017-07-03 08:37:58 +03:00
]
fmt2qlt = dict ( hdflv = 4 , flv = 3 , hdmp4 = 2 , mp4 = 1 )
@staticmethod
def bilibili_stream_type ( urls ) :
url = urls [ 0 ]
2018-04-22 05:38:40 +03:00
if ' hd.flv ' in url or ' -80.flv ' in url :
2017-07-03 08:37:58 +03:00
return ' hdflv ' , ' flv '
2017-12-04 16:17:11 +03:00
if ' -64.flv ' in url :
return ' flv720 ' , ' flv '
if ' .flv ' in url :
2017-07-03 08:37:58 +03:00
return ' flv ' , ' flv '
2017-12-04 16:17:11 +03:00
if ' hd.mp4 ' in url or ' -48.mp4 ' in url :
2017-07-03 08:37:58 +03:00
return ' hdmp4 ' , ' mp4 '
2017-12-04 16:17:11 +03:00
if ' .mp4 ' in url :
2017-07-03 08:37:58 +03:00
return ' mp4 ' , ' mp4 '
raise Exception ( ' Unknown stream type ' )
2017-08-25 23:37:06 +03:00
def api_req ( self , cid , quality , bangumi , bangumi_movie = False , * * kwargs ) :
2017-07-03 08:37:58 +03:00
ts = str ( int ( time . time ( ) ) )
if not bangumi :
2018-04-22 05:38:40 +03:00
#params_str = 'cid={}&player=1&quality={}&ts={}'.format(cid, quality, ts)
params_str = ' appkey=84956560bc028eb7&cid= {} &otype=xml&qn= {} &quality= {} &type= ' . format ( cid , quality , quality )
2017-07-03 08:37:58 +03:00
chksum = hashlib . md5 ( bytes ( params_str + self . SEC1 , ' utf8 ' ) ) . hexdigest ( )
api_url = self . api_url + params_str + ' &sign= ' + chksum
else :
2017-08-25 23:37:06 +03:00
mod = ' movie ' if bangumi_movie else ' bangumi '
params_str = ' cid= {} &module= {} &player=1&quality= {} &ts= {} ' . format ( cid , mod , quality , ts )
2017-07-03 08:37:58 +03:00
chksum = hashlib . md5 ( bytes ( params_str + self . SEC2 , ' utf8 ' ) ) . hexdigest ( )
api_url = self . bangumi_api_url + params_str + ' &sign= ' + chksum
2018-01-05 07:23:02 +03:00
xml_str = get_content ( api_url , headers = { ' referer ' : self . url , ' User-Agent ' : ' Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36 ' } )
2017-07-03 08:37:58 +03:00
return xml_str
def parse_bili_xml ( self , xml_str ) :
urls_list = [ ]
total_size = 0
doc = parseString ( xml_str . encode ( ' utf8 ' ) )
durls = doc . getElementsByTagName ( ' durl ' )
for durl in durls :
size = durl . getElementsByTagName ( ' size ' ) [ 0 ]
total_size + = int ( size . firstChild . nodeValue )
url = durl . getElementsByTagName ( ' url ' ) [ 0 ]
urls_list . append ( url . firstChild . nodeValue )
stream_type , container = self . bilibili_stream_type ( urls_list )
if stream_type not in self . streams :
self . streams [ stream_type ] = { }
self . streams [ stream_type ] [ ' src ' ] = urls_list
self . streams [ stream_type ] [ ' size ' ] = total_size
self . streams [ stream_type ] [ ' container ' ] = container
def download_by_vid ( self , cid , bangumi , * * kwargs ) :
stream_id = kwargs . get ( ' stream_id ' )
2017-12-04 05:04:32 +03:00
# guard here. if stream_id invalid, fallback as not stream_id
2017-07-03 08:37:58 +03:00
if stream_id and stream_id in self . fmt2qlt :
quality = stream_id
else :
quality = ' hdflv ' if bangumi else ' flv '
info_only = kwargs . get ( ' info_only ' )
2018-04-22 05:38:40 +03:00
for qlt in [ 116 , 112 , 80 , 74 , 64 , 32 , 16 , 15 ] :
2017-08-25 23:37:06 +03:00
api_xml = self . api_req ( cid , qlt , bangumi , * * kwargs )
2017-07-03 08:37:58 +03:00
self . parse_bili_xml ( api_xml )
2017-12-04 06:01:08 +03:00
if not info_only or stream_id :
2017-07-03 08:37:58 +03:00
self . danmuku = get_danmuku_xml ( cid )
def prepare ( self , * * kwargs ) :
2017-10-18 13:15:31 +03:00
if socket . getdefaulttimeout ( ) == 600 : # no timeout specified
socket . setdefaulttimeout ( 2 ) # fail fast, very speedy!
2017-09-20 23:50:37 +03:00
2017-10-13 17:31:51 +03:00
# handle "watchlater" URLs
if ' /watchlater/ ' in self . url :
aid = re . search ( r ' av( \ d+) ' , self . url ) . group ( 1 )
self . url = ' http://www.bilibili.com/video/av {} / ' . format ( aid )
2017-07-03 08:37:58 +03:00
self . ua = fake_headers [ ' User-Agent ' ]
self . url = url_locations ( [ self . url ] ) [ 0 ]
frag = urllib . parse . urlparse ( self . url ) . fragment
2017-12-04 05:04:32 +03:00
# http://www.bilibili.com/video/av3141144/index_2.html#page=3
2017-07-03 08:37:58 +03:00
if frag :
hit = re . search ( r ' page=( \ d+) ' , frag )
if hit is not None :
page = hit . group ( 1 )
aid = re . search ( r ' av( \ d+) ' , self . url ) . group ( 1 )
self . url = ' http://www.bilibili.com/video/av {} /index_ {} .html ' . format ( aid , page )
self . referer = self . url
self . page = get_content ( self . url )
2017-10-16 09:02:55 +03:00
2018-02-02 21:01:39 +03:00
m = re . search ( r ' <h1.*?>(.*?)</h1> ' , self . page ) or re . search ( r ' <h1 title= " ([^ " ]+) " > ' , self . page )
2017-10-16 09:02:55 +03:00
if m is not None :
self . title = m . group ( 1 )
2018-04-28 19:59:52 +03:00
s = re . search ( r ' <span>([^<]+)</span> ' , m . group ( 1 ) )
if s :
self . title = unescape_html ( s . group ( 1 ) )
2017-10-16 09:02:55 +03:00
if self . title is None :
2018-02-02 21:01:39 +03:00
m = re . search ( r ' property= " og:title " content= " ([^ " ]+) " ' , self . page )
2017-10-16 09:02:55 +03:00
if m is not None :
self . title = m . group ( 1 )
if ' subtitle ' in kwargs :
subtitle = kwargs [ ' subtitle ' ]
self . title = ' {} {} ' . format ( self . title , subtitle )
2017-10-13 17:31:51 +03:00
2017-07-06 12:03:01 +03:00
if ' bangumi.bilibili.com/movie ' in self . url :
self . movie_entry ( * * kwargs )
elif ' bangumi.bilibili.com ' in self . url :
2017-07-03 08:37:58 +03:00
self . bangumi_entry ( * * kwargs )
2018-02-10 01:53:04 +03:00
elif ' bangumi/ ' in self . url :
self . bangumi_entry ( * * kwargs )
2017-07-03 08:37:58 +03:00
elif ' live.bilibili.com ' in self . url :
self . live_entry ( * * kwargs )
2017-08-05 16:49:58 +03:00
elif ' vc.bilibili.com ' in self . url :
self . vc_entry ( * * kwargs )
2017-07-03 08:37:58 +03:00
else :
self . entry ( * * kwargs )
2017-07-06 12:03:01 +03:00
def movie_entry ( self , * * kwargs ) :
patt = r " var \ s*aid \ s*= \ s* ' ( \ d+) ' "
aid = re . search ( patt , self . page ) . group ( 1 )
page_list = json . loads ( get_content ( ' http://www.bilibili.com/widget/getPageList?aid= {} ' . format ( aid ) ) )
2017-12-04 05:04:32 +03:00
# better ideas for bangumi_movie titles?
2017-07-06 12:03:01 +03:00
self . title = page_list [ 0 ] [ ' pagename ' ]
2017-08-25 23:37:06 +03:00
self . download_by_vid ( page_list [ 0 ] [ ' cid ' ] , True , bangumi_movie = True , * * kwargs )
2017-07-06 12:03:01 +03:00
2017-07-03 08:37:58 +03:00
def entry ( self , * * kwargs ) :
2017-12-04 05:04:32 +03:00
# tencent player
2017-07-03 08:37:58 +03:00
tc_flashvars = re . search ( r ' " bili-cid= \ d+&bili-aid= \ d+&vid=([^ " ]+) " ' , self . page )
if tc_flashvars :
tc_flashvars = tc_flashvars . group ( 1 )
if tc_flashvars is not None :
self . out = True
2018-06-02 18:15:44 +03:00
qq_download_by_vid ( tc_flashvars , self . title , True , output_dir = kwargs [ ' output_dir ' ] , merge = kwargs [ ' merge ' ] , info_only = kwargs [ ' info_only ' ] )
2017-07-03 08:37:58 +03:00
return
2016-08-29 06:39:19 +03:00
2018-03-25 01:56:38 +03:00
has_plist = re . search ( r ' " page " :2 ' , self . page )
2018-06-27 23:18:27 +03:00
if has_plist and not kwargs . get ( ' playlist ' ) :
2017-12-24 00:02:11 +03:00
log . w ( ' This page contains a playlist. (use --playlist to download all videos.) ' )
2017-12-23 16:12:56 +03:00
try :
2018-06-24 12:41:58 +03:00
page_list = json . loads ( re . search ( r ' " pages " :( \ [.*? \ ]) ' , self . page ) . group ( 1 ) )
index_id = int ( re . search ( r ' index_( \ d+) ' , self . url ) . group ( 1 ) )
cid = page_list [ index_id - 1 ] [ ' cid ' ] # change cid match rule
2017-12-23 16:12:56 +03:00
except :
cid = re . search ( r ' " cid " :( \ d+) ' , self . page ) . group ( 1 )
2017-07-03 08:37:58 +03:00
if cid is not None :
2017-12-30 22:44:31 +03:00
self . download_by_vid ( cid , re . search ( ' bangumi ' , self . url ) is not None , * * kwargs )
2017-07-03 08:37:58 +03:00
else :
2017-12-04 05:04:32 +03:00
# flashvars?
2017-07-03 08:37:58 +03:00
flashvars = re . search ( r ' flashvars= " ([^ " ]+) " ' , self . page ) . group ( 1 )
if flashvars is None :
raise Exception ( ' Unsupported page {} ' . format ( self . url ) )
param = flashvars . split ( ' & ' ) [ 0 ]
t , cid = param . split ( ' = ' )
t = t . strip ( )
cid = cid . strip ( )
if t == ' vid ' :
sina_download_by_vid ( cid , self . title , output_dir = kwargs [ ' output_dir ' ] , merge = kwargs [ ' merge ' ] , info_only = kwargs [ ' info_only ' ] )
elif t == ' ykid ' :
youku_download_by_vid ( cid , self . title , output_dir = kwargs [ ' output_dir ' ] , merge = kwargs [ ' merge ' ] , info_only = kwargs [ ' info_only ' ] )
elif t == ' uid ' :
tudou_download_by_id ( cid , self . title , output_dir = kwargs [ ' output_dir ' ] , merge = kwargs [ ' merge ' ] , info_only = kwargs [ ' info_only ' ] )
else :
raise NotImplementedError ( ' Unknown flashvars {} ' . format ( flashvars ) )
return
2016-08-29 06:39:19 +03:00
2017-07-03 08:37:58 +03:00
def live_entry ( self , * * kwargs ) :
2017-11-22 17:13:17 +03:00
# Extract room ID from the short display ID (seen in the room
# URL). The room ID is usually the same as the short ID, but not
# always; case in point: https://live.bilibili.com/48, with 48
# as the short ID and 63727 as the actual ID.
room_short_id = re . search ( r ' live.bilibili.com/([^?]+) ' , self . url ) . group ( 1 )
room_init_api_response = json . loads ( get_content ( self . live_room_init_api_url . format ( room_short_id ) ) )
self . room_id = room_init_api_response [ ' data ' ] [ ' room_id ' ]
room_info_api_response = json . loads ( get_content ( self . live_room_info_api_url . format ( self . room_id ) ) )
self . title = room_info_api_response [ ' data ' ] [ ' title ' ]
2017-07-03 08:37:58 +03:00
api_url = self . live_api . format ( self . room_id )
json_data = json . loads ( get_content ( api_url ) )
urls = [ json_data [ ' durl ' ] [ 0 ] [ ' url ' ] ]
self . streams [ ' live ' ] = { }
self . streams [ ' live ' ] [ ' src ' ] = urls
self . streams [ ' live ' ] [ ' container ' ] = ' flv '
self . streams [ ' live ' ] [ ' size ' ] = 0
2017-08-05 16:49:58 +03:00
def vc_entry ( self , * * kwargs ) :
vc_id = re . search ( r ' video/( \ d+) ' , self . url )
if not vc_id :
vc_id = re . search ( r ' vcdetail \ ?vc=( \ d+) ' , self . url )
if not vc_id :
log . wtf ( ' Unknown url pattern ' )
endpoint = ' http://api.vc.bilibili.com/clip/v1/video/detail?video_id= {} &need_playurl=1 ' . format ( vc_id . group ( 1 ) )
vc_meta = json . loads ( get_content ( endpoint , headers = fake_headers ) )
if vc_meta [ ' code ' ] != 0 :
log . wtf ( ' {} \n {} ' . format ( vc_meta [ ' msg ' ] , vc_meta [ ' message ' ] ) )
item = vc_meta [ ' data ' ] [ ' item ' ]
self . title = item [ ' description ' ]
self . streams [ ' vc ' ] = { }
self . streams [ ' vc ' ] [ ' src ' ] = [ item [ ' video_playurl ' ] ]
self . streams [ ' vc ' ] [ ' container ' ] = ' mp4 '
self . streams [ ' vc ' ] [ ' size ' ] = int ( item [ ' video_size ' ] )
2017-07-03 08:37:58 +03:00
def bangumi_entry ( self , * * kwargs ) :
bangumi_id = re . search ( r ' ( \ d+) ' , self . url ) . group ( 1 )
frag = urllib . parse . urlparse ( self . url ) . fragment
if frag :
episode_id = frag
else :
2018-02-10 01:53:04 +03:00
episode_id = re . search ( r ' first_ep_id \ s*= \ s* " ( \ d+) " ' , self . page ) or re . search ( r ' \ /ep( \ d+) ' , self . url ) . group ( 1 )
2017-08-12 06:26:02 +03:00
# cont = post_content('http://bangumi.bilibili.com/web_api/get_source', post_data=dict(episode_id=episode_id))
# cid = json.loads(cont)['result']['cid']
2017-07-03 08:37:58 +03:00
cont = get_content ( ' http://bangumi.bilibili.com/web_api/episode/ {} .json ' . format ( episode_id ) )
ep_info = json . loads ( cont ) [ ' result ' ] [ ' currentEpisode ' ]
2016-08-29 06:39:19 +03:00
2018-02-10 01:53:04 +03:00
bangumi_data = get_bangumi_info ( str ( ep_info [ ' seasonId ' ] ) )
bangumi_payment = bangumi_data . get ( ' payment ' )
if bangumi_payment and bangumi_payment [ ' price ' ] != ' 0 ' :
log . w ( " It ' s a paid item " )
# ep_ids = collect_bangumi_epids(bangumi_data)
2017-08-12 06:26:02 +03:00
index_title = ep_info [ ' indexTitle ' ]
long_title = ep_info [ ' longTitle ' ] . strip ( )
cid = ep_info [ ' danmaku ' ]
2016-08-29 06:39:19 +03:00
2017-08-12 06:26:02 +03:00
self . title = ' {} [ {} {} ] ' . format ( self . title , index_title , long_title )
2017-07-03 08:37:58 +03:00
self . download_by_vid ( cid , bangumi = True , * * kwargs )
2016-08-29 06:39:19 +03:00
2017-07-03 08:37:58 +03:00
def check_oversea ( ) :
url = ' https://interface.bilibili.com/player?id=cid:17778881 '
xml_lines = get_content ( url ) . split ( ' \n ' )
for line in xml_lines :
key = line . split ( ' > ' ) [ 0 ] [ 1 : ]
if key == ' country ' :
value = line . split ( ' > ' ) [ 1 ] . split ( ' < ' ) [ 0 ]
if value != ' 中国 ' :
return True
else :
return False
return False
def check_sid ( ) :
if not cookies :
return False
for cookie in cookies :
if cookie . domain == ' .bilibili.com ' and cookie . name == ' sid ' :
return True
return False
def fetch_sid ( cid , aid ) :
url = ' http://interface.bilibili.com/player?id=cid: {} &aid= {} ' . format ( cid , aid )
cookies = http . cookiejar . CookieJar ( )
req = urllib . request . Request ( url )
res = urllib . request . urlopen ( url )
cookies . extract_cookies ( res , req )
for c in cookies :
if c . domain == ' .bilibili.com ' and c . name == ' sid ' :
return c . value
raise
def collect_bangumi_epids ( json_data ) :
2017-08-12 06:26:02 +03:00
eps = json_data [ ' episodes ' ] [ : : - 1 ]
return [ ep [ ' episode_id ' ] for ep in eps ]
2017-07-03 08:37:58 +03:00
2018-02-10 01:53:04 +03:00
def get_bangumi_info ( season_id ) :
2017-07-03 08:37:58 +03:00
BASE_URL = ' http://bangumi.bilibili.com/jsonp/seasoninfo/ '
long_epoch = int ( time . time ( ) * 1000 )
2018-02-10 01:53:04 +03:00
req_url = BASE_URL + season_id + ' .ver?callback=seasonListCallback&jsonp=jsonp&_= ' + str ( long_epoch )
2017-07-03 08:37:58 +03:00
season_data = get_content ( req_url )
season_data = season_data [ len ( ' seasonListCallback( ' ) : ]
season_data = season_data [ : - 1 * len ( ' ); ' ) ]
json_data = json . loads ( season_data )
2017-08-12 06:26:02 +03:00
return json_data [ ' result ' ]
2017-07-03 08:37:58 +03:00
def get_danmuku_xml ( cid ) :
return get_content ( ' http://comment.bilibili.com/ {} .xml ' . format ( cid ) )
2016-08-29 06:39:19 +03:00
def parse_cid_playurl ( xml ) :
from xml . dom . minidom import parseString
try :
2017-07-03 08:37:58 +03:00
urls_list = [ ]
total_size = 0
2016-08-29 06:39:19 +03:00
doc = parseString ( xml . encode ( ' utf-8 ' ) )
2017-07-03 08:37:58 +03:00
durls = doc . getElementsByTagName ( ' durl ' )
cdn_cnt = len ( durls [ 0 ] . getElementsByTagName ( ' url ' ) )
for i in range ( cdn_cnt ) :
urls_list . append ( [ ] )
for durl in durls :
size = durl . getElementsByTagName ( ' size ' ) [ 0 ]
total_size + = int ( size . firstChild . nodeValue )
cnt = len ( durl . getElementsByTagName ( ' url ' ) )
for i in range ( cnt ) :
u = durl . getElementsByTagName ( ' url ' ) [ i ] . firstChild . nodeValue
urls_list [ i ] . append ( u )
return urls_list , total_size
except Exception as e :
log . w ( e )
return [ ] , 0
def bilibili_download_playlist_by_url ( url , * * kwargs ) :
url = url_locations ( [ url ] ) [ 0 ]
2018-06-27 23:18:27 +03:00
kwargs [ ' playlist ' ] = True
2017-12-04 05:04:32 +03:00
# a bangumi here? possible?
2017-07-03 08:37:58 +03:00
if ' live.bilibili ' in url :
site . download_by_url ( url )
elif ' bangumi.bilibili ' in url :
bangumi_id = re . search ( r ' ( \ d+) ' , url ) . group ( 1 )
bangumi_data = get_bangumi_info ( bangumi_id )
ep_ids = collect_bangumi_epids ( bangumi_data )
base_url = url . split ( ' # ' ) [ 0 ]
for ep_id in ep_ids :
ep_url = ' # ' . join ( [ base_url , ep_id ] )
Bilibili ( ) . download_by_url ( ep_url , * * kwargs )
2016-08-29 06:39:19 +03:00
else :
2017-07-03 08:37:58 +03:00
aid = re . search ( r ' av( \ d+) ' , url ) . group ( 1 )
page_list = json . loads ( get_content ( ' http://www.bilibili.com/widget/getPageList?aid= {} ' . format ( aid ) ) )
page_cnt = len ( page_list )
for no in range ( 1 , page_cnt + 1 ) :
page_url = ' http://www.bilibili.com/video/av {} /index_ {} .html ' . format ( aid , no )
subtitle = page_list [ no - 1 ] [ ' pagename ' ]
Bilibili ( ) . download_by_url ( page_url , subtitle = subtitle , * * kwargs )
site = Bilibili ( )
download = site . download_by_url
download_playlist = bilibili_download_playlist_by_url
bilibili_download = download