diff --git a/src/you_get/common.py b/src/you_get/common.py
index 2ff61d55..a5a0fbab 100755
--- a/src/you_get/common.py
+++ b/src/you_get/common.py
@@ -79,6 +79,7 @@ SITES = {
'videomega' : 'videomega',
'vidto' : 'vidto',
'vimeo' : 'vimeo',
+ 'wanmen' : 'wanmen',
'weibo' : 'miaopai',
'veoh' : 'veoh',
'vine' : 'vine',
diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py
index 97ab0b41..e69bc2fd 100755
--- a/src/you_get/extractors/__init__.py
+++ b/src/you_get/extractors/__init__.py
@@ -7,6 +7,7 @@ from .baidu import *
from .bandcamp import *
from .bigthink import *
from .bilibili import *
+from .bokecc import *
from .cbs import *
from .ckplayer import *
from .cntv import *
@@ -73,6 +74,7 @@ from .vimeo import *
from .vine import *
from .vk import *
from .w56 import *
+from .wanmen import *
from .xiami import *
from .yinyuetai import *
from .yixia import *
diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py
index c7c4fac9..e9d3e7ad 100644
--- a/src/you_get/extractors/bilibili.py
+++ b/src/you_get/extractors/bilibili.py
@@ -2,17 +2,96 @@
__all__ = ['bilibili_download']
-import json
-import re
from ..common import *
-def get_srt_xml(cid):
- return get_html('http://comment.bilibili.com/%s.xml' % cid)
+from .sina import sina_download_by_vid
+from .tudou import tudou_download_by_id
+from .youku import youku_download_by_vid
-def bilibili_download_by_api(url, output_dir='.', merge=True, info_only=False, **kwargs):
- title = r1(r'cid=(\d+)', url)
- info = json.loads(get_content(url))
- urls = [i['url'] for i in info['durl']]
+import hashlib
+import re
+
+appkey = 'f3bb208b3d081dc8'
+SECRETKEY_MINILOADER = '1c15888dc316e05a15fdd0a02ed6584f'
+
+def get_srt_xml(id):
+ url = 'http://comment.bilibili.com/%s.xml' % id
+ return get_html(url)
+
+
+def parse_srt_p(p):
+ fields = p.split(',')
+ assert len(fields) == 8, fields
+ time, mode, font_size, font_color, pub_time, pool, user_id, history = fields
+ time = float(time)
+
+ mode = int(mode)
+ assert 1 <= mode <= 8
+ # mode 1~3: scrolling
+ # mode 4: bottom
+ # mode 5: top
+ # mode 6: reverse?
+ # mode 7: position
+ # mode 8: advanced
+
+ pool = int(pool)
+ assert 0 <= pool <= 2
+ # pool 0: normal
+ # pool 1: srt
+ # pool 2: special?
+
+ font_size = int(font_size)
+
+ font_color = '#%06x' % int(font_color)
+
+ return pool, mode, font_size, font_color
+
+
+def parse_srt_xml(xml):
+ d = re.findall(r'(.*)', xml)
+ for x, y in d:
+ p = parse_srt_p(x)
+ raise NotImplementedError()
+
+
+def parse_cid_playurl(xml):
+ from xml.dom.minidom import parseString
+ try:
+ doc = parseString(xml.encode('utf-8'))
+ urls = [durl.getElementsByTagName('url')[0].firstChild.nodeValue for durl in doc.getElementsByTagName('durl')]
+ return urls
+ except:
+ return []
+
+
+def bilibili_download_by_cids(cids, title, output_dir='.', merge=True, info_only=False):
+ urls = []
+ for cid in cids:
+ sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest()
+ url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this
+ urls += [i
+ if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i)
+ else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i)
+ for i in parse_cid_playurl(get_content(url))]
+
+ type_ = ''
+ size = 0
+ for url in urls:
+ _, type_, temp = url_info(url)
+ size += temp
+
+ print_info(site_info, title, type_, size)
+ if not info_only:
+ download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge)
+
+
+def bilibili_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False):
+ sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest()
+ url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this
+ urls = [i
+ if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i)
+ else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i)
+ for i in parse_cid_playurl(get_content(url))]
type_ = ''
size = 0
@@ -27,50 +106,82 @@ def bilibili_download_by_api(url, output_dir='.', merge=True, info_only=False, *
if not info_only:
download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge)
-def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
- if re.match(r'https?://interface\.bilibili\.com/', url):
- # quick hack for explicit API
- bilibili_download_by_api(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
- return
- html = get_content(url)
- main_title = r1_of([r'',
- r'
]*>\s*([^<>]+)\s*
'], html)
- cid = r1(r'cid=(\d+)', html)
+def bilibili_live_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False):
+ api_url = 'http://live.bilibili.com/api/playurl?cid=' + cid
+ urls = parse_cid_playurl(get_content(api_url))
- aid = r1(r'av(\d+)', url)
- page = r1(r'index_(\d+)', url)
- sub_titles = re.findall('', html)
- if page is None and sub_titles: # download all
- for t in enumerate(sub_titles):
- page, sub_title = t[0] + 1, t[1]
- title = main_title + ' - ' + sub_title
-
- api = 'http://www.bilibili.com/m/html5?aid=%s&page=%s' % (aid, page)
- info = json.loads(get_content(api))
- src = info['src']
- _, type_, size = url_info(src)
- print_info(site_info, title, type_, size)
- if not info_only:
- download_urls([src], title, type_, total_size=size, output_dir=output_dir, merge=merge)
-
- else: # download selected
- if page is None: page = 1
- sub_title = r1('', html)
- if sub_title is None:
- sub_title = r1('', html)
- if sub_title:
- title = main_title + ' - ' + sub_title
- else:
- title = main_title
-
- api = 'http://www.bilibili.com/m/html5?aid=%s&page=%s' % (aid, page)
- info = json.loads(get_content(api))
- src = info['src']
- _, type_, size = url_info(src)
+ for url in urls:
+ _, type_, _ = url_info(url)
+ size = 0
print_info(site_info, title, type_, size)
if not info_only:
- download_urls([src], title, type_, total_size=size, output_dir=output_dir, merge=merge)
+ download_urls([url], title, type_, total_size=None, output_dir=output_dir, merge=merge)
+
+
+def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
+ html = get_content(url)
+
+ if re.match(r'https?://bangumi\.bilibili\.com/', url):
+ # quick hack for bangumi URLs
+ url = r1(r'"([^"]+)" class="v-av-link"', html)
+ html = get_content(url)
+
+ title = r1_of([r'',
+ r']*>\s*([^<>]+)\s*
'], html)
+ if title:
+ title = unescape_html(title)
+ title = escape_file_path(title)
+
+ flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"',
+ r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
+ assert flashvars
+ flashvars = flashvars.replace(': ', '=')
+ t, cid = flashvars.split('=', 1)
+ cid = cid.split('&')[0]
+ if t == 'cid':
+ if re.match(r'https?://live\.bilibili\.com/', url):
+ title = r1(r'\s*([^<>]+)\s*', html)
+ bilibili_live_download_by_cid(cid, title, output_dir=output_dir, merge=merge, info_only=info_only)
+
+ else:
+ # multi-P
+ cids = []
+ pages = re.findall('', html)
+ for i, page in enumerate(pages):
+ html = get_html("http://www.bilibili.com%s" % page)
+ flashvars = r1_of([r'(cid=\d+)',
+ r'flashvars="([^"]+)"',
+ r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
+ if flashvars:
+ t, cid = flashvars.split('=', 1)
+ cids.append(cid.split('&')[0])
+ if url.endswith(page):
+ cids = [cid.split('&')[0]]
+ titles = [titles[i]]
+ break
+
+ # no multi-P
+ if not pages:
+ cids = [cid]
+ titles = [r1(r'', html) or title]
+
+ for i in range(len(cids)):
+ bilibili_download_by_cid(cids[i],
+ titles[i],
+ output_dir=output_dir,
+ merge=merge,
+ info_only=info_only)
+
+ elif t == 'vid':
+ sina_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
+ elif t == 'ykid':
+ youku_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
+ elif t == 'uid':
+ tudou_download_by_id(cid, title, output_dir=output_dir, merge=merge, info_only=info_only)
+ else:
+ raise NotImplementedError(flashvars)
if not info_only and not dry_run:
if not kwargs['caption']:
@@ -82,6 +193,7 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs
with open(os.path.join(output_dir, title + '.cmt.xml'), 'w', encoding='utf-8') as x:
x.write(xml)
+
site_info = "bilibili.com"
download = bilibili_download
-download_playlist = bilibili_download
+download_playlist = bilibili_download
\ No newline at end of file
diff --git a/src/you_get/extractors/bokecc.py b/src/you_get/extractors/bokecc.py
new file mode 100644
index 00000000..8566e828
--- /dev/null
+++ b/src/you_get/extractors/bokecc.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+from ..common import *
+from ..extractor import VideoExtractor
+import xml.etree.ElementTree as ET
+
+class BokeCC(VideoExtractor):
+ name = "BokeCC"
+
+ stream_types = [ # we do now know for now, as we have to check the
+ # output from the API
+ ]
+
+ API_ENDPOINT = 'http://p.bokecc.com/'
+
+
+ def download_by_id(self, vid = '', title = None, output_dir='.', merge=True, info_only=False,**kwargs):
+ """self, str->None
+
+ Keyword arguments:
+ self: self
+ vid: The video ID for BokeCC cloud, something like
+ FE3BB999594978049C33DC5901307461
+
+ Calls the prepare() to download the video.
+
+ If no title is provided, this method shall try to find a proper title
+ with the information providin within the
+ returned content of the API."""
+
+ assert vid
+
+ self.prepare(vid = vid, title = title, **kwargs)
+
+ self.extract(**kwargs)
+
+ self.download(output_dir = output_dir,
+ merge = merge,
+ info_only = info_only, **kwargs)
+
+ def prepare(self, vid = '', title = None, **kwargs):
+ assert vid
+
+ api_url = self.API_ENDPOINT + \
+ 'servlet/playinfo?vid={vid}&m=0'.format(vid = vid) #return XML
+
+ html = get_content(api_url)
+ self.tree = ET.ElementTree(ET.fromstring(html))
+
+ if self.tree.find('result').text != '1':
+ log.wtf('API result says failed!')
+ raise
+
+ if title is None:
+ self.title = '_'.join([i.text for i in tree.iterfind('video/videomarks/videomark/markdesc')])
+ else:
+ self.title = title
+
+ for i in self.tree.iterfind('video/quality'):
+ quality = i.attrib ['value']
+ url = i[0].attrib['playurl']
+ self.stream_types.append({'id': quality,
+ 'video_profile': i.attrib ['desp']})
+ self.streams[quality] = {'url': url,
+ 'video_profile': i.attrib ['desp']}
+ self.streams_sorted = [dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams]
+
+
+ def extract(self, **kwargs):
+ for i in self.streams:
+ s = self.streams[i]
+ _, s['container'], s['size'] = url_info(s['url'])
+ s['src'] = [s['url']]
+ if 'stream_id' in kwargs and kwargs['stream_id']:
+ # Extract the stream
+ stream_id = kwargs['stream_id']
+
+ if stream_id not in self.streams:
+ log.e('[Error] Invalid video format.')
+ log.e('Run \'-i\' command with no specific video format to view all available formats.')
+ exit(2)
+ else:
+ # Extract stream with the best quality
+ stream_id = self.streams_sorted[0]['id']
+ _, s['container'], s['size'] = url_info(s['url'])
+ s['src'] = [s['url']]
+
+site = BokeCC()
+
+# I don't know how to call the player directly so I just put it here
+# just in case anyone touchs it -- Beining@Aug.24.2016
+#download = site.download_by_url
+#download_playlist = site.download_by_url
+
+bokecc_download_by_id = site.download_by_id
diff --git a/src/you_get/extractors/universal.py b/src/you_get/extractors/universal.py
index b0d929c9..ebab70f8 100644
--- a/src/you_get/extractors/universal.py
+++ b/src/you_get/extractors/universal.py
@@ -6,20 +6,20 @@ from ..common import *
from .embed import *
def universal_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
- try:
- embed_download(url, output_dir, merge=merge, info_only=info_only)
- except: pass
- else: return
+ content_type = get_head(url, headers=fake_headers)['Content-Type']
+ if content_type.startswith('text/html'):
+ try:
+ embed_download(url, output_dir, merge=merge, info_only=info_only)
+ except: pass
+ else: return
domains = url.split('/')[2].split('.')
if len(domains) > 2: domains = domains[1:]
site_info = '.'.join(domains)
- response = get_response(url, faker=True)
- content_type = response.headers['Content-Type']
-
if content_type.startswith('text/html'):
# extract an HTML page
+ response = get_response(url, faker=True)
page = str(response.data)
page_title = r1(r'([^<]*)', page)
diff --git a/src/you_get/extractors/wanmen.py b/src/you_get/extractors/wanmen.py
new file mode 100755
index 00000000..20c543c1
--- /dev/null
+++ b/src/you_get/extractors/wanmen.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+
+__all__ = ['wanmen_download', 'wanmen_download_by_course', 'wanmen_download_by_course_topic', 'wanmen_download_by_course_topic_part']
+
+from ..common import *
+from .bokecc import bokecc_download_by_id
+from json import loads
+
+
+##Helper functions
+def _wanmen_get_json_api_content_by_courseID(courseID):
+ """int->JSON
+
+ Return a parsed JSON tree of WanMen's API."""
+
+ return loads(get_content('http://api.wanmen.org/course/getCourseNested/{courseID}'.format(courseID = courseID)))
+
+def _wanmen_get_title_by_json_topic_part(json_content, tIndex, pIndex):
+ """JSON, int, int, int->str
+
+ Get a proper title with courseid+topicID+partID."""
+
+ return '_'.join([json_content[0]['name'],
+ json_content[0]['Topics'][tIndex]['name'],
+ json_content[0]['Topics'][tIndex]['Parts'][pIndex]['name']])
+
+
+def _wanmen_get_boke_id_by_json_topic_part(json_content, tIndex, pIndex):
+ """JSON, int, int, int->str
+
+ Get one BokeCC video ID with courseid+topicID+partID."""
+
+ return json_content[0]['Topics'][tIndex]['Parts'][pIndex]['ccVideoLink']
+
+
+##Parsers
+def wanmen_download_by_course(json_api_content, output_dir='.', merge=True, info_only=False, **kwargs):
+ """int->None
+
+ Download a WHOLE course.
+ Reuse the API call to save time."""
+
+ for tIndex in range(len(json_api_content[0]['Topics'])):
+ for pIndex in range(len(json_api_content[0]['Topics'][tIndex]['Parts'])):
+ wanmen_download_by_course_topic_part(json_api_content,
+ tIndex,
+ pIndex,
+ output_dir=output_dir,
+ merge=merge,
+ info_only=info_only,
+ **kwargs)
+
+
+def wanmen_download_by_course_topic(json_api_content, tIndex, output_dir='.', merge=True, info_only=False, **kwargs):
+ """int, int->None
+
+ Download a TOPIC of a course.
+ Reuse the API call to save time."""
+
+ for pIndex in range(len(json_api_content[0]['Topics'][tIndex]['Parts'])):
+ wanmen_download_by_course_topic_part(json_api_content,
+ tIndex,
+ pIndex,
+ output_dir=output_dir,
+ merge=merge,
+ info_only=info_only,
+ **kwargs)
+
+def wanmen_download_by_course_topic_part(json_api_content, tIndex, pIndex, output_dir='.', merge=True, info_only=False, **kwargs):
+ """int, int, int->None
+
+ Download ONE PART of the course."""
+
+ html = json_api_content
+
+ title = _wanmen_get_title_by_json_topic_part(html,
+ tIndex,
+ pIndex)
+
+ bokeccID = _wanmen_get_boke_id_by_json_topic_part(html,
+ tIndex,
+ pIndex)
+
+ bokecc_download_by_id(vid = bokeccID, title = title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
+
+
+##Main entrance
+def wanmen_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
+
+ if not 'wanmen.org' in url:
+ log.wtf('You are at the wrong place dude. This is for WanMen University!')
+ raise
+
+ courseID = int(match1(url, r'course\/(\d+)'))
+ assert courseID > 0 #without courseID we cannot do anything
+
+ tIndex = int(match1(url, r'tIndex=(\d+)'))
+
+ pIndex = int(match1(url, r'pIndex=(\d+)'))
+
+ json_api_content = _wanmen_get_json_api_content_by_courseID(courseID)
+
+ if pIndex: #only download ONE single part
+ assert tIndex >= 0
+ wanmen_download_by_course_topic_part(json_api_content, tIndex, pIndex,
+ output_dir = output_dir,
+ merge = merge,
+ info_only = info_only)
+ elif tIndex: #download a topic
+ wanmen_download_by_course_topic(json_api_content, tIndex,
+ output_dir = output_dir,
+ merge = merge,
+ info_only = info_only)
+ else: #download the whole course
+ wanmen_download_by_course(json_api_content,
+ output_dir = output_dir,
+ merge = merge,
+ info_only = info_only)
+
+
+site_info = "WanMen University"
+download = wanmen_download
+download_playlist = wanmen_download_by_course
\ No newline at end of file