diff --git a/src/you_get/common.py b/src/you_get/common.py index 2ff61d55..a5a0fbab 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -79,6 +79,7 @@ SITES = { 'videomega' : 'videomega', 'vidto' : 'vidto', 'vimeo' : 'vimeo', + 'wanmen' : 'wanmen', 'weibo' : 'miaopai', 'veoh' : 'veoh', 'vine' : 'vine', diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index 97ab0b41..e69bc2fd 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -7,6 +7,7 @@ from .baidu import * from .bandcamp import * from .bigthink import * from .bilibili import * +from .bokecc import * from .cbs import * from .ckplayer import * from .cntv import * @@ -73,6 +74,7 @@ from .vimeo import * from .vine import * from .vk import * from .w56 import * +from .wanmen import * from .xiami import * from .yinyuetai import * from .yixia import * diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index c7c4fac9..e9d3e7ad 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -2,17 +2,96 @@ __all__ = ['bilibili_download'] -import json -import re from ..common import * -def get_srt_xml(cid): - return get_html('http://comment.bilibili.com/%s.xml' % cid) +from .sina import sina_download_by_vid +from .tudou import tudou_download_by_id +from .youku import youku_download_by_vid -def bilibili_download_by_api(url, output_dir='.', merge=True, info_only=False, **kwargs): - title = r1(r'cid=(\d+)', url) - info = json.loads(get_content(url)) - urls = [i['url'] for i in info['durl']] +import hashlib +import re + +appkey = 'f3bb208b3d081dc8' +SECRETKEY_MINILOADER = '1c15888dc316e05a15fdd0a02ed6584f' + +def get_srt_xml(id): + url = 'http://comment.bilibili.com/%s.xml' % id + return get_html(url) + + +def parse_srt_p(p): + fields = p.split(',') + assert len(fields) == 8, fields + time, mode, font_size, font_color, pub_time, pool, user_id, history = fields + time = float(time) + + mode = int(mode) + assert 1 <= mode <= 8 + # mode 1~3: scrolling + # mode 4: bottom + # mode 5: top + # mode 6: reverse? + # mode 7: position + # mode 8: advanced + + pool = int(pool) + assert 0 <= pool <= 2 + # pool 0: normal + # pool 1: srt + # pool 2: special? + + font_size = int(font_size) + + font_color = '#%06x' % int(font_color) + + return pool, mode, font_size, font_color + + +def parse_srt_xml(xml): + d = re.findall(r'(.*)', xml) + for x, y in d: + p = parse_srt_p(x) + raise NotImplementedError() + + +def parse_cid_playurl(xml): + from xml.dom.minidom import parseString + try: + doc = parseString(xml.encode('utf-8')) + urls = [durl.getElementsByTagName('url')[0].firstChild.nodeValue for durl in doc.getElementsByTagName('durl')] + return urls + except: + return [] + + +def bilibili_download_by_cids(cids, title, output_dir='.', merge=True, info_only=False): + urls = [] + for cid in cids: + sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest() + url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this + urls += [i + if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i) + else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i) + for i in parse_cid_playurl(get_content(url))] + + type_ = '' + size = 0 + for url in urls: + _, type_, temp = url_info(url) + size += temp + + print_info(site_info, title, type_, size) + if not info_only: + download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge) + + +def bilibili_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False): + sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest() + url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this + urls = [i + if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i) + else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i) + for i in parse_cid_playurl(get_content(url))] type_ = '' size = 0 @@ -27,50 +106,82 @@ def bilibili_download_by_api(url, output_dir='.', merge=True, info_only=False, * if not info_only: download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge) -def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - if re.match(r'https?://interface\.bilibili\.com/', url): - # quick hack for explicit API - bilibili_download_by_api(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) - return - html = get_content(url) - main_title = r1_of([r'', - r']*>\s*([^<>]+)\s*'], html) - cid = r1(r'cid=(\d+)', html) +def bilibili_live_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False): + api_url = 'http://live.bilibili.com/api/playurl?cid=' + cid + urls = parse_cid_playurl(get_content(api_url)) - aid = r1(r'av(\d+)', url) - page = r1(r'index_(\d+)', url) - sub_titles = re.findall('', html) - if page is None and sub_titles: # download all - for t in enumerate(sub_titles): - page, sub_title = t[0] + 1, t[1] - title = main_title + ' - ' + sub_title - - api = 'http://www.bilibili.com/m/html5?aid=%s&page=%s' % (aid, page) - info = json.loads(get_content(api)) - src = info['src'] - _, type_, size = url_info(src) - print_info(site_info, title, type_, size) - if not info_only: - download_urls([src], title, type_, total_size=size, output_dir=output_dir, merge=merge) - - else: # download selected - if page is None: page = 1 - sub_title = r1('', html) - if sub_title is None: - sub_title = r1('', html) - if sub_title: - title = main_title + ' - ' + sub_title - else: - title = main_title - - api = 'http://www.bilibili.com/m/html5?aid=%s&page=%s' % (aid, page) - info = json.loads(get_content(api)) - src = info['src'] - _, type_, size = url_info(src) + for url in urls: + _, type_, _ = url_info(url) + size = 0 print_info(site_info, title, type_, size) if not info_only: - download_urls([src], title, type_, total_size=size, output_dir=output_dir, merge=merge) + download_urls([url], title, type_, total_size=None, output_dir=output_dir, merge=merge) + + +def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs): + html = get_content(url) + + if re.match(r'https?://bangumi\.bilibili\.com/', url): + # quick hack for bangumi URLs + url = r1(r'"([^"]+)" class="v-av-link"', html) + html = get_content(url) + + title = r1_of([r'', + r']*>\s*([^<>]+)\s*'], html) + if title: + title = unescape_html(title) + title = escape_file_path(title) + + flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"', + r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) + assert flashvars + flashvars = flashvars.replace(': ', '=') + t, cid = flashvars.split('=', 1) + cid = cid.split('&')[0] + if t == 'cid': + if re.match(r'https?://live\.bilibili\.com/', url): + title = r1(r'\s*([^<>]+)\s*', html) + bilibili_live_download_by_cid(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) + + else: + # multi-P + cids = [] + pages = re.findall('', html) + for i, page in enumerate(pages): + html = get_html("http://www.bilibili.com%s" % page) + flashvars = r1_of([r'(cid=\d+)', + r'flashvars="([^"]+)"', + r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) + if flashvars: + t, cid = flashvars.split('=', 1) + cids.append(cid.split('&')[0]) + if url.endswith(page): + cids = [cid.split('&')[0]] + titles = [titles[i]] + break + + # no multi-P + if not pages: + cids = [cid] + titles = [r1(r'', html) or title] + + for i in range(len(cids)): + bilibili_download_by_cid(cids[i], + titles[i], + output_dir=output_dir, + merge=merge, + info_only=info_only) + + elif t == 'vid': + sina_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + elif t == 'ykid': + youku_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + elif t == 'uid': + tudou_download_by_id(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) + else: + raise NotImplementedError(flashvars) if not info_only and not dry_run: if not kwargs['caption']: @@ -82,6 +193,7 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs with open(os.path.join(output_dir, title + '.cmt.xml'), 'w', encoding='utf-8') as x: x.write(xml) + site_info = "bilibili.com" download = bilibili_download -download_playlist = bilibili_download +download_playlist = bilibili_download \ No newline at end of file diff --git a/src/you_get/extractors/bokecc.py b/src/you_get/extractors/bokecc.py new file mode 100644 index 00000000..8566e828 --- /dev/null +++ b/src/you_get/extractors/bokecc.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python + +from ..common import * +from ..extractor import VideoExtractor +import xml.etree.ElementTree as ET + +class BokeCC(VideoExtractor): + name = "BokeCC" + + stream_types = [ # we do now know for now, as we have to check the + # output from the API + ] + + API_ENDPOINT = 'http://p.bokecc.com/' + + + def download_by_id(self, vid = '', title = None, output_dir='.', merge=True, info_only=False,**kwargs): + """self, str->None + + Keyword arguments: + self: self + vid: The video ID for BokeCC cloud, something like + FE3BB999594978049C33DC5901307461 + + Calls the prepare() to download the video. + + If no title is provided, this method shall try to find a proper title + with the information providin within the + returned content of the API.""" + + assert vid + + self.prepare(vid = vid, title = title, **kwargs) + + self.extract(**kwargs) + + self.download(output_dir = output_dir, + merge = merge, + info_only = info_only, **kwargs) + + def prepare(self, vid = '', title = None, **kwargs): + assert vid + + api_url = self.API_ENDPOINT + \ + 'servlet/playinfo?vid={vid}&m=0'.format(vid = vid) #return XML + + html = get_content(api_url) + self.tree = ET.ElementTree(ET.fromstring(html)) + + if self.tree.find('result').text != '1': + log.wtf('API result says failed!') + raise + + if title is None: + self.title = '_'.join([i.text for i in tree.iterfind('video/videomarks/videomark/markdesc')]) + else: + self.title = title + + for i in self.tree.iterfind('video/quality'): + quality = i.attrib ['value'] + url = i[0].attrib['playurl'] + self.stream_types.append({'id': quality, + 'video_profile': i.attrib ['desp']}) + self.streams[quality] = {'url': url, + 'video_profile': i.attrib ['desp']} + self.streams_sorted = [dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams] + + + def extract(self, **kwargs): + for i in self.streams: + s = self.streams[i] + _, s['container'], s['size'] = url_info(s['url']) + s['src'] = [s['url']] + if 'stream_id' in kwargs and kwargs['stream_id']: + # Extract the stream + stream_id = kwargs['stream_id'] + + if stream_id not in self.streams: + log.e('[Error] Invalid video format.') + log.e('Run \'-i\' command with no specific video format to view all available formats.') + exit(2) + else: + # Extract stream with the best quality + stream_id = self.streams_sorted[0]['id'] + _, s['container'], s['size'] = url_info(s['url']) + s['src'] = [s['url']] + +site = BokeCC() + +# I don't know how to call the player directly so I just put it here +# just in case anyone touchs it -- Beining@Aug.24.2016 +#download = site.download_by_url +#download_playlist = site.download_by_url + +bokecc_download_by_id = site.download_by_id diff --git a/src/you_get/extractors/universal.py b/src/you_get/extractors/universal.py index b0d929c9..ebab70f8 100644 --- a/src/you_get/extractors/universal.py +++ b/src/you_get/extractors/universal.py @@ -6,20 +6,20 @@ from ..common import * from .embed import * def universal_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - try: - embed_download(url, output_dir, merge=merge, info_only=info_only) - except: pass - else: return + content_type = get_head(url, headers=fake_headers)['Content-Type'] + if content_type.startswith('text/html'): + try: + embed_download(url, output_dir, merge=merge, info_only=info_only) + except: pass + else: return domains = url.split('/')[2].split('.') if len(domains) > 2: domains = domains[1:] site_info = '.'.join(domains) - response = get_response(url, faker=True) - content_type = response.headers['Content-Type'] - if content_type.startswith('text/html'): # extract an HTML page + response = get_response(url, faker=True) page = str(response.data) page_title = r1(r'([^<]*)', page) diff --git a/src/you_get/extractors/wanmen.py b/src/you_get/extractors/wanmen.py new file mode 100755 index 00000000..20c543c1 --- /dev/null +++ b/src/you_get/extractors/wanmen.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python + +__all__ = ['wanmen_download', 'wanmen_download_by_course', 'wanmen_download_by_course_topic', 'wanmen_download_by_course_topic_part'] + +from ..common import * +from .bokecc import bokecc_download_by_id +from json import loads + + +##Helper functions +def _wanmen_get_json_api_content_by_courseID(courseID): + """int->JSON + + Return a parsed JSON tree of WanMen's API.""" + + return loads(get_content('http://api.wanmen.org/course/getCourseNested/{courseID}'.format(courseID = courseID))) + +def _wanmen_get_title_by_json_topic_part(json_content, tIndex, pIndex): + """JSON, int, int, int->str + + Get a proper title with courseid+topicID+partID.""" + + return '_'.join([json_content[0]['name'], + json_content[0]['Topics'][tIndex]['name'], + json_content[0]['Topics'][tIndex]['Parts'][pIndex]['name']]) + + +def _wanmen_get_boke_id_by_json_topic_part(json_content, tIndex, pIndex): + """JSON, int, int, int->str + + Get one BokeCC video ID with courseid+topicID+partID.""" + + return json_content[0]['Topics'][tIndex]['Parts'][pIndex]['ccVideoLink'] + + +##Parsers +def wanmen_download_by_course(json_api_content, output_dir='.', merge=True, info_only=False, **kwargs): + """int->None + + Download a WHOLE course. + Reuse the API call to save time.""" + + for tIndex in range(len(json_api_content[0]['Topics'])): + for pIndex in range(len(json_api_content[0]['Topics'][tIndex]['Parts'])): + wanmen_download_by_course_topic_part(json_api_content, + tIndex, + pIndex, + output_dir=output_dir, + merge=merge, + info_only=info_only, + **kwargs) + + +def wanmen_download_by_course_topic(json_api_content, tIndex, output_dir='.', merge=True, info_only=False, **kwargs): + """int, int->None + + Download a TOPIC of a course. + Reuse the API call to save time.""" + + for pIndex in range(len(json_api_content[0]['Topics'][tIndex]['Parts'])): + wanmen_download_by_course_topic_part(json_api_content, + tIndex, + pIndex, + output_dir=output_dir, + merge=merge, + info_only=info_only, + **kwargs) + +def wanmen_download_by_course_topic_part(json_api_content, tIndex, pIndex, output_dir='.', merge=True, info_only=False, **kwargs): + """int, int, int->None + + Download ONE PART of the course.""" + + html = json_api_content + + title = _wanmen_get_title_by_json_topic_part(html, + tIndex, + pIndex) + + bokeccID = _wanmen_get_boke_id_by_json_topic_part(html, + tIndex, + pIndex) + + bokecc_download_by_id(vid = bokeccID, title = title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) + + +##Main entrance +def wanmen_download(url, output_dir='.', merge=True, info_only=False, **kwargs): + + if not 'wanmen.org' in url: + log.wtf('You are at the wrong place dude. This is for WanMen University!') + raise + + courseID = int(match1(url, r'course\/(\d+)')) + assert courseID > 0 #without courseID we cannot do anything + + tIndex = int(match1(url, r'tIndex=(\d+)')) + + pIndex = int(match1(url, r'pIndex=(\d+)')) + + json_api_content = _wanmen_get_json_api_content_by_courseID(courseID) + + if pIndex: #only download ONE single part + assert tIndex >= 0 + wanmen_download_by_course_topic_part(json_api_content, tIndex, pIndex, + output_dir = output_dir, + merge = merge, + info_only = info_only) + elif tIndex: #download a topic + wanmen_download_by_course_topic(json_api_content, tIndex, + output_dir = output_dir, + merge = merge, + info_only = info_only) + else: #download the whole course + wanmen_download_by_course(json_api_content, + output_dir = output_dir, + merge = merge, + info_only = info_only) + + +site_info = "WanMen University" +download = wanmen_download +download_playlist = wanmen_download_by_course \ No newline at end of file