diff --git a/.gitignore b/.gitignore index d22d3afe..0888e5ab 100644 --- a/.gitignore +++ b/.gitignore @@ -81,3 +81,5 @@ _* *.xml /.env /.idea +*.m4a +*.DS_Store diff --git a/src/you_get/common.py b/src/you_get/common.py index 12f15fae..55659986 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -75,6 +75,7 @@ SITES = { 'tumblr' : 'tumblr', 'twimg' : 'twitter', 'twitter' : 'twitter', + 'ucas' : 'ucas', 'videomega' : 'videomega', 'vidto' : 'vidto', 'vimeo' : 'vimeo', @@ -86,8 +87,10 @@ SITES = { 'xiami' : 'xiami', 'xiaokaxiu' : 'yixia', 'xiaojiadianvideo' : 'fc2video', + 'ximalaya' : 'ximalaya', 'yinyuetai' : 'yinyuetai', 'miaopai' : 'yixia', + 'yizhibo' : 'yizhibo', 'youku' : 'youku', 'youtu' : 'youtube', 'youtube' : 'youtube', @@ -256,6 +259,8 @@ def undeflate(data): # DEPRECATED in favor of get_content() def get_response(url, faker = False): + logging.debug('get_response: %s' % url) + # install cookies if cookies: opener = request.build_opener(request.HTTPCookieProcessor(cookies)) @@ -290,6 +295,8 @@ def get_decoded_html(url, faker = False): return data def get_location(url): + logging.debug('get_location: %s' % url) + response = request.urlopen(url) # urllib will follow redirections and it's too much code to tell urllib # not to do that @@ -395,6 +402,8 @@ def urls_size(urls, faker = False, headers = {}): return sum([url_size(url, faker=faker, headers=headers) for url in urls]) def get_head(url, headers = {}, get_method = 'HEAD'): + logging.debug('get_head: %s' % url) + if headers: req = request.Request(url, headers=headers) else: @@ -404,6 +413,8 @@ def get_head(url, headers = {}, get_method = 'HEAD'): return dict(res.headers) def url_info(url, faker = False, headers = {}): + logging.debug('url_info: %s' % url) + if faker: response = urlopen_with_retry(request.Request(url, headers=fake_headers)) elif headers: @@ -457,6 +468,8 @@ def url_info(url, faker = False, headers = {}): def url_locations(urls, faker = False, headers = {}): locations = [] for url in urls: + logging.debug('url_locations: %s' % url) + if faker: response = urlopen_with_retry(request.Request(url, headers=fake_headers)) elif headers: @@ -467,7 +480,10 @@ def url_locations(urls, faker = False, headers = {}): locations.append(response.url) return locations -def url_save(url, filepath, bar, refer = None, is_part = False, faker = False, headers = {}): +def url_save(url, filepath, bar, refer = None, is_part = False, faker = False, headers = {}, timeout = None, **kwargs): +#When a referer specified with param refer, the key must be 'Referer' for the hack here + if refer is not None: + headers['Referer'] = refer file_size = url_size(url, faker = faker, headers = headers) if os.path.exists(filepath): @@ -507,12 +523,14 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False, h headers = headers else: headers = {} - if received: - headers['Range'] = 'bytes=' + str(received) + '-' + headers['Range'] = 'bytes=' + str(received) + '-' if refer: headers['Referer'] = refer - response = urlopen_with_retry(request.Request(url, headers=headers)) + if timeout: + response = urlopen_with_retry(request.Request(url, headers=headers), timeout=timeout) + else: + response = urlopen_with_retry(request.Request(url, headers=headers)) try: range_start = int(response.headers['content-range'][6:].split('/')[0].split('-')[0]) end_length = int(response.headers['content-range'][6:].split('/')[1]) @@ -635,7 +653,7 @@ class SimpleProgressBar: total_str = '%5s' % round(self.total_size / 1048576, 1) total_str_width = max(len(total_str), 5) self.bar_size = self.term_size - 27 - 2*total_pieces_len - 2*total_str_width - self.bar = '{:>4}%% ({:>%s}/%sMB) ├{:─<%s}┤[{:>%s}/{:>%s}] {}' % ( + self.bar = '{:>4}%% ({:>%s}/%sMB) ©À{:©¤<%s}©È[{:>%s}/{:>%s}] {}' % ( total_str_width, total_str, self.bar_size, total_pieces_len, total_pieces_len) def update(self): @@ -647,12 +665,12 @@ class SimpleProgressBar: dots = bar_size * int(percent) // 100 plus = int(percent) - dots // bar_size * 100 if plus > 0.8: - plus = 'â–ˆ' + plus = '¨€' elif plus > 0.4: plus = '>' else: plus = '' - bar = 'â–ˆ' * dots + plus + bar = '¨€' * dots + plus bar = self.bar.format(percent, round(self.received / 1048576, 1), bar, self.current_piece, self.total_pieces, self.speed) sys.stdout.write('\r' + bar) sys.stdout.flush() @@ -719,7 +737,10 @@ class DummyProgressBar: def get_output_filename(urls, title, ext, output_dir, merge, **kwargs): # lame hack for the --output-filename option global output_filename - if output_filename: return output_filename + if output_filename: + if ext: + return output_filename + '.' + ext + return output_filename merged_ext = ext if (len(urls) > 1) and merge: @@ -781,7 +802,7 @@ def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merg url = urls[0] print('Downloading %s ...' % tr(output_filename)) bar.update() - url_save(url, output_filepath, bar, refer = refer, faker = faker, headers = headers) + url_save(url, output_filepath, bar, refer = refer, faker = faker, headers = headers, **kwargs) bar.done() else: parts = [] @@ -793,7 +814,7 @@ def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merg parts.append(filepath) #print 'Downloading %s [%s/%s]...' % (tr(filename), i + 1, len(urls)) bar.update_piece(i + 1) - url_save(url, filepath, bar, refer = refer, is_part = True, faker = faker, headers = headers) + url_save(url, filepath, bar, refer = refer, is_part = True, faker = faker, headers = headers, **kwargs) bar.done() if not merge: @@ -1042,7 +1063,7 @@ def print_info(site_info, title, type, size): type_info = "Advanced Systems Format (%s)" % type #elif type in ['video/mpeg']: # type_info = "MPEG video (%s)" % type - elif type in ['audio/mp4']: + elif type in ['audio/mp4', 'audio/m4a']: type_info = "MPEG-4 audio (%s)" % type elif type in ['audio/mpeg']: type_info = "MP3 (%s)" % type @@ -1175,18 +1196,20 @@ def script_main(script_name, download, download_playlist, **kwargs): -s | --socks-proxy Use an SOCKS5 proxy for downloading. -t | --timeout Set socket timeout. -d | --debug Show traceback and other debug info. + -I | --input-file Read non-playlist urls from file. ''' - short_opts = 'Vhfiuc:ndF:O:o:p:x:y:s:t:' - opts = ['version', 'help', 'force', 'info', 'url', 'cookies', 'no-caption', 'no-merge', 'no-proxy', 'debug', 'json', 'format=', 'stream=', 'itag=', 'output-filename=', 'output-dir=', 'player=', 'http-proxy=', 'socks-proxy=', 'extractor-proxy=', 'lang=', 'timeout='] + short_opts = 'Vhfiuc:ndF:O:o:p:x:y:s:t:I:' + opts = ['version', 'help', 'force', 'info', 'url', 'cookies', 'no-caption', 'no-merge', 'no-proxy', 'debug', 'json', 'format=', 'stream=', 'itag=', 'output-filename=', 'output-dir=', 'player=', 'http-proxy=', 'socks-proxy=', 'extractor-proxy=', 'lang=', 'timeout=', 'input-file='] EXTRA_OPTS = 'sortbyidx tofile beginidx='.split() opts += EXTRA_OPTS - if download_playlist: - short_opts = 'l' + short_opts - opts = ['playlist'] + opts +#dead code? download_playlist is a function and always True +#if download_playlist: + short_opts = 'l' + short_opts + opts = ['playlist'] + opts try: - opts, args = getopt.getopt(sys.argv[1:], short_opts, opts) + opts, args = getopt.gnu_getopt(sys.argv[1:], short_opts, opts) except getopt.GetoptError as err: log.e(err) log.e("try 'you-get --help' for more options") @@ -1212,6 +1235,7 @@ def script_main(script_name, download, download_playlist, **kwargs): extractor_proxy = None traceback = False timeout = 600 + urls_from_file = [] extra_opts = {} for o, a in opts: if o in ('-V', '--version'): @@ -1290,6 +1314,15 @@ def script_main(script_name, download, download_playlist, **kwargs): lang = a elif o in ('-t', '--timeout'): timeout = int(a) + elif o in ('-I', '--input-file'): + logging.debug('you are trying to load urls from {}'.format(a)) + if playlist: + log.e("reading playlist from a file is unsupported and won't make your life easier") + sys.exit(2) + with open(a, 'r') as input_file: + for line in input_file: + url = line.strip() + urls_from_file.append(url) else: oky = o.strip('-') if oky in EXTRA_OPTS or oky + '=' in EXTRA_OPTS: @@ -1297,9 +1330,10 @@ def script_main(script_name, download, download_playlist, **kwargs): else: log.e("try 'you-get --help' for more options") sys.exit(2) - if not args: + if not args and not urls_from_file: print(help) sys.exit() + args.extend(urls_from_file) if socks_proxy: try: @@ -1333,16 +1367,20 @@ def script_main(script_name, download, download_playlist, **kwargs): else: download_main(download, download_playlist, args, playlist, stream_id=stream_id, extractor_proxy=extractor_proxy, output_dir=output_dir, merge=merge, info_only=info_only, json_output=json_output, caption=caption) else: - if not extractor_proxy: - download_main(download, download_playlist, args, playlist, output_dir=output_dir, merge=merge, info_only=info_only, json_output=json_output, caption=caption) - else: - download_main(download, download_playlist, args, playlist, extractor_proxy=extractor_proxy, output_dir=output_dir, merge=merge, info_only=info_only, json_output=json_output, caption=caption) + download_main(download, download_playlist, args, playlist, stream_id=stream_id, extractor_proxy=extractor_proxy, output_dir=output_dir, merge=merge, info_only=info_only, json_output=json_output, caption=caption) + else: + if not extractor_proxy: + download_main(download, download_playlist, args, playlist, output_dir=output_dir, merge=merge, info_only=info_only, json_output=json_output, caption=caption) + else: + download_main(download, download_playlist, args, playlist, extractor_proxy=extractor_proxy, output_dir=output_dir, merge=merge, info_only=info_only, json_output=json_output, caption=caption) except KeyboardInterrupt: if traceback: raise else: sys.exit(1) except UnicodeEncodeError: + if traceback: + raise log.e('[error] oops, the current environment does not seem to support Unicode.') log.e('please set it to a UTF-8-aware locale first,') log.e('so as to save the video (with some Unicode characters) correctly.') @@ -1397,7 +1435,7 @@ def url_to_module(url): video_host = r1(r'https?://([^/]+)/', url) video_url = r1(r'https?://[^/]+(.*)', url) - if video_host.endswith('.com.cn'): + if video_host.endswith('.com.cn') or video_host.endswith('.ac.cn'): video_host = video_host[:-3] domain = r1(r'(\.[^.]+\.[^.]+)$', video_host) or video_host assert domain, 'unsupported url: ' + url diff --git a/src/you_get/extractor.py b/src/you_get/extractor.py index 268c3105..a9938c0b 100644 --- a/src/you_get/extractor.py +++ b/src/you_get/extractor.py @@ -105,7 +105,7 @@ class VideoExtractor(): if 'quality' in stream: print(" quality: %s" % stream['quality']) - if 'size' in stream: + if 'size' in stream and stream['container'].lower() != 'm3u8': print(" size: %s MiB (%s bytes)" % (round(stream['size'] / 1048576, 1), stream['size'])) if 'itag' in stream: diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index a027c396..6d6f56cf 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -66,6 +66,7 @@ from .tucao import * from .tudou import * from .tumblr import * from .twitter import * +from .ucas import * from .veoh import * from .videomega import * from .vimeo import * diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 47dfe045..ccb395cb 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -82,26 +82,32 @@ def bilibili_download_by_cids(cids, title, output_dir='.', merge=True, info_only print_info(site_info, title, type_, size) if not info_only: - download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge) + download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge, headers={'Referer': 'http://www.bilibili.com/'}) def bilibili_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False): - sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest() - url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this - urls = [i - if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i) - else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i) - for i in parse_cid_playurl(get_content(url))] + while True: + try: + sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest() + url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this + urls = [i + if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i) + else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i) + for i in parse_cid_playurl(get_content(url))] - type_ = '' - size = 0 - for url in urls: - _, type_, temp = url_info(url) - size += temp or 0 + type_ = '' + size = 0 + for url in urls: + _, type_, temp = url_info(url, headers={'Referer': 'http://www.bilibili.com/'}) + size += temp or 0 - print_info(site_info, title, type_, size) - if not info_only: - download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge) + print_info(site_info, title, type_, size) + if not info_only: + download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge, timeout=1, headers={'Referer': 'http://www.bilibili.com/'}) + except socket.timeout: + continue + else: + break def bilibili_live_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False): diff --git a/src/you_get/extractors/ckplayer.py b/src/you_get/extractors/ckplayer.py index 09e95557..91159897 100644 --- a/src/you_get/extractors/ckplayer.py +++ b/src/you_get/extractors/ckplayer.py @@ -9,7 +9,6 @@ __all__ = ['ckplayer_download'] from xml.etree import cElementTree as ET from copy import copy from ..common import * - #---------------------------------------------------------------------- def ckplayer_get_info_by_xml(ckinfo): """str->dict @@ -20,20 +19,22 @@ def ckplayer_get_info_by_xml(ckinfo): 'links': [], 'size': 0, 'flashvars': '',} - if '_text' in dictify(e)['ckplayer']['info'][0]['title'][0]: #title - video_dict['title'] = dictify(e)['ckplayer']['info'][0]['title'][0]['_text'].strip() + dictified = dictify(e)['ckplayer'] + if 'info' in dictified: + if '_text' in dictified['info'][0]['title'][0]: #title + video_dict['title'] = dictified['info'][0]['title'][0]['_text'].strip() #if dictify(e)['ckplayer']['info'][0]['title'][0]['_text'].strip(): #duration #video_dict['title'] = dictify(e)['ckplayer']['info'][0]['title'][0]['_text'].strip() - if '_text' in dictify(e)['ckplayer']['video'][0]['size'][0]: #size exists for 1 piece - video_dict['size'] = sum([int(i['size'][0]['_text']) for i in dictify(e)['ckplayer']['video']]) + if '_text' in dictified['video'][0]['size'][0]: #size exists for 1 piece + video_dict['size'] = sum([int(i['size'][0]['_text']) for i in dictified['video']]) - if '_text' in dictify(e)['ckplayer']['video'][0]['file'][0]: #link exist - video_dict['links'] = [i['file'][0]['_text'].strip() for i in dictify(e)['ckplayer']['video']] + if '_text' in dictified['video'][0]['file'][0]: #link exist + video_dict['links'] = [i['file'][0]['_text'].strip() for i in dictified['video']] - if '_text' in dictify(e)['ckplayer']['flashvars'][0]: - video_dict['flashvars'] = dictify(e)['ckplayer']['flashvars'][0]['_text'].strip() + if '_text' in dictified['flashvars'][0]: + video_dict['flashvars'] = dictified['flashvars'][0]['_text'].strip() return video_dict diff --git a/src/you_get/extractors/cntv.py b/src/you_get/extractors/cntv.py index e25fa961..87f1984f 100644 --- a/src/you_get/extractors/cntv.py +++ b/src/you_get/extractors/cntv.py @@ -32,6 +32,8 @@ def cntv_download_by_id(id, title = None, output_dir = '.', merge = True, info_o def cntv_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): if re.match(r'http://tv\.cntv\.cn/video/(\w+)/(\w+)', url): id = match1(url, r'http://tv\.cntv\.cn/video/\w+/(\w+)') + elif re.match(r'http://tv\.cctv\.com/\d+/\d+/\d+/\w+.shtml', url): + id = r1(r'var guid = "(\w+)"', get_html(url)) elif re.match(r'http://\w+\.cntv\.cn/(\w+/\w+/(classpage/video/)?)?\d+/\d+\.shtml', url) or \ re.match(r'http://\w+.cntv.cn/(\w+/)*VIDE\d+.shtml', url) or \ re.match(r'http://(\w+).cntv.cn/(\w+)/classpage/video/(\d+)/(\d+).shtml', url) or \ diff --git a/src/you_get/extractors/dilidili.py b/src/you_get/extractors/dilidili.py index 082f84e1..f7b5922d 100644 --- a/src/you_get/extractors/dilidili.py +++ b/src/you_get/extractors/dilidili.py @@ -21,8 +21,9 @@ headers = { #---------------------------------------------------------------------- def dilidili_parser_data_to_stream_types(typ ,vid ,hd2 ,sign, tmsign, ulk): """->list""" + another_url = 'https://newplayer.jfrft.com/parse.php?xmlurl=null&type={typ}&vid={vid}&hd={hd2}&sign={sign}&tmsign={tmsign}&userlink={ulk}'.format(typ = typ, vid = vid, hd2 = hd2, sign = sign, tmsign = tmsign, ulk = ulk) parse_url = 'http://player.005.tv/parse.php?xmlurl=null&type={typ}&vid={vid}&hd={hd2}&sign={sign}&tmsign={tmsign}&userlink={ulk}'.format(typ = typ, vid = vid, hd2 = hd2, sign = sign, tmsign = tmsign, ulk = ulk) - html = get_content(parse_url, headers=headers) + html = get_content(another_url, headers=headers) info = re.search(r'(\{[^{]+\})(\{[^{]+\})(\{[^{]+\})(\{[^{]+\})(\{[^{]+\})', html).groups() info = [i.strip('{}').split('->') for i in info] @@ -35,13 +36,22 @@ def dilidili_parser_data_to_stream_types(typ ,vid ,hd2 ,sign, tmsign, ulk): #---------------------------------------------------------------------- def dilidili_download(url, output_dir = '.', merge = False, info_only = False, **kwargs): - if re.match(r'http://www.dilidili.com/watch\S+', url): + global headers + re_str = r'http://www.dilidili.com/watch\S+' + if re.match(r'http://www.dilidili.wang', url): + re_str = r'http://www.dilidili.wang/watch\S+' + headers['Referer'] = 'http://www.dilidili.wang/' + elif re.match(r'http://www.dilidili.mobi', url): + re_str = r'http://www.dilidili.mobi/watch\S+' + headers['Referer'] = 'http://www.dilidili.mobi/' + + if re.match(re_str, url): html = get_content(url) title = match1(html, r'(.+)丨(.+)') #title # player loaded via internal iframe frame_url = re.search(r'