diff --git a/README.md b/README.md index 40a26803..98c403c3 100644 --- a/README.md +++ b/README.md @@ -408,6 +408,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | Naver
네이버 | |✓| | | | 芒果TV | |✓| | | | 火猫TV | |✓| | | +| 全民Tv | |✓| | | For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. diff --git a/src/you_get/common.py b/src/you_get/common.py index 0100cae7..f320f6ab 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -65,6 +65,7 @@ SITES = { 'pptv' : 'pptv', 'qianmo' : 'qianmo', 'qq' : 'qq', + 'quanmin' : 'quanmin', 'showroom-live' : 'showroom', 'sina' : 'sina', 'smgbb' : 'bilibili', @@ -338,7 +339,7 @@ def get_content(url, headers={}, decoded=True): if charset is not None: data = data.decode(charset) else: - data = data.decode('utf-8') + data = data.decode('utf-8', 'ignore') return data @@ -395,12 +396,12 @@ def url_size(url, faker = False, headers = {}): def urls_size(urls, faker = False, headers = {}): return sum([url_size(url, faker=faker, headers=headers) for url in urls]) -def get_head(url, headers = {}): +def get_head(url, headers = {}, get_method = 'HEAD'): if headers: req = request.Request(url, headers = headers) else: req = request.Request(url) - req.get_method = lambda : 'HEAD' + req.get_method = lambda : get_method res = request.urlopen(req) return dict(res.headers) @@ -968,11 +969,15 @@ def download_url_ffmpeg(url,title, ext,params={}, total_size=0, output_dir='.', from .processor.ffmpeg import has_ffmpeg_installed, ffmpeg_download_stream assert has_ffmpeg_installed(), "FFmpeg not installed." + global output_filename - if(output_filename): + if output_filename: dotPos = output_filename.rfind(".") title = output_filename[:dotPos] ext = output_filename[dotPos+1:] + + title = tr(get_filename(title)) + ffmpeg_download_stream(url, title, ext, params, output_dir) def playlist_not_supported(name): diff --git a/src/you_get/extractor.py b/src/you_get/extractor.py index 594b908e..332440dd 100644 --- a/src/you_get/extractor.py +++ b/src/you_get/extractor.py @@ -206,7 +206,7 @@ class VideoExtractor(): output_dir=kwargs['output_dir'], merge=kwargs['merge'], av=stream_id in self.dash_streams) - if not kwargs['caption']: + if 'caption' not in kwargs or not kwargs['caption']: print('Skipping captions.') return for lang in self.caption_tracks: diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 122dea0b..5f00ffe9 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -127,10 +127,11 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs if re.match(r'https?://bangumi\.bilibili\.com/', url): # quick hack for bangumi URLs - episode_id = r1(r'data-current-episode-id="(\d+)"', html) + episode_id = r1(r'#(\d+)$', url) or r1(r'first_ep_id = "(\d+)"', html) cont = post_content('http://bangumi.bilibili.com/web_api/get_source', post_data={'episode_id': episode_id}) cid = json.loads(cont)['result']['cid'] + title = '%s [%s]' % (title, episode_id) bilibili_download_by_cid(str(cid), title, output_dir=output_dir, merge=merge, info_only=info_only) else: diff --git a/src/you_get/extractors/embed.py b/src/you_get/extractors/embed.py index fc4015c4..3bdb924c 100644 --- a/src/you_get/extractors/embed.py +++ b/src/you_get/extractors/embed.py @@ -2,6 +2,7 @@ __all__ = ['embed_download'] from ..common import * +from .bilibili import bilibili_download from .iqiyi import iqiyi_download_by_vid from .le import letvcloud_download_by_vu from .netease import netease_download @@ -42,6 +43,11 @@ netease_embed_patterns = [ '(http://\w+\.163\.com/movie/[^\'"]+)' ] vimeo_embed_patters = [ 'player\.vimeo\.com/video/(\d+)' ] +""" +check the share button on http://www.bilibili.com/video/av5079467/ +""" +bilibili_embed_patterns = [ 'static\.hdslb\.com/miniloader\.swf.*aid=(\d+)' ] + def embed_download(url, output_dir = '.', merge = True, info_only = False ,**kwargs): content = get_content(url, headers=fake_headers) @@ -78,6 +84,12 @@ def embed_download(url, output_dir = '.', merge = True, info_only = False ,**kwa found = True vimeo_download_by_id(url, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + aids = matchall(content, bilibili_embed_patterns) + for aid in aids: + found = True + url = 'http://www.bilibili.com/video/av%s/' % aid + bilibili_download(url, output_dir=output_dir, merge=merge, info_only=info_only) + if not found: raise NotImplementedError(url) diff --git a/src/you_get/extractors/lizhi.py b/src/you_get/extractors/lizhi.py index 56dbf756..65988a9f 100644 --- a/src/you_get/extractors/lizhi.py +++ b/src/you_get/extractors/lizhi.py @@ -4,37 +4,55 @@ __all__ = ['lizhi_download'] import json from ..common import * -def lizhi_download_playlist(url, output_dir = '.', merge = True, info_only = False, **kwargs): - # like this http://www.lizhi.fm/#/31365/ - #api desc: s->start l->length band->some radio - #http://www.lizhi.fm/api/radio_audios?s=0&l=100&band=31365 - band_id = match1(url,r'#/(\d+)') - #try to get a considerable large l to reduce html parsing task. - api_url = 'http://www.lizhi.fm/api/radio_audios?s=0&l=65535&band='+band_id - content_json = json.loads(get_content(api_url)) - for sound in content_json: - title = sound["name"] - res_url = sound["url"] - songtype, ext, size = url_info(res_url,faker=True) - print_info(site_info, title, songtype, size) - if not info_only: - #no referer no speed! - download_urls([res_url], title, ext, size, output_dir, merge=merge ,refer = 'http://www.lizhi.fm',faker=True) - pass +# radio_id: e.g. 549759 from http://www.lizhi.fm/549759/ +# +# Returns a list of tuples (audio_id, title, url) for each episode +# (audio) in the radio playlist. url is the direct link to the audio +# file. +def lizhi_extract_playlist_info(radio_id): + # /api/radio_audios API parameters: + # + # - s: starting episode + # - l: count (per page) + # - band: radio_id + # + # We use l=65535 for poor man's pagination (that is, no pagination + # at all -- hope all fits on a single page). + # + # TODO: Use /api/radio?band={radio_id} to get number of episodes + # (au_cnt), then handle pagination properly. + api_url = 'http://www.lizhi.fm/api/radio_audios?s=0&l=65535&band=%s' % radio_id + api_response = json.loads(get_content(api_url)) + return [(ep['id'], ep['name'], ep['url']) for ep in api_response] -def lizhi_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): - # url like http://www.lizhi.fm/#/549759/18864883431656710 - api_id = match1(url,r'#/(\d+/\d+)') - api_url = 'http://www.lizhi.fm/api/audio/'+api_id - content_json = json.loads(get_content(api_url)) - title = content_json["audio"]["name"] - res_url = content_json["audio"]["url"] - songtype, ext, size = url_info(res_url,faker=True) - print_info(site_info, title, songtype, size) +def lizhi_download_audio(audio_id, title, url, output_dir='.', info_only=False): + filetype, ext, size = url_info(url) + print_info(site_info, title, filetype, size) if not info_only: - #no referer no speed! - download_urls([res_url], title, ext, size, output_dir, merge=merge ,refer = 'http://www.lizhi.fm',faker=True) + download_urls([url], title, ext, size, output_dir=output_dir) +def lizhi_download_playlist(url, output_dir='.', info_only=False, **kwargs): + # Sample URL: http://www.lizhi.fm/549759/ + radio_id = match1(url,r'/(\d+)') + if not radio_id: + raise NotImplementedError('%s not supported' % url) + for audio_id, title, url in lizhi_extract_playlist_info(radio_id): + lizhi_download_audio(audio_id, title, url, output_dir=output_dir, info_only=info_only) + +def lizhi_download(url, output_dir='.', info_only=False, **kwargs): + # Sample URL: http://www.lizhi.fm/549759/18864883431656710/ + m = re.search(r'/(?P\d+)/(?P\d+)', url) + if not m: + raise NotImplementedError('%s not supported' % url) + radio_id = m.group('radio_id') + audio_id = m.group('audio_id') + # Look for the audio_id among the full list of episodes + for aid, title, url in lizhi_extract_playlist_info(radio_id): + if aid == audio_id: + lizhi_download_audio(audio_id, title, url, output_dir=output_dir, info_only=info_only) + break + else: + raise NotImplementedError('Audio #%s not found in playlist #%s' % (audio_id, radio_id)) site_info = "lizhi.fm" download = lizhi_download diff --git a/src/you_get/extractors/magisto.py b/src/you_get/extractors/magisto.py index 2a53be02..b2e8e502 100644 --- a/src/you_get/extractors/magisto.py +++ b/src/you_get/extractors/magisto.py @@ -3,15 +3,19 @@ __all__ = ['magisto_download'] from ..common import * +import json def magisto_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_html(url) - - title1 = r1(r'None - + Arguments: client_id: An ID per client. For now we only know Acfun's such ID. - + vid: An video ID for each video, starts with "C". - + kwargs['embsig']: Youku COOP's anti hotlinking. For Acfun, an API call must be done to Acfun's server, or the "playsign" of the content of sign_url shall be empty. - + Misc: Override the original one with VideoExtractor. - + Author: Most of the credit are to @ERioK, who gave his POC. - + History: Jul.28.2016 Youku COOP now have anti hotlinking via embsig. """ self.f_code_1 = '10ehfkbv' #can be retrived by running r.translate with the keys and the list e self.f_code_2 = 'msjv7h2b' - + # as in VideoExtractor self.url = None self.vid = vid self.name = "优酷开放平台 (Youku COOP)" #A little bit of work before self.prepare - + #Change as Jul.28.2016 Youku COOP updates its platform to add ant hotlinking if kwargs['embsig']: sign_url = "https://api.youku.com/players/custom.json?client_id={client_id}&video_id={video_id}&embsig={embsig}".format(client_id = client_id, video_id = vid, embsig = kwargs['embsig']) @@ -371,9 +371,9 @@ class Youku(VideoExtractor): #to be injected and replace ct10 and 12 api85_url = 'http://play.youku.com/partner/get.json?cid={client_id}&vid={vid}&ct=85&sign={playsign}'.format(client_id = client_id, vid = vid, playsign = playsign) api86_url = 'http://play.youku.com/partner/get.json?cid={client_id}&vid={vid}&ct=86&sign={playsign}'.format(client_id = client_id, vid = vid, playsign = playsign) - + self.prepare(api_url = api85_url, api12_url = api86_url, ctype = 86, **kwargs) - + #exact copy from original VideoExtractor if 'extractor_proxy' in kwargs and kwargs['extractor_proxy']: unset_proxy() diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 64af5c14..c403cb74 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -148,6 +148,17 @@ class YouTube(VideoExtractor): elif video_info['status'] == ['ok']: if 'use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']: self.title = parse.unquote_plus(video_info['title'][0]) + + # YouTube Live + if 'url_encoded_fmt_stream_map' not in video_info: + hlsvp = video_info['hlsvp'][0] + + if 'info_only' in kwargs and kwargs['info_only']: + return + else: + download_url_ffmpeg(hlsvp, self.title, 'mp4') + exit(0) + stream_list = video_info['url_encoded_fmt_stream_map'][0].split(',') # Parse video page (for DASH) @@ -258,11 +269,17 @@ class YouTube(VideoExtractor): burls = rep.getElementsByTagName('BaseURL') dash_mp4_a_url = burls[0].firstChild.nodeValue dash_mp4_a_size = burls[0].getAttribute('yt:contentLength') + if not dash_mp4_a_size: + try: dash_mp4_a_size = url_size(dash_mp4_a_url) + except: continue elif mimeType == 'audio/webm': rep = aset.getElementsByTagName('Representation')[-1] burls = rep.getElementsByTagName('BaseURL') dash_webm_a_url = burls[0].firstChild.nodeValue dash_webm_a_size = burls[0].getAttribute('yt:contentLength') + if not dash_webm_a_size: + try: dash_webm_a_size = url_size(dash_webm_a_url) + except: continue elif mimeType == 'video/mp4': for rep in aset.getElementsByTagName('Representation'): w = int(rep.getAttribute('width')) @@ -271,6 +288,9 @@ class YouTube(VideoExtractor): burls = rep.getElementsByTagName('BaseURL') dash_url = burls[0].firstChild.nodeValue dash_size = burls[0].getAttribute('yt:contentLength') + if not dash_size: + try: dash_size = url_size(dash_url) + except: continue self.dash_streams[itag] = { 'quality': '%sx%s' % (w, h), 'itag': itag, @@ -288,6 +308,9 @@ class YouTube(VideoExtractor): burls = rep.getElementsByTagName('BaseURL') dash_url = burls[0].firstChild.nodeValue dash_size = burls[0].getAttribute('yt:contentLength') + if not dash_size: + try: dash_size = url_size(dash_url) + except: continue self.dash_streams[itag] = { 'quality': '%sx%s' % (w, h), 'itag': itag, diff --git a/src/you_get/processor/ffmpeg.py b/src/you_get/processor/ffmpeg.py old mode 100644 new mode 100755 index a8599e52..da7c076c --- a/src/you_get/processor/ffmpeg.py +++ b/src/you_get/processor/ffmpeg.py @@ -1,5 +1,6 @@ #!/usr/bin/env python +import logging import os.path import subprocess from ..util.strings import parameterize @@ -21,11 +22,26 @@ def get_usable_ffmpeg(cmd): return None FFMPEG, FFMPEG_VERSION = get_usable_ffmpeg('ffmpeg') or get_usable_ffmpeg('avconv') or (None, None) -LOGLEVEL = ['-loglevel', 'quiet'] +if logging.getLogger().isEnabledFor(logging.DEBUG): + LOGLEVEL = ['-loglevel', 'info'] +else: + LOGLEVEL = ['-loglevel', 'quiet'] def has_ffmpeg_installed(): return FFMPEG is not None +# Given a list of segments and the output path, generates the concat +# list and returns the path to the concat list. +def generate_concat_list(files, output): + concat_list_path = output + '.txt' + concat_list_dir = os.path.dirname(concat_list_path) + with open(concat_list_path, 'w', encoding='utf-8') as concat_list: + for file in files: + if os.path.isfile(file): + relpath = os.path.relpath(file, start=concat_list_dir) + concat_list.write('file %s\n' % parameterize(relpath)) + return concat_list_path + def ffmpeg_concat_av(files, output, ext): print('Merging video parts... ', end="", flush=True) params = [FFMPEG] + LOGLEVEL @@ -52,17 +68,9 @@ def ffmpeg_convert_ts_to_mkv(files, output='output.mkv'): def ffmpeg_concat_mp4_to_mpg(files, output='output.mpg'): # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): - concat_list = open(output + '.txt', 'w', encoding="utf-8") - for file in files: - if os.path.isfile(file): - concat_list.write("file %s\n" % parameterize(file)) - concat_list.close() - - params = [FFMPEG] + LOGLEVEL - params.extend(['-f', 'concat', '-safe', '-1', '-y', '-i']) - params.append(output + '.txt') - params += ['-c', 'copy', output] - + concat_list = generate_concat_list(files, output) + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + '-i', concat_list, '-c', 'copy', output] if subprocess.call(params) == 0: os.remove(output + '.txt') return True @@ -115,18 +123,10 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'): print('Merging video parts... ', end="", flush=True) # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): - concat_list = open(output + '.txt', 'w', encoding="utf-8") - for file in files: - if os.path.isfile(file): - # for escaping rules, see: - # https://www.ffmpeg.org/ffmpeg-utils.html#Quoting-and-escaping - concat_list.write("file %s\n" % parameterize(file)) - concat_list.close() - - params = [FFMPEG] + LOGLEVEL + ['-f', 'concat', '-safe', '-1', '-y', '-i'] - params.append(output + '.txt') - params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc', output] - + concat_list = generate_concat_list(files, output) + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + '-i', concat_list, '-c', 'copy', + '-bsf:a', 'aac_adtstoasc', output] subprocess.check_call(params) os.remove(output + '.txt') return True @@ -162,16 +162,10 @@ def ffmpeg_concat_mp4_to_mp4(files, output='output.mp4'): print('Merging video parts... ', end="", flush=True) # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): - concat_list = open(output + '.txt', 'w', encoding="utf-8") - for file in files: - if os.path.isfile(file): - concat_list.write("file %s\n" % parameterize(file)) - concat_list.close() - - params = [FFMPEG] + LOGLEVEL + ['-f', 'concat', '-safe', '-1', '-y', '-i'] - params.append(output + '.txt') - params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc', output] - + concat_list = generate_concat_list(files, output) + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + '-i', concat_list, '-c', 'copy', + '-bsf:a', 'aac_adtstoasc', output] subprocess.check_call(params) os.remove(output + '.txt') return True diff --git a/src/you_get/version.py b/src/you_get/version.py index 28919906..2e8e4f41 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.595' +__version__ = '0.4.626' diff --git a/tests/test.py b/tests/test.py index 0fa2979a..020455b0 100644 --- a/tests/test.py +++ b/tests/test.py @@ -18,9 +18,6 @@ class YouGetTests(unittest.TestCase): def test_magisto(self): magisto.download("http://www.magisto.com/album/video/f3x9AAQORAkfDnIFDA", info_only=True) - def test_mixcloud(self): - mixcloud.download("http://www.mixcloud.com/DJVadim/north-america-are-you-ready/", info_only=True) - def test_youtube(self): youtube.download("http://www.youtube.com/watch?v=pzKerr0JIPA", info_only=True) youtube.download("http://youtu.be/pzKerr0JIPA", info_only=True) diff --git a/you-get.json b/you-get.json index 084657d9..594742c2 100644 --- a/you-get.json +++ b/you-get.json @@ -24,6 +24,7 @@ "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", "Topic :: Internet", "Topic :: Internet :: WWW/HTTP", "Topic :: Multimedia",