From 606e0a786e2ab631288d2f4567ed1d37334ae52e Mon Sep 17 00:00:00 2001 From: Zhiming Wang Date: Sun, 4 Dec 2016 19:36:17 -0500 Subject: [PATCH 1/8] [lizhi] overhaul Lizhi extractor has stopped working. In particular, there are two major changes: - URL format change: no more #/ in URL paths; - The /api/audio/{radio_id}/{audio_id} API now returns 404. This is a rewrite based on the /api/radio_audios API. --- src/you_get/extractors/lizhi.py | 74 ++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 28 deletions(-) diff --git a/src/you_get/extractors/lizhi.py b/src/you_get/extractors/lizhi.py index 56dbf756..65988a9f 100644 --- a/src/you_get/extractors/lizhi.py +++ b/src/you_get/extractors/lizhi.py @@ -4,37 +4,55 @@ __all__ = ['lizhi_download'] import json from ..common import * -def lizhi_download_playlist(url, output_dir = '.', merge = True, info_only = False, **kwargs): - # like this http://www.lizhi.fm/#/31365/ - #api desc: s->start l->length band->some radio - #http://www.lizhi.fm/api/radio_audios?s=0&l=100&band=31365 - band_id = match1(url,r'#/(\d+)') - #try to get a considerable large l to reduce html parsing task. - api_url = 'http://www.lizhi.fm/api/radio_audios?s=0&l=65535&band='+band_id - content_json = json.loads(get_content(api_url)) - for sound in content_json: - title = sound["name"] - res_url = sound["url"] - songtype, ext, size = url_info(res_url,faker=True) - print_info(site_info, title, songtype, size) - if not info_only: - #no referer no speed! - download_urls([res_url], title, ext, size, output_dir, merge=merge ,refer = 'http://www.lizhi.fm',faker=True) - pass +# radio_id: e.g. 549759 from http://www.lizhi.fm/549759/ +# +# Returns a list of tuples (audio_id, title, url) for each episode +# (audio) in the radio playlist. url is the direct link to the audio +# file. +def lizhi_extract_playlist_info(radio_id): + # /api/radio_audios API parameters: + # + # - s: starting episode + # - l: count (per page) + # - band: radio_id + # + # We use l=65535 for poor man's pagination (that is, no pagination + # at all -- hope all fits on a single page). + # + # TODO: Use /api/radio?band={radio_id} to get number of episodes + # (au_cnt), then handle pagination properly. + api_url = 'http://www.lizhi.fm/api/radio_audios?s=0&l=65535&band=%s' % radio_id + api_response = json.loads(get_content(api_url)) + return [(ep['id'], ep['name'], ep['url']) for ep in api_response] -def lizhi_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): - # url like http://www.lizhi.fm/#/549759/18864883431656710 - api_id = match1(url,r'#/(\d+/\d+)') - api_url = 'http://www.lizhi.fm/api/audio/'+api_id - content_json = json.loads(get_content(api_url)) - title = content_json["audio"]["name"] - res_url = content_json["audio"]["url"] - songtype, ext, size = url_info(res_url,faker=True) - print_info(site_info, title, songtype, size) +def lizhi_download_audio(audio_id, title, url, output_dir='.', info_only=False): + filetype, ext, size = url_info(url) + print_info(site_info, title, filetype, size) if not info_only: - #no referer no speed! - download_urls([res_url], title, ext, size, output_dir, merge=merge ,refer = 'http://www.lizhi.fm',faker=True) + download_urls([url], title, ext, size, output_dir=output_dir) +def lizhi_download_playlist(url, output_dir='.', info_only=False, **kwargs): + # Sample URL: http://www.lizhi.fm/549759/ + radio_id = match1(url,r'/(\d+)') + if not radio_id: + raise NotImplementedError('%s not supported' % url) + for audio_id, title, url in lizhi_extract_playlist_info(radio_id): + lizhi_download_audio(audio_id, title, url, output_dir=output_dir, info_only=info_only) + +def lizhi_download(url, output_dir='.', info_only=False, **kwargs): + # Sample URL: http://www.lizhi.fm/549759/18864883431656710/ + m = re.search(r'/(?P\d+)/(?P\d+)', url) + if not m: + raise NotImplementedError('%s not supported' % url) + radio_id = m.group('radio_id') + audio_id = m.group('audio_id') + # Look for the audio_id among the full list of episodes + for aid, title, url in lizhi_extract_playlist_info(radio_id): + if aid == audio_id: + lizhi_download_audio(audio_id, title, url, output_dir=output_dir, info_only=info_only) + break + else: + raise NotImplementedError('Audio #%s not found in playlist #%s' % (audio_id, radio_id)) site_info = "lizhi.fm" download = lizhi_download From a6d3c13684cff5811e3c1c6bac93698355cc3a43 Mon Sep 17 00:00:00 2001 From: Zhiming Wang Date: Mon, 5 Dec 2016 23:45:28 -0500 Subject: [PATCH 2/8] [embed] add support for bilibili's embedded player Sample embed: for http://www.bilibili.com/video/av5079467/: --- src/you_get/extractors/embed.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/you_get/extractors/embed.py b/src/you_get/extractors/embed.py index fc4015c4..3bdb924c 100644 --- a/src/you_get/extractors/embed.py +++ b/src/you_get/extractors/embed.py @@ -2,6 +2,7 @@ __all__ = ['embed_download'] from ..common import * +from .bilibili import bilibili_download from .iqiyi import iqiyi_download_by_vid from .le import letvcloud_download_by_vu from .netease import netease_download @@ -42,6 +43,11 @@ netease_embed_patterns = [ '(http://\w+\.163\.com/movie/[^\'"]+)' ] vimeo_embed_patters = [ 'player\.vimeo\.com/video/(\d+)' ] +""" +check the share button on http://www.bilibili.com/video/av5079467/ +""" +bilibili_embed_patterns = [ 'static\.hdslb\.com/miniloader\.swf.*aid=(\d+)' ] + def embed_download(url, output_dir = '.', merge = True, info_only = False ,**kwargs): content = get_content(url, headers=fake_headers) @@ -78,6 +84,12 @@ def embed_download(url, output_dir = '.', merge = True, info_only = False ,**kwa found = True vimeo_download_by_id(url, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + aids = matchall(content, bilibili_embed_patterns) + for aid in aids: + found = True + url = 'http://www.bilibili.com/video/av%s/' % aid + bilibili_download(url, output_dir=output_dir, merge=merge, info_only=info_only) + if not found: raise NotImplementedError(url) From 9905620b5297483e5e10195aad90a14be1d360fd Mon Sep 17 00:00:00 2001 From: Valdemar Erk Date: Fri, 16 Dec 2016 09:36:29 +0100 Subject: [PATCH 3/8] Fix for magisto --- src/you_get/extractors/magisto.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/you_get/extractors/magisto.py b/src/you_get/extractors/magisto.py index 2a53be02..b2e8e502 100644 --- a/src/you_get/extractors/magisto.py +++ b/src/you_get/extractors/magisto.py @@ -3,15 +3,19 @@ __all__ = ['magisto_download'] from ..common import * +import json def magisto_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_html(url) - - title1 = r1(r' Date: Sat, 24 Dec 2016 15:49:47 +0100 Subject: [PATCH 4/8] [test] remove mixcloud --- tests/test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test.py b/tests/test.py index 0fa2979a..020455b0 100644 --- a/tests/test.py +++ b/tests/test.py @@ -18,9 +18,6 @@ class YouGetTests(unittest.TestCase): def test_magisto(self): magisto.download("http://www.magisto.com/album/video/f3x9AAQORAkfDnIFDA", info_only=True) - def test_mixcloud(self): - mixcloud.download("http://www.mixcloud.com/DJVadim/north-america-are-you-ready/", info_only=True) - def test_youtube(self): youtube.download("http://www.youtube.com/watch?v=pzKerr0JIPA", info_only=True) youtube.download("http://youtu.be/pzKerr0JIPA", info_only=True) From b493af9a69878544ddc6a1fdb71ca61b48bd57ab Mon Sep 17 00:00:00 2001 From: Zhiming Wang Date: Thu, 15 Dec 2016 23:37:35 -0500 Subject: [PATCH 5/8] [ffmpeg] fix concat list when output dir is not pwd Relative paths in the concat list are considered relative to the parent directory of the script, not the calling directory. This isn't entirely obvious from the documentation, but it is easy to infer from the concat demuxer's concept of "safety", and easy to test (confirmed on FFmpeg 3.2.2). See https://ffmpeg.org/ffmpeg-all.html#concat-1 for details. This commit fixes the wrong relative paths when --output-dir is specified and not pwd. This commit also - Factors out common concat list writer code; - Slightly simplifies the code to collect FFmpeg params (on Py35+ we can further simplify by unpacking LOGLEVEL with the star operator right in the list literal). --- src/you_get/processor/ffmpeg.py | 56 ++++++++++++++------------------- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/src/you_get/processor/ffmpeg.py b/src/you_get/processor/ffmpeg.py index a8599e52..433aff3f 100644 --- a/src/you_get/processor/ffmpeg.py +++ b/src/you_get/processor/ffmpeg.py @@ -26,6 +26,18 @@ LOGLEVEL = ['-loglevel', 'quiet'] def has_ffmpeg_installed(): return FFMPEG is not None +# Given a list of segments and the output path, generates the concat +# list and returns the path to the concat list. +def generate_concat_list(files, output): + concat_list_path = output + '.txt' + concat_list_dir = os.path.dirname(concat_list_path) + with open(concat_list_path, 'w', encoding='utf-8') as concat_list: + for file in files: + if os.path.isfile(file): + relpath = os.path.relpath(file, start=concat_list_dir) + concat_list.write('file %s\n' % parameterize(relpath)) + return concat_list_path + def ffmpeg_concat_av(files, output, ext): print('Merging video parts... ', end="", flush=True) params = [FFMPEG] + LOGLEVEL @@ -52,17 +64,9 @@ def ffmpeg_convert_ts_to_mkv(files, output='output.mkv'): def ffmpeg_concat_mp4_to_mpg(files, output='output.mpg'): # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): - concat_list = open(output + '.txt', 'w', encoding="utf-8") - for file in files: - if os.path.isfile(file): - concat_list.write("file %s\n" % parameterize(file)) - concat_list.close() - - params = [FFMPEG] + LOGLEVEL - params.extend(['-f', 'concat', '-safe', '-1', '-y', '-i']) - params.append(output + '.txt') - params += ['-c', 'copy', output] - + concat_list = generate_concat_list(files, output) + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + '-i', concat_list, '-c', 'copy', output] if subprocess.call(params) == 0: os.remove(output + '.txt') return True @@ -115,18 +119,10 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'): print('Merging video parts... ', end="", flush=True) # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): - concat_list = open(output + '.txt', 'w', encoding="utf-8") - for file in files: - if os.path.isfile(file): - # for escaping rules, see: - # https://www.ffmpeg.org/ffmpeg-utils.html#Quoting-and-escaping - concat_list.write("file %s\n" % parameterize(file)) - concat_list.close() - - params = [FFMPEG] + LOGLEVEL + ['-f', 'concat', '-safe', '-1', '-y', '-i'] - params.append(output + '.txt') - params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc', output] - + concat_list = generate_concat_list(files, output) + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + '-i', concat_list, '-c', 'copy', + '-bsf:a', 'aac_adtstoasc', output] subprocess.check_call(params) os.remove(output + '.txt') return True @@ -162,16 +158,10 @@ def ffmpeg_concat_mp4_to_mp4(files, output='output.mp4'): print('Merging video parts... ', end="", flush=True) # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): - concat_list = open(output + '.txt', 'w', encoding="utf-8") - for file in files: - if os.path.isfile(file): - concat_list.write("file %s\n" % parameterize(file)) - concat_list.close() - - params = [FFMPEG] + LOGLEVEL + ['-f', 'concat', '-safe', '-1', '-y', '-i'] - params.append(output + '.txt') - params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc', output] - + concat_list = generate_concat_list(files, output) + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + '-i', concat_list, '-c', 'copy', + '-bsf:a', 'aac_adtstoasc', output] subprocess.check_call(params) os.remove(output + '.txt') return True From f7b6f6b40f97813206252f9c41dbe05bda592918 Mon Sep 17 00:00:00 2001 From: Zhiming Wang Date: Sun, 25 Dec 2016 13:48:00 -0500 Subject: [PATCH 6/8] ffmpeg: set loglevel to info in debug mode Occasionally, the FFmpeg invocation fails (which could be due to bugs in you-get; see #1558 for instance), but -loglevel quiet means nothing is printed other than the exit status (pretty much always 1) in Python's traceback, which is not helpful at all. This commit restores FFmpeg's regular output (-loglevel info) when --debug is specified. We're not using verbose, debug or trace because those levels are mostly only useful for debugging FFmpeg itself, which is not our goal. Due to lack of meaningful API to access the global logging level, this is a hack based on two assumptions: 1. When --debug is enabled, the root logger level is set to DEBUG; 2. processor.ffmpeg is lazily imported, after command line options are parsed. --- src/you_get/processor/ffmpeg.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) mode change 100644 => 100755 src/you_get/processor/ffmpeg.py diff --git a/src/you_get/processor/ffmpeg.py b/src/you_get/processor/ffmpeg.py old mode 100644 new mode 100755 index a8599e52..f5b3cd38 --- a/src/you_get/processor/ffmpeg.py +++ b/src/you_get/processor/ffmpeg.py @@ -1,5 +1,6 @@ #!/usr/bin/env python +import logging import os.path import subprocess from ..util.strings import parameterize @@ -21,7 +22,10 @@ def get_usable_ffmpeg(cmd): return None FFMPEG, FFMPEG_VERSION = get_usable_ffmpeg('ffmpeg') or get_usable_ffmpeg('avconv') or (None, None) -LOGLEVEL = ['-loglevel', 'quiet'] +if logging.getLogger().isEnabledFor(logging.DEBUG): + LOGLEVEL = ['-loglevel', 'info'] +else: + LOGLEVEL = ['-loglevel', 'quiet'] def has_ffmpeg_installed(): return FFMPEG is not None From 927a1cb91f854cb5260f67b15d9811f763955407 Mon Sep 17 00:00:00 2001 From: liujianshan Date: Thu, 29 Dec 2016 19:47:53 +0800 Subject: [PATCH 7/8] Fix soku.com vid download error problem --- src/you_get/extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractor.py b/src/you_get/extractor.py index 594b908e..332440dd 100644 --- a/src/you_get/extractor.py +++ b/src/you_get/extractor.py @@ -206,7 +206,7 @@ class VideoExtractor(): output_dir=kwargs['output_dir'], merge=kwargs['merge'], av=stream_id in self.dash_streams) - if not kwargs['caption']: + if 'caption' not in kwargs or not kwargs['caption']: print('Skipping captions.') return for lang in self.caption_tracks: From 76399e8561c421ead7a590ef857a98eccb16af61 Mon Sep 17 00:00:00 2001 From: ChenYuan Date: Sun, 1 Jan 2017 00:44:56 +0800 Subject: [PATCH 8/8] fix bilibili bangumi modify the regex to get eposide id --- src/you_get/extractors/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 122dea0b..aecb072c 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -127,7 +127,7 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs if re.match(r'https?://bangumi\.bilibili\.com/', url): # quick hack for bangumi URLs - episode_id = r1(r'data-current-episode-id="(\d+)"', html) + episode_id = r1(r'first_ep_id = "(\d+)"', html) cont = post_content('http://bangumi.bilibili.com/web_api/get_source', post_data={'episode_id': episode_id}) cid = json.loads(cont)['result']['cid']