diff --git a/.travis.yml b/.travis.yml index 9df327b0..c11cbe34 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,11 +6,13 @@ python: - "3.4" - "3.5" - "3.6" + - "3.7-dev" - "nightly" - "pypy3" -before_install: pip install flake8 +before_install: + - if [[ $TRAVIS_PYTHON_VERSION != '3.2'* && $TRAVIS_PYTHON_VERSION != '3.3'* ]]; then pip install flake8; fi before_script: - - if [[ $TRAVIS_PYTHON_VERSION != '3.2'* ]]; then flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics; fi + - if [[ $TRAVIS_PYTHON_VERSION != '3.2'* && $TRAVIS_PYTHON_VERSION != '3.3'* ]]; then flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics; fi script: make test sudo: false notifications: diff --git a/README.md b/README.md index f6f8efdc..f524c60d 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,14 @@ You can install `you-get` easily via: $ brew install you-get ``` +### Option 8: pkg (FreeBSD only) + +You can install `you-get` easily via: + +``` +# pkg install you-get +``` + ### Shell completion Completion definitions for Bash, Fish and Zsh can be found in [`contrib/completion`](https://github.com/soimort/you-get/tree/develop/contrib/completion). Please consult your shell's manual for how to take advantage of them. @@ -416,7 +424,9 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 西瓜视频 | |✓| | | | 快手 | |✓|✓| | | 抖音 | |✓| | | +| TikTok | |✓| | | | 中国体育(TV) |
|✓| | | +| 知乎 | |✓| | | For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. diff --git a/src/you_get/common.py b/src/you_get/common.py index b19d602f..78182163 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -102,6 +102,7 @@ SITES = { 'soundcloud' : 'soundcloud', 'ted' : 'ted', 'theplatform' : 'theplatform', + 'tiktok' : 'tiktok', 'tucao' : 'tucao', 'tudou' : 'tudou', 'tumblr' : 'tumblr', @@ -127,6 +128,7 @@ SITES = { 'youtube' : 'youtube', 'zhanqi' : 'zhanqi', 'zhibo' : 'zhibo', + 'zhihu' : 'zhihu', } dry_run = False @@ -429,7 +431,7 @@ def get_content(url, headers={}, decoded=True): # Decode the response body if decoded: charset = match1( - response.getheader('Content-Type'), r'charset=([\w-]+)' + response.getheader('Content-Type', ''), r'charset=([\w-]+)' ) if charset is not None: data = data.decode(charset) @@ -439,7 +441,7 @@ def get_content(url, headers={}, decoded=True): return data -def post_content(url, headers={}, post_data={}, decoded=True): +def post_content(url, headers={}, post_data={}, decoded=True, **kwargs): """Post the content of a URL via sending a HTTP POST request. Args: @@ -450,14 +452,19 @@ def post_content(url, headers={}, post_data={}, decoded=True): Returns: The content as a string. """ - - logging.debug('post_content: %s \n post_data: %s' % (url, post_data)) + if kwargs.get('post_data_raw'): + logging.debug('post_content: %s\npost_data_raw: %s' % (url, kwargs['post_data_raw'])) + else: + logging.debug('post_content: %s\npost_data: %s' % (url, post_data)) req = request.Request(url, headers=headers) if cookies: cookies.add_cookie_header(req) req.headers.update(req.unredirected_hdrs) - post_data_enc = bytes(parse.urlencode(post_data), 'utf-8') + if kwargs.get('post_data_raw'): + post_data_enc = bytes(kwargs['post_data_raw'], 'utf-8') + else: + post_data_enc = bytes(parse.urlencode(post_data), 'utf-8') response = urlopen_with_retry(req, data=post_data_enc) data = response.read() @@ -602,7 +609,12 @@ def url_save( # the key must be 'Referer' for the hack here if refer is not None: tmp_headers['Referer'] = refer - file_size = url_size(url, faker=faker, headers=tmp_headers) + if type(url) is list: + file_size = urls_size(url, faker=faker, headers=tmp_headers) + is_chunked, urls = True, url + else: + file_size = url_size(url, faker=faker, headers=tmp_headers) + is_chunked, urls = False, [url] continue_renameing = True while continue_renameing: @@ -612,7 +624,7 @@ def url_save( if not is_part: if bar: bar.done() - print( + log.w( 'Skipping {}: file already exists'.format( tr(os.path.basename(filepath)) ) @@ -638,7 +650,10 @@ def url_save( print('Changing name to %s' % tr(os.path.basename(filepath)), '...') continue_renameing = True continue - print('Overwriting %s' % tr(os.path.basename(filepath)), '...') + if log.yes_or_no('File with this name already exists. Overwrite?'): + log.w('Overwriting %s ...' % tr(os.path.basename(filepath))) + else: + return elif not os.path.exists(os.path.dirname(filepath)): os.mkdir(os.path.dirname(filepath)) @@ -655,70 +670,78 @@ def url_save( else: open_mode = 'wb' - if received < file_size: - if faker: - tmp_headers = fake_headers - ''' - if parameter headers passed in, we have it copied as tmp_header - elif headers: - headers = headers - else: - headers = {} - ''' - if received: - tmp_headers['Range'] = 'bytes=' + str(received) + '-' - if refer: - tmp_headers['Referer'] = refer + for url in urls: + received_chunk = 0 + if received < file_size: + if faker: + tmp_headers = fake_headers + ''' + if parameter headers passed in, we have it copied as tmp_header + elif headers: + headers = headers + else: + headers = {} + ''' + if received and not is_chunked: # only request a range when not chunked + tmp_headers['Range'] = 'bytes=' + str(received) + '-' + if refer: + tmp_headers['Referer'] = refer - if timeout: - response = urlopen_with_retry( - request.Request(url, headers=tmp_headers), timeout=timeout - ) - else: - response = urlopen_with_retry( - request.Request(url, headers=tmp_headers) - ) - try: - range_start = int( - response.headers[ - 'content-range' - ][6:].split('/')[0].split('-')[0] - ) - end_length = int( - response.headers['content-range'][6:].split('/')[1] - ) - range_length = end_length - range_start - except: - content_length = response.headers['content-length'] - range_length = int(content_length) if content_length is not None \ - else float('inf') + if timeout: + response = urlopen_with_retry( + request.Request(url, headers=tmp_headers), timeout=timeout + ) + else: + response = urlopen_with_retry( + request.Request(url, headers=tmp_headers) + ) + try: + range_start = int( + response.headers[ + 'content-range' + ][6:].split('/')[0].split('-')[0] + ) + end_length = int( + response.headers['content-range'][6:].split('/')[1] + ) + range_length = end_length - range_start + except: + content_length = response.headers['content-length'] + range_length = int(content_length) if content_length is not None \ + else float('inf') - if file_size != received + range_length: - received = 0 - if bar: - bar.received = 0 - open_mode = 'wb' - - with open(temp_filepath, open_mode) as output: - while True: - buffer = None - try: - buffer = response.read(1024 * 256) - except socket.timeout: - pass - if not buffer: - if received == file_size: # Download finished - break - # Unexpected termination. Retry request - tmp_headers['Range'] = 'bytes=' + str(received) + '-' - response = urlopen_with_retry( - request.Request(url, headers=tmp_headers) - ) - continue - output.write(buffer) - received += len(buffer) + if is_chunked: # always append if chunked + open_mode = 'ab' + elif file_size != received + range_length: # is it ever necessary? + received = 0 if bar: - bar.update_received(len(buffer)) + bar.received = 0 + open_mode = 'wb' + + with open(temp_filepath, open_mode) as output: + while True: + buffer = None + try: + buffer = response.read(1024 * 256) + except socket.timeout: + pass + if not buffer: + if is_chunked and received_chunk == range_length: + break + elif not is_chunked and received == file_size: # Download finished + break + # Unexpected termination. Retry request + if not is_chunked: # when + tmp_headers['Range'] = 'bytes=' + str(received) + '-' + response = urlopen_with_retry( + request.Request(url, headers=tmp_headers) + ) + continue + output.write(buffer) + received += len(buffer) + received_chunk += len(buffer) + if bar: + bar.update_received(len(buffer)) assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % ( received, os.path.getsize(temp_filepath), temp_filepath @@ -907,7 +930,7 @@ def download_urls( if total_size: if not force and os.path.exists(output_filepath) and not auto_rename\ and os.path.getsize(output_filepath) >= total_size * 0.9: - print('Skipping %s: file already exists' % output_filepath) + log.w('Skipping %s: file already exists' % output_filepath) print() return bar = SimpleProgressBar(total_size, len(urls)) @@ -1554,9 +1577,9 @@ def google_search(url): url = 'https://www.google.com/search?tbm=vid&q=%s' % parse.quote(keywords) page = get_content(url, headers=fake_headers) videos = re.findall( - r'([^<]+)<', page + r'

([^<]+)<', page ) - vdurs = re.findall(r'([^<]+)<', page) + vdurs = re.findall(r'([^<]+)<', page) durs = [r1(r'(\d+:\d+)', unescape_html(dur)) for dur in vdurs] print('Google Videos search:') for v in zip(videos, durs): diff --git a/src/you_get/extractor.py b/src/you_get/extractor.py index 4c9ccaa5..8aeed560 100644 --- a/src/you_get/extractor.py +++ b/src/you_get/extractor.py @@ -211,7 +211,7 @@ class VideoExtractor(): ext = self.dash_streams[stream_id]['container'] total_size = self.dash_streams[stream_id]['size'] - if ext == 'm3u8': + if ext == 'm3u8' or ext == 'm4a': ext = 'mp4' if not urls: diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index 649a911f..d2c4c7b7 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -67,6 +67,7 @@ from .sohu import * from .soundcloud import * from .suntv import * from .theplatform import * +from .tiktok import * from .tucao import * from .tudou import * from .tumblr import * @@ -88,4 +89,5 @@ from .ted import * from .khan import * from .zhanqi import * from .kuaishou import * -from .zhibo import * \ No newline at end of file +from .zhibo import * +from .zhihu import * diff --git a/src/you_get/extractors/acfun.py b/src/you_get/extractors/acfun.py index 4b45c5e9..200a3f54 100644 --- a/src/you_get/extractors/acfun.py +++ b/src/you_get/extractors/acfun.py @@ -85,9 +85,13 @@ def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=Fals _, _, seg_size = url_info(url) size += seg_size #fallback to flvhd is not quite possible - print_info(site_info, title, 'mp4', size) + if re.search(r'fid=[0-9A-Z\-]*.flv', preferred[0][0]): + ext = 'flv' + else: + ext = 'mp4' + print_info(site_info, title, ext, size) if not info_only: - download_urls(preferred[0], title, 'mp4', size, output_dir=output_dir, merge=merge) + download_urls(preferred[0], title, ext, size, output_dir=output_dir, merge=merge) else: raise NotImplementedError(sourceType) @@ -105,27 +109,42 @@ def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=Fals pass def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - assert re.match(r'http://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url) - html = get_content(url) + assert re.match(r'http://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', url) - title = r1(r'data-title="([^"]+)"', html) + if re.match(r'http://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url): + html = get_content(url) + title = r1(r'data-title="([^"]+)"', html) + if match1(url, r'_(\d+)$'): # current P + title = title + " " + r1(r'active">([^<]*)', html) + vid = r1('data-vid="(\d+)"', html) + up = r1('data-name="([^"]+)"', html) + # bangumi + elif re.match("http://[^\.]*\.*acfun\.[^\.]+/bangumi/ab(\d+)", url): + html = get_content(url) + title = match1(html, r'"newTitle"\s*:\s*"([^"]+)"') + if match1(url, r'_(\d+)$'): # current P + title = title + " " + r1(r'active">([^<]*)', html) + vid = match1(html, r'videoId="(\d+)"') + up = "acfun" + else: + raise NotImplemented + + assert title and vid title = unescape_html(title) title = escape_file_path(title) - assert title - if match1(url, r'_(\d+)$'): # current P - title = title + " " + r1(r'active">([^<]*)', html) - - vid = r1('data-vid="(\d+)"', html) - up = r1('data-name="([^"]+)"', html) p_title = r1('active">([^<]+)', html) title = '%s (%s)' % (title, up) - if p_title: title = '%s - %s' % (title, p_title) + if p_title: + title = '%s - %s' % (title, p_title) + + acfun_download_by_vid(vid, title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) + site_info = "AcFun.tv" download = acfun_download download_playlist = playlist_not_supported('acfun') diff --git a/src/you_get/extractors/baidu.py b/src/you_get/extractors/baidu.py index b30c9d86..a8cb3d5d 100644 --- a/src/you_get/extractors/baidu.py +++ b/src/you_get/extractors/baidu.py @@ -129,8 +129,9 @@ def baidu_download(url, output_dir='.', stream_type=None, merge=True, info_only= html = get_html(url) title = r1(r'title:"([^"]+)"', html) - vhsrc = re.findall(r'"BDE_Image"[^>]+src="([^"]+)"', html) or re.findall(r'vhsrc="([^"]+)"', html) - if vhsrc is not None: + vhsrc = re.findall(r'"BDE_Image"[^>]+src="([^"]+\.mp4)"', html) or \ + re.findall(r'vhsrc="([^"]+)"', html) + if len(vhsrc) > 0: ext = 'mp4' size = url_size(vhsrc[0]) print_info(site_info, title, ext, size) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 96fc60c8..5ed7f28d 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -22,7 +22,7 @@ from .youku import youku_download_by_vid class Bilibili(VideoExtractor): name = 'Bilibili' - live_api = 'http://live.bilibili.com/api/playurl?cid={}&otype=json' + live_api = 'https://api.live.bilibili.com/room/v1/Room/playUrl?cid={}&quality=0&platform=web' api_url = 'http://interface.bilibili.com/v2/playurl?' bangumi_api_url = 'http://bangumi.bilibili.com/player/web_api/playurl?' live_room_init_api_url = 'https://api.live.bilibili.com/room/v1/Room/room_init?id={}' @@ -115,7 +115,7 @@ class Bilibili(VideoExtractor): self.url = 'http://www.bilibili.com/video/av{}/'.format(aid) self.ua = fake_headers['User-Agent'] - self.url = url_locations([self.url])[0] + self.url = url_locations([self.url], faker=True)[0] frag = urllib.parse.urlparse(self.url).fragment # http://www.bilibili.com/video/av3141144/index_2.html#page=3 if frag: @@ -125,30 +125,31 @@ class Bilibili(VideoExtractor): aid = re.search(r'av(\d+)', self.url).group(1) self.url = 'http://www.bilibili.com/video/av{}/index_{}.html'.format(aid, page) self.referer = self.url - self.page = get_content(self.url) + self.page = get_content(self.url, headers=fake_headers) m = re.search(r'(.*?)

', self.page) or re.search(r'

', self.page) if m is not None: self.title = m.group(1) - s = re.search(r'([^<]+)', m.group(1)) + s = re.search(r'([^<]+)', m.group(1)) if s: self.title = unescape_html(s.group(1)) if self.title is None: m = re.search(r'property="og:title" content="([^"]+)"', self.page) if m is not None: self.title = m.group(1) - if 'subtitle' in kwargs: subtitle = kwargs['subtitle'] self.title = '{} {}'.format(self.title, subtitle) else: playinfo = re.search(r'__INITIAL_STATE__=(.*?);\(function\(\)', self.page) if playinfo is not None: - pages = json.loads(playinfo.group(1))['videoData']['pages'] - if len(pages) > 1: - qs = dict(parse.parse_qsl(urllib.parse.urlparse(self.url).query)) - page = pages[int(qs.get('p', 1)) - 1] - self.title = '{} #{}. {}'.format(self.title, page['page'], page['part']) + jsonPlayinfo = json.loads(playinfo.group(1)) + if 'videoData' in jsonPlayinfo: + pages = jsonPlayinfo['videoData']['pages'] + if len(pages) > 1: + qs = dict(parse.parse_qsl(urllib.parse.urlparse(self.url).query)) + page = pages[int(qs.get('p', 1)) - 1] + self.title = '{} #{}. {}'.format(self.title, page['page'], page['part']) if 'bangumi.bilibili.com/movie' in self.url: self.movie_entry(**kwargs) @@ -160,6 +161,8 @@ class Bilibili(VideoExtractor): self.live_entry(**kwargs) elif 'vc.bilibili.com' in self.url: self.vc_entry(**kwargs) + elif 'audio/au' in self.url: + self.audio_entry(**kwargs) else: self.entry(**kwargs) @@ -171,6 +174,30 @@ class Bilibili(VideoExtractor): self.title = page_list[0]['pagename'] self.download_by_vid(page_list[0]['cid'], True, bangumi_movie=True, **kwargs) + def audio_entry(self, **kwargs): + assert re.match(r'https?://www.bilibili.com/audio/au\d+', self.url) + patt = r"(\d+)" + audio_id = re.search(patt, self.url).group(1) + audio_info_url = \ + 'https://www.bilibili.com/audio/music-service-c/web/song/info?sid={}'.format(audio_id) + audio_info_response = json.loads(get_content(audio_info_url)) + if audio_info_response['msg'] != 'success': + log.wtf('fetch audio information failed!') + sys.exit(2) + self.title = audio_info_response['data']['title'] + # TODO:there is no quality option for now + audio_download_url = \ + 'https://www.bilibili.com/audio/music-service-c/web/url?sid={}&privilege=2&quality=2'.format(audio_id) + audio_download_response = json.loads(get_content(audio_download_url)) + if audio_download_response['msg'] != 'success': + log.wtf('fetch audio resource failed!') + sys.exit(2) + self.streams['mp4'] = {} + self.streams['mp4']['src'] = [audio_download_response['data']['cdns'][0]] + self.streams['mp4']['container'] = 'm4a' + self.streams['mp4']['size'] = audio_download_response['data']['size'] + + def entry(self, **kwargs): # tencent player tc_flashvars = re.search(r'"bili-cid=\d+&bili-aid=\d+&vid=([^"]+)"', self.page) @@ -190,7 +217,12 @@ class Bilibili(VideoExtractor): index_id = int(re.search(r'index_(\d+)', self.url).group(1)) cid = page_list[index_id-1]['cid'] # change cid match rule except: - cid = re.search(r'"cid":(\d+)', self.page).group(1) + page = re.search(r'p=(\d+)', self.url) + if page is None: + p = 1 + else: + p = int(page.group(1)) + cid = re.search(r'"cid":(\d+),"page":%s' % p, self.page).group(1) if cid is not None: self.download_by_vid(cid, re.search('bangumi', self.url) is not None, **kwargs) else: @@ -226,7 +258,7 @@ class Bilibili(VideoExtractor): api_url = self.live_api.format(self.room_id) json_data = json.loads(get_content(api_url)) - urls = [json_data['durl'][0]['url']] + urls = [json_data['data']['durl'][0]['url']] self.streams['live'] = {} self.streams['live']['src'] = urls @@ -252,28 +284,9 @@ class Bilibili(VideoExtractor): self.streams['vc']['size'] = int(item['video_size']) def bangumi_entry(self, **kwargs): - bangumi_id = re.search(r'(\d+)', self.url).group(1) - frag = urllib.parse.urlparse(self.url).fragment - if frag: - episode_id = frag - else: - episode_id = re.search(r'first_ep_id\s*=\s*"(\d+)"', self.page) or re.search(r'\/ep(\d+)', self.url).group(1) - # cont = post_content('http://bangumi.bilibili.com/web_api/get_source', post_data=dict(episode_id=episode_id)) - # cid = json.loads(cont)['result']['cid'] - cont = get_content('http://bangumi.bilibili.com/web_api/episode/{}.json'.format(episode_id)) - ep_info = json.loads(cont)['result']['currentEpisode'] - - bangumi_data = get_bangumi_info(str(ep_info['seasonId'])) - bangumi_payment = bangumi_data.get('payment') - if bangumi_payment and bangumi_payment['price'] != '0': - log.w("It's a paid item") - # ep_ids = collect_bangumi_epids(bangumi_data) - - index_title = ep_info['indexTitle'] - long_title = ep_info['longTitle'].strip() - cid = ep_info['danmaku'] - - self.title = '{} [{} {}]'.format(self.title, index_title, long_title) + data = json.loads(re.search(r'__INITIAL_STATE__=(.+);\(function', self.page).group(1)) + cid = data['epInfo']['cid'] + # index_title = data['epInfo']['index_title'] self.download_by_vid(cid, bangumi=True, **kwargs) @@ -376,10 +389,82 @@ def download_video_from_favlist(url, **kwargs): else: log.wtf("Fail to parse the fav title" + url, "") +def download_music_from_favlist(url, page, **kwargs): + m = re.search(r'https?://www.bilibili.com/audio/mycollection/(\d+)', url) + if m is not None: + sid = m.group(1) + json_result = json.loads(get_content("https://www.bilibili.com/audio/music-service-c/web/song/of-coll?" + "sid={}&pn={}&ps=100".format(sid, page))) + if json_result['msg'] == 'success': + music_list = json_result['data']['data'] + music_count = len(music_list) + for i in range(music_count): + audio_id = music_list[i]['id'] + audio_title = music_list[i]['title'] + audio_url = "https://www.bilibili.com/audio/au{}".format(audio_id) + print("Start downloading music ", audio_title) + Bilibili().download_by_url(audio_url, **kwargs) + if page < json_result['data']['pageCount']: + page += 1 + download_music_from_favlist(url, page, **kwargs) + else: + log.wtf("Fail to get music list of page " + json_result) + sys.exit(2) + else: + log.wtf("Fail to parse the sid from " + url, "") +def download_video_from_totallist(url, page, **kwargs): + # the url has format: https://space.bilibili.com/64169458/#/video + m = re.search(r'space\.bilibili\.com/(\d+)/.*?video', url) + mid = "" + if m is not None: + mid = m.group(1) + jsonresult = json.loads(get_content("https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&pagesize=100&tid=0&page={}&keyword=&order=pubdate&jsonp=jsonp".format(mid, page))) + if jsonresult['status']: + videos = jsonresult['data']['vlist'] + videocount = len(videos) + for i in range(videocount): + videoid = videos[i]["aid"] + videotitle = videos[i]["title"] + videourl = "https://www.bilibili.com/video/av{}".format(videoid) + print("Start downloading ", videotitle, " video ", videotitle) + Bilibili().download_by_url(videourl, subtitle=videotitle, **kwargs) + if page < jsonresult['data']['pages']: + page += 1 + download_video_from_totallist(url, page, **kwargs) + else: + log.wtf("Fail to get the files of page " + jsonresult) + sys.exit(2) + + else: + log.wtf("Fail to parse the video title" + url, "") + +def download_music_from_totallist(url, page, **kwargs): + m = re.search(r'https?://www.bilibili.com/audio/am(\d+)\?type=\d', url) + if m is not None: + sid = m.group(1) + json_result = json.loads(get_content("https://www.bilibili.com/audio/music-service-c/web/song/of-menu?" + "sid={}&pn={}&ps=100".format(sid, page))) + if json_result['msg'] == 'success': + music_list = json_result['data']['data'] + music_count = len(music_list) + for i in range(music_count): + audio_id = music_list[i]['id'] + audio_title = music_list[i]['title'] + audio_url = "https://www.bilibili.com/audio/au{}".format(audio_id) + print("Start downloading music ",audio_title) + Bilibili().download_by_url(audio_url, **kwargs) + if page < json_result['data']['pageCount']: + page += 1 + download_music_from_totallist(url, page, **kwargs) + else: + log.wtf("Fail to get music list of page " + json_result) + sys.exit(2) + else: + log.wtf("Fail to parse the sid from " + url, "") def bilibili_download_playlist_by_url(url, **kwargs): - url = url_locations([url])[0] + url = url_locations([url], faker=True)[0] kwargs['playlist'] = True # a bangumi here? possible? if 'live.bilibili' in url: @@ -396,6 +481,12 @@ def bilibili_download_playlist_by_url(url, **kwargs): elif 'favlist' in url: # this a fav list folder download_video_from_favlist(url, **kwargs) + elif re.match(r'https?://space.bilibili.com/\d+/#/video', url): + download_video_from_totallist(url, 1, **kwargs) + elif re.match(r'https://www.bilibili.com/audio/mycollection/\d+', url): + download_music_from_favlist(url, 1, **kwargs) + elif re.match(r'https?://www.bilibili.com/audio/am\d+\?type=\d', url): + download_music_from_totallist(url, 1, **kwargs) else: aid = re.search(r'av(\d+)', url).group(1) page_list = json.loads(get_content('http://www.bilibili.com/widget/getPageList?aid={}'.format(aid))) diff --git a/src/you_get/extractors/instagram.py b/src/you_get/extractors/instagram.py index 65fc01f5..567e0dd7 100755 --- a/src/you_get/extractors/instagram.py +++ b/src/you_get/extractors/instagram.py @@ -29,9 +29,9 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg image_url = edge['node']['display_url'] if 'video_url' in edge['node']: image_url = edge['node']['video_url'] - image_url = image_url.split('?')[0] - ext = image_url.split('.')[-1] + ext = image_url.split('?')[0].split('.')[-1] size = int(get_head(image_url)['Content-Length']) + print_info(site_info, title, ext, size) if not info_only: download_urls(urls=[image_url], @@ -44,9 +44,9 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg image_url = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['display_url'] if 'video_url' in info['entry_data']['PostPage'][0]['graphql']['shortcode_media']: image_url =info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['video_url'] - image_url = image_url.split('?')[0] - ext = image_url.split('.')[-1] + ext = image_url.split('?')[0].split('.')[-1] size = int(get_head(image_url)['Content-Length']) + print_info(site_info, title, ext, size) if not info_only: download_urls(urls=[image_url], diff --git a/src/you_get/extractors/iwara.py b/src/you_get/extractors/iwara.py index 50d14fb8..a30159d7 100644 --- a/src/you_get/extractors/iwara.py +++ b/src/you_get/extractors/iwara.py @@ -17,20 +17,20 @@ headers = { def iwara_download(url, output_dir='.', merge=True, info_only=False, **kwargs): global headers - video_hash=match1(url, r'http://\w+.iwara.tv/videos/(\w+)') - video_url=match1(url, r'(http://\w+.iwara.tv)/videos/\w+') - html = get_content(url,headers=headers) + video_hash = match1(url, r'https?://\w+.iwara.tv/videos/(\w+)') + video_url = match1(url, r'(https?://\w+.iwara.tv)/videos/\w+') + html = get_content(url, headers=headers) title = r1(r'(.*)', html) - api_url=video_url+'/api/video/'+video_hash - content=get_content(api_url,headers=headers) - data=json.loads(content) - type,ext,size=url_info(data[0]['uri'], headers=headers) - down_urls=data[0]['uri'] - print_info(down_urls,title+data[0]['resolution'],type,size) + api_url = video_url + '/api/video/' + video_hash + content = get_content(api_url, headers=headers) + data = json.loads(content) + down_urls = 'https:' + data[0]['uri'] + type, ext, size = url_info(down_urls, headers=headers) + print_info(site_info, title+data[0]['resolution'], type, size) if not info_only: - download_urls([down_urls], title, ext, size, output_dir, merge = merge,headers=headers) + download_urls([down_urls], title, ext, size, output_dir, merge=merge, headers=headers) -site_info = "iwara" +site_info = "Iwara" download = iwara_download download_playlist = playlist_not_supported('iwara') diff --git a/src/you_get/extractors/ixigua.py b/src/you_get/extractors/ixigua.py index 59133442..3cf07b09 100644 --- a/src/you_get/extractors/ixigua.py +++ b/src/you_get/extractors/ixigua.py @@ -1,14 +1,132 @@ #!/usr/bin/env python -__all__ = ['ixigua_download'] +import base64 -from .toutiao import download as toutiao_download -from .toutiao import download_playlist as toutiao_download_playlist +import binascii + +from ..common import * +import random +import ctypes +from json import loads + +__all__ = ['ixigua_download', 'ixigua_download_playlist_by_url'] + +headers = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 " + "Safari/537.36", +} + + +def int_overflow(val): + maxint = 2147483647 + if not -maxint - 1 <= val <= maxint: + val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1 + return val + + +def unsigned_right_shitf(n, i): + if n < 0: + n = ctypes.c_uint32(n).value + if i < 0: + return -int_overflow(n << abs(i)) + return int_overflow(n >> i) + + +def get_video_url_from_video_id(video_id): + """Splicing URLs according to video ID to get video details""" + # from js + data = [""] * 256 + for index, _ in enumerate(data): + t = index + for i in range(8): + t = -306674912 ^ unsigned_right_shitf(t, 1) if 1 & t else unsigned_right_shitf(t, 1) + data[index] = t + + def tmp(): + rand_num = random.random() + path = "/video/urls/v/1/toutiao/mp4/{video_id}?r={random_num}".format(video_id=video_id, + random_num=str(rand_num)[2:]) + e = o = r = -1 + i, a = 0, len(path) + while i < a: + e = ord(path[i]) + i += 1 + if e < 128: + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ e)] + else: + if e < 2048: + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (192 | e >> 6 & 31))] + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))] + else: + if 55296 <= e < 57344: + e = (1023 & e) + 64 + i += 1 + o = 1023 & t.url(i) + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (240 | e >> 8 & 7))] + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 2 & 63))] + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | o >> 6 & 15 | (3 & e) << 4))] + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & o))] + else: + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (224 | e >> 12 & 15))] + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 6 & 63))] + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))] + + return "https://ib.365yg.com{path}&s={param}".format(path=path, param=unsigned_right_shitf(r ^ -1, 0)) + + while 1: + url = tmp() + if url.split("=")[-1][0] != "-": # 参数s不能为负数 + return url def ixigua_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - return toutiao_download(url.replace('ixigua', '365yg')) + # example url: https://www.ixigua.com/i6631065141750268420/#mid=63024814422 + html = get_html(url, faker=True) + video_id = match1(html, r"videoId\s*:\s*'([^']+)'") + title = match1(html, r"title: '(\S+)',") + if not video_id: + log.e("video_id not found, url:{}".format(url)) + return + video_info_url = get_video_url_from_video_id(video_id) + video_info = loads(get_content(video_info_url)) + if video_info.get("code", 1) != 0: + log.e("Get video info from {} error: server return code {}".format(video_info_url, video_info.get("code", 1))) + return + if not video_info.get("data", None): + log.e("Get video info from {} error: The server returns JSON value" + " without data or data is empty".format(video_info_url)) + return + if not video_info["data"].get("video_list", None): + log.e("Get video info from {} error: The server returns JSON value" + " without data.video_list or data.video_list is empty".format(video_info_url)) + return + if not video_info["data"]["video_list"].get("video_1", None): + log.e("Get video info from {} error: The server returns JSON value" + " without data.video_list.video_1 or data.video_list.video_1 is empty".format(video_info_url)) + return + size = int(video_info["data"]["video_list"]["video_1"]["size"]) + print_info(site_info=site_info, title=title, type="mp4", size=size) # 该网站只有mp4类型文件 + if not info_only: + video_url = base64.b64decode(video_info["data"]["video_list"]["video_1"]["main_url"].encode("utf-8")) + download_urls([video_url.decode("utf-8")], title, "mp4", size, output_dir, merge=merge, headers=headers, **kwargs) + + +def ixigua_download_playlist_by_url(url, output_dir='.', merge=True, info_only=False, **kwargs): + assert "user" in url, "Only support users to publish video list,Please provide a similar url:" \ + "https://www.ixigua.com/c/user/6907091136/" + + user_id = url.split("/")[-2] if url[-1] == "/" else url.split("/")[-1] + params = {"max_behot_time": "0", "max_repin_time": "0", "count": "20", "page_type": "0", "user_id": user_id} + while 1: + url = "https://www.ixigua.com/c/user/article/?" + "&".join(["{}={}".format(k, v) for k, v in params.items()]) + video_list = loads(get_content(url, headers=headers)) + params["max_behot_time"] = video_list["next"]["max_behot_time"] + for video in video_list["data"]: + ixigua_download("https://www.ixigua.com/i{}/".format(video["item_id"]), output_dir, merge, info_only, + **kwargs) + if video_list["next"]["max_behot_time"] == 0: + break site_info = "ixigua.com" download = ixigua_download -download_playlist = toutiao_download_playlist +download_playlist = ixigua_download_playlist_by_url diff --git a/src/you_get/extractors/lizhi.py b/src/you_get/extractors/lizhi.py index 65988a9f..4991df31 100644 --- a/src/you_get/extractors/lizhi.py +++ b/src/you_get/extractors/lizhi.py @@ -2,8 +2,17 @@ __all__ = ['lizhi_download'] import json +import datetime from ..common import * +# +# Worked well but not perfect. +# TODO: add option --format={sd|hd} +# +def get_url(ep): + readable = datetime.datetime.fromtimestamp(int(ep['create_time']) / 1000).strftime('%Y/%m/%d') + return 'http://cdn5.lizhi.fm/audio/{}/{}_hd.mp3'.format(readable, ep['id']) + # radio_id: e.g. 549759 from http://www.lizhi.fm/549759/ # # Returns a list of tuples (audio_id, title, url) for each episode @@ -23,7 +32,7 @@ def lizhi_extract_playlist_info(radio_id): # (au_cnt), then handle pagination properly. api_url = 'http://www.lizhi.fm/api/radio_audios?s=0&l=65535&band=%s' % radio_id api_response = json.loads(get_content(api_url)) - return [(ep['id'], ep['name'], ep['url']) for ep in api_response] + return [(ep['id'], ep['name'], get_url(ep)) for ep in api_response] def lizhi_download_audio(audio_id, title, url, output_dir='.', info_only=False): filetype, ext, size = url_info(url) diff --git a/src/you_get/extractors/miaopai.py b/src/you_get/extractors/miaopai.py index f37d45b0..304ac176 100644 --- a/src/you_get/extractors/miaopai.py +++ b/src/you_get/extractors/miaopai.py @@ -2,9 +2,12 @@ __all__ = ['miaopai_download'] +import string +import random from ..common import * import urllib.error import urllib.parse +from ..util import fs fake_headers_mobile = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', @@ -20,6 +23,10 @@ def miaopai_download_by_fid(fid, output_dir = '.', merge = False, info_only = Fa mobile_page = get_content(page_url, headers=fake_headers_mobile) url = match1(mobile_page, r''): '-', ord('['): '(', ord(']'): ')', + ord('\t'): ' ', }) else: # *nix diff --git a/src/you_get/util/log.py b/src/you_get/util/log.py index a2c77ab5..67b26b78 100644 --- a/src/you_get/util/log.py +++ b/src/you_get/util/log.py @@ -96,3 +96,9 @@ def wtf(message, exit_code=1): print_log(message, RED, BOLD) if exit_code is not None: sys.exit(exit_code) + +def yes_or_no(message): + ans = str(input('%s (y/N) ' % message)).lower().strip() + if ans == 'y': + return True + return False diff --git a/src/you_get/util/os.py b/src/you_get/util/os.py index 11730e28..1a00d2b5 100644 --- a/src/you_get/util/os.py +++ b/src/you_get/util/os.py @@ -19,9 +19,11 @@ def detect_os(): elif 'linux' in syst: os = 'linux' # detect WSL https://github.com/Microsoft/BashOnWindows/issues/423 - with open('/proc/version', 'r') as f: - if 'microsoft' in f.read().lower(): - os = 'wsl' + try: + with open('/proc/version', 'r') as f: + if 'microsoft' in f.read().lower(): + os = 'wsl' + except: pass elif 'windows' in syst: os = 'windows' elif 'bsd' in syst: diff --git a/src/you_get/version.py b/src/you_get/version.py index 76969dc3..e89eb41a 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1128' +__version__ = '0.4.1193' diff --git a/you-get.json b/you-get.json index 594742c2..56f8212a 100644 --- a/you-get.json +++ b/you-get.json @@ -25,6 +25,7 @@ "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", "Topic :: Internet", "Topic :: Internet :: WWW/HTTP", "Topic :: Multimedia",