From c14605f7011ebe40c01c10e3063a4c25c88a4f04 Mon Sep 17 00:00:00 2001 From: MMMartt Date: Tue, 18 Aug 2020 17:05:58 +0800 Subject: [PATCH 001/235] fix acfun download fail --- src/you_get/extractors/acfun.py | 39 ++++++++++++++------------------- tests/test.py | 1 + 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/src/you_get/extractors/acfun.py b/src/you_get/extractors/acfun.py index b83c2859..9205b1b8 100644 --- a/src/you_get/extractors/acfun.py +++ b/src/you_get/extractors/acfun.py @@ -111,6 +111,18 @@ def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=Fals def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs): assert re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', url) + def getM3u8UrlFromCurrentVideoInfo(currentVideoInfo): + if 'playInfos' in currentVideoInfo: + return currentVideoInfo['playInfos'][0]['playUrls'][0] + elif 'ksPlayJson' in currentVideoInfo: + ksPlayJson = json.loads( currentVideoInfo['ksPlayJson'] ) + representation = ksPlayJson.get('adaptationSet')[0].get('representation') + reps = [] + for one in representation: + reps.append( (one['width']* one['height'], one['url'], one['backupUrl']) ) + return max(reps)[1] + + if re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url): html = get_content(url, headers=fake_headers) json_text = match1(html, r"(?s)videoInfo\s*=\s*(\{.*?\});") @@ -122,37 +134,18 @@ def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs): if len(video_list) > 1: title += " - " + [p.get('title') for p in video_list if p.get('id') == vid][0] currentVideoInfo = json_data.get('currentVideoInfo') - if 'playInfos' in currentVideoInfo: - m3u8_url = currentVideoInfo['playInfos'][0]['playUrls'][0] - elif 'ksPlayJson' in currentVideoInfo: - ksPlayJson = json.loads( currentVideoInfo['ksPlayJson'] ) - representation = ksPlayJson.get('adaptationSet').get('representation') - reps = [] - for one in representation: - reps.append( (one['width']* one['height'], one['url'], one['backupUrl']) ) - m3u8_url = max(reps)[1] - + m3u8_url = getM3u8UrlFromCurrentVideoInfo(currentVideoInfo) elif re.match("https?://[^\.]*\.*acfun\.[^\.]+/bangumi/aa(\d+)", url): html = get_content(url, headers=fake_headers) - tag_script = match1(html, r'') + tag_script = match1(html, r'') json_text = tag_script[tag_script.find('{') : tag_script.find('};') + 1] json_data = json.loads(json_text) title = json_data['bangumiTitle'] + " " + json_data['episodeName'] + " " + json_data['title'] vid = str(json_data['videoId']) up = "acfun" - play_info = get_content("https://www.acfun.cn/rest/pc-direct/play/playInfo/m3u8Auto?videoId=" + vid, headers=fake_headers) - play_url = json.loads(play_info)['playInfo']['streams'][0]['playUrls'][0] - m3u8_all_qualities_file = get_content(play_url) - m3u8_all_qualities_lines = m3u8_all_qualities_file.split('#EXT-X-STREAM-INF:')[1:] - highest_quality_line = m3u8_all_qualities_lines[0] - for line in m3u8_all_qualities_lines: - bandwith = int(match1(line, r'BANDWIDTH=(\d+)')) - if bandwith > int(match1(highest_quality_line, r'BANDWIDTH=(\d+)')): - highest_quality_line = line - #TODO: 应由用户指定清晰度 - m3u8_url = match1(highest_quality_line, r'\n([^#\n]+)$') - m3u8_url = play_url[:play_url.rfind("/")+1] + m3u8_url + currentVideoInfo = json_data.get('currentVideoInfo') + m3u8_url = getM3u8UrlFromCurrentVideoInfo(currentVideoInfo) else: raise NotImplemented diff --git a/tests/test.py b/tests/test.py index 00bd4cbb..33503b8d 100644 --- a/tests/test.py +++ b/tests/test.py @@ -39,6 +39,7 @@ class YouGetTests(unittest.TestCase): def test_acfun(self): acfun.download('https://www.acfun.cn/v/ac11701912', info_only=True) + acfun.download('https://www.acfun.cn/bangumi/aa6002986', info_only=True) def test_bilibil(self): bilibili.download( From 5adb4caa738464a0730cf70fee34ae8191f100b5 Mon Sep 17 00:00:00 2001 From: Shen <960821@gmail.com> Date: Mon, 7 Sep 2020 14:29:00 +0800 Subject: [PATCH 002/235] fix bilibili 4k --- src/you_get/extractors/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 94e5479f..7ea626f8 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -62,7 +62,7 @@ class Bilibili(VideoExtractor): @staticmethod def bilibili_api(avid, cid, qn=0): - return 'https://api.bilibili.com/x/player/playurl?avid=%s&cid=%s&qn=%s&type=&otype=json&fnver=0&fnval=16' % (avid, cid, qn) + return 'https://api.bilibili.com/x/player/playurl?avid=%s&cid=%s&qn=%s&type=&otype=json&fnver=0&fnval=16&fourk=1' % (avid, cid, qn) @staticmethod def bilibili_audio_api(sid): From 398068e13e8b7052903f89d1caf30023e8639e78 Mon Sep 17 00:00:00 2001 From: jseagull Date: Mon, 14 Sep 2020 15:28:29 +0800 Subject: [PATCH 003/235] fix iqiyi playlist extrator --- src/you_get/extractors/iqiyi.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/you_get/extractors/iqiyi.py b/src/you_get/extractors/iqiyi.py index 3fe93209..2a48791c 100644 --- a/src/you_get/extractors/iqiyi.py +++ b/src/you_get/extractors/iqiyi.py @@ -119,10 +119,10 @@ class Iqiyi(VideoExtractor): self.url = url video_page = get_content(url) - videos = set(re.findall(r' Date: Fri, 18 Sep 2020 14:25:38 -0700 Subject: [PATCH 004/235] Fixed tiktok extraction --- src/you_get/extractors/tiktok.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/tiktok.py b/src/you_get/extractors/tiktok.py index c2a0eb8d..4843ced8 100644 --- a/src/you_get/extractors/tiktok.py +++ b/src/you_get/extractors/tiktok.py @@ -15,16 +15,16 @@ def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs): uniqueId = videoData['authorInfos'].get('uniqueId') nickName = videoData['authorInfos'].get('nickName') - for i, url in enumerate(urls): + for i, videoUrl in enumerate(urls): title = '%s [%s]' % (nickName or uniqueId, videoId) if len(urls) > 1: title = '%s [%s]' % (title, i) - mime, ext, size = url_info(url) + mime, ext, size = url_info(videoUrl, headers={'Referer': url}) print_info(site_info, title, mime, size) if not info_only: - download_urls([url], title, ext, size, output_dir=output_dir, merge=merge) + download_urls([videoUrl], title, ext, size, output_dir=output_dir, merge=merge, headers={'Referer': url}) site_info = "TikTok.com" download = tiktok_download From 4abc4bd7cfa1ec787a59a91480df28e458a12fe0 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 19 Sep 2020 22:57:06 +0200 Subject: [PATCH 005/235] [instagram] fix vid extraction for Reels --- src/you_get/extractors/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/instagram.py b/src/you_get/extractors/instagram.py index 567e0dd7..86905a77 100755 --- a/src/you_get/extractors/instagram.py +++ b/src/you_get/extractors/instagram.py @@ -8,7 +8,7 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg url = r1(r'([^?]*)', url) html = get_html(url) - vid = r1(r'instagram.com/p/([^/]+)', url) + vid = r1(r'instagram.com/\w+/([^/]+)', url) description = r1(r' Date: Sun, 20 Sep 2020 17:05:47 +0200 Subject: [PATCH 006/235] [imgur] fix --- src/you_get/extractors/imgur.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/you_get/extractors/imgur.py b/src/you_get/extractors/imgur.py index cc5dc9fd..519fa245 100644 --- a/src/you_get/extractors/imgur.py +++ b/src/you_get/extractors/imgur.py @@ -52,20 +52,16 @@ class Imgur(VideoExtractor): else: # gallery image content = get_content(self.url) - image = json.loads(match1(content, r'image\s*:\s*({.*}),')) - ext = image['ext'] + url = match1(content, r'(https?://i.imgur.com/[^"]+)') + _, container, size = url_info(url) self.streams = { 'original': { - 'src': ['http://i.imgur.com/%s%s' % (image['hash'], ext)], - 'size': image['size'], - 'container': ext[1:] - }, - 'thumbnail': { - 'src': ['http://i.imgur.com/%ss%s' % (image['hash'], '.jpg')], - 'container': 'jpg' + 'src': [url], + 'size': size, + 'container': container } } - self.title = image['title'] or image['hash'] + self.title = r1(r'i\.imgur\.com/([^./]*)', url) def extract(self, **kwargs): if 'stream_id' in kwargs and kwargs['stream_id']: From 00e2ce3f48b0cc3a8bcc8fe07cdc1892783b74e1 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 20 Sep 2020 17:14:34 +0200 Subject: [PATCH 007/235] [test] remove case for acfun bangumi that is inaccessible from non-China IP --- tests/test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test.py b/tests/test.py index 33503b8d..00bd4cbb 100644 --- a/tests/test.py +++ b/tests/test.py @@ -39,7 +39,6 @@ class YouGetTests(unittest.TestCase): def test_acfun(self): acfun.download('https://www.acfun.cn/v/ac11701912', info_only=True) - acfun.download('https://www.acfun.cn/bangumi/aa6002986', info_only=True) def test_bilibil(self): bilibili.download( From 5c9ec6c4f300dda24fcf0ad6b75a45e3cec46536 Mon Sep 17 00:00:00 2001 From: johnsmith2077 Date: Sun, 4 Oct 2020 05:12:56 +0800 Subject: [PATCH 008/235] add format selection for AcFun --- src/you_get/extractors/acfun.py | 343 ++++++++++++++++++-------------- 1 file changed, 194 insertions(+), 149 deletions(-) diff --git a/src/you_get/extractors/acfun.py b/src/you_get/extractors/acfun.py index 9205b1b8..cd275927 100644 --- a/src/you_get/extractors/acfun.py +++ b/src/you_get/extractors/acfun.py @@ -1,168 +1,213 @@ #!/usr/bin/env python -__all__ = ['acfun_download'] - from ..common import * +from ..extractor import VideoExtractor -from .le import letvcloud_download_by_vu -from .qq import qq_download_by_vid -from .sina import sina_download_by_vid -from .tudou import tudou_download_by_iid -from .youku import youku_download_by_vid +class AcFun(VideoExtractor): + name = "AcFun" -import json -import re -import base64 -import time + stream_types = [ + {'id': '2160P', 'qualityType': '2160p'}, + {'id': '1080P60', 'qualityType': '1080p60'}, + {'id': '720P60', 'qualityType': '720p60'}, + {'id': '1080P+', 'qualityType': '1080p+'}, + {'id': '1080P', 'qualityType': '1080p'}, + {'id': '720P', 'qualityType': '720p'}, + {'id': '540P', 'qualityType': '540p'}, + {'id': '360P', 'qualityType': '360p'} + ] -def get_srt_json(id): - url = 'http://danmu.aixifan.com/V2/%s' % id - return get_content(url) + def prepare(self, **kwargs): + assert re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', self.url) -def youku_acfun_proxy(vid, sign, ref): - endpoint = 'http://player.acfun.cn/flash_data?vid={}&ct=85&ev=3&sign={}&time={}' - url = endpoint.format(vid, sign, str(int(time.time() * 1000))) - json_data = json.loads(get_content(url, headers=dict(referer=ref)))['data'] - enc_text = base64.b64decode(json_data) - dec_text = rc4(b'8bdc7e1a', enc_text).decode('utf8') - youku_json = json.loads(dec_text) + if re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', self.url): + html = get_content(self.url, headers=fake_headers) + json_text = match1(html, r"(?s)videoInfo\s*=\s*(\{.*?\});") + json_data = json.loads(json_text) + vid = json_data.get('currentVideoInfo').get('id') + up = json_data.get('user').get('name') + self.title = json_data.get('title') + video_list = json_data.get('videoList') + if len(video_list) > 1: + self.title += " - " + [p.get('title') for p in video_list if p.get('id') == vid][0] + currentVideoInfo = json_data.get('currentVideoInfo') + + elif re.match("https?://[^\.]*\.*acfun\.[^\.]+/bangumi/aa(\d+)", self.url): + html = get_content(self.url, headers=fake_headers) + tag_script = match1(html, r'') + json_text = tag_script[tag_script.find('{') : tag_script.find('};') + 1] + json_data = json.loads(json_text) + self.title = json_data['bangumiTitle'] + " " + json_data['episodeName'] + " " + json_data['title'] + vid = str(json_data['videoId']) + up = "acfun" + currentVideoInfo = json_data.get('currentVideoInfo') - yk_streams = {} - for stream in youku_json['stream']: - tp = stream['stream_type'] - yk_streams[tp] = [], stream['total_size'] - if stream.get('segs'): - for seg in stream['segs']: - yk_streams[tp][0].append(seg['url']) else: - yk_streams[tp] = stream['m3u8'], stream['total_size'] + raise NotImplemented - return yk_streams - -def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False, **kwargs): - """str, str, str, bool, bool ->None - - Download Acfun video by vid. - - Call Acfun API, decide which site to use, and pass the job to its - extractor. - """ - - #first call the main parasing API - info = json.loads(get_content('http://www.acfun.cn/video/getVideo.aspx?id=' + vid, headers=fake_headers)) - - sourceType = info['sourceType'] - - #decide sourceId to know which extractor to use - if 'sourceId' in info: sourceId = info['sourceId'] - # danmakuId = info['danmakuId'] - - #call extractor decided by sourceId - if sourceType == 'sina': - sina_download_by_vid(sourceId, title, output_dir=output_dir, merge=merge, info_only=info_only) - elif sourceType == 'youku': - youku_download_by_vid(sourceId, title=title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) - elif sourceType == 'tudou': - tudou_download_by_iid(sourceId, title, output_dir=output_dir, merge=merge, info_only=info_only) - elif sourceType == 'qq': - qq_download_by_vid(sourceId, title, True, output_dir=output_dir, merge=merge, info_only=info_only) - elif sourceType == 'letv': - letvcloud_download_by_vu(sourceId, '2d8c027396', title, output_dir=output_dir, merge=merge, info_only=info_only) - elif sourceType == 'zhuzhan': - #As in Jul.28.2016, Acfun is using embsig to anti hotlink so we need to pass this -#In Mar. 2017 there is a dedicated ``acfun_proxy'' in youku cloud player -#old code removed - url = 'http://www.acfun.cn/v/ac' + vid - yk_streams = youku_acfun_proxy(info['sourceId'], info['encode'], url) - seq = ['mp4hd3', 'mp4hd2', 'mp4hd', 'flvhd'] - for t in seq: - if yk_streams.get(t): - preferred = yk_streams[t] - break -#total_size in the json could be incorrect(F.I. 0) - size = 0 - for url in preferred[0]: - _, _, seg_size = url_info(url) - size += seg_size -#fallback to flvhd is not quite possible - if re.search(r'fid=[0-9A-Z\-]*.flv', preferred[0][0]): - ext = 'flv' - else: - ext = 'mp4' - print_info(site_info, title, ext, size) - if not info_only: - download_urls(preferred[0], title, ext, size, output_dir=output_dir, merge=merge) - else: - raise NotImplementedError(sourceType) - - if not info_only and not dry_run: - if not kwargs['caption']: - print('Skipping danmaku.') - return - try: - title = get_filename(title) - print('Downloading %s ...\n' % (title + '.cmt.json')) - cmt = get_srt_json(vid) - with open(os.path.join(output_dir, title + '.cmt.json'), 'w', encoding='utf-8') as x: - x.write(cmt) - except: - pass - -def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - assert re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', url) - - def getM3u8UrlFromCurrentVideoInfo(currentVideoInfo): - if 'playInfos' in currentVideoInfo: - return currentVideoInfo['playInfos'][0]['playUrls'][0] - elif 'ksPlayJson' in currentVideoInfo: - ksPlayJson = json.loads( currentVideoInfo['ksPlayJson'] ) + if 'ksPlayJson' in currentVideoInfo: + durationMillis = currentVideoInfo['durationMillis'] + ksPlayJson = ksPlayJson = json.loads( currentVideoInfo['ksPlayJson'] ) representation = ksPlayJson.get('adaptationSet')[0].get('representation') - reps = [] - for one in representation: - reps.append( (one['width']* one['height'], one['url'], one['backupUrl']) ) - return max(reps)[1] + stream_list = representation + + for stream in stream_list: + m3u8_url = stream["url"] + size = durationMillis * stream["avgBitrate"] / 8 + # size = float('inf') + container = 'mp4' + stream_id = stream["qualityLabel"] + quality = stream["qualityType"] + + stream_data = dict(src=m3u8_url, size=size, container=container, quality=quality) + self.streams[stream_id] = stream_data + + assert self.title and m3u8_url + self.title = unescape_html(self.title) + self.title = escape_file_path(self.title) + p_title = r1('active">([^<]+)', html) + self.title = '%s (%s)' % (self.title, up) + if p_title: + self.title = '%s - %s' % (self.title, p_title) - if re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url): - html = get_content(url, headers=fake_headers) - json_text = match1(html, r"(?s)videoInfo\s*=\s*(\{.*?\});") - json_data = json.loads(json_text) - vid = json_data.get('currentVideoInfo').get('id') - up = json_data.get('user').get('name') - title = json_data.get('title') - video_list = json_data.get('videoList') - if len(video_list) > 1: - title += " - " + [p.get('title') for p in video_list if p.get('id') == vid][0] - currentVideoInfo = json_data.get('currentVideoInfo') - m3u8_url = getM3u8UrlFromCurrentVideoInfo(currentVideoInfo) - elif re.match("https?://[^\.]*\.*acfun\.[^\.]+/bangumi/aa(\d+)", url): - html = get_content(url, headers=fake_headers) - tag_script = match1(html, r'') - json_text = tag_script[tag_script.find('{') : tag_script.find('};') + 1] - json_data = json.loads(json_text) - title = json_data['bangumiTitle'] + " " + json_data['episodeName'] + " " + json_data['title'] - vid = str(json_data['videoId']) - up = "acfun" + def download(self, **kwargs): + if 'json_output' in kwargs and kwargs['json_output']: + json_output.output(self) + elif 'info_only' in kwargs and kwargs['info_only']: + if 'stream_id' in kwargs and kwargs['stream_id']: + # Display the stream + stream_id = kwargs['stream_id'] + if 'index' not in kwargs: + self.p(stream_id) + else: + self.p_i(stream_id) + else: + # Display all available streams + if 'index' not in kwargs: + self.p([]) + else: + stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag'] + self.p_i(stream_id) - currentVideoInfo = json_data.get('currentVideoInfo') - m3u8_url = getM3u8UrlFromCurrentVideoInfo(currentVideoInfo) + else: + if 'stream_id' in kwargs and kwargs['stream_id']: + # Download the stream + stream_id = kwargs['stream_id'] + else: + stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag'] - else: - raise NotImplemented - - assert title and m3u8_url - title = unescape_html(title) - title = escape_file_path(title) - p_title = r1('active">([^<]+)', html) - title = '%s (%s)' % (title, up) - if p_title: - title = '%s - %s' % (title, p_title) - - print_info(site_info, title, 'm3u8', float('inf')) - if not info_only: - download_url_ffmpeg(m3u8_url, title, 'mp4', output_dir=output_dir, merge=merge) + if 'index' not in kwargs: + self.p(stream_id) + else: + self.p_i(stream_id) + if stream_id in self.streams: + url = self.streams[stream_id]['src'] + ext = self.streams[stream_id]['container'] + total_size = self.streams[stream_id]['size'] + if ext == 'm3u8' or ext == 'm4a': + ext = 'mp4' + + if not url: + log.wtf('[Failed] Cannot extract video source.') + # For legacy main() + headers = {} + if self.ua is not None: + headers['User-Agent'] = self.ua + if self.referer is not None: + headers['Referer'] = self.referer + + download_url_ffmpeg(url, self.title, ext, output_dir=kwargs['output_dir'], merge=kwargs['merge']) + + if 'caption' not in kwargs or not kwargs['caption']: + print('Skipping captions or danmaku.') + return + + for lang in self.caption_tracks: + filename = '%s.%s.srt' % (get_filename(self.title), lang) + print('Saving %s ... ' % filename, end="", flush=True) + srt = self.caption_tracks[lang] + with open(os.path.join(kwargs['output_dir'], filename), + 'w', encoding='utf-8') as x: + x.write(srt) + print('Done.') + + if self.danmaku is not None and not dry_run: + filename = '{}.cmt.xml'.format(get_filename(self.title)) + print('Downloading {} ...\n'.format(filename)) + with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf8') as fp: + fp.write(self.danmaku) + + if self.lyrics is not None and not dry_run: + filename = '{}.lrc'.format(get_filename(self.title)) + print('Downloading {} ...\n'.format(filename)) + with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf8') as fp: + fp.write(self.lyrics) + + # For main_dev() + #download_urls(urls, self.title, self.streams[stream_id]['container'], self.streams[stream_id]['size']) + keep_obj = kwargs.get('keep_obj', False) + if not keep_obj: + self.__init__() + + + def acfun_download(self, url, output_dir='.', merge=True, info_only=False, **kwargs): + assert re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', url) + + def getM3u8UrlFromCurrentVideoInfo(currentVideoInfo): + if 'playInfos' in currentVideoInfo: + return currentVideoInfo['playInfos'][0]['playUrls'][0] + elif 'ksPlayJson' in currentVideoInfo: + ksPlayJson = json.loads( currentVideoInfo['ksPlayJson'] ) + representation = ksPlayJson.get('adaptationSet')[0].get('representation') + reps = [] + for one in representation: + reps.append( (one['width']* one['height'], one['url'], one['backupUrl']) ) + return max(reps)[1] + + + if re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url): + html = get_content(url, headers=fake_headers) + json_text = match1(html, r"(?s)videoInfo\s*=\s*(\{.*?\});") + json_data = json.loads(json_text) + vid = json_data.get('currentVideoInfo').get('id') + up = json_data.get('user').get('name') + title = json_data.get('title') + video_list = json_data.get('videoList') + if len(video_list) > 1: + title += " - " + [p.get('title') for p in video_list if p.get('id') == vid][0] + currentVideoInfo = json_data.get('currentVideoInfo') + m3u8_url = getM3u8UrlFromCurrentVideoInfo(currentVideoInfo) + elif re.match("https?://[^\.]*\.*acfun\.[^\.]+/bangumi/aa(\d+)", url): + html = get_content(url, headers=fake_headers) + tag_script = match1(html, r'') + json_text = tag_script[tag_script.find('{') : tag_script.find('};') + 1] + json_data = json.loads(json_text) + title = json_data['bangumiTitle'] + " " + json_data['episodeName'] + " " + json_data['title'] + vid = str(json_data['videoId']) + up = "acfun" + + currentVideoInfo = json_data.get('currentVideoInfo') + m3u8_url = getM3u8UrlFromCurrentVideoInfo(currentVideoInfo) + + else: + raise NotImplemented + + assert title and m3u8_url + title = unescape_html(title) + title = escape_file_path(title) + p_title = r1('active">([^<]+)', html) + title = '%s (%s)' % (title, up) + if p_title: + title = '%s - %s' % (title, p_title) + + print_info(site_info, title, 'm3u8', float('inf')) + if not info_only: + download_url_ffmpeg(m3u8_url, title, 'mp4', output_dir=output_dir, merge=merge) + +site = AcFun() site_info = "AcFun.cn" -download = acfun_download +download = site.download_by_url download_playlist = playlist_not_supported('acfun') From faff58a148f323064fcaef3745a9be95e5066bd0 Mon Sep 17 00:00:00 2001 From: cmsxbc Date: Sun, 11 Oct 2020 23:40:00 +0800 Subject: [PATCH 009/235] fix bilibili space videos --- src/you_get/extractors/bilibili.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 7ea626f8..cdcccf20 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -114,7 +114,7 @@ class Bilibili(VideoExtractor): @staticmethod def bilibili_space_video_api(mid, pn=1, ps=100): - return 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid=%s&page=%s&pagesize=%s&order=0&jsonp=jsonp' % (mid, pn, ps) + return "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=%s&ps=%s&tid=0&keyword=&order=pubdate&jsonp=jsonp" % (mid, pn, ps) @staticmethod def bilibili_vc_api(video_id): @@ -734,15 +734,15 @@ class Bilibili(VideoExtractor): api_url = self.bilibili_space_video_api(mid) api_content = get_content(api_url, headers=self.bilibili_headers()) videos_info = json.loads(api_content) - pc = videos_info['data']['pages'] + pc = videos_info['data']['page']['count'] // videos_info['data']['page']['ps'] for pn in range(1, pc + 1): api_url = self.bilibili_space_video_api(mid, pn=pn) api_content = get_content(api_url, headers=self.bilibili_headers()) videos_info = json.loads(api_content) - epn, i = len(videos_info['data']['vlist']), 0 - for video in videos_info['data']['vlist']: + epn, i = len(videos_info['data']['list']['vlist']), 0 + for video in videos_info['data']['list']['vlist']: i += 1; log.w('Extracting %s of %s videos ...' % (i, epn)) url = 'https://www.bilibili.com/video/av%s' % video['aid'] self.__class__().download_playlist_by_url(url, **kwargs) From 205470ec116654608ddd97390bd885ba6df100b1 Mon Sep 17 00:00:00 2001 From: zhouyuan1 Date: Mon, 12 Oct 2020 13:22:17 +0800 Subject: [PATCH 010/235] add support for socks5 proxy using username and password config --- src/you_get/common.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 9c56b5c2..51521407 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -1422,12 +1422,27 @@ def load_cookies(cookiefile): def set_socks_proxy(proxy): try: import socks - socks_proxy_addrs = proxy.split(':') - socks.set_default_proxy( - socks.SOCKS5, - socks_proxy_addrs[0], - int(socks_proxy_addrs[1]) - ) + if '@' in proxy: + proxy_info = proxy.split("@") + socks_proxy_addrs = proxy_info[1].split(':') + socks_proxy_auth = proxy_info[0].split(":") + print(socks_proxy_auth[0]+" "+socks_proxy_auth[1]+" "+socks_proxy_addrs[0]+" "+socks_proxy_addrs[1]) + socks.set_default_proxy( + socks.SOCKS5, + socks_proxy_addrs[0], + int(socks_proxy_addrs[1]), + True, + socks_proxy_auth[0], + socks_proxy_auth[1] + ) + else: + socks_proxy_addrs = proxy.split(':') + print(socks_proxy_addrs[0]+" "+socks_proxy_addrs[1]) + socks.set_default_proxy( + socks.SOCKS5, + socks_proxy_addrs[0], + int(socks_proxy_addrs[1]), + ) socket.socket = socks.socksocket def getaddrinfo(*args): @@ -1565,7 +1580,7 @@ def script_main(download, download_playlist, **kwargs): '--no-proxy', action='store_true', help='Never use a proxy' ) proxy_grp.add_argument( - '-s', '--socks-proxy', metavar='HOST:PORT', + '-s', '--socks-proxy', metavar='HOST:PORT or USERNAME:PASSWORD@HOST:PORT', help='Use an SOCKS5 proxy for downloading' ) From c271363585021c9ee86e9c6e08e35d48d9fbf159 Mon Sep 17 00:00:00 2001 From: Felix Yan Date: Wed, 14 Oct 2020 23:11:09 +0800 Subject: [PATCH 011/235] Correct a typo in iqiyi.py --- src/you_get/extractors/iqiyi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/iqiyi.py b/src/you_get/extractors/iqiyi.py index 2a48791c..d138a49f 100644 --- a/src/you_get/extractors/iqiyi.py +++ b/src/you_get/extractors/iqiyi.py @@ -20,7 +20,7 @@ Changelog: use @fffonion 's method in #617. Add trace AVM(asasm) code in Iqiyi's encode function where the salt is put into the encode array and reassemble by RABCDasm(or WinRABCDasm),then use Fiddler to response modified file to replace the src file with its AutoResponder function ,set browser Fiddler proxy and play with !debug version! Flash Player ,finially get result in flashlog.txt(its location can be easily found in search engine). Code Like (without letters after #comment:),it just do the job : trace("{IQIYI_SALT}:"+salt_array.join("")) - ```(Postion After getTimer) + ```(Position After getTimer) findpropstrict QName(PackageNamespace(""), "trace") pushstring "{IQIYI_SALT}:" #comment for you to locate the salt getscopeobject 1 From 5d59f76a41bca9a88495d2b721dc5618a1a568ba Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 25 Oct 2020 17:02:28 +0100 Subject: [PATCH 012/235] [tiktok] fix extraction --- src/you_get/extractors/tiktok.py | 44 ++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/src/you_get/extractors/tiktok.py b/src/you_get/extractors/tiktok.py index 4843ced8..2ef05226 100644 --- a/src/you_get/extractors/tiktok.py +++ b/src/you_get/extractors/tiktok.py @@ -5,26 +5,42 @@ __all__ = ['tiktok_download'] from ..common import * def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - html = get_html(url, faker=True) + referUrl = url.split('?')[0] + headers = fake_headers + # trick or treat + html = get_content(url, headers=headers) data = r1(r'', html) info = json.loads(data) - videoData = info['props']['pageProps']['videoData'] - urls = videoData['itemInfos']['video']['urls'] - videoId = videoData['itemInfos']['id'] - uniqueId = videoData['authorInfos'].get('uniqueId') - nickName = videoData['authorInfos'].get('nickName') + wid = info['props']['initialProps']['$wid'] + cookie = 'tt_webid=%s; tt_webid_v2=%s' % (wid, wid) - for i, videoUrl in enumerate(urls): - title = '%s [%s]' % (nickName or uniqueId, videoId) - if len(urls) > 1: - title = '%s [%s]' % (title, i) + # here's the cookie + headers['Cookie'] = cookie - mime, ext, size = url_info(videoUrl, headers={'Referer': url}) + # try again + html = get_content(url, headers=headers) + data = r1(r'', html) + info = json.loads(data) + wid = info['props']['initialProps']['$wid'] + cookie = 'tt_webid=%s; tt_webid_v2=%s' % (wid, wid) - print_info(site_info, title, mime, size) - if not info_only: - download_urls([videoUrl], title, ext, size, output_dir=output_dir, merge=merge, headers={'Referer': url}) + videoData = info['props']['pageProps']['itemInfo']['itemStruct'] + videoId = videoData['id'] + videoUrl = videoData['video']['downloadAddr'] + uniqueId = videoData['author'].get('uniqueId') + nickName = videoData['author'].get('nickname') + + title = '%s [%s]' % (nickName or uniqueId, videoId) + + # we also need the referer + headers['Referer'] = referUrl + + mime, ext, size = url_info(videoUrl, headers=headers) + + print_info(site_info, title, mime, size) + if not info_only: + download_urls([videoUrl], title, ext, size, output_dir=output_dir, merge=merge, headers=headers) site_info = "TikTok.com" download = tiktok_download From 517b8c090b719702518d4ccf5d4f217b9251adcd Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 27 Oct 2020 17:20:28 +0100 Subject: [PATCH 013/235] [youtube] fix playlist extraction --- src/you_get/extractors/youtube.py | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 4f3a947e..e6388e98 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -157,34 +157,22 @@ class YouTube(VideoExtractor): log.wtf('[Failed] Unsupported URL pattern.') video_page = get_content('https://www.youtube.com/playlist?list=%s' % playlist_id) - from html.parser import HTMLParser - videos = sorted([HTMLParser().unescape(video) - for video in re.findall(r' Date: Tue, 27 Oct 2020 18:19:28 +0100 Subject: [PATCH 014/235] version 0.4.1475 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index e404e0c0..b4696519 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1456' +__version__ = '0.4.1475' From d0225b4f469c896363bdb96bbdd7a70675a9f0d1 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 28 Oct 2020 02:12:07 +0100 Subject: [PATCH 015/235] [youtube] fix videos whose page ytplayer_config does not contain assets --- src/you_get/extractors/youtube.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index e6388e98..19a72081 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -204,13 +204,22 @@ class YouTube(VideoExtractor): video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) try: ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) - self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] + # Workaround: get_video_info returns bad s. Why? if 'url_encoded_fmt_stream_map' not in ytplayer_config['args']: stream_list = json.loads(ytplayer_config['args']['player_response'])['streamingData']['formats'] else: stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') #stream_list = ytplayer_config['args']['adaptive_fmts'].split(',') + + if 'assets' in ytplayer_config: + self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] + elif re.search('([^"]*/base\.js)"', video_page): + self.html5player = 'https://www.youtube.com' + re.search('([^"]*/base\.js)"', video_page).group(1) + self.html5player = self.html5player.replace('\/', '/') # unescape URL + else: + self.html5player = None + except: if 'url_encoded_fmt_stream_map' not in video_info: stream_list = json.loads(video_info['player_response'][0])['streamingData']['formats'] From 0e004039ffd452b5635c42e616a8e38adea5de98 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 28 Oct 2020 02:14:05 +0100 Subject: [PATCH 016/235] [youtube] decipher -> s_to_sig --- src/you_get/extractors/youtube.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 19a72081..ac62e57c 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -68,7 +68,7 @@ class YouTube(VideoExtractor): 'audio_encoding': 'AAC', 'audio_bitrate': '24'}, ] - def decipher(js, s): + def s_to_sig(js, s): # Examples: # - https://www.youtube.com/yts/jsbin/player-da_DK-vflWlK-zq/base.js # - https://www.youtube.com/yts/jsbin/player-vflvABTsY/da_DK/base.js @@ -490,13 +490,13 @@ class YouTube(VideoExtractor): if stream['type'].startswith('audio/mp4'): dash_mp4_a_url = stream['url'] if 's' in stream: - sig = self.__class__.decipher(self.js, stream['s']) + sig = self.__class__.s_to_sig(self.js, stream['s']) dash_mp4_a_url += '&sig={}'.format(sig) dash_mp4_a_size = stream['clen'] elif stream['type'].startswith('audio/webm'): dash_webm_a_url = stream['url'] if 's' in stream: - sig = self.__class__.decipher(self.js, stream['s']) + sig = self.__class__.s_to_sig(self.js, stream['s']) dash_webm_a_url += '&sig={}'.format(sig) dash_webm_a_size = stream['clen'] for stream in streams: # video @@ -505,7 +505,7 @@ class YouTube(VideoExtractor): mimeType = 'video/mp4' dash_url = stream['url'] if 's' in stream: - sig = self.__class__.decipher(self.js, stream['s']) + sig = self.__class__.s_to_sig(self.js, stream['s']) dash_url += '&sig={}'.format(sig) dash_size = stream['clen'] itag = stream['itag'] @@ -524,7 +524,7 @@ class YouTube(VideoExtractor): mimeType = 'video/webm' dash_url = stream['url'] if 's' in stream: - sig = self.__class__.decipher(self.js, stream['s']) + sig = self.__class__.s_to_sig(self.js, stream['s']) dash_url += '&sig={}'.format(sig) dash_size = stream['clen'] itag = stream['itag'] @@ -573,7 +573,7 @@ class YouTube(VideoExtractor): if not hasattr(self, 'js'): self.js = get_content(self.html5player) s = self.streams[stream_id]['s'] - sig = self.__class__.decipher(self.js, s) + sig = self.__class__.s_to_sig(self.js, s) src += '&sig={}'.format(sig) self.streams[stream_id]['src'] = [src] From bcbe4e816a7f1de3afdacf49fa4163608c1a1452 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 28 Oct 2020 02:39:33 +0100 Subject: [PATCH 017/235] add a simple GitHub Actions workflow --- .github/workflows/python-package.yml | 39 ++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 .github/workflows/python-package.yml diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 00000000..96eefcc5 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,39 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python package + +on: + push: + branches: [ develop ] + pull_request: + branches: [ develop ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.5, 3.6, 3.7, 3.8, pypy3] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with unittest + run: | + make test From 2a7fcdc8e9b28ba34148e131b737f7f2385b1d2d Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 28 Oct 2020 03:12:06 +0100 Subject: [PATCH 018/235] migrate From Travis CI to GitHub Actions --- .github/workflows/python-package.yml | 2 +- .travis.yml | 22 ---------------------- README.md | 2 +- 3 files changed, 2 insertions(+), 24 deletions(-) delete mode 100644 .travis.yml diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 96eefcc5..b3d50ff7 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,7 +1,7 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions -name: Python package +name: develop on: push: diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index eedbeeb2..00000000 --- a/.travis.yml +++ /dev/null @@ -1,22 +0,0 @@ -# https://travis-ci.org/soimort/you-get -language: python -python: - - "3.4" - - "3.5" - - "3.6" - - "3.7" - - "3.8" - #- "nightly" (flake8 not working in python 3.9 yet, module 'ast' has no attribute 'AugLoad') - - "pypy3" -before_install: - - pip install flake8 -before_script: - - flake8 . --count --select=E9,F63,F72,F82 --show-source --statistics -script: make test -notifications: - webhooks: - urls: - - https://webhooks.gitter.im/e/43cd57826e88ed8f2152 - on_success: change # options: [always|never|change] default: always - on_failure: always # options: [always|never|change] default: always - on_start: never # options: [always|never|change] default: always diff --git a/README.md b/README.md index 3429f9d8..6adeeb35 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # You-Get +[![Build Status](https://github.com/soimort/you-get/workflows/develop/badge.svg)](https://github.com/soimort/you-get/actions) [![PyPI version](https://img.shields.io/pypi/v/you-get.svg)](https://pypi.python.org/pypi/you-get/) -[![Build Status](https://travis-ci.org/soimort/you-get.svg)](https://travis-ci.org/soimort/you-get) [![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) **NOTICE: Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.** From 4f599121d2f89b6613084ff8fdaa2986955b69de Mon Sep 17 00:00:00 2001 From: WhoIsSure Date: Mon, 9 Nov 2020 23:24:35 +0800 Subject: [PATCH 019/235] [pptv] fix: download pptv error, code 403 / 405 #2832 --- src/you_get/extractors/pptv.py | 158 +++------------------------------ 1 file changed, 14 insertions(+), 144 deletions(-) diff --git a/src/you_get/extractors/pptv.py b/src/you_get/extractors/pptv.py index dacd78e4..ef25ac44 100644 --- a/src/you_get/extractors/pptv.py +++ b/src/you_get/extractors/pptv.py @@ -174,7 +174,7 @@ def make_url(stream): src = [] for i, seg in enumerate(stream['segs']): url = 'http://{}/{}/{}?key={}&k={}'.format(host, i, rid, key, key_expr) - url += '&fpp.ver=1.3.0.4&type=' + url += '&type=web.fpp' src.append(url) return src @@ -189,17 +189,27 @@ class PPTV(VideoExtractor): ] def prepare(self, **kwargs): + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/69.0.3497.100 Safari/537.36" + } + self.vid = match1(self.url, r'https?://sports.pptv.com/vod/(\d+)/*') if self.url and not self.vid: if not re.match(r'https?://v.pptv.com/show/(\w+)\.html', self.url): raise('Unknown url pattern') - page_content = get_content(self.url,{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}) + page_content = get_content(self.url, headers) + self.vid = match1(page_content, r'webcfg\s*=\s*{"id":\s*(\d+)') + if not self.vid: + request = urllib.request.Request(self.url, headers=headers) + response = urllib.request.urlopen(request) + self.vid = match1(response.url, r'https?://sports.pptv.com/vod/(\d+)/*') if not self.vid: raise('Cannot find id') api_url = 'http://web-play.pptv.com/webplay3-0-{}.xml'.format(self.vid) - api_url += '?appplt=flp&appid=pptv.flashplayer.vod&appver=3.4.2.28&type=&version=4' - dom = parseString(get_content(api_url,{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"})) + api_url += '?type=web.fpp¶m=type=web.fpp&version=4' + dom = parseString(get_content(api_url, headers)) self.title, m_items, m_streams, m_segs = parse_pptv_xml(dom) xml_streams = merge_meta(m_items, m_streams, m_segs) for stream_id in xml_streams: @@ -212,146 +222,6 @@ class PPTV(VideoExtractor): 'src': src } -''' -def constructKey(arg): - - def str2hex(s): - r="" - for i in s[:8]: - t=hex(ord(i))[2:] - if len(t)==1: - t="0"+t - r+=t - for i in range(16): - r+=hex(int(15*random()))[2:] - return r - - #ABANDONED Because SERVER_KEY is static - def getkey(s): - #returns 1896220160 - l2=[i for i in s] - l4=0 - l3=0 - while l4>> in as3 - if k>=0: - return k>>b - elif k<0: - return (2**32+k)>>b - pass - - def lot(k,b): - return (k<([^<>]+)', xml) - k = r1(r']+>([^<>]+)', xml) - rid = r1(r'rid="([^"]+)"', xml) - title = r1(r'nm="([^"]+)"', xml) - - st=r1(r'([^<>]+)',xml)[:-4] - st=time.mktime(time.strptime(st))*1000-60*1000-time.time()*1000 - st+=time.time()*1000 - st=st/1000 - - key=constructKey(st) - - pieces = re.findall(']+fs="(\d+)"', xml) - numbers, fs = zip(*pieces) - urls=["http://{}/{}/{}?key={}&fpp.ver=1.3.0.4&k={}&type=web.fpp".format(host,i,rid,key,k) for i in range(max(map(int,numbers))+1)] - - total_size = sum(map(int, fs)) - assert rid.endswith('.mp4') - print_info(site_info, title, 'mp4', total_size) - - if not info_only: - try: - download_urls(urls, title, 'mp4', total_size, output_dir = output_dir, merge = merge) - except urllib.error.HTTPError: - #for key expired - pptv_download_by_id(id, output_dir = output_dir, merge = merge, info_only = info_only) - -def pptv_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): - assert re.match(r'http://v.pptv.com/show/(\w+)\.html', url) - html = get_html(url) - id = r1(r'webcfg\s*=\s*{"id":\s*(\d+)', html) - assert id - pptv_download_by_id(id, output_dir = output_dir, merge = merge, info_only = info_only) -''' site = PPTV() #site_info = "PPTV.com" #download = pptv_download From 1b1f1dd1181bb15dabd04f928842891ac635f49c Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 9 Nov 2020 23:32:39 -0500 Subject: [PATCH 020/235] update regex to match vid for xinpianchang --- src/you_get/extractors/xinpianchang.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/xinpianchang.py b/src/you_get/extractors/xinpianchang.py index fac3d01f..1121550c 100644 --- a/src/you_get/extractors/xinpianchang.py +++ b/src/you_get/extractors/xinpianchang.py @@ -20,7 +20,7 @@ class Xinpianchang(VideoExtractor): def prepare(self, **kwargs): # find key page_content = get_content(self.url) - match_rule = r"vid: \"(.+?)\"," + match_rule = r"vid = \"(.+?)\";" key = re.findall(match_rule, page_content)[0] # get videos info From b0aca8438c512586389f4bc29bf39c18dfd5b1ad Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 11 Nov 2020 23:54:42 +0100 Subject: [PATCH 021/235] [test] remove case for tiktok --- tests/test.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test.py b/tests/test.py index 00bd4cbb..26a0aa18 100644 --- a/tests/test.py +++ b/tests/test.py @@ -40,7 +40,7 @@ class YouGetTests(unittest.TestCase): def test_acfun(self): acfun.download('https://www.acfun.cn/v/ac11701912', info_only=True) - def test_bilibil(self): + def test_bilibili(self): bilibili.download( "https://www.bilibili.com/watchlater/#/BV1PE411q7mZ/p6", info_only=True ) @@ -58,10 +58,10 @@ class YouGetTests(unittest.TestCase): # 'https://soundcloud.com/anthony-flieger/sets/cytus', info_only=True #) - def tests_tiktok(self): - tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True) - tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True) - tiktok.download('https://vt.tiktok.com/UGJR4R/', info_only=True) + #def tests_tiktok(self): + # tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True) + # tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True) + # tiktok.download('https://vt.tiktok.com/UGJR4R/', info_only=True) if __name__ == '__main__': From c074d8100110c1c2eed27f2f4ffb12f7670d6e74 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 12 Nov 2020 00:00:24 +0100 Subject: [PATCH 022/235] [common] update UA --- src/you_get/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 51521407..79fc74d1 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -143,7 +143,7 @@ fake_headers = { 'Accept-Charset': 'UTF-8,*;q=0.5', 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'en-US,en;q=0.8', - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', # noqa + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.74 Safari/537.36 Edg/79.0.309.43', # noqa } if sys.stdout.isatty(): From 67e9f7c6c1aa1d07eb2b8dbbf273cfa14d4dd21a Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 12 Nov 2020 00:08:17 +0100 Subject: [PATCH 023/235] [test] remove a case for youtube that often fails --- tests/test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test.py b/tests/test.py index 26a0aa18..0f7595b3 100644 --- a/tests/test.py +++ b/tests/test.py @@ -33,9 +33,9 @@ class YouGetTests(unittest.TestCase): 'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa info_only=True ) - youtube.download( - 'https://www.youtube.com/watch?v=Fpr4fQSh1cc', info_only=True - ) + #youtube.download( + # 'https://www.youtube.com/watch?v=Fpr4fQSh1cc', info_only=True + #) def test_acfun(self): acfun.download('https://www.acfun.cn/v/ac11701912', info_only=True) From 4a9d2c1e13b8918deba39af515d315b60e545422 Mon Sep 17 00:00:00 2001 From: Kagamia Date: Mon, 16 Nov 2020 00:25:29 +0800 Subject: [PATCH 024/235] add fake header --- src/you_get/extractors/netease.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/netease.py b/src/you_get/extractors/netease.py index c7c0f666..ca1be887 100644 --- a/src/you_get/extractors/netease.py +++ b/src/you_get/extractors/netease.py @@ -123,10 +123,10 @@ def netease_song_download(song, output_dir='.', info_only=False, playlist_prefix output_dir=output_dir, info_only=info_only) def netease_download_common(title, url_best, output_dir, info_only): - songtype, ext, size = url_info(url_best) + songtype, ext, size = url_info(url_best, faker=True) print_info(site_info, title, songtype, size) if not info_only: - download_urls([url_best], title, ext, size, output_dir) + download_urls([url_best], title, ext, size, output_dir, faker=True) def netease_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): From c9001c70e9c0e43b105b6fbf4996c128f1bda5c1 Mon Sep 17 00:00:00 2001 From: widtrizz <54760244+widtrizz@users.noreply.github.com> Date: Tue, 17 Nov 2020 21:08:26 +0800 Subject: [PATCH 025/235] Update ccode 0590 --- src/you_get/extractors/youku.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/you_get/extractors/youku.py b/src/you_get/extractors/youku.py index d7107eca..7f4be852 100644 --- a/src/you_get/extractors/youku.py +++ b/src/you_get/extractors/youku.py @@ -41,7 +41,6 @@ class Youku(VideoExtractor): mobile_ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36' dispatcher_url = 'vali.cp31.ott.cibntv.net' - # Last updated: 2017-10-13 stream_types = [ {'id': 'hd3', 'container': 'flv', 'video_profile': '1080P'}, {'id': 'hd3v2', 'container': 'flv', 'video_profile': '1080P'}, @@ -78,7 +77,7 @@ class Youku(VideoExtractor): self.api_error_code = None self.api_error_msg = None - self.ccode = '0519' + self.ccode = '0590' # Found in http://g.alicdn.com/player/ykplayer/0.5.64/youku-player.min.js # grep -oE '"[0-9a-zA-Z+/=]{256}"' youku-player.min.js self.ckey = 'DIl58SLFxFNndSV1GFNnMQVYkx1PP5tKe1siZu/86PR1u/Wh1Ptd+WOZsHHWxysSfAOhNJpdVWsdVJNsfJ8Sxd8WKVvNfAS8aS8fAOzYARzPyPc3JvtnPHjTdKfESTdnuTW6ZPvk2pNDh4uFzotgdMEFkzQ5wZVXl2Pf1/Y6hLK0OnCNxBj3+nb0v72gZ6b0td+WOZsHHWxysSo/0y9D2K42SaB8Y/+aD2K42SaB8Y/+ahU+WOZsHcrxysooUeND' From 6f9cd8a069611e69d26da77eac676d33ca99c735 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 18 Nov 2020 16:35:59 +0100 Subject: [PATCH 026/235] version 0.4.1488 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index b4696519..0e6f1230 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1475' +__version__ = '0.4.1488' From 6e39a594e4b09b1379be649517dcf2d237ef6263 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 5 Dec 2020 16:14:40 +0100 Subject: [PATCH 027/235] [instagram] fix extraction --- src/you_get/extractors/instagram.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/you_get/extractors/instagram.py b/src/you_get/extractors/instagram.py index 86905a77..6f6bf173 100755 --- a/src/you_get/extractors/instagram.py +++ b/src/you_get/extractors/instagram.py @@ -9,7 +9,7 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg html = get_html(url) vid = r1(r'instagram.com/\w+/([^/]+)', url) - description = r1(r'\s([^<]*)', html) title = "{} [{}]".format(description.replace("\n", " "), vid) stream = r1(r'', html) - info = json.loads(data.group(1)) + data = re.search(r'window\.__additionalDataLoaded\(\'[^\']+\',(.*)\);', html) + post = json.loads(data.group(1)) - if 'edge_sidecar_to_children' in info['entry_data']['PostPage'][0]['graphql']['shortcode_media']: - edges = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['edge_sidecar_to_children']['edges'] + if 'edge_sidecar_to_children' in post['graphql']['shortcode_media']: + edges = post['graphql']['shortcode_media']['edge_sidecar_to_children']['edges'] for edge in edges: title = edge['node']['shortcode'] image_url = edge['node']['display_url'] @@ -40,10 +40,10 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg total_size=size, output_dir=output_dir) else: - title = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['shortcode'] - image_url = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['display_url'] - if 'video_url' in info['entry_data']['PostPage'][0]['graphql']['shortcode_media']: - image_url =info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['video_url'] + title = post['graphql']['shortcode_media']['shortcode'] + image_url = post['graphql']['shortcode_media']['display_url'] + if 'video_url' in post['graphql']['shortcode_media']: + image_url = post['graphql']['shortcode_media']['video_url'] ext = image_url.split('?')[0].split('.')[-1] size = int(get_head(image_url)['Content-Length']) From 0247b06437a2161ff537169d5f49c2e7c0a95665 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 5 Dec 2020 17:19:27 +0100 Subject: [PATCH 028/235] [instagram] support extraction with cookies --- src/you_get/extractors/instagram.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/instagram.py b/src/you_get/extractors/instagram.py index 6f6bf173..7c9d6272 100755 --- a/src/you_get/extractors/instagram.py +++ b/src/you_get/extractors/instagram.py @@ -9,8 +9,10 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg html = get_html(url) vid = r1(r'instagram.com/\w+/([^/]+)', url) - description = r1(r'\s([^<]*)', html) + description = r1(r'\s([^<]*)', html) # with logged-in cookies title = "{} [{}]".format(description.replace("\n", " "), vid) + stream = r1(r'', html) - post = json.loads(data.group(1)) + data = re.search(r'window\._sharedData\s*=\s*(.*);', html) + if data is not None: + info = json.loads(data.group(1)) + post = info['entry_data']['PostPage'][0] + else: + # with logged-in cookies + data = re.search(r'window\.__additionalDataLoaded\(\'[^\']+\',(.*)\);', html) + if data is not None: + log.e('[Error] Cookies needed.') + post = json.loads(data.group(1)) if 'edge_sidecar_to_children' in post['graphql']['shortcode_media']: edges = post['graphql']['shortcode_media']['edge_sidecar_to_children']['edges'] From dbb1296d548cf13312fa0706cbf212f42f064c94 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 9 Dec 2020 00:46:03 +0100 Subject: [PATCH 029/235] update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6adeeb35..ce412afd 100644 --- a/README.md +++ b/README.md @@ -436,7 +436,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 火猫TV | |✓| | | | 阳光宽频网 | |✓| | | | 西瓜视频 | |✓| | | -| 新片场 | |✓| | | +| 新片场 | |✓| | | | 快手 | |✓|✓| | | 抖音 | |✓| | | | TikTok | |✓| | | From 4793e55e1465dbedacd04784974cfaf21bfaffb6 Mon Sep 17 00:00:00 2001 From: zhoudunguang Date: Wed, 9 Dec 2020 16:04:03 +0800 Subject: [PATCH 030/235] update ccode 0532 for Youku --- src/you_get/extractors/youku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/youku.py b/src/you_get/extractors/youku.py index 7f4be852..92cdafb6 100644 --- a/src/you_get/extractors/youku.py +++ b/src/you_get/extractors/youku.py @@ -77,7 +77,7 @@ class Youku(VideoExtractor): self.api_error_code = None self.api_error_msg = None - self.ccode = '0590' + self.ccode = '0532' # Found in http://g.alicdn.com/player/ykplayer/0.5.64/youku-player.min.js # grep -oE '"[0-9a-zA-Z+/=]{256}"' youku-player.min.js self.ckey = 'DIl58SLFxFNndSV1GFNnMQVYkx1PP5tKe1siZu/86PR1u/Wh1Ptd+WOZsHHWxysSfAOhNJpdVWsdVJNsfJ8Sxd8WKVvNfAS8aS8fAOzYARzPyPc3JvtnPHjTdKfESTdnuTW6ZPvk2pNDh4uFzotgdMEFkzQ5wZVXl2Pf1/Y6hLK0OnCNxBj3+nb0v72gZ6b0td+WOZsHHWxysSo/0y9D2K42SaB8Y/+aD2K42SaB8Y/+ahU+WOZsHcrxysooUeND' From b00860e4beed7d72a20f212c5ab9f443b779dc2d Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 11 Dec 2020 02:02:44 +0100 Subject: [PATCH 031/235] [instagram] add faker --- src/you_get/extractors/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/instagram.py b/src/you_get/extractors/instagram.py index 7c9d6272..d5100a78 100755 --- a/src/you_get/extractors/instagram.py +++ b/src/you_get/extractors/instagram.py @@ -6,7 +6,7 @@ from ..common import * def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwargs): url = r1(r'([^?]*)', url) - html = get_html(url) + html = get_html(url, faker=True) vid = r1(r'instagram.com/\w+/([^/]+)', url) description = r1(r' Date: Tue, 15 Dec 2020 21:41:41 +0800 Subject: [PATCH 032/235] feat: add lrts extractor --- README.md | 1 + src/you_get/common.py | 1 + src/you_get/extractors/lrts.py | 70 ++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 src/you_get/extractors/lrts.py diff --git a/README.md b/README.md index ce412afd..6a23faf8 100644 --- a/README.md +++ b/README.md @@ -414,6 +414,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 酷我音乐 | | | |✓| | 乐视网 | |✓| | | | 荔枝FM | | | |✓| +| 懒人听书 | | | |✓| | 秒拍 | |✓| | | | MioMio弹幕网 | |✓| | | | MissEvan
猫耳FM | | | |✓| diff --git a/src/you_get/common.py b/src/you_get/common.py index 79fc74d1..2b6e05d2 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -76,6 +76,7 @@ SITES = { 'letv' : 'le', 'lizhi' : 'lizhi', 'longzhu' : 'longzhu', + 'lrts' : 'lrts', 'magisto' : 'magisto', 'metacafe' : 'metacafe', 'mgtv' : 'mgtv', diff --git a/src/you_get/extractors/lrts.py b/src/you_get/extractors/lrts.py new file mode 100644 index 00000000..23abab5c --- /dev/null +++ b/src/you_get/extractors/lrts.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python + +__all__ = ['lrts_download'] + +import logging +from ..common import * + +def lrts_download(url, output_dir='.', merge=True, info_only=False, **kwargs): + html = get_html(url) + matched = re.search(r"/book/(\d+)", url) + if not matched: + raise AssertionError("not found book number: %s" % url) + book_no = matched.group(1) + book_title = book_no + matched = re.search(r"(.*)-(.*)", html) + if matched: + book_title = matched.group(1) + + matched = re.search(r"var totalCount='(\d+)'", html) + if not matched: + raise AssertionError("not found total count in html") + total_count = int(matched.group(1)) + logging.debug('total: %s' % total_count) + page_size = 10 + logging.debug('total page count: %s' % ((total_count // page_size) + 1)) + headers = { + 'Referer': url + } + items = [] + if (total_count > page_size): + for page in range((total_count // page_size) + 1): + page_url = 'http://www.lrts.me/ajax/book/%s/%s/%s' % (book_no, page, page_size) + response_content = json.loads(post_content(page_url, headers)) + if response_content['status'] != 'success': + raise AssertionError("got the page failed: %s" % (page_url)) + data = response_content['data']['data'] + if data: + for i in data: + i['resName'] = parse.unquote(i['resName']) + items.extend(data) + else: + break + + headers = { + 'Referer': 'http://www.lrts.me/playlist' + } + + for item in items: + i_url = 'http://www.lrts.me/ajax/path/4/%s/%s' % (item['fatherResId'], item['resId']) + response_content = json.loads(post_content(i_url, headers)) + # logging.debug(response_content) + if response_content['status'] == 'success' and response_content['data']: + item['ok'] = True + item['url'] = response_content['data'] + + items = list(filter(lambda i: 'ok' in i and i['ok'], items)) + print('Downloading %s: %s count ...' % (book_title, len(items))) + + for item in items: + title = item['resName'] + file_url = item['url'] + # if not file_url: continue + _, _, size = url_info(file_url) + print_info(site_info, title, 'mp3', size) + if not info_only: + download_urls([file_url], title, 'mp3', size, output_dir, merge=merge) + +site_info = "lrts.me" +download = lrts_download +download_playlist = lrts_download From e37836a40bd38feb7f2f616852883578b7153a6c Mon Sep 17 00:00:00 2001 From: Riceball LEE Date: Wed, 16 Dec 2020 10:31:52 +0800 Subject: [PATCH 033/235] feat: add arguments to specify the playlist first, last, page-size options --- src/you_get/common.py | 17 ++++++++++++++++- src/you_get/extractors/lrts.py | 26 ++++++++++++++++++++------ 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 2b6e05d2..7fe9d51d 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -1557,6 +1557,21 @@ def script_main(download, download_playlist, **kwargs): '-l', '--playlist', action='store_true', help='Prefer to download a playlist' ) + + playlist_grp = parser.add_argument_group('Playlist optional options') + playlist_grp.add_argument( + '-first', '--first', metavar='FIRST', + help='the first number' + ) + playlist_grp.add_argument( + '-last', '--last', metavar='LAST', + help='the last number' + ) + playlist_grp.add_argument( + '-size', '--page-size', metavar='PAGE_SIZE', + help='the page size number' + ) + download_grp.add_argument( '-a', '--auto-rename', action='store_true', default=False, help='Auto rename same name different files' @@ -1674,7 +1689,7 @@ def script_main(download, download_playlist, **kwargs): socket.setdefaulttimeout(args.timeout) try: - extra = {} + extra = {'args': args} if extractor_proxy: extra['extractor_proxy'] = extractor_proxy if stream_id: diff --git a/src/you_get/extractors/lrts.py b/src/you_get/extractors/lrts.py index 23abab5c..d206491d 100644 --- a/src/you_get/extractors/lrts.py +++ b/src/you_get/extractors/lrts.py @@ -4,15 +4,18 @@ __all__ = ['lrts_download'] import logging from ..common import * +from ..util import log, term def lrts_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_html(url) + args = kwargs.get('args') + if not args: args = {} matched = re.search(r"/book/(\d+)", url) if not matched: raise AssertionError("not found book number: %s" % url) book_no = matched.group(1) book_title = book_no - matched = re.search(r"(.*)-(.*)", html) + matched = re.search(r"([^-]*)[-](.*)[,](.*)", html) if matched: book_title = matched.group(1) @@ -20,15 +23,25 @@ def lrts_download(url, output_dir='.', merge=True, info_only=False, **kwargs): if not matched: raise AssertionError("not found total count in html") total_count = int(matched.group(1)) - logging.debug('total: %s' % total_count) + log.i('%s total: %s' % (book_title, total_count)) + first_page = 0 + if ('first' in args and args.first!= None): + first_page = int(args.first) + page_size = 10 - logging.debug('total page count: %s' % ((total_count // page_size) + 1)) + if ('page_size' in args and args.page_size != None): + page_size = int(args.page_size) + last_page = (total_count // page_size) + 1 + if ('last' in args and args.last != None): + last_page = int(args.last) + + log.i('page size is %s, page from %s to %s' % (page_size, first_page, last_page)) headers = { 'Referer': url } items = [] if (total_count > page_size): - for page in range((total_count // page_size) + 1): + for page in range(first_page, last_page): page_url = 'http://www.lrts.me/ajax/book/%s/%s/%s' % (book_no, page, page_size) response_content = json.loads(post_content(page_url, headers)) if response_content['status'] != 'success': @@ -48,13 +61,14 @@ def lrts_download(url, output_dir='.', merge=True, info_only=False, **kwargs): for item in items: i_url = 'http://www.lrts.me/ajax/path/4/%s/%s' % (item['fatherResId'], item['resId']) response_content = json.loads(post_content(i_url, headers)) - # logging.debug(response_content) + logging.debug(response_content) if response_content['status'] == 'success' and response_content['data']: item['ok'] = True item['url'] = response_content['data'] + logging.debug('ok') items = list(filter(lambda i: 'ok' in i and i['ok'], items)) - print('Downloading %s: %s count ...' % (book_title, len(items))) + log.i('Downloading %s: %s count ...' % (book_title, len(items))) for item in items: title = item['resName'] From 17c740ae542e4ce2285da8775152cde8cbc782b8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 17 Dec 2020 02:01:23 +0100 Subject: [PATCH 034/235] [twitter] add faker --- src/you_get/extractors/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 602c18f6..23468211 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -34,7 +34,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) **kwargs) return - html = get_html(url, faker=False) # disable faker to prevent 302 infinite redirect + html = get_html(url, faker=True) # now it seems faker must be enabled screen_name = r1(r'twitter\.com/([^/]+)', url) or r1(r'data-screen-name="([^"]*)"', html) or \ r1(r' Date: Fri, 18 Dec 2020 14:53:39 +0800 Subject: [PATCH 035/235] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index ce412afd..137cbab8 100644 --- a/README.md +++ b/README.md @@ -427,7 +427,6 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | **Tudou
土豆** | |✓| | | | 虾米 | |✓| |✓| | 阳光卫视 | |✓| | | -| **音悦Tai** | |✓| | | | **Youku
优酷** | |✓| | | | 战旗TV | |✓| | | | 央视网 | |✓| | | From 5a890eac531546f7bc737a0616a86262e418acdf Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 21 Dec 2020 17:00:22 +0100 Subject: [PATCH 036/235] [soundcloud] fix --- src/you_get/extractors/soundcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/soundcloud.py b/src/you_get/extractors/soundcloud.py index ecd3fc8d..08e9d561 100644 --- a/src/you_get/extractors/soundcloud.py +++ b/src/you_get/extractors/soundcloud.py @@ -19,7 +19,7 @@ def get_sndcd_apikey(): def get_resource_info(resource_url, client_id): cont = get_content(resource_url, decoded=True) - x = re.escape('forEach(function(e){n(e)})}catch(t){}})},') + x = re.escape('forEach(function(e){n(e)})}catch(e){}})},') x = re.search(r'' + x + r'(.*)\);', cont) info = json.loads(x.group(1))[-1]['data'][0] From 6be1d0308ec52c212052b5808661636100ae2bfb Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 26 Dec 2020 18:45:55 +0100 Subject: [PATCH 037/235] [youtube] partial fix for #2857 --- src/you_get/extractors/youtube.py | 43 +++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index ac62e57c..9485b876 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -203,28 +203,43 @@ class YouTube(VideoExtractor): # Parse video page (for DASH) video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) try: - ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) + try: + # Complete ytplayer_config + ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) - # Workaround: get_video_info returns bad s. Why? - if 'url_encoded_fmt_stream_map' not in ytplayer_config['args']: - stream_list = json.loads(ytplayer_config['args']['player_response'])['streamingData']['formats'] - else: - stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') - #stream_list = ytplayer_config['args']['adaptive_fmts'].split(',') + # Workaround: get_video_info returns bad s. Why? + if 'url_encoded_fmt_stream_map' not in ytplayer_config['args']: + stream_list = json.loads(ytplayer_config['args']['player_response'])['streamingData']['formats'] + else: + stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') + #stream_list = ytplayer_config['args']['adaptive_fmts'].split(',') - if 'assets' in ytplayer_config: - self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] - elif re.search('([^"]*/base\.js)"', video_page): - self.html5player = 'https://www.youtube.com' + re.search('([^"]*/base\.js)"', video_page).group(1) - self.html5player = self.html5player.replace('\/', '/') # unescape URL - else: - self.html5player = None + if 'assets' in ytplayer_config: + self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] + elif re.search('([^"]*/base\.js)"', video_page): + self.html5player = 'https://www.youtube.com' + re.search('([^"]*/base\.js)"', video_page).group(1) + self.html5player = self.html5player.replace('\/', '/') # unescape URL + else: + self.html5player = None + + except: + # ytplayer_config = {args:{raw_player_response:ytInitialPlayerResponse}} + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + + stream_list = ytInitialPlayerResponse['streamingData']['formats'] + #stream_list = ytInitialPlayerResponse['streamingData']['adaptiveFormats'] + + if re.search('([^"]*/base\.js)"', video_page): + self.html5player = 'https://www.youtube.com' + re.search('([^"]*/base\.js)"', video_page).group(1) + else: + self.html5player = None except: if 'url_encoded_fmt_stream_map' not in video_info: stream_list = json.loads(video_info['player_response'][0])['streamingData']['formats'] else: stream_list = video_info['url_encoded_fmt_stream_map'][0].split(',') + if re.search('([^"]*/base\.js)"', video_page): self.html5player = 'https://www.youtube.com' + re.search('([^"]*/base\.js)"', video_page).group(1) else: From 9a4d9ef94e5a00666118d0c2c845c16266f7dbb5 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 26 Dec 2020 19:02:47 +0100 Subject: [PATCH 038/235] [youtube] fully fix #2857 with DASH stream --- src/you_get/extractors/youtube.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 9485b876..30297c75 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -468,11 +468,16 @@ class YouTube(VideoExtractor): for afmt in video_info['adaptive_fmts'][0].split(',')] else: try: - streams = json.loads(video_info['player_response'][0])['streamingData']['adaptiveFormats'] + try: + streams = json.loads(video_info['player_response'][0])['streamingData']['adaptiveFormats'] + except: + streams = ytInitialPlayerResponse['streamingData']['adaptiveFormats'] except: # no DASH stream at all return + # streams without contentLength got broken urls, just remove them (#2767) streams = [stream for stream in streams if 'contentLength' in stream] + for stream in streams: stream['itag'] = str(stream['itag']) if 'qualityLabel' in stream: From 4f8c73e9ac27198e15694affb20a615fa9ffa48c Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 26 Dec 2020 19:19:28 +0100 Subject: [PATCH 039/235] [yinyuetai] purge (#2855) --- src/you_get/common.py | 1 - src/you_get/extractors/__init__.py | 3 +- src/you_get/extractors/embed.py | 8 ------ src/you_get/extractors/yinyuetai.py | 43 ----------------------------- 4 files changed, 1 insertion(+), 54 deletions(-) delete mode 100644 src/you_get/extractors/yinyuetai.py diff --git a/src/you_get/common.py b/src/you_get/common.py index 79fc74d1..224249b4 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -117,7 +117,6 @@ SITES = { 'xiaojiadianvideo' : 'fc2video', 'ximalaya' : 'ximalaya', 'xinpianchang' : 'xinpianchang', - 'yinyuetai' : 'yinyuetai', 'yizhibo' : 'yizhibo', 'youku' : 'youku', 'youtu' : 'youtube', diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index 4280d236..8cf1ea7c 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -80,10 +80,9 @@ from .w56 import * from .wanmen import * from .xiami import * from .xinpianchang import * -from .yinyuetai import * from .yixia import * from .youku import * from .youtube import * from .zhanqi import * from .zhibo import * -from .zhihu import * \ No newline at end of file +from .zhihu import * diff --git a/src/you_get/extractors/embed.py b/src/you_get/extractors/embed.py index ee539747..aedf5137 100644 --- a/src/you_get/extractors/embed.py +++ b/src/you_get/extractors/embed.py @@ -13,7 +13,6 @@ from .qq import qq_download_by_vid from .sina import sina_download_by_vid from .tudou import tudou_download_by_id from .vimeo import vimeo_download_by_id -from .yinyuetai import yinyuetai_download_by_id from .youku import youku_download_by_vid from . import iqiyi from . import bokecc @@ -40,8 +39,6 @@ refer to http://open.tudou.com/wiki/video/info """ tudou_api_patterns = [ ] -yinyuetai_embed_patterns = [ 'player\.yinyuetai\.com/video/swf/(\d+)' ] - iqiyi_embed_patterns = [ 'player\.video\.qiyi\.com/([^/]+)/[^/]+/[^/]+/[^/]+\.swf[^"]+tvId=(\d+)' ] netease_embed_patterns = [ '(http://\w+\.163\.com/movie/[^\'"]+)' ] @@ -82,11 +79,6 @@ def embed_download(url, output_dir = '.', merge = True, info_only = False, **kwa found = True tudou_download_by_id(vid, title=title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) - vids = matchall(content, yinyuetai_embed_patterns) - for vid in vids: - found = True - yinyuetai_download_by_id(vid, title=title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) - vids = matchall(content, iqiyi_embed_patterns) for vid in vids: found = True diff --git a/src/you_get/extractors/yinyuetai.py b/src/you_get/extractors/yinyuetai.py deleted file mode 100644 index 6c39540f..00000000 --- a/src/you_get/extractors/yinyuetai.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python - -__all__ = ['yinyuetai_download', 'yinyuetai_download_by_id'] - -from ..common import * - -def yinyuetai_download_by_id(vid, title=None, output_dir='.', merge=True, info_only=False): - video_info = json.loads(get_html('http://www.yinyuetai.com/insite/get-video-info?json=true&videoId=%s' % vid)) - url_models = video_info['videoInfo']['coreVideoInfo']['videoUrlModels'] - url_models = sorted(url_models, key=lambda i: i['qualityLevel']) - url = url_models[-1]['videoUrl'] - type = ext = r1(r'\.(flv|mp4)', url) - _, _, size = url_info(url) - - print_info(site_info, title, type, size) - if not info_only: - download_urls([url], title, ext, size, output_dir, merge = merge) - -def yinyuetai_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - id = r1(r'http://\w+.yinyuetai.com/video/(\d+)', url) or \ - r1(r'http://\w+.yinyuetai.com/video/h5/(\d+)', url) - if not id: - yinyuetai_download_playlist(url, output_dir=output_dir, merge=merge, info_only=info_only) - return - - html = get_html(url, 'utf-8') - title = r1(r'', html) or r1(r'(.*)', html) - assert title - title = parse.unquote(title) - title = escape_file_path(title) - yinyuetai_download_by_id(id, title, output_dir, merge=merge, info_only=info_only) - -def yinyuetai_download_playlist(url, output_dir='.', merge=True, info_only=False, **kwargs): - playlist = r1(r'http://\w+.yinyuetai.com/playlist/(\d+)', url) - html = get_html(url) - data_ids = re.findall(r'data-index="\d+"\s*data-id=(\d+)', html) - for data_id in data_ids: - yinyuetai_download('http://v.yinyuetai.com/video/' + data_id, - output_dir=output_dir, merge=merge, info_only=info_only) - -site_info = "YinYueTai.com" -download = yinyuetai_download -download_playlist = yinyuetai_download_playlist From a2155e85f89b6315b3e3145e97c95b93b087951a Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Sat, 26 Dec 2020 19:21:43 +0100 Subject: [PATCH 040/235] version 0.4.1500 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 0e6f1230..95505814 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1488' +__version__ = '0.4.1500' From 9432ce3c71d4f7df1a090a4c8defa76cd9ff06f3 Mon Sep 17 00:00:00 2001 From: Riceball LEE <snowyu.lee@gmail.com> Date: Sun, 27 Dec 2020 15:45:21 +0800 Subject: [PATCH 041/235] fix(lrts): can not download audio for the count less than pagesize --- src/you_get/extractors/lrts.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/src/you_get/extractors/lrts.py b/src/you_get/extractors/lrts.py index d206491d..94d12a25 100644 --- a/src/you_get/extractors/lrts.py +++ b/src/you_get/extractors/lrts.py @@ -40,20 +40,18 @@ def lrts_download(url, output_dir='.', merge=True, info_only=False, **kwargs): 'Referer': url } items = [] - if (total_count > page_size): - for page in range(first_page, last_page): - page_url = 'http://www.lrts.me/ajax/book/%s/%s/%s' % (book_no, page, page_size) - response_content = json.loads(post_content(page_url, headers)) - if response_content['status'] != 'success': - raise AssertionError("got the page failed: %s" % (page_url)) - data = response_content['data']['data'] - if data: - for i in data: - i['resName'] = parse.unquote(i['resName']) - items.extend(data) - else: - break - + for page in range(first_page, last_page): + page_url = 'http://www.lrts.me/ajax/book/%s/%s/%s' % (book_no, page, page_size) + response_content = json.loads(post_content(page_url, headers)) + if response_content['status'] != 'success': + raise AssertionError("got the page failed: %s" % (page_url)) + data = response_content['data']['data'] + if data: + for i in data: + i['resName'] = parse.unquote(i['resName']) + items.extend(data) + else: + break headers = { 'Referer': 'http://www.lrts.me/playlist' } @@ -61,7 +59,6 @@ def lrts_download(url, output_dir='.', merge=True, info_only=False, **kwargs): for item in items: i_url = 'http://www.lrts.me/ajax/path/4/%s/%s' % (item['fatherResId'], item['resId']) response_content = json.loads(post_content(i_url, headers)) - logging.debug(response_content) if response_content['status'] == 'success' and response_content['data']: item['ok'] = True item['url'] = response_content['data'] From 027130a45ab4c69ab3b1ba1bff9dd9b34ada5a4f Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Sun, 27 Dec 2020 17:22:25 +0100 Subject: [PATCH 042/235] [common] use only double-dashed arguments --- src/you_get/common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 268448d8..83dfa2ac 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -1559,15 +1559,15 @@ def script_main(download, download_playlist, **kwargs): playlist_grp = parser.add_argument_group('Playlist optional options') playlist_grp.add_argument( - '-first', '--first', metavar='FIRST', + '--first', metavar='FIRST', help='the first number' ) playlist_grp.add_argument( - '-last', '--last', metavar='LAST', + '--last', metavar='LAST', help='the last number' ) playlist_grp.add_argument( - '-size', '--page-size', metavar='PAGE_SIZE', + '--size', '--page-size', metavar='PAGE_SIZE', help='the page size number' ) From 15cc69a120cd297f2a12c5000eb1d9717e3105f4 Mon Sep 17 00:00:00 2001 From: Bochun Bai <bbc@sinofool.net> Date: Wed, 6 Jan 2021 20:59:13 -0800 Subject: [PATCH 043/235] Fix mgtv: adds tk2 and referer --- src/you_get/extractors/mgtv.py | 114 +++++++++++++++++++++++---------- 1 file changed, 79 insertions(+), 35 deletions(-) diff --git a/src/you_get/extractors/mgtv.py b/src/you_get/extractors/mgtv.py index 657167a6..c8a03065 100644 --- a/src/you_get/extractors/mgtv.py +++ b/src/you_get/extractors/mgtv.py @@ -9,19 +9,36 @@ from urllib.parse import urlsplit from os.path import dirname import re +import base64 +import time +import uuid + + class MGTV(VideoExtractor): name = "芒果 (MGTV)" # Last updated: 2016-11-13 stream_types = [ + {'id': 'fhd', 'container': 'ts', 'video_profile': '蓝光'}, {'id': 'hd', 'container': 'ts', 'video_profile': '超清'}, {'id': 'sd', 'container': 'ts', 'video_profile': '高清'}, {'id': 'ld', 'container': 'ts', 'video_profile': '标清'}, ] - - id_dic = {i['video_profile']:(i['id']) for i in stream_types} - - api_endpoint = 'http://pcweb.api.mgtv.com/player/video?video_id={video_id}' + + id_dic = {i['video_profile']: (i['id']) for i in stream_types} + + did = str(uuid.uuid4()) + ver = '0.3.0301' + pno = '1030' + + def tk2(self): + return base64.urlsafe_b64encode(b'did=%s|ver=%s|pno=%s|clit=%d' % ( + self.did.encode(), self.ver.encode(), self.pno.encode(), time.time())).decode('utf-8')[::-1] + + info_endpoint = 'https://pcweb.api.mgtv.com/video/info?vid={video_id}' + player_endpoint = 'https://pcweb.api.mgtv.com/player/video?did={did}&tk2={tk2}&video_id={video_id}' + source_endpoint = 'https://pcweb.api.mgtv.com/player/getSource?tk2={tk2}&pm2={pm2}&video_id={video_id}' + playlist_endpoint = 'https://pcweb.api.mgtv.com/episode/list?video_id={video_id}&page={page}&size=30' @staticmethod def get_vid_from_url(url): @@ -31,71 +48,95 @@ class MGTV(VideoExtractor): if not vid: vid = match1(url, 'https?://www.mgtv.com/hz/bdpz/\d+/(\d+).html') return vid - - #---------------------------------------------------------------------- - @staticmethod - def get_mgtv_real_url(url): + + # ---------------------------------------------------------------------- + def get_mgtv_real_url(self, url): """str->list of str Give you the real URLs.""" content = loads(get_content(url)) m3u_url = content['info'] split = urlsplit(m3u_url) - - base_url = "{scheme}://{netloc}{path}/".format(scheme = split[0], - netloc = split[1], - path = dirname(split[2])) - content = get_content(content['info']) #get the REAL M3U url, maybe to be changed later? + base_url = "{scheme}://{netloc}{path}/".format(scheme=split[0], + netloc=split[1], + path=dirname(split[2])) + + content = get_content(content['info'], + headers={'Referer': self.url}) # get the REAL M3U url, maybe to be changed later? segment_list = [] segments_size = 0 for i in content.split(): - if not i.startswith('#'): #not the best way, better we use the m3u8 package + if not i.startswith('#'): # not the best way, better we use the m3u8 package segment_list.append(base_url + i) # use ext-info for fast size calculate elif i.startswith('#EXT-MGTV-File-SIZE:'): - segments_size += int(i[i.rfind(':')+1:]) + segments_size += int(i[i.rfind(':') + 1:]) return m3u_url, segments_size, segment_list def download_playlist_by_url(self, url, **kwargs): - pass + self.url = url + self.vid = self.get_vid_from_url(self.url) + content_playlist = get_content(self.playlist_endpoint.format(video_id=self.vid, page=1)) + content_playlist = loads(content_playlist) + for ep in content_playlist['data']['list']: + self.download_by_url('https://www.mgtv.com' + ep['url'], **kwargs) + max_page = content_playlist['data']['total_page'] + for page in range(2, max_page + 1): + content_playlist = get_content(self.playlist_endpoint.format(video_id=self.vid, page=page)) + content_playlist = loads(content_playlist) + for ep in content_playlist['data']['list']: + self.download_by_url('https://www.mgtv.com' + ep['url'], **kwargs) def prepare(self, **kwargs): if self.url: self.vid = self.get_vid_from_url(self.url) - content = get_content(self.api_endpoint.format(video_id = self.vid)) - content = loads(content) - self.title = content['data']['info']['title'] - domain = content['data']['stream_domain'][0] - - #stream_available = [i['name'] for i in content['data']['stream']] + content_info = get_content(self.info_endpoint.format(video_id=self.vid)) + log.d(content_info) + content_info = loads(content_info) + self.title = content_info['data']['info']['videoName'] + + content_player = get_content(self.player_endpoint.format(did=self.did, video_id=self.vid, tk2=self.tk2())) + log.d(content_player) + content_player = loads(content_player) + pm2 = content_player['data']['atc']['pm2'] + + content_source = get_content(self.source_endpoint.format(video_id=self.vid, tk2=self.tk2(), pm2=pm2)) + log.d(content_source) + content_source = loads(content_source) + domain = content_source['data']['stream_domain'][0] + + # stream_available = [i['name'] for i in content['data']['stream']] stream_available = {} - for i in content['data']['stream']: + for i in content_source['data']['stream']: stream_available[i['name']] = i['url'] for s in self.stream_types: if s['video_profile'] in stream_available.keys(): quality_id = self.id_dic[s['video_profile']] url = stream_available[s['video_profile']] - url = domain + re.sub( r'(\&arange\=\d+)', '', url) #Un-Hum + if url is None or url == '': + # skip invalid profile with empty url + continue + url = domain + re.sub(r'(\&arange\=\d+)', '', url) # Un-Hum m3u8_url, m3u8_size, segment_list_this = self.get_mgtv_real_url(url) stream_fileid_list = [] for i in segment_list_this: stream_fileid_list.append(os.path.basename(i).split('.')[0]) - #make pieces + # make pieces pieces = [] for i in zip(stream_fileid_list, segment_list_this): - pieces.append({'fileid': i[0], 'segs': i[1],}) + pieces.append({'fileid': i[0], 'segs': i[1], }) self.streams[quality_id] = { - 'container': s['container'], - 'video_profile': s['video_profile'], - 'size': m3u8_size, - 'pieces': pieces, - 'm3u8_url': m3u8_url - } + 'container': s['container'], + 'video_profile': s['video_profile'], + 'size': m3u8_size, + 'pieces': pieces, + 'm3u8_url': m3u8_url + } if not kwargs['info_only']: self.streams[quality_id]['src'] = segment_list_this @@ -132,7 +173,8 @@ class MGTV(VideoExtractor): if 'index' not in kwargs: self.p([]) else: - stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag'] + stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else \ + self.streams_sorted[0]['itag'] self.p_i(stream_id) # default to use the best quality @@ -148,8 +190,10 @@ class MGTV(VideoExtractor): else: download_urls(stream_info['src'], self.title, stream_info['container'], stream_info['size'], output_dir=kwargs['output_dir'], - merge=kwargs.get('merge', True)) - # av=stream_id in self.dash_streams) + merge=kwargs.get('merge', True), + headers={'Referer': self.url}) + # av=stream_id in self.dash_streams) + site = MGTV() download = site.download_by_url From e914a4bbbfefe9db807dd55416a0e5bb7b467536 Mon Sep 17 00:00:00 2001 From: Bochun Bai <bbc@sinofool.net> Date: Wed, 13 Jan 2021 20:55:19 -0800 Subject: [PATCH 044/235] Mgtv: add another format of url Fix indentation causing undefined variable --- src/you_get/extractors/mgtv.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/you_get/extractors/mgtv.py b/src/you_get/extractors/mgtv.py index c8a03065..053212ef 100644 --- a/src/you_get/extractors/mgtv.py +++ b/src/you_get/extractors/mgtv.py @@ -47,6 +47,8 @@ class MGTV(VideoExtractor): vid = match1(url, 'https?://www.mgtv.com/(?:b|l)/\d+/(\d+).html') if not vid: vid = match1(url, 'https?://www.mgtv.com/hz/bdpz/\d+/(\d+).html') + if not vid: + vid = match1(url, 'https?://www.mgtv.com/s/(\d+).html') return vid # ---------------------------------------------------------------------- @@ -125,21 +127,21 @@ class MGTV(VideoExtractor): for i in segment_list_this: stream_fileid_list.append(os.path.basename(i).split('.')[0]) - # make pieces - pieces = [] - for i in zip(stream_fileid_list, segment_list_this): - pieces.append({'fileid': i[0], 'segs': i[1], }) + # make pieces + pieces = [] + for i in zip(stream_fileid_list, segment_list_this): + pieces.append({'fileid': i[0], 'segs': i[1], }) - self.streams[quality_id] = { - 'container': s['container'], - 'video_profile': s['video_profile'], - 'size': m3u8_size, - 'pieces': pieces, - 'm3u8_url': m3u8_url - } + self.streams[quality_id] = { + 'container': s['container'], + 'video_profile': s['video_profile'], + 'size': m3u8_size, + 'pieces': pieces, + 'm3u8_url': m3u8_url + } - if not kwargs['info_only']: - self.streams[quality_id]['src'] = segment_list_this + if not kwargs['info_only']: + self.streams[quality_id]['src'] = segment_list_this def extract(self, **kwargs): if 'stream_id' in kwargs and kwargs['stream_id']: From 81d416d1cdb1e43ceaef2fb5d7c6905c14b4aa32 Mon Sep 17 00:00:00 2001 From: Zhenyao She <dr.tobyshe@gmail.com> Date: Sat, 16 Jan 2021 11:02:41 -0500 Subject: [PATCH 045/235] ignore vim swap files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 99b18775..716c13b3 100644 --- a/.gitignore +++ b/.gitignore @@ -84,6 +84,7 @@ _* *.m4a *.DS_Store *.txt +*.sw[a-p] *.zip From 240b7a81118ac99055d9311a3a0bf3ff8fddc256 Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Mon, 18 Jan 2021 18:37:24 +0100 Subject: [PATCH 046/235] [youtube] fix extraction of caption tracks, close #2866 --- src/you_get/extractors/youtube.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 30297c75..e954b1f4 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -353,7 +353,10 @@ class YouTube(VideoExtractor): # Prepare caption tracks try: - caption_tracks = json.loads(ytplayer_config['args']['player_response'])['captions']['playerCaptionsTracklistRenderer']['captionTracks'] + try: + caption_tracks = json.loads(ytplayer_config['args']['player_response'])['captions']['playerCaptionsTracklistRenderer']['captionTracks'] + except: + caption_tracks = ytInitialPlayerResponse['captions']['playerCaptionsTracklistRenderer']['captionTracks'] for ct in caption_tracks: ttsurl, lang = ct['baseUrl'], ct['languageCode'] From 8b473e725662bfff520d27c2eb1589d46fcafda5 Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Sat, 6 Feb 2021 18:04:09 +0100 Subject: [PATCH 047/235] [xiami] xiami is dead --- README.md | 1 - src/you_get/common.py | 1 - src/you_get/extractors/__init__.py | 1 - src/you_get/extractors/xiami.py | 215 ----------------------------- 4 files changed, 218 deletions(-) delete mode 100644 src/you_get/extractors/xiami.py diff --git a/README.md b/README.md index bd2f02bd..d2fdaa99 100644 --- a/README.md +++ b/README.md @@ -426,7 +426,6 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | Sina<br/>新浪视频<br/>微博秒拍视频 | <http://video.sina.com.cn/><br/><http://video.weibo.com/> |✓| | | | Sohu<br/>搜狐视频 | <http://tv.sohu.com/> |✓| | | | **Tudou<br/>土豆** | <http://www.tudou.com/> |✓| | | -| 虾米 | <http://www.xiami.com/> |✓| |✓| | 阳光卫视 | <http://www.isuntv.com/> |✓| | | | **Youku<br/>优酷** | <http://www.youku.com/> |✓| | | | 战旗TV | <http://www.zhanqi.tv/lives> |✓| | | diff --git a/src/you_get/common.py b/src/you_get/common.py index 83dfa2ac..67ef581b 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -113,7 +113,6 @@ SITES = { 'veoh' : 'veoh', 'vine' : 'vine', 'vk' : 'vk', - 'xiami' : 'xiami', 'xiaokaxiu' : 'yixia', 'xiaojiadianvideo' : 'fc2video', 'ximalaya' : 'ximalaya', diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index 8cf1ea7c..8c43a8bc 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -78,7 +78,6 @@ from .vine import * from .vk import * from .w56 import * from .wanmen import * -from .xiami import * from .xinpianchang import * from .yixia import * from .youku import * diff --git a/src/you_get/extractors/xiami.py b/src/you_get/extractors/xiami.py deleted file mode 100644 index 16656adb..00000000 --- a/src/you_get/extractors/xiami.py +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -__all__ = ['xiami_download'] - -from ..common import * - -from xml.dom.minidom import parseString -from urllib import parse - -def location_dec(str): - head = int(str[0]) - str = str[1:] - rows = head - cols = int(len(str)/rows) + 1 - - out = "" - full_row = len(str) % head - for c in range(cols): - for r in range(rows): - if c == (cols - 1) and r >= full_row: - continue - if r < full_row: - char = str[r*cols+c] - else: - char = str[cols*full_row+(r-full_row)*(cols-1)+c] - out += char - return parse.unquote(out).replace("^", "0") - -def xiami_download_lyric(lrc_url, file_name, output_dir): - lrc = get_content(lrc_url, headers=fake_headers) - filename = get_filename(file_name) - if len(lrc) > 0: - with open(output_dir + "/" + filename + '.lrc', 'w', encoding='utf-8') as x: - x.write(lrc) - -def xiami_download_pic(pic_url, file_name, output_dir): - from ..util.strings import get_filename - pic_url = pic_url.replace('_1', '') - pos = pic_url.rfind('.') - ext = pic_url[pos:] - pic = get_content(pic_url, headers=fake_headers, decoded=False) - if len(pic) > 0: - with open(output_dir + "/" + file_name.replace('/', '-') + ext, 'wb') as x: - x.write(pic) - -def xiami_download_song(sid, output_dir = '.', info_only = False): - xml = get_content('http://www.xiami.com/song/playlist/id/%s/object_name/default/object_id/0' % sid, headers=fake_headers) - doc = parseString(xml) - i = doc.getElementsByTagName("track")[0] - artist = i.getElementsByTagName("artist")[0].firstChild.nodeValue - album_name = i.getElementsByTagName("album_name")[0].firstChild.nodeValue - song_title = i.getElementsByTagName("name")[0].firstChild.nodeValue - url = location_dec(i.getElementsByTagName("location")[0].firstChild.nodeValue) - try: - lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue - except: - pass - type_, ext, size = url_info(url, headers=fake_headers) - if not ext: - ext = 'mp3' - - print_info(site_info, song_title, ext, size) - if not info_only: - file_name = "%s - %s - %s" % (song_title, artist, album_name) - download_urls([url], file_name, ext, size, output_dir, headers=fake_headers) - try: - xiami_download_lyric(lrc_url, file_name, output_dir) - except: - pass - -def xiami_download_showcollect(cid, output_dir = '.', info_only = False): - html = get_content('http://www.xiami.com/song/showcollect/id/' + cid, headers=fake_headers) - collect_name = r1(r'<title>(.*)', html) - - xml = get_content('http://www.xiami.com/song/playlist/id/%s/type/3' % cid, headers=fake_headers) - doc = parseString(xml) - output_dir = output_dir + "/" + "[" + collect_name + "]" - tracks = doc.getElementsByTagName("track") - track_nr = 1 - for i in tracks: - artist=album_name=song_title=url="" - try: - song_id = i.getElementsByTagName("song_id")[0].firstChild.nodeValue - artist = i.getElementsByTagName("artist")[0].firstChild.nodeValue - album_name = i.getElementsByTagName("album_name")[0].firstChild.nodeValue - song_title = i.getElementsByTagName("title")[0].firstChild.nodeValue - url = location_dec(i.getElementsByTagName("location")[0].firstChild.nodeValue) - except: - log.e("Song %s failed. [Info Missing] artist:%s, album:%s, title:%s, url:%s" % (song_id, artist, album_name, song_title, url)) - continue - try: - lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue - except: - pass - type_, ext, size = url_info(url, headers=fake_headers) - if not ext: - ext = 'mp3' - - print_info(site_info, song_title, ext, size) - if not info_only: - file_name = "%02d.%s - %s - %s" % (track_nr, song_title, artist, album_name) - download_urls([url], file_name, ext, size, output_dir, headers=fake_headers) - try: - xiami_download_lyric(lrc_url, file_name, output_dir) - except: - pass - - track_nr += 1 - -def xiami_download_album(aid, output_dir='.', info_only=False): - xml = get_content('http://www.xiami.com/song/playlist/id/%s/type/1' % aid, headers=fake_headers) - album_name = r1(r'', xml) - artist = r1(r'', xml) - doc = parseString(xml) - output_dir = output_dir + "/%s - %s" % (artist, album_name) - track_list = doc.getElementsByTagName('trackList')[0] - tracks = track_list.getElementsByTagName("track") - track_nr = 1 - pic_exist = False - for i in tracks: -#in this xml track tag is used for both "track in a trackList" and track no -#dirty here - if i.firstChild.nodeValue is not None: - continue - song_title = i.getElementsByTagName("songName")[0].firstChild.nodeValue - url = location_dec(i.getElementsByTagName("location")[0].firstChild.nodeValue) - try: - lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue - except: - pass - if not pic_exist: - pic_url = i.getElementsByTagName("pic")[0].firstChild.nodeValue - type_, ext, size = url_info(url, headers=fake_headers) - if not ext: - ext = 'mp3' - - print_info(site_info, song_title, ext, size) - if not info_only: - file_name = "%02d.%s" % (track_nr, song_title) - download_urls([url], file_name, ext, size, output_dir, headers=fake_headers) - try: - xiami_download_lyric(lrc_url, file_name, output_dir) - except: - pass - if not pic_exist: - xiami_download_pic(pic_url, 'cover', output_dir) - pic_exist = True - - track_nr += 1 - -def xiami_download_mv(url, output_dir='.', merge=True, info_only=False): - # FIXME: broken merge - page = get_content(url, headers=fake_headers) - title = re.findall('([^<]+)', page)[0] - vid, uid = re.findall(r'vid:"(\d+)",uid:"(\d+)"', page)[0] - api_url = 'http://cloud.video.taobao.com/videoapi/info.php?vid=%s&uid=%s' % (vid, uid) - result = get_content(api_url, headers=fake_headers) - doc = parseString(result) - video_url = doc.getElementsByTagName("video_url")[-1].firstChild.nodeValue - length = int(doc.getElementsByTagName("length")[-1].firstChild.nodeValue) - - v_urls = [] - k_start = 0 - total_size = 0 - while True: - k_end = k_start + 20000000 - if k_end >= length: k_end = length - 1 - v_url = video_url + '/start_%s/end_%s/1.flv' % (k_start, k_end) - try: - _, ext, size = url_info(v_url) - except: - break - v_urls.append(v_url) - total_size += size - k_start = k_end + 1 - - print_info(site_info, title, ext, total_size) - if not info_only: - download_urls(v_urls, title, ext, total_size, output_dir, merge=merge, headers=fake_headers) - -def xiami_download(url, output_dir='.', merge=True, info_only=False, **kwargs): -#albums - if re.match(r'http://www.xiami.com/album/\d+', url): - id = r1(r'http://www.xiami.com/album/(\d+)', url) - xiami_download_album(id, output_dir, info_only) - elif re.match(r'http://www.xiami.com/album/\w+', url): - page = get_content(url, headers=fake_headers) - album_id = re.search(r'rel="canonical"\s+href="http://www.xiami.com/album/([^"]+)"', page).group(1) - xiami_download_album(album_id, output_dir, info_only) - -#collections - if re.match(r'http://www.xiami.com/collect/\d+', url): - id = r1(r'http://www.xiami.com/collect/(\d+)', url) - xiami_download_showcollect(id, output_dir, info_only) - -#single track - if re.match(r'http://www.xiami.com/song/\d+\b', url): - id = r1(r'http://www.xiami.com/song/(\d+)', url) - xiami_download_song(id, output_dir, info_only) - elif re.match(r'http://www.xiami.com/song/\w+', url): - html = get_content(url, headers=fake_headers) - id = r1(r'rel="canonical" href="http://www.xiami.com/song/([^"]+)"', html) - xiami_download_song(id, output_dir, info_only) - - if re.match('http://www.xiami.com/song/detail/id/\d+', url): - id = r1(r'http://www.xiami.com/song/detail/id/(\d+)', url) - xiami_download_song(id, output_dir, info_only) - - if re.match('http://www.xiami.com/mv', url): - xiami_download_mv(url, output_dir, merge=merge, info_only=info_only) - -site_info = "Xiami.com" -download = xiami_download -download_playlist = playlist_not_supported("xiami") From 92ac149d544a39821528c6882dc63ae08be5147e Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Sun, 7 Feb 2021 14:11:03 +0100 Subject: [PATCH 048/235] [tumblr] fix extraction --- src/you_get/extractors/tumblr.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/you_get/extractors/tumblr.py b/src/you_get/extractors/tumblr.py index 9a314c7f..1fdfcad0 100644 --- a/src/you_get/extractors/tumblr.py +++ b/src/you_get/extractors/tumblr.py @@ -14,7 +14,7 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): return import ssl - ssl_context = request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1)) + ssl_context = request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)) # server requires TLS v1.2 cookie_handler = request.HTTPCookieProcessor() opener = request.build_opener(ssl_context, cookie_handler) request.install_opener(opener) @@ -45,23 +45,30 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): r1(r'<title>([^<\n]*)', html) urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.jpg)', html) +\ re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.png)', html) +\ - re.findall(r'(https?://[^;"&]+/tumblr_[^";&]+_\d+\.gif)', html) + re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.gif)', html) +\ + re.findall(r'(https?://\d+\.media\.tumblr\.com/[^;"&]+/s\d+x\d+/[^;"&]+\.jpg)', html) +\ + re.findall(r'(https?://\d+\.media\.tumblr\.com/[^;"&]+/s\d+x\d+/[^;"&]+\.png)', html) +\ + re.findall(r'(https?://\d+\.media\.tumblr\.com/[^;"&]+/s\d+x\d+/[^;"&]+\.gif)', html) tuggles = {} for url in urls: if url.endswith('.gif'): hd_url = url elif url.endswith('.jpg'): - hd_url = r1(r'(.+)_\d+\.jpg$', url) + '_1280.jpg' # FIXME: decide actual quality + hd_url = url # FIXME: decide actual quality # r1(r'(.+)_\d+\.jpg$', url) + '_1280.jpg' elif url.endswith('.png'): - hd_url = r1(r'(.+)_\d+\.png$', url) + '_1280.png' # FIXME: decide actual quality + hd_url = url # FIXME: decide actual quality # r1(r'(.+)_\d+\.png$', url) + '_1280.png' else: continue filename = parse.unquote(hd_url.split('/')[-1]) title = '.'.join(filename.split('.')[:-1]) - tumblr_id = r1(r'^tumblr_(.+)_\d+$', title) - quality = int(r1(r'^tumblr_.+_(\d+)$', title)) + tumblr_id = r1(r'^tumblr_(.+)_\d+$', title) or title + try: + quality = int(r1(r'^tumblr_.+_(\d+)$', title)) + except: + quality = int(r1(r'/s(\d+)x\d+/', hd_url)) ext = filename.split('.')[-1] + try: size = int(get_head(hd_url)['Content-Length']) if tumblr_id not in tuggles or tuggles[tumblr_id]['quality'] < quality: From 980ba1bc2e3c41b8c41963cdfd05bbc1b3de9801 Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Mon, 8 Feb 2021 17:35:23 +0100 Subject: [PATCH 049/235] [instagram] data not none even when missing cookies --- src/you_get/extractors/instagram.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/instagram.py b/src/you_get/extractors/instagram.py index d5100a78..0a9da345 100755 --- a/src/you_get/extractors/instagram.py +++ b/src/you_get/extractors/instagram.py @@ -22,14 +22,15 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg download_urls([stream], title, ext, size, output_dir, merge=merge) else: data = re.search(r'window\._sharedData\s*=\s*(.*);</script>', html) - if data is not None: + try: info = json.loads(data.group(1)) post = info['entry_data']['PostPage'][0] - else: + assert post + except: # with logged-in cookies data = re.search(r'window\.__additionalDataLoaded\(\'[^\']+\',(.*)\);</script>', html) if data is not None: - log.e('[Error] Cookies needed.') + log.e('[Warning] Cookies needed.') post = json.loads(data.group(1)) if 'edge_sidecar_to_children' in post['graphql']['shortcode_media']: From 3ab931a6a0fc2fc95d18865725fc236c4f53b5e1 Mon Sep 17 00:00:00 2001 From: v4hn <me@v4hn.de> Date: Wed, 17 Feb 2021 14:10:15 +0100 Subject: [PATCH 050/235] [youtube] fix playlist extraction & comprehensive error output --- src/you_get/extractors/youtube.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index e954b1f4..7e4c74f0 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -157,7 +157,12 @@ class YouTube(VideoExtractor): log.wtf('[Failed] Unsupported URL pattern.') video_page = get_content('https://www.youtube.com/playlist?list=%s' % playlist_id) - ytInitialData = json.loads(match1(video_page, r'window\["ytInitialData"\]\s*=\s*(.+);')) + playlist_json_serialized = match1(video_page, r'window\["ytInitialData"\]\s*=\s*(.+);', r'var\s+ytInitialData\s*=\s*([^;]+);') + + if len(playlist_json_serialized) == 0: + log.wtf('[Failed] Unable to extract playlist data') + + ytInitialData = json.loads(playlist_json_serialized[0]) tab0 = ytInitialData['contents']['twoColumnBrowseResultsRenderer']['tabs'][0] itemSection0 = tab0['tabRenderer']['content']['sectionListRenderer']['contents'][0] From 28af439fcadcd37e6171baddda433ffa0fe5a684 Mon Sep 17 00:00:00 2001 From: SHMoney2021 <15300607716@163.com> Date: Mon, 8 Mar 2021 08:52:43 +0800 Subject: [PATCH 051/235] fix page count error in 'space_video' case --- src/you_get/extractors/bilibili.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index cdcccf20..a812d72d 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -4,6 +4,8 @@ from ..common import * from ..extractor import VideoExtractor import hashlib +import math + class Bilibili(VideoExtractor): name = "Bilibili" @@ -734,7 +736,8 @@ class Bilibili(VideoExtractor): api_url = self.bilibili_space_video_api(mid) api_content = get_content(api_url, headers=self.bilibili_headers()) videos_info = json.loads(api_content) - pc = videos_info['data']['page']['count'] // videos_info['data']['page']['ps'] + # pc = videos_info['data']['page']['count'] // videos_info['data']['page']['ps'] + pc = math.ceil(videos_info['data']['page']['count'] / videos_info['data']['page']['ps']) for pn in range(1, pc + 1): api_url = self.bilibili_space_video_api(mid, pn=pn) From 439354e730d8b864de9401536c93220467ccb355 Mon Sep 17 00:00:00 2001 From: lcjh <120989324@qq.com> Date: Wed, 24 Mar 2021 17:48:59 +0000 Subject: [PATCH 052/235] add HDR support for bilibili --- src/you_get/extractors/bilibili.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index a812d72d..a696b398 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -12,6 +12,8 @@ class Bilibili(VideoExtractor): # Bilibili media encoding options, in descending quality order. stream_types = [ + {'id': 'hdflv2', 'quality': 125, 'audio_quality': 30280, + 'container': 'FLV', 'video_resolution': '3840p', 'desc': '真彩 HDR'}, {'id': 'hdflv2_4k', 'quality': 120, 'audio_quality': 30280, 'container': 'FLV', 'video_resolution': '2160p', 'desc': '超清 4K'}, {'id': 'flv_p60', 'quality': 116, 'audio_quality': 30280, From 25204d8841cdbbcad4f5df357c80853f34286025 Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Mon, 29 Mar 2021 16:38:54 +0200 Subject: [PATCH 053/235] [test] remove test_bilibili (videos deleted) --- tests/test.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/test.py b/tests/test.py index 0f7595b3..5a86ee8f 100644 --- a/tests/test.py +++ b/tests/test.py @@ -40,14 +40,6 @@ class YouGetTests(unittest.TestCase): def test_acfun(self): acfun.download('https://www.acfun.cn/v/ac11701912', info_only=True) - def test_bilibili(self): - bilibili.download( - "https://www.bilibili.com/watchlater/#/BV1PE411q7mZ/p6", info_only=True - ) - bilibili.download( - "https://www.bilibili.com/watchlater/#/av74906671/p6", info_only=True - ) - def test_soundcloud(self): ## single song soundcloud.download( From 17eff492fe1db0c378f2447b54c5f09ed98b2626 Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Mon, 29 Mar 2021 16:44:07 +0200 Subject: [PATCH 054/235] version 0.4.1520 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 95505814..60bff607 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1500' +__version__ = '0.4.1520' From ef9ff72183acd93b1b10b2b836d145447cceb016 Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Fri, 2 Apr 2021 01:44:36 +0200 Subject: [PATCH 055/235] [bilibili] fix a bug for non-interative multi-part videos (https://github.com/soimort/you-get/pull/2746#pullrequestreview-626492105) --- src/you_get/extractors/bilibili.py | 32 ++++++++++-------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index a696b398..644c5af4 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -181,7 +181,7 @@ class Bilibili(VideoExtractor): self.download_playlist_by_url(self.url, **kwargs) return - # regular av video + # regular video if sort == 'video': initial_state_text = match1(html_content, r'__INITIAL_STATE__=(.*?);\(function\(\)') # FIXME initial_state = json.loads(initial_state_text) @@ -601,13 +601,21 @@ class Bilibili(VideoExtractor): log.e('[Error] Unsupported URL pattern.') exit(1) - # regular av video + # regular video if sort == 'video': initial_state_text = match1(html_content, r'__INITIAL_STATE__=(.*?);\(function\(\)') # FIXME initial_state = json.loads(initial_state_text) aid = initial_state['videoData']['aid'] pn = initial_state['videoData']['videos'] - if pn!= len(initial_state['videoData']['pages']):#interaction video 互动视频 + + if pn == len(initial_state['videoData']['pages']): + # non-interative video + for pi in range(1, pn + 1): + purl = 'https://www.bilibili.com/video/av%s?p=%s' % (aid, pi) + self.__class__().download_by_url(purl, **kwargs) + + else: + # interative video search_node_list = [] download_cid_set = set([initial_state['videoData']['cid']]) params = { @@ -658,24 +666,6 @@ class Bilibili(VideoExtractor): self.streams_sorted = [dict([('itag', stream_type['itag'])] + list(self.streams[stream_type['itag']].items())) for stream_type in self.__class__.stream_types if stream_type['itag'] in self.streams] self.extract(**kwargs) self.download(**kwargs) - else: - playinfo_text = match1(html_content, r'__playinfo__=(.*?)</script><script>') # FIXME - playinfo = json.loads(playinfo_text) if playinfo_text else None - - html_content_ = get_content(self.url, headers=self.bilibili_headers(cookie='CURRENT_FNVAL=16')) - playinfo_text_ = match1(html_content_, r'__playinfo__=(.*?)</script><script>') # FIXME - playinfo_ = json.loads(playinfo_text_) if playinfo_text_ else None - p = int(match1(self.url, r'[\?&]p=(\d+)') or match1(self.url, r'/index_(\d+)') or '1')-1 - for pi in range(p,pn): - self.prepare_by_cid(aid,initial_state['videoData']['pages'][pi]['cid'],'%s (P%s. %s)' % (initial_state['videoData']['title'], pi+1, initial_state['videoData']['pages'][pi]['part']),html_content,playinfo,playinfo_,url) - try: - self.streams_sorted = [dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams] - except: - self.streams_sorted = [dict([('itag', stream_type['itag'])] + list(self.streams[stream_type['itag']].items())) for stream_type in self.__class__.stream_types if stream_type['itag'] in self.streams] - self.extract(**kwargs) - self.download(**kwargs) - # purl = 'https://www.bilibili.com/video/av%s?p=%s' % (aid, pi+1) - # self.__class__().download_by_url(purl, **kwargs) elif sort == 'bangumi': initial_state_text = match1(html_content, r'__INITIAL_STATE__=(.*?);\(function\(\)') # FIXME From 1c841f7e8ce60130572a8f03fb038eda99deff6a Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Sun, 4 Apr 2021 02:59:00 +0200 Subject: [PATCH 056/235] [bilibili] redirect /s/ URLs --- src/you_get/extractors/bilibili.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 644c5af4..edb656c7 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -162,6 +162,11 @@ class Bilibili(VideoExtractor): self.url = 'https://www.bilibili.com/bangumi/play/ep%s' % ep_id html_content = get_content(self.url, headers=self.bilibili_headers(referer=self.url)) + # redirect: s + elif re.match(r'https?://(www\.)?bilibili\.com/s/(.+)', self.url): + self.url = 'https://www.bilibili.com/%s' % match1(self.url, r'/s/(.+)') + html_content = get_content(self.url, headers=self.bilibili_headers()) + # sort it out if re.match(r'https?://(www\.)?bilibili\.com/audio/au(\d+)', self.url): sort = 'audio' From 144886840212d5d0ee059858e6493dd265927376 Mon Sep 17 00:00:00 2001 From: flewsea <w48325832@gmail.com> Date: Fri, 7 May 2021 00:20:30 +0800 Subject: [PATCH 057/235] skip private video --- src/you_get/extractors/iwara.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/iwara.py b/src/you_get/extractors/iwara.py index 67a41d41..37cd712a 100644 --- a/src/you_get/extractors/iwara.py +++ b/src/you_get/extractors/iwara.py @@ -27,6 +27,9 @@ def iwara_download(url, output_dir='.', merge=True, info_only=False, **kwargs): api_url = video_url + '/api/video/' + video_hash content = get_content(api_url, headers=headers) data = json.loads(content) + if len(data)<1 : + print('Maybe is Private Video?'+'['+title+']') + return True; down_urls = 'https:' + data[0]['uri'] type, ext, size = url_info(down_urls, headers=headers) print_info(site_info, title+data[0]['resolution'], type, size) @@ -35,10 +38,8 @@ def iwara_download(url, output_dir='.', merge=True, info_only=False, **kwargs): download_urls([down_urls], title, ext, size, output_dir, merge=merge, headers=headers) def download_playlist_by_url( url, **kwargs): - video_page = get_content(url) - # url_first=re.findall(r"(http[s]?://[^/]+)",url) + video_page = get_html(url) url_first=match1(url, r"(http[s]?://[^/]+)") - # print (url_first) videos = set(re.findall(r'<a href="(/videos/[^"]+)"', video_page)) if(len(videos)>0): for video in videos: From 663e53a95f5435afd95a03bb8b16c6d1fac283a5 Mon Sep 17 00:00:00 2001 From: zhouyuan1 <zhouyuan1@staff.weibo.com> Date: Thu, 20 May 2021 13:03:03 +0800 Subject: [PATCH 058/235] add param m3u8 , allow download video via m3u8 url --- src/you_get/common.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 67ef581b..6e619c11 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -1321,7 +1321,13 @@ def download_main(download, download_playlist, urls, playlist, **kwargs): if re.match(r'https?://', url) is None: url = 'http://' + url - if playlist: + if m3u8: + if output_filename: + title = output_filename + else: + title = "m3u8file" + download_url_ffmpeg(url=url, title=title,ext = 'mp4',output_dir = '.') + elif playlist: download_playlist(url, **kwargs) else: download(url, **kwargs) @@ -1425,7 +1431,6 @@ def set_socks_proxy(proxy): proxy_info = proxy.split("@") socks_proxy_addrs = proxy_info[1].split(':') socks_proxy_auth = proxy_info[0].split(":") - print(socks_proxy_auth[0]+" "+socks_proxy_auth[1]+" "+socks_proxy_addrs[0]+" "+socks_proxy_addrs[1]) socks.set_default_proxy( socks.SOCKS5, socks_proxy_addrs[0], @@ -1436,7 +1441,6 @@ def set_socks_proxy(proxy): ) else: socks_proxy_addrs = proxy.split(':') - print(socks_proxy_addrs[0]+" "+socks_proxy_addrs[1]) socks.set_default_proxy( socks.SOCKS5, socks_proxy_addrs[0], @@ -1601,6 +1605,10 @@ def script_main(download, download_playlist, **kwargs): download_grp.add_argument('--stream', help=argparse.SUPPRESS) download_grp.add_argument('--itag', help=argparse.SUPPRESS) + download_grp.add_argument('-m', '--m3u8', action='store_true', default=False, + help = 'download vide using an m3u8 url') + + parser.add_argument('URL', nargs='*', help=argparse.SUPPRESS) args = parser.parse_args() @@ -1626,6 +1634,7 @@ def script_main(download, download_playlist, **kwargs): global output_filename global auto_rename global insecure + global m3u8 output_filename = args.output_filename extractor_proxy = args.extractor_proxy @@ -1647,6 +1656,9 @@ def script_main(download, download_playlist, **kwargs): if args.cookies: load_cookies(args.cookies) + if args.m3u8: + m3u8 = True + caption = True stream_id = args.format or args.stream or args.itag if args.no_caption: From 5104dd3058c342998b32dabdd47230c5f3af75c6 Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Mon, 24 May 2021 15:38:02 +0200 Subject: [PATCH 059/235] [youtube] close #2890 --- src/you_get/extractors/youtube.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 7e4c74f0..487869cf 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -195,8 +195,9 @@ class YouTube(VideoExtractor): # Get video info # 'eurl' is a magic parameter that can bypass age restriction # full form: 'eurl=https%3A%2F%2Fyoutube.googleapis.com%2Fv%2F{VIDEO_ID}' - video_info = parse.parse_qs(get_content('https://www.youtube.com/get_video_info?video_id={}&eurl=https%3A%2F%2Fy'.format(self.vid))) - logging.debug('STATUS: %s' % video_info['status'][0]) + #video_info = parse.parse_qs(get_content('https://www.youtube.com/get_video_info?video_id={}&eurl=https%3A%2F%2Fy'.format(self.vid))) + #logging.debug('STATUS: %s' % video_info['status'][0]) + video_info = {'status': ['ok'], 'use_cipher_signature': 'True'} ytplayer_config = None if 'status' not in video_info: @@ -253,11 +254,16 @@ class YouTube(VideoExtractor): else: # Parse video page instead video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) - ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) - self.title = json.loads(ytplayer_config["args"]["player_response"])["videoDetails"]["title"] - self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] - stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + + self.title = ytInitialPlayerResponse["videoDetails"]["title"] + if re.search('([^"]*/base\.js)"', video_page): + self.html5player = 'https://www.youtube.com' + re.search('([^"]*/base\.js)"', video_page).group(1) + else: + self.html5player = None + + stream_list = ytInitialPlayerResponse['streamingData']['formats'] elif video_info['status'] == ['fail']: logging.debug('ERRORCODE: %s' % video_info['errorcode'][0]) From e1db00a8fdc228f695b7c643823303ab7e3fe0f0 Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Mon, 24 May 2021 15:38:53 +0200 Subject: [PATCH 060/235] test on Python 3.9 --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index b3d50ff7..daae6668 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.5, 3.6, 3.7, 3.8, pypy3] + python-version: [3.5, 3.6, 3.7, 3.8, 3.9, pypy3] steps: - uses: actions/checkout@v2 From f9fbe8fae02a6ada9f64e0dcfbb960c856701ad6 Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Mon, 24 May 2021 15:43:24 +0200 Subject: [PATCH 061/235] version 0.4.1525 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 60bff607..63b4af56 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1520' +__version__ = '0.4.1525' From 0fcbe3c5a79142f141a55cdd6705f562384b0eed Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Tue, 1 Jun 2021 18:14:56 +0200 Subject: [PATCH 062/235] [youtube] s_to_sig: add prefix to prevent namespace pollution --- src/you_get/extractors/youtube.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 487869cf..58614c5f 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -76,11 +76,13 @@ class YouTube(VideoExtractor): # - https://www.youtube.com/yts/jsbin/player_ias-vfl_RGK2l/en_US/base.js # - https://www.youtube.com/yts/jsbin/player-vflRjqq_w/da_DK/base.js # - https://www.youtube.com/yts/jsbin/player_ias-vfl-jbnrr/da_DK/base.js + # - https://www.youtube.com/s/player/0b643cd1/player_ias.vflset/sv_SE/base.js def tr_js(code): code = re.sub(r'function', r'def', code) - code = re.sub(r'(\W)(as|if|in|is|or)\(', r'\1_\2(', code) + # add prefix '_sig_' to prevent namespace pollution + code = re.sub(r'(\W)([$\w][$\w])\(', r'\1_sig_\2(', code) code = re.sub(r'\$', '_dollar', code) - code = re.sub(r'\{', r':\n\t', code) + code = re.sub(r'\{', r': ', code) code = re.sub(r'\}', r'\n', code) code = re.sub(r'var\s+', r'', code) code = re.sub(r'(\w+).join\(""\)', r'"".join(\1)', code) @@ -99,7 +101,7 @@ class YouTube(VideoExtractor): f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \ match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1)) f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def) - f1def = 'function main_%s%s' % (f1, f1def) # prefix to avoid potential namespace conflict + f1def = 'function %s%s' % (f1, f1def) code = tr_js(f1def) f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def)) for f2 in f2s: @@ -112,13 +114,13 @@ class YouTube(VideoExtractor): f2def = 'function {}({},b){}'.format(f2e, f2def.group(1), f2def.group(2)) f2 = re.sub(r'(as|if|in|is|or)', r'_\1', f2) f2 = re.sub(r'\$', '_dollar', f2) - code = code + 'global %s\n' % f2 + tr_js(f2def) + code = code + 'global _sig_%s\n' % f2 + tr_js(f2def) f1 = re.sub(r'(as|if|in|is|or)', r'_\1', f1) f1 = re.sub(r'\$', '_dollar', f1) - code = code + 'sig=main_%s(s)' % f1 # prefix to avoid potential namespace conflict + code = code + '_sig=_sig_%s(s)' % f1 exec(code, globals(), locals()) - return locals()['sig'] + return locals()['_sig'] def chunk_by_range(url, size): urls = [] From edb358d0b878ba12824b4a39dd5f28cab41a051f Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Tue, 1 Jun 2021 18:33:26 +0200 Subject: [PATCH 063/235] version 0.4.1527 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 63b4af56..70ca2ef5 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1525' +__version__ = '0.4.1527' From d40a15de5c15babe7daa4eb939e5c2f50a90f68c Mon Sep 17 00:00:00 2001 From: nuomi1 <nuomi1@qq.com> Date: Fri, 4 Jun 2021 22:35:20 +0800 Subject: [PATCH 064/235] fix: url --- src/you_get/extractors/missevan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/missevan.py b/src/you_get/extractors/missevan.py index c2b25a90..5018852f 100644 --- a/src/you_get/extractors/missevan.py +++ b/src/you_get/extractors/missevan.py @@ -353,7 +353,7 @@ class MissEvan(VideoExtractor): @staticmethod def url_resource(uri): - return 'https://static.missevan.com/' + uri + return uri if re.match(r'^https?:/{2}\w.+$', uri) else 'https://static.missevan.com/' + uri site = MissEvan() site_info = 'MissEvan.com' From 3881ed3f949ee63d53fb8f7f7b1e28c4f654ab25 Mon Sep 17 00:00:00 2001 From: nuomi1 <nuomi1@qq.com> Date: Fri, 4 Jun 2021 22:35:50 +0800 Subject: [PATCH 065/235] feat: new missevan_stream_types --- src/you_get/extractors/missevan.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/you_get/extractors/missevan.py b/src/you_get/extractors/missevan.py index 5018852f..b7f413f2 100644 --- a/src/you_get/extractors/missevan.py +++ b/src/you_get/extractors/missevan.py @@ -75,17 +75,13 @@ class _Dispatcher(object): raise _NoMatchException() missevan_stream_types = [ - {'id': 'source', 'quality': '源文件', 'url_json_key': 'soundurl', - 'resource_url_fmt': 'sound/{resource_url}'}, - {'id': '320', 'quality': '320 Kbps', 'url_json_key': 'soundurl_64'}, + {'id': 'source', 'quality': '源文件', 'url_json_key': 'soundurl'}, {'id': '128', 'quality': '128 Kbps', 'url_json_key': 'soundurl_128'}, - {'id': '32', 'quality': '32 Kbps', 'url_json_key': 'soundurl_32'}, {'id': 'covers', 'desc': '封面图', 'url_json_key': 'cover_image', 'default_src': 'covers/nocover.png', 'resource_url_fmt': 'covers/{resource_url}'}, - {'id': 'coversmini', 'desc': '封面缩略图', 'url_json_key': 'cover_image', - 'default_src': 'coversmini/nocover.png', - 'resource_url_fmt': 'coversmini/{resource_url}'} + {'id': 'coversmini', 'desc': '封面缩略图', 'url_json_key': 'front_cover', + 'default_src': 'coversmini/nocover.png'} ] def _get_resource_uri(data, stream_type): From 5445f5ecde1823a200ec24ea0b74328b95299b57 Mon Sep 17 00:00:00 2001 From: Philip Xu <pyx@xrefactor.com> Date: Tue, 22 Jun 2021 17:51:17 -0400 Subject: [PATCH 066/235] Update douyin.py The site douyin.com changed to a more user friendly website, this is the updated extractor for that. --- src/you_get/extractors/douyin.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/you_get/extractors/douyin.py b/src/you_get/extractors/douyin.py index e39775f4..5de1fdd9 100644 --- a/src/you_get/extractors/douyin.py +++ b/src/you_get/extractors/douyin.py @@ -1,7 +1,7 @@ # coding=utf-8 import re -import json +from urllib.parse import unquote from ..common import ( url_size, @@ -18,17 +18,17 @@ __all__ = ['douyin_download_by_url'] def douyin_download_by_url(url, **kwargs): page_content = get_content(url, headers=fake_headers) - match_rule = re.compile(r'var data = \[(.*?)\];') - video_info = json.loads(match_rule.findall(page_content)[0]) - video_url = video_info['video']['play_addr']['url_list'][0] - # fix: https://www.douyin.com/share/video/6553248251821165832 - # if there is no title, use desc - cha_list = video_info['cha_list'] - if cha_list: - title = cha_list[0]['cha_name'] - else: - title = video_info['desc'] + # The easiest way to get the title is, obviously, from <title> + title = re.findall(r'<title.*>(.*)', page_content)[0].strip() + # Remove the site name from title + site_name = ' - 抖音' + if title.endswith(site_name): + title = title[:-len(site_name)] video_format = 'mp4' + # The video url is url escaped, as of today, there are 4 working CDN video + # urls for the same video, I chose the shortest one. + cdn_pattern = r'(api\.amemv\.com.*PackSourceEnum_AWEME_DETAIL)' + video_url = 'https://' + unquote(re.findall(cdn_pattern, page_content)[0]) size = url_size(video_url, faker=True) print_info( site_info='douyin.com', title=title, From b9dbae8b40588bc134af6bb7406492ec68fffded Mon Sep 17 00:00:00 2001 From: zhancat200801 Date: Tue, 29 Jun 2021 13:18:09 +0800 Subject: [PATCH 067/235] modify qq.py --- src/you_get/extractors/qq.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/you_get/extractors/qq.py b/src/you_get/extractors/qq.py index 6411b195..e38770e9 100644 --- a/src/you_get/extractors/qq.py +++ b/src/you_get/extractors/qq.py @@ -35,6 +35,7 @@ def qq_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False): part_urls= [] total_size = 0 + ext = None for part in range(1, seg_cnt+1): if fc_cnt == 0: # fix json parsing error From 71780ae4aa4a47fd817dc4c8485ce4aac96a8633 Mon Sep 17 00:00:00 2001 From: Philip Xu Date: Tue, 6 Jul 2021 06:24:52 -0400 Subject: [PATCH 068/235] Update douyin.py Updated the extractors for the latest www.douyin.com --- src/you_get/extractors/douyin.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/you_get/extractors/douyin.py b/src/you_get/extractors/douyin.py index 5de1fdd9..8067b1b5 100644 --- a/src/you_get/extractors/douyin.py +++ b/src/you_get/extractors/douyin.py @@ -1,6 +1,7 @@ # coding=utf-8 import re +import json from urllib.parse import unquote from ..common import ( @@ -18,17 +19,17 @@ __all__ = ['douyin_download_by_url'] def douyin_download_by_url(url, **kwargs): page_content = get_content(url, headers=fake_headers) - # The easiest way to get the title is, obviously, from - title = re.findall(r'<title.*>(.*)', page_content)[0].strip() - # Remove the site name from title - site_name = ' - 抖音' - if title.endswith(site_name): - title = title[:-len(site_name)] + # The video player and video source are rendered client-side, the data + # contains in a ', html) + data = re.search(r'window\._sharedData\s*=\s*(.*);', cont) try: info = json.loads(data.group(1)) post = info['entry_data']['PostPage'][0] assert post except: # with logged-in cookies - data = re.search(r'window\.__additionalDataLoaded\(\'[^\']+\',(.*)\);', html) + data = re.search(r'window\.__additionalDataLoaded\(\'[^\']+\',(.*)\);', cont) if data is not None: log.e('[Warning] Cookies needed.') post = json.loads(data.group(1)) From ad24e68baa5106522c6d4a37395896e9776ea88f Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 11 Jul 2021 18:46:41 +0200 Subject: [PATCH 072/235] version 0.4.1536 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 70ca2ef5..dd603918 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1527' +__version__ = '0.4.1536' From f4ec55b00e5b8743727b138d06f6ffadadd49579 Mon Sep 17 00:00:00 2001 From: Tim Gates Date: Sat, 31 Jul 2021 09:34:46 +1000 Subject: [PATCH 073/235] docs: Fix a few typos There are small typos in: - src/you_get/extractors/flickr.py - src/you_get/extractors/mtv81.py - src/you_get/extractors/qingting.py Fixes: - Should read `several` rather than `serveral`. - Should read `channel` rather than `chaanel`. - Should read `approach` rather than `approch`. --- src/you_get/extractors/flickr.py | 2 +- src/you_get/extractors/mtv81.py | 2 +- src/you_get/extractors/qingting.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/flickr.py b/src/you_get/extractors/flickr.py index 2535dd1c..79fca4ff 100644 --- a/src/you_get/extractors/flickr.py +++ b/src/you_get/extractors/flickr.py @@ -73,7 +73,7 @@ def get_api_key(page): match = match1(page, pattern_inline_api_key) # this happens only when the url points to a gallery page # that contains no inline api_key(and never makes xhr api calls) - # in fact this might be a better approch for getting a temporary api key + # in fact this might be a better approach for getting a temporary api key # since there's no place for a user to add custom information that may # misguide the regex in the homepage if not match: diff --git a/src/you_get/extractors/mtv81.py b/src/you_get/extractors/mtv81.py index b92f74bc..ef432159 100644 --- a/src/you_get/extractors/mtv81.py +++ b/src/you_get/extractors/mtv81.py @@ -28,7 +28,7 @@ def mtv81_download(url, output_dir='.', merge=True, info_only=False, **kwargs): # # rtmpdump -r 'rtmpe://cp30865.edgefcs.net/ondemand/mtviestor/_!/intlod/MTVInternational/MBUS/GeoLocals/00JP/VIAMTVI/PYC/201304/7122HVAQ4/00JPVIAMTVIPYC7122HVAQ4_640x_360_1200_m30.mp4' -o "title.mp4" --swfVfy http://media.mtvnservices.com/player/prime/mediaplayerprime.1.10.8.swf # - # because rtmpdump is unstable,may try serveral times + # because rtmpdump is unstable,may try several times # if not info_only: # import pdb diff --git a/src/you_get/extractors/qingting.py b/src/you_get/extractors/qingting.py index 9859d4be..8dd1b14f 100644 --- a/src/you_get/extractors/qingting.py +++ b/src/you_get/extractors/qingting.py @@ -10,7 +10,7 @@ __all__ = ['qingting_download_by_url'] class Qingting(VideoExtractor): # every resource is described by its channel id and program id - # so vid is tuple (chaanel_id, program_id) + # so vid is tuple (channel_id, program_id) name = 'Qingting' stream_types = [ From b97e9484430e2344d3c332d39880ba3ae68890e8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 15 Aug 2021 05:54:36 +0200 Subject: [PATCH 074/235] [youtube] tr_js: support 3-char main function names --- src/you_get/extractors/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 58614c5f..81b45ac5 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -77,10 +77,11 @@ class YouTube(VideoExtractor): # - https://www.youtube.com/yts/jsbin/player-vflRjqq_w/da_DK/base.js # - https://www.youtube.com/yts/jsbin/player_ias-vfl-jbnrr/da_DK/base.js # - https://www.youtube.com/s/player/0b643cd1/player_ias.vflset/sv_SE/base.js + # - https://www.youtube.com/s/player/50e823fc/player_ias.vflset/sv_SE/base.js def tr_js(code): code = re.sub(r'function', r'def', code) # add prefix '_sig_' to prevent namespace pollution - code = re.sub(r'(\W)([$\w][$\w])\(', r'\1_sig_\2(', code) + code = re.sub(r'(\W)([$\w][$\w][$\w]?)\(', r'\1_sig_\2(', code) code = re.sub(r'\$', '_dollar', code) code = re.sub(r'\{', r': ', code) code = re.sub(r'\}', r'\n', code) From 09cd505311af7dff87d344436a7f4a87b1763cd1 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 15 Aug 2021 06:39:48 +0200 Subject: [PATCH 075/235] [test] disable test_soundcloud temporarily --- tests/test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test.py b/tests/test.py index 5a86ee8f..1989455f 100644 --- a/tests/test.py +++ b/tests/test.py @@ -42,9 +42,9 @@ class YouGetTests(unittest.TestCase): def test_soundcloud(self): ## single song - soundcloud.download( - 'https://soundcloud.com/keiny-pham/impure-bird', info_only=True - ) + #soundcloud.download( + # 'https://soundcloud.com/keiny-pham/impure-bird', info_only=True + #) ## playlist #soundcloud.download( # 'https://soundcloud.com/anthony-flieger/sets/cytus', info_only=True From e1e1503b08c037ec0eb06c6c951240d004cf342b Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 15 Aug 2021 06:44:15 +0200 Subject: [PATCH 076/235] [test] disable test_soundcloud temporarily --- tests/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test.py b/tests/test.py index 1989455f..4a2a117c 100644 --- a/tests/test.py +++ b/tests/test.py @@ -40,7 +40,7 @@ class YouGetTests(unittest.TestCase): def test_acfun(self): acfun.download('https://www.acfun.cn/v/ac11701912', info_only=True) - def test_soundcloud(self): + #def test_soundcloud(self): ## single song #soundcloud.download( # 'https://soundcloud.com/keiny-pham/impure-bird', info_only=True From 5498c377ff3e8c03831ecef9defe18bea8b4937d Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 15 Aug 2021 06:47:09 +0200 Subject: [PATCH 077/235] version 0.4.1545 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index dd603918..f7daa7f8 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1536' +__version__ = '0.4.1545' From ea5f712cb0a420e26ea6e883014159deb584c43b Mon Sep 17 00:00:00 2001 From: hong56hk Date: Thu, 26 Aug 2021 00:12:52 +0800 Subject: [PATCH 078/235] fix for using insecure flag does not work --- src/you_get/common.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 597ed45a..5aa74a20 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -343,7 +343,12 @@ def undeflate(data): # DEPRECATED in favor of get_content() def get_response(url, faker=False): logging.debug('get_response: %s' % url) - + ctx = None + if insecure: + # ignore ssl errors + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE # install cookies if cookies: opener = request.build_opener(request.HTTPCookieProcessor(cookies)) @@ -351,10 +356,10 @@ def get_response(url, faker=False): if faker: response = request.urlopen( - request.Request(url, headers=fake_headers), None + request.Request(url, headers=fake_headers), None, context=ctx, ) else: - response = request.urlopen(url) + response = request.urlopen(url, context=ctx) data = response.read() if response.info().get('Content-Encoding') == 'gzip': From 63fd9716a8740fc6862b70a474e398ca6e9f26bd Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 27 Aug 2021 05:14:00 +0200 Subject: [PATCH 079/235] [universal] fix blogger --- src/you_get/extractors/universal.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/universal.py b/src/you_get/extractors/universal.py index abc69475..fdc7426d 100644 --- a/src/you_get/extractors/universal.py +++ b/src/you_get/extractors/universal.py @@ -70,12 +70,13 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg '[-_][6-9]\d\dx1\d\d\d\.jpe?g', '[-_][6-9]\d\dx[6-9]\d\d\.jpe?g', 's1600/[\w%]+\.jpe?g', # blogger + 'blogger\.googleusercontent\.com/img/a/\w*', # blogger 'img[6-9]\d\d/[\w%]+\.jpe?g' # oricon? ] urls = [] for i in media_exts: - urls += re.findall(r'(https?://[^ ;&"\'\\<>]+' + i + r'[^ ;&"\'\\<>]*)', page) + urls += re.findall(r'(https?://[^ ;&"\'\\<>]*' + i + r'[^ ;&"\'\\<>]*)', page) p_urls = re.findall(r'(https?%3A%2F%2F[^;&"]+' + i + r'[^;&"]*)', page) urls += [parse.unquote(url) for url in p_urls] From d896342862b2fc18448fe2b21054af8ba27f8dbc Mon Sep 17 00:00:00 2001 From: Ziding Zhang Date: Mon, 20 Sep 2021 14:41:37 +0100 Subject: [PATCH 080/235] Create SECURITY.md I'd like to report a security issue but cannot find contact instructions on your repository. If not a hassle, might you kindly add a SECURITY.md file with an email, or another contact method? GitHub [recommends](https://docs.github.com/en/code-security/getting-started/adding-a-security-policy-to-your-repository) this best practice to ensure security issues are responsibly disclosed, and it would serve as a simple instruction for security researchers in the future. Thank you for your consideration! --- SECURITY.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 SECURITY.md diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..5041b2f2 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,5 @@ +# Security Policy + +## Reporting a Vulnerability + +Please report security issues to From 7c2523f5cc843a6622968f0eed8dcb9a49b62b00 Mon Sep 17 00:00:00 2001 From: Jian Wang Date: Thu, 7 Oct 2021 15:42:37 +0800 Subject: [PATCH 081/235] support channel/series for bilibili --- src/you_get/extractors/bilibili.py | 20 ++++++++++++++++++++ tests/test.py | 3 +++ 2 files changed, 23 insertions(+) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index edb656c7..36de363c 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -112,6 +112,10 @@ class Bilibili(VideoExtractor): def bilibili_space_channel_api(mid, cid, pn=1, ps=100): return 'https://api.bilibili.com/x/space/channel/video?mid=%s&cid=%s&pn=%s&ps=%s&order=0&jsonp=jsonp' % (mid, cid, pn, ps) + @staticmethod + def bilibili_series_archives_api(mid, sid, pn=1, ps=100): + return 'https://api.bilibili.com/x/series/archives?mid=%s&series_id=%s&pn=%s&ps=%s&only_normal=true&sort=asc&jsonp=jsonp' % (mid, sid, pn, ps) + @staticmethod def bilibili_space_favlist_api(fid, pn=1, ps=20): return 'https://api.bilibili.com/x/v3/fav/resource/list?media_id=%s&pn=%s&ps=%s&order=mtime&type=0&tid=0&jsonp=jsonp' % (fid, pn, ps) @@ -596,6 +600,8 @@ class Bilibili(VideoExtractor): sort = 'video' elif re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/detail\?.*cid=(\d+)', self.url): sort = 'space_channel' + elif re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/seriesdetail\?.*sid=(\d+)', self.url): + sort = 'space_channel_series' elif re.match(r'https?://space\.?bilibili\.com/(\d+)/favlist\?.*fid=(\d+)', self.url): sort = 'space_favlist' elif re.match(r'https?://space\.?bilibili\.com/(\d+)/video', self.url): @@ -706,6 +712,20 @@ class Bilibili(VideoExtractor): url = 'https://www.bilibili.com/video/av%s' % video['aid'] self.__class__().download_playlist_by_url(url, **kwargs) + elif sort == 'space_channel_series': + m = re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/seriesdetail\?.*sid=(\d+)', self.url) + mid, sid = m.group(1), m.group(2) + api_url = self.bilibili_series_archives_api(mid, sid) + api_content = get_content(api_url, headers=self.bilibili_headers(referer=self.url)) + archives_info = json.loads(api_content) + # TBD: channel of more than 100 videos + + epn, i = len(archives_info['data']['archives']), 0 + for video in archives_info['data']['archives']: + i += 1; log.w('Extracting %s of %s videos ...' % (i, epn)) + url = 'https://www.bilibili.com/video/av%s' % video['aid'] + self.__class__().download_playlist_by_url(url, **kwargs) + elif sort == 'space_favlist': m = re.match(r'https?://space\.?bilibili\.com/(\d+)/favlist\?.*fid=(\d+)', self.url) vmid, fid = m.group(1), m.group(2) diff --git a/tests/test.py b/tests/test.py index 4a2a117c..8ae622b2 100644 --- a/tests/test.py +++ b/tests/test.py @@ -40,6 +40,9 @@ class YouGetTests(unittest.TestCase): def test_acfun(self): acfun.download('https://www.acfun.cn/v/ac11701912', info_only=True) + def test_bilibili(self): + bilibili.download('https://space.bilibili.com/72270557/channel/seriesdetail?sid=218844', info_only=True) + #def test_soundcloud(self): ## single song #soundcloud.download( From 798ad6d14eb23ee0754ead224494911fcf27cd68 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 20 Oct 2021 18:15:51 +0200 Subject: [PATCH 082/235] [universal] tweak --- src/you_get/extractors/universal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/universal.py b/src/you_get/extractors/universal.py index fdc7426d..4a3268ab 100644 --- a/src/you_get/extractors/universal.py +++ b/src/you_get/extractors/universal.py @@ -76,7 +76,7 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg urls = [] for i in media_exts: - urls += re.findall(r'(https?://[^ ;&"\'\\<>]*' + i + r'[^ ;&"\'\\<>]*)', page) + urls += re.findall(r'(https?://[^ ;&"\'\\<>]*' + i + r'[^ =?;&"\'\\<>]*)', page) p_urls = re.findall(r'(https?%3A%2F%2F[^;&"]+' + i + r'[^;&"]*)', page) urls += [parse.unquote(url) for url in p_urls] From c064013b9c19c6225b6edd818ced0f4003d22854 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sun, 24 Oct 2021 07:58:34 +0200 Subject: [PATCH 083/235] Test on Python 3.10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit “3.10” must be quoted or yaml will treat it as 3.1. --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index daae6668..b23455c8 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9, pypy3] + python-version: [3.5, 3.6, 3.7, 3.8, 3.9, "3.10", pypy3] steps: - uses: actions/checkout@v2 From 3c8382d2af3ef9239f64f6524cbe1abeb949729c Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sun, 24 Oct 2021 08:01:32 +0200 Subject: [PATCH 084/235] strategy: fail-fast: false --- .github/workflows/python-package.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index b23455c8..75231110 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -14,6 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: + fail-fast: false matrix: python-version: [3.5, 3.6, 3.7, 3.8, 3.9, "3.10", pypy3] From 968334acb48c79bc801ca61ee1d0f77d570ccadb Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sun, 24 Oct 2021 08:06:43 +0200 Subject: [PATCH 085/235] "Programming Language :: Python :: 3.10", --- you-get.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/you-get.json b/you-get.json index e98e2e8a..a9ef1dd5 100644 --- a/you-get.json +++ b/you-get.json @@ -25,6 +25,8 @@ "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", "Topic :: Internet", "Topic :: Internet :: WWW/HTTP", "Topic :: Multimedia", From c4603bbd2a4377ea2ff6f9c16a280e8a01a8583c Mon Sep 17 00:00:00 2001 From: Chuang Zhu Date: Tue, 2 Nov 2021 12:45:41 +0800 Subject: [PATCH 086/235] [bilibili] fix 'NoneType' object is not subscriptable --- src/you_get/extractors/bilibili.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index edb656c7..38ff368d 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -193,10 +193,12 @@ class Bilibili(VideoExtractor): playinfo_text = match1(html_content, r'__playinfo__=(.*?)', cont) From 15393a8218faf96af88f506fa1a1a51a57fd31b3 Mon Sep 17 00:00:00 2001 From: liguangbin Date: Sat, 19 Mar 2022 22:32:57 +0800 Subject: [PATCH 104/235] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E8=A5=BF=E7=93=9C?= =?UTF-8?q?=E8=A7=86=E9=A2=91=E4=B8=8B=E8=BD=BD=E5=A4=B1=E8=B4=A5=E9=97=AE?= =?UTF-8?q?=E9=A2=98;=20fix=20problem:the=20JSON=20object=20must=20be=20st?= =?UTF-8?q?r,=20bytes=20or=20bytearray,=20not=20NoneType?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/you_get/extractors/ixigua.py | 119 ++++++++++++++++++++----------- 1 file changed, 77 insertions(+), 42 deletions(-) diff --git a/src/you_get/extractors/ixigua.py b/src/you_get/extractors/ixigua.py index 2f11e7f9..57119eb6 100644 --- a/src/you_get/extractors/ixigua.py +++ b/src/you_get/extractors/ixigua.py @@ -80,59 +80,94 @@ def get_video_url_from_video_id(video_id): return url -def ixigua_download(url, output_dir='.', merge=True, info_only=False, **kwargs): +def ixigua_download(url, output_dir='.', merge=True, info_only=False, stream_id='', **kwargs): # example url: https://www.ixigua.com/i6631065141750268420/#mid=63024814422 - resp = urlopen_with_retry(request.Request(url)) + headers['cookie'] = "MONITOR_WEB_ID=7892c49b-296e-4499-8704-e47c1b15123; " \ + "ixigua-a-s=1; ttcid=af99669b6304453480454f1507011d5c234; BD_REF=1; " \ + "__ac_nonce=060d88ff000a75e8d17eb; __ac_signature=_02B4Z6wo100f01kX9ZpgAAIDAKIBBQUIPYT5F2WIAAPG2ad; " \ + "ttwid=1%7CcIsVF_3vqSIk4XErhPB0H2VaTxT0tdsTMRbMjrJOPN8%7C1624806049%7C08ce7dd6f7d20506a41ba0a331ef96a6505d96731e6ad9f6c8c709f53f227ab1" + + resp = urlopen_with_retry(request.Request(url, headers=headers)) html = resp.read().decode('utf-8') _cookies = [] for c in resp.getheader('Set-Cookie').split("httponly,"): _cookies.append(c.strip().split(' ')[0]) - headers['cookie'] = ' '.join(_cookies) + headers['cookie'] += ';'.join(_cookies) - conf = loads(match1(html, r"window\.config = (.+);")) - if not conf: - log.e("Get window.config from url failed, url: {}".format(url)) + match_txt = match1(html, r"', html) + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0', + 'Accept-Encoding': 'gzip, deflate', + 'Accept': '*/*', + 'Connection': 'keep-alive' # important + } + + html = getHttps(host, url, headers=headers) + data = r1(r'window\[\'SIGI_STATE\'\]=(.*?);window\[\'SIGI_RETRY\'\]', html) info = json.loads(data) - wid = info['props']['initialProps']['$wid'] - cookie = 'tt_webid=%s; tt_webid_v2=%s' % (wid, wid) + downloadAddr = info['ItemModule'][vid]['video']['downloadAddr'] + author = info['ItemModule'][vid]['author'] # same as uniqueId + nickname = info['UserModule']['users'][author]['nickname'] + title = '%s [%s]' % (nickname or author, vid) - # here's the cookie - headers['Cookie'] = cookie - - # try again - html = get_content(url, headers=headers) - data = r1(r'', html) - info = json.loads(data) - wid = info['props']['initialProps']['$wid'] - cookie = 'tt_webid=%s; tt_webid_v2=%s' % (wid, wid) - - videoData = info['props']['pageProps']['itemInfo']['itemStruct'] - videoId = videoData['id'] - videoUrl = videoData['video']['downloadAddr'] - uniqueId = videoData['author'].get('uniqueId') - nickName = videoData['author'].get('nickname') - - title = '%s [%s]' % (nickName or uniqueId, videoId) - - # we also need the referer - headers['Referer'] = referUrl - - mime, ext, size = url_info(videoUrl, headers=headers) + mime, ext, size = url_info(downloadAddr, headers=headers) print_info(site_info, title, mime, size) if not info_only: - download_urls([videoUrl], title, ext, size, output_dir=output_dir, merge=merge, headers=headers) + download_urls([downloadAddr], title, ext, size, output_dir=output_dir, merge=merge, headers=headers) site_info = "TikTok.com" download = tiktok_download From 37ca277e7256139484195a699bd160540cddd8d8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 19 Apr 2022 17:55:05 +0200 Subject: [PATCH 115/235] [bilibili] use hdflv2_hdr as id for HDR --- src/you_get/extractors/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index da19eb68..f7cc80dc 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -16,7 +16,7 @@ class Bilibili(VideoExtractor): 'container': 'FLV', 'video_resolution': '4320p', 'desc': '超高清 8K'}, {'id': 'hdflv2_dolby', 'quality': 126, 'audio_quality': 30280, 'container': 'FLV', 'video_resolution': '3840p', 'desc': '杜比视界'}, - {'id': 'hdflv2', 'quality': 125, 'audio_quality': 30280, + {'id': 'hdflv2_hdr', 'quality': 125, 'audio_quality': 30280, 'container': 'FLV', 'video_resolution': '2160p', 'desc': '真彩 HDR'}, {'id': 'hdflv2_4k', 'quality': 120, 'audio_quality': 30280, 'container': 'FLV', 'video_resolution': '2160p', 'desc': '超清 4K'}, From 408e78b180557b2372249b370549685a50d4a787 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 19 Apr 2022 21:37:48 +0200 Subject: [PATCH 116/235] drop support for python <3.5 --- README.md | 2 +- you-get.json | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index 0c3d4099..d4b8cd29 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ Are you a Python programmer? Then check out [the source](https://github.com/soim The following dependencies are necessary: -* **[Python](https://www.python.org/downloads/)** 3.2 or above +* **[Python](https://www.python.org/downloads/)** 3.5 or above * **[FFmpeg](https://www.ffmpeg.org/)** 1.0 or above * (Optional) [RTMPDump](https://rtmpdump.mplayerhq.hu/) diff --git a/you-get.json b/you-get.json index a9ef1dd5..1a36b3c0 100644 --- a/you-get.json +++ b/you-get.json @@ -18,9 +18,6 @@ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.2", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", From 43e14887a31704857452166c54c69c065f0b6036 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 22 Apr 2022 19:00:29 +0200 Subject: [PATCH 117/235] [bilibili] support lowercase bv --- src/you_get/extractors/bilibili.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index f7cc80dc..48c91925 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -188,7 +188,7 @@ class Bilibili(VideoExtractor): sort = 'live' elif re.match(r'https?://vc\.bilibili\.com/video/(\d+)', self.url): sort = 'vc' - elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(BV(\S+)))', self.url): + elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(bv(\S+))|(BV(\S+)))', self.url): sort = 'video' elif re.match(r'https?://h\.?bilibili\.com/(\d+)', self.url): sort = 'h' @@ -604,7 +604,7 @@ class Bilibili(VideoExtractor): elif re.match(r'https?://(www\.)?bilibili\.com/bangumi/media/md(\d+)', self.url) or \ re.match(r'https?://bangumi\.bilibili\.com/anime/(\d+)', self.url): sort = 'bangumi_md' - elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|BV(\S+))', self.url): + elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|bv(\S+)|BV(\S+))', self.url): sort = 'video' elif re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/detail\?.*cid=(\d+)', self.url): sort = 'space_channel' From 355e22584c8e8e0e5ac544945601f937355844cc Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 22 Apr 2022 19:17:26 +0200 Subject: [PATCH 118/235] [twitter] show the warning message if login required --- src/you_get/extractors/twitter.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 23468211..8c052ed0 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -51,7 +51,12 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token}) info = json.loads(api_content) - if 'extended_entities' in info['globalObjects']['tweets'][item_id]: + if item_id not in info['globalObjects']['tweets']: + # something wrong here + log.w(info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text']) + return + + elif 'extended_entities' in info['globalObjects']['tweets'][item_id]: # if the tweet contains media, download them media = info['globalObjects']['tweets'][item_id]['extended_entities']['media'] From bbc4df7a89ce32baa76e5d60d7a705354308b4f2 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 22 Apr 2022 19:29:52 +0200 Subject: [PATCH 119/235] [tiktok] fix extraction --- src/you_get/extractors/tiktok.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/tiktok.py b/src/you_get/extractors/tiktok.py index d1069fcc..33e1f11e 100644 --- a/src/you_get/extractors/tiktok.py +++ b/src/you_get/extractors/tiktok.py @@ -23,7 +23,8 @@ def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs): } html = getHttps(host, url, headers=headers) - data = r1(r'window\[\'SIGI_STATE\'\]=(.*?);window\[\'SIGI_RETRY\'\]', html) + data = r1(r'window\[\'SIGI_STATE\'\]=(.*?);window\[\'SIGI_RETRY\'\]', html) or \ + r1(r'', html) info = json.loads(data) downloadAddr = info['ItemModule'][vid]['video']['downloadAddr'] author = info['ItemModule'][vid]['author'] # same as uniqueId From fd2d7fdcbc14384baf45c86588d769300e6bec79 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 22 Apr 2022 23:21:14 +0200 Subject: [PATCH 120/235] [bilibili] support festival videos (fix #2955) --- src/you_get/extractors/bilibili.py | 52 ++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 48c91925..caaa91d6 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -177,6 +177,11 @@ class Bilibili(VideoExtractor): self.url = 'https://www.bilibili.com/%s' % match1(self.url, r'/s/(.+)') html_content = get_content(self.url, headers=self.bilibili_headers()) + # redirect: festival + elif re.match(r'https?://(www\.)?bilibili\.com/festival/(.+)', self.url): + self.url = 'https://www.bilibili.com/video/%s' % match1(self.url, r'bvid=([^&]+)') + html_content = get_content(self.url, headers=self.bilibili_headers()) + # sort it out if re.match(r'https?://(www\.)?bilibili\.com/audio/au(\d+)', self.url): sort = 'audio' @@ -203,30 +208,43 @@ class Bilibili(VideoExtractor): playinfo_text = match1(html_content, r'__playinfo__=(.*?)', html) info = json.loads(data) From f2ea06473aabd2fcc598cd98aa7ceb93c95d978a Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 00:36:42 +0200 Subject: [PATCH 128/235] [tests] test_twitter --- tests/test.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test.py b/tests/test.py index 862b829f..a1c6c076 100644 --- a/tests/test.py +++ b/tests/test.py @@ -10,7 +10,8 @@ from you_get.extractors import ( acfun, bilibili, soundcloud, - tiktok + tiktok, + twitter ) @@ -28,7 +29,7 @@ class YouGetTests(unittest.TestCase): youtube.download( 'http://www.youtube.com/watch?v=pzKerr0JIPA', info_only=True ) - youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True) + #youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True) #youtube.download( # 'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa # info_only=True @@ -57,6 +58,9 @@ class YouGetTests(unittest.TestCase): tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True) tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True) + def test_twitter(self): + twitter.download('https://twitter.com/elonmusk/status/1530516552084234244', info_only=True) + if __name__ == '__main__': unittest.main() From d57a0eba3e60cb4341a8bce02259ad8a4dee66c5 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 01:00:55 +0200 Subject: [PATCH 129/235] [youtube] improve regex --- src/you_get/extractors/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index b5f56fa4..3e1c5cad 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -237,7 +237,7 @@ class YouTube(VideoExtractor): except: # ytplayer_config = {args:{raw_player_response:ytInitialPlayerResponse}} - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) stream_list = ytInitialPlayerResponse['streamingData']['formats'] #stream_list = ytInitialPlayerResponse['streamingData']['adaptiveFormats'] @@ -262,7 +262,7 @@ class YouTube(VideoExtractor): # Parse video page instead video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) self.title = ytInitialPlayerResponse["videoDetails"]["title"] if re.search('([^"]*/base\.js)"', video_page): From 249afb8b27498a89986d3af3aec2dad0819fa014 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 01:12:15 +0200 Subject: [PATCH 130/235] [.github/workflows] test pypy3.8 and pypy3.9 --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 75231110..05dbc85a 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9, "3.10", pypy3] + python-version: [3.5, 3.6, 3.7, 3.8, 3.9, '3.10', 'pypy3.8', 'pypy3.9'] steps: - uses: actions/checkout@v2 From ce1f44fb88848c4e99357b51f6ab0b5cf9fe16a4 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 01:14:47 +0200 Subject: [PATCH 131/235] [.github/workflows] test pypy-3.8 and pypy-3.9 --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 05dbc85a..47fb37a5 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9, '3.10', 'pypy3.8', 'pypy3.9'] + python-version: [3.5, 3.6, 3.7, 3.8, 3.9, '3.10', pypy-3.8, pypy-3.9] steps: - uses: actions/checkout@v2 From 1aa7ca21fcc6769859c9b072f1a5052d73f546d9 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 01:41:25 +0200 Subject: [PATCH 132/235] drop support for python <3.7.4 --- .github/workflows/python-package.yml | 2 +- README.md | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 47fb37a5..f90b61ae 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9, '3.10', pypy-3.8, pypy-3.9] + python-version: [3.7, 3.8, 3.9, '3.10', pypy-3.8, pypy-3.9] steps: - uses: actions/checkout@v2 diff --git a/README.md b/README.md index d4b8cd29..e0cea4dd 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,9 @@ [![PyPI version](https://img.shields.io/pypi/v/you-get.svg)](https://pypi.python.org/pypi/you-get/) [![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -**NOTICE: Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.** +**NOTICE (30 May 2022): Support for Python 3.5, 3.6 and 3.7 will eventually be dropped. ([see details here](https://github.com/soimort/you-get/wiki/TLS-1.3-post-handshake-authentication-(PHA)))** + +**NOTICE (8 Mar 2019): Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.** --- @@ -53,9 +55,9 @@ Are you a Python programmer? Then check out [the source](https://github.com/soim ### Prerequisites -The following dependencies are necessary: +The following dependencies are recommended: -* **[Python](https://www.python.org/downloads/)** 3.5 or above +* **[Python](https://www.python.org/downloads/)** 3.8 or above * **[FFmpeg](https://www.ffmpeg.org/)** 1.0 or above * (Optional) [RTMPDump](https://rtmpdump.mplayerhq.hu/) From a5c726b9d701f81f4cb976242baeb5f00a7c164b Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 01:44:36 +0200 Subject: [PATCH 133/235] version 0.4.1612 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 8fabc52e..da7d3c33 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1602' +__version__ = '0.4.1612' From 9980b727cbcad548059cd05fbfa0254cc99b8e26 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 14:50:20 +0200 Subject: [PATCH 134/235] update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e0cea4dd..44c102cd 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ Are you a Python programmer? Then check out [the source](https://github.com/soim The following dependencies are recommended: -* **[Python](https://www.python.org/downloads/)** 3.8 or above +* **[Python](https://www.python.org/downloads/)** 3.7.4 or above * **[FFmpeg](https://www.ffmpeg.org/)** 1.0 or above * (Optional) [RTMPDump](https://rtmpdump.mplayerhq.hu/) From 6268c1173ce183b2548850bf95d1e7587ad22019 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 14:53:21 +0200 Subject: [PATCH 135/235] update README.rst --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 3c23ab5e..376abcf7 100644 --- a/README.rst +++ b/README.rst @@ -52,7 +52,7 @@ source `__ and fork it! .. |PyPI version| image:: https://badge.fury.io/py/you-get.png :target: http://badge.fury.io/py/you-get -.. |Build Status| image:: https://api.travis-ci.org/soimort/you-get.png - :target: https://travis-ci.org/soimort/you-get +.. |Build Status| image:: https://github.com/soimort/you-get/workflows/develop/badge.svg + :target: https://github.com/soimort/you-get/actions .. |Gitter| image:: https://badges.gitter.im/Join%20Chat.svg :target: https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge From bfb35db5a6b6dfd31c55c2c2f3edd56c0c59e8bf Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 15:38:00 +0200 Subject: [PATCH 136/235] update you-get.json --- you-get.json | 2 -- 1 file changed, 2 deletions(-) diff --git a/you-get.json b/you-get.json index 1a36b3c0..bb94ba00 100644 --- a/you-get.json +++ b/you-get.json @@ -18,8 +18,6 @@ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", From 6ddc3fce89bb496394ab6f51c224b0f9964ee344 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 15:38:47 +0200 Subject: [PATCH 137/235] update Makefile --- Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index c0f9cf0e..fe4a238c 100644 --- a/Makefile +++ b/Makefile @@ -43,5 +43,7 @@ install: $(SETUP) install --user --prefix= release: - zenity --question - $(SETUP) sdist bdist_wheel upload --sign + #zenity --question + $(SETUP) sdist bdist_wheel + echo 'Upload new version to PyPI using:' + echo ' twine upload --sign dist/you-get-VERSION.tar.gz dist/you_get-VERSION-py3-none-any.whl' From c0151a97756990bca525598dc37db476cd6c34d4 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 7 Jun 2022 16:19:14 +0200 Subject: [PATCH 138/235] [youtube] we should extract ytInitialPlayerResponse more reliably --- src/you_get/extractors/youtube.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 3e1c5cad..f820152f 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -237,7 +237,10 @@ class YouTube(VideoExtractor): except: # ytplayer_config = {args:{raw_player_response:ytInitialPlayerResponse}} - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + try: # FIXME: we should extract ytInitialPlayerResponse more reliably + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + except: + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) stream_list = ytInitialPlayerResponse['streamingData']['formats'] #stream_list = ytInitialPlayerResponse['streamingData']['adaptiveFormats'] @@ -262,7 +265,10 @@ class YouTube(VideoExtractor): # Parse video page instead video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + try: # FIXME: we should extract ytInitialPlayerResponse more reliably + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + except: + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) self.title = ytInitialPlayerResponse["videoDetails"]["title"] if re.search('([^"]*/base\.js)"', video_page): From a47960f6ed7b2a484b6629678b3a6ad8e39497bd Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 20 Jun 2022 23:04:56 +0200 Subject: [PATCH 139/235] [twitter] better warning --- src/you_get/extractors/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 8c052ed0..19b4ce87 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -53,7 +53,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) info = json.loads(api_content) if item_id not in info['globalObjects']['tweets']: # something wrong here - log.w(info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text']) + log.wtf('[Failed] ' + info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text'], exit_code=None) return elif 'extended_entities' in info['globalObjects']['tweets'][item_id]: From d661c95480abd61f7ef8877d8dbcb827534aa54d Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 1 Jul 2022 22:21:47 +0200 Subject: [PATCH 140/235] [instagram] fix extraction --- src/you_get/extractors/instagram.py | 74 +++++++++++++---------------- 1 file changed, 32 insertions(+), 42 deletions(-) diff --git a/src/you_get/extractors/instagram.py b/src/you_get/extractors/instagram.py index 4167b226..604c534c 100755 --- a/src/you_get/extractors/instagram.py +++ b/src/you_get/extractors/instagram.py @@ -10,60 +10,50 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg vid = r1(r'instagram.com/\w+/([^/]+)', url) description = r1(r'\s([^<]*)', cont) # with logged-in cookies + r1(r'([^<]*)', cont) # with logged-in cookies title = "{} [{}]".format(description.replace("\n", " "), vid) - stream = r1(r'', cont) - try: - info = json.loads(data.group(1)) - post = info['entry_data']['PostPage'][0] - assert post['items'] - except: - # with logged-in cookies - data = re.search(r'window\.__additionalDataLoaded\(\'[^\']+\',(.*)\);', cont) - if data is not None: - log.e('[Warning] Cookies needed.') - post = json.loads(data.group(1)) + api_url = 'https://i.instagram.com/api/v1/media/%s/info/' % media_id + try: + api_cont = get_content(api_url, headers={**fake_headers, **{'x-ig-app-id': appId}}) + except: + log.wtf('[Error] Please specify a cookie file.') + post = json.loads(api_cont) - for item in post['items']: - code = item['code'] - carousel_media = item.get('carousel_media') or [item] - for i, media in enumerate(carousel_media): - title = '%s [%s]' % (code, i) - image_url = media['image_versions2']['candidates'][0]['url'] - ext = image_url.split('?')[0].split('.')[-1] - size = int(get_head(image_url)['Content-Length']) + for item in post['items']: + code = item['code'] + carousel_media = item.get('carousel_media') or [item] + for i, media in enumerate(carousel_media): + title = '%s [%s]' % (code, i) + image_url = media['image_versions2']['candidates'][0]['url'] + ext = image_url.split('?')[0].split('.')[-1] + size = int(get_head(image_url)['Content-Length']) + + print_info(site_info, title, ext, size) + if not info_only: + download_urls(urls=[image_url], + title=title, + ext=ext, + total_size=size, + output_dir=output_dir) + + # download videos (if any) + if 'video_versions' in media: + video_url = media['video_versions'][0]['url'] + ext = video_url.split('?')[0].split('.')[-1] + size = int(get_head(video_url)['Content-Length']) print_info(site_info, title, ext, size) if not info_only: - download_urls(urls=[image_url], + download_urls(urls=[video_url], title=title, ext=ext, total_size=size, output_dir=output_dir) - # download videos (if any) - if 'video_versions' in media: - video_url = media['video_versions'][0]['url'] - ext = video_url.split('?')[0].split('.')[-1] - size = int(get_head(video_url)['Content-Length']) - - print_info(site_info, title, ext, size) - if not info_only: - download_urls(urls=[video_url], - title=title, - ext=ext, - total_size=size, - output_dir=output_dir) - site_info = "Instagram.com" download = instagram_download download_playlist = playlist_not_supported('instagram') From 82b376a0c60ff473686d6a79ae6ca5c42dc93950 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 1 Jul 2022 23:26:50 +0200 Subject: [PATCH 141/235] version 0.4.1620 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index da7d3c33..ac2bfc03 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1612' +__version__ = '0.4.1620' From 4119a1493e3c1c46c04914accd677d331c357edb Mon Sep 17 00:00:00 2001 From: owlwang Date: Mon, 25 Jul 2022 12:34:55 +0800 Subject: [PATCH 142/235] fix douyin extractor --- src/you_get/extractors/douyin.py | 51 ++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/src/you_get/extractors/douyin.py b/src/you_get/extractors/douyin.py index 8067b1b5..6a59b160 100644 --- a/src/you_get/extractors/douyin.py +++ b/src/you_get/extractors/douyin.py @@ -1,8 +1,6 @@ # coding=utf-8 -import re import json -from urllib.parse import unquote from ..common import ( url_size, @@ -11,25 +9,52 @@ from ..common import ( fake_headers, download_urls, playlist_not_supported, + match1, + get_location, ) - __all__ = ['douyin_download_by_url'] +def get_value(source: dict, path): + try: + value = source + for key in path: + if type(key) is str: + if key in value.keys(): + value = value[key] + else: + value = None + break + elif type(key) is int: + if len(value) != 0: + value = value[key] + else: + value = None + break + except: + value = None + return value + + def douyin_download_by_url(url, **kwargs): + # if short link, get the real url + if 'v.douyin.com' in url: + url = get_location(url) + aweme_id = match1(url, r'/(\d+)/?') + # get video info + video_info_api = 'https://www.douyin.com/web/api/v2/aweme/iteminfo/?item_ids={}' + url = video_info_api.format(aweme_id) page_content = get_content(url, headers=fake_headers) - # The video player and video source are rendered client-side, the data - # contains in a ', html) From 7b845b34ce18863e519ad3cce8e53431ba41664d Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 11 Dec 2022 17:43:07 +0100 Subject: [PATCH 163/235] [tiktok] fix extraction for alternative URLs --- src/you_get/common.py | 15 +++++++++------ src/you_get/extractors/tiktok.py | 12 ++++++------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 1558baf6..c337a2a2 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -344,21 +344,24 @@ def undeflate(data): # an http.client implementation of get_content() # because urllib does not support "Connection: keep-alive" -def getHttps(host, url, headers, gzip=True, deflate=False, debuglevel=0): +def getHttps(host, url, headers, debuglevel=0): import http.client conn = http.client.HTTPSConnection(host) conn.set_debuglevel(debuglevel) conn.request("GET", url, headers=headers) resp = conn.getresponse() + set_cookie = resp.getheader('set-cookie') data = resp.read() - if gzip: - data = ungzip(data) - if deflate: - data = undeflate(data) + try: + data = ungzip(data) # gzip + data = undeflate(data) # deflate + except: + pass - return str(data, encoding='utf-8'), resp.getheader('set-cookie') + conn.close() + return str(data, encoding='utf-8'), set_cookie # DEPRECATED in favor of get_content() diff --git a/src/you_get/extractors/tiktok.py b/src/you_get/extractors/tiktok.py index 641e5e97..2c4892f6 100644 --- a/src/you_get/extractors/tiktok.py +++ b/src/you_get/extractors/tiktok.py @@ -16,12 +16,12 @@ def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs): m = re.match('(https?://)?([^/]+)(/.*)', url) host = m.group(2) if host != 'www.tiktok.com': # non-canonical URL - url = get_location(url, headers=headers) - m = re.match('(https?://)?([^/]+)(/.*)', url) - host = m.group(2) - - url = m.group(3).split('?')[0] - vid = url.split('/')[3] # should be a string of numbers + vid = r1(r'/video/(\d+)', url) + url = 'https://www.tiktok.com/@/video/%s/' % vid + host = 'www.tiktok.com' + else: + url = m.group(3).split('?')[0] + vid = url.split('/')[3] # should be a string of numbers html, set_cookie = getHttps(host, url, headers=headers) tt_chain_token = r1('tt_chain_token=([^;]+);', set_cookie) From 888a9e29f37a52a57a81b28bebdf39bd77aa058f Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 11 Dec 2022 17:44:06 +0100 Subject: [PATCH 164/235] [tests] test "universal" tiktok url --- tests/test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test.py b/tests/test.py index 877b6935..c0f3836a 100644 --- a/tests/test.py +++ b/tests/test.py @@ -57,6 +57,7 @@ class YouGetTests(unittest.TestCase): def test_tiktok(self): tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True) + tiktok.download('https://www.tiktok.com/@/video/6850796940293164290', info_only=True) tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True) def test_twitter(self): From fd7889783419940da9ed460ab420c48be39a2ae4 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 11 Dec 2022 18:09:10 +0100 Subject: [PATCH 165/235] [instagram] show cookie warning --- src/you_get/extractors/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/instagram.py b/src/you_get/extractors/instagram.py index 604c534c..8e261fe7 100755 --- a/src/you_get/extractors/instagram.py +++ b/src/you_get/extractors/instagram.py @@ -19,9 +19,9 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg api_url = 'https://i.instagram.com/api/v1/media/%s/info/' % media_id try: api_cont = get_content(api_url, headers={**fake_headers, **{'x-ig-app-id': appId}}) + post = json.loads(api_cont) except: log.wtf('[Error] Please specify a cookie file.') - post = json.loads(api_cont) for item in post['items']: code = item['code'] From 0d9c28031010ba44fc69977050d5fe572fdee12b Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 11 Dec 2022 18:15:16 +0100 Subject: [PATCH 166/235] version 0.4.1650 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index ac2bfc03..440488a9 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1620' +__version__ = '0.4.1650' From c0a483dab1a07bce353a8cb8f6cb4111c6348a85 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 18 Dec 2022 14:54:34 +0100 Subject: [PATCH 167/235] [twitter] warn when falling back to deprecated API --- src/you_get/extractors/twitter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 7975bdfd..baf4c375 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -55,7 +55,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) info = json.loads(api_content) if item_id not in info['globalObjects']['tweets']: # something wrong here - #log.wtf('[Failed] ' + info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text'], exit_code=None) + log.w('[Failed] ' + info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text']) assert False elif 'extended_entities' in info['globalObjects']['tweets'][item_id]: @@ -94,6 +94,8 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) return except: + log.w('[Warning] Falling back to deprecated Twitter API. Extraction may be incomplete.') + authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw' # FIXME: 403 with cookies From 0fc63efa63c88662f363fa89768b1c1f9dd6cc36 Mon Sep 17 00:00:00 2001 From: arix00 <15333224+arix00@users.noreply.github.com> Date: Sun, 1 Jan 2023 20:38:21 -0800 Subject: [PATCH 168/235] Download multipage video collection When there're more than single page videos in a collection, Download all videos as current code will only handle first page. For 'space_channel_series' and 'space_channel_collection' --- src/you_get/extractors/bilibili.py | 38 ++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 6d34c2c4..b082553e 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -747,13 +747,20 @@ class Bilibili(VideoExtractor): elif sort == 'space_channel_series': m = re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/seriesdetail\?.*sid=(\d+)', self.url) mid, sid = m.group(1), m.group(2) - api_url = self.bilibili_series_archives_api(mid, sid) - api_content = get_content(api_url, headers=self.bilibili_headers(referer=self.url)) - archives_info = json.loads(api_content) - # TBD: channel of more than 100 videos + pn = 1 + video_list = [] + while True: + api_url = self.bilibili_series_archives_api(mid, sid, pn) + api_content = get_content(api_url, headers=self.bilibili_headers(referer=self.url)) + archives_info = json.loads(api_content) + video_list.extend(archives_info['data']['archives']) + if len(video_list) < archives_info['data']['page']['total'] and len(archives_info['data']['archives']) > 0: + pn += 1 + else: + break - epn, i = len(archives_info['data']['archives']), 0 - for video in archives_info['data']['archives']: + epn, i = len(video_list), 0 + for video in video_list: i += 1; log.w('Extracting %s of %s videos ...' % (i, epn)) url = 'https://www.bilibili.com/video/av%s' % video['aid'] self.__class__().download_playlist_by_url(url, **kwargs) @@ -761,13 +768,20 @@ class Bilibili(VideoExtractor): elif sort == 'space_channel_collection': m = re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/collectiondetail\?.*sid=(\d+)', self.url) mid, sid = m.group(1), m.group(2) - api_url = self.bilibili_space_collection_api(mid, sid) - api_content = get_content(api_url, headers=self.bilibili_headers(referer=self.url)) - archives_info = json.loads(api_content) - # TBD: channel of more than 100 videos + pn = 1 + video_list = [] + while True: + api_url = self.bilibili_space_collection_api(mid, sid, pn) + api_content = get_content(api_url, headers=self.bilibili_headers(referer=self.url)) + archives_info = json.loads(api_content) + video_list.extend(archives_info['data']['archives']) + if len(video_list) < archives_info['data']['page']['total'] and len(archives_info['data']['archives']) > 0: + pn += 1 + else: + break - epn, i = len(archives_info['data']['archives']), 0 - for video in archives_info['data']['archives']: + epn, i = len(video_list), 0 + for video in video_list: i += 1; log.w('Extracting %s of %s videos ...' % (i, epn)) url = 'https://www.bilibili.com/video/av%s' % video['aid'] self.__class__().download_playlist_by_url(url, **kwargs) From 25eb89984524acd42a9b704d3d5b0edfa509c95a Mon Sep 17 00:00:00 2001 From: juruoyyx <60863833+juruoyyx@users.noreply.github.com> Date: Fri, 6 Jan 2023 10:55:00 +0800 Subject: [PATCH 169/235] Update ffmpeg.py --- src/you_get/processor/ffmpeg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/processor/ffmpeg.py b/src/you_get/processor/ffmpeg.py index 50e2c9fe..efc0a472 100755 --- a/src/you_get/processor/ffmpeg.py +++ b/src/you_get/processor/ffmpeg.py @@ -128,7 +128,7 @@ def ffmpeg_concat_mp4_to_mpg(files, output='output.mpg'): def ffmpeg_concat_ts_to_mkv(files, output='output.mkv'): print('Merging video parts... ', end="", flush=True) - params = [FFMPEG] + LOGLEVEL + ['-isync', '-y', '-i'] + params = [FFMPEG] + LOGLEVEL + ['-y', '-i'] params.append('concat:') for file in files: if os.path.isfile(file): From a2e411395b9bffa0329c3ea4d80c8fbb218e7bad Mon Sep 17 00:00:00 2001 From: URenko <18209292+URenko@users.noreply.github.com> Date: Wed, 8 Feb 2023 18:51:01 +0800 Subject: [PATCH 170/235] support different codecs for bilibili --- src/you_get/extractors/bilibili.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 6d34c2c4..6ec8bc13 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -42,6 +42,8 @@ class Bilibili(VideoExtractor): {'id': 'jpg', 'quality': 0}, ] + codecids = {7: 'AVC', 12: 'HEVC', 13: 'AV1'} + @staticmethod def height_to_quality(height, qn): if height <= 360 and qn <= 16: @@ -70,7 +72,7 @@ class Bilibili(VideoExtractor): @staticmethod def bilibili_api(avid, cid, qn=0): - return 'https://api.bilibili.com/x/player/playurl?avid=%s&cid=%s&qn=%s&type=&otype=json&fnver=0&fnval=16&fourk=1' % (avid, cid, qn) + return 'https://api.bilibili.com/x/player/playurl?avid=%s&cid=%s&qn=%s&type=&otype=json&fnver=0&fnval=4048&fourk=1' % (avid, cid, qn) @staticmethod def bilibili_audio_api(sid): @@ -302,11 +304,10 @@ class Bilibili(VideoExtractor): if 'dash' in playinfo['data']: audio_size_cache = {} for video in playinfo['data']['dash']['video']: - # prefer the latter codecs! s = self.stream_qualities[video['id']] - format_id = 'dash-' + s['id'] # prefix + format_id = f"dash-{s['id']}-{self.codecids[video['codecid']]}" # prefix container = 'mp4' # enforce MP4 container - desc = s['desc'] + desc = s['desc'] + ' ' + video['codecs'] audio_quality = s['audio_quality'] baseurl = video['baseUrl'] size = self.url_size(baseurl, headers=self.bilibili_headers(referer=self.url)) From f54669411e5b10b2e79484f0d07f00664b450bc0 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 8 Feb 2023 16:41:03 +0100 Subject: [PATCH 171/235] add param "--prefix" to prefix downloaded files --- src/you_get/common.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index c337a2a2..bdb67bac 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -138,6 +138,7 @@ auto_rename = False insecure = False m3u8 = False postfix = False +prefix = None fake_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # noqa @@ -1014,6 +1015,8 @@ def download_urls( title = tr(get_filename(title)) if postfix and 'vid' in kwargs: title = "%s [%s]" % (title, kwargs['vid']) + if prefix is not None: + title = "[%s] %s" % (prefix, title) output_filename = get_output_filename(urls, title, ext, output_dir, merge) output_filepath = os.path.join(output_dir, output_filename) @@ -1563,9 +1566,13 @@ def script_main(download, download_playlist, **kwargs): help='Do not download captions (subtitles, lyrics, danmaku, ...)' ) download_grp.add_argument( - '--postfix', action='store_true', default=False, + '--post', '--postfix', dest='postfix', action='store_true', default=False, help='Postfix downloaded files with unique identifiers' ) + download_grp.add_argument( + '--pre', '--prefix', dest='prefix', metavar='PREFIX', default=None, + help='Prefix downloaded files with string' + ) download_grp.add_argument( '-f', '--force', action='store_true', default=False, help='Force overwriting existing files' @@ -1689,6 +1696,7 @@ def script_main(download, download_playlist, **kwargs): global insecure global m3u8 global postfix + global prefix output_filename = args.output_filename extractor_proxy = args.extractor_proxy @@ -1726,6 +1734,7 @@ def script_main(download, download_playlist, **kwargs): insecure = True postfix = args.postfix + prefix = args.prefix if args.no_proxy: set_http_proxy('') From 2ba7493f126aed5785893b4cd5c3042998da7b99 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 9 Feb 2023 23:36:16 +0100 Subject: [PATCH 172/235] [bilibili] warn if cookies are not loaded --- src/you_get/extractors/bilibili.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index e59296ee..6335e6dd 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -117,7 +117,7 @@ class Bilibili(VideoExtractor): @staticmethod def bilibili_space_channel_api(mid, cid, pn=1, ps=100): return 'https://api.bilibili.com/x/space/channel/video?mid=%s&cid=%s&pn=%s&ps=%s&order=0&jsonp=jsonp' % (mid, cid, pn, ps) - + @staticmethod def bilibili_space_collection_api(mid, cid, pn=1, ps=30): return 'https://api.bilibili.com/x/polymer/space/seasons_archives_list?mid=%s&season_id=%s&sort_reverse=false&page_num=%s&page_size=%s' % (mid, cid, pn, ps) @@ -125,7 +125,7 @@ class Bilibili(VideoExtractor): @staticmethod def bilibili_series_archives_api(mid, sid, pn=1, ps=100): return 'https://api.bilibili.com/x/series/archives?mid=%s&series_id=%s&pn=%s&ps=%s&only_normal=true&sort=asc&jsonp=jsonp' % (mid, sid, pn, ps) - + @staticmethod def bilibili_space_favlist_api(fid, pn=1, ps=20): return 'https://api.bilibili.com/x/v3/fav/resource/list?media_id=%s&pn=%s&ps=%s&order=mtime&type=0&tid=0&jsonp=jsonp' % (fid, pn, ps) @@ -224,6 +224,10 @@ class Bilibili(VideoExtractor): if 'videoData' in initial_state: # (standard video) + # warn if cookies are not loaded + if cookies is None: + log.w('You will need login cookies for 720p formats or above. (use --cookies to load cookies.txt.)') + # warn if it is a multi-part video pn = initial_state['videoData']['videos'] if pn > 1 and not kwargs.get('playlist'): From 2aaa877a9b8ebda9ed25cb87df1ad760700b55c4 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 15 Feb 2023 16:20:47 +0100 Subject: [PATCH 173/235] [.github/workflows] test python 3.11 --- .github/workflows/python-package.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 37a8f1aa..39793c03 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,5 +1,4 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions name: develop @@ -16,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.7, 3.8, 3.9, '3.10', 3.11-dev, pypy-3.8, pypy-3.9] + python-version: [3.7, 3.8, 3.9, '3.10', '3.11', pypy-3.8, pypy-3.9] steps: - uses: actions/checkout@v3 From ad5825a8f644442a3f45e028b7f04f4c6d861aba Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 9 May 2023 15:22:19 +0200 Subject: [PATCH 174/235] [twitter] fix extraction --- src/you_get/extractors/twitter.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index baf4c375..752ef746 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -34,7 +34,18 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) **kwargs) return - html = get_html(url, faker=True) # now it seems faker must be enabled + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0' + } + host = 'www.twitter.com' + + html, set_cookie = getHttps(host, url, headers=headers) + # "Found. Redirecting to..." + guest_id = r1('guest_id=([^;]+);', set_cookie) + headers['Cookie'] = 'guest_id=%s' % guest_id + + html = get_content(url, headers=headers) + screen_name = r1(r'twitter\.com/([^/]+)', url) or r1(r'data-screen-name="([^"]*)"', html) or \ r1(r' Date: Wed, 5 Jul 2023 17:12:15 +0200 Subject: [PATCH 175/235] [twitter] fix extraction --- src/you_get/extractors/twitter.py | 119 +++++++----------------------- 1 file changed, 26 insertions(+), 93 deletions(-) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 752ef746..43cfa6a4 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -23,7 +23,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) if re.match(r'https?://mobile', url): # normalize mobile URL url = 'https://' + match1(url, r'//mobile\.(.+)') - if re.match(r'https?://twitter\.com/i/moments/', url): # moments + if re.match(r'https?://twitter\.com/i/moments/', url): # FIXME: moments html = get_html(url, faker=True) paths = re.findall(r'data-permalink-path="([^"]+)"', html) for path in paths: @@ -34,114 +34,47 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) **kwargs) return - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0' - } - host = 'www.twitter.com' - - html, set_cookie = getHttps(host, url, headers=headers) - # "Found. Redirecting to..." - guest_id = r1('guest_id=([^;]+);', set_cookie) - headers['Cookie'] = 'guest_id=%s' % guest_id - - html = get_content(url, headers=headers) - - screen_name = r1(r'twitter\.com/([^/]+)', url) or r1(r'data-screen-name="([^"]*)"', html) or \ - r1(r' Date: Tue, 11 Jul 2023 17:20:12 +0200 Subject: [PATCH 176/235] [twitter] minor fix --- src/you_get/extractors/twitter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 43cfa6a4..4a439fe8 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -62,6 +62,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) if 'video' in info: for mediaDetail in info['mediaDetails']: + if 'video_info' not in mediaDetail: continue variants = mediaDetail['video_info']['variants'] variants = sorted(variants, key=lambda kv: kv.get('bitrate', 0)) title = item_id + '_' + variants[-1]['url'].split('/')[-1].split('?')[0].split('.')[0] From 9f38d7d76f2df34fa1bd72b826c5248a3aba67d3 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 25 Jul 2023 23:42:57 +0200 Subject: [PATCH 177/235] [common] update UA --- src/you_get/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index bdb67bac..4095dc52 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -141,11 +141,11 @@ postfix = False prefix = None fake_headers = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # noqa + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'UTF-8,*;q=0.5', 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'en-US,en;q=0.8', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.74 Safari/537.36 Edg/79.0.309.43', # noqa + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.183' # Latest Edge } if sys.stdout.isatty(): From 30d6c642f94d19b979cc4ea3461db1fea1901a6b Mon Sep 17 00:00:00 2001 From: OneCloud Date: Sat, 16 Dec 2023 23:17:22 +0800 Subject: [PATCH 178/235] [bilibili] api url update. --- src/you_get/extractors/bilibili.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 6335e6dd..b4e241c5 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -100,7 +100,8 @@ class Bilibili(VideoExtractor): appkey, sec = ''.join([chr(ord(i) + 2) for i in entropy[::-1]]).split(':') params = 'appkey=%s&cid=%s&otype=json&qn=%s&quality=%s&type=' % (appkey, cid, qn, qn) chksum = hashlib.md5(bytes(params + sec, 'utf8')).hexdigest() - return 'https://interface.bilibili.com/v2/playurl?%s&sign=%s' % (params, chksum) + return 'https://api.bilibili.com/x/player/wbi/v2?%s&sign=%s' % (params, chksum) + @staticmethod def bilibili_live_api(cid): From ac01a66b62c15d27b6a3b73257865dd06ad04433 Mon Sep 17 00:00:00 2001 From: Johnny Date: Mon, 18 Dec 2023 17:28:54 +0800 Subject: [PATCH 179/235] [twitter] fix twitter video download --- src/you_get/extractors/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 4a439fe8..d995fc58 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -40,7 +40,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) page_title = "{} [{}]".format(screen_name, item_id) # FIXME: this API won't work for protected or nsfw contents - api_url = 'https://cdn.syndication.twimg.com/tweet-result?id=%s' % item_id + api_url = 'https://cdn.syndication.twimg.com/tweet-result?id=%s&token=!' % item_id content = get_content(api_url) info = json.loads(content) From c7e5a297478f5207af0fa0ba8a3c76ccda33b200 Mon Sep 17 00:00:00 2001 From: ifui Date: Tue, 9 Jan 2024 09:53:41 +0800 Subject: [PATCH 180/235] fix ffmpeg: Unrecognized option 'absf'. --- src/you_get/processor/ffmpeg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/processor/ffmpeg.py b/src/you_get/processor/ffmpeg.py index 50e2c9fe..e8639e89 100755 --- a/src/you_get/processor/ffmpeg.py +++ b/src/you_get/processor/ffmpeg.py @@ -175,7 +175,7 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'): if FFMPEG == 'avconv': params += ['-c', 'copy'] else: - params += ['-c', 'copy', '-absf', 'aac_adtstoasc'] + params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc'] params.extend(['--', output]) if subprocess.call(params, stdin=STDIN) == 0: @@ -229,7 +229,7 @@ def ffmpeg_concat_mp4_to_mp4(files, output='output.mp4'): if FFMPEG == 'avconv': params += ['-c', 'copy'] else: - params += ['-c', 'copy', '-absf', 'aac_adtstoasc'] + params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc'] params.extend(['--', output]) subprocess.check_call(params, stdin=STDIN) From 6d577712489d749b9e6401c3ba07c9e263c34dd8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 4 Mar 2024 03:15:47 +0100 Subject: [PATCH 181/235] [imgur] fix extraction --- src/you_get/extractors/imgur.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/imgur.py b/src/you_get/extractors/imgur.py index d612a30a..14fd7f1a 100644 --- a/src/you_get/extractors/imgur.py +++ b/src/you_get/extractors/imgur.py @@ -54,7 +54,7 @@ class Imgur(VideoExtractor): content = get_content(self.url) url = match1(content, r'meta property="og:video"[^>]+(https?://i.imgur.com/[^"?]+)') or \ match1(content, r'meta property="og:image"[^>]+(https?://i.imgur.com/[^"?]+)') - _, container, size = url_info(url) + _, container, size = url_info(url, faker=True) self.streams = { 'original': { 'src': [url], From 0b7a91fc064ebcd497d53a7c9a302ba7e74858fe Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 4 Mar 2024 03:16:43 +0100 Subject: [PATCH 182/235] [imgur] fix extraction --- src/you_get/extractors/imgur.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/imgur.py b/src/you_get/extractors/imgur.py index 14fd7f1a..09395202 100644 --- a/src/you_get/extractors/imgur.py +++ b/src/you_get/extractors/imgur.py @@ -39,7 +39,7 @@ class Imgur(VideoExtractor): elif re.search(r'i\.imgur\.com/', self.url): # direct image - _, container, size = url_info(self.url) + _, container, size = url_info(self.url, faker=True) self.streams = { 'original': { 'src': [self.url], From 01fa32419ea78f9f76dfeb8ced96a1ab3166b0e0 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 18 Apr 2024 22:54:01 +0200 Subject: [PATCH 183/235] [common] update UA --- src/you_get/common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 4095dc52..ad3d3278 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -145,7 +145,7 @@ fake_headers = { 'Accept-Charset': 'UTF-8,*;q=0.5', 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'en-US,en;q=0.8', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.183' # Latest Edge + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/123.0.2420.97' # Latest Edge } if sys.stdout.isatty(): @@ -352,6 +352,7 @@ def getHttps(host, url, headers, debuglevel=0): conn.set_debuglevel(debuglevel) conn.request("GET", url, headers=headers) resp = conn.getresponse() + logging.debug('getHttps: %s' % resp.getheaders()) set_cookie = resp.getheader('set-cookie') data = resp.read() @@ -362,7 +363,7 @@ def getHttps(host, url, headers, debuglevel=0): pass conn.close() - return str(data, encoding='utf-8'), set_cookie + return str(data, encoding='utf-8'), set_cookie # TODO: support raw data # DEPRECATED in favor of get_content() From 1c1f9828698cecf3421d08adf44ae21c1514dbec Mon Sep 17 00:00:00 2001 From: cerenkov Date: Thu, 9 May 2024 17:18:11 +0800 Subject: [PATCH 184/235] The installation instructions failed for two reasons: 1. python 3.12 has removed the `imp` module and has replaced it by the `importlib` module. [1](https://docs.python.org/3/whatsnew/3.12.html#imp) 2. the use of setup.py as in the command `python setup.py install` is deprecated and should be replaced by the recommended way of `python -m pip install path/to/project`. [2](https://packaging.python.org/en/latest/discussions/setup-py-deprecated/) --- README.md | 10 ++++++---- setup.py | 17 +++++++++++++++-- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 44c102cd..588ce309 100644 --- a/README.md +++ b/README.md @@ -80,16 +80,18 @@ You may either download the [stable](https://github.com/soimort/you-get/archive/ Alternatively, run ``` -$ [sudo] python3 setup.py install +$ cd path/to/you-get +$ [sudo] python -m pip install . ``` Or ``` -$ python3 setup.py install --user +$ cd path/to/you-get +$ python -m pip install . --user ``` -to install `you-get` to a permanent path. +to install `you-get` to a permanent path. (And don't omit the dot `.` representing the current directory) You can also use the [pipenv](https://pipenv.pypa.io/en/latest) to install the `you-get` in the Python virtual environment. @@ -107,7 +109,7 @@ This is the recommended way for all developers, even if you don't often code in $ git clone git://github.com/soimort/you-get.git ``` -Then put the cloned directory into your `PATH`, or run `./setup.py install` to install `you-get` to a permanent path. +Then put the cloned directory into your `PATH`, or run `python -m pip install path/to/you-get` to install `you-get` to a permanent path. ### Option 5: Homebrew (Mac only) diff --git a/setup.py b/setup.py index 24dc9fb2..470c99ed 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,20 @@ PACKAGE_NAME = 'you_get' PROJ_METADATA = '%s.json' % PROJ_NAME -import os, json, imp +import importlib.util +import importlib.machinery + +def load_source(modname, filename): + loader = importlib.machinery.SourceFileLoader(modname, filename) + spec = importlib.util.spec_from_file_location(modname, filename, loader=loader) + module = importlib.util.module_from_spec(spec) + # The module is always executed and not cached in sys.modules. + # Uncomment the following line to cache the module. + # sys.modules[module.__name__] = module + loader.exec_module(module) + return module + +import os, json here = os.path.abspath(os.path.dirname(__file__)) proj_info = json.loads(open(os.path.join(here, PROJ_METADATA), encoding='utf-8').read()) try: @@ -13,7 +26,7 @@ try: except: README = "" CHANGELOG = open(os.path.join(here, 'CHANGELOG.rst'), encoding='utf-8').read() -VERSION = imp.load_source('version', os.path.join(here, 'src/%s/version.py' % PACKAGE_NAME)).__version__ +VERSION = load_source('version', os.path.join(here, 'src/%s/version.py' % PACKAGE_NAME)).__version__ from setuptools import setup, find_packages setup( From f7face20fa0e8b566c70141b123b62777b9a98cf Mon Sep 17 00:00:00 2001 From: wyzypa Date: Fri, 10 May 2024 10:29:44 +0800 Subject: [PATCH 185/235] fix: [extractors.bilibili] add headers when requesting for danmaku --- src/you_get/extractors/bilibili.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index b4e241c5..9860d5d6 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -335,7 +335,7 @@ class Bilibili(VideoExtractor): 'src': [[baseurl]], 'size': size} # get danmaku - self.danmaku = get_content('http://comment.bilibili.com/%s.xml' % cid) + self.danmaku = get_content('http://comment.bilibili.com/%s.xml' % cid, headers=self.bilibili_headers(referer=self.url)) # bangumi elif sort == 'bangumi': @@ -414,7 +414,7 @@ class Bilibili(VideoExtractor): 'src': [[baseurl], [audio_baseurl]], 'size': size} # get danmaku - self.danmaku = get_content('http://comment.bilibili.com/%s.xml' % cid) + self.danmaku = get_content('http://comment.bilibili.com/%s.xml' % cid, headers=self.bilibili_headers(referer=self.url)) # vc video elif sort == 'vc': @@ -596,7 +596,7 @@ class Bilibili(VideoExtractor): 'src': [[baseurl]], 'size': size} # get danmaku - self.danmaku = get_content('http://comment.bilibili.com/%s.xml' % cid) + self.danmaku = get_content('http://comment.bilibili.com/%s.xml' % cid, headers=self.bilibili_headers(referer=self.url)) def extract(self, **kwargs): # set UA and referer for downloading From a4d34ff6b1821ce9c6f1f05423776c54f29dedac Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 17 May 2024 18:19:12 +0200 Subject: [PATCH 186/235] [bilibili] change all http to https --- src/you_get/extractors/bilibili.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 9860d5d6..ea67f92f 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -335,7 +335,7 @@ class Bilibili(VideoExtractor): 'src': [[baseurl]], 'size': size} # get danmaku - self.danmaku = get_content('http://comment.bilibili.com/%s.xml' % cid, headers=self.bilibili_headers(referer=self.url)) + self.danmaku = get_content('https://comment.bilibili.com/%s.xml' % cid, headers=self.bilibili_headers(referer=self.url)) # bangumi elif sort == 'bangumi': @@ -414,7 +414,7 @@ class Bilibili(VideoExtractor): 'src': [[baseurl], [audio_baseurl]], 'size': size} # get danmaku - self.danmaku = get_content('http://comment.bilibili.com/%s.xml' % cid, headers=self.bilibili_headers(referer=self.url)) + self.danmaku = get_content('https://comment.bilibili.com/%s.xml' % cid, headers=self.bilibili_headers(referer=self.url)) # vc video elif sort == 'vc': @@ -596,7 +596,7 @@ class Bilibili(VideoExtractor): 'src': [[baseurl]], 'size': size} # get danmaku - self.danmaku = get_content('http://comment.bilibili.com/%s.xml' % cid, headers=self.bilibili_headers(referer=self.url)) + self.danmaku = get_content('https://comment.bilibili.com/%s.xml' % cid, headers=self.bilibili_headers(referer=self.url)) def extract(self, **kwargs): # set UA and referer for downloading From 57f6502e801c493feddcb5ab84ae155ead515886 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 17 May 2024 20:57:37 +0200 Subject: [PATCH 187/235] [twitter] fix extraction (x.com) --- README.md | 2 +- src/you_get/common.py | 8 ++++++-- src/you_get/extractors/twitter.py | 6 +++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 588ce309..9127e57d 100644 --- a/README.md +++ b/README.md @@ -376,7 +376,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | Site | URL | Videos? | Images? | Audios? | | :--: | :-- | :-----: | :-----: | :-----: | | **YouTube** | |✓| | | -| **Twitter** | |✓|✓| | +| **X (Twitter)** | |✓|✓| | | VK | |✓|✓| | | Vine | |✓| | | | Vimeo | |✓| | | diff --git a/src/you_get/common.py b/src/you_get/common.py index ad3d3278..d14eb980 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -113,6 +113,7 @@ SITES = { 'veoh' : 'veoh', 'vine' : 'vine', 'vk' : 'vk', + 'x' : 'twitter', 'xiaokaxiu' : 'yixia', 'xiaojiadianvideo' : 'fc2video', 'ximalaya' : 'ximalaya', @@ -1856,9 +1857,12 @@ def url_to_module(url): ) else: try: - location = get_location(url) # t.co isn't happy with fake_headers + try: + location = get_location(url) # t.co isn't happy with fake_headers + except: + location = get_location(url, headers=fake_headers) except: - location = get_location(url, headers=fake_headers) + location = get_location(url, headers=fake_headers, get_method='GET') if location and location != url and not location.startswith('/'): return url_to_module(location) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index d995fc58..b0125c67 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -34,9 +34,9 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) **kwargs) return - m = re.match('^https?://(mobile\.)?twitter\.com/([^/]+)/status/(\d+)', url) + m = re.match('^https?://(mobile\.)?(x|twitter)\.com/([^/]+)/status/(\d+)', url) assert m - screen_name, item_id = m.group(2), m.group(3) + screen_name, item_id = m.group(3), m.group(4) page_title = "{} [{}]".format(screen_name, item_id) # FIXME: this API won't work for protected or nsfw contents @@ -77,6 +77,6 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) # TODO: should we deal with quoted tweets? -site_info = "Twitter.com" +site_info = "X.com" download = twitter_download download_playlist = playlist_not_supported('twitter') From 873ffdb61eb461b4c71ab7e12151864b49a23f3a Mon Sep 17 00:00:00 2001 From: cerenkov Date: Sun, 19 May 2024 00:58:08 +0800 Subject: [PATCH 188/235] Update setup.py: compatibility for older python versions --- setup.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/setup.py b/setup.py index 470c99ed..ea984671 100755 --- a/setup.py +++ b/setup.py @@ -5,18 +5,21 @@ PACKAGE_NAME = 'you_get' PROJ_METADATA = '%s.json' % PROJ_NAME -import importlib.util -import importlib.machinery - -def load_source(modname, filename): - loader = importlib.machinery.SourceFileLoader(modname, filename) - spec = importlib.util.spec_from_file_location(modname, filename, loader=loader) - module = importlib.util.module_from_spec(spec) - # The module is always executed and not cached in sys.modules. - # Uncomment the following line to cache the module. - # sys.modules[module.__name__] = module - loader.exec_module(module) - return module +import sys +if (sys.version_info >= (3, 12)): + import importlib.util + import importlib.machinery + def load_source(modname, filename): + loader = importlib.machinery.SourceFileLoader(modname, filename) + spec = importlib.util.spec_from_file_location(modname, filename, loader=loader) + module = importlib.util.module_from_spec(spec) + # The module is always executed and not cached in sys.modules. + # Uncomment the following line to cache the module. + # sys.modules[module.__name__] = module + loader.exec_module(module) + return module +else: + from imp import load_source import os, json here = os.path.abspath(os.path.dirname(__file__)) From 317cc467e7f738390a9d451ad530736d0e848690 Mon Sep 17 00:00:00 2001 From: cerenkov Date: Sun, 19 May 2024 01:01:09 +0800 Subject: [PATCH 189/235] Update python-package.yml: add python-version 3.12 job --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 39793c03..51d56c91 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.7, 3.8, 3.9, '3.10', '3.11', pypy-3.8, pypy-3.9] + python-version: [3.7, 3.8, 3.9, '3.10', '3.11', '3.12', pypy-3.8, pypy-3.9] steps: - uses: actions/checkout@v3 From adeaeb896775c5dd6397fdd6eb98a2750aa99717 Mon Sep 17 00:00:00 2001 From: cerenkov Date: Sun, 19 May 2024 01:03:54 +0800 Subject: [PATCH 190/235] Update you-get.json: add info Python :: 3.11 and 3.12 --- you-get.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/you-get.json b/you-get.json index bb94ba00..adf604dc 100644 --- a/you-get.json +++ b/you-get.json @@ -22,6 +22,8 @@ "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Internet", "Topic :: Internet :: WWW/HTTP", "Topic :: Multimedia", From b935d3ed02ea029f6c220e7123fd321e489fc7d8 Mon Sep 17 00:00:00 2001 From: cerenkov Date: Sun, 19 May 2024 02:01:31 +0800 Subject: [PATCH 191/235] Update python-package.yml: update setuptools --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 51d56c91..98b6c7de 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -25,7 +25,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip + python -m pip install --upgrade pip setuptools pip install flake8 pytest if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Lint with flake8 From 19a4f15d6014c66c99d48be4a024b89407fe428a Mon Sep 17 00:00:00 2001 From: cerenkov Date: Sun, 19 May 2024 12:03:54 +0800 Subject: [PATCH 192/235] Revert "Update setup.py: compatibility for older python versions" This reverts commit 873ffdb61eb461b4c71ab7e12151864b49a23f3a. Minimun python version 3.7 already assures importlib availability --- setup.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/setup.py b/setup.py index ea984671..470c99ed 100755 --- a/setup.py +++ b/setup.py @@ -5,21 +5,18 @@ PACKAGE_NAME = 'you_get' PROJ_METADATA = '%s.json' % PROJ_NAME -import sys -if (sys.version_info >= (3, 12)): - import importlib.util - import importlib.machinery - def load_source(modname, filename): - loader = importlib.machinery.SourceFileLoader(modname, filename) - spec = importlib.util.spec_from_file_location(modname, filename, loader=loader) - module = importlib.util.module_from_spec(spec) - # The module is always executed and not cached in sys.modules. - # Uncomment the following line to cache the module. - # sys.modules[module.__name__] = module - loader.exec_module(module) - return module -else: - from imp import load_source +import importlib.util +import importlib.machinery + +def load_source(modname, filename): + loader = importlib.machinery.SourceFileLoader(modname, filename) + spec = importlib.util.spec_from_file_location(modname, filename, loader=loader) + module = importlib.util.module_from_spec(spec) + # The module is always executed and not cached in sys.modules. + # Uncomment the following line to cache the module. + # sys.modules[module.__name__] = module + loader.exec_module(module) + return module import os, json here = os.path.abspath(os.path.dirname(__file__)) From 7f8ebe1c93b1f35545989445369ab4f19dc34af8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 20 May 2024 04:18:10 +0200 Subject: [PATCH 193/235] [tests] disable test_tiktok temporarily --- tests/test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test.py b/tests/test.py index c0f3836a..a8c7023d 100644 --- a/tests/test.py +++ b/tests/test.py @@ -55,10 +55,10 @@ class YouGetTests(unittest.TestCase): # 'https://soundcloud.com/anthony-flieger/sets/cytus', info_only=True #) - def test_tiktok(self): - tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True) - tiktok.download('https://www.tiktok.com/@/video/6850796940293164290', info_only=True) - tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True) + #def test_tiktok(self): + #tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True) + #tiktok.download('https://www.tiktok.com/@/video/6850796940293164290', info_only=True) + #tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True) def test_twitter(self): twitter.download('https://twitter.com/elonmusk/status/1530516552084234244', info_only=True) From 0b9fec525152ef1c705864245970925c64a65872 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 20 May 2024 04:56:36 +0200 Subject: [PATCH 194/235] [imgur] fix extraction --- src/you_get/extractors/imgur.py | 8 +++++--- tests/test.py | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/imgur.py b/src/you_get/extractors/imgur.py index 09395202..2726c974 100644 --- a/src/you_get/extractors/imgur.py +++ b/src/you_get/extractors/imgur.py @@ -13,9 +13,11 @@ class Imgur(VideoExtractor): ] def prepare(self, **kwargs): + self.ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/123.0.2420.97' + if re.search(r'imgur\.com/a/', self.url): # album - content = get_content(self.url) + content = get_content(self.url, headers=fake_headers) album = match1(content, r'album\s*:\s*({.*}),') or \ match1(content, r'image\s*:\s*({.*}),') album = json.loads(album) @@ -51,10 +53,10 @@ class Imgur(VideoExtractor): else: # gallery image - content = get_content(self.url) + content = get_content(self.url, headers=fake_headers) url = match1(content, r'meta property="og:video"[^>]+(https?://i.imgur.com/[^"?]+)') or \ match1(content, r'meta property="og:image"[^>]+(https?://i.imgur.com/[^"?]+)') - _, container, size = url_info(url, faker=True) + _, container, size = url_info(url, headers={'User-Agent': fake_headers['User-Agent']}) self.streams = { 'original': { 'src': [url], diff --git a/tests/test.py b/tests/test.py index a8c7023d..d3cd099d 100644 --- a/tests/test.py +++ b/tests/test.py @@ -19,6 +19,7 @@ from you_get.extractors import ( class YouGetTests(unittest.TestCase): def test_imgur(self): imgur.download('http://imgur.com/WVLk5nD', info_only=True) + imgur.download('https://imgur.com/we-should-have-listened-WVLk5nD', info_only=True) def test_magisto(self): magisto.download( From 97bb0a6d4e06ddc679ca4176356e99c0aa532566 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 21 May 2024 01:45:12 +0200 Subject: [PATCH 195/235] python-package.yml: add pypy-3.10 --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 98b6c7de..1d9afe9f 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.7, 3.8, 3.9, '3.10', '3.11', '3.12', pypy-3.8, pypy-3.9] + python-version: [3.7, 3.8, 3.9, '3.10', '3.11', '3.12', pypy-3.8, pypy-3.9, pypy-3.10] steps: - uses: actions/checkout@v3 From bf3d82bd4b11e6ef06634c04d2bbb68593025984 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 21 May 2024 23:59:39 +0200 Subject: [PATCH 196/235] [tiktok] fix extraction --- src/you_get/extractors/tiktok.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/you_get/extractors/tiktok.py b/src/you_get/extractors/tiktok.py index 2c4892f6..3cbb59d3 100644 --- a/src/you_get/extractors/tiktok.py +++ b/src/you_get/extractors/tiktok.py @@ -27,12 +27,12 @@ def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs): tt_chain_token = r1('tt_chain_token=([^;]+);', set_cookie) headers['Cookie'] = 'tt_chain_token=%s' % tt_chain_token - data = r1(r'window\[\'SIGI_STATE\'\]=(.*?);window\[\'SIGI_RETRY\'\]', html) or \ - r1(r'', html) + data = r1(r'', html) info = json.loads(data) - downloadAddr = info['ItemModule'][vid]['video']['downloadAddr'] - author = info['ItemModule'][vid]['author'] # same as uniqueId - nickname = info['UserModule']['users'][author]['nickname'] + itemStruct = info['__DEFAULT_SCOPE__']['webapp.video-detail']['itemInfo']['itemStruct'] + downloadAddr = itemStruct['video']['downloadAddr'] + author = itemStruct['author']['uniqueId'] + nickname = itemStruct['author']['nickname'] title = '%s [%s]' % (nickname or author, vid) mime, ext, size = url_info(downloadAddr, headers=headers) From 5e7a06f01de18fb02992e27ba088bd613a8a5055 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 22 May 2024 00:01:35 +0200 Subject: [PATCH 197/235] [tests] enable test_tiktok, etc. --- tests/test.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test.py b/tests/test.py index d3cd099d..63e10e17 100644 --- a/tests/test.py +++ b/tests/test.py @@ -43,8 +43,8 @@ class YouGetTests(unittest.TestCase): def test_acfun(self): acfun.download('https://www.acfun.cn/v/ac11701912', info_only=True) - #def test_bilibili(self): - # bilibili.download('https://www.bilibili.com/video/BV1sL4y177sC', info_only=True) + def test_bilibili(self): + bilibili.download('https://www.bilibili.com/video/BV1sL4y177sC', info_only=True) #def test_soundcloud(self): ## single song @@ -56,10 +56,10 @@ class YouGetTests(unittest.TestCase): # 'https://soundcloud.com/anthony-flieger/sets/cytus', info_only=True #) - #def test_tiktok(self): - #tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True) - #tiktok.download('https://www.tiktok.com/@/video/6850796940293164290', info_only=True) - #tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True) + def test_tiktok(self): + tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True) + tiktok.download('https://www.tiktok.com/@/video/6850796940293164290', info_only=True) + tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True) def test_twitter(self): twitter.download('https://twitter.com/elonmusk/status/1530516552084234244', info_only=True) From 290c74569902db2f1e58521e4e3bd7e1d47727a9 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 22 May 2024 00:04:35 +0200 Subject: [PATCH 198/235] [tests] remove test_acfun (404 URL) --- tests/test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test.py b/tests/test.py index 63e10e17..78de6561 100644 --- a/tests/test.py +++ b/tests/test.py @@ -40,9 +40,6 @@ class YouGetTests(unittest.TestCase): # 'https://www.youtube.com/watch?v=Fpr4fQSh1cc', info_only=True #) - def test_acfun(self): - acfun.download('https://www.acfun.cn/v/ac11701912', info_only=True) - def test_bilibili(self): bilibili.download('https://www.bilibili.com/video/BV1sL4y177sC', info_only=True) From 763f8a4e650b1982c3da09e0ea8b84c32d1a93de Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 22 May 2024 00:05:57 +0200 Subject: [PATCH 199/235] [tests] update test_acfun --- tests/test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test.py b/tests/test.py index 78de6561..720dc839 100644 --- a/tests/test.py +++ b/tests/test.py @@ -40,6 +40,9 @@ class YouGetTests(unittest.TestCase): # 'https://www.youtube.com/watch?v=Fpr4fQSh1cc', info_only=True #) + def test_acfun(self): + acfun.download('https://www.acfun.cn/v/ac44560432', info_only=True) + def test_bilibili(self): bilibili.download('https://www.bilibili.com/video/BV1sL4y177sC', info_only=True) From 1899b3e4fad6ea77a98aa1a4acafc8c27e535a9d Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 22 May 2024 00:24:51 +0200 Subject: [PATCH 200/235] update README.md (pip3 -> pip) --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9127e57d..c4ea4c00 100644 --- a/README.md +++ b/README.md @@ -63,9 +63,9 @@ The following dependencies are recommended: ### Option 1: Install via pip -The official release of `you-get` is distributed on [PyPI](https://pypi.python.org/pypi/you-get), and can be installed easily from a PyPI mirror via the [pip](https://en.wikipedia.org/wiki/Pip_\(package_manager\)) package manager. Note that you must use the Python 3 version of `pip`: +The official release of `you-get` is distributed on [PyPI](https://pypi.python.org/pypi/you-get), and can be installed easily from a PyPI mirror via the [pip](https://en.wikipedia.org/wiki/Pip_\(package_manager\)) package manager: (Note that you must use the Python 3 version of `pip`) - $ pip3 install you-get + $ pip install you-get ### Option 2: Install via [Antigen](https://github.com/zsh-users/antigen) (for Zsh users) @@ -136,7 +136,7 @@ Completion definitions for Bash, Fish and Zsh can be found in [`contrib/completi Based on which option you chose to install `you-get`, you may upgrade it via: ``` -$ pip3 install --upgrade you-get +$ pip install --upgrade you-get ``` or download the latest release via: @@ -148,7 +148,7 @@ $ you-get https://github.com/soimort/you-get/archive/master.zip In order to get the latest ```develop``` branch without messing up the PIP, you can try: ``` -$ pip3 install --upgrade git+https://github.com/soimort/you-get@develop +$ pip install --upgrade git+https://github.com/soimort/you-get@develop ``` ## Getting Started From 34e4c8651bb2ce752e2320a15262bf1bddad6b0e Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 22 May 2024 01:02:27 +0200 Subject: [PATCH 201/235] [tumblr] print info for each pic --- src/you_get/extractors/tumblr.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/you_get/extractors/tumblr.py b/src/you_get/extractors/tumblr.py index 1fdfcad0..5b5c22d6 100644 --- a/src/you_get/extractors/tumblr.py +++ b/src/you_get/extractors/tumblr.py @@ -82,16 +82,16 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): except: pass if tuggles: - size = sum([tuggles[t]['size'] for t in tuggles]) - print_info(site_info, page_title, None, size) + #size = sum([tuggles[t]['size'] for t in tuggles]) + #print_info(site_info, page_title, None, size) - if not info_only: - for t in tuggles: - title = tuggles[t]['title'] - ext = tuggles[t]['ext'] - size = tuggles[t]['size'] - url = tuggles[t]['url'] - print_info(site_info, title, ext, size) + for t in tuggles: + title = tuggles[t]['title'] + ext = tuggles[t]['ext'] + size = tuggles[t]['size'] + url = tuggles[t]['url'] + print_info(site_info, title, ext, size) + if not info_only: download_urls([url], title, ext, size, output_dir=output_dir) return From f6fd3fb867e9ce9a9219f9f0c6bf091797074a9d Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 22 May 2024 01:04:21 +0200 Subject: [PATCH 202/235] [tumblr] add prefix --- src/you_get/extractors/tumblr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/tumblr.py b/src/you_get/extractors/tumblr.py index 5b5c22d6..b0dc99f6 100644 --- a/src/you_get/extractors/tumblr.py +++ b/src/you_get/extractors/tumblr.py @@ -86,7 +86,7 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): #print_info(site_info, page_title, None, size) for t in tuggles: - title = tuggles[t]['title'] + title = '[tumblr] ' + tuggles[t]['title'] ext = tuggles[t]['ext'] size = tuggles[t]['size'] url = tuggles[t]['url'] From 1a3bcb462aef6addd659c0d0df4f242b61ab6f4c Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 22 May 2024 01:10:56 +0200 Subject: [PATCH 203/235] update README.md (http:// -> https://) --- README.md | 133 ++++++++++++++++++++++++++---------------------------- 1 file changed, 64 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index c4ea4c00..5748775e 100644 --- a/README.md +++ b/README.md @@ -268,25 +268,20 @@ Type: JPEG Image (image/jpeg) Size: 0.06 MiB (66482 Bytes) Downloading rms.jpg ... -100.0% ( 0.1/0.1 MB) ├████████████████████████████████████████┤[1/1] 127 kB/s + 100% ( 0.1/ 0.1MB) ├████████████████████████████████████████┤[1/1] 127 kB/s ``` Otherwise, `you-get` will scrape the web page and try to figure out if there's anything interesting to you: ``` -$ you-get http://kopasas.tumblr.com/post/69361932517 +$ you-get https://kopasas.tumblr.com/post/69361932517 Site: Tumblr.com -Title: kopasas -Type: Unknown type (None) -Size: 0.51 MiB (536583 Bytes) - -Site: Tumblr.com -Title: tumblr_mxhg13jx4n1sftq6do1_1280 +Title: [tumblr] tumblr_mxhg13jx4n1sftq6do1_640 Type: Portable Network Graphics (image/png) -Size: 0.51 MiB (536583 Bytes) +Size: 0.11 MiB (118484 Bytes) -Downloading tumblr_mxhg13jx4n1sftq6do1_1280.png ... -100.0% ( 0.5/0.5 MB) ├████████████████████████████████████████┤[1/1] 22 MB/s +Downloading [tumblr] tumblr_mxhg13jx4n1sftq6do1_640.png ... + 100% ( 0.1/ 0.1MB) ├████████████████████████████████████████┤[1/1] 22 MB/s ``` **Note:** @@ -377,81 +372,81 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | :--: | :-- | :-----: | :-----: | :-----: | | **YouTube** | |✓| | | | **X (Twitter)** | |✓|✓| | -| VK | |✓|✓| | +| VK | |✓|✓| | | Vine | |✓| | | | Vimeo | |✓| | | -| Veoh | |✓| | | +| Veoh | |✓| | | | **Tumblr** | |✓|✓|✓| -| TED | |✓| | | +| TED | |✓| | | | SoundCloud | | | |✓| | SHOWROOM | |✓| | | | Pinterest | | |✓| | -| MTV81 | |✓| | | +| MTV81 | |✓| | | | Mixcloud | | | |✓| -| Metacafe | |✓| | | -| Magisto | |✓| | | +| Metacafe | |✓| | | +| Magisto | |✓| | | | Khan Academy | |✓| | | | Internet Archive | |✓| | | | **Instagram** | |✓|✓| | -| InfoQ | |✓| | | -| Imgur | | |✓| | -| Heavy Music Archive | | | |✓| -| Freesound | | | |✓| +| InfoQ | |✓| | | +| Imgur | | |✓| | +| Heavy Music Archive | | | |✓| +| Freesound | | | |✓| | Flickr | |✓|✓| | -| FC2 Video | |✓| | | +| FC2 Video | |✓| | | | Facebook | |✓| | | -| eHow | |✓| | | -| Dailymotion | |✓| | | -| Coub | |✓| | | -| CBS | |✓| | | -| Bandcamp | | | |✓| -| AliveThai | |✓| | | -| interest.me | |✓| | | -| **755
ナナゴーゴー** | |✓|✓| | -| **niconico
ニコニコ動画** | |✓| | | -| **163
网易视频
网易云音乐** |
|✓| |✓| -| 56网 | |✓| | | -| **AcFun** | |✓| | | -| **Baidu
百度贴吧** | |✓|✓| | -| 爆米花网 | |✓| | | -| **bilibili
哔哩哔哩** | |✓|✓|✓| -| 豆瓣 | |✓| |✓| -| 斗鱼 | |✓| | | -| 凤凰视频 | |✓| | | -| 风行网 | |✓| | | -| iQIYI
爱奇艺 | |✓| | | -| 激动网 | |✓| | | -| 酷6网 | |✓| | | -| 酷狗音乐 | | | |✓| -| 酷我音乐 | | | |✓| -| 乐视网 | |✓| | | -| 荔枝FM | | | |✓| -| 懒人听书 | | | |✓| -| 秒拍 | |✓| | | -| MioMio弹幕网 | |✓| | | -| MissEvan
猫耳FM | | | |✓| +| eHow | |✓| | | +| Dailymotion | |✓| | | +| Coub | |✓| | | +| CBS | |✓| | | +| Bandcamp | | | |✓| +| AliveThai | |✓| | | +| interest.me | |✓| | | +| **755
ナナゴーゴー** | |✓|✓| | +| **niconico
ニコニコ動画** | |✓| | | +| **163
网易视频
网易云音乐** |
|✓| |✓| +| 56网 | |✓| | | +| **AcFun** | |✓| | | +| **Baidu
百度贴吧** | |✓|✓| | +| 爆米花网 | |✓| | | +| **bilibili
哔哩哔哩** | |✓|✓|✓| +| 豆瓣 | |✓| |✓| +| 斗鱼 | |✓| | | +| 凤凰视频 | |✓| | | +| 风行网 | |✓| | | +| iQIYI
爱奇艺 | |✓| | | +| 激动网 | |✓| | | +| 酷6网 | |✓| | | +| 酷狗音乐 | | | |✓| +| 酷我音乐 | | | |✓| +| 乐视网 | |✓| | | +| 荔枝FM | | | |✓| +| 懒人听书 | | | |✓| +| 秒拍 | |✓| | | +| MioMio弹幕网 | |✓| | | +| MissEvan
猫耳FM | | | |✓| | 痞客邦 | |✓| | | -| PPTV聚力 | |✓| | | -| 齐鲁网 | |✓| | | -| QQ
腾讯视频 | |✓| | | -| 企鹅直播 | |✓| | | -| Sina
新浪视频
微博秒拍视频 |
|✓| | | -| Sohu
搜狐视频 | |✓| | | -| **Tudou
土豆** | |✓| | | -| 阳光卫视 | |✓| | | -| **Youku
优酷** | |✓| | | -| 战旗TV | |✓| | | -| 央视网 | |✓| | | -| Naver
네이버 | |✓| | | -| 芒果TV | |✓| | | -| 火猫TV | |✓| | | -| 阳光宽频网 | |✓| | | +| PPTV聚力 | |✓| | | +| 齐鲁网 | |✓| | | +| QQ
腾讯视频 | |✓| | | +| 企鹅直播 | |✓| | | +| Sina
新浪视频
微博秒拍视频 |
|✓| | | +| Sohu
搜狐视频 | |✓| | | +| **Tudou
土豆** | |✓| | | +| 阳光卫视 | |✓| | | +| **Youku
优酷** | |✓| | | +| 战旗TV | |✓| | | +| 央视网 | |✓| | | +| Naver
네이버 | |✓| | | +| 芒果TV | |✓| | | +| 火猫TV | |✓| | | +| 阳光宽频网 | |✓| | | | 西瓜视频 | |✓| | | | 新片场 | |✓| | | | 快手 | |✓|✓| | | 抖音 | |✓| | | | TikTok | |✓| | | -| 中国体育(TV) |
|✓| | | +| 中国体育(TV) |
|✓| | | | 知乎 | |✓| | | For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. @@ -464,7 +459,7 @@ Check if it's already a known problem on Date: Wed, 22 May 2024 01:15:49 +0200 Subject: [PATCH 204/235] [vine] remove support (discontinued) --- README.md | 1 - src/you_get/common.py | 1 - src/you_get/extractors/__init__.py | 1 - src/you_get/extractors/tumblr.py | 4 ---- src/you_get/extractors/twitter.py | 1 - src/you_get/extractors/vine.py | 36 ------------------------------ 6 files changed, 44 deletions(-) delete mode 100644 src/you_get/extractors/vine.py diff --git a/README.md b/README.md index 5748775e..0db87791 100644 --- a/README.md +++ b/README.md @@ -373,7 +373,6 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | **YouTube** | |✓| | | | **X (Twitter)** | |✓|✓| | | VK | |✓|✓| | -| Vine | |✓| | | | Vimeo | |✓| | | | Veoh | |✓| | | | **Tumblr** | |✓|✓|✓| diff --git a/src/you_get/common.py b/src/you_get/common.py index d14eb980..752c63f2 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -111,7 +111,6 @@ SITES = { 'wanmen' : 'wanmen', 'weibo' : 'miaopai', 'veoh' : 'veoh', - 'vine' : 'vine', 'vk' : 'vk', 'x' : 'twitter', 'xiaokaxiu' : 'yixia', diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index 8c43a8bc..e68cd174 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -74,7 +74,6 @@ from .twitter import * from .ucas import * from .veoh import * from .vimeo import * -from .vine import * from .vk import * from .w56 import * from .wanmen import * diff --git a/src/you_get/extractors/tumblr.py b/src/you_get/extractors/tumblr.py index b0dc99f6..08e605e8 100644 --- a/src/you_get/extractors/tumblr.py +++ b/src/you_get/extractors/tumblr.py @@ -6,7 +6,6 @@ from ..common import * from .universal import * from .dailymotion import dailymotion_download from .vimeo import vimeo_download -from .vine import vine_download def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): if re.match(r'https?://\d+\.media\.tumblr\.com/', url): @@ -125,9 +124,6 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): elif re.search(r'dailymotion\.com', iframe_url): dailymotion_download(iframe_url, output_dir, merge=merge, info_only=info_only, **kwargs) return - elif re.search(r'vine\.co', iframe_url): - vine_download(iframe_url, output_dir, merge=merge, info_only=info_only, **kwargs) - return else: iframe_html = get_content(iframe_url) real_url = r1(r'([^<]*)', html) - stream = r1(r'', html) - if not stream: # https://vine.co/v/.../card - stream = r1(r'"videoUrl":"([^"]+)"', html) - if stream: - stream = stream.replace('\\/', '/') - else: - posts_url = 'https://archive.vine.co/posts/' + video_id + '.json' - json_data = json.loads(get_content(posts_url)) - stream = json_data['videoDashUrl'] - title = json_data['description'] - if title == "": - title = json_data['username'].replace(" ", "_") + "_" + video_id - - mime, ext, size = url_info(stream) - - print_info(site_info, title, mime, size) - if not info_only: - download_urls([stream], title, ext, size, output_dir, merge=merge) - - -site_info = "Vine.co" -download = vine_download -download_playlist = playlist_not_supported('vine') From 9aa538a403452901ccacca02deb12977f9cbcda0 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 22 May 2024 01:32:38 +0200 Subject: [PATCH 205/235] update LICENSE.txt --- LICENSE.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE.txt b/LICENSE.txt index a193d8e2..fcc26433 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2012-2020 Mort Yao and other contributors +Copyright (c) 2012-2024 Mort Yao and other contributors (https://github.com/soimort/you-get/graphs/contributors) Copyright (c) 2012 Boyu Guo From dbb767f9288b73d852945a1dc965fd12640c73fe Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 22 May 2024 01:58:47 +0200 Subject: [PATCH 206/235] version 0.4.1700 --- SECURITY.md | 2 +- src/you_get/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/SECURITY.md b/SECURITY.md index 5041b2f2..d9fb8cf3 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -2,4 +2,4 @@ ## Reporting a Vulnerability -Please report security issues to +Please report security issues to . diff --git a/src/you_get/version.py b/src/you_get/version.py index 440488a9..14ed6a99 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1650' +__version__ = '0.4.1700' From 29f513821df4b0ec3ed0b514a0897f8c336b51e7 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 22 May 2024 02:53:12 +0200 Subject: [PATCH 207/235] [tests] disable test_bilibili temporarily --- tests/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test.py b/tests/test.py index 720dc839..e8a378f5 100644 --- a/tests/test.py +++ b/tests/test.py @@ -43,8 +43,8 @@ class YouGetTests(unittest.TestCase): def test_acfun(self): acfun.download('https://www.acfun.cn/v/ac44560432', info_only=True) - def test_bilibili(self): - bilibili.download('https://www.bilibili.com/video/BV1sL4y177sC', info_only=True) + #def test_bilibili(self): + #bilibili.download('https://www.bilibili.com/video/BV1sL4y177sC', info_only=True) #def test_soundcloud(self): ## single song From 567d1059fce22fe790e059af9812ab7cd12135db Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 23 Jun 2024 10:17:02 +0200 Subject: [PATCH 208/235] [youtube] fix 403 error and throttling (close #2950) --- setup.py | 3 +- src/you_get/extractors/youtube.py | 176 ++++++++++++++---------------- tests/test.py | 6 +- 3 files changed, 88 insertions(+), 97 deletions(-) diff --git a/setup.py b/setup.py index 470c99ed..0804ae33 100755 --- a/setup.py +++ b/setup.py @@ -56,7 +56,8 @@ setup( entry_points = {'console_scripts': proj_info['console_scripts']}, - extras_require={ + install_requires = ['dukpy'], + extras_require = { 'socks': ['PySocks'], } ) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index ddf12be9..ee30644b 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -3,6 +3,8 @@ from ..common import * from ..extractor import VideoExtractor +import dukpy +from urllib.parse import urlparse, parse_qs, urlencode from xml.dom.minidom import parseString class YouTube(VideoExtractor): @@ -68,45 +70,32 @@ class YouTube(VideoExtractor): 'audio_encoding': 'AAC', 'audio_bitrate': '24'}, ] + def dethrottle(js, url): + def n_to_n(js, n): + # Examples: + # yma - https://www.youtube.com/s/player/84314bef/player_ias.vflset/en_US/base.js + # Xka - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js + f1 = match1(js, r'a\.set\("n",b\),[$\w]+\.length\|\|([$\w]+)\(""\)') + f1def = match1(js, r'\W%s=(function\(\w+\).+?\)});' % re.escape(f1)) + n = dukpy.evaljs('(%s)("%s")' % (f1def, n)) + return n + + u = urlparse(url) + qs = parse_qs(u.query) + n = n_to_n(js, qs['n'][0]) + qs['n'] = [n] + return u._replace(query=urlencode(qs, doseq=True)).geturl() + def s_to_sig(js, s): # Examples: - # - https://www.youtube.com/yts/jsbin/player-da_DK-vflWlK-zq/base.js - # - https://www.youtube.com/yts/jsbin/player-vflvABTsY/da_DK/base.js - # - https://www.youtube.com/yts/jsbin/player-vfls4aurX/da_DK/base.js - # - https://www.youtube.com/yts/jsbin/player_ias-vfl_RGK2l/en_US/base.js - # - https://www.youtube.com/yts/jsbin/player-vflRjqq_w/da_DK/base.js - # - https://www.youtube.com/yts/jsbin/player_ias-vfl-jbnrr/da_DK/base.js - # - https://www.youtube.com/s/player/0b643cd1/player_ias.vflset/sv_SE/base.js - # - https://www.youtube.com/s/player/50e823fc/player_ias.vflset/sv_SE/base.js - # - https://www.youtube.com/s/player/3b5d5649/player_ias.vflset/sv_SE/base.js - # - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js - def tr_js(code): - code = re.sub(r'function', r'def', code) - # add prefix '_sig_' to prevent namespace pollution - code = re.sub(r'(\W)([$\w][$\w][$\w]?)\(', r'\1_sig_\2(', code) - code = re.sub(r'\$', '_dollar', code) - code = re.sub(r'\{', r': ', code) - code = re.sub(r'\}', r'\n', code) - code = re.sub(r'var\s+', r'', code) - code = re.sub(r'(\w+).join\(""\)', r'"".join(\1)', code) - code = re.sub(r'(\w+).length', r'len(\1)', code) - code = re.sub(r'(\w+).slice\((\w+)\)', r'\1[\2:]', code) - code = re.sub(r'(\w+).splice\((\w+),(\w+)\)', r'del \1[\2:\2+\3]', code) - code = re.sub(r'(\w+).split\(""\)', r'list(\1)', code) - return code - - js = js.replace('\n', ' ') - f1 = match1(js, r'\.set\(\w+\.sp,encodeURIComponent\(([$\w]+)') or \ - match1(js, r'\.set\(\w+\.sp,\(0,window\.encodeURIComponent\)\(([$\w]+)') or \ - match1(js, r'\.set\(\w+\.sp,([$\w]+)\(\w+\.s\)\)') or \ - match1(js, r'"signature",([$\w]+)\(\w+\.\w+\)') or \ - match1(js, r'=([$\w]+)\(decodeURIComponent\(') - f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \ - match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1)) - f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def) + # BPa - https://www.youtube.com/s/player/84314bef/player_ias.vflset/en_US/base.js + # Xva - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js + js_code = '' + f1 = match1(js, r'=([$\w]+)\(decodeURIComponent\(') + f1def = match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1)) + f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def) # remove . prefix f1def = 'function %s%s' % (f1, f1def) - code = tr_js(f1def) - f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def)) + f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def)) # find all invoked function names for f2 in f2s: f2e = re.escape(f2) f2def = re.search(r'[^$\w]%s:function\((\w+,\w+)\)(\{[^\{\}]+\})' % f2e, js) @@ -115,13 +104,10 @@ class YouTube(VideoExtractor): else: f2def = re.search(r'[^$\w]%s:function\((\w+)\)(\{[^\{\}]+\})' % f2e, js) f2def = 'function {}({},b){}'.format(f2e, f2def.group(1), f2def.group(2)) - f2 = re.sub(r'\$', '_dollar', f2) # replace dollar sign - code = code + 'global _sig_%s\n' % f2 + tr_js(f2def) - - f1 = re.sub(r'\$', '_dollar', f1) # replace dollar sign - code = code + '_sig=_sig_%s(s)' % f1 - exec(code, globals(), locals()) - return locals()['_sig'] + js_code += f2def + ';' + js_code += f1def + ';%s("%s")' % (f1, s) + sig = dukpy.evaljs(js_code) + return sig def chunk_by_range(url, size): urls = [] @@ -209,6 +195,7 @@ class YouTube(VideoExtractor): raise elif video_info['status'] == ['ok']: if 'use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']: + # FIXME: this is basically dead code, use_cipher_signature is always true self.title = parse.unquote_plus(json.loads(video_info["player_response"][0])["videoDetails"]["title"]) # Parse video page (for DASH) video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) @@ -259,23 +246,30 @@ class YouTube(VideoExtractor): self.html5player = None else: - # Parse video page instead + # Extract from video page + logging.debug('Extracting from the video page...') video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) - try: # FIXME: we should extract ytInitialPlayerResponse more reliably - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + try: + jsUrl = re.search('([^"]*/base\.js)"', video_page).group(1) except: - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + log.wtf('[Failed] Unable to find base.js on the video page') + # FIXME: do we still need this? + jsUrl = jsUrl.replace('\/', '/') # unescape URL (for age-restricted videos) + self.html5player = 'https://www.youtube.com' + jsUrl + logging.debug('Retrieving the player code...') + self.js = get_content(self.html5player).replace('\n', ' ') + logging.debug('Loading ytInitialPlayerResponse...') + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});(\n|)', video_page).group(1)) + + # Get the video title self.title = ytInitialPlayerResponse["videoDetails"]["title"] - if re.search('([^"]*/base\.js)"', video_page): - self.html5player = 'https://www.youtube.com' + re.search('([^"]*/base\.js)"', video_page).group(1) - else: - self.html5player = None stream_list = ytInitialPlayerResponse['streamingData']['formats'] elif video_info['status'] == ['fail']: + # FIXME: this is basically dead code, status is always ok logging.debug('ERRORCODE: %s' % video_info['errorcode'][0]) if video_info['errorcode'] == ['150']: # FIXME: still relevant? @@ -327,7 +321,7 @@ class YouTube(VideoExtractor): log.wtf('[Failed] Invalid status.', exit_code=None) raise - # YouTube Live + # FIXME: YouTube Live if ytplayer_config and (ytplayer_config['args'].get('livestream') == '1' or ytplayer_config['args'].get('live_playback') == '1'): if 'hlsvp' in ytplayer_config['args']: hlsvp = ytplayer_config['args']['hlsvp'] @@ -343,6 +337,7 @@ class YouTube(VideoExtractor): for stream in stream_list: if isinstance(stream, str): + # FIXME: dead code? metadata = parse.parse_qs(stream) stream_itag = metadata['itag'][0] self.streams[stream_itag] = { @@ -357,22 +352,29 @@ class YouTube(VideoExtractor): 'container': mime_to_container(metadata['type'][0].split(';')[0]), } else: - stream_itag = str(stream['itag']) - self.streams[stream_itag] = { + if 'signatureCipher' in stream: + logging.debug('Parsing signatureCipher for itag=%s...' % stream['itag']) + qs = parse_qs(stream['signatureCipher']) + #logging.debug(qs) + sp = qs['sp'][0] + sig = self.__class__.s_to_sig(self.js, qs['s'][0]) + url = qs['url'][0] + '&{}={}'.format(sp, sig) + elif 'url' in stream: + url = stream['url'] + else: + log.wtf('No signatureCipher or url for itag=%s' % stream['itag']) + url = self.__class__.dethrottle(self.js, url) + + self.streams[str(stream['itag'])] = { 'itag': str(stream['itag']), - 'url': stream['url'] if 'url' in stream else None, - 'sig': None, - 's': None, + 'url': url, 'quality': stream['quality'], 'type': stream['mimeType'], 'mime': stream['mimeType'].split(';')[0], 'container': mime_to_container(stream['mimeType'].split(';')[0]), } - if 'signatureCipher' in stream: - self.streams[stream_itag].update(dict([(_.split('=')[0], parse.unquote(_.split('=')[1])) - for _ in stream['signatureCipher'].split('&')])) - # Prepare caption tracks + # FIXME: Prepare caption tracks try: try: caption_tracks = json.loads(ytplayer_config['args']['player_response'])['captions']['playerCaptionsTracklistRenderer']['captionTracks'] @@ -408,6 +410,7 @@ class YouTube(VideoExtractor): # Prepare DASH streams (NOTE: not every video has DASH streams!) try: + # FIXME: dead code? dashmpd = ytplayer_config['args']['dashmpd'] dash_xml = parseString(get_content(dashmpd)) for aset in dash_xml.getElementsByTagName('AdaptationSet'): @@ -473,12 +476,8 @@ class YouTube(VideoExtractor): 'size': int(dash_size) + int(dash_webm_a_size) } except: - # VEVO - if not self.html5player: return - self.html5player = self.html5player.replace('\/', '/') # unescape URL (for age-restricted videos) - self.js = get_content(self.html5player) - try: + # FIXME: dead code? # Video info from video page (not always available) streams = [dict([(i.split('=')[0], parse.unquote(i.split('=')[1])) @@ -486,6 +485,7 @@ class YouTube(VideoExtractor): for afmt in ytplayer_config['args']['adaptive_fmts'].split(',')] except: if 'adaptive_fmts' in video_info: + # FIXME: dead code? streams = [dict([(i.split('=')[0], parse.unquote(i.split('=')[1])) for i in afmt.split('&')]) @@ -493,12 +493,15 @@ class YouTube(VideoExtractor): else: try: try: + # FIXME: dead code? streams = json.loads(video_info['player_response'][0])['streamingData']['adaptiveFormats'] except: streams = ytInitialPlayerResponse['streamingData']['adaptiveFormats'] except: # no DASH stream at all + # FIXME: dead code? return + # FIXME: dead code? # streams without contentLength got broken urls, just remove them (#2767) streams = [stream for stream in streams if 'contentLength' in stream] @@ -523,34 +526,33 @@ class YouTube(VideoExtractor): del stream['contentLength'] del stream['initRange'] del stream['indexRange'] - if 'signatureCipher' in stream: - stream.update(dict([(_.split('=')[0], parse.unquote(_.split('=')[1])) - for _ in stream['signatureCipher'].split('&')])) - del stream['signatureCipher'] - for stream in streams: # get over speed limiting - stream['url'] += '&ratebypass=yes' + if 'signatureCipher' in stream: + logging.debug('Parsing signatureCipher for itag=%s...' % stream['itag']) + qs = parse_qs(stream['signatureCipher']) + #logging.debug(qs) + sp = qs['sp'][0] + sig = self.__class__.s_to_sig(self.js, qs['s'][0]) + url = qs['url'][0] + '&ratebypass=yes&{}={}'.format(sp, sig) + elif 'url' in stream: + url = stream['url'] + else: + log.wtf('No signatureCipher or url for itag=%s' % stream['itag']) + url = self.__class__.dethrottle(self.js, url) + stream['url'] = url + for stream in streams: # audio if stream['type'].startswith('audio/mp4'): dash_mp4_a_url = stream['url'] - if 's' in stream: - sig = self.__class__.s_to_sig(self.js, stream['s']) - dash_mp4_a_url += '&sig={}'.format(sig) dash_mp4_a_size = stream['clen'] elif stream['type'].startswith('audio/webm'): dash_webm_a_url = stream['url'] - if 's' in stream: - sig = self.__class__.s_to_sig(self.js, stream['s']) - dash_webm_a_url += '&sig={}'.format(sig) dash_webm_a_size = stream['clen'] for stream in streams: # video if 'size' in stream: if stream['type'].startswith('video/mp4'): mimeType = 'video/mp4' dash_url = stream['url'] - if 's' in stream: - sig = self.__class__.s_to_sig(self.js, stream['s']) - dash_url += '&sig={}'.format(sig) dash_size = stream['clen'] itag = stream['itag'] dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size)) @@ -567,9 +569,6 @@ class YouTube(VideoExtractor): elif stream['type'].startswith('video/webm'): mimeType = 'video/webm' dash_url = stream['url'] - if 's' in stream: - sig = self.__class__.s_to_sig(self.js, stream['s']) - dash_url += '&sig={}'.format(sig) dash_size = stream['clen'] itag = stream['itag'] audio_url = None @@ -610,15 +609,6 @@ class YouTube(VideoExtractor): if stream_id in self.streams: src = self.streams[stream_id]['url'] - if self.streams[stream_id]['sig'] is not None: - sig = self.streams[stream_id]['sig'] - src += '&sig={}'.format(sig) - elif self.streams[stream_id]['s'] is not None: - if not hasattr(self, 'js'): - self.js = get_content(self.html5player) - s = self.streams[stream_id]['s'] - sig = self.__class__.s_to_sig(self.js, s) - src += '&sig={}'.format(sig) self.streams[stream_id]['src'] = [src] self.streams[stream_id]['size'] = urls_size(self.streams[stream_id]['src']) diff --git a/tests/test.py b/tests/test.py index e8a378f5..8d348fbc 100644 --- a/tests/test.py +++ b/tests/test.py @@ -36,9 +36,9 @@ class YouGetTests(unittest.TestCase): # 'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa # info_only=True #) - #youtube.download( - # 'https://www.youtube.com/watch?v=Fpr4fQSh1cc', info_only=True - #) + youtube.download( + 'https://www.youtube.com/watch?v=oRdxUFDoQe0', info_only=True + ) def test_acfun(self): acfun.download('https://www.acfun.cn/v/ac44560432', info_only=True) From b0e6f0cadcfa0598ea5954334b87dab5c76ae238 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 23 Jun 2024 14:43:43 +0200 Subject: [PATCH 209/235] [youtube] remove dead code --- src/you_get/extractors/youtube.py | 395 ++++++------------------------ 1 file changed, 78 insertions(+), 317 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index ee30644b..f5aaf50e 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -182,204 +182,54 @@ class YouTube(VideoExtractor): if re.search('\Wlist=', self.url) and not kwargs.get('playlist'): log.w('This video is from a playlist. (use --playlist to download all videos in the playlist.)') - # Get video info - # 'eurl' is a magic parameter that can bypass age restriction - # full form: 'eurl=https%3A%2F%2Fyoutube.googleapis.com%2Fv%2F{VIDEO_ID}' - #video_info = parse.parse_qs(get_content('https://www.youtube.com/get_video_info?video_id={}&eurl=https%3A%2F%2Fy'.format(self.vid))) - #logging.debug('STATUS: %s' % video_info['status'][0]) - video_info = {'status': ['ok'], 'use_cipher_signature': 'True'} + # Extract from video page + logging.debug('Extracting from the video page...') + video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) - ytplayer_config = None - if 'status' not in video_info: - log.wtf('[Failed] Unknown status.', exit_code=None) - raise - elif video_info['status'] == ['ok']: - if 'use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']: - # FIXME: this is basically dead code, use_cipher_signature is always true - self.title = parse.unquote_plus(json.loads(video_info["player_response"][0])["videoDetails"]["title"]) - # Parse video page (for DASH) - video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) - try: - try: - # Complete ytplayer_config - ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) + try: + jsUrl = re.search('([^"]*/base\.js)"', video_page).group(1) + except: + log.wtf('[Failed] Unable to find base.js on the video page') + # FIXME: do we still need this? + jsUrl = jsUrl.replace('\/', '/') # unescape URL (for age-restricted videos) + self.html5player = 'https://www.youtube.com' + jsUrl + logging.debug('Retrieving the player code...') + self.js = get_content(self.html5player).replace('\n', ' ') - # Workaround: get_video_info returns bad s. Why? - if 'url_encoded_fmt_stream_map' not in ytplayer_config['args']: - stream_list = json.loads(ytplayer_config['args']['player_response'])['streamingData']['formats'] - else: - stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') - #stream_list = ytplayer_config['args']['adaptive_fmts'].split(',') + logging.debug('Loading ytInitialPlayerResponse...') + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});(\n|)', video_page).group(1)) - if 'assets' in ytplayer_config: - self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] - elif re.search('([^"]*/base\.js)"', video_page): - self.html5player = 'https://www.youtube.com' + re.search('([^"]*/base\.js)"', video_page).group(1) - self.html5player = self.html5player.replace('\/', '/') # unescape URL - else: - self.html5player = None + # Get the video title + self.title = ytInitialPlayerResponse["videoDetails"]["title"] - except: - # ytplayer_config = {args:{raw_player_response:ytInitialPlayerResponse}} - try: # FIXME: we should extract ytInitialPlayerResponse more reliably - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) - except: - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) - - stream_list = ytInitialPlayerResponse['streamingData']['formats'] - #stream_list = ytInitialPlayerResponse['streamingData']['adaptiveFormats'] - - if re.search('([^"]*/base\.js)"', video_page): - self.html5player = 'https://www.youtube.com' + re.search('([^"]*/base\.js)"', video_page).group(1) - else: - self.html5player = None - - except: - if 'url_encoded_fmt_stream_map' not in video_info: - stream_list = json.loads(video_info['player_response'][0])['streamingData']['formats'] - else: - stream_list = video_info['url_encoded_fmt_stream_map'][0].split(',') - - if re.search('([^"]*/base\.js)"', video_page): - self.html5player = 'https://www.youtube.com' + re.search('([^"]*/base\.js)"', video_page).group(1) - else: - self.html5player = None - - else: - # Extract from video page - logging.debug('Extracting from the video page...') - video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) - - try: - jsUrl = re.search('([^"]*/base\.js)"', video_page).group(1) - except: - log.wtf('[Failed] Unable to find base.js on the video page') - # FIXME: do we still need this? - jsUrl = jsUrl.replace('\/', '/') # unescape URL (for age-restricted videos) - self.html5player = 'https://www.youtube.com' + jsUrl - logging.debug('Retrieving the player code...') - self.js = get_content(self.html5player).replace('\n', ' ') - - logging.debug('Loading ytInitialPlayerResponse...') - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});(\n|)', video_page).group(1)) - - # Get the video title - self.title = ytInitialPlayerResponse["videoDetails"]["title"] - - stream_list = ytInitialPlayerResponse['streamingData']['formats'] - - elif video_info['status'] == ['fail']: - # FIXME: this is basically dead code, status is always ok - logging.debug('ERRORCODE: %s' % video_info['errorcode'][0]) - if video_info['errorcode'] == ['150']: - # FIXME: still relevant? - if cookies: - # Load necessary cookies into headers (for age-restricted videos) - consent, ssid, hsid, sid = 'YES', '', '', '' - for cookie in cookies: - if cookie.domain.endswith('.youtube.com'): - if cookie.name == 'SSID': - ssid = cookie.value - elif cookie.name == 'HSID': - hsid = cookie.value - elif cookie.name == 'SID': - sid = cookie.value - cookie_str = 'CONSENT=%s; SSID=%s; HSID=%s; SID=%s' % (consent, ssid, hsid, sid) - - video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid, - headers={'Cookie': cookie_str}) - else: - video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) - - try: - ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+});ytplayer', video_page).group(1)) - except: - msg = re.search('class="message">([^<]+)<', video_page).group(1) - log.wtf('[Failed] Got message "%s". Try to login with --cookies.' % msg.strip()) - - if 'title' in ytplayer_config['args']: - # 150 Restricted from playback on certain sites - # Parse video page instead - self.title = ytplayer_config['args']['title'] - self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] - stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') - else: - log.wtf('[Error] The uploader has not made this video available in your country.', exit_code=None) - raise - #self.title = re.search(' Date: Sun, 23 Jun 2024 17:03:31 +0200 Subject: [PATCH 210/235] [youtube] check playabilityStatus and ask for cookies if needed --- src/you_get/extractors/youtube.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index f5aaf50e..1d370ec2 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -190,8 +190,6 @@ class YouTube(VideoExtractor): jsUrl = re.search('([^"]*/base\.js)"', video_page).group(1) except: log.wtf('[Failed] Unable to find base.js on the video page') - # FIXME: do we still need this? - jsUrl = jsUrl.replace('\/', '/') # unescape URL (for age-restricted videos) self.html5player = 'https://www.youtube.com' + jsUrl logging.debug('Retrieving the player code...') self.js = get_content(self.html5player).replace('\n', ' ') @@ -202,6 +200,14 @@ class YouTube(VideoExtractor): # Get the video title self.title = ytInitialPlayerResponse["videoDetails"]["title"] + # Check the status + playabilityStatus = ytInitialPlayerResponse['playabilityStatus'] + status = playabilityStatus['status'] + logging.debug('status: %s' % status) + if status != 'OK': + # If cookies are loaded, status should be OK + log.wtf('[Failed] %s (use --cookies to load cookies)' % playabilityStatus['reason']) + stream_list = ytInitialPlayerResponse['streamingData']['formats'] for stream in stream_list: From 2c976407e5e84becd2b3cc85eccbd3de04ac7f56 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 23 Jun 2024 17:42:54 +0200 Subject: [PATCH 211/235] [youtube] show subreason if possible --- src/you_get/extractors/youtube.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 1d370ec2..a76e0d2e 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -206,7 +206,13 @@ class YouTube(VideoExtractor): logging.debug('status: %s' % status) if status != 'OK': # If cookies are loaded, status should be OK - log.wtf('[Failed] %s (use --cookies to load cookies)' % playabilityStatus['reason']) + try: + subreason = playabilityStatus['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'][0]['text'] + log.e('[Error] %s (%s)' % (playabilityStatus['reason'], subreason)) + except: + log.e('[Error] %s' % playabilityStatus['reason']) + log.e('View the video from a browser and export the cookies, then use --cookies to load cookies.') + exit(1) stream_list = ytInitialPlayerResponse['streamingData']['formats'] From 68965e0a962b050d901de10605af8238074c3101 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 23 Jun 2024 18:01:53 +0200 Subject: [PATCH 212/235] [youtube] ask for cookies only if status is LOGIN_REQUIRED --- src/you_get/extractors/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index a76e0d2e..82d271f0 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -211,7 +211,8 @@ class YouTube(VideoExtractor): log.e('[Error] %s (%s)' % (playabilityStatus['reason'], subreason)) except: log.e('[Error] %s' % playabilityStatus['reason']) - log.e('View the video from a browser and export the cookies, then use --cookies to load cookies.') + if status == 'LOGIN_REQUIRED': + log.e('View the video from a browser and export the cookies, then use --cookies to load cookies.') exit(1) stream_list = ytInitialPlayerResponse['streamingData']['formats'] From 2569aa03f66abb48e348d89a37d7fbeb31adc0c1 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 23 Jun 2024 18:44:47 +0200 Subject: [PATCH 213/235] [youtube] prompt for installing dukpy if ImportError --- src/you_get/extractors/youtube.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 82d271f0..38da668b 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -3,7 +3,12 @@ from ..common import * from ..extractor import VideoExtractor -import dukpy +try: + import dukpy +except ImportError: + log.e('Please install dukpy in order to extract videos from YouTube:') + log.e('$ pip install dukpy') + exit(0) from urllib.parse import urlparse, parse_qs, urlencode from xml.dom.minidom import parseString From 4b9c9510849c48984b165a348aea52dc8176a52e Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 23 Jun 2024 20:30:31 +0200 Subject: [PATCH 214/235] switch from "setup.py test" to "python -m unittest" (pytest has a weird 'I/O operation on closed file' bug though) --- .github/workflows/python-package.yml | 2 +- Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 1d9afe9f..2ee74663 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -26,7 +26,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip setuptools - pip install flake8 pytest + pip install flake8 if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Lint with flake8 run: | diff --git a/Makefile b/Makefile index fe4a238c..7b433237 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ i: @(cd src/; python3 -i -c 'import you_get; print("You-Get %s\n>>> import you_get" % you_get.version.__version__)') test: - $(SETUP) test + (cd src; python -m unittest discover -s ../tests) clean: zenity --question From 31365ac3686edbfa93cb63f019ffa74cdd64bc7e Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 23 Jun 2024 20:56:10 +0200 Subject: [PATCH 215/235] add requirements.txt --- .gitignore | 3 ++- requirements.txt | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 716c13b3..57f9412b 100644 --- a/.gitignore +++ b/.gitignore @@ -79,6 +79,7 @@ _* *.ts *.webm *.xml +*.json /.env /.idea *.m4a @@ -88,5 +89,5 @@ _* *.zip +.emacs* .vscode - diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..7af04e46 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +# runtime dependencies +dukpy From 450ce6e521c37a21e82c71f8077c8ccfdbc0b105 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 23 Jun 2024 21:04:52 +0200 Subject: [PATCH 216/235] version 0.4.1710 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 14ed6a99..cbb80d9c 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1700' +__version__ = '0.4.1710' From 08ff14c45547b6b24e50e9643bab7b25b80b492c Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 23 Jun 2024 22:30:16 +0200 Subject: [PATCH 217/235] Makefile: stop using python setup.py commands --- Makefile | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index 7b433237..a6222024 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,9 @@ -SETUP = python3 setup.py - -.PHONY: default i test clean all html rst build sdist bdist bdist_egg bdist_wheel install release +.PHONY: default i test clean all html rst build install release default: i i: - @(cd src/; python3 -i -c 'import you_get; print("You-Get %s\n>>> import you_get" % you_get.version.__version__)') + @(cd src; python -i -c 'import you_get; print("You-Get %s\n>>> import you_get" % you_get.version.__version__)') test: (cd src; python -m unittest discover -s ../tests) @@ -16,7 +14,7 @@ clean: find . | grep __pycache__ | xargs rm -fr find . | grep .pyc | xargs rm -f -all: build sdist bdist bdist_egg bdist_wheel +all: build html: pandoc README.md > README.html @@ -25,25 +23,11 @@ rst: pandoc -s -t rst README.md > README.rst build: - $(SETUP) build - -sdist: - $(SETUP) sdist - -bdist: - $(SETUP) bdist - -bdist_egg: - $(SETUP) bdist_egg - -bdist_wheel: - $(SETUP) bdist_wheel + python -m build install: - $(SETUP) install --user --prefix= + python -m pip install . -release: - #zenity --question - $(SETUP) sdist bdist_wheel - echo 'Upload new version to PyPI using:' - echo ' twine upload --sign dist/you-get-VERSION.tar.gz dist/you_get-VERSION-py3-none-any.whl' +release: build + @echo 'Upload new version to PyPI using:' + @echo ' twine upload --sign dist/you_get-VERSION*' From ef6a97301328d1001922d865a9bd830c5da85291 Mon Sep 17 00:00:00 2001 From: Rui Chen Date: Sun, 23 Jun 2024 22:57:59 -0400 Subject: [PATCH 218/235] chore: update `MANIFEST.in` to include `contrib` --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 521b023b..d3164364 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,3 +4,4 @@ include Makefile include README.md include you-get include you-get.json +recursive-include contrib * From edf29ae1ef7906b606b3ae4099582df655c12676 Mon Sep 17 00:00:00 2001 From: Rui Chen Date: Sun, 23 Jun 2024 22:59:18 -0400 Subject: [PATCH 219/235] chore: also include `you-get.plugin.zsh` and `CONTRIBUTING.md` --- MANIFEST.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MANIFEST.in b/MANIFEST.in index d3164364..ed688fde 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,7 +1,9 @@ include *.rst include *.txt include Makefile +include CONTRIBUTING.md include README.md include you-get include you-get.json +include you-get.plugin.zsh recursive-include contrib * From ed3f48810db3eb8664f9851effe5f9af241e25a1 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 25 Jun 2024 03:33:21 +0200 Subject: [PATCH 220/235] [common] update UA --- src/you_get/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 752c63f2..2e2fbebd 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -145,7 +145,7 @@ fake_headers = { 'Accept-Charset': 'UTF-8,*;q=0.5', 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'en-US,en;q=0.8', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/123.0.2420.97' # Latest Edge + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.2592.68' # Latest Edge } if sys.stdout.isatty(): From 0c216b3eb40c610784c53561896451775e6a92b1 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 25 Jun 2024 05:53:30 +0200 Subject: [PATCH 221/235] [youtube] more detailed logging of stream formats --- src/you_get/extractors/youtube.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 38da668b..cca65f08 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -223,8 +223,9 @@ class YouTube(VideoExtractor): stream_list = ytInitialPlayerResponse['streamingData']['formats'] for stream in stream_list: + logging.debug('Found format: itag=%s' % stream['itag']) if 'signatureCipher' in stream: - logging.debug('Parsing signatureCipher for itag=%s...' % stream['itag']) + logging.debug(' Parsing signatureCipher for itag=%s...' % stream['itag']) qs = parse_qs(stream['signatureCipher']) #logging.debug(qs) sp = qs['sp'][0] @@ -233,7 +234,7 @@ class YouTube(VideoExtractor): elif 'url' in stream: url = stream['url'] else: - log.wtf('No signatureCipher or url for itag=%s' % stream['itag']) + log.wtf(' No signatureCipher or url for itag=%s' % stream['itag']) url = self.__class__.dethrottle(self.js, url) self.streams[str(stream['itag'])] = { @@ -285,15 +286,19 @@ class YouTube(VideoExtractor): streams = [stream for stream in streams if 'contentLength' in stream] for stream in streams: + logging.debug('Found adaptiveFormat: itag=%s' % stream['itag']) stream['itag'] = str(stream['itag']) if 'qualityLabel' in stream: stream['quality_label'] = stream['qualityLabel'] del stream['qualityLabel'] + logging.debug(' quality_label: \t%s' % stream['quality_label']) if 'width' in stream: stream['size'] = '{}x{}'.format(stream['width'], stream['height']) del stream['width'] del stream['height'] + logging.debug(' size: \t%s' % stream['size']) stream['type'] = stream['mimeType'] + logging.debug(' type: \t%s' % stream['type']) stream['clen'] = stream['contentLength'] stream['init'] = '{}-{}'.format( stream['initRange']['start'], @@ -307,7 +312,7 @@ class YouTube(VideoExtractor): del stream['indexRange'] if 'signatureCipher' in stream: - logging.debug('Parsing signatureCipher for itag=%s...' % stream['itag']) + logging.debug(' Parsing signatureCipher for itag=%s...' % stream['itag']) qs = parse_qs(stream['signatureCipher']) #logging.debug(qs) sp = qs['sp'][0] From 882b9c07f80f0b666b3f1c4fac0e7625c5c0399f Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 27 Jun 2024 05:53:56 +0200 Subject: [PATCH 222/235] [youtube] separate files for autogenerated caption tracks --- src/you_get/extractors/youtube.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index cca65f08..e36124c8 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -274,7 +274,10 @@ class YouTube(VideoExtractor): srt += '%s --> %s\n' % (start, finish) srt += '%s\n\n' % content - self.caption_tracks[lang] = srt + if 'kind' in ct: + self.caption_tracks[ct['vssId']] = srt # autogenerated + else: + self.caption_tracks[lang] = srt except: pass # Prepare DASH streams From 1a928fac497de1faabe625ea905efc64a1cf7b06 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 14 Jul 2024 08:28:21 +0200 Subject: [PATCH 223/235] [youtube] fix extraction --- src/you_get/extractors/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index e36124c8..f87ea31b 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -80,7 +80,8 @@ class YouTube(VideoExtractor): # Examples: # yma - https://www.youtube.com/s/player/84314bef/player_ias.vflset/en_US/base.js # Xka - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js - f1 = match1(js, r'a\.set\("n",b\),[$\w]+\.length\|\|([$\w]+)\(""\)') + # jma - https://www.youtube.com/s/player/8d9f6215/player_ias.vflset/sv_SE/base.js + f1 = match1(js, r',[$\w]+\.length\|\|([$\w]+)\(""\)\)}};') f1def = match1(js, r'\W%s=(function\(\w+\).+?\)});' % re.escape(f1)) n = dukpy.evaljs('(%s)("%s")' % (f1def, n)) return n From aba2852f84e8b5ea7bdb06f14e95a9ba4e8e0f84 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 14 Jul 2024 08:29:25 +0200 Subject: [PATCH 224/235] version 0.4.1718 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index cbb80d9c..742f2def 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1710' +__version__ = '0.4.1718' From 899e2b6b2b0f85a627c8241384ed5d28e43a49c1 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 24 Jul 2024 00:26:54 +0200 Subject: [PATCH 225/235] [youtube] sloppy fix --- src/you_get/extractors/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index f87ea31b..dd06daf8 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -201,7 +201,7 @@ class YouTube(VideoExtractor): self.js = get_content(self.html5player).replace('\n', ' ') logging.debug('Loading ytInitialPlayerResponse...') - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});(\n|)', video_page).group(1)) + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});(\n||var )', video_page).group(1)) # Get the video title self.title = ytInitialPlayerResponse["videoDetails"]["title"] From 4ab02216cdfde52c9828c48644accde08c85e575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wanlin=20Wang=20=E7=8E=8B=E4=B8=87=E9=9C=96?= Date: Thu, 25 Jul 2024 17:20:26 +0800 Subject: [PATCH 226/235] Add ytInitialPlayerResponse checker to let user easily know problem --- src/you_get/extractors/youtube.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index dd06daf8..2e31e646 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -175,6 +175,16 @@ class YouTube(VideoExtractor): pass # FIXME: show DASH stream sizes (by default) for playlist videos + def check_playability_response(self, ytInitialPlayerResponse): + STATUS_OK = "OK" + + playerResponseStatus = ytInitialPlayerResponse["playabilityStatus"]["status"] + if playerResponseStatus != STATUS_OK: + reason = ytInitialPlayerResponse["playabilityStatus"].get("reason", "") + raise AssertionError( + f"Server refused to provide video details. Returned status: {playerResponseStatus}, reason: {reason}." + ) + def prepare(self, **kwargs): assert self.url or self.vid @@ -202,6 +212,7 @@ class YouTube(VideoExtractor): logging.debug('Loading ytInitialPlayerResponse...') ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});(\n||var )', video_page).group(1)) + self.check_playability_response(ytInitialPlayerResponse) # Get the video title self.title = ytInitialPlayerResponse["videoDetails"]["title"] From 7acebdab9d6ff76ecb98f6cdf753e6f20d333f00 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 30 Jul 2024 00:28:13 +0200 Subject: [PATCH 227/235] [instagram] fix extraction --- src/you_get/common.py | 2 +- src/you_get/extractors/instagram.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 2e2fbebd..f6956fad 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -145,7 +145,7 @@ fake_headers = { 'Accept-Charset': 'UTF-8,*;q=0.5', 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'en-US,en;q=0.8', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.2592.68' # Latest Edge + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/126.0.2592.113' # Latest Edge } if sys.stdout.isatty(): diff --git a/src/you_get/extractors/instagram.py b/src/you_get/extractors/instagram.py index 8e261fe7..b0fd6f0f 100755 --- a/src/you_get/extractors/instagram.py +++ b/src/you_get/extractors/instagram.py @@ -5,8 +5,13 @@ __all__ = ['instagram_download'] from ..common import * def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwargs): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.2592.87', + 'sec-fetch-mode': 'navigate' # important + } + url = r1(r'([^?]*)', url) - cont = get_content(url, headers=fake_headers) + cont = get_content(url, headers=headers) vid = r1(r'instagram.com/\w+/([^/]+)', url) description = r1(r' Date: Wed, 31 Jul 2024 13:45:22 +0200 Subject: [PATCH 228/235] Add Flox as an installation option --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 0db87791..a3cb7cea 100644 --- a/README.md +++ b/README.md @@ -127,6 +127,14 @@ You can install `you-get` easily via: # pkg install you-get ``` +### Option 7: Flox (Mac, Linux, and Windows WSL) + +You can install `you-get` easily via: + +``` +$ flox install you-get +``` + ### Shell completion Completion definitions for Bash, Fish and Zsh can be found in [`contrib/completion`](https://github.com/soimort/you-get/tree/develop/contrib/completion). Please consult your shell's manual for how to take advantage of them. From bc0e680ed5cd86f968641639db4b933fe6744b29 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 2 Aug 2024 20:20:53 +0200 Subject: [PATCH 229/235] [tiktok] support short URLs (vt.tiktok.com) --- src/you_get/extractors/tiktok.py | 2 ++ tests/test.py | 8 +++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/tiktok.py b/src/you_get/extractors/tiktok.py index 3cbb59d3..d1d98c41 100644 --- a/src/you_get/extractors/tiktok.py +++ b/src/you_get/extractors/tiktok.py @@ -16,6 +16,8 @@ def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs): m = re.match('(https?://)?([^/]+)(/.*)', url) host = m.group(2) if host != 'www.tiktok.com': # non-canonical URL + if host == 'vt.tiktok.com': # short URL + url = get_location(url) vid = r1(r'/video/(\d+)', url) url = 'https://www.tiktok.com/@/video/%s/' % vid host = 'www.tiktok.com' diff --git a/tests/test.py b/tests/test.py index 8d348fbc..f04b1fe2 100644 --- a/tests/test.py +++ b/tests/test.py @@ -57,12 +57,14 @@ class YouGetTests(unittest.TestCase): #) def test_tiktok(self): - tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True) - tiktok.download('https://www.tiktok.com/@/video/6850796940293164290', info_only=True) - tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True) + tiktok.download('https://www.tiktok.com/@zukky_48/video/7398162058153315605', info_only=True) + tiktok.download('https://www.tiktok.com/@/video/7398162058153315605', info_only=True) + tiktok.download('https://t.tiktok.com/i18n/share/video/7398162058153315605/', info_only=True) + tiktok.download('https://vt.tiktok.com/ZSYKjKt6M/', info_only=True) def test_twitter(self): twitter.download('https://twitter.com/elonmusk/status/1530516552084234244', info_only=True) + twitter.download('https://x.com/elonmusk/status/1530516552084234244', info_only=True) def test_weibo(self): miaopai.download('https://video.weibo.com/show?fid=1034:4825403706245135', info_only=True) From 4cec20ac208b7c1a0c6433d9d9bbfaa6b77e3148 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 4 Aug 2024 19:06:56 +0200 Subject: [PATCH 230/235] [youtube] add self.ua --- src/you_get/extractors/youtube.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 2e31e646..ea365a5a 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -177,7 +177,7 @@ class YouTube(VideoExtractor): def check_playability_response(self, ytInitialPlayerResponse): STATUS_OK = "OK" - + playerResponseStatus = ytInitialPlayerResponse["playabilityStatus"]["status"] if playerResponseStatus != STATUS_OK: reason = ytInitialPlayerResponse["playabilityStatus"].get("reason", "") @@ -186,6 +186,8 @@ class YouTube(VideoExtractor): ) def prepare(self, **kwargs): + self.ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.2651.86' + assert self.url or self.vid if not self.vid and self.url: From afbadf23014b2955ce70b043930d795c68127b7e Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 18 Aug 2024 03:36:58 +0200 Subject: [PATCH 231/235] [youtube] update self.ua (fix extraction) --- src/you_get/extractors/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index ea365a5a..0a93d396 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -186,7 +186,7 @@ class YouTube(VideoExtractor): ) def prepare(self, **kwargs): - self.ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.2651.86' + self.ua = 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36' assert self.url or self.vid @@ -202,7 +202,7 @@ class YouTube(VideoExtractor): # Extract from video page logging.debug('Extracting from the video page...') - video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) + video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid, headers={'User-Agent': self.ua}) try: jsUrl = re.search('([^"]*/base\.js)"', video_page).group(1) From 72b1a7bce13179f4678654d65e9f7cd9917dcaeb Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 19 Aug 2024 18:20:21 +0200 Subject: [PATCH 232/235] fix "SyntaxWarning: invalid escape sequence" for Python 3.12 --- src/you_get/common.py | 2 +- src/you_get/extractors/acfun.py | 4 ++-- src/you_get/extractors/baidu.py | 2 +- src/you_get/extractors/coub.py | 2 +- src/you_get/extractors/douban.py | 2 +- src/you_get/extractors/embed.py | 24 ++++++++++++------------ src/you_get/extractors/funshion.py | 2 +- src/you_get/extractors/ku6.py | 2 +- src/you_get/extractors/kugou.py | 10 +++++----- src/you_get/extractors/kuwo.py | 4 ++-- src/you_get/extractors/mgtv.py | 6 +++--- src/you_get/extractors/qq.py | 2 +- src/you_get/extractors/sohu.py | 2 +- src/you_get/extractors/tudou.py | 2 +- src/you_get/extractors/tumblr.py | 2 +- src/you_get/extractors/twitter.py | 2 +- src/you_get/extractors/universal.py | 18 +++++++++--------- src/you_get/extractors/vimeo.py | 2 +- src/you_get/extractors/youku.py | 10 +++++----- src/you_get/extractors/youtube.py | 6 +++--- 20 files changed, 53 insertions(+), 53 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index f6956fad..0b307dde 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -715,7 +715,7 @@ def url_save( bar.done() if not force and auto_rename: path, ext = os.path.basename(filepath).rsplit('.', 1) - finder = re.compile(' \([1-9]\d*?\)$') + finder = re.compile(r' \([1-9]\d*?\)$') if (finder.search(path) is None): thisfile = path + ' (1).' + ext else: diff --git a/src/you_get/extractors/acfun.py b/src/you_get/extractors/acfun.py index cd275927..5775eb5e 100644 --- a/src/you_get/extractors/acfun.py +++ b/src/you_get/extractors/acfun.py @@ -32,7 +32,7 @@ class AcFun(VideoExtractor): self.title += " - " + [p.get('title') for p in video_list if p.get('id') == vid][0] currentVideoInfo = json_data.get('currentVideoInfo') - elif re.match("https?://[^\.]*\.*acfun\.[^\.]+/bangumi/aa(\d+)", self.url): + elif re.match(r"https?://[^\.]*\.*acfun\.[^\.]+/bangumi/aa(\d+)", self.url): html = get_content(self.url, headers=fake_headers) tag_script = match1(html, r'') json_text = tag_script[tag_script.find('{') : tag_script.find('};') + 1] @@ -180,7 +180,7 @@ class AcFun(VideoExtractor): title += " - " + [p.get('title') for p in video_list if p.get('id') == vid][0] currentVideoInfo = json_data.get('currentVideoInfo') m3u8_url = getM3u8UrlFromCurrentVideoInfo(currentVideoInfo) - elif re.match("https?://[^\.]*\.*acfun\.[^\.]+/bangumi/aa(\d+)", url): + elif re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/bangumi/aa(\d+)', url): html = get_content(url, headers=fake_headers) tag_script = match1(html, r'') json_text = tag_script[tag_script.find('{') : tag_script.find('};') + 1] diff --git a/src/you_get/extractors/baidu.py b/src/you_get/extractors/baidu.py index 521d5e99..61b0ad24 100644 --- a/src/you_get/extractors/baidu.py +++ b/src/you_get/extractors/baidu.py @@ -116,7 +116,7 @@ def baidu_download(url, output_dir='.', stream_type=None, merge=True, info_only= id = r1(r'https?://music.baidu.com/album/(\d+)', url) baidu_download_album(id, output_dir, merge, info_only) - elif re.match('https?://music.baidu.com/song/\d+', url): + elif re.match(r'https?://music.baidu.com/song/\d+', url): id = r1(r'https?://music.baidu.com/song/(\d+)', url) baidu_download_song(id, output_dir, merge, info_only) diff --git a/src/you_get/extractors/coub.py b/src/you_get/extractors/coub.py index a71cbc18..b7becb45 100644 --- a/src/you_get/extractors/coub.py +++ b/src/you_get/extractors/coub.py @@ -58,7 +58,7 @@ def fix_coub_video_file(file_path): def get_title_and_urls(json_data): - title = legitimize(re.sub('[\s*]', "_", json_data['title'])) + title = legitimize(re.sub(r'[\s*]', "_", json_data['title'])) video_info = json_data['file_versions']['html5']['video'] if 'high' not in video_info: if 'med' not in video_info: diff --git a/src/you_get/extractors/douban.py b/src/you_get/extractors/douban.py index 1a4a67d1..17b7a8c5 100644 --- a/src/you_get/extractors/douban.py +++ b/src/you_get/extractors/douban.py @@ -10,7 +10,7 @@ def douban_download(url, output_dir = '.', merge = True, info_only = False, **kw if re.match(r'https?://movie', url): title = match1(html, 'name="description" content="([^"]+)') - tid = match1(url, 'trailer/(\d+)') + tid = match1(url, r'trailer/(\d+)') real_url = 'https://movie.douban.com/trailer/video_url?tid=%s' % tid type, ext, size = url_info(real_url) diff --git a/src/you_get/extractors/embed.py b/src/you_get/extractors/embed.py index aedf5137..48092df1 100644 --- a/src/you_get/extractors/embed.py +++ b/src/you_get/extractors/embed.py @@ -20,18 +20,18 @@ from . import bokecc """ refer to http://open.youku.com/tools """ -youku_embed_patterns = [ 'youku\.com/v_show/id_([a-zA-Z0-9=]+)', - 'player\.youku\.com/player\.php/sid/([a-zA-Z0-9=]+)/v\.swf', - 'loader\.swf\?VideoIDS=([a-zA-Z0-9=]+)', - 'player\.youku\.com/embed/([a-zA-Z0-9=]+)', - 'YKU.Player\(\'[a-zA-Z0-9]+\',{ client_id: \'[a-zA-Z0-9]+\', vid: \'([a-zA-Z0-9]+)\'' +youku_embed_patterns = [ r'youku\.com/v_show/id_([a-zA-Z0-9=]+)', + r'player\.youku\.com/player\.php/sid/([a-zA-Z0-9=]+)/v\.swf', + r'loader\.swf\?VideoIDS=([a-zA-Z0-9=]+)', + r'player\.youku\.com/embed/([a-zA-Z0-9=]+)', + r'YKU.Player\(\'[a-zA-Z0-9]+\',{ client_id: \'[a-zA-Z0-9]+\', vid: \'([a-zA-Z0-9]+)\'' ] """ http://www.tudou.com/programs/view/html5embed.action?type=0&code=3LS_URGvl54&lcode=&resourceId=0_06_05_99 """ -tudou_embed_patterns = [ 'tudou\.com[a-zA-Z0-9\/\?=\&\.\;]+code=([a-zA-Z0-9_-]+)\&', - 'www\.tudou\.com/v/([a-zA-Z0-9_-]+)/[^"]*v\.swf' +tudou_embed_patterns = [ r'tudou\.com[a-zA-Z0-9\/\?=\&\.\;]+code=([a-zA-Z0-9_-]+)\&', + r'www\.tudou\.com/v/([a-zA-Z0-9_-]+)/[^"]*v\.swf' ] """ @@ -39,18 +39,18 @@ refer to http://open.tudou.com/wiki/video/info """ tudou_api_patterns = [ ] -iqiyi_embed_patterns = [ 'player\.video\.qiyi\.com/([^/]+)/[^/]+/[^/]+/[^/]+\.swf[^"]+tvId=(\d+)' ] +iqiyi_embed_patterns = [ r'player\.video\.qiyi\.com/([^/]+)/[^/]+/[^/]+/[^/]+\.swf[^"]+tvId=(\d+)' ] -netease_embed_patterns = [ '(http://\w+\.163\.com/movie/[^\'"]+)' ] +netease_embed_patterns = [ r'(http://\w+\.163\.com/movie/[^\'"]+)' ] -vimeo_embed_patters = [ 'player\.vimeo\.com/video/(\d+)' ] +vimeo_embed_patters = [ r'player\.vimeo\.com/video/(\d+)' ] -dailymotion_embed_patterns = [ 'www\.dailymotion\.com/embed/video/(\w+)' ] +dailymotion_embed_patterns = [ r'www\.dailymotion\.com/embed/video/(\w+)' ] """ check the share button on http://www.bilibili.com/video/av5079467/ """ -bilibili_embed_patterns = [ 'static\.hdslb\.com/miniloader\.swf.*aid=(\d+)' ] +bilibili_embed_patterns = [ r'static\.hdslb\.com/miniloader\.swf.*aid=(\d+)' ] ''' diff --git a/src/you_get/extractors/funshion.py b/src/you_get/extractors/funshion.py index 1f7156b5..2bb1aba0 100644 --- a/src/you_get/extractors/funshion.py +++ b/src/you_get/extractors/funshion.py @@ -84,7 +84,7 @@ class Funshion(VideoExtractor): moz_ec_name = search_dict(sym_to_name, 'mozEcName') push = search_dict(sym_to_name, 'push') - patt = '{}\.{}\("(.+?)"\)'.format(moz_ec_name, push) + patt = r'{}\.{}\("(.+?)"\)'.format(moz_ec_name, push) ec_list = re.findall(patt, code) [magic_list.append(sym_to_name[ec]) for ec in ec_list] return magic_list diff --git a/src/you_get/extractors/ku6.py b/src/you_get/extractors/ku6.py index c827eafd..d37d8947 100644 --- a/src/you_get/extractors/ku6.py +++ b/src/you_get/extractors/ku6.py @@ -50,7 +50,7 @@ def ku6_download(url, output_dir = '.', merge = True, info_only = False, **kwarg vid = vid.group(1) else: raise Exception('Unsupported url') - this_meta = re.search('"?'+vid+'"?:\{(.+?)\}', meta) + this_meta = re.search('"?'+vid+r'"?:\{(.+?)\}', meta) if this_meta is not None: this_meta = this_meta.group(1) title = re.search('title:"(.+?)"', this_meta).group(1) diff --git a/src/you_get/extractors/kugou.py b/src/you_get/extractors/kugou.py index 192bd809..18b342bf 100644 --- a/src/you_get/extractors/kugou.py +++ b/src/you_get/extractors/kugou.py @@ -32,8 +32,8 @@ def kugou_download(url, output_dir=".", merge=True, info_only=False, **kwargs): def kugou_download_by_hash(url, output_dir='.', merge=True, info_only=False): # sample # url_sample:http://www.kugou.com/song/#hash=93F7D2FC6E95424739448218B591AEAF&album_id=9019462 - hash_val = match1(url, 'hash=(\w+)') - album_id = match1(url, 'album_id=(\d+)') + hash_val = match1(url, r'hash=(\w+)') + album_id = match1(url, r'album_id=(\d+)') if not album_id: album_id = 123 html = get_html("http://www.kugou.com/yy/index.php?r=play/getdata&hash={}&album_id={}&mid=123".format(hash_val, album_id)) @@ -60,7 +60,7 @@ def kugou_download_playlist(url, output_dir='.', merge=True, info_only=False, ** res = pattern.findall(html) for song in res: res = get_html(song) - pattern_url = re.compile('"hash":"(\w+)".*"album_id":(\d)+') + pattern_url = re.compile(r'"hash":"(\w+)".*"album_id":(\d)+') hash_val, album_id = res = pattern_url.findall(res)[0] if not album_id: album_id = 123 @@ -70,7 +70,7 @@ def kugou_download_playlist(url, output_dir='.', merge=True, info_only=False, ** # album sample: http://www.kugou.com/yy/album/single/1645030.html elif url.lower().find('album') != -1: html = get_html(url) - pattern = re.compile('var data=(\[.*?\]);') + pattern = re.compile(r'var data=(\[.*?\]);') res = pattern.findall(html)[0] for v in json.loads(res): urls.append('http://www.kugou.com/song/#hash=%s&album_id=%s' % (v['hash'], v['album_id'])) @@ -79,7 +79,7 @@ def kugou_download_playlist(url, output_dir='.', merge=True, info_only=False, ** # playlist sample:http://www.kugou.com/yy/special/single/487279.html else: html = get_html(url) - pattern = re.compile('data="(\w+)\|(\d+)"') + pattern = re.compile(r'data="(\w+)\|(\d+)"') for v in pattern.findall(html): urls.append('http://www.kugou.com/song/#hash=%s&album_id=%s' % (v[0], v[1])) print('http://www.kugou.com/song/#hash=%s&album_id=%s' % (v[0], v[1])) diff --git a/src/you_get/extractors/kuwo.py b/src/you_get/extractors/kuwo.py index 54c09235..dceab68f 100644 --- a/src/you_get/extractors/kuwo.py +++ b/src/you_get/extractors/kuwo.py @@ -18,7 +18,7 @@ def kuwo_download_by_rid(rid, output_dir = '.', merge = True, info_only = False) def kuwo_playlist_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): html=get_content(url) - matched=set(re.compile("yinyue/(\d+)").findall(html))#reduce duplicated + matched=set(re.compile(r"yinyue/(\d+)").findall(html))#reduce duplicated for rid in matched: kuwo_download_by_rid(rid,output_dir,merge,info_only) @@ -26,7 +26,7 @@ def kuwo_playlist_download(url, output_dir = '.', merge = True, info_only = Fals def kuwo_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): if "www.kuwo.cn/yinyue" in url: - rid=match1(url,'yinyue/(\d+)') + rid=match1(url, r'yinyue/(\d+)') kuwo_download_by_rid(rid,output_dir, merge, info_only) else: kuwo_playlist_download(url,output_dir,merge,info_only) diff --git a/src/you_get/extractors/mgtv.py b/src/you_get/extractors/mgtv.py index 053212ef..f241ba6f 100644 --- a/src/you_get/extractors/mgtv.py +++ b/src/you_get/extractors/mgtv.py @@ -44,11 +44,11 @@ class MGTV(VideoExtractor): def get_vid_from_url(url): """Extracts video ID from URL. """ - vid = match1(url, 'https?://www.mgtv.com/(?:b|l)/\d+/(\d+).html') + vid = match1(url, r'https?://www.mgtv.com/(?:b|l)/\d+/(\d+).html') if not vid: - vid = match1(url, 'https?://www.mgtv.com/hz/bdpz/\d+/(\d+).html') + vid = match1(url, r'https?://www.mgtv.com/hz/bdpz/\d+/(\d+).html') if not vid: - vid = match1(url, 'https?://www.mgtv.com/s/(\d+).html') + vid = match1(url, r'https?://www.mgtv.com/s/(\d+).html') return vid # ---------------------------------------------------------------------- diff --git a/src/you_get/extractors/qq.py b/src/you_get/extractors/qq.py index e38770e9..6cb76e1d 100644 --- a/src/you_get/extractors/qq.py +++ b/src/you_get/extractors/qq.py @@ -83,7 +83,7 @@ def kg_qq_download_by_shareid(shareid, output_dir='.', info_only=False, caption= playurl = json_data['data']['playurl'] videourl = json_data['data']['playurl_video'] real_url = playurl if playurl else videourl - real_url = real_url.replace('\/', '/') + real_url = real_url.replace(r'\/', '/') ksong_mid = json_data['data']['ksong_mid'] lyric_url = 'http://cgi.kg.qq.com/fcgi-bin/fcg_lyric?jsonpCallback=jsopgetlrcdata&outCharset=utf-8&ksongmid=' + ksong_mid diff --git a/src/you_get/extractors/sohu.py b/src/you_get/extractors/sohu.py index 74374202..e0e4eeb9 100644 --- a/src/you_get/extractors/sohu.py +++ b/src/you_get/extractors/sohu.py @@ -23,7 +23,7 @@ def real_url(fileName, key, ch): def sohu_download(url, output_dir='.', merge=True, info_only=False, extractor_proxy=None, **kwargs): if re.match(r'http://share.vrs.sohu.com', url): - vid = r1('id=(\d+)', url) + vid = r1(r'id=(\d+)', url) else: html = get_html(url) vid = r1(r'\Wvid\s*[\:=]\s*[\'"]?(\d+)[\'"]?', html) or r1(r'bid:\'(\d+)\',', html) or r1(r'bid=(\d+)', html) diff --git a/src/you_get/extractors/tudou.py b/src/you_get/extractors/tudou.py index b1568dfd..92b8393c 100644 --- a/src/you_get/extractors/tudou.py +++ b/src/you_get/extractors/tudou.py @@ -71,7 +71,7 @@ def tudou_download(url, output_dir = '.', merge = True, info_only = False, **kwa # obsolete? def parse_playlist(url): - aid = r1('http://www.tudou.com/playlist/p/a(\d+)(?:i\d+)?\.html', url) + aid = r1(r'http://www.tudou.com/playlist/p/a(\d+)(?:i\d+)?\.html', url) html = get_decoded_html(url) if not aid: aid = r1(r"aid\s*[:=]\s*'(\d+)'", html) diff --git a/src/you_get/extractors/tumblr.py b/src/you_get/extractors/tumblr.py index 08e605e8..236ba182 100644 --- a/src/you_get/extractors/tumblr.py +++ b/src/you_get/extractors/tumblr.py @@ -34,7 +34,7 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): post_data_raw='{"eu_resident":true,"gdpr_is_acceptable_age":true,"gdpr_consent_core":true,"gdpr_consent_first_party_ads":true,"gdpr_consent_third_party_ads":true,"gdpr_consent_search_history":true,"redirect_to":"%s","gdpr_reconsent":false}' % url) page = get_html(url, faker=True) - html = parse.unquote(page).replace('\/', '/') + html = parse.unquote(page).replace(r'\/', '/') feed = r1(r'', html) if feed in ['photo', 'photoset', 'entry'] or feed is None: diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index b2c5878a..299dc052 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -33,7 +33,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) **kwargs) return - m = re.match('^https?://(mobile\.)?(x|twitter)\.com/([^/]+)/status/(\d+)', url) + m = re.match(r'^https?://(mobile\.)?(x|twitter)\.com/([^/]+)/status/(\d+)', url) assert m screen_name, item_id = m.group(3), m.group(4) page_title = "{} [{}]".format(screen_name, item_id) diff --git a/src/you_get/extractors/universal.py b/src/you_get/extractors/universal.py index 4a3268ab..03bba35a 100644 --- a/src/you_get/extractors/universal.py +++ b/src/you_get/extractors/universal.py @@ -48,7 +48,7 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg else: return - hls_urls = re.findall(r'(https?://[^;"\'\\]+' + '\.m3u8?' + + hls_urls = re.findall(r'(https?://[^;"\'\\]+' + r'\.m3u8?' + r'[^;"\'\\]*)', page) if hls_urls: try: @@ -64,14 +64,14 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg return # most common media file extensions on the Internet - media_exts = ['\.flv', '\.mp3', '\.mp4', '\.webm', - '[-_]1\d\d\d\.jpe?g', '[-_][6-9]\d\d\.jpe?g', # tumblr - '[-_]1\d\d\dx[6-9]\d\d\.jpe?g', - '[-_][6-9]\d\dx1\d\d\d\.jpe?g', - '[-_][6-9]\d\dx[6-9]\d\d\.jpe?g', - 's1600/[\w%]+\.jpe?g', # blogger - 'blogger\.googleusercontent\.com/img/a/\w*', # blogger - 'img[6-9]\d\d/[\w%]+\.jpe?g' # oricon? + media_exts = [r'\.flv', r'\.mp3', r'\.mp4', r'\.webm', + r'[-_]1\d\d\d\.jpe?g', r'[-_][6-9]\d\d\.jpe?g', # tumblr + r'[-_]1\d\d\dx[6-9]\d\d\.jpe?g', + r'[-_][6-9]\d\dx1\d\d\d\.jpe?g', + r'[-_][6-9]\d\dx[6-9]\d\d\.jpe?g', + r's1600/[\w%]+\.jpe?g', # blogger + r'blogger\.googleusercontent\.com/img/a/\w*', # blogger + r'img[6-9]\d\d/[\w%]+\.jpe?g' # oricon? ] urls = [] diff --git a/src/you_get/extractors/vimeo.py b/src/you_get/extractors/vimeo.py index c7d7b057..8c9f8ebc 100644 --- a/src/you_get/extractors/vimeo.py +++ b/src/you_get/extractors/vimeo.py @@ -102,7 +102,7 @@ class VimeoExtractor(VideoExtractor): pos = 0 while pos < len(lines): if lines[pos].startswith('#EXT-X-STREAM-INF'): - patt = 'RESOLUTION=(\d+)x(\d+)' + patt = r'RESOLUTION=(\d+)x(\d+)' hit = re.search(patt, lines[pos]) if hit is None: continue diff --git a/src/you_get/extractors/youku.py b/src/you_get/extractors/youku.py index ed0743bb..7a6fb2fc 100644 --- a/src/you_get/extractors/youku.py +++ b/src/you_get/extractors/youku.py @@ -242,7 +242,7 @@ class Youku(VideoExtractor): def youku_download_playlist_by_url(url, **kwargs): video_page_pt = 'https?://v.youku.com/v_show/id_([A-Za-z0-9=]+)' - js_cb_pt = '\(({.+})\)' + js_cb_pt = r'\(({.+})\)' if re.match(video_page_pt, url): youku_obj = Youku() youku_obj.url = url @@ -272,14 +272,14 @@ def youku_download_playlist_by_url(url, **kwargs): page = get_content(url) show_id = re.search(r'showid:"(\d+)"', page).group(1) ep = 'http://list.youku.com/show/module?id={}&tab=showInfo&callback=jQuery'.format(show_id) - xhr_page = get_content(ep).replace('\/', '/').replace('\"', '"') + xhr_page = get_content(ep).replace(r'\/', '/').replace(r'\"', '"') video_url = re.search(r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_page).group(1) youku_download_playlist_by_url('http://'+video_url, **kwargs) return - elif re.match('https?://list.youku.com/albumlist/show/id_(\d+)\.html', url): + elif re.match(r'https?://list.youku.com/albumlist/show/id_(\d+)\.html', url): # http://list.youku.com/albumlist/show/id_2336634.html # UGC playlist - list_id = re.search('https?://list.youku.com/albumlist/show/id_(\d+)\.html', url).group(1) + list_id = re.search(r'https?://list.youku.com/albumlist/show/id_(\d+)\.html', url).group(1) ep = 'http://list.youku.com/albumlist/items?id={}&page={}&size=20&ascending=1&callback=tuijsonp6' first_u = ep.format(list_id, 1) @@ -294,7 +294,7 @@ def youku_download_playlist_by_url(url, **kwargs): for i in range(2, req_cnt+2): req_u = ep.format(list_id, i) xhr_page = get_content(req_u) - json_data = json.loads(re.search(js_cb_pt, xhr_page).group(1).replace('\/', '/')) + json_data = json.loads(re.search(js_cb_pt, xhr_page).group(1).replace(r'\/', '/')) xhr_html = json_data['html'] page_videos = re.findall(r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_html) v_urls.extend(page_videos) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 0a93d396..fe064199 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -197,7 +197,7 @@ class YouTube(VideoExtractor): self.download_playlist_by_url(self.url, **kwargs) exit(0) - if re.search('\Wlist=', self.url) and not kwargs.get('playlist'): + if re.search(r'\Wlist=', self.url) and not kwargs.get('playlist'): log.w('This video is from a playlist. (use --playlist to download all videos in the playlist.)') # Extract from video page @@ -205,7 +205,7 @@ class YouTube(VideoExtractor): video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid, headers={'User-Agent': self.ua}) try: - jsUrl = re.search('([^"]*/base\.js)"', video_page).group(1) + jsUrl = re.search(r'([^"]*/base\.js)"', video_page).group(1) except: log.wtf('[Failed] Unable to find base.js on the video page') self.html5player = 'https://www.youtube.com' + jsUrl @@ -213,7 +213,7 @@ class YouTube(VideoExtractor): self.js = get_content(self.html5player).replace('\n', ' ') logging.debug('Loading ytInitialPlayerResponse...') - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});(\n||var )', video_page).group(1)) + ytInitialPlayerResponse = json.loads(re.search(r'ytInitialPlayerResponse\s*=\s*([^\n]+?});(\n||var )', video_page).group(1)) self.check_playability_response(ytInitialPlayerResponse) # Get the video title From 26db833b86e13f83acc00367533e07a35ab66e49 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 19 Aug 2024 18:41:21 +0200 Subject: [PATCH 233/235] [tests] disable test_youtube temporarily --- tests/test.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test.py b/tests/test.py index f04b1fe2..2d220c62 100644 --- a/tests/test.py +++ b/tests/test.py @@ -27,18 +27,18 @@ class YouGetTests(unittest.TestCase): info_only=True ) - def test_youtube(self): - youtube.download( - 'http://www.youtube.com/watch?v=pzKerr0JIPA', info_only=True - ) + #def test_youtube(self): + #youtube.download( + # 'http://www.youtube.com/watch?v=pzKerr0JIPA', info_only=True + #) #youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True) #youtube.download( # 'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa # info_only=True #) - youtube.download( - 'https://www.youtube.com/watch?v=oRdxUFDoQe0', info_only=True - ) + #youtube.download( + # 'https://www.youtube.com/watch?v=oRdxUFDoQe0', info_only=True + #) def test_acfun(self): acfun.download('https://www.acfun.cn/v/ac44560432', info_only=True) From 2c8a0cb5e87692ce3132d284cd5f71cb50006cfe Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 19 Aug 2024 21:48:34 +0200 Subject: [PATCH 234/235] [vimeo] cleanup --- src/you_get/extractors/vimeo.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/src/you_get/extractors/vimeo.py b/src/you_get/extractors/vimeo.py index 8c9f8ebc..4034d0e0 100644 --- a/src/you_get/extractors/vimeo.py +++ b/src/you_get/extractors/vimeo.py @@ -132,34 +132,6 @@ class VimeoExtractor(VideoExtractor): def vimeo_download_by_id(id, title=None, output_dir='.', merge=True, info_only=False, **kwargs): - ''' - try: - # normal Vimeo video - html = get_content('https://vimeo.com/' + id) - cfg_patt = r'clip_page_config\s*=\s*(\{.+?\});' - cfg = json.loads(match1(html, cfg_patt)) - video_page = get_content(cfg['player']['config_url'], headers=fake_headers) - title = cfg['clip']['title'] - info = loads(video_page) - except: - # embedded player - referer may be required - if 'referer' in kwargs: - fake_headers['Referer'] = kwargs['referer'] - - video_page = get_content('http://player.vimeo.com/video/%s' % id, headers=fake_headers) - title = r1(r'([^<]+)', video_page) - info = loads(match1(video_page, r'var t=(\{.+?\});')) - - streams = info['request']['files']['progressive'] - streams = sorted(streams, key=lambda i: i['height']) - url = streams[-1]['url'] - - type, ext, size = url_info(url, faker=True) - - print_info(site_info, title, type, size) - if not info_only: - download_urls([url], title, ext, size, output_dir, merge=merge, faker=True) - ''' site = VimeoExtractor() site.download_by_vid(id, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs) From c4042d0d0bba7b6662bd55a5767e6f1562f43341 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 19 Aug 2024 21:53:12 +0200 Subject: [PATCH 235/235] version 0.4.1730 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 742f2def..e0068208 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1718' +__version__ = '0.4.1730'