From 663e53a95f5435afd95a03bb8b16c6d1fac283a5 Mon Sep 17 00:00:00 2001 From: zhouyuan1 Date: Thu, 20 May 2021 13:03:03 +0800 Subject: [PATCH 01/67] add param m3u8 , allow download video via m3u8 url --- src/you_get/common.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 67ef581b..6e619c11 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -1321,7 +1321,13 @@ def download_main(download, download_playlist, urls, playlist, **kwargs): if re.match(r'https?://', url) is None: url = 'http://' + url - if playlist: + if m3u8: + if output_filename: + title = output_filename + else: + title = "m3u8file" + download_url_ffmpeg(url=url, title=title,ext = 'mp4',output_dir = '.') + elif playlist: download_playlist(url, **kwargs) else: download(url, **kwargs) @@ -1425,7 +1431,6 @@ def set_socks_proxy(proxy): proxy_info = proxy.split("@") socks_proxy_addrs = proxy_info[1].split(':') socks_proxy_auth = proxy_info[0].split(":") - print(socks_proxy_auth[0]+" "+socks_proxy_auth[1]+" "+socks_proxy_addrs[0]+" "+socks_proxy_addrs[1]) socks.set_default_proxy( socks.SOCKS5, socks_proxy_addrs[0], @@ -1436,7 +1441,6 @@ def set_socks_proxy(proxy): ) else: socks_proxy_addrs = proxy.split(':') - print(socks_proxy_addrs[0]+" "+socks_proxy_addrs[1]) socks.set_default_proxy( socks.SOCKS5, socks_proxy_addrs[0], @@ -1601,6 +1605,10 @@ def script_main(download, download_playlist, **kwargs): download_grp.add_argument('--stream', help=argparse.SUPPRESS) download_grp.add_argument('--itag', help=argparse.SUPPRESS) + download_grp.add_argument('-m', '--m3u8', action='store_true', default=False, + help = 'download vide using an m3u8 url') + + parser.add_argument('URL', nargs='*', help=argparse.SUPPRESS) args = parser.parse_args() @@ -1626,6 +1634,7 @@ def script_main(download, download_playlist, **kwargs): global output_filename global auto_rename global insecure + global m3u8 output_filename = args.output_filename extractor_proxy = args.extractor_proxy @@ -1647,6 +1656,9 @@ def script_main(download, download_playlist, **kwargs): if args.cookies: load_cookies(args.cookies) + if args.m3u8: + m3u8 = True + caption = True stream_id = args.format or args.stream or args.itag if args.no_caption: From b9dbae8b40588bc134af6bb7406492ec68fffded Mon Sep 17 00:00:00 2001 From: zhancat200801 Date: Tue, 29 Jun 2021 13:18:09 +0800 Subject: [PATCH 02/67] modify qq.py --- src/you_get/extractors/qq.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/you_get/extractors/qq.py b/src/you_get/extractors/qq.py index 6411b195..e38770e9 100644 --- a/src/you_get/extractors/qq.py +++ b/src/you_get/extractors/qq.py @@ -35,6 +35,7 @@ def qq_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False): part_urls= [] total_size = 0 + ext = None for part in range(1, seg_cnt+1): if fc_cnt == 0: # fix json parsing error From ea5f712cb0a420e26ea6e883014159deb584c43b Mon Sep 17 00:00:00 2001 From: hong56hk Date: Thu, 26 Aug 2021 00:12:52 +0800 Subject: [PATCH 03/67] fix for using insecure flag does not work --- src/you_get/common.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 597ed45a..5aa74a20 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -343,7 +343,12 @@ def undeflate(data): # DEPRECATED in favor of get_content() def get_response(url, faker=False): logging.debug('get_response: %s' % url) - + ctx = None + if insecure: + # ignore ssl errors + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE # install cookies if cookies: opener = request.build_opener(request.HTTPCookieProcessor(cookies)) @@ -351,10 +356,10 @@ def get_response(url, faker=False): if faker: response = request.urlopen( - request.Request(url, headers=fake_headers), None + request.Request(url, headers=fake_headers), None, context=ctx, ) else: - response = request.urlopen(url) + response = request.urlopen(url, context=ctx) data = response.read() if response.info().get('Content-Encoding') == 'gzip': From 7c2523f5cc843a6622968f0eed8dcb9a49b62b00 Mon Sep 17 00:00:00 2001 From: Jian Wang Date: Thu, 7 Oct 2021 15:42:37 +0800 Subject: [PATCH 04/67] support channel/series for bilibili --- src/you_get/extractors/bilibili.py | 20 ++++++++++++++++++++ tests/test.py | 3 +++ 2 files changed, 23 insertions(+) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index edb656c7..36de363c 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -112,6 +112,10 @@ class Bilibili(VideoExtractor): def bilibili_space_channel_api(mid, cid, pn=1, ps=100): return 'https://api.bilibili.com/x/space/channel/video?mid=%s&cid=%s&pn=%s&ps=%s&order=0&jsonp=jsonp' % (mid, cid, pn, ps) + @staticmethod + def bilibili_series_archives_api(mid, sid, pn=1, ps=100): + return 'https://api.bilibili.com/x/series/archives?mid=%s&series_id=%s&pn=%s&ps=%s&only_normal=true&sort=asc&jsonp=jsonp' % (mid, sid, pn, ps) + @staticmethod def bilibili_space_favlist_api(fid, pn=1, ps=20): return 'https://api.bilibili.com/x/v3/fav/resource/list?media_id=%s&pn=%s&ps=%s&order=mtime&type=0&tid=0&jsonp=jsonp' % (fid, pn, ps) @@ -596,6 +600,8 @@ class Bilibili(VideoExtractor): sort = 'video' elif re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/detail\?.*cid=(\d+)', self.url): sort = 'space_channel' + elif re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/seriesdetail\?.*sid=(\d+)', self.url): + sort = 'space_channel_series' elif re.match(r'https?://space\.?bilibili\.com/(\d+)/favlist\?.*fid=(\d+)', self.url): sort = 'space_favlist' elif re.match(r'https?://space\.?bilibili\.com/(\d+)/video', self.url): @@ -706,6 +712,20 @@ class Bilibili(VideoExtractor): url = 'https://www.bilibili.com/video/av%s' % video['aid'] self.__class__().download_playlist_by_url(url, **kwargs) + elif sort == 'space_channel_series': + m = re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/seriesdetail\?.*sid=(\d+)', self.url) + mid, sid = m.group(1), m.group(2) + api_url = self.bilibili_series_archives_api(mid, sid) + api_content = get_content(api_url, headers=self.bilibili_headers(referer=self.url)) + archives_info = json.loads(api_content) + # TBD: channel of more than 100 videos + + epn, i = len(archives_info['data']['archives']), 0 + for video in archives_info['data']['archives']: + i += 1; log.w('Extracting %s of %s videos ...' % (i, epn)) + url = 'https://www.bilibili.com/video/av%s' % video['aid'] + self.__class__().download_playlist_by_url(url, **kwargs) + elif sort == 'space_favlist': m = re.match(r'https?://space\.?bilibili\.com/(\d+)/favlist\?.*fid=(\d+)', self.url) vmid, fid = m.group(1), m.group(2) diff --git a/tests/test.py b/tests/test.py index 4a2a117c..8ae622b2 100644 --- a/tests/test.py +++ b/tests/test.py @@ -40,6 +40,9 @@ class YouGetTests(unittest.TestCase): def test_acfun(self): acfun.download('https://www.acfun.cn/v/ac11701912', info_only=True) + def test_bilibili(self): + bilibili.download('https://space.bilibili.com/72270557/channel/seriesdetail?sid=218844', info_only=True) + #def test_soundcloud(self): ## single song #soundcloud.download( From 798ad6d14eb23ee0754ead224494911fcf27cd68 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 20 Oct 2021 18:15:51 +0200 Subject: [PATCH 05/67] [universal] tweak --- src/you_get/extractors/universal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/universal.py b/src/you_get/extractors/universal.py index fdc7426d..4a3268ab 100644 --- a/src/you_get/extractors/universal.py +++ b/src/you_get/extractors/universal.py @@ -76,7 +76,7 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg urls = [] for i in media_exts: - urls += re.findall(r'(https?://[^ ;&"\'\\<>]*' + i + r'[^ ;&"\'\\<>]*)', page) + urls += re.findall(r'(https?://[^ ;&"\'\\<>]*' + i + r'[^ =?;&"\'\\<>]*)', page) p_urls = re.findall(r'(https?%3A%2F%2F[^;&"]+' + i + r'[^;&"]*)', page) urls += [parse.unquote(url) for url in p_urls] From c064013b9c19c6225b6edd818ced0f4003d22854 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sun, 24 Oct 2021 07:58:34 +0200 Subject: [PATCH 06/67] Test on Python 3.10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit “3.10” must be quoted or yaml will treat it as 3.1. --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index daae6668..b23455c8 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9, pypy3] + python-version: [3.5, 3.6, 3.7, 3.8, 3.9, "3.10", pypy3] steps: - uses: actions/checkout@v2 From 3c8382d2af3ef9239f64f6524cbe1abeb949729c Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sun, 24 Oct 2021 08:01:32 +0200 Subject: [PATCH 07/67] strategy: fail-fast: false --- .github/workflows/python-package.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index b23455c8..75231110 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -14,6 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: + fail-fast: false matrix: python-version: [3.5, 3.6, 3.7, 3.8, 3.9, "3.10", pypy3] From 968334acb48c79bc801ca61ee1d0f77d570ccadb Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sun, 24 Oct 2021 08:06:43 +0200 Subject: [PATCH 08/67] "Programming Language :: Python :: 3.10", --- you-get.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/you-get.json b/you-get.json index e98e2e8a..a9ef1dd5 100644 --- a/you-get.json +++ b/you-get.json @@ -25,6 +25,8 @@ "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", "Topic :: Internet", "Topic :: Internet :: WWW/HTTP", "Topic :: Multimedia", From c4603bbd2a4377ea2ff6f9c16a280e8a01a8583c Mon Sep 17 00:00:00 2001 From: Chuang Zhu Date: Tue, 2 Nov 2021 12:45:41 +0800 Subject: [PATCH 09/67] [bilibili] fix 'NoneType' object is not subscriptable --- src/you_get/extractors/bilibili.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index edb656c7..38ff368d 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -193,10 +193,12 @@ class Bilibili(VideoExtractor): playinfo_text = match1(html_content, r'__playinfo__=(.*?)', cont) From 15393a8218faf96af88f506fa1a1a51a57fd31b3 Mon Sep 17 00:00:00 2001 From: liguangbin Date: Sat, 19 Mar 2022 22:32:57 +0800 Subject: [PATCH 27/67] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E8=A5=BF=E7=93=9C?= =?UTF-8?q?=E8=A7=86=E9=A2=91=E4=B8=8B=E8=BD=BD=E5=A4=B1=E8=B4=A5=E9=97=AE?= =?UTF-8?q?=E9=A2=98;=20fix=20problem:the=20JSON=20object=20must=20be=20st?= =?UTF-8?q?r,=20bytes=20or=20bytearray,=20not=20NoneType?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/you_get/extractors/ixigua.py | 119 ++++++++++++++++++++----------- 1 file changed, 77 insertions(+), 42 deletions(-) diff --git a/src/you_get/extractors/ixigua.py b/src/you_get/extractors/ixigua.py index 2f11e7f9..57119eb6 100644 --- a/src/you_get/extractors/ixigua.py +++ b/src/you_get/extractors/ixigua.py @@ -80,59 +80,94 @@ def get_video_url_from_video_id(video_id): return url -def ixigua_download(url, output_dir='.', merge=True, info_only=False, **kwargs): +def ixigua_download(url, output_dir='.', merge=True, info_only=False, stream_id='', **kwargs): # example url: https://www.ixigua.com/i6631065141750268420/#mid=63024814422 - resp = urlopen_with_retry(request.Request(url)) + headers['cookie'] = "MONITOR_WEB_ID=7892c49b-296e-4499-8704-e47c1b15123; " \ + "ixigua-a-s=1; ttcid=af99669b6304453480454f1507011d5c234; BD_REF=1; " \ + "__ac_nonce=060d88ff000a75e8d17eb; __ac_signature=_02B4Z6wo100f01kX9ZpgAAIDAKIBBQUIPYT5F2WIAAPG2ad; " \ + "ttwid=1%7CcIsVF_3vqSIk4XErhPB0H2VaTxT0tdsTMRbMjrJOPN8%7C1624806049%7C08ce7dd6f7d20506a41ba0a331ef96a6505d96731e6ad9f6c8c709f53f227ab1" + + resp = urlopen_with_retry(request.Request(url, headers=headers)) html = resp.read().decode('utf-8') _cookies = [] for c in resp.getheader('Set-Cookie').split("httponly,"): _cookies.append(c.strip().split(' ')[0]) - headers['cookie'] = ' '.join(_cookies) + headers['cookie'] += ';'.join(_cookies) - conf = loads(match1(html, r"window\.config = (.+);")) - if not conf: - log.e("Get window.config from url failed, url: {}".format(url)) + match_txt = match1(html, r"', html) + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0', + 'Accept-Encoding': 'gzip, deflate', + 'Accept': '*/*', + 'Connection': 'keep-alive' # important + } + + html = getHttps(host, url, headers=headers) + data = r1(r'window\[\'SIGI_STATE\'\]=(.*?);window\[\'SIGI_RETRY\'\]', html) info = json.loads(data) - wid = info['props']['initialProps']['$wid'] - cookie = 'tt_webid=%s; tt_webid_v2=%s' % (wid, wid) + downloadAddr = info['ItemModule'][vid]['video']['downloadAddr'] + author = info['ItemModule'][vid]['author'] # same as uniqueId + nickname = info['UserModule']['users'][author]['nickname'] + title = '%s [%s]' % (nickname or author, vid) - # here's the cookie - headers['Cookie'] = cookie - - # try again - html = get_content(url, headers=headers) - data = r1(r'', html) - info = json.loads(data) - wid = info['props']['initialProps']['$wid'] - cookie = 'tt_webid=%s; tt_webid_v2=%s' % (wid, wid) - - videoData = info['props']['pageProps']['itemInfo']['itemStruct'] - videoId = videoData['id'] - videoUrl = videoData['video']['downloadAddr'] - uniqueId = videoData['author'].get('uniqueId') - nickName = videoData['author'].get('nickname') - - title = '%s [%s]' % (nickName or uniqueId, videoId) - - # we also need the referer - headers['Referer'] = referUrl - - mime, ext, size = url_info(videoUrl, headers=headers) + mime, ext, size = url_info(downloadAddr, headers=headers) print_info(site_info, title, mime, size) if not info_only: - download_urls([videoUrl], title, ext, size, output_dir=output_dir, merge=merge, headers=headers) + download_urls([downloadAddr], title, ext, size, output_dir=output_dir, merge=merge, headers=headers) site_info = "TikTok.com" download = tiktok_download From 37ca277e7256139484195a699bd160540cddd8d8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 19 Apr 2022 17:55:05 +0200 Subject: [PATCH 38/67] [bilibili] use hdflv2_hdr as id for HDR --- src/you_get/extractors/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index da19eb68..f7cc80dc 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -16,7 +16,7 @@ class Bilibili(VideoExtractor): 'container': 'FLV', 'video_resolution': '4320p', 'desc': '超高清 8K'}, {'id': 'hdflv2_dolby', 'quality': 126, 'audio_quality': 30280, 'container': 'FLV', 'video_resolution': '3840p', 'desc': '杜比视界'}, - {'id': 'hdflv2', 'quality': 125, 'audio_quality': 30280, + {'id': 'hdflv2_hdr', 'quality': 125, 'audio_quality': 30280, 'container': 'FLV', 'video_resolution': '2160p', 'desc': '真彩 HDR'}, {'id': 'hdflv2_4k', 'quality': 120, 'audio_quality': 30280, 'container': 'FLV', 'video_resolution': '2160p', 'desc': '超清 4K'}, From 408e78b180557b2372249b370549685a50d4a787 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 19 Apr 2022 21:37:48 +0200 Subject: [PATCH 39/67] drop support for python <3.5 --- README.md | 2 +- you-get.json | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index 0c3d4099..d4b8cd29 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ Are you a Python programmer? Then check out [the source](https://github.com/soim The following dependencies are necessary: -* **[Python](https://www.python.org/downloads/)** 3.2 or above +* **[Python](https://www.python.org/downloads/)** 3.5 or above * **[FFmpeg](https://www.ffmpeg.org/)** 1.0 or above * (Optional) [RTMPDump](https://rtmpdump.mplayerhq.hu/) diff --git a/you-get.json b/you-get.json index a9ef1dd5..1a36b3c0 100644 --- a/you-get.json +++ b/you-get.json @@ -18,9 +18,6 @@ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.2", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", From 43e14887a31704857452166c54c69c065f0b6036 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 22 Apr 2022 19:00:29 +0200 Subject: [PATCH 40/67] [bilibili] support lowercase bv --- src/you_get/extractors/bilibili.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index f7cc80dc..48c91925 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -188,7 +188,7 @@ class Bilibili(VideoExtractor): sort = 'live' elif re.match(r'https?://vc\.bilibili\.com/video/(\d+)', self.url): sort = 'vc' - elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(BV(\S+)))', self.url): + elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(bv(\S+))|(BV(\S+)))', self.url): sort = 'video' elif re.match(r'https?://h\.?bilibili\.com/(\d+)', self.url): sort = 'h' @@ -604,7 +604,7 @@ class Bilibili(VideoExtractor): elif re.match(r'https?://(www\.)?bilibili\.com/bangumi/media/md(\d+)', self.url) or \ re.match(r'https?://bangumi\.bilibili\.com/anime/(\d+)', self.url): sort = 'bangumi_md' - elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|BV(\S+))', self.url): + elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|bv(\S+)|BV(\S+))', self.url): sort = 'video' elif re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/detail\?.*cid=(\d+)', self.url): sort = 'space_channel' From 355e22584c8e8e0e5ac544945601f937355844cc Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 22 Apr 2022 19:17:26 +0200 Subject: [PATCH 41/67] [twitter] show the warning message if login required --- src/you_get/extractors/twitter.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 23468211..8c052ed0 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -51,7 +51,12 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token}) info = json.loads(api_content) - if 'extended_entities' in info['globalObjects']['tweets'][item_id]: + if item_id not in info['globalObjects']['tweets']: + # something wrong here + log.w(info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text']) + return + + elif 'extended_entities' in info['globalObjects']['tweets'][item_id]: # if the tweet contains media, download them media = info['globalObjects']['tweets'][item_id]['extended_entities']['media'] From bbc4df7a89ce32baa76e5d60d7a705354308b4f2 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 22 Apr 2022 19:29:52 +0200 Subject: [PATCH 42/67] [tiktok] fix extraction --- src/you_get/extractors/tiktok.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/tiktok.py b/src/you_get/extractors/tiktok.py index d1069fcc..33e1f11e 100644 --- a/src/you_get/extractors/tiktok.py +++ b/src/you_get/extractors/tiktok.py @@ -23,7 +23,8 @@ def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs): } html = getHttps(host, url, headers=headers) - data = r1(r'window\[\'SIGI_STATE\'\]=(.*?);window\[\'SIGI_RETRY\'\]', html) + data = r1(r'window\[\'SIGI_STATE\'\]=(.*?);window\[\'SIGI_RETRY\'\]', html) or \ + r1(r'', html) info = json.loads(data) downloadAddr = info['ItemModule'][vid]['video']['downloadAddr'] author = info['ItemModule'][vid]['author'] # same as uniqueId From fd2d7fdcbc14384baf45c86588d769300e6bec79 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 22 Apr 2022 23:21:14 +0200 Subject: [PATCH 43/67] [bilibili] support festival videos (fix #2955) --- src/you_get/extractors/bilibili.py | 52 ++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 48c91925..caaa91d6 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -177,6 +177,11 @@ class Bilibili(VideoExtractor): self.url = 'https://www.bilibili.com/%s' % match1(self.url, r'/s/(.+)') html_content = get_content(self.url, headers=self.bilibili_headers()) + # redirect: festival + elif re.match(r'https?://(www\.)?bilibili\.com/festival/(.+)', self.url): + self.url = 'https://www.bilibili.com/video/%s' % match1(self.url, r'bvid=([^&]+)') + html_content = get_content(self.url, headers=self.bilibili_headers()) + # sort it out if re.match(r'https?://(www\.)?bilibili\.com/audio/au(\d+)', self.url): sort = 'audio' @@ -203,30 +208,43 @@ class Bilibili(VideoExtractor): playinfo_text = match1(html_content, r'__playinfo__=(.*?)', html) info = json.loads(data) From f2ea06473aabd2fcc598cd98aa7ceb93c95d978a Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 00:36:42 +0200 Subject: [PATCH 51/67] [tests] test_twitter --- tests/test.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test.py b/tests/test.py index 862b829f..a1c6c076 100644 --- a/tests/test.py +++ b/tests/test.py @@ -10,7 +10,8 @@ from you_get.extractors import ( acfun, bilibili, soundcloud, - tiktok + tiktok, + twitter ) @@ -28,7 +29,7 @@ class YouGetTests(unittest.TestCase): youtube.download( 'http://www.youtube.com/watch?v=pzKerr0JIPA', info_only=True ) - youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True) + #youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True) #youtube.download( # 'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa # info_only=True @@ -57,6 +58,9 @@ class YouGetTests(unittest.TestCase): tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True) tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True) + def test_twitter(self): + twitter.download('https://twitter.com/elonmusk/status/1530516552084234244', info_only=True) + if __name__ == '__main__': unittest.main() From d57a0eba3e60cb4341a8bce02259ad8a4dee66c5 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 01:00:55 +0200 Subject: [PATCH 52/67] [youtube] improve regex --- src/you_get/extractors/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index b5f56fa4..3e1c5cad 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -237,7 +237,7 @@ class YouTube(VideoExtractor): except: # ytplayer_config = {args:{raw_player_response:ytInitialPlayerResponse}} - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) stream_list = ytInitialPlayerResponse['streamingData']['formats'] #stream_list = ytInitialPlayerResponse['streamingData']['adaptiveFormats'] @@ -262,7 +262,7 @@ class YouTube(VideoExtractor): # Parse video page instead video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) self.title = ytInitialPlayerResponse["videoDetails"]["title"] if re.search('([^"]*/base\.js)"', video_page): From 249afb8b27498a89986d3af3aec2dad0819fa014 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 01:12:15 +0200 Subject: [PATCH 53/67] [.github/workflows] test pypy3.8 and pypy3.9 --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 75231110..05dbc85a 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9, "3.10", pypy3] + python-version: [3.5, 3.6, 3.7, 3.8, 3.9, '3.10', 'pypy3.8', 'pypy3.9'] steps: - uses: actions/checkout@v2 From ce1f44fb88848c4e99357b51f6ab0b5cf9fe16a4 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 01:14:47 +0200 Subject: [PATCH 54/67] [.github/workflows] test pypy-3.8 and pypy-3.9 --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 05dbc85a..47fb37a5 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9, '3.10', 'pypy3.8', 'pypy3.9'] + python-version: [3.5, 3.6, 3.7, 3.8, 3.9, '3.10', pypy-3.8, pypy-3.9] steps: - uses: actions/checkout@v2 From 1aa7ca21fcc6769859c9b072f1a5052d73f546d9 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 01:41:25 +0200 Subject: [PATCH 55/67] drop support for python <3.7.4 --- .github/workflows/python-package.yml | 2 +- README.md | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 47fb37a5..f90b61ae 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9, '3.10', pypy-3.8, pypy-3.9] + python-version: [3.7, 3.8, 3.9, '3.10', pypy-3.8, pypy-3.9] steps: - uses: actions/checkout@v2 diff --git a/README.md b/README.md index d4b8cd29..e0cea4dd 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,9 @@ [![PyPI version](https://img.shields.io/pypi/v/you-get.svg)](https://pypi.python.org/pypi/you-get/) [![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -**NOTICE: Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.** +**NOTICE (30 May 2022): Support for Python 3.5, 3.6 and 3.7 will eventually be dropped. ([see details here](https://github.com/soimort/you-get/wiki/TLS-1.3-post-handshake-authentication-(PHA)))** + +**NOTICE (8 Mar 2019): Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.** --- @@ -53,9 +55,9 @@ Are you a Python programmer? Then check out [the source](https://github.com/soim ### Prerequisites -The following dependencies are necessary: +The following dependencies are recommended: -* **[Python](https://www.python.org/downloads/)** 3.5 or above +* **[Python](https://www.python.org/downloads/)** 3.8 or above * **[FFmpeg](https://www.ffmpeg.org/)** 1.0 or above * (Optional) [RTMPDump](https://rtmpdump.mplayerhq.hu/) From a5c726b9d701f81f4cb976242baeb5f00a7c164b Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 01:44:36 +0200 Subject: [PATCH 56/67] version 0.4.1612 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 8fabc52e..da7d3c33 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1602' +__version__ = '0.4.1612' From 9980b727cbcad548059cd05fbfa0254cc99b8e26 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 14:50:20 +0200 Subject: [PATCH 57/67] update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e0cea4dd..44c102cd 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ Are you a Python programmer? Then check out [the source](https://github.com/soim The following dependencies are recommended: -* **[Python](https://www.python.org/downloads/)** 3.8 or above +* **[Python](https://www.python.org/downloads/)** 3.7.4 or above * **[FFmpeg](https://www.ffmpeg.org/)** 1.0 or above * (Optional) [RTMPDump](https://rtmpdump.mplayerhq.hu/) From 6268c1173ce183b2548850bf95d1e7587ad22019 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 14:53:21 +0200 Subject: [PATCH 58/67] update README.rst --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 3c23ab5e..376abcf7 100644 --- a/README.rst +++ b/README.rst @@ -52,7 +52,7 @@ source `__ and fork it! .. |PyPI version| image:: https://badge.fury.io/py/you-get.png :target: http://badge.fury.io/py/you-get -.. |Build Status| image:: https://api.travis-ci.org/soimort/you-get.png - :target: https://travis-ci.org/soimort/you-get +.. |Build Status| image:: https://github.com/soimort/you-get/workflows/develop/badge.svg + :target: https://github.com/soimort/you-get/actions .. |Gitter| image:: https://badges.gitter.im/Join%20Chat.svg :target: https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge From bfb35db5a6b6dfd31c55c2c2f3edd56c0c59e8bf Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 15:38:00 +0200 Subject: [PATCH 59/67] update you-get.json --- you-get.json | 2 -- 1 file changed, 2 deletions(-) diff --git a/you-get.json b/you-get.json index 1a36b3c0..bb94ba00 100644 --- a/you-get.json +++ b/you-get.json @@ -18,8 +18,6 @@ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", From 6ddc3fce89bb496394ab6f51c224b0f9964ee344 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 15:38:47 +0200 Subject: [PATCH 60/67] update Makefile --- Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index c0f9cf0e..fe4a238c 100644 --- a/Makefile +++ b/Makefile @@ -43,5 +43,7 @@ install: $(SETUP) install --user --prefix= release: - zenity --question - $(SETUP) sdist bdist_wheel upload --sign + #zenity --question + $(SETUP) sdist bdist_wheel + echo 'Upload new version to PyPI using:' + echo ' twine upload --sign dist/you-get-VERSION.tar.gz dist/you_get-VERSION-py3-none-any.whl' From c0151a97756990bca525598dc37db476cd6c34d4 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 7 Jun 2022 16:19:14 +0200 Subject: [PATCH 61/67] [youtube] we should extract ytInitialPlayerResponse more reliably --- src/you_get/extractors/youtube.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 3e1c5cad..f820152f 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -237,7 +237,10 @@ class YouTube(VideoExtractor): except: # ytplayer_config = {args:{raw_player_response:ytInitialPlayerResponse}} - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + try: # FIXME: we should extract ytInitialPlayerResponse more reliably + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + except: + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) stream_list = ytInitialPlayerResponse['streamingData']['formats'] #stream_list = ytInitialPlayerResponse['streamingData']['adaptiveFormats'] @@ -262,7 +265,10 @@ class YouTube(VideoExtractor): # Parse video page instead video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + try: # FIXME: we should extract ytInitialPlayerResponse more reliably + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + except: + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) self.title = ytInitialPlayerResponse["videoDetails"]["title"] if re.search('([^"]*/base\.js)"', video_page): From a47960f6ed7b2a484b6629678b3a6ad8e39497bd Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 20 Jun 2022 23:04:56 +0200 Subject: [PATCH 62/67] [twitter] better warning --- src/you_get/extractors/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 8c052ed0..19b4ce87 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -53,7 +53,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) info = json.loads(api_content) if item_id not in info['globalObjects']['tweets']: # something wrong here - log.w(info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text']) + log.wtf('[Failed] ' + info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text'], exit_code=None) return elif 'extended_entities' in info['globalObjects']['tweets'][item_id]: From d661c95480abd61f7ef8877d8dbcb827534aa54d Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 1 Jul 2022 22:21:47 +0200 Subject: [PATCH 63/67] [instagram] fix extraction --- src/you_get/extractors/instagram.py | 74 +++++++++++++---------------- 1 file changed, 32 insertions(+), 42 deletions(-) diff --git a/src/you_get/extractors/instagram.py b/src/you_get/extractors/instagram.py index 4167b226..604c534c 100755 --- a/src/you_get/extractors/instagram.py +++ b/src/you_get/extractors/instagram.py @@ -10,60 +10,50 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg vid = r1(r'instagram.com/\w+/([^/]+)', url) description = r1(r'\s([^<]*)', cont) # with logged-in cookies + r1(r'([^<]*)', cont) # with logged-in cookies title = "{} [{}]".format(description.replace("\n", " "), vid) - stream = r1(r'', cont) - try: - info = json.loads(data.group(1)) - post = info['entry_data']['PostPage'][0] - assert post['items'] - except: - # with logged-in cookies - data = re.search(r'window\.__additionalDataLoaded\(\'[^\']+\',(.*)\);', cont) - if data is not None: - log.e('[Warning] Cookies needed.') - post = json.loads(data.group(1)) + api_url = 'https://i.instagram.com/api/v1/media/%s/info/' % media_id + try: + api_cont = get_content(api_url, headers={**fake_headers, **{'x-ig-app-id': appId}}) + except: + log.wtf('[Error] Please specify a cookie file.') + post = json.loads(api_cont) - for item in post['items']: - code = item['code'] - carousel_media = item.get('carousel_media') or [item] - for i, media in enumerate(carousel_media): - title = '%s [%s]' % (code, i) - image_url = media['image_versions2']['candidates'][0]['url'] - ext = image_url.split('?')[0].split('.')[-1] - size = int(get_head(image_url)['Content-Length']) + for item in post['items']: + code = item['code'] + carousel_media = item.get('carousel_media') or [item] + for i, media in enumerate(carousel_media): + title = '%s [%s]' % (code, i) + image_url = media['image_versions2']['candidates'][0]['url'] + ext = image_url.split('?')[0].split('.')[-1] + size = int(get_head(image_url)['Content-Length']) + + print_info(site_info, title, ext, size) + if not info_only: + download_urls(urls=[image_url], + title=title, + ext=ext, + total_size=size, + output_dir=output_dir) + + # download videos (if any) + if 'video_versions' in media: + video_url = media['video_versions'][0]['url'] + ext = video_url.split('?')[0].split('.')[-1] + size = int(get_head(video_url)['Content-Length']) print_info(site_info, title, ext, size) if not info_only: - download_urls(urls=[image_url], + download_urls(urls=[video_url], title=title, ext=ext, total_size=size, output_dir=output_dir) - # download videos (if any) - if 'video_versions' in media: - video_url = media['video_versions'][0]['url'] - ext = video_url.split('?')[0].split('.')[-1] - size = int(get_head(video_url)['Content-Length']) - - print_info(site_info, title, ext, size) - if not info_only: - download_urls(urls=[video_url], - title=title, - ext=ext, - total_size=size, - output_dir=output_dir) - site_info = "Instagram.com" download = instagram_download download_playlist = playlist_not_supported('instagram') From 82b376a0c60ff473686d6a79ae6ca5c42dc93950 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 1 Jul 2022 23:26:50 +0200 Subject: [PATCH 64/67] version 0.4.1620 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index da7d3c33..ac2bfc03 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1612' +__version__ = '0.4.1620' From 4119a1493e3c1c46c04914accd677d331c357edb Mon Sep 17 00:00:00 2001 From: owlwang Date: Mon, 25 Jul 2022 12:34:55 +0800 Subject: [PATCH 65/67] fix douyin extractor --- src/you_get/extractors/douyin.py | 51 ++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/src/you_get/extractors/douyin.py b/src/you_get/extractors/douyin.py index 8067b1b5..6a59b160 100644 --- a/src/you_get/extractors/douyin.py +++ b/src/you_get/extractors/douyin.py @@ -1,8 +1,6 @@ # coding=utf-8 -import re import json -from urllib.parse import unquote from ..common import ( url_size, @@ -11,25 +9,52 @@ from ..common import ( fake_headers, download_urls, playlist_not_supported, + match1, + get_location, ) - __all__ = ['douyin_download_by_url'] +def get_value(source: dict, path): + try: + value = source + for key in path: + if type(key) is str: + if key in value.keys(): + value = value[key] + else: + value = None + break + elif type(key) is int: + if len(value) != 0: + value = value[key] + else: + value = None + break + except: + value = None + return value + + def douyin_download_by_url(url, **kwargs): + # if short link, get the real url + if 'v.douyin.com' in url: + url = get_location(url) + aweme_id = match1(url, r'/(\d+)/?') + # get video info + video_info_api = 'https://www.douyin.com/web/api/v2/aweme/iteminfo/?item_ids={}' + url = video_info_api.format(aweme_id) page_content = get_content(url, headers=fake_headers) - # The video player and video source are rendered client-side, the data - # contains in a