From ea5f712cb0a420e26ea6e883014159deb584c43b Mon Sep 17 00:00:00 2001 From: hong56hk Date: Thu, 26 Aug 2021 00:12:52 +0800 Subject: [PATCH 01/77] fix for using insecure flag does not work --- src/you_get/common.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 597ed45a..5aa74a20 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -343,7 +343,12 @@ def undeflate(data): # DEPRECATED in favor of get_content() def get_response(url, faker=False): logging.debug('get_response: %s' % url) - + ctx = None + if insecure: + # ignore ssl errors + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE # install cookies if cookies: opener = request.build_opener(request.HTTPCookieProcessor(cookies)) @@ -351,10 +356,10 @@ def get_response(url, faker=False): if faker: response = request.urlopen( - request.Request(url, headers=fake_headers), None + request.Request(url, headers=fake_headers), None, context=ctx, ) else: - response = request.urlopen(url) + response = request.urlopen(url, context=ctx) data = response.read() if response.info().get('Content-Encoding') == 'gzip': From 68cf21dbee1478ab86bd1608e84ceebc54e7a05f Mon Sep 17 00:00:00 2001 From: Peter Date: Fri, 7 Jan 2022 10:41:05 +0800 Subject: [PATCH 02/77] Add pipenv command usage option --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index d2fdaa99..0c3d4099 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,14 @@ $ python3 setup.py install --user to install `you-get` to a permanent path. +You can also use the [pipenv](https://pipenv.pypa.io/en/latest) to install the `you-get` in the Python virtual environment. + +``` +$ pipenv install -e . +$ pipenv run you-get --version +you-get: version 0.4.1555, a tiny downloader that scrapes the web. +``` + ### Option 4: Git clone This is the recommended way for all developers, even if you don't often code in Python. From ed2c0b32fe430e1d8b7f927263015b86087a3fc2 Mon Sep 17 00:00:00 2001 From: SuperSodaSea Date: Sat, 22 Jan 2022 22:50:21 +0800 Subject: [PATCH 03/77] [bilibili] Add 8K video download support --- src/you_get/extractors/bilibili.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 5cd47e10..00204da2 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -12,6 +12,8 @@ class Bilibili(VideoExtractor): # Bilibili media encoding options, in descending quality order. stream_types = [ + {'id': 'hdflv2_8k', 'quality': 127, 'audio_quality': 30280, + 'container': 'FLV', 'video_resolution': '4320p', 'desc': '超高清 8K'}, {'id': 'hdflv2', 'quality': 125, 'audio_quality': 30280, 'container': 'FLV', 'video_resolution': '3840p', 'desc': '真彩 HDR'}, {'id': 'hdflv2_4k', 'quality': 120, 'audio_quality': 30280, From 5aeae31fa2553eedfdf11d1bbde7892184164f73 Mon Sep 17 00:00:00 2001 From: SuperSodaSea Date: Sat, 29 Jan 2022 02:44:00 +0800 Subject: [PATCH 04/77] [bilibili] Add Dolby Vision video download support --- src/you_get/extractors/bilibili.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 00204da2..4b2c246b 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -14,6 +14,8 @@ class Bilibili(VideoExtractor): stream_types = [ {'id': 'hdflv2_8k', 'quality': 127, 'audio_quality': 30280, 'container': 'FLV', 'video_resolution': '4320p', 'desc': '超高清 8K'}, + {'id': 'hdflv2_dolby', 'quality': 126, 'audio_quality': 30280, + 'container': 'FLV', 'video_resolution': '3840p', 'desc': '杜比视界'}, {'id': 'hdflv2', 'quality': 125, 'audio_quality': 30280, 'container': 'FLV', 'video_resolution': '3840p', 'desc': '真彩 HDR'}, {'id': 'hdflv2_4k', 'quality': 120, 'audio_quality': 30280, From d6ae98fb1e0b2156e07f01957337fbd39a75f39d Mon Sep 17 00:00:00 2001 From: sdlyyxy Date: Sun, 6 Feb 2022 17:10:15 +0800 Subject: [PATCH 05/77] =?UTF-8?q?Bilibili:=20fix=20"=E7=9C=9F=E5=BD=A9=20H?= =?UTF-8?q?DR"=20video=5Fresolution?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/you_get/extractors/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 5cd47e10..101832e2 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -13,7 +13,7 @@ class Bilibili(VideoExtractor): # Bilibili media encoding options, in descending quality order. stream_types = [ {'id': 'hdflv2', 'quality': 125, 'audio_quality': 30280, - 'container': 'FLV', 'video_resolution': '3840p', 'desc': '真彩 HDR'}, + 'container': 'FLV', 'video_resolution': '2160p', 'desc': '真彩 HDR'}, {'id': 'hdflv2_4k', 'quality': 120, 'audio_quality': 30280, 'container': 'FLV', 'video_resolution': '2160p', 'desc': '超清 4K'}, {'id': 'flv_p60', 'quality': 116, 'audio_quality': 30280, From 15393a8218faf96af88f506fa1a1a51a57fd31b3 Mon Sep 17 00:00:00 2001 From: liguangbin Date: Sat, 19 Mar 2022 22:32:57 +0800 Subject: [PATCH 06/77] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E8=A5=BF=E7=93=9C?= =?UTF-8?q?=E8=A7=86=E9=A2=91=E4=B8=8B=E8=BD=BD=E5=A4=B1=E8=B4=A5=E9=97=AE?= =?UTF-8?q?=E9=A2=98;=20fix=20problem:the=20JSON=20object=20must=20be=20st?= =?UTF-8?q?r,=20bytes=20or=20bytearray,=20not=20NoneType?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/you_get/extractors/ixigua.py | 119 ++++++++++++++++++++----------- 1 file changed, 77 insertions(+), 42 deletions(-) diff --git a/src/you_get/extractors/ixigua.py b/src/you_get/extractors/ixigua.py index 2f11e7f9..57119eb6 100644 --- a/src/you_get/extractors/ixigua.py +++ b/src/you_get/extractors/ixigua.py @@ -80,59 +80,94 @@ def get_video_url_from_video_id(video_id): return url -def ixigua_download(url, output_dir='.', merge=True, info_only=False, **kwargs): +def ixigua_download(url, output_dir='.', merge=True, info_only=False, stream_id='', **kwargs): # example url: https://www.ixigua.com/i6631065141750268420/#mid=63024814422 - resp = urlopen_with_retry(request.Request(url)) + headers['cookie'] = "MONITOR_WEB_ID=7892c49b-296e-4499-8704-e47c1b15123; " \ + "ixigua-a-s=1; ttcid=af99669b6304453480454f1507011d5c234; BD_REF=1; " \ + "__ac_nonce=060d88ff000a75e8d17eb; __ac_signature=_02B4Z6wo100f01kX9ZpgAAIDAKIBBQUIPYT5F2WIAAPG2ad; " \ + "ttwid=1%7CcIsVF_3vqSIk4XErhPB0H2VaTxT0tdsTMRbMjrJOPN8%7C1624806049%7C08ce7dd6f7d20506a41ba0a331ef96a6505d96731e6ad9f6c8c709f53f227ab1" + + resp = urlopen_with_retry(request.Request(url, headers=headers)) html = resp.read().decode('utf-8') _cookies = [] for c in resp.getheader('Set-Cookie').split("httponly,"): _cookies.append(c.strip().split(' ')[0]) - headers['cookie'] = ' '.join(_cookies) + headers['cookie'] += ';'.join(_cookies) - conf = loads(match1(html, r"window\.config = (.+);")) - if not conf: - log.e("Get window.config from url failed, url: {}".format(url)) + match_txt = match1(html, r"', html) + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0', + 'Accept-Encoding': 'gzip, deflate', + 'Accept': '*/*', + 'Connection': 'keep-alive' # important + } + + html = getHttps(host, url, headers=headers) + data = r1(r'window\[\'SIGI_STATE\'\]=(.*?);window\[\'SIGI_RETRY\'\]', html) info = json.loads(data) - wid = info['props']['initialProps']['$wid'] - cookie = 'tt_webid=%s; tt_webid_v2=%s' % (wid, wid) + downloadAddr = info['ItemModule'][vid]['video']['downloadAddr'] + author = info['ItemModule'][vid]['author'] # same as uniqueId + nickname = info['UserModule']['users'][author]['nickname'] + title = '%s [%s]' % (nickname or author, vid) - # here's the cookie - headers['Cookie'] = cookie - - # try again - html = get_content(url, headers=headers) - data = r1(r'', html) - info = json.loads(data) - wid = info['props']['initialProps']['$wid'] - cookie = 'tt_webid=%s; tt_webid_v2=%s' % (wid, wid) - - videoData = info['props']['pageProps']['itemInfo']['itemStruct'] - videoId = videoData['id'] - videoUrl = videoData['video']['downloadAddr'] - uniqueId = videoData['author'].get('uniqueId') - nickName = videoData['author'].get('nickname') - - title = '%s [%s]' % (nickName or uniqueId, videoId) - - # we also need the referer - headers['Referer'] = referUrl - - mime, ext, size = url_info(videoUrl, headers=headers) + mime, ext, size = url_info(downloadAddr, headers=headers) print_info(site_info, title, mime, size) if not info_only: - download_urls([videoUrl], title, ext, size, output_dir=output_dir, merge=merge, headers=headers) + download_urls([downloadAddr], title, ext, size, output_dir=output_dir, merge=merge, headers=headers) site_info = "TikTok.com" download = tiktok_download From 37ca277e7256139484195a699bd160540cddd8d8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 19 Apr 2022 17:55:05 +0200 Subject: [PATCH 15/77] [bilibili] use hdflv2_hdr as id for HDR --- src/you_get/extractors/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index da19eb68..f7cc80dc 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -16,7 +16,7 @@ class Bilibili(VideoExtractor): 'container': 'FLV', 'video_resolution': '4320p', 'desc': '超高清 8K'}, {'id': 'hdflv2_dolby', 'quality': 126, 'audio_quality': 30280, 'container': 'FLV', 'video_resolution': '3840p', 'desc': '杜比视界'}, - {'id': 'hdflv2', 'quality': 125, 'audio_quality': 30280, + {'id': 'hdflv2_hdr', 'quality': 125, 'audio_quality': 30280, 'container': 'FLV', 'video_resolution': '2160p', 'desc': '真彩 HDR'}, {'id': 'hdflv2_4k', 'quality': 120, 'audio_quality': 30280, 'container': 'FLV', 'video_resolution': '2160p', 'desc': '超清 4K'}, From 408e78b180557b2372249b370549685a50d4a787 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 19 Apr 2022 21:37:48 +0200 Subject: [PATCH 16/77] drop support for python <3.5 --- README.md | 2 +- you-get.json | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index 0c3d4099..d4b8cd29 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ Are you a Python programmer? Then check out [the source](https://github.com/soim The following dependencies are necessary: -* **[Python](https://www.python.org/downloads/)** 3.2 or above +* **[Python](https://www.python.org/downloads/)** 3.5 or above * **[FFmpeg](https://www.ffmpeg.org/)** 1.0 or above * (Optional) [RTMPDump](https://rtmpdump.mplayerhq.hu/) diff --git a/you-get.json b/you-get.json index a9ef1dd5..1a36b3c0 100644 --- a/you-get.json +++ b/you-get.json @@ -18,9 +18,6 @@ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.2", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", From 43e14887a31704857452166c54c69c065f0b6036 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 22 Apr 2022 19:00:29 +0200 Subject: [PATCH 17/77] [bilibili] support lowercase bv --- src/you_get/extractors/bilibili.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index f7cc80dc..48c91925 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -188,7 +188,7 @@ class Bilibili(VideoExtractor): sort = 'live' elif re.match(r'https?://vc\.bilibili\.com/video/(\d+)', self.url): sort = 'vc' - elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(BV(\S+)))', self.url): + elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(bv(\S+))|(BV(\S+)))', self.url): sort = 'video' elif re.match(r'https?://h\.?bilibili\.com/(\d+)', self.url): sort = 'h' @@ -604,7 +604,7 @@ class Bilibili(VideoExtractor): elif re.match(r'https?://(www\.)?bilibili\.com/bangumi/media/md(\d+)', self.url) or \ re.match(r'https?://bangumi\.bilibili\.com/anime/(\d+)', self.url): sort = 'bangumi_md' - elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|BV(\S+))', self.url): + elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|bv(\S+)|BV(\S+))', self.url): sort = 'video' elif re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/detail\?.*cid=(\d+)', self.url): sort = 'space_channel' From 355e22584c8e8e0e5ac544945601f937355844cc Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 22 Apr 2022 19:17:26 +0200 Subject: [PATCH 18/77] [twitter] show the warning message if login required --- src/you_get/extractors/twitter.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 23468211..8c052ed0 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -51,7 +51,12 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token}) info = json.loads(api_content) - if 'extended_entities' in info['globalObjects']['tweets'][item_id]: + if item_id not in info['globalObjects']['tweets']: + # something wrong here + log.w(info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text']) + return + + elif 'extended_entities' in info['globalObjects']['tweets'][item_id]: # if the tweet contains media, download them media = info['globalObjects']['tweets'][item_id]['extended_entities']['media'] From bbc4df7a89ce32baa76e5d60d7a705354308b4f2 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 22 Apr 2022 19:29:52 +0200 Subject: [PATCH 19/77] [tiktok] fix extraction --- src/you_get/extractors/tiktok.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/tiktok.py b/src/you_get/extractors/tiktok.py index d1069fcc..33e1f11e 100644 --- a/src/you_get/extractors/tiktok.py +++ b/src/you_get/extractors/tiktok.py @@ -23,7 +23,8 @@ def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs): } html = getHttps(host, url, headers=headers) - data = r1(r'window\[\'SIGI_STATE\'\]=(.*?);window\[\'SIGI_RETRY\'\]', html) + data = r1(r'window\[\'SIGI_STATE\'\]=(.*?);window\[\'SIGI_RETRY\'\]', html) or \ + r1(r'', html) info = json.loads(data) downloadAddr = info['ItemModule'][vid]['video']['downloadAddr'] author = info['ItemModule'][vid]['author'] # same as uniqueId From fd2d7fdcbc14384baf45c86588d769300e6bec79 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 22 Apr 2022 23:21:14 +0200 Subject: [PATCH 20/77] [bilibili] support festival videos (fix #2955) --- src/you_get/extractors/bilibili.py | 52 ++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 48c91925..caaa91d6 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -177,6 +177,11 @@ class Bilibili(VideoExtractor): self.url = 'https://www.bilibili.com/%s' % match1(self.url, r'/s/(.+)') html_content = get_content(self.url, headers=self.bilibili_headers()) + # redirect: festival + elif re.match(r'https?://(www\.)?bilibili\.com/festival/(.+)', self.url): + self.url = 'https://www.bilibili.com/video/%s' % match1(self.url, r'bvid=([^&]+)') + html_content = get_content(self.url, headers=self.bilibili_headers()) + # sort it out if re.match(r'https?://(www\.)?bilibili\.com/audio/au(\d+)', self.url): sort = 'audio' @@ -203,30 +208,43 @@ class Bilibili(VideoExtractor): playinfo_text = match1(html_content, r'__playinfo__=(.*?)', html) info = json.loads(data) From f2ea06473aabd2fcc598cd98aa7ceb93c95d978a Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 00:36:42 +0200 Subject: [PATCH 28/77] [tests] test_twitter --- tests/test.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test.py b/tests/test.py index 862b829f..a1c6c076 100644 --- a/tests/test.py +++ b/tests/test.py @@ -10,7 +10,8 @@ from you_get.extractors import ( acfun, bilibili, soundcloud, - tiktok + tiktok, + twitter ) @@ -28,7 +29,7 @@ class YouGetTests(unittest.TestCase): youtube.download( 'http://www.youtube.com/watch?v=pzKerr0JIPA', info_only=True ) - youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True) + #youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True) #youtube.download( # 'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa # info_only=True @@ -57,6 +58,9 @@ class YouGetTests(unittest.TestCase): tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True) tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True) + def test_twitter(self): + twitter.download('https://twitter.com/elonmusk/status/1530516552084234244', info_only=True) + if __name__ == '__main__': unittest.main() From d57a0eba3e60cb4341a8bce02259ad8a4dee66c5 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 01:00:55 +0200 Subject: [PATCH 29/77] [youtube] improve regex --- src/you_get/extractors/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index b5f56fa4..3e1c5cad 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -237,7 +237,7 @@ class YouTube(VideoExtractor): except: # ytplayer_config = {args:{raw_player_response:ytInitialPlayerResponse}} - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) stream_list = ytInitialPlayerResponse['streamingData']['formats'] #stream_list = ytInitialPlayerResponse['streamingData']['adaptiveFormats'] @@ -262,7 +262,7 @@ class YouTube(VideoExtractor): # Parse video page instead video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) self.title = ytInitialPlayerResponse["videoDetails"]["title"] if re.search('([^"]*/base\.js)"', video_page): From 249afb8b27498a89986d3af3aec2dad0819fa014 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 01:12:15 +0200 Subject: [PATCH 30/77] [.github/workflows] test pypy3.8 and pypy3.9 --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 75231110..05dbc85a 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9, "3.10", pypy3] + python-version: [3.5, 3.6, 3.7, 3.8, 3.9, '3.10', 'pypy3.8', 'pypy3.9'] steps: - uses: actions/checkout@v2 From ce1f44fb88848c4e99357b51f6ab0b5cf9fe16a4 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 01:14:47 +0200 Subject: [PATCH 31/77] [.github/workflows] test pypy-3.8 and pypy-3.9 --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 05dbc85a..47fb37a5 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9, '3.10', 'pypy3.8', 'pypy3.9'] + python-version: [3.5, 3.6, 3.7, 3.8, 3.9, '3.10', pypy-3.8, pypy-3.9] steps: - uses: actions/checkout@v2 From 1aa7ca21fcc6769859c9b072f1a5052d73f546d9 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 01:41:25 +0200 Subject: [PATCH 32/77] drop support for python <3.7.4 --- .github/workflows/python-package.yml | 2 +- README.md | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 47fb37a5..f90b61ae 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9, '3.10', pypy-3.8, pypy-3.9] + python-version: [3.7, 3.8, 3.9, '3.10', pypy-3.8, pypy-3.9] steps: - uses: actions/checkout@v2 diff --git a/README.md b/README.md index d4b8cd29..e0cea4dd 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,9 @@ [![PyPI version](https://img.shields.io/pypi/v/you-get.svg)](https://pypi.python.org/pypi/you-get/) [![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -**NOTICE: Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.** +**NOTICE (30 May 2022): Support for Python 3.5, 3.6 and 3.7 will eventually be dropped. ([see details here](https://github.com/soimort/you-get/wiki/TLS-1.3-post-handshake-authentication-(PHA)))** + +**NOTICE (8 Mar 2019): Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.** --- @@ -53,9 +55,9 @@ Are you a Python programmer? Then check out [the source](https://github.com/soim ### Prerequisites -The following dependencies are necessary: +The following dependencies are recommended: -* **[Python](https://www.python.org/downloads/)** 3.5 or above +* **[Python](https://www.python.org/downloads/)** 3.8 or above * **[FFmpeg](https://www.ffmpeg.org/)** 1.0 or above * (Optional) [RTMPDump](https://rtmpdump.mplayerhq.hu/) From a5c726b9d701f81f4cb976242baeb5f00a7c164b Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 01:44:36 +0200 Subject: [PATCH 33/77] version 0.4.1612 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 8fabc52e..da7d3c33 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1602' +__version__ = '0.4.1612' From 9980b727cbcad548059cd05fbfa0254cc99b8e26 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 14:50:20 +0200 Subject: [PATCH 34/77] update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e0cea4dd..44c102cd 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ Are you a Python programmer? Then check out [the source](https://github.com/soim The following dependencies are recommended: -* **[Python](https://www.python.org/downloads/)** 3.8 or above +* **[Python](https://www.python.org/downloads/)** 3.7.4 or above * **[FFmpeg](https://www.ffmpeg.org/)** 1.0 or above * (Optional) [RTMPDump](https://rtmpdump.mplayerhq.hu/) From 6268c1173ce183b2548850bf95d1e7587ad22019 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 14:53:21 +0200 Subject: [PATCH 35/77] update README.rst --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 3c23ab5e..376abcf7 100644 --- a/README.rst +++ b/README.rst @@ -52,7 +52,7 @@ source `__ and fork it! .. |PyPI version| image:: https://badge.fury.io/py/you-get.png :target: http://badge.fury.io/py/you-get -.. |Build Status| image:: https://api.travis-ci.org/soimort/you-get.png - :target: https://travis-ci.org/soimort/you-get +.. |Build Status| image:: https://github.com/soimort/you-get/workflows/develop/badge.svg + :target: https://github.com/soimort/you-get/actions .. |Gitter| image:: https://badges.gitter.im/Join%20Chat.svg :target: https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge From bfb35db5a6b6dfd31c55c2c2f3edd56c0c59e8bf Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 15:38:00 +0200 Subject: [PATCH 36/77] update you-get.json --- you-get.json | 2 -- 1 file changed, 2 deletions(-) diff --git a/you-get.json b/you-get.json index 1a36b3c0..bb94ba00 100644 --- a/you-get.json +++ b/you-get.json @@ -18,8 +18,6 @@ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", From 6ddc3fce89bb496394ab6f51c224b0f9964ee344 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 30 May 2022 15:38:47 +0200 Subject: [PATCH 37/77] update Makefile --- Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index c0f9cf0e..fe4a238c 100644 --- a/Makefile +++ b/Makefile @@ -43,5 +43,7 @@ install: $(SETUP) install --user --prefix= release: - zenity --question - $(SETUP) sdist bdist_wheel upload --sign + #zenity --question + $(SETUP) sdist bdist_wheel + echo 'Upload new version to PyPI using:' + echo ' twine upload --sign dist/you-get-VERSION.tar.gz dist/you_get-VERSION-py3-none-any.whl' From c0151a97756990bca525598dc37db476cd6c34d4 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 7 Jun 2022 16:19:14 +0200 Subject: [PATCH 38/77] [youtube] we should extract ytInitialPlayerResponse more reliably --- src/you_get/extractors/youtube.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 3e1c5cad..f820152f 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -237,7 +237,10 @@ class YouTube(VideoExtractor): except: # ytplayer_config = {args:{raw_player_response:ytInitialPlayerResponse}} - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + try: # FIXME: we should extract ytInitialPlayerResponse more reliably + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + except: + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) stream_list = ytInitialPlayerResponse['streamingData']['formats'] #stream_list = ytInitialPlayerResponse['streamingData']['adaptiveFormats'] @@ -262,7 +265,10 @@ class YouTube(VideoExtractor): # Parse video page instead video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + try: # FIXME: we should extract ytInitialPlayerResponse more reliably + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + except: + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) self.title = ytInitialPlayerResponse["videoDetails"]["title"] if re.search('([^"]*/base\.js)"', video_page): From a47960f6ed7b2a484b6629678b3a6ad8e39497bd Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 20 Jun 2022 23:04:56 +0200 Subject: [PATCH 39/77] [twitter] better warning --- src/you_get/extractors/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 8c052ed0..19b4ce87 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -53,7 +53,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) info = json.loads(api_content) if item_id not in info['globalObjects']['tweets']: # something wrong here - log.w(info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text']) + log.wtf('[Failed] ' + info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text'], exit_code=None) return elif 'extended_entities' in info['globalObjects']['tweets'][item_id]: From d661c95480abd61f7ef8877d8dbcb827534aa54d Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 1 Jul 2022 22:21:47 +0200 Subject: [PATCH 40/77] [instagram] fix extraction --- src/you_get/extractors/instagram.py | 74 +++++++++++++---------------- 1 file changed, 32 insertions(+), 42 deletions(-) diff --git a/src/you_get/extractors/instagram.py b/src/you_get/extractors/instagram.py index 4167b226..604c534c 100755 --- a/src/you_get/extractors/instagram.py +++ b/src/you_get/extractors/instagram.py @@ -10,60 +10,50 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg vid = r1(r'instagram.com/\w+/([^/]+)', url) description = r1(r'\s([^<]*)', cont) # with logged-in cookies + r1(r'([^<]*)', cont) # with logged-in cookies title = "{} [{}]".format(description.replace("\n", " "), vid) - stream = r1(r'', cont) - try: - info = json.loads(data.group(1)) - post = info['entry_data']['PostPage'][0] - assert post['items'] - except: - # with logged-in cookies - data = re.search(r'window\.__additionalDataLoaded\(\'[^\']+\',(.*)\);', cont) - if data is not None: - log.e('[Warning] Cookies needed.') - post = json.loads(data.group(1)) + api_url = 'https://i.instagram.com/api/v1/media/%s/info/' % media_id + try: + api_cont = get_content(api_url, headers={**fake_headers, **{'x-ig-app-id': appId}}) + except: + log.wtf('[Error] Please specify a cookie file.') + post = json.loads(api_cont) - for item in post['items']: - code = item['code'] - carousel_media = item.get('carousel_media') or [item] - for i, media in enumerate(carousel_media): - title = '%s [%s]' % (code, i) - image_url = media['image_versions2']['candidates'][0]['url'] - ext = image_url.split('?')[0].split('.')[-1] - size = int(get_head(image_url)['Content-Length']) + for item in post['items']: + code = item['code'] + carousel_media = item.get('carousel_media') or [item] + for i, media in enumerate(carousel_media): + title = '%s [%s]' % (code, i) + image_url = media['image_versions2']['candidates'][0]['url'] + ext = image_url.split('?')[0].split('.')[-1] + size = int(get_head(image_url)['Content-Length']) + + print_info(site_info, title, ext, size) + if not info_only: + download_urls(urls=[image_url], + title=title, + ext=ext, + total_size=size, + output_dir=output_dir) + + # download videos (if any) + if 'video_versions' in media: + video_url = media['video_versions'][0]['url'] + ext = video_url.split('?')[0].split('.')[-1] + size = int(get_head(video_url)['Content-Length']) print_info(site_info, title, ext, size) if not info_only: - download_urls(urls=[image_url], + download_urls(urls=[video_url], title=title, ext=ext, total_size=size, output_dir=output_dir) - # download videos (if any) - if 'video_versions' in media: - video_url = media['video_versions'][0]['url'] - ext = video_url.split('?')[0].split('.')[-1] - size = int(get_head(video_url)['Content-Length']) - - print_info(site_info, title, ext, size) - if not info_only: - download_urls(urls=[video_url], - title=title, - ext=ext, - total_size=size, - output_dir=output_dir) - site_info = "Instagram.com" download = instagram_download download_playlist = playlist_not_supported('instagram') From 82b376a0c60ff473686d6a79ae6ca5c42dc93950 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 1 Jul 2022 23:26:50 +0200 Subject: [PATCH 41/77] version 0.4.1620 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index da7d3c33..ac2bfc03 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1612' +__version__ = '0.4.1620' From 4119a1493e3c1c46c04914accd677d331c357edb Mon Sep 17 00:00:00 2001 From: owlwang Date: Mon, 25 Jul 2022 12:34:55 +0800 Subject: [PATCH 42/77] fix douyin extractor --- src/you_get/extractors/douyin.py | 51 ++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/src/you_get/extractors/douyin.py b/src/you_get/extractors/douyin.py index 8067b1b5..6a59b160 100644 --- a/src/you_get/extractors/douyin.py +++ b/src/you_get/extractors/douyin.py @@ -1,8 +1,6 @@ # coding=utf-8 -import re import json -from urllib.parse import unquote from ..common import ( url_size, @@ -11,25 +9,52 @@ from ..common import ( fake_headers, download_urls, playlist_not_supported, + match1, + get_location, ) - __all__ = ['douyin_download_by_url'] +def get_value(source: dict, path): + try: + value = source + for key in path: + if type(key) is str: + if key in value.keys(): + value = value[key] + else: + value = None + break + elif type(key) is int: + if len(value) != 0: + value = value[key] + else: + value = None + break + except: + value = None + return value + + def douyin_download_by_url(url, **kwargs): + # if short link, get the real url + if 'v.douyin.com' in url: + url = get_location(url) + aweme_id = match1(url, r'/(\d+)/?') + # get video info + video_info_api = 'https://www.douyin.com/web/api/v2/aweme/iteminfo/?item_ids={}' + url = video_info_api.format(aweme_id) page_content = get_content(url, headers=fake_headers) - # The video player and video source are rendered client-side, the data - # contains in a ', html) From 7b845b34ce18863e519ad3cce8e53431ba41664d Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 11 Dec 2022 17:43:07 +0100 Subject: [PATCH 63/77] [tiktok] fix extraction for alternative URLs --- src/you_get/common.py | 15 +++++++++------ src/you_get/extractors/tiktok.py | 12 ++++++------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 1558baf6..c337a2a2 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -344,21 +344,24 @@ def undeflate(data): # an http.client implementation of get_content() # because urllib does not support "Connection: keep-alive" -def getHttps(host, url, headers, gzip=True, deflate=False, debuglevel=0): +def getHttps(host, url, headers, debuglevel=0): import http.client conn = http.client.HTTPSConnection(host) conn.set_debuglevel(debuglevel) conn.request("GET", url, headers=headers) resp = conn.getresponse() + set_cookie = resp.getheader('set-cookie') data = resp.read() - if gzip: - data = ungzip(data) - if deflate: - data = undeflate(data) + try: + data = ungzip(data) # gzip + data = undeflate(data) # deflate + except: + pass - return str(data, encoding='utf-8'), resp.getheader('set-cookie') + conn.close() + return str(data, encoding='utf-8'), set_cookie # DEPRECATED in favor of get_content() diff --git a/src/you_get/extractors/tiktok.py b/src/you_get/extractors/tiktok.py index 641e5e97..2c4892f6 100644 --- a/src/you_get/extractors/tiktok.py +++ b/src/you_get/extractors/tiktok.py @@ -16,12 +16,12 @@ def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs): m = re.match('(https?://)?([^/]+)(/.*)', url) host = m.group(2) if host != 'www.tiktok.com': # non-canonical URL - url = get_location(url, headers=headers) - m = re.match('(https?://)?([^/]+)(/.*)', url) - host = m.group(2) - - url = m.group(3).split('?')[0] - vid = url.split('/')[3] # should be a string of numbers + vid = r1(r'/video/(\d+)', url) + url = 'https://www.tiktok.com/@/video/%s/' % vid + host = 'www.tiktok.com' + else: + url = m.group(3).split('?')[0] + vid = url.split('/')[3] # should be a string of numbers html, set_cookie = getHttps(host, url, headers=headers) tt_chain_token = r1('tt_chain_token=([^;]+);', set_cookie) From 888a9e29f37a52a57a81b28bebdf39bd77aa058f Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 11 Dec 2022 17:44:06 +0100 Subject: [PATCH 64/77] [tests] test "universal" tiktok url --- tests/test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test.py b/tests/test.py index 877b6935..c0f3836a 100644 --- a/tests/test.py +++ b/tests/test.py @@ -57,6 +57,7 @@ class YouGetTests(unittest.TestCase): def test_tiktok(self): tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True) + tiktok.download('https://www.tiktok.com/@/video/6850796940293164290', info_only=True) tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True) def test_twitter(self): From fd7889783419940da9ed460ab420c48be39a2ae4 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 11 Dec 2022 18:09:10 +0100 Subject: [PATCH 65/77] [instagram] show cookie warning --- src/you_get/extractors/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/instagram.py b/src/you_get/extractors/instagram.py index 604c534c..8e261fe7 100755 --- a/src/you_get/extractors/instagram.py +++ b/src/you_get/extractors/instagram.py @@ -19,9 +19,9 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg api_url = 'https://i.instagram.com/api/v1/media/%s/info/' % media_id try: api_cont = get_content(api_url, headers={**fake_headers, **{'x-ig-app-id': appId}}) + post = json.loads(api_cont) except: log.wtf('[Error] Please specify a cookie file.') - post = json.loads(api_cont) for item in post['items']: code = item['code'] From 0d9c28031010ba44fc69977050d5fe572fdee12b Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 11 Dec 2022 18:15:16 +0100 Subject: [PATCH 66/77] version 0.4.1650 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index ac2bfc03..440488a9 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1620' +__version__ = '0.4.1650' From c0a483dab1a07bce353a8cb8f6cb4111c6348a85 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 18 Dec 2022 14:54:34 +0100 Subject: [PATCH 67/77] [twitter] warn when falling back to deprecated API --- src/you_get/extractors/twitter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 7975bdfd..baf4c375 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -55,7 +55,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) info = json.loads(api_content) if item_id not in info['globalObjects']['tweets']: # something wrong here - #log.wtf('[Failed] ' + info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text'], exit_code=None) + log.w('[Failed] ' + info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text']) assert False elif 'extended_entities' in info['globalObjects']['tweets'][item_id]: @@ -94,6 +94,8 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) return except: + log.w('[Warning] Falling back to deprecated Twitter API. Extraction may be incomplete.') + authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw' # FIXME: 403 with cookies From 0fc63efa63c88662f363fa89768b1c1f9dd6cc36 Mon Sep 17 00:00:00 2001 From: arix00 <15333224+arix00@users.noreply.github.com> Date: Sun, 1 Jan 2023 20:38:21 -0800 Subject: [PATCH 68/77] Download multipage video collection When there're more than single page videos in a collection, Download all videos as current code will only handle first page. For 'space_channel_series' and 'space_channel_collection' --- src/you_get/extractors/bilibili.py | 38 ++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 6d34c2c4..b082553e 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -747,13 +747,20 @@ class Bilibili(VideoExtractor): elif sort == 'space_channel_series': m = re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/seriesdetail\?.*sid=(\d+)', self.url) mid, sid = m.group(1), m.group(2) - api_url = self.bilibili_series_archives_api(mid, sid) - api_content = get_content(api_url, headers=self.bilibili_headers(referer=self.url)) - archives_info = json.loads(api_content) - # TBD: channel of more than 100 videos + pn = 1 + video_list = [] + while True: + api_url = self.bilibili_series_archives_api(mid, sid, pn) + api_content = get_content(api_url, headers=self.bilibili_headers(referer=self.url)) + archives_info = json.loads(api_content) + video_list.extend(archives_info['data']['archives']) + if len(video_list) < archives_info['data']['page']['total'] and len(archives_info['data']['archives']) > 0: + pn += 1 + else: + break - epn, i = len(archives_info['data']['archives']), 0 - for video in archives_info['data']['archives']: + epn, i = len(video_list), 0 + for video in video_list: i += 1; log.w('Extracting %s of %s videos ...' % (i, epn)) url = 'https://www.bilibili.com/video/av%s' % video['aid'] self.__class__().download_playlist_by_url(url, **kwargs) @@ -761,13 +768,20 @@ class Bilibili(VideoExtractor): elif sort == 'space_channel_collection': m = re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/collectiondetail\?.*sid=(\d+)', self.url) mid, sid = m.group(1), m.group(2) - api_url = self.bilibili_space_collection_api(mid, sid) - api_content = get_content(api_url, headers=self.bilibili_headers(referer=self.url)) - archives_info = json.loads(api_content) - # TBD: channel of more than 100 videos + pn = 1 + video_list = [] + while True: + api_url = self.bilibili_space_collection_api(mid, sid, pn) + api_content = get_content(api_url, headers=self.bilibili_headers(referer=self.url)) + archives_info = json.loads(api_content) + video_list.extend(archives_info['data']['archives']) + if len(video_list) < archives_info['data']['page']['total'] and len(archives_info['data']['archives']) > 0: + pn += 1 + else: + break - epn, i = len(archives_info['data']['archives']), 0 - for video in archives_info['data']['archives']: + epn, i = len(video_list), 0 + for video in video_list: i += 1; log.w('Extracting %s of %s videos ...' % (i, epn)) url = 'https://www.bilibili.com/video/av%s' % video['aid'] self.__class__().download_playlist_by_url(url, **kwargs) From 25eb89984524acd42a9b704d3d5b0edfa509c95a Mon Sep 17 00:00:00 2001 From: juruoyyx <60863833+juruoyyx@users.noreply.github.com> Date: Fri, 6 Jan 2023 10:55:00 +0800 Subject: [PATCH 69/77] Update ffmpeg.py --- src/you_get/processor/ffmpeg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/processor/ffmpeg.py b/src/you_get/processor/ffmpeg.py index 50e2c9fe..efc0a472 100755 --- a/src/you_get/processor/ffmpeg.py +++ b/src/you_get/processor/ffmpeg.py @@ -128,7 +128,7 @@ def ffmpeg_concat_mp4_to_mpg(files, output='output.mpg'): def ffmpeg_concat_ts_to_mkv(files, output='output.mkv'): print('Merging video parts... ', end="", flush=True) - params = [FFMPEG] + LOGLEVEL + ['-isync', '-y', '-i'] + params = [FFMPEG] + LOGLEVEL + ['-y', '-i'] params.append('concat:') for file in files: if os.path.isfile(file): From a2e411395b9bffa0329c3ea4d80c8fbb218e7bad Mon Sep 17 00:00:00 2001 From: URenko <18209292+URenko@users.noreply.github.com> Date: Wed, 8 Feb 2023 18:51:01 +0800 Subject: [PATCH 70/77] support different codecs for bilibili --- src/you_get/extractors/bilibili.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 6d34c2c4..6ec8bc13 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -42,6 +42,8 @@ class Bilibili(VideoExtractor): {'id': 'jpg', 'quality': 0}, ] + codecids = {7: 'AVC', 12: 'HEVC', 13: 'AV1'} + @staticmethod def height_to_quality(height, qn): if height <= 360 and qn <= 16: @@ -70,7 +72,7 @@ class Bilibili(VideoExtractor): @staticmethod def bilibili_api(avid, cid, qn=0): - return 'https://api.bilibili.com/x/player/playurl?avid=%s&cid=%s&qn=%s&type=&otype=json&fnver=0&fnval=16&fourk=1' % (avid, cid, qn) + return 'https://api.bilibili.com/x/player/playurl?avid=%s&cid=%s&qn=%s&type=&otype=json&fnver=0&fnval=4048&fourk=1' % (avid, cid, qn) @staticmethod def bilibili_audio_api(sid): @@ -302,11 +304,10 @@ class Bilibili(VideoExtractor): if 'dash' in playinfo['data']: audio_size_cache = {} for video in playinfo['data']['dash']['video']: - # prefer the latter codecs! s = self.stream_qualities[video['id']] - format_id = 'dash-' + s['id'] # prefix + format_id = f"dash-{s['id']}-{self.codecids[video['codecid']]}" # prefix container = 'mp4' # enforce MP4 container - desc = s['desc'] + desc = s['desc'] + ' ' + video['codecs'] audio_quality = s['audio_quality'] baseurl = video['baseUrl'] size = self.url_size(baseurl, headers=self.bilibili_headers(referer=self.url)) From f54669411e5b10b2e79484f0d07f00664b450bc0 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 8 Feb 2023 16:41:03 +0100 Subject: [PATCH 71/77] add param "--prefix" to prefix downloaded files --- src/you_get/common.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index c337a2a2..bdb67bac 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -138,6 +138,7 @@ auto_rename = False insecure = False m3u8 = False postfix = False +prefix = None fake_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # noqa @@ -1014,6 +1015,8 @@ def download_urls( title = tr(get_filename(title)) if postfix and 'vid' in kwargs: title = "%s [%s]" % (title, kwargs['vid']) + if prefix is not None: + title = "[%s] %s" % (prefix, title) output_filename = get_output_filename(urls, title, ext, output_dir, merge) output_filepath = os.path.join(output_dir, output_filename) @@ -1563,9 +1566,13 @@ def script_main(download, download_playlist, **kwargs): help='Do not download captions (subtitles, lyrics, danmaku, ...)' ) download_grp.add_argument( - '--postfix', action='store_true', default=False, + '--post', '--postfix', dest='postfix', action='store_true', default=False, help='Postfix downloaded files with unique identifiers' ) + download_grp.add_argument( + '--pre', '--prefix', dest='prefix', metavar='PREFIX', default=None, + help='Prefix downloaded files with string' + ) download_grp.add_argument( '-f', '--force', action='store_true', default=False, help='Force overwriting existing files' @@ -1689,6 +1696,7 @@ def script_main(download, download_playlist, **kwargs): global insecure global m3u8 global postfix + global prefix output_filename = args.output_filename extractor_proxy = args.extractor_proxy @@ -1726,6 +1734,7 @@ def script_main(download, download_playlist, **kwargs): insecure = True postfix = args.postfix + prefix = args.prefix if args.no_proxy: set_http_proxy('') From 2ba7493f126aed5785893b4cd5c3042998da7b99 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 9 Feb 2023 23:36:16 +0100 Subject: [PATCH 72/77] [bilibili] warn if cookies are not loaded --- src/you_get/extractors/bilibili.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index e59296ee..6335e6dd 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -117,7 +117,7 @@ class Bilibili(VideoExtractor): @staticmethod def bilibili_space_channel_api(mid, cid, pn=1, ps=100): return 'https://api.bilibili.com/x/space/channel/video?mid=%s&cid=%s&pn=%s&ps=%s&order=0&jsonp=jsonp' % (mid, cid, pn, ps) - + @staticmethod def bilibili_space_collection_api(mid, cid, pn=1, ps=30): return 'https://api.bilibili.com/x/polymer/space/seasons_archives_list?mid=%s&season_id=%s&sort_reverse=false&page_num=%s&page_size=%s' % (mid, cid, pn, ps) @@ -125,7 +125,7 @@ class Bilibili(VideoExtractor): @staticmethod def bilibili_series_archives_api(mid, sid, pn=1, ps=100): return 'https://api.bilibili.com/x/series/archives?mid=%s&series_id=%s&pn=%s&ps=%s&only_normal=true&sort=asc&jsonp=jsonp' % (mid, sid, pn, ps) - + @staticmethod def bilibili_space_favlist_api(fid, pn=1, ps=20): return 'https://api.bilibili.com/x/v3/fav/resource/list?media_id=%s&pn=%s&ps=%s&order=mtime&type=0&tid=0&jsonp=jsonp' % (fid, pn, ps) @@ -224,6 +224,10 @@ class Bilibili(VideoExtractor): if 'videoData' in initial_state: # (standard video) + # warn if cookies are not loaded + if cookies is None: + log.w('You will need login cookies for 720p formats or above. (use --cookies to load cookies.txt.)') + # warn if it is a multi-part video pn = initial_state['videoData']['videos'] if pn > 1 and not kwargs.get('playlist'): From 2aaa877a9b8ebda9ed25cb87df1ad760700b55c4 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 15 Feb 2023 16:20:47 +0100 Subject: [PATCH 73/77] [.github/workflows] test python 3.11 --- .github/workflows/python-package.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 37a8f1aa..39793c03 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,5 +1,4 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions name: develop @@ -16,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.7, 3.8, 3.9, '3.10', 3.11-dev, pypy-3.8, pypy-3.9] + python-version: [3.7, 3.8, 3.9, '3.10', '3.11', pypy-3.8, pypy-3.9] steps: - uses: actions/checkout@v3 From ad5825a8f644442a3f45e028b7f04f4c6d861aba Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 9 May 2023 15:22:19 +0200 Subject: [PATCH 74/77] [twitter] fix extraction --- src/you_get/extractors/twitter.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index baf4c375..752ef746 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -34,7 +34,18 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) **kwargs) return - html = get_html(url, faker=True) # now it seems faker must be enabled + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0' + } + host = 'www.twitter.com' + + html, set_cookie = getHttps(host, url, headers=headers) + # "Found. Redirecting to..." + guest_id = r1('guest_id=([^;]+);', set_cookie) + headers['Cookie'] = 'guest_id=%s' % guest_id + + html = get_content(url, headers=headers) + screen_name = r1(r'twitter\.com/([^/]+)', url) or r1(r'data-screen-name="([^"]*)"', html) or \ r1(r' Date: Wed, 5 Jul 2023 17:12:15 +0200 Subject: [PATCH 75/77] [twitter] fix extraction --- src/you_get/extractors/twitter.py | 119 +++++++----------------------- 1 file changed, 26 insertions(+), 93 deletions(-) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 752ef746..43cfa6a4 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -23,7 +23,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) if re.match(r'https?://mobile', url): # normalize mobile URL url = 'https://' + match1(url, r'//mobile\.(.+)') - if re.match(r'https?://twitter\.com/i/moments/', url): # moments + if re.match(r'https?://twitter\.com/i/moments/', url): # FIXME: moments html = get_html(url, faker=True) paths = re.findall(r'data-permalink-path="([^"]+)"', html) for path in paths: @@ -34,114 +34,47 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) **kwargs) return - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0' - } - host = 'www.twitter.com' - - html, set_cookie = getHttps(host, url, headers=headers) - # "Found. Redirecting to..." - guest_id = r1('guest_id=([^;]+);', set_cookie) - headers['Cookie'] = 'guest_id=%s' % guest_id - - html = get_content(url, headers=headers) - - screen_name = r1(r'twitter\.com/([^/]+)', url) or r1(r'data-screen-name="([^"]*)"', html) or \ - r1(r' Date: Tue, 11 Jul 2023 17:20:12 +0200 Subject: [PATCH 76/77] [twitter] minor fix --- src/you_get/extractors/twitter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 43cfa6a4..4a439fe8 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -62,6 +62,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) if 'video' in info: for mediaDetail in info['mediaDetails']: + if 'video_info' not in mediaDetail: continue variants = mediaDetail['video_info']['variants'] variants = sorted(variants, key=lambda kv: kv.get('bitrate', 0)) title = item_id + '_' + variants[-1]['url'].split('/')[-1].split('?')[0].split('.')[0] From 9f38d7d76f2df34fa1bd72b826c5248a3aba67d3 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 25 Jul 2023 23:42:57 +0200 Subject: [PATCH 77/77] [common] update UA --- src/you_get/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index bdb67bac..4095dc52 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -141,11 +141,11 @@ postfix = False prefix = None fake_headers = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # noqa + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'UTF-8,*;q=0.5', 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'en-US,en;q=0.8', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.74 Safari/537.36 Edg/79.0.309.43', # noqa + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.183' # Latest Edge } if sys.stdout.isatty():