diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index daae6668..f90b61ae 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -14,8 +14,9 @@ jobs: runs-on: ubuntu-latest strategy: + fail-fast: false matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9, pypy3] + python-version: [3.7, 3.8, 3.9, '3.10', pypy-3.8, pypy-3.9] steps: - uses: actions/checkout@v2 diff --git a/Makefile b/Makefile index c0f9cf0e..fe4a238c 100644 --- a/Makefile +++ b/Makefile @@ -43,5 +43,7 @@ install: $(SETUP) install --user --prefix= release: - zenity --question - $(SETUP) sdist bdist_wheel upload --sign + #zenity --question + $(SETUP) sdist bdist_wheel + echo 'Upload new version to PyPI using:' + echo ' twine upload --sign dist/you-get-VERSION.tar.gz dist/you_get-VERSION-py3-none-any.whl' diff --git a/README.md b/README.md index d2fdaa99..44c102cd 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,9 @@ [![PyPI version](https://img.shields.io/pypi/v/you-get.svg)](https://pypi.python.org/pypi/you-get/) [![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -**NOTICE: Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.** +**NOTICE (30 May 2022): Support for Python 3.5, 3.6 and 3.7 will eventually be dropped. ([see details here](https://github.com/soimort/you-get/wiki/TLS-1.3-post-handshake-authentication-(PHA)))** + +**NOTICE (8 Mar 2019): Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.** --- @@ -53,9 +55,9 @@ Are you a Python programmer? Then check out [the source](https://github.com/soim ### Prerequisites -The following dependencies are necessary: +The following dependencies are recommended: -* **[Python](https://www.python.org/downloads/)** 3.2 or above +* **[Python](https://www.python.org/downloads/)** 3.7.4 or above * **[FFmpeg](https://www.ffmpeg.org/)** 1.0 or above * (Optional) [RTMPDump](https://rtmpdump.mplayerhq.hu/) @@ -89,6 +91,14 @@ $ python3 setup.py install --user to install `you-get` to a permanent path. +You can also use the [pipenv](https://pipenv.pypa.io/en/latest) to install the `you-get` in the Python virtual environment. + +``` +$ pipenv install -e . +$ pipenv run you-get --version +you-get: version 0.4.1555, a tiny downloader that scrapes the web. +``` + ### Option 4: Git clone This is the recommended way for all developers, even if you don't often code in Python. diff --git a/README.rst b/README.rst index 3c23ab5e..376abcf7 100644 --- a/README.rst +++ b/README.rst @@ -52,7 +52,7 @@ source `__ and fork it! .. |PyPI version| image:: https://badge.fury.io/py/you-get.png :target: http://badge.fury.io/py/you-get -.. |Build Status| image:: https://api.travis-ci.org/soimort/you-get.png - :target: https://travis-ci.org/soimort/you-get +.. |Build Status| image:: https://github.com/soimort/you-get/workflows/develop/badge.svg + :target: https://github.com/soimort/you-get/actions .. |Gitter| image:: https://badges.gitter.im/Join%20Chat.svg :target: https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge diff --git a/src/you_get/common.py b/src/you_get/common.py index 597ed45a..c5c19d01 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -136,6 +136,8 @@ cookies = None output_filename = None auto_rename = False insecure = False +m3u8 = False +postfix = False fake_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # noqa @@ -340,10 +342,34 @@ def undeflate(data): return decompressobj.decompress(data)+decompressobj.flush() +# an http.client implementation of get_content() +# because urllib does not support "Connection: keep-alive" +def getHttps(host, url, headers, gzip=True, deflate=False, debuglevel=0): + import http.client + + conn = http.client.HTTPSConnection(host) + conn.set_debuglevel(debuglevel) + conn.request("GET", url, headers=headers) + resp = conn.getresponse() + + data = resp.read() + if gzip: + data = ungzip(data) + if deflate: + data = undeflate(data) + + return str(data, encoding='utf-8') + + # DEPRECATED in favor of get_content() def get_response(url, faker=False): logging.debug('get_response: %s' % url) - + ctx = None + if insecure: + # ignore ssl errors + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE # install cookies if cookies: opener = request.build_opener(request.HTTPCookieProcessor(cookies)) @@ -351,10 +377,10 @@ def get_response(url, faker=False): if faker: response = request.urlopen( - request.Request(url, headers=fake_headers), None + request.Request(url, headers=fake_headers), None, context=ctx, ) else: - response = request.urlopen(url) + response = request.urlopen(url, context=ctx) data = response.read() if response.info().get('Content-Encoding') == 'gzip': @@ -983,6 +1009,8 @@ def download_urls( pass title = tr(get_filename(title)) + if postfix and 'vid' in kwargs: + title = "%s [%s]" % (title, kwargs['vid']) output_filename = get_output_filename(urls, title, ext, output_dir, merge) output_filepath = os.path.join(output_dir, output_filename) @@ -1339,7 +1367,13 @@ def download_main(download, download_playlist, urls, playlist, **kwargs): if re.match(r'https?://', url) is None: url = 'http://' + url - if playlist: + if m3u8: + if output_filename: + title = output_filename + else: + title = "m3u8file" + download_url_ffmpeg(url=url, title=title,ext = 'mp4',output_dir = '.') + elif playlist: download_playlist(url, **kwargs) else: download(url, **kwargs) @@ -1443,7 +1477,6 @@ def set_socks_proxy(proxy): proxy_info = proxy.split("@") socks_proxy_addrs = proxy_info[1].split(':') socks_proxy_auth = proxy_info[0].split(":") - print(socks_proxy_auth[0]+" "+socks_proxy_auth[1]+" "+socks_proxy_addrs[0]+" "+socks_proxy_addrs[1]) socks.set_default_proxy( socks.SOCKS5, socks_proxy_addrs[0], @@ -1454,7 +1487,6 @@ def set_socks_proxy(proxy): ) else: socks_proxy_addrs = proxy.split(':') - print(socks_proxy_addrs[0]+" "+socks_proxy_addrs[1]) socks.set_default_proxy( socks.SOCKS5, socks_proxy_addrs[0], @@ -1527,6 +1559,10 @@ def script_main(download, download_playlist, **kwargs): '--no-caption', action='store_true', help='Do not download captions (subtitles, lyrics, danmaku, ...)' ) + download_grp.add_argument( + '--postfix', action='store_true', default=False, + help='Postfix downloaded files with unique identifiers' + ) download_grp.add_argument( '-f', '--force', action='store_true', default=False, help='Force overwriting existing files' @@ -1619,6 +1655,10 @@ def script_main(download, download_playlist, **kwargs): download_grp.add_argument('--stream', help=argparse.SUPPRESS) download_grp.add_argument('--itag', help=argparse.SUPPRESS) + download_grp.add_argument('-m', '--m3u8', action='store_true', default=False, + help = 'download video using an m3u8 url') + + parser.add_argument('URL', nargs='*', help=argparse.SUPPRESS) args = parser.parse_args() @@ -1644,6 +1684,8 @@ def script_main(download, download_playlist, **kwargs): global output_filename global auto_rename global insecure + global m3u8 + global postfix output_filename = args.output_filename extractor_proxy = args.extractor_proxy @@ -1665,6 +1707,9 @@ def script_main(download, download_playlist, **kwargs): if args.cookies: load_cookies(args.cookies) + if args.m3u8: + m3u8 = True + caption = True stream_id = args.format or args.stream or args.itag if args.no_caption: @@ -1677,6 +1722,7 @@ def script_main(download, download_playlist, **kwargs): # ignore ssl insecure = True + postfix = args.postfix if args.no_proxy: set_http_proxy('') @@ -1763,20 +1809,10 @@ def google_search(url): url = 'https://www.google.com/search?tbm=vid&q=%s' % parse.quote(keywords) page = get_content(url, headers=fake_headers) videos = re.findall( - r'

([^<]+)<', page + r'(https://www\.youtube\.com/watch\?v=[\w-]+)', page ) - vdurs = re.findall(r'([^<]+)<', page) - durs = [r1(r'(\d+:\d+)', unescape_html(dur)) for dur in vdurs] - print('Google Videos search:') - for v in zip(videos, durs): - print('- video: {} [{}]'.format( - unescape_html(v[0][1]), - v[1] if v[1] else '?' - )) - print('# you-get %s' % log.sprint(v[0][0], log.UNDERLINE)) - print() print('Best matched result:') - return(videos[0][0]) + return(videos[0]) def url_to_module(url): diff --git a/src/you_get/extractor.py b/src/you_get/extractor.py index c4315935..bd71717e 100644 --- a/src/you_get/extractor.py +++ b/src/you_get/extractor.py @@ -238,7 +238,8 @@ class VideoExtractor(): download_urls(urls, self.title, ext, total_size, headers=headers, output_dir=kwargs['output_dir'], merge=kwargs['merge'], - av=stream_id in self.dash_streams) + av=stream_id in self.dash_streams, + vid=self.vid) if 'caption' not in kwargs or not kwargs['caption']: print('Skipping captions or danmaku.') diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index edb656c7..1a13b61c 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -12,8 +12,12 @@ class Bilibili(VideoExtractor): # Bilibili media encoding options, in descending quality order. stream_types = [ - {'id': 'hdflv2', 'quality': 125, 'audio_quality': 30280, - 'container': 'FLV', 'video_resolution': '3840p', 'desc': '真彩 HDR'}, + {'id': 'hdflv2_8k', 'quality': 127, 'audio_quality': 30280, + 'container': 'FLV', 'video_resolution': '4320p', 'desc': '超高清 8K'}, + {'id': 'hdflv2_dolby', 'quality': 126, 'audio_quality': 30280, + 'container': 'FLV', 'video_resolution': '3840p', 'desc': '杜比视界'}, + {'id': 'hdflv2_hdr', 'quality': 125, 'audio_quality': 30280, + 'container': 'FLV', 'video_resolution': '2160p', 'desc': '真彩 HDR'}, {'id': 'hdflv2_4k', 'quality': 120, 'audio_quality': 30280, 'container': 'FLV', 'video_resolution': '2160p', 'desc': '超清 4K'}, {'id': 'flv_p60', 'quality': 116, 'audio_quality': 30280, @@ -112,12 +116,16 @@ class Bilibili(VideoExtractor): def bilibili_space_channel_api(mid, cid, pn=1, ps=100): return 'https://api.bilibili.com/x/space/channel/video?mid=%s&cid=%s&pn=%s&ps=%s&order=0&jsonp=jsonp' % (mid, cid, pn, ps) + @staticmethod + def bilibili_series_archives_api(mid, sid, pn=1, ps=100): + return 'https://api.bilibili.com/x/series/archives?mid=%s&series_id=%s&pn=%s&ps=%s&only_normal=true&sort=asc&jsonp=jsonp' % (mid, sid, pn, ps) + @staticmethod def bilibili_space_favlist_api(fid, pn=1, ps=20): return 'https://api.bilibili.com/x/v3/fav/resource/list?media_id=%s&pn=%s&ps=%s&order=mtime&type=0&tid=0&jsonp=jsonp' % (fid, pn, ps) @staticmethod - def bilibili_space_video_api(mid, pn=1, ps=100): + def bilibili_space_video_api(mid, pn=1, ps=50): return "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=%s&ps=%s&tid=0&keyword=&order=pubdate&jsonp=jsonp" % (mid, pn, ps) @staticmethod @@ -137,6 +145,8 @@ class Bilibili(VideoExtractor): def prepare(self, **kwargs): self.stream_qualities = {s['quality']: s for s in self.stream_types} + self.streams.clear() + self.dash_streams.clear() try: html_content = get_content(self.url, headers=self.bilibili_headers(referer=self.url)) @@ -167,6 +177,11 @@ class Bilibili(VideoExtractor): self.url = 'https://www.bilibili.com/%s' % match1(self.url, r'/s/(.+)') html_content = get_content(self.url, headers=self.bilibili_headers()) + # redirect: festival + elif re.match(r'https?://(www\.)?bilibili\.com/festival/(.+)', self.url): + self.url = 'https://www.bilibili.com/video/%s' % match1(self.url, r'bvid=([^&]+)') + html_content = get_content(self.url, headers=self.bilibili_headers()) + # sort it out if re.match(r'https?://(www\.)?bilibili\.com/audio/au(\d+)', self.url): sort = 'audio' @@ -178,7 +193,7 @@ class Bilibili(VideoExtractor): sort = 'live' elif re.match(r'https?://vc\.bilibili\.com/video/(\d+)', self.url): sort = 'vc' - elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(BV(\S+)))', self.url): + elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(bv(\S+))|(BV(\S+)))', self.url): sort = 'video' elif re.match(r'https?://h\.?bilibili\.com/(\d+)', self.url): sort = 'h' @@ -193,28 +208,43 @@ class Bilibili(VideoExtractor): playinfo_text = match1(html_content, r'__playinfo__=(.*?)', cont) - try: - info = json.loads(data.group(1)) - post = info['entry_data']['PostPage'][0] - assert post - except: - # with logged-in cookies - data = re.search(r'window\.__additionalDataLoaded\(\'[^\']+\',(.*)\);', cont) - if data is not None: - log.e('[Warning] Cookies needed.') - post = json.loads(data.group(1)) + api_url = 'https://i.instagram.com/api/v1/media/%s/info/' % media_id + try: + api_cont = get_content(api_url, headers={**fake_headers, **{'x-ig-app-id': appId}}) + except: + log.wtf('[Error] Please specify a cookie file.') + post = json.loads(api_cont) - if 'edge_sidecar_to_children' in post['graphql']['shortcode_media']: - edges = post['graphql']['shortcode_media']['edge_sidecar_to_children']['edges'] - for edge in edges: - title = edge['node']['shortcode'] - image_url = edge['node']['display_url'] - if 'video_url' in edge['node']: - image_url = edge['node']['video_url'] - ext = image_url.split('?')[0].split('.')[-1] - size = int(get_head(image_url)['Content-Length']) - - print_info(site_info, title, ext, size) - if not info_only: - download_urls(urls=[image_url], - title=title, - ext=ext, - total_size=size, - output_dir=output_dir) - else: - title = post['graphql']['shortcode_media']['shortcode'] - image_url = post['graphql']['shortcode_media']['display_url'] - if 'video_url' in post['graphql']['shortcode_media']: - image_url = post['graphql']['shortcode_media']['video_url'] + for item in post['items']: + code = item['code'] + carousel_media = item.get('carousel_media') or [item] + for i, media in enumerate(carousel_media): + title = '%s [%s]' % (code, i) + image_url = media['image_versions2']['candidates'][0]['url'] ext = image_url.split('?')[0].split('.')[-1] size = int(get_head(image_url)['Content-Length']) @@ -66,6 +40,20 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg total_size=size, output_dir=output_dir) + # download videos (if any) + if 'video_versions' in media: + video_url = media['video_versions'][0]['url'] + ext = video_url.split('?')[0].split('.')[-1] + size = int(get_head(video_url)['Content-Length']) + + print_info(site_info, title, ext, size) + if not info_only: + download_urls(urls=[video_url], + title=title, + ext=ext, + total_size=size, + output_dir=output_dir) + site_info = "Instagram.com" download = instagram_download download_playlist = playlist_not_supported('instagram') diff --git a/src/you_get/extractors/ixigua.py b/src/you_get/extractors/ixigua.py index 2f11e7f9..b368b380 100644 --- a/src/you_get/extractors/ixigua.py +++ b/src/you_get/extractors/ixigua.py @@ -18,121 +18,95 @@ headers = { } -def int_overflow(val): - maxint = 2147483647 - if not -maxint - 1 <= val <= maxint: - val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1 - return val - - -def unsigned_right_shitf(n, i): - if n < 0: - n = ctypes.c_uint32(n).value - if i < 0: - return -int_overflow(n << abs(i)) - return int_overflow(n >> i) - - -def get_video_url_from_video_id(video_id): - """Splicing URLs according to video ID to get video details""" - # from js - data = [""] * 256 - for index, _ in enumerate(data): - t = index - for i in range(8): - t = -306674912 ^ unsigned_right_shitf(t, 1) if 1 & t else unsigned_right_shitf(t, 1) - data[index] = t - - def tmp(): - rand_num = random.random() - path = "/video/urls/v/1/toutiao/mp4/{video_id}?r={random_num}".format(video_id=video_id, - random_num=str(rand_num)[2:]) - e = o = r = -1 - i, a = 0, len(path) - while i < a: - e = ord(path[i]) - i += 1 - if e < 128: - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ e)] - else: - if e < 2048: - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (192 | e >> 6 & 31))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))] - else: - if 55296 <= e < 57344: - e = (1023 & e) + 64 - i += 1 - o = 1023 & t.url(i) - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (240 | e >> 8 & 7))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 2 & 63))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | o >> 6 & 15 | (3 & e) << 4))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & o))] - else: - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (224 | e >> 12 & 15))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 6 & 63))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))] - - return "https://ib.365yg.com{path}&s={param}".format(path=path, param=unsigned_right_shitf(r ^ -1, 0)) - - while 1: - url = tmp() - if url.split("=")[-1][0] != "-": # 参数s不能为负数 - return url - - -def ixigua_download(url, output_dir='.', merge=True, info_only=False, **kwargs): +def ixigua_download(url, output_dir='.', merge=True, info_only=False, stream_id='', **kwargs): # example url: https://www.ixigua.com/i6631065141750268420/#mid=63024814422 - resp = urlopen_with_retry(request.Request(url)) + headers['cookie'] = "MONITOR_WEB_ID=7892c49b-296e-4499-8704-e47c1b15123; " \ + "ixigua-a-s=1; ttcid=af99669b6304453480454f1507011d5c234; BD_REF=1; " \ + "__ac_nonce=060d88ff000a75e8d17eb; __ac_signature=_02B4Z6wo100f01kX9ZpgAAIDAKIBBQUIPYT5F2WIAAPG2ad; " \ + "ttwid=1%7CcIsVF_3vqSIk4XErhPB0H2VaTxT0tdsTMRbMjrJOPN8%7C1624806049%7C08ce7dd6f7d20506a41ba0a331ef96a6505d96731e6ad9f6c8c709f53f227ab1; " + + resp = urlopen_with_retry(request.Request(url, headers=headers)) html = resp.read().decode('utf-8') _cookies = [] for c in resp.getheader('Set-Cookie').split("httponly,"): _cookies.append(c.strip().split(' ')[0]) - headers['cookie'] = ' '.join(_cookies) + headers['cookie'] += ' '.join(_cookies) - conf = loads(match1(html, r"window\.config = (.+);")) - if not conf: - log.e("Get window.config from url failed, url: {}".format(url)) + match_txt = match1(html, r"', html) + m = re.match('(https?://)?([^/]+)(/.*)', url) + host = m.group(2) + if host != 'www.tiktok.com': # non-canonical URL + url = get_location(url, headers=headers) + m = re.match('(https?://)?([^/]+)(/.*)', url) + host = m.group(2) + + url = m.group(3).split('?')[0] + vid = url.split('/')[3] # should be a string of numbers + + html = getHttps(host, url, headers=headers) + + data = r1(r'window\[\'SIGI_STATE\'\]=(.*?);window\[\'SIGI_RETRY\'\]', html) or \ + r1(r'', html) info = json.loads(data) - wid = info['props']['initialProps']['$wid'] - cookie = 'tt_webid=%s; tt_webid_v2=%s' % (wid, wid) + downloadAddr = info['ItemModule'][vid]['video']['downloadAddr'] + author = info['ItemModule'][vid]['author'] # same as uniqueId + nickname = info['UserModule']['users'][author]['nickname'] + title = '%s [%s]' % (nickname or author, vid) - # here's the cookie - headers['Cookie'] = cookie - - # try again - html = get_content(url, headers=headers) - data = r1(r'', html) - info = json.loads(data) - wid = info['props']['initialProps']['$wid'] - cookie = 'tt_webid=%s; tt_webid_v2=%s' % (wid, wid) - - videoData = info['props']['pageProps']['itemInfo']['itemStruct'] - videoId = videoData['id'] - videoUrl = videoData['video']['downloadAddr'] - uniqueId = videoData['author'].get('uniqueId') - nickName = videoData['author'].get('nickname') - - title = '%s [%s]' % (nickName or uniqueId, videoId) - - # we also need the referer - headers['Referer'] = referUrl - - mime, ext, size = url_info(videoUrl, headers=headers) + mime, ext, size = url_info(downloadAddr, headers=headers) print_info(site_info, title, mime, size) if not info_only: - download_urls([videoUrl], title, ext, size, output_dir=output_dir, merge=merge, headers=headers) + download_urls([downloadAddr], title, ext, size, output_dir=output_dir, merge=merge, headers=headers) site_info = "TikTok.com" download = tiktok_download diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 23468211..19b4ce87 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -51,7 +51,12 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token}) info = json.loads(api_content) - if 'extended_entities' in info['globalObjects']['tweets'][item_id]: + if item_id not in info['globalObjects']['tweets']: + # something wrong here + log.wtf('[Failed] ' + info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text'], exit_code=None) + return + + elif 'extended_entities' in info['globalObjects']['tweets'][item_id]: # if the tweet contains media, download them media = info['globalObjects']['tweets'][item_id]['extended_entities']['media'] diff --git a/src/you_get/extractors/universal.py b/src/you_get/extractors/universal.py index fdc7426d..4a3268ab 100644 --- a/src/you_get/extractors/universal.py +++ b/src/you_get/extractors/universal.py @@ -76,7 +76,7 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg urls = [] for i in media_exts: - urls += re.findall(r'(https?://[^ ;&"\'\\<>]*' + i + r'[^ ;&"\'\\<>]*)', page) + urls += re.findall(r'(https?://[^ ;&"\'\\<>]*' + i + r'[^ =?;&"\'\\<>]*)', page) p_urls = re.findall(r'(https?%3A%2F%2F[^;&"]+' + i + r'[^;&"]*)', page) urls += [parse.unquote(url) for url in p_urls] diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 81b45ac5..ddf12be9 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -78,6 +78,8 @@ class YouTube(VideoExtractor): # - https://www.youtube.com/yts/jsbin/player_ias-vfl-jbnrr/da_DK/base.js # - https://www.youtube.com/s/player/0b643cd1/player_ias.vflset/sv_SE/base.js # - https://www.youtube.com/s/player/50e823fc/player_ias.vflset/sv_SE/base.js + # - https://www.youtube.com/s/player/3b5d5649/player_ias.vflset/sv_SE/base.js + # - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js def tr_js(code): code = re.sub(r'function', r'def', code) # add prefix '_sig_' to prevent namespace pollution @@ -113,12 +115,10 @@ class YouTube(VideoExtractor): else: f2def = re.search(r'[^$\w]%s:function\((\w+)\)(\{[^\{\}]+\})' % f2e, js) f2def = 'function {}({},b){}'.format(f2e, f2def.group(1), f2def.group(2)) - f2 = re.sub(r'(as|if|in|is|or)', r'_\1', f2) - f2 = re.sub(r'\$', '_dollar', f2) + f2 = re.sub(r'\$', '_dollar', f2) # replace dollar sign code = code + 'global _sig_%s\n' % f2 + tr_js(f2def) - f1 = re.sub(r'(as|if|in|is|or)', r'_\1', f1) - f1 = re.sub(r'\$', '_dollar', f1) + f1 = re.sub(r'\$', '_dollar', f1) # replace dollar sign code = code + '_sig=_sig_%s(s)' % f1 exec(code, globals(), locals()) return locals()['_sig'] @@ -141,6 +141,7 @@ class YouTube(VideoExtractor): """ return match1(url, r'youtu\.be/([^?/]+)') or \ match1(url, r'youtube\.com/embed/([^/?]+)') or \ + match1(url, r'youtube\.com/shorts/([^/?]+)') or \ match1(url, r'youtube\.com/v/([^/?]+)') or \ match1(url, r'youtube\.com/watch/([^/?]+)') or \ parse_query_param(url, 'v') or \ @@ -233,7 +234,10 @@ class YouTube(VideoExtractor): except: # ytplayer_config = {args:{raw_player_response:ytInitialPlayerResponse}} - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + try: # FIXME: we should extract ytInitialPlayerResponse more reliably + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + except: + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) stream_list = ytInitialPlayerResponse['streamingData']['formats'] #stream_list = ytInitialPlayerResponse['streamingData']['adaptiveFormats'] @@ -258,7 +262,10 @@ class YouTube(VideoExtractor): # Parse video page instead video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) - ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + try: # FIXME: we should extract ytInitialPlayerResponse more reliably + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) + except: + ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) self.title = ytInitialPlayerResponse["videoDetails"]["title"] if re.search('([^"]*/base\.js)"', video_page): diff --git a/src/you_get/extractors/zhihu.py b/src/you_get/extractors/zhihu.py index 64f81423..1dceef53 100644 --- a/src/you_get/extractors/zhihu.py +++ b/src/you_get/extractors/zhihu.py @@ -31,8 +31,8 @@ def zhihu_download(url, output_dir='.', merge=True, info_only=False, **kwargs): play_list = video_info["playlist"] # first High Definition - # second Second Standard Definition - # third ld. What is ld ? + # second Standard Definition + # third Low Definition # finally continue data = play_list.get("hd", play_list.get("sd", play_list.get("ld", None))) if not data: diff --git a/src/you_get/processor/ffmpeg.py b/src/you_get/processor/ffmpeg.py index 11126c27..50e2c9fe 100755 --- a/src/you_get/processor/ffmpeg.py +++ b/src/you_get/processor/ffmpeg.py @@ -93,7 +93,7 @@ def ffmpeg_concat_mp4_to_mpg(files, output='output.mpg'): # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): concat_list = generate_concat_list(files, output) - params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0', '-i', concat_list, '-c', 'copy'] params.extend(['--', output]) if subprocess.call(params, stdin=STDIN) == 0: @@ -149,7 +149,7 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'): # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): concat_list = generate_concat_list(files, output) - params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0', '-i', concat_list, '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] params.extend(['--', output]) @@ -203,7 +203,7 @@ def ffmpeg_concat_mp4_to_mp4(files, output='output.mp4'): # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): concat_list = generate_concat_list(files, output) - params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0', '-i', concat_list, '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] params.extend(['--', output]) diff --git a/src/you_get/version.py b/src/you_get/version.py index f7daa7f8..ac2bfc03 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1545' +__version__ = '0.4.1620' diff --git a/tests/test.py b/tests/test.py index 4a2a117c..a1c6c076 100644 --- a/tests/test.py +++ b/tests/test.py @@ -10,7 +10,8 @@ from you_get.extractors import ( acfun, bilibili, soundcloud, - tiktok + tiktok, + twitter ) @@ -28,11 +29,11 @@ class YouGetTests(unittest.TestCase): youtube.download( 'http://www.youtube.com/watch?v=pzKerr0JIPA', info_only=True ) - youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True) - youtube.download( - 'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa - info_only=True - ) + #youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True) + #youtube.download( + # 'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa + # info_only=True + #) #youtube.download( # 'https://www.youtube.com/watch?v=Fpr4fQSh1cc', info_only=True #) @@ -40,6 +41,9 @@ class YouGetTests(unittest.TestCase): def test_acfun(self): acfun.download('https://www.acfun.cn/v/ac11701912', info_only=True) + #def test_bilibili(self): + # bilibili.download('https://www.bilibili.com/video/BV1sL4y177sC', info_only=True) + #def test_soundcloud(self): ## single song #soundcloud.download( @@ -50,10 +54,12 @@ class YouGetTests(unittest.TestCase): # 'https://soundcloud.com/anthony-flieger/sets/cytus', info_only=True #) - #def tests_tiktok(self): - # tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True) - # tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True) - # tiktok.download('https://vt.tiktok.com/UGJR4R/', info_only=True) + def test_tiktok(self): + tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True) + tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True) + + def test_twitter(self): + twitter.download('https://twitter.com/elonmusk/status/1530516552084234244', info_only=True) if __name__ == '__main__': diff --git a/you-get.json b/you-get.json index e98e2e8a..bb94ba00 100644 --- a/you-get.json +++ b/you-get.json @@ -18,13 +18,10 @@ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.2", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", "Topic :: Internet", "Topic :: Internet :: WWW/HTTP", "Topic :: Multimedia",