Merge branch 'soimort:develop' into develop

This commit is contained in:
gitreposk 2022-09-06 16:46:36 +08:00 committed by GitHub
commit f894232ba3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 361 additions and 265 deletions

View File

@ -14,8 +14,9 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
fail-fast: false
matrix: matrix:
python-version: [3.5, 3.6, 3.7, 3.8, 3.9, pypy3] python-version: [3.7, 3.8, 3.9, '3.10', pypy-3.8, pypy-3.9]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2

View File

@ -43,5 +43,7 @@ install:
$(SETUP) install --user --prefix= $(SETUP) install --user --prefix=
release: release:
zenity --question #zenity --question
$(SETUP) sdist bdist_wheel upload --sign $(SETUP) sdist bdist_wheel
echo 'Upload new version to PyPI using:'
echo ' twine upload --sign dist/you-get-VERSION.tar.gz dist/you_get-VERSION-py3-none-any.whl'

View File

@ -4,7 +4,9 @@
[![PyPI version](https://img.shields.io/pypi/v/you-get.svg)](https://pypi.python.org/pypi/you-get/) [![PyPI version](https://img.shields.io/pypi/v/you-get.svg)](https://pypi.python.org/pypi/you-get/)
[![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
**NOTICE: Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.** **NOTICE (30 May 2022): Support for Python 3.5, 3.6 and 3.7 will eventually be dropped. ([see details here](https://github.com/soimort/you-get/wiki/TLS-1.3-post-handshake-authentication-(PHA)))**
**NOTICE (8 Mar 2019): Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.**
--- ---
@ -53,9 +55,9 @@ Are you a Python programmer? Then check out [the source](https://github.com/soim
### Prerequisites ### Prerequisites
The following dependencies are necessary: The following dependencies are recommended:
* **[Python](https://www.python.org/downloads/)** 3.2 or above * **[Python](https://www.python.org/downloads/)** 3.7.4 or above
* **[FFmpeg](https://www.ffmpeg.org/)** 1.0 or above * **[FFmpeg](https://www.ffmpeg.org/)** 1.0 or above
* (Optional) [RTMPDump](https://rtmpdump.mplayerhq.hu/) * (Optional) [RTMPDump](https://rtmpdump.mplayerhq.hu/)
@ -89,6 +91,14 @@ $ python3 setup.py install --user
to install `you-get` to a permanent path. to install `you-get` to a permanent path.
You can also use the [pipenv](https://pipenv.pypa.io/en/latest) to install the `you-get` in the Python virtual environment.
```
$ pipenv install -e .
$ pipenv run you-get --version
you-get: version 0.4.1555, a tiny downloader that scrapes the web.
```
### Option 4: Git clone ### Option 4: Git clone
This is the recommended way for all developers, even if you don't often code in Python. This is the recommended way for all developers, even if you don't often code in Python.

View File

@ -52,7 +52,7 @@ source <https://github.com/soimort/you-get>`__ and fork it!
.. |PyPI version| image:: https://badge.fury.io/py/you-get.png .. |PyPI version| image:: https://badge.fury.io/py/you-get.png
:target: http://badge.fury.io/py/you-get :target: http://badge.fury.io/py/you-get
.. |Build Status| image:: https://api.travis-ci.org/soimort/you-get.png .. |Build Status| image:: https://github.com/soimort/you-get/workflows/develop/badge.svg
:target: https://travis-ci.org/soimort/you-get :target: https://github.com/soimort/you-get/actions
.. |Gitter| image:: https://badges.gitter.im/Join%20Chat.svg .. |Gitter| image:: https://badges.gitter.im/Join%20Chat.svg
:target: https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge :target: https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge

View File

@ -136,6 +136,8 @@ cookies = None
output_filename = None output_filename = None
auto_rename = False auto_rename = False
insecure = False insecure = False
m3u8 = False
postfix = False
fake_headers = { fake_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # noqa 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # noqa
@ -340,10 +342,34 @@ def undeflate(data):
return decompressobj.decompress(data)+decompressobj.flush() return decompressobj.decompress(data)+decompressobj.flush()
# an http.client implementation of get_content()
# because urllib does not support "Connection: keep-alive"
def getHttps(host, url, headers, gzip=True, deflate=False, debuglevel=0):
import http.client
conn = http.client.HTTPSConnection(host)
conn.set_debuglevel(debuglevel)
conn.request("GET", url, headers=headers)
resp = conn.getresponse()
data = resp.read()
if gzip:
data = ungzip(data)
if deflate:
data = undeflate(data)
return str(data, encoding='utf-8')
# DEPRECATED in favor of get_content() # DEPRECATED in favor of get_content()
def get_response(url, faker=False): def get_response(url, faker=False):
logging.debug('get_response: %s' % url) logging.debug('get_response: %s' % url)
ctx = None
if insecure:
# ignore ssl errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
# install cookies # install cookies
if cookies: if cookies:
opener = request.build_opener(request.HTTPCookieProcessor(cookies)) opener = request.build_opener(request.HTTPCookieProcessor(cookies))
@ -351,10 +377,10 @@ def get_response(url, faker=False):
if faker: if faker:
response = request.urlopen( response = request.urlopen(
request.Request(url, headers=fake_headers), None request.Request(url, headers=fake_headers), None, context=ctx,
) )
else: else:
response = request.urlopen(url) response = request.urlopen(url, context=ctx)
data = response.read() data = response.read()
if response.info().get('Content-Encoding') == 'gzip': if response.info().get('Content-Encoding') == 'gzip':
@ -983,6 +1009,8 @@ def download_urls(
pass pass
title = tr(get_filename(title)) title = tr(get_filename(title))
if postfix and 'vid' in kwargs:
title = "%s [%s]" % (title, kwargs['vid'])
output_filename = get_output_filename(urls, title, ext, output_dir, merge) output_filename = get_output_filename(urls, title, ext, output_dir, merge)
output_filepath = os.path.join(output_dir, output_filename) output_filepath = os.path.join(output_dir, output_filename)
@ -1339,7 +1367,13 @@ def download_main(download, download_playlist, urls, playlist, **kwargs):
if re.match(r'https?://', url) is None: if re.match(r'https?://', url) is None:
url = 'http://' + url url = 'http://' + url
if playlist: if m3u8:
if output_filename:
title = output_filename
else:
title = "m3u8file"
download_url_ffmpeg(url=url, title=title,ext = 'mp4',output_dir = '.')
elif playlist:
download_playlist(url, **kwargs) download_playlist(url, **kwargs)
else: else:
download(url, **kwargs) download(url, **kwargs)
@ -1443,7 +1477,6 @@ def set_socks_proxy(proxy):
proxy_info = proxy.split("@") proxy_info = proxy.split("@")
socks_proxy_addrs = proxy_info[1].split(':') socks_proxy_addrs = proxy_info[1].split(':')
socks_proxy_auth = proxy_info[0].split(":") socks_proxy_auth = proxy_info[0].split(":")
print(socks_proxy_auth[0]+" "+socks_proxy_auth[1]+" "+socks_proxy_addrs[0]+" "+socks_proxy_addrs[1])
socks.set_default_proxy( socks.set_default_proxy(
socks.SOCKS5, socks.SOCKS5,
socks_proxy_addrs[0], socks_proxy_addrs[0],
@ -1454,7 +1487,6 @@ def set_socks_proxy(proxy):
) )
else: else:
socks_proxy_addrs = proxy.split(':') socks_proxy_addrs = proxy.split(':')
print(socks_proxy_addrs[0]+" "+socks_proxy_addrs[1])
socks.set_default_proxy( socks.set_default_proxy(
socks.SOCKS5, socks.SOCKS5,
socks_proxy_addrs[0], socks_proxy_addrs[0],
@ -1527,6 +1559,10 @@ def script_main(download, download_playlist, **kwargs):
'--no-caption', action='store_true', '--no-caption', action='store_true',
help='Do not download captions (subtitles, lyrics, danmaku, ...)' help='Do not download captions (subtitles, lyrics, danmaku, ...)'
) )
download_grp.add_argument(
'--postfix', action='store_true', default=False,
help='Postfix downloaded files with unique identifiers'
)
download_grp.add_argument( download_grp.add_argument(
'-f', '--force', action='store_true', default=False, '-f', '--force', action='store_true', default=False,
help='Force overwriting existing files' help='Force overwriting existing files'
@ -1619,6 +1655,10 @@ def script_main(download, download_playlist, **kwargs):
download_grp.add_argument('--stream', help=argparse.SUPPRESS) download_grp.add_argument('--stream', help=argparse.SUPPRESS)
download_grp.add_argument('--itag', help=argparse.SUPPRESS) download_grp.add_argument('--itag', help=argparse.SUPPRESS)
download_grp.add_argument('-m', '--m3u8', action='store_true', default=False,
help = 'download video using an m3u8 url')
parser.add_argument('URL', nargs='*', help=argparse.SUPPRESS) parser.add_argument('URL', nargs='*', help=argparse.SUPPRESS)
args = parser.parse_args() args = parser.parse_args()
@ -1644,6 +1684,8 @@ def script_main(download, download_playlist, **kwargs):
global output_filename global output_filename
global auto_rename global auto_rename
global insecure global insecure
global m3u8
global postfix
output_filename = args.output_filename output_filename = args.output_filename
extractor_proxy = args.extractor_proxy extractor_proxy = args.extractor_proxy
@ -1665,6 +1707,9 @@ def script_main(download, download_playlist, **kwargs):
if args.cookies: if args.cookies:
load_cookies(args.cookies) load_cookies(args.cookies)
if args.m3u8:
m3u8 = True
caption = True caption = True
stream_id = args.format or args.stream or args.itag stream_id = args.format or args.stream or args.itag
if args.no_caption: if args.no_caption:
@ -1677,6 +1722,7 @@ def script_main(download, download_playlist, **kwargs):
# ignore ssl # ignore ssl
insecure = True insecure = True
postfix = args.postfix
if args.no_proxy: if args.no_proxy:
set_http_proxy('') set_http_proxy('')
@ -1763,20 +1809,10 @@ def google_search(url):
url = 'https://www.google.com/search?tbm=vid&q=%s' % parse.quote(keywords) url = 'https://www.google.com/search?tbm=vid&q=%s' % parse.quote(keywords)
page = get_content(url, headers=fake_headers) page = get_content(url, headers=fake_headers)
videos = re.findall( videos = re.findall(
r'<a href="(https?://[^"]+)" onmousedown="[^"]+"><h3 class="[^"]*">([^<]+)<', page r'(https://www\.youtube\.com/watch\?v=[\w-]+)', page
) )
vdurs = re.findall(r'<span class="vdur[^"]*">([^<]+)<', page)
durs = [r1(r'(\d+:\d+)', unescape_html(dur)) for dur in vdurs]
print('Google Videos search:')
for v in zip(videos, durs):
print('- video: {} [{}]'.format(
unescape_html(v[0][1]),
v[1] if v[1] else '?'
))
print('# you-get %s' % log.sprint(v[0][0], log.UNDERLINE))
print()
print('Best matched result:') print('Best matched result:')
return(videos[0][0]) return(videos[0])
def url_to_module(url): def url_to_module(url):

View File

@ -238,7 +238,8 @@ class VideoExtractor():
download_urls(urls, self.title, ext, total_size, headers=headers, download_urls(urls, self.title, ext, total_size, headers=headers,
output_dir=kwargs['output_dir'], output_dir=kwargs['output_dir'],
merge=kwargs['merge'], merge=kwargs['merge'],
av=stream_id in self.dash_streams) av=stream_id in self.dash_streams,
vid=self.vid)
if 'caption' not in kwargs or not kwargs['caption']: if 'caption' not in kwargs or not kwargs['caption']:
print('Skipping captions or danmaku.') print('Skipping captions or danmaku.')

View File

@ -12,8 +12,12 @@ class Bilibili(VideoExtractor):
# Bilibili media encoding options, in descending quality order. # Bilibili media encoding options, in descending quality order.
stream_types = [ stream_types = [
{'id': 'hdflv2', 'quality': 125, 'audio_quality': 30280, {'id': 'hdflv2_8k', 'quality': 127, 'audio_quality': 30280,
'container': 'FLV', 'video_resolution': '3840p', 'desc': '真彩 HDR'}, 'container': 'FLV', 'video_resolution': '4320p', 'desc': '超高清 8K'},
{'id': 'hdflv2_dolby', 'quality': 126, 'audio_quality': 30280,
'container': 'FLV', 'video_resolution': '3840p', 'desc': '杜比视界'},
{'id': 'hdflv2_hdr', 'quality': 125, 'audio_quality': 30280,
'container': 'FLV', 'video_resolution': '2160p', 'desc': '真彩 HDR'},
{'id': 'hdflv2_4k', 'quality': 120, 'audio_quality': 30280, {'id': 'hdflv2_4k', 'quality': 120, 'audio_quality': 30280,
'container': 'FLV', 'video_resolution': '2160p', 'desc': '超清 4K'}, 'container': 'FLV', 'video_resolution': '2160p', 'desc': '超清 4K'},
{'id': 'flv_p60', 'quality': 116, 'audio_quality': 30280, {'id': 'flv_p60', 'quality': 116, 'audio_quality': 30280,
@ -112,12 +116,16 @@ class Bilibili(VideoExtractor):
def bilibili_space_channel_api(mid, cid, pn=1, ps=100): def bilibili_space_channel_api(mid, cid, pn=1, ps=100):
return 'https://api.bilibili.com/x/space/channel/video?mid=%s&cid=%s&pn=%s&ps=%s&order=0&jsonp=jsonp' % (mid, cid, pn, ps) return 'https://api.bilibili.com/x/space/channel/video?mid=%s&cid=%s&pn=%s&ps=%s&order=0&jsonp=jsonp' % (mid, cid, pn, ps)
@staticmethod
def bilibili_series_archives_api(mid, sid, pn=1, ps=100):
return 'https://api.bilibili.com/x/series/archives?mid=%s&series_id=%s&pn=%s&ps=%s&only_normal=true&sort=asc&jsonp=jsonp' % (mid, sid, pn, ps)
@staticmethod @staticmethod
def bilibili_space_favlist_api(fid, pn=1, ps=20): def bilibili_space_favlist_api(fid, pn=1, ps=20):
return 'https://api.bilibili.com/x/v3/fav/resource/list?media_id=%s&pn=%s&ps=%s&order=mtime&type=0&tid=0&jsonp=jsonp' % (fid, pn, ps) return 'https://api.bilibili.com/x/v3/fav/resource/list?media_id=%s&pn=%s&ps=%s&order=mtime&type=0&tid=0&jsonp=jsonp' % (fid, pn, ps)
@staticmethod @staticmethod
def bilibili_space_video_api(mid, pn=1, ps=100): def bilibili_space_video_api(mid, pn=1, ps=50):
return "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=%s&ps=%s&tid=0&keyword=&order=pubdate&jsonp=jsonp" % (mid, pn, ps) return "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=%s&ps=%s&tid=0&keyword=&order=pubdate&jsonp=jsonp" % (mid, pn, ps)
@staticmethod @staticmethod
@ -137,6 +145,8 @@ class Bilibili(VideoExtractor):
def prepare(self, **kwargs): def prepare(self, **kwargs):
self.stream_qualities = {s['quality']: s for s in self.stream_types} self.stream_qualities = {s['quality']: s for s in self.stream_types}
self.streams.clear()
self.dash_streams.clear()
try: try:
html_content = get_content(self.url, headers=self.bilibili_headers(referer=self.url)) html_content = get_content(self.url, headers=self.bilibili_headers(referer=self.url))
@ -167,6 +177,11 @@ class Bilibili(VideoExtractor):
self.url = 'https://www.bilibili.com/%s' % match1(self.url, r'/s/(.+)') self.url = 'https://www.bilibili.com/%s' % match1(self.url, r'/s/(.+)')
html_content = get_content(self.url, headers=self.bilibili_headers()) html_content = get_content(self.url, headers=self.bilibili_headers())
# redirect: festival
elif re.match(r'https?://(www\.)?bilibili\.com/festival/(.+)', self.url):
self.url = 'https://www.bilibili.com/video/%s' % match1(self.url, r'bvid=([^&]+)')
html_content = get_content(self.url, headers=self.bilibili_headers())
# sort it out # sort it out
if re.match(r'https?://(www\.)?bilibili\.com/audio/au(\d+)', self.url): if re.match(r'https?://(www\.)?bilibili\.com/audio/au(\d+)', self.url):
sort = 'audio' sort = 'audio'
@ -178,7 +193,7 @@ class Bilibili(VideoExtractor):
sort = 'live' sort = 'live'
elif re.match(r'https?://vc\.bilibili\.com/video/(\d+)', self.url): elif re.match(r'https?://vc\.bilibili\.com/video/(\d+)', self.url):
sort = 'vc' sort = 'vc'
elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(BV(\S+)))', self.url): elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(bv(\S+))|(BV(\S+)))', self.url):
sort = 'video' sort = 'video'
elif re.match(r'https?://h\.?bilibili\.com/(\d+)', self.url): elif re.match(r'https?://h\.?bilibili\.com/(\d+)', self.url):
sort = 'h' sort = 'h'
@ -193,28 +208,43 @@ class Bilibili(VideoExtractor):
playinfo_text = match1(html_content, r'__playinfo__=(.*?)</script><script>') # FIXME playinfo_text = match1(html_content, r'__playinfo__=(.*?)</script><script>') # FIXME
playinfo = json.loads(playinfo_text) if playinfo_text else None playinfo = json.loads(playinfo_text) if playinfo_text else None
playinfo = playinfo if playinfo and playinfo.get('code') == 0 else None
html_content_ = get_content(self.url, headers=self.bilibili_headers(cookie='CURRENT_FNVAL=16')) html_content_ = get_content(self.url, headers=self.bilibili_headers(cookie='CURRENT_FNVAL=16'))
playinfo_text_ = match1(html_content_, r'__playinfo__=(.*?)</script><script>') # FIXME playinfo_text_ = match1(html_content_, r'__playinfo__=(.*?)</script><script>') # FIXME
playinfo_ = json.loads(playinfo_text_) if playinfo_text_ else None playinfo_ = json.loads(playinfo_text_) if playinfo_text_ else None
playinfo_ = playinfo_ if playinfo_ and playinfo_.get('code') == 0 else None
# warn if it is a multi-part video if 'videoData' in initial_state:
pn = initial_state['videoData']['videos'] # (standard video)
if pn > 1 and not kwargs.get('playlist'):
log.w('This is a multipart video. (use --playlist to download all parts.)')
# set video title # warn if it is a multi-part video
self.title = initial_state['videoData']['title'] pn = initial_state['videoData']['videos']
# refine title for a specific part, if it is a multi-part video if pn > 1 and not kwargs.get('playlist'):
p = int(match1(self.url, r'[\?&]p=(\d+)') or match1(self.url, r'/index_(\d+)') or log.w('This is a multipart video. (use --playlist to download all parts.)')
'1') # use URL to decide p-number, not initial_state['p']
if pn > 1: # set video title
part = initial_state['videoData']['pages'][p - 1]['part'] self.title = initial_state['videoData']['title']
self.title = '%s (P%s. %s)' % (self.title, p, part) # refine title for a specific part, if it is a multi-part video
p = int(match1(self.url, r'[\?&]p=(\d+)') or match1(self.url, r'/index_(\d+)') or
'1') # use URL to decide p-number, not initial_state['p']
if pn > 1:
part = initial_state['videoData']['pages'][p - 1]['part']
self.title = '%s (P%s. %s)' % (self.title, p, part)
# construct playinfos
avid = initial_state['aid']
cid = initial_state['videoData']['pages'][p - 1]['cid'] # use p-number, not initial_state['videoData']['cid']
else:
# (festival video)
# set video title
self.title = initial_state['videoInfo']['title']
# construct playinfos
avid = initial_state['videoInfo']['aid']
cid = initial_state['videoInfo']['cid']
# construct playinfos
avid = initial_state['aid']
cid = initial_state['videoData']['pages'][p - 1]['cid'] # use p-number, not initial_state['videoData']['cid']
current_quality, best_quality = None, None current_quality, best_quality = None, None
if playinfo is not None: if playinfo is not None:
current_quality = playinfo['data']['quality'] or None # 0 indicates an error, fallback to None current_quality = playinfo['data']['quality'] or None # 0 indicates an error, fallback to None
@ -592,10 +622,12 @@ class Bilibili(VideoExtractor):
elif re.match(r'https?://(www\.)?bilibili\.com/bangumi/media/md(\d+)', self.url) or \ elif re.match(r'https?://(www\.)?bilibili\.com/bangumi/media/md(\d+)', self.url) or \
re.match(r'https?://bangumi\.bilibili\.com/anime/(\d+)', self.url): re.match(r'https?://bangumi\.bilibili\.com/anime/(\d+)', self.url):
sort = 'bangumi_md' sort = 'bangumi_md'
elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|BV(\S+))', self.url): elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|bv(\S+)|BV(\S+))', self.url):
sort = 'video' sort = 'video'
elif re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/detail\?.*cid=(\d+)', self.url): elif re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/detail\?.*cid=(\d+)', self.url):
sort = 'space_channel' sort = 'space_channel'
elif re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/seriesdetail\?.*sid=(\d+)', self.url):
sort = 'space_channel_series'
elif re.match(r'https?://space\.?bilibili\.com/(\d+)/favlist\?.*fid=(\d+)', self.url): elif re.match(r'https?://space\.?bilibili\.com/(\d+)/favlist\?.*fid=(\d+)', self.url):
sort = 'space_favlist' sort = 'space_favlist'
elif re.match(r'https?://space\.?bilibili\.com/(\d+)/video', self.url): elif re.match(r'https?://space\.?bilibili\.com/(\d+)/video', self.url):
@ -706,6 +738,20 @@ class Bilibili(VideoExtractor):
url = 'https://www.bilibili.com/video/av%s' % video['aid'] url = 'https://www.bilibili.com/video/av%s' % video['aid']
self.__class__().download_playlist_by_url(url, **kwargs) self.__class__().download_playlist_by_url(url, **kwargs)
elif sort == 'space_channel_series':
m = re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/seriesdetail\?.*sid=(\d+)', self.url)
mid, sid = m.group(1), m.group(2)
api_url = self.bilibili_series_archives_api(mid, sid)
api_content = get_content(api_url, headers=self.bilibili_headers(referer=self.url))
archives_info = json.loads(api_content)
# TBD: channel of more than 100 videos
epn, i = len(archives_info['data']['archives']), 0
for video in archives_info['data']['archives']:
i += 1; log.w('Extracting %s of %s videos ...' % (i, epn))
url = 'https://www.bilibili.com/video/av%s' % video['aid']
self.__class__().download_playlist_by_url(url, **kwargs)
elif sort == 'space_favlist': elif sort == 'space_favlist':
m = re.match(r'https?://space\.?bilibili\.com/(\d+)/favlist\?.*fid=(\d+)', self.url) m = re.match(r'https?://space\.?bilibili\.com/(\d+)/favlist\?.*fid=(\d+)', self.url)
vmid, fid = m.group(1), m.group(2) vmid, fid = m.group(1), m.group(2)

View File

@ -1,8 +1,6 @@
# coding=utf-8 # coding=utf-8
import re
import json import json
from urllib.parse import unquote
from ..common import ( from ..common import (
url_size, url_size,
@ -11,25 +9,52 @@ from ..common import (
fake_headers, fake_headers,
download_urls, download_urls,
playlist_not_supported, playlist_not_supported,
match1,
get_location,
) )
__all__ = ['douyin_download_by_url'] __all__ = ['douyin_download_by_url']
def get_value(source: dict, path):
try:
value = source
for key in path:
if type(key) is str:
if key in value.keys():
value = value[key]
else:
value = None
break
elif type(key) is int:
if len(value) != 0:
value = value[key]
else:
value = None
break
except:
value = None
return value
def douyin_download_by_url(url, **kwargs): def douyin_download_by_url(url, **kwargs):
# if short link, get the real url
if 'v.douyin.com' in url:
url = get_location(url)
aweme_id = match1(url, r'/(\d+)/?')
# get video info
video_info_api = 'https://www.douyin.com/web/api/v2/aweme/iteminfo/?item_ids={}'
url = video_info_api.format(aweme_id)
page_content = get_content(url, headers=fake_headers) page_content = get_content(url, headers=fake_headers)
# The video player and video source are rendered client-side, the data video_info = json.loads(page_content)
# contains in a <script id="RENDER_DATA" type="application/json"> tag
# quoted, unquote the whole page content then search using regex with # get video id and title
# regular string. video_id = get_value(video_info, ['item_list', 0, 'video', 'vid'])
page_content = unquote(page_content) title = get_value(video_info, ['item_list', 0, 'desc'])
title = re.findall(r'"desc":"([^"]*)"', page_content)[0].strip()
# get video play url
video_url = "https://aweme.snssdk.com/aweme/v1/playwm/?ratio=720p&line=0&video_id={}".format(video_id)
video_format = 'mp4' video_format = 'mp4'
# video URLs are in this pattern {"src":"THE_URL"}, in json format
urls_pattern = r'"playAddr":(\[.*?\])'
urls = json.loads(re.findall(urls_pattern, page_content)[0])
video_url = 'https:' + urls[0]['src']
size = url_size(video_url, faker=True) size = url_size(video_url, faker=True)
print_info( print_info(
site_info='douyin.com', title=title, site_info='douyin.com', title=title,

View File

@ -52,7 +52,8 @@ class Imgur(VideoExtractor):
else: else:
# gallery image # gallery image
content = get_content(self.url) content = get_content(self.url)
url = match1(content, r'(https?://i.imgur.com/[^"]+)') url = match1(content, r'meta property="og:video"[^>]+(https?://i.imgur.com/[^"?]+)') or \
match1(content, r'meta property="og:image"[^>]+(https?://i.imgur.com/[^"?]+)')
_, container, size = url_info(url) _, container, size = url_info(url)
self.streams = { self.streams = {
'original': { 'original': {

View File

@ -10,51 +10,25 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg
vid = r1(r'instagram.com/\w+/([^/]+)', url) vid = r1(r'instagram.com/\w+/([^/]+)', url)
description = r1(r'<meta property="og:title" content="([^"]*)"', cont) or \ description = r1(r'<meta property="og:title" content="([^"]*)"', cont) or \
r1(r'<title>\s([^<]*)</title>', cont) # with logged-in cookies r1(r'<title>([^<]*)</title>', cont) # with logged-in cookies
title = "{} [{}]".format(description.replace("\n", " "), vid) title = "{} [{}]".format(description.replace("\n", " "), vid)
stream = r1(r'<meta property="og:video" content="([^"]*)"', cont) appId = r1(r'"appId":"(\d+)"', cont)
if stream: media_id = r1(r'"media_id":"(\d+)"', cont)
_, ext, size = url_info(stream)
print_info(site_info, title, ext, size) api_url = 'https://i.instagram.com/api/v1/media/%s/info/' % media_id
if not info_only: try:
download_urls([stream], title, ext, size, output_dir, merge=merge) api_cont = get_content(api_url, headers={**fake_headers, **{'x-ig-app-id': appId}})
else: except:
data = re.search(r'window\._sharedData\s*=\s*(.*);</script>', cont) log.wtf('[Error] Please specify a cookie file.')
try: post = json.loads(api_cont)
info = json.loads(data.group(1))
post = info['entry_data']['PostPage'][0]
assert post
except:
# with logged-in cookies
data = re.search(r'window\.__additionalDataLoaded\(\'[^\']+\',(.*)\);</script>', cont)
if data is not None:
log.e('[Warning] Cookies needed.')
post = json.loads(data.group(1))
if 'edge_sidecar_to_children' in post['graphql']['shortcode_media']: for item in post['items']:
edges = post['graphql']['shortcode_media']['edge_sidecar_to_children']['edges'] code = item['code']
for edge in edges: carousel_media = item.get('carousel_media') or [item]
title = edge['node']['shortcode'] for i, media in enumerate(carousel_media):
image_url = edge['node']['display_url'] title = '%s [%s]' % (code, i)
if 'video_url' in edge['node']: image_url = media['image_versions2']['candidates'][0]['url']
image_url = edge['node']['video_url']
ext = image_url.split('?')[0].split('.')[-1]
size = int(get_head(image_url)['Content-Length'])
print_info(site_info, title, ext, size)
if not info_only:
download_urls(urls=[image_url],
title=title,
ext=ext,
total_size=size,
output_dir=output_dir)
else:
title = post['graphql']['shortcode_media']['shortcode']
image_url = post['graphql']['shortcode_media']['display_url']
if 'video_url' in post['graphql']['shortcode_media']:
image_url = post['graphql']['shortcode_media']['video_url']
ext = image_url.split('?')[0].split('.')[-1] ext = image_url.split('?')[0].split('.')[-1]
size = int(get_head(image_url)['Content-Length']) size = int(get_head(image_url)['Content-Length'])
@ -66,6 +40,20 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg
total_size=size, total_size=size,
output_dir=output_dir) output_dir=output_dir)
# download videos (if any)
if 'video_versions' in media:
video_url = media['video_versions'][0]['url']
ext = video_url.split('?')[0].split('.')[-1]
size = int(get_head(video_url)['Content-Length'])
print_info(site_info, title, ext, size)
if not info_only:
download_urls(urls=[video_url],
title=title,
ext=ext,
total_size=size,
output_dir=output_dir)
site_info = "Instagram.com" site_info = "Instagram.com"
download = instagram_download download = instagram_download
download_playlist = playlist_not_supported('instagram') download_playlist = playlist_not_supported('instagram')

View File

@ -18,121 +18,95 @@ headers = {
} }
def int_overflow(val): def ixigua_download(url, output_dir='.', merge=True, info_only=False, stream_id='', **kwargs):
maxint = 2147483647
if not -maxint - 1 <= val <= maxint:
val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1
return val
def unsigned_right_shitf(n, i):
if n < 0:
n = ctypes.c_uint32(n).value
if i < 0:
return -int_overflow(n << abs(i))
return int_overflow(n >> i)
def get_video_url_from_video_id(video_id):
"""Splicing URLs according to video ID to get video details"""
# from js
data = [""] * 256
for index, _ in enumerate(data):
t = index
for i in range(8):
t = -306674912 ^ unsigned_right_shitf(t, 1) if 1 & t else unsigned_right_shitf(t, 1)
data[index] = t
def tmp():
rand_num = random.random()
path = "/video/urls/v/1/toutiao/mp4/{video_id}?r={random_num}".format(video_id=video_id,
random_num=str(rand_num)[2:])
e = o = r = -1
i, a = 0, len(path)
while i < a:
e = ord(path[i])
i += 1
if e < 128:
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ e)]
else:
if e < 2048:
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (192 | e >> 6 & 31))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))]
else:
if 55296 <= e < 57344:
e = (1023 & e) + 64
i += 1
o = 1023 & t.url(i)
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (240 | e >> 8 & 7))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 2 & 63))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | o >> 6 & 15 | (3 & e) << 4))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & o))]
else:
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (224 | e >> 12 & 15))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 6 & 63))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))]
return "https://ib.365yg.com{path}&s={param}".format(path=path, param=unsigned_right_shitf(r ^ -1, 0))
while 1:
url = tmp()
if url.split("=")[-1][0] != "-": # 参数s不能为负数
return url
def ixigua_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
# example url: https://www.ixigua.com/i6631065141750268420/#mid=63024814422 # example url: https://www.ixigua.com/i6631065141750268420/#mid=63024814422
resp = urlopen_with_retry(request.Request(url)) headers['cookie'] = "MONITOR_WEB_ID=7892c49b-296e-4499-8704-e47c1b15123; " \
"ixigua-a-s=1; ttcid=af99669b6304453480454f1507011d5c234; BD_REF=1; " \
"__ac_nonce=060d88ff000a75e8d17eb; __ac_signature=_02B4Z6wo100f01kX9ZpgAAIDAKIBBQUIPYT5F2WIAAPG2ad; " \
"ttwid=1%7CcIsVF_3vqSIk4XErhPB0H2VaTxT0tdsTMRbMjrJOPN8%7C1624806049%7C08ce7dd6f7d20506a41ba0a331ef96a6505d96731e6ad9f6c8c709f53f227ab1; "
resp = urlopen_with_retry(request.Request(url, headers=headers))
html = resp.read().decode('utf-8') html = resp.read().decode('utf-8')
_cookies = [] _cookies = []
for c in resp.getheader('Set-Cookie').split("httponly,"): for c in resp.getheader('Set-Cookie').split("httponly,"):
_cookies.append(c.strip().split(' ')[0]) _cookies.append(c.strip().split(' ')[0])
headers['cookie'] = ' '.join(_cookies) headers['cookie'] += ' '.join(_cookies)
conf = loads(match1(html, r"window\.config = (.+);")) match_txt = match1(html, r"<script id=\"SSR_HYDRATED_DATA\">window._SSR_HYDRATED_DATA=(.*?)<\/script>")
if not conf: if not match_txt:
log.e("Get window.config from url failed, url: {}".format(url)) log.e("Get video info from url failed, url: {}".format(url))
return return
verify_url = conf['prefix'] + conf['url'] + '?key=' + conf['key'] + '&psm=' + conf['psm'] \ video_info = loads(match_txt.replace('":undefined', '":null'))
+ '&_signature=' + ''.join(random.sample(string.ascii_letters + string.digits, 31)) if not video_info:
try: log.e("video_info not found, url:{}".format(url))
ok = get_content(verify_url)
except Exception as e:
ok = e.msg
if ok != 'OK':
log.e("Verify failed, verify_url: {}, result: {}".format(verify_url, ok))
return return
html = get_content(url, headers=headers)
video_id = match1(html, r"\"vid\":\"([^\"]+)") title = video_info['anyVideo']['gidInformation']['packerData']['video']['title']
title = match1(html, r"\"player__videoTitle\">.*?<h1.*?>(.*)<\/h1><\/div>") video_resource = video_info['anyVideo']['gidInformation']['packerData']['video']['videoResource']
if not video_id: if video_resource.get('dash', None):
log.e("video_id not found, url:{}".format(url)) video_list = video_resource['dash']
elif video_resource.get('dash_120fps', None):
video_list = video_resource['dash_120fps']
elif video_resource.get('normal', None):
video_list = video_resource['normal']
else:
log.e("video_list not found, url:{}".format(url))
return return
video_info_url = get_video_url_from_video_id(video_id)
video_info = loads(get_content(video_info_url)) streams = [
if video_info.get("code", 1) != 0: # {'file_id': 'fc1b9bf8e8e04a849d90a5172d3f6919', 'quality': "normal", 'size': 0,
log.e("Get video info from {} error: server return code {}".format(video_info_url, video_info.get("code", 1))) # 'definition': '720p', 'video_url': '','audio_url':'','v_type':'dash'},
return ]
if not video_info.get("data", None): # 先用无水印的视频与音频合成没有的话再直接用有水印的mp4
log.e("Get video info from {} error: The server returns JSON value" if video_list.get('dynamic_video', None):
" without data or data is empty".format(video_info_url)) audio_url = base64.b64decode(
return video_list['dynamic_video']['dynamic_audio_list'][0]['main_url'].encode("utf-8")).decode("utf-8")
if not video_info["data"].get("video_list", None): dynamic_video_list = video_list['dynamic_video']['dynamic_video_list']
log.e("Get video info from {} error: The server returns JSON value" streams = convertStreams(dynamic_video_list, audio_url)
" without data.video_list or data.video_list is empty".format(video_info_url)) elif video_list.get('video_list', None):
return dynamic_video_list = video_list['video_list']
if not video_info["data"]["video_list"].get("video_1", None): streams = convertStreams(dynamic_video_list, "")
log.e("Get video info from {} error: The server returns JSON value"
" without data.video_list.video_1 or data.video_list.video_1 is empty".format(video_info_url)) print("title: %s" % title)
return for stream in streams:
bestQualityVideo = list(video_info["data"]["video_list"].keys())[-1] #There is not only video_1, there might be video_2 if stream_id != "" and stream_id != stream['definition']:
size = int(video_info["data"]["video_list"][bestQualityVideo]["size"]) continue
print_info(site_info=site_info, title=title, type="mp4", size=size) # 该网站只有mp4类型文件
if not info_only: print(" - format: %s" % stream['definition'])
video_url = base64.b64decode(video_info["data"]["video_list"][bestQualityVideo]["main_url"].encode("utf-8")) print(" size: %s MiB (%s bytes)" % (round(stream['size'] / 1048576, 1), stream['size']))
download_urls([video_url.decode("utf-8")], title, "mp4", size, output_dir, merge=merge, headers=headers, **kwargs) print(" quality: %s " % stream['quality'])
print(" v_type: %s " % stream['v_type'])
# print(" video_url: %s " % stream['video_url'])
# print(" audio_url: %s " % stream['audio_url'])
print()
# 不是只看信息的话,就下载第一个
if not info_only:
urls = [stream['video_url']]
if stream['audio_url'] != "":
urls.append(stream['audio_url'])
kwargs['av'] = 'av' # 这将会合并音视频
download_urls(urls, title, "mp4", stream['size'], output_dir, merge=merge, headers=headers,
**kwargs)
return
def convertStreams(video_list, audio_url):
streams = []
for dynamic_video in video_list:
streams.append({
'file_id': dynamic_video['file_hash'],
'quality': dynamic_video['quality'],
'size': dynamic_video['size'],
'definition': dynamic_video['definition'],
'video_url': base64.b64decode(dynamic_video['main_url'].encode("utf-8")).decode("utf-8"),
'audio_url': audio_url,
'v_type': dynamic_video['vtype'],
})
return streams
def ixigua_download_playlist_by_url(url, output_dir='.', merge=True, info_only=False, **kwargs): def ixigua_download_playlist_by_url(url, output_dir='.', merge=True, info_only=False, **kwargs):

View File

@ -35,6 +35,7 @@ def qq_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False):
part_urls= [] part_urls= []
total_size = 0 total_size = 0
ext = None
for part in range(1, seg_cnt+1): for part in range(1, seg_cnt+1):
if fc_cnt == 0: if fc_cnt == 0:
# fix json parsing error # fix json parsing error

View File

@ -5,42 +5,38 @@ __all__ = ['tiktok_download']
from ..common import * from ..common import *
def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs): def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
referUrl = url.split('?')[0] headers = {
headers = fake_headers 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0',
'Accept-Encoding': 'gzip, deflate',
'Accept': '*/*',
'Connection': 'keep-alive' # important
}
# trick or treat m = re.match('(https?://)?([^/]+)(/.*)', url)
html = get_content(url, headers=headers) host = m.group(2)
data = r1(r'<script id="__NEXT_DATA__".*?>(.*?)</script>', html) if host != 'www.tiktok.com': # non-canonical URL
url = get_location(url, headers=headers)
m = re.match('(https?://)?([^/]+)(/.*)', url)
host = m.group(2)
url = m.group(3).split('?')[0]
vid = url.split('/')[3] # should be a string of numbers
html = getHttps(host, url, headers=headers)
data = r1(r'window\[\'SIGI_STATE\'\]=(.*?);window\[\'SIGI_RETRY\'\]', html) or \
r1(r'<script id="SIGI_STATE" type="application/json">(.*?)</script>', html)
info = json.loads(data) info = json.loads(data)
wid = info['props']['initialProps']['$wid'] downloadAddr = info['ItemModule'][vid]['video']['downloadAddr']
cookie = 'tt_webid=%s; tt_webid_v2=%s' % (wid, wid) author = info['ItemModule'][vid]['author'] # same as uniqueId
nickname = info['UserModule']['users'][author]['nickname']
title = '%s [%s]' % (nickname or author, vid)
# here's the cookie mime, ext, size = url_info(downloadAddr, headers=headers)
headers['Cookie'] = cookie
# try again
html = get_content(url, headers=headers)
data = r1(r'<script id="__NEXT_DATA__".*?>(.*?)</script>', html)
info = json.loads(data)
wid = info['props']['initialProps']['$wid']
cookie = 'tt_webid=%s; tt_webid_v2=%s' % (wid, wid)
videoData = info['props']['pageProps']['itemInfo']['itemStruct']
videoId = videoData['id']
videoUrl = videoData['video']['downloadAddr']
uniqueId = videoData['author'].get('uniqueId')
nickName = videoData['author'].get('nickname')
title = '%s [%s]' % (nickName or uniqueId, videoId)
# we also need the referer
headers['Referer'] = referUrl
mime, ext, size = url_info(videoUrl, headers=headers)
print_info(site_info, title, mime, size) print_info(site_info, title, mime, size)
if not info_only: if not info_only:
download_urls([videoUrl], title, ext, size, output_dir=output_dir, merge=merge, headers=headers) download_urls([downloadAddr], title, ext, size, output_dir=output_dir, merge=merge, headers=headers)
site_info = "TikTok.com" site_info = "TikTok.com"
download = tiktok_download download = tiktok_download

View File

@ -51,7 +51,12 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs)
api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token}) api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token})
info = json.loads(api_content) info = json.loads(api_content)
if 'extended_entities' in info['globalObjects']['tweets'][item_id]: if item_id not in info['globalObjects']['tweets']:
# something wrong here
log.wtf('[Failed] ' + info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text'], exit_code=None)
return
elif 'extended_entities' in info['globalObjects']['tweets'][item_id]:
# if the tweet contains media, download them # if the tweet contains media, download them
media = info['globalObjects']['tweets'][item_id]['extended_entities']['media'] media = info['globalObjects']['tweets'][item_id]['extended_entities']['media']

View File

@ -76,7 +76,7 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg
urls = [] urls = []
for i in media_exts: for i in media_exts:
urls += re.findall(r'(https?://[^ ;&"\'\\<>]*' + i + r'[^ ;&"\'\\<>]*)', page) urls += re.findall(r'(https?://[^ ;&"\'\\<>]*' + i + r'[^ =?;&"\'\\<>]*)', page)
p_urls = re.findall(r'(https?%3A%2F%2F[^;&"]+' + i + r'[^;&"]*)', page) p_urls = re.findall(r'(https?%3A%2F%2F[^;&"]+' + i + r'[^;&"]*)', page)
urls += [parse.unquote(url) for url in p_urls] urls += [parse.unquote(url) for url in p_urls]

View File

@ -78,6 +78,8 @@ class YouTube(VideoExtractor):
# - https://www.youtube.com/yts/jsbin/player_ias-vfl-jbnrr/da_DK/base.js # - https://www.youtube.com/yts/jsbin/player_ias-vfl-jbnrr/da_DK/base.js
# - https://www.youtube.com/s/player/0b643cd1/player_ias.vflset/sv_SE/base.js # - https://www.youtube.com/s/player/0b643cd1/player_ias.vflset/sv_SE/base.js
# - https://www.youtube.com/s/player/50e823fc/player_ias.vflset/sv_SE/base.js # - https://www.youtube.com/s/player/50e823fc/player_ias.vflset/sv_SE/base.js
# - https://www.youtube.com/s/player/3b5d5649/player_ias.vflset/sv_SE/base.js
# - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js
def tr_js(code): def tr_js(code):
code = re.sub(r'function', r'def', code) code = re.sub(r'function', r'def', code)
# add prefix '_sig_' to prevent namespace pollution # add prefix '_sig_' to prevent namespace pollution
@ -113,12 +115,10 @@ class YouTube(VideoExtractor):
else: else:
f2def = re.search(r'[^$\w]%s:function\((\w+)\)(\{[^\{\}]+\})' % f2e, js) f2def = re.search(r'[^$\w]%s:function\((\w+)\)(\{[^\{\}]+\})' % f2e, js)
f2def = 'function {}({},b){}'.format(f2e, f2def.group(1), f2def.group(2)) f2def = 'function {}({},b){}'.format(f2e, f2def.group(1), f2def.group(2))
f2 = re.sub(r'(as|if|in|is|or)', r'_\1', f2) f2 = re.sub(r'\$', '_dollar', f2) # replace dollar sign
f2 = re.sub(r'\$', '_dollar', f2)
code = code + 'global _sig_%s\n' % f2 + tr_js(f2def) code = code + 'global _sig_%s\n' % f2 + tr_js(f2def)
f1 = re.sub(r'(as|if|in|is|or)', r'_\1', f1) f1 = re.sub(r'\$', '_dollar', f1) # replace dollar sign
f1 = re.sub(r'\$', '_dollar', f1)
code = code + '_sig=_sig_%s(s)' % f1 code = code + '_sig=_sig_%s(s)' % f1
exec(code, globals(), locals()) exec(code, globals(), locals())
return locals()['_sig'] return locals()['_sig']
@ -141,6 +141,7 @@ class YouTube(VideoExtractor):
""" """
return match1(url, r'youtu\.be/([^?/]+)') or \ return match1(url, r'youtu\.be/([^?/]+)') or \
match1(url, r'youtube\.com/embed/([^/?]+)') or \ match1(url, r'youtube\.com/embed/([^/?]+)') or \
match1(url, r'youtube\.com/shorts/([^/?]+)') or \
match1(url, r'youtube\.com/v/([^/?]+)') or \ match1(url, r'youtube\.com/v/([^/?]+)') or \
match1(url, r'youtube\.com/watch/([^/?]+)') or \ match1(url, r'youtube\.com/watch/([^/?]+)') or \
parse_query_param(url, 'v') or \ parse_query_param(url, 'v') or \
@ -233,7 +234,10 @@ class YouTube(VideoExtractor):
except: except:
# ytplayer_config = {args:{raw_player_response:ytInitialPlayerResponse}} # ytplayer_config = {args:{raw_player_response:ytInitialPlayerResponse}}
ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) try: # FIXME: we should extract ytInitialPlayerResponse more reliably
ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});</script>', video_page).group(1))
except:
ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1))
stream_list = ytInitialPlayerResponse['streamingData']['formats'] stream_list = ytInitialPlayerResponse['streamingData']['formats']
#stream_list = ytInitialPlayerResponse['streamingData']['adaptiveFormats'] #stream_list = ytInitialPlayerResponse['streamingData']['adaptiveFormats']
@ -258,7 +262,10 @@ class YouTube(VideoExtractor):
# Parse video page instead # Parse video page instead
video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid)
ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1)) try: # FIXME: we should extract ytInitialPlayerResponse more reliably
ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});</script>', video_page).group(1))
except:
ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1))
self.title = ytInitialPlayerResponse["videoDetails"]["title"] self.title = ytInitialPlayerResponse["videoDetails"]["title"]
if re.search('([^"]*/base\.js)"', video_page): if re.search('([^"]*/base\.js)"', video_page):

View File

@ -31,8 +31,8 @@ def zhihu_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
play_list = video_info["playlist"] play_list = video_info["playlist"]
# first High Definition # first High Definition
# second Second Standard Definition # second Standard Definition
# third ld. What is ld ? # third Low Definition
# finally continue # finally continue
data = play_list.get("hd", play_list.get("sd", play_list.get("ld", None))) data = play_list.get("hd", play_list.get("sd", play_list.get("ld", None)))
if not data: if not data:

View File

@ -93,7 +93,7 @@ def ffmpeg_concat_mp4_to_mpg(files, output='output.mpg'):
# Use concat demuxer on FFmpeg >= 1.1 # Use concat demuxer on FFmpeg >= 1.1
if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)):
concat_list = generate_concat_list(files, output) concat_list = generate_concat_list(files, output)
params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0',
'-i', concat_list, '-c', 'copy'] '-i', concat_list, '-c', 'copy']
params.extend(['--', output]) params.extend(['--', output])
if subprocess.call(params, stdin=STDIN) == 0: if subprocess.call(params, stdin=STDIN) == 0:
@ -149,7 +149,7 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'):
# Use concat demuxer on FFmpeg >= 1.1 # Use concat demuxer on FFmpeg >= 1.1
if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)):
concat_list = generate_concat_list(files, output) concat_list = generate_concat_list(files, output)
params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0',
'-i', concat_list, '-c', 'copy', '-i', concat_list, '-c', 'copy',
'-bsf:a', 'aac_adtstoasc'] '-bsf:a', 'aac_adtstoasc']
params.extend(['--', output]) params.extend(['--', output])
@ -203,7 +203,7 @@ def ffmpeg_concat_mp4_to_mp4(files, output='output.mp4'):
# Use concat demuxer on FFmpeg >= 1.1 # Use concat demuxer on FFmpeg >= 1.1
if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)):
concat_list = generate_concat_list(files, output) concat_list = generate_concat_list(files, output)
params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0',
'-i', concat_list, '-c', 'copy', '-i', concat_list, '-c', 'copy',
'-bsf:a', 'aac_adtstoasc'] '-bsf:a', 'aac_adtstoasc']
params.extend(['--', output]) params.extend(['--', output])

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python
script_name = 'you-get' script_name = 'you-get'
__version__ = '0.4.1545' __version__ = '0.4.1620'

View File

@ -10,7 +10,8 @@ from you_get.extractors import (
acfun, acfun,
bilibili, bilibili,
soundcloud, soundcloud,
tiktok tiktok,
twitter
) )
@ -28,11 +29,11 @@ class YouGetTests(unittest.TestCase):
youtube.download( youtube.download(
'http://www.youtube.com/watch?v=pzKerr0JIPA', info_only=True 'http://www.youtube.com/watch?v=pzKerr0JIPA', info_only=True
) )
youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True) #youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True)
youtube.download( #youtube.download(
'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa # 'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa
info_only=True # info_only=True
) #)
#youtube.download( #youtube.download(
# 'https://www.youtube.com/watch?v=Fpr4fQSh1cc', info_only=True # 'https://www.youtube.com/watch?v=Fpr4fQSh1cc', info_only=True
#) #)
@ -40,6 +41,9 @@ class YouGetTests(unittest.TestCase):
def test_acfun(self): def test_acfun(self):
acfun.download('https://www.acfun.cn/v/ac11701912', info_only=True) acfun.download('https://www.acfun.cn/v/ac11701912', info_only=True)
#def test_bilibili(self):
# bilibili.download('https://www.bilibili.com/video/BV1sL4y177sC', info_only=True)
#def test_soundcloud(self): #def test_soundcloud(self):
## single song ## single song
#soundcloud.download( #soundcloud.download(
@ -50,10 +54,12 @@ class YouGetTests(unittest.TestCase):
# 'https://soundcloud.com/anthony-flieger/sets/cytus', info_only=True # 'https://soundcloud.com/anthony-flieger/sets/cytus', info_only=True
#) #)
#def tests_tiktok(self): def test_tiktok(self):
# tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True) tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True)
# tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True) tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True)
# tiktok.download('https://vt.tiktok.com/UGJR4R/', info_only=True)
def test_twitter(self):
twitter.download('https://twitter.com/elonmusk/status/1530516552084234244', info_only=True)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -18,13 +18,10 @@
"Programming Language :: Python", "Programming Language :: Python",
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.2",
"Programming Language :: Python :: 3.3",
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Topic :: Internet", "Topic :: Internet",
"Topic :: Internet :: WWW/HTTP", "Topic :: Internet :: WWW/HTTP",
"Topic :: Multimedia", "Topic :: Multimedia",