Merge branch 'soimort:develop' into develop

This commit is contained in:
gitreposk 2022-09-06 16:46:36 +08:00 committed by GitHub
commit f894232ba3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 361 additions and 265 deletions

View File

@ -14,8 +14,9 @@ jobs:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: [3.5, 3.6, 3.7, 3.8, 3.9, pypy3]
python-version: [3.7, 3.8, 3.9, '3.10', pypy-3.8, pypy-3.9]
steps:
- uses: actions/checkout@v2

View File

@ -43,5 +43,7 @@ install:
$(SETUP) install --user --prefix=
release:
zenity --question
$(SETUP) sdist bdist_wheel upload --sign
#zenity --question
$(SETUP) sdist bdist_wheel
echo 'Upload new version to PyPI using:'
echo ' twine upload --sign dist/you-get-VERSION.tar.gz dist/you_get-VERSION-py3-none-any.whl'

View File

@ -4,7 +4,9 @@
[![PyPI version](https://img.shields.io/pypi/v/you-get.svg)](https://pypi.python.org/pypi/you-get/)
[![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
**NOTICE: Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.**
**NOTICE (30 May 2022): Support for Python 3.5, 3.6 and 3.7 will eventually be dropped. ([see details here](https://github.com/soimort/you-get/wiki/TLS-1.3-post-handshake-authentication-(PHA)))**
**NOTICE (8 Mar 2019): Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.**
---
@ -53,9 +55,9 @@ Are you a Python programmer? Then check out [the source](https://github.com/soim
### Prerequisites
The following dependencies are necessary:
The following dependencies are recommended:
* **[Python](https://www.python.org/downloads/)** 3.2 or above
* **[Python](https://www.python.org/downloads/)** 3.7.4 or above
* **[FFmpeg](https://www.ffmpeg.org/)** 1.0 or above
* (Optional) [RTMPDump](https://rtmpdump.mplayerhq.hu/)
@ -89,6 +91,14 @@ $ python3 setup.py install --user
to install `you-get` to a permanent path.
You can also use the [pipenv](https://pipenv.pypa.io/en/latest) to install the `you-get` in the Python virtual environment.
```
$ pipenv install -e .
$ pipenv run you-get --version
you-get: version 0.4.1555, a tiny downloader that scrapes the web.
```
### Option 4: Git clone
This is the recommended way for all developers, even if you don't often code in Python.

View File

@ -52,7 +52,7 @@ source <https://github.com/soimort/you-get>`__ and fork it!
.. |PyPI version| image:: https://badge.fury.io/py/you-get.png
:target: http://badge.fury.io/py/you-get
.. |Build Status| image:: https://api.travis-ci.org/soimort/you-get.png
:target: https://travis-ci.org/soimort/you-get
.. |Build Status| image:: https://github.com/soimort/you-get/workflows/develop/badge.svg
:target: https://github.com/soimort/you-get/actions
.. |Gitter| image:: https://badges.gitter.im/Join%20Chat.svg
:target: https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge

View File

@ -136,6 +136,8 @@ cookies = None
output_filename = None
auto_rename = False
insecure = False
m3u8 = False
postfix = False
fake_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # noqa
@ -340,10 +342,34 @@ def undeflate(data):
return decompressobj.decompress(data)+decompressobj.flush()
# an http.client implementation of get_content()
# because urllib does not support "Connection: keep-alive"
def getHttps(host, url, headers, gzip=True, deflate=False, debuglevel=0):
import http.client
conn = http.client.HTTPSConnection(host)
conn.set_debuglevel(debuglevel)
conn.request("GET", url, headers=headers)
resp = conn.getresponse()
data = resp.read()
if gzip:
data = ungzip(data)
if deflate:
data = undeflate(data)
return str(data, encoding='utf-8')
# DEPRECATED in favor of get_content()
def get_response(url, faker=False):
logging.debug('get_response: %s' % url)
ctx = None
if insecure:
# ignore ssl errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
# install cookies
if cookies:
opener = request.build_opener(request.HTTPCookieProcessor(cookies))
@ -351,10 +377,10 @@ def get_response(url, faker=False):
if faker:
response = request.urlopen(
request.Request(url, headers=fake_headers), None
request.Request(url, headers=fake_headers), None, context=ctx,
)
else:
response = request.urlopen(url)
response = request.urlopen(url, context=ctx)
data = response.read()
if response.info().get('Content-Encoding') == 'gzip':
@ -983,6 +1009,8 @@ def download_urls(
pass
title = tr(get_filename(title))
if postfix and 'vid' in kwargs:
title = "%s [%s]" % (title, kwargs['vid'])
output_filename = get_output_filename(urls, title, ext, output_dir, merge)
output_filepath = os.path.join(output_dir, output_filename)
@ -1339,7 +1367,13 @@ def download_main(download, download_playlist, urls, playlist, **kwargs):
if re.match(r'https?://', url) is None:
url = 'http://' + url
if playlist:
if m3u8:
if output_filename:
title = output_filename
else:
title = "m3u8file"
download_url_ffmpeg(url=url, title=title,ext = 'mp4',output_dir = '.')
elif playlist:
download_playlist(url, **kwargs)
else:
download(url, **kwargs)
@ -1443,7 +1477,6 @@ def set_socks_proxy(proxy):
proxy_info = proxy.split("@")
socks_proxy_addrs = proxy_info[1].split(':')
socks_proxy_auth = proxy_info[0].split(":")
print(socks_proxy_auth[0]+" "+socks_proxy_auth[1]+" "+socks_proxy_addrs[0]+" "+socks_proxy_addrs[1])
socks.set_default_proxy(
socks.SOCKS5,
socks_proxy_addrs[0],
@ -1454,7 +1487,6 @@ def set_socks_proxy(proxy):
)
else:
socks_proxy_addrs = proxy.split(':')
print(socks_proxy_addrs[0]+" "+socks_proxy_addrs[1])
socks.set_default_proxy(
socks.SOCKS5,
socks_proxy_addrs[0],
@ -1527,6 +1559,10 @@ def script_main(download, download_playlist, **kwargs):
'--no-caption', action='store_true',
help='Do not download captions (subtitles, lyrics, danmaku, ...)'
)
download_grp.add_argument(
'--postfix', action='store_true', default=False,
help='Postfix downloaded files with unique identifiers'
)
download_grp.add_argument(
'-f', '--force', action='store_true', default=False,
help='Force overwriting existing files'
@ -1619,6 +1655,10 @@ def script_main(download, download_playlist, **kwargs):
download_grp.add_argument('--stream', help=argparse.SUPPRESS)
download_grp.add_argument('--itag', help=argparse.SUPPRESS)
download_grp.add_argument('-m', '--m3u8', action='store_true', default=False,
help = 'download video using an m3u8 url')
parser.add_argument('URL', nargs='*', help=argparse.SUPPRESS)
args = parser.parse_args()
@ -1644,6 +1684,8 @@ def script_main(download, download_playlist, **kwargs):
global output_filename
global auto_rename
global insecure
global m3u8
global postfix
output_filename = args.output_filename
extractor_proxy = args.extractor_proxy
@ -1665,6 +1707,9 @@ def script_main(download, download_playlist, **kwargs):
if args.cookies:
load_cookies(args.cookies)
if args.m3u8:
m3u8 = True
caption = True
stream_id = args.format or args.stream or args.itag
if args.no_caption:
@ -1677,6 +1722,7 @@ def script_main(download, download_playlist, **kwargs):
# ignore ssl
insecure = True
postfix = args.postfix
if args.no_proxy:
set_http_proxy('')
@ -1763,20 +1809,10 @@ def google_search(url):
url = 'https://www.google.com/search?tbm=vid&q=%s' % parse.quote(keywords)
page = get_content(url, headers=fake_headers)
videos = re.findall(
r'<a href="(https?://[^"]+)" onmousedown="[^"]+"><h3 class="[^"]*">([^<]+)<', page
r'(https://www\.youtube\.com/watch\?v=[\w-]+)', page
)
vdurs = re.findall(r'<span class="vdur[^"]*">([^<]+)<', page)
durs = [r1(r'(\d+:\d+)', unescape_html(dur)) for dur in vdurs]
print('Google Videos search:')
for v in zip(videos, durs):
print('- video: {} [{}]'.format(
unescape_html(v[0][1]),
v[1] if v[1] else '?'
))
print('# you-get %s' % log.sprint(v[0][0], log.UNDERLINE))
print()
print('Best matched result:')
return(videos[0][0])
return(videos[0])
def url_to_module(url):

View File

@ -238,7 +238,8 @@ class VideoExtractor():
download_urls(urls, self.title, ext, total_size, headers=headers,
output_dir=kwargs['output_dir'],
merge=kwargs['merge'],
av=stream_id in self.dash_streams)
av=stream_id in self.dash_streams,
vid=self.vid)
if 'caption' not in kwargs or not kwargs['caption']:
print('Skipping captions or danmaku.')

View File

@ -12,8 +12,12 @@ class Bilibili(VideoExtractor):
# Bilibili media encoding options, in descending quality order.
stream_types = [
{'id': 'hdflv2', 'quality': 125, 'audio_quality': 30280,
'container': 'FLV', 'video_resolution': '3840p', 'desc': '真彩 HDR'},
{'id': 'hdflv2_8k', 'quality': 127, 'audio_quality': 30280,
'container': 'FLV', 'video_resolution': '4320p', 'desc': '超高清 8K'},
{'id': 'hdflv2_dolby', 'quality': 126, 'audio_quality': 30280,
'container': 'FLV', 'video_resolution': '3840p', 'desc': '杜比视界'},
{'id': 'hdflv2_hdr', 'quality': 125, 'audio_quality': 30280,
'container': 'FLV', 'video_resolution': '2160p', 'desc': '真彩 HDR'},
{'id': 'hdflv2_4k', 'quality': 120, 'audio_quality': 30280,
'container': 'FLV', 'video_resolution': '2160p', 'desc': '超清 4K'},
{'id': 'flv_p60', 'quality': 116, 'audio_quality': 30280,
@ -112,12 +116,16 @@ class Bilibili(VideoExtractor):
def bilibili_space_channel_api(mid, cid, pn=1, ps=100):
return 'https://api.bilibili.com/x/space/channel/video?mid=%s&cid=%s&pn=%s&ps=%s&order=0&jsonp=jsonp' % (mid, cid, pn, ps)
@staticmethod
def bilibili_series_archives_api(mid, sid, pn=1, ps=100):
return 'https://api.bilibili.com/x/series/archives?mid=%s&series_id=%s&pn=%s&ps=%s&only_normal=true&sort=asc&jsonp=jsonp' % (mid, sid, pn, ps)
@staticmethod
def bilibili_space_favlist_api(fid, pn=1, ps=20):
return 'https://api.bilibili.com/x/v3/fav/resource/list?media_id=%s&pn=%s&ps=%s&order=mtime&type=0&tid=0&jsonp=jsonp' % (fid, pn, ps)
@staticmethod
def bilibili_space_video_api(mid, pn=1, ps=100):
def bilibili_space_video_api(mid, pn=1, ps=50):
return "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=%s&ps=%s&tid=0&keyword=&order=pubdate&jsonp=jsonp" % (mid, pn, ps)
@staticmethod
@ -137,6 +145,8 @@ class Bilibili(VideoExtractor):
def prepare(self, **kwargs):
self.stream_qualities = {s['quality']: s for s in self.stream_types}
self.streams.clear()
self.dash_streams.clear()
try:
html_content = get_content(self.url, headers=self.bilibili_headers(referer=self.url))
@ -167,6 +177,11 @@ class Bilibili(VideoExtractor):
self.url = 'https://www.bilibili.com/%s' % match1(self.url, r'/s/(.+)')
html_content = get_content(self.url, headers=self.bilibili_headers())
# redirect: festival
elif re.match(r'https?://(www\.)?bilibili\.com/festival/(.+)', self.url):
self.url = 'https://www.bilibili.com/video/%s' % match1(self.url, r'bvid=([^&]+)')
html_content = get_content(self.url, headers=self.bilibili_headers())
# sort it out
if re.match(r'https?://(www\.)?bilibili\.com/audio/au(\d+)', self.url):
sort = 'audio'
@ -178,7 +193,7 @@ class Bilibili(VideoExtractor):
sort = 'live'
elif re.match(r'https?://vc\.bilibili\.com/video/(\d+)', self.url):
sort = 'vc'
elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(BV(\S+)))', self.url):
elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(bv(\S+))|(BV(\S+)))', self.url):
sort = 'video'
elif re.match(r'https?://h\.?bilibili\.com/(\d+)', self.url):
sort = 'h'
@ -193,28 +208,43 @@ class Bilibili(VideoExtractor):
playinfo_text = match1(html_content, r'__playinfo__=(.*?)</script><script>') # FIXME
playinfo = json.loads(playinfo_text) if playinfo_text else None
playinfo = playinfo if playinfo and playinfo.get('code') == 0 else None
html_content_ = get_content(self.url, headers=self.bilibili_headers(cookie='CURRENT_FNVAL=16'))
playinfo_text_ = match1(html_content_, r'__playinfo__=(.*?)</script><script>') # FIXME
playinfo_ = json.loads(playinfo_text_) if playinfo_text_ else None
playinfo_ = playinfo_ if playinfo_ and playinfo_.get('code') == 0 else None
# warn if it is a multi-part video
pn = initial_state['videoData']['videos']
if pn > 1 and not kwargs.get('playlist'):
log.w('This is a multipart video. (use --playlist to download all parts.)')
if 'videoData' in initial_state:
# (standard video)
# set video title
self.title = initial_state['videoData']['title']
# refine title for a specific part, if it is a multi-part video
p = int(match1(self.url, r'[\?&]p=(\d+)') or match1(self.url, r'/index_(\d+)') or
'1') # use URL to decide p-number, not initial_state['p']
if pn > 1:
part = initial_state['videoData']['pages'][p - 1]['part']
self.title = '%s (P%s. %s)' % (self.title, p, part)
# warn if it is a multi-part video
pn = initial_state['videoData']['videos']
if pn > 1 and not kwargs.get('playlist'):
log.w('This is a multipart video. (use --playlist to download all parts.)')
# set video title
self.title = initial_state['videoData']['title']
# refine title for a specific part, if it is a multi-part video
p = int(match1(self.url, r'[\?&]p=(\d+)') or match1(self.url, r'/index_(\d+)') or
'1') # use URL to decide p-number, not initial_state['p']
if pn > 1:
part = initial_state['videoData']['pages'][p - 1]['part']
self.title = '%s (P%s. %s)' % (self.title, p, part)
# construct playinfos
avid = initial_state['aid']
cid = initial_state['videoData']['pages'][p - 1]['cid'] # use p-number, not initial_state['videoData']['cid']
else:
# (festival video)
# set video title
self.title = initial_state['videoInfo']['title']
# construct playinfos
avid = initial_state['videoInfo']['aid']
cid = initial_state['videoInfo']['cid']
# construct playinfos
avid = initial_state['aid']
cid = initial_state['videoData']['pages'][p - 1]['cid'] # use p-number, not initial_state['videoData']['cid']
current_quality, best_quality = None, None
if playinfo is not None:
current_quality = playinfo['data']['quality'] or None # 0 indicates an error, fallback to None
@ -592,10 +622,12 @@ class Bilibili(VideoExtractor):
elif re.match(r'https?://(www\.)?bilibili\.com/bangumi/media/md(\d+)', self.url) or \
re.match(r'https?://bangumi\.bilibili\.com/anime/(\d+)', self.url):
sort = 'bangumi_md'
elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|BV(\S+))', self.url):
elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|bv(\S+)|BV(\S+))', self.url):
sort = 'video'
elif re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/detail\?.*cid=(\d+)', self.url):
sort = 'space_channel'
elif re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/seriesdetail\?.*sid=(\d+)', self.url):
sort = 'space_channel_series'
elif re.match(r'https?://space\.?bilibili\.com/(\d+)/favlist\?.*fid=(\d+)', self.url):
sort = 'space_favlist'
elif re.match(r'https?://space\.?bilibili\.com/(\d+)/video', self.url):
@ -706,6 +738,20 @@ class Bilibili(VideoExtractor):
url = 'https://www.bilibili.com/video/av%s' % video['aid']
self.__class__().download_playlist_by_url(url, **kwargs)
elif sort == 'space_channel_series':
m = re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/seriesdetail\?.*sid=(\d+)', self.url)
mid, sid = m.group(1), m.group(2)
api_url = self.bilibili_series_archives_api(mid, sid)
api_content = get_content(api_url, headers=self.bilibili_headers(referer=self.url))
archives_info = json.loads(api_content)
# TBD: channel of more than 100 videos
epn, i = len(archives_info['data']['archives']), 0
for video in archives_info['data']['archives']:
i += 1; log.w('Extracting %s of %s videos ...' % (i, epn))
url = 'https://www.bilibili.com/video/av%s' % video['aid']
self.__class__().download_playlist_by_url(url, **kwargs)
elif sort == 'space_favlist':
m = re.match(r'https?://space\.?bilibili\.com/(\d+)/favlist\?.*fid=(\d+)', self.url)
vmid, fid = m.group(1), m.group(2)

View File

@ -1,8 +1,6 @@
# coding=utf-8
import re
import json
from urllib.parse import unquote
from ..common import (
url_size,
@ -11,25 +9,52 @@ from ..common import (
fake_headers,
download_urls,
playlist_not_supported,
match1,
get_location,
)
__all__ = ['douyin_download_by_url']
def get_value(source: dict, path):
try:
value = source
for key in path:
if type(key) is str:
if key in value.keys():
value = value[key]
else:
value = None
break
elif type(key) is int:
if len(value) != 0:
value = value[key]
else:
value = None
break
except:
value = None
return value
def douyin_download_by_url(url, **kwargs):
# if short link, get the real url
if 'v.douyin.com' in url:
url = get_location(url)
aweme_id = match1(url, r'/(\d+)/?')
# get video info
video_info_api = 'https://www.douyin.com/web/api/v2/aweme/iteminfo/?item_ids={}'
url = video_info_api.format(aweme_id)
page_content = get_content(url, headers=fake_headers)
# The video player and video source are rendered client-side, the data
# contains in a <script id="RENDER_DATA" type="application/json"> tag
# quoted, unquote the whole page content then search using regex with
# regular string.
page_content = unquote(page_content)
title = re.findall(r'"desc":"([^"]*)"', page_content)[0].strip()
video_info = json.loads(page_content)
# get video id and title
video_id = get_value(video_info, ['item_list', 0, 'video', 'vid'])
title = get_value(video_info, ['item_list', 0, 'desc'])
# get video play url
video_url = "https://aweme.snssdk.com/aweme/v1/playwm/?ratio=720p&line=0&video_id={}".format(video_id)
video_format = 'mp4'
# video URLs are in this pattern {"src":"THE_URL"}, in json format
urls_pattern = r'"playAddr":(\[.*?\])'
urls = json.loads(re.findall(urls_pattern, page_content)[0])
video_url = 'https:' + urls[0]['src']
size = url_size(video_url, faker=True)
print_info(
site_info='douyin.com', title=title,

View File

@ -52,7 +52,8 @@ class Imgur(VideoExtractor):
else:
# gallery image
content = get_content(self.url)
url = match1(content, r'(https?://i.imgur.com/[^"]+)')
url = match1(content, r'meta property="og:video"[^>]+(https?://i.imgur.com/[^"?]+)') or \
match1(content, r'meta property="og:image"[^>]+(https?://i.imgur.com/[^"?]+)')
_, container, size = url_info(url)
self.streams = {
'original': {

View File

@ -10,51 +10,25 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg
vid = r1(r'instagram.com/\w+/([^/]+)', url)
description = r1(r'<meta property="og:title" content="([^"]*)"', cont) or \
r1(r'<title>\s([^<]*)</title>', cont) # with logged-in cookies
r1(r'<title>([^<]*)</title>', cont) # with logged-in cookies
title = "{} [{}]".format(description.replace("\n", " "), vid)
stream = r1(r'<meta property="og:video" content="([^"]*)"', cont)
if stream:
_, ext, size = url_info(stream)
appId = r1(r'"appId":"(\d+)"', cont)
media_id = r1(r'"media_id":"(\d+)"', cont)
print_info(site_info, title, ext, size)
if not info_only:
download_urls([stream], title, ext, size, output_dir, merge=merge)
else:
data = re.search(r'window\._sharedData\s*=\s*(.*);</script>', cont)
try:
info = json.loads(data.group(1))
post = info['entry_data']['PostPage'][0]
assert post
except:
# with logged-in cookies
data = re.search(r'window\.__additionalDataLoaded\(\'[^\']+\',(.*)\);</script>', cont)
if data is not None:
log.e('[Warning] Cookies needed.')
post = json.loads(data.group(1))
api_url = 'https://i.instagram.com/api/v1/media/%s/info/' % media_id
try:
api_cont = get_content(api_url, headers={**fake_headers, **{'x-ig-app-id': appId}})
except:
log.wtf('[Error] Please specify a cookie file.')
post = json.loads(api_cont)
if 'edge_sidecar_to_children' in post['graphql']['shortcode_media']:
edges = post['graphql']['shortcode_media']['edge_sidecar_to_children']['edges']
for edge in edges:
title = edge['node']['shortcode']
image_url = edge['node']['display_url']
if 'video_url' in edge['node']:
image_url = edge['node']['video_url']
ext = image_url.split('?')[0].split('.')[-1]
size = int(get_head(image_url)['Content-Length'])
print_info(site_info, title, ext, size)
if not info_only:
download_urls(urls=[image_url],
title=title,
ext=ext,
total_size=size,
output_dir=output_dir)
else:
title = post['graphql']['shortcode_media']['shortcode']
image_url = post['graphql']['shortcode_media']['display_url']
if 'video_url' in post['graphql']['shortcode_media']:
image_url = post['graphql']['shortcode_media']['video_url']
for item in post['items']:
code = item['code']
carousel_media = item.get('carousel_media') or [item]
for i, media in enumerate(carousel_media):
title = '%s [%s]' % (code, i)
image_url = media['image_versions2']['candidates'][0]['url']
ext = image_url.split('?')[0].split('.')[-1]
size = int(get_head(image_url)['Content-Length'])
@ -66,6 +40,20 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg
total_size=size,
output_dir=output_dir)
# download videos (if any)
if 'video_versions' in media:
video_url = media['video_versions'][0]['url']
ext = video_url.split('?')[0].split('.')[-1]
size = int(get_head(video_url)['Content-Length'])
print_info(site_info, title, ext, size)
if not info_only:
download_urls(urls=[video_url],
title=title,
ext=ext,
total_size=size,
output_dir=output_dir)
site_info = "Instagram.com"
download = instagram_download
download_playlist = playlist_not_supported('instagram')

View File

@ -18,121 +18,95 @@ headers = {
}
def int_overflow(val):
maxint = 2147483647
if not -maxint - 1 <= val <= maxint:
val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1
return val
def unsigned_right_shitf(n, i):
if n < 0:
n = ctypes.c_uint32(n).value
if i < 0:
return -int_overflow(n << abs(i))
return int_overflow(n >> i)
def get_video_url_from_video_id(video_id):
"""Splicing URLs according to video ID to get video details"""
# from js
data = [""] * 256
for index, _ in enumerate(data):
t = index
for i in range(8):
t = -306674912 ^ unsigned_right_shitf(t, 1) if 1 & t else unsigned_right_shitf(t, 1)
data[index] = t
def tmp():
rand_num = random.random()
path = "/video/urls/v/1/toutiao/mp4/{video_id}?r={random_num}".format(video_id=video_id,
random_num=str(rand_num)[2:])
e = o = r = -1
i, a = 0, len(path)
while i < a:
e = ord(path[i])
i += 1
if e < 128:
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ e)]
else:
if e < 2048:
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (192 | e >> 6 & 31))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))]
else:
if 55296 <= e < 57344:
e = (1023 & e) + 64
i += 1
o = 1023 & t.url(i)
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (240 | e >> 8 & 7))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 2 & 63))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | o >> 6 & 15 | (3 & e) << 4))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & o))]
else:
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (224 | e >> 12 & 15))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 6 & 63))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))]
return "https://ib.365yg.com{path}&s={param}".format(path=path, param=unsigned_right_shitf(r ^ -1, 0))
while 1:
url = tmp()
if url.split("=")[-1][0] != "-": # 参数s不能为负数
return url
def ixigua_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
def ixigua_download(url, output_dir='.', merge=True, info_only=False, stream_id='', **kwargs):
# example url: https://www.ixigua.com/i6631065141750268420/#mid=63024814422
resp = urlopen_with_retry(request.Request(url))
headers['cookie'] = "MONITOR_WEB_ID=7892c49b-296e-4499-8704-e47c1b15123; " \
"ixigua-a-s=1; ttcid=af99669b6304453480454f1507011d5c234; BD_REF=1; " \
"__ac_nonce=060d88ff000a75e8d17eb; __ac_signature=_02B4Z6wo100f01kX9ZpgAAIDAKIBBQUIPYT5F2WIAAPG2ad; " \
"ttwid=1%7CcIsVF_3vqSIk4XErhPB0H2VaTxT0tdsTMRbMjrJOPN8%7C1624806049%7C08ce7dd6f7d20506a41ba0a331ef96a6505d96731e6ad9f6c8c709f53f227ab1; "
resp = urlopen_with_retry(request.Request(url, headers=headers))
html = resp.read().decode('utf-8')
_cookies = []
for c in resp.getheader('Set-Cookie').split("httponly,"):
_cookies.append(c.strip().split(' ')[0])
headers['cookie'] = ' '.join(_cookies)
headers['cookie'] += ' '.join(_cookies)
conf = loads(match1(html, r"window\.config = (.+);"))
if not conf:
log.e("Get window.config from url failed, url: {}".format(url))
match_txt = match1(html, r"<script id=\"SSR_HYDRATED_DATA\">window._SSR_HYDRATED_DATA=(.*?)<\/script>")
if not match_txt:
log.e("Get video info from url failed, url: {}".format(url))
return
verify_url = conf['prefix'] + conf['url'] + '?key=' + conf['key'] + '&psm=' + conf['psm'] \
+ '&_signature=' + ''.join(random.sample(string.ascii_letters + string.digits, 31))
try:
ok = get_content(verify_url)
except Exception as e:
ok = e.msg
if ok != 'OK':
log.e("Verify failed, verify_url: {}, result: {}".format(verify_url, ok))
video_info = loads(match_txt.replace('":undefined', '":null'))
if not video_info:
log.e("video_info not found, url:{}".format(url))
return
html = get_content(url, headers=headers)
video_id = match1(html, r"\"vid\":\"([^\"]+)")
title = match1(html, r"\"player__videoTitle\">.*?<h1.*?>(.*)<\/h1><\/div>")
if not video_id:
log.e("video_id not found, url:{}".format(url))
title = video_info['anyVideo']['gidInformation']['packerData']['video']['title']
video_resource = video_info['anyVideo']['gidInformation']['packerData']['video']['videoResource']
if video_resource.get('dash', None):
video_list = video_resource['dash']
elif video_resource.get('dash_120fps', None):
video_list = video_resource['dash_120fps']
elif video_resource.get('normal', None):
video_list = video_resource['normal']
else:
log.e("video_list not found, url:{}".format(url))
return
video_info_url = get_video_url_from_video_id(video_id)
video_info = loads(get_content(video_info_url))
if video_info.get("code", 1) != 0:
log.e("Get video info from {} error: server return code {}".format(video_info_url, video_info.get("code", 1)))
return
if not video_info.get("data", None):
log.e("Get video info from {} error: The server returns JSON value"
" without data or data is empty".format(video_info_url))
return
if not video_info["data"].get("video_list", None):
log.e("Get video info from {} error: The server returns JSON value"
" without data.video_list or data.video_list is empty".format(video_info_url))
return
if not video_info["data"]["video_list"].get("video_1", None):
log.e("Get video info from {} error: The server returns JSON value"
" without data.video_list.video_1 or data.video_list.video_1 is empty".format(video_info_url))
return
bestQualityVideo = list(video_info["data"]["video_list"].keys())[-1] #There is not only video_1, there might be video_2
size = int(video_info["data"]["video_list"][bestQualityVideo]["size"])
print_info(site_info=site_info, title=title, type="mp4", size=size) # 该网站只有mp4类型文件
if not info_only:
video_url = base64.b64decode(video_info["data"]["video_list"][bestQualityVideo]["main_url"].encode("utf-8"))
download_urls([video_url.decode("utf-8")], title, "mp4", size, output_dir, merge=merge, headers=headers, **kwargs)
streams = [
# {'file_id': 'fc1b9bf8e8e04a849d90a5172d3f6919', 'quality': "normal", 'size': 0,
# 'definition': '720p', 'video_url': '','audio_url':'','v_type':'dash'},
]
# 先用无水印的视频与音频合成没有的话再直接用有水印的mp4
if video_list.get('dynamic_video', None):
audio_url = base64.b64decode(
video_list['dynamic_video']['dynamic_audio_list'][0]['main_url'].encode("utf-8")).decode("utf-8")
dynamic_video_list = video_list['dynamic_video']['dynamic_video_list']
streams = convertStreams(dynamic_video_list, audio_url)
elif video_list.get('video_list', None):
dynamic_video_list = video_list['video_list']
streams = convertStreams(dynamic_video_list, "")
print("title: %s" % title)
for stream in streams:
if stream_id != "" and stream_id != stream['definition']:
continue
print(" - format: %s" % stream['definition'])
print(" size: %s MiB (%s bytes)" % (round(stream['size'] / 1048576, 1), stream['size']))
print(" quality: %s " % stream['quality'])
print(" v_type: %s " % stream['v_type'])
# print(" video_url: %s " % stream['video_url'])
# print(" audio_url: %s " % stream['audio_url'])
print()
# 不是只看信息的话,就下载第一个
if not info_only:
urls = [stream['video_url']]
if stream['audio_url'] != "":
urls.append(stream['audio_url'])
kwargs['av'] = 'av' # 这将会合并音视频
download_urls(urls, title, "mp4", stream['size'], output_dir, merge=merge, headers=headers,
**kwargs)
return
def convertStreams(video_list, audio_url):
streams = []
for dynamic_video in video_list:
streams.append({
'file_id': dynamic_video['file_hash'],
'quality': dynamic_video['quality'],
'size': dynamic_video['size'],
'definition': dynamic_video['definition'],
'video_url': base64.b64decode(dynamic_video['main_url'].encode("utf-8")).decode("utf-8"),
'audio_url': audio_url,
'v_type': dynamic_video['vtype'],
})
return streams
def ixigua_download_playlist_by_url(url, output_dir='.', merge=True, info_only=False, **kwargs):

View File

@ -35,6 +35,7 @@ def qq_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False):
part_urls= []
total_size = 0
ext = None
for part in range(1, seg_cnt+1):
if fc_cnt == 0:
# fix json parsing error

View File

@ -5,42 +5,38 @@ __all__ = ['tiktok_download']
from ..common import *
def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
referUrl = url.split('?')[0]
headers = fake_headers
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0',
'Accept-Encoding': 'gzip, deflate',
'Accept': '*/*',
'Connection': 'keep-alive' # important
}
# trick or treat
html = get_content(url, headers=headers)
data = r1(r'<script id="__NEXT_DATA__".*?>(.*?)</script>', html)
m = re.match('(https?://)?([^/]+)(/.*)', url)
host = m.group(2)
if host != 'www.tiktok.com': # non-canonical URL
url = get_location(url, headers=headers)
m = re.match('(https?://)?([^/]+)(/.*)', url)
host = m.group(2)
url = m.group(3).split('?')[0]
vid = url.split('/')[3] # should be a string of numbers
html = getHttps(host, url, headers=headers)
data = r1(r'window\[\'SIGI_STATE\'\]=(.*?);window\[\'SIGI_RETRY\'\]', html) or \
r1(r'<script id="SIGI_STATE" type="application/json">(.*?)</script>', html)
info = json.loads(data)
wid = info['props']['initialProps']['$wid']
cookie = 'tt_webid=%s; tt_webid_v2=%s' % (wid, wid)
downloadAddr = info['ItemModule'][vid]['video']['downloadAddr']
author = info['ItemModule'][vid]['author'] # same as uniqueId
nickname = info['UserModule']['users'][author]['nickname']
title = '%s [%s]' % (nickname or author, vid)
# here's the cookie
headers['Cookie'] = cookie
# try again
html = get_content(url, headers=headers)
data = r1(r'<script id="__NEXT_DATA__".*?>(.*?)</script>', html)
info = json.loads(data)
wid = info['props']['initialProps']['$wid']
cookie = 'tt_webid=%s; tt_webid_v2=%s' % (wid, wid)
videoData = info['props']['pageProps']['itemInfo']['itemStruct']
videoId = videoData['id']
videoUrl = videoData['video']['downloadAddr']
uniqueId = videoData['author'].get('uniqueId')
nickName = videoData['author'].get('nickname')
title = '%s [%s]' % (nickName or uniqueId, videoId)
# we also need the referer
headers['Referer'] = referUrl
mime, ext, size = url_info(videoUrl, headers=headers)
mime, ext, size = url_info(downloadAddr, headers=headers)
print_info(site_info, title, mime, size)
if not info_only:
download_urls([videoUrl], title, ext, size, output_dir=output_dir, merge=merge, headers=headers)
download_urls([downloadAddr], title, ext, size, output_dir=output_dir, merge=merge, headers=headers)
site_info = "TikTok.com"
download = tiktok_download

View File

@ -51,7 +51,12 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs)
api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token})
info = json.loads(api_content)
if 'extended_entities' in info['globalObjects']['tweets'][item_id]:
if item_id not in info['globalObjects']['tweets']:
# something wrong here
log.wtf('[Failed] ' + info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text'], exit_code=None)
return
elif 'extended_entities' in info['globalObjects']['tweets'][item_id]:
# if the tweet contains media, download them
media = info['globalObjects']['tweets'][item_id]['extended_entities']['media']

View File

@ -76,7 +76,7 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg
urls = []
for i in media_exts:
urls += re.findall(r'(https?://[^ ;&"\'\\<>]*' + i + r'[^ ;&"\'\\<>]*)', page)
urls += re.findall(r'(https?://[^ ;&"\'\\<>]*' + i + r'[^ =?;&"\'\\<>]*)', page)
p_urls = re.findall(r'(https?%3A%2F%2F[^;&"]+' + i + r'[^;&"]*)', page)
urls += [parse.unquote(url) for url in p_urls]

View File

@ -78,6 +78,8 @@ class YouTube(VideoExtractor):
# - https://www.youtube.com/yts/jsbin/player_ias-vfl-jbnrr/da_DK/base.js
# - https://www.youtube.com/s/player/0b643cd1/player_ias.vflset/sv_SE/base.js
# - https://www.youtube.com/s/player/50e823fc/player_ias.vflset/sv_SE/base.js
# - https://www.youtube.com/s/player/3b5d5649/player_ias.vflset/sv_SE/base.js
# - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js
def tr_js(code):
code = re.sub(r'function', r'def', code)
# add prefix '_sig_' to prevent namespace pollution
@ -113,12 +115,10 @@ class YouTube(VideoExtractor):
else:
f2def = re.search(r'[^$\w]%s:function\((\w+)\)(\{[^\{\}]+\})' % f2e, js)
f2def = 'function {}({},b){}'.format(f2e, f2def.group(1), f2def.group(2))
f2 = re.sub(r'(as|if|in|is|or)', r'_\1', f2)
f2 = re.sub(r'\$', '_dollar', f2)
f2 = re.sub(r'\$', '_dollar', f2) # replace dollar sign
code = code + 'global _sig_%s\n' % f2 + tr_js(f2def)
f1 = re.sub(r'(as|if|in|is|or)', r'_\1', f1)
f1 = re.sub(r'\$', '_dollar', f1)
f1 = re.sub(r'\$', '_dollar', f1) # replace dollar sign
code = code + '_sig=_sig_%s(s)' % f1
exec(code, globals(), locals())
return locals()['_sig']
@ -141,6 +141,7 @@ class YouTube(VideoExtractor):
"""
return match1(url, r'youtu\.be/([^?/]+)') or \
match1(url, r'youtube\.com/embed/([^/?]+)') or \
match1(url, r'youtube\.com/shorts/([^/?]+)') or \
match1(url, r'youtube\.com/v/([^/?]+)') or \
match1(url, r'youtube\.com/watch/([^/?]+)') or \
parse_query_param(url, 'v') or \
@ -233,7 +234,10 @@ class YouTube(VideoExtractor):
except:
# ytplayer_config = {args:{raw_player_response:ytInitialPlayerResponse}}
ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1))
try: # FIXME: we should extract ytInitialPlayerResponse more reliably
ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});</script>', video_page).group(1))
except:
ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1))
stream_list = ytInitialPlayerResponse['streamingData']['formats']
#stream_list = ytInitialPlayerResponse['streamingData']['adaptiveFormats']
@ -258,7 +262,10 @@ class YouTube(VideoExtractor):
# Parse video page instead
video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid)
ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1))
try: # FIXME: we should extract ytInitialPlayerResponse more reliably
ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});</script>', video_page).group(1))
except:
ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1))
self.title = ytInitialPlayerResponse["videoDetails"]["title"]
if re.search('([^"]*/base\.js)"', video_page):

View File

@ -31,8 +31,8 @@ def zhihu_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
play_list = video_info["playlist"]
# first High Definition
# second Second Standard Definition
# third ld. What is ld ?
# second Standard Definition
# third Low Definition
# finally continue
data = play_list.get("hd", play_list.get("sd", play_list.get("ld", None)))
if not data:

View File

@ -93,7 +93,7 @@ def ffmpeg_concat_mp4_to_mpg(files, output='output.mpg'):
# Use concat demuxer on FFmpeg >= 1.1
if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)):
concat_list = generate_concat_list(files, output)
params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1',
params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0',
'-i', concat_list, '-c', 'copy']
params.extend(['--', output])
if subprocess.call(params, stdin=STDIN) == 0:
@ -149,7 +149,7 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'):
# Use concat demuxer on FFmpeg >= 1.1
if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)):
concat_list = generate_concat_list(files, output)
params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1',
params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0',
'-i', concat_list, '-c', 'copy',
'-bsf:a', 'aac_adtstoasc']
params.extend(['--', output])
@ -203,7 +203,7 @@ def ffmpeg_concat_mp4_to_mp4(files, output='output.mp4'):
# Use concat demuxer on FFmpeg >= 1.1
if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)):
concat_list = generate_concat_list(files, output)
params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1',
params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0',
'-i', concat_list, '-c', 'copy',
'-bsf:a', 'aac_adtstoasc']
params.extend(['--', output])

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python
script_name = 'you-get'
__version__ = '0.4.1545'
__version__ = '0.4.1620'

View File

@ -10,7 +10,8 @@ from you_get.extractors import (
acfun,
bilibili,
soundcloud,
tiktok
tiktok,
twitter
)
@ -28,11 +29,11 @@ class YouGetTests(unittest.TestCase):
youtube.download(
'http://www.youtube.com/watch?v=pzKerr0JIPA', info_only=True
)
youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True)
youtube.download(
'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa
info_only=True
)
#youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True)
#youtube.download(
# 'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa
# info_only=True
#)
#youtube.download(
# 'https://www.youtube.com/watch?v=Fpr4fQSh1cc', info_only=True
#)
@ -40,6 +41,9 @@ class YouGetTests(unittest.TestCase):
def test_acfun(self):
acfun.download('https://www.acfun.cn/v/ac11701912', info_only=True)
#def test_bilibili(self):
# bilibili.download('https://www.bilibili.com/video/BV1sL4y177sC', info_only=True)
#def test_soundcloud(self):
## single song
#soundcloud.download(
@ -50,10 +54,12 @@ class YouGetTests(unittest.TestCase):
# 'https://soundcloud.com/anthony-flieger/sets/cytus', info_only=True
#)
#def tests_tiktok(self):
# tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True)
# tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True)
# tiktok.download('https://vt.tiktok.com/UGJR4R/', info_only=True)
def test_tiktok(self):
tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True)
tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True)
def test_twitter(self):
twitter.download('https://twitter.com/elonmusk/status/1530516552084234244', info_only=True)
if __name__ == '__main__':

View File

@ -18,13 +18,10 @@
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.2",
"Programming Language :: Python :: 3.3",
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Topic :: Internet",
"Topic :: Internet :: WWW/HTTP",
"Topic :: Multimedia",