Merge branch 'develop' into develop

This commit is contained in:
liqiworks 2021-10-15 10:45:19 +08:00 committed by GitHub
commit 0a7ece66e3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 151 additions and 75 deletions

View File

@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.5, 3.6, 3.7, 3.8, pypy3]
python-version: [3.5, 3.6, 3.7, 3.8, 3.9, pypy3]
steps:
- uses: actions/checkout@v2

View File

@ -434,8 +434,17 @@ def get_content(url, headers={}, decoded=True):
req = request.Request(url, headers=headers)
if cookies:
cookies.add_cookie_header(req)
req.headers.update(req.unredirected_hdrs)
# NOTE: Do not use cookies.add_cookie_header(req)
# #HttpOnly_ cookies were not supported by CookieJar and MozillaCookieJar properly until python 3.10
# See also:
# - https://github.com/python/cpython/pull/17471
# - https://bugs.python.org/issue2190
# Here we add cookies to the request headers manually
cookie_strings = []
for cookie in list(cookies):
cookie_strings.append(cookie.name + '=' + cookie.value)
cookie_headers = {'Cookie': '; '.join(cookie_strings)}
req.headers.update(cookie_headers)
response = urlopen_with_retry(req)
data = response.read()
@ -478,8 +487,17 @@ def post_content(url, headers={}, post_data={}, decoded=True, **kwargs):
req = request.Request(url, headers=headers)
if cookies:
cookies.add_cookie_header(req)
req.headers.update(req.unredirected_hdrs)
# NOTE: Do not use cookies.add_cookie_header(req)
# #HttpOnly_ cookies were not supported by CookieJar and MozillaCookieJar properly until python 3.10
# See also:
# - https://github.com/python/cpython/pull/17471
# - https://bugs.python.org/issue2190
# Here we add cookies to the request headers manually
cookie_strings = []
for cookie in list(cookies):
cookie_strings.append(cookie.name + '=' + cookie.value)
cookie_headers = {'Cookie': '; '.join(cookie_strings)}
req.headers.update(cookie_headers)
if kwargs.get('post_data_raw'):
post_data_enc = bytes(kwargs['post_data_raw'], 'utf-8')
else:

View File

@ -12,6 +12,8 @@ class Bilibili(VideoExtractor):
# Bilibili media encoding options, in descending quality order.
stream_types = [
{'id': 'hdflv2', 'quality': 125, 'audio_quality': 30280,
'container': 'FLV', 'video_resolution': '3840p', 'desc': '真彩 HDR'},
{'id': 'hdflv2_4k', 'quality': 120, 'audio_quality': 30280,
'container': 'FLV', 'video_resolution': '2160p', 'desc': '超清 4K'},
{'id': 'flv_p60', 'quality': 116, 'audio_quality': 30280,
@ -160,6 +162,11 @@ class Bilibili(VideoExtractor):
self.url = 'https://www.bilibili.com/bangumi/play/ep%s' % ep_id
html_content = get_content(self.url, headers=self.bilibili_headers(referer=self.url))
# redirect: s
elif re.match(r'https?://(www\.)?bilibili\.com/s/(.+)', self.url):
self.url = 'https://www.bilibili.com/%s' % match1(self.url, r'/s/(.+)')
html_content = get_content(self.url, headers=self.bilibili_headers())
# sort it out
if re.match(r'https?://(www\.)?bilibili\.com/audio/au(\d+)', self.url):
sort = 'audio'
@ -179,7 +186,7 @@ class Bilibili(VideoExtractor):
self.download_playlist_by_url(self.url, **kwargs)
return
# regular av video
# regular video
if sort == 'video':
initial_state_text = match1(html_content, r'__INITIAL_STATE__=(.*?);\(function\(\)') # FIXME
initial_state = json.loads(initial_state_text)
@ -599,13 +606,21 @@ class Bilibili(VideoExtractor):
log.e('[Error] Unsupported URL pattern.')
exit(1)
# regular av video
# regular video
if sort == 'video':
initial_state_text = match1(html_content, r'__INITIAL_STATE__=(.*?);\(function\(\)') # FIXME
initial_state = json.loads(initial_state_text)
aid = initial_state['videoData']['aid']
pn = initial_state['videoData']['videos']
if pn!= len(initial_state['videoData']['pages']):#interaction video 互动视频
if pn == len(initial_state['videoData']['pages']):
# non-interative video
for pi in range(1, pn + 1):
purl = 'https://www.bilibili.com/video/av%s?p=%s' % (aid, pi)
self.__class__().download_by_url(purl, **kwargs)
else:
# interative video
search_node_list = []
download_cid_set = set([initial_state['videoData']['cid']])
params = {
@ -656,24 +671,6 @@ class Bilibili(VideoExtractor):
self.streams_sorted = [dict([('itag', stream_type['itag'])] + list(self.streams[stream_type['itag']].items())) for stream_type in self.__class__.stream_types if stream_type['itag'] in self.streams]
self.extract(**kwargs)
self.download(**kwargs)
else:
playinfo_text = match1(html_content, r'__playinfo__=(.*?)</script><script>') # FIXME
playinfo = json.loads(playinfo_text) if playinfo_text else None
html_content_ = get_content(self.url, headers=self.bilibili_headers(cookie='CURRENT_FNVAL=16'))
playinfo_text_ = match1(html_content_, r'__playinfo__=(.*?)</script><script>') # FIXME
playinfo_ = json.loads(playinfo_text_) if playinfo_text_ else None
p = int(match1(self.url, r'[\?&]p=(\d+)') or match1(self.url, r'/index_(\d+)') or '1')-1
for pi in range(p,pn):
self.prepare_by_cid(aid,initial_state['videoData']['pages'][pi]['cid'],'%s (P%s. %s)' % (initial_state['videoData']['title'], pi+1, initial_state['videoData']['pages'][pi]['part']),html_content,playinfo,playinfo_,url)
try:
self.streams_sorted = [dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams]
except:
self.streams_sorted = [dict([('itag', stream_type['itag'])] + list(self.streams[stream_type['itag']].items())) for stream_type in self.__class__.stream_types if stream_type['itag'] in self.streams]
self.extract(**kwargs)
self.download(**kwargs)
# purl = 'https://www.bilibili.com/video/av%s?p=%s' % (aid, pi+1)
# self.__class__().download_by_url(purl, **kwargs)
elif sort == 'bangumi':
initial_state_text = match1(html_content, r'__INITIAL_STATE__=(.*?);\(function\(\)') # FIXME

View File

@ -2,6 +2,7 @@
import re
import json
from urllib.parse import unquote
from ..common import (
url_size,
@ -18,17 +19,17 @@ __all__ = ['douyin_download_by_url']
def douyin_download_by_url(url, **kwargs):
page_content = get_content(url, headers=fake_headers)
match_rule = re.compile(r'var data = \[(.*?)\];')
video_info = json.loads(match_rule.findall(page_content)[0])
video_url = video_info['video']['play_addr']['url_list'][0]
# fix: https://www.douyin.com/share/video/6553248251821165832
# if there is no title, use desc
cha_list = video_info['cha_list']
if cha_list:
title = cha_list[0]['cha_name']
else:
title = video_info['desc']
# The video player and video source are rendered client-side, the data
# contains in a <script id="RENDER_DATA" type="application/json"> tag
# quoted, unquote the whole page content then search using regex with
# regular string.
page_content = unquote(page_content)
title = re.findall(r'"desc":"([^"]*)"', page_content)[0].strip()
video_format = 'mp4'
# video URLs are in this pattern {"src":"THE_URL"}, in json format
urls_pattern = r'"playAddr":(\[.*?\])'
urls = json.loads(re.findall(urls_pattern, page_content)[0])
video_url = 'https:' + urls[0]['src']
size = url_size(video_url, faker=True)
print_info(
site_info='douyin.com', title=title,

View File

@ -73,7 +73,7 @@ def get_api_key(page):
match = match1(page, pattern_inline_api_key)
# this happens only when the url points to a gallery page
# that contains no inline api_key(and never makes xhr api calls)
# in fact this might be a better approch for getting a temporary api key
# in fact this might be a better approach for getting a temporary api key
# since there's no place for a user to add custom information that may
# misguide the regex in the homepage
if not match:

View File

@ -6,14 +6,14 @@ from ..common import *
def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
url = r1(r'([^?]*)', url)
html = get_html(url, faker=True)
cont = get_content(url, headers=fake_headers)
vid = r1(r'instagram.com/\w+/([^/]+)', url)
description = r1(r'<meta property="og:title" content="([^"]*)"', html) or \
r1(r'<title>\s([^<]*)</title>', html) # with logged-in cookies
description = r1(r'<meta property="og:title" content="([^"]*)"', cont) or \
r1(r'<title>\s([^<]*)</title>', cont) # with logged-in cookies
title = "{} [{}]".format(description.replace("\n", " "), vid)
stream = r1(r'<meta property="og:video" content="([^"]*)"', html)
stream = r1(r'<meta property="og:video" content="([^"]*)"', cont)
if stream:
_, ext, size = url_info(stream)
@ -21,14 +21,14 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg
if not info_only:
download_urls([stream], title, ext, size, output_dir, merge=merge)
else:
data = re.search(r'window\._sharedData\s*=\s*(.*);</script>', html)
data = re.search(r'window\._sharedData\s*=\s*(.*);</script>', cont)
try:
info = json.loads(data.group(1))
post = info['entry_data']['PostPage'][0]
assert post
except:
# with logged-in cookies
data = re.search(r'window\.__additionalDataLoaded\(\'[^\']+\',(.*)\);</script>', html)
data = re.search(r'window\.__additionalDataLoaded\(\'[^\']+\',(.*)\);</script>', cont)
if data is not None:
log.e('[Warning] Cookies needed.')
post = json.loads(data.group(1))

View File

@ -27,6 +27,9 @@ def iwara_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
api_url = video_url + '/api/video/' + video_hash
content = get_content(api_url, headers=headers)
data = json.loads(content)
if len(data)<1 :
print('Maybe is Private Video?'+'['+title+']')
return True;
down_urls = 'https:' + data[0]['uri']
type, ext, size = url_info(down_urls, headers=headers)
print_info(site_info, title+data[0]['resolution'], type, size)
@ -35,10 +38,8 @@ def iwara_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
download_urls([down_urls], title, ext, size, output_dir, merge=merge, headers=headers)
def download_playlist_by_url( url, **kwargs):
video_page = get_content(url)
# url_first=re.findall(r"(http[s]?://[^/]+)",url)
video_page = get_html(url)
url_first=match1(url, r"(http[s]?://[^/]+)")
# print (url_first)
videos = set(re.findall(r'<a href="(/videos/[^"]+)"', video_page))
if(len(videos)>0):
for video in videos:

View File

@ -19,7 +19,7 @@ fake_headers_mobile = {
def miaopai_download_by_fid(fid, output_dir = '.', merge = False, info_only = False, **kwargs):
'''Source: Android mobile'''
page_url = 'http://video.weibo.com/show?fid=' + fid + '&type=mp4'
page_url = 'https://video.weibo.com/show?fid=' + fid + '&type=mp4'
mobile_page = get_content(page_url, headers=fake_headers_mobile)
url = match1(mobile_page, r'<video id=.*?src=[\'"](.*?)[\'"]\W')
@ -78,6 +78,51 @@ def miaopai_download_story(url, output_dir='.', merge=False, info_only=False, **
download_urls([stream_url], fs.legitimize(title), ext, total_size=None, headers=fake_headers_mobile, **kwargs)
def miaopai_download_h5api(url, output_dir='.', merge=False, info_only=False, **kwargs):
oid = match1(url, r'/show/(\d{4}:\w+)')
page = "/show/%s" % oid
data_url = 'https://h5.video.weibo.com/api/component?%s' % parse.urlencode({
'page': page
})
headers = {}
headers.update(fake_headers_mobile)
headers['origin'] = 'https://h5.video.weibo.com'
headers['page-referer'] = page
headers['referer'] = 'https://h5.video.weibo.com/show/%s' % oid
post_data = {
"data": json.dumps({
"Component_Play_Playinfo": {"oid": oid}
})
}
data_content = post_content(data_url, headers=headers, post_data=post_data)
data = json.loads(data_content)
if data['msg'] != 'succ':
raise Exception('Weibo api returns non-success: (%s)%s'.format(data['code'], data['msg']))
play_info = data['data']['Component_Play_Playinfo']
title = play_info['title']
# get video formats and sort by size desc
video_formats = []
for fmt, relative_uri in play_info['urls'].items():
url = "https:%s" % relative_uri
type, ext, size = url_info(url, headers=headers)
video_formats.append({
'fmt': fmt,
'url': url,
'type': type,
'ext': ext,
'size': size,
})
video_formats.sort(key=lambda v:v['size'], reverse=True)
selected_video = video_formats[0]
video_url, ext, size = selected_video['url'], selected_video['ext'], selected_video['size']
print_info(site_info, title, ext, size)
if not info_only:
download_urls([video_url], fs.legitimize(title), ext, total_size=size, headers=headers, **kwargs)
def miaopai_download_direct(url, output_dir='.', merge=False, info_only=False, **kwargs):
mobile_page = get_content(url, headers=fake_headers_mobile)
try:
@ -108,12 +153,16 @@ def miaopai_download(url, output_dir='.', merge=False, info_only=False, **kwargs
if re.match(r'^http[s]://.*\.weibo\.com/tv/v/(\w+)', url):
return miaopai_download_direct(url, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs)
if re.match(r'^http[s]://(.+\.)?weibo\.com/(tv/)?show/(\d{4}:\w+)', url):
return miaopai_download_h5api(url, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs)
fid = match1(url, r'\?fid=(\d{4}:\w+)')
if fid is not None:
miaopai_download_by_fid(fid, output_dir, merge, info_only)
elif '/p/230444' in url:
fid = match1(url, r'/p/230444(\w+)')
miaopai_download_by_fid('1034:'+fid, output_dir, merge, info_only)
pass
else:
mobile_page = get_content(url, headers = fake_headers_mobile)
hit = re.search(r'"page_url"\s*:\s*"([^"]+)"', mobile_page)

View File

@ -75,17 +75,13 @@ class _Dispatcher(object):
raise _NoMatchException()
missevan_stream_types = [
{'id': 'source', 'quality': '源文件', 'url_json_key': 'soundurl',
'resource_url_fmt': 'sound/{resource_url}'},
{'id': '320', 'quality': '320 Kbps', 'url_json_key': 'soundurl_64'},
{'id': 'source', 'quality': '源文件', 'url_json_key': 'soundurl'},
{'id': '128', 'quality': '128 Kbps', 'url_json_key': 'soundurl_128'},
{'id': '32', 'quality': '32 Kbps', 'url_json_key': 'soundurl_32'},
{'id': 'covers', 'desc': '封面图', 'url_json_key': 'cover_image',
'default_src': 'covers/nocover.png',
'resource_url_fmt': 'covers/{resource_url}'},
{'id': 'coversmini', 'desc': '封面缩略图', 'url_json_key': 'cover_image',
'default_src': 'coversmini/nocover.png',
'resource_url_fmt': 'coversmini/{resource_url}'}
{'id': 'coversmini', 'desc': '封面缩略图', 'url_json_key': 'front_cover',
'default_src': 'coversmini/nocover.png'}
]
def _get_resource_uri(data, stream_type):
@ -353,7 +349,7 @@ class MissEvan(VideoExtractor):
@staticmethod
def url_resource(uri):
return 'https://static.missevan.com/' + uri
return uri if re.match(r'^https?:/{2}\w.+$', uri) else 'https://static.missevan.com/' + uri
site = MissEvan()
site_info = 'MissEvan.com'

View File

@ -28,7 +28,7 @@ def mtv81_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
#
# rtmpdump -r 'rtmpe://cp30865.edgefcs.net/ondemand/mtviestor/_!/intlod/MTVInternational/MBUS/GeoLocals/00JP/VIAMTVI/PYC/201304/7122HVAQ4/00JPVIAMTVIPYC7122HVAQ4_640x_360_1200_m30.mp4' -o "title.mp4" --swfVfy http://media.mtvnservices.com/player/prime/mediaplayerprime.1.10.8.swf
#
# because rtmpdump is unstable,may try serveral times
# because rtmpdump is unstable,may try several times
#
if not info_only:
# import pdb

View File

@ -10,7 +10,7 @@ __all__ = ['qingting_download_by_url']
class Qingting(VideoExtractor):
# every resource is described by its channel id and program id
# so vid is tuple (chaanel_id, program_id)
# so vid is tuple (channel_id, program_id)
name = 'Qingting'
stream_types = [

View File

@ -35,6 +35,7 @@ def qq_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False):
part_urls= []
total_size = 0
ext = None
for part in range(1, seg_cnt+1):
if fc_cnt == 0:
# fix json parsing error

View File

@ -70,12 +70,13 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg
'[-_][6-9]\d\dx1\d\d\d\.jpe?g',
'[-_][6-9]\d\dx[6-9]\d\d\.jpe?g',
's1600/[\w%]+\.jpe?g', # blogger
'blogger\.googleusercontent\.com/img/a/\w*', # blogger
'img[6-9]\d\d/[\w%]+\.jpe?g' # oricon?
]
urls = []
for i in media_exts:
urls += re.findall(r'(https?://[^ ;&"\'\\<>]+' + i + r'[^ ;&"\'\\<>]*)', page)
urls += re.findall(r'(https?://[^ ;&"\'\\<>]*' + i + r'[^ ;&"\'\\<>]*)', page)
p_urls = re.findall(r'(https?%3A%2F%2F[^;&"]+' + i + r'[^;&"]*)', page)
urls += [parse.unquote(url) for url in p_urls]

View File

@ -76,11 +76,14 @@ class YouTube(VideoExtractor):
# - https://www.youtube.com/yts/jsbin/player_ias-vfl_RGK2l/en_US/base.js
# - https://www.youtube.com/yts/jsbin/player-vflRjqq_w/da_DK/base.js
# - https://www.youtube.com/yts/jsbin/player_ias-vfl-jbnrr/da_DK/base.js
# - https://www.youtube.com/s/player/0b643cd1/player_ias.vflset/sv_SE/base.js
# - https://www.youtube.com/s/player/50e823fc/player_ias.vflset/sv_SE/base.js
def tr_js(code):
code = re.sub(r'function', r'def', code)
code = re.sub(r'(\W)(as|if|in|is|or)\(', r'\1_\2(', code)
# add prefix '_sig_' to prevent namespace pollution
code = re.sub(r'(\W)([$\w][$\w][$\w]?)\(', r'\1_sig_\2(', code)
code = re.sub(r'\$', '_dollar', code)
code = re.sub(r'\{', r':\n\t', code)
code = re.sub(r'\{', r': ', code)
code = re.sub(r'\}', r'\n', code)
code = re.sub(r'var\s+', r'', code)
code = re.sub(r'(\w+).join\(""\)', r'"".join(\1)', code)
@ -99,7 +102,7 @@ class YouTube(VideoExtractor):
f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \
match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1))
f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def)
f1def = 'function main_%s%s' % (f1, f1def) # prefix to avoid potential namespace conflict
f1def = 'function %s%s' % (f1, f1def)
code = tr_js(f1def)
f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def))
for f2 in f2s:
@ -112,13 +115,13 @@ class YouTube(VideoExtractor):
f2def = 'function {}({},b){}'.format(f2e, f2def.group(1), f2def.group(2))
f2 = re.sub(r'(as|if|in|is|or)', r'_\1', f2)
f2 = re.sub(r'\$', '_dollar', f2)
code = code + 'global %s\n' % f2 + tr_js(f2def)
code = code + 'global _sig_%s\n' % f2 + tr_js(f2def)
f1 = re.sub(r'(as|if|in|is|or)', r'_\1', f1)
f1 = re.sub(r'\$', '_dollar', f1)
code = code + 'sig=main_%s(s)' % f1 # prefix to avoid potential namespace conflict
code = code + '_sig=_sig_%s(s)' % f1
exec(code, globals(), locals())
return locals()['sig']
return locals()['_sig']
def chunk_by_range(url, size):
urls = []
@ -195,8 +198,9 @@ class YouTube(VideoExtractor):
# Get video info
# 'eurl' is a magic parameter that can bypass age restriction
# full form: 'eurl=https%3A%2F%2Fyoutube.googleapis.com%2Fv%2F{VIDEO_ID}'
video_info = parse.parse_qs(get_content('https://www.youtube.com/get_video_info?video_id={}&eurl=https%3A%2F%2Fy'.format(self.vid)))
logging.debug('STATUS: %s' % video_info['status'][0])
#video_info = parse.parse_qs(get_content('https://www.youtube.com/get_video_info?video_id={}&eurl=https%3A%2F%2Fy'.format(self.vid)))
#logging.debug('STATUS: %s' % video_info['status'][0])
video_info = {'status': ['ok'], 'use_cipher_signature': 'True'}
ytplayer_config = None
if 'status' not in video_info:
@ -253,11 +257,16 @@ class YouTube(VideoExtractor):
else:
# Parse video page instead
video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid)
ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1))
self.title = json.loads(ytplayer_config["args"]["player_response"])["videoDetails"]["title"]
self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js']
stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',')
ytInitialPlayerResponse = json.loads(re.search('ytInitialPlayerResponse\s*=\s*([^\n]+?});', video_page).group(1))
self.title = ytInitialPlayerResponse["videoDetails"]["title"]
if re.search('([^"]*/base\.js)"', video_page):
self.html5player = 'https://www.youtube.com' + re.search('([^"]*/base\.js)"', video_page).group(1)
else:
self.html5player = None
stream_list = ytInitialPlayerResponse['streamingData']['formats']
elif video_info['status'] == ['fail']:
logging.debug('ERRORCODE: %s' % video_info['errorcode'][0])

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python
script_name = 'you-get'
__version__ = '0.4.1500'
__version__ = '0.4.1545'

View File

@ -41,6 +41,7 @@ class YouGetTests(unittest.TestCase):
#def test_acfun(self):
# acfun.download('https://www.acfun.cn/v/ac11701912', info_only=True)
#def test_bilibili(self):
# bilibili.download(
# "https://www.bilibili.com/watchlater/#/BV1PE411q7mZ/p6", info_only=True
@ -49,11 +50,13 @@ class YouGetTests(unittest.TestCase):
# "https://www.bilibili.com/watchlater/#/av74906671/p6", info_only=True
# )
# def test_soundcloud(self):
#def test_soundcloud(self):
## single song
# soundcloud.download(
# 'https://soundcloud.com/keiny-pham/impure-bird', info_only=True
# )
#soundcloud.download(
# 'https://soundcloud.com/keiny-pham/impure-bird', info_only=True
#)
## playlist
#soundcloud.download(
# 'https://soundcloud.com/anthony-flieger/sets/cytus', info_only=True