Merge pull request #5 from soimort/develop

1
This commit is contained in:
杨朝雄 2018-12-24 14:13:49 +08:00 committed by GitHub
commit 894e17f108
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
30 changed files with 754 additions and 215 deletions

View File

@ -6,11 +6,13 @@ python:
- "3.4"
- "3.5"
- "3.6"
- "3.7-dev"
- "nightly"
- "pypy3"
before_install: pip install flake8
before_install:
- if [[ $TRAVIS_PYTHON_VERSION != '3.2'* && $TRAVIS_PYTHON_VERSION != '3.3'* ]]; then pip install flake8; fi
before_script:
- if [[ $TRAVIS_PYTHON_VERSION != '3.2'* ]]; then flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics; fi
- if [[ $TRAVIS_PYTHON_VERSION != '3.2'* && $TRAVIS_PYTHON_VERSION != '3.3'* ]]; then flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics; fi
script: make test
sudo: false
notifications:

View File

@ -113,6 +113,14 @@ You can install `you-get` easily via:
$ brew install you-get
```
### Option 8: pkg (FreeBSD only)
You can install `you-get` easily via:
```
# pkg install you-get
```
### Shell completion
Completion definitions for Bash, Fish and Zsh can be found in [`contrib/completion`](https://github.com/soimort/you-get/tree/develop/contrib/completion). Please consult your shell's manual for how to take advantage of them.
@ -416,7 +424,9 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the
| 西瓜视频 | <https://www.ixigua.com/> |✓| | |
| 快手 | <https://www.kuaishou.com/> |✓|✓| |
| 抖音 | <https://www.douyin.com/> |✓| | |
| TikTok | <https://www.tiktok.com/> |✓| | |
| 中国体育(TV) | <http://v.zhibo.tv/> </br><http://video.zhibo.tv/> |✓| | |
| 知乎 | <https://www.zhihu.com/> |✓| | |
For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page.

View File

@ -102,6 +102,7 @@ SITES = {
'soundcloud' : 'soundcloud',
'ted' : 'ted',
'theplatform' : 'theplatform',
'tiktok' : 'tiktok',
'tucao' : 'tucao',
'tudou' : 'tudou',
'tumblr' : 'tumblr',
@ -127,6 +128,7 @@ SITES = {
'youtube' : 'youtube',
'zhanqi' : 'zhanqi',
'zhibo' : 'zhibo',
'zhihu' : 'zhihu',
}
dry_run = False
@ -429,7 +431,7 @@ def get_content(url, headers={}, decoded=True):
# Decode the response body
if decoded:
charset = match1(
response.getheader('Content-Type'), r'charset=([\w-]+)'
response.getheader('Content-Type', ''), r'charset=([\w-]+)'
)
if charset is not None:
data = data.decode(charset)
@ -439,7 +441,7 @@ def get_content(url, headers={}, decoded=True):
return data
def post_content(url, headers={}, post_data={}, decoded=True):
def post_content(url, headers={}, post_data={}, decoded=True, **kwargs):
"""Post the content of a URL via sending a HTTP POST request.
Args:
@ -450,14 +452,19 @@ def post_content(url, headers={}, post_data={}, decoded=True):
Returns:
The content as a string.
"""
logging.debug('post_content: %s \n post_data: %s' % (url, post_data))
if kwargs.get('post_data_raw'):
logging.debug('post_content: %s\npost_data_raw: %s' % (url, kwargs['post_data_raw']))
else:
logging.debug('post_content: %s\npost_data: %s' % (url, post_data))
req = request.Request(url, headers=headers)
if cookies:
cookies.add_cookie_header(req)
req.headers.update(req.unredirected_hdrs)
post_data_enc = bytes(parse.urlencode(post_data), 'utf-8')
if kwargs.get('post_data_raw'):
post_data_enc = bytes(kwargs['post_data_raw'], 'utf-8')
else:
post_data_enc = bytes(parse.urlencode(post_data), 'utf-8')
response = urlopen_with_retry(req, data=post_data_enc)
data = response.read()
@ -602,7 +609,12 @@ def url_save(
# the key must be 'Referer' for the hack here
if refer is not None:
tmp_headers['Referer'] = refer
file_size = url_size(url, faker=faker, headers=tmp_headers)
if type(url) is list:
file_size = urls_size(url, faker=faker, headers=tmp_headers)
is_chunked, urls = True, url
else:
file_size = url_size(url, faker=faker, headers=tmp_headers)
is_chunked, urls = False, [url]
continue_renameing = True
while continue_renameing:
@ -612,7 +624,7 @@ def url_save(
if not is_part:
if bar:
bar.done()
print(
log.w(
'Skipping {}: file already exists'.format(
tr(os.path.basename(filepath))
)
@ -638,7 +650,10 @@ def url_save(
print('Changing name to %s' % tr(os.path.basename(filepath)), '...')
continue_renameing = True
continue
print('Overwriting %s' % tr(os.path.basename(filepath)), '...')
if log.yes_or_no('File with this name already exists. Overwrite?'):
log.w('Overwriting %s ...' % tr(os.path.basename(filepath)))
else:
return
elif not os.path.exists(os.path.dirname(filepath)):
os.mkdir(os.path.dirname(filepath))
@ -655,70 +670,78 @@ def url_save(
else:
open_mode = 'wb'
if received < file_size:
if faker:
tmp_headers = fake_headers
'''
if parameter headers passed in, we have it copied as tmp_header
elif headers:
headers = headers
else:
headers = {}
'''
if received:
tmp_headers['Range'] = 'bytes=' + str(received) + '-'
if refer:
tmp_headers['Referer'] = refer
for url in urls:
received_chunk = 0
if received < file_size:
if faker:
tmp_headers = fake_headers
'''
if parameter headers passed in, we have it copied as tmp_header
elif headers:
headers = headers
else:
headers = {}
'''
if received and not is_chunked: # only request a range when not chunked
tmp_headers['Range'] = 'bytes=' + str(received) + '-'
if refer:
tmp_headers['Referer'] = refer
if timeout:
response = urlopen_with_retry(
request.Request(url, headers=tmp_headers), timeout=timeout
)
else:
response = urlopen_with_retry(
request.Request(url, headers=tmp_headers)
)
try:
range_start = int(
response.headers[
'content-range'
][6:].split('/')[0].split('-')[0]
)
end_length = int(
response.headers['content-range'][6:].split('/')[1]
)
range_length = end_length - range_start
except:
content_length = response.headers['content-length']
range_length = int(content_length) if content_length is not None \
else float('inf')
if timeout:
response = urlopen_with_retry(
request.Request(url, headers=tmp_headers), timeout=timeout
)
else:
response = urlopen_with_retry(
request.Request(url, headers=tmp_headers)
)
try:
range_start = int(
response.headers[
'content-range'
][6:].split('/')[0].split('-')[0]
)
end_length = int(
response.headers['content-range'][6:].split('/')[1]
)
range_length = end_length - range_start
except:
content_length = response.headers['content-length']
range_length = int(content_length) if content_length is not None \
else float('inf')
if file_size != received + range_length:
received = 0
if bar:
bar.received = 0
open_mode = 'wb'
with open(temp_filepath, open_mode) as output:
while True:
buffer = None
try:
buffer = response.read(1024 * 256)
except socket.timeout:
pass
if not buffer:
if received == file_size: # Download finished
break
# Unexpected termination. Retry request
tmp_headers['Range'] = 'bytes=' + str(received) + '-'
response = urlopen_with_retry(
request.Request(url, headers=tmp_headers)
)
continue
output.write(buffer)
received += len(buffer)
if is_chunked: # always append if chunked
open_mode = 'ab'
elif file_size != received + range_length: # is it ever necessary?
received = 0
if bar:
bar.update_received(len(buffer))
bar.received = 0
open_mode = 'wb'
with open(temp_filepath, open_mode) as output:
while True:
buffer = None
try:
buffer = response.read(1024 * 256)
except socket.timeout:
pass
if not buffer:
if is_chunked and received_chunk == range_length:
break
elif not is_chunked and received == file_size: # Download finished
break
# Unexpected termination. Retry request
if not is_chunked: # when
tmp_headers['Range'] = 'bytes=' + str(received) + '-'
response = urlopen_with_retry(
request.Request(url, headers=tmp_headers)
)
continue
output.write(buffer)
received += len(buffer)
received_chunk += len(buffer)
if bar:
bar.update_received(len(buffer))
assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (
received, os.path.getsize(temp_filepath), temp_filepath
@ -907,7 +930,7 @@ def download_urls(
if total_size:
if not force and os.path.exists(output_filepath) and not auto_rename\
and os.path.getsize(output_filepath) >= total_size * 0.9:
print('Skipping %s: file already exists' % output_filepath)
log.w('Skipping %s: file already exists' % output_filepath)
print()
return
bar = SimpleProgressBar(total_size, len(urls))
@ -1554,9 +1577,9 @@ def google_search(url):
url = 'https://www.google.com/search?tbm=vid&q=%s' % parse.quote(keywords)
page = get_content(url, headers=fake_headers)
videos = re.findall(
r'<a href="(https?://[^"]+)" onmousedown="[^"]+">([^<]+)<', page
r'<a href="(https?://[^"]+)" onmousedown="[^"]+"><h3 class="[^"]*">([^<]+)<', page
)
vdurs = re.findall(r'<span class="vdur _dwc">([^<]+)<', page)
vdurs = re.findall(r'<span class="vdur[^"]*">([^<]+)<', page)
durs = [r1(r'(\d+:\d+)', unescape_html(dur)) for dur in vdurs]
print('Google Videos search:')
for v in zip(videos, durs):

View File

@ -211,7 +211,7 @@ class VideoExtractor():
ext = self.dash_streams[stream_id]['container']
total_size = self.dash_streams[stream_id]['size']
if ext == 'm3u8':
if ext == 'm3u8' or ext == 'm4a':
ext = 'mp4'
if not urls:

View File

@ -67,6 +67,7 @@ from .sohu import *
from .soundcloud import *
from .suntv import *
from .theplatform import *
from .tiktok import *
from .tucao import *
from .tudou import *
from .tumblr import *
@ -88,4 +89,5 @@ from .ted import *
from .khan import *
from .zhanqi import *
from .kuaishou import *
from .zhibo import *
from .zhibo import *
from .zhihu import *

View File

@ -85,9 +85,13 @@ def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=Fals
_, _, seg_size = url_info(url)
size += seg_size
#fallback to flvhd is not quite possible
print_info(site_info, title, 'mp4', size)
if re.search(r'fid=[0-9A-Z\-]*.flv', preferred[0][0]):
ext = 'flv'
else:
ext = 'mp4'
print_info(site_info, title, ext, size)
if not info_only:
download_urls(preferred[0], title, 'mp4', size, output_dir=output_dir, merge=merge)
download_urls(preferred[0], title, ext, size, output_dir=output_dir, merge=merge)
else:
raise NotImplementedError(sourceType)
@ -105,27 +109,42 @@ def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=Fals
pass
def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
assert re.match(r'http://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url)
html = get_content(url)
assert re.match(r'http://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', url)
title = r1(r'data-title="([^"]+)"', html)
if re.match(r'http://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url):
html = get_content(url)
title = r1(r'data-title="([^"]+)"', html)
if match1(url, r'_(\d+)$'): # current P
title = title + " " + r1(r'active">([^<]*)', html)
vid = r1('data-vid="(\d+)"', html)
up = r1('data-name="([^"]+)"', html)
# bangumi
elif re.match("http://[^\.]*\.*acfun\.[^\.]+/bangumi/ab(\d+)", url):
html = get_content(url)
title = match1(html, r'"newTitle"\s*:\s*"([^"]+)"')
if match1(url, r'_(\d+)$'): # current P
title = title + " " + r1(r'active">([^<]*)', html)
vid = match1(html, r'videoId="(\d+)"')
up = "acfun"
else:
raise NotImplemented
assert title and vid
title = unescape_html(title)
title = escape_file_path(title)
assert title
if match1(url, r'_(\d+)$'): # current P
title = title + " " + r1(r'active">([^<]*)', html)
vid = r1('data-vid="(\d+)"', html)
up = r1('data-name="([^"]+)"', html)
p_title = r1('active">([^<]+)', html)
title = '%s (%s)' % (title, up)
if p_title: title = '%s - %s' % (title, p_title)
if p_title:
title = '%s - %s' % (title, p_title)
acfun_download_by_vid(vid, title,
output_dir=output_dir,
merge=merge,
info_only=info_only,
**kwargs)
site_info = "AcFun.tv"
download = acfun_download
download_playlist = playlist_not_supported('acfun')

View File

@ -129,8 +129,9 @@ def baidu_download(url, output_dir='.', stream_type=None, merge=True, info_only=
html = get_html(url)
title = r1(r'title:"([^"]+)"', html)
vhsrc = re.findall(r'"BDE_Image"[^>]+src="([^"]+)"', html) or re.findall(r'vhsrc="([^"]+)"', html)
if vhsrc is not None:
vhsrc = re.findall(r'"BDE_Image"[^>]+src="([^"]+\.mp4)"', html) or \
re.findall(r'vhsrc="([^"]+)"', html)
if len(vhsrc) > 0:
ext = 'mp4'
size = url_size(vhsrc[0])
print_info(site_info, title, ext, size)

View File

@ -22,7 +22,7 @@ from .youku import youku_download_by_vid
class Bilibili(VideoExtractor):
name = 'Bilibili'
live_api = 'http://live.bilibili.com/api/playurl?cid={}&otype=json'
live_api = 'https://api.live.bilibili.com/room/v1/Room/playUrl?cid={}&quality=0&platform=web'
api_url = 'http://interface.bilibili.com/v2/playurl?'
bangumi_api_url = 'http://bangumi.bilibili.com/player/web_api/playurl?'
live_room_init_api_url = 'https://api.live.bilibili.com/room/v1/Room/room_init?id={}'
@ -115,7 +115,7 @@ class Bilibili(VideoExtractor):
self.url = 'http://www.bilibili.com/video/av{}/'.format(aid)
self.ua = fake_headers['User-Agent']
self.url = url_locations([self.url])[0]
self.url = url_locations([self.url], faker=True)[0]
frag = urllib.parse.urlparse(self.url).fragment
# http://www.bilibili.com/video/av3141144/index_2.html#page=3
if frag:
@ -125,30 +125,31 @@ class Bilibili(VideoExtractor):
aid = re.search(r'av(\d+)', self.url).group(1)
self.url = 'http://www.bilibili.com/video/av{}/index_{}.html'.format(aid, page)
self.referer = self.url
self.page = get_content(self.url)
self.page = get_content(self.url, headers=fake_headers)
m = re.search(r'<h1.*?>(.*?)</h1>', self.page) or re.search(r'<h1 title="([^"]+)">', self.page)
if m is not None:
self.title = m.group(1)
s = re.search(r'<span>([^<]+)</span>', m.group(1))
s = re.search(r'<span.*?>([^<]+)</span>', m.group(1))
if s:
self.title = unescape_html(s.group(1))
if self.title is None:
m = re.search(r'property="og:title" content="([^"]+)"', self.page)
if m is not None:
self.title = m.group(1)
if 'subtitle' in kwargs:
subtitle = kwargs['subtitle']
self.title = '{} {}'.format(self.title, subtitle)
else:
playinfo = re.search(r'__INITIAL_STATE__=(.*?);\(function\(\)', self.page)
if playinfo is not None:
pages = json.loads(playinfo.group(1))['videoData']['pages']
if len(pages) > 1:
qs = dict(parse.parse_qsl(urllib.parse.urlparse(self.url).query))
page = pages[int(qs.get('p', 1)) - 1]
self.title = '{} #{}. {}'.format(self.title, page['page'], page['part'])
jsonPlayinfo = json.loads(playinfo.group(1))
if 'videoData' in jsonPlayinfo:
pages = jsonPlayinfo['videoData']['pages']
if len(pages) > 1:
qs = dict(parse.parse_qsl(urllib.parse.urlparse(self.url).query))
page = pages[int(qs.get('p', 1)) - 1]
self.title = '{} #{}. {}'.format(self.title, page['page'], page['part'])
if 'bangumi.bilibili.com/movie' in self.url:
self.movie_entry(**kwargs)
@ -160,6 +161,8 @@ class Bilibili(VideoExtractor):
self.live_entry(**kwargs)
elif 'vc.bilibili.com' in self.url:
self.vc_entry(**kwargs)
elif 'audio/au' in self.url:
self.audio_entry(**kwargs)
else:
self.entry(**kwargs)
@ -171,6 +174,30 @@ class Bilibili(VideoExtractor):
self.title = page_list[0]['pagename']
self.download_by_vid(page_list[0]['cid'], True, bangumi_movie=True, **kwargs)
def audio_entry(self, **kwargs):
assert re.match(r'https?://www.bilibili.com/audio/au\d+', self.url)
patt = r"(\d+)"
audio_id = re.search(patt, self.url).group(1)
audio_info_url = \
'https://www.bilibili.com/audio/music-service-c/web/song/info?sid={}'.format(audio_id)
audio_info_response = json.loads(get_content(audio_info_url))
if audio_info_response['msg'] != 'success':
log.wtf('fetch audio information failed!')
sys.exit(2)
self.title = audio_info_response['data']['title']
# TODO:there is no quality option for now
audio_download_url = \
'https://www.bilibili.com/audio/music-service-c/web/url?sid={}&privilege=2&quality=2'.format(audio_id)
audio_download_response = json.loads(get_content(audio_download_url))
if audio_download_response['msg'] != 'success':
log.wtf('fetch audio resource failed!')
sys.exit(2)
self.streams['mp4'] = {}
self.streams['mp4']['src'] = [audio_download_response['data']['cdns'][0]]
self.streams['mp4']['container'] = 'm4a'
self.streams['mp4']['size'] = audio_download_response['data']['size']
def entry(self, **kwargs):
# tencent player
tc_flashvars = re.search(r'"bili-cid=\d+&bili-aid=\d+&vid=([^"]+)"', self.page)
@ -190,7 +217,12 @@ class Bilibili(VideoExtractor):
index_id = int(re.search(r'index_(\d+)', self.url).group(1))
cid = page_list[index_id-1]['cid'] # change cid match rule
except:
cid = re.search(r'"cid":(\d+)', self.page).group(1)
page = re.search(r'p=(\d+)', self.url)
if page is None:
p = 1
else:
p = int(page.group(1))
cid = re.search(r'"cid":(\d+),"page":%s' % p, self.page).group(1)
if cid is not None:
self.download_by_vid(cid, re.search('bangumi', self.url) is not None, **kwargs)
else:
@ -226,7 +258,7 @@ class Bilibili(VideoExtractor):
api_url = self.live_api.format(self.room_id)
json_data = json.loads(get_content(api_url))
urls = [json_data['durl'][0]['url']]
urls = [json_data['data']['durl'][0]['url']]
self.streams['live'] = {}
self.streams['live']['src'] = urls
@ -252,28 +284,9 @@ class Bilibili(VideoExtractor):
self.streams['vc']['size'] = int(item['video_size'])
def bangumi_entry(self, **kwargs):
bangumi_id = re.search(r'(\d+)', self.url).group(1)
frag = urllib.parse.urlparse(self.url).fragment
if frag:
episode_id = frag
else:
episode_id = re.search(r'first_ep_id\s*=\s*"(\d+)"', self.page) or re.search(r'\/ep(\d+)', self.url).group(1)
# cont = post_content('http://bangumi.bilibili.com/web_api/get_source', post_data=dict(episode_id=episode_id))
# cid = json.loads(cont)['result']['cid']
cont = get_content('http://bangumi.bilibili.com/web_api/episode/{}.json'.format(episode_id))
ep_info = json.loads(cont)['result']['currentEpisode']
bangumi_data = get_bangumi_info(str(ep_info['seasonId']))
bangumi_payment = bangumi_data.get('payment')
if bangumi_payment and bangumi_payment['price'] != '0':
log.w("It's a paid item")
# ep_ids = collect_bangumi_epids(bangumi_data)
index_title = ep_info['indexTitle']
long_title = ep_info['longTitle'].strip()
cid = ep_info['danmaku']
self.title = '{} [{} {}]'.format(self.title, index_title, long_title)
data = json.loads(re.search(r'__INITIAL_STATE__=(.+);\(function', self.page).group(1))
cid = data['epInfo']['cid']
# index_title = data['epInfo']['index_title']
self.download_by_vid(cid, bangumi=True, **kwargs)
@ -376,10 +389,82 @@ def download_video_from_favlist(url, **kwargs):
else:
log.wtf("Fail to parse the fav title" + url, "")
def download_music_from_favlist(url, page, **kwargs):
m = re.search(r'https?://www.bilibili.com/audio/mycollection/(\d+)', url)
if m is not None:
sid = m.group(1)
json_result = json.loads(get_content("https://www.bilibili.com/audio/music-service-c/web/song/of-coll?"
"sid={}&pn={}&ps=100".format(sid, page)))
if json_result['msg'] == 'success':
music_list = json_result['data']['data']
music_count = len(music_list)
for i in range(music_count):
audio_id = music_list[i]['id']
audio_title = music_list[i]['title']
audio_url = "https://www.bilibili.com/audio/au{}".format(audio_id)
print("Start downloading music ", audio_title)
Bilibili().download_by_url(audio_url, **kwargs)
if page < json_result['data']['pageCount']:
page += 1
download_music_from_favlist(url, page, **kwargs)
else:
log.wtf("Fail to get music list of page " + json_result)
sys.exit(2)
else:
log.wtf("Fail to parse the sid from " + url, "")
def download_video_from_totallist(url, page, **kwargs):
# the url has format: https://space.bilibili.com/64169458/#/video
m = re.search(r'space\.bilibili\.com/(\d+)/.*?video', url)
mid = ""
if m is not None:
mid = m.group(1)
jsonresult = json.loads(get_content("https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&pagesize=100&tid=0&page={}&keyword=&order=pubdate&jsonp=jsonp".format(mid, page)))
if jsonresult['status']:
videos = jsonresult['data']['vlist']
videocount = len(videos)
for i in range(videocount):
videoid = videos[i]["aid"]
videotitle = videos[i]["title"]
videourl = "https://www.bilibili.com/video/av{}".format(videoid)
print("Start downloading ", videotitle, " video ", videotitle)
Bilibili().download_by_url(videourl, subtitle=videotitle, **kwargs)
if page < jsonresult['data']['pages']:
page += 1
download_video_from_totallist(url, page, **kwargs)
else:
log.wtf("Fail to get the files of page " + jsonresult)
sys.exit(2)
else:
log.wtf("Fail to parse the video title" + url, "")
def download_music_from_totallist(url, page, **kwargs):
m = re.search(r'https?://www.bilibili.com/audio/am(\d+)\?type=\d', url)
if m is not None:
sid = m.group(1)
json_result = json.loads(get_content("https://www.bilibili.com/audio/music-service-c/web/song/of-menu?"
"sid={}&pn={}&ps=100".format(sid, page)))
if json_result['msg'] == 'success':
music_list = json_result['data']['data']
music_count = len(music_list)
for i in range(music_count):
audio_id = music_list[i]['id']
audio_title = music_list[i]['title']
audio_url = "https://www.bilibili.com/audio/au{}".format(audio_id)
print("Start downloading music ",audio_title)
Bilibili().download_by_url(audio_url, **kwargs)
if page < json_result['data']['pageCount']:
page += 1
download_music_from_totallist(url, page, **kwargs)
else:
log.wtf("Fail to get music list of page " + json_result)
sys.exit(2)
else:
log.wtf("Fail to parse the sid from " + url, "")
def bilibili_download_playlist_by_url(url, **kwargs):
url = url_locations([url])[0]
url = url_locations([url], faker=True)[0]
kwargs['playlist'] = True
# a bangumi here? possible?
if 'live.bilibili' in url:
@ -396,6 +481,12 @@ def bilibili_download_playlist_by_url(url, **kwargs):
elif 'favlist' in url:
# this a fav list folder
download_video_from_favlist(url, **kwargs)
elif re.match(r'https?://space.bilibili.com/\d+/#/video', url):
download_video_from_totallist(url, 1, **kwargs)
elif re.match(r'https://www.bilibili.com/audio/mycollection/\d+', url):
download_music_from_favlist(url, 1, **kwargs)
elif re.match(r'https?://www.bilibili.com/audio/am\d+\?type=\d', url):
download_music_from_totallist(url, 1, **kwargs)
else:
aid = re.search(r'av(\d+)', url).group(1)
page_list = json.loads(get_content('http://www.bilibili.com/widget/getPageList?aid={}'.format(aid)))

View File

@ -29,9 +29,9 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg
image_url = edge['node']['display_url']
if 'video_url' in edge['node']:
image_url = edge['node']['video_url']
image_url = image_url.split('?')[0]
ext = image_url.split('.')[-1]
ext = image_url.split('?')[0].split('.')[-1]
size = int(get_head(image_url)['Content-Length'])
print_info(site_info, title, ext, size)
if not info_only:
download_urls(urls=[image_url],
@ -44,9 +44,9 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg
image_url = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['display_url']
if 'video_url' in info['entry_data']['PostPage'][0]['graphql']['shortcode_media']:
image_url =info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['video_url']
image_url = image_url.split('?')[0]
ext = image_url.split('.')[-1]
ext = image_url.split('?')[0].split('.')[-1]
size = int(get_head(image_url)['Content-Length'])
print_info(site_info, title, ext, size)
if not info_only:
download_urls(urls=[image_url],

View File

@ -17,20 +17,20 @@ headers = {
def iwara_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
global headers
video_hash=match1(url, r'http://\w+.iwara.tv/videos/(\w+)')
video_url=match1(url, r'(http://\w+.iwara.tv)/videos/\w+')
html = get_content(url,headers=headers)
video_hash = match1(url, r'https?://\w+.iwara.tv/videos/(\w+)')
video_url = match1(url, r'(https?://\w+.iwara.tv)/videos/\w+')
html = get_content(url, headers=headers)
title = r1(r'<title>(.*)</title>', html)
api_url=video_url+'/api/video/'+video_hash
content=get_content(api_url,headers=headers)
data=json.loads(content)
type,ext,size=url_info(data[0]['uri'], headers=headers)
down_urls=data[0]['uri']
print_info(down_urls,title+data[0]['resolution'],type,size)
api_url = video_url + '/api/video/' + video_hash
content = get_content(api_url, headers=headers)
data = json.loads(content)
down_urls = 'https:' + data[0]['uri']
type, ext, size = url_info(down_urls, headers=headers)
print_info(site_info, title+data[0]['resolution'], type, size)
if not info_only:
download_urls([down_urls], title, ext, size, output_dir, merge = merge,headers=headers)
download_urls([down_urls], title, ext, size, output_dir, merge=merge, headers=headers)
site_info = "iwara"
site_info = "Iwara"
download = iwara_download
download_playlist = playlist_not_supported('iwara')

View File

@ -1,14 +1,132 @@
#!/usr/bin/env python
__all__ = ['ixigua_download']
import base64
from .toutiao import download as toutiao_download
from .toutiao import download_playlist as toutiao_download_playlist
import binascii
from ..common import *
import random
import ctypes
from json import loads
__all__ = ['ixigua_download', 'ixigua_download_playlist_by_url']
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 "
"Safari/537.36",
}
def int_overflow(val):
maxint = 2147483647
if not -maxint - 1 <= val <= maxint:
val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1
return val
def unsigned_right_shitf(n, i):
if n < 0:
n = ctypes.c_uint32(n).value
if i < 0:
return -int_overflow(n << abs(i))
return int_overflow(n >> i)
def get_video_url_from_video_id(video_id):
"""Splicing URLs according to video ID to get video details"""
# from js
data = [""] * 256
for index, _ in enumerate(data):
t = index
for i in range(8):
t = -306674912 ^ unsigned_right_shitf(t, 1) if 1 & t else unsigned_right_shitf(t, 1)
data[index] = t
def tmp():
rand_num = random.random()
path = "/video/urls/v/1/toutiao/mp4/{video_id}?r={random_num}".format(video_id=video_id,
random_num=str(rand_num)[2:])
e = o = r = -1
i, a = 0, len(path)
while i < a:
e = ord(path[i])
i += 1
if e < 128:
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ e)]
else:
if e < 2048:
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (192 | e >> 6 & 31))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))]
else:
if 55296 <= e < 57344:
e = (1023 & e) + 64
i += 1
o = 1023 & t.url(i)
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (240 | e >> 8 & 7))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 2 & 63))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | o >> 6 & 15 | (3 & e) << 4))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & o))]
else:
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (224 | e >> 12 & 15))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 6 & 63))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))]
return "https://ib.365yg.com{path}&s={param}".format(path=path, param=unsigned_right_shitf(r ^ -1, 0))
while 1:
url = tmp()
if url.split("=")[-1][0] != "-": # 参数s不能为负数
return url
def ixigua_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
return toutiao_download(url.replace('ixigua', '365yg'))
# example url: https://www.ixigua.com/i6631065141750268420/#mid=63024814422
html = get_html(url, faker=True)
video_id = match1(html, r"videoId\s*:\s*'([^']+)'")
title = match1(html, r"title: '(\S+)',")
if not video_id:
log.e("video_id not found, url:{}".format(url))
return
video_info_url = get_video_url_from_video_id(video_id)
video_info = loads(get_content(video_info_url))
if video_info.get("code", 1) != 0:
log.e("Get video info from {} error: server return code {}".format(video_info_url, video_info.get("code", 1)))
return
if not video_info.get("data", None):
log.e("Get video info from {} error: The server returns JSON value"
" without data or data is empty".format(video_info_url))
return
if not video_info["data"].get("video_list", None):
log.e("Get video info from {} error: The server returns JSON value"
" without data.video_list or data.video_list is empty".format(video_info_url))
return
if not video_info["data"]["video_list"].get("video_1", None):
log.e("Get video info from {} error: The server returns JSON value"
" without data.video_list.video_1 or data.video_list.video_1 is empty".format(video_info_url))
return
size = int(video_info["data"]["video_list"]["video_1"]["size"])
print_info(site_info=site_info, title=title, type="mp4", size=size) # 该网站只有mp4类型文件
if not info_only:
video_url = base64.b64decode(video_info["data"]["video_list"]["video_1"]["main_url"].encode("utf-8"))
download_urls([video_url.decode("utf-8")], title, "mp4", size, output_dir, merge=merge, headers=headers, **kwargs)
def ixigua_download_playlist_by_url(url, output_dir='.', merge=True, info_only=False, **kwargs):
assert "user" in url, "Only support users to publish video list,Please provide a similar url:" \
"https://www.ixigua.com/c/user/6907091136/"
user_id = url.split("/")[-2] if url[-1] == "/" else url.split("/")[-1]
params = {"max_behot_time": "0", "max_repin_time": "0", "count": "20", "page_type": "0", "user_id": user_id}
while 1:
url = "https://www.ixigua.com/c/user/article/?" + "&".join(["{}={}".format(k, v) for k, v in params.items()])
video_list = loads(get_content(url, headers=headers))
params["max_behot_time"] = video_list["next"]["max_behot_time"]
for video in video_list["data"]:
ixigua_download("https://www.ixigua.com/i{}/".format(video["item_id"]), output_dir, merge, info_only,
**kwargs)
if video_list["next"]["max_behot_time"] == 0:
break
site_info = "ixigua.com"
download = ixigua_download
download_playlist = toutiao_download_playlist
download_playlist = ixigua_download_playlist_by_url

View File

@ -2,8 +2,17 @@
__all__ = ['lizhi_download']
import json
import datetime
from ..common import *
#
# Worked well but not perfect.
# TODO: add option --format={sd|hd}
#
def get_url(ep):
readable = datetime.datetime.fromtimestamp(int(ep['create_time']) / 1000).strftime('%Y/%m/%d')
return 'http://cdn5.lizhi.fm/audio/{}/{}_hd.mp3'.format(readable, ep['id'])
# radio_id: e.g. 549759 from http://www.lizhi.fm/549759/
#
# Returns a list of tuples (audio_id, title, url) for each episode
@ -23,7 +32,7 @@ def lizhi_extract_playlist_info(radio_id):
# (au_cnt), then handle pagination properly.
api_url = 'http://www.lizhi.fm/api/radio_audios?s=0&l=65535&band=%s' % radio_id
api_response = json.loads(get_content(api_url))
return [(ep['id'], ep['name'], ep['url']) for ep in api_response]
return [(ep['id'], ep['name'], get_url(ep)) for ep in api_response]
def lizhi_download_audio(audio_id, title, url, output_dir='.', info_only=False):
filetype, ext, size = url_info(url)

View File

@ -2,9 +2,12 @@
__all__ = ['miaopai_download']
import string
import random
from ..common import *
import urllib.error
import urllib.parse
from ..util import fs
fake_headers_mobile = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
@ -20,6 +23,10 @@ def miaopai_download_by_fid(fid, output_dir = '.', merge = False, info_only = Fa
mobile_page = get_content(page_url, headers=fake_headers_mobile)
url = match1(mobile_page, r'<video id=.*?src=[\'"](.*?)[\'"]\W')
if url is None:
wb_mp = re.search(r'<script src=([\'"])(.+?wb_mp\.js)\1>', mobile_page).group(2)
return miaopai_download_by_wbmp(wb_mp, fid, output_dir=output_dir, merge=merge,
info_only=info_only, total_size=None, **kwargs)
title = match1(mobile_page, r'<title>((.|\n)+?)</title>')
if not title:
title = fid
@ -29,14 +36,62 @@ def miaopai_download_by_fid(fid, output_dir = '.', merge = False, info_only = Fa
if not info_only:
download_urls([url], title, ext, total_size=None, output_dir=output_dir, merge=merge)
#----------------------------------------------------------------------
def miaopai_download_by_wbmp(wbmp_url, fid, info_only=False, **kwargs):
headers = {}
headers.update(fake_headers_mobile)
headers['Host'] = 'imgaliyuncdn.miaopai.com'
wbmp = get_content(wbmp_url, headers=headers)
appid = re.search(r'appid:\s*?([^,]+?),', wbmp).group(1)
jsonp = re.search(r'jsonp:\s*?([\'"])(\w+?)\1', wbmp).group(2)
population = [i for i in string.ascii_lowercase] + [i for i in string.digits]
info_url = '{}?{}'.format('http://p.weibo.com/aj_media/info', parse.urlencode({
'appid': appid.strip(),
'fid': fid,
jsonp.strip(): '_jsonp' + ''.join(random.sample(population, 11))
}))
headers['Host'] = 'p.weibo.com'
jsonp_text = get_content(info_url, headers=headers)
jsonp_dict = json.loads(match1(jsonp_text, r'\(({.+})\)'))
if jsonp_dict['code'] != 200:
log.wtf('[Failed] "%s"' % jsonp_dict['msg'])
video_url = jsonp_dict['data']['meta_data'][0]['play_urls']['l']
title = jsonp_dict['data']['description']
title = title.replace('\n', '_')
ext = 'mp4'
headers['Host'] = 'f.us.sinaimg.cn'
print_info(site_info, title, ext, url_info(video_url, headers=headers)[2])
if not info_only:
download_urls([video_url], fs.legitimize(title), ext, headers=headers, **kwargs)
def miaopai_download_direct(url, info_only, **kwargs):
mobile_page = get_content(url, headers=fake_headers_mobile)
try:
title = re.search(r'([\'"])title\1:\s*([\'"])(.+?)\2,', mobile_page).group(3)
except:
title = re.search(r'([\'"])status_title\1:\s*([\'"])(.+?)\2,', mobile_page).group(3)
title = title.replace('\n', '_')
stream_url = re.search(r'([\'"])stream_url\1:\s*([\'"])(.+?)\2,', mobile_page).group(3)
ext = 'mp4'
print_info(site_info, title, ext, url_info(stream_url, headers=fake_headers_mobile)[2])
if not info_only:
download_urls([stream_url], fs.legitimize(title), ext, total_size=None, headers=fake_headers_mobile, **kwargs)
# ----------------------------------------------------------------------
def miaopai_download(url, output_dir = '.', merge = False, info_only = False, **kwargs):
if match1(url, r'weibo\.com/tv/v/(\w+)'):
return miaopai_download_direct(url, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs)
fid = match1(url, r'\?fid=(\d{4}:\w+)')
if fid is not None:
miaopai_download_by_fid(fid, output_dir, merge, info_only)
elif '/p/230444' in url:
fid = match1(url, r'/p/230444(\w+)')
miaopai_download_by_fid('1034:'+fid, output_dir, merge, info_only)
elif re.match(r'^http[s]://weibo\.com/\d+/.+', url):
miaopai_download_direct(url, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs)
else:
mobile_page = get_content(url, headers = fake_headers_mobile)
hit = re.search(r'"page_url"\s*:\s*"([^"]+)"', mobile_page)
@ -46,6 +101,7 @@ def miaopai_download(url, output_dir = '.', merge = False, info_only = False, **
escaped_url = hit.group(1)
miaopai_download(urllib.parse.unquote(escaped_url), output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
site_info = "miaopai"
download = miaopai_download
download_playlist = playlist_not_supported('miaopai')

View File

@ -7,31 +7,40 @@ import re
from ..util import log
from ..common import get_content, download_urls, print_info, playlist_not_supported, url_size
from .universal import *
__all__ = ['naver_download_by_url']
def naver_download_by_url(url, info_only=False, **kwargs):
def naver_download_by_url(url, output_dir='.', merge=True, info_only=False, **kwargs):
ep = 'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{}?key={}'
page = get_content(url)
og_video_url = re.search(r"<meta\s+property=\"og:video:url\"\s+content='(.+?)'>", page).group(1)
params_dict = urllib.parse.parse_qs(urllib.parse.urlparse(og_video_url).query)
vid = params_dict['vid'][0]
key = params_dict['outKey'][0]
meta_str = get_content(ep.format(vid, key))
meta_json = json.loads(meta_str)
if 'errorCode' in meta_json:
log.wtf(meta_json['errorCode'])
title = meta_json['meta']['subject']
videos = meta_json['videos']['list']
video_list = sorted(videos, key=lambda video: video['encodingOption']['width'])
video_url = video_list[-1]['source']
# size = video_list[-1]['size']
# result wrong size
size = url_size(video_url)
print_info(site_info, title, 'mp4', size)
if not info_only:
download_urls([video_url], title, 'mp4', size, **kwargs)
try:
temp = re.search(r"<meta\s+property=\"og:video:url\"\s+content='(.+?)'>", page)
if temp is not None:
og_video_url = temp.group(1)
params_dict = urllib.parse.parse_qs(urllib.parse.urlparse(og_video_url).query)
vid = params_dict['vid'][0]
key = params_dict['outKey'][0]
else:
vid = re.search(r"\"videoId\"\s*:\s*\"(.+?)\"", page).group(1)
key = re.search(r"\"inKey\"\s*:\s*\"(.+?)\"", page).group(1)
meta_str = get_content(ep.format(vid, key))
meta_json = json.loads(meta_str)
if 'errorCode' in meta_json:
log.wtf(meta_json['errorCode'])
title = meta_json['meta']['subject']
videos = meta_json['videos']['list']
video_list = sorted(videos, key=lambda video: video['encodingOption']['width'])
video_url = video_list[-1]['source']
# size = video_list[-1]['size']
# result wrong size
size = url_size(video_url)
print_info(site_info, title, 'mp4', size)
if not info_only:
download_urls([video_url], title, 'mp4', size, **kwargs)
except:
universal_download(url, output_dir, merge=merge, info_only=info_only, **kwargs)
site_info = "naver.com"
download = naver_download_by_url

View File

@ -192,14 +192,14 @@ class PPTV(VideoExtractor):
if self.url and not self.vid:
if not re.match(r'http://v.pptv.com/show/(\w+)\.html', self.url):
raise('Unknown url pattern')
page_content = get_content(self.url)
page_content = get_content(self.url,{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"})
self.vid = match1(page_content, r'webcfg\s*=\s*{"id":\s*(\d+)')
if not self.vid:
raise('Cannot find id')
api_url = 'http://web-play.pptv.com/webplay3-0-{}.xml'.format(self.vid)
api_url += '?appplt=flp&appid=pptv.flashplayer.vod&appver=3.4.2.28&type=&version=4'
dom = parseString(get_content(api_url))
dom = parseString(get_content(api_url,{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}))
self.title, m_items, m_streams, m_segs = parse_pptv_xml(dom)
xml_streams = merge_meta(m_items, m_streams, m_segs)
for stream_id in xml_streams:

View File

@ -15,9 +15,9 @@ Changelog:
new api
'''
def real_url(host,vid,tvid,new,clipURL,ck):
url = 'http://'+host+'/?prot=9&prod=flash&pt=1&file='+clipURL+'&new='+new +'&key='+ ck+'&vid='+str(vid)+'&uid='+str(int(time.time()*1000))+'&t='+str(random())+'&rb=1'
return json.loads(get_html(url))['url']
def real_url(fileName, key, ch):
url = "https://data.vod.itc.cn/ip?new=" + fileName + "&num=1&key=" + key + "&ch=" + ch + "&pt=1&pg=2&prod=h5n"
return json.loads(get_html(url))['servers'][0]['url']
def sohu_download(url, output_dir = '.', merge = True, info_only = False, extractor_proxy=None, **kwargs):
if re.match(r'http://share.vrs.sohu.com', url):
@ -51,9 +51,8 @@ def sohu_download(url, output_dir = '.', merge = True, info_only = False, extrac
title = data['tvName']
size = sum(data['clipsBytes'])
assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su'])
for new,clip,ck, in zip(data['su'], data['clipsURL'], data['ck']):
clipURL = urlparse(clip).path
urls.append(real_url(host,hqvid,tvid,new,clipURL,ck))
for fileName, key in zip(data['su'], data['ck']):
urls.append(real_url(fileName, key, data['ch']))
# assert data['clipsURL'][0].endswith('.mp4')
else:
@ -66,9 +65,8 @@ def sohu_download(url, output_dir = '.', merge = True, info_only = False, extrac
title = data['tvName']
size = sum(map(int,data['clipsBytes']))
assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su'])
for new,clip,ck, in zip(data['su'], data['clipsURL'], data['ck']):
clipURL = urlparse(clip).path
urls.append(real_url(host,vid,tvid,new,clipURL,ck))
for fileName, key in zip(data['su'], data['ck']):
urls.append(real_url(fileName, key, data['ch']))
print_info(site_info, title, 'mp4', size)
if not info_only:

View File

@ -0,0 +1,23 @@
#!/usr/bin/env python
__all__ = ['tiktok_download']
from ..common import *
def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
html = get_html(url)
title = r1(r'<title>(.*?)</title>', html)
video_id = r1(r'/video/(\d+)', url) or r1(r'musical\?id=(\d+)', html)
title = '%s [%s]' % (title, video_id)
dataText = r1(r'var data = \[(.*)\] ', html) or r1(r'var data = (\{.*\})', html)
data = json.loads(dataText)
source = 'http:' + data['video']['play_addr']['url_list'][0]
mime, ext, size = url_info(source)
print_info(site_info, title, mime, size)
if not info_only:
download_urls([source], title, ext, size, output_dir, merge=merge)
site_info = "TikTok.com"
download = tiktok_download
download_playlist = playlist_not_supported('tiktok')

View File

@ -13,7 +13,29 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
universal_download(url, output_dir, merge=merge, info_only=info_only)
return
html = parse.unquote(get_html(url)).replace('\/', '/')
import ssl
ssl_context = request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1))
cookie_handler = request.HTTPCookieProcessor()
opener = request.build_opener(ssl_context, cookie_handler)
request.install_opener(opener)
page = get_html(url)
form_key = match1(page, r'id="tumblr_form_key" content="([^"]+)"')
if form_key is not None:
# bypass GDPR consent page
referer = 'https://www.tumblr.com/privacy/consent?redirect=%s' % parse.quote_plus(url)
post_content('https://www.tumblr.com/svc/privacy/consent',
headers={
'Content-Type': 'application/json',
'User-Agent': fake_headers['User-Agent'],
'Referer': referer,
'X-tumblr-form-key': form_key,
'X-Requested-With': 'XMLHttpRequest'
},
post_data_raw='{"eu_resident":true,"gdpr_is_acceptable_age":true,"gdpr_consent_core":true,"gdpr_consent_first_party_ads":true,"gdpr_consent_third_party_ads":true,"gdpr_consent_search_history":true,"redirect_to":"%s","gdpr_reconsent":false}' % url)
page = get_html(url, faker=True)
html = parse.unquote(page).replace('\/', '/')
feed = r1(r'<meta property="og:type" content="tumblr-feed:(\w+)" />', html)
if feed in ['photo', 'photoset', 'entry'] or feed is None:
@ -21,23 +43,24 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
page_title = r1(r'<meta name="description" content="([^"\n]+)', html) or \
r1(r'<meta property="og:description" content="([^"\n]+)', html) or \
r1(r'<title>([^<\n]*)', html)
urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.jpg)', html) +\
re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.png)', html) +\
re.findall(r'(https?://[^;"&]+/tumblr_[^";]+_\d+\.gif)', html)
urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.jpg)', html) +\
re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.png)', html) +\
re.findall(r'(https?://[^;"&]+/tumblr_[^";&]+_\d+\.gif)', html)
tuggles = {}
for url in urls:
filename = parse.unquote(url.split('/')[-1])
hd_url = r1(r'(.+)_\d+\.jpg$', url) + '_1280.jpg' # FIXME: decide actual quality
filename = parse.unquote(hd_url.split('/')[-1])
title = '.'.join(filename.split('.')[:-1])
tumblr_id = r1(r'^tumblr_(.+)_\d+$', title)
quality = int(r1(r'^tumblr_.+_(\d+)$', title))
ext = filename.split('.')[-1]
try:
size = int(get_head(url)['Content-Length'])
size = int(get_head(hd_url)['Content-Length'])
if tumblr_id not in tuggles or tuggles[tumblr_id]['quality'] < quality:
tuggles[tumblr_id] = {
'title': title,
'url': url,
'url': hd_url,
'quality': quality,
'ext': ext,
'size': size,
@ -99,11 +122,15 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
r1(r'<meta property="og:description" content="([^"]*)" />', html) or
r1(r'<title>([^<\n]*)', html) or url.split("/")[4]).replace('\n', '')
type, ext, size = url_info(real_url)
# this is better
vcode = r1(r'tumblr_(\w+)', real_url)
real_url = 'https://vt.media.tumblr.com/tumblr_%s.mp4' % vcode
type, ext, size = url_info(real_url, faker=True)
print_info(site_info, title, type, size)
if not info_only:
download_urls([real_url], title, ext, size, output_dir, merge = merge)
download_urls([real_url], title, ext, size, output_dir, merge=merge)
site_info = "Tumblr.com"
download = tumblr_download

View File

@ -30,9 +30,9 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs)
return
html = get_html(url, faker=True)
screen_name = r1(r'data-screen-name="([^"]*)"', html) or \
screen_name = r1(r'twitter\.com/([^/]+)', url) or r1(r'data-screen-name="([^"]*)"', html) or \
r1(r'<meta name="twitter:title" content="([^"]*)"', html)
item_id = r1(r'data-item-id="([^"]*)"', html) or \
item_id = r1(r'twitter\.com/[^/]+/status/(\d+)', url) or r1(r'data-item-id="([^"]*)"', html) or \
r1(r'<meta name="twitter:site:id" content="([^"]*)"', html)
page_title = "{} [{}]".format(screen_name, item_id)

View File

@ -67,9 +67,9 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg
urls = []
for i in media_exts:
urls += re.findall(r'(https?://[^ ;"\'\\]+' + i + r'[^ ;"\'\\]*)', page)
urls += re.findall(r'(https?://[^ ;&"\'\\]+' + i + r'[^ ;&"\'\\]*)', page)
p_urls = re.findall(r'(https?%3A%2F%2F[^;&]+' + i + r'[^;&]*)', page)
p_urls = re.findall(r'(https?%3A%2F%2F[^;&"]+' + i + r'[^;&"]*)', page)
urls += [parse.unquote(url) for url in p_urls]
q_urls = re.findall(r'(https?:\\\\/\\\\/[^ ;"\']+' + i + r'[^ ;"\']*)', page)
@ -106,6 +106,9 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg
title = '%s' % i
i += 1
if r1(r'(https://pinterest.com/pin/)', url):
continue
candies.append({'url': url,
'title': title})

View File

@ -7,6 +7,24 @@ from urllib.parse import urlparse
from json import loads
import re
#----------------------------------------------------------------------
def miaopai_download_by_smid(smid, output_dir = '.', merge = True, info_only = False):
""""""
api_endpoint = 'https://n.miaopai.com/api/aj_media/info.json?smid={smid}'.format(smid = smid)
html = get_content(api_endpoint)
api_content = loads(html)
video_url = api_content['data']['meta_data'][0]['play_urls']['l']
title = api_content['data']['description']
type, ext, size = url_info(video_url)
print_info(site_info, title, type, size)
if not info_only:
download_urls([video_url], title, ext, size, output_dir, merge=merge)
#----------------------------------------------------------------------
def yixia_miaopai_download_by_scid(scid, output_dir = '.', merge = True, info_only = False):
""""""
@ -47,7 +65,11 @@ def yixia_xiaokaxiu_download_by_scid(scid, output_dir = '.', merge = True, info_
def yixia_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):
"""wrapper"""
hostname = urlparse(url).hostname
if 'miaopai.com' in hostname: #Miaopai
if 'n.miaopai.com' == hostname:
smid = match1(url, r'n\.miaopai\.com/media/([^.]+)')
miaopai_download_by_smid(smid, output_dir, merge, info_only)
return
elif 'miaopai.com' in hostname: #Miaopai
yixia_download_by_scid = yixia_miaopai_download_by_scid
site_info = "Yixia Miaopai"

View File

@ -78,7 +78,7 @@ class Youku(VideoExtractor):
self.api_error_code = None
self.api_error_msg = None
self.ccode = '0508'
self.ccode = '0590'
# Found in http://g.alicdn.com/player/ykplayer/0.5.64/youku-player.min.js
# grep -oE '"[0-9a-zA-Z+/=]{256}"' youku-player.min.js
self.ckey = 'DIl58SLFxFNndSV1GFNnMQVYkx1PP5tKe1siZu/86PR1u/Wh1Ptd+WOZsHHWxysSfAOhNJpdVWsdVJNsfJ8Sxd8WKVvNfAS8aS8fAOzYARzPyPc3JvtnPHjTdKfESTdnuTW6ZPvk2pNDh4uFzotgdMEFkzQ5wZVXl2Pf1/Y6hLK0OnCNxBj3+nb0v72gZ6b0td+WOZsHHWxysSo/0y9D2K42SaB8Y/+aD2K42SaB8Y/+ahU+WOZsHcrxysooUeND'

View File

@ -37,6 +37,10 @@ class YouTube(VideoExtractor):
]
def decipher(js, s):
# Examples:
# - https://www.youtube.com/yts/jsbin/player-da_DK-vflWlK-zq/base.js
# - https://www.youtube.com/yts/jsbin/player-vflvABTsY/da_DK/base.js
# - https://www.youtube.com/yts/jsbin/player-vfls4aurX/da_DK/base.js
def tr_js(code):
code = re.sub(r'function', r'def', code)
code = re.sub(r'(\W)(as|if|in|is|or)\(', r'\1_\2(', code)
@ -52,11 +56,13 @@ class YouTube(VideoExtractor):
return code
js = js.replace('\n', ' ')
f1 = match1(js, r'"signature",([$\w]+)\(\w+\.\w+\)')
f1 = match1(js, r'\.set\(\w+\.sp,\(0,window\.encodeURIComponent\)\(([$\w]+)') or \
match1(js, r'\.set\(\w+\.sp,([$\w]+)\(\w+\.s\)\)') or \
match1(js, r'"signature",([$\w]+)\(\w+\.\w+\)')
f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \
match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1))
f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def)
f1def = 'function %s%s' % (f1, f1def)
f1def = 'function main_%s%s' % (f1, f1def) # prefix to avoid potential namespace conflict
code = tr_js(f1def)
f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def))
for f2 in f2s:
@ -73,10 +79,20 @@ class YouTube(VideoExtractor):
f1 = re.sub(r'(as|if|in|is|or)', r'_\1', f1)
f1 = re.sub(r'\$', '_dollar', f1)
code = code + 'sig=%s(s)' % f1
code = code + 'sig=main_%s(s)' % f1 # prefix to avoid potential namespace conflict
exec(code, globals(), locals())
return locals()['sig']
def chunk_by_range(url, size):
urls = []
chunk_size = 10485760
start, end = 0, chunk_size - 1
urls.append('%s&range=%s-%s' % (url, start, end))
while end + 1 < size: # processed size < expected size
start, end = end + 1, end + chunk_size
urls.append('%s&range=%s-%s' % (url, start, end))
return urls
def get_url_from_vid(vid):
return 'https://youtu.be/{}'.format(vid)
@ -128,7 +144,10 @@ class YouTube(VideoExtractor):
for video in videos:
vid = parse_query_param(video, 'v')
index = parse_query_param(video, 'index')
self.__class__().download_by_url(self.__class__.get_url_from_vid(vid), index=index, **kwargs)
try:
self.__class__().download_by_url(self.__class__.get_url_from_vid(vid), index=index, **kwargs)
except:
pass
def prepare(self, **kwargs):
assert self.url or self.vid
@ -144,7 +163,8 @@ class YouTube(VideoExtractor):
ytplayer_config = None
if 'status' not in video_info:
log.wtf('[Failed] Unknown status.')
log.wtf('[Failed] Unknown status.', exit_code=None)
raise
elif video_info['status'] == ['ok']:
if 'use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']:
self.title = parse.unquote_plus(video_info['title'][0])
@ -176,7 +196,8 @@ class YouTube(VideoExtractor):
ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+});ytplayer', video_page).group(1))
except:
msg = re.search('class="message">([^<]+)<', video_page).group(1)
log.wtf('[Failed] "%s"' % msg.strip())
log.wtf('[Failed] "%s"' % msg.strip(), exit_code=None)
raise
if 'title' in ytplayer_config['args']:
# 150 Restricted from playback on certain sites
@ -185,18 +206,22 @@ class YouTube(VideoExtractor):
self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js']
stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',')
else:
log.wtf('[Error] The uploader has not made this video available in your country.')
log.wtf('[Error] The uploader has not made this video available in your country.', exit_code=None)
raise
#self.title = re.search('<meta name="title" content="([^"]+)"', video_page).group(1)
#stream_list = []
elif video_info['errorcode'] == ['100']:
log.wtf('[Failed] This video does not exist.', exit_code=int(video_info['errorcode'][0]))
log.wtf('[Failed] This video does not exist.', exit_code=None) #int(video_info['errorcode'][0])
raise
else:
log.wtf('[Failed] %s' % video_info['reason'][0], exit_code=int(video_info['errorcode'][0]))
log.wtf('[Failed] %s' % video_info['reason'][0], exit_code=None) #int(video_info['errorcode'][0])
raise
else:
log.wtf('[Failed] Invalid status.')
log.wtf('[Failed] Invalid status.', exit_code=None)
raise
# YouTube Live
if ytplayer_config and (ytplayer_config['args'].get('livestream') == '1' or ytplayer_config['args'].get('live_playback') == '1'):
@ -286,13 +311,15 @@ class YouTube(VideoExtractor):
if not dash_size:
try: dash_size = url_size(dash_url)
except: continue
dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size))
dash_mp4_a_urls = self.__class__.chunk_by_range(dash_mp4_a_url, int(dash_mp4_a_size))
self.dash_streams[itag] = {
'quality': '%sx%s' % (w, h),
'itag': itag,
'type': mimeType,
'mime': mimeType,
'container': 'mp4',
'src': [dash_url, dash_mp4_a_url],
'src': [dash_urls, dash_mp4_a_urls],
'size': int(dash_size) + int(dash_mp4_a_size)
}
elif mimeType == 'video/webm':
@ -306,13 +333,15 @@ class YouTube(VideoExtractor):
if not dash_size:
try: dash_size = url_size(dash_url)
except: continue
dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size))
dash_webm_a_urls = self.__class__.chunk_by_range(dash_webm_a_url, int(dash_webm_a_size))
self.dash_streams[itag] = {
'quality': '%sx%s' % (w, h),
'itag': itag,
'type': mimeType,
'mime': mimeType,
'container': 'webm',
'src': [dash_url, dash_webm_a_url],
'src': [dash_urls, dash_webm_a_urls],
'size': int(dash_size) + int(dash_webm_a_size)
}
except:
@ -349,13 +378,15 @@ class YouTube(VideoExtractor):
dash_url += '&signature={}'.format(sig)
dash_size = stream['clen']
itag = stream['itag']
dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size))
dash_mp4_a_urls = self.__class__.chunk_by_range(dash_mp4_a_url, int(dash_mp4_a_size))
self.dash_streams[itag] = {
'quality': stream['size'],
'itag': itag,
'type': mimeType,
'mime': mimeType,
'container': 'mp4',
'src': [dash_url, dash_mp4_a_url],
'src': [dash_urls, dash_mp4_a_urls],
'size': int(dash_size) + int(dash_mp4_a_size)
}
elif stream['type'].startswith('video/webm'):
@ -374,13 +405,15 @@ class YouTube(VideoExtractor):
except UnboundLocalError as e:
audio_url = dash_mp4_a_url
audio_size = int(dash_mp4_a_size)
dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size))
audio_urls = self.__class__.chunk_by_range(audio_url, int(audio_size))
self.dash_streams[itag] = {
'quality': stream['size'],
'itag': itag,
'type': mimeType,
'mime': mimeType,
'container': 'webm',
'src': [dash_url, audio_url],
'src': [dash_urls, audio_urls],
'size': int(dash_size) + int(audio_size)
}

View File

@ -37,11 +37,14 @@ def zhibo_download(url, output_dir = '.', merge = True, info_only = False, **kwa
if is_live is not "1":
raise ValueError("The live stream is not online! (Errno:%s)" % is_live)
ourStreamName = r1(r"window.ourStreamName=\'([s\S'\s\.]*)\'\;[\s\S]*window.rtmpDefaultSource", html)
rtmpPollUrl = r1(r"window.rtmpPollUrl=\'([s\S'\s\.]*)\'\;[\s\S]*window.hlsDefaultSource", html)
#real_url = 'rtmp://220.194.213.56/live.zhibo.tv/8live/' + ourStreamName
real_url = rtmpPollUrl + ourStreamName
match = re.search(r"""
ourStreamName .*?
'(.*?)' .*?
rtmpHighSource .*?
'(.*?)' .*?
'(.*?)'
""", html, re.S | re.X)
real_url = match.group(3) + match.group(1) + match.group(2)
print_info(site_info, title, 'flv', float('inf'))
if not info_only:

View File

@ -0,0 +1,79 @@
#!/usr/bin/env python
__all__ = ['zhihu_download', 'zhihu_download_playlist']
from ..common import *
import json
def zhihu_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
paths = url.split("/")
# question or column
if len(paths) < 3 and len(paths) < 6:
raise TypeError("URL does not conform to specifications, Support column and question only."
"Example URL: https://zhuanlan.zhihu.com/p/51669862 or "
"https://www.zhihu.com/question/267782048/answer/490720324")
if ("question" not in paths or "answer" not in paths) and "zhuanlan.zhihu.com" not in paths:
raise TypeError("URL does not conform to specifications, Support column and question only."
"Example URL: https://zhuanlan.zhihu.com/p/51669862 or "
"https://www.zhihu.com/question/267782048/answer/490720324")
html = get_html(url, faker=True)
title = match1(html, r'data-react-helmet="true">(.*?)</title>')
for index, video_id in enumerate(matchall(html, [r'<a class="video-box" href="\S+video/(\d+)"'])):
try:
video_info = json.loads(
get_content(r"https://lens.zhihu.com/api/videos/{}".format(video_id), headers=fake_headers))
except json.decoder.JSONDecodeError:
log.w("Video id not found:{}".format(video_id))
continue
play_list = video_info["playlist"]
# first High Definition
# second Second Standard Definition
# third ld. What is ld ?
# finally continue
data = play_list.get("hd", play_list.get("sd", play_list.get("ld", None)))
if not data:
log.w("Video id No play address:{}".format(video_id))
continue
print_info(site_info, title, data["format"], data["size"])
if not info_only:
ext = "_{}.{}".format(index, data["format"])
if kwargs.get("zhihu_offset"):
ext = "_{}".format(kwargs["zhihu_offset"]) + ext
download_urls([data["play_url"]], title, ext, data["size"],
output_dir=output_dir, merge=merge, **kwargs)
def zhihu_download_playlist(url, output_dir='.', merge=True, info_only=False, **kwargs):
if "question" not in url or "answer" in url: # question page
raise TypeError("URL does not conform to specifications, Support question only."
" Example URL: https://www.zhihu.com/question/267782048")
url = url.split("?")[0]
if url[-1] == "/":
question_id = url.split("/")[-2]
else:
question_id = url.split("/")[-1]
videos_url = r"https://www.zhihu.com/api/v4/questions/{}/answers".format(question_id)
try:
questions = json.loads(get_content(videos_url))
except json.decoder.JSONDecodeError:
raise TypeError("Check whether the problem URL exists.Example URL: https://www.zhihu.com/question/267782048")
count = 0
while 1:
for data in questions["data"]:
kwargs["zhihu_offset"] = count
zhihu_download("https://www.zhihu.com/question/{}/answer/{}".format(question_id, data["id"]),
output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
count += 1
if questions["paging"]["is_end"]:
return
questions = json.loads(get_content(questions["paging"]["next"], headers=fake_headers))
site_info = "zhihu.com"
download = zhihu_download
download_playlist = zhihu_download_playlist

View File

@ -13,6 +13,7 @@ def legitimize(text, os=detect_os()):
ord('|'): '-',
})
# FIXME: do some filesystem detection
if os == 'windows' or os == 'cygwin' or os == 'wsl':
# Windows (non-POSIX namespace)
text = text.translate({
@ -28,6 +29,7 @@ def legitimize(text, os=detect_os()):
ord('>'): '-',
ord('['): '(',
ord(']'): ')',
ord('\t'): ' ',
})
else:
# *nix

View File

@ -96,3 +96,9 @@ def wtf(message, exit_code=1):
print_log(message, RED, BOLD)
if exit_code is not None:
sys.exit(exit_code)
def yes_or_no(message):
ans = str(input('%s (y/N) ' % message)).lower().strip()
if ans == 'y':
return True
return False

View File

@ -19,9 +19,11 @@ def detect_os():
elif 'linux' in syst:
os = 'linux'
# detect WSL https://github.com/Microsoft/BashOnWindows/issues/423
with open('/proc/version', 'r') as f:
if 'microsoft' in f.read().lower():
os = 'wsl'
try:
with open('/proc/version', 'r') as f:
if 'microsoft' in f.read().lower():
os = 'wsl'
except: pass
elif 'windows' in syst:
os = 'windows'
elif 'bsd' in syst:

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python
script_name = 'you-get'
__version__ = '0.4.1128'
__version__ = '0.4.1193'

View File

@ -25,6 +25,7 @@
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Topic :: Internet",
"Topic :: Internet :: WWW/HTTP",
"Topic :: Multimedia",