mirror of
https://github.com/soimort/you-get.git
synced 2025-02-10 12:12:26 +03:00
commit
894e17f108
@ -6,11 +6,13 @@ python:
|
||||
- "3.4"
|
||||
- "3.5"
|
||||
- "3.6"
|
||||
- "3.7-dev"
|
||||
- "nightly"
|
||||
- "pypy3"
|
||||
before_install: pip install flake8
|
||||
before_install:
|
||||
- if [[ $TRAVIS_PYTHON_VERSION != '3.2'* && $TRAVIS_PYTHON_VERSION != '3.3'* ]]; then pip install flake8; fi
|
||||
before_script:
|
||||
- if [[ $TRAVIS_PYTHON_VERSION != '3.2'* ]]; then flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics; fi
|
||||
- if [[ $TRAVIS_PYTHON_VERSION != '3.2'* && $TRAVIS_PYTHON_VERSION != '3.3'* ]]; then flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics; fi
|
||||
script: make test
|
||||
sudo: false
|
||||
notifications:
|
||||
|
10
README.md
10
README.md
@ -113,6 +113,14 @@ You can install `you-get` easily via:
|
||||
$ brew install you-get
|
||||
```
|
||||
|
||||
### Option 8: pkg (FreeBSD only)
|
||||
|
||||
You can install `you-get` easily via:
|
||||
|
||||
```
|
||||
# pkg install you-get
|
||||
```
|
||||
|
||||
### Shell completion
|
||||
|
||||
Completion definitions for Bash, Fish and Zsh can be found in [`contrib/completion`](https://github.com/soimort/you-get/tree/develop/contrib/completion). Please consult your shell's manual for how to take advantage of them.
|
||||
@ -416,7 +424,9 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the
|
||||
| 西瓜视频 | <https://www.ixigua.com/> |✓| | |
|
||||
| 快手 | <https://www.kuaishou.com/> |✓|✓| |
|
||||
| 抖音 | <https://www.douyin.com/> |✓| | |
|
||||
| TikTok | <https://www.tiktok.com/> |✓| | |
|
||||
| 中国体育(TV) | <http://v.zhibo.tv/> </br><http://video.zhibo.tv/> |✓| | |
|
||||
| 知乎 | <https://www.zhihu.com/> |✓| | |
|
||||
|
||||
For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page.
|
||||
|
||||
|
@ -102,6 +102,7 @@ SITES = {
|
||||
'soundcloud' : 'soundcloud',
|
||||
'ted' : 'ted',
|
||||
'theplatform' : 'theplatform',
|
||||
'tiktok' : 'tiktok',
|
||||
'tucao' : 'tucao',
|
||||
'tudou' : 'tudou',
|
||||
'tumblr' : 'tumblr',
|
||||
@ -127,6 +128,7 @@ SITES = {
|
||||
'youtube' : 'youtube',
|
||||
'zhanqi' : 'zhanqi',
|
||||
'zhibo' : 'zhibo',
|
||||
'zhihu' : 'zhihu',
|
||||
}
|
||||
|
||||
dry_run = False
|
||||
@ -429,7 +431,7 @@ def get_content(url, headers={}, decoded=True):
|
||||
# Decode the response body
|
||||
if decoded:
|
||||
charset = match1(
|
||||
response.getheader('Content-Type'), r'charset=([\w-]+)'
|
||||
response.getheader('Content-Type', ''), r'charset=([\w-]+)'
|
||||
)
|
||||
if charset is not None:
|
||||
data = data.decode(charset)
|
||||
@ -439,7 +441,7 @@ def get_content(url, headers={}, decoded=True):
|
||||
return data
|
||||
|
||||
|
||||
def post_content(url, headers={}, post_data={}, decoded=True):
|
||||
def post_content(url, headers={}, post_data={}, decoded=True, **kwargs):
|
||||
"""Post the content of a URL via sending a HTTP POST request.
|
||||
|
||||
Args:
|
||||
@ -450,13 +452,18 @@ def post_content(url, headers={}, post_data={}, decoded=True):
|
||||
Returns:
|
||||
The content as a string.
|
||||
"""
|
||||
|
||||
logging.debug('post_content: %s \n post_data: %s' % (url, post_data))
|
||||
if kwargs.get('post_data_raw'):
|
||||
logging.debug('post_content: %s\npost_data_raw: %s' % (url, kwargs['post_data_raw']))
|
||||
else:
|
||||
logging.debug('post_content: %s\npost_data: %s' % (url, post_data))
|
||||
|
||||
req = request.Request(url, headers=headers)
|
||||
if cookies:
|
||||
cookies.add_cookie_header(req)
|
||||
req.headers.update(req.unredirected_hdrs)
|
||||
if kwargs.get('post_data_raw'):
|
||||
post_data_enc = bytes(kwargs['post_data_raw'], 'utf-8')
|
||||
else:
|
||||
post_data_enc = bytes(parse.urlencode(post_data), 'utf-8')
|
||||
response = urlopen_with_retry(req, data=post_data_enc)
|
||||
data = response.read()
|
||||
@ -602,7 +609,12 @@ def url_save(
|
||||
# the key must be 'Referer' for the hack here
|
||||
if refer is not None:
|
||||
tmp_headers['Referer'] = refer
|
||||
if type(url) is list:
|
||||
file_size = urls_size(url, faker=faker, headers=tmp_headers)
|
||||
is_chunked, urls = True, url
|
||||
else:
|
||||
file_size = url_size(url, faker=faker, headers=tmp_headers)
|
||||
is_chunked, urls = False, [url]
|
||||
|
||||
continue_renameing = True
|
||||
while continue_renameing:
|
||||
@ -612,7 +624,7 @@ def url_save(
|
||||
if not is_part:
|
||||
if bar:
|
||||
bar.done()
|
||||
print(
|
||||
log.w(
|
||||
'Skipping {}: file already exists'.format(
|
||||
tr(os.path.basename(filepath))
|
||||
)
|
||||
@ -638,7 +650,10 @@ def url_save(
|
||||
print('Changing name to %s' % tr(os.path.basename(filepath)), '...')
|
||||
continue_renameing = True
|
||||
continue
|
||||
print('Overwriting %s' % tr(os.path.basename(filepath)), '...')
|
||||
if log.yes_or_no('File with this name already exists. Overwrite?'):
|
||||
log.w('Overwriting %s ...' % tr(os.path.basename(filepath)))
|
||||
else:
|
||||
return
|
||||
elif not os.path.exists(os.path.dirname(filepath)):
|
||||
os.mkdir(os.path.dirname(filepath))
|
||||
|
||||
@ -655,6 +670,8 @@ def url_save(
|
||||
else:
|
||||
open_mode = 'wb'
|
||||
|
||||
for url in urls:
|
||||
received_chunk = 0
|
||||
if received < file_size:
|
||||
if faker:
|
||||
tmp_headers = fake_headers
|
||||
@ -665,7 +682,7 @@ def url_save(
|
||||
else:
|
||||
headers = {}
|
||||
'''
|
||||
if received:
|
||||
if received and not is_chunked: # only request a range when not chunked
|
||||
tmp_headers['Range'] = 'bytes=' + str(received) + '-'
|
||||
if refer:
|
||||
tmp_headers['Referer'] = refer
|
||||
@ -693,7 +710,9 @@ def url_save(
|
||||
range_length = int(content_length) if content_length is not None \
|
||||
else float('inf')
|
||||
|
||||
if file_size != received + range_length:
|
||||
if is_chunked: # always append if chunked
|
||||
open_mode = 'ab'
|
||||
elif file_size != received + range_length: # is it ever necessary?
|
||||
received = 0
|
||||
if bar:
|
||||
bar.received = 0
|
||||
@ -707,9 +726,12 @@ def url_save(
|
||||
except socket.timeout:
|
||||
pass
|
||||
if not buffer:
|
||||
if received == file_size: # Download finished
|
||||
if is_chunked and received_chunk == range_length:
|
||||
break
|
||||
elif not is_chunked and received == file_size: # Download finished
|
||||
break
|
||||
# Unexpected termination. Retry request
|
||||
if not is_chunked: # when
|
||||
tmp_headers['Range'] = 'bytes=' + str(received) + '-'
|
||||
response = urlopen_with_retry(
|
||||
request.Request(url, headers=tmp_headers)
|
||||
@ -717,6 +739,7 @@ def url_save(
|
||||
continue
|
||||
output.write(buffer)
|
||||
received += len(buffer)
|
||||
received_chunk += len(buffer)
|
||||
if bar:
|
||||
bar.update_received(len(buffer))
|
||||
|
||||
@ -907,7 +930,7 @@ def download_urls(
|
||||
if total_size:
|
||||
if not force and os.path.exists(output_filepath) and not auto_rename\
|
||||
and os.path.getsize(output_filepath) >= total_size * 0.9:
|
||||
print('Skipping %s: file already exists' % output_filepath)
|
||||
log.w('Skipping %s: file already exists' % output_filepath)
|
||||
print()
|
||||
return
|
||||
bar = SimpleProgressBar(total_size, len(urls))
|
||||
@ -1554,9 +1577,9 @@ def google_search(url):
|
||||
url = 'https://www.google.com/search?tbm=vid&q=%s' % parse.quote(keywords)
|
||||
page = get_content(url, headers=fake_headers)
|
||||
videos = re.findall(
|
||||
r'<a href="(https?://[^"]+)" onmousedown="[^"]+">([^<]+)<', page
|
||||
r'<a href="(https?://[^"]+)" onmousedown="[^"]+"><h3 class="[^"]*">([^<]+)<', page
|
||||
)
|
||||
vdurs = re.findall(r'<span class="vdur _dwc">([^<]+)<', page)
|
||||
vdurs = re.findall(r'<span class="vdur[^"]*">([^<]+)<', page)
|
||||
durs = [r1(r'(\d+:\d+)', unescape_html(dur)) for dur in vdurs]
|
||||
print('Google Videos search:')
|
||||
for v in zip(videos, durs):
|
||||
|
@ -211,7 +211,7 @@ class VideoExtractor():
|
||||
ext = self.dash_streams[stream_id]['container']
|
||||
total_size = self.dash_streams[stream_id]['size']
|
||||
|
||||
if ext == 'm3u8':
|
||||
if ext == 'm3u8' or ext == 'm4a':
|
||||
ext = 'mp4'
|
||||
|
||||
if not urls:
|
||||
|
@ -67,6 +67,7 @@ from .sohu import *
|
||||
from .soundcloud import *
|
||||
from .suntv import *
|
||||
from .theplatform import *
|
||||
from .tiktok import *
|
||||
from .tucao import *
|
||||
from .tudou import *
|
||||
from .tumblr import *
|
||||
@ -89,3 +90,4 @@ from .khan import *
|
||||
from .zhanqi import *
|
||||
from .kuaishou import *
|
||||
from .zhibo import *
|
||||
from .zhihu import *
|
||||
|
@ -85,9 +85,13 @@ def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=Fals
|
||||
_, _, seg_size = url_info(url)
|
||||
size += seg_size
|
||||
#fallback to flvhd is not quite possible
|
||||
print_info(site_info, title, 'mp4', size)
|
||||
if re.search(r'fid=[0-9A-Z\-]*.flv', preferred[0][0]):
|
||||
ext = 'flv'
|
||||
else:
|
||||
ext = 'mp4'
|
||||
print_info(site_info, title, ext, size)
|
||||
if not info_only:
|
||||
download_urls(preferred[0], title, 'mp4', size, output_dir=output_dir, merge=merge)
|
||||
download_urls(preferred[0], title, ext, size, output_dir=output_dir, merge=merge)
|
||||
else:
|
||||
raise NotImplementedError(sourceType)
|
||||
|
||||
@ -105,27 +109,42 @@ def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=Fals
|
||||
pass
|
||||
|
||||
def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
|
||||
assert re.match(r'http://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url)
|
||||
html = get_content(url)
|
||||
assert re.match(r'http://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', url)
|
||||
|
||||
if re.match(r'http://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url):
|
||||
html = get_content(url)
|
||||
title = r1(r'data-title="([^"]+)"', html)
|
||||
title = unescape_html(title)
|
||||
title = escape_file_path(title)
|
||||
assert title
|
||||
if match1(url, r'_(\d+)$'): # current P
|
||||
title = title + " " + r1(r'active">([^<]*)', html)
|
||||
|
||||
vid = r1('data-vid="(\d+)"', html)
|
||||
up = r1('data-name="([^"]+)"', html)
|
||||
# bangumi
|
||||
elif re.match("http://[^\.]*\.*acfun\.[^\.]+/bangumi/ab(\d+)", url):
|
||||
html = get_content(url)
|
||||
title = match1(html, r'"newTitle"\s*:\s*"([^"]+)"')
|
||||
if match1(url, r'_(\d+)$'): # current P
|
||||
title = title + " " + r1(r'active">([^<]*)', html)
|
||||
vid = match1(html, r'videoId="(\d+)"')
|
||||
up = "acfun"
|
||||
else:
|
||||
raise NotImplemented
|
||||
|
||||
assert title and vid
|
||||
title = unescape_html(title)
|
||||
title = escape_file_path(title)
|
||||
p_title = r1('active">([^<]+)', html)
|
||||
title = '%s (%s)' % (title, up)
|
||||
if p_title: title = '%s - %s' % (title, p_title)
|
||||
if p_title:
|
||||
title = '%s - %s' % (title, p_title)
|
||||
|
||||
|
||||
acfun_download_by_vid(vid, title,
|
||||
output_dir=output_dir,
|
||||
merge=merge,
|
||||
info_only=info_only,
|
||||
**kwargs)
|
||||
|
||||
|
||||
site_info = "AcFun.tv"
|
||||
download = acfun_download
|
||||
download_playlist = playlist_not_supported('acfun')
|
||||
|
@ -129,8 +129,9 @@ def baidu_download(url, output_dir='.', stream_type=None, merge=True, info_only=
|
||||
html = get_html(url)
|
||||
title = r1(r'title:"([^"]+)"', html)
|
||||
|
||||
vhsrc = re.findall(r'"BDE_Image"[^>]+src="([^"]+)"', html) or re.findall(r'vhsrc="([^"]+)"', html)
|
||||
if vhsrc is not None:
|
||||
vhsrc = re.findall(r'"BDE_Image"[^>]+src="([^"]+\.mp4)"', html) or \
|
||||
re.findall(r'vhsrc="([^"]+)"', html)
|
||||
if len(vhsrc) > 0:
|
||||
ext = 'mp4'
|
||||
size = url_size(vhsrc[0])
|
||||
print_info(site_info, title, ext, size)
|
||||
|
@ -22,7 +22,7 @@ from .youku import youku_download_by_vid
|
||||
|
||||
class Bilibili(VideoExtractor):
|
||||
name = 'Bilibili'
|
||||
live_api = 'http://live.bilibili.com/api/playurl?cid={}&otype=json'
|
||||
live_api = 'https://api.live.bilibili.com/room/v1/Room/playUrl?cid={}&quality=0&platform=web'
|
||||
api_url = 'http://interface.bilibili.com/v2/playurl?'
|
||||
bangumi_api_url = 'http://bangumi.bilibili.com/player/web_api/playurl?'
|
||||
live_room_init_api_url = 'https://api.live.bilibili.com/room/v1/Room/room_init?id={}'
|
||||
@ -115,7 +115,7 @@ class Bilibili(VideoExtractor):
|
||||
self.url = 'http://www.bilibili.com/video/av{}/'.format(aid)
|
||||
|
||||
self.ua = fake_headers['User-Agent']
|
||||
self.url = url_locations([self.url])[0]
|
||||
self.url = url_locations([self.url], faker=True)[0]
|
||||
frag = urllib.parse.urlparse(self.url).fragment
|
||||
# http://www.bilibili.com/video/av3141144/index_2.html#page=3
|
||||
if frag:
|
||||
@ -125,26 +125,27 @@ class Bilibili(VideoExtractor):
|
||||
aid = re.search(r'av(\d+)', self.url).group(1)
|
||||
self.url = 'http://www.bilibili.com/video/av{}/index_{}.html'.format(aid, page)
|
||||
self.referer = self.url
|
||||
self.page = get_content(self.url)
|
||||
self.page = get_content(self.url, headers=fake_headers)
|
||||
|
||||
m = re.search(r'<h1.*?>(.*?)</h1>', self.page) or re.search(r'<h1 title="([^"]+)">', self.page)
|
||||
if m is not None:
|
||||
self.title = m.group(1)
|
||||
s = re.search(r'<span>([^<]+)</span>', m.group(1))
|
||||
s = re.search(r'<span.*?>([^<]+)</span>', m.group(1))
|
||||
if s:
|
||||
self.title = unescape_html(s.group(1))
|
||||
if self.title is None:
|
||||
m = re.search(r'property="og:title" content="([^"]+)"', self.page)
|
||||
if m is not None:
|
||||
self.title = m.group(1)
|
||||
|
||||
if 'subtitle' in kwargs:
|
||||
subtitle = kwargs['subtitle']
|
||||
self.title = '{} {}'.format(self.title, subtitle)
|
||||
else:
|
||||
playinfo = re.search(r'__INITIAL_STATE__=(.*?);\(function\(\)', self.page)
|
||||
if playinfo is not None:
|
||||
pages = json.loads(playinfo.group(1))['videoData']['pages']
|
||||
jsonPlayinfo = json.loads(playinfo.group(1))
|
||||
if 'videoData' in jsonPlayinfo:
|
||||
pages = jsonPlayinfo['videoData']['pages']
|
||||
if len(pages) > 1:
|
||||
qs = dict(parse.parse_qsl(urllib.parse.urlparse(self.url).query))
|
||||
page = pages[int(qs.get('p', 1)) - 1]
|
||||
@ -160,6 +161,8 @@ class Bilibili(VideoExtractor):
|
||||
self.live_entry(**kwargs)
|
||||
elif 'vc.bilibili.com' in self.url:
|
||||
self.vc_entry(**kwargs)
|
||||
elif 'audio/au' in self.url:
|
||||
self.audio_entry(**kwargs)
|
||||
else:
|
||||
self.entry(**kwargs)
|
||||
|
||||
@ -171,6 +174,30 @@ class Bilibili(VideoExtractor):
|
||||
self.title = page_list[0]['pagename']
|
||||
self.download_by_vid(page_list[0]['cid'], True, bangumi_movie=True, **kwargs)
|
||||
|
||||
def audio_entry(self, **kwargs):
|
||||
assert re.match(r'https?://www.bilibili.com/audio/au\d+', self.url)
|
||||
patt = r"(\d+)"
|
||||
audio_id = re.search(patt, self.url).group(1)
|
||||
audio_info_url = \
|
||||
'https://www.bilibili.com/audio/music-service-c/web/song/info?sid={}'.format(audio_id)
|
||||
audio_info_response = json.loads(get_content(audio_info_url))
|
||||
if audio_info_response['msg'] != 'success':
|
||||
log.wtf('fetch audio information failed!')
|
||||
sys.exit(2)
|
||||
self.title = audio_info_response['data']['title']
|
||||
# TODO:there is no quality option for now
|
||||
audio_download_url = \
|
||||
'https://www.bilibili.com/audio/music-service-c/web/url?sid={}&privilege=2&quality=2'.format(audio_id)
|
||||
audio_download_response = json.loads(get_content(audio_download_url))
|
||||
if audio_download_response['msg'] != 'success':
|
||||
log.wtf('fetch audio resource failed!')
|
||||
sys.exit(2)
|
||||
self.streams['mp4'] = {}
|
||||
self.streams['mp4']['src'] = [audio_download_response['data']['cdns'][0]]
|
||||
self.streams['mp4']['container'] = 'm4a'
|
||||
self.streams['mp4']['size'] = audio_download_response['data']['size']
|
||||
|
||||
|
||||
def entry(self, **kwargs):
|
||||
# tencent player
|
||||
tc_flashvars = re.search(r'"bili-cid=\d+&bili-aid=\d+&vid=([^"]+)"', self.page)
|
||||
@ -190,7 +217,12 @@ class Bilibili(VideoExtractor):
|
||||
index_id = int(re.search(r'index_(\d+)', self.url).group(1))
|
||||
cid = page_list[index_id-1]['cid'] # change cid match rule
|
||||
except:
|
||||
cid = re.search(r'"cid":(\d+)', self.page).group(1)
|
||||
page = re.search(r'p=(\d+)', self.url)
|
||||
if page is None:
|
||||
p = 1
|
||||
else:
|
||||
p = int(page.group(1))
|
||||
cid = re.search(r'"cid":(\d+),"page":%s' % p, self.page).group(1)
|
||||
if cid is not None:
|
||||
self.download_by_vid(cid, re.search('bangumi', self.url) is not None, **kwargs)
|
||||
else:
|
||||
@ -226,7 +258,7 @@ class Bilibili(VideoExtractor):
|
||||
|
||||
api_url = self.live_api.format(self.room_id)
|
||||
json_data = json.loads(get_content(api_url))
|
||||
urls = [json_data['durl'][0]['url']]
|
||||
urls = [json_data['data']['durl'][0]['url']]
|
||||
|
||||
self.streams['live'] = {}
|
||||
self.streams['live']['src'] = urls
|
||||
@ -252,28 +284,9 @@ class Bilibili(VideoExtractor):
|
||||
self.streams['vc']['size'] = int(item['video_size'])
|
||||
|
||||
def bangumi_entry(self, **kwargs):
|
||||
bangumi_id = re.search(r'(\d+)', self.url).group(1)
|
||||
frag = urllib.parse.urlparse(self.url).fragment
|
||||
if frag:
|
||||
episode_id = frag
|
||||
else:
|
||||
episode_id = re.search(r'first_ep_id\s*=\s*"(\d+)"', self.page) or re.search(r'\/ep(\d+)', self.url).group(1)
|
||||
# cont = post_content('http://bangumi.bilibili.com/web_api/get_source', post_data=dict(episode_id=episode_id))
|
||||
# cid = json.loads(cont)['result']['cid']
|
||||
cont = get_content('http://bangumi.bilibili.com/web_api/episode/{}.json'.format(episode_id))
|
||||
ep_info = json.loads(cont)['result']['currentEpisode']
|
||||
|
||||
bangumi_data = get_bangumi_info(str(ep_info['seasonId']))
|
||||
bangumi_payment = bangumi_data.get('payment')
|
||||
if bangumi_payment and bangumi_payment['price'] != '0':
|
||||
log.w("It's a paid item")
|
||||
# ep_ids = collect_bangumi_epids(bangumi_data)
|
||||
|
||||
index_title = ep_info['indexTitle']
|
||||
long_title = ep_info['longTitle'].strip()
|
||||
cid = ep_info['danmaku']
|
||||
|
||||
self.title = '{} [{} {}]'.format(self.title, index_title, long_title)
|
||||
data = json.loads(re.search(r'__INITIAL_STATE__=(.+);\(function', self.page).group(1))
|
||||
cid = data['epInfo']['cid']
|
||||
# index_title = data['epInfo']['index_title']
|
||||
self.download_by_vid(cid, bangumi=True, **kwargs)
|
||||
|
||||
|
||||
@ -376,10 +389,82 @@ def download_video_from_favlist(url, **kwargs):
|
||||
|
||||
else:
|
||||
log.wtf("Fail to parse the fav title" + url, "")
|
||||
def download_music_from_favlist(url, page, **kwargs):
|
||||
m = re.search(r'https?://www.bilibili.com/audio/mycollection/(\d+)', url)
|
||||
if m is not None:
|
||||
sid = m.group(1)
|
||||
json_result = json.loads(get_content("https://www.bilibili.com/audio/music-service-c/web/song/of-coll?"
|
||||
"sid={}&pn={}&ps=100".format(sid, page)))
|
||||
if json_result['msg'] == 'success':
|
||||
music_list = json_result['data']['data']
|
||||
music_count = len(music_list)
|
||||
for i in range(music_count):
|
||||
audio_id = music_list[i]['id']
|
||||
audio_title = music_list[i]['title']
|
||||
audio_url = "https://www.bilibili.com/audio/au{}".format(audio_id)
|
||||
print("Start downloading music ", audio_title)
|
||||
Bilibili().download_by_url(audio_url, **kwargs)
|
||||
if page < json_result['data']['pageCount']:
|
||||
page += 1
|
||||
download_music_from_favlist(url, page, **kwargs)
|
||||
else:
|
||||
log.wtf("Fail to get music list of page " + json_result)
|
||||
sys.exit(2)
|
||||
else:
|
||||
log.wtf("Fail to parse the sid from " + url, "")
|
||||
|
||||
def download_video_from_totallist(url, page, **kwargs):
|
||||
# the url has format: https://space.bilibili.com/64169458/#/video
|
||||
m = re.search(r'space\.bilibili\.com/(\d+)/.*?video', url)
|
||||
mid = ""
|
||||
if m is not None:
|
||||
mid = m.group(1)
|
||||
jsonresult = json.loads(get_content("https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&pagesize=100&tid=0&page={}&keyword=&order=pubdate&jsonp=jsonp".format(mid, page)))
|
||||
if jsonresult['status']:
|
||||
videos = jsonresult['data']['vlist']
|
||||
videocount = len(videos)
|
||||
for i in range(videocount):
|
||||
videoid = videos[i]["aid"]
|
||||
videotitle = videos[i]["title"]
|
||||
videourl = "https://www.bilibili.com/video/av{}".format(videoid)
|
||||
print("Start downloading ", videotitle, " video ", videotitle)
|
||||
Bilibili().download_by_url(videourl, subtitle=videotitle, **kwargs)
|
||||
if page < jsonresult['data']['pages']:
|
||||
page += 1
|
||||
download_video_from_totallist(url, page, **kwargs)
|
||||
else:
|
||||
log.wtf("Fail to get the files of page " + jsonresult)
|
||||
sys.exit(2)
|
||||
|
||||
else:
|
||||
log.wtf("Fail to parse the video title" + url, "")
|
||||
|
||||
def download_music_from_totallist(url, page, **kwargs):
|
||||
m = re.search(r'https?://www.bilibili.com/audio/am(\d+)\?type=\d', url)
|
||||
if m is not None:
|
||||
sid = m.group(1)
|
||||
json_result = json.loads(get_content("https://www.bilibili.com/audio/music-service-c/web/song/of-menu?"
|
||||
"sid={}&pn={}&ps=100".format(sid, page)))
|
||||
if json_result['msg'] == 'success':
|
||||
music_list = json_result['data']['data']
|
||||
music_count = len(music_list)
|
||||
for i in range(music_count):
|
||||
audio_id = music_list[i]['id']
|
||||
audio_title = music_list[i]['title']
|
||||
audio_url = "https://www.bilibili.com/audio/au{}".format(audio_id)
|
||||
print("Start downloading music ",audio_title)
|
||||
Bilibili().download_by_url(audio_url, **kwargs)
|
||||
if page < json_result['data']['pageCount']:
|
||||
page += 1
|
||||
download_music_from_totallist(url, page, **kwargs)
|
||||
else:
|
||||
log.wtf("Fail to get music list of page " + json_result)
|
||||
sys.exit(2)
|
||||
else:
|
||||
log.wtf("Fail to parse the sid from " + url, "")
|
||||
|
||||
def bilibili_download_playlist_by_url(url, **kwargs):
|
||||
url = url_locations([url])[0]
|
||||
url = url_locations([url], faker=True)[0]
|
||||
kwargs['playlist'] = True
|
||||
# a bangumi here? possible?
|
||||
if 'live.bilibili' in url:
|
||||
@ -396,6 +481,12 @@ def bilibili_download_playlist_by_url(url, **kwargs):
|
||||
elif 'favlist' in url:
|
||||
# this a fav list folder
|
||||
download_video_from_favlist(url, **kwargs)
|
||||
elif re.match(r'https?://space.bilibili.com/\d+/#/video', url):
|
||||
download_video_from_totallist(url, 1, **kwargs)
|
||||
elif re.match(r'https://www.bilibili.com/audio/mycollection/\d+', url):
|
||||
download_music_from_favlist(url, 1, **kwargs)
|
||||
elif re.match(r'https?://www.bilibili.com/audio/am\d+\?type=\d', url):
|
||||
download_music_from_totallist(url, 1, **kwargs)
|
||||
else:
|
||||
aid = re.search(r'av(\d+)', url).group(1)
|
||||
page_list = json.loads(get_content('http://www.bilibili.com/widget/getPageList?aid={}'.format(aid)))
|
||||
|
@ -29,9 +29,9 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg
|
||||
image_url = edge['node']['display_url']
|
||||
if 'video_url' in edge['node']:
|
||||
image_url = edge['node']['video_url']
|
||||
image_url = image_url.split('?')[0]
|
||||
ext = image_url.split('.')[-1]
|
||||
ext = image_url.split('?')[0].split('.')[-1]
|
||||
size = int(get_head(image_url)['Content-Length'])
|
||||
|
||||
print_info(site_info, title, ext, size)
|
||||
if not info_only:
|
||||
download_urls(urls=[image_url],
|
||||
@ -44,9 +44,9 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg
|
||||
image_url = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['display_url']
|
||||
if 'video_url' in info['entry_data']['PostPage'][0]['graphql']['shortcode_media']:
|
||||
image_url =info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['video_url']
|
||||
image_url = image_url.split('?')[0]
|
||||
ext = image_url.split('.')[-1]
|
||||
ext = image_url.split('?')[0].split('.')[-1]
|
||||
size = int(get_head(image_url)['Content-Length'])
|
||||
|
||||
print_info(site_info, title, ext, size)
|
||||
if not info_only:
|
||||
download_urls(urls=[image_url],
|
||||
|
@ -17,20 +17,20 @@ headers = {
|
||||
|
||||
def iwara_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
|
||||
global headers
|
||||
video_hash=match1(url, r'http://\w+.iwara.tv/videos/(\w+)')
|
||||
video_url=match1(url, r'(http://\w+.iwara.tv)/videos/\w+')
|
||||
html = get_content(url,headers=headers)
|
||||
video_hash = match1(url, r'https?://\w+.iwara.tv/videos/(\w+)')
|
||||
video_url = match1(url, r'(https?://\w+.iwara.tv)/videos/\w+')
|
||||
html = get_content(url, headers=headers)
|
||||
title = r1(r'<title>(.*)</title>', html)
|
||||
api_url=video_url+'/api/video/'+video_hash
|
||||
content=get_content(api_url,headers=headers)
|
||||
data=json.loads(content)
|
||||
type,ext,size=url_info(data[0]['uri'], headers=headers)
|
||||
down_urls=data[0]['uri']
|
||||
print_info(down_urls,title+data[0]['resolution'],type,size)
|
||||
api_url = video_url + '/api/video/' + video_hash
|
||||
content = get_content(api_url, headers=headers)
|
||||
data = json.loads(content)
|
||||
down_urls = 'https:' + data[0]['uri']
|
||||
type, ext, size = url_info(down_urls, headers=headers)
|
||||
print_info(site_info, title+data[0]['resolution'], type, size)
|
||||
|
||||
if not info_only:
|
||||
download_urls([down_urls], title, ext, size, output_dir, merge = merge,headers=headers)
|
||||
download_urls([down_urls], title, ext, size, output_dir, merge=merge, headers=headers)
|
||||
|
||||
site_info = "iwara"
|
||||
site_info = "Iwara"
|
||||
download = iwara_download
|
||||
download_playlist = playlist_not_supported('iwara')
|
||||
|
@ -1,14 +1,132 @@
|
||||
#!/usr/bin/env python
|
||||
__all__ = ['ixigua_download']
|
||||
import base64
|
||||
|
||||
from .toutiao import download as toutiao_download
|
||||
from .toutiao import download_playlist as toutiao_download_playlist
|
||||
import binascii
|
||||
|
||||
from ..common import *
|
||||
import random
|
||||
import ctypes
|
||||
from json import loads
|
||||
|
||||
__all__ = ['ixigua_download', 'ixigua_download_playlist_by_url']
|
||||
|
||||
headers = {
|
||||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 "
|
||||
"Safari/537.36",
|
||||
}
|
||||
|
||||
|
||||
def int_overflow(val):
|
||||
maxint = 2147483647
|
||||
if not -maxint - 1 <= val <= maxint:
|
||||
val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1
|
||||
return val
|
||||
|
||||
|
||||
def unsigned_right_shitf(n, i):
|
||||
if n < 0:
|
||||
n = ctypes.c_uint32(n).value
|
||||
if i < 0:
|
||||
return -int_overflow(n << abs(i))
|
||||
return int_overflow(n >> i)
|
||||
|
||||
|
||||
def get_video_url_from_video_id(video_id):
|
||||
"""Splicing URLs according to video ID to get video details"""
|
||||
# from js
|
||||
data = [""] * 256
|
||||
for index, _ in enumerate(data):
|
||||
t = index
|
||||
for i in range(8):
|
||||
t = -306674912 ^ unsigned_right_shitf(t, 1) if 1 & t else unsigned_right_shitf(t, 1)
|
||||
data[index] = t
|
||||
|
||||
def tmp():
|
||||
rand_num = random.random()
|
||||
path = "/video/urls/v/1/toutiao/mp4/{video_id}?r={random_num}".format(video_id=video_id,
|
||||
random_num=str(rand_num)[2:])
|
||||
e = o = r = -1
|
||||
i, a = 0, len(path)
|
||||
while i < a:
|
||||
e = ord(path[i])
|
||||
i += 1
|
||||
if e < 128:
|
||||
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ e)]
|
||||
else:
|
||||
if e < 2048:
|
||||
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (192 | e >> 6 & 31))]
|
||||
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))]
|
||||
else:
|
||||
if 55296 <= e < 57344:
|
||||
e = (1023 & e) + 64
|
||||
i += 1
|
||||
o = 1023 & t.url(i)
|
||||
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (240 | e >> 8 & 7))]
|
||||
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 2 & 63))]
|
||||
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | o >> 6 & 15 | (3 & e) << 4))]
|
||||
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & o))]
|
||||
else:
|
||||
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (224 | e >> 12 & 15))]
|
||||
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 6 & 63))]
|
||||
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))]
|
||||
|
||||
return "https://ib.365yg.com{path}&s={param}".format(path=path, param=unsigned_right_shitf(r ^ -1, 0))
|
||||
|
||||
while 1:
|
||||
url = tmp()
|
||||
if url.split("=")[-1][0] != "-": # 参数s不能为负数
|
||||
return url
|
||||
|
||||
|
||||
def ixigua_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
|
||||
return toutiao_download(url.replace('ixigua', '365yg'))
|
||||
# example url: https://www.ixigua.com/i6631065141750268420/#mid=63024814422
|
||||
html = get_html(url, faker=True)
|
||||
video_id = match1(html, r"videoId\s*:\s*'([^']+)'")
|
||||
title = match1(html, r"title: '(\S+)',")
|
||||
if not video_id:
|
||||
log.e("video_id not found, url:{}".format(url))
|
||||
return
|
||||
video_info_url = get_video_url_from_video_id(video_id)
|
||||
video_info = loads(get_content(video_info_url))
|
||||
if video_info.get("code", 1) != 0:
|
||||
log.e("Get video info from {} error: server return code {}".format(video_info_url, video_info.get("code", 1)))
|
||||
return
|
||||
if not video_info.get("data", None):
|
||||
log.e("Get video info from {} error: The server returns JSON value"
|
||||
" without data or data is empty".format(video_info_url))
|
||||
return
|
||||
if not video_info["data"].get("video_list", None):
|
||||
log.e("Get video info from {} error: The server returns JSON value"
|
||||
" without data.video_list or data.video_list is empty".format(video_info_url))
|
||||
return
|
||||
if not video_info["data"]["video_list"].get("video_1", None):
|
||||
log.e("Get video info from {} error: The server returns JSON value"
|
||||
" without data.video_list.video_1 or data.video_list.video_1 is empty".format(video_info_url))
|
||||
return
|
||||
size = int(video_info["data"]["video_list"]["video_1"]["size"])
|
||||
print_info(site_info=site_info, title=title, type="mp4", size=size) # 该网站只有mp4类型文件
|
||||
if not info_only:
|
||||
video_url = base64.b64decode(video_info["data"]["video_list"]["video_1"]["main_url"].encode("utf-8"))
|
||||
download_urls([video_url.decode("utf-8")], title, "mp4", size, output_dir, merge=merge, headers=headers, **kwargs)
|
||||
|
||||
|
||||
def ixigua_download_playlist_by_url(url, output_dir='.', merge=True, info_only=False, **kwargs):
|
||||
assert "user" in url, "Only support users to publish video list,Please provide a similar url:" \
|
||||
"https://www.ixigua.com/c/user/6907091136/"
|
||||
|
||||
user_id = url.split("/")[-2] if url[-1] == "/" else url.split("/")[-1]
|
||||
params = {"max_behot_time": "0", "max_repin_time": "0", "count": "20", "page_type": "0", "user_id": user_id}
|
||||
while 1:
|
||||
url = "https://www.ixigua.com/c/user/article/?" + "&".join(["{}={}".format(k, v) for k, v in params.items()])
|
||||
video_list = loads(get_content(url, headers=headers))
|
||||
params["max_behot_time"] = video_list["next"]["max_behot_time"]
|
||||
for video in video_list["data"]:
|
||||
ixigua_download("https://www.ixigua.com/i{}/".format(video["item_id"]), output_dir, merge, info_only,
|
||||
**kwargs)
|
||||
if video_list["next"]["max_behot_time"] == 0:
|
||||
break
|
||||
|
||||
|
||||
site_info = "ixigua.com"
|
||||
download = ixigua_download
|
||||
download_playlist = toutiao_download_playlist
|
||||
download_playlist = ixigua_download_playlist_by_url
|
||||
|
@ -2,8 +2,17 @@
|
||||
|
||||
__all__ = ['lizhi_download']
|
||||
import json
|
||||
import datetime
|
||||
from ..common import *
|
||||
|
||||
#
|
||||
# Worked well but not perfect.
|
||||
# TODO: add option --format={sd|hd}
|
||||
#
|
||||
def get_url(ep):
|
||||
readable = datetime.datetime.fromtimestamp(int(ep['create_time']) / 1000).strftime('%Y/%m/%d')
|
||||
return 'http://cdn5.lizhi.fm/audio/{}/{}_hd.mp3'.format(readable, ep['id'])
|
||||
|
||||
# radio_id: e.g. 549759 from http://www.lizhi.fm/549759/
|
||||
#
|
||||
# Returns a list of tuples (audio_id, title, url) for each episode
|
||||
@ -23,7 +32,7 @@ def lizhi_extract_playlist_info(radio_id):
|
||||
# (au_cnt), then handle pagination properly.
|
||||
api_url = 'http://www.lizhi.fm/api/radio_audios?s=0&l=65535&band=%s' % radio_id
|
||||
api_response = json.loads(get_content(api_url))
|
||||
return [(ep['id'], ep['name'], ep['url']) for ep in api_response]
|
||||
return [(ep['id'], ep['name'], get_url(ep)) for ep in api_response]
|
||||
|
||||
def lizhi_download_audio(audio_id, title, url, output_dir='.', info_only=False):
|
||||
filetype, ext, size = url_info(url)
|
||||
|
@ -2,9 +2,12 @@
|
||||
|
||||
__all__ = ['miaopai_download']
|
||||
|
||||
import string
|
||||
import random
|
||||
from ..common import *
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
from ..util import fs
|
||||
|
||||
fake_headers_mobile = {
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
@ -20,6 +23,10 @@ def miaopai_download_by_fid(fid, output_dir = '.', merge = False, info_only = Fa
|
||||
|
||||
mobile_page = get_content(page_url, headers=fake_headers_mobile)
|
||||
url = match1(mobile_page, r'<video id=.*?src=[\'"](.*?)[\'"]\W')
|
||||
if url is None:
|
||||
wb_mp = re.search(r'<script src=([\'"])(.+?wb_mp\.js)\1>', mobile_page).group(2)
|
||||
return miaopai_download_by_wbmp(wb_mp, fid, output_dir=output_dir, merge=merge,
|
||||
info_only=info_only, total_size=None, **kwargs)
|
||||
title = match1(mobile_page, r'<title>((.|\n)+?)</title>')
|
||||
if not title:
|
||||
title = fid
|
||||
@ -29,14 +36,62 @@ def miaopai_download_by_fid(fid, output_dir = '.', merge = False, info_only = Fa
|
||||
if not info_only:
|
||||
download_urls([url], title, ext, total_size=None, output_dir=output_dir, merge=merge)
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
|
||||
def miaopai_download_by_wbmp(wbmp_url, fid, info_only=False, **kwargs):
|
||||
headers = {}
|
||||
headers.update(fake_headers_mobile)
|
||||
headers['Host'] = 'imgaliyuncdn.miaopai.com'
|
||||
wbmp = get_content(wbmp_url, headers=headers)
|
||||
appid = re.search(r'appid:\s*?([^,]+?),', wbmp).group(1)
|
||||
jsonp = re.search(r'jsonp:\s*?([\'"])(\w+?)\1', wbmp).group(2)
|
||||
population = [i for i in string.ascii_lowercase] + [i for i in string.digits]
|
||||
info_url = '{}?{}'.format('http://p.weibo.com/aj_media/info', parse.urlencode({
|
||||
'appid': appid.strip(),
|
||||
'fid': fid,
|
||||
jsonp.strip(): '_jsonp' + ''.join(random.sample(population, 11))
|
||||
}))
|
||||
headers['Host'] = 'p.weibo.com'
|
||||
jsonp_text = get_content(info_url, headers=headers)
|
||||
jsonp_dict = json.loads(match1(jsonp_text, r'\(({.+})\)'))
|
||||
if jsonp_dict['code'] != 200:
|
||||
log.wtf('[Failed] "%s"' % jsonp_dict['msg'])
|
||||
video_url = jsonp_dict['data']['meta_data'][0]['play_urls']['l']
|
||||
title = jsonp_dict['data']['description']
|
||||
title = title.replace('\n', '_')
|
||||
ext = 'mp4'
|
||||
headers['Host'] = 'f.us.sinaimg.cn'
|
||||
print_info(site_info, title, ext, url_info(video_url, headers=headers)[2])
|
||||
if not info_only:
|
||||
download_urls([video_url], fs.legitimize(title), ext, headers=headers, **kwargs)
|
||||
|
||||
|
||||
def miaopai_download_direct(url, info_only, **kwargs):
|
||||
mobile_page = get_content(url, headers=fake_headers_mobile)
|
||||
try:
|
||||
title = re.search(r'([\'"])title\1:\s*([\'"])(.+?)\2,', mobile_page).group(3)
|
||||
except:
|
||||
title = re.search(r'([\'"])status_title\1:\s*([\'"])(.+?)\2,', mobile_page).group(3)
|
||||
title = title.replace('\n', '_')
|
||||
stream_url = re.search(r'([\'"])stream_url\1:\s*([\'"])(.+?)\2,', mobile_page).group(3)
|
||||
ext = 'mp4'
|
||||
print_info(site_info, title, ext, url_info(stream_url, headers=fake_headers_mobile)[2])
|
||||
if not info_only:
|
||||
download_urls([stream_url], fs.legitimize(title), ext, total_size=None, headers=fake_headers_mobile, **kwargs)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
def miaopai_download(url, output_dir = '.', merge = False, info_only = False, **kwargs):
|
||||
if match1(url, r'weibo\.com/tv/v/(\w+)'):
|
||||
return miaopai_download_direct(url, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs)
|
||||
|
||||
fid = match1(url, r'\?fid=(\d{4}:\w+)')
|
||||
if fid is not None:
|
||||
miaopai_download_by_fid(fid, output_dir, merge, info_only)
|
||||
elif '/p/230444' in url:
|
||||
fid = match1(url, r'/p/230444(\w+)')
|
||||
miaopai_download_by_fid('1034:'+fid, output_dir, merge, info_only)
|
||||
elif re.match(r'^http[s]://weibo\.com/\d+/.+', url):
|
||||
miaopai_download_direct(url, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs)
|
||||
else:
|
||||
mobile_page = get_content(url, headers = fake_headers_mobile)
|
||||
hit = re.search(r'"page_url"\s*:\s*"([^"]+)"', mobile_page)
|
||||
@ -46,6 +101,7 @@ def miaopai_download(url, output_dir = '.', merge = False, info_only = False, **
|
||||
escaped_url = hit.group(1)
|
||||
miaopai_download(urllib.parse.unquote(escaped_url), output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
|
||||
|
||||
|
||||
site_info = "miaopai"
|
||||
download = miaopai_download
|
||||
download_playlist = playlist_not_supported('miaopai')
|
||||
|
@ -7,17 +7,24 @@ import re
|
||||
|
||||
from ..util import log
|
||||
from ..common import get_content, download_urls, print_info, playlist_not_supported, url_size
|
||||
from .universal import *
|
||||
|
||||
__all__ = ['naver_download_by_url']
|
||||
|
||||
|
||||
def naver_download_by_url(url, info_only=False, **kwargs):
|
||||
def naver_download_by_url(url, output_dir='.', merge=True, info_only=False, **kwargs):
|
||||
ep = 'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{}?key={}'
|
||||
page = get_content(url)
|
||||
og_video_url = re.search(r"<meta\s+property=\"og:video:url\"\s+content='(.+?)'>", page).group(1)
|
||||
try:
|
||||
temp = re.search(r"<meta\s+property=\"og:video:url\"\s+content='(.+?)'>", page)
|
||||
if temp is not None:
|
||||
og_video_url = temp.group(1)
|
||||
params_dict = urllib.parse.parse_qs(urllib.parse.urlparse(og_video_url).query)
|
||||
vid = params_dict['vid'][0]
|
||||
key = params_dict['outKey'][0]
|
||||
else:
|
||||
vid = re.search(r"\"videoId\"\s*:\s*\"(.+?)\"", page).group(1)
|
||||
key = re.search(r"\"inKey\"\s*:\s*\"(.+?)\"", page).group(1)
|
||||
meta_str = get_content(ep.format(vid, key))
|
||||
meta_json = json.loads(meta_str)
|
||||
if 'errorCode' in meta_json:
|
||||
@ -32,6 +39,8 @@ def naver_download_by_url(url, info_only=False, **kwargs):
|
||||
print_info(site_info, title, 'mp4', size)
|
||||
if not info_only:
|
||||
download_urls([video_url], title, 'mp4', size, **kwargs)
|
||||
except:
|
||||
universal_download(url, output_dir, merge=merge, info_only=info_only, **kwargs)
|
||||
|
||||
site_info = "naver.com"
|
||||
download = naver_download_by_url
|
||||
|
@ -192,14 +192,14 @@ class PPTV(VideoExtractor):
|
||||
if self.url and not self.vid:
|
||||
if not re.match(r'http://v.pptv.com/show/(\w+)\.html', self.url):
|
||||
raise('Unknown url pattern')
|
||||
page_content = get_content(self.url)
|
||||
page_content = get_content(self.url,{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"})
|
||||
self.vid = match1(page_content, r'webcfg\s*=\s*{"id":\s*(\d+)')
|
||||
|
||||
if not self.vid:
|
||||
raise('Cannot find id')
|
||||
api_url = 'http://web-play.pptv.com/webplay3-0-{}.xml'.format(self.vid)
|
||||
api_url += '?appplt=flp&appid=pptv.flashplayer.vod&appver=3.4.2.28&type=&version=4'
|
||||
dom = parseString(get_content(api_url))
|
||||
dom = parseString(get_content(api_url,{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}))
|
||||
self.title, m_items, m_streams, m_segs = parse_pptv_xml(dom)
|
||||
xml_streams = merge_meta(m_items, m_streams, m_segs)
|
||||
for stream_id in xml_streams:
|
||||
|
@ -15,9 +15,9 @@ Changelog:
|
||||
new api
|
||||
'''
|
||||
|
||||
def real_url(host,vid,tvid,new,clipURL,ck):
|
||||
url = 'http://'+host+'/?prot=9&prod=flash&pt=1&file='+clipURL+'&new='+new +'&key='+ ck+'&vid='+str(vid)+'&uid='+str(int(time.time()*1000))+'&t='+str(random())+'&rb=1'
|
||||
return json.loads(get_html(url))['url']
|
||||
def real_url(fileName, key, ch):
|
||||
url = "https://data.vod.itc.cn/ip?new=" + fileName + "&num=1&key=" + key + "&ch=" + ch + "&pt=1&pg=2&prod=h5n"
|
||||
return json.loads(get_html(url))['servers'][0]['url']
|
||||
|
||||
def sohu_download(url, output_dir = '.', merge = True, info_only = False, extractor_proxy=None, **kwargs):
|
||||
if re.match(r'http://share.vrs.sohu.com', url):
|
||||
@ -51,9 +51,8 @@ def sohu_download(url, output_dir = '.', merge = True, info_only = False, extrac
|
||||
title = data['tvName']
|
||||
size = sum(data['clipsBytes'])
|
||||
assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su'])
|
||||
for new,clip,ck, in zip(data['su'], data['clipsURL'], data['ck']):
|
||||
clipURL = urlparse(clip).path
|
||||
urls.append(real_url(host,hqvid,tvid,new,clipURL,ck))
|
||||
for fileName, key in zip(data['su'], data['ck']):
|
||||
urls.append(real_url(fileName, key, data['ch']))
|
||||
# assert data['clipsURL'][0].endswith('.mp4')
|
||||
|
||||
else:
|
||||
@ -66,9 +65,8 @@ def sohu_download(url, output_dir = '.', merge = True, info_only = False, extrac
|
||||
title = data['tvName']
|
||||
size = sum(map(int,data['clipsBytes']))
|
||||
assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su'])
|
||||
for new,clip,ck, in zip(data['su'], data['clipsURL'], data['ck']):
|
||||
clipURL = urlparse(clip).path
|
||||
urls.append(real_url(host,vid,tvid,new,clipURL,ck))
|
||||
for fileName, key in zip(data['su'], data['ck']):
|
||||
urls.append(real_url(fileName, key, data['ch']))
|
||||
|
||||
print_info(site_info, title, 'mp4', size)
|
||||
if not info_only:
|
||||
|
23
src/you_get/extractors/tiktok.py
Normal file
23
src/you_get/extractors/tiktok.py
Normal file
@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__all__ = ['tiktok_download']
|
||||
|
||||
from ..common import *
|
||||
|
||||
def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
|
||||
html = get_html(url)
|
||||
title = r1(r'<title>(.*?)</title>', html)
|
||||
video_id = r1(r'/video/(\d+)', url) or r1(r'musical\?id=(\d+)', html)
|
||||
title = '%s [%s]' % (title, video_id)
|
||||
dataText = r1(r'var data = \[(.*)\] ', html) or r1(r'var data = (\{.*\})', html)
|
||||
data = json.loads(dataText)
|
||||
source = 'http:' + data['video']['play_addr']['url_list'][0]
|
||||
mime, ext, size = url_info(source)
|
||||
|
||||
print_info(site_info, title, mime, size)
|
||||
if not info_only:
|
||||
download_urls([source], title, ext, size, output_dir, merge=merge)
|
||||
|
||||
site_info = "TikTok.com"
|
||||
download = tiktok_download
|
||||
download_playlist = playlist_not_supported('tiktok')
|
@ -13,7 +13,29 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
|
||||
universal_download(url, output_dir, merge=merge, info_only=info_only)
|
||||
return
|
||||
|
||||
html = parse.unquote(get_html(url)).replace('\/', '/')
|
||||
import ssl
|
||||
ssl_context = request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1))
|
||||
cookie_handler = request.HTTPCookieProcessor()
|
||||
opener = request.build_opener(ssl_context, cookie_handler)
|
||||
request.install_opener(opener)
|
||||
|
||||
page = get_html(url)
|
||||
form_key = match1(page, r'id="tumblr_form_key" content="([^"]+)"')
|
||||
if form_key is not None:
|
||||
# bypass GDPR consent page
|
||||
referer = 'https://www.tumblr.com/privacy/consent?redirect=%s' % parse.quote_plus(url)
|
||||
post_content('https://www.tumblr.com/svc/privacy/consent',
|
||||
headers={
|
||||
'Content-Type': 'application/json',
|
||||
'User-Agent': fake_headers['User-Agent'],
|
||||
'Referer': referer,
|
||||
'X-tumblr-form-key': form_key,
|
||||
'X-Requested-With': 'XMLHttpRequest'
|
||||
},
|
||||
post_data_raw='{"eu_resident":true,"gdpr_is_acceptable_age":true,"gdpr_consent_core":true,"gdpr_consent_first_party_ads":true,"gdpr_consent_third_party_ads":true,"gdpr_consent_search_history":true,"redirect_to":"%s","gdpr_reconsent":false}' % url)
|
||||
page = get_html(url, faker=True)
|
||||
|
||||
html = parse.unquote(page).replace('\/', '/')
|
||||
feed = r1(r'<meta property="og:type" content="tumblr-feed:(\w+)" />', html)
|
||||
|
||||
if feed in ['photo', 'photoset', 'entry'] or feed is None:
|
||||
@ -21,23 +43,24 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
|
||||
page_title = r1(r'<meta name="description" content="([^"\n]+)', html) or \
|
||||
r1(r'<meta property="og:description" content="([^"\n]+)', html) or \
|
||||
r1(r'<title>([^<\n]*)', html)
|
||||
urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.jpg)', html) +\
|
||||
re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.png)', html) +\
|
||||
re.findall(r'(https?://[^;"&]+/tumblr_[^";]+_\d+\.gif)', html)
|
||||
urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.jpg)', html) +\
|
||||
re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.png)', html) +\
|
||||
re.findall(r'(https?://[^;"&]+/tumblr_[^";&]+_\d+\.gif)', html)
|
||||
|
||||
tuggles = {}
|
||||
for url in urls:
|
||||
filename = parse.unquote(url.split('/')[-1])
|
||||
hd_url = r1(r'(.+)_\d+\.jpg$', url) + '_1280.jpg' # FIXME: decide actual quality
|
||||
filename = parse.unquote(hd_url.split('/')[-1])
|
||||
title = '.'.join(filename.split('.')[:-1])
|
||||
tumblr_id = r1(r'^tumblr_(.+)_\d+$', title)
|
||||
quality = int(r1(r'^tumblr_.+_(\d+)$', title))
|
||||
ext = filename.split('.')[-1]
|
||||
try:
|
||||
size = int(get_head(url)['Content-Length'])
|
||||
size = int(get_head(hd_url)['Content-Length'])
|
||||
if tumblr_id not in tuggles or tuggles[tumblr_id]['quality'] < quality:
|
||||
tuggles[tumblr_id] = {
|
||||
'title': title,
|
||||
'url': url,
|
||||
'url': hd_url,
|
||||
'quality': quality,
|
||||
'ext': ext,
|
||||
'size': size,
|
||||
@ -99,11 +122,15 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
|
||||
r1(r'<meta property="og:description" content="([^"]*)" />', html) or
|
||||
r1(r'<title>([^<\n]*)', html) or url.split("/")[4]).replace('\n', '')
|
||||
|
||||
type, ext, size = url_info(real_url)
|
||||
# this is better
|
||||
vcode = r1(r'tumblr_(\w+)', real_url)
|
||||
real_url = 'https://vt.media.tumblr.com/tumblr_%s.mp4' % vcode
|
||||
|
||||
type, ext, size = url_info(real_url, faker=True)
|
||||
|
||||
print_info(site_info, title, type, size)
|
||||
if not info_only:
|
||||
download_urls([real_url], title, ext, size, output_dir, merge = merge)
|
||||
download_urls([real_url], title, ext, size, output_dir, merge=merge)
|
||||
|
||||
site_info = "Tumblr.com"
|
||||
download = tumblr_download
|
||||
|
@ -30,9 +30,9 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs)
|
||||
return
|
||||
|
||||
html = get_html(url, faker=True)
|
||||
screen_name = r1(r'data-screen-name="([^"]*)"', html) or \
|
||||
screen_name = r1(r'twitter\.com/([^/]+)', url) or r1(r'data-screen-name="([^"]*)"', html) or \
|
||||
r1(r'<meta name="twitter:title" content="([^"]*)"', html)
|
||||
item_id = r1(r'data-item-id="([^"]*)"', html) or \
|
||||
item_id = r1(r'twitter\.com/[^/]+/status/(\d+)', url) or r1(r'data-item-id="([^"]*)"', html) or \
|
||||
r1(r'<meta name="twitter:site:id" content="([^"]*)"', html)
|
||||
page_title = "{} [{}]".format(screen_name, item_id)
|
||||
|
||||
|
@ -67,9 +67,9 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg
|
||||
|
||||
urls = []
|
||||
for i in media_exts:
|
||||
urls += re.findall(r'(https?://[^ ;"\'\\]+' + i + r'[^ ;"\'\\]*)', page)
|
||||
urls += re.findall(r'(https?://[^ ;&"\'\\]+' + i + r'[^ ;&"\'\\]*)', page)
|
||||
|
||||
p_urls = re.findall(r'(https?%3A%2F%2F[^;&]+' + i + r'[^;&]*)', page)
|
||||
p_urls = re.findall(r'(https?%3A%2F%2F[^;&"]+' + i + r'[^;&"]*)', page)
|
||||
urls += [parse.unquote(url) for url in p_urls]
|
||||
|
||||
q_urls = re.findall(r'(https?:\\\\/\\\\/[^ ;"\']+' + i + r'[^ ;"\']*)', page)
|
||||
@ -106,6 +106,9 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg
|
||||
title = '%s' % i
|
||||
i += 1
|
||||
|
||||
if r1(r'(https://pinterest.com/pin/)', url):
|
||||
continue
|
||||
|
||||
candies.append({'url': url,
|
||||
'title': title})
|
||||
|
||||
|
@ -7,6 +7,24 @@ from urllib.parse import urlparse
|
||||
from json import loads
|
||||
import re
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
def miaopai_download_by_smid(smid, output_dir = '.', merge = True, info_only = False):
|
||||
""""""
|
||||
api_endpoint = 'https://n.miaopai.com/api/aj_media/info.json?smid={smid}'.format(smid = smid)
|
||||
|
||||
html = get_content(api_endpoint)
|
||||
|
||||
api_content = loads(html)
|
||||
|
||||
video_url = api_content['data']['meta_data'][0]['play_urls']['l']
|
||||
title = api_content['data']['description']
|
||||
|
||||
type, ext, size = url_info(video_url)
|
||||
|
||||
print_info(site_info, title, type, size)
|
||||
if not info_only:
|
||||
download_urls([video_url], title, ext, size, output_dir, merge=merge)
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
def yixia_miaopai_download_by_scid(scid, output_dir = '.', merge = True, info_only = False):
|
||||
""""""
|
||||
@ -47,7 +65,11 @@ def yixia_xiaokaxiu_download_by_scid(scid, output_dir = '.', merge = True, info_
|
||||
def yixia_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):
|
||||
"""wrapper"""
|
||||
hostname = urlparse(url).hostname
|
||||
if 'miaopai.com' in hostname: #Miaopai
|
||||
if 'n.miaopai.com' == hostname:
|
||||
smid = match1(url, r'n\.miaopai\.com/media/([^.]+)')
|
||||
miaopai_download_by_smid(smid, output_dir, merge, info_only)
|
||||
return
|
||||
elif 'miaopai.com' in hostname: #Miaopai
|
||||
yixia_download_by_scid = yixia_miaopai_download_by_scid
|
||||
site_info = "Yixia Miaopai"
|
||||
|
||||
|
@ -78,7 +78,7 @@ class Youku(VideoExtractor):
|
||||
self.api_error_code = None
|
||||
self.api_error_msg = None
|
||||
|
||||
self.ccode = '0508'
|
||||
self.ccode = '0590'
|
||||
# Found in http://g.alicdn.com/player/ykplayer/0.5.64/youku-player.min.js
|
||||
# grep -oE '"[0-9a-zA-Z+/=]{256}"' youku-player.min.js
|
||||
self.ckey = 'DIl58SLFxFNndSV1GFNnMQVYkx1PP5tKe1siZu/86PR1u/Wh1Ptd+WOZsHHWxysSfAOhNJpdVWsdVJNsfJ8Sxd8WKVvNfAS8aS8fAOzYARzPyPc3JvtnPHjTdKfESTdnuTW6ZPvk2pNDh4uFzotgdMEFkzQ5wZVXl2Pf1/Y6hLK0OnCNxBj3+nb0v72gZ6b0td+WOZsHHWxysSo/0y9D2K42SaB8Y/+aD2K42SaB8Y/+ahU+WOZsHcrxysooUeND'
|
||||
|
@ -37,6 +37,10 @@ class YouTube(VideoExtractor):
|
||||
]
|
||||
|
||||
def decipher(js, s):
|
||||
# Examples:
|
||||
# - https://www.youtube.com/yts/jsbin/player-da_DK-vflWlK-zq/base.js
|
||||
# - https://www.youtube.com/yts/jsbin/player-vflvABTsY/da_DK/base.js
|
||||
# - https://www.youtube.com/yts/jsbin/player-vfls4aurX/da_DK/base.js
|
||||
def tr_js(code):
|
||||
code = re.sub(r'function', r'def', code)
|
||||
code = re.sub(r'(\W)(as|if|in|is|or)\(', r'\1_\2(', code)
|
||||
@ -52,11 +56,13 @@ class YouTube(VideoExtractor):
|
||||
return code
|
||||
|
||||
js = js.replace('\n', ' ')
|
||||
f1 = match1(js, r'"signature",([$\w]+)\(\w+\.\w+\)')
|
||||
f1 = match1(js, r'\.set\(\w+\.sp,\(0,window\.encodeURIComponent\)\(([$\w]+)') or \
|
||||
match1(js, r'\.set\(\w+\.sp,([$\w]+)\(\w+\.s\)\)') or \
|
||||
match1(js, r'"signature",([$\w]+)\(\w+\.\w+\)')
|
||||
f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \
|
||||
match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1))
|
||||
f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def)
|
||||
f1def = 'function %s%s' % (f1, f1def)
|
||||
f1def = 'function main_%s%s' % (f1, f1def) # prefix to avoid potential namespace conflict
|
||||
code = tr_js(f1def)
|
||||
f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def))
|
||||
for f2 in f2s:
|
||||
@ -73,10 +79,20 @@ class YouTube(VideoExtractor):
|
||||
|
||||
f1 = re.sub(r'(as|if|in|is|or)', r'_\1', f1)
|
||||
f1 = re.sub(r'\$', '_dollar', f1)
|
||||
code = code + 'sig=%s(s)' % f1
|
||||
code = code + 'sig=main_%s(s)' % f1 # prefix to avoid potential namespace conflict
|
||||
exec(code, globals(), locals())
|
||||
return locals()['sig']
|
||||
|
||||
def chunk_by_range(url, size):
|
||||
urls = []
|
||||
chunk_size = 10485760
|
||||
start, end = 0, chunk_size - 1
|
||||
urls.append('%s&range=%s-%s' % (url, start, end))
|
||||
while end + 1 < size: # processed size < expected size
|
||||
start, end = end + 1, end + chunk_size
|
||||
urls.append('%s&range=%s-%s' % (url, start, end))
|
||||
return urls
|
||||
|
||||
def get_url_from_vid(vid):
|
||||
return 'https://youtu.be/{}'.format(vid)
|
||||
|
||||
@ -128,7 +144,10 @@ class YouTube(VideoExtractor):
|
||||
for video in videos:
|
||||
vid = parse_query_param(video, 'v')
|
||||
index = parse_query_param(video, 'index')
|
||||
try:
|
||||
self.__class__().download_by_url(self.__class__.get_url_from_vid(vid), index=index, **kwargs)
|
||||
except:
|
||||
pass
|
||||
|
||||
def prepare(self, **kwargs):
|
||||
assert self.url or self.vid
|
||||
@ -144,7 +163,8 @@ class YouTube(VideoExtractor):
|
||||
|
||||
ytplayer_config = None
|
||||
if 'status' not in video_info:
|
||||
log.wtf('[Failed] Unknown status.')
|
||||
log.wtf('[Failed] Unknown status.', exit_code=None)
|
||||
raise
|
||||
elif video_info['status'] == ['ok']:
|
||||
if 'use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']:
|
||||
self.title = parse.unquote_plus(video_info['title'][0])
|
||||
@ -176,7 +196,8 @@ class YouTube(VideoExtractor):
|
||||
ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+});ytplayer', video_page).group(1))
|
||||
except:
|
||||
msg = re.search('class="message">([^<]+)<', video_page).group(1)
|
||||
log.wtf('[Failed] "%s"' % msg.strip())
|
||||
log.wtf('[Failed] "%s"' % msg.strip(), exit_code=None)
|
||||
raise
|
||||
|
||||
if 'title' in ytplayer_config['args']:
|
||||
# 150 Restricted from playback on certain sites
|
||||
@ -185,18 +206,22 @@ class YouTube(VideoExtractor):
|
||||
self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js']
|
||||
stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',')
|
||||
else:
|
||||
log.wtf('[Error] The uploader has not made this video available in your country.')
|
||||
log.wtf('[Error] The uploader has not made this video available in your country.', exit_code=None)
|
||||
raise
|
||||
#self.title = re.search('<meta name="title" content="([^"]+)"', video_page).group(1)
|
||||
#stream_list = []
|
||||
|
||||
elif video_info['errorcode'] == ['100']:
|
||||
log.wtf('[Failed] This video does not exist.', exit_code=int(video_info['errorcode'][0]))
|
||||
log.wtf('[Failed] This video does not exist.', exit_code=None) #int(video_info['errorcode'][0])
|
||||
raise
|
||||
|
||||
else:
|
||||
log.wtf('[Failed] %s' % video_info['reason'][0], exit_code=int(video_info['errorcode'][0]))
|
||||
log.wtf('[Failed] %s' % video_info['reason'][0], exit_code=None) #int(video_info['errorcode'][0])
|
||||
raise
|
||||
|
||||
else:
|
||||
log.wtf('[Failed] Invalid status.')
|
||||
log.wtf('[Failed] Invalid status.', exit_code=None)
|
||||
raise
|
||||
|
||||
# YouTube Live
|
||||
if ytplayer_config and (ytplayer_config['args'].get('livestream') == '1' or ytplayer_config['args'].get('live_playback') == '1'):
|
||||
@ -286,13 +311,15 @@ class YouTube(VideoExtractor):
|
||||
if not dash_size:
|
||||
try: dash_size = url_size(dash_url)
|
||||
except: continue
|
||||
dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size))
|
||||
dash_mp4_a_urls = self.__class__.chunk_by_range(dash_mp4_a_url, int(dash_mp4_a_size))
|
||||
self.dash_streams[itag] = {
|
||||
'quality': '%sx%s' % (w, h),
|
||||
'itag': itag,
|
||||
'type': mimeType,
|
||||
'mime': mimeType,
|
||||
'container': 'mp4',
|
||||
'src': [dash_url, dash_mp4_a_url],
|
||||
'src': [dash_urls, dash_mp4_a_urls],
|
||||
'size': int(dash_size) + int(dash_mp4_a_size)
|
||||
}
|
||||
elif mimeType == 'video/webm':
|
||||
@ -306,13 +333,15 @@ class YouTube(VideoExtractor):
|
||||
if not dash_size:
|
||||
try: dash_size = url_size(dash_url)
|
||||
except: continue
|
||||
dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size))
|
||||
dash_webm_a_urls = self.__class__.chunk_by_range(dash_webm_a_url, int(dash_webm_a_size))
|
||||
self.dash_streams[itag] = {
|
||||
'quality': '%sx%s' % (w, h),
|
||||
'itag': itag,
|
||||
'type': mimeType,
|
||||
'mime': mimeType,
|
||||
'container': 'webm',
|
||||
'src': [dash_url, dash_webm_a_url],
|
||||
'src': [dash_urls, dash_webm_a_urls],
|
||||
'size': int(dash_size) + int(dash_webm_a_size)
|
||||
}
|
||||
except:
|
||||
@ -349,13 +378,15 @@ class YouTube(VideoExtractor):
|
||||
dash_url += '&signature={}'.format(sig)
|
||||
dash_size = stream['clen']
|
||||
itag = stream['itag']
|
||||
dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size))
|
||||
dash_mp4_a_urls = self.__class__.chunk_by_range(dash_mp4_a_url, int(dash_mp4_a_size))
|
||||
self.dash_streams[itag] = {
|
||||
'quality': stream['size'],
|
||||
'itag': itag,
|
||||
'type': mimeType,
|
||||
'mime': mimeType,
|
||||
'container': 'mp4',
|
||||
'src': [dash_url, dash_mp4_a_url],
|
||||
'src': [dash_urls, dash_mp4_a_urls],
|
||||
'size': int(dash_size) + int(dash_mp4_a_size)
|
||||
}
|
||||
elif stream['type'].startswith('video/webm'):
|
||||
@ -374,13 +405,15 @@ class YouTube(VideoExtractor):
|
||||
except UnboundLocalError as e:
|
||||
audio_url = dash_mp4_a_url
|
||||
audio_size = int(dash_mp4_a_size)
|
||||
dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size))
|
||||
audio_urls = self.__class__.chunk_by_range(audio_url, int(audio_size))
|
||||
self.dash_streams[itag] = {
|
||||
'quality': stream['size'],
|
||||
'itag': itag,
|
||||
'type': mimeType,
|
||||
'mime': mimeType,
|
||||
'container': 'webm',
|
||||
'src': [dash_url, audio_url],
|
||||
'src': [dash_urls, audio_urls],
|
||||
'size': int(dash_size) + int(audio_size)
|
||||
}
|
||||
|
||||
|
@ -37,11 +37,14 @@ def zhibo_download(url, output_dir = '.', merge = True, info_only = False, **kwa
|
||||
if is_live is not "1":
|
||||
raise ValueError("The live stream is not online! (Errno:%s)" % is_live)
|
||||
|
||||
ourStreamName = r1(r"window.ourStreamName=\'([s\S'\s\.]*)\'\;[\s\S]*window.rtmpDefaultSource", html)
|
||||
rtmpPollUrl = r1(r"window.rtmpPollUrl=\'([s\S'\s\.]*)\'\;[\s\S]*window.hlsDefaultSource", html)
|
||||
|
||||
#real_url = 'rtmp://220.194.213.56/live.zhibo.tv/8live/' + ourStreamName
|
||||
real_url = rtmpPollUrl + ourStreamName
|
||||
match = re.search(r"""
|
||||
ourStreamName .*?
|
||||
'(.*?)' .*?
|
||||
rtmpHighSource .*?
|
||||
'(.*?)' .*?
|
||||
'(.*?)'
|
||||
""", html, re.S | re.X)
|
||||
real_url = match.group(3) + match.group(1) + match.group(2)
|
||||
|
||||
print_info(site_info, title, 'flv', float('inf'))
|
||||
if not info_only:
|
||||
|
79
src/you_get/extractors/zhihu.py
Normal file
79
src/you_get/extractors/zhihu.py
Normal file
@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__all__ = ['zhihu_download', 'zhihu_download_playlist']
|
||||
|
||||
from ..common import *
|
||||
import json
|
||||
|
||||
|
||||
def zhihu_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
|
||||
paths = url.split("/")
|
||||
# question or column
|
||||
if len(paths) < 3 and len(paths) < 6:
|
||||
raise TypeError("URL does not conform to specifications, Support column and question only."
|
||||
"Example URL: https://zhuanlan.zhihu.com/p/51669862 or "
|
||||
"https://www.zhihu.com/question/267782048/answer/490720324")
|
||||
|
||||
if ("question" not in paths or "answer" not in paths) and "zhuanlan.zhihu.com" not in paths:
|
||||
raise TypeError("URL does not conform to specifications, Support column and question only."
|
||||
"Example URL: https://zhuanlan.zhihu.com/p/51669862 or "
|
||||
"https://www.zhihu.com/question/267782048/answer/490720324")
|
||||
|
||||
html = get_html(url, faker=True)
|
||||
title = match1(html, r'data-react-helmet="true">(.*?)</title>')
|
||||
for index, video_id in enumerate(matchall(html, [r'<a class="video-box" href="\S+video/(\d+)"'])):
|
||||
try:
|
||||
video_info = json.loads(
|
||||
get_content(r"https://lens.zhihu.com/api/videos/{}".format(video_id), headers=fake_headers))
|
||||
except json.decoder.JSONDecodeError:
|
||||
log.w("Video id not found:{}".format(video_id))
|
||||
continue
|
||||
|
||||
play_list = video_info["playlist"]
|
||||
# first High Definition
|
||||
# second Second Standard Definition
|
||||
# third ld. What is ld ?
|
||||
# finally continue
|
||||
data = play_list.get("hd", play_list.get("sd", play_list.get("ld", None)))
|
||||
if not data:
|
||||
log.w("Video id No play address:{}".format(video_id))
|
||||
continue
|
||||
print_info(site_info, title, data["format"], data["size"])
|
||||
if not info_only:
|
||||
ext = "_{}.{}".format(index, data["format"])
|
||||
if kwargs.get("zhihu_offset"):
|
||||
ext = "_{}".format(kwargs["zhihu_offset"]) + ext
|
||||
download_urls([data["play_url"]], title, ext, data["size"],
|
||||
output_dir=output_dir, merge=merge, **kwargs)
|
||||
|
||||
|
||||
def zhihu_download_playlist(url, output_dir='.', merge=True, info_only=False, **kwargs):
|
||||
if "question" not in url or "answer" in url: # question page
|
||||
raise TypeError("URL does not conform to specifications, Support question only."
|
||||
" Example URL: https://www.zhihu.com/question/267782048")
|
||||
url = url.split("?")[0]
|
||||
if url[-1] == "/":
|
||||
question_id = url.split("/")[-2]
|
||||
else:
|
||||
question_id = url.split("/")[-1]
|
||||
videos_url = r"https://www.zhihu.com/api/v4/questions/{}/answers".format(question_id)
|
||||
try:
|
||||
questions = json.loads(get_content(videos_url))
|
||||
except json.decoder.JSONDecodeError:
|
||||
raise TypeError("Check whether the problem URL exists.Example URL: https://www.zhihu.com/question/267782048")
|
||||
|
||||
count = 0
|
||||
while 1:
|
||||
for data in questions["data"]:
|
||||
kwargs["zhihu_offset"] = count
|
||||
zhihu_download("https://www.zhihu.com/question/{}/answer/{}".format(question_id, data["id"]),
|
||||
output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
|
||||
count += 1
|
||||
if questions["paging"]["is_end"]:
|
||||
return
|
||||
questions = json.loads(get_content(questions["paging"]["next"], headers=fake_headers))
|
||||
|
||||
|
||||
site_info = "zhihu.com"
|
||||
download = zhihu_download
|
||||
download_playlist = zhihu_download_playlist
|
@ -13,6 +13,7 @@ def legitimize(text, os=detect_os()):
|
||||
ord('|'): '-',
|
||||
})
|
||||
|
||||
# FIXME: do some filesystem detection
|
||||
if os == 'windows' or os == 'cygwin' or os == 'wsl':
|
||||
# Windows (non-POSIX namespace)
|
||||
text = text.translate({
|
||||
@ -28,6 +29,7 @@ def legitimize(text, os=detect_os()):
|
||||
ord('>'): '-',
|
||||
ord('['): '(',
|
||||
ord(']'): ')',
|
||||
ord('\t'): ' ',
|
||||
})
|
||||
else:
|
||||
# *nix
|
||||
|
@ -96,3 +96,9 @@ def wtf(message, exit_code=1):
|
||||
print_log(message, RED, BOLD)
|
||||
if exit_code is not None:
|
||||
sys.exit(exit_code)
|
||||
|
||||
def yes_or_no(message):
|
||||
ans = str(input('%s (y/N) ' % message)).lower().strip()
|
||||
if ans == 'y':
|
||||
return True
|
||||
return False
|
||||
|
@ -19,9 +19,11 @@ def detect_os():
|
||||
elif 'linux' in syst:
|
||||
os = 'linux'
|
||||
# detect WSL https://github.com/Microsoft/BashOnWindows/issues/423
|
||||
try:
|
||||
with open('/proc/version', 'r') as f:
|
||||
if 'microsoft' in f.read().lower():
|
||||
os = 'wsl'
|
||||
except: pass
|
||||
elif 'windows' in syst:
|
||||
os = 'windows'
|
||||
elif 'bsd' in syst:
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
script_name = 'you-get'
|
||||
__version__ = '0.4.1128'
|
||||
__version__ = '0.4.1193'
|
||||
|
@ -25,6 +25,7 @@
|
||||
"Programming Language :: Python :: 3.4",
|
||||
"Programming Language :: Python :: 3.5",
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Topic :: Internet",
|
||||
"Topic :: Internet :: WWW/HTTP",
|
||||
"Topic :: Multimedia",
|
||||
|
Loading…
Reference in New Issue
Block a user