Merge pull request #5 from soimort/develop

1
This commit is contained in:
杨朝雄 2018-12-24 14:13:49 +08:00 committed by GitHub
commit 894e17f108
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
30 changed files with 754 additions and 215 deletions

View File

@ -6,11 +6,13 @@ python:
- "3.4" - "3.4"
- "3.5" - "3.5"
- "3.6" - "3.6"
- "3.7-dev"
- "nightly" - "nightly"
- "pypy3" - "pypy3"
before_install: pip install flake8 before_install:
- if [[ $TRAVIS_PYTHON_VERSION != '3.2'* && $TRAVIS_PYTHON_VERSION != '3.3'* ]]; then pip install flake8; fi
before_script: before_script:
- if [[ $TRAVIS_PYTHON_VERSION != '3.2'* ]]; then flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics; fi - if [[ $TRAVIS_PYTHON_VERSION != '3.2'* && $TRAVIS_PYTHON_VERSION != '3.3'* ]]; then flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics; fi
script: make test script: make test
sudo: false sudo: false
notifications: notifications:

View File

@ -113,6 +113,14 @@ You can install `you-get` easily via:
$ brew install you-get $ brew install you-get
``` ```
### Option 8: pkg (FreeBSD only)
You can install `you-get` easily via:
```
# pkg install you-get
```
### Shell completion ### Shell completion
Completion definitions for Bash, Fish and Zsh can be found in [`contrib/completion`](https://github.com/soimort/you-get/tree/develop/contrib/completion). Please consult your shell's manual for how to take advantage of them. Completion definitions for Bash, Fish and Zsh can be found in [`contrib/completion`](https://github.com/soimort/you-get/tree/develop/contrib/completion). Please consult your shell's manual for how to take advantage of them.
@ -416,7 +424,9 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the
| 西瓜视频 | <https://www.ixigua.com/> |✓| | | | 西瓜视频 | <https://www.ixigua.com/> |✓| | |
| 快手 | <https://www.kuaishou.com/> |✓|✓| | | 快手 | <https://www.kuaishou.com/> |✓|✓| |
| 抖音 | <https://www.douyin.com/> |✓| | | | 抖音 | <https://www.douyin.com/> |✓| | |
| TikTok | <https://www.tiktok.com/> |✓| | |
| 中国体育(TV) | <http://v.zhibo.tv/> </br><http://video.zhibo.tv/> |✓| | | | 中国体育(TV) | <http://v.zhibo.tv/> </br><http://video.zhibo.tv/> |✓| | |
| 知乎 | <https://www.zhihu.com/> |✓| | |
For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page.

View File

@ -102,6 +102,7 @@ SITES = {
'soundcloud' : 'soundcloud', 'soundcloud' : 'soundcloud',
'ted' : 'ted', 'ted' : 'ted',
'theplatform' : 'theplatform', 'theplatform' : 'theplatform',
'tiktok' : 'tiktok',
'tucao' : 'tucao', 'tucao' : 'tucao',
'tudou' : 'tudou', 'tudou' : 'tudou',
'tumblr' : 'tumblr', 'tumblr' : 'tumblr',
@ -127,6 +128,7 @@ SITES = {
'youtube' : 'youtube', 'youtube' : 'youtube',
'zhanqi' : 'zhanqi', 'zhanqi' : 'zhanqi',
'zhibo' : 'zhibo', 'zhibo' : 'zhibo',
'zhihu' : 'zhihu',
} }
dry_run = False dry_run = False
@ -429,7 +431,7 @@ def get_content(url, headers={}, decoded=True):
# Decode the response body # Decode the response body
if decoded: if decoded:
charset = match1( charset = match1(
response.getheader('Content-Type'), r'charset=([\w-]+)' response.getheader('Content-Type', ''), r'charset=([\w-]+)'
) )
if charset is not None: if charset is not None:
data = data.decode(charset) data = data.decode(charset)
@ -439,7 +441,7 @@ def get_content(url, headers={}, decoded=True):
return data return data
def post_content(url, headers={}, post_data={}, decoded=True): def post_content(url, headers={}, post_data={}, decoded=True, **kwargs):
"""Post the content of a URL via sending a HTTP POST request. """Post the content of a URL via sending a HTTP POST request.
Args: Args:
@ -450,14 +452,19 @@ def post_content(url, headers={}, post_data={}, decoded=True):
Returns: Returns:
The content as a string. The content as a string.
""" """
if kwargs.get('post_data_raw'):
logging.debug('post_content: %s \n post_data: %s' % (url, post_data)) logging.debug('post_content: %s\npost_data_raw: %s' % (url, kwargs['post_data_raw']))
else:
logging.debug('post_content: %s\npost_data: %s' % (url, post_data))
req = request.Request(url, headers=headers) req = request.Request(url, headers=headers)
if cookies: if cookies:
cookies.add_cookie_header(req) cookies.add_cookie_header(req)
req.headers.update(req.unredirected_hdrs) req.headers.update(req.unredirected_hdrs)
post_data_enc = bytes(parse.urlencode(post_data), 'utf-8') if kwargs.get('post_data_raw'):
post_data_enc = bytes(kwargs['post_data_raw'], 'utf-8')
else:
post_data_enc = bytes(parse.urlencode(post_data), 'utf-8')
response = urlopen_with_retry(req, data=post_data_enc) response = urlopen_with_retry(req, data=post_data_enc)
data = response.read() data = response.read()
@ -602,7 +609,12 @@ def url_save(
# the key must be 'Referer' for the hack here # the key must be 'Referer' for the hack here
if refer is not None: if refer is not None:
tmp_headers['Referer'] = refer tmp_headers['Referer'] = refer
file_size = url_size(url, faker=faker, headers=tmp_headers) if type(url) is list:
file_size = urls_size(url, faker=faker, headers=tmp_headers)
is_chunked, urls = True, url
else:
file_size = url_size(url, faker=faker, headers=tmp_headers)
is_chunked, urls = False, [url]
continue_renameing = True continue_renameing = True
while continue_renameing: while continue_renameing:
@ -612,7 +624,7 @@ def url_save(
if not is_part: if not is_part:
if bar: if bar:
bar.done() bar.done()
print( log.w(
'Skipping {}: file already exists'.format( 'Skipping {}: file already exists'.format(
tr(os.path.basename(filepath)) tr(os.path.basename(filepath))
) )
@ -638,7 +650,10 @@ def url_save(
print('Changing name to %s' % tr(os.path.basename(filepath)), '...') print('Changing name to %s' % tr(os.path.basename(filepath)), '...')
continue_renameing = True continue_renameing = True
continue continue
print('Overwriting %s' % tr(os.path.basename(filepath)), '...') if log.yes_or_no('File with this name already exists. Overwrite?'):
log.w('Overwriting %s ...' % tr(os.path.basename(filepath)))
else:
return
elif not os.path.exists(os.path.dirname(filepath)): elif not os.path.exists(os.path.dirname(filepath)):
os.mkdir(os.path.dirname(filepath)) os.mkdir(os.path.dirname(filepath))
@ -655,70 +670,78 @@ def url_save(
else: else:
open_mode = 'wb' open_mode = 'wb'
if received < file_size: for url in urls:
if faker: received_chunk = 0
tmp_headers = fake_headers if received < file_size:
''' if faker:
if parameter headers passed in, we have it copied as tmp_header tmp_headers = fake_headers
elif headers: '''
headers = headers if parameter headers passed in, we have it copied as tmp_header
else: elif headers:
headers = {} headers = headers
''' else:
if received: headers = {}
tmp_headers['Range'] = 'bytes=' + str(received) + '-' '''
if refer: if received and not is_chunked: # only request a range when not chunked
tmp_headers['Referer'] = refer tmp_headers['Range'] = 'bytes=' + str(received) + '-'
if refer:
tmp_headers['Referer'] = refer
if timeout: if timeout:
response = urlopen_with_retry( response = urlopen_with_retry(
request.Request(url, headers=tmp_headers), timeout=timeout request.Request(url, headers=tmp_headers), timeout=timeout
) )
else: else:
response = urlopen_with_retry( response = urlopen_with_retry(
request.Request(url, headers=tmp_headers) request.Request(url, headers=tmp_headers)
) )
try: try:
range_start = int( range_start = int(
response.headers[ response.headers[
'content-range' 'content-range'
][6:].split('/')[0].split('-')[0] ][6:].split('/')[0].split('-')[0]
) )
end_length = int( end_length = int(
response.headers['content-range'][6:].split('/')[1] response.headers['content-range'][6:].split('/')[1]
) )
range_length = end_length - range_start range_length = end_length - range_start
except: except:
content_length = response.headers['content-length'] content_length = response.headers['content-length']
range_length = int(content_length) if content_length is not None \ range_length = int(content_length) if content_length is not None \
else float('inf') else float('inf')
if file_size != received + range_length: if is_chunked: # always append if chunked
received = 0 open_mode = 'ab'
if bar: elif file_size != received + range_length: # is it ever necessary?
bar.received = 0 received = 0
open_mode = 'wb'
with open(temp_filepath, open_mode) as output:
while True:
buffer = None
try:
buffer = response.read(1024 * 256)
except socket.timeout:
pass
if not buffer:
if received == file_size: # Download finished
break
# Unexpected termination. Retry request
tmp_headers['Range'] = 'bytes=' + str(received) + '-'
response = urlopen_with_retry(
request.Request(url, headers=tmp_headers)
)
continue
output.write(buffer)
received += len(buffer)
if bar: if bar:
bar.update_received(len(buffer)) bar.received = 0
open_mode = 'wb'
with open(temp_filepath, open_mode) as output:
while True:
buffer = None
try:
buffer = response.read(1024 * 256)
except socket.timeout:
pass
if not buffer:
if is_chunked and received_chunk == range_length:
break
elif not is_chunked and received == file_size: # Download finished
break
# Unexpected termination. Retry request
if not is_chunked: # when
tmp_headers['Range'] = 'bytes=' + str(received) + '-'
response = urlopen_with_retry(
request.Request(url, headers=tmp_headers)
)
continue
output.write(buffer)
received += len(buffer)
received_chunk += len(buffer)
if bar:
bar.update_received(len(buffer))
assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % ( assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (
received, os.path.getsize(temp_filepath), temp_filepath received, os.path.getsize(temp_filepath), temp_filepath
@ -907,7 +930,7 @@ def download_urls(
if total_size: if total_size:
if not force and os.path.exists(output_filepath) and not auto_rename\ if not force and os.path.exists(output_filepath) and not auto_rename\
and os.path.getsize(output_filepath) >= total_size * 0.9: and os.path.getsize(output_filepath) >= total_size * 0.9:
print('Skipping %s: file already exists' % output_filepath) log.w('Skipping %s: file already exists' % output_filepath)
print() print()
return return
bar = SimpleProgressBar(total_size, len(urls)) bar = SimpleProgressBar(total_size, len(urls))
@ -1554,9 +1577,9 @@ def google_search(url):
url = 'https://www.google.com/search?tbm=vid&q=%s' % parse.quote(keywords) url = 'https://www.google.com/search?tbm=vid&q=%s' % parse.quote(keywords)
page = get_content(url, headers=fake_headers) page = get_content(url, headers=fake_headers)
videos = re.findall( videos = re.findall(
r'<a href="(https?://[^"]+)" onmousedown="[^"]+">([^<]+)<', page r'<a href="(https?://[^"]+)" onmousedown="[^"]+"><h3 class="[^"]*">([^<]+)<', page
) )
vdurs = re.findall(r'<span class="vdur _dwc">([^<]+)<', page) vdurs = re.findall(r'<span class="vdur[^"]*">([^<]+)<', page)
durs = [r1(r'(\d+:\d+)', unescape_html(dur)) for dur in vdurs] durs = [r1(r'(\d+:\d+)', unescape_html(dur)) for dur in vdurs]
print('Google Videos search:') print('Google Videos search:')
for v in zip(videos, durs): for v in zip(videos, durs):

View File

@ -211,7 +211,7 @@ class VideoExtractor():
ext = self.dash_streams[stream_id]['container'] ext = self.dash_streams[stream_id]['container']
total_size = self.dash_streams[stream_id]['size'] total_size = self.dash_streams[stream_id]['size']
if ext == 'm3u8': if ext == 'm3u8' or ext == 'm4a':
ext = 'mp4' ext = 'mp4'
if not urls: if not urls:

View File

@ -67,6 +67,7 @@ from .sohu import *
from .soundcloud import * from .soundcloud import *
from .suntv import * from .suntv import *
from .theplatform import * from .theplatform import *
from .tiktok import *
from .tucao import * from .tucao import *
from .tudou import * from .tudou import *
from .tumblr import * from .tumblr import *
@ -88,4 +89,5 @@ from .ted import *
from .khan import * from .khan import *
from .zhanqi import * from .zhanqi import *
from .kuaishou import * from .kuaishou import *
from .zhibo import * from .zhibo import *
from .zhihu import *

View File

@ -85,9 +85,13 @@ def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=Fals
_, _, seg_size = url_info(url) _, _, seg_size = url_info(url)
size += seg_size size += seg_size
#fallback to flvhd is not quite possible #fallback to flvhd is not quite possible
print_info(site_info, title, 'mp4', size) if re.search(r'fid=[0-9A-Z\-]*.flv', preferred[0][0]):
ext = 'flv'
else:
ext = 'mp4'
print_info(site_info, title, ext, size)
if not info_only: if not info_only:
download_urls(preferred[0], title, 'mp4', size, output_dir=output_dir, merge=merge) download_urls(preferred[0], title, ext, size, output_dir=output_dir, merge=merge)
else: else:
raise NotImplementedError(sourceType) raise NotImplementedError(sourceType)
@ -105,27 +109,42 @@ def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=Fals
pass pass
def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs): def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
assert re.match(r'http://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url) assert re.match(r'http://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', url)
html = get_content(url)
title = r1(r'data-title="([^"]+)"', html) if re.match(r'http://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url):
html = get_content(url)
title = r1(r'data-title="([^"]+)"', html)
if match1(url, r'_(\d+)$'): # current P
title = title + " " + r1(r'active">([^<]*)', html)
vid = r1('data-vid="(\d+)"', html)
up = r1('data-name="([^"]+)"', html)
# bangumi
elif re.match("http://[^\.]*\.*acfun\.[^\.]+/bangumi/ab(\d+)", url):
html = get_content(url)
title = match1(html, r'"newTitle"\s*:\s*"([^"]+)"')
if match1(url, r'_(\d+)$'): # current P
title = title + " " + r1(r'active">([^<]*)', html)
vid = match1(html, r'videoId="(\d+)"')
up = "acfun"
else:
raise NotImplemented
assert title and vid
title = unescape_html(title) title = unescape_html(title)
title = escape_file_path(title) title = escape_file_path(title)
assert title
if match1(url, r'_(\d+)$'): # current P
title = title + " " + r1(r'active">([^<]*)', html)
vid = r1('data-vid="(\d+)"', html)
up = r1('data-name="([^"]+)"', html)
p_title = r1('active">([^<]+)', html) p_title = r1('active">([^<]+)', html)
title = '%s (%s)' % (title, up) title = '%s (%s)' % (title, up)
if p_title: title = '%s - %s' % (title, p_title) if p_title:
title = '%s - %s' % (title, p_title)
acfun_download_by_vid(vid, title, acfun_download_by_vid(vid, title,
output_dir=output_dir, output_dir=output_dir,
merge=merge, merge=merge,
info_only=info_only, info_only=info_only,
**kwargs) **kwargs)
site_info = "AcFun.tv" site_info = "AcFun.tv"
download = acfun_download download = acfun_download
download_playlist = playlist_not_supported('acfun') download_playlist = playlist_not_supported('acfun')

View File

@ -129,8 +129,9 @@ def baidu_download(url, output_dir='.', stream_type=None, merge=True, info_only=
html = get_html(url) html = get_html(url)
title = r1(r'title:"([^"]+)"', html) title = r1(r'title:"([^"]+)"', html)
vhsrc = re.findall(r'"BDE_Image"[^>]+src="([^"]+)"', html) or re.findall(r'vhsrc="([^"]+)"', html) vhsrc = re.findall(r'"BDE_Image"[^>]+src="([^"]+\.mp4)"', html) or \
if vhsrc is not None: re.findall(r'vhsrc="([^"]+)"', html)
if len(vhsrc) > 0:
ext = 'mp4' ext = 'mp4'
size = url_size(vhsrc[0]) size = url_size(vhsrc[0])
print_info(site_info, title, ext, size) print_info(site_info, title, ext, size)

View File

@ -22,7 +22,7 @@ from .youku import youku_download_by_vid
class Bilibili(VideoExtractor): class Bilibili(VideoExtractor):
name = 'Bilibili' name = 'Bilibili'
live_api = 'http://live.bilibili.com/api/playurl?cid={}&otype=json' live_api = 'https://api.live.bilibili.com/room/v1/Room/playUrl?cid={}&quality=0&platform=web'
api_url = 'http://interface.bilibili.com/v2/playurl?' api_url = 'http://interface.bilibili.com/v2/playurl?'
bangumi_api_url = 'http://bangumi.bilibili.com/player/web_api/playurl?' bangumi_api_url = 'http://bangumi.bilibili.com/player/web_api/playurl?'
live_room_init_api_url = 'https://api.live.bilibili.com/room/v1/Room/room_init?id={}' live_room_init_api_url = 'https://api.live.bilibili.com/room/v1/Room/room_init?id={}'
@ -115,7 +115,7 @@ class Bilibili(VideoExtractor):
self.url = 'http://www.bilibili.com/video/av{}/'.format(aid) self.url = 'http://www.bilibili.com/video/av{}/'.format(aid)
self.ua = fake_headers['User-Agent'] self.ua = fake_headers['User-Agent']
self.url = url_locations([self.url])[0] self.url = url_locations([self.url], faker=True)[0]
frag = urllib.parse.urlparse(self.url).fragment frag = urllib.parse.urlparse(self.url).fragment
# http://www.bilibili.com/video/av3141144/index_2.html#page=3 # http://www.bilibili.com/video/av3141144/index_2.html#page=3
if frag: if frag:
@ -125,30 +125,31 @@ class Bilibili(VideoExtractor):
aid = re.search(r'av(\d+)', self.url).group(1) aid = re.search(r'av(\d+)', self.url).group(1)
self.url = 'http://www.bilibili.com/video/av{}/index_{}.html'.format(aid, page) self.url = 'http://www.bilibili.com/video/av{}/index_{}.html'.format(aid, page)
self.referer = self.url self.referer = self.url
self.page = get_content(self.url) self.page = get_content(self.url, headers=fake_headers)
m = re.search(r'<h1.*?>(.*?)</h1>', self.page) or re.search(r'<h1 title="([^"]+)">', self.page) m = re.search(r'<h1.*?>(.*?)</h1>', self.page) or re.search(r'<h1 title="([^"]+)">', self.page)
if m is not None: if m is not None:
self.title = m.group(1) self.title = m.group(1)
s = re.search(r'<span>([^<]+)</span>', m.group(1)) s = re.search(r'<span.*?>([^<]+)</span>', m.group(1))
if s: if s:
self.title = unescape_html(s.group(1)) self.title = unescape_html(s.group(1))
if self.title is None: if self.title is None:
m = re.search(r'property="og:title" content="([^"]+)"', self.page) m = re.search(r'property="og:title" content="([^"]+)"', self.page)
if m is not None: if m is not None:
self.title = m.group(1) self.title = m.group(1)
if 'subtitle' in kwargs: if 'subtitle' in kwargs:
subtitle = kwargs['subtitle'] subtitle = kwargs['subtitle']
self.title = '{} {}'.format(self.title, subtitle) self.title = '{} {}'.format(self.title, subtitle)
else: else:
playinfo = re.search(r'__INITIAL_STATE__=(.*?);\(function\(\)', self.page) playinfo = re.search(r'__INITIAL_STATE__=(.*?);\(function\(\)', self.page)
if playinfo is not None: if playinfo is not None:
pages = json.loads(playinfo.group(1))['videoData']['pages'] jsonPlayinfo = json.loads(playinfo.group(1))
if len(pages) > 1: if 'videoData' in jsonPlayinfo:
qs = dict(parse.parse_qsl(urllib.parse.urlparse(self.url).query)) pages = jsonPlayinfo['videoData']['pages']
page = pages[int(qs.get('p', 1)) - 1] if len(pages) > 1:
self.title = '{} #{}. {}'.format(self.title, page['page'], page['part']) qs = dict(parse.parse_qsl(urllib.parse.urlparse(self.url).query))
page = pages[int(qs.get('p', 1)) - 1]
self.title = '{} #{}. {}'.format(self.title, page['page'], page['part'])
if 'bangumi.bilibili.com/movie' in self.url: if 'bangumi.bilibili.com/movie' in self.url:
self.movie_entry(**kwargs) self.movie_entry(**kwargs)
@ -160,6 +161,8 @@ class Bilibili(VideoExtractor):
self.live_entry(**kwargs) self.live_entry(**kwargs)
elif 'vc.bilibili.com' in self.url: elif 'vc.bilibili.com' in self.url:
self.vc_entry(**kwargs) self.vc_entry(**kwargs)
elif 'audio/au' in self.url:
self.audio_entry(**kwargs)
else: else:
self.entry(**kwargs) self.entry(**kwargs)
@ -171,6 +174,30 @@ class Bilibili(VideoExtractor):
self.title = page_list[0]['pagename'] self.title = page_list[0]['pagename']
self.download_by_vid(page_list[0]['cid'], True, bangumi_movie=True, **kwargs) self.download_by_vid(page_list[0]['cid'], True, bangumi_movie=True, **kwargs)
def audio_entry(self, **kwargs):
assert re.match(r'https?://www.bilibili.com/audio/au\d+', self.url)
patt = r"(\d+)"
audio_id = re.search(patt, self.url).group(1)
audio_info_url = \
'https://www.bilibili.com/audio/music-service-c/web/song/info?sid={}'.format(audio_id)
audio_info_response = json.loads(get_content(audio_info_url))
if audio_info_response['msg'] != 'success':
log.wtf('fetch audio information failed!')
sys.exit(2)
self.title = audio_info_response['data']['title']
# TODO:there is no quality option for now
audio_download_url = \
'https://www.bilibili.com/audio/music-service-c/web/url?sid={}&privilege=2&quality=2'.format(audio_id)
audio_download_response = json.loads(get_content(audio_download_url))
if audio_download_response['msg'] != 'success':
log.wtf('fetch audio resource failed!')
sys.exit(2)
self.streams['mp4'] = {}
self.streams['mp4']['src'] = [audio_download_response['data']['cdns'][0]]
self.streams['mp4']['container'] = 'm4a'
self.streams['mp4']['size'] = audio_download_response['data']['size']
def entry(self, **kwargs): def entry(self, **kwargs):
# tencent player # tencent player
tc_flashvars = re.search(r'"bili-cid=\d+&bili-aid=\d+&vid=([^"]+)"', self.page) tc_flashvars = re.search(r'"bili-cid=\d+&bili-aid=\d+&vid=([^"]+)"', self.page)
@ -190,7 +217,12 @@ class Bilibili(VideoExtractor):
index_id = int(re.search(r'index_(\d+)', self.url).group(1)) index_id = int(re.search(r'index_(\d+)', self.url).group(1))
cid = page_list[index_id-1]['cid'] # change cid match rule cid = page_list[index_id-1]['cid'] # change cid match rule
except: except:
cid = re.search(r'"cid":(\d+)', self.page).group(1) page = re.search(r'p=(\d+)', self.url)
if page is None:
p = 1
else:
p = int(page.group(1))
cid = re.search(r'"cid":(\d+),"page":%s' % p, self.page).group(1)
if cid is not None: if cid is not None:
self.download_by_vid(cid, re.search('bangumi', self.url) is not None, **kwargs) self.download_by_vid(cid, re.search('bangumi', self.url) is not None, **kwargs)
else: else:
@ -226,7 +258,7 @@ class Bilibili(VideoExtractor):
api_url = self.live_api.format(self.room_id) api_url = self.live_api.format(self.room_id)
json_data = json.loads(get_content(api_url)) json_data = json.loads(get_content(api_url))
urls = [json_data['durl'][0]['url']] urls = [json_data['data']['durl'][0]['url']]
self.streams['live'] = {} self.streams['live'] = {}
self.streams['live']['src'] = urls self.streams['live']['src'] = urls
@ -252,28 +284,9 @@ class Bilibili(VideoExtractor):
self.streams['vc']['size'] = int(item['video_size']) self.streams['vc']['size'] = int(item['video_size'])
def bangumi_entry(self, **kwargs): def bangumi_entry(self, **kwargs):
bangumi_id = re.search(r'(\d+)', self.url).group(1) data = json.loads(re.search(r'__INITIAL_STATE__=(.+);\(function', self.page).group(1))
frag = urllib.parse.urlparse(self.url).fragment cid = data['epInfo']['cid']
if frag: # index_title = data['epInfo']['index_title']
episode_id = frag
else:
episode_id = re.search(r'first_ep_id\s*=\s*"(\d+)"', self.page) or re.search(r'\/ep(\d+)', self.url).group(1)
# cont = post_content('http://bangumi.bilibili.com/web_api/get_source', post_data=dict(episode_id=episode_id))
# cid = json.loads(cont)['result']['cid']
cont = get_content('http://bangumi.bilibili.com/web_api/episode/{}.json'.format(episode_id))
ep_info = json.loads(cont)['result']['currentEpisode']
bangumi_data = get_bangumi_info(str(ep_info['seasonId']))
bangumi_payment = bangumi_data.get('payment')
if bangumi_payment and bangumi_payment['price'] != '0':
log.w("It's a paid item")
# ep_ids = collect_bangumi_epids(bangumi_data)
index_title = ep_info['indexTitle']
long_title = ep_info['longTitle'].strip()
cid = ep_info['danmaku']
self.title = '{} [{} {}]'.format(self.title, index_title, long_title)
self.download_by_vid(cid, bangumi=True, **kwargs) self.download_by_vid(cid, bangumi=True, **kwargs)
@ -376,10 +389,82 @@ def download_video_from_favlist(url, **kwargs):
else: else:
log.wtf("Fail to parse the fav title" + url, "") log.wtf("Fail to parse the fav title" + url, "")
def download_music_from_favlist(url, page, **kwargs):
m = re.search(r'https?://www.bilibili.com/audio/mycollection/(\d+)', url)
if m is not None:
sid = m.group(1)
json_result = json.loads(get_content("https://www.bilibili.com/audio/music-service-c/web/song/of-coll?"
"sid={}&pn={}&ps=100".format(sid, page)))
if json_result['msg'] == 'success':
music_list = json_result['data']['data']
music_count = len(music_list)
for i in range(music_count):
audio_id = music_list[i]['id']
audio_title = music_list[i]['title']
audio_url = "https://www.bilibili.com/audio/au{}".format(audio_id)
print("Start downloading music ", audio_title)
Bilibili().download_by_url(audio_url, **kwargs)
if page < json_result['data']['pageCount']:
page += 1
download_music_from_favlist(url, page, **kwargs)
else:
log.wtf("Fail to get music list of page " + json_result)
sys.exit(2)
else:
log.wtf("Fail to parse the sid from " + url, "")
def download_video_from_totallist(url, page, **kwargs):
# the url has format: https://space.bilibili.com/64169458/#/video
m = re.search(r'space\.bilibili\.com/(\d+)/.*?video', url)
mid = ""
if m is not None:
mid = m.group(1)
jsonresult = json.loads(get_content("https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&pagesize=100&tid=0&page={}&keyword=&order=pubdate&jsonp=jsonp".format(mid, page)))
if jsonresult['status']:
videos = jsonresult['data']['vlist']
videocount = len(videos)
for i in range(videocount):
videoid = videos[i]["aid"]
videotitle = videos[i]["title"]
videourl = "https://www.bilibili.com/video/av{}".format(videoid)
print("Start downloading ", videotitle, " video ", videotitle)
Bilibili().download_by_url(videourl, subtitle=videotitle, **kwargs)
if page < jsonresult['data']['pages']:
page += 1
download_video_from_totallist(url, page, **kwargs)
else:
log.wtf("Fail to get the files of page " + jsonresult)
sys.exit(2)
else:
log.wtf("Fail to parse the video title" + url, "")
def download_music_from_totallist(url, page, **kwargs):
m = re.search(r'https?://www.bilibili.com/audio/am(\d+)\?type=\d', url)
if m is not None:
sid = m.group(1)
json_result = json.loads(get_content("https://www.bilibili.com/audio/music-service-c/web/song/of-menu?"
"sid={}&pn={}&ps=100".format(sid, page)))
if json_result['msg'] == 'success':
music_list = json_result['data']['data']
music_count = len(music_list)
for i in range(music_count):
audio_id = music_list[i]['id']
audio_title = music_list[i]['title']
audio_url = "https://www.bilibili.com/audio/au{}".format(audio_id)
print("Start downloading music ",audio_title)
Bilibili().download_by_url(audio_url, **kwargs)
if page < json_result['data']['pageCount']:
page += 1
download_music_from_totallist(url, page, **kwargs)
else:
log.wtf("Fail to get music list of page " + json_result)
sys.exit(2)
else:
log.wtf("Fail to parse the sid from " + url, "")
def bilibili_download_playlist_by_url(url, **kwargs): def bilibili_download_playlist_by_url(url, **kwargs):
url = url_locations([url])[0] url = url_locations([url], faker=True)[0]
kwargs['playlist'] = True kwargs['playlist'] = True
# a bangumi here? possible? # a bangumi here? possible?
if 'live.bilibili' in url: if 'live.bilibili' in url:
@ -396,6 +481,12 @@ def bilibili_download_playlist_by_url(url, **kwargs):
elif 'favlist' in url: elif 'favlist' in url:
# this a fav list folder # this a fav list folder
download_video_from_favlist(url, **kwargs) download_video_from_favlist(url, **kwargs)
elif re.match(r'https?://space.bilibili.com/\d+/#/video', url):
download_video_from_totallist(url, 1, **kwargs)
elif re.match(r'https://www.bilibili.com/audio/mycollection/\d+', url):
download_music_from_favlist(url, 1, **kwargs)
elif re.match(r'https?://www.bilibili.com/audio/am\d+\?type=\d', url):
download_music_from_totallist(url, 1, **kwargs)
else: else:
aid = re.search(r'av(\d+)', url).group(1) aid = re.search(r'av(\d+)', url).group(1)
page_list = json.loads(get_content('http://www.bilibili.com/widget/getPageList?aid={}'.format(aid))) page_list = json.loads(get_content('http://www.bilibili.com/widget/getPageList?aid={}'.format(aid)))

View File

@ -29,9 +29,9 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg
image_url = edge['node']['display_url'] image_url = edge['node']['display_url']
if 'video_url' in edge['node']: if 'video_url' in edge['node']:
image_url = edge['node']['video_url'] image_url = edge['node']['video_url']
image_url = image_url.split('?')[0] ext = image_url.split('?')[0].split('.')[-1]
ext = image_url.split('.')[-1]
size = int(get_head(image_url)['Content-Length']) size = int(get_head(image_url)['Content-Length'])
print_info(site_info, title, ext, size) print_info(site_info, title, ext, size)
if not info_only: if not info_only:
download_urls(urls=[image_url], download_urls(urls=[image_url],
@ -44,9 +44,9 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg
image_url = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['display_url'] image_url = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['display_url']
if 'video_url' in info['entry_data']['PostPage'][0]['graphql']['shortcode_media']: if 'video_url' in info['entry_data']['PostPage'][0]['graphql']['shortcode_media']:
image_url =info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['video_url'] image_url =info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['video_url']
image_url = image_url.split('?')[0] ext = image_url.split('?')[0].split('.')[-1]
ext = image_url.split('.')[-1]
size = int(get_head(image_url)['Content-Length']) size = int(get_head(image_url)['Content-Length'])
print_info(site_info, title, ext, size) print_info(site_info, title, ext, size)
if not info_only: if not info_only:
download_urls(urls=[image_url], download_urls(urls=[image_url],

View File

@ -17,20 +17,20 @@ headers = {
def iwara_download(url, output_dir='.', merge=True, info_only=False, **kwargs): def iwara_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
global headers global headers
video_hash=match1(url, r'http://\w+.iwara.tv/videos/(\w+)') video_hash = match1(url, r'https?://\w+.iwara.tv/videos/(\w+)')
video_url=match1(url, r'(http://\w+.iwara.tv)/videos/\w+') video_url = match1(url, r'(https?://\w+.iwara.tv)/videos/\w+')
html = get_content(url,headers=headers) html = get_content(url, headers=headers)
title = r1(r'<title>(.*)</title>', html) title = r1(r'<title>(.*)</title>', html)
api_url=video_url+'/api/video/'+video_hash api_url = video_url + '/api/video/' + video_hash
content=get_content(api_url,headers=headers) content = get_content(api_url, headers=headers)
data=json.loads(content) data = json.loads(content)
type,ext,size=url_info(data[0]['uri'], headers=headers) down_urls = 'https:' + data[0]['uri']
down_urls=data[0]['uri'] type, ext, size = url_info(down_urls, headers=headers)
print_info(down_urls,title+data[0]['resolution'],type,size) print_info(site_info, title+data[0]['resolution'], type, size)
if not info_only: if not info_only:
download_urls([down_urls], title, ext, size, output_dir, merge = merge,headers=headers) download_urls([down_urls], title, ext, size, output_dir, merge=merge, headers=headers)
site_info = "iwara" site_info = "Iwara"
download = iwara_download download = iwara_download
download_playlist = playlist_not_supported('iwara') download_playlist = playlist_not_supported('iwara')

View File

@ -1,14 +1,132 @@
#!/usr/bin/env python #!/usr/bin/env python
__all__ = ['ixigua_download'] import base64
from .toutiao import download as toutiao_download import binascii
from .toutiao import download_playlist as toutiao_download_playlist
from ..common import *
import random
import ctypes
from json import loads
__all__ = ['ixigua_download', 'ixigua_download_playlist_by_url']
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 "
"Safari/537.36",
}
def int_overflow(val):
maxint = 2147483647
if not -maxint - 1 <= val <= maxint:
val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1
return val
def unsigned_right_shitf(n, i):
if n < 0:
n = ctypes.c_uint32(n).value
if i < 0:
return -int_overflow(n << abs(i))
return int_overflow(n >> i)
def get_video_url_from_video_id(video_id):
"""Splicing URLs according to video ID to get video details"""
# from js
data = [""] * 256
for index, _ in enumerate(data):
t = index
for i in range(8):
t = -306674912 ^ unsigned_right_shitf(t, 1) if 1 & t else unsigned_right_shitf(t, 1)
data[index] = t
def tmp():
rand_num = random.random()
path = "/video/urls/v/1/toutiao/mp4/{video_id}?r={random_num}".format(video_id=video_id,
random_num=str(rand_num)[2:])
e = o = r = -1
i, a = 0, len(path)
while i < a:
e = ord(path[i])
i += 1
if e < 128:
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ e)]
else:
if e < 2048:
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (192 | e >> 6 & 31))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))]
else:
if 55296 <= e < 57344:
e = (1023 & e) + 64
i += 1
o = 1023 & t.url(i)
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (240 | e >> 8 & 7))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 2 & 63))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | o >> 6 & 15 | (3 & e) << 4))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & o))]
else:
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (224 | e >> 12 & 15))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 6 & 63))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))]
return "https://ib.365yg.com{path}&s={param}".format(path=path, param=unsigned_right_shitf(r ^ -1, 0))
while 1:
url = tmp()
if url.split("=")[-1][0] != "-": # 参数s不能为负数
return url
def ixigua_download(url, output_dir='.', merge=True, info_only=False, **kwargs): def ixigua_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
return toutiao_download(url.replace('ixigua', '365yg')) # example url: https://www.ixigua.com/i6631065141750268420/#mid=63024814422
html = get_html(url, faker=True)
video_id = match1(html, r"videoId\s*:\s*'([^']+)'")
title = match1(html, r"title: '(\S+)',")
if not video_id:
log.e("video_id not found, url:{}".format(url))
return
video_info_url = get_video_url_from_video_id(video_id)
video_info = loads(get_content(video_info_url))
if video_info.get("code", 1) != 0:
log.e("Get video info from {} error: server return code {}".format(video_info_url, video_info.get("code", 1)))
return
if not video_info.get("data", None):
log.e("Get video info from {} error: The server returns JSON value"
" without data or data is empty".format(video_info_url))
return
if not video_info["data"].get("video_list", None):
log.e("Get video info from {} error: The server returns JSON value"
" without data.video_list or data.video_list is empty".format(video_info_url))
return
if not video_info["data"]["video_list"].get("video_1", None):
log.e("Get video info from {} error: The server returns JSON value"
" without data.video_list.video_1 or data.video_list.video_1 is empty".format(video_info_url))
return
size = int(video_info["data"]["video_list"]["video_1"]["size"])
print_info(site_info=site_info, title=title, type="mp4", size=size) # 该网站只有mp4类型文件
if not info_only:
video_url = base64.b64decode(video_info["data"]["video_list"]["video_1"]["main_url"].encode("utf-8"))
download_urls([video_url.decode("utf-8")], title, "mp4", size, output_dir, merge=merge, headers=headers, **kwargs)
def ixigua_download_playlist_by_url(url, output_dir='.', merge=True, info_only=False, **kwargs):
assert "user" in url, "Only support users to publish video list,Please provide a similar url:" \
"https://www.ixigua.com/c/user/6907091136/"
user_id = url.split("/")[-2] if url[-1] == "/" else url.split("/")[-1]
params = {"max_behot_time": "0", "max_repin_time": "0", "count": "20", "page_type": "0", "user_id": user_id}
while 1:
url = "https://www.ixigua.com/c/user/article/?" + "&".join(["{}={}".format(k, v) for k, v in params.items()])
video_list = loads(get_content(url, headers=headers))
params["max_behot_time"] = video_list["next"]["max_behot_time"]
for video in video_list["data"]:
ixigua_download("https://www.ixigua.com/i{}/".format(video["item_id"]), output_dir, merge, info_only,
**kwargs)
if video_list["next"]["max_behot_time"] == 0:
break
site_info = "ixigua.com" site_info = "ixigua.com"
download = ixigua_download download = ixigua_download
download_playlist = toutiao_download_playlist download_playlist = ixigua_download_playlist_by_url

View File

@ -2,8 +2,17 @@
__all__ = ['lizhi_download'] __all__ = ['lizhi_download']
import json import json
import datetime
from ..common import * from ..common import *
#
# Worked well but not perfect.
# TODO: add option --format={sd|hd}
#
def get_url(ep):
readable = datetime.datetime.fromtimestamp(int(ep['create_time']) / 1000).strftime('%Y/%m/%d')
return 'http://cdn5.lizhi.fm/audio/{}/{}_hd.mp3'.format(readable, ep['id'])
# radio_id: e.g. 549759 from http://www.lizhi.fm/549759/ # radio_id: e.g. 549759 from http://www.lizhi.fm/549759/
# #
# Returns a list of tuples (audio_id, title, url) for each episode # Returns a list of tuples (audio_id, title, url) for each episode
@ -23,7 +32,7 @@ def lizhi_extract_playlist_info(radio_id):
# (au_cnt), then handle pagination properly. # (au_cnt), then handle pagination properly.
api_url = 'http://www.lizhi.fm/api/radio_audios?s=0&l=65535&band=%s' % radio_id api_url = 'http://www.lizhi.fm/api/radio_audios?s=0&l=65535&band=%s' % radio_id
api_response = json.loads(get_content(api_url)) api_response = json.loads(get_content(api_url))
return [(ep['id'], ep['name'], ep['url']) for ep in api_response] return [(ep['id'], ep['name'], get_url(ep)) for ep in api_response]
def lizhi_download_audio(audio_id, title, url, output_dir='.', info_only=False): def lizhi_download_audio(audio_id, title, url, output_dir='.', info_only=False):
filetype, ext, size = url_info(url) filetype, ext, size = url_info(url)

View File

@ -2,9 +2,12 @@
__all__ = ['miaopai_download'] __all__ = ['miaopai_download']
import string
import random
from ..common import * from ..common import *
import urllib.error import urllib.error
import urllib.parse import urllib.parse
from ..util import fs
fake_headers_mobile = { fake_headers_mobile = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
@ -20,6 +23,10 @@ def miaopai_download_by_fid(fid, output_dir = '.', merge = False, info_only = Fa
mobile_page = get_content(page_url, headers=fake_headers_mobile) mobile_page = get_content(page_url, headers=fake_headers_mobile)
url = match1(mobile_page, r'<video id=.*?src=[\'"](.*?)[\'"]\W') url = match1(mobile_page, r'<video id=.*?src=[\'"](.*?)[\'"]\W')
if url is None:
wb_mp = re.search(r'<script src=([\'"])(.+?wb_mp\.js)\1>', mobile_page).group(2)
return miaopai_download_by_wbmp(wb_mp, fid, output_dir=output_dir, merge=merge,
info_only=info_only, total_size=None, **kwargs)
title = match1(mobile_page, r'<title>((.|\n)+?)</title>') title = match1(mobile_page, r'<title>((.|\n)+?)</title>')
if not title: if not title:
title = fid title = fid
@ -29,14 +36,62 @@ def miaopai_download_by_fid(fid, output_dir = '.', merge = False, info_only = Fa
if not info_only: if not info_only:
download_urls([url], title, ext, total_size=None, output_dir=output_dir, merge=merge) download_urls([url], title, ext, total_size=None, output_dir=output_dir, merge=merge)
#----------------------------------------------------------------------
def miaopai_download_by_wbmp(wbmp_url, fid, info_only=False, **kwargs):
headers = {}
headers.update(fake_headers_mobile)
headers['Host'] = 'imgaliyuncdn.miaopai.com'
wbmp = get_content(wbmp_url, headers=headers)
appid = re.search(r'appid:\s*?([^,]+?),', wbmp).group(1)
jsonp = re.search(r'jsonp:\s*?([\'"])(\w+?)\1', wbmp).group(2)
population = [i for i in string.ascii_lowercase] + [i for i in string.digits]
info_url = '{}?{}'.format('http://p.weibo.com/aj_media/info', parse.urlencode({
'appid': appid.strip(),
'fid': fid,
jsonp.strip(): '_jsonp' + ''.join(random.sample(population, 11))
}))
headers['Host'] = 'p.weibo.com'
jsonp_text = get_content(info_url, headers=headers)
jsonp_dict = json.loads(match1(jsonp_text, r'\(({.+})\)'))
if jsonp_dict['code'] != 200:
log.wtf('[Failed] "%s"' % jsonp_dict['msg'])
video_url = jsonp_dict['data']['meta_data'][0]['play_urls']['l']
title = jsonp_dict['data']['description']
title = title.replace('\n', '_')
ext = 'mp4'
headers['Host'] = 'f.us.sinaimg.cn'
print_info(site_info, title, ext, url_info(video_url, headers=headers)[2])
if not info_only:
download_urls([video_url], fs.legitimize(title), ext, headers=headers, **kwargs)
def miaopai_download_direct(url, info_only, **kwargs):
mobile_page = get_content(url, headers=fake_headers_mobile)
try:
title = re.search(r'([\'"])title\1:\s*([\'"])(.+?)\2,', mobile_page).group(3)
except:
title = re.search(r'([\'"])status_title\1:\s*([\'"])(.+?)\2,', mobile_page).group(3)
title = title.replace('\n', '_')
stream_url = re.search(r'([\'"])stream_url\1:\s*([\'"])(.+?)\2,', mobile_page).group(3)
ext = 'mp4'
print_info(site_info, title, ext, url_info(stream_url, headers=fake_headers_mobile)[2])
if not info_only:
download_urls([stream_url], fs.legitimize(title), ext, total_size=None, headers=fake_headers_mobile, **kwargs)
# ----------------------------------------------------------------------
def miaopai_download(url, output_dir = '.', merge = False, info_only = False, **kwargs): def miaopai_download(url, output_dir = '.', merge = False, info_only = False, **kwargs):
if match1(url, r'weibo\.com/tv/v/(\w+)'):
return miaopai_download_direct(url, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs)
fid = match1(url, r'\?fid=(\d{4}:\w+)') fid = match1(url, r'\?fid=(\d{4}:\w+)')
if fid is not None: if fid is not None:
miaopai_download_by_fid(fid, output_dir, merge, info_only) miaopai_download_by_fid(fid, output_dir, merge, info_only)
elif '/p/230444' in url: elif '/p/230444' in url:
fid = match1(url, r'/p/230444(\w+)') fid = match1(url, r'/p/230444(\w+)')
miaopai_download_by_fid('1034:'+fid, output_dir, merge, info_only) miaopai_download_by_fid('1034:'+fid, output_dir, merge, info_only)
elif re.match(r'^http[s]://weibo\.com/\d+/.+', url):
miaopai_download_direct(url, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs)
else: else:
mobile_page = get_content(url, headers = fake_headers_mobile) mobile_page = get_content(url, headers = fake_headers_mobile)
hit = re.search(r'"page_url"\s*:\s*"([^"]+)"', mobile_page) hit = re.search(r'"page_url"\s*:\s*"([^"]+)"', mobile_page)
@ -46,6 +101,7 @@ def miaopai_download(url, output_dir = '.', merge = False, info_only = False, **
escaped_url = hit.group(1) escaped_url = hit.group(1)
miaopai_download(urllib.parse.unquote(escaped_url), output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) miaopai_download(urllib.parse.unquote(escaped_url), output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
site_info = "miaopai" site_info = "miaopai"
download = miaopai_download download = miaopai_download
download_playlist = playlist_not_supported('miaopai') download_playlist = playlist_not_supported('miaopai')

View File

@ -7,31 +7,40 @@ import re
from ..util import log from ..util import log
from ..common import get_content, download_urls, print_info, playlist_not_supported, url_size from ..common import get_content, download_urls, print_info, playlist_not_supported, url_size
from .universal import *
__all__ = ['naver_download_by_url'] __all__ = ['naver_download_by_url']
def naver_download_by_url(url, info_only=False, **kwargs): def naver_download_by_url(url, output_dir='.', merge=True, info_only=False, **kwargs):
ep = 'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{}?key={}' ep = 'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{}?key={}'
page = get_content(url) page = get_content(url)
og_video_url = re.search(r"<meta\s+property=\"og:video:url\"\s+content='(.+?)'>", page).group(1) try:
params_dict = urllib.parse.parse_qs(urllib.parse.urlparse(og_video_url).query) temp = re.search(r"<meta\s+property=\"og:video:url\"\s+content='(.+?)'>", page)
vid = params_dict['vid'][0] if temp is not None:
key = params_dict['outKey'][0] og_video_url = temp.group(1)
meta_str = get_content(ep.format(vid, key)) params_dict = urllib.parse.parse_qs(urllib.parse.urlparse(og_video_url).query)
meta_json = json.loads(meta_str) vid = params_dict['vid'][0]
if 'errorCode' in meta_json: key = params_dict['outKey'][0]
log.wtf(meta_json['errorCode']) else:
title = meta_json['meta']['subject'] vid = re.search(r"\"videoId\"\s*:\s*\"(.+?)\"", page).group(1)
videos = meta_json['videos']['list'] key = re.search(r"\"inKey\"\s*:\s*\"(.+?)\"", page).group(1)
video_list = sorted(videos, key=lambda video: video['encodingOption']['width']) meta_str = get_content(ep.format(vid, key))
video_url = video_list[-1]['source'] meta_json = json.loads(meta_str)
# size = video_list[-1]['size'] if 'errorCode' in meta_json:
# result wrong size log.wtf(meta_json['errorCode'])
size = url_size(video_url) title = meta_json['meta']['subject']
print_info(site_info, title, 'mp4', size) videos = meta_json['videos']['list']
if not info_only: video_list = sorted(videos, key=lambda video: video['encodingOption']['width'])
download_urls([video_url], title, 'mp4', size, **kwargs) video_url = video_list[-1]['source']
# size = video_list[-1]['size']
# result wrong size
size = url_size(video_url)
print_info(site_info, title, 'mp4', size)
if not info_only:
download_urls([video_url], title, 'mp4', size, **kwargs)
except:
universal_download(url, output_dir, merge=merge, info_only=info_only, **kwargs)
site_info = "naver.com" site_info = "naver.com"
download = naver_download_by_url download = naver_download_by_url

View File

@ -192,14 +192,14 @@ class PPTV(VideoExtractor):
if self.url and not self.vid: if self.url and not self.vid:
if not re.match(r'http://v.pptv.com/show/(\w+)\.html', self.url): if not re.match(r'http://v.pptv.com/show/(\w+)\.html', self.url):
raise('Unknown url pattern') raise('Unknown url pattern')
page_content = get_content(self.url) page_content = get_content(self.url,{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"})
self.vid = match1(page_content, r'webcfg\s*=\s*{"id":\s*(\d+)') self.vid = match1(page_content, r'webcfg\s*=\s*{"id":\s*(\d+)')
if not self.vid: if not self.vid:
raise('Cannot find id') raise('Cannot find id')
api_url = 'http://web-play.pptv.com/webplay3-0-{}.xml'.format(self.vid) api_url = 'http://web-play.pptv.com/webplay3-0-{}.xml'.format(self.vid)
api_url += '?appplt=flp&appid=pptv.flashplayer.vod&appver=3.4.2.28&type=&version=4' api_url += '?appplt=flp&appid=pptv.flashplayer.vod&appver=3.4.2.28&type=&version=4'
dom = parseString(get_content(api_url)) dom = parseString(get_content(api_url,{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}))
self.title, m_items, m_streams, m_segs = parse_pptv_xml(dom) self.title, m_items, m_streams, m_segs = parse_pptv_xml(dom)
xml_streams = merge_meta(m_items, m_streams, m_segs) xml_streams = merge_meta(m_items, m_streams, m_segs)
for stream_id in xml_streams: for stream_id in xml_streams:

View File

@ -15,9 +15,9 @@ Changelog:
new api new api
''' '''
def real_url(host,vid,tvid,new,clipURL,ck): def real_url(fileName, key, ch):
url = 'http://'+host+'/?prot=9&prod=flash&pt=1&file='+clipURL+'&new='+new +'&key='+ ck+'&vid='+str(vid)+'&uid='+str(int(time.time()*1000))+'&t='+str(random())+'&rb=1' url = "https://data.vod.itc.cn/ip?new=" + fileName + "&num=1&key=" + key + "&ch=" + ch + "&pt=1&pg=2&prod=h5n"
return json.loads(get_html(url))['url'] return json.loads(get_html(url))['servers'][0]['url']
def sohu_download(url, output_dir = '.', merge = True, info_only = False, extractor_proxy=None, **kwargs): def sohu_download(url, output_dir = '.', merge = True, info_only = False, extractor_proxy=None, **kwargs):
if re.match(r'http://share.vrs.sohu.com', url): if re.match(r'http://share.vrs.sohu.com', url):
@ -51,9 +51,8 @@ def sohu_download(url, output_dir = '.', merge = True, info_only = False, extrac
title = data['tvName'] title = data['tvName']
size = sum(data['clipsBytes']) size = sum(data['clipsBytes'])
assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su']) assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su'])
for new,clip,ck, in zip(data['su'], data['clipsURL'], data['ck']): for fileName, key in zip(data['su'], data['ck']):
clipURL = urlparse(clip).path urls.append(real_url(fileName, key, data['ch']))
urls.append(real_url(host,hqvid,tvid,new,clipURL,ck))
# assert data['clipsURL'][0].endswith('.mp4') # assert data['clipsURL'][0].endswith('.mp4')
else: else:
@ -66,9 +65,8 @@ def sohu_download(url, output_dir = '.', merge = True, info_only = False, extrac
title = data['tvName'] title = data['tvName']
size = sum(map(int,data['clipsBytes'])) size = sum(map(int,data['clipsBytes']))
assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su']) assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su'])
for new,clip,ck, in zip(data['su'], data['clipsURL'], data['ck']): for fileName, key in zip(data['su'], data['ck']):
clipURL = urlparse(clip).path urls.append(real_url(fileName, key, data['ch']))
urls.append(real_url(host,vid,tvid,new,clipURL,ck))
print_info(site_info, title, 'mp4', size) print_info(site_info, title, 'mp4', size)
if not info_only: if not info_only:

View File

@ -0,0 +1,23 @@
#!/usr/bin/env python
__all__ = ['tiktok_download']
from ..common import *
def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
html = get_html(url)
title = r1(r'<title>(.*?)</title>', html)
video_id = r1(r'/video/(\d+)', url) or r1(r'musical\?id=(\d+)', html)
title = '%s [%s]' % (title, video_id)
dataText = r1(r'var data = \[(.*)\] ', html) or r1(r'var data = (\{.*\})', html)
data = json.loads(dataText)
source = 'http:' + data['video']['play_addr']['url_list'][0]
mime, ext, size = url_info(source)
print_info(site_info, title, mime, size)
if not info_only:
download_urls([source], title, ext, size, output_dir, merge=merge)
site_info = "TikTok.com"
download = tiktok_download
download_playlist = playlist_not_supported('tiktok')

View File

@ -13,7 +13,29 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
universal_download(url, output_dir, merge=merge, info_only=info_only) universal_download(url, output_dir, merge=merge, info_only=info_only)
return return
html = parse.unquote(get_html(url)).replace('\/', '/') import ssl
ssl_context = request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1))
cookie_handler = request.HTTPCookieProcessor()
opener = request.build_opener(ssl_context, cookie_handler)
request.install_opener(opener)
page = get_html(url)
form_key = match1(page, r'id="tumblr_form_key" content="([^"]+)"')
if form_key is not None:
# bypass GDPR consent page
referer = 'https://www.tumblr.com/privacy/consent?redirect=%s' % parse.quote_plus(url)
post_content('https://www.tumblr.com/svc/privacy/consent',
headers={
'Content-Type': 'application/json',
'User-Agent': fake_headers['User-Agent'],
'Referer': referer,
'X-tumblr-form-key': form_key,
'X-Requested-With': 'XMLHttpRequest'
},
post_data_raw='{"eu_resident":true,"gdpr_is_acceptable_age":true,"gdpr_consent_core":true,"gdpr_consent_first_party_ads":true,"gdpr_consent_third_party_ads":true,"gdpr_consent_search_history":true,"redirect_to":"%s","gdpr_reconsent":false}' % url)
page = get_html(url, faker=True)
html = parse.unquote(page).replace('\/', '/')
feed = r1(r'<meta property="og:type" content="tumblr-feed:(\w+)" />', html) feed = r1(r'<meta property="og:type" content="tumblr-feed:(\w+)" />', html)
if feed in ['photo', 'photoset', 'entry'] or feed is None: if feed in ['photo', 'photoset', 'entry'] or feed is None:
@ -21,23 +43,24 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
page_title = r1(r'<meta name="description" content="([^"\n]+)', html) or \ page_title = r1(r'<meta name="description" content="([^"\n]+)', html) or \
r1(r'<meta property="og:description" content="([^"\n]+)', html) or \ r1(r'<meta property="og:description" content="([^"\n]+)', html) or \
r1(r'<title>([^<\n]*)', html) r1(r'<title>([^<\n]*)', html)
urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.jpg)', html) +\ urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.jpg)', html) +\
re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.png)', html) +\ re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.png)', html) +\
re.findall(r'(https?://[^;"&]+/tumblr_[^";]+_\d+\.gif)', html) re.findall(r'(https?://[^;"&]+/tumblr_[^";&]+_\d+\.gif)', html)
tuggles = {} tuggles = {}
for url in urls: for url in urls:
filename = parse.unquote(url.split('/')[-1]) hd_url = r1(r'(.+)_\d+\.jpg$', url) + '_1280.jpg' # FIXME: decide actual quality
filename = parse.unquote(hd_url.split('/')[-1])
title = '.'.join(filename.split('.')[:-1]) title = '.'.join(filename.split('.')[:-1])
tumblr_id = r1(r'^tumblr_(.+)_\d+$', title) tumblr_id = r1(r'^tumblr_(.+)_\d+$', title)
quality = int(r1(r'^tumblr_.+_(\d+)$', title)) quality = int(r1(r'^tumblr_.+_(\d+)$', title))
ext = filename.split('.')[-1] ext = filename.split('.')[-1]
try: try:
size = int(get_head(url)['Content-Length']) size = int(get_head(hd_url)['Content-Length'])
if tumblr_id not in tuggles or tuggles[tumblr_id]['quality'] < quality: if tumblr_id not in tuggles or tuggles[tumblr_id]['quality'] < quality:
tuggles[tumblr_id] = { tuggles[tumblr_id] = {
'title': title, 'title': title,
'url': url, 'url': hd_url,
'quality': quality, 'quality': quality,
'ext': ext, 'ext': ext,
'size': size, 'size': size,
@ -99,11 +122,15 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
r1(r'<meta property="og:description" content="([^"]*)" />', html) or r1(r'<meta property="og:description" content="([^"]*)" />', html) or
r1(r'<title>([^<\n]*)', html) or url.split("/")[4]).replace('\n', '') r1(r'<title>([^<\n]*)', html) or url.split("/")[4]).replace('\n', '')
type, ext, size = url_info(real_url) # this is better
vcode = r1(r'tumblr_(\w+)', real_url)
real_url = 'https://vt.media.tumblr.com/tumblr_%s.mp4' % vcode
type, ext, size = url_info(real_url, faker=True)
print_info(site_info, title, type, size) print_info(site_info, title, type, size)
if not info_only: if not info_only:
download_urls([real_url], title, ext, size, output_dir, merge = merge) download_urls([real_url], title, ext, size, output_dir, merge=merge)
site_info = "Tumblr.com" site_info = "Tumblr.com"
download = tumblr_download download = tumblr_download

View File

@ -30,9 +30,9 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs)
return return
html = get_html(url, faker=True) html = get_html(url, faker=True)
screen_name = r1(r'data-screen-name="([^"]*)"', html) or \ screen_name = r1(r'twitter\.com/([^/]+)', url) or r1(r'data-screen-name="([^"]*)"', html) or \
r1(r'<meta name="twitter:title" content="([^"]*)"', html) r1(r'<meta name="twitter:title" content="([^"]*)"', html)
item_id = r1(r'data-item-id="([^"]*)"', html) or \ item_id = r1(r'twitter\.com/[^/]+/status/(\d+)', url) or r1(r'data-item-id="([^"]*)"', html) or \
r1(r'<meta name="twitter:site:id" content="([^"]*)"', html) r1(r'<meta name="twitter:site:id" content="([^"]*)"', html)
page_title = "{} [{}]".format(screen_name, item_id) page_title = "{} [{}]".format(screen_name, item_id)

View File

@ -67,9 +67,9 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg
urls = [] urls = []
for i in media_exts: for i in media_exts:
urls += re.findall(r'(https?://[^ ;"\'\\]+' + i + r'[^ ;"\'\\]*)', page) urls += re.findall(r'(https?://[^ ;&"\'\\]+' + i + r'[^ ;&"\'\\]*)', page)
p_urls = re.findall(r'(https?%3A%2F%2F[^;&]+' + i + r'[^;&]*)', page) p_urls = re.findall(r'(https?%3A%2F%2F[^;&"]+' + i + r'[^;&"]*)', page)
urls += [parse.unquote(url) for url in p_urls] urls += [parse.unquote(url) for url in p_urls]
q_urls = re.findall(r'(https?:\\\\/\\\\/[^ ;"\']+' + i + r'[^ ;"\']*)', page) q_urls = re.findall(r'(https?:\\\\/\\\\/[^ ;"\']+' + i + r'[^ ;"\']*)', page)
@ -106,6 +106,9 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg
title = '%s' % i title = '%s' % i
i += 1 i += 1
if r1(r'(https://pinterest.com/pin/)', url):
continue
candies.append({'url': url, candies.append({'url': url,
'title': title}) 'title': title})

View File

@ -7,6 +7,24 @@ from urllib.parse import urlparse
from json import loads from json import loads
import re import re
#----------------------------------------------------------------------
def miaopai_download_by_smid(smid, output_dir = '.', merge = True, info_only = False):
""""""
api_endpoint = 'https://n.miaopai.com/api/aj_media/info.json?smid={smid}'.format(smid = smid)
html = get_content(api_endpoint)
api_content = loads(html)
video_url = api_content['data']['meta_data'][0]['play_urls']['l']
title = api_content['data']['description']
type, ext, size = url_info(video_url)
print_info(site_info, title, type, size)
if not info_only:
download_urls([video_url], title, ext, size, output_dir, merge=merge)
#---------------------------------------------------------------------- #----------------------------------------------------------------------
def yixia_miaopai_download_by_scid(scid, output_dir = '.', merge = True, info_only = False): def yixia_miaopai_download_by_scid(scid, output_dir = '.', merge = True, info_only = False):
"""""" """"""
@ -47,7 +65,11 @@ def yixia_xiaokaxiu_download_by_scid(scid, output_dir = '.', merge = True, info_
def yixia_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): def yixia_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):
"""wrapper""" """wrapper"""
hostname = urlparse(url).hostname hostname = urlparse(url).hostname
if 'miaopai.com' in hostname: #Miaopai if 'n.miaopai.com' == hostname:
smid = match1(url, r'n\.miaopai\.com/media/([^.]+)')
miaopai_download_by_smid(smid, output_dir, merge, info_only)
return
elif 'miaopai.com' in hostname: #Miaopai
yixia_download_by_scid = yixia_miaopai_download_by_scid yixia_download_by_scid = yixia_miaopai_download_by_scid
site_info = "Yixia Miaopai" site_info = "Yixia Miaopai"

View File

@ -78,7 +78,7 @@ class Youku(VideoExtractor):
self.api_error_code = None self.api_error_code = None
self.api_error_msg = None self.api_error_msg = None
self.ccode = '0508' self.ccode = '0590'
# Found in http://g.alicdn.com/player/ykplayer/0.5.64/youku-player.min.js # Found in http://g.alicdn.com/player/ykplayer/0.5.64/youku-player.min.js
# grep -oE '"[0-9a-zA-Z+/=]{256}"' youku-player.min.js # grep -oE '"[0-9a-zA-Z+/=]{256}"' youku-player.min.js
self.ckey = 'DIl58SLFxFNndSV1GFNnMQVYkx1PP5tKe1siZu/86PR1u/Wh1Ptd+WOZsHHWxysSfAOhNJpdVWsdVJNsfJ8Sxd8WKVvNfAS8aS8fAOzYARzPyPc3JvtnPHjTdKfESTdnuTW6ZPvk2pNDh4uFzotgdMEFkzQ5wZVXl2Pf1/Y6hLK0OnCNxBj3+nb0v72gZ6b0td+WOZsHHWxysSo/0y9D2K42SaB8Y/+aD2K42SaB8Y/+ahU+WOZsHcrxysooUeND' self.ckey = 'DIl58SLFxFNndSV1GFNnMQVYkx1PP5tKe1siZu/86PR1u/Wh1Ptd+WOZsHHWxysSfAOhNJpdVWsdVJNsfJ8Sxd8WKVvNfAS8aS8fAOzYARzPyPc3JvtnPHjTdKfESTdnuTW6ZPvk2pNDh4uFzotgdMEFkzQ5wZVXl2Pf1/Y6hLK0OnCNxBj3+nb0v72gZ6b0td+WOZsHHWxysSo/0y9D2K42SaB8Y/+aD2K42SaB8Y/+ahU+WOZsHcrxysooUeND'

View File

@ -37,6 +37,10 @@ class YouTube(VideoExtractor):
] ]
def decipher(js, s): def decipher(js, s):
# Examples:
# - https://www.youtube.com/yts/jsbin/player-da_DK-vflWlK-zq/base.js
# - https://www.youtube.com/yts/jsbin/player-vflvABTsY/da_DK/base.js
# - https://www.youtube.com/yts/jsbin/player-vfls4aurX/da_DK/base.js
def tr_js(code): def tr_js(code):
code = re.sub(r'function', r'def', code) code = re.sub(r'function', r'def', code)
code = re.sub(r'(\W)(as|if|in|is|or)\(', r'\1_\2(', code) code = re.sub(r'(\W)(as|if|in|is|or)\(', r'\1_\2(', code)
@ -52,11 +56,13 @@ class YouTube(VideoExtractor):
return code return code
js = js.replace('\n', ' ') js = js.replace('\n', ' ')
f1 = match1(js, r'"signature",([$\w]+)\(\w+\.\w+\)') f1 = match1(js, r'\.set\(\w+\.sp,\(0,window\.encodeURIComponent\)\(([$\w]+)') or \
match1(js, r'\.set\(\w+\.sp,([$\w]+)\(\w+\.s\)\)') or \
match1(js, r'"signature",([$\w]+)\(\w+\.\w+\)')
f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \ f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \
match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1)) match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1))
f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def) f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def)
f1def = 'function %s%s' % (f1, f1def) f1def = 'function main_%s%s' % (f1, f1def) # prefix to avoid potential namespace conflict
code = tr_js(f1def) code = tr_js(f1def)
f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def)) f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def))
for f2 in f2s: for f2 in f2s:
@ -73,10 +79,20 @@ class YouTube(VideoExtractor):
f1 = re.sub(r'(as|if|in|is|or)', r'_\1', f1) f1 = re.sub(r'(as|if|in|is|or)', r'_\1', f1)
f1 = re.sub(r'\$', '_dollar', f1) f1 = re.sub(r'\$', '_dollar', f1)
code = code + 'sig=%s(s)' % f1 code = code + 'sig=main_%s(s)' % f1 # prefix to avoid potential namespace conflict
exec(code, globals(), locals()) exec(code, globals(), locals())
return locals()['sig'] return locals()['sig']
def chunk_by_range(url, size):
urls = []
chunk_size = 10485760
start, end = 0, chunk_size - 1
urls.append('%s&range=%s-%s' % (url, start, end))
while end + 1 < size: # processed size < expected size
start, end = end + 1, end + chunk_size
urls.append('%s&range=%s-%s' % (url, start, end))
return urls
def get_url_from_vid(vid): def get_url_from_vid(vid):
return 'https://youtu.be/{}'.format(vid) return 'https://youtu.be/{}'.format(vid)
@ -128,7 +144,10 @@ class YouTube(VideoExtractor):
for video in videos: for video in videos:
vid = parse_query_param(video, 'v') vid = parse_query_param(video, 'v')
index = parse_query_param(video, 'index') index = parse_query_param(video, 'index')
self.__class__().download_by_url(self.__class__.get_url_from_vid(vid), index=index, **kwargs) try:
self.__class__().download_by_url(self.__class__.get_url_from_vid(vid), index=index, **kwargs)
except:
pass
def prepare(self, **kwargs): def prepare(self, **kwargs):
assert self.url or self.vid assert self.url or self.vid
@ -144,7 +163,8 @@ class YouTube(VideoExtractor):
ytplayer_config = None ytplayer_config = None
if 'status' not in video_info: if 'status' not in video_info:
log.wtf('[Failed] Unknown status.') log.wtf('[Failed] Unknown status.', exit_code=None)
raise
elif video_info['status'] == ['ok']: elif video_info['status'] == ['ok']:
if 'use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']: if 'use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']:
self.title = parse.unquote_plus(video_info['title'][0]) self.title = parse.unquote_plus(video_info['title'][0])
@ -176,7 +196,8 @@ class YouTube(VideoExtractor):
ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+});ytplayer', video_page).group(1)) ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+});ytplayer', video_page).group(1))
except: except:
msg = re.search('class="message">([^<]+)<', video_page).group(1) msg = re.search('class="message">([^<]+)<', video_page).group(1)
log.wtf('[Failed] "%s"' % msg.strip()) log.wtf('[Failed] "%s"' % msg.strip(), exit_code=None)
raise
if 'title' in ytplayer_config['args']: if 'title' in ytplayer_config['args']:
# 150 Restricted from playback on certain sites # 150 Restricted from playback on certain sites
@ -185,18 +206,22 @@ class YouTube(VideoExtractor):
self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js']
stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',')
else: else:
log.wtf('[Error] The uploader has not made this video available in your country.') log.wtf('[Error] The uploader has not made this video available in your country.', exit_code=None)
raise
#self.title = re.search('<meta name="title" content="([^"]+)"', video_page).group(1) #self.title = re.search('<meta name="title" content="([^"]+)"', video_page).group(1)
#stream_list = [] #stream_list = []
elif video_info['errorcode'] == ['100']: elif video_info['errorcode'] == ['100']:
log.wtf('[Failed] This video does not exist.', exit_code=int(video_info['errorcode'][0])) log.wtf('[Failed] This video does not exist.', exit_code=None) #int(video_info['errorcode'][0])
raise
else: else:
log.wtf('[Failed] %s' % video_info['reason'][0], exit_code=int(video_info['errorcode'][0])) log.wtf('[Failed] %s' % video_info['reason'][0], exit_code=None) #int(video_info['errorcode'][0])
raise
else: else:
log.wtf('[Failed] Invalid status.') log.wtf('[Failed] Invalid status.', exit_code=None)
raise
# YouTube Live # YouTube Live
if ytplayer_config and (ytplayer_config['args'].get('livestream') == '1' or ytplayer_config['args'].get('live_playback') == '1'): if ytplayer_config and (ytplayer_config['args'].get('livestream') == '1' or ytplayer_config['args'].get('live_playback') == '1'):
@ -286,13 +311,15 @@ class YouTube(VideoExtractor):
if not dash_size: if not dash_size:
try: dash_size = url_size(dash_url) try: dash_size = url_size(dash_url)
except: continue except: continue
dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size))
dash_mp4_a_urls = self.__class__.chunk_by_range(dash_mp4_a_url, int(dash_mp4_a_size))
self.dash_streams[itag] = { self.dash_streams[itag] = {
'quality': '%sx%s' % (w, h), 'quality': '%sx%s' % (w, h),
'itag': itag, 'itag': itag,
'type': mimeType, 'type': mimeType,
'mime': mimeType, 'mime': mimeType,
'container': 'mp4', 'container': 'mp4',
'src': [dash_url, dash_mp4_a_url], 'src': [dash_urls, dash_mp4_a_urls],
'size': int(dash_size) + int(dash_mp4_a_size) 'size': int(dash_size) + int(dash_mp4_a_size)
} }
elif mimeType == 'video/webm': elif mimeType == 'video/webm':
@ -306,13 +333,15 @@ class YouTube(VideoExtractor):
if not dash_size: if not dash_size:
try: dash_size = url_size(dash_url) try: dash_size = url_size(dash_url)
except: continue except: continue
dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size))
dash_webm_a_urls = self.__class__.chunk_by_range(dash_webm_a_url, int(dash_webm_a_size))
self.dash_streams[itag] = { self.dash_streams[itag] = {
'quality': '%sx%s' % (w, h), 'quality': '%sx%s' % (w, h),
'itag': itag, 'itag': itag,
'type': mimeType, 'type': mimeType,
'mime': mimeType, 'mime': mimeType,
'container': 'webm', 'container': 'webm',
'src': [dash_url, dash_webm_a_url], 'src': [dash_urls, dash_webm_a_urls],
'size': int(dash_size) + int(dash_webm_a_size) 'size': int(dash_size) + int(dash_webm_a_size)
} }
except: except:
@ -349,13 +378,15 @@ class YouTube(VideoExtractor):
dash_url += '&signature={}'.format(sig) dash_url += '&signature={}'.format(sig)
dash_size = stream['clen'] dash_size = stream['clen']
itag = stream['itag'] itag = stream['itag']
dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size))
dash_mp4_a_urls = self.__class__.chunk_by_range(dash_mp4_a_url, int(dash_mp4_a_size))
self.dash_streams[itag] = { self.dash_streams[itag] = {
'quality': stream['size'], 'quality': stream['size'],
'itag': itag, 'itag': itag,
'type': mimeType, 'type': mimeType,
'mime': mimeType, 'mime': mimeType,
'container': 'mp4', 'container': 'mp4',
'src': [dash_url, dash_mp4_a_url], 'src': [dash_urls, dash_mp4_a_urls],
'size': int(dash_size) + int(dash_mp4_a_size) 'size': int(dash_size) + int(dash_mp4_a_size)
} }
elif stream['type'].startswith('video/webm'): elif stream['type'].startswith('video/webm'):
@ -374,13 +405,15 @@ class YouTube(VideoExtractor):
except UnboundLocalError as e: except UnboundLocalError as e:
audio_url = dash_mp4_a_url audio_url = dash_mp4_a_url
audio_size = int(dash_mp4_a_size) audio_size = int(dash_mp4_a_size)
dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size))
audio_urls = self.__class__.chunk_by_range(audio_url, int(audio_size))
self.dash_streams[itag] = { self.dash_streams[itag] = {
'quality': stream['size'], 'quality': stream['size'],
'itag': itag, 'itag': itag,
'type': mimeType, 'type': mimeType,
'mime': mimeType, 'mime': mimeType,
'container': 'webm', 'container': 'webm',
'src': [dash_url, audio_url], 'src': [dash_urls, audio_urls],
'size': int(dash_size) + int(audio_size) 'size': int(dash_size) + int(audio_size)
} }

View File

@ -37,11 +37,14 @@ def zhibo_download(url, output_dir = '.', merge = True, info_only = False, **kwa
if is_live is not "1": if is_live is not "1":
raise ValueError("The live stream is not online! (Errno:%s)" % is_live) raise ValueError("The live stream is not online! (Errno:%s)" % is_live)
ourStreamName = r1(r"window.ourStreamName=\'([s\S'\s\.]*)\'\;[\s\S]*window.rtmpDefaultSource", html) match = re.search(r"""
rtmpPollUrl = r1(r"window.rtmpPollUrl=\'([s\S'\s\.]*)\'\;[\s\S]*window.hlsDefaultSource", html) ourStreamName .*?
'(.*?)' .*?
#real_url = 'rtmp://220.194.213.56/live.zhibo.tv/8live/' + ourStreamName rtmpHighSource .*?
real_url = rtmpPollUrl + ourStreamName '(.*?)' .*?
'(.*?)'
""", html, re.S | re.X)
real_url = match.group(3) + match.group(1) + match.group(2)
print_info(site_info, title, 'flv', float('inf')) print_info(site_info, title, 'flv', float('inf'))
if not info_only: if not info_only:

View File

@ -0,0 +1,79 @@
#!/usr/bin/env python
__all__ = ['zhihu_download', 'zhihu_download_playlist']
from ..common import *
import json
def zhihu_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
paths = url.split("/")
# question or column
if len(paths) < 3 and len(paths) < 6:
raise TypeError("URL does not conform to specifications, Support column and question only."
"Example URL: https://zhuanlan.zhihu.com/p/51669862 or "
"https://www.zhihu.com/question/267782048/answer/490720324")
if ("question" not in paths or "answer" not in paths) and "zhuanlan.zhihu.com" not in paths:
raise TypeError("URL does not conform to specifications, Support column and question only."
"Example URL: https://zhuanlan.zhihu.com/p/51669862 or "
"https://www.zhihu.com/question/267782048/answer/490720324")
html = get_html(url, faker=True)
title = match1(html, r'data-react-helmet="true">(.*?)</title>')
for index, video_id in enumerate(matchall(html, [r'<a class="video-box" href="\S+video/(\d+)"'])):
try:
video_info = json.loads(
get_content(r"https://lens.zhihu.com/api/videos/{}".format(video_id), headers=fake_headers))
except json.decoder.JSONDecodeError:
log.w("Video id not found:{}".format(video_id))
continue
play_list = video_info["playlist"]
# first High Definition
# second Second Standard Definition
# third ld. What is ld ?
# finally continue
data = play_list.get("hd", play_list.get("sd", play_list.get("ld", None)))
if not data:
log.w("Video id No play address:{}".format(video_id))
continue
print_info(site_info, title, data["format"], data["size"])
if not info_only:
ext = "_{}.{}".format(index, data["format"])
if kwargs.get("zhihu_offset"):
ext = "_{}".format(kwargs["zhihu_offset"]) + ext
download_urls([data["play_url"]], title, ext, data["size"],
output_dir=output_dir, merge=merge, **kwargs)
def zhihu_download_playlist(url, output_dir='.', merge=True, info_only=False, **kwargs):
if "question" not in url or "answer" in url: # question page
raise TypeError("URL does not conform to specifications, Support question only."
" Example URL: https://www.zhihu.com/question/267782048")
url = url.split("?")[0]
if url[-1] == "/":
question_id = url.split("/")[-2]
else:
question_id = url.split("/")[-1]
videos_url = r"https://www.zhihu.com/api/v4/questions/{}/answers".format(question_id)
try:
questions = json.loads(get_content(videos_url))
except json.decoder.JSONDecodeError:
raise TypeError("Check whether the problem URL exists.Example URL: https://www.zhihu.com/question/267782048")
count = 0
while 1:
for data in questions["data"]:
kwargs["zhihu_offset"] = count
zhihu_download("https://www.zhihu.com/question/{}/answer/{}".format(question_id, data["id"]),
output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
count += 1
if questions["paging"]["is_end"]:
return
questions = json.loads(get_content(questions["paging"]["next"], headers=fake_headers))
site_info = "zhihu.com"
download = zhihu_download
download_playlist = zhihu_download_playlist

View File

@ -13,6 +13,7 @@ def legitimize(text, os=detect_os()):
ord('|'): '-', ord('|'): '-',
}) })
# FIXME: do some filesystem detection
if os == 'windows' or os == 'cygwin' or os == 'wsl': if os == 'windows' or os == 'cygwin' or os == 'wsl':
# Windows (non-POSIX namespace) # Windows (non-POSIX namespace)
text = text.translate({ text = text.translate({
@ -28,6 +29,7 @@ def legitimize(text, os=detect_os()):
ord('>'): '-', ord('>'): '-',
ord('['): '(', ord('['): '(',
ord(']'): ')', ord(']'): ')',
ord('\t'): ' ',
}) })
else: else:
# *nix # *nix

View File

@ -96,3 +96,9 @@ def wtf(message, exit_code=1):
print_log(message, RED, BOLD) print_log(message, RED, BOLD)
if exit_code is not None: if exit_code is not None:
sys.exit(exit_code) sys.exit(exit_code)
def yes_or_no(message):
ans = str(input('%s (y/N) ' % message)).lower().strip()
if ans == 'y':
return True
return False

View File

@ -19,9 +19,11 @@ def detect_os():
elif 'linux' in syst: elif 'linux' in syst:
os = 'linux' os = 'linux'
# detect WSL https://github.com/Microsoft/BashOnWindows/issues/423 # detect WSL https://github.com/Microsoft/BashOnWindows/issues/423
with open('/proc/version', 'r') as f: try:
if 'microsoft' in f.read().lower(): with open('/proc/version', 'r') as f:
os = 'wsl' if 'microsoft' in f.read().lower():
os = 'wsl'
except: pass
elif 'windows' in syst: elif 'windows' in syst:
os = 'windows' os = 'windows'
elif 'bsd' in syst: elif 'bsd' in syst:

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python
script_name = 'you-get' script_name = 'you-get'
__version__ = '0.4.1128' __version__ = '0.4.1193'

View File

@ -25,6 +25,7 @@
"Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Topic :: Internet", "Topic :: Internet",
"Topic :: Internet :: WWW/HTTP", "Topic :: Internet :: WWW/HTTP",
"Topic :: Multimedia", "Topic :: Multimedia",