diff --git a/.travis.yml b/.travis.yml
index 9df327b0..c11cbe34 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,11 +6,13 @@ python:
- "3.4"
- "3.5"
- "3.6"
+ - "3.7-dev"
- "nightly"
- "pypy3"
-before_install: pip install flake8
+before_install:
+ - if [[ $TRAVIS_PYTHON_VERSION != '3.2'* && $TRAVIS_PYTHON_VERSION != '3.3'* ]]; then pip install flake8; fi
before_script:
- - if [[ $TRAVIS_PYTHON_VERSION != '3.2'* ]]; then flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics; fi
+ - if [[ $TRAVIS_PYTHON_VERSION != '3.2'* && $TRAVIS_PYTHON_VERSION != '3.3'* ]]; then flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics; fi
script: make test
sudo: false
notifications:
diff --git a/README.md b/README.md
index f6f8efdc..f524c60d 100644
--- a/README.md
+++ b/README.md
@@ -113,6 +113,14 @@ You can install `you-get` easily via:
$ brew install you-get
```
+### Option 8: pkg (FreeBSD only)
+
+You can install `you-get` easily via:
+
+```
+# pkg install you-get
+```
+
### Shell completion
Completion definitions for Bash, Fish and Zsh can be found in [`contrib/completion`](https://github.com/soimort/you-get/tree/develop/contrib/completion). Please consult your shell's manual for how to take advantage of them.
@@ -416,7 +424,9 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the
| 西瓜视频 | |✓| | |
| 快手 | |✓|✓| |
| 抖音 | |✓| | |
+| TikTok | |✓| | |
| 中国体育(TV) | |✓| | |
+| 知乎 | |✓| | |
For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page.
diff --git a/src/you_get/common.py b/src/you_get/common.py
index b19d602f..78182163 100755
--- a/src/you_get/common.py
+++ b/src/you_get/common.py
@@ -102,6 +102,7 @@ SITES = {
'soundcloud' : 'soundcloud',
'ted' : 'ted',
'theplatform' : 'theplatform',
+ 'tiktok' : 'tiktok',
'tucao' : 'tucao',
'tudou' : 'tudou',
'tumblr' : 'tumblr',
@@ -127,6 +128,7 @@ SITES = {
'youtube' : 'youtube',
'zhanqi' : 'zhanqi',
'zhibo' : 'zhibo',
+ 'zhihu' : 'zhihu',
}
dry_run = False
@@ -429,7 +431,7 @@ def get_content(url, headers={}, decoded=True):
# Decode the response body
if decoded:
charset = match1(
- response.getheader('Content-Type'), r'charset=([\w-]+)'
+ response.getheader('Content-Type', ''), r'charset=([\w-]+)'
)
if charset is not None:
data = data.decode(charset)
@@ -439,7 +441,7 @@ def get_content(url, headers={}, decoded=True):
return data
-def post_content(url, headers={}, post_data={}, decoded=True):
+def post_content(url, headers={}, post_data={}, decoded=True, **kwargs):
"""Post the content of a URL via sending a HTTP POST request.
Args:
@@ -450,14 +452,19 @@ def post_content(url, headers={}, post_data={}, decoded=True):
Returns:
The content as a string.
"""
-
- logging.debug('post_content: %s \n post_data: %s' % (url, post_data))
+ if kwargs.get('post_data_raw'):
+ logging.debug('post_content: %s\npost_data_raw: %s' % (url, kwargs['post_data_raw']))
+ else:
+ logging.debug('post_content: %s\npost_data: %s' % (url, post_data))
req = request.Request(url, headers=headers)
if cookies:
cookies.add_cookie_header(req)
req.headers.update(req.unredirected_hdrs)
- post_data_enc = bytes(parse.urlencode(post_data), 'utf-8')
+ if kwargs.get('post_data_raw'):
+ post_data_enc = bytes(kwargs['post_data_raw'], 'utf-8')
+ else:
+ post_data_enc = bytes(parse.urlencode(post_data), 'utf-8')
response = urlopen_with_retry(req, data=post_data_enc)
data = response.read()
@@ -602,7 +609,12 @@ def url_save(
# the key must be 'Referer' for the hack here
if refer is not None:
tmp_headers['Referer'] = refer
- file_size = url_size(url, faker=faker, headers=tmp_headers)
+ if type(url) is list:
+ file_size = urls_size(url, faker=faker, headers=tmp_headers)
+ is_chunked, urls = True, url
+ else:
+ file_size = url_size(url, faker=faker, headers=tmp_headers)
+ is_chunked, urls = False, [url]
continue_renameing = True
while continue_renameing:
@@ -612,7 +624,7 @@ def url_save(
if not is_part:
if bar:
bar.done()
- print(
+ log.w(
'Skipping {}: file already exists'.format(
tr(os.path.basename(filepath))
)
@@ -638,7 +650,10 @@ def url_save(
print('Changing name to %s' % tr(os.path.basename(filepath)), '...')
continue_renameing = True
continue
- print('Overwriting %s' % tr(os.path.basename(filepath)), '...')
+ if log.yes_or_no('File with this name already exists. Overwrite?'):
+ log.w('Overwriting %s ...' % tr(os.path.basename(filepath)))
+ else:
+ return
elif not os.path.exists(os.path.dirname(filepath)):
os.mkdir(os.path.dirname(filepath))
@@ -655,70 +670,78 @@ def url_save(
else:
open_mode = 'wb'
- if received < file_size:
- if faker:
- tmp_headers = fake_headers
- '''
- if parameter headers passed in, we have it copied as tmp_header
- elif headers:
- headers = headers
- else:
- headers = {}
- '''
- if received:
- tmp_headers['Range'] = 'bytes=' + str(received) + '-'
- if refer:
- tmp_headers['Referer'] = refer
+ for url in urls:
+ received_chunk = 0
+ if received < file_size:
+ if faker:
+ tmp_headers = fake_headers
+ '''
+ if parameter headers passed in, we have it copied as tmp_header
+ elif headers:
+ headers = headers
+ else:
+ headers = {}
+ '''
+ if received and not is_chunked: # only request a range when not chunked
+ tmp_headers['Range'] = 'bytes=' + str(received) + '-'
+ if refer:
+ tmp_headers['Referer'] = refer
- if timeout:
- response = urlopen_with_retry(
- request.Request(url, headers=tmp_headers), timeout=timeout
- )
- else:
- response = urlopen_with_retry(
- request.Request(url, headers=tmp_headers)
- )
- try:
- range_start = int(
- response.headers[
- 'content-range'
- ][6:].split('/')[0].split('-')[0]
- )
- end_length = int(
- response.headers['content-range'][6:].split('/')[1]
- )
- range_length = end_length - range_start
- except:
- content_length = response.headers['content-length']
- range_length = int(content_length) if content_length is not None \
- else float('inf')
+ if timeout:
+ response = urlopen_with_retry(
+ request.Request(url, headers=tmp_headers), timeout=timeout
+ )
+ else:
+ response = urlopen_with_retry(
+ request.Request(url, headers=tmp_headers)
+ )
+ try:
+ range_start = int(
+ response.headers[
+ 'content-range'
+ ][6:].split('/')[0].split('-')[0]
+ )
+ end_length = int(
+ response.headers['content-range'][6:].split('/')[1]
+ )
+ range_length = end_length - range_start
+ except:
+ content_length = response.headers['content-length']
+ range_length = int(content_length) if content_length is not None \
+ else float('inf')
- if file_size != received + range_length:
- received = 0
- if bar:
- bar.received = 0
- open_mode = 'wb'
-
- with open(temp_filepath, open_mode) as output:
- while True:
- buffer = None
- try:
- buffer = response.read(1024 * 256)
- except socket.timeout:
- pass
- if not buffer:
- if received == file_size: # Download finished
- break
- # Unexpected termination. Retry request
- tmp_headers['Range'] = 'bytes=' + str(received) + '-'
- response = urlopen_with_retry(
- request.Request(url, headers=tmp_headers)
- )
- continue
- output.write(buffer)
- received += len(buffer)
+ if is_chunked: # always append if chunked
+ open_mode = 'ab'
+ elif file_size != received + range_length: # is it ever necessary?
+ received = 0
if bar:
- bar.update_received(len(buffer))
+ bar.received = 0
+ open_mode = 'wb'
+
+ with open(temp_filepath, open_mode) as output:
+ while True:
+ buffer = None
+ try:
+ buffer = response.read(1024 * 256)
+ except socket.timeout:
+ pass
+ if not buffer:
+ if is_chunked and received_chunk == range_length:
+ break
+ elif not is_chunked and received == file_size: # Download finished
+ break
+ # Unexpected termination. Retry request
+ if not is_chunked: # when
+ tmp_headers['Range'] = 'bytes=' + str(received) + '-'
+ response = urlopen_with_retry(
+ request.Request(url, headers=tmp_headers)
+ )
+ continue
+ output.write(buffer)
+ received += len(buffer)
+ received_chunk += len(buffer)
+ if bar:
+ bar.update_received(len(buffer))
assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (
received, os.path.getsize(temp_filepath), temp_filepath
@@ -907,7 +930,7 @@ def download_urls(
if total_size:
if not force and os.path.exists(output_filepath) and not auto_rename\
and os.path.getsize(output_filepath) >= total_size * 0.9:
- print('Skipping %s: file already exists' % output_filepath)
+ log.w('Skipping %s: file already exists' % output_filepath)
print()
return
bar = SimpleProgressBar(total_size, len(urls))
@@ -1554,9 +1577,9 @@ def google_search(url):
url = 'https://www.google.com/search?tbm=vid&q=%s' % parse.quote(keywords)
page = get_content(url, headers=fake_headers)
videos = re.findall(
- r'([^<]+)<', page
+ r'([^<]+)<', page
)
- vdurs = re.findall(r'([^<]+)<', page)
+ vdurs = re.findall(r'([^<]+)<', page)
durs = [r1(r'(\d+:\d+)', unescape_html(dur)) for dur in vdurs]
print('Google Videos search:')
for v in zip(videos, durs):
diff --git a/src/you_get/extractor.py b/src/you_get/extractor.py
index 4c9ccaa5..8aeed560 100644
--- a/src/you_get/extractor.py
+++ b/src/you_get/extractor.py
@@ -211,7 +211,7 @@ class VideoExtractor():
ext = self.dash_streams[stream_id]['container']
total_size = self.dash_streams[stream_id]['size']
- if ext == 'm3u8':
+ if ext == 'm3u8' or ext == 'm4a':
ext = 'mp4'
if not urls:
diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py
index 649a911f..d2c4c7b7 100755
--- a/src/you_get/extractors/__init__.py
+++ b/src/you_get/extractors/__init__.py
@@ -67,6 +67,7 @@ from .sohu import *
from .soundcloud import *
from .suntv import *
from .theplatform import *
+from .tiktok import *
from .tucao import *
from .tudou import *
from .tumblr import *
@@ -88,4 +89,5 @@ from .ted import *
from .khan import *
from .zhanqi import *
from .kuaishou import *
-from .zhibo import *
\ No newline at end of file
+from .zhibo import *
+from .zhihu import *
diff --git a/src/you_get/extractors/acfun.py b/src/you_get/extractors/acfun.py
index 4b45c5e9..200a3f54 100644
--- a/src/you_get/extractors/acfun.py
+++ b/src/you_get/extractors/acfun.py
@@ -85,9 +85,13 @@ def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=Fals
_, _, seg_size = url_info(url)
size += seg_size
#fallback to flvhd is not quite possible
- print_info(site_info, title, 'mp4', size)
+ if re.search(r'fid=[0-9A-Z\-]*.flv', preferred[0][0]):
+ ext = 'flv'
+ else:
+ ext = 'mp4'
+ print_info(site_info, title, ext, size)
if not info_only:
- download_urls(preferred[0], title, 'mp4', size, output_dir=output_dir, merge=merge)
+ download_urls(preferred[0], title, ext, size, output_dir=output_dir, merge=merge)
else:
raise NotImplementedError(sourceType)
@@ -105,27 +109,42 @@ def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=Fals
pass
def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
- assert re.match(r'http://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url)
- html = get_content(url)
+ assert re.match(r'http://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', url)
- title = r1(r'data-title="([^"]+)"', html)
+ if re.match(r'http://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url):
+ html = get_content(url)
+ title = r1(r'data-title="([^"]+)"', html)
+ if match1(url, r'_(\d+)$'): # current P
+ title = title + " " + r1(r'active">([^<]*)', html)
+ vid = r1('data-vid="(\d+)"', html)
+ up = r1('data-name="([^"]+)"', html)
+ # bangumi
+ elif re.match("http://[^\.]*\.*acfun\.[^\.]+/bangumi/ab(\d+)", url):
+ html = get_content(url)
+ title = match1(html, r'"newTitle"\s*:\s*"([^"]+)"')
+ if match1(url, r'_(\d+)$'): # current P
+ title = title + " " + r1(r'active">([^<]*)', html)
+ vid = match1(html, r'videoId="(\d+)"')
+ up = "acfun"
+ else:
+ raise NotImplemented
+
+ assert title and vid
title = unescape_html(title)
title = escape_file_path(title)
- assert title
- if match1(url, r'_(\d+)$'): # current P
- title = title + " " + r1(r'active">([^<]*)', html)
-
- vid = r1('data-vid="(\d+)"', html)
- up = r1('data-name="([^"]+)"', html)
p_title = r1('active">([^<]+)', html)
title = '%s (%s)' % (title, up)
- if p_title: title = '%s - %s' % (title, p_title)
+ if p_title:
+ title = '%s - %s' % (title, p_title)
+
+
acfun_download_by_vid(vid, title,
output_dir=output_dir,
merge=merge,
info_only=info_only,
**kwargs)
+
site_info = "AcFun.tv"
download = acfun_download
download_playlist = playlist_not_supported('acfun')
diff --git a/src/you_get/extractors/baidu.py b/src/you_get/extractors/baidu.py
index b30c9d86..a8cb3d5d 100644
--- a/src/you_get/extractors/baidu.py
+++ b/src/you_get/extractors/baidu.py
@@ -129,8 +129,9 @@ def baidu_download(url, output_dir='.', stream_type=None, merge=True, info_only=
html = get_html(url)
title = r1(r'title:"([^"]+)"', html)
- vhsrc = re.findall(r'"BDE_Image"[^>]+src="([^"]+)"', html) or re.findall(r'vhsrc="([^"]+)"', html)
- if vhsrc is not None:
+ vhsrc = re.findall(r'"BDE_Image"[^>]+src="([^"]+\.mp4)"', html) or \
+ re.findall(r'vhsrc="([^"]+)"', html)
+ if len(vhsrc) > 0:
ext = 'mp4'
size = url_size(vhsrc[0])
print_info(site_info, title, ext, size)
diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py
index 96fc60c8..5ed7f28d 100644
--- a/src/you_get/extractors/bilibili.py
+++ b/src/you_get/extractors/bilibili.py
@@ -22,7 +22,7 @@ from .youku import youku_download_by_vid
class Bilibili(VideoExtractor):
name = 'Bilibili'
- live_api = 'http://live.bilibili.com/api/playurl?cid={}&otype=json'
+ live_api = 'https://api.live.bilibili.com/room/v1/Room/playUrl?cid={}&quality=0&platform=web'
api_url = 'http://interface.bilibili.com/v2/playurl?'
bangumi_api_url = 'http://bangumi.bilibili.com/player/web_api/playurl?'
live_room_init_api_url = 'https://api.live.bilibili.com/room/v1/Room/room_init?id={}'
@@ -115,7 +115,7 @@ class Bilibili(VideoExtractor):
self.url = 'http://www.bilibili.com/video/av{}/'.format(aid)
self.ua = fake_headers['User-Agent']
- self.url = url_locations([self.url])[0]
+ self.url = url_locations([self.url], faker=True)[0]
frag = urllib.parse.urlparse(self.url).fragment
# http://www.bilibili.com/video/av3141144/index_2.html#page=3
if frag:
@@ -125,30 +125,31 @@ class Bilibili(VideoExtractor):
aid = re.search(r'av(\d+)', self.url).group(1)
self.url = 'http://www.bilibili.com/video/av{}/index_{}.html'.format(aid, page)
self.referer = self.url
- self.page = get_content(self.url)
+ self.page = get_content(self.url, headers=fake_headers)
m = re.search(r'(.*?)', self.page) or re.search(r'', self.page)
if m is not None:
self.title = m.group(1)
- s = re.search(r'([^<]+)', m.group(1))
+ s = re.search(r'([^<]+)
', m.group(1))
if s:
self.title = unescape_html(s.group(1))
if self.title is None:
m = re.search(r'property="og:title" content="([^"]+)"', self.page)
if m is not None:
self.title = m.group(1)
-
if 'subtitle' in kwargs:
subtitle = kwargs['subtitle']
self.title = '{} {}'.format(self.title, subtitle)
else:
playinfo = re.search(r'__INITIAL_STATE__=(.*?);\(function\(\)', self.page)
if playinfo is not None:
- pages = json.loads(playinfo.group(1))['videoData']['pages']
- if len(pages) > 1:
- qs = dict(parse.parse_qsl(urllib.parse.urlparse(self.url).query))
- page = pages[int(qs.get('p', 1)) - 1]
- self.title = '{} #{}. {}'.format(self.title, page['page'], page['part'])
+ jsonPlayinfo = json.loads(playinfo.group(1))
+ if 'videoData' in jsonPlayinfo:
+ pages = jsonPlayinfo['videoData']['pages']
+ if len(pages) > 1:
+ qs = dict(parse.parse_qsl(urllib.parse.urlparse(self.url).query))
+ page = pages[int(qs.get('p', 1)) - 1]
+ self.title = '{} #{}. {}'.format(self.title, page['page'], page['part'])
if 'bangumi.bilibili.com/movie' in self.url:
self.movie_entry(**kwargs)
@@ -160,6 +161,8 @@ class Bilibili(VideoExtractor):
self.live_entry(**kwargs)
elif 'vc.bilibili.com' in self.url:
self.vc_entry(**kwargs)
+ elif 'audio/au' in self.url:
+ self.audio_entry(**kwargs)
else:
self.entry(**kwargs)
@@ -171,6 +174,30 @@ class Bilibili(VideoExtractor):
self.title = page_list[0]['pagename']
self.download_by_vid(page_list[0]['cid'], True, bangumi_movie=True, **kwargs)
+ def audio_entry(self, **kwargs):
+ assert re.match(r'https?://www.bilibili.com/audio/au\d+', self.url)
+ patt = r"(\d+)"
+ audio_id = re.search(patt, self.url).group(1)
+ audio_info_url = \
+ 'https://www.bilibili.com/audio/music-service-c/web/song/info?sid={}'.format(audio_id)
+ audio_info_response = json.loads(get_content(audio_info_url))
+ if audio_info_response['msg'] != 'success':
+ log.wtf('fetch audio information failed!')
+ sys.exit(2)
+ self.title = audio_info_response['data']['title']
+ # TODO:there is no quality option for now
+ audio_download_url = \
+ 'https://www.bilibili.com/audio/music-service-c/web/url?sid={}&privilege=2&quality=2'.format(audio_id)
+ audio_download_response = json.loads(get_content(audio_download_url))
+ if audio_download_response['msg'] != 'success':
+ log.wtf('fetch audio resource failed!')
+ sys.exit(2)
+ self.streams['mp4'] = {}
+ self.streams['mp4']['src'] = [audio_download_response['data']['cdns'][0]]
+ self.streams['mp4']['container'] = 'm4a'
+ self.streams['mp4']['size'] = audio_download_response['data']['size']
+
+
def entry(self, **kwargs):
# tencent player
tc_flashvars = re.search(r'"bili-cid=\d+&bili-aid=\d+&vid=([^"]+)"', self.page)
@@ -190,7 +217,12 @@ class Bilibili(VideoExtractor):
index_id = int(re.search(r'index_(\d+)', self.url).group(1))
cid = page_list[index_id-1]['cid'] # change cid match rule
except:
- cid = re.search(r'"cid":(\d+)', self.page).group(1)
+ page = re.search(r'p=(\d+)', self.url)
+ if page is None:
+ p = 1
+ else:
+ p = int(page.group(1))
+ cid = re.search(r'"cid":(\d+),"page":%s' % p, self.page).group(1)
if cid is not None:
self.download_by_vid(cid, re.search('bangumi', self.url) is not None, **kwargs)
else:
@@ -226,7 +258,7 @@ class Bilibili(VideoExtractor):
api_url = self.live_api.format(self.room_id)
json_data = json.loads(get_content(api_url))
- urls = [json_data['durl'][0]['url']]
+ urls = [json_data['data']['durl'][0]['url']]
self.streams['live'] = {}
self.streams['live']['src'] = urls
@@ -252,28 +284,9 @@ class Bilibili(VideoExtractor):
self.streams['vc']['size'] = int(item['video_size'])
def bangumi_entry(self, **kwargs):
- bangumi_id = re.search(r'(\d+)', self.url).group(1)
- frag = urllib.parse.urlparse(self.url).fragment
- if frag:
- episode_id = frag
- else:
- episode_id = re.search(r'first_ep_id\s*=\s*"(\d+)"', self.page) or re.search(r'\/ep(\d+)', self.url).group(1)
- # cont = post_content('http://bangumi.bilibili.com/web_api/get_source', post_data=dict(episode_id=episode_id))
- # cid = json.loads(cont)['result']['cid']
- cont = get_content('http://bangumi.bilibili.com/web_api/episode/{}.json'.format(episode_id))
- ep_info = json.loads(cont)['result']['currentEpisode']
-
- bangumi_data = get_bangumi_info(str(ep_info['seasonId']))
- bangumi_payment = bangumi_data.get('payment')
- if bangumi_payment and bangumi_payment['price'] != '0':
- log.w("It's a paid item")
- # ep_ids = collect_bangumi_epids(bangumi_data)
-
- index_title = ep_info['indexTitle']
- long_title = ep_info['longTitle'].strip()
- cid = ep_info['danmaku']
-
- self.title = '{} [{} {}]'.format(self.title, index_title, long_title)
+ data = json.loads(re.search(r'__INITIAL_STATE__=(.+);\(function', self.page).group(1))
+ cid = data['epInfo']['cid']
+ # index_title = data['epInfo']['index_title']
self.download_by_vid(cid, bangumi=True, **kwargs)
@@ -376,10 +389,82 @@ def download_video_from_favlist(url, **kwargs):
else:
log.wtf("Fail to parse the fav title" + url, "")
+def download_music_from_favlist(url, page, **kwargs):
+ m = re.search(r'https?://www.bilibili.com/audio/mycollection/(\d+)', url)
+ if m is not None:
+ sid = m.group(1)
+ json_result = json.loads(get_content("https://www.bilibili.com/audio/music-service-c/web/song/of-coll?"
+ "sid={}&pn={}&ps=100".format(sid, page)))
+ if json_result['msg'] == 'success':
+ music_list = json_result['data']['data']
+ music_count = len(music_list)
+ for i in range(music_count):
+ audio_id = music_list[i]['id']
+ audio_title = music_list[i]['title']
+ audio_url = "https://www.bilibili.com/audio/au{}".format(audio_id)
+ print("Start downloading music ", audio_title)
+ Bilibili().download_by_url(audio_url, **kwargs)
+ if page < json_result['data']['pageCount']:
+ page += 1
+ download_music_from_favlist(url, page, **kwargs)
+ else:
+ log.wtf("Fail to get music list of page " + json_result)
+ sys.exit(2)
+ else:
+ log.wtf("Fail to parse the sid from " + url, "")
+def download_video_from_totallist(url, page, **kwargs):
+ # the url has format: https://space.bilibili.com/64169458/#/video
+ m = re.search(r'space\.bilibili\.com/(\d+)/.*?video', url)
+ mid = ""
+ if m is not None:
+ mid = m.group(1)
+ jsonresult = json.loads(get_content("https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&pagesize=100&tid=0&page={}&keyword=&order=pubdate&jsonp=jsonp".format(mid, page)))
+ if jsonresult['status']:
+ videos = jsonresult['data']['vlist']
+ videocount = len(videos)
+ for i in range(videocount):
+ videoid = videos[i]["aid"]
+ videotitle = videos[i]["title"]
+ videourl = "https://www.bilibili.com/video/av{}".format(videoid)
+ print("Start downloading ", videotitle, " video ", videotitle)
+ Bilibili().download_by_url(videourl, subtitle=videotitle, **kwargs)
+ if page < jsonresult['data']['pages']:
+ page += 1
+ download_video_from_totallist(url, page, **kwargs)
+ else:
+ log.wtf("Fail to get the files of page " + jsonresult)
+ sys.exit(2)
+
+ else:
+ log.wtf("Fail to parse the video title" + url, "")
+
+def download_music_from_totallist(url, page, **kwargs):
+ m = re.search(r'https?://www.bilibili.com/audio/am(\d+)\?type=\d', url)
+ if m is not None:
+ sid = m.group(1)
+ json_result = json.loads(get_content("https://www.bilibili.com/audio/music-service-c/web/song/of-menu?"
+ "sid={}&pn={}&ps=100".format(sid, page)))
+ if json_result['msg'] == 'success':
+ music_list = json_result['data']['data']
+ music_count = len(music_list)
+ for i in range(music_count):
+ audio_id = music_list[i]['id']
+ audio_title = music_list[i]['title']
+ audio_url = "https://www.bilibili.com/audio/au{}".format(audio_id)
+ print("Start downloading music ",audio_title)
+ Bilibili().download_by_url(audio_url, **kwargs)
+ if page < json_result['data']['pageCount']:
+ page += 1
+ download_music_from_totallist(url, page, **kwargs)
+ else:
+ log.wtf("Fail to get music list of page " + json_result)
+ sys.exit(2)
+ else:
+ log.wtf("Fail to parse the sid from " + url, "")
def bilibili_download_playlist_by_url(url, **kwargs):
- url = url_locations([url])[0]
+ url = url_locations([url], faker=True)[0]
kwargs['playlist'] = True
# a bangumi here? possible?
if 'live.bilibili' in url:
@@ -396,6 +481,12 @@ def bilibili_download_playlist_by_url(url, **kwargs):
elif 'favlist' in url:
# this a fav list folder
download_video_from_favlist(url, **kwargs)
+ elif re.match(r'https?://space.bilibili.com/\d+/#/video', url):
+ download_video_from_totallist(url, 1, **kwargs)
+ elif re.match(r'https://www.bilibili.com/audio/mycollection/\d+', url):
+ download_music_from_favlist(url, 1, **kwargs)
+ elif re.match(r'https?://www.bilibili.com/audio/am\d+\?type=\d', url):
+ download_music_from_totallist(url, 1, **kwargs)
else:
aid = re.search(r'av(\d+)', url).group(1)
page_list = json.loads(get_content('http://www.bilibili.com/widget/getPageList?aid={}'.format(aid)))
diff --git a/src/you_get/extractors/instagram.py b/src/you_get/extractors/instagram.py
index 65fc01f5..567e0dd7 100755
--- a/src/you_get/extractors/instagram.py
+++ b/src/you_get/extractors/instagram.py
@@ -29,9 +29,9 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg
image_url = edge['node']['display_url']
if 'video_url' in edge['node']:
image_url = edge['node']['video_url']
- image_url = image_url.split('?')[0]
- ext = image_url.split('.')[-1]
+ ext = image_url.split('?')[0].split('.')[-1]
size = int(get_head(image_url)['Content-Length'])
+
print_info(site_info, title, ext, size)
if not info_only:
download_urls(urls=[image_url],
@@ -44,9 +44,9 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg
image_url = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['display_url']
if 'video_url' in info['entry_data']['PostPage'][0]['graphql']['shortcode_media']:
image_url =info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['video_url']
- image_url = image_url.split('?')[0]
- ext = image_url.split('.')[-1]
+ ext = image_url.split('?')[0].split('.')[-1]
size = int(get_head(image_url)['Content-Length'])
+
print_info(site_info, title, ext, size)
if not info_only:
download_urls(urls=[image_url],
diff --git a/src/you_get/extractors/iwara.py b/src/you_get/extractors/iwara.py
index 50d14fb8..a30159d7 100644
--- a/src/you_get/extractors/iwara.py
+++ b/src/you_get/extractors/iwara.py
@@ -17,20 +17,20 @@ headers = {
def iwara_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
global headers
- video_hash=match1(url, r'http://\w+.iwara.tv/videos/(\w+)')
- video_url=match1(url, r'(http://\w+.iwara.tv)/videos/\w+')
- html = get_content(url,headers=headers)
+ video_hash = match1(url, r'https?://\w+.iwara.tv/videos/(\w+)')
+ video_url = match1(url, r'(https?://\w+.iwara.tv)/videos/\w+')
+ html = get_content(url, headers=headers)
title = r1(r'(.*)', html)
- api_url=video_url+'/api/video/'+video_hash
- content=get_content(api_url,headers=headers)
- data=json.loads(content)
- type,ext,size=url_info(data[0]['uri'], headers=headers)
- down_urls=data[0]['uri']
- print_info(down_urls,title+data[0]['resolution'],type,size)
+ api_url = video_url + '/api/video/' + video_hash
+ content = get_content(api_url, headers=headers)
+ data = json.loads(content)
+ down_urls = 'https:' + data[0]['uri']
+ type, ext, size = url_info(down_urls, headers=headers)
+ print_info(site_info, title+data[0]['resolution'], type, size)
if not info_only:
- download_urls([down_urls], title, ext, size, output_dir, merge = merge,headers=headers)
+ download_urls([down_urls], title, ext, size, output_dir, merge=merge, headers=headers)
-site_info = "iwara"
+site_info = "Iwara"
download = iwara_download
download_playlist = playlist_not_supported('iwara')
diff --git a/src/you_get/extractors/ixigua.py b/src/you_get/extractors/ixigua.py
index 59133442..3cf07b09 100644
--- a/src/you_get/extractors/ixigua.py
+++ b/src/you_get/extractors/ixigua.py
@@ -1,14 +1,132 @@
#!/usr/bin/env python
-__all__ = ['ixigua_download']
+import base64
-from .toutiao import download as toutiao_download
-from .toutiao import download_playlist as toutiao_download_playlist
+import binascii
+
+from ..common import *
+import random
+import ctypes
+from json import loads
+
+__all__ = ['ixigua_download', 'ixigua_download_playlist_by_url']
+
+headers = {
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 "
+ "Safari/537.36",
+}
+
+
+def int_overflow(val):
+ maxint = 2147483647
+ if not -maxint - 1 <= val <= maxint:
+ val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1
+ return val
+
+
+def unsigned_right_shitf(n, i):
+ if n < 0:
+ n = ctypes.c_uint32(n).value
+ if i < 0:
+ return -int_overflow(n << abs(i))
+ return int_overflow(n >> i)
+
+
+def get_video_url_from_video_id(video_id):
+ """Splicing URLs according to video ID to get video details"""
+ # from js
+ data = [""] * 256
+ for index, _ in enumerate(data):
+ t = index
+ for i in range(8):
+ t = -306674912 ^ unsigned_right_shitf(t, 1) if 1 & t else unsigned_right_shitf(t, 1)
+ data[index] = t
+
+ def tmp():
+ rand_num = random.random()
+ path = "/video/urls/v/1/toutiao/mp4/{video_id}?r={random_num}".format(video_id=video_id,
+ random_num=str(rand_num)[2:])
+ e = o = r = -1
+ i, a = 0, len(path)
+ while i < a:
+ e = ord(path[i])
+ i += 1
+ if e < 128:
+ r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ e)]
+ else:
+ if e < 2048:
+ r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (192 | e >> 6 & 31))]
+ r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))]
+ else:
+ if 55296 <= e < 57344:
+ e = (1023 & e) + 64
+ i += 1
+ o = 1023 & t.url(i)
+ r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (240 | e >> 8 & 7))]
+ r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 2 & 63))]
+ r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | o >> 6 & 15 | (3 & e) << 4))]
+ r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & o))]
+ else:
+ r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (224 | e >> 12 & 15))]
+ r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 6 & 63))]
+ r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))]
+
+ return "https://ib.365yg.com{path}&s={param}".format(path=path, param=unsigned_right_shitf(r ^ -1, 0))
+
+ while 1:
+ url = tmp()
+ if url.split("=")[-1][0] != "-": # 参数s不能为负数
+ return url
def ixigua_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
- return toutiao_download(url.replace('ixigua', '365yg'))
+ # example url: https://www.ixigua.com/i6631065141750268420/#mid=63024814422
+ html = get_html(url, faker=True)
+ video_id = match1(html, r"videoId\s*:\s*'([^']+)'")
+ title = match1(html, r"title: '(\S+)',")
+ if not video_id:
+ log.e("video_id not found, url:{}".format(url))
+ return
+ video_info_url = get_video_url_from_video_id(video_id)
+ video_info = loads(get_content(video_info_url))
+ if video_info.get("code", 1) != 0:
+ log.e("Get video info from {} error: server return code {}".format(video_info_url, video_info.get("code", 1)))
+ return
+ if not video_info.get("data", None):
+ log.e("Get video info from {} error: The server returns JSON value"
+ " without data or data is empty".format(video_info_url))
+ return
+ if not video_info["data"].get("video_list", None):
+ log.e("Get video info from {} error: The server returns JSON value"
+ " without data.video_list or data.video_list is empty".format(video_info_url))
+ return
+ if not video_info["data"]["video_list"].get("video_1", None):
+ log.e("Get video info from {} error: The server returns JSON value"
+ " without data.video_list.video_1 or data.video_list.video_1 is empty".format(video_info_url))
+ return
+ size = int(video_info["data"]["video_list"]["video_1"]["size"])
+ print_info(site_info=site_info, title=title, type="mp4", size=size) # 该网站只有mp4类型文件
+ if not info_only:
+ video_url = base64.b64decode(video_info["data"]["video_list"]["video_1"]["main_url"].encode("utf-8"))
+ download_urls([video_url.decode("utf-8")], title, "mp4", size, output_dir, merge=merge, headers=headers, **kwargs)
+
+
+def ixigua_download_playlist_by_url(url, output_dir='.', merge=True, info_only=False, **kwargs):
+ assert "user" in url, "Only support users to publish video list,Please provide a similar url:" \
+ "https://www.ixigua.com/c/user/6907091136/"
+
+ user_id = url.split("/")[-2] if url[-1] == "/" else url.split("/")[-1]
+ params = {"max_behot_time": "0", "max_repin_time": "0", "count": "20", "page_type": "0", "user_id": user_id}
+ while 1:
+ url = "https://www.ixigua.com/c/user/article/?" + "&".join(["{}={}".format(k, v) for k, v in params.items()])
+ video_list = loads(get_content(url, headers=headers))
+ params["max_behot_time"] = video_list["next"]["max_behot_time"]
+ for video in video_list["data"]:
+ ixigua_download("https://www.ixigua.com/i{}/".format(video["item_id"]), output_dir, merge, info_only,
+ **kwargs)
+ if video_list["next"]["max_behot_time"] == 0:
+ break
site_info = "ixigua.com"
download = ixigua_download
-download_playlist = toutiao_download_playlist
+download_playlist = ixigua_download_playlist_by_url
diff --git a/src/you_get/extractors/lizhi.py b/src/you_get/extractors/lizhi.py
index 65988a9f..4991df31 100644
--- a/src/you_get/extractors/lizhi.py
+++ b/src/you_get/extractors/lizhi.py
@@ -2,8 +2,17 @@
__all__ = ['lizhi_download']
import json
+import datetime
from ..common import *
+#
+# Worked well but not perfect.
+# TODO: add option --format={sd|hd}
+#
+def get_url(ep):
+ readable = datetime.datetime.fromtimestamp(int(ep['create_time']) / 1000).strftime('%Y/%m/%d')
+ return 'http://cdn5.lizhi.fm/audio/{}/{}_hd.mp3'.format(readable, ep['id'])
+
# radio_id: e.g. 549759 from http://www.lizhi.fm/549759/
#
# Returns a list of tuples (audio_id, title, url) for each episode
@@ -23,7 +32,7 @@ def lizhi_extract_playlist_info(radio_id):
# (au_cnt), then handle pagination properly.
api_url = 'http://www.lizhi.fm/api/radio_audios?s=0&l=65535&band=%s' % radio_id
api_response = json.loads(get_content(api_url))
- return [(ep['id'], ep['name'], ep['url']) for ep in api_response]
+ return [(ep['id'], ep['name'], get_url(ep)) for ep in api_response]
def lizhi_download_audio(audio_id, title, url, output_dir='.', info_only=False):
filetype, ext, size = url_info(url)
diff --git a/src/you_get/extractors/miaopai.py b/src/you_get/extractors/miaopai.py
index f37d45b0..304ac176 100644
--- a/src/you_get/extractors/miaopai.py
+++ b/src/you_get/extractors/miaopai.py
@@ -2,9 +2,12 @@
__all__ = ['miaopai_download']
+import string
+import random
from ..common import *
import urllib.error
import urllib.parse
+from ..util import fs
fake_headers_mobile = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
@@ -20,6 +23,10 @@ def miaopai_download_by_fid(fid, output_dir = '.', merge = False, info_only = Fa
mobile_page = get_content(page_url, headers=fake_headers_mobile)
url = match1(mobile_page, r'