[zhihu] Add zhihu.com support

This commit is contained in:
perror 2018-12-07 21:38:39 +08:00
parent 7dbfece21f
commit 8454f878d2
4 changed files with 82 additions and 0 deletions

View File

@ -426,6 +426,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the
| 抖音 | <https://www.douyin.com/> |✓| | | | 抖音 | <https://www.douyin.com/> |✓| | |
| TikTok | <https://www.tiktok.com/> |✓| | | | TikTok | <https://www.tiktok.com/> |✓| | |
| 中国体育(TV) | <http://v.zhibo.tv/> </br><http://video.zhibo.tv/> |✓| | | | 中国体育(TV) | <http://v.zhibo.tv/> </br><http://video.zhibo.tv/> |✓| | |
| 知乎 | <https://www.zhihu.com/> |✓| | |
For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page.

View File

@ -128,6 +128,7 @@ SITES = {
'youtube' : 'youtube', 'youtube' : 'youtube',
'zhanqi' : 'zhanqi', 'zhanqi' : 'zhanqi',
'zhibo' : 'zhibo', 'zhibo' : 'zhibo',
'zhihu' : 'zhihu',
} }
dry_run = False dry_run = False

View File

@ -90,3 +90,4 @@ from .khan import *
from .zhanqi import * from .zhanqi import *
from .kuaishou import * from .kuaishou import *
from .zhibo import * from .zhibo import *
from .zhihu import *

View File

@ -0,0 +1,79 @@
#!/usr/bin/env python
__all__ = ['zhihu_download', 'zhihu_download_playlist']
from ..common import *
import json
def zhihu_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
paths = url.split("/")
# question or column
if len(paths) < 3 and len(paths) < 6:
raise TypeError("URL does not conform to specifications, Support column and question only."
"Example URL: https://zhuanlan.zhihu.com/p/51669862 or "
"https://www.zhihu.com/question/267782048/answer/490720324")
if ("question" not in paths or "answer" not in paths) and "zhuanlan.zhihu.com" not in paths:
raise TypeError("URL does not conform to specifications, Support column and question only."
"Example URL: https://zhuanlan.zhihu.com/p/51669862 or "
"https://www.zhihu.com/question/267782048/answer/490720324")
html = get_html(url, faker=True)
title = match1(html, r'data-react-helmet="true">(.*?)</title>')
for index, video_id in enumerate(matchall(html, [r'<a class="video-box" href="\S+video/(\d+)"'])):
try:
video_info = json.loads(
get_content(r"https://lens.zhihu.com/api/videos/{}".format(video_id), headers=fake_headers))
except json.decoder.JSONDecodeError:
log.w("Video id not found:{}".format(video_id))
continue
play_list = video_info["playlist"]
# first High Definition
# second Second Standard Definition
# third ld. What is ld ?
# finally continue
data = play_list.get("hd", play_list.get("sd", play_list.get("ld", None)))
if not data:
log.w("Video id No play address:{}".format(video_id))
continue
print_info(site_info, title, data["format"], data["size"])
if not info_only:
ext = "_{}.{}".format(index, data["format"])
if kwargs.get("zhihu_offset"):
ext = "_{}".format(kwargs["zhihu_offset"]) + ext
download_urls([data["play_url"]], title, ext, data["size"],
output_dir=output_dir, merge=merge, **kwargs)
def zhihu_download_playlist(url, output_dir='.', merge=True, info_only=False, **kwargs):
if "question" not in url or "answer" in url: # question page
raise TypeError("URL does not conform to specifications, Support question only."
" Example URL: https://www.zhihu.com/question/267782048")
url = url.split("?")[0]
if url[-1] == "/":
question_id = url.split("/")[-2]
else:
question_id = url.split("/")[-1]
videos_url = r"https://www.zhihu.com/api/v4/questions/{}/answers".format(question_id)
try:
questions = json.loads(get_content(videos_url))
except json.decoder.JSONDecodeError:
raise TypeError("Check whether the problem URL exists.Example URL: https://www.zhihu.com/question/267782048")
count = 0
while 1:
for data in questions["data"]:
kwargs["zhihu_offset"] = count
zhihu_download("https://www.zhihu.com/question/{}/answer/{}".format(question_id, data["id"]),
output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
count += 1
if questions["paging"]["is_end"]:
return
questions = json.loads(get_content(questions["paging"]["next"], headers=fake_headers))
site_info = "zhihu.com"
download = zhihu_download
download_playlist = zhihu_download_playlist