diff --git a/README.md b/README.md index 856f6c80..f524c60d 100644 --- a/README.md +++ b/README.md @@ -426,6 +426,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 抖音 | |✓| | | | TikTok | |✓| | | | 中国体育(TV) |
|✓| | | +| 知乎 | |✓| | | For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. diff --git a/src/you_get/common.py b/src/you_get/common.py index 47893910..78182163 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -128,6 +128,7 @@ SITES = { 'youtube' : 'youtube', 'zhanqi' : 'zhanqi', 'zhibo' : 'zhibo', + 'zhihu' : 'zhihu', } dry_run = False diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index 302433c0..d2c4c7b7 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -90,3 +90,4 @@ from .khan import * from .zhanqi import * from .kuaishou import * from .zhibo import * +from .zhihu import * diff --git a/src/you_get/extractors/zhihu.py b/src/you_get/extractors/zhihu.py new file mode 100644 index 00000000..64f81423 --- /dev/null +++ b/src/you_get/extractors/zhihu.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python + +__all__ = ['zhihu_download', 'zhihu_download_playlist'] + +from ..common import * +import json + + +def zhihu_download(url, output_dir='.', merge=True, info_only=False, **kwargs): + paths = url.split("/") + # question or column + if len(paths) < 3 and len(paths) < 6: + raise TypeError("URL does not conform to specifications, Support column and question only." + "Example URL: https://zhuanlan.zhihu.com/p/51669862 or " + "https://www.zhihu.com/question/267782048/answer/490720324") + + if ("question" not in paths or "answer" not in paths) and "zhuanlan.zhihu.com" not in paths: + raise TypeError("URL does not conform to specifications, Support column and question only." + "Example URL: https://zhuanlan.zhihu.com/p/51669862 or " + "https://www.zhihu.com/question/267782048/answer/490720324") + + html = get_html(url, faker=True) + title = match1(html, r'data-react-helmet="true">(.*?)') + for index, video_id in enumerate(matchall(html, [r'