Merge remote-tracking branch 'upstream/develop' into develop

# Conflicts:
#	src/you_get/extractors/bilibili.py
This commit is contained in:
ed 2024-09-03 09:42:34 +08:00
commit 59e1b4d6ef
59 changed files with 1443 additions and 1666 deletions

39
.github/workflows/python-package.yml vendored Normal file
View File

@ -0,0 +1,39 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
name: develop
on:
push:
branches: [ develop ]
pull_request:
branches: [ develop ]
jobs:
build:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, '3.10', '3.11', '3.12', pypy-3.8, pypy-3.9, pypy-3.10]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip setuptools
pip install flake8
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with unittest
run: |
make test

4
.gitignore vendored
View File

@ -79,13 +79,15 @@ _*
*.ts
*.webm
*.xml
*.json
/.env
/.idea
*.m4a
*.DS_Store
*.txt
*.sw[a-p]
*.zip
.emacs*
.vscode

View File

@ -1,22 +0,0 @@
# https://travis-ci.org/soimort/you-get
language: python
python:
- "3.4"
- "3.5"
- "3.6"
- "3.7"
- "3.8"
#- "nightly" (flake8 not working in python 3.9 yet, module 'ast' has no attribute 'AugLoad')
- "pypy3"
before_install:
- pip install flake8
before_script:
- flake8 . --count --select=E9,F63,F72,F82 --show-source --statistics
script: make test
notifications:
webhooks:
urls:
- https://webhooks.gitter.im/e/43cd57826e88ed8f2152
on_success: change # options: [always|never|change] default: always
on_failure: always # options: [always|never|change] default: always
on_start: never # options: [always|never|change] default: always

View File

@ -1,6 +1,6 @@
MIT License
Copyright (c) 2012-2020 Mort Yao <mort.yao@gmail.com> and other contributors
Copyright (c) 2012-2024 Mort Yao <mort.yao@gmail.com> and other contributors
(https://github.com/soimort/you-get/graphs/contributors)
Copyright (c) 2012 Boyu Guo <iambus@gmail.com>

View File

@ -1,6 +1,9 @@
include *.rst
include *.txt
include Makefile
include CONTRIBUTING.md
include README.md
include you-get
include you-get.json
include you-get.plugin.zsh
recursive-include contrib *

View File

@ -1,14 +1,12 @@
SETUP = python3 setup.py
.PHONY: default i test clean all html rst build sdist bdist bdist_egg bdist_wheel install release
.PHONY: default i test clean all html rst build install release
default: i
i:
@(cd src/; python3 -i -c 'import you_get; print("You-Get %s\n>>> import you_get" % you_get.version.__version__)')
@(cd src; python -i -c 'import you_get; print("You-Get %s\n>>> import you_get" % you_get.version.__version__)')
test:
$(SETUP) test
(cd src; python -m unittest discover -s ../tests)
clean:
zenity --question
@ -16,7 +14,7 @@ clean:
find . | grep __pycache__ | xargs rm -fr
find . | grep .pyc | xargs rm -f
all: build sdist bdist bdist_egg bdist_wheel
all: build
html:
pandoc README.md > README.html
@ -25,23 +23,11 @@ rst:
pandoc -s -t rst README.md > README.rst
build:
$(SETUP) build
sdist:
$(SETUP) sdist
bdist:
$(SETUP) bdist
bdist_egg:
$(SETUP) bdist_egg
bdist_wheel:
$(SETUP) bdist_wheel
python -m build
install:
$(SETUP) install --user --prefix=
python -m pip install .
release:
zenity --question
$(SETUP) sdist bdist_wheel upload --sign
release: build
@echo 'Upload new version to PyPI using:'
@echo ' twine upload --sign dist/you_get-VERSION*'

183
README.md
View File

@ -1,10 +1,12 @@
# You-Get
[![Build Status](https://github.com/soimort/you-get/workflows/develop/badge.svg)](https://github.com/soimort/you-get/actions)
[![PyPI version](https://img.shields.io/pypi/v/you-get.svg)](https://pypi.python.org/pypi/you-get/)
[![Build Status](https://travis-ci.org/soimort/you-get.svg)](https://travis-ci.org/soimort/you-get)
[![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
**NOTICE: Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.**
**NOTICE (30 May 2022): Support for Python 3.5, 3.6 and 3.7 will eventually be dropped. ([see details here](https://github.com/soimort/you-get/wiki/TLS-1.3-post-handshake-authentication-(PHA)))**
**NOTICE (8 Mar 2019): Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.**
---
@ -53,17 +55,17 @@ Are you a Python programmer? Then check out [the source](https://github.com/soim
### Prerequisites
The following dependencies are necessary:
The following dependencies are recommended:
* **[Python](https://www.python.org/downloads/)** 3.2 or above
* **[Python](https://www.python.org/downloads/)** 3.7.4 or above
* **[FFmpeg](https://www.ffmpeg.org/)** 1.0 or above
* (Optional) [RTMPDump](https://rtmpdump.mplayerhq.hu/)
### Option 1: Install via pip
The official release of `you-get` is distributed on [PyPI](https://pypi.python.org/pypi/you-get), and can be installed easily from a PyPI mirror via the [pip](https://en.wikipedia.org/wiki/Pip_\(package_manager\)) package manager. Note that you must use the Python 3 version of `pip`:
The official release of `you-get` is distributed on [PyPI](https://pypi.python.org/pypi/you-get), and can be installed easily from a PyPI mirror via the [pip](https://en.wikipedia.org/wiki/Pip_\(package_manager\)) package manager: (Note that you must use the Python 3 version of `pip`)
$ pip3 install you-get
$ pip install you-get
### Option 2: Install via [Antigen](https://github.com/zsh-users/antigen) (for Zsh users)
@ -78,16 +80,26 @@ You may either download the [stable](https://github.com/soimort/you-get/archive/
Alternatively, run
```
$ [sudo] python3 setup.py install
$ cd path/to/you-get
$ [sudo] python -m pip install .
```
Or
```
$ python3 setup.py install --user
$ cd path/to/you-get
$ python -m pip install . --user
```
to install `you-get` to a permanent path.
to install `you-get` to a permanent path. (And don't omit the dot `.` representing the current directory)
You can also use the [pipenv](https://pipenv.pypa.io/en/latest) to install the `you-get` in the Python virtual environment.
```
$ pipenv install -e .
$ pipenv run you-get --version
you-get: version 0.4.1555, a tiny downloader that scrapes the web.
```
### Option 4: Git clone
@ -97,7 +109,7 @@ This is the recommended way for all developers, even if you don't often code in
$ git clone git://github.com/soimort/you-get.git
```
Then put the cloned directory into your `PATH`, or run `./setup.py install` to install `you-get` to a permanent path.
Then put the cloned directory into your `PATH`, or run `python -m pip install path/to/you-get` to install `you-get` to a permanent path.
### Option 5: Homebrew (Mac only)
@ -115,6 +127,14 @@ You can install `you-get` easily via:
# pkg install you-get
```
### Option 7: Flox (Mac, Linux, and Windows WSL)
You can install `you-get` easily via:
```
$ flox install you-get
```
### Shell completion
Completion definitions for Bash, Fish and Zsh can be found in [`contrib/completion`](https://github.com/soimort/you-get/tree/develop/contrib/completion). Please consult your shell's manual for how to take advantage of them.
@ -124,7 +144,7 @@ Completion definitions for Bash, Fish and Zsh can be found in [`contrib/completi
Based on which option you chose to install `you-get`, you may upgrade it via:
```
$ pip3 install --upgrade you-get
$ pip install --upgrade you-get
```
or download the latest release via:
@ -136,7 +156,7 @@ $ you-get https://github.com/soimort/you-get/archive/master.zip
In order to get the latest ```develop``` branch without messing up the PIP, you can try:
```
$ pip3 install --upgrade git+https://github.com/soimort/you-get@develop
$ pip install --upgrade git+https://github.com/soimort/you-get@develop
```
## Getting Started
@ -256,25 +276,20 @@ Type: JPEG Image (image/jpeg)
Size: 0.06 MiB (66482 Bytes)
Downloading rms.jpg ...
100.0% ( 0.1/0.1 MB) ├████████████████████████████████████████┤[1/1] 127 kB/s
100% ( 0.1/ 0.1MB) ├████████████████████████████████████████┤[1/1] 127 kB/s
```
Otherwise, `you-get` will scrape the web page and try to figure out if there's anything interesting to you:
```
$ you-get http://kopasas.tumblr.com/post/69361932517
$ you-get https://kopasas.tumblr.com/post/69361932517
Site: Tumblr.com
Title: kopasas
Type: Unknown type (None)
Size: 0.51 MiB (536583 Bytes)
Site: Tumblr.com
Title: tumblr_mxhg13jx4n1sftq6do1_1280
Title: [tumblr] tumblr_mxhg13jx4n1sftq6do1_640
Type: Portable Network Graphics (image/png)
Size: 0.51 MiB (536583 Bytes)
Size: 0.11 MiB (118484 Bytes)
Downloading tumblr_mxhg13jx4n1sftq6do1_1280.png ...
100.0% ( 0.5/0.5 MB) ├████████████████████████████████████████┤[1/1] 22 MB/s
Downloading [tumblr] tumblr_mxhg13jx4n1sftq6do1_640.png ...
100% ( 0.1/ 0.1MB) ├████████████████████████████████████████┤[1/1] 22 MB/s
```
**Note:**
@ -364,83 +379,81 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the
| Site | URL | Videos? | Images? | Audios? |
| :--: | :-- | :-----: | :-----: | :-----: |
| **YouTube** | <https://www.youtube.com/> |✓| | |
| **Twitter** | <https://twitter.com/> |✓|✓| |
| VK | <http://vk.com/> |✓|✓| |
| Vine | <https://vine.co/> |✓| | |
| **X (Twitter)** | <https://x.com/> |✓|✓| |
| VK | <https://vk.com/> |✓|✓| |
| Vimeo | <https://vimeo.com/> |✓| | |
| Veoh | <http://www.veoh.com/> |✓| | |
| Veoh | <https://www.veoh.com/> |✓| | |
| **Tumblr** | <https://www.tumblr.com/> |✓|✓|✓|
| TED | <http://www.ted.com/> |✓| | |
| TED | <https://www.ted.com/> |✓| | |
| SoundCloud | <https://soundcloud.com/> | | |✓|
| SHOWROOM | <https://www.showroom-live.com/> |✓| | |
| Pinterest | <https://www.pinterest.com/> | |✓| |
| MTV81 | <http://www.mtv81.com/> |✓| | |
| MTV81 | <https://www.mtv81.com/> |✓| | |
| Mixcloud | <https://www.mixcloud.com/> | | |✓|
| Metacafe | <http://www.metacafe.com/> |✓| | |
| Magisto | <http://www.magisto.com/> |✓| | |
| Metacafe | <https://www.metacafe.com/> |✓| | |
| Magisto | <https://www.magisto.com/> |✓| | |
| Khan Academy | <https://www.khanacademy.org/> |✓| | |
| Internet Archive | <https://archive.org/> |✓| | |
| **Instagram** | <https://instagram.com/> |✓|✓| |
| InfoQ | <http://www.infoq.com/presentations/> |✓| | |
| Imgur | <http://imgur.com/> | |✓| |
| Heavy Music Archive | <http://www.heavy-music.ru/> | | |✓|
| Freesound | <http://www.freesound.org/> | | |✓|
| InfoQ | <https://www.infoq.com/presentations/> |✓| | |
| Imgur | <https://imgur.com/> | |✓| |
| Heavy Music Archive | <https://www.heavy-music.ru/> | | |✓|
| Freesound | <https://www.freesound.org/> | | |✓|
| Flickr | <https://www.flickr.com/> |✓|✓| |
| FC2 Video | <http://video.fc2.com/> |✓| | |
| FC2 Video | <https://video.fc2.com/> |✓| | |
| Facebook | <https://www.facebook.com/> |✓| | |
| eHow | <http://www.ehow.com/> |✓| | |
| Dailymotion | <http://www.dailymotion.com/> |✓| | |
| Coub | <http://coub.com/> |✓| | |
| CBS | <http://www.cbs.com/> |✓| | |
| Bandcamp | <http://bandcamp.com/> | | |✓|
| AliveThai | <http://alive.in.th/> |✓| | |
| interest.me | <http://ch.interest.me/tvn> |✓| | |
| **755<br/>ナナゴーゴー** | <http://7gogo.jp/> |✓|✓| |
| **niconico<br/>ニコニコ動画** | <http://www.nicovideo.jp/> |✓| | |
| **163<br/>网易视频<br/>网易云音乐** | <http://v.163.com/><br/><http://music.163.com/> |✓| |✓|
| 56网 | <http://www.56.com/> |✓| | |
| **AcFun** | <http://www.acfun.cn/> |✓| | |
| **Baidu<br/>百度贴吧** | <http://tieba.baidu.com/> |✓|✓| |
| 爆米花网 | <http://www.baomihua.com/> |✓| | |
| **bilibili<br/>哔哩哔哩** | <http://www.bilibili.com/> |✓|✓|✓|
| 豆瓣 | <http://www.douban.com/> |✓| |✓|
| 斗鱼 | <http://www.douyutv.com/> |✓| | |
| 凤凰视频 | <http://v.ifeng.com/> |✓| | |
| 风行网 | <http://www.fun.tv/> |✓| | |
| iQIYI<br/>爱奇艺 | <http://www.iqiyi.com/> |✓| | |
| 激动网 | <http://www.joy.cn/> |✓| | |
| 酷6网 | <http://www.ku6.com/> |✓| | |
| 酷狗音乐 | <http://www.kugou.com/> | | |✓|
| 酷我音乐 | <http://www.kuwo.cn/> | | |✓|
| 乐视网 | <http://www.le.com/> |✓| | |
| 荔枝FM | <http://www.lizhi.fm/> | | |✓|
| 秒拍 | <http://www.miaopai.com/> |✓| | |
| MioMio弹幕网 | <http://www.miomio.tv/> |✓| | |
| MissEvan<br/>猫耳FM | <http://www.missevan.com/> | | |✓|
| eHow | <https://www.ehow.com/> |✓| | |
| Dailymotion | <https://www.dailymotion.com/> |✓| | |
| Coub | <https://coub.com/> |✓| | |
| CBS | <https://www.cbs.com/> |✓| | |
| Bandcamp | <https://bandcamp.com/> | | |✓|
| AliveThai | <https://alive.in.th/> |✓| | |
| interest.me | <https://ch.interest.me/tvn> |✓| | |
| **755<br/>ナナゴーゴー** | <https://7gogo.jp/> |✓|✓| |
| **niconico<br/>ニコニコ動画** | <https://www.nicovideo.jp/> |✓| | |
| **163<br/>网易视频<br/>网易云音乐** | <https://v.163.com/><br/><https://music.163.com/> |✓| |✓|
| 56网 | <https://www.56.com/> |✓| | |
| **AcFun** | <https://www.acfun.cn/> |✓| | |
| **Baidu<br/>百度贴吧** | <https://tieba.baidu.com/> |✓|✓| |
| 爆米花网 | <https://www.baomihua.com/> |✓| | |
| **bilibili<br/>哔哩哔哩** | <https://www.bilibili.com/> |✓|✓|✓|
| 豆瓣 | <https://www.douban.com/> |✓| |✓|
| 斗鱼 | <https://www.douyutv.com/> |✓| | |
| 凤凰视频 | <https://v.ifeng.com/> |✓| | |
| 风行网 | <https://www.fun.tv/> |✓| | |
| iQIYI<br/>爱奇艺 | <https://www.iqiyi.com/> |✓| | |
| 激动网 | <https://www.joy.cn/> |✓| | |
| 酷6网 | <https://www.ku6.com/> |✓| | |
| 酷狗音乐 | <https://www.kugou.com/> | | |✓|
| 酷我音乐 | <https://www.kuwo.cn/> | | |✓|
| 乐视网 | <https://www.le.com/> |✓| | |
| 荔枝FM | <https://www.lizhi.fm/> | | |✓|
| 懒人听书 | <https://www.lrts.me/> | | |✓|
| 秒拍 | <https://www.miaopai.com/> |✓| | |
| MioMio弹幕网 | <https://www.miomio.tv/> |✓| | |
| MissEvan<br/>猫耳FM | <https://www.missevan.com/> | | |✓|
| 痞客邦 | <https://www.pixnet.net/> |✓| | |
| PPTV聚力 | <http://www.pptv.com/> |✓| | |
| 齐鲁网 | <http://v.iqilu.com/> |✓| | |
| QQ<br/>腾讯视频 | <http://v.qq.com/> |✓| | |
| 企鹅直播 | <http://live.qq.com/> |✓| | |
| Sina<br/>新浪视频<br/>微博秒拍视频 | <http://video.sina.com.cn/><br/><http://video.weibo.com/> |✓| | |
| Sohu<br/>搜狐视频 | <http://tv.sohu.com/> |✓| | |
| **Tudou<br/>土豆** | <http://www.tudou.com/> |✓| | |
| 虾米 | <http://www.xiami.com/> |✓| |✓|
| 阳光卫视 | <http://www.isuntv.com/> |✓| | |
| **音悦Tai** | <http://www.yinyuetai.com/> |✓| | |
| **Youku<br/>优酷** | <http://www.youku.com/> |✓| | |
| 战旗TV | <http://www.zhanqi.tv/lives> |✓| | |
| 央视网 | <http://www.cntv.cn/> |✓| | |
| Naver<br/>네이버 | <http://tvcast.naver.com/> |✓| | |
| 芒果TV | <http://www.mgtv.com/> |✓| | |
| 火猫TV | <http://www.huomao.com/> |✓| | |
| 阳光宽频网 | <http://www.365yg.com/> |✓| | |
| PPTV聚力 | <https://www.pptv.com/> |✓| | |
| 齐鲁网 | <https://v.iqilu.com/> |✓| | |
| QQ<br/>腾讯视频 | <https://v.qq.com/> |✓| | |
| 企鹅直播 | <https://live.qq.com/> |✓| | |
| Sina<br/>新浪视频<br/>微博秒拍视频 | <https://video.sina.com.cn/><br/><https://video.weibo.com/> |✓| | |
| Sohu<br/>搜狐视频 | <https://tv.sohu.com/> |✓| | |
| **Tudou<br/>土豆** | <https://www.tudou.com/> |✓| | |
| 阳光卫视 | <https://www.isuntv.com/> |✓| | |
| **Youku<br/>优酷** | <https://www.youku.com/> |✓| | |
| 战旗TV | <https://www.zhanqi.tv/lives> |✓| | |
| 央视网 | <https://www.cntv.cn/> |✓| | |
| Naver<br/>네이버 | <https://tvcast.naver.com/> |✓| | |
| 芒果TV | <https://www.mgtv.com/> |✓| | |
| 火猫TV | <https://www.huomao.com/> |✓| | |
| 阳光宽频网 | <https://www.365yg.com/> |✓| | |
| 西瓜视频 | <https://www.ixigua.com/> |✓| | |
| 新片场 | <https://www.xinpianchang.com//> |✓| | |
| 新片场 | <https://www.xinpianchang.com/> |✓| | |
| 快手 | <https://www.kuaishou.com/> |✓|✓| |
| 抖音 | <https://www.douyin.com/> |✓| | |
| TikTok | <https://www.tiktok.com/> |✓| | |
| 中国体育(TV) | <http://v.zhibo.tv/> </br><http://video.zhibo.tv/> |✓| | |
| 中国体育(TV) | <https://v.zhibo.tv/> </br><https://video.zhibo.tv/> |✓| | |
| 知乎 | <https://www.zhihu.com/> |✓| | |
For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page.
@ -453,7 +466,7 @@ Check if it's already a known problem on <https://github.com/soimort/you-get/wik
## Getting Involved
You can reach us on the Gitter channel [#soimort/you-get](https://gitter.im/soimort/you-get) (here's how you [set up your IRC client](http://irc.gitter.im) for Gitter). If you have a quick question regarding `you-get`, ask it there.
You can reach us on the Gitter channel [#soimort/you-get](https://gitter.im/soimort/you-get) (here's how you [set up your IRC client](https://irc.gitter.im) for Gitter). If you have a quick question regarding `you-get`, ask it there.
If you are seeking to report an issue or contribute, please make sure to read [the guidelines](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) first.

View File

@ -52,7 +52,7 @@ source <https://github.com/soimort/you-get>`__ and fork it!
.. |PyPI version| image:: https://badge.fury.io/py/you-get.png
:target: http://badge.fury.io/py/you-get
.. |Build Status| image:: https://api.travis-ci.org/soimort/you-get.png
:target: https://travis-ci.org/soimort/you-get
.. |Build Status| image:: https://github.com/soimort/you-get/workflows/develop/badge.svg
:target: https://github.com/soimort/you-get/actions
.. |Gitter| image:: https://badges.gitter.im/Join%20Chat.svg
:target: https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge

5
SECURITY.md Normal file
View File

@ -0,0 +1,5 @@
# Security Policy
## Reporting a Vulnerability
Please report security issues to <mort.yao+you-get@gmail.com>.

2
requirements.txt Normal file
View File

@ -0,0 +1,2 @@
# runtime dependencies
dukpy

View File

@ -5,7 +5,20 @@ PACKAGE_NAME = 'you_get'
PROJ_METADATA = '%s.json' % PROJ_NAME
import os, json, imp
import importlib.util
import importlib.machinery
def load_source(modname, filename):
loader = importlib.machinery.SourceFileLoader(modname, filename)
spec = importlib.util.spec_from_file_location(modname, filename, loader=loader)
module = importlib.util.module_from_spec(spec)
# The module is always executed and not cached in sys.modules.
# Uncomment the following line to cache the module.
# sys.modules[module.__name__] = module
loader.exec_module(module)
return module
import os, json
here = os.path.abspath(os.path.dirname(__file__))
proj_info = json.loads(open(os.path.join(here, PROJ_METADATA), encoding='utf-8').read())
try:
@ -13,7 +26,7 @@ try:
except:
README = ""
CHANGELOG = open(os.path.join(here, 'CHANGELOG.rst'), encoding='utf-8').read()
VERSION = imp.load_source('version', os.path.join(here, 'src/%s/version.py' % PACKAGE_NAME)).__version__
VERSION = load_source('version', os.path.join(here, 'src/%s/version.py' % PACKAGE_NAME)).__version__
from setuptools import setup, find_packages
setup(
@ -43,7 +56,8 @@ setup(
entry_points = {'console_scripts': proj_info['console_scripts']},
extras_require={
install_requires = ['dukpy'],
extras_require = {
'socks': ['PySocks'],
}
)

View File

@ -76,6 +76,7 @@ SITES = {
'letv' : 'le',
'lizhi' : 'lizhi',
'longzhu' : 'longzhu',
'lrts' : 'lrts',
'magisto' : 'magisto',
'metacafe' : 'metacafe',
'mgtv' : 'mgtv',
@ -110,14 +111,12 @@ SITES = {
'wanmen' : 'wanmen',
'weibo' : 'miaopai',
'veoh' : 'veoh',
'vine' : 'vine',
'vk' : 'vk',
'xiami' : 'xiami',
'x' : 'twitter',
'xiaokaxiu' : 'yixia',
'xiaojiadianvideo' : 'fc2video',
'ximalaya' : 'ximalaya',
'xinpianchang' : 'xinpianchang',
'yinyuetai' : 'yinyuetai',
'yizhibo' : 'yizhibo',
'youku' : 'youku',
'youtu' : 'youtube',
@ -137,13 +136,16 @@ cookies = None
output_filename = None
auto_rename = False
insecure = False
m3u8 = False
postfix = False
prefix = None
fake_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # noqa
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'UTF-8,*;q=0.5',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', # noqa
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/126.0.2592.113' # Latest Edge
}
if sys.stdout.isatty():
@ -341,10 +343,38 @@ def undeflate(data):
return decompressobj.decompress(data)+decompressobj.flush()
# an http.client implementation of get_content()
# because urllib does not support "Connection: keep-alive"
def getHttps(host, url, headers, debuglevel=0):
import http.client
conn = http.client.HTTPSConnection(host)
conn.set_debuglevel(debuglevel)
conn.request("GET", url, headers=headers)
resp = conn.getresponse()
logging.debug('getHttps: %s' % resp.getheaders())
set_cookie = resp.getheader('set-cookie')
data = resp.read()
try:
data = ungzip(data) # gzip
data = undeflate(data) # deflate
except:
pass
conn.close()
return str(data, encoding='utf-8'), set_cookie # TODO: support raw data
# DEPRECATED in favor of get_content()
def get_response(url, faker=False):
logging.debug('get_response: %s' % url)
ctx = None
if insecure:
# ignore ssl errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
# install cookies
if cookies:
opener = request.build_opener(request.HTTPCookieProcessor(cookies))
@ -352,10 +382,10 @@ def get_response(url, faker=False):
if faker:
response = request.urlopen(
request.Request(url, headers=fake_headers), None
request.Request(url, headers=fake_headers), None, context=ctx,
)
else:
response = request.urlopen(url)
response = request.urlopen(url, context=ctx)
data = response.read()
if response.info().get('Content-Encoding') == 'gzip':
@ -434,8 +464,17 @@ def get_content(url, headers={}, decoded=True):
req = request.Request(url, headers=headers)
if cookies:
cookies.add_cookie_header(req)
req.headers.update(req.unredirected_hdrs)
# NOTE: Do not use cookies.add_cookie_header(req)
# #HttpOnly_ cookies were not supported by CookieJar and MozillaCookieJar properly until python 3.10
# See also:
# - https://github.com/python/cpython/pull/17471
# - https://bugs.python.org/issue2190
# Here we add cookies to the request headers manually
cookie_strings = []
for cookie in list(cookies):
cookie_strings.append(cookie.name + '=' + cookie.value)
cookie_headers = {'Cookie': '; '.join(cookie_strings)}
req.headers.update(cookie_headers)
response = urlopen_with_retry(req)
data = response.read()
@ -478,8 +517,17 @@ def post_content(url, headers={}, post_data={}, decoded=True, **kwargs):
req = request.Request(url, headers=headers)
if cookies:
cookies.add_cookie_header(req)
req.headers.update(req.unredirected_hdrs)
# NOTE: Do not use cookies.add_cookie_header(req)
# #HttpOnly_ cookies were not supported by CookieJar and MozillaCookieJar properly until python 3.10
# See also:
# - https://github.com/python/cpython/pull/17471
# - https://bugs.python.org/issue2190
# Here we add cookies to the request headers manually
cookie_strings = []
for cookie in list(cookies):
cookie_strings.append(cookie.name + '=' + cookie.value)
cookie_headers = {'Cookie': '; '.join(cookie_strings)}
req.headers.update(cookie_headers)
if kwargs.get('post_data_raw'):
post_data_enc = bytes(kwargs['post_data_raw'], 'utf-8')
else:
@ -667,7 +715,7 @@ def url_save(
bar.done()
if not force and auto_rename:
path, ext = os.path.basename(filepath).rsplit('.', 1)
finder = re.compile(' \([1-9]\d*?\)$')
finder = re.compile(r' \([1-9]\d*?\)$')
if (finder.search(path) is None):
thisfile = path + ' (1).' + ext
else:
@ -966,6 +1014,10 @@ def download_urls(
pass
title = tr(get_filename(title))
if postfix and 'vid' in kwargs:
title = "%s [%s]" % (title, kwargs['vid'])
if prefix is not None:
title = "[%s] %s" % (prefix, title)
output_filename = get_output_filename(urls, title, ext, output_dir, merge)
output_filepath = os.path.join(output_dir, output_filename)
@ -1322,7 +1374,13 @@ def download_main(download, download_playlist, urls, playlist, **kwargs):
if re.match(r'https?://', url) is None:
url = 'http://' + url
if playlist:
if m3u8:
if output_filename:
title = output_filename
else:
title = "m3u8file"
download_url_ffmpeg(url=url, title=title,ext = 'mp4',output_dir = '.')
elif playlist:
download_playlist(url, **kwargs)
else:
download(url, **kwargs)
@ -1422,12 +1480,25 @@ def load_cookies(cookiefile):
def set_socks_proxy(proxy):
try:
import socks
socks_proxy_addrs = proxy.split(':')
socks.set_default_proxy(
socks.SOCKS5,
socks_proxy_addrs[0],
int(socks_proxy_addrs[1])
)
if '@' in proxy:
proxy_info = proxy.split("@")
socks_proxy_addrs = proxy_info[1].split(':')
socks_proxy_auth = proxy_info[0].split(":")
socks.set_default_proxy(
socks.SOCKS5,
socks_proxy_addrs[0],
int(socks_proxy_addrs[1]),
True,
socks_proxy_auth[0],
socks_proxy_auth[1]
)
else:
socks_proxy_addrs = proxy.split(':')
socks.set_default_proxy(
socks.SOCKS5,
socks_proxy_addrs[0],
int(socks_proxy_addrs[1]),
)
socket.socket = socks.socksocket
def getaddrinfo(*args):
@ -1495,6 +1566,14 @@ def script_main(download, download_playlist, **kwargs):
'--no-caption', action='store_true',
help='Do not download captions (subtitles, lyrics, danmaku, ...)'
)
download_grp.add_argument(
'--post', '--postfix', dest='postfix', action='store_true', default=False,
help='Postfix downloaded files with unique identifiers'
)
download_grp.add_argument(
'--pre', '--prefix', dest='prefix', metavar='PREFIX', default=None,
help='Prefix downloaded files with string'
)
download_grp.add_argument(
'-f', '--force', action='store_true', default=False,
help='Force overwriting existing files'
@ -1541,6 +1620,21 @@ def script_main(download, download_playlist, **kwargs):
'-l', '--playlist', action='store_true',
help='Prefer to download a playlist'
)
playlist_grp = parser.add_argument_group('Playlist optional options')
playlist_grp.add_argument(
'--first', metavar='FIRST',
help='the first number'
)
playlist_grp.add_argument(
'--last', metavar='LAST',
help='the last number'
)
playlist_grp.add_argument(
'--size', '--page-size', metavar='PAGE_SIZE',
help='the page size number'
)
download_grp.add_argument(
'-a', '--auto-rename', action='store_true', default=False,
help='Auto rename same name different files'
@ -1565,13 +1659,17 @@ def script_main(download, download_playlist, **kwargs):
'--no-proxy', action='store_true', help='Never use a proxy'
)
proxy_grp.add_argument(
'-s', '--socks-proxy', metavar='HOST:PORT',
'-s', '--socks-proxy', metavar='HOST:PORT or USERNAME:PASSWORD@HOST:PORT',
help='Use an SOCKS5 proxy for downloading'
)
download_grp.add_argument('--stream', help=argparse.SUPPRESS)
download_grp.add_argument('--itag', help=argparse.SUPPRESS)
download_grp.add_argument('-m', '--m3u8', action='store_true', default=False,
help = 'download video using an m3u8 url')
parser.add_argument('URL', nargs='*', help=argparse.SUPPRESS)
args = parser.parse_args()
@ -1597,6 +1695,9 @@ def script_main(download, download_playlist, **kwargs):
global output_filename
global auto_rename
global insecure
global m3u8
global postfix
global prefix
output_filename = args.output_filename
extractor_proxy = args.extractor_proxy
@ -1618,6 +1719,9 @@ def script_main(download, download_playlist, **kwargs):
if args.cookies:
load_cookies(args.cookies)
if args.m3u8:
m3u8 = True
caption = True
stream_id = args.format or args.stream or args.itag
if args.no_caption:
@ -1630,6 +1734,8 @@ def script_main(download, download_playlist, **kwargs):
# ignore ssl
insecure = True
postfix = args.postfix
prefix = args.prefix
if args.no_proxy:
set_http_proxy('')
@ -1658,7 +1764,7 @@ def script_main(download, download_playlist, **kwargs):
socket.setdefaulttimeout(args.timeout)
try:
extra = {}
extra = {'args': args}
if extractor_proxy:
extra['extractor_proxy'] = extractor_proxy
if stream_id:
@ -1716,20 +1822,10 @@ def google_search(url):
url = 'https://www.google.com/search?tbm=vid&q=%s' % parse.quote(keywords)
page = get_content(url, headers=fake_headers)
videos = re.findall(
r'<a href="(https?://[^"]+)" onmousedown="[^"]+"><h3 class="[^"]*">([^<]+)<', page
r'(https://www\.youtube\.com/watch\?v=[\w-]+)', page
)
vdurs = re.findall(r'<span class="vdur[^"]*">([^<]+)<', page)
durs = [r1(r'(\d+:\d+)', unescape_html(dur)) for dur in vdurs]
print('Google Videos search:')
for v in zip(videos, durs):
print('- video: {} [{}]'.format(
unescape_html(v[0][1]),
v[1] if v[1] else '?'
))
print('# you-get %s' % log.sprint(v[0][0], log.UNDERLINE))
print()
print('Best matched result:')
return(videos[0][0])
return(videos[0])
def url_to_module(url):
@ -1760,9 +1856,12 @@ def url_to_module(url):
)
else:
try:
location = get_location(url) # t.co isn't happy with fake_headers
try:
location = get_location(url) # t.co isn't happy with fake_headers
except:
location = get_location(url, headers=fake_headers)
except:
location = get_location(url, headers=fake_headers)
location = get_location(url, headers=fake_headers, get_method='GET')
if location and location != url and not location.startswith('/'):
return url_to_module(location)

View File

@ -238,7 +238,8 @@ class VideoExtractor():
download_urls(urls, self.title, ext, total_size, headers=headers,
output_dir=kwargs['output_dir'],
merge=kwargs['merge'],
av=stream_id in self.dash_streams)
av=stream_id in self.dash_streams,
vid=self.vid)
if 'caption' not in kwargs or not kwargs['caption']:
print('Skipping captions or danmaku.')

View File

@ -74,16 +74,13 @@ from .twitter import *
from .ucas import *
from .veoh import *
from .vimeo import *
from .vine import *
from .vk import *
from .w56 import *
from .wanmen import *
from .xiami import *
from .xinpianchang import *
from .yinyuetai import *
from .yixia import *
from .youku import *
from .youtube import *
from .zhanqi import *
from .zhibo import *
from .zhihu import *
from .zhihu import *

View File

@ -1,175 +1,213 @@
#!/usr/bin/env python
__all__ = ['acfun_download']
from ..common import *
from ..extractor import VideoExtractor
from .le import letvcloud_download_by_vu
from .qq import qq_download_by_vid
from .sina import sina_download_by_vid
from .tudou import tudou_download_by_iid
from .youku import youku_download_by_vid
class AcFun(VideoExtractor):
name = "AcFun"
import json
import re
import base64
import time
stream_types = [
{'id': '2160P', 'qualityType': '2160p'},
{'id': '1080P60', 'qualityType': '1080p60'},
{'id': '720P60', 'qualityType': '720p60'},
{'id': '1080P+', 'qualityType': '1080p+'},
{'id': '1080P', 'qualityType': '1080p'},
{'id': '720P', 'qualityType': '720p'},
{'id': '540P', 'qualityType': '540p'},
{'id': '360P', 'qualityType': '360p'}
]
def get_srt_json(id):
url = 'http://danmu.aixifan.com/V2/%s' % id
return get_content(url)
def prepare(self, **kwargs):
assert re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', self.url)
def youku_acfun_proxy(vid, sign, ref):
endpoint = 'http://player.acfun.cn/flash_data?vid={}&ct=85&ev=3&sign={}&time={}'
url = endpoint.format(vid, sign, str(int(time.time() * 1000)))
json_data = json.loads(get_content(url, headers=dict(referer=ref)))['data']
enc_text = base64.b64decode(json_data)
dec_text = rc4(b'8bdc7e1a', enc_text).decode('utf8')
youku_json = json.loads(dec_text)
if re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', self.url):
html = get_content(self.url, headers=fake_headers)
json_text = match1(html, r"(?s)videoInfo\s*=\s*(\{.*?\});")
json_data = json.loads(json_text)
vid = json_data.get('currentVideoInfo').get('id')
up = json_data.get('user').get('name')
self.title = json_data.get('title')
video_list = json_data.get('videoList')
if len(video_list) > 1:
self.title += " - " + [p.get('title') for p in video_list if p.get('id') == vid][0]
currentVideoInfo = json_data.get('currentVideoInfo')
elif re.match(r"https?://[^\.]*\.*acfun\.[^\.]+/bangumi/aa(\d+)", self.url):
html = get_content(self.url, headers=fake_headers)
tag_script = match1(html, r'<script>\s*window\.pageInfo([^<]+)</script>')
json_text = tag_script[tag_script.find('{') : tag_script.find('};') + 1]
json_data = json.loads(json_text)
self.title = json_data['bangumiTitle'] + " " + json_data['episodeName'] + " " + json_data['title']
vid = str(json_data['videoId'])
up = "acfun"
currentVideoInfo = json_data.get('currentVideoInfo')
yk_streams = {}
for stream in youku_json['stream']:
tp = stream['stream_type']
yk_streams[tp] = [], stream['total_size']
if stream.get('segs'):
for seg in stream['segs']:
yk_streams[tp][0].append(seg['url'])
else:
yk_streams[tp] = stream['m3u8'], stream['total_size']
raise NotImplemented
return yk_streams
if 'ksPlayJson' in currentVideoInfo:
durationMillis = currentVideoInfo['durationMillis']
ksPlayJson = ksPlayJson = json.loads( currentVideoInfo['ksPlayJson'] )
representation = ksPlayJson.get('adaptationSet')[0].get('representation')
stream_list = representation
def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False, **kwargs):
"""str, str, str, bool, bool ->None
for stream in stream_list:
m3u8_url = stream["url"]
size = durationMillis * stream["avgBitrate"] / 8
# size = float('inf')
container = 'mp4'
stream_id = stream["qualityLabel"]
quality = stream["qualityType"]
stream_data = dict(src=m3u8_url, size=size, container=container, quality=quality)
self.streams[stream_id] = stream_data
Download Acfun video by vid.
assert self.title and m3u8_url
self.title = unescape_html(self.title)
self.title = escape_file_path(self.title)
p_title = r1('active">([^<]+)', html)
self.title = '%s (%s)' % (self.title, up)
if p_title:
self.title = '%s - %s' % (self.title, p_title)
Call Acfun API, decide which site to use, and pass the job to its
extractor.
"""
#first call the main parasing API
info = json.loads(get_content('http://www.acfun.cn/video/getVideo.aspx?id=' + vid, headers=fake_headers))
def download(self, **kwargs):
if 'json_output' in kwargs and kwargs['json_output']:
json_output.output(self)
elif 'info_only' in kwargs and kwargs['info_only']:
if 'stream_id' in kwargs and kwargs['stream_id']:
# Display the stream
stream_id = kwargs['stream_id']
if 'index' not in kwargs:
self.p(stream_id)
else:
self.p_i(stream_id)
else:
# Display all available streams
if 'index' not in kwargs:
self.p([])
else:
stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag']
self.p_i(stream_id)
sourceType = info['sourceType']
#decide sourceId to know which extractor to use
if 'sourceId' in info: sourceId = info['sourceId']
# danmakuId = info['danmakuId']
#call extractor decided by sourceId
if sourceType == 'sina':
sina_download_by_vid(sourceId, title, output_dir=output_dir, merge=merge, info_only=info_only)
elif sourceType == 'youku':
youku_download_by_vid(sourceId, title=title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
elif sourceType == 'tudou':
tudou_download_by_iid(sourceId, title, output_dir=output_dir, merge=merge, info_only=info_only)
elif sourceType == 'qq':
qq_download_by_vid(sourceId, title, True, output_dir=output_dir, merge=merge, info_only=info_only)
elif sourceType == 'letv':
letvcloud_download_by_vu(sourceId, '2d8c027396', title, output_dir=output_dir, merge=merge, info_only=info_only)
elif sourceType == 'zhuzhan':
#As in Jul.28.2016, Acfun is using embsig to anti hotlink so we need to pass this
#In Mar. 2017 there is a dedicated ``acfun_proxy'' in youku cloud player
#old code removed
url = 'http://www.acfun.cn/v/ac' + vid
yk_streams = youku_acfun_proxy(info['sourceId'], info['encode'], url)
seq = ['mp4hd3', 'mp4hd2', 'mp4hd', 'flvhd']
for t in seq:
if yk_streams.get(t):
preferred = yk_streams[t]
break
#total_size in the json could be incorrect(F.I. 0)
size = 0
for url in preferred[0]:
_, _, seg_size = url_info(url)
size += seg_size
#fallback to flvhd is not quite possible
if re.search(r'fid=[0-9A-Z\-]*.flv', preferred[0][0]):
ext = 'flv'
else:
ext = 'mp4'
print_info(site_info, title, ext, size)
if 'stream_id' in kwargs and kwargs['stream_id']:
# Download the stream
stream_id = kwargs['stream_id']
else:
stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag']
if 'index' not in kwargs:
self.p(stream_id)
else:
self.p_i(stream_id)
if stream_id in self.streams:
url = self.streams[stream_id]['src']
ext = self.streams[stream_id]['container']
total_size = self.streams[stream_id]['size']
if ext == 'm3u8' or ext == 'm4a':
ext = 'mp4'
if not url:
log.wtf('[Failed] Cannot extract video source.')
# For legacy main()
headers = {}
if self.ua is not None:
headers['User-Agent'] = self.ua
if self.referer is not None:
headers['Referer'] = self.referer
download_url_ffmpeg(url, self.title, ext, output_dir=kwargs['output_dir'], merge=kwargs['merge'])
if 'caption' not in kwargs or not kwargs['caption']:
print('Skipping captions or danmaku.')
return
for lang in self.caption_tracks:
filename = '%s.%s.srt' % (get_filename(self.title), lang)
print('Saving %s ... ' % filename, end="", flush=True)
srt = self.caption_tracks[lang]
with open(os.path.join(kwargs['output_dir'], filename),
'w', encoding='utf-8') as x:
x.write(srt)
print('Done.')
if self.danmaku is not None and not dry_run:
filename = '{}.cmt.xml'.format(get_filename(self.title))
print('Downloading {} ...\n'.format(filename))
with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf8') as fp:
fp.write(self.danmaku)
if self.lyrics is not None and not dry_run:
filename = '{}.lrc'.format(get_filename(self.title))
print('Downloading {} ...\n'.format(filename))
with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf8') as fp:
fp.write(self.lyrics)
# For main_dev()
#download_urls(urls, self.title, self.streams[stream_id]['container'], self.streams[stream_id]['size'])
keep_obj = kwargs.get('keep_obj', False)
if not keep_obj:
self.__init__()
def acfun_download(self, url, output_dir='.', merge=True, info_only=False, **kwargs):
assert re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', url)
def getM3u8UrlFromCurrentVideoInfo(currentVideoInfo):
if 'playInfos' in currentVideoInfo:
return currentVideoInfo['playInfos'][0]['playUrls'][0]
elif 'ksPlayJson' in currentVideoInfo:
ksPlayJson = json.loads( currentVideoInfo['ksPlayJson'] )
representation = ksPlayJson.get('adaptationSet')[0].get('representation')
reps = []
for one in representation:
reps.append( (one['width']* one['height'], one['url'], one['backupUrl']) )
return max(reps)[1]
if re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url):
html = get_content(url, headers=fake_headers)
json_text = match1(html, r"(?s)videoInfo\s*=\s*(\{.*?\});")
json_data = json.loads(json_text)
vid = json_data.get('currentVideoInfo').get('id')
up = json_data.get('user').get('name')
title = json_data.get('title')
video_list = json_data.get('videoList')
if len(video_list) > 1:
title += " - " + [p.get('title') for p in video_list if p.get('id') == vid][0]
currentVideoInfo = json_data.get('currentVideoInfo')
m3u8_url = getM3u8UrlFromCurrentVideoInfo(currentVideoInfo)
elif re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/bangumi/aa(\d+)', url):
html = get_content(url, headers=fake_headers)
tag_script = match1(html, r'<script>\s*window\.pageInfo([^<]+)</script>')
json_text = tag_script[tag_script.find('{') : tag_script.find('};') + 1]
json_data = json.loads(json_text)
title = json_data['bangumiTitle'] + " " + json_data['episodeName'] + " " + json_data['title']
vid = str(json_data['videoId'])
up = "acfun"
currentVideoInfo = json_data.get('currentVideoInfo')
m3u8_url = getM3u8UrlFromCurrentVideoInfo(currentVideoInfo)
else:
raise NotImplemented
assert title and m3u8_url
title = unescape_html(title)
title = escape_file_path(title)
p_title = r1('active">([^<]+)', html)
title = '%s (%s)' % (title, up)
if p_title:
title = '%s - %s' % (title, p_title)
print_info(site_info, title, 'm3u8', float('inf'))
if not info_only:
download_urls(preferred[0], title, ext, size, output_dir=output_dir, merge=merge)
else:
raise NotImplementedError(sourceType)
if not info_only and not dry_run:
if not kwargs['caption']:
print('Skipping danmaku.')
return
try:
title = get_filename(title)
print('Downloading %s ...\n' % (title + '.cmt.json'))
cmt = get_srt_json(vid)
with open(os.path.join(output_dir, title + '.cmt.json'), 'w', encoding='utf-8') as x:
x.write(cmt)
except:
pass
def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
assert re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', url)
if re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url):
html = get_content(url, headers=fake_headers)
json_text = match1(html, r"(?s)videoInfo\s*=\s*(\{.*?\});")
json_data = json.loads(json_text)
vid = json_data.get('currentVideoInfo').get('id')
up = json_data.get('user').get('name')
title = json_data.get('title')
video_list = json_data.get('videoList')
if len(video_list) > 1:
title += " - " + [p.get('title') for p in video_list if p.get('id') == vid][0]
currentVideoInfo = json_data.get('currentVideoInfo')
if 'playInfos' in currentVideoInfo:
m3u8_url = currentVideoInfo['playInfos'][0]['playUrls'][0]
elif 'ksPlayJson' in currentVideoInfo:
ksPlayJson = json.loads( currentVideoInfo['ksPlayJson'] )
representation = ksPlayJson.get('adaptationSet').get('representation')
reps = []
for one in representation:
reps.append( (one['width']* one['height'], one['url'], one['backupUrl']) )
m3u8_url = max(reps)[1]
elif re.match("https?://[^\.]*\.*acfun\.[^\.]+/bangumi/aa(\d+)", url):
html = get_content(url, headers=fake_headers)
tag_script = match1(html, r'<script>window\.pageInfo([^<]+)</script>')
json_text = tag_script[tag_script.find('{') : tag_script.find('};') + 1]
json_data = json.loads(json_text)
title = json_data['bangumiTitle'] + " " + json_data['episodeName'] + " " + json_data['title']
vid = str(json_data['videoId'])
up = "acfun"
play_info = get_content("https://www.acfun.cn/rest/pc-direct/play/playInfo/m3u8Auto?videoId=" + vid, headers=fake_headers)
play_url = json.loads(play_info)['playInfo']['streams'][0]['playUrls'][0]
m3u8_all_qualities_file = get_content(play_url)
m3u8_all_qualities_lines = m3u8_all_qualities_file.split('#EXT-X-STREAM-INF:')[1:]
highest_quality_line = m3u8_all_qualities_lines[0]
for line in m3u8_all_qualities_lines:
bandwith = int(match1(line, r'BANDWIDTH=(\d+)'))
if bandwith > int(match1(highest_quality_line, r'BANDWIDTH=(\d+)')):
highest_quality_line = line
#TODO: 应由用户指定清晰度
m3u8_url = match1(highest_quality_line, r'\n([^#\n]+)$')
m3u8_url = play_url[:play_url.rfind("/")+1] + m3u8_url
else:
raise NotImplemented
assert title and m3u8_url
title = unescape_html(title)
title = escape_file_path(title)
p_title = r1('active">([^<]+)', html)
title = '%s (%s)' % (title, up)
if p_title:
title = '%s - %s' % (title, p_title)
print_info(site_info, title, 'm3u8', float('inf'))
if not info_only:
download_url_ffmpeg(m3u8_url, title, 'mp4', output_dir=output_dir, merge=merge)
download_url_ffmpeg(m3u8_url, title, 'mp4', output_dir=output_dir, merge=merge)
site = AcFun()
site_info = "AcFun.cn"
download = acfun_download
download = site.download_by_url
download_playlist = playlist_not_supported('acfun')

View File

@ -116,7 +116,7 @@ def baidu_download(url, output_dir='.', stream_type=None, merge=True, info_only=
id = r1(r'https?://music.baidu.com/album/(\d+)', url)
baidu_download_album(id, output_dir, merge, info_only)
elif re.match('https?://music.baidu.com/song/\d+', url):
elif re.match(r'https?://music.baidu.com/song/\d+', url):
id = r1(r'https?://music.baidu.com/song/(\d+)', url)
baidu_download_song(id, output_dir, merge, info_only)

View File

@ -1,16 +1,23 @@
#!/usr/bin/env python
import math
from ..common import *
from ..extractor import VideoExtractor
import hashlib
import math
class Bilibili(VideoExtractor):
name = "Bilibili"
# Bilibili media encoding options, in descending quality order.
stream_types = [
{'id': 'hdflv2_8k', 'quality': 127, 'audio_quality': 30280,
'container': 'FLV', 'video_resolution': '4320p', 'desc': '超高清 8K'},
{'id': 'hdflv2_dolby', 'quality': 126, 'audio_quality': 30280,
'container': 'FLV', 'video_resolution': '3840p', 'desc': '杜比视界'},
{'id': 'hdflv2_hdr', 'quality': 125, 'audio_quality': 30280,
'container': 'FLV', 'video_resolution': '2160p', 'desc': '真彩 HDR'},
{'id': 'hdflv2_4k', 'quality': 120, 'audio_quality': 30280,
'container': 'FLV', 'video_resolution': '2160p', 'desc': '超清 4K'},
{'id': 'flv_p60', 'quality': 116, 'audio_quality': 30280,
@ -35,6 +42,8 @@ class Bilibili(VideoExtractor):
{'id': 'jpg', 'quality': 0},
]
codecids = {7: 'AVC', 12: 'HEVC', 13: 'AV1'}
@staticmethod
def height_to_quality(height, qn):
if height <= 360 and qn <= 16:
@ -63,7 +72,7 @@ class Bilibili(VideoExtractor):
@staticmethod
def bilibili_api(avid, cid, qn=0):
return 'https://api.bilibili.com/x/player/playurl?avid=%s&cid=%s&qn=%s&type=&otype=json&fnver=0&fnval=16' % (avid, cid, qn)
return 'https://api.bilibili.com/x/player/playurl?avid=%s&cid=%s&qn=%s&type=&otype=json&fnver=0&fnval=4048&fourk=1' % (avid, cid, qn)
@staticmethod
def bilibili_audio_api(sid):
@ -91,7 +100,8 @@ class Bilibili(VideoExtractor):
appkey, sec = ''.join([chr(ord(i) + 2) for i in entropy[::-1]]).split(':')
params = 'appkey=%s&cid=%s&otype=json&qn=%s&quality=%s&type=' % (appkey, cid, qn, qn)
chksum = hashlib.md5(bytes(params + sec, 'utf8')).hexdigest()
return 'https://interface.bilibili.com/v2/playurl?%s&sign=%s' % (params, chksum)
return 'https://api.bilibili.com/x/player/wbi/v2?%s&sign=%s' % (params, chksum)
@staticmethod
def bilibili_live_api(cid):
@ -109,13 +119,21 @@ class Bilibili(VideoExtractor):
def bilibili_space_channel_api(mid, cid, pn=1, ps=100):
return 'https://api.bilibili.com/x/space/channel/video?mid=%s&cid=%s&pn=%s&ps=%s&order=0&jsonp=jsonp' % (mid, cid, pn, ps)
@staticmethod
def bilibili_space_collection_api(mid, cid, pn=1, ps=30):
return 'https://api.bilibili.com/x/polymer/space/seasons_archives_list?mid=%s&season_id=%s&sort_reverse=false&page_num=%s&page_size=%s' % (mid, cid, pn, ps)
@staticmethod
def bilibili_series_archives_api(mid, sid, pn=1, ps=100):
return 'https://api.bilibili.com/x/series/archives?mid=%s&series_id=%s&pn=%s&ps=%s&only_normal=true&sort=asc&jsonp=jsonp' % (mid, sid, pn, ps)
@staticmethod
def bilibili_space_favlist_api(fid, pn=1, ps=20):
return 'https://api.bilibili.com/x/v3/fav/resource/list?media_id=%s&pn=%s&ps=%s&order=mtime&type=0&tid=0&jsonp=jsonp' % (fid, pn, ps)
@staticmethod
def bilibili_space_video_api(mid, pn=1, ps=100):
return 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid=%s&page=%s&pagesize=%s&order=0&jsonp=jsonp' % (mid, pn, ps)
def bilibili_space_video_api(mid, pn=1, ps=50):
return "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=%s&ps=%s&tid=0&keyword=&order=pubdate&jsonp=jsonp" % (mid, pn, ps)
@staticmethod
def bilibili_vc_api(video_id):
@ -132,10 +150,10 @@ class Bilibili(VideoExtractor):
except:
return err_value
# https://api.bilibili.com/x/player.so?id=cid%3A162260003&aid=95051759&bvid=BV1zE411T7nb&buvid=FB2BB46F-B1F3-4BDA-A589-33348940411A155830infoc
def prepare(self, **kwargs):
self.stream_qualities = {s['quality']: s for s in self.stream_types}
self.streams.clear()
self.dash_streams.clear()
try:
html_content = get_content(self.url, headers=self.bilibili_headers(referer=self.url))
@ -154,13 +172,23 @@ class Bilibili(VideoExtractor):
# redirect: bangumi/play/ss -> bangumi/play/ep
# redirect: bangumi.bilibili.com/anime -> bangumi/play/ep
elif re.match(r'https?://(www\.)?bilibili\.com/bangumi/play/ss(\d+)', self.url) or \
re.match(r'https?://bangumi\.bilibili\.com/anime/(\d+)/play', self.url):
re.match(r'https?://bangumi\.bilibili\.com/anime/(\d+)/play', self.url):
initial_state_text = match1(html_content, r'__INITIAL_STATE__=(.*?);\(function\(\)') # FIXME
initial_state = json.loads(initial_state_text)
ep_id = initial_state['epList'][0]['id']
self.url = 'https://www.bilibili.com/bangumi/play/ep%s' % ep_id
html_content = get_content(self.url, headers=self.bilibili_headers(referer=self.url))
# redirect: s
elif re.match(r'https?://(www\.)?bilibili\.com/s/(.+)', self.url):
self.url = 'https://www.bilibili.com/%s' % match1(self.url, r'/s/(.+)')
html_content = get_content(self.url, headers=self.bilibili_headers())
# redirect: festival
elif re.match(r'https?://(www\.)?bilibili\.com/festival/(.+)', self.url):
self.url = 'https://www.bilibili.com/video/%s' % match1(self.url, r'bvid=([^&]+)')
html_content = get_content(self.url, headers=self.bilibili_headers())
# sort it out
if re.match(r'https?://(www\.)?bilibili\.com/audio/au(\d+)', self.url):
sort = 'audio'
@ -172,7 +200,7 @@ class Bilibili(VideoExtractor):
sort = 'live'
elif re.match(r'https?://vc\.bilibili\.com/video/(\d+)', self.url):
sort = 'vc'
elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(BV(\S+)))', self.url):
elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(bv(\S+))|(BV(\S+)))', self.url):
sort = 'video'
elif re.match(r'https?://h\.?bilibili\.com/(\d+)', self.url):
sort = 'h'
@ -180,35 +208,54 @@ class Bilibili(VideoExtractor):
self.download_playlist_by_url(self.url, **kwargs)
return
# regular av video
# regular video
if sort == 'video':
initial_state_text = match1(html_content, r'__INITIAL_STATE__=(.*?);\(function\(\)') # FIXME
initial_state = json.loads(initial_state_text)
playinfo_text = match1(html_content, r'__playinfo__=(.*?)</script><script>') # FIXME
playinfo = json.loads(playinfo_text) if playinfo_text else None
playinfo = playinfo if playinfo and playinfo.get('code') == 0 else None
html_content_ = get_content(self.url, headers=self.bilibili_headers(cookie='CURRENT_FNVAL=16'))
playinfo_text_ = match1(html_content_, r'__playinfo__=(.*?)</script><script>') # FIXME
playinfo_ = json.loads(playinfo_text_) if playinfo_text_ else None
playinfo_ = playinfo_ if playinfo_ and playinfo_.get('code') == 0 else None
# warn if it is a multi-part video
pn = initial_state['videoData']['videos']
if pn > 1 and not kwargs.get('playlist'):
log.w('This is a multipart video. (use --playlist to download all parts.)')
if 'videoData' in initial_state:
# (standard video)
# set video title
self.title = initial_state['videoData']['title']
# refine title for a specific part, if it is a multi-part video
p = int(match1(self.url, r'[\?&]p=(\d+)') or match1(self.url, r'/index_(\d+)') or
'1') # use URL to decide p-number, not initial_state['p']
if pn > 1:
part = initial_state['videoData']['pages'][p - 1]['part']
self.title = '%s (P%s. %s)' % (self.title, p, part)
# warn if cookies are not loaded
if cookies is None:
log.w('You will need login cookies for 720p formats or above. (use --cookies to load cookies.txt.)')
# warn if it is a multi-part video
pn = initial_state['videoData']['videos']
if pn > 1 and not kwargs.get('playlist'):
log.w('This is a multipart video. (use --playlist to download all parts.)')
# set video title
self.title = initial_state['videoData']['title']
# refine title for a specific part, if it is a multi-part video
p = int(match1(self.url, r'[\?&]p=(\d+)') or match1(self.url, r'/index_(\d+)') or
'1') # use URL to decide p-number, not initial_state['p']
if pn > 1:
part = initial_state['videoData']['pages'][p - 1]['part']
self.title = '%s (P%s. %s)' % (self.title, p, part)
# construct playinfos
avid = initial_state['aid']
cid = initial_state['videoData']['pages'][p - 1]['cid'] # use p-number, not initial_state['videoData']['cid']
else:
# (festival video)
# set video title
self.title = initial_state['videoInfo']['title']
# construct playinfos
avid = initial_state['videoInfo']['aid']
cid = initial_state['videoInfo']['cid']
# construct playinfos
avid = initial_state['aid']
cid = initial_state['videoData']['pages'][p - 1]['cid'] # use p-number, not initial_state['videoData']['cid']
current_quality, best_quality = None, None
if playinfo is not None:
current_quality = playinfo['data']['quality'] or None # 0 indicates an error, fallback to None
@ -262,11 +309,10 @@ class Bilibili(VideoExtractor):
if 'dash' in playinfo['data']:
audio_size_cache = {}
for video in playinfo['data']['dash']['video']:
# prefer the latter codecs!
s = self.stream_qualities[video['id']]
format_id = 'dash-' + s['id'] # prefix
format_id = f"dash-{s['id']}-{self.codecids[video['codecid']]}" # prefix
container = 'mp4' # enforce MP4 container
desc = s['desc']
desc = s['desc'] + ' ' + video['codecs']
audio_quality = s['audio_quality']
baseurl = video['baseUrl']
size = self.url_size(baseurl, headers=self.bilibili_headers(referer=self.url))
@ -289,7 +335,7 @@ class Bilibili(VideoExtractor):
'src': [[baseurl]], 'size': size}
# get danmaku
self.danmaku = get_content('http://comment.bilibili.com/%s.xml' % cid)
self.danmaku = get_content('https://comment.bilibili.com/%s.xml' % cid, headers=self.bilibili_headers(referer=self.url))
# bangumi
elif sort == 'bangumi':
@ -368,7 +414,7 @@ class Bilibili(VideoExtractor):
'src': [[baseurl], [audio_baseurl]], 'size': size}
# get danmaku
self.danmaku = get_content('http://comment.bilibili.com/%s.xml' % cid)
self.danmaku = get_content('https://comment.bilibili.com/%s.xml' % cid, headers=self.bilibili_headers(referer=self.url))
# vc video
elif sort == 'vc':
@ -550,7 +596,7 @@ class Bilibili(VideoExtractor):
'src': [[baseurl]], 'size': size}
# get danmaku
self.danmaku = get_content('http://comment.bilibili.com/%s.xml' % cid)
self.danmaku = get_content('https://comment.bilibili.com/%s.xml' % cid, headers=self.bilibili_headers(referer=self.url))
def extract(self, **kwargs):
# set UA and referer for downloading
@ -572,21 +618,6 @@ class Bilibili(VideoExtractor):
# extract stream with the best quality
stream_id = self.streams_sorted[0]['id']
def formattime(t):
if t/10 == 0:
return '0'+str(t)
else:
return str(t)
def ms2time(t):
m = t/60000
t = t%60000
s = t/1000
t = t%1000
minsec = formattime(m)+':'+formattime(s)+'.'+str(t)
return minsec
def download_playlist_by_url(self, url, **kwargs):
self.url = url
kwargs['playlist'] = True
@ -599,12 +630,16 @@ class Bilibili(VideoExtractor):
elif match1(html_content, r'<meta property="og:url" content="(https://www.bilibili.com/bangumi/play/[^"]+)"'):
sort = 'bangumi'
elif re.match(r'https?://(www\.)?bilibili\.com/bangumi/media/md(\d+)', self.url) or \
re.match(r'https?://bangumi\.bilibili\.com/anime/(\d+)', self.url):
re.match(r'https?://bangumi\.bilibili\.com/anime/(\d+)', self.url):
sort = 'bangumi_md'
elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|BV(\S+))', self.url):
elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|bv(\S+)|BV(\S+))', self.url):
sort = 'video'
elif re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/detail\?.*cid=(\d+)', self.url):
sort = 'space_channel'
elif re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/seriesdetail\?.*sid=(\d+)', self.url):
sort = 'space_channel_series'
elif re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/collectiondetail\?.*sid=(\d+)', self.url):
sort = 'space_channel_collection'
elif re.match(r'https?://space\.?bilibili\.com/(\d+)/favlist\?.*fid=(\d+)', self.url):
sort = 'space_favlist'
elif re.match(r'https?://space\.?bilibili\.com/(\d+)/video', self.url):
@ -615,18 +650,26 @@ class Bilibili(VideoExtractor):
log.e('[Error] Unsupported URL pattern.')
exit(1)
# regular av video
# regular video
if sort == 'video':
initial_state_text = match1(html_content, r'__INITIAL_STATE__=(.*?);\(function\(\)') # FIXME
initial_state = json.loads(initial_state_text)
aid = initial_state['videoData']['aid']
pn = initial_state['videoData']['videos']
if pn!= len(initial_state['videoData']['pages']):#interaction video 互动视频
if pn == len(initial_state['videoData']['pages']):
# non-interative video
for pi in range(1, pn + 1):
purl = 'https://www.bilibili.com/video/av%s?p=%s' % (aid, pi)
self.__class__().download_by_url(purl, **kwargs)
else:
# interative video
search_node_list = []
download_cid_set = set([initial_state['videoData']['cid']])
params = {
'id': 'cid:{}'.format(initial_state['videoData']['cid']),
'aid': str(aid)
'id': 'cid:{}'.format(initial_state['videoData']['cid']),
'aid': str(aid)
}
urlcontent = get_content('https://api.bilibili.com/x/player.so?'+parse.urlencode(params), headers=self.bilibili_headers(referer='https://www.bilibili.com/video/av{}'.format(aid)))
graph_version = json.loads(urlcontent[urlcontent.find('<interaction>')+13:urlcontent.find('</interaction>')])['graph_version']
@ -672,63 +715,6 @@ class Bilibili(VideoExtractor):
self.streams_sorted = [dict([('itag', stream_type['itag'])] + list(self.streams[stream_type['itag']].items())) for stream_type in self.__class__.stream_types if stream_type['itag'] in self.streams]
self.extract(**kwargs)
self.download(**kwargs)
else:
playinfo_text = match1(html_content, r'__playinfo__=(.*?)</script><script>') # FIXME
playinfo = json.loads(playinfo_text) if playinfo_text else None
html_content_ = get_content(self.url, headers=self.bilibili_headers(cookie='CURRENT_FNVAL=16'))
playinfo_text_ = match1(html_content_, r'__playinfo__=(.*?)</script><script>') # FIXME
playinfo_ = json.loads(playinfo_text_) if playinfo_text_ else None
p = int(match1(self.url, r'[\?&]p=(\d+)') or match1(self.url, r'/index_(\d+)') or '1')-1
for pi in range(p,pn):
self.prepare_by_cid(aid,initial_state['videoData']['pages'][pi]['cid'],'%s (P%s. %s)' % (initial_state['videoData']['title'], pi+1, initial_state['videoData']['pages'][pi]['part']),html_content,playinfo,playinfo_,url)
tttt = self.title
try:
self.streams_sorted = [dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams]
except:
self.streams_sorted = [dict([('itag', stream_type['itag'])] + list(self.streams[stream_type['itag']].items())) for stream_type in self.__class__.stream_types if stream_type['itag'] in self.streams]
self.extract(**kwargs)
self.download(**kwargs)
lrcurl = "https://api.bilibili.com/x/player.so?id=cid%3A" + str(initial_state['videoData']['pages'][pi]['cid']) + "&aid=" + str(aid) + "&bvid=" +initial_state['videoData']["bvid"]+"&buvid=FB2BB46F-B1F3-4BDA-A589-33348940411A155830infoc"
print("lrc url", lrcurl)
# -H 'Referer: https://www.bilibili.com/video/BV1zE411T7nb'
h = dict()
jsonOfLrc = get_content(lrcurl, headers={"Referer": "https://www.bilibili.com/video/" + initial_state['videoData']["bvid"]})
# Example line:
# <subtitle>{"allow_submit":false,"lan":"","lan_doc":"","subtitles":[{"id":23916631605379079,"lan":"zh-CN","lan_doc":"中文(中国)","is_lock":false,"subtitle_url":"//i0.hdslb.com/bfs/subtitle/dfb81041cf92b5c2ebce2540cd14c9e49674f460.json"}]}</subtitle>
subtitleMeta = match1(jsonOfLrc, r'<subtitle>(.*?)</subtitle>')
subtitlejson = json.loads(subtitleMeta)
print(subtitlejson)
if len(subtitlejson["subtitles"])> 0:
suburl = subtitlejson["subtitles"][0]["subtitle_url"]
subjson = get_content("https:" + suburl)
file = ''
datas = json.loads(subjson)
i = 1
for data in datas['body']:
start = data['from'] # 获取开始时间
stop = data['to'] # 获取结束时间
content = data['content'] # 获取字幕内容
file += '{}\n'.format(i) # 加入序号
hour = math.floor(start) // 3600
minute = (math.floor(start) - hour * 3600) // 60
sec = math.floor(start) - hour * 3600 - minute * 60
minisec = int(math.modf(start)[0] * 100) # 处理开始时间
file += str(hour).zfill(2) + ':' + str(minute).zfill(2) + ':' + str(sec).zfill(2) + ',' + str(minisec).zfill(2) # 将数字填充0并按照格式写入
file += ' --> '
hour = math.floor(stop) // 3600
minute = (math.floor(stop) - hour * 3600) // 60
sec = math.floor(stop) - hour * 3600 - minute * 60
minisec = abs(int(math.modf(stop)[0] * 100 - 1)) # 此处减1是为了防止两个字幕同时出现
file += str(hour).zfill(2) + ':' + str(minute).zfill(2) + ':' + str(sec).zfill(2) + ',' + str(minisec).zfill(2)
file += '\n' + content + '\n\n' # 加入字幕文字
i += 1
srtfilename = '%s.srt' % get_filename(tttt)
with open(os.path.join(".", srtfilename), 'w', encoding='utf-8') as f:
f.write(file) # 将数据写入文件
# purl = 'https://www.bilibili.com/video/av%s?p=%s' % (aid, pi+1)
# self.__class__().download_by_url(purl, **kwargs)
elif sort == 'bangumi':
initial_state_text = match1(html_content, r'__INITIAL_STATE__=(.*?);\(function\(\)') # FIXME
@ -764,6 +750,48 @@ class Bilibili(VideoExtractor):
url = 'https://www.bilibili.com/video/av%s' % video['aid']
self.__class__().download_playlist_by_url(url, **kwargs)
elif sort == 'space_channel_series':
m = re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/seriesdetail\?.*sid=(\d+)', self.url)
mid, sid = m.group(1), m.group(2)
pn = 1
video_list = []
while True:
api_url = self.bilibili_series_archives_api(mid, sid, pn)
api_content = get_content(api_url, headers=self.bilibili_headers(referer=self.url))
archives_info = json.loads(api_content)
video_list.extend(archives_info['data']['archives'])
if len(video_list) < archives_info['data']['page']['total'] and len(archives_info['data']['archives']) > 0:
pn += 1
else:
break
epn, i = len(video_list), 0
for video in video_list:
i += 1; log.w('Extracting %s of %s videos ...' % (i, epn))
url = 'https://www.bilibili.com/video/av%s' % video['aid']
self.__class__().download_playlist_by_url(url, **kwargs)
elif sort == 'space_channel_collection':
m = re.match(r'https?://space\.?bilibili\.com/(\d+)/channel/collectiondetail\?.*sid=(\d+)', self.url)
mid, sid = m.group(1), m.group(2)
pn = 1
video_list = []
while True:
api_url = self.bilibili_space_collection_api(mid, sid, pn)
api_content = get_content(api_url, headers=self.bilibili_headers(referer=self.url))
archives_info = json.loads(api_content)
video_list.extend(archives_info['data']['archives'])
if len(video_list) < archives_info['data']['page']['total'] and len(archives_info['data']['archives']) > 0:
pn += 1
else:
break
epn, i = len(video_list), 0
for video in video_list:
i += 1; log.w('Extracting %s of %s videos ...' % (i, epn))
url = 'https://www.bilibili.com/video/av%s' % video['aid']
self.__class__().download_playlist_by_url(url, **kwargs)
elif sort == 'space_favlist':
m = re.match(r'https?://space\.?bilibili\.com/(\d+)/favlist\?.*fid=(\d+)', self.url)
vmid, fid = m.group(1), m.group(2)
@ -791,15 +819,16 @@ class Bilibili(VideoExtractor):
api_url = self.bilibili_space_video_api(mid)
api_content = get_content(api_url, headers=self.bilibili_headers())
videos_info = json.loads(api_content)
pc = videos_info['data']['pages']
# pc = videos_info['data']['page']['count'] // videos_info['data']['page']['ps']
pc = math.ceil(videos_info['data']['page']['count'] / videos_info['data']['page']['ps'])
for pn in range(1, pc + 1):
api_url = self.bilibili_space_video_api(mid, pn=pn)
api_content = get_content(api_url, headers=self.bilibili_headers())
videos_info = json.loads(api_content)
epn, i = len(videos_info['data']['vlist']), 0
for video in videos_info['data']['vlist']:
epn, i = len(videos_info['data']['list']['vlist']), 0
for video in videos_info['data']['list']['vlist']:
i += 1; log.w('Extracting %s of %s videos ...' % (i, epn))
url = 'https://www.bilibili.com/video/av%s' % video['aid']
self.__class__().download_playlist_by_url(url, **kwargs)

View File

@ -58,7 +58,7 @@ def fix_coub_video_file(file_path):
def get_title_and_urls(json_data):
title = legitimize(re.sub('[\s*]', "_", json_data['title']))
title = legitimize(re.sub(r'[\s*]', "_", json_data['title']))
video_info = json_data['file_versions']['html5']['video']
if 'high' not in video_info:
if 'med' not in video_info:

View File

@ -10,7 +10,7 @@ def douban_download(url, output_dir = '.', merge = True, info_only = False, **kw
if re.match(r'https?://movie', url):
title = match1(html, 'name="description" content="([^"]+)')
tid = match1(url, 'trailer/(\d+)')
tid = match1(url, r'trailer/(\d+)')
real_url = 'https://movie.douban.com/trailer/video_url?tid=%s' % tid
type, ext, size = url_info(real_url)

View File

@ -1,6 +1,5 @@
# coding=utf-8
import re
import json
from ..common import (
@ -10,24 +9,51 @@ from ..common import (
fake_headers,
download_urls,
playlist_not_supported,
match1,
get_location,
)
__all__ = ['douyin_download_by_url']
def get_value(source: dict, path):
try:
value = source
for key in path:
if type(key) is str:
if key in value.keys():
value = value[key]
else:
value = None
break
elif type(key) is int:
if len(value) != 0:
value = value[key]
else:
value = None
break
except:
value = None
return value
def douyin_download_by_url(url, **kwargs):
# if short link, get the real url
if 'v.douyin.com' in url:
url = get_location(url)
aweme_id = match1(url, r'/(\d+)/?')
# get video info
video_info_api = 'https://www.douyin.com/web/api/v2/aweme/iteminfo/?item_ids={}'
url = video_info_api.format(aweme_id)
page_content = get_content(url, headers=fake_headers)
match_rule = re.compile(r'var data = \[(.*?)\];')
video_info = json.loads(match_rule.findall(page_content)[0])
video_url = video_info['video']['play_addr']['url_list'][0]
# fix: https://www.douyin.com/share/video/6553248251821165832
# if there is no title, use desc
cha_list = video_info['cha_list']
if cha_list:
title = cha_list[0]['cha_name']
else:
title = video_info['desc']
video_info = json.loads(page_content)
# get video id and title
video_id = get_value(video_info, ['item_list', 0, 'video', 'vid'])
title = get_value(video_info, ['item_list', 0, 'desc'])
# get video play url
video_url = "https://aweme.snssdk.com/aweme/v1/play/?ratio=720p&line=0&video_id={}".format(video_id)
video_format = 'mp4'
size = url_size(video_url, faker=True)
print_info(

View File

@ -13,7 +13,6 @@ from .qq import qq_download_by_vid
from .sina import sina_download_by_vid
from .tudou import tudou_download_by_id
from .vimeo import vimeo_download_by_id
from .yinyuetai import yinyuetai_download_by_id
from .youku import youku_download_by_vid
from . import iqiyi
from . import bokecc
@ -21,18 +20,18 @@ from . import bokecc
"""
refer to http://open.youku.com/tools
"""
youku_embed_patterns = [ 'youku\.com/v_show/id_([a-zA-Z0-9=]+)',
'player\.youku\.com/player\.php/sid/([a-zA-Z0-9=]+)/v\.swf',
'loader\.swf\?VideoIDS=([a-zA-Z0-9=]+)',
'player\.youku\.com/embed/([a-zA-Z0-9=]+)',
'YKU.Player\(\'[a-zA-Z0-9]+\',{ client_id: \'[a-zA-Z0-9]+\', vid: \'([a-zA-Z0-9]+)\''
youku_embed_patterns = [ r'youku\.com/v_show/id_([a-zA-Z0-9=]+)',
r'player\.youku\.com/player\.php/sid/([a-zA-Z0-9=]+)/v\.swf',
r'loader\.swf\?VideoIDS=([a-zA-Z0-9=]+)',
r'player\.youku\.com/embed/([a-zA-Z0-9=]+)',
r'YKU.Player\(\'[a-zA-Z0-9]+\',{ client_id: \'[a-zA-Z0-9]+\', vid: \'([a-zA-Z0-9]+)\''
]
"""
http://www.tudou.com/programs/view/html5embed.action?type=0&amp;code=3LS_URGvl54&amp;lcode=&amp;resourceId=0_06_05_99
"""
tudou_embed_patterns = [ 'tudou\.com[a-zA-Z0-9\/\?=\&\.\;]+code=([a-zA-Z0-9_-]+)\&',
'www\.tudou\.com/v/([a-zA-Z0-9_-]+)/[^"]*v\.swf'
tudou_embed_patterns = [ r'tudou\.com[a-zA-Z0-9\/\?=\&\.\;]+code=([a-zA-Z0-9_-]+)\&',
r'www\.tudou\.com/v/([a-zA-Z0-9_-]+)/[^"]*v\.swf'
]
"""
@ -40,20 +39,18 @@ refer to http://open.tudou.com/wiki/video/info
"""
tudou_api_patterns = [ ]
yinyuetai_embed_patterns = [ 'player\.yinyuetai\.com/video/swf/(\d+)' ]
iqiyi_embed_patterns = [ r'player\.video\.qiyi\.com/([^/]+)/[^/]+/[^/]+/[^/]+\.swf[^"]+tvId=(\d+)' ]
iqiyi_embed_patterns = [ 'player\.video\.qiyi\.com/([^/]+)/[^/]+/[^/]+/[^/]+\.swf[^"]+tvId=(\d+)' ]
netease_embed_patterns = [ r'(http://\w+\.163\.com/movie/[^\'"]+)' ]
netease_embed_patterns = [ '(http://\w+\.163\.com/movie/[^\'"]+)' ]
vimeo_embed_patters = [ r'player\.vimeo\.com/video/(\d+)' ]
vimeo_embed_patters = [ 'player\.vimeo\.com/video/(\d+)' ]
dailymotion_embed_patterns = [ 'www\.dailymotion\.com/embed/video/(\w+)' ]
dailymotion_embed_patterns = [ r'www\.dailymotion\.com/embed/video/(\w+)' ]
"""
check the share button on http://www.bilibili.com/video/av5079467/
"""
bilibili_embed_patterns = [ 'static\.hdslb\.com/miniloader\.swf.*aid=(\d+)' ]
bilibili_embed_patterns = [ r'static\.hdslb\.com/miniloader\.swf.*aid=(\d+)' ]
'''
@ -82,11 +79,6 @@ def embed_download(url, output_dir = '.', merge = True, info_only = False, **kwa
found = True
tudou_download_by_id(vid, title=title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
vids = matchall(content, yinyuetai_embed_patterns)
for vid in vids:
found = True
yinyuetai_download_by_id(vid, title=title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
vids = matchall(content, iqiyi_embed_patterns)
for vid in vids:
found = True

View File

@ -73,7 +73,7 @@ def get_api_key(page):
match = match1(page, pattern_inline_api_key)
# this happens only when the url points to a gallery page
# that contains no inline api_key(and never makes xhr api calls)
# in fact this might be a better approch for getting a temporary api key
# in fact this might be a better approach for getting a temporary api key
# since there's no place for a user to add custom information that may
# misguide the regex in the homepage
if not match:

View File

@ -84,7 +84,7 @@ class Funshion(VideoExtractor):
moz_ec_name = search_dict(sym_to_name, 'mozEcName')
push = search_dict(sym_to_name, 'push')
patt = '{}\.{}\("(.+?)"\)'.format(moz_ec_name, push)
patt = r'{}\.{}\("(.+?)"\)'.format(moz_ec_name, push)
ec_list = re.findall(patt, code)
[magic_list.append(sym_to_name[ec]) for ec in ec_list]
return magic_list

View File

@ -13,9 +13,11 @@ class Imgur(VideoExtractor):
]
def prepare(self, **kwargs):
self.ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/123.0.2420.97'
if re.search(r'imgur\.com/a/', self.url):
# album
content = get_content(self.url)
content = get_content(self.url, headers=fake_headers)
album = match1(content, r'album\s*:\s*({.*}),') or \
match1(content, r'image\s*:\s*({.*}),')
album = json.loads(album)
@ -39,7 +41,7 @@ class Imgur(VideoExtractor):
elif re.search(r'i\.imgur\.com/', self.url):
# direct image
_, container, size = url_info(self.url)
_, container, size = url_info(self.url, faker=True)
self.streams = {
'original': {
'src': [self.url],
@ -51,21 +53,18 @@ class Imgur(VideoExtractor):
else:
# gallery image
content = get_content(self.url)
image = json.loads(match1(content, r'image\s*:\s*({.*}),'))
ext = image['ext']
content = get_content(self.url, headers=fake_headers)
url = match1(content, r'meta property="og:video"[^>]+(https?://i.imgur.com/[^"?]+)') or \
match1(content, r'meta property="og:image"[^>]+(https?://i.imgur.com/[^"?]+)')
_, container, size = url_info(url, headers={'User-Agent': fake_headers['User-Agent']})
self.streams = {
'original': {
'src': ['http://i.imgur.com/%s%s' % (image['hash'], ext)],
'size': image['size'],
'container': ext[1:]
},
'thumbnail': {
'src': ['http://i.imgur.com/%ss%s' % (image['hash'], '.jpg')],
'container': 'jpg'
'src': [url],
'size': size,
'container': container
}
}
self.title = image['title'] or image['hash']
self.title = r1(r'i\.imgur\.com/([^./]*)', url)
def extract(self, **kwargs):
if 'stream_id' in kwargs and kwargs['stream_id']:

View File

@ -5,45 +5,37 @@ __all__ = ['instagram_download']
from ..common import *
def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.2592.87',
'sec-fetch-mode': 'navigate' # important
}
url = r1(r'([^?]*)', url)
html = get_html(url)
cont = get_content(url, headers=headers)
vid = r1(r'instagram.com/p/([^/]+)', url)
description = r1(r'<meta property="og:title" content="([^"]*)"', html)
vid = r1(r'instagram.com/\w+/([^/]+)', url)
description = r1(r'<meta property="og:title" content="([^"]*)"', cont) or \
r1(r'<title>([^<]*)</title>', cont) # with logged-in cookies
title = "{} [{}]".format(description.replace("\n", " "), vid)
stream = r1(r'<meta property="og:video" content="([^"]*)"', html)
if stream:
_, ext, size = url_info(stream)
print_info(site_info, title, ext, size)
if not info_only:
download_urls([stream], title, ext, size, output_dir, merge=merge)
else:
data = re.search(r'window\._sharedData\s*=\s*(.*);</script>', html)
info = json.loads(data.group(1))
appId = r1(r'"appId":"(\d+)"', cont)
media_id = r1(r'"media_id":"(\d+)"', cont)
logging.debug('appId: %s' % appId)
logging.debug('media_id: %s' % media_id)
if 'edge_sidecar_to_children' in info['entry_data']['PostPage'][0]['graphql']['shortcode_media']:
edges = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['edge_sidecar_to_children']['edges']
for edge in edges:
title = edge['node']['shortcode']
image_url = edge['node']['display_url']
if 'video_url' in edge['node']:
image_url = edge['node']['video_url']
ext = image_url.split('?')[0].split('.')[-1]
size = int(get_head(image_url)['Content-Length'])
api_url = 'https://i.instagram.com/api/v1/media/%s/info/' % media_id
try:
api_cont = get_content(api_url, headers={**fake_headers, **{'x-ig-app-id': appId}})
post = json.loads(api_cont)
except:
log.wtf('[Error] Please specify a cookie file.')
print_info(site_info, title, ext, size)
if not info_only:
download_urls(urls=[image_url],
title=title,
ext=ext,
total_size=size,
output_dir=output_dir)
else:
title = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['shortcode']
image_url = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['display_url']
if 'video_url' in info['entry_data']['PostPage'][0]['graphql']['shortcode_media']:
image_url =info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['video_url']
for item in post['items']:
code = item['code']
carousel_media = item.get('carousel_media') or [item]
for i, media in enumerate(carousel_media):
title = '%s [%s]' % (code, i)
image_url = media['image_versions2']['candidates'][0]['url']
ext = image_url.split('?')[0].split('.')[-1]
size = int(get_head(image_url)['Content-Length'])
@ -55,6 +47,20 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg
total_size=size,
output_dir=output_dir)
# download videos (if any)
if 'video_versions' in media:
video_url = media['video_versions'][0]['url']
ext = video_url.split('?')[0].split('.')[-1]
size = int(get_head(video_url)['Content-Length'])
print_info(site_info, title, ext, size)
if not info_only:
download_urls(urls=[video_url],
title=title,
ext=ext,
total_size=size,
output_dir=output_dir)
site_info = "Instagram.com"
download = instagram_download
download_playlist = playlist_not_supported('instagram')

View File

@ -20,7 +20,7 @@ Changelog:
use @fffonion 's method in #617.
Add trace AVM(asasm) code in Iqiyi's encode function where the salt is put into the encode array and reassemble by RABCDasm(or WinRABCDasm),then use Fiddler to response modified file to replace the src file with its AutoResponder function ,set browser Fiddler proxy and play with !debug version! Flash Player ,finially get result in flashlog.txt(its location can be easily found in search engine).
Code Like (without letters after #comment:),it just do the job : trace("{IQIYI_SALT}:"+salt_array.join(""))
```(Postion After getTimer)
```(Position After getTimer)
findpropstrict QName(PackageNamespace(""), "trace")
pushstring "{IQIYI_SALT}:" #comment for you to locate the salt
getscopeobject 1
@ -119,10 +119,10 @@ class Iqiyi(VideoExtractor):
self.url = url
video_page = get_content(url)
videos = set(re.findall(r'<a href="(http://www\.iqiyi\.com/v_[^"]+)"', video_page))
videos = set(re.findall(r'<a href="(?=https?:)?(//www\.iqiyi\.com/v_[^"]+)"', video_page))
for video in videos:
self.__class__().download_by_url(video, **kwargs)
self.__class__().download_by_url('https:' + video, **kwargs)
def prepare(self, **kwargs):
assert self.url or self.vid
@ -131,10 +131,10 @@ class Iqiyi(VideoExtractor):
html = get_html(self.url)
tvid = r1(r'#curid=(.+)_', self.url) or \
r1(r'tvid=([^&]+)', self.url) or \
r1(r'data-player-tvid="([^"]+)"', html) or r1(r'tv(?:i|I)d=(.+?)\&', html) or r1(r'param\[\'tvid\'\]\s*=\s*"(.+?)"', html)
r1(r'data-player-tvid="([^"]+)"', html) or r1(r'tv(?:i|I)d=(\w+?)\&', html) or r1(r'param\[\'tvid\'\]\s*=\s*"(.+?)"', html)
videoid = r1(r'#curid=.+_(.*)$', self.url) or \
r1(r'vid=([^&]+)', self.url) or \
r1(r'data-player-videoid="([^"]+)"', html) or r1(r'vid=(.+?)\&', html) or r1(r'param\[\'vid\'\]\s*=\s*"(.+?)"', html)
r1(r'data-player-videoid="([^"]+)"', html) or r1(r'vid=(\w+?)\&', html) or r1(r'param\[\'vid\'\]\s*=\s*"(.+?)"', html)
self.vid = (tvid, videoid)
info_u = 'http://pcw-api.iqiyi.com/video/video/playervideoinfo?tvid=' + tvid
json_res = get_content(info_u)
@ -153,7 +153,7 @@ class Iqiyi(VideoExtractor):
except Exception as e:
log.i("vd: {} is not handled".format(stream['vd']))
log.i("info is {}".format(stream))
def download(self, **kwargs):
"""Override the original one
@ -201,10 +201,15 @@ class Iqiyi(VideoExtractor):
if not urls:
log.wtf('[Failed] Cannot extract video source.')
# For legacy main()
#Here's the change!!
download_url_ffmpeg(urls[0], self.title, 'mp4', output_dir=kwargs['output_dir'], merge=kwargs['merge'], stream=False)
#Here's the change!!
# ffmpeg fails to parse.
# download_url_ffmpeg(urls[0], self.title, 'mp4', output_dir=kwargs['output_dir'], merge=kwargs['merge'], stream=False)
#Here's the way works out
urls = general_m3u8_extractor(urls[0])
# ffmpeg fail to convert the output video with mkv extension, due to sort of timestamp problem
download_urls(urls, self.title, 'mp4', 0, **kwargs)
if not kwargs['caption']:
print('Skipping captions.')
return
@ -215,7 +220,7 @@ class Iqiyi(VideoExtractor):
with open(os.path.join(kwargs['output_dir'], filename),
'w', encoding='utf-8') as x:
x.write(srt)
print('Done.')
print('Done.')
'''
if info["code"] != "A000000":

View File

@ -27,6 +27,9 @@ def iwara_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
api_url = video_url + '/api/video/' + video_hash
content = get_content(api_url, headers=headers)
data = json.loads(content)
if len(data)<1 :
print('Maybe is Private Video?'+'['+title+']')
return True;
down_urls = 'https:' + data[0]['uri']
type, ext, size = url_info(down_urls, headers=headers)
print_info(site_info, title+data[0]['resolution'], type, size)
@ -35,10 +38,8 @@ def iwara_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
download_urls([down_urls], title, ext, size, output_dir, merge=merge, headers=headers)
def download_playlist_by_url( url, **kwargs):
video_page = get_content(url)
# url_first=re.findall(r"(http[s]?://[^/]+)",url)
video_page = get_html(url)
url_first=match1(url, r"(http[s]?://[^/]+)")
# print (url_first)
videos = set(re.findall(r'<a href="(/videos/[^"]+)"', video_page))
if(len(videos)>0):
for video in videos:

View File

@ -18,121 +18,97 @@ headers = {
}
def int_overflow(val):
maxint = 2147483647
if not -maxint - 1 <= val <= maxint:
val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1
return val
def unsigned_right_shitf(n, i):
if n < 0:
n = ctypes.c_uint32(n).value
if i < 0:
return -int_overflow(n << abs(i))
return int_overflow(n >> i)
def get_video_url_from_video_id(video_id):
"""Splicing URLs according to video ID to get video details"""
# from js
data = [""] * 256
for index, _ in enumerate(data):
t = index
for i in range(8):
t = -306674912 ^ unsigned_right_shitf(t, 1) if 1 & t else unsigned_right_shitf(t, 1)
data[index] = t
def tmp():
rand_num = random.random()
path = "/video/urls/v/1/toutiao/mp4/{video_id}?r={random_num}".format(video_id=video_id,
random_num=str(rand_num)[2:])
e = o = r = -1
i, a = 0, len(path)
while i < a:
e = ord(path[i])
i += 1
if e < 128:
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ e)]
else:
if e < 2048:
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (192 | e >> 6 & 31))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))]
else:
if 55296 <= e < 57344:
e = (1023 & e) + 64
i += 1
o = 1023 & t.url(i)
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (240 | e >> 8 & 7))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 2 & 63))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | o >> 6 & 15 | (3 & e) << 4))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & o))]
else:
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (224 | e >> 12 & 15))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 6 & 63))]
r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))]
return "https://ib.365yg.com{path}&s={param}".format(path=path, param=unsigned_right_shitf(r ^ -1, 0))
while 1:
url = tmp()
if url.split("=")[-1][0] != "-": # 参数s不能为负数
return url
def ixigua_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
def ixigua_download(url, output_dir='.', merge=True, info_only=False, stream_id='', **kwargs):
# example url: https://www.ixigua.com/i6631065141750268420/#mid=63024814422
resp = urlopen_with_retry(request.Request(url))
headers['cookie'] = "MONITOR_WEB_ID=7892c49b-296e-4499-8704-e47c1b15123; " \
"ixigua-a-s=1; ttcid=af99669b6304453480454f1507011d5c234; BD_REF=1; " \
"__ac_nonce=060d88ff000a75e8d17eb; __ac_signature=_02B4Z6wo100f01kX9ZpgAAIDAKIBBQUIPYT5F2WIAAPG2ad; " \
"ttwid=1%7CcIsVF_3vqSIk4XErhPB0H2VaTxT0tdsTMRbMjrJOPN8%7C1624806049%7C08ce7dd6f7d20506a41ba0a331ef96a6505d96731e6ad9f6c8c709f53f227ab1; "
resp = urlopen_with_retry(request.Request(url, headers=headers))
html = resp.read().decode('utf-8')
_cookies = []
for c in resp.getheader('Set-Cookie').split("httponly,"):
_cookies.append(c.strip().split(' ')[0])
headers['cookie'] = ' '.join(_cookies)
headers['cookie'] += ' '.join(_cookies)
conf = loads(match1(html, r"window\.config = (.+);"))
if not conf:
log.e("Get window.config from url failed, url: {}".format(url))
match_txt = match1(html, r"<script id=\"SSR_HYDRATED_DATA\">window._SSR_HYDRATED_DATA=(.*?)<\/script>")
if not match_txt:
log.e("Get video info from url failed, url: {}".format(url))
return
verify_url = conf['prefix'] + conf['url'] + '?key=' + conf['key'] + '&psm=' + conf['psm'] \
+ '&_signature=' + ''.join(random.sample(string.ascii_letters + string.digits, 31))
try:
ok = get_content(verify_url)
except Exception as e:
ok = e.msg
if ok != 'OK':
log.e("Verify failed, verify_url: {}, result: {}".format(verify_url, ok))
video_info = loads(match_txt.replace('":undefined', '":null'))
if not video_info:
log.e("video_info not found, url:{}".format(url))
return
html = get_content(url, headers=headers)
video_id = match1(html, r"\"vid\":\"([^\"]+)")
title = match1(html, r"\"player__videoTitle\">.*?<h1.*?>(.*)<\/h1><\/div>")
if not video_id:
log.e("video_id not found, url:{}".format(url))
title = video_info['anyVideo']['gidInformation']['packerData']['video']['title']
video_resource = video_info['anyVideo']['gidInformation']['packerData']['video']['videoResource']
if video_resource.get('dash', None):
video_list = video_resource['dash']
elif video_resource.get('dash_120fps', None):
video_list = video_resource['dash_120fps']
elif video_resource.get('normal', None):
video_list = video_resource['normal']
else:
log.e("video_list not found, url:{}".format(url))
return
video_info_url = get_video_url_from_video_id(video_id)
video_info = loads(get_content(video_info_url))
if video_info.get("code", 1) != 0:
log.e("Get video info from {} error: server return code {}".format(video_info_url, video_info.get("code", 1)))
return
if not video_info.get("data", None):
log.e("Get video info from {} error: The server returns JSON value"
" without data or data is empty".format(video_info_url))
return
if not video_info["data"].get("video_list", None):
log.e("Get video info from {} error: The server returns JSON value"
" without data.video_list or data.video_list is empty".format(video_info_url))
return
if not video_info["data"]["video_list"].get("video_1", None):
log.e("Get video info from {} error: The server returns JSON value"
" without data.video_list.video_1 or data.video_list.video_1 is empty".format(video_info_url))
return
bestQualityVideo = list(video_info["data"]["video_list"].keys())[-1] #There is not only video_1, there might be video_2
size = int(video_info["data"]["video_list"][bestQualityVideo]["size"])
print_info(site_info=site_info, title=title, type="mp4", size=size) # 该网站只有mp4类型文件
if not info_only:
video_url = base64.b64decode(video_info["data"]["video_list"][bestQualityVideo]["main_url"].encode("utf-8"))
download_urls([video_url.decode("utf-8")], title, "mp4", size, output_dir, merge=merge, headers=headers, **kwargs)
streams = [
# {'file_id': 'fc1b9bf8e8e04a849d90a5172d3f6919', 'quality': "normal", 'size': 0,
# 'definition': '720p', 'video_url': '','audio_url':'','v_type':'dash'},
]
# 先用无水印的视频与音频合成没有的话再直接用有水印的mp4
if video_list.get('dynamic_video', None):
audio_url = base64.b64decode(
video_list['dynamic_video']['dynamic_audio_list'][0]['main_url'].encode("utf-8")).decode("utf-8")
dynamic_video_list = video_list['dynamic_video']['dynamic_video_list']
streams = convertStreams(dynamic_video_list, audio_url)
elif video_list.get('video_list', None):
dynamic_video_list = video_list['video_list']
streams = convertStreams(dynamic_video_list, "")
print("title: %s" % title)
for stream in streams:
if stream_id != "" and stream_id != stream['definition']:
continue
print(" - format: %s" % stream['definition'])
print(" size: %s MiB (%s bytes)" % (round(stream['size'] / 1048576, 1), stream['size']))
print(" quality: %s " % stream['quality'])
print(" v_type: %s " % stream['v_type'])
# print(" video_url: %s " % stream['video_url'])
# print(" audio_url: %s " % stream['audio_url'])
print()
# 不是只看信息的话,就下载第一个
if not info_only:
urls = [stream['video_url']]
if stream['audio_url'] != "":
urls.append(stream['audio_url'])
kwargs['av'] = 'av' # 这将会合并音视频
download_urls(urls, title, "mp4", stream['size'], output_dir, merge=merge, headers=headers,
**kwargs)
return
def convertStreams(video_list, audio_url):
streams = []
if type(video_list) == dict:
video_list = video_list.values()
for dynamic_video in video_list:
streams.append({
'file_id': dynamic_video['file_hash'],
'quality': dynamic_video['quality'],
'size': dynamic_video['size'],
'definition': dynamic_video['definition'],
'video_url': base64.b64decode(dynamic_video['main_url'].encode("utf-8")).decode("utf-8"),
'audio_url': audio_url,
'v_type': dynamic_video['vtype'],
})
return streams
def ixigua_download_playlist_by_url(url, output_dir='.', merge=True, info_only=False, **kwargs):

View File

@ -50,7 +50,7 @@ def ku6_download(url, output_dir = '.', merge = True, info_only = False, **kwarg
vid = vid.group(1)
else:
raise Exception('Unsupported url')
this_meta = re.search('"?'+vid+'"?:\{(.+?)\}', meta)
this_meta = re.search('"?'+vid+r'"?:\{(.+?)\}', meta)
if this_meta is not None:
this_meta = this_meta.group(1)
title = re.search('title:"(.+?)"', this_meta).group(1)

View File

@ -32,8 +32,8 @@ def kugou_download(url, output_dir=".", merge=True, info_only=False, **kwargs):
def kugou_download_by_hash(url, output_dir='.', merge=True, info_only=False):
# sample
# url_sample:http://www.kugou.com/song/#hash=93F7D2FC6E95424739448218B591AEAF&album_id=9019462
hash_val = match1(url, 'hash=(\w+)')
album_id = match1(url, 'album_id=(\d+)')
hash_val = match1(url, r'hash=(\w+)')
album_id = match1(url, r'album_id=(\d+)')
if not album_id:
album_id = 123
html = get_html("http://www.kugou.com/yy/index.php?r=play/getdata&hash={}&album_id={}&mid=123".format(hash_val, album_id))
@ -60,7 +60,7 @@ def kugou_download_playlist(url, output_dir='.', merge=True, info_only=False, **
res = pattern.findall(html)
for song in res:
res = get_html(song)
pattern_url = re.compile('"hash":"(\w+)".*"album_id":(\d)+')
pattern_url = re.compile(r'"hash":"(\w+)".*"album_id":(\d)+')
hash_val, album_id = res = pattern_url.findall(res)[0]
if not album_id:
album_id = 123
@ -70,7 +70,7 @@ def kugou_download_playlist(url, output_dir='.', merge=True, info_only=False, **
# album sample: http://www.kugou.com/yy/album/single/1645030.html
elif url.lower().find('album') != -1:
html = get_html(url)
pattern = re.compile('var data=(\[.*?\]);')
pattern = re.compile(r'var data=(\[.*?\]);')
res = pattern.findall(html)[0]
for v in json.loads(res):
urls.append('http://www.kugou.com/song/#hash=%s&album_id=%s' % (v['hash'], v['album_id']))
@ -79,7 +79,7 @@ def kugou_download_playlist(url, output_dir='.', merge=True, info_only=False, **
# playlist sample:http://www.kugou.com/yy/special/single/487279.html
else:
html = get_html(url)
pattern = re.compile('data="(\w+)\|(\d+)"')
pattern = re.compile(r'data="(\w+)\|(\d+)"')
for v in pattern.findall(html):
urls.append('http://www.kugou.com/song/#hash=%s&album_id=%s' % (v[0], v[1]))
print('http://www.kugou.com/song/#hash=%s&album_id=%s' % (v[0], v[1]))

View File

@ -18,7 +18,7 @@ def kuwo_download_by_rid(rid, output_dir = '.', merge = True, info_only = False)
def kuwo_playlist_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):
html=get_content(url)
matched=set(re.compile("yinyue/(\d+)").findall(html))#reduce duplicated
matched=set(re.compile(r"yinyue/(\d+)").findall(html))#reduce duplicated
for rid in matched:
kuwo_download_by_rid(rid,output_dir,merge,info_only)
@ -26,7 +26,7 @@ def kuwo_playlist_download(url, output_dir = '.', merge = True, info_only = Fals
def kuwo_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):
if "www.kuwo.cn/yinyue" in url:
rid=match1(url,'yinyue/(\d+)')
rid=match1(url, r'yinyue/(\d+)')
kuwo_download_by_rid(rid,output_dir, merge, info_only)
else:
kuwo_playlist_download(url,output_dir,merge,info_only)

View File

@ -0,0 +1,81 @@
#!/usr/bin/env python
__all__ = ['lrts_download']
import logging
from ..common import *
from ..util import log, term
def lrts_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
html = get_html(url)
args = kwargs.get('args')
if not args: args = {}
matched = re.search(r"/book/(\d+)", url)
if not matched:
raise AssertionError("not found book number: %s" % url)
book_no = matched.group(1)
book_title = book_no
matched = re.search(r"<title>([^-]*)[-](.*)[,](.*)</title>", html)
if matched:
book_title = matched.group(1)
matched = re.search(r"var totalCount='(\d+)'", html)
if not matched:
raise AssertionError("not found total count in html")
total_count = int(matched.group(1))
log.i('%s total: %s' % (book_title, total_count))
first_page = 0
if ('first' in args and args.first!= None):
first_page = int(args.first)
page_size = 10
if ('page_size' in args and args.page_size != None):
page_size = int(args.page_size)
last_page = (total_count // page_size) + 1
if ('last' in args and args.last != None):
last_page = int(args.last)
log.i('page size is %s, page from %s to %s' % (page_size, first_page, last_page))
headers = {
'Referer': url
}
items = []
for page in range(first_page, last_page):
page_url = 'http://www.lrts.me/ajax/book/%s/%s/%s' % (book_no, page, page_size)
response_content = json.loads(post_content(page_url, headers))
if response_content['status'] != 'success':
raise AssertionError("got the page failed: %s" % (page_url))
data = response_content['data']['data']
if data:
for i in data:
i['resName'] = parse.unquote(i['resName'])
items.extend(data)
else:
break
headers = {
'Referer': 'http://www.lrts.me/playlist'
}
for item in items:
i_url = 'http://www.lrts.me/ajax/path/4/%s/%s' % (item['fatherResId'], item['resId'])
response_content = json.loads(post_content(i_url, headers))
if response_content['status'] == 'success' and response_content['data']:
item['ok'] = True
item['url'] = response_content['data']
logging.debug('ok')
items = list(filter(lambda i: 'ok' in i and i['ok'], items))
log.i('Downloading %s: %s count ...' % (book_title, len(items)))
for item in items:
title = item['resName']
file_url = item['url']
# if not file_url: continue
_, _, size = url_info(file_url)
print_info(site_info, title, 'mp3', size)
if not info_only:
download_urls([file_url], title, 'mp3', size, output_dir, merge=merge)
site_info = "lrts.me"
download = lrts_download
download_playlist = lrts_download

View File

@ -9,87 +9,130 @@ from urllib.parse import urlsplit
from os.path import dirname
import re
import base64
import time
import uuid
class MGTV(VideoExtractor):
name = "芒果 (MGTV)"
# Last updated: 2016-11-13
stream_types = [
{'id': 'fhd', 'container': 'ts', 'video_profile': '蓝光'},
{'id': 'hd', 'container': 'ts', 'video_profile': '超清'},
{'id': 'sd', 'container': 'ts', 'video_profile': '高清'},
{'id': 'ld', 'container': 'ts', 'video_profile': '标清'},
]
id_dic = {i['video_profile']:(i['id']) for i in stream_types}
api_endpoint = 'http://pcweb.api.mgtv.com/player/video?video_id={video_id}'
id_dic = {i['video_profile']: (i['id']) for i in stream_types}
did = str(uuid.uuid4())
ver = '0.3.0301'
pno = '1030'
def tk2(self):
return base64.urlsafe_b64encode(b'did=%s|ver=%s|pno=%s|clit=%d' % (
self.did.encode(), self.ver.encode(), self.pno.encode(), time.time())).decode('utf-8')[::-1]
info_endpoint = 'https://pcweb.api.mgtv.com/video/info?vid={video_id}'
player_endpoint = 'https://pcweb.api.mgtv.com/player/video?did={did}&tk2={tk2}&video_id={video_id}'
source_endpoint = 'https://pcweb.api.mgtv.com/player/getSource?tk2={tk2}&pm2={pm2}&video_id={video_id}'
playlist_endpoint = 'https://pcweb.api.mgtv.com/episode/list?video_id={video_id}&page={page}&size=30'
@staticmethod
def get_vid_from_url(url):
"""Extracts video ID from URL.
"""
vid = match1(url, 'https?://www.mgtv.com/(?:b|l)/\d+/(\d+).html')
vid = match1(url, r'https?://www.mgtv.com/(?:b|l)/\d+/(\d+).html')
if not vid:
vid = match1(url, 'https?://www.mgtv.com/hz/bdpz/\d+/(\d+).html')
vid = match1(url, r'https?://www.mgtv.com/hz/bdpz/\d+/(\d+).html')
if not vid:
vid = match1(url, r'https?://www.mgtv.com/s/(\d+).html')
return vid
#----------------------------------------------------------------------
@staticmethod
def get_mgtv_real_url(url):
# ----------------------------------------------------------------------
def get_mgtv_real_url(self, url):
"""str->list of str
Give you the real URLs."""
content = loads(get_content(url))
m3u_url = content['info']
split = urlsplit(m3u_url)
base_url = "{scheme}://{netloc}{path}/".format(scheme = split[0],
netloc = split[1],
path = dirname(split[2]))
content = get_content(content['info']) #get the REAL M3U url, maybe to be changed later?
base_url = "{scheme}://{netloc}{path}/".format(scheme=split[0],
netloc=split[1],
path=dirname(split[2]))
content = get_content(content['info'],
headers={'Referer': self.url}) # get the REAL M3U url, maybe to be changed later?
segment_list = []
segments_size = 0
for i in content.split():
if not i.startswith('#'): #not the best way, better we use the m3u8 package
if not i.startswith('#'): # not the best way, better we use the m3u8 package
segment_list.append(base_url + i)
# use ext-info for fast size calculate
elif i.startswith('#EXT-MGTV-File-SIZE:'):
segments_size += int(i[i.rfind(':')+1:])
segments_size += int(i[i.rfind(':') + 1:])
return m3u_url, segments_size, segment_list
def download_playlist_by_url(self, url, **kwargs):
pass
self.url = url
self.vid = self.get_vid_from_url(self.url)
content_playlist = get_content(self.playlist_endpoint.format(video_id=self.vid, page=1))
content_playlist = loads(content_playlist)
for ep in content_playlist['data']['list']:
self.download_by_url('https://www.mgtv.com' + ep['url'], **kwargs)
max_page = content_playlist['data']['total_page']
for page in range(2, max_page + 1):
content_playlist = get_content(self.playlist_endpoint.format(video_id=self.vid, page=page))
content_playlist = loads(content_playlist)
for ep in content_playlist['data']['list']:
self.download_by_url('https://www.mgtv.com' + ep['url'], **kwargs)
def prepare(self, **kwargs):
if self.url:
self.vid = self.get_vid_from_url(self.url)
content = get_content(self.api_endpoint.format(video_id = self.vid))
content = loads(content)
self.title = content['data']['info']['title']
domain = content['data']['stream_domain'][0]
#stream_available = [i['name'] for i in content['data']['stream']]
content_info = get_content(self.info_endpoint.format(video_id=self.vid))
log.d(content_info)
content_info = loads(content_info)
self.title = content_info['data']['info']['videoName']
content_player = get_content(self.player_endpoint.format(did=self.did, video_id=self.vid, tk2=self.tk2()))
log.d(content_player)
content_player = loads(content_player)
pm2 = content_player['data']['atc']['pm2']
content_source = get_content(self.source_endpoint.format(video_id=self.vid, tk2=self.tk2(), pm2=pm2))
log.d(content_source)
content_source = loads(content_source)
domain = content_source['data']['stream_domain'][0]
# stream_available = [i['name'] for i in content['data']['stream']]
stream_available = {}
for i in content['data']['stream']:
for i in content_source['data']['stream']:
stream_available[i['name']] = i['url']
for s in self.stream_types:
if s['video_profile'] in stream_available.keys():
quality_id = self.id_dic[s['video_profile']]
url = stream_available[s['video_profile']]
url = domain + re.sub( r'(\&arange\=\d+)', '', url) #Un-Hum
if url is None or url == '':
# skip invalid profile with empty url
continue
url = domain + re.sub(r'(\&arange\=\d+)', '', url) # Un-Hum
m3u8_url, m3u8_size, segment_list_this = self.get_mgtv_real_url(url)
stream_fileid_list = []
for i in segment_list_this:
stream_fileid_list.append(os.path.basename(i).split('.')[0])
#make pieces
pieces = []
for i in zip(stream_fileid_list, segment_list_this):
pieces.append({'fileid': i[0], 'segs': i[1],})
# make pieces
pieces = []
for i in zip(stream_fileid_list, segment_list_this):
pieces.append({'fileid': i[0], 'segs': i[1], })
self.streams[quality_id] = {
self.streams[quality_id] = {
'container': s['container'],
'video_profile': s['video_profile'],
'size': m3u8_size,
@ -97,8 +140,8 @@ class MGTV(VideoExtractor):
'm3u8_url': m3u8_url
}
if not kwargs['info_only']:
self.streams[quality_id]['src'] = segment_list_this
if not kwargs['info_only']:
self.streams[quality_id]['src'] = segment_list_this
def extract(self, **kwargs):
if 'stream_id' in kwargs and kwargs['stream_id']:
@ -132,7 +175,8 @@ class MGTV(VideoExtractor):
if 'index' not in kwargs:
self.p([])
else:
stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag']
stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else \
self.streams_sorted[0]['itag']
self.p_i(stream_id)
# default to use the best quality
@ -148,8 +192,10 @@ class MGTV(VideoExtractor):
else:
download_urls(stream_info['src'], self.title, stream_info['container'], stream_info['size'],
output_dir=kwargs['output_dir'],
merge=kwargs.get('merge', True))
# av=stream_id in self.dash_streams)
merge=kwargs.get('merge', True),
headers={'Referer': self.url})
# av=stream_id in self.dash_streams)
site = MGTV()
download = site.download_by_url

View File

@ -19,7 +19,7 @@ fake_headers_mobile = {
def miaopai_download_by_fid(fid, output_dir = '.', merge = False, info_only = False, **kwargs):
'''Source: Android mobile'''
page_url = 'http://video.weibo.com/show?fid=' + fid + '&type=mp4'
page_url = 'https://video.weibo.com/show?fid=' + fid + '&type=mp4'
mobile_page = get_content(page_url, headers=fake_headers_mobile)
url = match1(mobile_page, r'<video id=.*?src=[\'"](.*?)[\'"]\W')
@ -78,6 +78,53 @@ def miaopai_download_story(url, output_dir='.', merge=False, info_only=False, **
download_urls([stream_url], fs.legitimize(title), ext, total_size=None, headers=fake_headers_mobile, **kwargs)
def miaopai_download_h5api(url, output_dir='.', merge=False, info_only=False, **kwargs):
oid = match1(url, r'/show/(\d{4}:\w+)')
if oid is None:
oid = match1(url, r'\?fid=(\d{4}:\w+)')
page = "/show/%s" % oid
data_url = 'https://h5.video.weibo.com/api/component?%s' % parse.urlencode({
'page': page
})
headers = {}
headers.update(fake_headers_mobile)
headers['origin'] = 'https://h5.video.weibo.com'
headers['page-referer'] = page
headers['referer'] = 'https://h5.video.weibo.com/show/%s' % oid
post_data = {
"data": json.dumps({
"Component_Play_Playinfo": {"oid": oid}
})
}
data_content = post_content(data_url, headers=headers, post_data=post_data)
data = json.loads(data_content)
if data['msg'] != 'succ':
raise Exception('Weibo api returns non-success: (%s)%s'.format(data['code'], data['msg']))
play_info = data['data']['Component_Play_Playinfo']
title = play_info['title']
# get video formats and sort by size desc
video_formats = []
for fmt, relative_uri in play_info['urls'].items():
url = "https:%s" % relative_uri
type, ext, size = url_info(url, headers=headers)
video_formats.append({
'fmt': fmt,
'url': url,
'type': type,
'ext': ext,
'size': size,
})
video_formats.sort(key=lambda v:v['size'], reverse=True)
selected_video = video_formats[0]
video_url, ext, size = selected_video['url'], selected_video['ext'], selected_video['size']
print_info(site_info, title, ext, size)
if not info_only:
download_urls([video_url], fs.legitimize(title), ext, total_size=size, headers=headers, **kwargs)
def miaopai_download_direct(url, output_dir='.', merge=False, info_only=False, **kwargs):
mobile_page = get_content(url, headers=fake_headers_mobile)
try:
@ -108,12 +155,19 @@ def miaopai_download(url, output_dir='.', merge=False, info_only=False, **kwargs
if re.match(r'^http[s]://.*\.weibo\.com/tv/v/(\w+)', url):
return miaopai_download_direct(url, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs)
if re.match(r'^http[s]://(.+\.)?weibo\.com/(tv/)?show/(\d{4}:\w+)', url):
return miaopai_download_h5api(url, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs)
if re.match(r'^http[s]://(.+\.)?weibo\.com/show\?fid=(\d{4}:\w+)', url):
return miaopai_download_h5api(url, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs)
fid = match1(url, r'\?fid=(\d{4}:\w+)')
if fid is not None:
miaopai_download_by_fid(fid, output_dir, merge, info_only)
elif '/p/230444' in url:
fid = match1(url, r'/p/230444(\w+)')
miaopai_download_by_fid('1034:'+fid, output_dir, merge, info_only)
pass
else:
mobile_page = get_content(url, headers = fake_headers_mobile)
hit = re.search(r'"page_url"\s*:\s*"([^"]+)"', mobile_page)

View File

@ -25,6 +25,7 @@ SOFTWARE.
import json
import os
import re
import urllib.parse
from ..common import get_content, urls_size, log, player, dry_run
from ..extractor import VideoExtractor
@ -75,17 +76,13 @@ class _Dispatcher(object):
raise _NoMatchException()
missevan_stream_types = [
{'id': 'source', 'quality': '源文件', 'url_json_key': 'soundurl',
'resource_url_fmt': 'sound/{resource_url}'},
{'id': '320', 'quality': '320 Kbps', 'url_json_key': 'soundurl_64'},
{'id': 'source', 'quality': '源文件', 'url_json_key': 'soundurl'},
{'id': '128', 'quality': '128 Kbps', 'url_json_key': 'soundurl_128'},
{'id': '32', 'quality': '32 Kbps', 'url_json_key': 'soundurl_32'},
{'id': 'covers', 'desc': '封面图', 'url_json_key': 'cover_image',
'default_src': 'covers/nocover.png',
'resource_url_fmt': 'covers/{resource_url}'},
{'id': 'coversmini', 'desc': '封面缩略图', 'url_json_key': 'cover_image',
'default_src': 'coversmini/nocover.png',
'resource_url_fmt': 'coversmini/{resource_url}'}
{'id': 'coversmini', 'desc': '封面缩略图', 'url_json_key': 'front_cover',
'default_src': 'coversmini/nocover.png'}
]
def _get_resource_uri(data, stream_type):
@ -103,7 +100,8 @@ def is_covers_stream(stream):
return stream.lower() in ('covers', 'coversmini')
def get_file_extension(file_path, default=''):
_, suffix = os.path.splitext(file_path)
url_parse_result = urllib.parse.urlparse(file_path)
_, suffix = os.path.splitext(url_parse_result.path)
if suffix:
# remove dot
suffix = suffix[1:]
@ -314,7 +312,7 @@ class MissEvan(VideoExtractor):
or kwargs.get('json_output'):
for _, stream in self.streams.items():
stream['size'] = urls_size(stream['src'])
stream['size'] = urls_size(stream['src'], faker=True)
return
# fetch size of the selected stream only
@ -323,7 +321,7 @@ class MissEvan(VideoExtractor):
stream = self.streams[stream_id]
if 'size' not in stream:
stream['size'] = urls_size(stream['src'])
stream['size'] = urls_size(stream['src'], faker=True)
def _get_content(self, url):
return get_content(url, headers=self.__headers)
@ -353,7 +351,7 @@ class MissEvan(VideoExtractor):
@staticmethod
def url_resource(uri):
return 'https://static.missevan.com/' + uri
return uri if re.match(r'^https?:/{2}\w.+$', uri) else 'https://static.missevan.com/' + uri
site = MissEvan()
site_info = 'MissEvan.com'

View File

@ -28,7 +28,7 @@ def mtv81_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
#
# rtmpdump -r 'rtmpe://cp30865.edgefcs.net/ondemand/mtviestor/_!/intlod/MTVInternational/MBUS/GeoLocals/00JP/VIAMTVI/PYC/201304/7122HVAQ4/00JPVIAMTVIPYC7122HVAQ4_640x_360_1200_m30.mp4' -o "title.mp4" --swfVfy http://media.mtvnservices.com/player/prime/mediaplayerprime.1.10.8.swf
#
# because rtmpdump is unstable,may try serveral times
# because rtmpdump is unstable,may try several times
#
if not info_only:
# import pdb

View File

@ -79,9 +79,14 @@ def netease_cloud_music_download(url, output_dir='.', merge=True, info_only=Fals
netease_song_download(j["program"]["mainSong"], output_dir=output_dir, info_only=info_only)
elif "radio" in url:
j = loads(get_content("http://music.163.com/api/dj/program/byradio/?radioId=%s&ids=[%s]&csrf_token=" % (rid, rid), headers={"Referer": "http://music.163.com/"}))
for i in j['programs']:
netease_song_download(i["mainSong"],output_dir=output_dir, info_only=info_only)
offset = 0
while True:
j = loads(get_content("http://music.163.com/api/dj/program/byradio/?radioId=%s&ids=[%s]&csrf_token=&offset=%d" % (rid, rid, offset), headers={"Referer": "http://music.163.com/"}))
for i in j['programs']:
netease_song_download(i["mainSong"], output_dir=output_dir, info_only=info_only)
if not j['more']:
break
offset += len(j['programs'])
elif "mv" in url:
j = loads(get_content("http://music.163.com/api/mv/detail/?id=%s&ids=[%s]&csrf_token=" % (rid, rid), headers={"Referer": "http://music.163.com/"}))
@ -123,10 +128,10 @@ def netease_song_download(song, output_dir='.', info_only=False, playlist_prefix
output_dir=output_dir, info_only=info_only)
def netease_download_common(title, url_best, output_dir, info_only):
songtype, ext, size = url_info(url_best)
songtype, ext, size = url_info(url_best, faker=True)
print_info(site_info, title, songtype, size)
if not info_only:
download_urls([url_best], title, ext, size, output_dir)
download_urls([url_best], title, ext, size, output_dir, faker=True)
def netease_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):

View File

@ -174,7 +174,7 @@ def make_url(stream):
src = []
for i, seg in enumerate(stream['segs']):
url = 'http://{}/{}/{}?key={}&k={}'.format(host, i, rid, key, key_expr)
url += '&fpp.ver=1.3.0.4&type='
url += '&type=web.fpp'
src.append(url)
return src
@ -189,17 +189,27 @@ class PPTV(VideoExtractor):
]
def prepare(self, **kwargs):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/69.0.3497.100 Safari/537.36"
}
self.vid = match1(self.url, r'https?://sports.pptv.com/vod/(\d+)/*')
if self.url and not self.vid:
if not re.match(r'https?://v.pptv.com/show/(\w+)\.html', self.url):
raise('Unknown url pattern')
page_content = get_content(self.url,{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"})
page_content = get_content(self.url, headers)
self.vid = match1(page_content, r'webcfg\s*=\s*{"id":\s*(\d+)')
if not self.vid:
request = urllib.request.Request(self.url, headers=headers)
response = urllib.request.urlopen(request)
self.vid = match1(response.url, r'https?://sports.pptv.com/vod/(\d+)/*')
if not self.vid:
raise('Cannot find id')
api_url = 'http://web-play.pptv.com/webplay3-0-{}.xml'.format(self.vid)
api_url += '?appplt=flp&appid=pptv.flashplayer.vod&appver=3.4.2.28&type=&version=4'
dom = parseString(get_content(api_url,{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}))
api_url += '?type=web.fpp&param=type=web.fpp&version=4'
dom = parseString(get_content(api_url, headers))
self.title, m_items, m_streams, m_segs = parse_pptv_xml(dom)
xml_streams = merge_meta(m_items, m_streams, m_segs)
for stream_id in xml_streams:
@ -212,146 +222,6 @@ class PPTV(VideoExtractor):
'src': src
}
'''
def constructKey(arg):
def str2hex(s):
r=""
for i in s[:8]:
t=hex(ord(i))[2:]
if len(t)==1:
t="0"+t
r+=t
for i in range(16):
r+=hex(int(15*random()))[2:]
return r
#ABANDONED Because SERVER_KEY is static
def getkey(s):
#returns 1896220160
l2=[i for i in s]
l4=0
l3=0
while l4<len(l2):
l5=l2[l4]
l6=ord(l5)
l7=l6<<((l4%4)*8)
l3=l3^l7
l4+=1
return l3
pass
def rot(k,b): ##>>> in as3
if k>=0:
return k>>b
elif k<0:
return (2**32+k)>>b
pass
def lot(k,b):
return (k<<b)%(2**32)
#WTF?
def encrypt(arg1,arg2):
delta=2654435769
l3=16;
l4=getkey(arg2) #1896220160
l8=[i for i in arg1]
l10=l4;
l9=[i for i in arg2]
l5=lot(l10,8)|rot(l10,24)#101056625
# assert l5==101056625
l6=lot(l10,16)|rot(l10,16)#100692230
# assert 100692230==l6
l7=lot(l10,24)|rot(l10,8)
# assert 7407110==l7
l11=""
l12=0
l13=ord(l8[l12])<<0
l14=ord(l8[l12+1])<<8
l15=ord(l8[l12+2])<<16
l16=ord(l8[l12+3])<<24
l17=ord(l8[l12+4])<<0
l18=ord(l8[l12+5])<<8
l19=ord(l8[l12+6])<<16
l20=ord(l8[l12+7])<<24
l21=(((0|l13)|l14)|l15)|l16
l22=(((0|l17)|l18)|l19)|l20
l23=0
l24=0
while l24<32:
l23=(l23+delta)%(2**32)
l33=(lot(l22,4)+l4)%(2**32)
l34=(l22+l23)%(2**32)
l35=(rot(l22,5)+l5)%(2**32)
l36=(l33^l34)^l35
l21=(l21+l36)%(2**32)
l37=(lot(l21,4)+l6)%(2**32)
l38=(l21+l23)%(2**32)
l39=(rot(l21,5))%(2**32)
l40=(l39+l7)%(2**32)
l41=((l37^l38)%(2**32)^l40)%(2**32)
l22=(l22+l41)%(2**32)
l24+=1
l11+=chr(rot(l21,0)&0xff)
l11+=chr(rot(l21,8)&0xff)
l11+=chr(rot(l21,16)&0xff)
l11+=chr(rot(l21,24)&0xff)
l11+=chr(rot(l22,0)&0xff)
l11+=chr(rot(l22,8)&0xff)
l11+=chr(rot(l22,16)&0xff)
l11+=chr(rot(l22,24)&0xff)
return l11
loc1=hex(int(arg))[2:]+(16-len(hex(int(arg))[2:]))*"\x00"
SERVER_KEY="qqqqqww"+"\x00"*9
res=encrypt(loc1,SERVER_KEY)
return str2hex(res)
def pptv_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False):
xml = get_html('http://web-play.pptv.com/webplay3-0-%s.xml?type=web.fpp' % id)
#vt=3 means vod mode vt=5 means live mode
host = r1(r'<sh>([^<>]+)</sh>', xml)
k = r1(r'<key expire=[^<>]+>([^<>]+)</key>', xml)
rid = r1(r'rid="([^"]+)"', xml)
title = r1(r'nm="([^"]+)"', xml)
st=r1(r'<st>([^<>]+)</st>',xml)[:-4]
st=time.mktime(time.strptime(st))*1000-60*1000-time.time()*1000
st+=time.time()*1000
st=st/1000
key=constructKey(st)
pieces = re.findall('<sgm no="(\d+)"[^<>]+fs="(\d+)"', xml)
numbers, fs = zip(*pieces)
urls=["http://{}/{}/{}?key={}&fpp.ver=1.3.0.4&k={}&type=web.fpp".format(host,i,rid,key,k) for i in range(max(map(int,numbers))+1)]
total_size = sum(map(int, fs))
assert rid.endswith('.mp4')
print_info(site_info, title, 'mp4', total_size)
if not info_only:
try:
download_urls(urls, title, 'mp4', total_size, output_dir = output_dir, merge = merge)
except urllib.error.HTTPError:
#for key expired
pptv_download_by_id(id, output_dir = output_dir, merge = merge, info_only = info_only)
def pptv_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):
assert re.match(r'http://v.pptv.com/show/(\w+)\.html', url)
html = get_html(url)
id = r1(r'webcfg\s*=\s*{"id":\s*(\d+)', html)
assert id
pptv_download_by_id(id, output_dir = output_dir, merge = merge, info_only = info_only)
'''
site = PPTV()
#site_info = "PPTV.com"
#download = pptv_download

View File

@ -10,7 +10,7 @@ __all__ = ['qingting_download_by_url']
class Qingting(VideoExtractor):
# every resource is described by its channel id and program id
# so vid is tuple (chaanel_id, program_id)
# so vid is tuple (channel_id, program_id)
name = 'Qingting'
stream_types = [

View File

@ -35,6 +35,7 @@ def qq_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False):
part_urls= []
total_size = 0
ext = None
for part in range(1, seg_cnt+1):
if fc_cnt == 0:
# fix json parsing error
@ -82,7 +83,7 @@ def kg_qq_download_by_shareid(shareid, output_dir='.', info_only=False, caption=
playurl = json_data['data']['playurl']
videourl = json_data['data']['playurl_video']
real_url = playurl if playurl else videourl
real_url = real_url.replace('\/', '/')
real_url = real_url.replace(r'\/', '/')
ksong_mid = json_data['data']['ksong_mid']
lyric_url = 'http://cgi.kg.qq.com/fcgi-bin/fcg_lyric?jsonpCallback=jsopgetlrcdata&outCharset=utf-8&ksongmid=' + ksong_mid

View File

@ -23,7 +23,7 @@ def real_url(fileName, key, ch):
def sohu_download(url, output_dir='.', merge=True, info_only=False, extractor_proxy=None, **kwargs):
if re.match(r'http://share.vrs.sohu.com', url):
vid = r1('id=(\d+)', url)
vid = r1(r'id=(\d+)', url)
else:
html = get_html(url)
vid = r1(r'\Wvid\s*[\:=]\s*[\'"]?(\d+)[\'"]?', html) or r1(r'bid:\'(\d+)\',', html) or r1(r'bid=(\d+)', html)

View File

@ -19,7 +19,7 @@ def get_sndcd_apikey():
def get_resource_info(resource_url, client_id):
cont = get_content(resource_url, decoded=True)
x = re.escape('forEach(function(e){n(e)})}catch(t){}})},')
x = re.escape('forEach(function(e){n(e)})}catch(e){}})},')
x = re.search(r'' + x + r'(.*)\);</script>', cont)
info = json.loads(x.group(1))[-1]['data'][0]

View File

@ -5,26 +5,43 @@ __all__ = ['tiktok_download']
from ..common import *
def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
html = get_html(url, faker=True)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0',
'Accept-Encoding': 'gzip, deflate',
'Accept': '*/*',
'Referer': 'https://www.tiktok.com/',
'Connection': 'keep-alive' # important
}
data = r1(r'<script id="__NEXT_DATA__".*?>(.*?)</script>', html)
m = re.match('(https?://)?([^/]+)(/.*)', url)
host = m.group(2)
if host != 'www.tiktok.com': # non-canonical URL
if host == 'vt.tiktok.com': # short URL
url = get_location(url)
vid = r1(r'/video/(\d+)', url)
url = 'https://www.tiktok.com/@/video/%s/' % vid
host = 'www.tiktok.com'
else:
url = m.group(3).split('?')[0]
vid = url.split('/')[3] # should be a string of numbers
html, set_cookie = getHttps(host, url, headers=headers)
tt_chain_token = r1('tt_chain_token=([^;]+);', set_cookie)
headers['Cookie'] = 'tt_chain_token=%s' % tt_chain_token
data = r1(r'<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" type="application/json">(.*?)</script>', html)
info = json.loads(data)
videoData = info['props']['pageProps']['videoData']
urls = videoData['itemInfos']['video']['urls']
videoId = videoData['itemInfos']['id']
uniqueId = videoData['authorInfos'].get('uniqueId')
nickName = videoData['authorInfos'].get('nickName')
itemStruct = info['__DEFAULT_SCOPE__']['webapp.video-detail']['itemInfo']['itemStruct']
downloadAddr = itemStruct['video']['downloadAddr']
author = itemStruct['author']['uniqueId']
nickname = itemStruct['author']['nickname']
title = '%s [%s]' % (nickname or author, vid)
for i, url in enumerate(urls):
title = '%s [%s]' % (nickName or uniqueId, videoId)
if len(urls) > 1:
title = '%s [%s]' % (title, i)
mime, ext, size = url_info(downloadAddr, headers=headers)
mime, ext, size = url_info(url)
print_info(site_info, title, mime, size)
if not info_only:
download_urls([url], title, ext, size, output_dir=output_dir, merge=merge)
print_info(site_info, title, mime, size)
if not info_only:
download_urls([downloadAddr], title, ext, size, output_dir=output_dir, merge=merge, headers=headers)
site_info = "TikTok.com"
download = tiktok_download

View File

@ -71,7 +71,7 @@ def tudou_download(url, output_dir = '.', merge = True, info_only = False, **kwa
# obsolete?
def parse_playlist(url):
aid = r1('http://www.tudou.com/playlist/p/a(\d+)(?:i\d+)?\.html', url)
aid = r1(r'http://www.tudou.com/playlist/p/a(\d+)(?:i\d+)?\.html', url)
html = get_decoded_html(url)
if not aid:
aid = r1(r"aid\s*[:=]\s*'(\d+)'", html)

View File

@ -6,7 +6,6 @@ from ..common import *
from .universal import *
from .dailymotion import dailymotion_download
from .vimeo import vimeo_download
from .vine import vine_download
def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
if re.match(r'https?://\d+\.media\.tumblr\.com/', url):
@ -14,7 +13,7 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
return
import ssl
ssl_context = request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1))
ssl_context = request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)) # server requires TLS v1.2
cookie_handler = request.HTTPCookieProcessor()
opener = request.build_opener(ssl_context, cookie_handler)
request.install_opener(opener)
@ -35,7 +34,7 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
post_data_raw='{"eu_resident":true,"gdpr_is_acceptable_age":true,"gdpr_consent_core":true,"gdpr_consent_first_party_ads":true,"gdpr_consent_third_party_ads":true,"gdpr_consent_search_history":true,"redirect_to":"%s","gdpr_reconsent":false}' % url)
page = get_html(url, faker=True)
html = parse.unquote(page).replace('\/', '/')
html = parse.unquote(page).replace(r'\/', '/')
feed = r1(r'<meta property="og:type" content="tumblr-feed:(\w+)" />', html)
if feed in ['photo', 'photoset', 'entry'] or feed is None:
@ -45,23 +44,30 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
r1(r'<title>([^<\n]*)', html)
urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.jpg)', html) +\
re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.png)', html) +\
re.findall(r'(https?://[^;"&]+/tumblr_[^";&]+_\d+\.gif)', html)
re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.gif)', html) +\
re.findall(r'(https?://\d+\.media\.tumblr\.com/[^;"&]+/s\d+x\d+/[^;"&]+\.jpg)', html) +\
re.findall(r'(https?://\d+\.media\.tumblr\.com/[^;"&]+/s\d+x\d+/[^;"&]+\.png)', html) +\
re.findall(r'(https?://\d+\.media\.tumblr\.com/[^;"&]+/s\d+x\d+/[^;"&]+\.gif)', html)
tuggles = {}
for url in urls:
if url.endswith('.gif'):
hd_url = url
elif url.endswith('.jpg'):
hd_url = r1(r'(.+)_\d+\.jpg$', url) + '_1280.jpg' # FIXME: decide actual quality
hd_url = url # FIXME: decide actual quality # r1(r'(.+)_\d+\.jpg$', url) + '_1280.jpg'
elif url.endswith('.png'):
hd_url = r1(r'(.+)_\d+\.png$', url) + '_1280.png' # FIXME: decide actual quality
hd_url = url # FIXME: decide actual quality # r1(r'(.+)_\d+\.png$', url) + '_1280.png'
else:
continue
filename = parse.unquote(hd_url.split('/')[-1])
title = '.'.join(filename.split('.')[:-1])
tumblr_id = r1(r'^tumblr_(.+)_\d+$', title)
quality = int(r1(r'^tumblr_.+_(\d+)$', title))
tumblr_id = r1(r'^tumblr_(.+)_\d+$', title) or title
try:
quality = int(r1(r'^tumblr_.+_(\d+)$', title))
except:
quality = int(r1(r'/s(\d+)x\d+/', hd_url))
ext = filename.split('.')[-1]
try:
size = int(get_head(hd_url)['Content-Length'])
if tumblr_id not in tuggles or tuggles[tumblr_id]['quality'] < quality:
@ -75,16 +81,16 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
except: pass
if tuggles:
size = sum([tuggles[t]['size'] for t in tuggles])
print_info(site_info, page_title, None, size)
#size = sum([tuggles[t]['size'] for t in tuggles])
#print_info(site_info, page_title, None, size)
if not info_only:
for t in tuggles:
title = tuggles[t]['title']
ext = tuggles[t]['ext']
size = tuggles[t]['size']
url = tuggles[t]['url']
print_info(site_info, title, ext, size)
for t in tuggles:
title = '[tumblr] ' + tuggles[t]['title']
ext = tuggles[t]['ext']
size = tuggles[t]['size']
url = tuggles[t]['url']
print_info(site_info, title, ext, size)
if not info_only:
download_urls([url], title, ext, size,
output_dir=output_dir)
return
@ -118,9 +124,6 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
elif re.search(r'dailymotion\.com', iframe_url):
dailymotion_download(iframe_url, output_dir, merge=merge, info_only=info_only, **kwargs)
return
elif re.search(r'vine\.co', iframe_url):
vine_download(iframe_url, output_dir, merge=merge, info_only=info_only, **kwargs)
return
else:
iframe_html = get_content(iframe_url)
real_url = r1(r'<source src="([^"]*)"', iframe_html)

View File

@ -4,7 +4,6 @@ __all__ = ['twitter_download']
from ..common import *
from .universal import *
from .vine import vine_download
def extract_m3u(source):
r1 = get_content(source)
@ -23,7 +22,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs)
if re.match(r'https?://mobile', url): # normalize mobile URL
url = 'https://' + match1(url, r'//mobile\.(.+)')
if re.match(r'https?://twitter\.com/i/moments/', url): # moments
if re.match(r'https?://twitter\.com/i/moments/', url): # FIXME: moments
html = get_html(url, faker=True)
paths = re.findall(r'data-permalink-path="([^"]+)"', html)
for path in paths:
@ -34,71 +33,49 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs)
**kwargs)
return
html = get_html(url, faker=False) # disable faker to prevent 302 infinite redirect
screen_name = r1(r'twitter\.com/([^/]+)', url) or r1(r'data-screen-name="([^"]*)"', html) or \
r1(r'<meta name="twitter:title" content="([^"]*)"', html)
item_id = r1(r'twitter\.com/[^/]+/status/(\d+)', url) or r1(r'data-item-id="([^"]*)"', html) or \
r1(r'<meta name="twitter:site:id" content="([^"]*)"', html)
m = re.match(r'^https?://(mobile\.)?(x|twitter)\.com/([^/]+)/status/(\d+)', url)
assert m
screen_name, item_id = m.group(3), m.group(4)
page_title = "{} [{}]".format(screen_name, item_id)
authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
# FIXME: this API won't work for protected or nsfw contents
api_url = 'https://cdn.syndication.twimg.com/tweet-result?id=%s&token=!' % item_id
content = get_content(api_url)
info = json.loads(content)
ga_url = 'https://api.twitter.com/1.1/guest/activate.json'
ga_content = post_content(ga_url, headers={'authorization': authorization})
guest_token = json.loads(ga_content)['guest_token']
author = info['user']['name']
url = 'https://twitter.com/%s/status/%s' % (info['user']['screen_name'], item_id)
full_text = info['text']
api_url = 'https://api.twitter.com/2/timeline/conversation/%s.json?tweet_mode=extended' % item_id
api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token})
info = json.loads(api_content)
if 'extended_entities' in info['globalObjects']['tweets'][item_id]:
# if the tweet contains media, download them
media = info['globalObjects']['tweets'][item_id]['extended_entities']['media']
elif info['globalObjects']['tweets'][item_id].get('is_quote_status') == True:
# if the tweet does not contain media, but it quotes a tweet
# and the quoted tweet contains media, download them
item_id = info['globalObjects']['tweets'][item_id]['quoted_status_id_str']
api_url = 'https://api.twitter.com/2/timeline/conversation/%s.json?tweet_mode=extended' % item_id
api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token})
info = json.loads(api_content)
if 'extended_entities' in info['globalObjects']['tweets'][item_id]:
media = info['globalObjects']['tweets'][item_id]['extended_entities']['media']
else:
# quoted tweet has no media
return
else:
# no media, no quoted tweet
return
for medium in media:
if 'video_info' in medium:
# FIXME: we're assuming one tweet only contains one video here
variants = medium['video_info']['variants']
variants = sorted(variants, key=lambda kv: kv.get('bitrate', 0))
urls = [ variants[-1]['url'] ]
if 'photos' in info:
for photo in info['photos']:
photo_url = photo['url']
title = item_id + '_' + photo_url.split('.')[-2].split('/')[-1]
urls = [ photo_url + ':orig' ]
size = urls_size(urls)
mime, ext = variants[-1]['content_type'], 'mp4'
print_info(site_info, page_title, mime, size)
if not info_only:
download_urls(urls, page_title, ext, size, output_dir, merge=merge)
else:
title = item_id + '_' + medium['media_url_https'].split('.')[-2].split('/')[-1]
urls = [ medium['media_url_https'] + ':orig' ]
size = urls_size(urls)
ext = medium['media_url_https'].split('.')[-1]
ext = photo_url.split('.')[-1]
print_info(site_info, title, ext, size)
if not info_only:
download_urls(urls, title, ext, size, output_dir, merge=merge)
if 'video' in info:
for mediaDetail in info['mediaDetails']:
if 'video_info' not in mediaDetail: continue
variants = mediaDetail['video_info']['variants']
variants = sorted(variants, key=lambda kv: kv.get('bitrate', 0))
title = item_id + '_' + variants[-1]['url'].split('/')[-1].split('?')[0].split('.')[0]
urls = [ variants[-1]['url'] ]
size = urls_size(urls)
mime, ext = variants[-1]['content_type'], 'mp4'
site_info = "Twitter.com"
print_info(site_info, title, ext, size)
if not info_only:
download_urls(urls, title, ext, size, output_dir, merge=merge)
# TODO: should we deal with quoted tweets?
site_info = "X.com"
download = twitter_download
download_playlist = playlist_not_supported('twitter')

View File

@ -48,7 +48,7 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg
else:
return
hls_urls = re.findall(r'(https?://[^;"\'\\]+' + '\.m3u8?' +
hls_urls = re.findall(r'(https?://[^;"\'\\]+' + r'\.m3u8?' +
r'[^;"\'\\]*)', page)
if hls_urls:
try:
@ -64,18 +64,19 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg
return
# most common media file extensions on the Internet
media_exts = ['\.flv', '\.mp3', '\.mp4', '\.webm',
'[-_]1\d\d\d\.jpe?g', '[-_][6-9]\d\d\.jpe?g', # tumblr
'[-_]1\d\d\dx[6-9]\d\d\.jpe?g',
'[-_][6-9]\d\dx1\d\d\d\.jpe?g',
'[-_][6-9]\d\dx[6-9]\d\d\.jpe?g',
's1600/[\w%]+\.jpe?g', # blogger
'img[6-9]\d\d/[\w%]+\.jpe?g' # oricon?
media_exts = [r'\.flv', r'\.mp3', r'\.mp4', r'\.webm',
r'[-_]1\d\d\d\.jpe?g', r'[-_][6-9]\d\d\.jpe?g', # tumblr
r'[-_]1\d\d\dx[6-9]\d\d\.jpe?g',
r'[-_][6-9]\d\dx1\d\d\d\.jpe?g',
r'[-_][6-9]\d\dx[6-9]\d\d\.jpe?g',
r's1600/[\w%]+\.jpe?g', # blogger
r'blogger\.googleusercontent\.com/img/a/\w*', # blogger
r'img[6-9]\d\d/[\w%]+\.jpe?g' # oricon?
]
urls = []
for i in media_exts:
urls += re.findall(r'(https?://[^ ;&"\'\\<>]+' + i + r'[^ ;&"\'\\<>]*)', page)
urls += re.findall(r'(https?://[^ ;&"\'\\<>]*' + i + r'[^ =?;&"\'\\<>]*)', page)
p_urls = re.findall(r'(https?%3A%2F%2F[^;&"]+' + i + r'[^;&"]*)', page)
urls += [parse.unquote(url) for url in p_urls]

View File

@ -102,7 +102,7 @@ class VimeoExtractor(VideoExtractor):
pos = 0
while pos < len(lines):
if lines[pos].startswith('#EXT-X-STREAM-INF'):
patt = 'RESOLUTION=(\d+)x(\d+)'
patt = r'RESOLUTION=(\d+)x(\d+)'
hit = re.search(patt, lines[pos])
if hit is None:
continue
@ -132,34 +132,6 @@ class VimeoExtractor(VideoExtractor):
def vimeo_download_by_id(id, title=None, output_dir='.', merge=True, info_only=False, **kwargs):
'''
try:
# normal Vimeo video
html = get_content('https://vimeo.com/' + id)
cfg_patt = r'clip_page_config\s*=\s*(\{.+?\});'
cfg = json.loads(match1(html, cfg_patt))
video_page = get_content(cfg['player']['config_url'], headers=fake_headers)
title = cfg['clip']['title']
info = loads(video_page)
except:
# embedded player - referer may be required
if 'referer' in kwargs:
fake_headers['Referer'] = kwargs['referer']
video_page = get_content('http://player.vimeo.com/video/%s' % id, headers=fake_headers)
title = r1(r'<title>([^<]+)</title>', video_page)
info = loads(match1(video_page, r'var t=(\{.+?\});'))
streams = info['request']['files']['progressive']
streams = sorted(streams, key=lambda i: i['height'])
url = streams[-1]['url']
type, ext, size = url_info(url, faker=True)
print_info(site_info, title, type, size)
if not info_only:
download_urls([url], title, ext, size, output_dir, merge=merge, faker=True)
'''
site = VimeoExtractor()
site.download_by_vid(id, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs)

View File

@ -1,36 +0,0 @@
#!/usr/bin/env python
__all__ = ['vine_download']
from ..common import *
import json
def vine_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
html = get_content(url)
video_id = r1(r'vine.co/v/([^/]+)', url)
title = r1(r'<title>([^<]*)</title>', html)
stream = r1(r'<meta property="twitter:player:stream" content="([^"]*)">', html)
if not stream: # https://vine.co/v/.../card
stream = r1(r'"videoUrl":"([^"]+)"', html)
if stream:
stream = stream.replace('\\/', '/')
else:
posts_url = 'https://archive.vine.co/posts/' + video_id + '.json'
json_data = json.loads(get_content(posts_url))
stream = json_data['videoDashUrl']
title = json_data['description']
if title == "":
title = json_data['username'].replace(" ", "_") + "_" + video_id
mime, ext, size = url_info(stream)
print_info(site_info, title, mime, size)
if not info_only:
download_urls([stream], title, ext, size, output_dir, merge=merge)
site_info = "Vine.co"
download = vine_download
download_playlist = playlist_not_supported('vine')

View File

@ -1,215 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__all__ = ['xiami_download']
from ..common import *
from xml.dom.minidom import parseString
from urllib import parse
def location_dec(str):
head = int(str[0])
str = str[1:]
rows = head
cols = int(len(str)/rows) + 1
out = ""
full_row = len(str) % head
for c in range(cols):
for r in range(rows):
if c == (cols - 1) and r >= full_row:
continue
if r < full_row:
char = str[r*cols+c]
else:
char = str[cols*full_row+(r-full_row)*(cols-1)+c]
out += char
return parse.unquote(out).replace("^", "0")
def xiami_download_lyric(lrc_url, file_name, output_dir):
lrc = get_content(lrc_url, headers=fake_headers)
filename = get_filename(file_name)
if len(lrc) > 0:
with open(output_dir + "/" + filename + '.lrc', 'w', encoding='utf-8') as x:
x.write(lrc)
def xiami_download_pic(pic_url, file_name, output_dir):
from ..util.strings import get_filename
pic_url = pic_url.replace('_1', '')
pos = pic_url.rfind('.')
ext = pic_url[pos:]
pic = get_content(pic_url, headers=fake_headers, decoded=False)
if len(pic) > 0:
with open(output_dir + "/" + file_name.replace('/', '-') + ext, 'wb') as x:
x.write(pic)
def xiami_download_song(sid, output_dir = '.', info_only = False):
xml = get_content('http://www.xiami.com/song/playlist/id/%s/object_name/default/object_id/0' % sid, headers=fake_headers)
doc = parseString(xml)
i = doc.getElementsByTagName("track")[0]
artist = i.getElementsByTagName("artist")[0].firstChild.nodeValue
album_name = i.getElementsByTagName("album_name")[0].firstChild.nodeValue
song_title = i.getElementsByTagName("name")[0].firstChild.nodeValue
url = location_dec(i.getElementsByTagName("location")[0].firstChild.nodeValue)
try:
lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue
except:
pass
type_, ext, size = url_info(url, headers=fake_headers)
if not ext:
ext = 'mp3'
print_info(site_info, song_title, ext, size)
if not info_only:
file_name = "%s - %s - %s" % (song_title, artist, album_name)
download_urls([url], file_name, ext, size, output_dir, headers=fake_headers)
try:
xiami_download_lyric(lrc_url, file_name, output_dir)
except:
pass
def xiami_download_showcollect(cid, output_dir = '.', info_only = False):
html = get_content('http://www.xiami.com/song/showcollect/id/' + cid, headers=fake_headers)
collect_name = r1(r'<title>(.*)</title>', html)
xml = get_content('http://www.xiami.com/song/playlist/id/%s/type/3' % cid, headers=fake_headers)
doc = parseString(xml)
output_dir = output_dir + "/" + "[" + collect_name + "]"
tracks = doc.getElementsByTagName("track")
track_nr = 1
for i in tracks:
artist=album_name=song_title=url=""
try:
song_id = i.getElementsByTagName("song_id")[0].firstChild.nodeValue
artist = i.getElementsByTagName("artist")[0].firstChild.nodeValue
album_name = i.getElementsByTagName("album_name")[0].firstChild.nodeValue
song_title = i.getElementsByTagName("title")[0].firstChild.nodeValue
url = location_dec(i.getElementsByTagName("location")[0].firstChild.nodeValue)
except:
log.e("Song %s failed. [Info Missing] artist:%s, album:%s, title:%s, url:%s" % (song_id, artist, album_name, song_title, url))
continue
try:
lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue
except:
pass
type_, ext, size = url_info(url, headers=fake_headers)
if not ext:
ext = 'mp3'
print_info(site_info, song_title, ext, size)
if not info_only:
file_name = "%02d.%s - %s - %s" % (track_nr, song_title, artist, album_name)
download_urls([url], file_name, ext, size, output_dir, headers=fake_headers)
try:
xiami_download_lyric(lrc_url, file_name, output_dir)
except:
pass
track_nr += 1
def xiami_download_album(aid, output_dir='.', info_only=False):
xml = get_content('http://www.xiami.com/song/playlist/id/%s/type/1' % aid, headers=fake_headers)
album_name = r1(r'<album_name><!\[CDATA\[(.*)\]\]>', xml)
artist = r1(r'<artist><!\[CDATA\[(.*)\]\]>', xml)
doc = parseString(xml)
output_dir = output_dir + "/%s - %s" % (artist, album_name)
track_list = doc.getElementsByTagName('trackList')[0]
tracks = track_list.getElementsByTagName("track")
track_nr = 1
pic_exist = False
for i in tracks:
#in this xml track tag is used for both "track in a trackList" and track no
#dirty here
if i.firstChild.nodeValue is not None:
continue
song_title = i.getElementsByTagName("songName")[0].firstChild.nodeValue
url = location_dec(i.getElementsByTagName("location")[0].firstChild.nodeValue)
try:
lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue
except:
pass
if not pic_exist:
pic_url = i.getElementsByTagName("pic")[0].firstChild.nodeValue
type_, ext, size = url_info(url, headers=fake_headers)
if not ext:
ext = 'mp3'
print_info(site_info, song_title, ext, size)
if not info_only:
file_name = "%02d.%s" % (track_nr, song_title)
download_urls([url], file_name, ext, size, output_dir, headers=fake_headers)
try:
xiami_download_lyric(lrc_url, file_name, output_dir)
except:
pass
if not pic_exist:
xiami_download_pic(pic_url, 'cover', output_dir)
pic_exist = True
track_nr += 1
def xiami_download_mv(url, output_dir='.', merge=True, info_only=False):
# FIXME: broken merge
page = get_content(url, headers=fake_headers)
title = re.findall('<title>([^<]+)', page)[0]
vid, uid = re.findall(r'vid:"(\d+)",uid:"(\d+)"', page)[0]
api_url = 'http://cloud.video.taobao.com/videoapi/info.php?vid=%s&uid=%s' % (vid, uid)
result = get_content(api_url, headers=fake_headers)
doc = parseString(result)
video_url = doc.getElementsByTagName("video_url")[-1].firstChild.nodeValue
length = int(doc.getElementsByTagName("length")[-1].firstChild.nodeValue)
v_urls = []
k_start = 0
total_size = 0
while True:
k_end = k_start + 20000000
if k_end >= length: k_end = length - 1
v_url = video_url + '/start_%s/end_%s/1.flv' % (k_start, k_end)
try:
_, ext, size = url_info(v_url)
except:
break
v_urls.append(v_url)
total_size += size
k_start = k_end + 1
print_info(site_info, title, ext, total_size)
if not info_only:
download_urls(v_urls, title, ext, total_size, output_dir, merge=merge, headers=fake_headers)
def xiami_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
#albums
if re.match(r'http://www.xiami.com/album/\d+', url):
id = r1(r'http://www.xiami.com/album/(\d+)', url)
xiami_download_album(id, output_dir, info_only)
elif re.match(r'http://www.xiami.com/album/\w+', url):
page = get_content(url, headers=fake_headers)
album_id = re.search(r'rel="canonical"\s+href="http://www.xiami.com/album/([^"]+)"', page).group(1)
xiami_download_album(album_id, output_dir, info_only)
#collections
if re.match(r'http://www.xiami.com/collect/\d+', url):
id = r1(r'http://www.xiami.com/collect/(\d+)', url)
xiami_download_showcollect(id, output_dir, info_only)
#single track
if re.match(r'http://www.xiami.com/song/\d+\b', url):
id = r1(r'http://www.xiami.com/song/(\d+)', url)
xiami_download_song(id, output_dir, info_only)
elif re.match(r'http://www.xiami.com/song/\w+', url):
html = get_content(url, headers=fake_headers)
id = r1(r'rel="canonical" href="http://www.xiami.com/song/([^"]+)"', html)
xiami_download_song(id, output_dir, info_only)
if re.match('http://www.xiami.com/song/detail/id/\d+', url):
id = r1(r'http://www.xiami.com/song/detail/id/(\d+)', url)
xiami_download_song(id, output_dir, info_only)
if re.match('http://www.xiami.com/mv', url):
xiami_download_mv(url, output_dir, merge=merge, info_only=info_only)
site_info = "Xiami.com"
download = xiami_download
download_playlist = playlist_not_supported("xiami")

View File

@ -20,7 +20,7 @@ class Xinpianchang(VideoExtractor):
def prepare(self, **kwargs):
# find key
page_content = get_content(self.url)
match_rule = r"vid: \"(.+?)\","
match_rule = r"vid = \"(.+?)\";"
key = re.findall(match_rule, page_content)[0]
# get videos info

View File

@ -1,43 +0,0 @@
#!/usr/bin/env python
__all__ = ['yinyuetai_download', 'yinyuetai_download_by_id']
from ..common import *
def yinyuetai_download_by_id(vid, title=None, output_dir='.', merge=True, info_only=False):
video_info = json.loads(get_html('http://www.yinyuetai.com/insite/get-video-info?json=true&videoId=%s' % vid))
url_models = video_info['videoInfo']['coreVideoInfo']['videoUrlModels']
url_models = sorted(url_models, key=lambda i: i['qualityLevel'])
url = url_models[-1]['videoUrl']
type = ext = r1(r'\.(flv|mp4)', url)
_, _, size = url_info(url)
print_info(site_info, title, type, size)
if not info_only:
download_urls([url], title, ext, size, output_dir, merge = merge)
def yinyuetai_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
id = r1(r'http://\w+.yinyuetai.com/video/(\d+)', url) or \
r1(r'http://\w+.yinyuetai.com/video/h5/(\d+)', url)
if not id:
yinyuetai_download_playlist(url, output_dir=output_dir, merge=merge, info_only=info_only)
return
html = get_html(url, 'utf-8')
title = r1(r'<meta property="og:title"\s+content="([^"]+)"/>', html) or r1(r'<title>(.*)', html)
assert title
title = parse.unquote(title)
title = escape_file_path(title)
yinyuetai_download_by_id(id, title, output_dir, merge=merge, info_only=info_only)
def yinyuetai_download_playlist(url, output_dir='.', merge=True, info_only=False, **kwargs):
playlist = r1(r'http://\w+.yinyuetai.com/playlist/(\d+)', url)
html = get_html(url)
data_ids = re.findall(r'data-index="\d+"\s*data-id=(\d+)', html)
for data_id in data_ids:
yinyuetai_download('http://v.yinyuetai.com/video/' + data_id,
output_dir=output_dir, merge=merge, info_only=info_only)
site_info = "YinYueTai.com"
download = yinyuetai_download
download_playlist = yinyuetai_download_playlist

View File

@ -41,7 +41,6 @@ class Youku(VideoExtractor):
mobile_ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'
dispatcher_url = 'vali.cp31.ott.cibntv.net'
# Last updated: 2017-10-13
stream_types = [
{'id': 'hd3', 'container': 'flv', 'video_profile': '1080P'},
{'id': 'hd3v2', 'container': 'flv', 'video_profile': '1080P'},
@ -78,7 +77,7 @@ class Youku(VideoExtractor):
self.api_error_code = None
self.api_error_msg = None
self.ccode = '0519'
self.ccode = '0564'
# Found in http://g.alicdn.com/player/ykplayer/0.5.64/youku-player.min.js
# grep -oE '"[0-9a-zA-Z+/=]{256}"' youku-player.min.js
self.ckey = 'DIl58SLFxFNndSV1GFNnMQVYkx1PP5tKe1siZu/86PR1u/Wh1Ptd+WOZsHHWxysSfAOhNJpdVWsdVJNsfJ8Sxd8WKVvNfAS8aS8fAOzYARzPyPc3JvtnPHjTdKfESTdnuTW6ZPvk2pNDh4uFzotgdMEFkzQ5wZVXl2Pf1/Y6hLK0OnCNxBj3+nb0v72gZ6b0td+WOZsHHWxysSo/0y9D2K42SaB8Y/+aD2K42SaB8Y/+ahU+WOZsHcrxysooUeND'
@ -243,7 +242,7 @@ class Youku(VideoExtractor):
def youku_download_playlist_by_url(url, **kwargs):
video_page_pt = 'https?://v.youku.com/v_show/id_([A-Za-z0-9=]+)'
js_cb_pt = '\(({.+})\)'
js_cb_pt = r'\(({.+})\)'
if re.match(video_page_pt, url):
youku_obj = Youku()
youku_obj.url = url
@ -273,14 +272,14 @@ def youku_download_playlist_by_url(url, **kwargs):
page = get_content(url)
show_id = re.search(r'showid:"(\d+)"', page).group(1)
ep = 'http://list.youku.com/show/module?id={}&tab=showInfo&callback=jQuery'.format(show_id)
xhr_page = get_content(ep).replace('\/', '/').replace('\"', '"')
xhr_page = get_content(ep).replace(r'\/', '/').replace(r'\"', '"')
video_url = re.search(r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_page).group(1)
youku_download_playlist_by_url('http://'+video_url, **kwargs)
return
elif re.match('https?://list.youku.com/albumlist/show/id_(\d+)\.html', url):
elif re.match(r'https?://list.youku.com/albumlist/show/id_(\d+)\.html', url):
# http://list.youku.com/albumlist/show/id_2336634.html
# UGC playlist
list_id = re.search('https?://list.youku.com/albumlist/show/id_(\d+)\.html', url).group(1)
list_id = re.search(r'https?://list.youku.com/albumlist/show/id_(\d+)\.html', url).group(1)
ep = 'http://list.youku.com/albumlist/items?id={}&page={}&size=20&ascending=1&callback=tuijsonp6'
first_u = ep.format(list_id, 1)
@ -295,7 +294,7 @@ def youku_download_playlist_by_url(url, **kwargs):
for i in range(2, req_cnt+2):
req_u = ep.format(list_id, i)
xhr_page = get_content(req_u)
json_data = json.loads(re.search(js_cb_pt, xhr_page).group(1).replace('\/', '/'))
json_data = json.loads(re.search(js_cb_pt, xhr_page).group(1).replace(r'\/', '/'))
xhr_html = json_data['html']
page_videos = re.findall(r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_html)
v_urls.extend(page_videos)

View File

@ -3,6 +3,13 @@
from ..common import *
from ..extractor import VideoExtractor
try:
import dukpy
except ImportError:
log.e('Please install dukpy in order to extract videos from YouTube:')
log.e('$ pip install dukpy')
exit(0)
from urllib.parse import urlparse, parse_qs, urlencode
from xml.dom.minidom import parseString
class YouTube(VideoExtractor):
@ -68,40 +75,33 @@ class YouTube(VideoExtractor):
'audio_encoding': 'AAC', 'audio_bitrate': '24'},
]
def decipher(js, s):
# Examples:
# - https://www.youtube.com/yts/jsbin/player-da_DK-vflWlK-zq/base.js
# - https://www.youtube.com/yts/jsbin/player-vflvABTsY/da_DK/base.js
# - https://www.youtube.com/yts/jsbin/player-vfls4aurX/da_DK/base.js
# - https://www.youtube.com/yts/jsbin/player_ias-vfl_RGK2l/en_US/base.js
# - https://www.youtube.com/yts/jsbin/player-vflRjqq_w/da_DK/base.js
# - https://www.youtube.com/yts/jsbin/player_ias-vfl-jbnrr/da_DK/base.js
def tr_js(code):
code = re.sub(r'function', r'def', code)
code = re.sub(r'(\W)(as|if|in|is|or)\(', r'\1_\2(', code)
code = re.sub(r'\$', '_dollar', code)
code = re.sub(r'\{', r':\n\t', code)
code = re.sub(r'\}', r'\n', code)
code = re.sub(r'var\s+', r'', code)
code = re.sub(r'(\w+).join\(""\)', r'"".join(\1)', code)
code = re.sub(r'(\w+).length', r'len(\1)', code)
code = re.sub(r'(\w+).slice\((\w+)\)', r'\1[\2:]', code)
code = re.sub(r'(\w+).splice\((\w+),(\w+)\)', r'del \1[\2:\2+\3]', code)
code = re.sub(r'(\w+).split\(""\)', r'list(\1)', code)
return code
def dethrottle(js, url):
def n_to_n(js, n):
# Examples:
# yma - https://www.youtube.com/s/player/84314bef/player_ias.vflset/en_US/base.js
# Xka - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js
# jma - https://www.youtube.com/s/player/8d9f6215/player_ias.vflset/sv_SE/base.js
f1 = match1(js, r',[$\w]+\.length\|\|([$\w]+)\(""\)\)}};')
f1def = match1(js, r'\W%s=(function\(\w+\).+?\)});' % re.escape(f1))
n = dukpy.evaljs('(%s)("%s")' % (f1def, n))
return n
js = js.replace('\n', ' ')
f1 = match1(js, r'\.set\(\w+\.sp,encodeURIComponent\(([$\w]+)') or \
match1(js, r'\.set\(\w+\.sp,\(0,window\.encodeURIComponent\)\(([$\w]+)') or \
match1(js, r'\.set\(\w+\.sp,([$\w]+)\(\w+\.s\)\)') or \
match1(js, r'"signature",([$\w]+)\(\w+\.\w+\)') or \
match1(js, r'=([$\w]+)\(decodeURIComponent\(')
f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \
match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1))
f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def)
f1def = 'function main_%s%s' % (f1, f1def) # prefix to avoid potential namespace conflict
code = tr_js(f1def)
f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def))
u = urlparse(url)
qs = parse_qs(u.query)
n = n_to_n(js, qs['n'][0])
qs['n'] = [n]
return u._replace(query=urlencode(qs, doseq=True)).geturl()
def s_to_sig(js, s):
# Examples:
# BPa - https://www.youtube.com/s/player/84314bef/player_ias.vflset/en_US/base.js
# Xva - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js
js_code = ''
f1 = match1(js, r'=([$\w]+)\(decodeURIComponent\(')
f1def = match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1))
f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def) # remove . prefix
f1def = 'function %s%s' % (f1, f1def)
f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def)) # find all invoked function names
for f2 in f2s:
f2e = re.escape(f2)
f2def = re.search(r'[^$\w]%s:function\((\w+,\w+)\)(\{[^\{\}]+\})' % f2e, js)
@ -110,15 +110,10 @@ class YouTube(VideoExtractor):
else:
f2def = re.search(r'[^$\w]%s:function\((\w+)\)(\{[^\{\}]+\})' % f2e, js)
f2def = 'function {}({},b){}'.format(f2e, f2def.group(1), f2def.group(2))
f2 = re.sub(r'(as|if|in|is|or)', r'_\1', f2)
f2 = re.sub(r'\$', '_dollar', f2)
code = code + 'global %s\n' % f2 + tr_js(f2def)
f1 = re.sub(r'(as|if|in|is|or)', r'_\1', f1)
f1 = re.sub(r'\$', '_dollar', f1)
code = code + 'sig=main_%s(s)' % f1 # prefix to avoid potential namespace conflict
exec(code, globals(), locals())
return locals()['sig']
js_code += f2def + ';'
js_code += f1def + ';%s("%s")' % (f1, s)
sig = dukpy.evaljs(js_code)
return sig
def chunk_by_range(url, size):
urls = []
@ -138,6 +133,7 @@ class YouTube(VideoExtractor):
"""
return match1(url, r'youtu\.be/([^?/]+)') or \
match1(url, r'youtube\.com/embed/([^/?]+)') or \
match1(url, r'youtube\.com/shorts/([^/?]+)') or \
match1(url, r'youtube\.com/v/([^/?]+)') or \
match1(url, r'youtube\.com/watch/([^/?]+)') or \
parse_query_param(url, 'v') or \
@ -157,36 +153,41 @@ class YouTube(VideoExtractor):
log.wtf('[Failed] Unsupported URL pattern.')
video_page = get_content('https://www.youtube.com/playlist?list=%s' % playlist_id)
from html.parser import HTMLParser
videos = sorted([HTMLParser().unescape(video)
for video in re.findall(r'<a href="(/watch\?[^"]+)"', video_page)
if parse_query_param(video, 'index')],
key=lambda video: parse_query_param(video, 'index'))
playlist_json_serialized = match1(video_page, r'window\["ytInitialData"\]\s*=\s*(.+);', r'var\s+ytInitialData\s*=\s*([^;]+);')
# Parse browse_ajax page for more videos to load
load_more_href = match1(video_page, r'data-uix-load-more-href="([^"]+)"')
while load_more_href:
browse_ajax = get_content('https://www.youtube.com/%s' % load_more_href)
browse_data = json.loads(browse_ajax)
load_more_widget_html = browse_data['load_more_widget_html']
content_html = browse_data['content_html']
vs = set(re.findall(r'href="(/watch\?[^"]+)"', content_html))
videos += sorted([HTMLParser().unescape(video)
for video in list(vs)
if parse_query_param(video, 'index')])
load_more_href = match1(load_more_widget_html, r'data-uix-load-more-href="([^"]+)"')
if len(playlist_json_serialized) == 0:
log.wtf('[Failed] Unable to extract playlist data')
ytInitialData = json.loads(playlist_json_serialized[0])
tab0 = ytInitialData['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]
itemSection0 = tab0['tabRenderer']['content']['sectionListRenderer']['contents'][0]
playlistVideoList0 = itemSection0['itemSectionRenderer']['contents'][0]
videos = playlistVideoList0['playlistVideoListRenderer']['contents']
self.title = re.search(r'<meta name="title" content="([^"]+)"', video_page).group(1)
self.p_playlist()
for video in videos:
vid = parse_query_param(video, 'v')
index = parse_query_param(video, 'index')
for index, video in enumerate(videos, 1):
vid = video['playlistVideoRenderer']['videoId']
try:
self.__class__().download_by_url(self.__class__.get_url_from_vid(vid), index=index, **kwargs)
except:
pass
# FIXME: show DASH stream sizes (by default) for playlist videos
def check_playability_response(self, ytInitialPlayerResponse):
STATUS_OK = "OK"
playerResponseStatus = ytInitialPlayerResponse["playabilityStatus"]["status"]
if playerResponseStatus != STATUS_OK:
reason = ytInitialPlayerResponse["playabilityStatus"].get("reason", "")
raise AssertionError(
f"Server refused to provide video details. Returned status: {playerResponseStatus}, reason: {reason}."
)
def prepare(self, **kwargs):
self.ua = 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36'
assert self.url or self.vid
if not self.vid and self.url:
@ -196,152 +197,72 @@ class YouTube(VideoExtractor):
self.download_playlist_by_url(self.url, **kwargs)
exit(0)
if re.search('\Wlist=', self.url) and not kwargs.get('playlist'):
if re.search(r'\Wlist=', self.url) and not kwargs.get('playlist'):
log.w('This video is from a playlist. (use --playlist to download all videos in the playlist.)')
# Get video info
# 'eurl' is a magic parameter that can bypass age restriction
# full form: 'eurl=https%3A%2F%2Fyoutube.googleapis.com%2Fv%2F{VIDEO_ID}'
video_info = parse.parse_qs(get_content('https://www.youtube.com/get_video_info?video_id={}&eurl=https%3A%2F%2Fy'.format(self.vid)))
logging.debug('STATUS: %s' % video_info['status'][0])
# Extract from video page
logging.debug('Extracting from the video page...')
video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid, headers={'User-Agent': self.ua})
ytplayer_config = None
if 'status' not in video_info:
log.wtf('[Failed] Unknown status.', exit_code=None)
raise
elif video_info['status'] == ['ok']:
if 'use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']:
self.title = parse.unquote_plus(json.loads(video_info["player_response"][0])["videoDetails"]["title"])
# Parse video page (for DASH)
video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid)
try:
ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1))
self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js']
# Workaround: get_video_info returns bad s. Why?
if 'url_encoded_fmt_stream_map' not in ytplayer_config['args']:
stream_list = json.loads(ytplayer_config['args']['player_response'])['streamingData']['formats']
else:
stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',')
#stream_list = ytplayer_config['args']['adaptive_fmts'].split(',')
except:
if 'url_encoded_fmt_stream_map' not in video_info:
stream_list = json.loads(video_info['player_response'][0])['streamingData']['formats']
else:
stream_list = video_info['url_encoded_fmt_stream_map'][0].split(',')
if re.search('([^"]*/base\.js)"', video_page):
self.html5player = 'https://www.youtube.com' + re.search('([^"]*/base\.js)"', video_page).group(1)
else:
self.html5player = None
try:
jsUrl = re.search(r'([^"]*/base\.js)"', video_page).group(1)
except:
log.wtf('[Failed] Unable to find base.js on the video page')
self.html5player = 'https://www.youtube.com' + jsUrl
logging.debug('Retrieving the player code...')
self.js = get_content(self.html5player).replace('\n', ' ')
else:
# Parse video page instead
video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid)
ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1))
logging.debug('Loading ytInitialPlayerResponse...')
ytInitialPlayerResponse = json.loads(re.search(r'ytInitialPlayerResponse\s*=\s*([^\n]+?});(\n|</script>|var )', video_page).group(1))
self.check_playability_response(ytInitialPlayerResponse)
self.title = json.loads(ytplayer_config["args"]["player_response"])["videoDetails"]["title"]
self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js']
stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',')
# Get the video title
self.title = ytInitialPlayerResponse["videoDetails"]["title"]
elif video_info['status'] == ['fail']:
logging.debug('ERRORCODE: %s' % video_info['errorcode'][0])
if video_info['errorcode'] == ['150']:
# FIXME: still relevant?
if cookies:
# Load necessary cookies into headers (for age-restricted videos)
consent, ssid, hsid, sid = 'YES', '', '', ''
for cookie in cookies:
if cookie.domain.endswith('.youtube.com'):
if cookie.name == 'SSID':
ssid = cookie.value
elif cookie.name == 'HSID':
hsid = cookie.value
elif cookie.name == 'SID':
sid = cookie.value
cookie_str = 'CONSENT=%s; SSID=%s; HSID=%s; SID=%s' % (consent, ssid, hsid, sid)
# Check the status
playabilityStatus = ytInitialPlayerResponse['playabilityStatus']
status = playabilityStatus['status']
logging.debug('status: %s' % status)
if status != 'OK':
# If cookies are loaded, status should be OK
try:
subreason = playabilityStatus['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'][0]['text']
log.e('[Error] %s (%s)' % (playabilityStatus['reason'], subreason))
except:
log.e('[Error] %s' % playabilityStatus['reason'])
if status == 'LOGIN_REQUIRED':
log.e('View the video from a browser and export the cookies, then use --cookies to load cookies.')
exit(1)
video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid,
headers={'Cookie': cookie_str})
else:
video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid)
try:
ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+});ytplayer', video_page).group(1))
except:
msg = re.search('class="message">([^<]+)<', video_page).group(1)
log.wtf('[Failed] Got message "%s". Try to login with --cookies.' % msg.strip())
if 'title' in ytplayer_config['args']:
# 150 Restricted from playback on certain sites
# Parse video page instead
self.title = ytplayer_config['args']['title']
self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js']
stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',')
else:
log.wtf('[Error] The uploader has not made this video available in your country.', exit_code=None)
raise
#self.title = re.search('<meta name="title" content="([^"]+)"', video_page).group(1)
#stream_list = []
elif video_info['errorcode'] == ['100']:
log.wtf('[Failed] This video does not exist.', exit_code=None) #int(video_info['errorcode'][0])
raise
else:
log.wtf('[Failed] %s' % video_info['reason'][0], exit_code=None) #int(video_info['errorcode'][0])
raise
else:
log.wtf('[Failed] Invalid status.', exit_code=None)
raise
# YouTube Live
if ytplayer_config and (ytplayer_config['args'].get('livestream') == '1' or ytplayer_config['args'].get('live_playback') == '1'):
if 'hlsvp' in ytplayer_config['args']:
hlsvp = ytplayer_config['args']['hlsvp']
else:
player_response= json.loads(ytplayer_config['args']['player_response'])
log.e('[Failed] %s' % player_response['playabilityStatus']['reason'], exit_code=1)
if 'info_only' in kwargs and kwargs['info_only']:
return
else:
download_url_ffmpeg(hlsvp, self.title, 'mp4')
exit(0)
stream_list = ytInitialPlayerResponse['streamingData']['formats']
for stream in stream_list:
if isinstance(stream, str):
metadata = parse.parse_qs(stream)
stream_itag = metadata['itag'][0]
self.streams[stream_itag] = {
'itag': metadata['itag'][0],
'url': metadata['url'][0],
'sig': metadata['sig'][0] if 'sig' in metadata else None,
's': metadata['s'][0] if 's' in metadata else None,
'quality': metadata['quality'][0] if 'quality' in metadata else None,
#'quality': metadata['quality_label'][0] if 'quality_label' in metadata else None,
'type': metadata['type'][0],
'mime': metadata['type'][0].split(';')[0],
'container': mime_to_container(metadata['type'][0].split(';')[0]),
}
logging.debug('Found format: itag=%s' % stream['itag'])
if 'signatureCipher' in stream:
logging.debug(' Parsing signatureCipher for itag=%s...' % stream['itag'])
qs = parse_qs(stream['signatureCipher'])
#logging.debug(qs)
sp = qs['sp'][0]
sig = self.__class__.s_to_sig(self.js, qs['s'][0])
url = qs['url'][0] + '&{}={}'.format(sp, sig)
elif 'url' in stream:
url = stream['url']
else:
stream_itag = str(stream['itag'])
self.streams[stream_itag] = {
'itag': str(stream['itag']),
'url': stream['url'] if 'url' in stream else None,
'sig': None,
's': None,
'quality': stream['quality'],
'type': stream['mimeType'],
'mime': stream['mimeType'].split(';')[0],
'container': mime_to_container(stream['mimeType'].split(';')[0]),
}
if 'signatureCipher' in stream:
self.streams[stream_itag].update(dict([(_.split('=')[0], parse.unquote(_.split('=')[1]))
for _ in stream['signatureCipher'].split('&')]))
log.wtf(' No signatureCipher or url for itag=%s' % stream['itag'])
url = self.__class__.dethrottle(self.js, url)
# Prepare caption tracks
self.streams[str(stream['itag'])] = {
'itag': str(stream['itag']),
'url': url,
'quality': stream['quality'],
'type': stream['mimeType'],
'mime': stream['mimeType'].split(';')[0],
'container': mime_to_container(stream['mimeType'].split(';')[0]),
}
# FIXME: Prepare caption tracks
try:
caption_tracks = json.loads(ytplayer_config['args']['player_response'])['captions']['playerCaptionsTracklistRenderer']['captionTracks']
caption_tracks = ytInitialPlayerResponse['captions']['playerCaptionsTracklistRenderer']['captionTracks']
for ct in caption_tracks:
ttsurl, lang = ct['baseUrl'], ct['languageCode']
@ -367,149 +288,72 @@ class YouTube(VideoExtractor):
srt += '%s --> %s\n' % (start, finish)
srt += '%s\n\n' % content
self.caption_tracks[lang] = srt
if 'kind' in ct:
self.caption_tracks[ct['vssId']] = srt # autogenerated
else:
self.caption_tracks[lang] = srt
except: pass
# Prepare DASH streams (NOTE: not every video has DASH streams!)
try:
dashmpd = ytplayer_config['args']['dashmpd']
dash_xml = parseString(get_content(dashmpd))
for aset in dash_xml.getElementsByTagName('AdaptationSet'):
mimeType = aset.getAttribute('mimeType')
if mimeType == 'audio/mp4':
rep = aset.getElementsByTagName('Representation')[-1]
burls = rep.getElementsByTagName('BaseURL')
dash_mp4_a_url = burls[0].firstChild.nodeValue
dash_mp4_a_size = burls[0].getAttribute('yt:contentLength')
if not dash_mp4_a_size:
try: dash_mp4_a_size = url_size(dash_mp4_a_url)
except: continue
elif mimeType == 'audio/webm':
rep = aset.getElementsByTagName('Representation')[-1]
burls = rep.getElementsByTagName('BaseURL')
dash_webm_a_url = burls[0].firstChild.nodeValue
dash_webm_a_size = burls[0].getAttribute('yt:contentLength')
if not dash_webm_a_size:
try: dash_webm_a_size = url_size(dash_webm_a_url)
except: continue
elif mimeType == 'video/mp4':
for rep in aset.getElementsByTagName('Representation'):
w = int(rep.getAttribute('width'))
h = int(rep.getAttribute('height'))
itag = rep.getAttribute('id')
burls = rep.getElementsByTagName('BaseURL')
dash_url = burls[0].firstChild.nodeValue
dash_size = burls[0].getAttribute('yt:contentLength')
if not dash_size:
try: dash_size = url_size(dash_url)
except: continue
dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size))
dash_mp4_a_urls = self.__class__.chunk_by_range(dash_mp4_a_url, int(dash_mp4_a_size))
self.dash_streams[itag] = {
'quality': '%sx%s' % (w, h),
'itag': itag,
'type': mimeType,
'mime': mimeType,
'container': 'mp4',
'src': [dash_urls, dash_mp4_a_urls],
'size': int(dash_size) + int(dash_mp4_a_size)
}
elif mimeType == 'video/webm':
for rep in aset.getElementsByTagName('Representation'):
w = int(rep.getAttribute('width'))
h = int(rep.getAttribute('height'))
itag = rep.getAttribute('id')
burls = rep.getElementsByTagName('BaseURL')
dash_url = burls[0].firstChild.nodeValue
dash_size = burls[0].getAttribute('yt:contentLength')
if not dash_size:
try: dash_size = url_size(dash_url)
except: continue
dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size))
dash_webm_a_urls = self.__class__.chunk_by_range(dash_webm_a_url, int(dash_webm_a_size))
self.dash_streams[itag] = {
'quality': '%sx%s' % (w, h),
'itag': itag,
'type': mimeType,
'mime': mimeType,
'container': 'webm',
'src': [dash_urls, dash_webm_a_urls],
'size': int(dash_size) + int(dash_webm_a_size)
}
except:
# VEVO
if not self.html5player: return
self.html5player = self.html5player.replace('\/', '/') # unescape URL (for age-restricted videos)
self.js = get_content(self.html5player)
# Prepare DASH streams
if 'adaptiveFormats' in ytInitialPlayerResponse['streamingData']:
streams = ytInitialPlayerResponse['streamingData']['adaptiveFormats']
try:
# Video info from video page (not always available)
streams = [dict([(i.split('=')[0],
parse.unquote(i.split('=')[1]))
for i in afmt.split('&')])
for afmt in ytplayer_config['args']['adaptive_fmts'].split(',')]
except:
if 'adaptive_fmts' in video_info:
streams = [dict([(i.split('=')[0],
parse.unquote(i.split('=')[1]))
for i in afmt.split('&')])
for afmt in video_info['adaptive_fmts'][0].split(',')]
# FIXME: dead code?
# streams without contentLength got broken urls, just remove them (#2767)
streams = [stream for stream in streams if 'contentLength' in stream]
for stream in streams:
logging.debug('Found adaptiveFormat: itag=%s' % stream['itag'])
stream['itag'] = str(stream['itag'])
if 'qualityLabel' in stream:
stream['quality_label'] = stream['qualityLabel']
del stream['qualityLabel']
logging.debug(' quality_label: \t%s' % stream['quality_label'])
if 'width' in stream:
stream['size'] = '{}x{}'.format(stream['width'], stream['height'])
del stream['width']
del stream['height']
logging.debug(' size: \t%s' % stream['size'])
stream['type'] = stream['mimeType']
logging.debug(' type: \t%s' % stream['type'])
stream['clen'] = stream['contentLength']
stream['init'] = '{}-{}'.format(
stream['initRange']['start'],
stream['initRange']['end'])
stream['index'] = '{}-{}'.format(
stream['indexRange']['start'],
stream['indexRange']['end'])
del stream['mimeType']
del stream['contentLength']
del stream['initRange']
del stream['indexRange']
if 'signatureCipher' in stream:
logging.debug(' Parsing signatureCipher for itag=%s...' % stream['itag'])
qs = parse_qs(stream['signatureCipher'])
#logging.debug(qs)
sp = qs['sp'][0]
sig = self.__class__.s_to_sig(self.js, qs['s'][0])
url = qs['url'][0] + '&ratebypass=yes&{}={}'.format(sp, sig)
elif 'url' in stream:
url = stream['url']
else:
try:
streams = json.loads(video_info['player_response'][0])['streamingData']['adaptiveFormats']
except: # no DASH stream at all
return
# streams without contentLength got broken urls, just remove them (#2767)
streams = [stream for stream in streams if 'contentLength' in stream]
for stream in streams:
stream['itag'] = str(stream['itag'])
if 'qualityLabel' in stream:
stream['quality_label'] = stream['qualityLabel']
del stream['qualityLabel']
if 'width' in stream:
stream['size'] = '{}x{}'.format(stream['width'], stream['height'])
del stream['width']
del stream['height']
stream['type'] = stream['mimeType']
stream['clen'] = stream['contentLength']
stream['init'] = '{}-{}'.format(
stream['initRange']['start'],
stream['initRange']['end'])
stream['index'] = '{}-{}'.format(
stream['indexRange']['start'],
stream['indexRange']['end'])
del stream['mimeType']
del stream['contentLength']
del stream['initRange']
del stream['indexRange']
if 'signatureCipher' in stream:
stream.update(dict([(_.split('=')[0], parse.unquote(_.split('=')[1]))
for _ in stream['signatureCipher'].split('&')]))
del stream['signatureCipher']
log.wtf('No signatureCipher or url for itag=%s' % stream['itag'])
url = self.__class__.dethrottle(self.js, url)
stream['url'] = url
for stream in streams: # get over speed limiting
stream['url'] += '&ratebypass=yes'
for stream in streams: # audio
if stream['type'].startswith('audio/mp4'):
dash_mp4_a_url = stream['url']
if 's' in stream:
sig = self.__class__.decipher(self.js, stream['s'])
dash_mp4_a_url += '&sig={}'.format(sig)
dash_mp4_a_size = stream['clen']
elif stream['type'].startswith('audio/webm'):
dash_webm_a_url = stream['url']
if 's' in stream:
sig = self.__class__.decipher(self.js, stream['s'])
dash_webm_a_url += '&sig={}'.format(sig)
dash_webm_a_size = stream['clen']
for stream in streams: # video
if 'size' in stream:
if stream['type'].startswith('video/mp4'):
mimeType = 'video/mp4'
dash_url = stream['url']
if 's' in stream:
sig = self.__class__.decipher(self.js, stream['s'])
dash_url += '&sig={}'.format(sig)
dash_size = stream['clen']
itag = stream['itag']
dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size))
@ -526,9 +370,6 @@ class YouTube(VideoExtractor):
elif stream['type'].startswith('video/webm'):
mimeType = 'video/webm'
dash_url = stream['url']
if 's' in stream:
sig = self.__class__.decipher(self.js, stream['s'])
dash_url += '&sig={}'.format(sig)
dash_size = stream['clen']
itag = stream['itag']
audio_url = None
@ -569,15 +410,6 @@ class YouTube(VideoExtractor):
if stream_id in self.streams:
src = self.streams[stream_id]['url']
if self.streams[stream_id]['sig'] is not None:
sig = self.streams[stream_id]['sig']
src += '&sig={}'.format(sig)
elif self.streams[stream_id]['s'] is not None:
if not hasattr(self, 'js'):
self.js = get_content(self.html5player)
s = self.streams[stream_id]['s']
sig = self.__class__.decipher(self.js, s)
src += '&sig={}'.format(sig)
self.streams[stream_id]['src'] = [src]
self.streams[stream_id]['size'] = urls_size(self.streams[stream_id]['src'])

View File

@ -31,8 +31,8 @@ def zhihu_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
play_list = video_info["playlist"]
# first High Definition
# second Second Standard Definition
# third ld. What is ld ?
# second Standard Definition
# third Low Definition
# finally continue
data = play_list.get("hd", play_list.get("sd", play_list.get("ld", None)))
if not data:

View File

@ -93,7 +93,7 @@ def ffmpeg_concat_mp4_to_mpg(files, output='output.mpg'):
# Use concat demuxer on FFmpeg >= 1.1
if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)):
concat_list = generate_concat_list(files, output)
params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1',
params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0',
'-i', concat_list, '-c', 'copy']
params.extend(['--', output])
if subprocess.call(params, stdin=STDIN) == 0:
@ -128,7 +128,7 @@ def ffmpeg_concat_mp4_to_mpg(files, output='output.mpg'):
def ffmpeg_concat_ts_to_mkv(files, output='output.mkv'):
print('Merging video parts... ', end="", flush=True)
params = [FFMPEG] + LOGLEVEL + ['-isync', '-y', '-i']
params = [FFMPEG] + LOGLEVEL + ['-y', '-i']
params.append('concat:')
for file in files:
if os.path.isfile(file):
@ -149,7 +149,7 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'):
# Use concat demuxer on FFmpeg >= 1.1
if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)):
concat_list = generate_concat_list(files, output)
params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1',
params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0',
'-i', concat_list, '-c', 'copy',
'-bsf:a', 'aac_adtstoasc']
params.extend(['--', output])
@ -175,7 +175,7 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'):
if FFMPEG == 'avconv':
params += ['-c', 'copy']
else:
params += ['-c', 'copy', '-absf', 'aac_adtstoasc']
params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc']
params.extend(['--', output])
if subprocess.call(params, stdin=STDIN) == 0:
@ -203,7 +203,7 @@ def ffmpeg_concat_mp4_to_mp4(files, output='output.mp4'):
# Use concat demuxer on FFmpeg >= 1.1
if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)):
concat_list = generate_concat_list(files, output)
params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1',
params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0',
'-i', concat_list, '-c', 'copy',
'-bsf:a', 'aac_adtstoasc']
params.extend(['--', output])
@ -229,7 +229,7 @@ def ffmpeg_concat_mp4_to_mp4(files, output='output.mp4'):
if FFMPEG == 'avconv':
params += ['-c', 'copy']
else:
params += ['-c', 'copy', '-absf', 'aac_adtstoasc']
params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc']
params.extend(['--', output])
subprocess.check_call(params, stdin=STDIN)

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python
script_name = 'you-get'
__version__ = '0.4.1456'
__version__ = '0.4.1730'

View File

@ -10,13 +10,16 @@ from you_get.extractors import (
acfun,
bilibili,
soundcloud,
tiktok
tiktok,
twitter,
miaopai
)
class YouGetTests(unittest.TestCase):
def test_imgur(self):
imgur.download('http://imgur.com/WVLk5nD', info_only=True)
imgur.download('https://imgur.com/we-should-have-listened-WVLk5nD', info_only=True)
def test_magisto(self):
magisto.download(
@ -24,45 +27,47 @@ class YouGetTests(unittest.TestCase):
info_only=True
)
def test_youtube(self):
youtube.download(
'http://www.youtube.com/watch?v=pzKerr0JIPA', info_only=True
)
youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True)
youtube.download(
'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa
info_only=True
)
youtube.download(
'https://www.youtube.com/watch?v=Fpr4fQSh1cc', info_only=True
)
#def test_youtube(self):
#youtube.download(
# 'http://www.youtube.com/watch?v=pzKerr0JIPA', info_only=True
#)
#youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True)
#youtube.download(
# 'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa
# info_only=True
#)
#youtube.download(
# 'https://www.youtube.com/watch?v=oRdxUFDoQe0', info_only=True
#)
def test_acfun(self):
acfun.download('https://www.acfun.cn/v/ac11701912', info_only=True)
acfun.download('https://www.acfun.cn/v/ac44560432', info_only=True)
def test_bilibil(self):
bilibili.download(
"https://www.bilibili.com/watchlater/#/BV1PE411q7mZ/p6", info_only=True
)
bilibili.download(
"https://www.bilibili.com/watchlater/#/av74906671/p6", info_only=True
)
#def test_bilibili(self):
#bilibili.download('https://www.bilibili.com/video/BV1sL4y177sC', info_only=True)
def test_soundcloud(self):
#def test_soundcloud(self):
## single song
soundcloud.download(
'https://soundcloud.com/keiny-pham/impure-bird', info_only=True
)
#soundcloud.download(
# 'https://soundcloud.com/keiny-pham/impure-bird', info_only=True
#)
## playlist
#soundcloud.download(
# 'https://soundcloud.com/anthony-flieger/sets/cytus', info_only=True
#)
def tests_tiktok(self):
tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True)
tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True)
tiktok.download('https://vt.tiktok.com/UGJR4R/', info_only=True)
def test_tiktok(self):
tiktok.download('https://www.tiktok.com/@zukky_48/video/7398162058153315605', info_only=True)
tiktok.download('https://www.tiktok.com/@/video/7398162058153315605', info_only=True)
tiktok.download('https://t.tiktok.com/i18n/share/video/7398162058153315605/', info_only=True)
tiktok.download('https://vt.tiktok.com/ZSYKjKt6M/', info_only=True)
def test_twitter(self):
twitter.download('https://twitter.com/elonmusk/status/1530516552084234244', info_only=True)
twitter.download('https://x.com/elonmusk/status/1530516552084234244', info_only=True)
def test_weibo(self):
miaopai.download('https://video.weibo.com/show?fid=1034:4825403706245135', info_only=True)
if __name__ == '__main__':
unittest.main()

View File

@ -18,13 +18,12 @@
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.2",
"Programming Language :: Python :: 3.3",
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Topic :: Internet",
"Topic :: Internet :: WWW/HTTP",
"Topic :: Multimedia",