diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 00000000..2ee74663 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,39 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions + +name: develop + +on: + push: + branches: [ develop ] + pull_request: + branches: [ develop ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: [3.7, 3.8, 3.9, '3.10', '3.11', '3.12', pypy-3.8, pypy-3.9, pypy-3.10] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools + pip install flake8 + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with unittest + run: | + make test diff --git a/.gitignore b/.gitignore index 99b18775..57f9412b 100644 --- a/.gitignore +++ b/.gitignore @@ -79,13 +79,15 @@ _* *.ts *.webm *.xml +*.json /.env /.idea *.m4a *.DS_Store *.txt +*.sw[a-p] *.zip +.emacs* .vscode - diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index eedbeeb2..00000000 --- a/.travis.yml +++ /dev/null @@ -1,22 +0,0 @@ -# https://travis-ci.org/soimort/you-get -language: python -python: - - "3.4" - - "3.5" - - "3.6" - - "3.7" - - "3.8" - #- "nightly" (flake8 not working in python 3.9 yet, module 'ast' has no attribute 'AugLoad') - - "pypy3" -before_install: - - pip install flake8 -before_script: - - flake8 . --count --select=E9,F63,F72,F82 --show-source --statistics -script: make test -notifications: - webhooks: - urls: - - https://webhooks.gitter.im/e/43cd57826e88ed8f2152 - on_success: change # options: [always|never|change] default: always - on_failure: always # options: [always|never|change] default: always - on_start: never # options: [always|never|change] default: always diff --git a/LICENSE.txt b/LICENSE.txt index a193d8e2..fcc26433 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2012-2020 Mort Yao and other contributors +Copyright (c) 2012-2024 Mort Yao and other contributors (https://github.com/soimort/you-get/graphs/contributors) Copyright (c) 2012 Boyu Guo diff --git a/MANIFEST.in b/MANIFEST.in index 521b023b..ed688fde 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,9 @@ include *.rst include *.txt include Makefile +include CONTRIBUTING.md include README.md include you-get include you-get.json +include you-get.plugin.zsh +recursive-include contrib * diff --git a/Makefile b/Makefile index c0f9cf0e..a6222024 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,12 @@ -SETUP = python3 setup.py - -.PHONY: default i test clean all html rst build sdist bdist bdist_egg bdist_wheel install release +.PHONY: default i test clean all html rst build install release default: i i: - @(cd src/; python3 -i -c 'import you_get; print("You-Get %s\n>>> import you_get" % you_get.version.__version__)') + @(cd src; python -i -c 'import you_get; print("You-Get %s\n>>> import you_get" % you_get.version.__version__)') test: - $(SETUP) test + (cd src; python -m unittest discover -s ../tests) clean: zenity --question @@ -16,7 +14,7 @@ clean: find . | grep __pycache__ | xargs rm -fr find . | grep .pyc | xargs rm -f -all: build sdist bdist bdist_egg bdist_wheel +all: build html: pandoc README.md > README.html @@ -25,23 +23,11 @@ rst: pandoc -s -t rst README.md > README.rst build: - $(SETUP) build - -sdist: - $(SETUP) sdist - -bdist: - $(SETUP) bdist - -bdist_egg: - $(SETUP) bdist_egg - -bdist_wheel: - $(SETUP) bdist_wheel + python -m build install: - $(SETUP) install --user --prefix= + python -m pip install . -release: - zenity --question - $(SETUP) sdist bdist_wheel upload --sign +release: build + @echo 'Upload new version to PyPI using:' + @echo ' twine upload --sign dist/you_get-VERSION*' diff --git a/README.md b/README.md index 3429f9d8..a3cb7cea 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,12 @@ # You-Get +[![Build Status](https://github.com/soimort/you-get/workflows/develop/badge.svg)](https://github.com/soimort/you-get/actions) [![PyPI version](https://img.shields.io/pypi/v/you-get.svg)](https://pypi.python.org/pypi/you-get/) -[![Build Status](https://travis-ci.org/soimort/you-get.svg)](https://travis-ci.org/soimort/you-get) [![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -**NOTICE: Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.** +**NOTICE (30 May 2022): Support for Python 3.5, 3.6 and 3.7 will eventually be dropped. ([see details here](https://github.com/soimort/you-get/wiki/TLS-1.3-post-handshake-authentication-(PHA)))** + +**NOTICE (8 Mar 2019): Read [this](https://github.com/soimort/you-get/blob/develop/CONTRIBUTING.md) if you are looking for the conventional "Issues" tab.** --- @@ -53,17 +55,17 @@ Are you a Python programmer? Then check out [the source](https://github.com/soim ### Prerequisites -The following dependencies are necessary: +The following dependencies are recommended: -* **[Python](https://www.python.org/downloads/)** 3.2 or above +* **[Python](https://www.python.org/downloads/)** 3.7.4 or above * **[FFmpeg](https://www.ffmpeg.org/)** 1.0 or above * (Optional) [RTMPDump](https://rtmpdump.mplayerhq.hu/) ### Option 1: Install via pip -The official release of `you-get` is distributed on [PyPI](https://pypi.python.org/pypi/you-get), and can be installed easily from a PyPI mirror via the [pip](https://en.wikipedia.org/wiki/Pip_\(package_manager\)) package manager. Note that you must use the Python 3 version of `pip`: +The official release of `you-get` is distributed on [PyPI](https://pypi.python.org/pypi/you-get), and can be installed easily from a PyPI mirror via the [pip](https://en.wikipedia.org/wiki/Pip_\(package_manager\)) package manager: (Note that you must use the Python 3 version of `pip`) - $ pip3 install you-get + $ pip install you-get ### Option 2: Install via [Antigen](https://github.com/zsh-users/antigen) (for Zsh users) @@ -78,16 +80,26 @@ You may either download the [stable](https://github.com/soimort/you-get/archive/ Alternatively, run ``` -$ [sudo] python3 setup.py install +$ cd path/to/you-get +$ [sudo] python -m pip install . ``` Or ``` -$ python3 setup.py install --user +$ cd path/to/you-get +$ python -m pip install . --user ``` -to install `you-get` to a permanent path. +to install `you-get` to a permanent path. (And don't omit the dot `.` representing the current directory) + +You can also use the [pipenv](https://pipenv.pypa.io/en/latest) to install the `you-get` in the Python virtual environment. + +``` +$ pipenv install -e . +$ pipenv run you-get --version +you-get: version 0.4.1555, a tiny downloader that scrapes the web. +``` ### Option 4: Git clone @@ -97,7 +109,7 @@ This is the recommended way for all developers, even if you don't often code in $ git clone git://github.com/soimort/you-get.git ``` -Then put the cloned directory into your `PATH`, or run `./setup.py install` to install `you-get` to a permanent path. +Then put the cloned directory into your `PATH`, or run `python -m pip install path/to/you-get` to install `you-get` to a permanent path. ### Option 5: Homebrew (Mac only) @@ -115,6 +127,14 @@ You can install `you-get` easily via: # pkg install you-get ``` +### Option 7: Flox (Mac, Linux, and Windows WSL) + +You can install `you-get` easily via: + +``` +$ flox install you-get +``` + ### Shell completion Completion definitions for Bash, Fish and Zsh can be found in [`contrib/completion`](https://github.com/soimort/you-get/tree/develop/contrib/completion). Please consult your shell's manual for how to take advantage of them. @@ -124,7 +144,7 @@ Completion definitions for Bash, Fish and Zsh can be found in [`contrib/completi Based on which option you chose to install `you-get`, you may upgrade it via: ``` -$ pip3 install --upgrade you-get +$ pip install --upgrade you-get ``` or download the latest release via: @@ -136,7 +156,7 @@ $ you-get https://github.com/soimort/you-get/archive/master.zip In order to get the latest ```develop``` branch without messing up the PIP, you can try: ``` -$ pip3 install --upgrade git+https://github.com/soimort/you-get@develop +$ pip install --upgrade git+https://github.com/soimort/you-get@develop ``` ## Getting Started @@ -256,25 +276,20 @@ Type: JPEG Image (image/jpeg) Size: 0.06 MiB (66482 Bytes) Downloading rms.jpg ... -100.0% ( 0.1/0.1 MB) ├████████████████████████████████████████┤[1/1] 127 kB/s + 100% ( 0.1/ 0.1MB) ├████████████████████████████████████████┤[1/1] 127 kB/s ``` Otherwise, `you-get` will scrape the web page and try to figure out if there's anything interesting to you: ``` -$ you-get http://kopasas.tumblr.com/post/69361932517 +$ you-get https://kopasas.tumblr.com/post/69361932517 Site: Tumblr.com -Title: kopasas -Type: Unknown type (None) -Size: 0.51 MiB (536583 Bytes) - -Site: Tumblr.com -Title: tumblr_mxhg13jx4n1sftq6do1_1280 +Title: [tumblr] tumblr_mxhg13jx4n1sftq6do1_640 Type: Portable Network Graphics (image/png) -Size: 0.51 MiB (536583 Bytes) +Size: 0.11 MiB (118484 Bytes) -Downloading tumblr_mxhg13jx4n1sftq6do1_1280.png ... -100.0% ( 0.5/0.5 MB) ├████████████████████████████████████████┤[1/1] 22 MB/s +Downloading [tumblr] tumblr_mxhg13jx4n1sftq6do1_640.png ... + 100% ( 0.1/ 0.1MB) ├████████████████████████████████████████┤[1/1] 22 MB/s ``` **Note:** @@ -364,83 +379,81 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | Site | URL | Videos? | Images? | Audios? | | :--: | :-- | :-----: | :-----: | :-----: | | **YouTube** | |✓| | | -| **Twitter** | |✓|✓| | -| VK | |✓|✓| | -| Vine | |✓| | | +| **X (Twitter)** | |✓|✓| | +| VK | |✓|✓| | | Vimeo | |✓| | | -| Veoh | |✓| | | +| Veoh | |✓| | | | **Tumblr** | |✓|✓|✓| -| TED | |✓| | | +| TED | |✓| | | | SoundCloud | | | |✓| | SHOWROOM | |✓| | | | Pinterest | | |✓| | -| MTV81 | |✓| | | +| MTV81 | |✓| | | | Mixcloud | | | |✓| -| Metacafe | |✓| | | -| Magisto | |✓| | | +| Metacafe | |✓| | | +| Magisto | |✓| | | | Khan Academy | |✓| | | | Internet Archive | |✓| | | | **Instagram** | |✓|✓| | -| InfoQ | |✓| | | -| Imgur | | |✓| | -| Heavy Music Archive | | | |✓| -| Freesound | | | |✓| +| InfoQ | |✓| | | +| Imgur | | |✓| | +| Heavy Music Archive | | | |✓| +| Freesound | | | |✓| | Flickr | |✓|✓| | -| FC2 Video | |✓| | | +| FC2 Video | |✓| | | | Facebook | |✓| | | -| eHow | |✓| | | -| Dailymotion | |✓| | | -| Coub | |✓| | | -| CBS | |✓| | | -| Bandcamp | | | |✓| -| AliveThai | |✓| | | -| interest.me | |✓| | | -| **755
ナナゴーゴー** | |✓|✓| | -| **niconico
ニコニコ動画** | |✓| | | -| **163
网易视频
网易云音乐** |
|✓| |✓| -| 56网 | |✓| | | -| **AcFun** | |✓| | | -| **Baidu
百度贴吧** | |✓|✓| | -| 爆米花网 | |✓| | | -| **bilibili
哔哩哔哩** | |✓|✓|✓| -| 豆瓣 | |✓| |✓| -| 斗鱼 | |✓| | | -| 凤凰视频 | |✓| | | -| 风行网 | |✓| | | -| iQIYI
爱奇艺 | |✓| | | -| 激动网 | |✓| | | -| 酷6网 | |✓| | | -| 酷狗音乐 | | | |✓| -| 酷我音乐 | | | |✓| -| 乐视网 | |✓| | | -| 荔枝FM | | | |✓| -| 秒拍 | |✓| | | -| MioMio弹幕网 | |✓| | | -| MissEvan
猫耳FM | | | |✓| +| eHow | |✓| | | +| Dailymotion | |✓| | | +| Coub | |✓| | | +| CBS | |✓| | | +| Bandcamp | | | |✓| +| AliveThai | |✓| | | +| interest.me | |✓| | | +| **755
ナナゴーゴー** | |✓|✓| | +| **niconico
ニコニコ動画** | |✓| | | +| **163
网易视频
网易云音乐** |
|✓| |✓| +| 56网 | |✓| | | +| **AcFun** | |✓| | | +| **Baidu
百度贴吧** | |✓|✓| | +| 爆米花网 | |✓| | | +| **bilibili
哔哩哔哩** | |✓|✓|✓| +| 豆瓣 | |✓| |✓| +| 斗鱼 | |✓| | | +| 凤凰视频 | |✓| | | +| 风行网 | |✓| | | +| iQIYI
爱奇艺 | |✓| | | +| 激动网 | |✓| | | +| 酷6网 | |✓| | | +| 酷狗音乐 | | | |✓| +| 酷我音乐 | | | |✓| +| 乐视网 | |✓| | | +| 荔枝FM | | | |✓| +| 懒人听书 | | | |✓| +| 秒拍 | |✓| | | +| MioMio弹幕网 | |✓| | | +| MissEvan
猫耳FM | | | |✓| | 痞客邦 | |✓| | | -| PPTV聚力 | |✓| | | -| 齐鲁网 | |✓| | | -| QQ
腾讯视频 | |✓| | | -| 企鹅直播 | |✓| | | -| Sina
新浪视频
微博秒拍视频 |
|✓| | | -| Sohu
搜狐视频 | |✓| | | -| **Tudou
土豆** | |✓| | | -| 虾米 | |✓| |✓| -| 阳光卫视 | |✓| | | -| **音悦Tai** | |✓| | | -| **Youku
优酷** | |✓| | | -| 战旗TV | |✓| | | -| 央视网 | |✓| | | -| Naver
네이버 | |✓| | | -| 芒果TV | |✓| | | -| 火猫TV | |✓| | | -| 阳光宽频网 | |✓| | | +| PPTV聚力 | |✓| | | +| 齐鲁网 | |✓| | | +| QQ
腾讯视频 | |✓| | | +| 企鹅直播 | |✓| | | +| Sina
新浪视频
微博秒拍视频 |
|✓| | | +| Sohu
搜狐视频 | |✓| | | +| **Tudou
土豆** | |✓| | | +| 阳光卫视 | |✓| | | +| **Youku
优酷** | |✓| | | +| 战旗TV | |✓| | | +| 央视网 | |✓| | | +| Naver
네이버 | |✓| | | +| 芒果TV | |✓| | | +| 火猫TV | |✓| | | +| 阳光宽频网 | |✓| | | | 西瓜视频 | |✓| | | -| 新片场 | |✓| | | +| 新片场 | |✓| | | | 快手 | |✓|✓| | | 抖音 | |✓| | | | TikTok | |✓| | | -| 中国体育(TV) |
|✓| | | +| 中国体育(TV) |
|✓| | | | 知乎 | |✓| | | For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. @@ -453,7 +466,7 @@ Check if it's already a known problem on `__ and fork it! .. |PyPI version| image:: https://badge.fury.io/py/you-get.png :target: http://badge.fury.io/py/you-get -.. |Build Status| image:: https://api.travis-ci.org/soimort/you-get.png - :target: https://travis-ci.org/soimort/you-get +.. |Build Status| image:: https://github.com/soimort/you-get/workflows/develop/badge.svg + :target: https://github.com/soimort/you-get/actions .. |Gitter| image:: https://badges.gitter.im/Join%20Chat.svg :target: https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..d9fb8cf3 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,5 @@ +# Security Policy + +## Reporting a Vulnerability + +Please report security issues to . diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..7af04e46 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +# runtime dependencies +dukpy diff --git a/setup.py b/setup.py index 24dc9fb2..0804ae33 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,20 @@ PACKAGE_NAME = 'you_get' PROJ_METADATA = '%s.json' % PROJ_NAME -import os, json, imp +import importlib.util +import importlib.machinery + +def load_source(modname, filename): + loader = importlib.machinery.SourceFileLoader(modname, filename) + spec = importlib.util.spec_from_file_location(modname, filename, loader=loader) + module = importlib.util.module_from_spec(spec) + # The module is always executed and not cached in sys.modules. + # Uncomment the following line to cache the module. + # sys.modules[module.__name__] = module + loader.exec_module(module) + return module + +import os, json here = os.path.abspath(os.path.dirname(__file__)) proj_info = json.loads(open(os.path.join(here, PROJ_METADATA), encoding='utf-8').read()) try: @@ -13,7 +26,7 @@ try: except: README = "" CHANGELOG = open(os.path.join(here, 'CHANGELOG.rst'), encoding='utf-8').read() -VERSION = imp.load_source('version', os.path.join(here, 'src/%s/version.py' % PACKAGE_NAME)).__version__ +VERSION = load_source('version', os.path.join(here, 'src/%s/version.py' % PACKAGE_NAME)).__version__ from setuptools import setup, find_packages setup( @@ -43,7 +56,8 @@ setup( entry_points = {'console_scripts': proj_info['console_scripts']}, - extras_require={ + install_requires = ['dukpy'], + extras_require = { 'socks': ['PySocks'], } ) diff --git a/src/you_get/common.py b/src/you_get/common.py index 9c56b5c2..0b307dde 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -76,6 +76,7 @@ SITES = { 'letv' : 'le', 'lizhi' : 'lizhi', 'longzhu' : 'longzhu', + 'lrts' : 'lrts', 'magisto' : 'magisto', 'metacafe' : 'metacafe', 'mgtv' : 'mgtv', @@ -110,14 +111,12 @@ SITES = { 'wanmen' : 'wanmen', 'weibo' : 'miaopai', 'veoh' : 'veoh', - 'vine' : 'vine', 'vk' : 'vk', - 'xiami' : 'xiami', + 'x' : 'twitter', 'xiaokaxiu' : 'yixia', 'xiaojiadianvideo' : 'fc2video', 'ximalaya' : 'ximalaya', 'xinpianchang' : 'xinpianchang', - 'yinyuetai' : 'yinyuetai', 'yizhibo' : 'yizhibo', 'youku' : 'youku', 'youtu' : 'youtube', @@ -137,13 +136,16 @@ cookies = None output_filename = None auto_rename = False insecure = False +m3u8 = False +postfix = False +prefix = None fake_headers = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # noqa + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'UTF-8,*;q=0.5', 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'en-US,en;q=0.8', - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', # noqa + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/126.0.2592.113' # Latest Edge } if sys.stdout.isatty(): @@ -341,10 +343,38 @@ def undeflate(data): return decompressobj.decompress(data)+decompressobj.flush() +# an http.client implementation of get_content() +# because urllib does not support "Connection: keep-alive" +def getHttps(host, url, headers, debuglevel=0): + import http.client + + conn = http.client.HTTPSConnection(host) + conn.set_debuglevel(debuglevel) + conn.request("GET", url, headers=headers) + resp = conn.getresponse() + logging.debug('getHttps: %s' % resp.getheaders()) + set_cookie = resp.getheader('set-cookie') + + data = resp.read() + try: + data = ungzip(data) # gzip + data = undeflate(data) # deflate + except: + pass + + conn.close() + return str(data, encoding='utf-8'), set_cookie # TODO: support raw data + + # DEPRECATED in favor of get_content() def get_response(url, faker=False): logging.debug('get_response: %s' % url) - + ctx = None + if insecure: + # ignore ssl errors + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE # install cookies if cookies: opener = request.build_opener(request.HTTPCookieProcessor(cookies)) @@ -352,10 +382,10 @@ def get_response(url, faker=False): if faker: response = request.urlopen( - request.Request(url, headers=fake_headers), None + request.Request(url, headers=fake_headers), None, context=ctx, ) else: - response = request.urlopen(url) + response = request.urlopen(url, context=ctx) data = response.read() if response.info().get('Content-Encoding') == 'gzip': @@ -434,8 +464,17 @@ def get_content(url, headers={}, decoded=True): req = request.Request(url, headers=headers) if cookies: - cookies.add_cookie_header(req) - req.headers.update(req.unredirected_hdrs) + # NOTE: Do not use cookies.add_cookie_header(req) + # #HttpOnly_ cookies were not supported by CookieJar and MozillaCookieJar properly until python 3.10 + # See also: + # - https://github.com/python/cpython/pull/17471 + # - https://bugs.python.org/issue2190 + # Here we add cookies to the request headers manually + cookie_strings = [] + for cookie in list(cookies): + cookie_strings.append(cookie.name + '=' + cookie.value) + cookie_headers = {'Cookie': '; '.join(cookie_strings)} + req.headers.update(cookie_headers) response = urlopen_with_retry(req) data = response.read() @@ -478,8 +517,17 @@ def post_content(url, headers={}, post_data={}, decoded=True, **kwargs): req = request.Request(url, headers=headers) if cookies: - cookies.add_cookie_header(req) - req.headers.update(req.unredirected_hdrs) + # NOTE: Do not use cookies.add_cookie_header(req) + # #HttpOnly_ cookies were not supported by CookieJar and MozillaCookieJar properly until python 3.10 + # See also: + # - https://github.com/python/cpython/pull/17471 + # - https://bugs.python.org/issue2190 + # Here we add cookies to the request headers manually + cookie_strings = [] + for cookie in list(cookies): + cookie_strings.append(cookie.name + '=' + cookie.value) + cookie_headers = {'Cookie': '; '.join(cookie_strings)} + req.headers.update(cookie_headers) if kwargs.get('post_data_raw'): post_data_enc = bytes(kwargs['post_data_raw'], 'utf-8') else: @@ -667,7 +715,7 @@ def url_save( bar.done() if not force and auto_rename: path, ext = os.path.basename(filepath).rsplit('.', 1) - finder = re.compile(' \([1-9]\d*?\)$') + finder = re.compile(r' \([1-9]\d*?\)$') if (finder.search(path) is None): thisfile = path + ' (1).' + ext else: @@ -966,6 +1014,10 @@ def download_urls( pass title = tr(get_filename(title)) + if postfix and 'vid' in kwargs: + title = "%s [%s]" % (title, kwargs['vid']) + if prefix is not None: + title = "[%s] %s" % (prefix, title) output_filename = get_output_filename(urls, title, ext, output_dir, merge) output_filepath = os.path.join(output_dir, output_filename) @@ -1322,7 +1374,13 @@ def download_main(download, download_playlist, urls, playlist, **kwargs): if re.match(r'https?://', url) is None: url = 'http://' + url - if playlist: + if m3u8: + if output_filename: + title = output_filename + else: + title = "m3u8file" + download_url_ffmpeg(url=url, title=title,ext = 'mp4',output_dir = '.') + elif playlist: download_playlist(url, **kwargs) else: download(url, **kwargs) @@ -1422,12 +1480,25 @@ def load_cookies(cookiefile): def set_socks_proxy(proxy): try: import socks - socks_proxy_addrs = proxy.split(':') - socks.set_default_proxy( - socks.SOCKS5, - socks_proxy_addrs[0], - int(socks_proxy_addrs[1]) - ) + if '@' in proxy: + proxy_info = proxy.split("@") + socks_proxy_addrs = proxy_info[1].split(':') + socks_proxy_auth = proxy_info[0].split(":") + socks.set_default_proxy( + socks.SOCKS5, + socks_proxy_addrs[0], + int(socks_proxy_addrs[1]), + True, + socks_proxy_auth[0], + socks_proxy_auth[1] + ) + else: + socks_proxy_addrs = proxy.split(':') + socks.set_default_proxy( + socks.SOCKS5, + socks_proxy_addrs[0], + int(socks_proxy_addrs[1]), + ) socket.socket = socks.socksocket def getaddrinfo(*args): @@ -1495,6 +1566,14 @@ def script_main(download, download_playlist, **kwargs): '--no-caption', action='store_true', help='Do not download captions (subtitles, lyrics, danmaku, ...)' ) + download_grp.add_argument( + '--post', '--postfix', dest='postfix', action='store_true', default=False, + help='Postfix downloaded files with unique identifiers' + ) + download_grp.add_argument( + '--pre', '--prefix', dest='prefix', metavar='PREFIX', default=None, + help='Prefix downloaded files with string' + ) download_grp.add_argument( '-f', '--force', action='store_true', default=False, help='Force overwriting existing files' @@ -1541,6 +1620,21 @@ def script_main(download, download_playlist, **kwargs): '-l', '--playlist', action='store_true', help='Prefer to download a playlist' ) + + playlist_grp = parser.add_argument_group('Playlist optional options') + playlist_grp.add_argument( + '--first', metavar='FIRST', + help='the first number' + ) + playlist_grp.add_argument( + '--last', metavar='LAST', + help='the last number' + ) + playlist_grp.add_argument( + '--size', '--page-size', metavar='PAGE_SIZE', + help='the page size number' + ) + download_grp.add_argument( '-a', '--auto-rename', action='store_true', default=False, help='Auto rename same name different files' @@ -1565,13 +1659,17 @@ def script_main(download, download_playlist, **kwargs): '--no-proxy', action='store_true', help='Never use a proxy' ) proxy_grp.add_argument( - '-s', '--socks-proxy', metavar='HOST:PORT', + '-s', '--socks-proxy', metavar='HOST:PORT or USERNAME:PASSWORD@HOST:PORT', help='Use an SOCKS5 proxy for downloading' ) download_grp.add_argument('--stream', help=argparse.SUPPRESS) download_grp.add_argument('--itag', help=argparse.SUPPRESS) + download_grp.add_argument('-m', '--m3u8', action='store_true', default=False, + help = 'download video using an m3u8 url') + + parser.add_argument('URL', nargs='*', help=argparse.SUPPRESS) args = parser.parse_args() @@ -1597,6 +1695,9 @@ def script_main(download, download_playlist, **kwargs): global output_filename global auto_rename global insecure + global m3u8 + global postfix + global prefix output_filename = args.output_filename extractor_proxy = args.extractor_proxy @@ -1618,6 +1719,9 @@ def script_main(download, download_playlist, **kwargs): if args.cookies: load_cookies(args.cookies) + if args.m3u8: + m3u8 = True + caption = True stream_id = args.format or args.stream or args.itag if args.no_caption: @@ -1630,6 +1734,8 @@ def script_main(download, download_playlist, **kwargs): # ignore ssl insecure = True + postfix = args.postfix + prefix = args.prefix if args.no_proxy: set_http_proxy('') @@ -1658,7 +1764,7 @@ def script_main(download, download_playlist, **kwargs): socket.setdefaulttimeout(args.timeout) try: - extra = {} + extra = {'args': args} if extractor_proxy: extra['extractor_proxy'] = extractor_proxy if stream_id: @@ -1716,20 +1822,10 @@ def google_search(url): url = 'https://www.google.com/search?tbm=vid&q=%s' % parse.quote(keywords) page = get_content(url, headers=fake_headers) videos = re.findall( - r'

([^<]+)<', page + r'(https://www\.youtube\.com/watch\?v=[\w-]+)', page ) - vdurs = re.findall(r'([^<]+)<', page) - durs = [r1(r'(\d+:\d+)', unescape_html(dur)) for dur in vdurs] - print('Google Videos search:') - for v in zip(videos, durs): - print('- video: {} [{}]'.format( - unescape_html(v[0][1]), - v[1] if v[1] else '?' - )) - print('# you-get %s' % log.sprint(v[0][0], log.UNDERLINE)) - print() print('Best matched result:') - return(videos[0][0]) + return(videos[0]) def url_to_module(url): @@ -1760,9 +1856,12 @@ def url_to_module(url): ) else: try: - location = get_location(url) # t.co isn't happy with fake_headers + try: + location = get_location(url) # t.co isn't happy with fake_headers + except: + location = get_location(url, headers=fake_headers) except: - location = get_location(url, headers=fake_headers) + location = get_location(url, headers=fake_headers, get_method='GET') if location and location != url and not location.startswith('/'): return url_to_module(location) diff --git a/src/you_get/extractor.py b/src/you_get/extractor.py index 83742f0f..fa2518db 100644 --- a/src/you_get/extractor.py +++ b/src/you_get/extractor.py @@ -238,7 +238,8 @@ class VideoExtractor(): download_urls(urls, self.title, ext, total_size, headers=headers, output_dir=kwargs['output_dir'], merge=kwargs['merge'], - av=stream_id in self.dash_streams) + av=stream_id in self.dash_streams, + vid=self.vid) if 'caption' not in kwargs or not kwargs['caption']: print('Skipping captions or danmaku.') diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index 4280d236..e68cd174 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -74,16 +74,13 @@ from .twitter import * from .ucas import * from .veoh import * from .vimeo import * -from .vine import * from .vk import * from .w56 import * from .wanmen import * -from .xiami import * from .xinpianchang import * -from .yinyuetai import * from .yixia import * from .youku import * from .youtube import * from .zhanqi import * from .zhibo import * -from .zhihu import * \ No newline at end of file +from .zhihu import * diff --git a/src/you_get/extractors/acfun.py b/src/you_get/extractors/acfun.py index b83c2859..5775eb5e 100644 --- a/src/you_get/extractors/acfun.py +++ b/src/you_get/extractors/acfun.py @@ -1,175 +1,213 @@ #!/usr/bin/env python -__all__ = ['acfun_download'] - from ..common import * +from ..extractor import VideoExtractor -from .le import letvcloud_download_by_vu -from .qq import qq_download_by_vid -from .sina import sina_download_by_vid -from .tudou import tudou_download_by_iid -from .youku import youku_download_by_vid +class AcFun(VideoExtractor): + name = "AcFun" -import json -import re -import base64 -import time + stream_types = [ + {'id': '2160P', 'qualityType': '2160p'}, + {'id': '1080P60', 'qualityType': '1080p60'}, + {'id': '720P60', 'qualityType': '720p60'}, + {'id': '1080P+', 'qualityType': '1080p+'}, + {'id': '1080P', 'qualityType': '1080p'}, + {'id': '720P', 'qualityType': '720p'}, + {'id': '540P', 'qualityType': '540p'}, + {'id': '360P', 'qualityType': '360p'} + ] -def get_srt_json(id): - url = 'http://danmu.aixifan.com/V2/%s' % id - return get_content(url) + def prepare(self, **kwargs): + assert re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', self.url) -def youku_acfun_proxy(vid, sign, ref): - endpoint = 'http://player.acfun.cn/flash_data?vid={}&ct=85&ev=3&sign={}&time={}' - url = endpoint.format(vid, sign, str(int(time.time() * 1000))) - json_data = json.loads(get_content(url, headers=dict(referer=ref)))['data'] - enc_text = base64.b64decode(json_data) - dec_text = rc4(b'8bdc7e1a', enc_text).decode('utf8') - youku_json = json.loads(dec_text) + if re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', self.url): + html = get_content(self.url, headers=fake_headers) + json_text = match1(html, r"(?s)videoInfo\s*=\s*(\{.*?\});") + json_data = json.loads(json_text) + vid = json_data.get('currentVideoInfo').get('id') + up = json_data.get('user').get('name') + self.title = json_data.get('title') + video_list = json_data.get('videoList') + if len(video_list) > 1: + self.title += " - " + [p.get('title') for p in video_list if p.get('id') == vid][0] + currentVideoInfo = json_data.get('currentVideoInfo') + + elif re.match(r"https?://[^\.]*\.*acfun\.[^\.]+/bangumi/aa(\d+)", self.url): + html = get_content(self.url, headers=fake_headers) + tag_script = match1(html, r'') + json_text = tag_script[tag_script.find('{') : tag_script.find('};') + 1] + json_data = json.loads(json_text) + self.title = json_data['bangumiTitle'] + " " + json_data['episodeName'] + " " + json_data['title'] + vid = str(json_data['videoId']) + up = "acfun" + currentVideoInfo = json_data.get('currentVideoInfo') - yk_streams = {} - for stream in youku_json['stream']: - tp = stream['stream_type'] - yk_streams[tp] = [], stream['total_size'] - if stream.get('segs'): - for seg in stream['segs']: - yk_streams[tp][0].append(seg['url']) else: - yk_streams[tp] = stream['m3u8'], stream['total_size'] + raise NotImplemented - return yk_streams + if 'ksPlayJson' in currentVideoInfo: + durationMillis = currentVideoInfo['durationMillis'] + ksPlayJson = ksPlayJson = json.loads( currentVideoInfo['ksPlayJson'] ) + representation = ksPlayJson.get('adaptationSet')[0].get('representation') + stream_list = representation -def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False, **kwargs): - """str, str, str, bool, bool ->None + for stream in stream_list: + m3u8_url = stream["url"] + size = durationMillis * stream["avgBitrate"] / 8 + # size = float('inf') + container = 'mp4' + stream_id = stream["qualityLabel"] + quality = stream["qualityType"] + + stream_data = dict(src=m3u8_url, size=size, container=container, quality=quality) + self.streams[stream_id] = stream_data - Download Acfun video by vid. + assert self.title and m3u8_url + self.title = unescape_html(self.title) + self.title = escape_file_path(self.title) + p_title = r1('active">([^<]+)', html) + self.title = '%s (%s)' % (self.title, up) + if p_title: + self.title = '%s - %s' % (self.title, p_title) - Call Acfun API, decide which site to use, and pass the job to its - extractor. - """ - #first call the main parasing API - info = json.loads(get_content('http://www.acfun.cn/video/getVideo.aspx?id=' + vid, headers=fake_headers)) + def download(self, **kwargs): + if 'json_output' in kwargs and kwargs['json_output']: + json_output.output(self) + elif 'info_only' in kwargs and kwargs['info_only']: + if 'stream_id' in kwargs and kwargs['stream_id']: + # Display the stream + stream_id = kwargs['stream_id'] + if 'index' not in kwargs: + self.p(stream_id) + else: + self.p_i(stream_id) + else: + # Display all available streams + if 'index' not in kwargs: + self.p([]) + else: + stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag'] + self.p_i(stream_id) - sourceType = info['sourceType'] - - #decide sourceId to know which extractor to use - if 'sourceId' in info: sourceId = info['sourceId'] - # danmakuId = info['danmakuId'] - - #call extractor decided by sourceId - if sourceType == 'sina': - sina_download_by_vid(sourceId, title, output_dir=output_dir, merge=merge, info_only=info_only) - elif sourceType == 'youku': - youku_download_by_vid(sourceId, title=title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) - elif sourceType == 'tudou': - tudou_download_by_iid(sourceId, title, output_dir=output_dir, merge=merge, info_only=info_only) - elif sourceType == 'qq': - qq_download_by_vid(sourceId, title, True, output_dir=output_dir, merge=merge, info_only=info_only) - elif sourceType == 'letv': - letvcloud_download_by_vu(sourceId, '2d8c027396', title, output_dir=output_dir, merge=merge, info_only=info_only) - elif sourceType == 'zhuzhan': - #As in Jul.28.2016, Acfun is using embsig to anti hotlink so we need to pass this -#In Mar. 2017 there is a dedicated ``acfun_proxy'' in youku cloud player -#old code removed - url = 'http://www.acfun.cn/v/ac' + vid - yk_streams = youku_acfun_proxy(info['sourceId'], info['encode'], url) - seq = ['mp4hd3', 'mp4hd2', 'mp4hd', 'flvhd'] - for t in seq: - if yk_streams.get(t): - preferred = yk_streams[t] - break -#total_size in the json could be incorrect(F.I. 0) - size = 0 - for url in preferred[0]: - _, _, seg_size = url_info(url) - size += seg_size -#fallback to flvhd is not quite possible - if re.search(r'fid=[0-9A-Z\-]*.flv', preferred[0][0]): - ext = 'flv' else: - ext = 'mp4' - print_info(site_info, title, ext, size) + if 'stream_id' in kwargs and kwargs['stream_id']: + # Download the stream + stream_id = kwargs['stream_id'] + else: + stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag'] + + if 'index' not in kwargs: + self.p(stream_id) + else: + self.p_i(stream_id) + if stream_id in self.streams: + url = self.streams[stream_id]['src'] + ext = self.streams[stream_id]['container'] + total_size = self.streams[stream_id]['size'] + + + if ext == 'm3u8' or ext == 'm4a': + ext = 'mp4' + + if not url: + log.wtf('[Failed] Cannot extract video source.') + # For legacy main() + headers = {} + if self.ua is not None: + headers['User-Agent'] = self.ua + if self.referer is not None: + headers['Referer'] = self.referer + + download_url_ffmpeg(url, self.title, ext, output_dir=kwargs['output_dir'], merge=kwargs['merge']) + + if 'caption' not in kwargs or not kwargs['caption']: + print('Skipping captions or danmaku.') + return + + for lang in self.caption_tracks: + filename = '%s.%s.srt' % (get_filename(self.title), lang) + print('Saving %s ... ' % filename, end="", flush=True) + srt = self.caption_tracks[lang] + with open(os.path.join(kwargs['output_dir'], filename), + 'w', encoding='utf-8') as x: + x.write(srt) + print('Done.') + + if self.danmaku is not None and not dry_run: + filename = '{}.cmt.xml'.format(get_filename(self.title)) + print('Downloading {} ...\n'.format(filename)) + with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf8') as fp: + fp.write(self.danmaku) + + if self.lyrics is not None and not dry_run: + filename = '{}.lrc'.format(get_filename(self.title)) + print('Downloading {} ...\n'.format(filename)) + with open(os.path.join(kwargs['output_dir'], filename), 'w', encoding='utf8') as fp: + fp.write(self.lyrics) + + # For main_dev() + #download_urls(urls, self.title, self.streams[stream_id]['container'], self.streams[stream_id]['size']) + keep_obj = kwargs.get('keep_obj', False) + if not keep_obj: + self.__init__() + + + def acfun_download(self, url, output_dir='.', merge=True, info_only=False, **kwargs): + assert re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', url) + + def getM3u8UrlFromCurrentVideoInfo(currentVideoInfo): + if 'playInfos' in currentVideoInfo: + return currentVideoInfo['playInfos'][0]['playUrls'][0] + elif 'ksPlayJson' in currentVideoInfo: + ksPlayJson = json.loads( currentVideoInfo['ksPlayJson'] ) + representation = ksPlayJson.get('adaptationSet')[0].get('representation') + reps = [] + for one in representation: + reps.append( (one['width']* one['height'], one['url'], one['backupUrl']) ) + return max(reps)[1] + + + if re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url): + html = get_content(url, headers=fake_headers) + json_text = match1(html, r"(?s)videoInfo\s*=\s*(\{.*?\});") + json_data = json.loads(json_text) + vid = json_data.get('currentVideoInfo').get('id') + up = json_data.get('user').get('name') + title = json_data.get('title') + video_list = json_data.get('videoList') + if len(video_list) > 1: + title += " - " + [p.get('title') for p in video_list if p.get('id') == vid][0] + currentVideoInfo = json_data.get('currentVideoInfo') + m3u8_url = getM3u8UrlFromCurrentVideoInfo(currentVideoInfo) + elif re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/bangumi/aa(\d+)', url): + html = get_content(url, headers=fake_headers) + tag_script = match1(html, r'') + json_text = tag_script[tag_script.find('{') : tag_script.find('};') + 1] + json_data = json.loads(json_text) + title = json_data['bangumiTitle'] + " " + json_data['episodeName'] + " " + json_data['title'] + vid = str(json_data['videoId']) + up = "acfun" + + currentVideoInfo = json_data.get('currentVideoInfo') + m3u8_url = getM3u8UrlFromCurrentVideoInfo(currentVideoInfo) + + else: + raise NotImplemented + + assert title and m3u8_url + title = unescape_html(title) + title = escape_file_path(title) + p_title = r1('active">([^<]+)', html) + title = '%s (%s)' % (title, up) + if p_title: + title = '%s - %s' % (title, p_title) + + print_info(site_info, title, 'm3u8', float('inf')) if not info_only: - download_urls(preferred[0], title, ext, size, output_dir=output_dir, merge=merge) - else: - raise NotImplementedError(sourceType) - - if not info_only and not dry_run: - if not kwargs['caption']: - print('Skipping danmaku.') - return - try: - title = get_filename(title) - print('Downloading %s ...\n' % (title + '.cmt.json')) - cmt = get_srt_json(vid) - with open(os.path.join(output_dir, title + '.cmt.json'), 'w', encoding='utf-8') as x: - x.write(cmt) - except: - pass - -def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - assert re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', url) - - if re.match(r'https?://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url): - html = get_content(url, headers=fake_headers) - json_text = match1(html, r"(?s)videoInfo\s*=\s*(\{.*?\});") - json_data = json.loads(json_text) - vid = json_data.get('currentVideoInfo').get('id') - up = json_data.get('user').get('name') - title = json_data.get('title') - video_list = json_data.get('videoList') - if len(video_list) > 1: - title += " - " + [p.get('title') for p in video_list if p.get('id') == vid][0] - currentVideoInfo = json_data.get('currentVideoInfo') - if 'playInfos' in currentVideoInfo: - m3u8_url = currentVideoInfo['playInfos'][0]['playUrls'][0] - elif 'ksPlayJson' in currentVideoInfo: - ksPlayJson = json.loads( currentVideoInfo['ksPlayJson'] ) - representation = ksPlayJson.get('adaptationSet').get('representation') - reps = [] - for one in representation: - reps.append( (one['width']* one['height'], one['url'], one['backupUrl']) ) - m3u8_url = max(reps)[1] - - elif re.match("https?://[^\.]*\.*acfun\.[^\.]+/bangumi/aa(\d+)", url): - html = get_content(url, headers=fake_headers) - tag_script = match1(html, r'') - json_text = tag_script[tag_script.find('{') : tag_script.find('};') + 1] - json_data = json.loads(json_text) - title = json_data['bangumiTitle'] + " " + json_data['episodeName'] + " " + json_data['title'] - vid = str(json_data['videoId']) - up = "acfun" - - play_info = get_content("https://www.acfun.cn/rest/pc-direct/play/playInfo/m3u8Auto?videoId=" + vid, headers=fake_headers) - play_url = json.loads(play_info)['playInfo']['streams'][0]['playUrls'][0] - m3u8_all_qualities_file = get_content(play_url) - m3u8_all_qualities_lines = m3u8_all_qualities_file.split('#EXT-X-STREAM-INF:')[1:] - highest_quality_line = m3u8_all_qualities_lines[0] - for line in m3u8_all_qualities_lines: - bandwith = int(match1(line, r'BANDWIDTH=(\d+)')) - if bandwith > int(match1(highest_quality_line, r'BANDWIDTH=(\d+)')): - highest_quality_line = line - #TODO: 应由用户指定清晰度 - m3u8_url = match1(highest_quality_line, r'\n([^#\n]+)$') - m3u8_url = play_url[:play_url.rfind("/")+1] + m3u8_url - - else: - raise NotImplemented - - assert title and m3u8_url - title = unescape_html(title) - title = escape_file_path(title) - p_title = r1('active">([^<]+)', html) - title = '%s (%s)' % (title, up) - if p_title: - title = '%s - %s' % (title, p_title) - - print_info(site_info, title, 'm3u8', float('inf')) - if not info_only: - download_url_ffmpeg(m3u8_url, title, 'mp4', output_dir=output_dir, merge=merge) - + download_url_ffmpeg(m3u8_url, title, 'mp4', output_dir=output_dir, merge=merge) +site = AcFun() site_info = "AcFun.cn" -download = acfun_download +download = site.download_by_url download_playlist = playlist_not_supported('acfun') diff --git a/src/you_get/extractors/baidu.py b/src/you_get/extractors/baidu.py index 521d5e99..61b0ad24 100644 --- a/src/you_get/extractors/baidu.py +++ b/src/you_get/extractors/baidu.py @@ -116,7 +116,7 @@ def baidu_download(url, output_dir='.', stream_type=None, merge=True, info_only= id = r1(r'https?://music.baidu.com/album/(\d+)', url) baidu_download_album(id, output_dir, merge, info_only) - elif re.match('https?://music.baidu.com/song/\d+', url): + elif re.match(r'https?://music.baidu.com/song/\d+', url): id = r1(r'https?://music.baidu.com/song/(\d+)', url) baidu_download_song(id, output_dir, merge, info_only) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 620b0ff5..fb544447 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -1,16 +1,23 @@ #!/usr/bin/env python -import math from ..common import * from ..extractor import VideoExtractor import hashlib +import math + class Bilibili(VideoExtractor): name = "Bilibili" # Bilibili media encoding options, in descending quality order. stream_types = [ + {'id': 'hdflv2_8k', 'quality': 127, 'audio_quality': 30280, + 'container': 'FLV', 'video_resolution': '4320p', 'desc': '超高清 8K'}, + {'id': 'hdflv2_dolby', 'quality': 126, 'audio_quality': 30280, + 'container': 'FLV', 'video_resolution': '3840p', 'desc': '杜比视界'}, + {'id': 'hdflv2_hdr', 'quality': 125, 'audio_quality': 30280, + 'container': 'FLV', 'video_resolution': '2160p', 'desc': '真彩 HDR'}, {'id': 'hdflv2_4k', 'quality': 120, 'audio_quality': 30280, 'container': 'FLV', 'video_resolution': '2160p', 'desc': '超清 4K'}, {'id': 'flv_p60', 'quality': 116, 'audio_quality': 30280, @@ -35,6 +42,8 @@ class Bilibili(VideoExtractor): {'id': 'jpg', 'quality': 0}, ] + codecids = {7: 'AVC', 12: 'HEVC', 13: 'AV1'} + @staticmethod def height_to_quality(height, qn): if height <= 360 and qn <= 16: @@ -63,7 +72,7 @@ class Bilibili(VideoExtractor): @staticmethod def bilibili_api(avid, cid, qn=0): - return 'https://api.bilibili.com/x/player/playurl?avid=%s&cid=%s&qn=%s&type=&otype=json&fnver=0&fnval=16' % (avid, cid, qn) + return 'https://api.bilibili.com/x/player/playurl?avid=%s&cid=%s&qn=%s&type=&otype=json&fnver=0&fnval=4048&fourk=1' % (avid, cid, qn) @staticmethod def bilibili_audio_api(sid): @@ -91,7 +100,8 @@ class Bilibili(VideoExtractor): appkey, sec = ''.join([chr(ord(i) + 2) for i in entropy[::-1]]).split(':') params = 'appkey=%s&cid=%s&otype=json&qn=%s&quality=%s&type=' % (appkey, cid, qn, qn) chksum = hashlib.md5(bytes(params + sec, 'utf8')).hexdigest() - return 'https://interface.bilibili.com/v2/playurl?%s&sign=%s' % (params, chksum) + return 'https://api.bilibili.com/x/player/wbi/v2?%s&sign=%s' % (params, chksum) + @staticmethod def bilibili_live_api(cid): @@ -109,13 +119,21 @@ class Bilibili(VideoExtractor): def bilibili_space_channel_api(mid, cid, pn=1, ps=100): return 'https://api.bilibili.com/x/space/channel/video?mid=%s&cid=%s&pn=%s&ps=%s&order=0&jsonp=jsonp' % (mid, cid, pn, ps) + @staticmethod + def bilibili_space_collection_api(mid, cid, pn=1, ps=30): + return 'https://api.bilibili.com/x/polymer/space/seasons_archives_list?mid=%s&season_id=%s&sort_reverse=false&page_num=%s&page_size=%s' % (mid, cid, pn, ps) + + @staticmethod + def bilibili_series_archives_api(mid, sid, pn=1, ps=100): + return 'https://api.bilibili.com/x/series/archives?mid=%s&series_id=%s&pn=%s&ps=%s&only_normal=true&sort=asc&jsonp=jsonp' % (mid, sid, pn, ps) + @staticmethod def bilibili_space_favlist_api(fid, pn=1, ps=20): return 'https://api.bilibili.com/x/v3/fav/resource/list?media_id=%s&pn=%s&ps=%s&order=mtime&type=0&tid=0&jsonp=jsonp' % (fid, pn, ps) @staticmethod - def bilibili_space_video_api(mid, pn=1, ps=100): - return 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid=%s&page=%s&pagesize=%s&order=0&jsonp=jsonp' % (mid, pn, ps) + def bilibili_space_video_api(mid, pn=1, ps=50): + return "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=%s&ps=%s&tid=0&keyword=&order=pubdate&jsonp=jsonp" % (mid, pn, ps) @staticmethod def bilibili_vc_api(video_id): @@ -132,10 +150,10 @@ class Bilibili(VideoExtractor): except: return err_value - # https://api.bilibili.com/x/player.so?id=cid%3A162260003&aid=95051759&bvid=BV1zE411T7nb&buvid=FB2BB46F-B1F3-4BDA-A589-33348940411A155830infoc - def prepare(self, **kwargs): self.stream_qualities = {s['quality']: s for s in self.stream_types} + self.streams.clear() + self.dash_streams.clear() try: html_content = get_content(self.url, headers=self.bilibili_headers(referer=self.url)) @@ -154,13 +172,23 @@ class Bilibili(VideoExtractor): # redirect: bangumi/play/ss -> bangumi/play/ep # redirect: bangumi.bilibili.com/anime -> bangumi/play/ep elif re.match(r'https?://(www\.)?bilibili\.com/bangumi/play/ss(\d+)', self.url) or \ - re.match(r'https?://bangumi\.bilibili\.com/anime/(\d+)/play', self.url): + re.match(r'https?://bangumi\.bilibili\.com/anime/(\d+)/play', self.url): initial_state_text = match1(html_content, r'__INITIAL_STATE__=(.*?);\(function\(\)') # FIXME initial_state = json.loads(initial_state_text) ep_id = initial_state['epList'][0]['id'] self.url = 'https://www.bilibili.com/bangumi/play/ep%s' % ep_id html_content = get_content(self.url, headers=self.bilibili_headers(referer=self.url)) + # redirect: s + elif re.match(r'https?://(www\.)?bilibili\.com/s/(.+)', self.url): + self.url = 'https://www.bilibili.com/%s' % match1(self.url, r'/s/(.+)') + html_content = get_content(self.url, headers=self.bilibili_headers()) + + # redirect: festival + elif re.match(r'https?://(www\.)?bilibili\.com/festival/(.+)', self.url): + self.url = 'https://www.bilibili.com/video/%s' % match1(self.url, r'bvid=([^&]+)') + html_content = get_content(self.url, headers=self.bilibili_headers()) + # sort it out if re.match(r'https?://(www\.)?bilibili\.com/audio/au(\d+)', self.url): sort = 'audio' @@ -172,7 +200,7 @@ class Bilibili(VideoExtractor): sort = 'live' elif re.match(r'https?://vc\.bilibili\.com/video/(\d+)', self.url): sort = 'vc' - elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(BV(\S+)))', self.url): + elif re.match(r'https?://(www\.)?bilibili\.com/video/(av(\d+)|(bv(\S+))|(BV(\S+)))', self.url): sort = 'video' elif re.match(r'https?://h\.?bilibili\.com/(\d+)', self.url): sort = 'h' @@ -180,35 +208,54 @@ class Bilibili(VideoExtractor): self.download_playlist_by_url(self.url, **kwargs) return - # regular av video + # regular video if sort == 'video': initial_state_text = match1(html_content, r'__INITIAL_STATE__=(.*?);\(function\(\)') # FIXME initial_state = json.loads(initial_state_text) playinfo_text = match1(html_content, r'__playinfo__=(.*?)', html) - info = json.loads(data.group(1)) + appId = r1(r'"appId":"(\d+)"', cont) + media_id = r1(r'"media_id":"(\d+)"', cont) + logging.debug('appId: %s' % appId) + logging.debug('media_id: %s' % media_id) - if 'edge_sidecar_to_children' in info['entry_data']['PostPage'][0]['graphql']['shortcode_media']: - edges = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['edge_sidecar_to_children']['edges'] - for edge in edges: - title = edge['node']['shortcode'] - image_url = edge['node']['display_url'] - if 'video_url' in edge['node']: - image_url = edge['node']['video_url'] - ext = image_url.split('?')[0].split('.')[-1] - size = int(get_head(image_url)['Content-Length']) + api_url = 'https://i.instagram.com/api/v1/media/%s/info/' % media_id + try: + api_cont = get_content(api_url, headers={**fake_headers, **{'x-ig-app-id': appId}}) + post = json.loads(api_cont) + except: + log.wtf('[Error] Please specify a cookie file.') - print_info(site_info, title, ext, size) - if not info_only: - download_urls(urls=[image_url], - title=title, - ext=ext, - total_size=size, - output_dir=output_dir) - else: - title = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['shortcode'] - image_url = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['display_url'] - if 'video_url' in info['entry_data']['PostPage'][0]['graphql']['shortcode_media']: - image_url =info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['video_url'] + for item in post['items']: + code = item['code'] + carousel_media = item.get('carousel_media') or [item] + for i, media in enumerate(carousel_media): + title = '%s [%s]' % (code, i) + image_url = media['image_versions2']['candidates'][0]['url'] ext = image_url.split('?')[0].split('.')[-1] size = int(get_head(image_url)['Content-Length']) @@ -55,6 +47,20 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg total_size=size, output_dir=output_dir) + # download videos (if any) + if 'video_versions' in media: + video_url = media['video_versions'][0]['url'] + ext = video_url.split('?')[0].split('.')[-1] + size = int(get_head(video_url)['Content-Length']) + + print_info(site_info, title, ext, size) + if not info_only: + download_urls(urls=[video_url], + title=title, + ext=ext, + total_size=size, + output_dir=output_dir) + site_info = "Instagram.com" download = instagram_download download_playlist = playlist_not_supported('instagram') diff --git a/src/you_get/extractors/iqiyi.py b/src/you_get/extractors/iqiyi.py index 3fe93209..16bf45d3 100644 --- a/src/you_get/extractors/iqiyi.py +++ b/src/you_get/extractors/iqiyi.py @@ -20,7 +20,7 @@ Changelog: use @fffonion 's method in #617. Add trace AVM(asasm) code in Iqiyi's encode function where the salt is put into the encode array and reassemble by RABCDasm(or WinRABCDasm),then use Fiddler to response modified file to replace the src file with its AutoResponder function ,set browser Fiddler proxy and play with !debug version! Flash Player ,finially get result in flashlog.txt(its location can be easily found in search engine). Code Like (without letters after #comment:),it just do the job : trace("{IQIYI_SALT}:"+salt_array.join("")) - ```(Postion After getTimer) + ```(Position After getTimer) findpropstrict QName(PackageNamespace(""), "trace") pushstring "{IQIYI_SALT}:" #comment for you to locate the salt getscopeobject 1 @@ -119,10 +119,10 @@ class Iqiyi(VideoExtractor): self.url = url video_page = get_content(url) - videos = set(re.findall(r'0): for video in videos: diff --git a/src/you_get/extractors/ixigua.py b/src/you_get/extractors/ixigua.py index 2f11e7f9..f2fd953e 100644 --- a/src/you_get/extractors/ixigua.py +++ b/src/you_get/extractors/ixigua.py @@ -18,121 +18,97 @@ headers = { } -def int_overflow(val): - maxint = 2147483647 - if not -maxint - 1 <= val <= maxint: - val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1 - return val - - -def unsigned_right_shitf(n, i): - if n < 0: - n = ctypes.c_uint32(n).value - if i < 0: - return -int_overflow(n << abs(i)) - return int_overflow(n >> i) - - -def get_video_url_from_video_id(video_id): - """Splicing URLs according to video ID to get video details""" - # from js - data = [""] * 256 - for index, _ in enumerate(data): - t = index - for i in range(8): - t = -306674912 ^ unsigned_right_shitf(t, 1) if 1 & t else unsigned_right_shitf(t, 1) - data[index] = t - - def tmp(): - rand_num = random.random() - path = "/video/urls/v/1/toutiao/mp4/{video_id}?r={random_num}".format(video_id=video_id, - random_num=str(rand_num)[2:]) - e = o = r = -1 - i, a = 0, len(path) - while i < a: - e = ord(path[i]) - i += 1 - if e < 128: - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ e)] - else: - if e < 2048: - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (192 | e >> 6 & 31))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))] - else: - if 55296 <= e < 57344: - e = (1023 & e) + 64 - i += 1 - o = 1023 & t.url(i) - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (240 | e >> 8 & 7))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 2 & 63))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | o >> 6 & 15 | (3 & e) << 4))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & o))] - else: - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (224 | e >> 12 & 15))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 6 & 63))] - r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))] - - return "https://ib.365yg.com{path}&s={param}".format(path=path, param=unsigned_right_shitf(r ^ -1, 0)) - - while 1: - url = tmp() - if url.split("=")[-1][0] != "-": # 参数s不能为负数 - return url - - -def ixigua_download(url, output_dir='.', merge=True, info_only=False, **kwargs): +def ixigua_download(url, output_dir='.', merge=True, info_only=False, stream_id='', **kwargs): # example url: https://www.ixigua.com/i6631065141750268420/#mid=63024814422 - resp = urlopen_with_retry(request.Request(url)) + headers['cookie'] = "MONITOR_WEB_ID=7892c49b-296e-4499-8704-e47c1b15123; " \ + "ixigua-a-s=1; ttcid=af99669b6304453480454f1507011d5c234; BD_REF=1; " \ + "__ac_nonce=060d88ff000a75e8d17eb; __ac_signature=_02B4Z6wo100f01kX9ZpgAAIDAKIBBQUIPYT5F2WIAAPG2ad; " \ + "ttwid=1%7CcIsVF_3vqSIk4XErhPB0H2VaTxT0tdsTMRbMjrJOPN8%7C1624806049%7C08ce7dd6f7d20506a41ba0a331ef96a6505d96731e6ad9f6c8c709f53f227ab1; " + + resp = urlopen_with_retry(request.Request(url, headers=headers)) html = resp.read().decode('utf-8') _cookies = [] for c in resp.getheader('Set-Cookie').split("httponly,"): _cookies.append(c.strip().split(' ')[0]) - headers['cookie'] = ' '.join(_cookies) + headers['cookie'] += ' '.join(_cookies) - conf = loads(match1(html, r"window\.config = (.+);")) - if not conf: - log.e("Get window.config from url failed, url: {}".format(url)) + match_txt = match1(html, r"', cont) info = json.loads(x.group(1))[-1]['data'][0] diff --git a/src/you_get/extractors/tiktok.py b/src/you_get/extractors/tiktok.py index c2a0eb8d..d1d98c41 100644 --- a/src/you_get/extractors/tiktok.py +++ b/src/you_get/extractors/tiktok.py @@ -5,26 +5,43 @@ __all__ = ['tiktok_download'] from ..common import * def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - html = get_html(url, faker=True) + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0', + 'Accept-Encoding': 'gzip, deflate', + 'Accept': '*/*', + 'Referer': 'https://www.tiktok.com/', + 'Connection': 'keep-alive' # important + } - data = r1(r'', html) + m = re.match('(https?://)?([^/]+)(/.*)', url) + host = m.group(2) + if host != 'www.tiktok.com': # non-canonical URL + if host == 'vt.tiktok.com': # short URL + url = get_location(url) + vid = r1(r'/video/(\d+)', url) + url = 'https://www.tiktok.com/@/video/%s/' % vid + host = 'www.tiktok.com' + else: + url = m.group(3).split('?')[0] + vid = url.split('/')[3] # should be a string of numbers + + html, set_cookie = getHttps(host, url, headers=headers) + tt_chain_token = r1('tt_chain_token=([^;]+);', set_cookie) + headers['Cookie'] = 'tt_chain_token=%s' % tt_chain_token + + data = r1(r'', html) info = json.loads(data) - videoData = info['props']['pageProps']['videoData'] - urls = videoData['itemInfos']['video']['urls'] - videoId = videoData['itemInfos']['id'] - uniqueId = videoData['authorInfos'].get('uniqueId') - nickName = videoData['authorInfos'].get('nickName') + itemStruct = info['__DEFAULT_SCOPE__']['webapp.video-detail']['itemInfo']['itemStruct'] + downloadAddr = itemStruct['video']['downloadAddr'] + author = itemStruct['author']['uniqueId'] + nickname = itemStruct['author']['nickname'] + title = '%s [%s]' % (nickname or author, vid) - for i, url in enumerate(urls): - title = '%s [%s]' % (nickName or uniqueId, videoId) - if len(urls) > 1: - title = '%s [%s]' % (title, i) + mime, ext, size = url_info(downloadAddr, headers=headers) - mime, ext, size = url_info(url) - - print_info(site_info, title, mime, size) - if not info_only: - download_urls([url], title, ext, size, output_dir=output_dir, merge=merge) + print_info(site_info, title, mime, size) + if not info_only: + download_urls([downloadAddr], title, ext, size, output_dir=output_dir, merge=merge, headers=headers) site_info = "TikTok.com" download = tiktok_download diff --git a/src/you_get/extractors/tudou.py b/src/you_get/extractors/tudou.py index b1568dfd..92b8393c 100644 --- a/src/you_get/extractors/tudou.py +++ b/src/you_get/extractors/tudou.py @@ -71,7 +71,7 @@ def tudou_download(url, output_dir = '.', merge = True, info_only = False, **kwa # obsolete? def parse_playlist(url): - aid = r1('http://www.tudou.com/playlist/p/a(\d+)(?:i\d+)?\.html', url) + aid = r1(r'http://www.tudou.com/playlist/p/a(\d+)(?:i\d+)?\.html', url) html = get_decoded_html(url) if not aid: aid = r1(r"aid\s*[:=]\s*'(\d+)'", html) diff --git a/src/you_get/extractors/tumblr.py b/src/you_get/extractors/tumblr.py index 9a314c7f..236ba182 100644 --- a/src/you_get/extractors/tumblr.py +++ b/src/you_get/extractors/tumblr.py @@ -6,7 +6,6 @@ from ..common import * from .universal import * from .dailymotion import dailymotion_download from .vimeo import vimeo_download -from .vine import vine_download def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): if re.match(r'https?://\d+\.media\.tumblr\.com/', url): @@ -14,7 +13,7 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): return import ssl - ssl_context = request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1)) + ssl_context = request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)) # server requires TLS v1.2 cookie_handler = request.HTTPCookieProcessor() opener = request.build_opener(ssl_context, cookie_handler) request.install_opener(opener) @@ -35,7 +34,7 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): post_data_raw='{"eu_resident":true,"gdpr_is_acceptable_age":true,"gdpr_consent_core":true,"gdpr_consent_first_party_ads":true,"gdpr_consent_third_party_ads":true,"gdpr_consent_search_history":true,"redirect_to":"%s","gdpr_reconsent":false}' % url) page = get_html(url, faker=True) - html = parse.unquote(page).replace('\/', '/') + html = parse.unquote(page).replace(r'\/', '/') feed = r1(r'', html) if feed in ['photo', 'photoset', 'entry'] or feed is None: @@ -45,23 +44,30 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): r1(r'([^<\n]*)', html) urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.jpg)', html) +\ re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.png)', html) +\ - re.findall(r'(https?://[^;"&]+/tumblr_[^";&]+_\d+\.gif)', html) + re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.gif)', html) +\ + re.findall(r'(https?://\d+\.media\.tumblr\.com/[^;"&]+/s\d+x\d+/[^;"&]+\.jpg)', html) +\ + re.findall(r'(https?://\d+\.media\.tumblr\.com/[^;"&]+/s\d+x\d+/[^;"&]+\.png)', html) +\ + re.findall(r'(https?://\d+\.media\.tumblr\.com/[^;"&]+/s\d+x\d+/[^;"&]+\.gif)', html) tuggles = {} for url in urls: if url.endswith('.gif'): hd_url = url elif url.endswith('.jpg'): - hd_url = r1(r'(.+)_\d+\.jpg$', url) + '_1280.jpg' # FIXME: decide actual quality + hd_url = url # FIXME: decide actual quality # r1(r'(.+)_\d+\.jpg$', url) + '_1280.jpg' elif url.endswith('.png'): - hd_url = r1(r'(.+)_\d+\.png$', url) + '_1280.png' # FIXME: decide actual quality + hd_url = url # FIXME: decide actual quality # r1(r'(.+)_\d+\.png$', url) + '_1280.png' else: continue filename = parse.unquote(hd_url.split('/')[-1]) title = '.'.join(filename.split('.')[:-1]) - tumblr_id = r1(r'^tumblr_(.+)_\d+$', title) - quality = int(r1(r'^tumblr_.+_(\d+)$', title)) + tumblr_id = r1(r'^tumblr_(.+)_\d+$', title) or title + try: + quality = int(r1(r'^tumblr_.+_(\d+)$', title)) + except: + quality = int(r1(r'/s(\d+)x\d+/', hd_url)) ext = filename.split('.')[-1] + try: size = int(get_head(hd_url)['Content-Length']) if tumblr_id not in tuggles or tuggles[tumblr_id]['quality'] < quality: @@ -75,16 +81,16 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): except: pass if tuggles: - size = sum([tuggles[t]['size'] for t in tuggles]) - print_info(site_info, page_title, None, size) + #size = sum([tuggles[t]['size'] for t in tuggles]) + #print_info(site_info, page_title, None, size) - if not info_only: - for t in tuggles: - title = tuggles[t]['title'] - ext = tuggles[t]['ext'] - size = tuggles[t]['size'] - url = tuggles[t]['url'] - print_info(site_info, title, ext, size) + for t in tuggles: + title = '[tumblr] ' + tuggles[t]['title'] + ext = tuggles[t]['ext'] + size = tuggles[t]['size'] + url = tuggles[t]['url'] + print_info(site_info, title, ext, size) + if not info_only: download_urls([url], title, ext, size, output_dir=output_dir) return @@ -118,9 +124,6 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): elif re.search(r'dailymotion\.com', iframe_url): dailymotion_download(iframe_url, output_dir, merge=merge, info_only=info_only, **kwargs) return - elif re.search(r'vine\.co', iframe_url): - vine_download(iframe_url, output_dir, merge=merge, info_only=info_only, **kwargs) - return else: iframe_html = get_content(iframe_url) real_url = r1(r'<source src="([^"]*)"', iframe_html) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 602c18f6..299dc052 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -4,7 +4,6 @@ __all__ = ['twitter_download'] from ..common import * from .universal import * -from .vine import vine_download def extract_m3u(source): r1 = get_content(source) @@ -23,7 +22,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) if re.match(r'https?://mobile', url): # normalize mobile URL url = 'https://' + match1(url, r'//mobile\.(.+)') - if re.match(r'https?://twitter\.com/i/moments/', url): # moments + if re.match(r'https?://twitter\.com/i/moments/', url): # FIXME: moments html = get_html(url, faker=True) paths = re.findall(r'data-permalink-path="([^"]+)"', html) for path in paths: @@ -34,71 +33,49 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) **kwargs) return - html = get_html(url, faker=False) # disable faker to prevent 302 infinite redirect - screen_name = r1(r'twitter\.com/([^/]+)', url) or r1(r'data-screen-name="([^"]*)"', html) or \ - r1(r'<meta name="twitter:title" content="([^"]*)"', html) - item_id = r1(r'twitter\.com/[^/]+/status/(\d+)', url) or r1(r'data-item-id="([^"]*)"', html) or \ - r1(r'<meta name="twitter:site:id" content="([^"]*)"', html) + m = re.match(r'^https?://(mobile\.)?(x|twitter)\.com/([^/]+)/status/(\d+)', url) + assert m + screen_name, item_id = m.group(3), m.group(4) page_title = "{} [{}]".format(screen_name, item_id) - authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' + # FIXME: this API won't work for protected or nsfw contents + api_url = 'https://cdn.syndication.twimg.com/tweet-result?id=%s&token=!' % item_id + content = get_content(api_url) + info = json.loads(content) - ga_url = 'https://api.twitter.com/1.1/guest/activate.json' - ga_content = post_content(ga_url, headers={'authorization': authorization}) - guest_token = json.loads(ga_content)['guest_token'] + author = info['user']['name'] + url = 'https://twitter.com/%s/status/%s' % (info['user']['screen_name'], item_id) + full_text = info['text'] - api_url = 'https://api.twitter.com/2/timeline/conversation/%s.json?tweet_mode=extended' % item_id - api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token}) - - info = json.loads(api_content) - if 'extended_entities' in info['globalObjects']['tweets'][item_id]: - # if the tweet contains media, download them - media = info['globalObjects']['tweets'][item_id]['extended_entities']['media'] - - elif info['globalObjects']['tweets'][item_id].get('is_quote_status') == True: - # if the tweet does not contain media, but it quotes a tweet - # and the quoted tweet contains media, download them - item_id = info['globalObjects']['tweets'][item_id]['quoted_status_id_str'] - - api_url = 'https://api.twitter.com/2/timeline/conversation/%s.json?tweet_mode=extended' % item_id - api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token}) - - info = json.loads(api_content) - - if 'extended_entities' in info['globalObjects']['tweets'][item_id]: - media = info['globalObjects']['tweets'][item_id]['extended_entities']['media'] - else: - # quoted tweet has no media - return - - else: - # no media, no quoted tweet - return - - for medium in media: - if 'video_info' in medium: - # FIXME: we're assuming one tweet only contains one video here - variants = medium['video_info']['variants'] - variants = sorted(variants, key=lambda kv: kv.get('bitrate', 0)) - urls = [ variants[-1]['url'] ] + if 'photos' in info: + for photo in info['photos']: + photo_url = photo['url'] + title = item_id + '_' + photo_url.split('.')[-2].split('/')[-1] + urls = [ photo_url + ':orig' ] size = urls_size(urls) - mime, ext = variants[-1]['content_type'], 'mp4' - - print_info(site_info, page_title, mime, size) - if not info_only: - download_urls(urls, page_title, ext, size, output_dir, merge=merge) - - else: - title = item_id + '_' + medium['media_url_https'].split('.')[-2].split('/')[-1] - urls = [ medium['media_url_https'] + ':orig' ] - size = urls_size(urls) - ext = medium['media_url_https'].split('.')[-1] + ext = photo_url.split('.')[-1] print_info(site_info, title, ext, size) if not info_only: download_urls(urls, title, ext, size, output_dir, merge=merge) + if 'video' in info: + for mediaDetail in info['mediaDetails']: + if 'video_info' not in mediaDetail: continue + variants = mediaDetail['video_info']['variants'] + variants = sorted(variants, key=lambda kv: kv.get('bitrate', 0)) + title = item_id + '_' + variants[-1]['url'].split('/')[-1].split('?')[0].split('.')[0] + urls = [ variants[-1]['url'] ] + size = urls_size(urls) + mime, ext = variants[-1]['content_type'], 'mp4' -site_info = "Twitter.com" + print_info(site_info, title, ext, size) + if not info_only: + download_urls(urls, title, ext, size, output_dir, merge=merge) + + # TODO: should we deal with quoted tweets? + + +site_info = "X.com" download = twitter_download download_playlist = playlist_not_supported('twitter') diff --git a/src/you_get/extractors/universal.py b/src/you_get/extractors/universal.py index abc69475..03bba35a 100644 --- a/src/you_get/extractors/universal.py +++ b/src/you_get/extractors/universal.py @@ -48,7 +48,7 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg else: return - hls_urls = re.findall(r'(https?://[^;"\'\\]+' + '\.m3u8?' + + hls_urls = re.findall(r'(https?://[^;"\'\\]+' + r'\.m3u8?' + r'[^;"\'\\]*)', page) if hls_urls: try: @@ -64,18 +64,19 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg return # most common media file extensions on the Internet - media_exts = ['\.flv', '\.mp3', '\.mp4', '\.webm', - '[-_]1\d\d\d\.jpe?g', '[-_][6-9]\d\d\.jpe?g', # tumblr - '[-_]1\d\d\dx[6-9]\d\d\.jpe?g', - '[-_][6-9]\d\dx1\d\d\d\.jpe?g', - '[-_][6-9]\d\dx[6-9]\d\d\.jpe?g', - 's1600/[\w%]+\.jpe?g', # blogger - 'img[6-9]\d\d/[\w%]+\.jpe?g' # oricon? + media_exts = [r'\.flv', r'\.mp3', r'\.mp4', r'\.webm', + r'[-_]1\d\d\d\.jpe?g', r'[-_][6-9]\d\d\.jpe?g', # tumblr + r'[-_]1\d\d\dx[6-9]\d\d\.jpe?g', + r'[-_][6-9]\d\dx1\d\d\d\.jpe?g', + r'[-_][6-9]\d\dx[6-9]\d\d\.jpe?g', + r's1600/[\w%]+\.jpe?g', # blogger + r'blogger\.googleusercontent\.com/img/a/\w*', # blogger + r'img[6-9]\d\d/[\w%]+\.jpe?g' # oricon? ] urls = [] for i in media_exts: - urls += re.findall(r'(https?://[^ ;&"\'\\<>]+' + i + r'[^ ;&"\'\\<>]*)', page) + urls += re.findall(r'(https?://[^ ;&"\'\\<>]*' + i + r'[^ =?;&"\'\\<>]*)', page) p_urls = re.findall(r'(https?%3A%2F%2F[^;&"]+' + i + r'[^;&"]*)', page) urls += [parse.unquote(url) for url in p_urls] diff --git a/src/you_get/extractors/vimeo.py b/src/you_get/extractors/vimeo.py index c7d7b057..4034d0e0 100644 --- a/src/you_get/extractors/vimeo.py +++ b/src/you_get/extractors/vimeo.py @@ -102,7 +102,7 @@ class VimeoExtractor(VideoExtractor): pos = 0 while pos < len(lines): if lines[pos].startswith('#EXT-X-STREAM-INF'): - patt = 'RESOLUTION=(\d+)x(\d+)' + patt = r'RESOLUTION=(\d+)x(\d+)' hit = re.search(patt, lines[pos]) if hit is None: continue @@ -132,34 +132,6 @@ class VimeoExtractor(VideoExtractor): def vimeo_download_by_id(id, title=None, output_dir='.', merge=True, info_only=False, **kwargs): - ''' - try: - # normal Vimeo video - html = get_content('https://vimeo.com/' + id) - cfg_patt = r'clip_page_config\s*=\s*(\{.+?\});' - cfg = json.loads(match1(html, cfg_patt)) - video_page = get_content(cfg['player']['config_url'], headers=fake_headers) - title = cfg['clip']['title'] - info = loads(video_page) - except: - # embedded player - referer may be required - if 'referer' in kwargs: - fake_headers['Referer'] = kwargs['referer'] - - video_page = get_content('http://player.vimeo.com/video/%s' % id, headers=fake_headers) - title = r1(r'<title>([^<]+)', video_page) - info = loads(match1(video_page, r'var t=(\{.+?\});')) - - streams = info['request']['files']['progressive'] - streams = sorted(streams, key=lambda i: i['height']) - url = streams[-1]['url'] - - type, ext, size = url_info(url, faker=True) - - print_info(site_info, title, type, size) - if not info_only: - download_urls([url], title, ext, size, output_dir, merge=merge, faker=True) - ''' site = VimeoExtractor() site.download_by_vid(id, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs) diff --git a/src/you_get/extractors/vine.py b/src/you_get/extractors/vine.py deleted file mode 100644 index d75454cf..00000000 --- a/src/you_get/extractors/vine.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python - -__all__ = ['vine_download'] - -from ..common import * -import json - - -def vine_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - html = get_content(url) - - video_id = r1(r'vine.co/v/([^/]+)', url) - title = r1(r'([^<]*)', html) - stream = r1(r'', html) - if not stream: # https://vine.co/v/.../card - stream = r1(r'"videoUrl":"([^"]+)"', html) - if stream: - stream = stream.replace('\\/', '/') - else: - posts_url = 'https://archive.vine.co/posts/' + video_id + '.json' - json_data = json.loads(get_content(posts_url)) - stream = json_data['videoDashUrl'] - title = json_data['description'] - if title == "": - title = json_data['username'].replace(" ", "_") + "_" + video_id - - mime, ext, size = url_info(stream) - - print_info(site_info, title, mime, size) - if not info_only: - download_urls([stream], title, ext, size, output_dir, merge=merge) - - -site_info = "Vine.co" -download = vine_download -download_playlist = playlist_not_supported('vine') diff --git a/src/you_get/extractors/xiami.py b/src/you_get/extractors/xiami.py deleted file mode 100644 index 16656adb..00000000 --- a/src/you_get/extractors/xiami.py +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -__all__ = ['xiami_download'] - -from ..common import * - -from xml.dom.minidom import parseString -from urllib import parse - -def location_dec(str): - head = int(str[0]) - str = str[1:] - rows = head - cols = int(len(str)/rows) + 1 - - out = "" - full_row = len(str) % head - for c in range(cols): - for r in range(rows): - if c == (cols - 1) and r >= full_row: - continue - if r < full_row: - char = str[r*cols+c] - else: - char = str[cols*full_row+(r-full_row)*(cols-1)+c] - out += char - return parse.unquote(out).replace("^", "0") - -def xiami_download_lyric(lrc_url, file_name, output_dir): - lrc = get_content(lrc_url, headers=fake_headers) - filename = get_filename(file_name) - if len(lrc) > 0: - with open(output_dir + "/" + filename + '.lrc', 'w', encoding='utf-8') as x: - x.write(lrc) - -def xiami_download_pic(pic_url, file_name, output_dir): - from ..util.strings import get_filename - pic_url = pic_url.replace('_1', '') - pos = pic_url.rfind('.') - ext = pic_url[pos:] - pic = get_content(pic_url, headers=fake_headers, decoded=False) - if len(pic) > 0: - with open(output_dir + "/" + file_name.replace('/', '-') + ext, 'wb') as x: - x.write(pic) - -def xiami_download_song(sid, output_dir = '.', info_only = False): - xml = get_content('http://www.xiami.com/song/playlist/id/%s/object_name/default/object_id/0' % sid, headers=fake_headers) - doc = parseString(xml) - i = doc.getElementsByTagName("track")[0] - artist = i.getElementsByTagName("artist")[0].firstChild.nodeValue - album_name = i.getElementsByTagName("album_name")[0].firstChild.nodeValue - song_title = i.getElementsByTagName("name")[0].firstChild.nodeValue - url = location_dec(i.getElementsByTagName("location")[0].firstChild.nodeValue) - try: - lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue - except: - pass - type_, ext, size = url_info(url, headers=fake_headers) - if not ext: - ext = 'mp3' - - print_info(site_info, song_title, ext, size) - if not info_only: - file_name = "%s - %s - %s" % (song_title, artist, album_name) - download_urls([url], file_name, ext, size, output_dir, headers=fake_headers) - try: - xiami_download_lyric(lrc_url, file_name, output_dir) - except: - pass - -def xiami_download_showcollect(cid, output_dir = '.', info_only = False): - html = get_content('http://www.xiami.com/song/showcollect/id/' + cid, headers=fake_headers) - collect_name = r1(r'(.*)', html) - - xml = get_content('http://www.xiami.com/song/playlist/id/%s/type/3' % cid, headers=fake_headers) - doc = parseString(xml) - output_dir = output_dir + "/" + "[" + collect_name + "]" - tracks = doc.getElementsByTagName("track") - track_nr = 1 - for i in tracks: - artist=album_name=song_title=url="" - try: - song_id = i.getElementsByTagName("song_id")[0].firstChild.nodeValue - artist = i.getElementsByTagName("artist")[0].firstChild.nodeValue - album_name = i.getElementsByTagName("album_name")[0].firstChild.nodeValue - song_title = i.getElementsByTagName("title")[0].firstChild.nodeValue - url = location_dec(i.getElementsByTagName("location")[0].firstChild.nodeValue) - except: - log.e("Song %s failed. [Info Missing] artist:%s, album:%s, title:%s, url:%s" % (song_id, artist, album_name, song_title, url)) - continue - try: - lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue - except: - pass - type_, ext, size = url_info(url, headers=fake_headers) - if not ext: - ext = 'mp3' - - print_info(site_info, song_title, ext, size) - if not info_only: - file_name = "%02d.%s - %s - %s" % (track_nr, song_title, artist, album_name) - download_urls([url], file_name, ext, size, output_dir, headers=fake_headers) - try: - xiami_download_lyric(lrc_url, file_name, output_dir) - except: - pass - - track_nr += 1 - -def xiami_download_album(aid, output_dir='.', info_only=False): - xml = get_content('http://www.xiami.com/song/playlist/id/%s/type/1' % aid, headers=fake_headers) - album_name = r1(r'', xml) - artist = r1(r'', xml) - doc = parseString(xml) - output_dir = output_dir + "/%s - %s" % (artist, album_name) - track_list = doc.getElementsByTagName('trackList')[0] - tracks = track_list.getElementsByTagName("track") - track_nr = 1 - pic_exist = False - for i in tracks: -#in this xml track tag is used for both "track in a trackList" and track no -#dirty here - if i.firstChild.nodeValue is not None: - continue - song_title = i.getElementsByTagName("songName")[0].firstChild.nodeValue - url = location_dec(i.getElementsByTagName("location")[0].firstChild.nodeValue) - try: - lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue - except: - pass - if not pic_exist: - pic_url = i.getElementsByTagName("pic")[0].firstChild.nodeValue - type_, ext, size = url_info(url, headers=fake_headers) - if not ext: - ext = 'mp3' - - print_info(site_info, song_title, ext, size) - if not info_only: - file_name = "%02d.%s" % (track_nr, song_title) - download_urls([url], file_name, ext, size, output_dir, headers=fake_headers) - try: - xiami_download_lyric(lrc_url, file_name, output_dir) - except: - pass - if not pic_exist: - xiami_download_pic(pic_url, 'cover', output_dir) - pic_exist = True - - track_nr += 1 - -def xiami_download_mv(url, output_dir='.', merge=True, info_only=False): - # FIXME: broken merge - page = get_content(url, headers=fake_headers) - title = re.findall('([^<]+)', page)[0] - vid, uid = re.findall(r'vid:"(\d+)",uid:"(\d+)"', page)[0] - api_url = 'http://cloud.video.taobao.com/videoapi/info.php?vid=%s&uid=%s' % (vid, uid) - result = get_content(api_url, headers=fake_headers) - doc = parseString(result) - video_url = doc.getElementsByTagName("video_url")[-1].firstChild.nodeValue - length = int(doc.getElementsByTagName("length")[-1].firstChild.nodeValue) - - v_urls = [] - k_start = 0 - total_size = 0 - while True: - k_end = k_start + 20000000 - if k_end >= length: k_end = length - 1 - v_url = video_url + '/start_%s/end_%s/1.flv' % (k_start, k_end) - try: - _, ext, size = url_info(v_url) - except: - break - v_urls.append(v_url) - total_size += size - k_start = k_end + 1 - - print_info(site_info, title, ext, total_size) - if not info_only: - download_urls(v_urls, title, ext, total_size, output_dir, merge=merge, headers=fake_headers) - -def xiami_download(url, output_dir='.', merge=True, info_only=False, **kwargs): -#albums - if re.match(r'http://www.xiami.com/album/\d+', url): - id = r1(r'http://www.xiami.com/album/(\d+)', url) - xiami_download_album(id, output_dir, info_only) - elif re.match(r'http://www.xiami.com/album/\w+', url): - page = get_content(url, headers=fake_headers) - album_id = re.search(r'rel="canonical"\s+href="http://www.xiami.com/album/([^"]+)"', page).group(1) - xiami_download_album(album_id, output_dir, info_only) - -#collections - if re.match(r'http://www.xiami.com/collect/\d+', url): - id = r1(r'http://www.xiami.com/collect/(\d+)', url) - xiami_download_showcollect(id, output_dir, info_only) - -#single track - if re.match(r'http://www.xiami.com/song/\d+\b', url): - id = r1(r'http://www.xiami.com/song/(\d+)', url) - xiami_download_song(id, output_dir, info_only) - elif re.match(r'http://www.xiami.com/song/\w+', url): - html = get_content(url, headers=fake_headers) - id = r1(r'rel="canonical" href="http://www.xiami.com/song/([^"]+)"', html) - xiami_download_song(id, output_dir, info_only) - - if re.match('http://www.xiami.com/song/detail/id/\d+', url): - id = r1(r'http://www.xiami.com/song/detail/id/(\d+)', url) - xiami_download_song(id, output_dir, info_only) - - if re.match('http://www.xiami.com/mv', url): - xiami_download_mv(url, output_dir, merge=merge, info_only=info_only) - -site_info = "Xiami.com" -download = xiami_download -download_playlist = playlist_not_supported("xiami") diff --git a/src/you_get/extractors/xinpianchang.py b/src/you_get/extractors/xinpianchang.py index fac3d01f..1121550c 100644 --- a/src/you_get/extractors/xinpianchang.py +++ b/src/you_get/extractors/xinpianchang.py @@ -20,7 +20,7 @@ class Xinpianchang(VideoExtractor): def prepare(self, **kwargs): # find key page_content = get_content(self.url) - match_rule = r"vid: \"(.+?)\"," + match_rule = r"vid = \"(.+?)\";" key = re.findall(match_rule, page_content)[0] # get videos info diff --git a/src/you_get/extractors/yinyuetai.py b/src/you_get/extractors/yinyuetai.py deleted file mode 100644 index 6c39540f..00000000 --- a/src/you_get/extractors/yinyuetai.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python - -__all__ = ['yinyuetai_download', 'yinyuetai_download_by_id'] - -from ..common import * - -def yinyuetai_download_by_id(vid, title=None, output_dir='.', merge=True, info_only=False): - video_info = json.loads(get_html('http://www.yinyuetai.com/insite/get-video-info?json=true&videoId=%s' % vid)) - url_models = video_info['videoInfo']['coreVideoInfo']['videoUrlModels'] - url_models = sorted(url_models, key=lambda i: i['qualityLevel']) - url = url_models[-1]['videoUrl'] - type = ext = r1(r'\.(flv|mp4)', url) - _, _, size = url_info(url) - - print_info(site_info, title, type, size) - if not info_only: - download_urls([url], title, ext, size, output_dir, merge = merge) - -def yinyuetai_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - id = r1(r'http://\w+.yinyuetai.com/video/(\d+)', url) or \ - r1(r'http://\w+.yinyuetai.com/video/h5/(\d+)', url) - if not id: - yinyuetai_download_playlist(url, output_dir=output_dir, merge=merge, info_only=info_only) - return - - html = get_html(url, 'utf-8') - title = r1(r'<meta property="og:title"\s+content="([^"]+)"/>', html) or r1(r'<title>(.*)', html) - assert title - title = parse.unquote(title) - title = escape_file_path(title) - yinyuetai_download_by_id(id, title, output_dir, merge=merge, info_only=info_only) - -def yinyuetai_download_playlist(url, output_dir='.', merge=True, info_only=False, **kwargs): - playlist = r1(r'http://\w+.yinyuetai.com/playlist/(\d+)', url) - html = get_html(url) - data_ids = re.findall(r'data-index="\d+"\s*data-id=(\d+)', html) - for data_id in data_ids: - yinyuetai_download('http://v.yinyuetai.com/video/' + data_id, - output_dir=output_dir, merge=merge, info_only=info_only) - -site_info = "YinYueTai.com" -download = yinyuetai_download -download_playlist = yinyuetai_download_playlist diff --git a/src/you_get/extractors/youku.py b/src/you_get/extractors/youku.py index d7107eca..7a6fb2fc 100644 --- a/src/you_get/extractors/youku.py +++ b/src/you_get/extractors/youku.py @@ -41,7 +41,6 @@ class Youku(VideoExtractor): mobile_ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36' dispatcher_url = 'vali.cp31.ott.cibntv.net' - # Last updated: 2017-10-13 stream_types = [ {'id': 'hd3', 'container': 'flv', 'video_profile': '1080P'}, {'id': 'hd3v2', 'container': 'flv', 'video_profile': '1080P'}, @@ -78,7 +77,7 @@ class Youku(VideoExtractor): self.api_error_code = None self.api_error_msg = None - self.ccode = '0519' + self.ccode = '0564' # Found in http://g.alicdn.com/player/ykplayer/0.5.64/youku-player.min.js # grep -oE '"[0-9a-zA-Z+/=]{256}"' youku-player.min.js self.ckey = 'DIl58SLFxFNndSV1GFNnMQVYkx1PP5tKe1siZu/86PR1u/Wh1Ptd+WOZsHHWxysSfAOhNJpdVWsdVJNsfJ8Sxd8WKVvNfAS8aS8fAOzYARzPyPc3JvtnPHjTdKfESTdnuTW6ZPvk2pNDh4uFzotgdMEFkzQ5wZVXl2Pf1/Y6hLK0OnCNxBj3+nb0v72gZ6b0td+WOZsHHWxysSo/0y9D2K42SaB8Y/+aD2K42SaB8Y/+ahU+WOZsHcrxysooUeND' @@ -243,7 +242,7 @@ class Youku(VideoExtractor): def youku_download_playlist_by_url(url, **kwargs): video_page_pt = 'https?://v.youku.com/v_show/id_([A-Za-z0-9=]+)' - js_cb_pt = '\(({.+})\)' + js_cb_pt = r'\(({.+})\)' if re.match(video_page_pt, url): youku_obj = Youku() youku_obj.url = url @@ -273,14 +272,14 @@ def youku_download_playlist_by_url(url, **kwargs): page = get_content(url) show_id = re.search(r'showid:"(\d+)"', page).group(1) ep = 'http://list.youku.com/show/module?id={}&tab=showInfo&callback=jQuery'.format(show_id) - xhr_page = get_content(ep).replace('\/', '/').replace('\"', '"') + xhr_page = get_content(ep).replace(r'\/', '/').replace(r'\"', '"') video_url = re.search(r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_page).group(1) youku_download_playlist_by_url('http://'+video_url, **kwargs) return - elif re.match('https?://list.youku.com/albumlist/show/id_(\d+)\.html', url): + elif re.match(r'https?://list.youku.com/albumlist/show/id_(\d+)\.html', url): # http://list.youku.com/albumlist/show/id_2336634.html # UGC playlist - list_id = re.search('https?://list.youku.com/albumlist/show/id_(\d+)\.html', url).group(1) + list_id = re.search(r'https?://list.youku.com/albumlist/show/id_(\d+)\.html', url).group(1) ep = 'http://list.youku.com/albumlist/items?id={}&page={}&size=20&ascending=1&callback=tuijsonp6' first_u = ep.format(list_id, 1) @@ -295,7 +294,7 @@ def youku_download_playlist_by_url(url, **kwargs): for i in range(2, req_cnt+2): req_u = ep.format(list_id, i) xhr_page = get_content(req_u) - json_data = json.loads(re.search(js_cb_pt, xhr_page).group(1).replace('\/', '/')) + json_data = json.loads(re.search(js_cb_pt, xhr_page).group(1).replace(r'\/', '/')) xhr_html = json_data['html'] page_videos = re.findall(r'(v.youku.com/v_show/id_(?:[A-Za-z0-9=]+)\.html)', xhr_html) v_urls.extend(page_videos) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 4f3a947e..fe064199 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -3,6 +3,13 @@ from ..common import * from ..extractor import VideoExtractor +try: + import dukpy +except ImportError: + log.e('Please install dukpy in order to extract videos from YouTube:') + log.e('$ pip install dukpy') + exit(0) +from urllib.parse import urlparse, parse_qs, urlencode from xml.dom.minidom import parseString class YouTube(VideoExtractor): @@ -68,40 +75,33 @@ class YouTube(VideoExtractor): 'audio_encoding': 'AAC', 'audio_bitrate': '24'}, ] - def decipher(js, s): - # Examples: - # - https://www.youtube.com/yts/jsbin/player-da_DK-vflWlK-zq/base.js - # - https://www.youtube.com/yts/jsbin/player-vflvABTsY/da_DK/base.js - # - https://www.youtube.com/yts/jsbin/player-vfls4aurX/da_DK/base.js - # - https://www.youtube.com/yts/jsbin/player_ias-vfl_RGK2l/en_US/base.js - # - https://www.youtube.com/yts/jsbin/player-vflRjqq_w/da_DK/base.js - # - https://www.youtube.com/yts/jsbin/player_ias-vfl-jbnrr/da_DK/base.js - def tr_js(code): - code = re.sub(r'function', r'def', code) - code = re.sub(r'(\W)(as|if|in|is|or)\(', r'\1_\2(', code) - code = re.sub(r'\$', '_dollar', code) - code = re.sub(r'\{', r':\n\t', code) - code = re.sub(r'\}', r'\n', code) - code = re.sub(r'var\s+', r'', code) - code = re.sub(r'(\w+).join\(""\)', r'"".join(\1)', code) - code = re.sub(r'(\w+).length', r'len(\1)', code) - code = re.sub(r'(\w+).slice\((\w+)\)', r'\1[\2:]', code) - code = re.sub(r'(\w+).splice\((\w+),(\w+)\)', r'del \1[\2:\2+\3]', code) - code = re.sub(r'(\w+).split\(""\)', r'list(\1)', code) - return code + def dethrottle(js, url): + def n_to_n(js, n): + # Examples: + # yma - https://www.youtube.com/s/player/84314bef/player_ias.vflset/en_US/base.js + # Xka - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js + # jma - https://www.youtube.com/s/player/8d9f6215/player_ias.vflset/sv_SE/base.js + f1 = match1(js, r',[$\w]+\.length\|\|([$\w]+)\(""\)\)}};') + f1def = match1(js, r'\W%s=(function\(\w+\).+?\)});' % re.escape(f1)) + n = dukpy.evaljs('(%s)("%s")' % (f1def, n)) + return n - js = js.replace('\n', ' ') - f1 = match1(js, r'\.set\(\w+\.sp,encodeURIComponent\(([$\w]+)') or \ - match1(js, r'\.set\(\w+\.sp,\(0,window\.encodeURIComponent\)\(([$\w]+)') or \ - match1(js, r'\.set\(\w+\.sp,([$\w]+)\(\w+\.s\)\)') or \ - match1(js, r'"signature",([$\w]+)\(\w+\.\w+\)') or \ - match1(js, r'=([$\w]+)\(decodeURIComponent\(') - f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \ - match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1)) - f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def) - f1def = 'function main_%s%s' % (f1, f1def) # prefix to avoid potential namespace conflict - code = tr_js(f1def) - f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def)) + u = urlparse(url) + qs = parse_qs(u.query) + n = n_to_n(js, qs['n'][0]) + qs['n'] = [n] + return u._replace(query=urlencode(qs, doseq=True)).geturl() + + def s_to_sig(js, s): + # Examples: + # BPa - https://www.youtube.com/s/player/84314bef/player_ias.vflset/en_US/base.js + # Xva - https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/sv_SE/base.js + js_code = '' + f1 = match1(js, r'=([$\w]+)\(decodeURIComponent\(') + f1def = match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1)) + f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def) # remove . prefix + f1def = 'function %s%s' % (f1, f1def) + f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def)) # find all invoked function names for f2 in f2s: f2e = re.escape(f2) f2def = re.search(r'[^$\w]%s:function\((\w+,\w+)\)(\{[^\{\}]+\})' % f2e, js) @@ -110,15 +110,10 @@ class YouTube(VideoExtractor): else: f2def = re.search(r'[^$\w]%s:function\((\w+)\)(\{[^\{\}]+\})' % f2e, js) f2def = 'function {}({},b){}'.format(f2e, f2def.group(1), f2def.group(2)) - f2 = re.sub(r'(as|if|in|is|or)', r'_\1', f2) - f2 = re.sub(r'\$', '_dollar', f2) - code = code + 'global %s\n' % f2 + tr_js(f2def) - - f1 = re.sub(r'(as|if|in|is|or)', r'_\1', f1) - f1 = re.sub(r'\$', '_dollar', f1) - code = code + 'sig=main_%s(s)' % f1 # prefix to avoid potential namespace conflict - exec(code, globals(), locals()) - return locals()['sig'] + js_code += f2def + ';' + js_code += f1def + ';%s("%s")' % (f1, s) + sig = dukpy.evaljs(js_code) + return sig def chunk_by_range(url, size): urls = [] @@ -138,6 +133,7 @@ class YouTube(VideoExtractor): """ return match1(url, r'youtu\.be/([^?/]+)') or \ match1(url, r'youtube\.com/embed/([^/?]+)') or \ + match1(url, r'youtube\.com/shorts/([^/?]+)') or \ match1(url, r'youtube\.com/v/([^/?]+)') or \ match1(url, r'youtube\.com/watch/([^/?]+)') or \ parse_query_param(url, 'v') or \ @@ -157,36 +153,41 @@ class YouTube(VideoExtractor): log.wtf('[Failed] Unsupported URL pattern.') video_page = get_content('https://www.youtube.com/playlist?list=%s' % playlist_id) - from html.parser import HTMLParser - videos = sorted([HTMLParser().unescape(video) - for video in re.findall(r'<a href="(/watch\?[^"]+)"', video_page) - if parse_query_param(video, 'index')], - key=lambda video: parse_query_param(video, 'index')) + playlist_json_serialized = match1(video_page, r'window\["ytInitialData"\]\s*=\s*(.+);', r'var\s+ytInitialData\s*=\s*([^;]+);') - # Parse browse_ajax page for more videos to load - load_more_href = match1(video_page, r'data-uix-load-more-href="([^"]+)"') - while load_more_href: - browse_ajax = get_content('https://www.youtube.com/%s' % load_more_href) - browse_data = json.loads(browse_ajax) - load_more_widget_html = browse_data['load_more_widget_html'] - content_html = browse_data['content_html'] - vs = set(re.findall(r'href="(/watch\?[^"]+)"', content_html)) - videos += sorted([HTMLParser().unescape(video) - for video in list(vs) - if parse_query_param(video, 'index')]) - load_more_href = match1(load_more_widget_html, r'data-uix-load-more-href="([^"]+)"') + if len(playlist_json_serialized) == 0: + log.wtf('[Failed] Unable to extract playlist data') + + ytInitialData = json.loads(playlist_json_serialized[0]) + + tab0 = ytInitialData['contents']['twoColumnBrowseResultsRenderer']['tabs'][0] + itemSection0 = tab0['tabRenderer']['content']['sectionListRenderer']['contents'][0] + playlistVideoList0 = itemSection0['itemSectionRenderer']['contents'][0] + videos = playlistVideoList0['playlistVideoListRenderer']['contents'] self.title = re.search(r'<meta name="title" content="([^"]+)"', video_page).group(1) self.p_playlist() - for video in videos: - vid = parse_query_param(video, 'v') - index = parse_query_param(video, 'index') + for index, video in enumerate(videos, 1): + vid = video['playlistVideoRenderer']['videoId'] try: self.__class__().download_by_url(self.__class__.get_url_from_vid(vid), index=index, **kwargs) except: pass + # FIXME: show DASH stream sizes (by default) for playlist videos + + def check_playability_response(self, ytInitialPlayerResponse): + STATUS_OK = "OK" + + playerResponseStatus = ytInitialPlayerResponse["playabilityStatus"]["status"] + if playerResponseStatus != STATUS_OK: + reason = ytInitialPlayerResponse["playabilityStatus"].get("reason", "") + raise AssertionError( + f"Server refused to provide video details. Returned status: {playerResponseStatus}, reason: {reason}." + ) def prepare(self, **kwargs): + self.ua = 'Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.103 Mobile Safari/537.36' + assert self.url or self.vid if not self.vid and self.url: @@ -196,152 +197,72 @@ class YouTube(VideoExtractor): self.download_playlist_by_url(self.url, **kwargs) exit(0) - if re.search('\Wlist=', self.url) and not kwargs.get('playlist'): + if re.search(r'\Wlist=', self.url) and not kwargs.get('playlist'): log.w('This video is from a playlist. (use --playlist to download all videos in the playlist.)') - # Get video info - # 'eurl' is a magic parameter that can bypass age restriction - # full form: 'eurl=https%3A%2F%2Fyoutube.googleapis.com%2Fv%2F{VIDEO_ID}' - video_info = parse.parse_qs(get_content('https://www.youtube.com/get_video_info?video_id={}&eurl=https%3A%2F%2Fy'.format(self.vid))) - logging.debug('STATUS: %s' % video_info['status'][0]) + # Extract from video page + logging.debug('Extracting from the video page...') + video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid, headers={'User-Agent': self.ua}) - ytplayer_config = None - if 'status' not in video_info: - log.wtf('[Failed] Unknown status.', exit_code=None) - raise - elif video_info['status'] == ['ok']: - if 'use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']: - self.title = parse.unquote_plus(json.loads(video_info["player_response"][0])["videoDetails"]["title"]) - # Parse video page (for DASH) - video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) - try: - ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) - self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] - # Workaround: get_video_info returns bad s. Why? - if 'url_encoded_fmt_stream_map' not in ytplayer_config['args']: - stream_list = json.loads(ytplayer_config['args']['player_response'])['streamingData']['formats'] - else: - stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') - #stream_list = ytplayer_config['args']['adaptive_fmts'].split(',') - except: - if 'url_encoded_fmt_stream_map' not in video_info: - stream_list = json.loads(video_info['player_response'][0])['streamingData']['formats'] - else: - stream_list = video_info['url_encoded_fmt_stream_map'][0].split(',') - if re.search('([^"]*/base\.js)"', video_page): - self.html5player = 'https://www.youtube.com' + re.search('([^"]*/base\.js)"', video_page).group(1) - else: - self.html5player = None + try: + jsUrl = re.search(r'([^"]*/base\.js)"', video_page).group(1) + except: + log.wtf('[Failed] Unable to find base.js on the video page') + self.html5player = 'https://www.youtube.com' + jsUrl + logging.debug('Retrieving the player code...') + self.js = get_content(self.html5player).replace('\n', ' ') - else: - # Parse video page instead - video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) - ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) + logging.debug('Loading ytInitialPlayerResponse...') + ytInitialPlayerResponse = json.loads(re.search(r'ytInitialPlayerResponse\s*=\s*([^\n]+?});(\n|</script>|var )', video_page).group(1)) + self.check_playability_response(ytInitialPlayerResponse) - self.title = json.loads(ytplayer_config["args"]["player_response"])["videoDetails"]["title"] - self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] - stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') + # Get the video title + self.title = ytInitialPlayerResponse["videoDetails"]["title"] - elif video_info['status'] == ['fail']: - logging.debug('ERRORCODE: %s' % video_info['errorcode'][0]) - if video_info['errorcode'] == ['150']: - # FIXME: still relevant? - if cookies: - # Load necessary cookies into headers (for age-restricted videos) - consent, ssid, hsid, sid = 'YES', '', '', '' - for cookie in cookies: - if cookie.domain.endswith('.youtube.com'): - if cookie.name == 'SSID': - ssid = cookie.value - elif cookie.name == 'HSID': - hsid = cookie.value - elif cookie.name == 'SID': - sid = cookie.value - cookie_str = 'CONSENT=%s; SSID=%s; HSID=%s; SID=%s' % (consent, ssid, hsid, sid) + # Check the status + playabilityStatus = ytInitialPlayerResponse['playabilityStatus'] + status = playabilityStatus['status'] + logging.debug('status: %s' % status) + if status != 'OK': + # If cookies are loaded, status should be OK + try: + subreason = playabilityStatus['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'][0]['text'] + log.e('[Error] %s (%s)' % (playabilityStatus['reason'], subreason)) + except: + log.e('[Error] %s' % playabilityStatus['reason']) + if status == 'LOGIN_REQUIRED': + log.e('View the video from a browser and export the cookies, then use --cookies to load cookies.') + exit(1) - video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid, - headers={'Cookie': cookie_str}) - else: - video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) - - try: - ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+});ytplayer', video_page).group(1)) - except: - msg = re.search('class="message">([^<]+)<', video_page).group(1) - log.wtf('[Failed] Got message "%s". Try to login with --cookies.' % msg.strip()) - - if 'title' in ytplayer_config['args']: - # 150 Restricted from playback on certain sites - # Parse video page instead - self.title = ytplayer_config['args']['title'] - self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] - stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') - else: - log.wtf('[Error] The uploader has not made this video available in your country.', exit_code=None) - raise - #self.title = re.search('<meta name="title" content="([^"]+)"', video_page).group(1) - #stream_list = [] - - elif video_info['errorcode'] == ['100']: - log.wtf('[Failed] This video does not exist.', exit_code=None) #int(video_info['errorcode'][0]) - raise - - else: - log.wtf('[Failed] %s' % video_info['reason'][0], exit_code=None) #int(video_info['errorcode'][0]) - raise - - else: - log.wtf('[Failed] Invalid status.', exit_code=None) - raise - - # YouTube Live - if ytplayer_config and (ytplayer_config['args'].get('livestream') == '1' or ytplayer_config['args'].get('live_playback') == '1'): - if 'hlsvp' in ytplayer_config['args']: - hlsvp = ytplayer_config['args']['hlsvp'] - else: - player_response= json.loads(ytplayer_config['args']['player_response']) - log.e('[Failed] %s' % player_response['playabilityStatus']['reason'], exit_code=1) - - if 'info_only' in kwargs and kwargs['info_only']: - return - else: - download_url_ffmpeg(hlsvp, self.title, 'mp4') - exit(0) + stream_list = ytInitialPlayerResponse['streamingData']['formats'] for stream in stream_list: - if isinstance(stream, str): - metadata = parse.parse_qs(stream) - stream_itag = metadata['itag'][0] - self.streams[stream_itag] = { - 'itag': metadata['itag'][0], - 'url': metadata['url'][0], - 'sig': metadata['sig'][0] if 'sig' in metadata else None, - 's': metadata['s'][0] if 's' in metadata else None, - 'quality': metadata['quality'][0] if 'quality' in metadata else None, - #'quality': metadata['quality_label'][0] if 'quality_label' in metadata else None, - 'type': metadata['type'][0], - 'mime': metadata['type'][0].split(';')[0], - 'container': mime_to_container(metadata['type'][0].split(';')[0]), - } + logging.debug('Found format: itag=%s' % stream['itag']) + if 'signatureCipher' in stream: + logging.debug(' Parsing signatureCipher for itag=%s...' % stream['itag']) + qs = parse_qs(stream['signatureCipher']) + #logging.debug(qs) + sp = qs['sp'][0] + sig = self.__class__.s_to_sig(self.js, qs['s'][0]) + url = qs['url'][0] + '&{}={}'.format(sp, sig) + elif 'url' in stream: + url = stream['url'] else: - stream_itag = str(stream['itag']) - self.streams[stream_itag] = { - 'itag': str(stream['itag']), - 'url': stream['url'] if 'url' in stream else None, - 'sig': None, - 's': None, - 'quality': stream['quality'], - 'type': stream['mimeType'], - 'mime': stream['mimeType'].split(';')[0], - 'container': mime_to_container(stream['mimeType'].split(';')[0]), - } - if 'signatureCipher' in stream: - self.streams[stream_itag].update(dict([(_.split('=')[0], parse.unquote(_.split('=')[1])) - for _ in stream['signatureCipher'].split('&')])) + log.wtf(' No signatureCipher or url for itag=%s' % stream['itag']) + url = self.__class__.dethrottle(self.js, url) - # Prepare caption tracks + self.streams[str(stream['itag'])] = { + 'itag': str(stream['itag']), + 'url': url, + 'quality': stream['quality'], + 'type': stream['mimeType'], + 'mime': stream['mimeType'].split(';')[0], + 'container': mime_to_container(stream['mimeType'].split(';')[0]), + } + + # FIXME: Prepare caption tracks try: - caption_tracks = json.loads(ytplayer_config['args']['player_response'])['captions']['playerCaptionsTracklistRenderer']['captionTracks'] + caption_tracks = ytInitialPlayerResponse['captions']['playerCaptionsTracklistRenderer']['captionTracks'] for ct in caption_tracks: ttsurl, lang = ct['baseUrl'], ct['languageCode'] @@ -367,149 +288,72 @@ class YouTube(VideoExtractor): srt += '%s --> %s\n' % (start, finish) srt += '%s\n\n' % content - self.caption_tracks[lang] = srt + if 'kind' in ct: + self.caption_tracks[ct['vssId']] = srt # autogenerated + else: + self.caption_tracks[lang] = srt except: pass - # Prepare DASH streams (NOTE: not every video has DASH streams!) - try: - dashmpd = ytplayer_config['args']['dashmpd'] - dash_xml = parseString(get_content(dashmpd)) - for aset in dash_xml.getElementsByTagName('AdaptationSet'): - mimeType = aset.getAttribute('mimeType') - if mimeType == 'audio/mp4': - rep = aset.getElementsByTagName('Representation')[-1] - burls = rep.getElementsByTagName('BaseURL') - dash_mp4_a_url = burls[0].firstChild.nodeValue - dash_mp4_a_size = burls[0].getAttribute('yt:contentLength') - if not dash_mp4_a_size: - try: dash_mp4_a_size = url_size(dash_mp4_a_url) - except: continue - elif mimeType == 'audio/webm': - rep = aset.getElementsByTagName('Representation')[-1] - burls = rep.getElementsByTagName('BaseURL') - dash_webm_a_url = burls[0].firstChild.nodeValue - dash_webm_a_size = burls[0].getAttribute('yt:contentLength') - if not dash_webm_a_size: - try: dash_webm_a_size = url_size(dash_webm_a_url) - except: continue - elif mimeType == 'video/mp4': - for rep in aset.getElementsByTagName('Representation'): - w = int(rep.getAttribute('width')) - h = int(rep.getAttribute('height')) - itag = rep.getAttribute('id') - burls = rep.getElementsByTagName('BaseURL') - dash_url = burls[0].firstChild.nodeValue - dash_size = burls[0].getAttribute('yt:contentLength') - if not dash_size: - try: dash_size = url_size(dash_url) - except: continue - dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size)) - dash_mp4_a_urls = self.__class__.chunk_by_range(dash_mp4_a_url, int(dash_mp4_a_size)) - self.dash_streams[itag] = { - 'quality': '%sx%s' % (w, h), - 'itag': itag, - 'type': mimeType, - 'mime': mimeType, - 'container': 'mp4', - 'src': [dash_urls, dash_mp4_a_urls], - 'size': int(dash_size) + int(dash_mp4_a_size) - } - elif mimeType == 'video/webm': - for rep in aset.getElementsByTagName('Representation'): - w = int(rep.getAttribute('width')) - h = int(rep.getAttribute('height')) - itag = rep.getAttribute('id') - burls = rep.getElementsByTagName('BaseURL') - dash_url = burls[0].firstChild.nodeValue - dash_size = burls[0].getAttribute('yt:contentLength') - if not dash_size: - try: dash_size = url_size(dash_url) - except: continue - dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size)) - dash_webm_a_urls = self.__class__.chunk_by_range(dash_webm_a_url, int(dash_webm_a_size)) - self.dash_streams[itag] = { - 'quality': '%sx%s' % (w, h), - 'itag': itag, - 'type': mimeType, - 'mime': mimeType, - 'container': 'webm', - 'src': [dash_urls, dash_webm_a_urls], - 'size': int(dash_size) + int(dash_webm_a_size) - } - except: - # VEVO - if not self.html5player: return - self.html5player = self.html5player.replace('\/', '/') # unescape URL (for age-restricted videos) - self.js = get_content(self.html5player) + # Prepare DASH streams + if 'adaptiveFormats' in ytInitialPlayerResponse['streamingData']: + streams = ytInitialPlayerResponse['streamingData']['adaptiveFormats'] - try: - # Video info from video page (not always available) - streams = [dict([(i.split('=')[0], - parse.unquote(i.split('=')[1])) - for i in afmt.split('&')]) - for afmt in ytplayer_config['args']['adaptive_fmts'].split(',')] - except: - if 'adaptive_fmts' in video_info: - streams = [dict([(i.split('=')[0], - parse.unquote(i.split('=')[1])) - for i in afmt.split('&')]) - for afmt in video_info['adaptive_fmts'][0].split(',')] + # FIXME: dead code? + # streams without contentLength got broken urls, just remove them (#2767) + streams = [stream for stream in streams if 'contentLength' in stream] + + for stream in streams: + logging.debug('Found adaptiveFormat: itag=%s' % stream['itag']) + stream['itag'] = str(stream['itag']) + if 'qualityLabel' in stream: + stream['quality_label'] = stream['qualityLabel'] + del stream['qualityLabel'] + logging.debug(' quality_label: \t%s' % stream['quality_label']) + if 'width' in stream: + stream['size'] = '{}x{}'.format(stream['width'], stream['height']) + del stream['width'] + del stream['height'] + logging.debug(' size: \t%s' % stream['size']) + stream['type'] = stream['mimeType'] + logging.debug(' type: \t%s' % stream['type']) + stream['clen'] = stream['contentLength'] + stream['init'] = '{}-{}'.format( + stream['initRange']['start'], + stream['initRange']['end']) + stream['index'] = '{}-{}'.format( + stream['indexRange']['start'], + stream['indexRange']['end']) + del stream['mimeType'] + del stream['contentLength'] + del stream['initRange'] + del stream['indexRange'] + + if 'signatureCipher' in stream: + logging.debug(' Parsing signatureCipher for itag=%s...' % stream['itag']) + qs = parse_qs(stream['signatureCipher']) + #logging.debug(qs) + sp = qs['sp'][0] + sig = self.__class__.s_to_sig(self.js, qs['s'][0]) + url = qs['url'][0] + '&ratebypass=yes&{}={}'.format(sp, sig) + elif 'url' in stream: + url = stream['url'] else: - try: - streams = json.loads(video_info['player_response'][0])['streamingData']['adaptiveFormats'] - except: # no DASH stream at all - return - # streams without contentLength got broken urls, just remove them (#2767) - streams = [stream for stream in streams if 'contentLength' in stream] - for stream in streams: - stream['itag'] = str(stream['itag']) - if 'qualityLabel' in stream: - stream['quality_label'] = stream['qualityLabel'] - del stream['qualityLabel'] - if 'width' in stream: - stream['size'] = '{}x{}'.format(stream['width'], stream['height']) - del stream['width'] - del stream['height'] - stream['type'] = stream['mimeType'] - stream['clen'] = stream['contentLength'] - stream['init'] = '{}-{}'.format( - stream['initRange']['start'], - stream['initRange']['end']) - stream['index'] = '{}-{}'.format( - stream['indexRange']['start'], - stream['indexRange']['end']) - del stream['mimeType'] - del stream['contentLength'] - del stream['initRange'] - del stream['indexRange'] - if 'signatureCipher' in stream: - stream.update(dict([(_.split('=')[0], parse.unquote(_.split('=')[1])) - for _ in stream['signatureCipher'].split('&')])) - del stream['signatureCipher'] + log.wtf('No signatureCipher or url for itag=%s' % stream['itag']) + url = self.__class__.dethrottle(self.js, url) + stream['url'] = url - for stream in streams: # get over speed limiting - stream['url'] += '&ratebypass=yes' for stream in streams: # audio if stream['type'].startswith('audio/mp4'): dash_mp4_a_url = stream['url'] - if 's' in stream: - sig = self.__class__.decipher(self.js, stream['s']) - dash_mp4_a_url += '&sig={}'.format(sig) dash_mp4_a_size = stream['clen'] elif stream['type'].startswith('audio/webm'): dash_webm_a_url = stream['url'] - if 's' in stream: - sig = self.__class__.decipher(self.js, stream['s']) - dash_webm_a_url += '&sig={}'.format(sig) dash_webm_a_size = stream['clen'] for stream in streams: # video if 'size' in stream: if stream['type'].startswith('video/mp4'): mimeType = 'video/mp4' dash_url = stream['url'] - if 's' in stream: - sig = self.__class__.decipher(self.js, stream['s']) - dash_url += '&sig={}'.format(sig) dash_size = stream['clen'] itag = stream['itag'] dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size)) @@ -526,9 +370,6 @@ class YouTube(VideoExtractor): elif stream['type'].startswith('video/webm'): mimeType = 'video/webm' dash_url = stream['url'] - if 's' in stream: - sig = self.__class__.decipher(self.js, stream['s']) - dash_url += '&sig={}'.format(sig) dash_size = stream['clen'] itag = stream['itag'] audio_url = None @@ -569,15 +410,6 @@ class YouTube(VideoExtractor): if stream_id in self.streams: src = self.streams[stream_id]['url'] - if self.streams[stream_id]['sig'] is not None: - sig = self.streams[stream_id]['sig'] - src += '&sig={}'.format(sig) - elif self.streams[stream_id]['s'] is not None: - if not hasattr(self, 'js'): - self.js = get_content(self.html5player) - s = self.streams[stream_id]['s'] - sig = self.__class__.decipher(self.js, s) - src += '&sig={}'.format(sig) self.streams[stream_id]['src'] = [src] self.streams[stream_id]['size'] = urls_size(self.streams[stream_id]['src']) diff --git a/src/you_get/extractors/zhihu.py b/src/you_get/extractors/zhihu.py index 64f81423..1dceef53 100644 --- a/src/you_get/extractors/zhihu.py +++ b/src/you_get/extractors/zhihu.py @@ -31,8 +31,8 @@ def zhihu_download(url, output_dir='.', merge=True, info_only=False, **kwargs): play_list = video_info["playlist"] # first High Definition - # second Second Standard Definition - # third ld. What is ld ? + # second Standard Definition + # third Low Definition # finally continue data = play_list.get("hd", play_list.get("sd", play_list.get("ld", None))) if not data: diff --git a/src/you_get/processor/ffmpeg.py b/src/you_get/processor/ffmpeg.py index 11126c27..4bbbd177 100755 --- a/src/you_get/processor/ffmpeg.py +++ b/src/you_get/processor/ffmpeg.py @@ -93,7 +93,7 @@ def ffmpeg_concat_mp4_to_mpg(files, output='output.mpg'): # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): concat_list = generate_concat_list(files, output) - params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0', '-i', concat_list, '-c', 'copy'] params.extend(['--', output]) if subprocess.call(params, stdin=STDIN) == 0: @@ -128,7 +128,7 @@ def ffmpeg_concat_mp4_to_mpg(files, output='output.mpg'): def ffmpeg_concat_ts_to_mkv(files, output='output.mkv'): print('Merging video parts... ', end="", flush=True) - params = [FFMPEG] + LOGLEVEL + ['-isync', '-y', '-i'] + params = [FFMPEG] + LOGLEVEL + ['-y', '-i'] params.append('concat:') for file in files: if os.path.isfile(file): @@ -149,7 +149,7 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'): # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): concat_list = generate_concat_list(files, output) - params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0', '-i', concat_list, '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] params.extend(['--', output]) @@ -175,7 +175,7 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'): if FFMPEG == 'avconv': params += ['-c', 'copy'] else: - params += ['-c', 'copy', '-absf', 'aac_adtstoasc'] + params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc'] params.extend(['--', output]) if subprocess.call(params, stdin=STDIN) == 0: @@ -203,7 +203,7 @@ def ffmpeg_concat_mp4_to_mp4(files, output='output.mp4'): # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): concat_list = generate_concat_list(files, output) - params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '0', '-i', concat_list, '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] params.extend(['--', output]) @@ -229,7 +229,7 @@ def ffmpeg_concat_mp4_to_mp4(files, output='output.mp4'): if FFMPEG == 'avconv': params += ['-c', 'copy'] else: - params += ['-c', 'copy', '-absf', 'aac_adtstoasc'] + params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc'] params.extend(['--', output]) subprocess.check_call(params, stdin=STDIN) diff --git a/src/you_get/version.py b/src/you_get/version.py index e404e0c0..e0068208 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1456' +__version__ = '0.4.1730' diff --git a/tests/test.py b/tests/test.py index 00bd4cbb..2d220c62 100644 --- a/tests/test.py +++ b/tests/test.py @@ -10,13 +10,16 @@ from you_get.extractors import ( acfun, bilibili, soundcloud, - tiktok + tiktok, + twitter, + miaopai ) class YouGetTests(unittest.TestCase): def test_imgur(self): imgur.download('http://imgur.com/WVLk5nD', info_only=True) + imgur.download('https://imgur.com/we-should-have-listened-WVLk5nD', info_only=True) def test_magisto(self): magisto.download( @@ -24,45 +27,47 @@ class YouGetTests(unittest.TestCase): info_only=True ) - def test_youtube(self): - youtube.download( - 'http://www.youtube.com/watch?v=pzKerr0JIPA', info_only=True - ) - youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True) - youtube.download( - 'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa - info_only=True - ) - youtube.download( - 'https://www.youtube.com/watch?v=Fpr4fQSh1cc', info_only=True - ) + #def test_youtube(self): + #youtube.download( + # 'http://www.youtube.com/watch?v=pzKerr0JIPA', info_only=True + #) + #youtube.download('http://youtu.be/pzKerr0JIPA', info_only=True) + #youtube.download( + # 'http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare', # noqa + # info_only=True + #) + #youtube.download( + # 'https://www.youtube.com/watch?v=oRdxUFDoQe0', info_only=True + #) def test_acfun(self): - acfun.download('https://www.acfun.cn/v/ac11701912', info_only=True) + acfun.download('https://www.acfun.cn/v/ac44560432', info_only=True) - def test_bilibil(self): - bilibili.download( - "https://www.bilibili.com/watchlater/#/BV1PE411q7mZ/p6", info_only=True - ) - bilibili.download( - "https://www.bilibili.com/watchlater/#/av74906671/p6", info_only=True - ) + #def test_bilibili(self): + #bilibili.download('https://www.bilibili.com/video/BV1sL4y177sC', info_only=True) - def test_soundcloud(self): + #def test_soundcloud(self): ## single song - soundcloud.download( - 'https://soundcloud.com/keiny-pham/impure-bird', info_only=True - ) + #soundcloud.download( + # 'https://soundcloud.com/keiny-pham/impure-bird', info_only=True + #) ## playlist #soundcloud.download( # 'https://soundcloud.com/anthony-flieger/sets/cytus', info_only=True #) - def tests_tiktok(self): - tiktok.download('https://www.tiktok.com/@nmb48_official/video/6850796940293164290', info_only=True) - tiktok.download('https://t.tiktok.com/i18n/share/video/6850796940293164290/', info_only=True) - tiktok.download('https://vt.tiktok.com/UGJR4R/', info_only=True) + def test_tiktok(self): + tiktok.download('https://www.tiktok.com/@zukky_48/video/7398162058153315605', info_only=True) + tiktok.download('https://www.tiktok.com/@/video/7398162058153315605', info_only=True) + tiktok.download('https://t.tiktok.com/i18n/share/video/7398162058153315605/', info_only=True) + tiktok.download('https://vt.tiktok.com/ZSYKjKt6M/', info_only=True) + def test_twitter(self): + twitter.download('https://twitter.com/elonmusk/status/1530516552084234244', info_only=True) + twitter.download('https://x.com/elonmusk/status/1530516552084234244', info_only=True) + + def test_weibo(self): + miaopai.download('https://video.weibo.com/show?fid=1034:4825403706245135', info_only=True) if __name__ == '__main__': unittest.main() diff --git a/you-get.json b/you-get.json index e98e2e8a..adf604dc 100644 --- a/you-get.json +++ b/you-get.json @@ -18,13 +18,12 @@ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.2", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Internet", "Topic :: Internet :: WWW/HTTP", "Topic :: Multimedia",