diff --git a/.gitignore b/.gitignore index 0a5d13ab..1d987ed9 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ _*/ +*.bak *.download *.cmt.* *.3gp diff --git a/.travis.yml b/.travis.yml index c2a812c1..5ac5b86a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,4 +3,5 @@ language: python python: - "3.2" - "3.3" + - "3.4" script: make test diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 67cbb1fb..95b21973 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,6 +1,111 @@ Changelog ========= +0.3.29 +------ + +*Date: 2014-05-29* + +* Bug fix release + +0.3.28.3 +-------- + +*Date: 2014-05-18* + +* New site support: + - CBS.com + +0.3.28.2 +-------- + +*Date: 2014-04-13* + +* Bug fix release + +0.3.28.1 +-------- + +*Date: 2014-02-28* + +* Bug fix release + +0.3.28 +------ + +*Date: 2014-02-21* + +* New site support: + - Magisto.com + - VK.com + +0.3.27 +------ + +*Date: 2014-02-14* + +* Bug fix release + +0.3.26 +------ + +*Date: 2014-02-08* + +* New features: + - Play video in players (#286) + - LeTV support (#289) + - Youku 1080P support +* Bug fixes: + - YouTube (#282, #292) + - Sina (#246, #280) + - Mixcloud + - NetEase + - QQ + - Vine + +0.3.25 +------ + +*Date: 2013-12-20* + +* Bug fix release + +0.3.24 +------ + +*Date: 2013-10-30* + +* Experimental: Sogou proxy server +* Fix issues for: + - Vimeo + +0.3.23 +------ + +*Date: 2013-10-23* + +* Support YouTube playlists +* Support general short URLs +* Fix issues for: + - Sina + +0.3.22 +------ + +*Date: 2013-10-18* + +* Fix issues for: + - Baidu + - Bilibili + - JPopsuki TV + - Niconico + - PPTV + - TED + - Tumblr + - YinYueTai + - YouTube + - ... + 0.3.21 ------ diff --git a/README.md b/README.md index 52d5d30c..0509b98d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # You-Get -[![Build Status](https://api.travis-ci.org/soimort/you-get.png)](https://travis-ci.org/soimort/you-get) +[![Build Status](https://api.travis-ci.org/soimort/you-get.png)](https://travis-ci.org/soimort/you-get) [![PyPI version](https://badge.fury.io/py/you-get.png)](http://badge.fury.io/py/you-get) [You-Get](https://github.com/soimort/you-get) is a video downloader runs on Python 3. It aims at easing the download of videos on [YouTube](http://www.youtube.com), [Youku](http://www.youku.com)/[Tudou](http://www.tudou.com) (biggest online video providers in China), [ Niconico](http://www.nicovideo.jp), etc., in one script. @@ -8,6 +8,8 @@ See the project homepage for further documentat Fork me on GitHub: +__中文说明__已移至[wiki](https://github.com/soimort/you-get/wiki/%E4%B8%AD%E6%96%87%E8%AF%B4%E6%98%8E)。 + ## Features ### Supported Sites (As of Now) @@ -16,6 +18,7 @@ Fork me on GitHub: * Vimeo * Coursera * Blip +* CBS * Dailymotion * eHow * Facebook @@ -26,22 +29,25 @@ Fork me on GitHub: * Tumblr * Vine * Instagram +* Magisto * SoundCloud * Mixcloud * Freesound +* JPopsuki * VID48 * Niconico (ニコニコ動画) * Youku (优酷) * Tudou (土豆) * YinYueTai (音悦台) -* AcFun -* bilibili +* AcFun +* bilibili * CNTV (中国网络电视台) * Douban (豆瓣) * ifeng (凤凰视频) * iQIYI (爱奇艺) * Joy.cn (激动网) * Ku6 (酷6网) +* LeTV (乐视网) * MioMio * NetEase (网易视频) * PPTV @@ -55,87 +61,74 @@ Fork me on GitHub: * Baidu Wangpan (百度网盘) * SongTaste * Alive.in.th +* VK ## Dependencies * [Python 3](http://www.python.org/download/releases/) -* __(Optional)__ [FFmpeg](http://ffmpeg.org) - * Used for converting and joining video files. +* __(Optional)__ [FFmpeg](http://ffmpeg.org) / [Libav](http://libav.org/) + * For converting and joining video files. +* __(Optional)__ [RTMPDump](http://rtmpdump.mplayerhq.hu/) + * For processing RTMP streams. ## Installation -### 1. Install via [Pip](http://www.pip-installer.org/): +### 1. Install via Pip: + + $ [sudo] pip install you-get - $ pip install you-get - Check if the installation was successful: - + $ you-get -V -### 2. Install via [EasyInstall](http://pypi.python.org/pypi/setuptools): - - $ easy_install you-get - - Check if the installation was successful: - - $ you-get -V - -### 3. Install from Git: +### 2. Install from Git: $ git clone git://github.com/soimort/you-get.git - + Use the raw script without installation: - + $ cd you-get/ $ ./you-get -V - + To install the package into the system path, execute: - + $ make install - + Check if the installation was successful: - + $ you-get -V -### 4. Direct download (from ): - +### 3. Direct download (from ): + $ wget -O you-get.zip https://github.com/soimort/you-get/zipball/master $ unzip you-get.zip - + Use the raw script without installation: - + $ cd soimort-you-get-*/ $ ./you-get -V - + To install the package into the system path, execute: - + $ make install - + Check if the installation was successful: - + $ you-get -V -### 5. Install from [AUR (Arch User Repository)](http://aur.archlinux.org/): +### 4. Install from your distro's repo: - Click [here](https://aur.archlinux.org/packages.php\?ID=62576). +* __AUR (Arch)__: -### Upgrading: +* __Overlay (Gentoo)__: + +## Upgrading Using Pip: - $ pip install --upgrade you-get + $ [sudo] pip install --upgrade you-get -### FAQ (For Windows Users): - -* Q: I don't know how to install it on Windows. - -* A: Then don't do it. Just put your `you-get` folder into system `%PATH%`. - -* Q: I got something like `UnicodeDecodeError: 'gbk' codec can't decode byte 0xb0 in position 1012: illegal multibyte sequence`. - -* A: Run `set PYTHONIOENCODING=utf-8`. - -## Examples (For End-Users) +## Examples Display the information of the video without downloading: @@ -172,31 +165,25 @@ By default, Python will apply the system proxy settings (i.e. environment variab For a complete list of all available options, see: $ you-get --help + Usage: you-get [OPTION]... [URL]... -## Examples (For Developers) + Startup options: + -V | --version Display the version and exit. + -h | --help Print this help and exit. -In Python 3 (interactive): - - >>> from you_get.downloader import * - >>> youtube.download("http://www.youtube.com/watch?v=8bQlxQJEzLk", info_only = True) - Video Site: YouTube.com - Title: If you're good at something, never do it for free! - Type: WebM video (video/webm) - Size: 0.13 MB (133176 Bytes) - - >>> import you_get - >>> you_get.any_download("http://www.youtube.com/watch?v=sGwy8DsUJ4M") - Video Site: YouTube.com - Title: Mort from Madagascar LIKES - Type: WebM video (video/webm) - Size: 1.78 MB (1867072 Bytes) - - Downloading Mort from Madagascar LIKES.webm ... - 100.0% ( 1.8/1.8 MB) [========================================] 1/1 - -## API Reference - -See source code. + Download options (use with URLs): + -f | --force Force overwriting existed files. + -i | --info Display the information of videos without downloading. + -u | --url Display the real URLs of videos without downloading. + -n | --no-merge Don't merge video parts. + -c | --cookies Load NetScape's cookies.txt file. + -o | --output-dir Set the output directory for downloaded videos. + -p | --player Directly play the video with PLAYER like vlc/smplayer. + -x | --http-proxy Use specific HTTP proxy for downloading. + --no-proxy Don't use any proxy. (ignore $http_proxy) + -S | --sogou Use a Sogou proxy server for downloading. + --sogou-proxy Run a standalone Sogou proxy server. + --debug Show traceback on KeyboardInterrupt. ## License @@ -205,227 +192,3 @@ You-Get is licensed under the [MIT license](https://raw.github.com/soimort/you-g ## Contributing Please see [CONTRIBUTING.md](https://github.com/soimort/you-get/blob/master/CONTRIBUTING.md). - - - -*** - - - -# You-Get - 中文说明 - -[You-Get](https://github.com/soimort/you-get)是一个基于Python 3的视频下载工具。之所以写它的主要原因是,我找不到一个现成的下载工具能够同时支持[YouTube](http://www.youtube.com/)和[优酷](http://www.youku.com/);而且,几乎所有以前的视频下载程序都是基于Python 2的。 - -项目主页: - -GitHub地址: - -## 特点 - -### 说明 - -You-Get基于优酷下载脚本[iambus/youku-lixian](https://github.com/iambus/youku-lixian)用Python 3改写而成,增加了以下功能: - -* 支持YouTube、Vimeo等国外视频网站 -* 支持断点续传 -* 可设置HTTP代理 - -### 支持的站点(截至目前) - -已实现对以下站点的支持,以后会陆续增加(・∀・) - -* YouTube -* Vimeo -* Coursera -* Blip -* Dailymotion -* eHow -* Facebook -* Google+ -* Google Drive -* Khan Academy -* TED -* Tumblr -* Vine -* Instagram -* SoundCloud -* Mixcloud -* Freesound -* VID48 -* NICONICO动画 -* 优酷 -* 土豆 -* 音悦台 -* AcFun -* bilibili -* CNTV -* 豆瓣 -* 凤凰视频 -* 爱奇艺 -* 激动网 -* 酷6网 -* MioMio -* 网易视频 -* PPTV -* 腾讯视频 -* 新浪视频 -* 搜狐视频 -* 56网 -* 虾米 -* 5sing -* 百度音乐 -* 百度网盘 -* SongTaste -* Alive.in.th - -## 依赖 - -* [Python 3](http://www.python.org/download/releases/) -* __(可选)__ [FFmpeg](http://ffmpeg.org) - * 用于转换与合并视频文件。 - -## 安装说明 - -(以下命令格式均以Linux shell为例) - -### 1. 通过[Pip](http://www.pip-installer.org/)安装: - - $ pip install you-get - - 检查安装是否成功: - - $ you-get -V - -### 2. 通过[EasyInstall](http://pypi.python.org/pypi/setuptools)安装: - - $ easy_install you-get - - 检查安装是否成功: - - $ you-get -V - -### 3. 从Git安装: - - $ git clone git://github.com/soimort/you-get.git - - 在不安装的情况下直接使用脚本: - - $ cd you-get/ - $ ./you-get -V - - 若要将Python package安装到系统默认路径,执行: - - $ make install - - 检查安装是否成功: - - $ you-get -V - -### 4. 直接下载(从): - - $ wget -O you-get.zip https://github.com/soimort/you-get/zipball/master - $ unzip you-get.zip - - 在不安装的情况下直接使用脚本: - - $ cd soimort-you-get-*/ - $ ./you-get -V - - 若要将Python package安装到系统默认路径,执行: - - $ make install - - 检查安装是否成功: - - $ you-get -V - -### 5. 从[AUR (Arch User Repository)](http://aur.archlinux.org/)安装: - - 点击[这里](https://aur.archlinux.org/packages.php\?ID=62576)。 - -### 升级: - -使用Pip: - - $ pip install --upgrade you-get - -### FAQ(针对Windows用户): - -* Q:我不知道该如何在Windows下安装。 - -* A:不需要安装。直接把`you-get`目录放到系统`%PATH%`中。 - -* Q:出现错误提示`UnicodeDecodeError: 'gbk' codec can't decode byte 0xb0 in position 1012: illegal multibyte sequence`。 - -* A:执行`set PYTHONIOENCODING=utf-8`。 - -## 使用方法示例 - -### 如何下载视频 - -显示视频信息,但不进行下载(`-i`或`--info`选项): - - $ you-get -i http://www.yinyuetai.com/video/463772 - -下载视频: - - $ you-get http://www.yinyuetai.com/video/463772 - -下载多个视频: - - $ you-get http://www.yinyuetai.com/video/463772 http://www.yinyuetai.com/video/471500 - -若当前目录下已有与视频标题同名的文件,下载时会自动跳过。若有同名的`.download`临时文件,程序会从上次中断处开始下载。 -如要强制重新下载该视频,可使用`-f`(`--force`)选项: - - $ you-get -f http://www.yinyuetai.com/video/463772 - -`-l`(`--playlist`)选项用于下载播放列表(只对某些网站适用): - - $ you-get -l http://www.youku.com/playlist_show/id_5344313.html - -__注:从0.1.3以后的版本起,`-l`选项不再必须。You-Get可以自动识别并处理播放列表的下载。__ - -指定视频文件的下载目录: - - $ you-get -o ~/Downloads http://www.yinyuetai.com/video/463772 - -显示详细帮助: - - $ you-get -h - -### 如何设置代理 - -默认情况下,Python自动使用系统的代理配置。可以通过环境变量`http_proxy`来设置系统的HTTP代理。 - -`-x`(`--http-proxy`)选项用于手动指定You-Get所使用的HTTP代理。例如:GoAgent的代理服务器是`http://127.0.0.1:8087`,则通过该代理下载某YouTube视频的命令是: - - $ you-get -x 127.0.0.1:8087 http://www.youtube.com/watch?v=KbtO_Ayjw0M - -Windows下的自由门等翻墙软件会自动设置系统全局代理,因此无需指定HTTP代理即可下载YouTube视频: - - $ you-get http://www.youtube.com/watch?v=KbtO_Ayjw0M - -如果不希望程序在下载过程中使用任何代理(包括系统的代理配置),可以显式地指定`--no-proxy`选项: - - $ you-get --no-proxy http://v.youku.com/v_show/id_XMjI0ODc1NTc2.html - -### 断点续传 - -下载未完成时被中止(因为`Ctrl+C`终止程序或者网络中断等原因),在目标路径中会有一个扩展名为`.download`的临时文件。 - -下次运行只要在目标路径中找到相应的`.download`临时文件,程序会自动从中断处继续下载。(除非指定了`-f`选项) - -## 使用Python 2? - -优酷等国内视频网站的下载,请移步:[iambus/youku-lixian](https://github.com/iambus/youku-lixian) - -YouTube等国外视频网站的下载,请移步:[rg3/youtube-dl](https://github.com/rg3/youtube-dl) - -## 许可证 - -You-Get在[MIT License](https://raw.github.com/soimort/you-get/master/LICENSE.txt)下发布。 - -## 如何参与贡献 / 报告issue - -请阅读 [CONTRIBUTING.md](https://github.com/soimort/you-get/blob/master/CONTRIBUTING.md)。 diff --git a/README.txt b/README.txt index 7bd8a1ee..b2195ae2 100644 --- a/README.txt +++ b/README.txt @@ -3,6 +3,8 @@ You-Get .. image:: https://api.travis-ci.org/soimort/you-get.png +.. image:: https://badge.fury.io/py/you-get.png + `You-Get `_ is a video downloader runs on Python 3. It aims at easing the download of videos on `YouTube `_, `Youku `_/`Tudou `_ (biggest online video providers in China), `Niconico `_, etc., in one script. See the project homepage http://www.soimort.org/you-get for further documentation. @@ -19,6 +21,7 @@ Supported Sites (As of Now) * Vimeo http://vimeo.com * Coursera https://www.coursera.org * Blip http://blip.tv +* CBS http://www.cbs.com * Dailymotion http://dailymotion.com * eHow http://www.ehow.com * Facebook http://facebook.com @@ -29,22 +32,25 @@ Supported Sites (As of Now) * Tumblr http://www.tumblr.com * Vine http://vine.co * Instagram http://instagram.com +* Magisto http://www.magisto.com * SoundCloud http://soundcloud.com * Mixcloud http://www.mixcloud.com * Freesound http://www.freesound.org +* JPopsuki http://jpopsuki.tv * VID48 http://vid48.com * Niconico (ニコニコ動画) http://www.nicovideo.jp * Youku (优酷) http://www.youku.com * Tudou (土豆) http://www.tudou.com * YinYueTai (音悦台) http://www.yinyuetai.com -* AcFun http://www.acfun.tv -* bilibili http://www.bilibili.tv +* AcFun http://www.acfun.com +* bilibili http://www.bilibili.com * CNTV (中国网络电视台) http://www.cntv.cn * Douban (豆瓣) http://douban.com * ifeng (凤凰视频) http://v.ifeng.com * iQIYI (爱奇艺) http://www.iqiyi.com * Joy.cn (激动网) http://www.joy.cn * Ku6 (酷6网) http://www.ku6.com +* LeTV (乐视网) http://www.letv.com * MioMio http://www.miomio.tv * NetEase (网易视频) http://v.163.com * PPTV http://www.pptv.com @@ -58,74 +64,78 @@ Supported Sites (As of Now) * Baidu Wangpan (百度网盘) http://pan.baidu.com * SongTaste http://www.songtaste.com * Alive.in.th http://alive.in.th +* VK http://vk.com Dependencies ------------ * `Python 3 `_ -* (Optional) `FFmpeg `_ - * Used for converting and joining video files. +* (Optional) `FFmpeg `_ / `Libav `_ + * For converting and joining video files. +* (Optional) `RTMPDump `_ + * For processing RTMP streams. Installation ------------ -#) Install via `Pip `_:: +#) Install via Pip:: + + $ [sudo] pip install you-get - $ pip install you-get - Check if the installation was successful:: - - $ you-get -V -#) Install via `EasyInstall `_:: - - $ easy_install you-get - - Check if the installation was successful:: - $ you-get -V #) Install from Git:: $ git clone git://github.com/soimort/you-get.git - + Use the raw script without installation:: - + $ cd you-get/ $ ./you-get -V - + To install the package into the system path, execute:: - + $ make install - + Check if the installation was successful:: - + $ you-get -V #) Direct download:: - + $ wget -O you-get.zip https://github.com/soimort/you-get/zipball/master $ unzip you-get.zip - + Use the raw script without installation:: - + $ cd soimort-you-get-*/ $ ./you-get -V - + To install the package into the system path, execute:: - + $ make install - + Check if the installation was successful:: - + $ you-get -V -#) Install from `AUR (Arch User Repository) `_: +#) Install from your distro's repo: - Click `here `_. +* `AUR (Arch) `_ -Examples (For End-Users) ------------------------- +* `Overlay (Gentoo) `_ + +Upgrading +--------- + +Using Pip:: + + $ [sudo] pip install --upgrade you-get + +Examples +-------- Display the information of the video without downloading:: @@ -163,33 +173,25 @@ Command-Line Options For a complete list of all available options, see:: $ you-get --help + Usage: you-get [OPTION]... [URL]... -Examples (For Developers) -------------------------- + Startup options: + -V | --version Display the version and exit. + -h | --help Print this help and exit. -In Python 3 (interactive):: - - >>> from you_get.downloader import * - >>> youtube.download("http://www.youtube.com/watch?v=8bQlxQJEzLk", info_only = True) - Video Site: YouTube.com - Title: If you're good at something, never do it for free! - Type: WebM video (video/webm) - Size: 0.13 MB (133176 Bytes) - - >>> import you_get - >>> you_get.any_download("http://www.youtube.com/watch?v=sGwy8DsUJ4M") - Video Site: YouTube.com - Title: Mort from Madagascar LIKES - Type: WebM video (video/webm) - Size: 1.78 MB (1867072 Bytes) - - Downloading Mort from Madagascar LIKES.webm ... - 100.0% ( 1.8/1.8 MB) [========================================] 1/1 - -API Reference -------------- - -See source code. + Download options (use with URLs): + -f | --force Force overwriting existed files. + -i | --info Display the information of videos without downloading. + -u | --url Display the real URLs of videos without downloading. + -n | --no-merge Don't merge video parts. + -c | --cookies Load NetScape's cookies.txt file. + -o | --output-dir Set the output directory for downloaded videos. + -p | --player Directly play the video with PLAYER like vlc/smplayer. + -x | --http-proxy Use specific HTTP proxy for downloading. + --no-proxy Don't use any proxy. (ignore $http_proxy) + -S | --sogou Use a Sogou proxy server for downloading. + --sogou-proxy Run a standalone Sogou proxy server. + --debug Show traceback on KeyboardInterrupt. License ------- diff --git a/setup.py b/setup.py index 6564d33d..d4f1be39 100755 --- a/setup.py +++ b/setup.py @@ -7,36 +7,36 @@ PROJ_METADATA = '%s.json' % PROJ_NAME import os, json, imp here = os.path.abspath(os.path.dirname(__file__)) -proj_info = json.loads(open(os.path.join(here, PROJ_METADATA)).read()) -README = open(os.path.join(here, 'README.txt')).read() -CHANGELOG = open(os.path.join(here, 'CHANGELOG.txt')).read() +proj_info = json.loads(open(os.path.join(here, PROJ_METADATA), encoding='utf-8').read()) +README = open(os.path.join(here, 'README.txt'), encoding='utf-8').read() +CHANGELOG = open(os.path.join(here, 'CHANGELOG.txt'), encoding='utf-8').read() VERSION = imp.load_source('version', os.path.join(here, 'src/%s/version.py' % PACKAGE_NAME)).__version__ from setuptools import setup, find_packages setup( name = proj_info['name'], version = VERSION, - + author = proj_info['author'], author_email = proj_info['author_email'], url = proj_info['url'], license = proj_info['license'], - + description = proj_info['description'], keywords = proj_info['keywords'], - + long_description = README + '\n\n' + CHANGELOG, - + packages = find_packages('src'), package_dir = {'' : 'src'}, - + test_suite = 'tests', - + platforms = 'any', zip_safe = False, include_package_data = True, - + classifiers = proj_info['classifiers'], - + entry_points = {'console_scripts': proj_info['console_scripts']} ) diff --git a/src/you_get/__init__.py b/src/you_get/__init__.py index ecca35d2..f8ee6011 100644 --- a/src/you_get/__init__.py +++ b/src/you_get/__init__.py @@ -3,7 +3,5 @@ from .common import * from .version import * -# Easy import -#from .cli_wrapper.converter import * -#from .cli_wrapper.player import * -from .downloader import * +from .cli_wrapper import * +from .extractor import * diff --git a/src/you_get/cli_wrapper/__init__.py b/src/you_get/cli_wrapper/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/you_get/cli_wrapper/downloader/__init__.py b/src/you_get/cli_wrapper/downloader/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/you_get/cli_wrapper/openssl/__init__.py b/src/you_get/cli_wrapper/openssl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/you_get/cli_wrapper/player/__init__.py b/src/you_get/cli_wrapper/player/__init__.py new file mode 100644 index 00000000..2f7636de --- /dev/null +++ b/src/you_get/cli_wrapper/player/__init__.py @@ -0,0 +1,3 @@ +#!/usr/bin/env python + +from .mplayer import * diff --git a/src/you_get/cli_wrapper/player/__main__.py b/src/you_get/cli_wrapper/player/__main__.py new file mode 100644 index 00000000..8d4958b9 --- /dev/null +++ b/src/you_get/cli_wrapper/player/__main__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python + +def main(): + script_main('you-get', any_download, any_download_playlist) + +if __name__ == "__main__": + main() diff --git a/src/you_get/cli_wrapper/player/dragonplayer.py b/src/you_get/cli_wrapper/player/dragonplayer.py new file mode 100644 index 00000000..e69de29b diff --git a/src/you_get/cli_wrapper/player/gnome_mplayer.py b/src/you_get/cli_wrapper/player/gnome_mplayer.py new file mode 100644 index 00000000..e69de29b diff --git a/src/you_get/cli_wrapper/player/mplayer.py b/src/you_get/cli_wrapper/player/mplayer.py new file mode 100644 index 00000000..e69de29b diff --git a/src/you_get/cli_wrapper/player/vlc.py b/src/you_get/cli_wrapper/player/vlc.py new file mode 100644 index 00000000..4265cc3e --- /dev/null +++ b/src/you_get/cli_wrapper/player/vlc.py @@ -0,0 +1 @@ +#!/usr/bin/env python diff --git a/src/you_get/cli_wrapper/player/wmp.py b/src/you_get/cli_wrapper/player/wmp.py new file mode 100644 index 00000000..e69de29b diff --git a/src/you_get/cli_wrapper/transcoder/__init__.py b/src/you_get/cli_wrapper/transcoder/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/you_get/cli_wrapper/transcoder/ffmpeg.py b/src/you_get/cli_wrapper/transcoder/ffmpeg.py new file mode 100644 index 00000000..e69de29b diff --git a/src/you_get/cli_wrapper/transcoder/libav.py b/src/you_get/cli_wrapper/transcoder/libav.py new file mode 100644 index 00000000..e69de29b diff --git a/src/you_get/cli_wrapper/transcoder/mencoder.py b/src/you_get/cli_wrapper/transcoder/mencoder.py new file mode 100644 index 00000000..e69de29b diff --git a/src/you_get/common.py b/src/you_get/common.py index 8faf907a..d4a9d562 100644 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -8,11 +8,18 @@ import re import sys from urllib import request, parse import platform +import threading from .version import __version__ +from .util import log, sogou_proxy_server, get_filename, unescape_html dry_run = False force = False +player = None +extractor_proxy = None +sogou_proxy = None +sogou_env = None +cookies_txt = None fake_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', @@ -49,16 +56,16 @@ def r1_of(patterns, text): def match1(text, *patterns): """Scans through a string for substrings matched some patterns (first-subgroups only). - + Args: text: A string to be scanned. patterns: Arbitrary number of regex patterns. - + Returns: When only one pattern is given, returns a string (None if no match found). When more than one pattern are given, returns a list of strings ([] if no match found). """ - + if len(patterns) == 1: pattern = patterns[0] match = re.search(pattern, text) @@ -74,23 +81,31 @@ def match1(text, *patterns): ret.append(match.group(1)) return ret +def launch_player(player, urls): + import subprocess + import shlex + subprocess.call(shlex.split(player) + list(urls)) + def parse_query_param(url, param): """Parses the query string of a URL and returns the value of a parameter. - + Args: url: A URL. param: A string representing the name of the parameter. - + Returns: The value of the parameter. """ - - return parse.parse_qs(parse.urlparse(url).query)[param][0] + + try: + return parse.parse_qs(parse.urlparse(url).query)[param][0] + except: + return None def unicodize(text): return re.sub(r'\\u([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])', lambda x: chr(int(x.group(0)[2:], 16)), text) -# DEPRECATED in favor of filenameable() +# DEPRECATED in favor of util.legitimize() def escape_file_path(path): path = path.replace('/', '-') path = path.replace('\\', '-') @@ -98,6 +113,7 @@ def escape_file_path(path): path = path.replace('?', '-') return path +# DEPRECATED in favor of util.legitimize() def filenameable(text): """Converts a string to a legal filename through various OSes. """ @@ -106,11 +122,7 @@ def filenameable(text): 0: None, ord('/'): '-', }) - if platform.system() == 'Darwin': # For Mac OS - text = text.translate({ - ord(':'): '-', - }) - elif platform.system() == 'Windows': # For Windows + if platform.system() == 'Windows': # For Windows text = text.translate({ ord(':'): '-', ord('*'): '-', @@ -124,14 +136,15 @@ def filenameable(text): ord('['): '(', ord(']'): ')', }) + else: + if text.startswith("."): + text = text[1:] + if platform.system() == 'Darwin': # For Mac OS + text = text.translate({ + ord(':'): '-', + }) return text -def unescape_html(html): - from html import parser - html = parser.HTMLParser().unescape(html) - html = re.sub(r'&#(\d+);', lambda x: chr(int(x.group(1))), html) - return html - def ungzip(data): """Decompresses data for Content-Encoding: gzip. """ @@ -146,7 +159,8 @@ def undeflate(data): (the zlib compression is used.) """ import zlib - return zlib.decompress(data, -zlib.MAX_WBITS) + decompressobj = zlib.decompressobj(-zlib.MAX_WBITS) + return decompressobj.decompress(data)+decompressobj.flush() # DEPRECATED in favor of get_content() def get_response(url, faker = False): @@ -154,7 +168,7 @@ def get_response(url, faker = False): response = request.urlopen(request.Request(url, headers = fake_headers), None) else: response = request.urlopen(url) - + data = response.read() if response.info().get('Content-Encoding') == 'gzip': data = ungzip(data) @@ -174,32 +188,36 @@ def get_decoded_html(url, faker = False): data = response.data charset = r1(r'charset=([\w-]+)', response.headers['content-type']) if charset: - return data.decode(charset) + return data.decode(charset, 'ignore') else: return data def get_content(url, headers={}, decoded=True): """Gets the content of a URL via sending a HTTP GET request. - + Args: url: A URL. headers: Request headers used by the client. decoded: Whether decode the response body using UTF-8 or the charset specified in Content-Type. - + Returns: The content as a string. """ - - response = request.urlopen(request.Request(url, headers=headers)) + + req = request.Request(url, headers=headers) + if cookies_txt: + cookies_txt.add_cookie_header(req) + req.headers.update(req.unredirected_hdrs) + response = request.urlopen(req) data = response.read() - + # Handle HTTP compression for gzip and deflate (zlib) content_encoding = response.getheader('Content-Encoding') if content_encoding == 'gzip': data = ungzip(data) elif content_encoding == 'deflate': data = undeflate(data) - + # Decode the response body if decoded: charset = match1(response.getheader('Content-Type'), r'charset=([\w-]+)') @@ -207,7 +225,7 @@ def get_content(url, headers={}, decoded=True): data = data.decode(charset) else: data = data.decode('utf-8') - + return data def url_size(url, faker = False): @@ -215,7 +233,7 @@ def url_size(url, faker = False): response = request.urlopen(request.Request(url, headers = fake_headers), None) else: response = request.urlopen(url) - + size = int(response.headers['content-length']) return size @@ -227,9 +245,9 @@ def url_info(url, faker = False): response = request.urlopen(request.Request(url, headers = fake_headers), None) else: response = request.urlopen(request.Request(url)) - + headers = response.headers - + type = headers['content-type'] mapping = { 'video/3gpp': '3gp', @@ -257,12 +275,12 @@ def url_info(url, faker = False): ext = None else: ext = None - + if headers['transfer-encoding'] != 'chunked': size = int(headers['content-length']) else: size = None - + return type, ext, size def url_locations(urls, faker = False): @@ -272,13 +290,13 @@ def url_locations(urls, faker = False): response = request.urlopen(request.Request(url, headers = fake_headers), None) else: response = request.urlopen(request.Request(url)) - + locations.append(response.url) return locations def url_save(url, filepath, bar, refer = None, is_part = False, faker = False): file_size = url_size(url, faker = faker) - + if os.path.exists(filepath): if not force and file_size == os.path.getsize(filepath): if not is_part: @@ -296,19 +314,19 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False): print('Overwriting %s' % tr(os.path.basename(filepath)), '...') elif not os.path.exists(os.path.dirname(filepath)): os.mkdir(os.path.dirname(filepath)) - + temp_filepath = filepath + '.download' received = 0 if not force: open_mode = 'ab' - + if os.path.exists(temp_filepath): received += os.path.getsize(temp_filepath) if bar: bar.update_received(os.path.getsize(temp_filepath)) else: open_mode = 'wb' - + if received < file_size: if faker: headers = fake_headers @@ -318,7 +336,7 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False): headers['Range'] = 'bytes=' + str(received) + '-' if refer: headers['Referer'] = refer - + response = request.urlopen(request.Request(url, headers = headers), None) try: range_start = int(response.headers['content-range'][6:].split('/')[0].split('-')[0]) @@ -326,13 +344,13 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False): range_length = end_length - range_start except: range_length = int(response.headers['content-length']) - + if file_size != received + range_length: received = 0 if bar: bar.received = 0 open_mode = 'wb' - + with open(temp_filepath, open_mode) as output: while True: buffer = response.read(1024 * 256) @@ -346,9 +364,9 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False): received += len(buffer) if bar: bar.update_received(len(buffer)) - + assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (received, os.path.getsize(temp_filepath), temp_filepath) - + if os.access(filepath, os.W_OK): os.remove(filepath) # on Windows rename could fail if destination filepath exists os.rename(temp_filepath, filepath) @@ -371,19 +389,19 @@ def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker = print('Overwriting %s' % tr(os.path.basename(filepath)), '...') elif not os.path.exists(os.path.dirname(filepath)): os.mkdir(os.path.dirname(filepath)) - + temp_filepath = filepath + '.download' received = 0 if not force: open_mode = 'ab' - + if os.path.exists(temp_filepath): received += os.path.getsize(temp_filepath) if bar: bar.update_received(os.path.getsize(temp_filepath)) else: open_mode = 'wb' - + if faker: headers = fake_headers else: @@ -392,9 +410,9 @@ def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker = headers['Range'] = 'bytes=' + str(received) + '-' if refer: headers['Referer'] = refer - + response = request.urlopen(request.Request(url, headers = headers), None) - + with open(temp_filepath, open_mode) as output: while True: buffer = response.read(1024 * 256) @@ -404,9 +422,9 @@ def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker = received += len(buffer) if bar: bar.update_received(len(buffer)) - + assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (received, os.path.getsize(temp_filepath)) - + if os.access(filepath, os.W_OK): os.remove(filepath) # on Windows rename could fail if destination filepath exists os.rename(temp_filepath, filepath) @@ -418,7 +436,7 @@ class SimpleProgressBar: self.total_pieces = total_pieces self.current_piece = 1 self.received = 0 - + def update(self): self.displayed = True bar_size = 40 @@ -437,14 +455,14 @@ class SimpleProgressBar: bar = '{0:>5}% ({1:>5}/{2:<5}MB) [{3:<40}] {4}/{5}'.format(percent, round(self.received / 1048576, 1), round(self.total_size / 1048576, 1), bar, self.current_piece, self.total_pieces) sys.stdout.write('\r' + bar) sys.stdout.flush() - + def update_received(self, n): self.received += n self.update() - + def update_piece(self, n): self.current_piece = n - + def done(self): if self.displayed: print() @@ -457,20 +475,20 @@ class PiecesProgressBar: self.total_pieces = total_pieces self.current_piece = 1 self.received = 0 - + def update(self): self.displayed = True bar = '{0:>5}%[{1:<40}] {2}/{3}'.format('?', '?' * 40, self.current_piece, self.total_pieces) sys.stdout.write('\r' + bar) sys.stdout.flush() - + def update_received(self, n): self.received += n self.update() - + def update_piece(self, n): self.current_piece = n - + def done(self): if self.displayed: print() @@ -486,12 +504,16 @@ class DummyProgressBar: def done(self): pass -def download_urls(urls, title, ext, total_size, output_dir = '.', refer = None, merge = True, faker = False): +def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False): assert urls if dry_run: - print('Real URLs:\n', urls, '\n') + print('Real URLs:\n%s\n' % urls) return - + + if player: + launch_player(player, urls) + return + if not total_size: try: total_size = urls_size(urls) @@ -500,9 +522,9 @@ def download_urls(urls, title, ext, total_size, output_dir = '.', refer = None, import sys traceback.print_exc(file = sys.stdout) pass - - title = filenameable(title) - + + title = get_filename(title) + filename = '%s.%s' % (title, ext) filepath = os.path.join(output_dir, filename) if total_size: @@ -513,7 +535,7 @@ def download_urls(urls, title, ext, total_size, output_dir = '.', refer = None, bar = SimpleProgressBar(total_size, len(urls)) else: bar = PiecesProgressBar(total_size, len(urls)) - + if len(urls) == 1: url = urls[0] print('Downloading %s ...' % tr(filename)) @@ -530,7 +552,7 @@ def download_urls(urls, title, ext, total_size, output_dir = '.', refer = None, bar.update_piece(i + 1) url_save(url, filepath, bar, refer = refer, is_part = True, faker = faker) bar.done() - + if not merge: print() return @@ -548,7 +570,7 @@ def download_urls(urls, title, ext, total_size, output_dir = '.', refer = None, else: for part in parts: os.remove(part) - + elif ext == 'mp4': try: from .processor.ffmpeg import has_ffmpeg_installed @@ -563,22 +585,26 @@ def download_urls(urls, title, ext, total_size, output_dir = '.', refer = None, else: for part in parts: os.remove(part) - + else: print("Can't merge %s files" % ext) - + print() -def download_urls_chunked(urls, title, ext, total_size, output_dir = '.', refer = None, merge = True, faker = False): +def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False): assert urls if dry_run: - print('Real URLs:\n', urls, '\n') + print('Real URLs:\n%s\n' % urls) return - + + if player: + launch_player(player, urls) + return + assert ext in ('ts') - - title = filenameable(title) - + + title = get_filename(title) + filename = '%s.%s' % (title, 'ts') filepath = os.path.join(output_dir, filename) if total_size: @@ -589,7 +615,7 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir = '.', refer bar = SimpleProgressBar(total_size, len(urls)) else: bar = PiecesProgressBar(total_size, len(urls)) - + if len(urls) == 1: parts = [] url = urls[0] @@ -598,7 +624,7 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir = '.', refer parts.append(filepath) url_save_chunked(url, filepath, bar, refer = refer, faker = faker) bar.done() - + if not merge: print() return @@ -626,7 +652,7 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir = '.', refer bar.update_piece(i + 1) url_save_chunked(url, filepath, bar, refer = refer, is_part = True, faker = faker) bar.done() - + if not merge: print() return @@ -643,9 +669,25 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir = '.', refer print('No ffmpeg is found. Merging aborted.') else: print("Can't merge %s files" % ext) - + print() +def download_rtmp_url(url, playpath, title, ext, total_size=0, output_dir='.', refer=None, merge=True, faker=False): + assert url + if dry_run: + print('Real URL:\n%s\n' % [url]) + print('Real Playpath:\n%s\n' % [playpath]) + return + + if player: + from .processor.rtmpdump import play_rtmpdump_stream + play_rtmpdump_stream(player, url, playpath) + return + + from .processor.rtmpdump import has_rtmpdump_installed, download_rtmpdump_stream + assert has_rtmpdump_installed(), "RTMPDump not installed." + download_rtmpdump_stream(url, playpath, title, ext, output_dir) + def playlist_not_supported(name): def f(*args, **kwargs): raise NotImplementedError('Playlist is not supported for ' + name) @@ -672,7 +714,7 @@ def print_info(site_info, title, type, size): type = 'video/MP2T' elif type in ['webm']: type = 'video/webm' - + if type in ['video/3gpp']: type_info = "3GPP multimedia file (%s)" % type elif type in ['video/x-flv', 'video/f4v']: @@ -699,13 +741,42 @@ def print_info(site_info, title, type, size): type_info = "MP3 (%s)" % type else: type_info = "Unknown type (%s)" % type - + print("Video Site:", site_info) - print("Title: ", tr(title)) + print("Title: ", unescape_html(tr(title))) print("Type: ", type_info) - print("Size: ", round(size / 1048576, 2), "MB (" + str(size) + " Bytes)") + print("Size: ", round(size / 1048576, 2), "MiB (" + str(size) + " Bytes)") print() +def parse_host(host): + """Parses host name and port number from a string. + """ + if re.match(r'^(\d+)$', host) is not None: + return ("0.0.0.0", int(host)) + if re.match(r'^(\w+)://', host) is None: + host = "//" + host + o = parse.urlparse(host) + hostname = o.hostname or "0.0.0.0" + port = o.port or 0 + return (hostname, port) + +def get_sogou_proxy(): + return sogou_proxy + +def set_proxy(proxy): + proxy_handler = request.ProxyHandler({ + 'http': '%s:%s' % proxy, + 'https': '%s:%s' % proxy, + }) + opener = request.build_opener(proxy_handler) + request.install_opener(opener) + +def unset_proxy(): + proxy_handler = request.ProxyHandler({}) + opener = request.build_opener(proxy_handler) + request.install_opener(opener) + +# DEPRECATED in favor of set_proxy() and unset_proxy() def set_http_proxy(proxy): if proxy == None: # Use system default setting proxy_support = request.ProxyHandler() @@ -722,7 +793,7 @@ def download_main(download, download_playlist, urls, playlist, output_dir, merge url = url[8:] if not url.startswith('http://'): url = 'http://' + url - + if playlist: download_playlist(url, output_dir = output_dir, merge = merge, info_only = info_only) else: @@ -732,7 +803,7 @@ def get_version(): try: import subprocess real_dir = os.path.dirname(os.path.realpath(__file__)) - git_hash = subprocess.Popen(['git', 'rev-parse', '--short', 'HEAD'], cwd=real_dir, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL).stdout.read().decode('utf-8').strip() + git_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'], cwd=real_dir, stderr=subprocess.DEVNULL).decode('utf-8').strip() assert git_hash return '%s-%s' % (__version__, git_hash) except: @@ -749,31 +820,46 @@ def script_main(script_name, download, download_playlist = None): -f | --force Force overwriting existed files. -i | --info Display the information of videos without downloading. -u | --url Display the real URLs of videos without downloading. + -c | --cookies Load NetScape's cookies.txt file. -n | --no-merge Don't merge video parts. -o | --output-dir Set the output directory for downloaded videos. - -x | --http-proxy Use specific HTTP proxy for downloading. + -p | --player Directly play the video with PLAYER like vlc/smplayer. + -x | --http-proxy Use specific HTTP proxy for downloading. + -y | --extractor-proxy Use specific HTTP proxy for extracting stream data. --no-proxy Don't use any proxy. (ignore $http_proxy) + -S | --sogou Use a Sogou proxy server for downloading. + --sogou-proxy Run a standalone Sogou proxy server. --debug Show traceback on KeyboardInterrupt. ''' - - short_opts = 'Vhfiuno:x:' - opts = ['version', 'help', 'force', 'info', 'url', 'no-merge', 'no-proxy', 'debug', 'output-dir=', 'http-proxy='] + + short_opts = 'Vhfiuc:nSo:p:x:y:' + opts = ['version', 'help', 'force', 'info', 'url', 'cookies', 'no-merge', 'no-proxy', 'debug', 'sogou', 'output-dir=', 'player=', 'http-proxy=', 'extractor-proxy=', 'sogou-proxy=', 'sogou-env='] if download_playlist: short_opts = 'l' + short_opts opts = ['playlist'] + opts - + try: opts, args = getopt.getopt(sys.argv[1:], short_opts, opts) except getopt.GetoptError as err: - print(err) - print(help) + log.e(err) + log.e("try 'you-get --help' for more options") sys.exit(2) - + + global force + global dry_run + global player + global extractor_proxy + global sogou_proxy + global sogou_env + global cookies_txt + cookies_txt = None + info_only = False playlist = False merge = True output_dir = '.' proxy = None + extractor_proxy = None traceback = False for o, a in opts: if o in ('-V', '--version'): @@ -784,38 +870,175 @@ def script_main(script_name, download, download_playlist = None): print(help) sys.exit() elif o in ('-f', '--force'): - global force force = True elif o in ('-i', '--info'): info_only = True elif o in ('-u', '--url'): - global dry_run dry_run = True + elif o in ('-c', '--cookies'): + from http import cookiejar + cookies_txt = cookiejar.MozillaCookieJar(a) + cookies_txt.load() elif o in ('-l', '--playlist'): playlist = True elif o in ('-n', '--no-merge'): merge = False - elif o in ('--no-proxy'): + elif o in ('--no-proxy',): proxy = '' - elif o in ('--debug'): + elif o in ('--debug',): traceback = True elif o in ('-o', '--output-dir'): output_dir = a + elif o in ('-p', '--player'): + player = a elif o in ('-x', '--http-proxy'): proxy = a + elif o in ('-y', '--extractor-proxy'): + extractor_proxy = a + elif o in ('-S', '--sogou'): + sogou_proxy = ("0.0.0.0", 0) + elif o in ('--sogou-proxy',): + sogou_proxy = parse_host(a) + elif o in ('--sogou-env',): + sogou_env = a + else: + log.e("try 'you-get --help' for more options") + sys.exit(2) + if not args: + if sogou_proxy is not None: + try: + if sogou_env is not None: + server = sogou_proxy_server(sogou_proxy, network_env=sogou_env) + else: + server = sogou_proxy_server(sogou_proxy) + server.serve_forever() + except KeyboardInterrupt: + if traceback: + raise + else: + sys.exit() else: print(help) - sys.exit(1) - if not args: - print(help) - sys.exit() - + sys.exit() + set_http_proxy(proxy) - - if traceback: + + try: download_main(download, download_playlist, args, playlist, output_dir, merge, info_only) - else: - try: - download_main(download, download_playlist, args, playlist, output_dir, merge, info_only) - except KeyboardInterrupt: + except KeyboardInterrupt: + if traceback: + raise + else: sys.exit(1) + + + +class VideoExtractor(): + def __init__(self, *args): + self.url = None + self.title = None + self.vid = None + self.streams = {} + self.streams_sorted = [] + + if args: + self.url = args[0] + + def download_by_url(self, url, **kwargs): + self.url = url + + self.prepare(**kwargs) + + self.streams_sorted = [dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams] + + global extractor_proxy + if extractor_proxy: + set_proxy(parse_host(extractor_proxy)) + self.extract(**kwargs) + if extractor_proxy: + unset_proxy() + + self.download(**kwargs) + + def download_by_vid(self, vid, **kwargs): + self.vid = vid + + self.prepare(**kwargs) + + self.streams_sorted = [dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams] + + global extractor_proxy + if extractor_proxy: + set_proxy(parse_host(extractor_proxy)) + self.extract(**kwargs) + if extractor_proxy: + unset_proxy() + + self.download(**kwargs) + + def prepare(self, **kwargs): + pass + #raise NotImplementedError() + + def extract(self, **kwargs): + pass + #raise NotImplementedError() + + def p_stream(self, stream_id): + stream = self.streams[stream_id] + print(" - id: \033[7m%s\033[0m" % stream_id) + print(" container: %s" % stream['container']) + print(" video-profile: %s" % stream['video_profile']) + print(" size: %s MiB (%s bytes)" % (round(stream['size'] / 1048576, 1), stream['size'])) + #print(" # download-with: \033[4myou-get --stream=%s\033[0m" % stream_id) + print() + + def p(self, stream_id=None): + print("site: %s" % self.__class__.name) + print("title: %s" % self.title) + if stream_id: + # Print the stream + print("stream:") + self.p_stream(stream_id) + + elif stream_id is None: + # Print stream with best quality + print("stream: # Best quality") + stream_id = self.streams_sorted[0]['id'] + self.p_stream(stream_id) + + elif stream_id == []: + # Print all available streams + print("streams: # Available quality and codecs") + for stream in self.streams_sorted: + self.p_stream(stream['id']) + + def download(self, **kwargs): + if 'info_only' in kwargs and kwargs['info_only']: + if 'stream_id' in kwargs and kwargs['stream_id']: + # Display the stream + stream_id = kwargs['stream_id'] + self.p(stream_id) + else: + # Display all available streams + self.p([]) + + else: + if 'stream_id' in kwargs and kwargs['stream_id']: + # Download the stream + stream_id = kwargs['stream_id'] + else: + # Download stream with the best quality + stream_id = self.streams_sorted[0]['id'] + + self.p(None) + + urls = self.streams[stream_id]['src'] + if not urls: + log.e('[Failed] Cannot extract video source.') + log.e('This is most likely because the video has not been made available in your country.') + log.e('You may try to use a proxy via \'-y\' for extracting stream data.') + exit(1) + download_urls(urls, self.title, self.streams[stream_id]['container'], self.streams[stream_id]['size'], output_dir=kwargs['output_dir'], merge=kwargs['merge']) + + self.__init__() diff --git a/src/you_get/downloader/acfun.py b/src/you_get/downloader/acfun.py deleted file mode 100644 index 88e1a7d0..00000000 --- a/src/you_get/downloader/acfun.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python - -__all__ = ['acfun_download'] - -from ..common import * - -from .qq import qq_download_by_id -from .sina import sina_download_by_vid -from .tudou import tudou_download_by_iid -from .youku import youku_download_by_id - -import json, re - -def get_srt_json(id): - url = 'http://comment.acfun.tv/%s.json' % id - return get_html(url) - -def acfun_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False): - info = json.loads(get_html('http://wenzhou.acfun.tv/api/getVideoByID.aspx?vid=' + id)) - t = info['vtype'] - vid = info['vid'] - if t == 'sina': - sina_download_by_vid(vid, title, output_dir = output_dir, merge = merge, info_only = info_only) - elif t == 'youku': - youku_download_by_id(vid, title, output_dir = output_dir, merge = merge, info_only = info_only) - elif t == 'tudou': - tudou_download_by_iid(vid, title, output_dir = output_dir, merge = merge, info_only = info_only) - elif t == 'qq': - qq_download_by_id(vid, title, output_dir = output_dir, merge = merge, info_only = info_only) - else: - raise NotImplementedError(t) - - if not info_only: - print('Downloading %s ...' % (title + '.cmt.json')) - cmt = get_srt_json(vid) - with open(os.path.join(output_dir, title + '.cmt.json'), 'w') as x: - x.write(cmt) - -def acfun_download(url, output_dir = '.', merge = True, info_only = False): - assert re.match(r'http://[^\.]+.acfun.tv/v/ac(\d+)', url) - html = get_html(url) - - title = r1(r'

]*>([^<>]+)<', html) - assert title - title = unescape_html(title) - title = escape_file_path(title) - title = title.replace(' - AcFun.tv', '') - - id = r1(r"\[Video\](\d+)\[/Video\]", html) or r1(r"\[video\](\d+)\[/video\]", html) - if not id: - id = r1(r"src=\"/newflvplayer/player.*id=(\d+)", html) - sina_download_by_vid(id, title, output_dir = output_dir, merge = merge, info_only = info_only) - else: - acfun_download_by_id(id, title, output_dir = output_dir, merge = merge, info_only = info_only) - -site_info = "AcFun.tv" -download = acfun_download -download_playlist = playlist_not_supported('acfun') diff --git a/src/you_get/downloader/douban.py b/src/you_get/downloader/douban.py deleted file mode 100644 index e27a3518..00000000 --- a/src/you_get/downloader/douban.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python - -__all__ = ['douban_download'] - -from ..common import * - -def douban_download(url, output_dir = '.', merge = True, info_only = False): - html = get_html(url) - - titles = re.findall(r'"name":"([^"]*)"', html) - real_urls = [re.sub('\\\\/', '/', i) for i in re.findall(r'"rawUrl":"([^"]*)"', html)] - - for i in range(len(titles)): - title = titles[i] - real_url = real_urls[i] - - type, ext, size = url_info(real_url) - - print_info(site_info, title, type, size) - if not info_only: - download_urls([real_url], title, ext, size, output_dir, merge = merge) - -site_info = "Douban.com" -download = douban_download -download_playlist = playlist_not_supported('douban') diff --git a/src/you_get/downloader/qq.py b/src/you_get/downloader/qq.py deleted file mode 100644 index b59c68bc..00000000 --- a/src/you_get/downloader/qq.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python - -__all__ = ['qq_download'] - -from ..common import * - -def qq_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False): - url = 'http://vsrc.store.qq.com/%s.flv' % id - - _, _, size = url_info(url) - - print_info(site_info, title, 'flv', size) - if not info_only: - download_urls([url], title, 'flv', size, output_dir = output_dir, merge = merge) - -def qq_download(url, output_dir = '.', merge = True, info_only = False): - if re.match(r'http://v.qq.com/([^\?]+)\?vid', url): - aid = r1(r'(.*)\.html', url) - vid = r1(r'http://v.qq.com/[^\?]+\?vid=(\w+)', url) - url = "%s/%s.html" % (aid, vid) - - if re.match(r'http://y.qq.com/([^\?]+)\?vid', url): - vid = r1(r'http://y.qq.com/[^\?]+\?vid=(\w+)', url) - - url = "http://v.qq.com/page/%s.html" % vid - - r_url = r1(r'") > -1: - title = line.replace("", "").replace("", "").replace("\t", "") - title = title[:title.find(' | ')] - if line.find("no-flash-video-download") > -1: - url = line.replace('[^<]', r'([^<>]*)'], page) - else: - title = r1_of([r'
[^<]', r'([^<>]*)', page) - if subtitle: - subtitle = subtitle.group(1).strip() - if subtitle == title: - subtitle = None - if subtitle: - title += '-' + subtitle - return title - -def parse_playlist_title(url, page): - if re.search(r'v_playlist', url): - # if we are playing a video from play list, the meta title might be incorrect - title = re.search(r'([^<>]*)', page).group(1) - else: - title = re.search(r'> 16 - c = source.pop(index) - mixed += c - - ids = info['data'][0]['streamfileids'][stream_type].split('*')[:-1] - vid = ''.join(mixed[int(i)] for i in ids) - - sid = '%s%s%s' % (int(time() * 1000), randint(1000, 1999), randint(1000, 9999)) - - urls = [] - for s in segs[stream_type]: - no = '%02x' % int(s['no']) - url = 'http://f.youku.com/player/getFlvPath/sid/%s_%s/st/%s/fileid/%s%s%s?K=%s&ts=%s' % (sid, no, file_type, vid[:8], no.upper(), vid[10:], s['k'], s['seconds']) - urls.append((url, int(s['size']))) - return urls - -def file_type_of_url(url): - return str(re.search(r'/st/([^/]+)/', url).group(1)) - -def youku_download_by_id(id, title, output_dir = '.', stream_type = None, merge = True, info_only = False): - info = get_info(id) - urls, sizes = zip(*find_video(info, stream_type)) - ext = file_type_of_url(urls[0]) - total_size = sum(sizes) - - print_info(site_info, title, ext, total_size) - if not info_only: - download_urls(urls, title, ext, total_size, output_dir, merge = merge) - -def parse_playlist_videos(html): - return re.findall(r'id="A_(\w+)"', html) - -def parse_playlist_pages(html): - m = re.search(r'
    .*?
', html, flags = re.S) - if m: - urls = re.findall(r'href="([^"]+)"', m.group()) - x1, x2, x3 = re.match(r'^(.*page_)(\d+)(_.*)$', urls[-1]).groups() - return ['http://v.youku.com%s%s%s?__rt=1&__ro=listShow' % (x1, i, x3) for i in range(2, int(x2) + 1)] - else: - return [] - -def parse_playlist(url): - html = get_html(url) - video_id = re.search(r"var\s+videoId\s*=\s*'(\d+)'", html).group(1) - show_id = re.search(r'var\s+showid\s*=\s*"(\d+)"', html).group(1) - list_url = 'http://v.youku.com/v_vpofficiallist/page_1_showid_%s_id_%s.html?__rt=1&__ro=listShow' % (show_id, video_id) - html = get_html(list_url) - ids = parse_playlist_videos(html) - for url in parse_playlist_pages(html): - ids.extend(parse_playlist_videos(get_html(url))) - return ids - -def parse_vplaylist(url): - id = r1_of([r'^http://www.youku.com/playlist_show/id_(\d+)(?:_ascending_\d_mode_pic(?:_page_\d+)?)?.html', - r'^http://v.youku.com/v_playlist/f(\d+)o[01]p\d+.html', - r'^http://u.youku.com/user_playlist/pid_(\d+)_id_[\w=]+(?:_page_\d+)?.html'], - url) - assert id, 'not valid vplaylist url: ' + url - url = 'http://www.youku.com/playlist_show/id_%s.html' % id - n = int(re.search(r'(\d+)', get_html(url)).group(1)) - return ['http://v.youku.com/v_playlist/f%so0p%s.html' % (id, i) for i in range(n)] - -def youku_download_playlist(url, output_dir='.', merge=True, info_only=False): - """Downloads a Youku playlist. - """ - - if re.match(r'http://www.youku.com/playlist_show/id_\d+(?:_ascending_\d_mode_pic(?:_page_\d+)?)?.html', url): - ids = parse_vplaylist(url) - elif re.match(r'http://v.youku.com/v_playlist/f\d+o[01]p\d+.html', url): - ids = parse_vplaylist(url) - elif re.match(r'http://u.youku.com/user_playlist/pid_(\d+)_id_[\w=]+(?:_page_\d+)?.html', url): - ids = parse_vplaylist(url) - elif re.match(r'http://www.youku.com/show_page/id_\w+.html', url): - url = find_video_id_from_show_page(url) - assert re.match(r'http://v.youku.com/v_show/id_([\w=]+).html', url), 'URL not supported as playlist' - ids = parse_playlist(url) - else: - ids = [] - assert ids != [] - - title = parse_playlist_title(url, get_html(url)) - title = filenameable(title) - output_dir = os.path.join(output_dir, title) - - for i, id in enumerate(ids): - print('Processing %s of %s videos...' % (i + 1, len(ids))) - try: - id, title = parse_page(youku_url(id)) - youku_download_by_id(id, title, output_dir=output_dir, merge=merge, info_only=info_only) - except: - continue - -def youku_download(url, output_dir='.', merge=True, info_only=False): - """Downloads Youku videos by URL. - """ - - try: - youku_download_playlist(url, output_dir=output_dir, merge=merge, info_only=info_only) - except: - id, title = parse_page(url) - youku_download_by_id(id, title=title, output_dir=output_dir, merge=merge, info_only=info_only) - -site_info = "Youku.com" -download = youku_download -download_playlist = youku_download_playlist diff --git a/src/you_get/downloader/__init__.py b/src/you_get/extractor/__init__.py similarity index 87% rename from src/you_get/downloader/__init__.py rename to src/you_get/extractor/__init__.py index 99e331f4..f128640b 100644 --- a/src/you_get/downloader/__init__.py +++ b/src/you_get/extractor/__init__.py @@ -5,6 +5,7 @@ from .alive import * from .baidu import * from .bilibili import * from .blip import * +from .cbs import * from .cntv import * from .coursera import * from .dailymotion import * @@ -18,7 +19,10 @@ from .ifeng import * from .instagram import * from .iqiyi import * from .joy import * +from .jpopsuki import * from .ku6 import * +from .letv import * +from .magisto import * from .miomio import * from .mixcloud import * from .netease import * @@ -29,11 +33,13 @@ from .sina import * from .sohu import * from .songtaste import * from .soundcloud import * +from .theplatform import * from .tudou import * from .tumblr import * from .vid48 import * from .vimeo import * from .vine import * +from .vk import * from .w56 import * from .xiami import * from .yinyuetai import * diff --git a/src/you_get/downloader/__main__.py b/src/you_get/extractor/__main__.py similarity index 56% rename from src/you_get/downloader/__main__.py rename to src/you_get/extractor/__main__.py index ed07f702..bbe15a33 100644 --- a/src/you_get/downloader/__main__.py +++ b/src/you_get/extractor/__main__.py @@ -1,20 +1,19 @@ #!/usr/bin/env python __all__ = ['main', 'any_download', 'any_download_playlist'] -from ..downloader import * +from ..extractor import * from ..common import * def url_to_module(url): - site = r1(r'http://([^/]+)/', url) - assert site, 'invalid url: ' + url - - if site.endswith('.com.cn'): - site = site[:-3] - domain = r1(r'(\.[^.]+\.[^.]+)$', site) - if not domain: - domain = site + video_host = r1(r'https?://([^/]+)/', url) + video_url = r1(r'https?://[^/]+(.*)', url) + assert video_host and video_url, 'invalid url: ' + url + + if video_host.endswith('.com.cn'): + video_host = video_host[:-3] + domain = r1(r'(\.[^.]+\.[^.]+)$', video_host) or video_host assert domain, 'unsupported url: ' + url - + k = r1(r'([^.]+)', domain) downloads = { '163': netease, @@ -25,6 +24,7 @@ def url_to_module(url): 'bilibili': bilibili, 'blip': blip, 'cntv': cntv, + 'cbs': cbs, 'coursera': coursera, 'dailymotion': dailymotion, 'douban': douban, @@ -38,8 +38,11 @@ def url_to_module(url): 'instagram': instagram, 'iqiyi': iqiyi, 'joy': joy, + 'jpopsuki': jpopsuki, 'kankanews': bilibili, 'ku6': ku6, + 'letv': letv, + 'magisto': magisto, 'miomio': miomio, 'mixcloud': mixcloud, 'nicovideo': nicovideo, @@ -51,11 +54,13 @@ def url_to_module(url): 'songtaste':songtaste, 'soundcloud': soundcloud, 'ted': ted, + 'theplatform': theplatform, 'tudou': tudou, 'tumblr': tumblr, 'vid48': vid48, 'vimeo': vimeo, 'vine': vine, + 'vk': vk, 'xiami': xiami, 'yinyuetai': yinyuetai, 'youku': youku, @@ -65,17 +70,25 @@ def url_to_module(url): #TODO } if k in downloads: - return downloads[k] + return downloads[k], url else: - raise NotImplementedError(url) + import http.client + conn = http.client.HTTPConnection(video_host) + conn.request("HEAD", video_url) + res = conn.getresponse() + location = res.getheader('location') + if location is None: + raise NotImplementedError(url) + else: + return url_to_module(location) -def any_download(url, output_dir = '.', merge = True, info_only = False): - m = url_to_module(url) - m.download(url, output_dir = output_dir, merge = merge, info_only = info_only) +def any_download(url, output_dir='.', merge=True, info_only=False): + m, url = url_to_module(url) + m.download(url, output_dir=output_dir, merge=merge, info_only=info_only) -def any_download_playlist(url, output_dir = '.', merge = True, info_only = False): - m = url_to_module(url) - m.download_playlist(url, output_dir = output_dir, merge = merge, info_only = info_only) +def any_download_playlist(url, output_dir='.', merge=True, info_only=False): + m, url = url_to_module(url) + m.download_playlist(url, output_dir=output_dir, merge=merge, info_only=info_only) def main(): script_main('you-get', any_download, any_download_playlist) diff --git a/src/you_get/extractor/acfun.py b/src/you_get/extractor/acfun.py new file mode 100644 index 00000000..00a2d21b --- /dev/null +++ b/src/you_get/extractor/acfun.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python + +__all__ = ['acfun_download'] + +from ..common import * + +from .qq import qq_download_by_id +from .sina import sina_download_by_vid +from .tudou import tudou_download_by_iid +from .youku import youku_download_by_vid + +import json, re + +def get_srt_json(id): + url = 'http://comment.acfun.com/%s.json' % id + return get_html(url) + +def get_srt_lock_json(id): + url = 'http://comment.acfun.com/%s_lock.json' % id + return get_html(url) + +def acfun_download_by_vid(vid, title=None, output_dir='.', merge=True, info_only=False): + info = json.loads(get_html('http://www.acfun.com/video/getVideo.aspx?id=' + vid)) + sourceType = info['sourceType'] + sourceId = info['sourceId'] + danmakuId = info['danmakuId'] + if sourceType == 'sina': + sina_download_by_vid(sourceId, title, output_dir=output_dir, merge=merge, info_only=info_only) + elif sourceType == 'youku': + youku_download_by_vid(sourceId, title, output_dir=output_dir, merge=merge, info_only=info_only) + elif sourceType == 'tudou': + tudou_download_by_iid(sourceId, title, output_dir=output_dir, merge=merge, info_only=info_only) + elif sourceType == 'qq': + qq_download_by_id(sourceId, title, output_dir=output_dir, merge=merge, info_only=info_only) + else: + raise NotImplementedError(sourceType) + + if not info_only: + title = get_filename(title) + try: + print('Downloading %s ...\n' % (title + '.cmt.json')) + cmt = get_srt_json(danmakuId) + with open(os.path.join(output_dir, title + '.cmt.json'), 'w') as x: + x.write(cmt) + print('Downloading %s ...\n' % (title + '.cmt_lock.json')) + cmt = get_srt_lock_json(danmakuId) + with open(os.path.join(output_dir, title + '.cmt_lock.json'), 'w') as x: + x.write(cmt) + except: + pass + +def acfun_download(url, output_dir = '.', merge = True, info_only = False): + assert re.match(r'http://[^\.]+.acfun.[^\.]+/v/ac(\d+)', url) + html = get_html(url) + + title = r1(r'

([^<>]+)<', html) + title = unescape_html(title) + title = escape_file_path(title) + assert title + + videos = re.findall("data-vid=\"(\d+)\" href=\"[^\"]+\" title=\"([^\"]+)\"", html) + if videos is not None: + for video in videos: + p_vid = video[0] + p_title = title + " - " + video[1] + acfun_download_by_vid(p_vid, p_title, output_dir=output_dir, merge=merge, info_only=info_only) + else: + # Useless - to be removed? + id = r1(r"src=\"/newflvplayer/player.*id=(\d+)", html) + sina_download_by_vid(id, title, output_dir=output_dir, merge=merge, info_only=info_only) + +site_info = "AcFun.com" +download = acfun_download +download_playlist = playlist_not_supported('acfun') diff --git a/src/you_get/downloader/alive.py b/src/you_get/extractor/alive.py similarity index 100% rename from src/you_get/downloader/alive.py rename to src/you_get/extractor/alive.py diff --git a/src/you_get/downloader/baidu.py b/src/you_get/extractor/baidu.py similarity index 53% rename from src/you_get/downloader/baidu.py rename to src/you_get/extractor/baidu.py index 79d7053d..c671fa74 100755 --- a/src/you_get/downloader/baidu.py +++ b/src/you_get/extractor/baidu.py @@ -8,45 +8,59 @@ from .. import common from urllib import parse -def baidu_get_song_html(sid): - return get_html('http://music.baidu.com/song/%s/download?__o=%%2Fsong%%2F%s' % (sid, sid), faker = True) +def baidu_get_song_data(sid): + data = json.loads(get_html('http://music.baidu.com/data/music/fmlink?songIds=%s' % sid, faker = True))['data'] -def baidu_get_song_url(html): - return r1(r'downlink="/data/music/file\?link=(.+?)"', html) + if data['xcode'] != '': + # inside china mainland + return data['songList'][0] + else: + # outside china mainland + return None -def baidu_get_song_artist(html): - return r1(r'singer_name:"(.+?)"', html) +def baidu_get_song_url(data): + return data['songLink'] -def baidu_get_song_album(html): - return r1(r'ablum_name:"(.+?)"', html) +def baidu_get_song_artist(data): + return data['artistName'] -def baidu_get_song_title(html): - return r1(r'song_title:"(.+?)"', html) +def baidu_get_song_album(data): + return data['albumName'] -def baidu_download_lyric(sid, file_name, output_dir): - if common.dry_run: - return +def baidu_get_song_title(data): + return data['songName'] - html = get_html('http://music.baidu.com/song/' + sid) - href = r1(r'', html) - if href: - lrc = get_html('http://music.baidu.com' + href) - if len(lrc) > 0: - with open(output_dir + "/" + file_name.replace('/', '-') + '.lrc', 'w') as x: - x.write(lrc) +def baidu_get_song_lyric(data): + lrc = data['lrcLink'] + return None if lrc is '' else "http://music.baidu.com%s" % lrc -def baidu_download_song(sid, output_dir = '.', merge = True, info_only = False): - html = baidu_get_song_html(sid) - url = baidu_get_song_url(html) - title = baidu_get_song_title(html) - artist = baidu_get_song_artist(html) - album = baidu_get_song_album(html) - type, ext, size = url_info(url, faker = True) +def baidu_download_song(sid, output_dir='.', merge=True, info_only=False): + data = baidu_get_song_data(sid) + if data is not None: + url = baidu_get_song_url(data) + title = baidu_get_song_title(data) + artist = baidu_get_song_artist(data) + album = baidu_get_song_album(data) + lrc = baidu_get_song_lyric(data) + file_name = "%s - %s - %s" % (title, album, artist) + else: + html = get_html("http://music.baidu.com/song/%s" % sid) + url = r1(r'data_url="([^"]+)"', html) + title = r1(r'data_name="([^"]+)"', html) + file_name = title + + type, ext, size = url_info(url, faker=True) print_info(site_info, title, type, size) if not info_only: - file_name = "%s - %s - %s" % (title, album, artist) - download_urls([url], file_name, ext, size, output_dir, merge = merge, faker = True) - baidu_download_lyric(sid, file_name, output_dir) + download_urls([url], file_name, ext, size, output_dir, merge=merge, faker=True) + + try: + type, ext, size = url_info(lrc, faker=True) + print_info(site_info, title, type, size) + if not info_only: + download_urls([lrc], file_name, ext, size, output_dir, faker=True) + except: + pass def baidu_download_album(aid, output_dir = '.', merge = True, info_only = False): html = get_html('http://music.baidu.com/album/%s' % aid, faker = True) @@ -56,32 +70,40 @@ def baidu_download_album(aid, output_dir = '.', merge = True, info_only = False) ids = json.loads(r1(r'', html).replace('"', '').replace(';', '"'))['ids'] track_nr = 1 for id in ids: - song_html = baidu_get_song_html(id) - song_url = baidu_get_song_url(song_html) - song_title = baidu_get_song_title(song_html) + song_data = baidu_get_song_data(id) + song_url = baidu_get_song_url(song_data) + song_title = baidu_get_song_title(song_data) + song_lrc = baidu_get_song_lyric(song_data) file_name = '%02d.%s' % (track_nr, song_title) + type, ext, size = url_info(song_url, faker = True) print_info(site_info, song_title, type, size) if not info_only: download_urls([song_url], file_name, ext, size, output_dir, merge = merge, faker = True) - baidu_download_lyric(id, file_name, output_dir) + + if song_lrc: + type, ext, size = url_info(song_lrc, faker = True) + print_info(site_info, song_title, type, size) + if not info_only: + download_urls([song_lrc], file_name, ext, size, output_dir, faker = True) + track_nr += 1 def baidu_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False): if re.match(r'http://pan.baidu.com', url): html = get_html(url) - + title = r1(r'server_filename="([^"]+)"', html) if len(title.split('.')) > 1: title = ".".join(title.split('.')[:-1]) - + real_url = r1(r'\\"dlink\\":\\"([^"]*)\\"', html).replace('\\\\/', '/') type, ext, size = url_info(real_url, faker = True) - + print_info(site_info, title, ext, size) if not info_only: download_urls([real_url], title, ext, size, output_dir, merge = merge) - + elif re.match(r'http://music.baidu.com/album/\d+', url): id = r1(r'http://music.baidu.com/album/(\d+)', url) baidu_download_album(id, output_dir, merge, info_only) diff --git a/src/you_get/downloader/bilibili.py b/src/you_get/extractor/bilibili.py similarity index 82% rename from src/you_get/downloader/bilibili.py rename to src/you_get/extractor/bilibili.py index 8512d362..934afdd6 100644 --- a/src/you_get/downloader/bilibili.py +++ b/src/you_get/extractor/bilibili.py @@ -6,12 +6,12 @@ from ..common import * from .sina import sina_download_by_vid from .tudou import tudou_download_by_id -from .youku import youku_download_by_id +from .youku import youku_download_by_vid import re def get_srt_xml(id): - url = 'http://comment.bilibili.tv/%s.xml' % id + url = 'http://comment.bilibili.com/%s.xml' % id return get_html(url) def parse_srt_p(p): @@ -19,7 +19,7 @@ def parse_srt_p(p): assert len(fields) == 8, fields time, mode, font_size, font_color, pub_time, pool, user_id, history = fields time = float(time) - + mode = int(mode) assert 1 <= mode <= 8 # mode 1~3: scrolling @@ -28,17 +28,17 @@ def parse_srt_p(p): # mode 6: reverse? # mode 7: position # mode 8: advanced - + pool = int(pool) assert 0 <= pool <= 2 # pool 0: normal # pool 1: srt # pool 2: special? - + font_size = int(font_size) - + font_color = '#%06x' % int(font_color) - + return pool, mode, font_size, font_color def parse_srt_xml(xml): @@ -54,9 +54,9 @@ def parse_cid_playurl(xml): return urls def bilibili_download_by_cid(id, title, output_dir = '.', merge = True, info_only = False): - url = 'http://interface.bilibili.tv/playurl?cid=' + id + url = 'http://interface.bilibili.com/playurl?cid=' + id urls = [i if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i) else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i) for i in parse_cid_playurl(get_html(url, 'utf-8'))] # dirty fix for QQ - + if re.search(r'\.(flv|hlv)\b', urls[0]): type = 'flv' elif re.search(r'/flv/', urls[0]): @@ -65,25 +65,24 @@ def bilibili_download_by_cid(id, title, output_dir = '.', merge = True, info_onl type = 'mp4' else: type = 'flv' - + size = 0 for url in urls: _, _, temp = url_info(url) size += temp - + print_info(site_info, title, type, size) if not info_only: download_urls(urls, title, type, total_size = None, output_dir = output_dir, merge = merge) def bilibili_download(url, output_dir = '.', merge = True, info_only = False): - assert re.match(r'http://(www.bilibili.tv|bilibili.kankanews.com|bilibili.smgbb.cn)/video/av(\d+)', url) html = get_html(url) - - title = r1(r'

([^<>]+)

', html) + + title = r1(r']*>([^<>]+)

', html) title = unescape_html(title) title = escape_file_path(title) - - flashvars = r1_of([r'player_params=\'(cid=\d+)', r'flashvars="([^"]+)"', r'"https://secure.bilibili.tv/secure,(cid=\d+)(?:&aid=\d+)?"'], html) + + flashvars = r1_of([r'player_params=\'(cid=\d+)', r'flashvars="([^"]+)"', r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) assert flashvars t, id = flashvars.split('=', 1) id = id.split('&')[0] @@ -92,18 +91,19 @@ def bilibili_download(url, output_dir = '.', merge = True, info_only = False): elif t == 'vid': sina_download_by_id(id, title, output_dir = output_dir, merge = merge, info_only = info_only) elif t == 'ykid': - youku_download_by_id(id, title, output_dir = output_dir, merge = merge, info_only = info_only) + youku_download_by_vid(id, title, output_dir = output_dir, merge = merge, info_only = info_only) elif t == 'uid': tudou_download_by_id(id, title, output_dir = output_dir, merge = merge, info_only = info_only) else: raise NotImplementedError(flashvars) - + if not info_only: - print('Downloading %s ...' % (title + '.cmt.xml')) + title = get_filename(title) + print('Downloading %s ...\n' % (title + '.cmt.xml')) xml = get_srt_xml(id) - with open(os.path.join(output_dir, title + '.cmt.xml'), 'w') as x: + with open(os.path.join(output_dir, title + '.cmt.xml'), 'w', encoding='utf-8') as x: x.write(xml) -site_info = "bilibili.tv" +site_info = "bilibili.com" download = bilibili_download download_playlist = playlist_not_supported('bilibili') diff --git a/src/you_get/downloader/blip.py b/src/you_get/extractor/blip.py similarity index 100% rename from src/you_get/downloader/blip.py rename to src/you_get/extractor/blip.py diff --git a/src/you_get/extractor/cbs.py b/src/you_get/extractor/cbs.py new file mode 100644 index 00000000..8c9d4a7b --- /dev/null +++ b/src/you_get/extractor/cbs.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python + +__all__ = ['cbs_download'] + +from ..common import * + +from .theplatform import theplatform_download_by_pid + +def cbs_download(url, output_dir='.', merge=True, info_only=False): + """Downloads CBS videos by URL. + """ + + html = get_content(url) + pid = match1(html, r'video\.settings\.pid\s*=\s*\'([^\']+)\'') + title = match1(html, r'video\.settings\.title\s*=\s*\"([^\"]+)\"') + + theplatform_download_by_pid(pid, title, output_dir=output_dir, merge=merge, info_only=info_only) + +site_info = "CBS.com" +download = cbs_download +download_playlist = playlist_not_supported('cbs') diff --git a/src/you_get/downloader/cntv.py b/src/you_get/extractor/cntv.py similarity index 100% rename from src/you_get/downloader/cntv.py rename to src/you_get/extractor/cntv.py diff --git a/src/you_get/downloader/coursera.py b/src/you_get/extractor/coursera.py similarity index 100% rename from src/you_get/downloader/coursera.py rename to src/you_get/extractor/coursera.py diff --git a/src/you_get/downloader/dailymotion.py b/src/you_get/extractor/dailymotion.py similarity index 91% rename from src/you_get/downloader/dailymotion.py rename to src/you_get/extractor/dailymotion.py index 99d586c8..8e8851aa 100644 --- a/src/you_get/downloader/dailymotion.py +++ b/src/you_get/extractor/dailymotion.py @@ -7,22 +7,22 @@ from ..common import * def dailymotion_download(url, output_dir = '.', merge = True, info_only = False): """Downloads Dailymotion videos by URL. """ - - id = match1(url, r'/video/([^\?]+)') + + id = match1(url, r'/video/([^\?]+)') or match1(url, r'video=([^\?]+)') embed_url = 'http://www.dailymotion.com/embed/video/%s' % id html = get_content(embed_url) - + info = json.loads(match1(html, r'var\s*info\s*=\s*({.+}),\n')) - + title = info['title'] - + for quality in ['stream_h264_hd1080_url', 'stream_h264_hd_url', 'stream_h264_hq_url', 'stream_h264_url', 'stream_h264_ld_url']: real_url = info[quality] if real_url: break - + type, ext, size = url_info(real_url) - + print_info(site_info, title, type, size) if not info_only: download_urls([real_url], title, ext, size, output_dir, merge = merge) diff --git a/src/you_get/extractor/douban.py b/src/you_get/extractor/douban.py new file mode 100644 index 00000000..8a52275f --- /dev/null +++ b/src/you_get/extractor/douban.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +__all__ = ['douban_download'] + +import urllib.request, urllib.parse +from ..common import * + +def douban_download(url, output_dir = '.', merge = True, info_only = False): + html = get_html(url) + if 'subject' in url: + titles = re.findall(r'data-title="([^"]*)">', html) + song_id = re.findall(r'
  • ([^<\n]+)', html) else: title = None - + html = get_html(url) real_urls = re.findall(r'\[(\d+),\d+,\d+,"([^"]+)"\]', html) real_url = unicodize(sorted(real_urls, key = lambda x : fmt_level[x[0]])[0][1]) - + if title is None: post_url = r1(r'"(https://plus.google.com/\d+/posts/[^"]*)"', html) post_html = get_html(post_url) - title = r1(r'([^<\n]+)', post_html) - + title = r1(r'<title[^>]*>([^<\n]+)', post_html) + if title is None: response = request.urlopen(request.Request(real_url)) if response.headers['content-disposition']: filename = parse.unquote(r1(r'filename="?(.+)"?', response.headers['content-disposition'])).split('.') title = ''.join(filename[:-1]) - + type, ext, size = url_info(real_url) if ext is None: ext = 'mp4' - + elif service in ['docs', 'drive'] : # Google Docs - + html = get_html(url) - + title = r1(r'"title":"([^"]*)"', html) or r1(r'<meta itemprop="name" content="([^"]*)"', html) if len(title.split('.')) > 1: title = ".".join(title.split('.')[:-1]) - + docid = r1(r'"docid":"([^"]*)"', html) - + request.install_opener(request.build_opener(request.HTTPCookieProcessor())) - + request.urlopen(request.Request("https://docs.google.com/uc?id=%s&export=download" % docid)) real_url ="https://docs.google.com/uc?export=download&confirm=no_antivirus&id=%s" % docid - + type, ext, size = url_info(real_url) - + print_info(site_info, title, ext, size) if not info_only: download_urls([real_url], title, ext, size, output_dir, merge = merge) diff --git a/src/you_get/downloader/ifeng.py b/src/you_get/extractor/ifeng.py similarity index 100% rename from src/you_get/downloader/ifeng.py rename to src/you_get/extractor/ifeng.py diff --git a/src/you_get/downloader/instagram.py b/src/you_get/extractor/instagram.py similarity index 86% rename from src/you_get/downloader/instagram.py rename to src/you_get/extractor/instagram.py index 6071dfd0..0605a6c3 100644 --- a/src/you_get/downloader/instagram.py +++ b/src/you_get/extractor/instagram.py @@ -6,13 +6,13 @@ from ..common import * def instagram_download(url, output_dir = '.', merge = True, info_only = False): html = get_html(url) - - id = r1(r'instagram.com/p/([^/]+)/', html) + + vid = r1(r'instagram.com/p/([^/]+)/', html) description = r1(r'<meta property="og:description" content="([^"]*)"', html) - title = description + " [" + id + "]" + title = description + " [" + vid + "]" url = r1(r'<meta property="og:video" content="([^"]*)"', html) type, ext, size = url_info(url) - + print_info(site_info, title, type, size) if not info_only: download_urls([url], title, ext, size, output_dir, merge = merge) diff --git a/src/you_get/downloader/iqiyi.py b/src/you_get/extractor/iqiyi.py similarity index 79% rename from src/you_get/downloader/iqiyi.py rename to src/you_get/extractor/iqiyi.py index 5c951d1d..0bfec350 100644 --- a/src/you_get/downloader/iqiyi.py +++ b/src/you_get/extractor/iqiyi.py @@ -6,20 +6,23 @@ from ..common import * def iqiyi_download(url, output_dir = '.', merge = True, info_only = False): html = get_html(url) - - videoId = r1(r'data-player-videoid="([^"]+)"', html) - assert videoId - - info_url = 'http://cache.video.qiyi.com/v/%s' % videoId - info_xml = get_html(info_url) - + + tvid = r1(r'data-player-tvid="([^"]+)"', html) + videoid = r1(r'data-player-videoid="([^"]+)"', html) + assert tvid + assert videoid + + info_url = 'http://cache.video.qiyi.com/vj/%s/%s/' % (tvid, videoid) + info = get_html(info_url) + raise NotImplementedError('iqiyi') + from xml.dom.minidom import parseString doc = parseString(info_xml) title = doc.getElementsByTagName('title')[0].firstChild.nodeValue size = int(doc.getElementsByTagName('totalBytes')[0].firstChild.nodeValue) urls = [n.firstChild.nodeValue for n in doc.getElementsByTagName('file')] assert urls[0].endswith('.f4v'), urls[0] - + for i in range(len(urls)): temp_url = "http://data.video.qiyi.com/%s" % urls[i].split("/")[-1].split(".")[0] + ".ts" try: @@ -28,7 +31,7 @@ def iqiyi_download(url, output_dir = '.', merge = True, info_only = False): key = r1(r'key=(.*)', e.geturl()) assert key urls[i] += "?key=%s" % key - + print_info(site_info, title, 'flv', size) if not info_only: download_urls(urls, title, 'flv', size, output_dir = output_dir, merge = merge) diff --git a/src/you_get/downloader/joy.py b/src/you_get/extractor/joy.py similarity index 100% rename from src/you_get/downloader/joy.py rename to src/you_get/extractor/joy.py diff --git a/src/you_get/extractor/jpopsuki.py b/src/you_get/extractor/jpopsuki.py new file mode 100644 index 00000000..cf4ec052 --- /dev/null +++ b/src/you_get/extractor/jpopsuki.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python + +__all__ = ['jpopsuki_download'] + +from ..common import * + +def jpopsuki_download(url, output_dir='.', merge=True, info_only=False): + html = get_html(url, faker=True) + + title = r1(r'<meta name="title" content="([^"]*)"', html) + if title.endswith(' - JPopsuki TV'): + title = title[:-14] + + url = "http://jpopsuki.tv%s" % r1(r'<source src="([^"]*)"', html) + type, ext, size = url_info(url, faker=True) + + print_info(site_info, title, type, size) + if not info_only: + download_urls([url], title, ext, size, output_dir, merge=merge, faker=True) + +site_info = "JPopsuki.tv" +download = jpopsuki_download +download_playlist = playlist_not_supported('jpopsuki') diff --git a/src/you_get/downloader/khan.py b/src/you_get/extractor/khan.py similarity index 100% rename from src/you_get/downloader/khan.py rename to src/you_get/extractor/khan.py diff --git a/src/you_get/downloader/ku6.py b/src/you_get/extractor/ku6.py similarity index 100% rename from src/you_get/downloader/ku6.py rename to src/you_get/extractor/ku6.py diff --git a/src/you_get/extractor/letv.py b/src/you_get/extractor/letv.py new file mode 100644 index 00000000..54aa28b2 --- /dev/null +++ b/src/you_get/extractor/letv.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +__all__ = ['letv_download'] + +import json +import random +import xml.etree.ElementTree as ET +from ..common import * + +def get_timestamp(): + tn = random.random() + url = 'http://api.letv.com/time?tn={}'.format(tn) + result = get_content(url) + return json.loads(result)['stime'] + +def get_key(t): + for s in range(0, 8): + e = 1 & t + t >>= 1 + e <<= 31 + t += e + return t ^ 185025305 + +def video_info(vid): + tn = get_timestamp() + key = get_key(tn) + url = 'http://api.letv.com/mms/out/video/play?id={}&platid=1&splatid=101&format=1&tkey={}&domain=http%3A%2F%2Fwww.letv.com'.format(vid, key) + r = get_content(url, decoded=False) + xml_obj = ET.fromstring(r) + info = json.loads(xml_obj.find("playurl").text) + title = info.get('title') + urls = info.get('dispatch') + for k in urls.keys(): + url = urls[k][0] + break + url += '&termid=1&format=0&hwtype=un&ostype=Windows7&tag=letv&sign=letv&expect=1&pay=0&rateid={}'.format(k) + return url, title + +def letv_download_by_vid(vid, output_dir='.', merge=True, info_only=False): + url, title = video_info(vid) + _, _, size = url_info(url) + ext = 'flv' + print_info(site_info, title, ext, size) + if not info_only: + download_urls([url], title, ext, size, output_dir=output_dir, merge=merge) + +def letv_download(url, output_dir='.', merge=True, info_only=False): + if re.match(r'http://www.letv.com/ptv/vplay/(\d+).html', url): + vid = match1(url, r'http://www.letv.com/ptv/vplay/(\d+).html') + else: + html = get_content(url) + vid = match1(html, r'vid="(\d+)"') + letv_download_by_vid(vid, output_dir=output_dir, merge=merge, info_only=info_only) + + +site_info = "letv.com" +download = letv_download +download_playlist = playlist_not_supported('letv') diff --git a/src/you_get/extractor/magisto.py b/src/you_get/extractor/magisto.py new file mode 100644 index 00000000..77032518 --- /dev/null +++ b/src/you_get/extractor/magisto.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python + +__all__ = ['magisto_download'] + +from ..common import * + +def magisto_download(url, output_dir='.', merge=True, info_only=False): + html = get_html(url) + + title1 = r1(r'<meta name="twitter:title" content="([^"]*)"', html) + title2 = r1(r'<meta name="twitter:description" content="([^"]*)"', html) + video_hash = r1(r'http://www.magisto.com/video/([^/]+)', url) + title = "%s %s - %s" % (title1, title2, video_hash) + url = r1(r'<source type="[^"]+" src="([^"]*)"', html) + type, ext, size = url_info(url) + + print_info(site_info, title, type, size) + if not info_only: + download_urls([url], title, ext, size, output_dir, merge=merge) + +site_info = "Magisto.com" +download = magisto_download +download_playlist = playlist_not_supported('magisto') diff --git a/src/you_get/downloader/miomio.py b/src/you_get/extractor/miomio.py similarity index 61% rename from src/you_get/downloader/miomio.py rename to src/you_get/extractor/miomio.py index cbdad765..4c23c929 100644 --- a/src/you_get/downloader/miomio.py +++ b/src/you_get/extractor/miomio.py @@ -4,21 +4,24 @@ __all__ = ['miomio_download'] from ..common import * +from .sina import sina_download_by_vid from .tudou import tudou_download_by_id -from .youku import youku_download_by_id +from .youku import youku_download_by_vid def miomio_download(url, output_dir = '.', merge = True, info_only = False): html = get_html(url) - + title = r1(r'<meta name="description" content="([^"]*)"', html) flashvars = r1(r'flashvars="(type=[^"]*)"', html) - + t = r1(r'type=(\w+)', flashvars) id = r1(r'vid=([^"]+)', flashvars) if t == 'youku': - youku_download_by_id(id, title, output_dir = output_dir, merge = merge, info_only = info_only) + youku_download_by_vid(id, title, output_dir=output_dir, merge=merge, info_only=info_only) elif t == 'tudou': - tudou_download_by_id(id, title, output_dir = output_dir, merge = merge, info_only = info_only) + tudou_download_by_id(id, title, output_dir=output_dir, merge=merge, info_only=info_only) + elif t == 'sina': + sina_download_by_vid(id, title, output_dir=output_dir, merge=merge, info_only=info_only) else: raise NotImplementedError(flashvars) diff --git a/src/you_get/downloader/mixcloud.py b/src/you_get/extractor/mixcloud.py similarity index 84% rename from src/you_get/downloader/mixcloud.py rename to src/you_get/extractor/mixcloud.py index 0261f081..d6159e47 100644 --- a/src/you_get/downloader/mixcloud.py +++ b/src/you_get/extractor/mixcloud.py @@ -7,9 +7,9 @@ from ..common import * def mixcloud_download(url, output_dir = '.', merge = True, info_only = False): html = get_html(url) title = r1(r'<meta property="og:title" content="([^"]*)"', html) - preview_url = r1("data-preview-url=\"([^\"]+)\"", html) + preview_url = r1("m-preview=\"([^\"]+)\"", html) - url = re.sub(r'previews', r'cloudcasts/originals', preview_url) + url = re.sub(r'previews', r'c/originals', preview_url) for i in range(10, 30): url = re.sub(r'stream[^.]*', r'stream' + str(i), url) @@ -22,7 +22,7 @@ def mixcloud_download(url, output_dir = '.', merge = True, info_only = False): try: type except: - url = re.sub('cloudcasts/originals', r'cloudcasts/m4a/64', url) + url = re.sub('c/originals', r'c/m4a/64', url) url = re.sub('.mp3', '.m4a', url) for i in range(10, 30): url = re.sub(r'stream[^.]*', r'stream' + str(i), url) diff --git a/src/you_get/downloader/netease.py b/src/you_get/extractor/netease.py similarity index 91% rename from src/you_get/downloader/netease.py rename to src/you_get/extractor/netease.py index 863689f3..1321ba0f 100644 --- a/src/you_get/downloader/netease.py +++ b/src/you_get/extractor/netease.py @@ -6,8 +6,9 @@ from ..common import * def netease_download(url, output_dir = '.', merge = True, info_only = False): html = get_decoded_html(url) - + title = r1('movieDescription=\'([^\']+)\'', html) or r1('<title>(.+)', html) + if title[0] == ' ': title = title[1:] @@ -27,7 +28,7 @@ def netease_download(url, output_dir = '.', merge = True, info_only = False): ext = 'flv' else: - url = r1(r'["\'](.+)-list.m3u8["\']', html) + ".mp4" + url = (r1(r'["\'](.+)-list.m3u8["\']', html) or r1(r'["\'](.+).m3u8["\']', html)) + ".mp4" _, _, size = url_info(url) ext = 'mp4' diff --git a/src/you_get/downloader/nicovideo.py b/src/you_get/extractor/nicovideo.py similarity index 73% rename from src/you_get/downloader/nicovideo.py rename to src/you_get/extractor/nicovideo.py index 7d384f31..f99a54b8 100644 --- a/src/you_get/downloader/nicovideo.py +++ b/src/you_get/extractor/nicovideo.py @@ -6,12 +6,17 @@ from ..common import * def nicovideo_login(user, password): data = "current_form=login&mail=" + user +"&password=" + password + "&login_submit=Log+In" - response = request.urlopen(request.Request("https://secure.nicovideo.jp/secure/login?site=niconico", headers = fake_headers, data = data.encode('utf-8'))) + response = request.urlopen(request.Request("https://secure.nicovideo.jp/secure/login?site=niconico", headers=fake_headers, data=data.encode('utf-8'))) return response.headers -def nicovideo_download(url, output_dir = '.', merge = True, info_only = False): - request.install_opener(request.build_opener(request.HTTPCookieProcessor())) - +def nicovideo_download(url, output_dir='.', merge=True, info_only=False): + import ssl + ssl_context = request.HTTPSHandler( +context=ssl.SSLContext(ssl.PROTOCOL_TLSv1)) + cookie_handler = request.HTTPCookieProcessor() + opener = request.build_opener(ssl_context, cookie_handler) + request.install_opener(opener) + import netrc, getpass info = netrc.netrc().authenticators('nicovideo') if info is None: @@ -21,15 +26,15 @@ def nicovideo_download(url, output_dir = '.', merge = True, info_only = False): user, password = info[0], info[2] print("Logging in...") nicovideo_login(user, password) - + html = get_html(url) # necessary! title = unicodize(r1(r'([^<]+)', html)) - + api_html = get_html('http://www.nicovideo.jp/api/getflv?v=%s' % url.split('/')[-1]) real_url = parse.unquote(r1(r'url=([^&]+)&', api_html)) - + type, ext, size = url_info(real_url) - + print_info(site_info, title, type, size) if not info_only: download_urls([real_url], title, ext, size, output_dir, merge = merge) diff --git a/src/you_get/downloader/pptv.py b/src/you_get/extractor/pptv.py similarity index 95% rename from src/you_get/downloader/pptv.py rename to src/you_get/extractor/pptv.py index 4fd88e5b..b1d45edf 100644 --- a/src/you_get/downloader/pptv.py +++ b/src/you_get/extractor/pptv.py @@ -14,7 +14,7 @@ def pptv_download_by_id(id, title = None, output_dir = '.', merge = True, info_o key = r1(r']+>([^<>]+)', xml) rid = r1(r'rid="([^"]+)"', xml) title = r1(r'nm="([^"]+)"', xml) - pieces = re.findall(']+fs="(\d+)"', xml) numbers, fs = zip(*pieces) urls = ['http://%s/%s/%s?k=%s' % (host, i, rid, key) for i in numbers] total_size = sum(map(int, fs)) diff --git a/src/you_get/extractor/qq.py b/src/you_get/extractor/qq.py new file mode 100644 index 00000000..b91f0f95 --- /dev/null +++ b/src/you_get/extractor/qq.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python + +__all__ = ['qq_download'] + +from ..common import * + +def qq_download_by_id(id, title=None, output_dir='.', merge=True, info_only=False): + xml = get_html('http://www.acfun.com/getinfo?vids=%s' % id) + from xml.dom.minidom import parseString + doc = parseString(xml) + doc_root = doc.getElementsByTagName('root')[0] + doc_vl = doc_root.getElementsByTagName('vl')[0] + doc_vi = doc_vl.getElementsByTagName('vi')[0] + fn = doc_vi.getElementsByTagName('fn')[0].firstChild.data + fclip = doc_vi.getElementsByTagName('fclip')[0].firstChild.data + if int(fclip) > 0: + fn = fn[:-4] + "." + fclip + fn[-4:] + fvkey = doc_vi.getElementsByTagName('fvkey')[0].firstChild.data + doc_ul = doc_vi.getElementsByTagName('ul') + url = doc_ul[0].getElementsByTagName('url')[0].firstChild.data + url = url + fn + '?vkey=' + fvkey + + _, ext, size = url_info(url) + + print_info(site_info, title, ext, size) + if not info_only: + download_urls([url], title, ext, size, output_dir=output_dir, merge=merge) + +def qq_download(url, output_dir = '.', merge = True, info_only = False): + if re.match(r'http://v.qq.com/([^\?]+)\?vid', url): + aid = r1(r'(.*)\.html', url) + vid = r1(r'http://v.qq.com/[^\?]+\?vid=(\w+)', url) + url = 'http://sns.video.qq.com/tvideo/fcgi-bin/video?vid=%s' % vid + + if re.match(r'http://y.qq.com/([^\?]+)\?vid', url): + vid = r1(r'http://y.qq.com/[^\?]+\?vid=(\w+)', url) + + url = "http://v.qq.com/page/%s.html" % vid + + r_url = r1(r'(.+?)', r'title:"([^"]+)"')[0].strip() + assert title + title = unescape_html(title) + title = escape_file_path(title) + + try: + id = vid + except: + id = r1(r'vid:"([^"]+)"', html) + + qq_download_by_id(id, title, output_dir = output_dir, merge = merge, info_only = info_only) + +site_info = "QQ.com" +download = qq_download +download_playlist = playlist_not_supported('qq') diff --git a/src/you_get/downloader/sina.py b/src/you_get/extractor/sina.py similarity index 75% rename from src/you_get/downloader/sina.py rename to src/you_get/extractor/sina.py index af030a9e..8ab4931c 100644 --- a/src/you_get/downloader/sina.py +++ b/src/you_get/extractor/sina.py @@ -4,8 +4,19 @@ __all__ = ['sina_download', 'sina_download_by_vid', 'sina_download_by_vkey'] from ..common import * -def video_info(id): - xml = get_content('http://v.iask.com/v_play.php?vid=%s' % id, decoded=True) +from hashlib import md5 +from random import randint +from time import time + +def get_k(vid, rand): + t = str(int('{0:b}'.format(int(time()))[:-6], 2)) + return md5((vid + 'Z6prk18aWxP278cVAH' + t + rand).encode('utf-8')).hexdigest()[:16] + t + +def video_info(vid): + rand = "0.{0}{1}".format(randint(10000, 10000000), randint(10000, 10000000)) + url = 'http://v.iask.com/v_play.php?vid={0}&ran={1}&p=i&k={2}'.format(vid, rand, get_k(vid, rand)) + xml = get_content(url, headers=fake_headers, decoded=True) + urls = re.findall(r'(?:)?', xml) name = match1(xml, r'(?:)?') vstr = match1(xml, r'(?:)?') @@ -15,7 +26,7 @@ def sina_download_by_vid(vid, title=None, output_dir='.', merge=True, info_only= """Downloads a Sina video by its unique vid. http://video.sina.com.cn/ """ - + urls, name, vstr = video_info(vid) title = title or name assert title @@ -23,7 +34,7 @@ def sina_download_by_vid(vid, title=None, output_dir='.', merge=True, info_only= for url in urls: _, _, temp = url_info(url) size += temp - + print_info(site_info, title, 'flv', size) if not info_only: download_urls(urls, title, 'flv', size, output_dir = output_dir, merge = merge) @@ -32,10 +43,10 @@ def sina_download_by_vkey(vkey, title=None, output_dir='.', merge=True, info_onl """Downloads a Sina video by its unique vkey. http://video.sina.com/ """ - + url = 'http://video.sina.com/v/flvideo/%s_0.flv' % vkey type, ext, size = url_info(url) - + print_info(site_info, title, 'flv', size) if not info_only: download_urls([url], title, 'flv', size, output_dir = output_dir, merge = merge) @@ -43,7 +54,7 @@ def sina_download_by_vkey(vkey, title=None, output_dir='.', merge=True, info_onl def sina_download(url, output_dir='.', merge=True, info_only=False): """Downloads Sina videos by URL. """ - + vid = match1(url, r'vid=(\d+)') if vid is None: video_page = get_content(url) @@ -51,9 +62,10 @@ def sina_download(url, output_dir='.', merge=True, info_only=False): if hd_vid == '0': vids = match1(video_page, r'[^\w]vid\s*:\s*\'([^\']+)\'').split('|') vid = vids[-1] - + if vid: - sina_download_by_vid(vid, output_dir=output_dir, merge=merge, info_only=info_only) + title = match1(video_page, r'title\s*:\s*\'([^\']+)\'') + sina_download_by_vid(vid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) else: vkey = match1(video_page, r'vkey\s*:\s*"([^"]+)"') title = match1(video_page, r'title\s*:\s*"([^"]+)"') diff --git a/src/you_get/downloader/sohu.py b/src/you_get/extractor/sohu.py similarity index 71% rename from src/you_get/downloader/sohu.py rename to src/you_get/extractor/sohu.py index 4400836a..9a1e109b 100644 --- a/src/you_get/downloader/sohu.py +++ b/src/you_get/extractor/sohu.py @@ -12,9 +12,22 @@ def real_url(host, prot, file, new): return '%s%s?key=%s' % (start[:-1], new, key) def sohu_download(url, output_dir = '.', merge = True, info_only = False): - vid = r1('vid\s*=\s*"(\d+)"', get_html(url)) - - if vid: + if re.match(r'http://share.vrs.sohu.com', url): + vid = r1('id=(\d+)', url) + else: + html = get_html(url) + vid = r1(r'\Wvid\s*[\:=]\s*[\'"]?(\d+)[\'"]?', html) + assert vid + + # Open Sogou proxy if required + if get_sogou_proxy() is not None: + server = sogou_proxy_server(get_sogou_proxy(), ostream=open(os.devnull, 'w')) + server_thread = threading.Thread(target=server.serve_forever) + server_thread.daemon = True + server_thread.start() + set_proxy(server.server_address) + + if re.match(r'http://tv.sohu.com/', url): data = json.loads(get_decoded_html('http://hot.vrs.sohu.com/vrs_flash.action?vid=%s' % vid)) for qtyp in ["oriVid","superVid","highVid" ,"norVid","relativeId"]: hqvid = data['data'][qtyp] @@ -31,10 +44,9 @@ def sohu_download(url, output_dir = '.', merge = True, info_only = False): for file, new in zip(data['clipsURL'], data['su']): urls.append(real_url(host, prot, file, new)) assert data['clipsURL'][0].endswith('.mp4') - + else: - vid = r1('vid\s*=\s*\'(\d+)\'', get_html(url)) - data = json.loads(get_decoded_html('http://my.tv.sohu.com/videinfo.jhtml?m=viewnew&vid=%s' % vid)) + data = json.loads(get_decoded_html('http://my.tv.sohu.com/play/videonew.do?vid=%s&referer=http://my.tv.sohu.com' % vid)) host = data['allot'] prot = data['prot'] urls = [] @@ -45,7 +57,12 @@ def sohu_download(url, output_dir = '.', merge = True, info_only = False): for file, new in zip(data['clipsURL'], data['su']): urls.append(real_url(host, prot, file, new)) assert data['clipsURL'][0].endswith('.mp4') - + + # Close Sogou proxy if required + if get_sogou_proxy() is not None: + server.shutdown() + unset_proxy() + print_info(site_info, title, 'mp4', size) if not info_only: download_urls(urls, title, 'mp4', size, output_dir, refer = url, merge = merge) diff --git a/src/you_get/downloader/songtaste.py b/src/you_get/extractor/songtaste.py similarity index 100% rename from src/you_get/downloader/songtaste.py rename to src/you_get/extractor/songtaste.py diff --git a/src/you_get/downloader/soundcloud.py b/src/you_get/extractor/soundcloud.py similarity index 100% rename from src/you_get/downloader/soundcloud.py rename to src/you_get/extractor/soundcloud.py diff --git a/src/you_get/extractor/ted.py b/src/you_get/extractor/ted.py new file mode 100644 index 00000000..0c2d2c83 --- /dev/null +++ b/src/you_get/extractor/ted.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +__all__ = ['ted_download'] + +from ..common import * +import json + +def ted_download(url, output_dir='.', merge=True, info_only=False): + html = get_html(url) + metadata = json.loads(match1(html, r'({"talks"(.*)})\)')) + title = metadata['talks'][0]['title'] + nativeDownloads = metadata['talks'][0]['nativeDownloads'] + for quality in ['high', 'medium', 'low']: + if quality in nativeDownloads: + url = nativeDownloads[quality] + type, ext, size = url_info(url) + print_info(site_info, title, type, size) + if not info_only: + download_urls([url], title, ext, size, output_dir, merge=merge) + break + +site_info = "TED.com" +download = ted_download +download_playlist = playlist_not_supported('ted') diff --git a/src/you_get/extractor/theplatform.py b/src/you_get/extractor/theplatform.py new file mode 100644 index 00000000..2938c459 --- /dev/null +++ b/src/you_get/extractor/theplatform.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +from ..common import * + +def theplatform_download_by_pid(pid, title, output_dir='.', merge=True, info_only=False): + smil_url = "http://link.theplatform.com/s/dJ5BDC/%s/meta.smil?format=smil&mbr=true" % pid + smil = get_content(smil_url) + smil_base = unescape_html(match1(smil, r' 0: vids.append({"k": data[k][0]["k"], "size": data[k][0]["size"]}) temp = max(vids, key=lambda x:x["size"]) @@ -25,10 +25,10 @@ def tudou_download_by_iid(iid, title, output_dir = '.', merge = True, info_only if not info_only: download_urls([url], title, ext, size, output_dir = output_dir, merge = merge) -def tudou_download_by_id(id, title, output_dir = '.', merge = True, info_only = False): +def tudou_download_by_id(id, title, output_dir = '.', merge = True, info_only = False): html = get_html('http://www.tudou.com/programs/view/%s/' % id) - - iid = r1(r'iid\s*[:=]\s*(\S+)', html) + + iid = r1(r'iid\s*[:=]\s*(\S+)', html) title = r1(r'kw\s*[:=]\s*[\'\"]([^\']+?)[\'\"]', html) tudou_download_by_iid(iid, title, output_dir = output_dir, merge = merge, info_only = info_only) @@ -37,22 +37,22 @@ def tudou_download(url, output_dir = '.', merge = True, info_only = False): id = r1(r'http://www.tudou.com/v/([^/]+)/', url) if id: return tudou_download_by_id(id, title="", info_only=info_only) - + html = get_decoded_html(url) - + title = r1(r'kw\s*[:=]\s*[\'\"]([^\']+?)[\'\"]', html) assert title title = unescape_html(title) - + vcode = r1(r'vcode\s*[:=]\s*\'([^\']+)\'', html) if vcode: - from .youku import youku_download_by_id - return youku_download_by_id(vcode, title, output_dir = output_dir, merge = merge, info_only = info_only) - + from .youku import youku_download_by_vid + return youku_download_by_vid(vcode, title, output_dir = output_dir, merge = merge, info_only = info_only) + iid = r1(r'iid\s*[:=]\s*(\d+)', html) if not iid: return tudou_download_playlist(url, output_dir, merge, info_only) - + tudou_download_by_iid(iid, title, output_dir = output_dir, merge = merge, info_only = info_only) def parse_playlist(url): @@ -81,4 +81,4 @@ def tudou_download_playlist(url, output_dir = '.', merge = True, info_only = Fal site_info = "Tudou.com" download = tudou_download -download_playlist = tudou_download_playlist \ No newline at end of file +download_playlist = tudou_download_playlist diff --git a/src/you_get/downloader/tumblr.py b/src/you_get/extractor/tumblr.py similarity index 100% rename from src/you_get/downloader/tumblr.py rename to src/you_get/extractor/tumblr.py diff --git a/src/you_get/downloader/vid48.py b/src/you_get/extractor/vid48.py similarity index 100% rename from src/you_get/downloader/vid48.py rename to src/you_get/extractor/vid48.py diff --git a/src/you_get/downloader/vimeo.py b/src/you_get/extractor/vimeo.py similarity index 59% rename from src/you_get/downloader/vimeo.py rename to src/you_get/extractor/vimeo.py index 1a18dfbb..60611f74 100644 --- a/src/you_get/downloader/vimeo.py +++ b/src/you_get/extractor/vimeo.py @@ -5,19 +5,16 @@ __all__ = ['vimeo_download', 'vimeo_download_by_id'] from ..common import * def vimeo_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False): - html = get_html('http://vimeo.com/%s' % id, faker = True) + video_page = get_content('http://player.vimeo.com/video/%s' % id, headers=fake_headers) + title = r1(r'([^<]+)', video_page) + info = dict(re.findall(r'"([^"]+)":\{[^{]+"url":"([^"]+)"', video_page)) + for quality in ['hd', 'sd', 'mobile']: + if quality in info: + url = info[quality] + break + assert url - signature = r1(r'"signature":"([^"]+)"', html) - timestamp = r1(r'"timestamp":([^,]+)', html) - hd = r1(r',"hd":(\d+),', html) - - title = r1(r'"title":"([^"]+)"', html) - title = escape_file_path(title) - - url = 'http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s' % (id, signature, timestamp) - if hd == "1": - url += '&quality=hd' - type, ext, size = url_info(url, faker = True) + type, ext, size = url_info(url, faker=True) print_info(site_info, title, type, size) if not info_only: diff --git a/src/you_get/extractor/vine.py b/src/you_get/extractor/vine.py new file mode 100644 index 00000000..5bcc23b5 --- /dev/null +++ b/src/you_get/extractor/vine.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python + +__all__ = ['vine_download'] + +from ..common import * + +def vine_download(url, output_dir='.', merge=True, info_only=False): + html = get_html(url) + + vid = r1(r'vine.co/v/([^/]+)/', html) + title1 = r1(r' 0: - with open(output_dir + "/" + file_name.replace('/', '-').replace('?', '-') + '.lrc', 'w', encoding='utf-8') as x: + with open(output_dir + "/" + filename + '.lrc', 'w', encoding='utf-8') as x: x.write(lrc) def xiami_download_pic(pic_url, file_name, output_dir): @@ -50,7 +51,10 @@ def xiami_download_song(sid, output_dir = '.', merge = True, info_only = False): album_name = i.getElementsByTagName("album_name")[0].firstChild.nodeValue song_title = i.getElementsByTagName("title")[0].firstChild.nodeValue url = location_dec(i.getElementsByTagName("location")[0].firstChild.nodeValue) - lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue + try: + lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue + except: + pass type, ext, size = url_info(url, faker = True) if not ext: ext = 'mp3' @@ -78,7 +82,10 @@ def xiami_download_showcollect(cid, output_dir = '.', merge = True, info_only = album_name = i.getElementsByTagName("album_name")[0].firstChild.nodeValue song_title = i.getElementsByTagName("title")[0].firstChild.nodeValue url = location_dec(i.getElementsByTagName("location")[0].firstChild.nodeValue) - lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue + try: + lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue + except: + pass type, ext, size = url_info(url, faker = True) if not ext: ext = 'mp3' @@ -107,7 +114,10 @@ def xiami_download_album(aid, output_dir = '.', merge = True, info_only = False) for i in tracks: song_title = i.getElementsByTagName("title")[0].firstChild.nodeValue url = location_dec(i.getElementsByTagName("location")[0].firstChild.nodeValue) - lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue + try: + lrc_url = i.getElementsByTagName("lyric")[0].firstChild.nodeValue + except: + pass if not pic_exist: pic_url = i.getElementsByTagName("pic")[0].firstChild.nodeValue type, ext, size = url_info(url, faker = True) diff --git a/src/you_get/downloader/yinyuetai.py b/src/you_get/extractor/yinyuetai.py similarity index 100% rename from src/you_get/downloader/yinyuetai.py rename to src/you_get/extractor/yinyuetai.py diff --git a/src/you_get/extractor/youku.py b/src/you_get/extractor/youku.py new file mode 100644 index 00000000..7ae92e73 --- /dev/null +++ b/src/you_get/extractor/youku.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from ..common import * + +class Youku(VideoExtractor): + name = "优酷 (Youku)" + + stream_types = [ + {'id': 'hd3', 'container': 'flv', 'video_profile': '1080P'}, + {'id': 'hd2', 'container': 'flv', 'video_profile': '超清'}, + {'id': 'mp4', 'container': 'mp4', 'video_profile': '高清'}, + {'id': 'flv', 'container': 'flv', 'video_profile': '标清'}, + {'id': '3gphd', 'container': '3gp', 'video_profile': '高清(3GP)'}, + ] + + def __init__(self, *args): + super().__init__(args) + + def get_vid_from_url(url): + """Extracts video ID from URL. + """ + patterns = [ + 'youku.com/v_show/id_([\w=]+)', + 'player.youku.com/player.php/sid/([\w=]+)/v.swf', + 'loader\.swf\?VideoIDS=([\w=]+)', + ] + matches = match1(url, *patterns) + if matches: + return matches[0] + else: + return None + + def parse_m3u8(m3u8): + return re.findall(r'(http://[^?]+)\?ts_start=0', m3u8) + + def prepare(self, **kwargs): + assert self.url or self.vid + if self.url and not self.vid: + self.vid = __class__.get_vid_from_url(self.url) + + meta = json.loads(get_html('http://v.youku.com/player/getPlayList/VideoIDS/%s' % self.vid)) + metadata0 = meta['data'][0] + + self.title = metadata0['title'] + + for stream_type in self.stream_types: + if stream_type['id'] in metadata0['streamsizes']: + stream_id = stream_type['id'] + stream_size = int(metadata0['streamsizes'][stream_id]) + self.streams[stream_id] = {'container': stream_type['container'], 'video_profile': stream_type['video_profile'], 'size': stream_size} + + def extract(self, **kwargs): + if 'stream_id' in kwargs and kwargs['stream_id']: + # Extract the stream + stream_id = kwargs['stream_id'] + else: + # Extract stream with the best quality + stream_id = self.streams_sorted[0]['id'] + + m3u8_url = "http://v.youku.com/player/getM3U8/vid/{vid}/type/{stream_id}/video.m3u8".format(vid=self.vid, stream_id=stream_id) + m3u8 = get_html(m3u8_url) + if not m3u8: + log.w('[Warning] This video can only be streamed within Mainland China!') + log.w('Use \'-y\' to specify a proxy server for extracting stream data.\n') + + self.streams[stream_id]['src'] = __class__.parse_m3u8(m3u8) + +site = Youku() +download = site.download_by_url +download_playlist = playlist_not_supported('youku') + +youku_download_by_vid = site.download_by_vid +# Used by: acfun.py bilibili.py miomio.py tudou.py diff --git a/src/you_get/downloader/youtube.py b/src/you_get/extractor/youtube.py similarity index 62% rename from src/you_get/downloader/youtube.py rename to src/you_get/extractor/youtube.py index 51fbb07f..cdf3e512 100644 --- a/src/you_get/downloader/youtube.py +++ b/src/you_get/extractor/youtube.py @@ -5,36 +5,37 @@ __all__ = ['youtube_download', 'youtube_download_by_id'] from ..common import * # YouTube media encoding options, in descending quality order. -# taken from http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs, 3/22/2013. +# taken from http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs, 2/14/2014. yt_codecs = [ {'itag': 38, 'container': 'MP4', 'video_resolution': '3072p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '3.5-5', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, + #{'itag': 85, 'container': 'MP4', 'video_resolution': '1080p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '3-4', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, {'itag': 46, 'container': 'WebM', 'video_resolution': '1080p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'}, {'itag': 37, 'container': 'MP4', 'video_resolution': '1080p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '3-4.3', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, - {'itag': 102, 'container': '', 'video_resolution': '', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '2', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'}, - {'itag': 45, 'container': 'WebM', 'video_resolution': '720p', 'video_encoding': '', 'video_profile': '', 'video_bitrate': '', 'audio_encoding': '', 'audio_bitrate': ''}, - {'itag': 22, 'container': 'MP4', 'video_resolution': '720p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '2-2.9', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, - {'itag': 84, 'container': 'MP4', 'video_resolution': '720p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '2-2.9', 'audio_encoding': 'AAC', 'audio_bitrate': '152'}, - {'itag': 120, 'container': 'FLV', 'video_resolution': '720p', 'video_encoding': 'AVC', 'video_profile': 'Main@L3.1', 'video_bitrate': '2', 'audio_encoding': 'AAC', 'audio_bitrate': '128'}, - {'itag': 85, 'container': 'MP4', 'video_resolution': '520p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '2-2.9', 'audio_encoding': 'AAC', 'audio_bitrate': '152'}, + #{'itag': 102, 'container': 'WebM', 'video_resolution': '720p', 'video_encoding': 'VP8', 'video_profile': '3D', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'}, + {'itag': 45, 'container': 'WebM', 'video_resolution': '720p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '2', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'}, + #{'itag': 84, 'container': 'MP4', 'video_resolution': '720p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '2-3', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, + {'itag': 22, 'container': 'MP4', 'video_resolution': '720p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '2-3', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, + {'itag': 120, 'container': 'FLV', 'video_resolution': '720p', 'video_encoding': 'H.264', 'video_profile': 'Main@L3.1', 'video_bitrate': '2', 'audio_encoding': 'AAC', 'audio_bitrate': '128'}, {'itag': 44, 'container': 'WebM', 'video_resolution': '480p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '1', 'audio_encoding': 'Vorbis', 'audio_bitrate': '128'}, {'itag': 35, 'container': 'FLV', 'video_resolution': '480p', 'video_encoding': 'H.264', 'video_profile': 'Main', 'video_bitrate': '0.8-1', 'audio_encoding': 'AAC', 'audio_bitrate': '128'}, - {'itag': 101, 'container': 'WebM', 'video_resolution': '360p', 'video_encoding': 'VP8', 'video_profile': '3D', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'}, - {'itag': 100, 'container': 'WebM', 'video_resolution': '360p', 'video_encoding': 'VP8', 'video_profile': '3D', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '128'}, + #{'itag': 101, 'container': 'WebM', 'video_resolution': '360p', 'video_encoding': 'VP8', 'video_profile': '3D', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'}, + #{'itag': 100, 'container': 'WebM', 'video_resolution': '360p', 'video_encoding': 'VP8', 'video_profile': '3D', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '128'}, {'itag': 43, 'container': 'WebM', 'video_resolution': '360p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '0.5', 'audio_encoding': 'Vorbis', 'audio_bitrate': '128'}, {'itag': 34, 'container': 'FLV', 'video_resolution': '360p', 'video_encoding': 'H.264', 'video_profile': 'Main', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '128'}, - {'itag': 82, 'container': 'MP4', 'video_resolution': '360p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '96'}, + #{'itag': 82, 'container': 'MP4', 'video_resolution': '360p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '96'}, {'itag': 18, 'container': 'MP4', 'video_resolution': '270p/360p', 'video_encoding': 'H.264', 'video_profile': 'Baseline', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '96'}, {'itag': 6, 'container': 'FLV', 'video_resolution': '270p', 'video_encoding': 'Sorenson H.263', 'video_profile': '', 'video_bitrate': '0.8', 'audio_encoding': 'MP3', 'audio_bitrate': '64'}, - {'itag': 83, 'container': 'MP4', 'video_resolution': '240p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '96'}, + #{'itag': 83, 'container': 'MP4', 'video_resolution': '240p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '96'}, {'itag': 13, 'container': '3GP', 'video_resolution': '', 'video_encoding': 'MPEG-4 Visual', 'video_profile': '', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': ''}, {'itag': 5, 'container': 'FLV', 'video_resolution': '240p', 'video_encoding': 'Sorenson H.263', 'video_profile': '', 'video_bitrate': '0.25', 'audio_encoding': 'MP3', 'audio_bitrate': '64'}, - {'itag': 36, 'container': '3GP', 'video_resolution': '240p', 'video_encoding': 'MPEG-4 Visual', 'video_profile': 'Simple', 'video_bitrate': '0.17', 'audio_encoding': 'AAC', 'audio_bitrate': '38'}, + {'itag': 36, 'container': '3GP', 'video_resolution': '240p', 'video_encoding': 'MPEG-4 Visual', 'video_profile': 'Simple', 'video_bitrate': '0.175', 'audio_encoding': 'AAC', 'audio_bitrate': '36'}, {'itag': 17, 'container': '3GP', 'video_resolution': '144p', 'video_encoding': 'MPEG-4 Visual', 'video_profile': 'Simple', 'video_bitrate': '0.05', 'audio_encoding': 'AAC', 'audio_bitrate': '24'}, ] def decipher(js, s): def tr_js(code): code = re.sub(r'function', r'def', code) + code = re.sub(r'\$', '_dollar', code) code = re.sub(r'\{', r':\n\t', code) code = re.sub(r'\}', r'\n', code) code = re.sub(r'var\s+', r'', code) @@ -44,73 +45,96 @@ def decipher(js, s): code = re.sub(r'(\w+).slice\((\d+)\)', r'\1[\2:]', code) code = re.sub(r'(\w+).split\(""\)', r'list(\1)', code) return code - - f1 = match1(js, r'g.sig\|\|(\w+)\(g.s\)') - f1def = match1(js, r'(function %s\(\w+\)\{[^\{]+\})' % f1) + + f1 = match1(js, r'\w+\.sig\|\|([$\w]+)\(\w+\.\w+\)') + f1def = match1(js, r'(function %s\(\w+\)\{[^\{]+\})' % re.escape(f1)) code = tr_js(f1def) - f2 = match1(f1def, r'(\w+)\(\w+,\d+\)') + f2 = match1(f1def, r'([$\w]+)\(\w+,\d+\)') if f2 is not None: - f2def = match1(js, r'(function %s\(\w+,\w+\)\{[^\{]+\})' % f2) + f2e = re.escape(f2) + f2def = match1(js, r'(function %s\(\w+,\w+\)\{[^\{]+\})' % f2e) + f2 = re.sub(r'\$', '_dollar', f2) code = code + 'global %s\n' % f2 + tr_js(f2def) - - code = code + 'sig=%s(s)' % f1 + + code = code + 'sig=%s(s)' % re.sub(r'\$', '_dollar', f1) exec(code, globals(), locals()) return locals()['sig'] def youtube_download_by_id(id, title=None, output_dir='.', merge=True, info_only=False): """Downloads a YouTube video by its unique id. """ - + raw_video_info = get_content('http://www.youtube.com/get_video_info?video_id=%s' % id) video_info = parse.parse_qs(raw_video_info) - + if video_info['status'] == ['ok'] and ('use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']): title = parse.unquote_plus(video_info['title'][0]) stream_list = parse.parse_qs(raw_video_info)['url_encoded_fmt_stream_map'][0].split(',') - + else: # Parse video page when video_info is not usable. video_page = get_content('http://www.youtube.com/watch?v=%s' % id) - ytplayer_config = json.loads(match1(video_page, r'ytplayer.config\s*=\s*([^\n]+);')) - + ytplayer_config = json.loads(match1(video_page, r'ytplayer.config\s*=\s*([^\n]+});')) + title = ytplayer_config['args']['title'] stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') - + html5player = ytplayer_config['assets']['js'] - + if html5player[0:2] == '//': + html5player = 'http:' + html5player + streams = { parse.parse_qs(stream)['itag'][0] : parse.parse_qs(stream) for stream in stream_list } - + for codec in yt_codecs: itag = str(codec['itag']) if itag in streams: download_stream = streams[itag] break - + url = download_stream['url'][0] if 'sig' in download_stream: sig = download_stream['sig'][0] - else: + url = '%s&signature=%s' % (url, sig) + elif 's' in download_stream: js = get_content(html5player) sig = decipher(js, download_stream['s'][0]) - url = '%s&signature=%s' % (url, sig) - + url = '%s&signature=%s' % (url, sig) + type, ext, size = url_info(url) - + print_info(site_info, title, type, size) if not info_only: download_urls([url], title, ext, size, output_dir, merge = merge) +def youtube_list_download_by_id(list_id, title=None, output_dir='.', merge=True, info_only=False): + """Downloads a YouTube video list by its unique id. + """ + + video_page = get_content('http://www.youtube.com/playlist?list=%s' % list_id) + ids = set(re.findall(r''): '-', + ord('['): '(', + ord(']'): ')', + }) + else: + # *nix + if os == 'Darwin': + # Mac OS HFS+ + text = text.translate({ + ord(':'): '-', + }) + + # Remove leading . + if text.startswith("."): + text = text[1:] + + text = text[:82] # Trim to 82 Unicode characters long + return text diff --git a/src/you_get/util/log.py b/src/you_get/util/log.py new file mode 100644 index 00000000..356dea76 --- /dev/null +++ b/src/you_get/util/log.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python + +from ..version import __name__ + +import os, sys, subprocess + +# Is terminal ANSI/VT100 compatible +if os.getenv('TERM') in ( + 'xterm', + 'vt100', + 'linux', + 'eterm-color', + 'screen', + ): + has_colors = True +else: + try: + # Eshell + ppid = os.getppid() + has_colors = (subprocess.getoutput('ps -p %d -ocomm=' % ppid) + == 'emacs') + except: + has_colors = False + +# ANSI/VT100 escape code +# http://en.wikipedia.org/wiki/ANSI_escape_code +colors = { + 'none': '', + 'reset': '\033[0m', + + 'black': '\033[30m', + 'bold-black': '\033[30;1m', + 'dark-gray': '\033[90m', + 'bold-dark-gray': '\033[90;1m', + + 'red': '\033[31m', + 'bold-red': '\033[31;1m', + 'light-red': '\033[91m', + 'bold-light-red': '\033[91;1m', + + 'green': '\033[32m', + 'bold-green': '\033[32;1m', + 'light-green': '\033[92m', + 'bold-light-green': '\033[92;1m', + + 'yellow': '\033[33m', + 'bold-yellow': '\033[33;1m', + 'light-yellow': '\033[93m', + 'bold-light-yellow': '\033[93;1m', + + 'blue': '\033[34m', + 'bold-blue': '\033[34;1m', + 'light-blue': '\033[94m', + 'bold-light-blue': '\033[94;1m', + + 'magenta': '\033[35m', + 'bold-magenta': '\033[35;1m', + 'light-magenta': '\033[95m', + 'bold-light-magenta': '\033[95;1m', + + 'cyan': '\033[36m', + 'bold-cyan': '\033[36;1m', + 'light-cyan': '\033[96m', + 'bold-light-cyan': '\033[96;1m', + + 'light-gray': '\033[37m', + 'bold-light-gray': '\033[37;1m', + 'white': '\033[97m', + 'bold-white': '\033[97;1m', +} + +def underlined(text): + """Returns an underlined text. + """ + return "\33[4m%s\33[24m" % text if has_colors else text + +def println(text, color=None, ostream=sys.stdout): + """Prints a text line to stream. + """ + if has_colors and color in colors: + ostream.write("{0}{1}{2}\n".format(colors[color], text, colors['reset'])) + else: + ostream.write("{0}\n".format(text)) + +def printlog(message, color=None, ostream=sys.stderr): + """Prints a log message to stream. + """ + if has_colors and color in colors: + ostream.write("{0}{1}: {2}{3}\n".format(colors[color], __name__, message, colors['reset'])) + else: + ostream.write("{0}: {1}\n".format(__name__, message)) + +def i(message, ostream=sys.stderr): + """Sends an info log message. + """ + printlog(message, + None, + ostream=ostream) + +def d(message, ostream=sys.stderr): + """Sends a debug log message. + """ + printlog(message, + 'blue' if has_colors else None, + ostream=ostream) + +def w(message, ostream=sys.stderr): + """Sends a warning log message. + """ + printlog(message, + 'yellow' if has_colors else None, + ostream=ostream) + +def e(message, ostream=sys.stderr): + """Sends an error log message. + """ + printlog(message, + 'bold-yellow' if has_colors else None, + ostream=ostream) + +def wtf(message, ostream=sys.stderr): + """What a Terrible Failure. + """ + printlog(message, + 'bold-red' if has_colors else None, + ostream=ostream) diff --git a/src/you_get/util/sogou_proxy.py b/src/you_get/util/sogou_proxy.py new file mode 100644 index 00000000..01ffb572 --- /dev/null +++ b/src/you_get/util/sogou_proxy.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python + +# Original code from: +# http://xiaoxia.org/2011/03/26/using-python-to-write-a-local-sogou-proxy-server-procedures/ + +from . import log + +from http.client import HTTPResponse +from http.server import BaseHTTPRequestHandler, HTTPServer +from socketserver import ThreadingMixIn +from threading import Thread +import random, socket, struct, sys, time + +def sogou_proxy_server( + host=("0.0.0.0", 0), + network_env='CERNET', + ostream=sys.stderr): + """Returns a Sogou proxy server object. + """ + + x_sogou_auth = '9CD285F1E7ADB0BD403C22AD1D545F40/30/853edc6d49ba4e27' + proxy_host = 'h0.cnc.bj.ie.sogou.com' + proxy_port = 80 + + def sogou_hash(t, host): + s = (t + host + 'SogouExplorerProxy').encode('ascii') + code = len(s) + dwords = int(len(s) / 4) + rest = len(s) % 4 + v = struct.unpack(str(dwords) + 'i' + str(rest) + 's', s) + for vv in v: + if type(vv) != bytes: + a = (vv & 0xFFFF) + b = (vv >> 16) + code += a + code = code ^ (((code << 5) ^ b) << 0xb) + # To avoid overflows + code &= 0xffffffff + code += code >> 0xb + if rest == 3: + code += s[len(s) - 2] * 256 + s[len(s) - 3] + code = code ^ ((code ^ (s[len(s) - 1]) * 4) << 0x10) + code &= 0xffffffff + code += code >> 0xb + elif rest == 2: + code += (s[len(s) - 1]) * 256 + (s[len(s) - 2]) + code ^= code << 0xb + code &= 0xffffffff + code += code >> 0x11 + elif rest == 1: + code += s[len(s) - 1] + code ^= code << 0xa + code &= 0xffffffff + code += code >> 0x1 + code ^= code * 8 + code &= 0xffffffff + code += code >> 5 + code ^= code << 4 + code = code & 0xffffffff + code += code >> 0x11 + code ^= code << 0x19 + code = code & 0xffffffff + code += code >> 6 + code = code & 0xffffffff + return hex(code)[2:].rstrip('L').zfill(8) + + class Handler(BaseHTTPRequestHandler): + _socket = None + def do_proxy(self): + try: + if self._socket is None: + self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + self._socket.connect((proxy_host, proxy_port)) + self._socket.send(self.requestline.encode('ascii') + b'\r\n') + log.d(self.requestline, ostream) + + # Add Sogou Verification Tags + self.headers['X-Sogou-Auth'] = x_sogou_auth + t = hex(int(time.time()))[2:].rstrip('L').zfill(8) + self.headers['X-Sogou-Tag'] = sogou_hash(t, self.headers['Host']) + self.headers['X-Sogou-Timestamp'] = t + self._socket.send(str(self.headers).encode('ascii') + b'\r\n') + + # Send POST data + if self.command == 'POST': + self._socket.send(self.rfile.read(int(self.headers['Content-Length']))) + response = HTTPResponse(self._socket, method=self.command) + response.begin() + + # Response + status = 'HTTP/1.1 %s %s' % (response.status, response.reason) + self.wfile.write(status.encode('ascii') + b'\r\n') + h = '' + for hh, vv in response.getheaders(): + if hh.upper() != 'TRANSFER-ENCODING': + h += hh + ': ' + vv + '\r\n' + self.wfile.write(h.encode('ascii') + b'\r\n') + while True: + response_data = response.read(8192) + if len(response_data) == 0: + break + self.wfile.write(response_data) + + except socket.error: + log.e('Socket error for ' + self.requestline, ostream) + + def do_POST(self): + self.do_proxy() + + def do_GET(self): + self.do_proxy() + + class ThreadingHTTPServer(ThreadingMixIn, HTTPServer): + pass + + # Server starts + log.printlog('Sogou Proxy Mini-Server', color='bold-green', ostream=ostream) + + try: + server = ThreadingHTTPServer(host, Handler) + except Exception as ex: + log.wtf("Socket error: %s" % ex, ostream) + exit(1) + host = server.server_address + + if network_env.upper() == 'CERNET': + proxy_host = 'h%s.edu.bj.ie.sogou.com' % random.randint(0, 10) + elif network_env.upper() == 'CTCNET': + proxy_host = 'h%s.ctc.bj.ie.sogou.com' % random.randint(0, 3) + elif network_env.upper() == 'CNCNET': + proxy_host = 'h%s.cnc.bj.ie.sogou.com' % random.randint(0, 3) + elif network_env.upper() == 'DXT': + proxy_host = 'h%s.dxt.bj.ie.sogou.com' % random.randint(0, 10) + else: + proxy_host = 'h%s.edu.bj.ie.sogou.com' % random.randint(0, 10) + + log.i('Remote host: %s' % log.underlined(proxy_host), ostream) + log.i('Proxy server running on %s' % + log.underlined("%s:%s" % host), ostream) + + return server diff --git a/src/you_get/util/strings.py b/src/you_get/util/strings.py new file mode 100644 index 00000000..7e74f35e --- /dev/null +++ b/src/you_get/util/strings.py @@ -0,0 +1,25 @@ +try: + # py 3.4 + from html import unescape as unescape_html +except ImportError: + import re + from html.entities import entitydefs + + def unescape_html(string): + '''HTML entity decode''' + string = re.sub(r'&#[^;]+;', _sharp2uni, string) + string = re.sub(r'&[^;]+;', lambda m: entitydefs[m.group(0)[1:-1]], string) + return string + + def _sharp2uni(m): + '''&#...; ==> unicode''' + s = m.group(0)[2:].rstrip(';;') + if s.startswith('x'): + return chr(int('0'+s, 16)) + else: + return chr(int(s)) + +from .fs import legitimize + +def get_filename(htmlstring): + return legitimize(unescape_html(htmlstring)) diff --git a/src/you_get/version.py b/src/you_get/version.py index 43c2747b..b60d6e85 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,5 +1,6 @@ #!/usr/bin/env python __all__ = ['__version__', '__date__'] -__version__ = '0.3.21' -__date__ = '2013-08-17' +__name__ = 'you-get' +__version__ = '0.3.30dev' +__date__ = '2014-06-24' diff --git a/tests/test.py b/tests/test.py index 641878ef..0d51f86e 100644 --- a/tests/test.py +++ b/tests/test.py @@ -4,37 +4,44 @@ import unittest from you_get import * -from you_get.downloader.__main__ import url_to_module +from you_get.extractor.__main__ import url_to_module def test_urls(urls): for url in urls: - url_to_module(url).download(url, info_only = True) + url_to_module(url)[0].download(url, info_only = True) class YouGetTests(unittest.TestCase): - + def test_freesound(self): test_urls([ "http://www.freesound.org/people/Corsica_S/sounds/184419/", ]) - + + def test_magisto(self): + test_urls([ + "http://www.magisto.com/album/video/f3x9AAQORAkfDnIFDA", + ]) + def test_mixcloud(self): test_urls([ "http://www.mixcloud.com/beatbopz/beat-bopz-disco-mix/", "http://www.mixcloud.com/DJVadim/north-america-are-you-ready/", ]) - + + def test_ted(self): + test_urls([ + "http://www.ted.com/talks/jennifer_lin_improvs_piano_magic.html", + "http://www.ted.com/talks/derek_paravicini_and_adam_ockelford_in_the_key_of_genius.html", + ]) + def test_vimeo(self): test_urls([ "http://vimeo.com/56810854", ]) - - def test_xiami(self): - test_urls([ - "http://www.xiami.com/song/1769835121", - ]) - + def test_youtube(self): test_urls([ "http://www.youtube.com/watch?v=pzKerr0JIPA", "http://youtu.be/pzKerr0JIPA", + "http://www.youtube.com/attribution_link?u=/watch?v%3DldAKIzq7bvs%26feature%3Dshare" ]) diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 00000000..0b7b0231 --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python + +import unittest + +from you_get.util import * + +class TestUtil(unittest.TestCase): + def test_legitimize(self): + self.assertEqual(legitimize("1*2", os="Linux"), "1*2") + self.assertEqual(legitimize("1*2", os="Darwin"), "1*2") + self.assertEqual(legitimize("1*2", os="Windows"), "1-2") diff --git a/you-get b/you-get index 86b44109..8bdc77c7 100755 --- a/you-get +++ b/you-get @@ -4,7 +4,7 @@ import os, sys __path__ = os.path.dirname(os.path.realpath(__file__)) __srcdir__ = 'src' sys.path.insert(1, os.path.join(__path__, __srcdir__)) -from you_get.downloader import main +from you_get.extractor import main if __name__ == '__main__': main() diff --git a/you-get.json b/you-get.json index 92114cff..dc988868 100644 --- a/you-get.json +++ b/you-get.json @@ -31,6 +31,6 @@ ], "console_scripts": [ - "you-get = you_get.downloader.__main__:main" + "you-get = you_get.extractor.__main__:main" ] }