diff --git a/README.md b/README.md index abdf39e5..40a26803 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # You-Get -[![PyPI version](https://badge.fury.io/py/you-get.png)](http://badge.fury.io/py/you-get) -[![Build Status](https://api.travis-ci.org/soimort/you-get.png)](https://travis-ci.org/soimort/you-get) +[![PyPI version](https://img.shields.io/pypi/v/you-get.svg)](https://pypi.python.org/pypi/you-get/) +[![Build Status](https://travis-ci.org/soimort/you-get.svg)](https://travis-ci.org/soimort/you-get) [![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/soimort/you-get?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [You-Get](https://you-get.org/) is a tiny command-line utility to download media contents (videos, audios, images) from the Web, in case there is no other handy way to do it. @@ -37,13 +37,13 @@ Interested? [Install it](#installation) now and [get started by examples](#getti Are you a Python programmer? Then check out [the source](https://github.com/soimort/you-get) and fork it! -![](http://i.imgur.com/GfthFAz.png) +![](https://i.imgur.com/GfthFAz.png) ## Installation ### Prerequisites -The following dependencies are required and must be installed separately, unless you are using a pre-built package on Windows: +The following dependencies are required and must be installed separately, unless you are using a pre-built package or chocolatey on Windows: * **[Python 3](https://www.python.org/downloads/)** * **[FFmpeg](https://www.ffmpeg.org/)** (strongly recommended) or [Libav](https://libav.org/) @@ -93,6 +93,24 @@ $ git clone git://github.com/soimort/you-get.git Then put the cloned directory into your `PATH`, or run `./setup.py install` to install `you-get` to a permanent path. +### Option 6: Using [Chocolatey](https://chocolatey.org/) (Windows only) + +``` +> choco install you-get +``` + +### Option 7: Homebrew (Mac only) + +You can install `you-get` easily via: + +``` +$ brew install you-get +``` + +### Shell completion + +Completion definitions for Bash, Fish and Zsh can be found in [`contrib/completion`](contrib/completion). Please consult your shell's manual for how to take advantage of them. + ## Upgrading Based on which option you chose to install `you-get`, you may upgrade it via: @@ -107,6 +125,18 @@ or download the latest release via: $ you-get https://github.com/soimort/you-get/archive/master.zip ``` +or use [chocolatey package manager](https://chocolatey.org): + +``` +> choco upgrade you-get +``` + +In order to get the latest ```develop``` branch without messing up the PIP, you can try: + +``` +$ pip3 install --upgrade git+https://github.com/soimort/you-get@develop +``` + ## Getting Started ### Download a video @@ -300,7 +330,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | :--: | :-- | :-----: | :-----: | :-----: | | **YouTube** | |✓| | | | **Twitter** | |✓|✓| | -| VK | |✓| | | +| VK | |✓|✓| | | Vine | |✓| | | | Vimeo | |✓| | | | Vidto | |✓| | | @@ -309,6 +339,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | **Tumblr** | |✓|✓|✓| | TED | |✓| | | | SoundCloud | | | |✓| +| SHOWROOM | |✓| | | | Pinterest | | |✓| | | MusicPlayOn | |✓| | | | MTV81 | |✓| | | @@ -342,8 +373,9 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 爆米花网 | |✓| | | | **bilibili
哔哩哔哩** | |✓| | | | Dilidili | |✓| | | -| 豆瓣 | | | |✓| +| 豆瓣 | |✓| |✓| | 斗鱼 | |✓| | | +| Panda
熊猫 | |✓| | | | 凤凰视频 | |✓| | | | 风行网 | |✓| | | | iQIYI
爱奇艺 | |✓| | | @@ -359,6 +391,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | PPTV聚力 | |✓| | | | 齐鲁网 | |✓| | | | QQ
腾讯视频 | |✓| | | +| 企鹅直播 | |✓| | | | 阡陌视频 | |✓| | | | THVideo | |✓| | | | Sina
新浪视频
微博秒拍视频 |
|✓| | | @@ -372,6 +405,9 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 战旗TV | |✓| | | | 央视网 | |✓| | | | 花瓣 | | |✓| | +| Naver
네이버 | |✓| | | +| 芒果TV | |✓| | | +| 火猫TV | |✓| | | For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. diff --git a/contrib/completion/_you-get b/contrib/completion/_you-get new file mode 100644 index 00000000..696aad89 --- /dev/null +++ b/contrib/completion/_you-get @@ -0,0 +1,29 @@ +#compdef you-get + +# Zsh completion definition for soimort/you-get. + +setopt localoptions noshwordsplit noksharrays +local -a args + +args=( + '(- : *)'{-V,--version}'[print version and exit]' + '(- : *)'{-h,--help}'[print help and exit]' + '(-i --info)'{-i,--info}'[print extracted information]' + '(-u --url)'{-u,--url}'[print extracted information with URLs]' + '(--json)--json[print extracted URLs in JSON format]' + '(-n --no-merge)'{-n,--no-merge}'[do not merge video parts]' + '(--no-caption)--no-caption[do not download captions]' + '(-f --force)'{-f,--force}'[force overwrite existing files]' + '(-F --format)'{-F,--format}'[set video format to the specified stream id]:stream id' + '(-O --output-filename)'{-O,--output-filename}'[set output filename]:filename:_files' + '(-o --output-dir)'{-o,--output-dir}'[set output directory]:directory:_files -/' + '(-p --player)'{-p,--player}'[stream extracted URL to the specified player]:player and options' + '(-c --cookies)'{-c,--cookies}'[load cookies.txt or cookies.sqlite]:cookies file:_files' + '(-x --http-proxy)'{-x,--http-proxy}'[use the specified HTTP proxy for downloading]:host\:port:' + '(-y --extractor-proxy)'{-y,--extractor-proxy}'[use the specified HTTP proxy for extraction only]:host\:port' + '(--no-proxy)--no-proxy[do not use a proxy]' + '(-t --timeout)'{-t,--timeout}'[set socket timeout]:seconds' + '(-d --debug)'{-d,--debug}'[show traceback and other debug info]' + '*: :_guard "^-*" url' +) +_arguments -S -s $args diff --git a/contrib/completion/you-get-completion.bash b/contrib/completion/you-get-completion.bash new file mode 100755 index 00000000..9c6480ec --- /dev/null +++ b/contrib/completion/you-get-completion.bash @@ -0,0 +1,31 @@ +# Bash completion definition for you-get. + +_you-get () { + COMPREPLY=() + local IFS=$' \n' + local cur=$2 prev=$3 + local -a opts_without_arg opts_with_arg + opts_without_arg=( + -V --version -h --help -i --info -u --url --json -n --no-merge + --no-caption -f --force --no-proxy -d --debug + ) + opts_with_arg=( + -F --format -O --output-filename -o --output-dir -p --player + -c --cookies -x --http-proxy -y --extractor-proxy -t --timeout + ) + + # Do not complete non option names + [[ $cur == -* ]] || return 1 + + # Do not complete when the previous arg is an option expecting an argument + for opt in "${opts_with_arg[@]}"; do + [[ $opt == $prev ]] && return 1 + done + + # Complete option names + COMPREPLY=( $(compgen -W "${opts_without_arg[*]} ${opts_with_arg[*]}" \ + -- "$cur") ) + return 0 +} + +complete -F _you-get you-get diff --git a/contrib/completion/you-get.fish b/contrib/completion/you-get.fish new file mode 100644 index 00000000..6917c422 --- /dev/null +++ b/contrib/completion/you-get.fish @@ -0,0 +1,23 @@ +# Fish completion definition for you-get. + +complete -c you-get -s V -l version -d 'print version and exit' +complete -c you-get -s h -l help -d 'print help and exit' +complete -c you-get -s i -l info -d 'print extracted information' +complete -c you-get -s u -l url -d 'print extracted information' +complete -c you-get -l json -d 'print extracted URLs in JSON format' +complete -c you-get -s n -l no-merge -d 'do not merge video parts' +complete -c you-get -l no-caption -d 'do not download captions' +complete -c you-get -s f -l force -d 'force overwrite existing files' +complete -c you-get -s F -l format -x -d 'set video format to the specified stream id' +complete -c you-get -s O -l output-filename -d 'set output filename' \ + -x -a '(__fish_complete_path (commandline -ct) "output filename")' +complete -c you-get -s o -l output-dir -d 'set output directory' \ + -x -a '(__fish_complete_directories (commandline -ct) "output directory")' +complete -c you-get -s p -l player -x -d 'stream extracted URL to the specified player' +complete -c you-get -s c -l cookies -d 'load cookies.txt or cookies.sqlite' \ + -x -a '(__fish_complete_path (commandline -ct) "cookies.txt or cookies.sqlite")' +complete -c you-get -s x -l http-proxy -x -d 'use the specified HTTP proxy for downloading' +complete -c you-get -s y -l extractor-proxy -x -d 'use the specified HTTP proxy for extraction only' +complete -c you-get -l no-proxy -d 'do not use a proxy' +complete -c you-get -s t -l timeout -x -d 'set socket timeout' +complete -c you-get -s d -l debug -d 'show traceback and other debug info' diff --git a/src/you_get/common.py b/src/you_get/common.py index e20be32b..b6f2d399 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -8,7 +8,9 @@ SITES = { 'baidu' : 'baidu', 'bandcamp' : 'bandcamp', 'baomihua' : 'baomihua', + 'bigthink' : 'bigthink', 'bilibili' : 'bilibili', + 'cctv' : 'cntv', 'cntv' : 'cntv', 'cbs' : 'cbs', 'dailymotion' : 'dailymotion', @@ -25,7 +27,9 @@ SITES = { 'google' : 'google', 'heavy-music' : 'heavymusic', 'huaban' : 'huaban', + 'huomao' : 'huomaotv', 'iask' : 'sina', + 'icourses' : 'icourses', 'ifeng' : 'ifeng', 'imgur' : 'imgur', 'in' : 'alive', @@ -47,17 +51,21 @@ SITES = { 'lizhi' : 'lizhi', 'magisto' : 'magisto', 'metacafe' : 'metacafe', + 'mgtv' : 'mgtv', 'miomio' : 'miomio', 'mixcloud' : 'mixcloud', 'mtv81' : 'mtv81', 'musicplayon' : 'musicplayon', + 'naver' : 'naver', '7gogo' : 'nanagogo', 'nicovideo' : 'nicovideo', + 'panda' : 'panda', 'pinterest' : 'pinterest', 'pixnet' : 'pixnet', 'pptv' : 'pptv', 'qianmo' : 'qianmo', 'qq' : 'qq', + 'showroom-live' : 'showroom', 'sina' : 'sina', 'smgbb' : 'bilibili', 'sohu' : 'sohu', @@ -73,6 +81,7 @@ SITES = { 'videomega' : 'videomega', 'vidto' : 'vidto', 'vimeo' : 'vimeo', + 'wanmen' : 'wanmen', 'weibo' : 'miaopai', 'veoh' : 'veoh', 'vine' : 'vine', @@ -95,6 +104,7 @@ import logging import os import platform import re +import socket import sys import time from urllib import request, parse, error @@ -305,7 +315,53 @@ def get_content(url, headers={}, decoded=True): if cookies: cookies.add_cookie_header(req) req.headers.update(req.unredirected_hdrs) - response = request.urlopen(req) + + for i in range(10): + try: + response = request.urlopen(req) + break + except socket.timeout: + logging.debug('request attempt %s timeout' % str(i + 1)) + + data = response.read() + + # Handle HTTP compression for gzip and deflate (zlib) + content_encoding = response.getheader('Content-Encoding') + if content_encoding == 'gzip': + data = ungzip(data) + elif content_encoding == 'deflate': + data = undeflate(data) + + # Decode the response body + if decoded: + charset = match1(response.getheader('Content-Type'), r'charset=([\w-]+)') + if charset is not None: + data = data.decode(charset) + else: + data = data.decode('utf-8') + + return data + +def post_content(url, headers={}, post_data={}, decoded=True): + """Post the content of a URL via sending a HTTP POST request. + + Args: + url: A URL. + headers: Request headers used by the client. + decoded: Whether decode the response body using UTF-8 or the charset specified in Content-Type. + + Returns: + The content as a string. + """ + + logging.debug('post_content: %s \n post_data: %s' % (url, post_data)) + + req = request.Request(url, headers=headers) + if cookies: + cookies.add_cookie_header(req) + req.headers.update(req.unredirected_hdrs) + post_data_enc = bytes(parse.urlencode(post_data), 'utf-8') + response = request.urlopen(req, data = post_data_enc) data = response.read() # Handle HTTP compression for gzip and deflate (zlib) @@ -492,7 +548,11 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False, h os.remove(filepath) # on Windows rename could fail if destination filepath exists os.rename(temp_filepath, filepath) -def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker = False, headers = {}): +def url_save_chunked(url, filepath, bar, dyn_callback=None, chunk_size=0, ignore_range=False, refer=None, is_part=False, faker=False, headers={}): + def dyn_update_url(received): + if callable(dyn_callback): + logging.debug('Calling callback %s for new URL from %s' % (dyn_callback.__name__, received)) + return dyn_callback(received) if os.path.exists(filepath): if not force: if not is_part: @@ -530,19 +590,26 @@ def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker = else: headers = {} if received: - headers['Range'] = 'bytes=' + str(received) + '-' + url = dyn_update_url(received) + if not ignore_range: + headers['Range'] = 'bytes=' + str(received) + '-' if refer: headers['Referer'] = refer - response = request.urlopen(request.Request(url, headers = headers), None) + response = request.urlopen(request.Request(url, headers=headers), None) with open(temp_filepath, open_mode) as output: + this_chunk = received while True: buffer = response.read(1024 * 256) if not buffer: break output.write(buffer) received += len(buffer) + if chunk_size and (received - this_chunk) >= chunk_size: + url = dyn_callback(received) + this_chunk = received + response = request.urlopen(request.Request(url, headers=headers), None) if bar: bar.update_received(len(buffer)) @@ -734,7 +801,7 @@ def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merg if has_ffmpeg_installed(): from .processor.ffmpeg import ffmpeg_concat_av ret = ffmpeg_concat_av(parts, output_filepath, ext) - print('Done.') + print('Merged into %s' % output_filename) if ret == 0: for part in parts: os.remove(part) @@ -747,7 +814,7 @@ def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merg else: from .processor.join_flv import concat_flv concat_flv(parts, output_filepath) - print('Done.') + print('Merged into %s' % output_filename) except: raise else: @@ -763,7 +830,7 @@ def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merg else: from .processor.join_mp4 import concat_mp4 concat_mp4(parts, output_filepath) - print('Done.') + print('Merged into %s' % output_filename) except: raise else: @@ -779,7 +846,7 @@ def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merg else: from .processor.join_ts import concat_ts concat_ts(parts, output_filepath) - print('Done.') + print('Merged into %s' % output_filename) except: raise else: @@ -791,7 +858,7 @@ def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merg print() -def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False, headers = {}): +def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False, headers = {}, **kwargs): assert urls if dry_run: print('Real URLs:\n%s\n' % urls) @@ -805,7 +872,7 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=No filename = '%s.%s' % (title, ext) filepath = os.path.join(output_dir, filename) - if total_size and ext in ('ts'): + if total_size: if not force and os.path.exists(filepath[:-3] + '.mkv'): print('Skipping %s: file already exists' % filepath[:-3] + '.mkv') print() @@ -820,7 +887,7 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=No print('Downloading %s ...' % tr(filename)) filepath = os.path.join(output_dir, filename) parts.append(filepath) - url_save_chunked(url, filepath, bar, refer = refer, faker = faker, headers = headers) + url_save_chunked(url, filepath, bar, refer = refer, faker = faker, headers = headers, **kwargs) bar.done() if not merge: @@ -887,6 +954,22 @@ def download_rtmp_url(url,title, ext,params={}, total_size=0, output_dir='.', re assert has_rtmpdump_installed(), "RTMPDump not installed." download_rtmpdump_stream(url, title, ext,params, output_dir) +def download_url_ffmpeg(url,title, ext,params={}, total_size=0, output_dir='.', refer=None, merge=True, faker=False): + assert url + if dry_run: + print('Real URL:\n%s\n' % [url]) + if params.get("-y",False): #None or unset ->False + print('Real Playpath:\n%s\n' % [params.get("-y")]) + return + + if player: + launch_player(player, [url]) + return + + from .processor.ffmpeg import has_ffmpeg_installed, ffmpeg_download_stream + assert has_ffmpeg_installed(), "FFmpeg not installed." + ffmpeg_download_stream(url, title, ext, params, output_dir) + def playlist_not_supported(name): def f(*args, **kwargs): raise NotImplementedError('Playlist is not supported for ' + name) @@ -1015,6 +1098,22 @@ def set_http_proxy(proxy): opener = request.build_opener(proxy_support) request.install_opener(opener) +def print_more_compatible(*args, **kwargs): + import builtins as __builtin__ + """Overload default print function as py (<3.3) does not support 'flush' keyword. + Although the function name can be same as print to get itself overloaded automatically, + I'd rather leave it with a different name and only overload it when importing to make less confusion. """ + # nothing happens on py3.3 and later + if sys.version_info[:2] >= (3, 3): + return __builtin__.print(*args, **kwargs) + + # in lower pyver (e.g. 3.2.x), remove 'flush' keyword and flush it as requested + doFlush = kwargs.pop('flush', False) + ret = __builtin__.print(*args, **kwargs) + if doFlush: + kwargs.get('file', sys.stdout).flush() + return ret + def download_main(download, download_playlist, urls, playlist, **kwargs): @@ -1060,11 +1159,13 @@ def script_main(script_name, download, download_playlist, **kwargs): -x | --http-proxy Use an HTTP proxy for downloading. -y | --extractor-proxy Use an HTTP proxy for extracting only. --no-proxy Never use a proxy. + -s | --socks-proxy Use an SOCKS5 proxy for downloading. + -t | --timeout Set socket timeout. -d | --debug Show traceback and other debug info. ''' - short_opts = 'Vhfiuc:ndF:O:o:p:x:y:' - opts = ['version', 'help', 'force', 'info', 'url', 'cookies', 'no-caption', 'no-merge', 'no-proxy', 'debug', 'json', 'format=', 'stream=', 'itag=', 'output-filename=', 'output-dir=', 'player=', 'http-proxy=', 'extractor-proxy=', 'lang='] + short_opts = 'Vhfiuc:ndF:O:o:p:x:y:s:t:' + opts = ['version', 'help', 'force', 'info', 'url', 'cookies', 'no-caption', 'no-merge', 'no-proxy', 'debug', 'json', 'format=', 'stream=', 'itag=', 'output-filename=', 'output-dir=', 'player=', 'http-proxy=', 'socks-proxy=', 'extractor-proxy=', 'lang=', 'timeout='] if download_playlist: short_opts = 'l' + short_opts opts = ['playlist'] + opts @@ -1092,8 +1193,10 @@ def script_main(script_name, download, download_playlist, **kwargs): lang = None output_dir = '.' proxy = None + socks_proxy = None extractor_proxy = None traceback = False + timeout = 600 for o, a in opts: if o in ('-V', '--version'): version() @@ -1163,10 +1266,14 @@ def script_main(script_name, download, download_playlist, **kwargs): caption = False elif o in ('-x', '--http-proxy'): proxy = a + elif o in ('-s', '--socks-proxy'): + socks_proxy = a elif o in ('-y', '--extractor-proxy'): extractor_proxy = a elif o in ('--lang',): lang = a + elif o in ('-t', '--timeout'): + timeout = int(a) else: log.e("try 'you-get --help' for more options") sys.exit(2) @@ -1174,7 +1281,26 @@ def script_main(script_name, download, download_playlist, **kwargs): print(help) sys.exit() - set_http_proxy(proxy) + if (socks_proxy): + try: + import socket + import socks + socks_proxy_addrs = socks_proxy.split(':') + socks.set_default_proxy(socks.SOCKS5, + socks_proxy_addrs[0], + int(socks_proxy_addrs[1])) + socket.socket = socks.socksocket + def getaddrinfo(*args): + return [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))] + socket.getaddrinfo = getaddrinfo + except ImportError: + log.w('Error importing PySocks library, socks proxy ignored.' + 'In order to use use socks proxy, please install PySocks.') + else: + import socket + set_http_proxy(proxy) + + socket.setdefaulttimeout(timeout) try: if stream_id: diff --git a/src/you_get/extractor.py b/src/you_get/extractor.py index 3cc78289..594b908e 100644 --- a/src/you_get/extractor.py +++ b/src/you_get/extractor.py @@ -1,6 +1,7 @@ #!/usr/bin/env python from .common import match1, maybe_print, download_urls, get_filename, parse_host, set_proxy, unset_proxy +from .common import print_more_compatible as print from .util import log from . import json_output import os diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index 5af9cdd3..61b6a0d1 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -5,7 +5,9 @@ from .alive import * from .archive import * from .baidu import * from .bandcamp import * +from .bigthink import * from .bilibili import * +from .bokecc import * from .cbs import * from .ckplayer import * from .cntv import * @@ -22,6 +24,7 @@ from .funshion import * from .google import * from .heavymusic import * from .huaban import * +from .icourses import * from .ifeng import * from .imgur import * from .infoq import * @@ -38,19 +41,24 @@ from .le import * from .lizhi import * from .magisto import * from .metacafe import * +from .mgtv import * from .miaopai import * from .miomio import * from .mixcloud import * from .mtv81 import * from .musicplayon import * from .nanagogo import * +from .naver import * from .netease import * from .nicovideo import * +from .panda import * from .pinterest import * from .pixnet import * from .pptv import * from .qianmo import * +from .qie import * from .qq import * +from .showroom import * from .sina import * from .sohu import * from .soundcloud import * @@ -67,6 +75,7 @@ from .vimeo import * from .vine import * from .vk import * from .w56 import * +from .wanmen import * from .xiami import * from .yinyuetai import * from .yixia import * @@ -74,3 +83,4 @@ from .youku import * from .youtube import * from .ted import * from .khan import * +from .zhanqi import * diff --git a/src/you_get/extractors/acfun.py b/src/you_get/extractors/acfun.py index e78d3636..87e005fb 100644 --- a/src/you_get/extractors/acfun.py +++ b/src/you_get/extractors/acfun.py @@ -8,7 +8,7 @@ from .le import letvcloud_download_by_vu from .qq import qq_download_by_vid from .sina import sina_download_by_vid from .tudou import tudou_download_by_iid -from .youku import youku_download_by_vid +from .youku import youku_download_by_vid, youku_open_download_by_vid import json, re @@ -17,10 +17,24 @@ def get_srt_json(id): return get_html(url) def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False, **kwargs): + """str, str, str, bool, bool ->None + + Download Acfun video by vid. + + Call Acfun API, decide which site to use, and pass the job to its + extractor. + """ + + #first call the main parasing API info = json.loads(get_html('http://www.acfun.tv/video/getVideo.aspx?id=' + vid)) + sourceType = info['sourceType'] + + #decide sourceId to know which extractor to use if 'sourceId' in info: sourceId = info['sourceId'] # danmakuId = info['danmakuId'] + + #call extractor decided by sourceId if sourceType == 'sina': sina_download_by_vid(sourceId, title, output_dir=output_dir, merge=merge, info_only=info_only) elif sourceType == 'youku': @@ -32,14 +46,13 @@ def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=Fals elif sourceType == 'letv': letvcloud_download_by_vu(sourceId, '2d8c027396', title, output_dir=output_dir, merge=merge, info_only=info_only) elif sourceType == 'zhuzhan': - a = 'http://api.aixifan.com/plays/%s/realSource' % vid - s = json.loads(get_content(a, headers={'deviceType': '1'})) - urls = s['data']['files'][-1]['url'] - size = urls_size(urls) - print_info(site_info, title, 'mp4', size) - if not info_only: - download_urls(urls, title, 'mp4', size, - output_dir=output_dir, merge=merge) + #As in Jul.28.2016, Acfun is using embsig to anti hotlink so we need to pass this + embsig = info['encode'] + a = 'http://api.aixifan.com/plays/%s' % vid + s = json.loads(get_content(a, headers={'deviceType': '2'})) + if s['data']['source'] == "zhuzhan-youku": + sourceId = s['data']['sourceId'] + youku_open_download_by_vid(client_id='908a519d032263f8', vid=sourceId, title=title, output_dir=output_dir,merge=merge, info_only=info_only, embsig = embsig, **kwargs) else: raise NotImplementedError(sourceType) @@ -60,20 +73,19 @@ def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs): assert re.match(r'http://[^\.]+.acfun.[^\.]+/\D/\D\D(\d+)', url) html = get_html(url) - title = r1(r'

([^<>]+)<', html) + title = r1(r'data-title="([^"]+)"', html) title = unescape_html(title) title = escape_file_path(title) assert title - videos = re.findall("data-vid=\"(\d+)\".*href=\"[^\"]+\".*title=\"([^\"]+)\"", html) - for video in videos: - p_vid = video[0] - p_title = title + " - " + video[1] if video[1] != '删除标签' else title - acfun_download_by_vid(p_vid, p_title, - output_dir=output_dir, - merge=merge, - info_only=info_only, - **kwargs) + vid = r1('data-vid="(\d+)"', html) + up = r1('data-name="([^"]+)"', html) + title = title + ' - ' + up + acfun_download_by_vid(vid, title, + output_dir=output_dir, + merge=merge, + info_only=info_only, + **kwargs) site_info = "AcFun.tv" download = acfun_download diff --git a/src/you_get/extractors/baidu.py b/src/you_get/extractors/baidu.py old mode 100755 new mode 100644 index aa9caa0c..d5efaf0b --- a/src/you_get/extractors/baidu.py +++ b/src/you_get/extractors/baidu.py @@ -7,8 +7,10 @@ from ..common import * from .embed import * from .universal import * + def baidu_get_song_data(sid): - data = json.loads(get_html('http://music.baidu.com/data/music/fmlink?songIds=%s' % sid, faker = True))['data'] + data = json.loads(get_html( + 'http://music.baidu.com/data/music/fmlink?songIds=%s' % sid, faker=True))['data'] if data['xcode'] != '': # inside china mainland @@ -17,22 +19,28 @@ def baidu_get_song_data(sid): # outside china mainland return None + def baidu_get_song_url(data): return data['songLink'] + def baidu_get_song_artist(data): return data['artistName'] + def baidu_get_song_album(data): return data['albumName'] + def baidu_get_song_title(data): return data['songName'] + def baidu_get_song_lyric(data): lrc = data['lrcLink'] return None if lrc is '' else "http://music.baidu.com%s" % lrc + def baidu_download_song(sid, output_dir='.', merge=True, info_only=False): data = baidu_get_song_data(sid) if data is not None: @@ -51,7 +59,8 @@ def baidu_download_song(sid, output_dir='.', merge=True, info_only=False): type, ext, size = url_info(url, faker=True) print_info(site_info, title, type, size) if not info_only: - download_urls([url], file_name, ext, size, output_dir, merge=merge, faker=True) + download_urls([url], file_name, ext, size, + output_dir, merge=merge, faker=True) try: type, ext, size = url_info(lrc, faker=True) @@ -61,12 +70,14 @@ def baidu_download_song(sid, output_dir='.', merge=True, info_only=False): except: pass -def baidu_download_album(aid, output_dir = '.', merge = True, info_only = False): - html = get_html('http://music.baidu.com/album/%s' % aid, faker = True) + +def baidu_download_album(aid, output_dir='.', merge=True, info_only=False): + html = get_html('http://music.baidu.com/album/%s' % aid, faker=True) album_name = r1(r'

(.+?)<\/h2>', html) artist = r1(r'', html) output_dir = '%s/%s - %s' % (output_dir, artist, album_name) - ids = json.loads(r1(r'', html).replace('"', '').replace(';', '"'))['ids'] + ids = json.loads(r1(r'', + html).replace('"', '').replace(';', '"'))['ids'] track_nr = 1 for id in ids: song_data = baidu_get_song_data(id) @@ -75,38 +86,29 @@ def baidu_download_album(aid, output_dir = '.', merge = True, info_only = False) song_lrc = baidu_get_song_lyric(song_data) file_name = '%02d.%s' % (track_nr, song_title) - type, ext, size = url_info(song_url, faker = True) + type, ext, size = url_info(song_url, faker=True) print_info(site_info, song_title, type, size) if not info_only: - download_urls([song_url], file_name, ext, size, output_dir, merge = merge, faker = True) + download_urls([song_url], file_name, ext, size, + output_dir, merge=merge, faker=True) if song_lrc: - type, ext, size = url_info(song_lrc, faker = True) + type, ext, size = url_info(song_lrc, faker=True) print_info(site_info, song_title, type, size) if not info_only: - download_urls([song_lrc], file_name, ext, size, output_dir, faker = True) + download_urls([song_lrc], file_name, ext, + size, output_dir, faker=True) track_nr += 1 -def baidu_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False, **kwargs): - if re.match(r'http://imgsrc.baidu.com', url): - universal_download(url, output_dir, merge=merge, info_only=info_only) - return - elif re.match(r'http://pan.baidu.com', url): - html = get_html(url) +def baidu_download(url, output_dir='.', stream_type=None, merge=True, info_only=False, **kwargs): - title = r1(r'server_filename="([^"]+)"', html) - if len(title.split('.')) > 1: - title = ".".join(title.split('.')[:-1]) - - real_url = r1(r'\\"dlink\\":\\"([^"]*)\\"', html).replace('\\\\/', '/') - type, ext, size = url_info(real_url, faker = True) - - print_info(site_info, title, ext, size) + if re.match(r'http://pan.baidu.com', url): + real_url, title, ext, size = baidu_pan_download(url) if not info_only: - download_urls([real_url], title, ext, size, output_dir, merge = merge) - + download_urls([real_url], title, ext, size, + output_dir, url, merge=merge, faker=True) elif re.match(r'http://music.baidu.com/album/\d+', url): id = r1(r'http://music.baidu.com/album/(\d+)', url) baidu_download_album(id, output_dir, merge, info_only) @@ -124,17 +126,20 @@ def baidu_download(url, output_dir = '.', stream_type = None, merge = True, info html = get_html(url) title = r1(r'title:"([^"]+)"', html) - items = re.findall(r'//imgsrc.baidu.com/forum/w[^"]+/([^/"]+)', html) + items = re.findall( + r'//imgsrc.baidu.com/forum/w[^"]+/([^/"]+)', html) urls = ['http://imgsrc.baidu.com/forum/pic/item/' + i for i in set(items)] # handle albums kw = r1(r'kw=([^&]+)', html) or r1(r"kw:'([^']+)'", html) tid = r1(r'tid=(\d+)', html) or r1(r"tid:'([^']+)'", html) - album_url = 'http://tieba.baidu.com/photo/g/bw/picture/list?kw=%s&tid=%s' % (kw, tid) + album_url = 'http://tieba.baidu.com/photo/g/bw/picture/list?kw=%s&tid=%s' % ( + kw, tid) album_info = json.loads(get_content(album_url)) for i in album_info['data']['pic_list']: - urls.append('http://imgsrc.baidu.com/forum/pic/item/' + i['pic_id'] + '.jpg') + urls.append( + 'http://imgsrc.baidu.com/forum/pic/item/' + i['pic_id'] + '.jpg') ext = 'jpg' size = float('Inf') @@ -144,6 +149,170 @@ def baidu_download(url, output_dir = '.', stream_type = None, merge = True, info download_urls(urls, title, ext, size, output_dir=output_dir, merge=False) + +def baidu_pan_download(url): + errno_patt = r'errno":([^"]+),' + refer_url = "" + fake_headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Charset': 'UTF-8,*;q=0.5', + 'Accept-Encoding': 'gzip,deflate,sdch', + 'Accept-Language': 'en-US,en;q=0.8', + 'Host': 'pan.baidu.com', + 'Origin': 'http://pan.baidu.com', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:13.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2500.0 Safari/537.36', + 'Referer': refer_url + } + if cookies: + print('Use user specified cookies') + else: + print('Generating cookies...') + fake_headers['Cookie'] = baidu_pan_gen_cookies(url) + refer_url = "http://pan.baidu.com" + html = get_content(url, fake_headers, decoded=True) + isprotected = False + sign, timestamp, bdstoken, appid, primary_id, fs_id, uk = baidu_pan_parse( + html) + if sign == None: + if re.findall(r'\baccess-code\b', html): + isprotected = True + sign, timestamp, bdstoken, appid, primary_id, fs_id, uk, fake_headers, psk = baidu_pan_protected_share( + url) + # raise NotImplementedError("Password required!") + if isprotected != True: + raise AssertionError("Share not found or canceled: %s" % url) + if bdstoken == None: + bdstoken = "" + if isprotected != True: + sign, timestamp, bdstoken, appid, primary_id, fs_id, uk = baidu_pan_parse( + html) + request_url = "http://pan.baidu.com/api/sharedownload?sign=%s×tamp=%s&bdstoken=%s&channel=chunlei&clienttype=0&web=1&app_id=%s" % ( + sign, timestamp, bdstoken, appid) + refer_url = url + post_data = { + 'encrypt': 0, + 'product': 'share', + 'uk': uk, + 'primaryid': primary_id, + 'fid_list': '[' + fs_id + ']' + } + if isprotected == True: + post_data['sekey'] = psk + response_content = post_content(request_url, fake_headers, post_data, True) + errno = match1(response_content, errno_patt) + if errno != "0": + raise AssertionError( + "Server refused to provide download link! (Errno:%s)" % errno) + real_url = r1(r'dlink":"([^"]+)"', response_content).replace('\\/', '/') + title = r1(r'server_filename":"([^"]+)"', response_content) + assert real_url + type, ext, size = url_info(real_url, faker=True) + title_wrapped = json.loads('{"wrapper":"%s"}' % title) + title = title_wrapped['wrapper'] + logging.debug(real_url) + print_info(site_info, title, ext, size) + print('Hold on...') + time.sleep(5) + return real_url, title, ext, size + + +def baidu_pan_parse(html): + sign_patt = r'sign":"([^"]+)"' + timestamp_patt = r'timestamp":([^"]+),' + appid_patt = r'app_id":"([^"]+)"' + bdstoken_patt = r'bdstoken":"([^"]+)"' + fs_id_patt = r'fs_id":([^"]+),' + uk_patt = r'uk":([^"]+),' + errno_patt = r'errno":([^"]+),' + primary_id_patt = r'shareid":([^"]+),' + sign = match1(html, sign_patt) + timestamp = match1(html, timestamp_patt) + appid = match1(html, appid_patt) + bdstoken = match1(html, bdstoken_patt) + fs_id = match1(html, fs_id_patt) + uk = match1(html, uk_patt) + primary_id = match1(html, primary_id_patt) + return sign, timestamp, bdstoken, appid, primary_id, fs_id, uk + + +def baidu_pan_gen_cookies(url, post_data=None): + from http import cookiejar + cookiejar = cookiejar.CookieJar() + opener = request.build_opener(request.HTTPCookieProcessor(cookiejar)) + resp = opener.open('http://pan.baidu.com') + if post_data != None: + resp = opener.open(url, bytes(parse.urlencode(post_data), 'utf-8')) + return cookjar2hdr(cookiejar) + + +def baidu_pan_protected_share(url): + print('This share is protected by password!') + inpwd = input('Please provide unlock password: ') + inpwd = inpwd.replace(' ', '').replace('\t', '') + print('Please wait...') + post_pwd = { + 'pwd': inpwd, + 'vcode': None, + 'vstr': None + } + from http import cookiejar + import time + cookiejar = cookiejar.CookieJar() + opener = request.build_opener(request.HTTPCookieProcessor(cookiejar)) + resp = opener.open('http://pan.baidu.com') + resp = opener.open(url) + init_url = resp.geturl() + verify_url = 'http://pan.baidu.com/share/verify?%s&t=%s&channel=chunlei&clienttype=0&web=1' % ( + init_url.split('?', 1)[1], int(time.time())) + refer_url = init_url + fake_headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Charset': 'UTF-8,*;q=0.5', + 'Accept-Encoding': 'gzip,deflate,sdch', + 'Accept-Language': 'en-US,en;q=0.8', + 'Host': 'pan.baidu.com', + 'Origin': 'http://pan.baidu.com', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:13.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2500.0 Safari/537.36', + 'Referer': refer_url + } + opener.addheaders = dict2triplet(fake_headers) + pwd_resp = opener.open(verify_url, bytes( + parse.urlencode(post_pwd), 'utf-8')) + pwd_resp_str = ungzip(pwd_resp.read()).decode('utf-8') + pwd_res = json.loads(pwd_resp_str) + if pwd_res['errno'] != 0: + raise AssertionError( + 'Server returned an error: %s (Incorrect password?)' % pwd_res['errno']) + pg_resp = opener.open('http://pan.baidu.com/share/link?%s' % + init_url.split('?', 1)[1]) + content = ungzip(pg_resp.read()).decode('utf-8') + sign, timestamp, bdstoken, appid, primary_id, fs_id, uk = baidu_pan_parse( + content) + psk = query_cookiejar(cookiejar, 'BDCLND') + psk = parse.unquote(psk) + fake_headers['Cookie'] = cookjar2hdr(cookiejar) + return sign, timestamp, bdstoken, appid, primary_id, fs_id, uk, fake_headers, psk + + +def cookjar2hdr(cookiejar): + cookie_str = '' + for i in cookiejar: + cookie_str = cookie_str + i.name + '=' + i.value + ';' + return cookie_str[:-1] + + +def query_cookiejar(cookiejar, name): + for i in cookiejar: + if i.name == name: + return i.value + + +def dict2triplet(dictin): + out_triplet = [] + for i in dictin: + out_triplet.append((i, dictin[i])) + return out_triplet + site_info = "Baidu.com" download = baidu_download download_playlist = playlist_not_supported("baidu") diff --git a/src/you_get/extractors/bandcamp.py b/src/you_get/extractors/bandcamp.py index de21a590..c24e4f3d 100644 --- a/src/you_get/extractors/bandcamp.py +++ b/src/you_get/extractors/bandcamp.py @@ -6,7 +6,7 @@ from ..common import * def bandcamp_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_html(url) - trackinfo = json.loads(r1(r'(\[{"video_poster_url".*}\]),', html)) + trackinfo = json.loads(r1(r'(\[{"(video_poster_url|video_caption)".*}\]),', html)) for track in trackinfo: track_num = track['track_num'] title = '%s. %s' % (track_num, track['title']) diff --git a/src/you_get/extractors/baomihua.py b/src/you_get/extractors/baomihua.py old mode 100755 new mode 100644 index f8be6fa9..4c4febb7 --- a/src/you_get/extractors/baomihua.py +++ b/src/you_get/extractors/baomihua.py @@ -7,7 +7,7 @@ from ..common import * import urllib def baomihua_download_by_id(id, title=None, output_dir='.', merge=True, info_only=False, **kwargs): - html = get_html('http://play.baomihua.com/getvideourl.aspx?flvid=%s' % id) + html = get_html('http://play.baomihua.com/getvideourl.aspx?flvid=%s&devicetype=phone_app' % id) host = r1(r'host=([^&]*)', html) assert host type = r1(r'videofiletype=([^&]*)', html) diff --git a/src/you_get/extractors/bigthink.py b/src/you_get/extractors/bigthink.py new file mode 100644 index 00000000..1dd196d5 --- /dev/null +++ b/src/you_get/extractors/bigthink.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python + +from ..common import * +from ..extractor import VideoExtractor + +import json + +class Bigthink(VideoExtractor): + name = "Bigthink" + + stream_types = [ #this is just a sample. Will make it in prepare() + # {'id': '1080'}, + # {'id': '720'}, + # {'id': '360'}, + # {'id': '288'}, + # {'id': '190'}, + # {'id': '180'}, + + ] + + @staticmethod + def get_streams_by_id(account_number, video_id): + """ + int, int->list + + Get the height of the videos. + + Since brightcove is using 3 kinds of links: rtmp, http and https, + we will be using the HTTPS one to make it secure. + + If somehow akamaihd.net is blocked by the Great Fucking Wall, + change the "startswith https" to http. + """ + endpoint = 'https://edge.api.brightcove.com/playback/v1/accounts/{account_number}/videos/{video_id}'.format(account_number = account_number, video_id = video_id) + fake_header_id = fake_headers + #is this somehow related to the time? Magic.... + fake_header_id['Accept'] ='application/json;pk=BCpkADawqM1cc6wmJQC2tvoXZt4mrB7bFfi6zGt9QnOzprPZcGLE9OMGJwspQwKfuFYuCjAAJ53JdjI8zGFx1ll4rxhYJ255AXH1BQ10rnm34weknpfG-sippyQ' + + html = get_content(endpoint, headers= fake_header_id) + html_json = json.loads(html) + + link_list = [] + + for i in html_json['sources']: + if 'src' in i: #to avoid KeyError + if i['src'].startswith('https'): + link_list.append((str(i['height']), i['src'])) + + return link_list + + def prepare(self, **kwargs): + + html = get_content(self.url) + + self.title = match1(html, r'(.*)', xml) for x, y in d: p = parse_srt_p(x) raise NotImplementedError() + def parse_cid_playurl(xml): from xml.dom.minidom import parseString try: @@ -59,14 +63,16 @@ def parse_cid_playurl(xml): except: return [] + def bilibili_download_by_cids(cids, title, output_dir='.', merge=True, info_only=False): urls = [] for cid in cids: - url = 'http://interface.bilibili.com/playurl?appkey=' + appkey + '&cid=' + cid + sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest() + url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this urls += [i - if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i) - else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i) - for i in parse_cid_playurl(get_content(url))] + if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i) + else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i) + for i in parse_cid_playurl(get_content(url))] type_ = '' size = 0 @@ -78,8 +84,10 @@ def bilibili_download_by_cids(cids, title, output_dir='.', merge=True, info_only if not info_only: download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge) + def bilibili_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False): - url = 'http://interface.bilibili.com/playurl?appkey=' + appkey + '&cid=' + cid + sign_this = hashlib.md5(bytes('cid={cid}&from=miniplay&player=1{SECRETKEY_MINILOADER}'.format(cid = cid, SECRETKEY_MINILOADER = SECRETKEY_MINILOADER), 'utf-8')).hexdigest() + url = 'http://interface.bilibili.com/playurl?&cid=' + cid + '&from=miniplay&player=1' + '&sign=' + sign_this urls = [i if not re.match(r'.*\.qqvideo\.tc\.qq\.com', i) else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', i) @@ -87,17 +95,15 @@ def bilibili_download_by_cid(cid, title, output_dir='.', merge=True, info_only=F type_ = '' size = 0 - try: - for url in urls: - _, type_, temp = url_info(url) - size += temp or 0 - except error.URLError: - log.wtf('[Failed] DNS not resolved. Please change your DNS server settings.') + for url in urls: + _, type_, temp = url_info(url) + size += temp or 0 print_info(site_info, title, type_, size) if not info_only: download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge) + def bilibili_live_download_by_cid(cid, title, output_dir='.', merge=True, info_only=False): api_url = 'http://live.bilibili.com/api/playurl?cid=' + cid urls = parse_cid_playurl(get_content(api_url)) @@ -109,59 +115,74 @@ def bilibili_live_download_by_cid(cid, title, output_dir='.', merge=True, info_o if not info_only: download_urls([url], title, type_, total_size=None, output_dir=output_dir, merge=merge) + def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_content(url) - title = r1_of([r'', - r']*>([^<>]+)

'], html) + title = r1_of([r'', + r']*>\s*([^<>]+)\s*'], html) if title: title = unescape_html(title) title = escape_file_path(title) - flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"', r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) - assert flashvars - flashvars = flashvars.replace(': ','=') - t, cid = flashvars.split('=', 1) - cid = cid.split('&')[0] - if t == 'cid': - if re.match(r'https?://live\.bilibili\.com/', url): - title = r1(r'([^<>]+)', html) - bilibili_live_download_by_cid(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) + if re.match(r'https?://bangumi\.bilibili\.com/', url): + # quick hack for bangumi URLs + episode_id = r1(r'data-current-episode-id="(\d+)"', html) + cont = post_content('http://bangumi.bilibili.com/web_api/get_source', + post_data={'episode_id': episode_id}) + cid = json.loads(cont)['result']['cid'] + bilibili_download_by_cid(str(cid), title, output_dir=output_dir, merge=merge, info_only=info_only) - else: - # multi-P - cids = [] - pages = re.findall('', html) - for page in pages: - html = get_html("http://www.bilibili.com%s" % page) - flashvars = r1_of([r'(cid=\d+)', - r'flashvars="([^"]+)"', - r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) - if flashvars: - t, cid = flashvars.split('=', 1) - cids.append(cid.split('&')[0]) - - # no multi-P - if not pages: - cids = [cid] - titles = [r1(r'', html) or title] - - for i in range(len(cids)): - bilibili_download_by_cid(cids[i], - titles[i], - output_dir=output_dir, - merge=merge, - info_only=info_only) - - elif t == 'vid': - sina_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) - elif t == 'ykid': - youku_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) - elif t == 'uid': - tudou_download_by_id(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) else: - raise NotImplementedError(flashvars) + flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"', + r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) + assert flashvars + flashvars = flashvars.replace(': ', '=') + t, cid = flashvars.split('=', 1) + cid = cid.split('&')[0] + if t == 'cid': + if re.match(r'https?://live\.bilibili\.com/', url): + title = r1(r'\s*([^<>]+)\s*', html) + bilibili_live_download_by_cid(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) + + else: + # multi-P + cids = [] + pages = re.findall('', html) + for i, page in enumerate(pages): + html = get_html("http://www.bilibili.com%s" % page) + flashvars = r1_of([r'(cid=\d+)', + r'flashvars="([^"]+)"', + r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) + if flashvars: + t, cid = flashvars.split('=', 1) + cids.append(cid.split('&')[0]) + if url.endswith(page): + cids = [cid.split('&')[0]] + titles = [titles[i]] + break + + # no multi-P + if not pages: + cids = [cid] + titles = [r1(r'', html) or title] + + for i in range(len(cids)): + bilibili_download_by_cid(cids[i], + titles[i], + output_dir=output_dir, + merge=merge, + info_only=info_only) + + elif t == 'vid': + sina_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + elif t == 'ykid': + youku_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + elif t == 'uid': + tudou_download_by_id(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) + else: + raise NotImplementedError(flashvars) if not info_only and not dry_run: if not kwargs['caption']: @@ -173,6 +194,7 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs with open(os.path.join(output_dir, title + '.cmt.xml'), 'w', encoding='utf-8') as x: x.write(xml) + site_info = "bilibili.com" download = bilibili_download download_playlist = bilibili_download diff --git a/src/you_get/extractors/bokecc.py b/src/you_get/extractors/bokecc.py new file mode 100644 index 00000000..8566e828 --- /dev/null +++ b/src/you_get/extractors/bokecc.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python + +from ..common import * +from ..extractor import VideoExtractor +import xml.etree.ElementTree as ET + +class BokeCC(VideoExtractor): + name = "BokeCC" + + stream_types = [ # we do now know for now, as we have to check the + # output from the API + ] + + API_ENDPOINT = 'http://p.bokecc.com/' + + + def download_by_id(self, vid = '', title = None, output_dir='.', merge=True, info_only=False,**kwargs): + """self, str->None + + Keyword arguments: + self: self + vid: The video ID for BokeCC cloud, something like + FE3BB999594978049C33DC5901307461 + + Calls the prepare() to download the video. + + If no title is provided, this method shall try to find a proper title + with the information providin within the + returned content of the API.""" + + assert vid + + self.prepare(vid = vid, title = title, **kwargs) + + self.extract(**kwargs) + + self.download(output_dir = output_dir, + merge = merge, + info_only = info_only, **kwargs) + + def prepare(self, vid = '', title = None, **kwargs): + assert vid + + api_url = self.API_ENDPOINT + \ + 'servlet/playinfo?vid={vid}&m=0'.format(vid = vid) #return XML + + html = get_content(api_url) + self.tree = ET.ElementTree(ET.fromstring(html)) + + if self.tree.find('result').text != '1': + log.wtf('API result says failed!') + raise + + if title is None: + self.title = '_'.join([i.text for i in tree.iterfind('video/videomarks/videomark/markdesc')]) + else: + self.title = title + + for i in self.tree.iterfind('video/quality'): + quality = i.attrib ['value'] + url = i[0].attrib['playurl'] + self.stream_types.append({'id': quality, + 'video_profile': i.attrib ['desp']}) + self.streams[quality] = {'url': url, + 'video_profile': i.attrib ['desp']} + self.streams_sorted = [dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams] + + + def extract(self, **kwargs): + for i in self.streams: + s = self.streams[i] + _, s['container'], s['size'] = url_info(s['url']) + s['src'] = [s['url']] + if 'stream_id' in kwargs and kwargs['stream_id']: + # Extract the stream + stream_id = kwargs['stream_id'] + + if stream_id not in self.streams: + log.e('[Error] Invalid video format.') + log.e('Run \'-i\' command with no specific video format to view all available formats.') + exit(2) + else: + # Extract stream with the best quality + stream_id = self.streams_sorted[0]['id'] + _, s['container'], s['size'] = url_info(s['url']) + s['src'] = [s['url']] + +site = BokeCC() + +# I don't know how to call the player directly so I just put it here +# just in case anyone touchs it -- Beining@Aug.24.2016 +#download = site.download_by_url +#download_playlist = site.download_by_url + +bokecc_download_by_id = site.download_by_id diff --git a/src/you_get/extractors/cntv.py b/src/you_get/extractors/cntv.py index cfd96e59..e25fa961 100644 --- a/src/you_get/extractors/cntv.py +++ b/src/you_get/extractors/cntv.py @@ -7,6 +7,7 @@ from ..common import * import json import re + def cntv_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False): assert id info = json.loads(get_html('http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do?pid=' + id)) @@ -31,7 +32,11 @@ def cntv_download_by_id(id, title = None, output_dir = '.', merge = True, info_o def cntv_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): if re.match(r'http://tv\.cntv\.cn/video/(\w+)/(\w+)', url): id = match1(url, r'http://tv\.cntv\.cn/video/\w+/(\w+)') - elif re.match(r'http://\w+\.cntv\.cn/(\w+/\w+/(classpage/video/)?)?\d+/\d+\.shtml', url) or re.match(r'http://\w+.cntv.cn/(\w+/)*VIDE\d+.shtml', url): + elif re.match(r'http://\w+\.cntv\.cn/(\w+/\w+/(classpage/video/)?)?\d+/\d+\.shtml', url) or \ + re.match(r'http://\w+.cntv.cn/(\w+/)*VIDE\d+.shtml', url) or \ + re.match(r'http://(\w+).cntv.cn/(\w+)/classpage/video/(\d+)/(\d+).shtml', url) or \ + re.match(r'http://\w+.cctv.com/\d+/\d+/\d+/\w+.shtml', url) or \ + re.match(r'http://\w+.cntv.cn/\d+/\d+/\d+/\w+.shtml', url): id = r1(r'videoCenterId","(\w+)"', get_html(url)) elif re.match(r'http://xiyou.cntv.cn/v-[\w-]+\.html', url): id = r1(r'http://xiyou.cntv.cn/v-([\w-]+)\.html', url) diff --git a/src/you_get/extractors/dailymotion.py b/src/you_get/extractors/dailymotion.py index 8b701cd1..2e96c160 100644 --- a/src/you_get/extractors/dailymotion.py +++ b/src/you_get/extractors/dailymotion.py @@ -4,6 +4,11 @@ __all__ = ['dailymotion_download'] from ..common import * +def extract_m3u(url): + content = get_content(url) + m3u_url = re.findall(r'http://.*', content)[0] + return match1(m3u_url, r'([^#]+)') + def dailymotion_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): """Downloads Dailymotion videos by URL. """ @@ -13,7 +18,7 @@ def dailymotion_download(url, output_dir = '.', merge = True, info_only = False, title = match1(html, r'"video_title"\s*:\s*"([^"]+)"') or \ match1(html, r'"title"\s*:\s*"([^"]+)"') - for quality in ['720','480','380','240','auto']: + for quality in ['1080','720','480','380','240','auto']: try: real_url = info[quality][0]["url"] if real_url: @@ -21,11 +26,12 @@ def dailymotion_download(url, output_dir = '.', merge = True, info_only = False, except KeyError: pass - type, ext, size = url_info(real_url) + m3u_url = extract_m3u(real_url) + mime, ext, size = 'video/mp4', 'mp4', 0 - print_info(site_info, title, type, size) + print_info(site_info, title, mime, size) if not info_only: - download_urls([real_url], title, ext, size, output_dir, merge = merge) + download_url_ffmpeg(m3u_url, title, ext, output_dir=output_dir, merge=merge) site_info = "Dailymotion.com" download = dailymotion_download diff --git a/src/you_get/extractors/dilidili.py b/src/you_get/extractors/dilidili.py old mode 100755 new mode 100644 index 615f9861..082f84e1 --- a/src/you_get/extractors/dilidili.py +++ b/src/you_get/extractors/dilidili.py @@ -35,16 +35,16 @@ def dilidili_parser_data_to_stream_types(typ ,vid ,hd2 ,sign, tmsign, ulk): #---------------------------------------------------------------------- def dilidili_download(url, output_dir = '.', merge = False, info_only = False, **kwargs): - if re.match(r'http://www.dilidili.com/watch/\w+', url): + if re.match(r'http://www.dilidili.com/watch\S+', url): html = get_content(url) title = match1(html, r'(.+)丨(.+)') #title # player loaded via internal iframe - frame_url = re.search(r'