Merge pull request #11 from soimort/master

update
This commit is contained in:
zhuhuotui 2013-08-28 05:38:34 -07:00
commit 6ec99038e0
37 changed files with 708 additions and 287 deletions

1
.gitignore vendored
View File

@ -11,6 +11,7 @@ _*/
*.3gp
*.asf
*.flv
*.lrc
*.mkv
*.mp3
*.mp4

View File

@ -1,6 +1,93 @@
Changelog
=========
0.3.21
------
*Date: 2013-08-17*
* Fix issues for:
- YouTube
- YinYueTai
- pan.baidu.com
0.3.20
------
*Date: 2013-08-16*
* Add support for:
- eHow
- Khan Academy
- TED
- 5sing
* Fix issues for:
- Tudou
0.3.18
------
*Date: 2013-07-19*
* Fix issues for:
- Dailymotion
- Youku
- Sina
- AcFun
- bilibili
0.3.17
------
*Date: 2013-07-12*
* Fix issues for:
- YouTube
- 163
- bilibili
* Code cleanup.
0.3.16
------
*Date: 2013-06-28*
* Fix issues for:
- YouTube
- Sohu
- Google+ (enable HTTPS proxy)
0.3.15
------
*Date: 2013-06-21*
* Add support for:
- Instagram
0.3.14
------
*Date: 2013-06-14*
* Add support for:
- Alive.in.th
* Remove support of:
- JPopsuki
* Fix issues for:
- AcFun
- iQIYI
0.3.13
------
*Date: 2013-06-07*
* Add support for:
- Baidu Wangpan (video only)
* Fix issue for:
- Google+
0.3.12
------
@ -86,7 +173,7 @@ Changelog
* Add support for:
- Douban
- MioMio
* Fix issue for:
* Fix issues for:
- Tudou
- Vimeo

View File

@ -17,15 +17,18 @@ Fork me on GitHub: <https://github.com/soimort/you-get>
* Coursera <https://www.coursera.org>
* Blip <http://blip.tv>
* Dailymotion <http://dailymotion.com>
* eHow <http://www.ehow.com>
* Facebook <http://facebook.com>
* Google+ <http://plus.google.com>
* Google Drive <http://docs.google.com>
* Khan Academy <http://www.khanacademy.org>
* TED <http://www.ted.com>
* Tumblr <http://www.tumblr.com>
* Vine <http://vine.co>
* Instagram <http://instagram.com>
* SoundCloud <http://soundcloud.com>
* Mixcloud <http://www.mixcloud.com>
* Freesound <http://www.freesound.org>
* JPopsuki <http://jpopsuki.tv>
* VID48 <http://vid48.com>
* Niconico (ニコニコ動画) <http://www.nicovideo.jp>
* Youku (优酷) <http://www.youku.com>
@ -47,8 +50,11 @@ Fork me on GitHub: <https://github.com/soimort/you-get>
* Sohu (搜狐视频) <http://tv.sohu.com>
* 56 (56网) <http://www.56.com>
* Xiami (虾米) <http://www.xiami.com>
* Baidu (百度音乐) <http://music.baidu.com>
* 5sing <http://www.5sing.com>
* Baidu Music (百度音乐) <http://music.baidu.com>
* Baidu Wangpan (百度网盘) <http://pan.baidu.com>
* SongTaste <http://www.songtaste.com>
* Alive.in.th <http://alive.in.th>
## Dependencies
@ -233,15 +239,18 @@ You-Get基于优酷下载脚本[iambus/youku-lixian](https://github.com/iambus/y
* Coursera <https://www.coursera.org>
* Blip <http://blip.tv>
* Dailymotion <http://dailymotion.com>
* eHow <http://www.ehow.com>
* Facebook <http://facebook.com>
* Google+ <http://plus.google.com>
* Google Drive <http://docs.google.com>
* Khan Academy <http://www.khanacademy.org>
* TED <http://www.ted.com>
* Tumblr <http://www.tumblr.com>
* Vine <http://vine.co>
* Instagram <http://instagram.com>
* SoundCloud <http://soundcloud.com>
* Mixcloud <http://www.mixcloud.com>
* Freesound <http://www.freesound.org>
* JPopsuki <http://jpopsuki.tv>
* VID48 <http://vid48.com>
* NICONICO动画 <http://www.nicovideo.jp>
* 优酷 <http://www.youku.com>
@ -263,8 +272,11 @@ You-Get基于优酷下载脚本[iambus/youku-lixian](https://github.com/iambus/y
* 搜狐视频 <http://tv.sohu.com>
* 56网 <http://www.56.com>
* 虾米 <http://www.xiami.com>
* 5sing <http://www.5sing.com>
* 百度音乐 <http://music.baidu.com>
* 百度网盘 <http://pan.baidu.com>
* SongTaste <http://www.songtaste.com>
* Alive.in.th <http://alive.in.th>
## 依赖

View File

@ -20,15 +20,18 @@ Supported Sites (As of Now)
* Coursera https://www.coursera.org
* Blip http://blip.tv
* Dailymotion http://dailymotion.com
* eHow http://www.ehow.com
* Facebook http://facebook.com
* Google+ http://plus.google.com
* Google Drive http://docs.google.com
* Khan Academy http://www.khanacademy.org
* TED http://www.ted.com
* Tumblr http://www.tumblr.com
* Vine http://vine.co
* Instagram http://instagram.com
* SoundCloud http://soundcloud.com
* Mixcloud http://www.mixcloud.com
* Freesound http://www.freesound.org
* JPopsuki http://jpopsuki.tv
* VID48 http://vid48.com
* Niconico (ニコニコ動画) http://www.nicovideo.jp
* Youku (优酷) http://www.youku.com
@ -50,8 +53,11 @@ Supported Sites (As of Now)
* Sohu (搜狐视频) http://tv.sohu.com
* 56 (56网) http://www.56.com
* Xiami (虾米) http://www.xiami.com
* Baidu (百度音乐) http://music.baidu.com
* 5sing http://www.5sing.com
* Baidu Music (百度音乐) http://music.baidu.com
* Baidu Wangpan (百度网盘) http://pan.baidu.com
* SongTaste http://www.songtaste.com
* Alive.in.th http://alive.in.th
Dependencies
------------

View File

@ -1,9 +1,9 @@
#!/usr/bin/env python
from .processor import *
from .downloader import *
from .version import *
from .common import *
from .__main__ import *
from .version import *
# Easy import
#from .cli_wrapper.converter import *
#from .cli_wrapper.player import *
from .downloader import *

View File

@ -7,6 +7,7 @@ import os
import re
import sys
from urllib import request, parse
import platform
from .version import __version__
@ -33,20 +34,63 @@ def tr(s):
except:
return str(s.encode('utf-8'))[2:-1]
# DEPRECATED in favor of match1()
def r1(pattern, text):
m = re.search(pattern, text)
if m:
return m.group(1)
# DEPRECATED in favor of match1()
def r1_of(patterns, text):
for p in patterns:
x = r1(p, text)
if x:
return x
def match1(text, *patterns):
"""Scans through a string for substrings matched some patterns (first-subgroups only).
Args:
text: A string to be scanned.
patterns: Arbitrary number of regex patterns.
Returns:
When only one pattern is given, returns a string (None if no match found).
When more than one pattern are given, returns a list of strings ([] if no match found).
"""
if len(patterns) == 1:
pattern = patterns[0]
match = re.search(pattern, text)
if match:
return match.group(1)
else:
return None
else:
ret = []
for pattern in patterns:
match = re.search(pattern, text)
if match:
ret.append(match.group(1))
return ret
def parse_query_param(url, param):
"""Parses the query string of a URL and returns the value of a parameter.
Args:
url: A URL.
param: A string representing the name of the parameter.
Returns:
The value of the parameter.
"""
return parse.parse_qs(parse.urlparse(url).query)[param][0]
def unicodize(text):
return re.sub(r'\\u([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])', lambda x: chr(int(x.group(0)[2:], 16)), text)
# DEPRECATED in favor of filenameable()
def escape_file_path(path):
path = path.replace('/', '-')
path = path.replace('\\', '-')
@ -54,23 +98,57 @@ def escape_file_path(path):
path = path.replace('?', '-')
return path
def filenameable(text):
"""Converts a string to a legal filename through various OSes.
"""
# All POSIX systems
text = text.translate({
0: None,
ord('/'): '-',
})
if platform.system() == 'Darwin': # For Mac OS
text = text.translate({
ord(':'): '-',
})
elif platform.system() == 'Windows': # For Windows
text = text.translate({
ord(':'): '-',
ord('*'): '-',
ord('?'): '-',
ord('\\'): '-',
ord('\"'): '\'',
ord('<'): '-',
ord('>'): '-',
ord('|'): '-',
ord('+'): '-',
ord('['): '(',
ord(']'): ')',
})
return text
def unescape_html(html):
from html import parser
html = parser.HTMLParser().unescape(html)
html = re.sub(r'&#(\d+);', lambda x: chr(int(x.group(1))), html)
return html
def ungzip(s):
def ungzip(data):
"""Decompresses data for Content-Encoding: gzip.
"""
from io import BytesIO
import gzip
buffer = BytesIO(s)
f = gzip.GzipFile(fileobj = buffer)
buffer = BytesIO(data)
f = gzip.GzipFile(fileobj=buffer)
return f.read()
def undeflate(s):
def undeflate(data):
"""Decompresses data for Content-Encoding: deflate.
(the zlib compression is used.)
"""
import zlib
return zlib.decompress(s, -zlib.MAX_WBITS)
return zlib.decompress(data, -zlib.MAX_WBITS)
# DEPRECATED in favor of get_content()
def get_response(url, faker = False):
if faker:
response = request.urlopen(request.Request(url, headers = fake_headers), None)
@ -85,10 +163,12 @@ def get_response(url, faker = False):
response.data = data
return response
# DEPRECATED in favor of get_content()
def get_html(url, encoding = None, faker = False):
content = get_response(url, faker).data
return str(content, 'utf-8', 'ignore')
# DEPRECATED in favor of get_content()
def get_decoded_html(url, faker = False):
response = get_response(url, faker)
data = response.data
@ -98,6 +178,38 @@ def get_decoded_html(url, faker = False):
else:
return data
def get_content(url, headers={}, decoded=True):
"""Gets the content of a URL via sending a HTTP GET request.
Args:
url: A URL.
headers: Request headers used by the client.
decoded: Whether decode the response body using UTF-8 or the charset specified in Content-Type.
Returns:
The content as a string.
"""
response = request.urlopen(request.Request(url, headers=headers))
data = response.read()
# Handle HTTP compression for gzip and deflate (zlib)
content_encoding = response.getheader('Content-Encoding')
if content_encoding == 'gzip':
data = ungzip(data)
elif content_encoding == 'deflate':
data = undeflate(data)
# Decode the response body
if decoded:
charset = match1(response.getheader('Content-Type'), r'charset=([\w-]+)')
if charset is not None:
data = data.decode(charset)
else:
data = data.decode('utf-8')
return data
def url_size(url, faker = False):
if faker:
response = request.urlopen(request.Request(url, headers = fake_headers), None)
@ -136,7 +248,7 @@ def url_info(url, faker = False):
type = None
if headers['content-disposition']:
try:
filename = parse.unquote(r1(r'filename="?(.+)"?', headers['content-disposition']))
filename = parse.unquote(r1(r'filename="?([^"]+)"?', headers['content-disposition']))
if len(filename.split('.')) > 1:
ext = filename.split('.')[-1]
else:
@ -388,7 +500,9 @@ def download_urls(urls, title, ext, total_size, output_dir = '.', refer = None,
import sys
traceback.print_exc(file = sys.stdout)
pass
title = escape_file_path(title)
title = filenameable(title)
filename = '%s.%s' % (title, ext)
filepath = os.path.join(output_dir, filename)
if total_size:
@ -437,19 +551,18 @@ def download_urls(urls, title, ext, total_size, output_dir = '.', refer = None,
elif ext == 'mp4':
try:
from .processor.join_mp4 import concat_mp4
concat_mp4(parts, os.path.join(output_dir, title + '.mp4'))
for part in parts:
os.remove(part)
except:
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
from .processor.ffmpeg import ffmpeg_concat_mp4_to_mp4
ffmpeg_concat_mp4_to_mp4(parts, os.path.join(output_dir, title + '.mp4'))
for part in parts:
os.remove(part)
else:
print('No ffmpeg is found. Merging aborted.')
from .processor.join_mp4 import concat_mp4
concat_mp4(parts, os.path.join(output_dir, title + '.mp4'))
except:
raise
else:
for part in parts:
os.remove(part)
else:
print("Can't merge %s files" % ext)
@ -463,7 +576,9 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir = '.', refer
return
assert ext in ('ts')
title = escape_file_path(title)
title = filenameable(title)
filename = '%s.%s' % (title, 'ts')
filepath = os.path.join(output_dir, filename)
if total_size:
@ -597,9 +712,7 @@ def set_http_proxy(proxy):
elif proxy == '': # Don't use any proxy
proxy_support = request.ProxyHandler({})
else: # Use proxy
if not proxy.startswith('http://'):
proxy = 'http://' + proxy
proxy_support = request.ProxyHandler({'http': '%s' % proxy})
proxy_support = request.ProxyHandler({'http': '%s' % proxy, 'https': '%s' % proxy})
opener = request.build_opener(proxy_support)
request.install_opener(opener)
@ -615,8 +728,18 @@ def download_main(download, download_playlist, urls, playlist, output_dir, merge
else:
download(url, output_dir = output_dir, merge = merge, info_only = info_only)
def get_version():
try:
import subprocess
real_dir = os.path.dirname(os.path.realpath(__file__))
git_hash = subprocess.Popen(['git', 'rev-parse', '--short', 'HEAD'], cwd=real_dir, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL).stdout.read().decode('utf-8').strip()
assert git_hash
return '%s-%s' % (__version__, git_hash)
except:
return __version__
def script_main(script_name, download, download_playlist = None):
version = 'You-Get %s, a video downloader.' % __version__
version = 'You-Get %s, a video downloader.' % get_version()
help = 'Usage: %s [OPTION]... [URL]...\n' % script_name
help += '''\nStartup options:
-V | --version Display the version and exit.

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python
from .acfun import *
from .alive import *
from .baidu import *
from .bilibili import *
from .blip import *
@ -8,13 +9,15 @@ from .cntv import *
from .coursera import *
from .dailymotion import *
from .douban import *
from .ehow import *
from .facebook import *
from .fivesing import *
from .freesound import *
from .google import *
from .ifeng import *
from .instagram import *
from .iqiyi import *
from .joy import *
from .jpopsuki import *
from .ku6 import *
from .miomio import *
from .mixcloud import *
@ -36,3 +39,7 @@ from .xiami import *
from .yinyuetai import *
from .youku import *
from .youtube import *
from .ted import *
from .khan import *
from .__main__ import *

View File

@ -1,9 +1,8 @@
#!/usr/bin/env python
__all__ = ['main', 'any_download', 'any_download_playlist']
from .downloader import *
from .common import *
from ..downloader import *
from ..common import *
def url_to_module(url):
site = r1(r'http://([^/]+)/', url)
@ -20,6 +19,7 @@ def url_to_module(url):
downloads = {
'163': netease,
'56': w56,
'5sing': fivesing,
'acfun': acfun,
'baidu': baidu,
'bilibili': bilibili,
@ -28,14 +28,16 @@ def url_to_module(url):
'coursera': coursera,
'dailymotion': dailymotion,
'douban': douban,
'ehow': ehow,
'facebook': facebook,
'freesound': freesound,
'google': google,
'iask': sina,
'ifeng': ifeng,
'in': alive,
'instagram': instagram,
'iqiyi': iqiyi,
'joy': joy,
'jpopsuki': jpopsuki,
'kankanews': bilibili,
'ku6': ku6,
'miomio': miomio,
@ -48,6 +50,7 @@ def url_to_module(url):
'sohu': sohu,
'songtaste':songtaste,
'soundcloud': soundcloud,
'ted': ted,
'tudou': tudou,
'tumblr': tumblr,
'vid48': vid48,
@ -58,6 +61,7 @@ def url_to_module(url):
'youku': youku,
'youtu': youtube,
'youtube': youtube,
'khanacademy': khan,
#TODO
}
if k in downloads:

View File

@ -5,7 +5,7 @@ __all__ = ['acfun_download']
from ..common import *
from .qq import qq_download_by_id
from .sina import sina_download_by_id
from .sina import sina_download_by_vid
from .tudou import tudou_download_by_iid
from .youku import youku_download_by_id
@ -16,11 +16,11 @@ def get_srt_json(id):
return get_html(url)
def acfun_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False):
info = json.loads(get_html('http://www.acfun.tv/api/getVideoByID.aspx?vid=' + id))
info = json.loads(get_html('http://wenzhou.acfun.tv/api/getVideoByID.aspx?vid=' + id))
t = info['vtype']
vid = info['vid']
if t == 'sina':
sina_download_by_id(vid, title, output_dir = output_dir, merge = merge, info_only = info_only)
sina_download_by_vid(vid, title, output_dir = output_dir, merge = merge, info_only = info_only)
elif t == 'youku':
youku_download_by_id(vid, title, output_dir = output_dir, merge = merge, info_only = info_only)
elif t == 'tudou':
@ -37,7 +37,7 @@ def acfun_download_by_id(id, title = None, output_dir = '.', merge = True, info_
x.write(cmt)
def acfun_download(url, output_dir = '.', merge = True, info_only = False):
assert re.match(r'http://www.acfun.tv/v/ac(\d+)', url)
assert re.match(r'http://[^\.]+.acfun.tv/v/ac(\d+)', url)
html = get_html(url)
title = r1(r'<h1 id="title-article" class="title"[^<>]*>([^<>]+)<', html)
@ -49,7 +49,7 @@ def acfun_download(url, output_dir = '.', merge = True, info_only = False):
id = r1(r"\[Video\](\d+)\[/Video\]", html) or r1(r"\[video\](\d+)\[/video\]", html)
if not id:
id = r1(r"src=\"/newflvplayer/player.*id=(\d+)", html)
sina_download_by_id(id, title, output_dir = output_dir, merge = merge, info_only = info_only)
sina_download_by_vid(id, title, output_dir = output_dir, merge = merge, info_only = info_only)
else:
acfun_download_by_id(id, title, output_dir = output_dir, merge = merge, info_only = info_only)

View File

@ -0,0 +1,21 @@
#!/usr/bin/env python
__all__ = ['alive_download']
from ..common import *
def alive_download(url, output_dir = '.', merge = True, info_only = False):
html = get_html(url)
title = r1(r'<meta property="og:title" content="([^"]+)"', html)
url = r1(r'file: "(http://alive[^"]+)"', html)
type, ext, size = url_info(url)
print_info(site_info, title, type, size)
if not info_only:
download_urls([url], title, ext, size, output_dir, merge = merge)
site_info = "Alive.in.th"
download = alive_download
download_playlist = playlist_not_supported('alive')

View File

@ -68,12 +68,25 @@ def baidu_download_album(aid, output_dir = '.', merge = True, info_only = False)
track_nr += 1
def baidu_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False):
if re.match(r'http://music.baidu.com/album/\d+', url):
if re.match(r'http://pan.baidu.com', url):
html = get_html(url)
title = r1(r'server_filename="([^"]+)"', html)
if len(title.split('.')) > 1:
title = ".".join(title.split('.')[:-1])
real_url = r1(r'\\"dlink\\":\\"([^"]*)\\"', html).replace('\\\\/', '/')
type, ext, size = url_info(real_url, faker = True)
print_info(site_info, title, ext, size)
if not info_only:
download_urls([real_url], title, ext, size, output_dir, merge = merge)
elif re.match(r'http://music.baidu.com/album/\d+', url):
id = r1(r'http://music.baidu.com/album/(\d+)', url)
baidu_download_album(id, output_dir, merge, info_only)
if re.match('http://music.baidu.com/song/\d+', url):
elif re.match('http://music.baidu.com/song/\d+', url):
id = r1(r'http://music.baidu.com/song/(\d+)', url)
baidu_download_song(id, output_dir, merge, info_only)

View File

@ -4,7 +4,7 @@ __all__ = ['bilibili_download']
from ..common import *
from .sina import sina_download_by_id
from .sina import sina_download_by_vid
from .tudou import tudou_download_by_id
from .youku import youku_download_by_id
@ -64,7 +64,7 @@ def bilibili_download_by_cid(id, title, output_dir = '.', merge = True, info_onl
elif re.search(r'/mp4/', urls[0]):
type = 'mp4'
else:
raise NotImplementedError(urls[0])
type = 'flv'
size = 0
for url in urls:
@ -83,7 +83,7 @@ def bilibili_download(url, output_dir = '.', merge = True, info_only = False):
title = unescape_html(title)
title = escape_file_path(title)
flashvars = r1_of([r'flashvars="([^"]+)"', r'"https://secure.bilibili.tv/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
flashvars = r1_of([r'player_params=\'(cid=\d+)', r'flashvars="([^"]+)"', r'"https://secure.bilibili.tv/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
assert flashvars
t, id = flashvars.split('=', 1)
id = id.split('&')[0]

View File

@ -5,16 +5,22 @@ __all__ = ['dailymotion_download']
from ..common import *
def dailymotion_download(url, output_dir = '.', merge = True, info_only = False):
html = get_html(url)
html = parse.unquote(html).replace('\/', '/')
"""Downloads Dailymotion videos by URL.
"""
title = r1(r'meta property="og:title" content="([^"]+)"', html)
title = escape_file_path(title)
id = match1(url, r'/video/([^\?]+)')
embed_url = 'http://www.dailymotion.com/embed/video/%s' % id
html = get_content(embed_url)
for quality in ['hd720URL', 'hqURL', 'sdURL']:
real_url = r1(r',\"' + quality + '\"\:\"([^\"]+?)\",', html)
info = json.loads(match1(html, r'var\s*info\s*=\s*({.+}),\n'))
title = info['title']
for quality in ['stream_h264_hd1080_url', 'stream_h264_hd_url', 'stream_h264_hq_url', 'stream_h264_url', 'stream_h264_ld_url']:
real_url = info[quality]
if real_url:
break
type, ext, size = url_info(real_url)
print_info(site_info, title, type, size)

View File

@ -0,0 +1,38 @@
#!/usr/bin/env python
__all__ = ['ehow_download']
from ..common import *
def ehow_download(url, output_dir = '.', merge = True, info_only = False):
assert re.search(r'http://www.ehow.com/video_', url), "URL you entered is not supported"
html = get_html(url)
contentid = r1(r'<meta name="contentid" scheme="DMINSTR2" content="([^"]+)" />', html)
vid = r1(r'"demand_ehow_videoid":"([^"]+)"', html)
assert vid
xml = get_html('http://www.ehow.com/services/video/series.xml?demand_ehow_videoid=%s' % vid)
from xml.dom.minidom import parseString
doc = parseString(xml)
tab = doc.getElementsByTagName('related')[0].firstChild
for video in tab.childNodes:
if re.search(contentid, video.attributes['link'].value):
url = video.attributes['flv'].value
break
title = video.attributes['title'].value
assert title
type, ext, size = url_info(url)
print_info(site_info, title, type, size)
if not info_only:
download_urls([url], title, ext, size, output_dir, merge = merge)
site_info = "ehow.com"
download = ehow_download
download_playlist = playlist_not_supported('ehow')

View File

@ -0,0 +1,18 @@
#!/usr/bin/env python
__all__ = ['fivesing_download']
from ..common import *
def fivesing_download(url, output_dir=".", merge=True, info_only=False):
html = get_html(url)
title = r1(r'var SongName = "(.*)";', html)
url = r1(r'file: "(\S*)"', html)
songtype, ext, size = url_info(url)
print_info(site_info, title, songtype, size)
if not info_only:
download_urls([url], title, ext, size, output_dir)
site_info = "5sing.com"
download = fivesing_download
download_playlist = playlist_not_supported("5sing")

View File

@ -6,6 +6,40 @@ from ..common import *
import re
# YouTube media encoding options, in descending quality order.
# taken from http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs, 3/22/2013.
youtube_codecs = [
{'itag': 38, 'container': 'MP4', 'video_resolution': '3072p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '3.5-5', 'audio_encoding': 'AAC', 'audio_bitrate': '192'},
{'itag': 46, 'container': 'WebM', 'video_resolution': '1080p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'},
{'itag': 37, 'container': 'MP4', 'video_resolution': '1080p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '3-4.3', 'audio_encoding': 'AAC', 'audio_bitrate': '192'},
{'itag': 102, 'container': '', 'video_resolution': '', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '2', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'},
{'itag': 45, 'container': 'WebM', 'video_resolution': '720p', 'video_encoding': '', 'video_profile': '', 'video_bitrate': '', 'audio_encoding': '', 'audio_bitrate': ''},
{'itag': 22, 'container': 'MP4', 'video_resolution': '720p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '2-2.9', 'audio_encoding': 'AAC', 'audio_bitrate': '192'},
{'itag': 84, 'container': 'MP4', 'video_resolution': '720p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '2-2.9', 'audio_encoding': 'AAC', 'audio_bitrate': '152'},
{'itag': 120, 'container': 'FLV', 'video_resolution': '720p', 'video_encoding': 'AVC', 'video_profile': 'Main@L3.1', 'video_bitrate': '2', 'audio_encoding': 'AAC', 'audio_bitrate': '128'},
{'itag': 85, 'container': 'MP4', 'video_resolution': '520p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '2-2.9', 'audio_encoding': 'AAC', 'audio_bitrate': '152'},
{'itag': 44, 'container': 'WebM', 'video_resolution': '480p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '1', 'audio_encoding': 'Vorbis', 'audio_bitrate': '128'},
{'itag': 35, 'container': 'FLV', 'video_resolution': '480p', 'video_encoding': 'H.264', 'video_profile': 'Main', 'video_bitrate': '0.8-1', 'audio_encoding': 'AAC', 'audio_bitrate': '128'},
{'itag': 101, 'container': 'WebM', 'video_resolution': '360p', 'video_encoding': 'VP8', 'video_profile': '3D', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'},
{'itag': 100, 'container': 'WebM', 'video_resolution': '360p', 'video_encoding': 'VP8', 'video_profile': '3D', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '128'},
{'itag': 43, 'container': 'WebM', 'video_resolution': '360p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '0.5', 'audio_encoding': 'Vorbis', 'audio_bitrate': '128'},
{'itag': 34, 'container': 'FLV', 'video_resolution': '360p', 'video_encoding': 'H.264', 'video_profile': 'Main', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '128'},
{'itag': 82, 'container': 'MP4', 'video_resolution': '360p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '96'},
{'itag': 18, 'container': 'MP4', 'video_resolution': '270p/360p', 'video_encoding': 'H.264', 'video_profile': 'Baseline', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '96'},
{'itag': 6, 'container': 'FLV', 'video_resolution': '270p', 'video_encoding': 'Sorenson H.263', 'video_profile': '', 'video_bitrate': '0.8', 'audio_encoding': 'MP3', 'audio_bitrate': '64'},
{'itag': 83, 'container': 'MP4', 'video_resolution': '240p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '96'},
{'itag': 13, 'container': '3GP', 'video_resolution': '', 'video_encoding': 'MPEG-4 Visual', 'video_profile': '', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': ''},
{'itag': 5, 'container': 'FLV', 'video_resolution': '240p', 'video_encoding': 'Sorenson H.263', 'video_profile': '', 'video_bitrate': '0.25', 'audio_encoding': 'MP3', 'audio_bitrate': '64'},
{'itag': 36, 'container': '3GP', 'video_resolution': '240p', 'video_encoding': 'MPEG-4 Visual', 'video_profile': 'Simple', 'video_bitrate': '0.17', 'audio_encoding': 'AAC', 'audio_bitrate': '38'},
{'itag': 17, 'container': '3GP', 'video_resolution': '144p', 'video_encoding': 'MPEG-4 Visual', 'video_profile': 'Simple', 'video_bitrate': '0.05', 'audio_encoding': 'AAC', 'audio_bitrate': '24'},
]
fmt_level = dict(
zip(
[str(codec['itag'])
for codec in
youtube_codecs],
range(len(youtube_codecs))))
def google_download(url, output_dir = '.', merge = True, info_only = False):
# Percent-encoding Unicode URL
url = parse.quote(url, safe = ':/+%')
@ -14,25 +48,22 @@ def google_download(url, output_dir = '.', merge = True, info_only = False):
if service == 'plus': # Google Plus
if re.search(r'plus.google.com/photos/\d+/albums/\d+/\d+', url):
oid = r1(r'plus.google.com/photos/(\d+)/albums/\d+/\d+', url)
pid = r1(r'plus.google.com/photos/\d+/albums/\d+/(\d+)', url)
elif re.search(r'plus.google.com/photos/\d+/albums/posts/\d+', url):
oid = r1(r'plus.google.com/photos/(\d+)/albums/posts/\d+', url)
pid = r1(r'plus.google.com/photos/\d+/albums/posts/(\d+)', url)
else:
if not re.search(r'plus.google.com/photos/[^/]*/albums/\d+/\d+', url):
html = get_html(url)
oid = r1(r'"https://plus.google.com/photos/(\d+)/albums/\d+/\d+', html)
pid = r1(r'"https://plus.google.com/photos/\d+/albums/\d+/(\d+)', html)
url = "http://plus.google.com/photos/%s/albums/posts/%s?oid=%s&pid=%s" % (oid, pid, oid, pid)
url = r1(r'"(https://plus.google.com/photos/\d+/albums/\d+/\d+)', html)
title = r1(r'<title>([^<\n]+)', html)
else:
title = None
html = get_html(url)
real_url = unicodize(r1(r'"(https://video.googleusercontent.com/[^"]*)",\d\]', html).replace('\/', '/'))
real_urls = re.findall(r'\[(\d+),\d+,\d+,"([^"]+)"\]', html)
real_url = unicodize(sorted(real_urls, key = lambda x : fmt_level[x[0]])[0][1])
if title is None:
post_url = r1(r'"(https://plus.google.com/\d+/posts/[^"]*)"', html)
post_html = get_html(post_url)
title = r1(r'<title>([^<\n]+)', post_html)
title = r1(r"\"([^\"]+)\",\"%s\"" % pid, html)
if title is None:
response = request.urlopen(request.Request(real_url))
if response.headers['content-disposition']:

View File

@ -0,0 +1,22 @@
#!/usr/bin/env python
__all__ = ['instagram_download']
from ..common import *
def instagram_download(url, output_dir = '.', merge = True, info_only = False):
html = get_html(url)
id = r1(r'instagram.com/p/([^/]+)/', html)
description = r1(r'<meta property="og:description" content="([^"]*)"', html)
title = description + " [" + id + "]"
url = r1(r'<meta property="og:video" content="([^"]*)"', html)
type, ext, size = url_info(url)
print_info(site_info, title, type, size)
if not info_only:
download_urls([url], title, ext, size, output_dir, merge = merge)
site_info = "Instagram.com"
download = instagram_download
download_playlist = playlist_not_supported('instagram')

View File

@ -6,13 +6,8 @@ from ..common import *
def iqiyi_download(url, output_dir = '.', merge = True, info_only = False):
html = get_html(url)
#title = r1(r'title\s*:\s*"([^"]+)"', html)
#title = unescape_html(title).decode('utf-8')
#videoId = r1(r'videoId\s*:\s*"([^"]+)"', html)
#pid = r1(r'pid\s*:\s*"([^"]+)"', html)
#ptype = r1(r'ptype\s*:\s*"([^"]+)"', html)
#info_url = 'http://cache.video.qiyi.com/v/%s/%s/%s/' % (videoId, pid, ptype)
videoId = r1(r'''["']videoId["'][:=]["']([^"']+)["']''', html)
videoId = r1(r'data-player-videoid="([^"]+)"', html)
assert videoId
info_url = 'http://cache.video.qiyi.com/v/%s' % videoId

View File

@ -1,23 +0,0 @@
#!/usr/bin/env python
__all__ = ['jpopsuki_download']
from ..common import *
def jpopsuki_download(url, output_dir = '.', merge = True, info_only = False):
html = get_html(url)
title = r1(r'<meta name="title" content="([^"]*)"', html)
if title.endswith(' - JPopsuki TV'):
title = title[:-14]
url = "http://jpopsuki.tv%s" % r1(r'<source src="([^"]*)"', html)
type, ext, size = url_info(url)
print_info(site_info, title, type, size)
if not info_only:
download_urls([url], title, ext, size, output_dir, merge = merge)
site_info = "JPopsuki.tv"
download = jpopsuki_download
download_playlist = playlist_not_supported('jpopsuki')

15
src/you_get/downloader/khan.py Executable file
View File

@ -0,0 +1,15 @@
#!/usr/bin/env python
__all__ = ['khan_download']
from ..common import *
from .youtube import youtube_download_by_id
def khan_download(url, output_dir = '.', merge = True, info_only = False):
page = get_html(url)
id = page[page.find('src="https://www.youtube.com/embed/') + len('src="https://www.youtube.com/embed/') :page.find('?enablejsapi=1&wmode=transparent&modestbranding=1&rel=0&fs=1&showinfo=0')]
youtube_download_by_id(id, output_dir=output_dir, merge=merge, info_only=info_only)
site_info = "khanacademy.org"
download = khan_download
download_playlist = playlist_not_supported('khan')

View File

@ -7,10 +7,13 @@ from ..common import *
def netease_download(url, output_dir = '.', merge = True, info_only = False):
html = get_decoded_html(url)
src = r1(r'<source src="([^"]+)"', html)
title = r1('movieDescription=\'([^\']+)\'', html)
title = r1('movieDescription=\'([^\']+)\'', html) or r1('<title>(.+)</title>', html)
if title[0] == ' ':
title = title[1:]
if title:
src = r1(r'<source src="([^"]+)"', html) or r1(r'<source type="[^"]+" src="([^"]+)"', html)
if src:
sd_url = r1(r'(.+)-mobile.mp4', src) + ".flv"
_, _, sd_size = url_info(sd_url)
@ -24,10 +27,7 @@ def netease_download(url, output_dir = '.', merge = True, info_only = False):
ext = 'flv'
else:
title = r1('<title>(.+)</title>', html)
if title[0] == ' ':
title = title[1:]
url = r1(r'(.+)-list.m3u8', src) + ".mp4"
url = r1(r'["\'](.+)-list.m3u8["\']', html) + ".mp4"
_, _, size = url_info(url)
ext = 'mp4'

View File

@ -23,7 +23,7 @@ def nicovideo_download(url, output_dir = '.', merge = True, info_only = False):
nicovideo_login(user, password)
html = get_html(url) # necessary!
title = unicodize(r1(r'title:\s*\'(.*)\',', html))
title = unicodize(r1(r'<span class="videoHeaderTitle">([^<]+)</span>', html))
api_html = get_html('http://www.nicovideo.jp/api/getflv?v=%s' % url.split('/')[-1])
real_url = parse.unquote(r1(r'url=([^&]+)&', api_html))

View File

@ -9,18 +9,14 @@ import urllib
import hashlib
def pptv_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False):
xml = get_html('http://web-play.pptv.com/webplay3-151-%s.xml' % id)
xml = get_html('http://web-play.pptv.com/webplay3-0-%s.xml?type=web.fpp' % id)
host = r1(r'<sh>([^<>]+)</sh>', xml)
port = 8080
st = r1(r'<st>([^<>]+)</st>', xml).encode('utf-8')
key = hashlib.md5(st).hexdigest() # FIXME: incorrect key
rids = re.findall(r'rid="([^"]+)"', xml)
key = r1(r'<key expire=[^<>]+>([^<>]+)</key>', xml)
rid = r1(r'rid="([^"]+)"', xml)
title = r1(r'nm="([^"]+)"', xml)
pieces = re.findall('<sgm no="(\d+)".*fs="(\d+)"', xml)
numbers, fs = zip(*pieces)
urls = ['http://%s:%s/%s/%s?key=%s' % (host, port, i, rid, key) for i in numbers]
urls = ['http://pptv.vod.lxdns.com/%s/%s?key=%s' % (i, rid, key) for i in numbers]
urls = ['http://%s/%s/%s?k=%s' % (host, i, rid, key) for i in numbers]
total_size = sum(map(int, fs))
assert rid.endswith('.mp4')

View File

@ -1,20 +1,22 @@
#!/usr/bin/env python
__all__ = ['sina_download', 'sina_download_by_id']
__all__ = ['sina_download', 'sina_download_by_vid', 'sina_download_by_vkey']
from ..common import *
import re
def video_info(id):
xml = get_decoded_html('http://v.iask.com/v_play.php?vid=%s' % id)
xml = get_content('http://v.iask.com/v_play.php?vid=%s' % id, decoded=True)
urls = re.findall(r'<url>(?:<!\[CDATA\[)?(.*?)(?:\]\]>)?</url>', xml)
name = r1(r'<vname>(?:<!\[CDATA\[)?(.+?)(?:\]\]>)?</vname>', xml)
vstr = r1(r'<vstr>(?:<!\[CDATA\[)?(.+?)(?:\]\]>)?</vstr>', xml)
name = match1(xml, r'<vname>(?:<!\[CDATA\[)?(.+?)(?:\]\]>)?</vname>')
vstr = match1(xml, r'<vstr>(?:<!\[CDATA\[)?(.+?)(?:\]\]>)?</vstr>')
return urls, name, vstr
def sina_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False):
urls, name, vstr = video_info(id)
def sina_download_by_vid(vid, title=None, output_dir='.', merge=True, info_only=False):
"""Downloads a Sina video by its unique vid.
http://video.sina.com.cn/
"""
urls, name, vstr = video_info(vid)
title = title or name
assert title
size = 0
@ -26,11 +28,36 @@ def sina_download_by_id(id, title = None, output_dir = '.', merge = True, info_o
if not info_only:
download_urls(urls, title, 'flv', size, output_dir = output_dir, merge = merge)
def sina_download(url, output_dir = '.', merge = True, info_only = False):
id = r1(r'[^_]vid\s*:\s*\'([^\']+)\',', get_html(url)).split('|')[-1]
assert id
def sina_download_by_vkey(vkey, title=None, output_dir='.', merge=True, info_only=False):
"""Downloads a Sina video by its unique vkey.
http://video.sina.com/
"""
sina_download_by_id(id, output_dir = output_dir, merge = merge, info_only = info_only)
url = 'http://video.sina.com/v/flvideo/%s_0.flv' % vkey
type, ext, size = url_info(url)
print_info(site_info, title, 'flv', size)
if not info_only:
download_urls([url], title, 'flv', size, output_dir = output_dir, merge = merge)
def sina_download(url, output_dir='.', merge=True, info_only=False):
"""Downloads Sina videos by URL.
"""
vid = match1(url, r'vid=(\d+)')
if vid is None:
video_page = get_content(url)
vid = hd_vid = match1(video_page, r'hd_vid\s*:\s*\'([^\']+)\'')
if hd_vid == '0':
vids = match1(video_page, r'[^\w]vid\s*:\s*\'([^\']+)\'').split('|')
vid = vids[-1]
if vid:
sina_download_by_vid(vid, output_dir=output_dir, merge=merge, info_only=info_only)
else:
vkey = match1(video_page, r'vkey\s*:\s*"([^"]+)"')
title = match1(video_page, r'title\s*:\s*"([^"]+)"')
sina_download_by_vkey(vkey, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
site_info = "Sina.com"
download = sina_download

View File

@ -8,7 +8,7 @@ import json
def real_url(host, prot, file, new):
url = 'http://%s/?prot=%s&file=%s&new=%s' % (host, prot, file, new)
start, _, host, key, _, _ = get_html(url).split('|')
start, _, host, key = get_html(url).split('|')[:4]
return '%s%s?key=%s' % (start[:-1], new, key)
def sohu_download(url, output_dir = '.', merge = True, info_only = False):

View File

@ -0,0 +1,24 @@
#!/usr/bin/env python
__all__ = ['ted_download']
from ..common import *
def ted_download(url, output_dir = '.', merge = True, info_only = False):
page = get_html(url).split("\n")
for line in page:
if line.find("<title>") > -1:
title = line.replace("<title>", "").replace("</title>", "").replace("\t", "")
title = title[:title.find(' | ')]
if line.find("no-flash-video-download") > -1:
url = line.replace('<a id="no-flash-video-download" href="', "").replace(" ", "").replace("\t", "").replace(".mp4", "-480p-en.mp4")
url = url[:url.find('"')]
type, ext, size = url_info(url)
print_info(site_info, title, type, size)
if not info_only:
download_urls([url], title, ext, size, output_dir, merge=merge)
break
site_info = "ted.com"
download = ted_download
download_playlist = playlist_not_supported('ted')

View File

@ -5,26 +5,31 @@ __all__ = ['tudou_download', 'tudou_download_playlist', 'tudou_download_by_id',
from ..common import *
def tudou_download_by_iid(iid, title, output_dir = '.', merge = True, info_only = False):
xml = get_html('http://v2.tudou.com/v?it=' + iid + '&st=1,2,3,4,99')
data = json.loads(get_decoded_html('http://www.tudou.com/outplay/goto/getItemSegs.action?iid=%s' % iid))
vids = []
for k in data:
if len(data[k]) == 1:
vids.append({"k": data[k][0]["k"], "size": data[k][0]["size"]})
temp = max(vids, key=lambda x:x["size"])
vid, size = temp["k"], temp["size"]
xml = get_html('http://ct.v2.tudou.com/f?id=%s' % vid)
from xml.dom.minidom import parseString
doc = parseString(xml)
title = title or doc.firstChild.getAttribute('tt') or doc.firstChild.getAttribute('title')
urls = [(int(n.getAttribute('brt')), n.firstChild.nodeValue.strip()) for n in doc.getElementsByTagName('f')]
url = max(urls, key = lambda x:x[0])[1]
assert 'f4v' in url
type, ext, size = url_info(url)
print_info(site_info, title, type, size)
if not info_only:
#url_save(url, filepath, bar):
download_urls([url], title, ext, total_size = None, output_dir = output_dir, merge = merge)
url = [n.firstChild.nodeValue.strip() for n in doc.getElementsByTagName('f')][0]
def tudou_download_by_id(id, title, output_dir = '.', merge = True, info_only = False):
ext = r1(r'http://[\w.]*/(\w+)/[\w.]*', url)
print_info(site_info, title, ext, size)
if not info_only:
download_urls([url], title, ext, size, output_dir = output_dir, merge = merge)
def tudou_download_by_id(id, title, output_dir = '.', merge = True, info_only = False):
html = get_html('http://www.tudou.com/programs/view/%s/' % id)
iid = r1(r'iid\s*[:=]\s*(\S+)', html)
iid = r1(r'iid\s*[:=]\s*(\S+)', html)
title = r1(r'kw\s*[:=]\s*[\'\"]([^\']+?)[\'\"]', html)
tudou_download_by_iid(iid, title, output_dir = output_dir, merge = merge, info_only = info_only)
def tudou_download(url, output_dir = '.', merge = True, info_only = False):
@ -76,4 +81,4 @@ def tudou_download_playlist(url, output_dir = '.', merge = True, info_only = Fal
site_info = "Tudou.com"
download = tudou_download
download_playlist = tudou_download_playlist
download_playlist = tudou_download_playlist

View File

@ -10,7 +10,9 @@ def tumblr_download(url, output_dir = '.', merge = True, info_only = False):
html = get_html(url)
html = parse.unquote(html).replace('\/', '/')
title = unescape_html(r1(r'<meta property="og:title" content="([^"]*)" />', html))
title = unescape_html(r1(r'<meta property="og:title" content="([^"]*)" />', html) or
r1(r'<meta property="og:description" content="([^"]*)" />', html) or
r1(r'<title>([^<\n]*)', html)).replace('\n', '')
real_url = r1(r'source src=\\x22([^\\]+)\\', html)
if not real_url:
real_url = r1(r'audio_file=([^&]+)&', html) + '?plead=please-dont-download-this-or-our-lawyers-wont-let-us-host-audio'

View File

@ -55,11 +55,14 @@ def xiami_download_song(sid, output_dir = '.', merge = True, info_only = False):
if not ext:
ext = 'mp3'
print_info(site_info, song_title, type, size)
print_info(site_info, song_title, ext, size)
if not info_only:
file_name = "%s - %s - %s" % (song_title, album_name, artist)
download_urls([url], file_name, ext, size, output_dir, merge = merge, faker = True)
xiami_download_lyric(lrc_url, file_name, output_dir)
try:
xiami_download_lyric(lrc_url, file_name, output_dir)
except:
pass
def xiami_download_showcollect(cid, output_dir = '.', merge = True, info_only = False):
html = get_html('http://www.xiami.com/song/showcollect/id/' + cid, faker = True)
@ -84,7 +87,10 @@ def xiami_download_showcollect(cid, output_dir = '.', merge = True, info_only =
if not info_only:
file_name = "%02d.%s - %s - %s" % (track_nr, song_title, artist, album_name)
download_urls([url], file_name, ext, size, output_dir, merge = merge, faker = True)
xiami_download_lyric(lrc_url, file_name, output_dir)
try:
xiami_download_lyric(lrc_url, file_name, output_dir)
except:
pass
track_nr += 1
@ -112,7 +118,10 @@ def xiami_download_album(aid, output_dir = '.', merge = True, info_only = False)
if not info_only:
file_name = "%02d.%s" % (track_nr, song_title)
download_urls([url], file_name, ext, size, output_dir, merge = merge, faker = True)
xiami_download_lyric(lrc_url, file_name, output_dir)
try:
xiami_download_lyric(lrc_url, file_name, output_dir)
except:
pass
if not pic_exist:
xiami_download_pic(pic_url, 'cover', output_dir)
pic_exist = True
@ -131,6 +140,10 @@ def xiami_download(url, output_dir = '.', stream_type = None, merge = True, info
if re.match('http://www.xiami.com/song/\d+', url):
id = r1(r'http://www.xiami.com/song/(\d+)', url)
xiami_download_song(id, output_dir, merge, info_only)
if re.match('http://www.xiami.com/song/detail/id/\d+', url):
id = r1(r'http://www.xiami.com/song/detail/id/(\d+)', url)
xiami_download_song(id, output_dir, merge, info_only)
site_info = "Xiami.com"
download = xiami_download

View File

@ -20,10 +20,10 @@ def yinyuetai_download_by_id(id, title = None, output_dir = '.', merge = True, i
download_urls([url], title, ext, size, output_dir, merge = merge)
def yinyuetai_download(url, output_dir = '.', merge = True, info_only = False):
id = r1(r'http://www.yinyuetai.com/video/(\d+)$', url)
id = r1(r'http://\w+.yinyuetai.com/video/(\d+)$', url)
assert id
html = get_html(url, 'utf-8')
title = r1(r'<meta property="og:title" content="([^"]+)"/>', html)
title = r1(r'<meta property="og:title"\s+content="([^"]+)"/>', html)
assert title
title = parse.unquote(title)
title = escape_file_path(title)

View File

@ -25,7 +25,7 @@ def find_video_id_from_url(url):
return r1_of(patterns, url)
def find_video_id_from_show_page(url):
return re.search(r'<div class="btnplay">.*href="([^"]+)"', get_html(url)).group(1)
return re.search(r'<a class="btnShow btnplay.*href="([^"]+)"', get_html(url)).group(1)
def youku_url(url):
id = find_video_id_from_url(url)
@ -61,7 +61,7 @@ def parse_video_title(url, page):
def parse_playlist_title(url, page):
if re.search(r'v_playlist', url):
# if we are playing a viedo from play list, the meta title might be incorrect
# if we are playing a video from play list, the meta title might be incorrect
title = re.search(r'<title>([^<>]*)</title>', page).group(1)
else:
title = re.search(r'<meta name="title" content="([^"]*)"', page).group(1)
@ -80,8 +80,8 @@ def parse_page(url):
return id2, title
def get_info(videoId2):
return json.loads(get_html('http://v.youku.com/player/getPlayList/VideoIDS/' + videoId2))
return json.loads(get_html('http://v.youku.com/player/getPlayList/VideoIDS/' + videoId2 + '/timezone/+08/version/5/source/out/Sc/2'))
def find_video(info, stream_type = None):
#key = '%s%x' % (info['data'][0]['key2'], int(info['data'][0]['key1'], 16) ^ 0xA55AA5A5)
segs = info['data'][0]['segs']
@ -120,28 +120,16 @@ def find_video(info, stream_type = None):
def file_type_of_url(url):
return str(re.search(r'/st/([^/]+)/', url).group(1))
def youku_download_by_id(id2, title, output_dir = '.', stream_type = None, merge = True, info_only = False):
info = get_info(id2)
def youku_download_by_id(id, title, output_dir = '.', stream_type = None, merge = True, info_only = False):
info = get_info(id)
urls, sizes = zip(*find_video(info, stream_type))
ext = file_type_of_url(urls[0])
total_size = sum(sizes)
urls = url_locations(urls) # Use real (redirected) URLs for resuming of downloads
print_info(site_info, title, ext, total_size)
if not info_only:
download_urls(urls, title, ext, total_size, output_dir, merge = merge)
def youku_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False):
if not youku_url(url):
youku_download_playlist(url, output_dir, merge, info_only)
return
id2, title = parse_page(url)
title = title.replace('?', '-')
youku_download_by_id(id2, title, output_dir, merge = merge, info_only = info_only)
def parse_playlist_videos(html):
return re.findall(r'id="A_(\w+)"', html)
@ -175,9 +163,9 @@ def parse_vplaylist(url):
n = int(re.search(r'<span class="num">(\d+)</span>', get_html(url)).group(1))
return ['http://v.youku.com/v_playlist/f%so0p%s.html' % (id, i) for i in range(n)]
def youku_download_playlist(url, output_dir = '.', merge = True, info_only = False):
if re.match(r'http://www.youku.com/show_page/id_\w+.html', url):
url = find_video_id_from_show_page(url)
def youku_download_playlist(url, output_dir='.', merge=True, info_only=False):
"""Downloads a Youku playlist.
"""
if re.match(r'http://www.youku.com/playlist_show/id_\d+(?:_ascending_\d_mode_pic(?:_page_\d+)?)?.html', url):
ids = parse_vplaylist(url)
@ -185,21 +173,36 @@ def youku_download_playlist(url, output_dir = '.', merge = True, info_only = Fal
ids = parse_vplaylist(url)
elif re.match(r'http://u.youku.com/user_playlist/pid_(\d+)_id_[\w=]+(?:_page_\d+)?.html', url):
ids = parse_vplaylist(url)
else:
elif re.match(r'http://www.youku.com/show_page/id_\w+.html', url):
url = find_video_id_from_show_page(url)
assert re.match(r'http://v.youku.com/v_show/id_([\w=]+).html', url), 'URL not supported as playlist'
ids = parse_playlist(url)
else:
ids = []
assert ids != []
title = parse_playlist_title(url, get_html(url))
title = title.replace('?', '-')
title = filenameable(title)
output_dir = os.path.join(output_dir, title)
for i, id in enumerate(ids):
print('Processing %s of %s videos...' % (i + 1, len(ids)))
try:
print('Processing %s of %s videos...' % (i + 1, len(ids)))
youku_download(id, output_dir, merge = merge, info_only = info_only)
id, title = parse_page(youku_url(id))
youku_download_by_id(id, title, output_dir=output_dir, merge=merge, info_only=info_only)
except:
continue
def youku_download(url, output_dir='.', merge=True, info_only=False):
"""Downloads Youku videos by URL.
"""
try:
youku_download_playlist(url, output_dir=output_dir, merge=merge, info_only=info_only)
except:
id, title = parse_page(url)
youku_download_by_id(id, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
site_info = "Youku.com"
download = youku_download
download_playlist = youku_download_playlist

View File

@ -6,7 +6,7 @@ from ..common import *
# YouTube media encoding options, in descending quality order.
# taken from http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs, 3/22/2013.
youtube_codecs = [
yt_codecs = [
{'itag': 38, 'container': 'MP4', 'video_resolution': '3072p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '3.5-5', 'audio_encoding': 'AAC', 'audio_bitrate': '192'},
{'itag': 46, 'container': 'WebM', 'video_resolution': '1080p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'},
{'itag': 37, 'container': 'MP4', 'video_resolution': '1080p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '3-4.3', 'audio_encoding': 'AAC', 'audio_bitrate': '192'},
@ -32,102 +32,70 @@ youtube_codecs = [
{'itag': 17, 'container': '3GP', 'video_resolution': '144p', 'video_encoding': 'MPEG-4 Visual', 'video_profile': 'Simple', 'video_bitrate': '0.05', 'audio_encoding': 'AAC', 'audio_bitrate': '24'},
]
def parse_video_info(raw_info):
"""Parser for YouTube's get_video_info data.
Returns a dict, where 'url_encoded_fmt_stream_map' maps to a sorted list.
def decipher(js, s):
def tr_js(code):
code = re.sub(r'function', r'def', code)
code = re.sub(r'\{', r':\n\t', code)
code = re.sub(r'\}', r'\n', code)
code = re.sub(r'var\s+', r'', code)
code = re.sub(r'(\w+).join\(""\)', r'"".join(\1)', code)
code = re.sub(r'(\w+).length', r'len(\1)', code)
code = re.sub(r'(\w+).reverse\(\)', r'\1[::-1]', code)
code = re.sub(r'(\w+).slice\((\d+)\)', r'\1[\2:]', code)
code = re.sub(r'(\w+).split\(""\)', r'list(\1)', code)
return code
f1 = match1(js, r'g.sig\|\|(\w+)\(g.s\)')
f1def = match1(js, r'(function %s\(\w+\)\{[^\{]+\})' % f1)
code = tr_js(f1def)
f2 = match1(f1def, r'(\w+)\(\w+,\d+\)')
if f2 is not None:
f2def = match1(js, r'(function %s\(\w+,\w+\)\{[^\{]+\})' % f2)
code = code + 'global %s\n' % f2 + tr_js(f2def)
code = code + 'sig=%s(s)' % f1
exec(code, globals(), locals())
return locals()['sig']
def youtube_download_by_id(id, title=None, output_dir='.', merge=True, info_only=False):
"""Downloads a YouTube video by its unique id.
"""
# Percent-encoding reserved characters, used as separators.
sepr = {
'&': '%26',
',': '%2C',
'=': '%3D',
raw_video_info = get_content('http://www.youtube.com/get_video_info?video_id=%s' % id)
video_info = parse.parse_qs(raw_video_info)
if video_info['status'] == ['ok'] and ('use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']):
title = parse.unquote_plus(video_info['title'][0])
stream_list = parse.parse_qs(raw_video_info)['url_encoded_fmt_stream_map'][0].split(',')
else:
# Parse video page when video_info is not usable.
video_page = get_content('http://www.youtube.com/watch?v=%s' % id)
ytplayer_config = json.loads(match1(video_page, r'ytplayer.config\s*=\s*([^\n]+);'))
title = ytplayer_config['args']['title']
stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',')
html5player = ytplayer_config['assets']['js']
streams = {
parse.parse_qs(stream)['itag'][0] : parse.parse_qs(stream)
for stream in stream_list
}
# fmt_level = {'itag': level, ...}
# itag of a higher quality maps to a lower level number.
# The highest quality has level number 0.
fmt_level = dict(
zip(
[str(codec['itag'])
for codec in
youtube_codecs],
range(len(youtube_codecs))))
for codec in yt_codecs:
itag = str(codec['itag'])
if itag in streams:
download_stream = streams[itag]
break
# {key1: value1, key2: value2, ...,
# 'url_encoded_fmt_stream_map': [{'itag': '38', ...}, ...]
# }
return dict(
[(lambda metadata:
['url_encoded_fmt_stream_map', (
lambda stream_map:
sorted(
[dict(
[subitem.split(sepr['='])
for subitem in
item.split(sepr['&'])])
for item in
stream_map.split(sepr[','])],
key =
lambda stream:
fmt_level[stream['itag']]))
(metadata[1])]
if metadata[0] == 'url_encoded_fmt_stream_map'
else metadata)
(item.split('='))
for item in
raw_info.split('&')])
def youtube_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False):
raw_info = request.urlopen('http://www.youtube.com/get_video_info?video_id=%s' % id).read().decode('utf-8')
video_info = parse_video_info(raw_info)
if video_info['status'] == 'ok': # use get_video_info data
title = parse.unquote(video_info['title'].replace('+', ' '))
signature = video_info['url_encoded_fmt_stream_map'][0]['sig']
url = parse.unquote(parse.unquote(video_info['url_encoded_fmt_stream_map'][0]['url'])) + "&signature=%s" % signature
else: # parse video page when "embedding disabled by request"
import json
html = request.urlopen('http://www.youtube.com/watch?v=' + id).read().decode('utf-8')
html = unescape_html(html)
yt_player_config = json.loads(r1(r'ytplayer.config = ([^\n]+);', html))
title = yt_player_config['args']['title']
title = unicodize(title)
title = parse.unquote(title)
title = escape_file_path(title)
for itag in [
'38',
'46', '37',
'102', '45', '22',
'84',
'120',
'85',
'44', '35',
'101', '100', '43', '34', '82', '18',
'6', '83', '13', '5', '36', '17',
]:
fmt = r1(r'([^,\"]*itag=' + itag + "[^,\"]*)", html)
if fmt:
url = r1(r'url=([^\\]+)', fmt)
url = unicodize(url)
url = parse.unquote(url)
sig = r1(r'sig=([^\\]+)', fmt)
url = url + '&signature=' + sig
break
try:
url
except NameError:
url = r1(r'ytdns.ping\("([^"]+)"[^;]*;</script>', html)
url = unicodize(url)
url = re.sub(r'\\/', '/', url)
url = re.sub(r'generate_204', 'videoplayback', url)
url = download_stream['url'][0]
if 'sig' in download_stream:
sig = download_stream['sig'][0]
else:
js = get_content(html5player)
sig = decipher(js, download_stream['s'][0])
url = '%s&signature=%s' % (url, sig)
type, ext, size = url_info(url)
@ -135,13 +103,14 @@ def youtube_download_by_id(id, title = None, output_dir = '.', merge = True, inf
if not info_only:
download_urls([url], title, ext, size, output_dir, merge = merge)
def youtube_download(url, output_dir = '.', merge = True, info_only = False):
id = r1(r'youtu.be/(.*)', url)
if not id:
id = parse.parse_qs(parse.urlparse(url).query)['v'][0]
def youtube_download(url, output_dir='.', merge=True, info_only=False):
"""Downloads YouTube videos by URL.
"""
id = match1(url, r'youtu.be/([^/]+)') or parse_query_param(url, 'v')
assert id
youtube_download_by_id(id, None, output_dir, merge = merge, info_only = info_only)
youtube_download_by_id(id, title=None, output_dir=output_dir, merge=merge, info_only=info_only)
site_info = "YouTube.com"
download = youtube_download

View File

@ -1,6 +1,5 @@
#!/usr/bin/env python
__all__ = ['__version__', '__date__']
__version__ = '0.3.12'
__date__ = '2013-05-19'
__version__ = '0.3.21'
__date__ = '2013-08-17'

View File

@ -4,7 +4,7 @@
import unittest
from you_get import *
from you_get.__main__ import url_to_module
from you_get.downloader.__main__ import url_to_module
def test_urls(urls):
for url in urls:
@ -17,11 +17,6 @@ class YouGetTests(unittest.TestCase):
"http://www.freesound.org/people/Corsica_S/sounds/184419/",
])
def test_jpopsuki(self):
test_urls([
#"http://jpopsuki.tv/video/Dragon-Ash---Run-to-the-Sun/8ad7aec604badd0b0798cd999b63ae17",
])
def test_mixcloud(self):
test_urls([
"http://www.mixcloud.com/beatbopz/beat-bopz-disco-mix/",

11
tests/test_common.py Normal file
View File

@ -0,0 +1,11 @@
#!/usr/bin/env python
import unittest
from you_get import *
class TestCommon(unittest.TestCase):
def test_match1(self):
self.assertEqual(match1('http://youtu.be/1234567890A', r'youtu.be/([^/]+)'), '1234567890A')
self.assertEqual(match1('http://youtu.be/1234567890A', r'youtu.be/([^/]+)', r'youtu.(\w+)'), ['1234567890A', 'be'])

View File

@ -1,9 +1,10 @@
#!/usr/bin/env python3
import os, sys
sys.path.insert(0, os.path.join((os.path.dirname(os.path.realpath(__file__))), "src"))
__path__ = os.path.dirname(os.path.realpath(__file__))
__srcdir__ = 'src'
sys.path.insert(1, os.path.join(__path__, __srcdir__))
from you_get.downloader import main
from you_get import *
if __name__ == "__main__":
if __name__ == '__main__':
main()

View File

@ -31,6 +31,6 @@
],
"console_scripts": [
"you-get = you_get.__main__:main"
"you-get = you_get.downloader.__main__:main"
]
}