diff --git a/.gitignore b/.gitignore index 63b93fe6..0a5d13ab 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ _*/ *.3gp *.asf *.flv +*.lrc *.mkv *.mp3 *.mp4 diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 264971d6..67cbb1fb 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,6 +1,93 @@ Changelog ========= +0.3.21 +------ + +*Date: 2013-08-17* + +* Fix issues for: + - YouTube + - YinYueTai + - pan.baidu.com + +0.3.20 +------ + +*Date: 2013-08-16* + +* Add support for: + - eHow + - Khan Academy + - TED + - 5sing +* Fix issues for: + - Tudou + +0.3.18 +------ + +*Date: 2013-07-19* + +* Fix issues for: + - Dailymotion + - Youku + - Sina + - AcFun + - bilibili + +0.3.17 +------ + +*Date: 2013-07-12* + +* Fix issues for: + - YouTube + - 163 + - bilibili +* Code cleanup. + +0.3.16 +------ + +*Date: 2013-06-28* + +* Fix issues for: + - YouTube + - Sohu + - Google+ (enable HTTPS proxy) + +0.3.15 +------ + +*Date: 2013-06-21* + +* Add support for: + - Instagram + +0.3.14 +------ + +*Date: 2013-06-14* + +* Add support for: + - Alive.in.th +* Remove support of: + - JPopsuki +* Fix issues for: + - AcFun + - iQIYI + +0.3.13 +------ + +*Date: 2013-06-07* + +* Add support for: + - Baidu Wangpan (video only) +* Fix issue for: + - Google+ + 0.3.12 ------ @@ -86,7 +173,7 @@ Changelog * Add support for: - Douban - MioMio -* Fix issue for: +* Fix issues for: - Tudou - Vimeo diff --git a/README.md b/README.md index 7579f81a..52d5d30c 100644 --- a/README.md +++ b/README.md @@ -17,15 +17,18 @@ Fork me on GitHub: * Coursera * Blip * Dailymotion +* eHow * Facebook * Google+ * Google Drive +* Khan Academy +* TED * Tumblr * Vine +* Instagram * SoundCloud * Mixcloud * Freesound -* JPopsuki * VID48 * Niconico (ニコニコ動画) * Youku (优酷) @@ -47,8 +50,11 @@ Fork me on GitHub: * Sohu (搜狐视频) * 56 (56网) * Xiami (虾米) -* Baidu (百度音乐) +* 5sing +* Baidu Music (百度音乐) +* Baidu Wangpan (百度网盘) * SongTaste +* Alive.in.th ## Dependencies @@ -233,15 +239,18 @@ You-Get基于优酷下载脚本[iambus/youku-lixian](https://github.com/iambus/y * Coursera * Blip * Dailymotion +* eHow * Facebook * Google+ * Google Drive +* Khan Academy +* TED * Tumblr * Vine +* Instagram * SoundCloud * Mixcloud * Freesound -* JPopsuki * VID48 * NICONICO动画 * 优酷 @@ -263,8 +272,11 @@ You-Get基于优酷下载脚本[iambus/youku-lixian](https://github.com/iambus/y * 搜狐视频 * 56网 * 虾米 +* 5sing * 百度音乐 +* 百度网盘 * SongTaste +* Alive.in.th ## 依赖 diff --git a/README.txt b/README.txt index d90bd5d1..7bd8a1ee 100644 --- a/README.txt +++ b/README.txt @@ -20,15 +20,18 @@ Supported Sites (As of Now) * Coursera https://www.coursera.org * Blip http://blip.tv * Dailymotion http://dailymotion.com +* eHow http://www.ehow.com * Facebook http://facebook.com * Google+ http://plus.google.com * Google Drive http://docs.google.com +* Khan Academy http://www.khanacademy.org +* TED http://www.ted.com * Tumblr http://www.tumblr.com * Vine http://vine.co +* Instagram http://instagram.com * SoundCloud http://soundcloud.com * Mixcloud http://www.mixcloud.com * Freesound http://www.freesound.org -* JPopsuki http://jpopsuki.tv * VID48 http://vid48.com * Niconico (ニコニコ動画) http://www.nicovideo.jp * Youku (优酷) http://www.youku.com @@ -50,8 +53,11 @@ Supported Sites (As of Now) * Sohu (搜狐视频) http://tv.sohu.com * 56 (56网) http://www.56.com * Xiami (虾米) http://www.xiami.com -* Baidu (百度音乐) http://music.baidu.com +* 5sing http://www.5sing.com +* Baidu Music (百度音乐) http://music.baidu.com +* Baidu Wangpan (百度网盘) http://pan.baidu.com * SongTaste http://www.songtaste.com +* Alive.in.th http://alive.in.th Dependencies ------------ diff --git a/src/you_get/__init__.py b/src/you_get/__init__.py index 0256fcfe..ecca35d2 100644 --- a/src/you_get/__init__.py +++ b/src/you_get/__init__.py @@ -1,9 +1,9 @@ #!/usr/bin/env python -from .processor import * - -from .downloader import * - -from .version import * from .common import * -from .__main__ import * +from .version import * + +# Easy import +#from .cli_wrapper.converter import * +#from .cli_wrapper.player import * +from .downloader import * diff --git a/src/you_get/common.py b/src/you_get/common.py index 7f708511..8faf907a 100644 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -7,6 +7,7 @@ import os import re import sys from urllib import request, parse +import platform from .version import __version__ @@ -33,20 +34,63 @@ def tr(s): except: return str(s.encode('utf-8'))[2:-1] +# DEPRECATED in favor of match1() def r1(pattern, text): m = re.search(pattern, text) if m: return m.group(1) +# DEPRECATED in favor of match1() def r1_of(patterns, text): for p in patterns: x = r1(p, text) if x: return x +def match1(text, *patterns): + """Scans through a string for substrings matched some patterns (first-subgroups only). + + Args: + text: A string to be scanned. + patterns: Arbitrary number of regex patterns. + + Returns: + When only one pattern is given, returns a string (None if no match found). + When more than one pattern are given, returns a list of strings ([] if no match found). + """ + + if len(patterns) == 1: + pattern = patterns[0] + match = re.search(pattern, text) + if match: + return match.group(1) + else: + return None + else: + ret = [] + for pattern in patterns: + match = re.search(pattern, text) + if match: + ret.append(match.group(1)) + return ret + +def parse_query_param(url, param): + """Parses the query string of a URL and returns the value of a parameter. + + Args: + url: A URL. + param: A string representing the name of the parameter. + + Returns: + The value of the parameter. + """ + + return parse.parse_qs(parse.urlparse(url).query)[param][0] + def unicodize(text): return re.sub(r'\\u([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])', lambda x: chr(int(x.group(0)[2:], 16)), text) +# DEPRECATED in favor of filenameable() def escape_file_path(path): path = path.replace('/', '-') path = path.replace('\\', '-') @@ -54,23 +98,57 @@ def escape_file_path(path): path = path.replace('?', '-') return path +def filenameable(text): + """Converts a string to a legal filename through various OSes. + """ + # All POSIX systems + text = text.translate({ + 0: None, + ord('/'): '-', + }) + if platform.system() == 'Darwin': # For Mac OS + text = text.translate({ + ord(':'): '-', + }) + elif platform.system() == 'Windows': # For Windows + text = text.translate({ + ord(':'): '-', + ord('*'): '-', + ord('?'): '-', + ord('\\'): '-', + ord('\"'): '\'', + ord('<'): '-', + ord('>'): '-', + ord('|'): '-', + ord('+'): '-', + ord('['): '(', + ord(']'): ')', + }) + return text + def unescape_html(html): from html import parser html = parser.HTMLParser().unescape(html) html = re.sub(r'&#(\d+);', lambda x: chr(int(x.group(1))), html) return html -def ungzip(s): +def ungzip(data): + """Decompresses data for Content-Encoding: gzip. + """ from io import BytesIO import gzip - buffer = BytesIO(s) - f = gzip.GzipFile(fileobj = buffer) + buffer = BytesIO(data) + f = gzip.GzipFile(fileobj=buffer) return f.read() -def undeflate(s): +def undeflate(data): + """Decompresses data for Content-Encoding: deflate. + (the zlib compression is used.) + """ import zlib - return zlib.decompress(s, -zlib.MAX_WBITS) + return zlib.decompress(data, -zlib.MAX_WBITS) +# DEPRECATED in favor of get_content() def get_response(url, faker = False): if faker: response = request.urlopen(request.Request(url, headers = fake_headers), None) @@ -85,10 +163,12 @@ def get_response(url, faker = False): response.data = data return response +# DEPRECATED in favor of get_content() def get_html(url, encoding = None, faker = False): content = get_response(url, faker).data return str(content, 'utf-8', 'ignore') +# DEPRECATED in favor of get_content() def get_decoded_html(url, faker = False): response = get_response(url, faker) data = response.data @@ -98,6 +178,38 @@ def get_decoded_html(url, faker = False): else: return data +def get_content(url, headers={}, decoded=True): + """Gets the content of a URL via sending a HTTP GET request. + + Args: + url: A URL. + headers: Request headers used by the client. + decoded: Whether decode the response body using UTF-8 or the charset specified in Content-Type. + + Returns: + The content as a string. + """ + + response = request.urlopen(request.Request(url, headers=headers)) + data = response.read() + + # Handle HTTP compression for gzip and deflate (zlib) + content_encoding = response.getheader('Content-Encoding') + if content_encoding == 'gzip': + data = ungzip(data) + elif content_encoding == 'deflate': + data = undeflate(data) + + # Decode the response body + if decoded: + charset = match1(response.getheader('Content-Type'), r'charset=([\w-]+)') + if charset is not None: + data = data.decode(charset) + else: + data = data.decode('utf-8') + + return data + def url_size(url, faker = False): if faker: response = request.urlopen(request.Request(url, headers = fake_headers), None) @@ -136,7 +248,7 @@ def url_info(url, faker = False): type = None if headers['content-disposition']: try: - filename = parse.unquote(r1(r'filename="?(.+)"?', headers['content-disposition'])) + filename = parse.unquote(r1(r'filename="?([^"]+)"?', headers['content-disposition'])) if len(filename.split('.')) > 1: ext = filename.split('.')[-1] else: @@ -388,7 +500,9 @@ def download_urls(urls, title, ext, total_size, output_dir = '.', refer = None, import sys traceback.print_exc(file = sys.stdout) pass - title = escape_file_path(title) + + title = filenameable(title) + filename = '%s.%s' % (title, ext) filepath = os.path.join(output_dir, filename) if total_size: @@ -437,19 +551,18 @@ def download_urls(urls, title, ext, total_size, output_dir = '.', refer = None, elif ext == 'mp4': try: - from .processor.join_mp4 import concat_mp4 - concat_mp4(parts, os.path.join(output_dir, title + '.mp4')) - for part in parts: - os.remove(part) - except: from .processor.ffmpeg import has_ffmpeg_installed if has_ffmpeg_installed(): from .processor.ffmpeg import ffmpeg_concat_mp4_to_mp4 ffmpeg_concat_mp4_to_mp4(parts, os.path.join(output_dir, title + '.mp4')) - for part in parts: - os.remove(part) else: - print('No ffmpeg is found. Merging aborted.') + from .processor.join_mp4 import concat_mp4 + concat_mp4(parts, os.path.join(output_dir, title + '.mp4')) + except: + raise + else: + for part in parts: + os.remove(part) else: print("Can't merge %s files" % ext) @@ -463,7 +576,9 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir = '.', refer return assert ext in ('ts') - title = escape_file_path(title) + + title = filenameable(title) + filename = '%s.%s' % (title, 'ts') filepath = os.path.join(output_dir, filename) if total_size: @@ -597,9 +712,7 @@ def set_http_proxy(proxy): elif proxy == '': # Don't use any proxy proxy_support = request.ProxyHandler({}) else: # Use proxy - if not proxy.startswith('http://'): - proxy = 'http://' + proxy - proxy_support = request.ProxyHandler({'http': '%s' % proxy}) + proxy_support = request.ProxyHandler({'http': '%s' % proxy, 'https': '%s' % proxy}) opener = request.build_opener(proxy_support) request.install_opener(opener) @@ -615,8 +728,18 @@ def download_main(download, download_playlist, urls, playlist, output_dir, merge else: download(url, output_dir = output_dir, merge = merge, info_only = info_only) +def get_version(): + try: + import subprocess + real_dir = os.path.dirname(os.path.realpath(__file__)) + git_hash = subprocess.Popen(['git', 'rev-parse', '--short', 'HEAD'], cwd=real_dir, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL).stdout.read().decode('utf-8').strip() + assert git_hash + return '%s-%s' % (__version__, git_hash) + except: + return __version__ + def script_main(script_name, download, download_playlist = None): - version = 'You-Get %s, a video downloader.' % __version__ + version = 'You-Get %s, a video downloader.' % get_version() help = 'Usage: %s [OPTION]... [URL]...\n' % script_name help += '''\nStartup options: -V | --version Display the version and exit. diff --git a/src/you_get/downloader/__init__.py b/src/you_get/downloader/__init__.py index 0b85ad8a..99e331f4 100644 --- a/src/you_get/downloader/__init__.py +++ b/src/you_get/downloader/__init__.py @@ -1,6 +1,7 @@ #!/usr/bin/env python from .acfun import * +from .alive import * from .baidu import * from .bilibili import * from .blip import * @@ -8,13 +9,15 @@ from .cntv import * from .coursera import * from .dailymotion import * from .douban import * +from .ehow import * from .facebook import * +from .fivesing import * from .freesound import * from .google import * from .ifeng import * +from .instagram import * from .iqiyi import * from .joy import * -from .jpopsuki import * from .ku6 import * from .miomio import * from .mixcloud import * @@ -36,3 +39,7 @@ from .xiami import * from .yinyuetai import * from .youku import * from .youtube import * +from .ted import * +from .khan import * + +from .__main__ import * diff --git a/src/you_get/__main__.py b/src/you_get/downloader/__main__.py similarity index 91% rename from src/you_get/__main__.py rename to src/you_get/downloader/__main__.py index 568082f5..ed07f702 100644 --- a/src/you_get/__main__.py +++ b/src/you_get/downloader/__main__.py @@ -1,9 +1,8 @@ #!/usr/bin/env python - __all__ = ['main', 'any_download', 'any_download_playlist'] -from .downloader import * -from .common import * +from ..downloader import * +from ..common import * def url_to_module(url): site = r1(r'http://([^/]+)/', url) @@ -20,6 +19,7 @@ def url_to_module(url): downloads = { '163': netease, '56': w56, + '5sing': fivesing, 'acfun': acfun, 'baidu': baidu, 'bilibili': bilibili, @@ -28,14 +28,16 @@ def url_to_module(url): 'coursera': coursera, 'dailymotion': dailymotion, 'douban': douban, + 'ehow': ehow, 'facebook': facebook, 'freesound': freesound, 'google': google, 'iask': sina, 'ifeng': ifeng, + 'in': alive, + 'instagram': instagram, 'iqiyi': iqiyi, 'joy': joy, - 'jpopsuki': jpopsuki, 'kankanews': bilibili, 'ku6': ku6, 'miomio': miomio, @@ -48,6 +50,7 @@ def url_to_module(url): 'sohu': sohu, 'songtaste':songtaste, 'soundcloud': soundcloud, + 'ted': ted, 'tudou': tudou, 'tumblr': tumblr, 'vid48': vid48, @@ -58,6 +61,7 @@ def url_to_module(url): 'youku': youku, 'youtu': youtube, 'youtube': youtube, + 'khanacademy': khan, #TODO } if k in downloads: diff --git a/src/you_get/downloader/acfun.py b/src/you_get/downloader/acfun.py index aa880bee..88e1a7d0 100644 --- a/src/you_get/downloader/acfun.py +++ b/src/you_get/downloader/acfun.py @@ -5,7 +5,7 @@ __all__ = ['acfun_download'] from ..common import * from .qq import qq_download_by_id -from .sina import sina_download_by_id +from .sina import sina_download_by_vid from .tudou import tudou_download_by_iid from .youku import youku_download_by_id @@ -16,11 +16,11 @@ def get_srt_json(id): return get_html(url) def acfun_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False): - info = json.loads(get_html('http://www.acfun.tv/api/getVideoByID.aspx?vid=' + id)) + info = json.loads(get_html('http://wenzhou.acfun.tv/api/getVideoByID.aspx?vid=' + id)) t = info['vtype'] vid = info['vid'] if t == 'sina': - sina_download_by_id(vid, title, output_dir = output_dir, merge = merge, info_only = info_only) + sina_download_by_vid(vid, title, output_dir = output_dir, merge = merge, info_only = info_only) elif t == 'youku': youku_download_by_id(vid, title, output_dir = output_dir, merge = merge, info_only = info_only) elif t == 'tudou': @@ -37,7 +37,7 @@ def acfun_download_by_id(id, title = None, output_dir = '.', merge = True, info_ x.write(cmt) def acfun_download(url, output_dir = '.', merge = True, info_only = False): - assert re.match(r'http://www.acfun.tv/v/ac(\d+)', url) + assert re.match(r'http://[^\.]+.acfun.tv/v/ac(\d+)', url) html = get_html(url) title = r1(r'

]*>([^<>]+)<', html) @@ -49,7 +49,7 @@ def acfun_download(url, output_dir = '.', merge = True, info_only = False): id = r1(r"\[Video\](\d+)\[/Video\]", html) or r1(r"\[video\](\d+)\[/video\]", html) if not id: id = r1(r"src=\"/newflvplayer/player.*id=(\d+)", html) - sina_download_by_id(id, title, output_dir = output_dir, merge = merge, info_only = info_only) + sina_download_by_vid(id, title, output_dir = output_dir, merge = merge, info_only = info_only) else: acfun_download_by_id(id, title, output_dir = output_dir, merge = merge, info_only = info_only) diff --git a/src/you_get/downloader/alive.py b/src/you_get/downloader/alive.py new file mode 100644 index 00000000..33764c72 --- /dev/null +++ b/src/you_get/downloader/alive.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python + +__all__ = ['alive_download'] + +from ..common import * + +def alive_download(url, output_dir = '.', merge = True, info_only = False): + html = get_html(url) + + title = r1(r' 1: + title = ".".join(title.split('.')[:-1]) + + real_url = r1(r'\\"dlink\\":\\"([^"]*)\\"', html).replace('\\\\/', '/') + type, ext, size = url_info(real_url, faker = True) + + print_info(site_info, title, ext, size) + if not info_only: + download_urls([real_url], title, ext, size, output_dir, merge = merge) + + elif re.match(r'http://music.baidu.com/album/\d+', url): id = r1(r'http://music.baidu.com/album/(\d+)', url) baidu_download_album(id, output_dir, merge, info_only) - if re.match('http://music.baidu.com/song/\d+', url): + elif re.match('http://music.baidu.com/song/\d+', url): id = r1(r'http://music.baidu.com/song/(\d+)', url) baidu_download_song(id, output_dir, merge, info_only) diff --git a/src/you_get/downloader/bilibili.py b/src/you_get/downloader/bilibili.py index 20e3c467..8512d362 100644 --- a/src/you_get/downloader/bilibili.py +++ b/src/you_get/downloader/bilibili.py @@ -4,7 +4,7 @@ __all__ = ['bilibili_download'] from ..common import * -from .sina import sina_download_by_id +from .sina import sina_download_by_vid from .tudou import tudou_download_by_id from .youku import youku_download_by_id @@ -64,7 +64,7 @@ def bilibili_download_by_cid(id, title, output_dir = '.', merge = True, info_onl elif re.search(r'/mp4/', urls[0]): type = 'mp4' else: - raise NotImplementedError(urls[0]) + type = 'flv' size = 0 for url in urls: @@ -83,7 +83,7 @@ def bilibili_download(url, output_dir = '.', merge = True, info_only = False): title = unescape_html(title) title = escape_file_path(title) - flashvars = r1_of([r'flashvars="([^"]+)"', r'"https://secure.bilibili.tv/secure,(cid=\d+)(?:&aid=\d+)?"'], html) + flashvars = r1_of([r'player_params=\'(cid=\d+)', r'flashvars="([^"]+)"', r'"https://secure.bilibili.tv/secure,(cid=\d+)(?:&aid=\d+)?"'], html) assert flashvars t, id = flashvars.split('=', 1) id = id.split('&')[0] diff --git a/src/you_get/downloader/dailymotion.py b/src/you_get/downloader/dailymotion.py index 5d42cbb9..99d586c8 100644 --- a/src/you_get/downloader/dailymotion.py +++ b/src/you_get/downloader/dailymotion.py @@ -5,16 +5,22 @@ __all__ = ['dailymotion_download'] from ..common import * def dailymotion_download(url, output_dir = '.', merge = True, info_only = False): - html = get_html(url) - html = parse.unquote(html).replace('\/', '/') + """Downloads Dailymotion videos by URL. + """ - title = r1(r'meta property="og:title" content="([^"]+)"', html) - title = escape_file_path(title) + id = match1(url, r'/video/([^\?]+)') + embed_url = 'http://www.dailymotion.com/embed/video/%s' % id + html = get_content(embed_url) - for quality in ['hd720URL', 'hqURL', 'sdURL']: - real_url = r1(r',\"' + quality + '\"\:\"([^\"]+?)\",', html) + info = json.loads(match1(html, r'var\s*info\s*=\s*({.+}),\n')) + + title = info['title'] + + for quality in ['stream_h264_hd1080_url', 'stream_h264_hd_url', 'stream_h264_hq_url', 'stream_h264_url', 'stream_h264_ld_url']: + real_url = info[quality] if real_url: break + type, ext, size = url_info(real_url) print_info(site_info, title, type, size) diff --git a/src/you_get/downloader/ehow.py b/src/you_get/downloader/ehow.py new file mode 100644 index 00000000..adee6bfc --- /dev/null +++ b/src/you_get/downloader/ehow.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +__all__ = ['ehow_download'] + +from ..common import * + +def ehow_download(url, output_dir = '.', merge = True, info_only = False): + + assert re.search(r'http://www.ehow.com/video_', url), "URL you entered is not supported" + + html = get_html(url) + contentid = r1(r'', html) + vid = r1(r'"demand_ehow_videoid":"([^"]+)"', html) + assert vid + + xml = get_html('http://www.ehow.com/services/video/series.xml?demand_ehow_videoid=%s' % vid) + + from xml.dom.minidom import parseString + doc = parseString(xml) + tab = doc.getElementsByTagName('related')[0].firstChild + + for video in tab.childNodes: + if re.search(contentid, video.attributes['link'].value): + url = video.attributes['flv'].value + break + + title = video.attributes['title'].value + assert title + + type, ext, size = url_info(url) + print_info(site_info, title, type, size) + + if not info_only: + download_urls([url], title, ext, size, output_dir, merge = merge) + +site_info = "ehow.com" +download = ehow_download +download_playlist = playlist_not_supported('ehow') \ No newline at end of file diff --git a/src/you_get/downloader/fivesing.py b/src/you_get/downloader/fivesing.py new file mode 100644 index 00000000..2d948af5 --- /dev/null +++ b/src/you_get/downloader/fivesing.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python + +__all__ = ['fivesing_download'] + +from ..common import * + +def fivesing_download(url, output_dir=".", merge=True, info_only=False): + html = get_html(url) + title = r1(r'var SongName = "(.*)";', html) + url = r1(r'file: "(\S*)"', html) + songtype, ext, size = url_info(url) + print_info(site_info, title, songtype, size) + if not info_only: + download_urls([url], title, ext, size, output_dir) + +site_info = "5sing.com" +download = fivesing_download +download_playlist = playlist_not_supported("5sing") diff --git a/src/you_get/downloader/google.py b/src/you_get/downloader/google.py index cd02697f..0193db2f 100644 --- a/src/you_get/downloader/google.py +++ b/src/you_get/downloader/google.py @@ -6,6 +6,40 @@ from ..common import * import re +# YouTube media encoding options, in descending quality order. +# taken from http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs, 3/22/2013. +youtube_codecs = [ + {'itag': 38, 'container': 'MP4', 'video_resolution': '3072p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '3.5-5', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, + {'itag': 46, 'container': 'WebM', 'video_resolution': '1080p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'}, + {'itag': 37, 'container': 'MP4', 'video_resolution': '1080p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '3-4.3', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, + {'itag': 102, 'container': '', 'video_resolution': '', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '2', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'}, + {'itag': 45, 'container': 'WebM', 'video_resolution': '720p', 'video_encoding': '', 'video_profile': '', 'video_bitrate': '', 'audio_encoding': '', 'audio_bitrate': ''}, + {'itag': 22, 'container': 'MP4', 'video_resolution': '720p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '2-2.9', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, + {'itag': 84, 'container': 'MP4', 'video_resolution': '720p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '2-2.9', 'audio_encoding': 'AAC', 'audio_bitrate': '152'}, + {'itag': 120, 'container': 'FLV', 'video_resolution': '720p', 'video_encoding': 'AVC', 'video_profile': 'Main@L3.1', 'video_bitrate': '2', 'audio_encoding': 'AAC', 'audio_bitrate': '128'}, + {'itag': 85, 'container': 'MP4', 'video_resolution': '520p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '2-2.9', 'audio_encoding': 'AAC', 'audio_bitrate': '152'}, + {'itag': 44, 'container': 'WebM', 'video_resolution': '480p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '1', 'audio_encoding': 'Vorbis', 'audio_bitrate': '128'}, + {'itag': 35, 'container': 'FLV', 'video_resolution': '480p', 'video_encoding': 'H.264', 'video_profile': 'Main', 'video_bitrate': '0.8-1', 'audio_encoding': 'AAC', 'audio_bitrate': '128'}, + {'itag': 101, 'container': 'WebM', 'video_resolution': '360p', 'video_encoding': 'VP8', 'video_profile': '3D', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'}, + {'itag': 100, 'container': 'WebM', 'video_resolution': '360p', 'video_encoding': 'VP8', 'video_profile': '3D', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '128'}, + {'itag': 43, 'container': 'WebM', 'video_resolution': '360p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '0.5', 'audio_encoding': 'Vorbis', 'audio_bitrate': '128'}, + {'itag': 34, 'container': 'FLV', 'video_resolution': '360p', 'video_encoding': 'H.264', 'video_profile': 'Main', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '128'}, + {'itag': 82, 'container': 'MP4', 'video_resolution': '360p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '96'}, + {'itag': 18, 'container': 'MP4', 'video_resolution': '270p/360p', 'video_encoding': 'H.264', 'video_profile': 'Baseline', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '96'}, + {'itag': 6, 'container': 'FLV', 'video_resolution': '270p', 'video_encoding': 'Sorenson H.263', 'video_profile': '', 'video_bitrate': '0.8', 'audio_encoding': 'MP3', 'audio_bitrate': '64'}, + {'itag': 83, 'container': 'MP4', 'video_resolution': '240p', 'video_encoding': 'H.264', 'video_profile': '3D', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': '96'}, + {'itag': 13, 'container': '3GP', 'video_resolution': '', 'video_encoding': 'MPEG-4 Visual', 'video_profile': '', 'video_bitrate': '0.5', 'audio_encoding': 'AAC', 'audio_bitrate': ''}, + {'itag': 5, 'container': 'FLV', 'video_resolution': '240p', 'video_encoding': 'Sorenson H.263', 'video_profile': '', 'video_bitrate': '0.25', 'audio_encoding': 'MP3', 'audio_bitrate': '64'}, + {'itag': 36, 'container': '3GP', 'video_resolution': '240p', 'video_encoding': 'MPEG-4 Visual', 'video_profile': 'Simple', 'video_bitrate': '0.17', 'audio_encoding': 'AAC', 'audio_bitrate': '38'}, + {'itag': 17, 'container': '3GP', 'video_resolution': '144p', 'video_encoding': 'MPEG-4 Visual', 'video_profile': 'Simple', 'video_bitrate': '0.05', 'audio_encoding': 'AAC', 'audio_bitrate': '24'}, +] +fmt_level = dict( + zip( + [str(codec['itag']) + for codec in + youtube_codecs], + range(len(youtube_codecs)))) + def google_download(url, output_dir = '.', merge = True, info_only = False): # Percent-encoding Unicode URL url = parse.quote(url, safe = ':/+%') @@ -14,25 +48,22 @@ def google_download(url, output_dir = '.', merge = True, info_only = False): if service == 'plus': # Google Plus - if re.search(r'plus.google.com/photos/\d+/albums/\d+/\d+', url): - oid = r1(r'plus.google.com/photos/(\d+)/albums/\d+/\d+', url) - pid = r1(r'plus.google.com/photos/\d+/albums/\d+/(\d+)', url) - - elif re.search(r'plus.google.com/photos/\d+/albums/posts/\d+', url): - oid = r1(r'plus.google.com/photos/(\d+)/albums/posts/\d+', url) - pid = r1(r'plus.google.com/photos/\d+/albums/posts/(\d+)', url) - - else: + if not re.search(r'plus.google.com/photos/[^/]*/albums/\d+/\d+', url): html = get_html(url) - oid = r1(r'"https://plus.google.com/photos/(\d+)/albums/\d+/\d+', html) - pid = r1(r'"https://plus.google.com/photos/\d+/albums/\d+/(\d+)', html) - - url = "http://plus.google.com/photos/%s/albums/posts/%s?oid=%s&pid=%s" % (oid, pid, oid, pid) + url = r1(r'"(https://plus.google.com/photos/\d+/albums/\d+/\d+)', html) + title = r1(r'([^<\n]+)', html) + else: + title = None html = get_html(url) - real_url = unicodize(r1(r'"(https://video.googleusercontent.com/[^"]*)",\d\]', html).replace('\/', '/')) + real_urls = re.findall(r'\[(\d+),\d+,\d+,"([^"]+)"\]', html) + real_url = unicodize(sorted(real_urls, key = lambda x : fmt_level[x[0]])[0][1]) + + if title is None: + post_url = r1(r'"(https://plus.google.com/\d+/posts/[^"]*)"', html) + post_html = get_html(post_url) + title = r1(r'<title>([^<\n]+)', post_html) - title = r1(r"\"([^\"]+)\",\"%s\"" % pid, html) if title is None: response = request.urlopen(request.Request(real_url)) if response.headers['content-disposition']: diff --git a/src/you_get/downloader/instagram.py b/src/you_get/downloader/instagram.py new file mode 100644 index 00000000..6071dfd0 --- /dev/null +++ b/src/you_get/downloader/instagram.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python + +__all__ = ['instagram_download'] + +from ..common import * + +def instagram_download(url, output_dir = '.', merge = True, info_only = False): + html = get_html(url) + + id = r1(r'instagram.com/p/([^/]+)/', html) + description = r1(r'<meta property="og:description" content="([^"]*)"', html) + title = description + " [" + id + "]" + url = r1(r'<meta property="og:video" content="([^"]*)"', html) + type, ext, size = url_info(url) + + print_info(site_info, title, type, size) + if not info_only: + download_urls([url], title, ext, size, output_dir, merge = merge) + +site_info = "Instagram.com" +download = instagram_download +download_playlist = playlist_not_supported('instagram') diff --git a/src/you_get/downloader/iqiyi.py b/src/you_get/downloader/iqiyi.py index c8693dbf..5c951d1d 100644 --- a/src/you_get/downloader/iqiyi.py +++ b/src/you_get/downloader/iqiyi.py @@ -6,13 +6,8 @@ from ..common import * def iqiyi_download(url, output_dir = '.', merge = True, info_only = False): html = get_html(url) - #title = r1(r'title\s*:\s*"([^"]+)"', html) - #title = unescape_html(title).decode('utf-8') - #videoId = r1(r'videoId\s*:\s*"([^"]+)"', html) - #pid = r1(r'pid\s*:\s*"([^"]+)"', html) - #ptype = r1(r'ptype\s*:\s*"([^"]+)"', html) - #info_url = 'http://cache.video.qiyi.com/v/%s/%s/%s/' % (videoId, pid, ptype) - videoId = r1(r'''["']videoId["'][:=]["']([^"']+)["']''', html) + + videoId = r1(r'data-player-videoid="([^"]+)"', html) assert videoId info_url = 'http://cache.video.qiyi.com/v/%s' % videoId diff --git a/src/you_get/downloader/jpopsuki.py b/src/you_get/downloader/jpopsuki.py deleted file mode 100644 index a88b23c2..00000000 --- a/src/you_get/downloader/jpopsuki.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python - -__all__ = ['jpopsuki_download'] - -from ..common import * - -def jpopsuki_download(url, output_dir = '.', merge = True, info_only = False): - html = get_html(url) - - title = r1(r'<meta name="title" content="([^"]*)"', html) - if title.endswith(' - JPopsuki TV'): - title = title[:-14] - - url = "http://jpopsuki.tv%s" % r1(r'<source src="([^"]*)"', html) - type, ext, size = url_info(url) - - print_info(site_info, title, type, size) - if not info_only: - download_urls([url], title, ext, size, output_dir, merge = merge) - -site_info = "JPopsuki.tv" -download = jpopsuki_download -download_playlist = playlist_not_supported('jpopsuki') diff --git a/src/you_get/downloader/khan.py b/src/you_get/downloader/khan.py new file mode 100755 index 00000000..617eec47 --- /dev/null +++ b/src/you_get/downloader/khan.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python + +__all__ = ['khan_download'] + +from ..common import * +from .youtube import youtube_download_by_id + +def khan_download(url, output_dir = '.', merge = True, info_only = False): + page = get_html(url) + id = page[page.find('src="https://www.youtube.com/embed/') + len('src="https://www.youtube.com/embed/') :page.find('?enablejsapi=1&wmode=transparent&modestbranding=1&rel=0&fs=1&showinfo=0')] + youtube_download_by_id(id, output_dir=output_dir, merge=merge, info_only=info_only) + +site_info = "khanacademy.org" +download = khan_download +download_playlist = playlist_not_supported('khan') diff --git a/src/you_get/downloader/netease.py b/src/you_get/downloader/netease.py index 964c192d..863689f3 100644 --- a/src/you_get/downloader/netease.py +++ b/src/you_get/downloader/netease.py @@ -7,10 +7,13 @@ from ..common import * def netease_download(url, output_dir = '.', merge = True, info_only = False): html = get_decoded_html(url) - src = r1(r'<source src="([^"]+)"', html) - title = r1('movieDescription=\'([^\']+)\'', html) + title = r1('movieDescription=\'([^\']+)\'', html) or r1('<title>(.+)', html) + if title[0] == ' ': + title = title[1:] - if title: + src = r1(r'(.+)', html) - if title[0] == ' ': - title = title[1:] - url = r1(r'(.+)-list.m3u8', src) + ".mp4" + url = r1(r'["\'](.+)-list.m3u8["\']', html) + ".mp4" _, _, size = url_info(url) ext = 'mp4' diff --git a/src/you_get/downloader/nicovideo.py b/src/you_get/downloader/nicovideo.py index 144c02a1..7d384f31 100644 --- a/src/you_get/downloader/nicovideo.py +++ b/src/you_get/downloader/nicovideo.py @@ -23,7 +23,7 @@ def nicovideo_download(url, output_dir = '.', merge = True, info_only = False): nicovideo_login(user, password) html = get_html(url) # necessary! - title = unicodize(r1(r'title:\s*\'(.*)\',', html)) + title = unicodize(r1(r'([^<]+)', html)) api_html = get_html('http://www.nicovideo.jp/api/getflv?v=%s' % url.split('/')[-1]) real_url = parse.unquote(r1(r'url=([^&]+)&', api_html)) diff --git a/src/you_get/downloader/pptv.py b/src/you_get/downloader/pptv.py index 53c8e508..4fd88e5b 100644 --- a/src/you_get/downloader/pptv.py +++ b/src/you_get/downloader/pptv.py @@ -9,18 +9,14 @@ import urllib import hashlib def pptv_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False): - xml = get_html('http://web-play.pptv.com/webplay3-151-%s.xml' % id) + xml = get_html('http://web-play.pptv.com/webplay3-0-%s.xml?type=web.fpp' % id) host = r1(r'([^<>]+)', xml) - port = 8080 - st = r1(r'([^<>]+)', xml).encode('utf-8') - key = hashlib.md5(st).hexdigest() # FIXME: incorrect key - rids = re.findall(r'rid="([^"]+)"', xml) + key = r1(r']+>([^<>]+)', xml) rid = r1(r'rid="([^"]+)"', xml) title = r1(r'nm="([^"]+)"', xml) pieces = re.findall('(?:)?', xml) - name = r1(r'(?:)?', xml) - vstr = r1(r'(?:)?', xml) + name = match1(xml, r'(?:)?') + vstr = match1(xml, r'(?:)?') return urls, name, vstr -def sina_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False): - urls, name, vstr = video_info(id) +def sina_download_by_vid(vid, title=None, output_dir='.', merge=True, info_only=False): + """Downloads a Sina video by its unique vid. + http://video.sina.com.cn/ + """ + + urls, name, vstr = video_info(vid) title = title or name assert title size = 0 @@ -26,11 +28,36 @@ def sina_download_by_id(id, title = None, output_dir = '.', merge = True, info_o if not info_only: download_urls(urls, title, 'flv', size, output_dir = output_dir, merge = merge) -def sina_download(url, output_dir = '.', merge = True, info_only = False): - id = r1(r'[^_]vid\s*:\s*\'([^\']+)\',', get_html(url)).split('|')[-1] - assert id +def sina_download_by_vkey(vkey, title=None, output_dir='.', merge=True, info_only=False): + """Downloads a Sina video by its unique vkey. + http://video.sina.com/ + """ - sina_download_by_id(id, output_dir = output_dir, merge = merge, info_only = info_only) + url = 'http://video.sina.com/v/flvideo/%s_0.flv' % vkey + type, ext, size = url_info(url) + + print_info(site_info, title, 'flv', size) + if not info_only: + download_urls([url], title, 'flv', size, output_dir = output_dir, merge = merge) + +def sina_download(url, output_dir='.', merge=True, info_only=False): + """Downloads Sina videos by URL. + """ + + vid = match1(url, r'vid=(\d+)') + if vid is None: + video_page = get_content(url) + vid = hd_vid = match1(video_page, r'hd_vid\s*:\s*\'([^\']+)\'') + if hd_vid == '0': + vids = match1(video_page, r'[^\w]vid\s*:\s*\'([^\']+)\'').split('|') + vid = vids[-1] + + if vid: + sina_download_by_vid(vid, output_dir=output_dir, merge=merge, info_only=info_only) + else: + vkey = match1(video_page, r'vkey\s*:\s*"([^"]+)"') + title = match1(video_page, r'title\s*:\s*"([^"]+)"') + sina_download_by_vkey(vkey, title=title, output_dir=output_dir, merge=merge, info_only=info_only) site_info = "Sina.com" download = sina_download diff --git a/src/you_get/downloader/sohu.py b/src/you_get/downloader/sohu.py index 5e6df793..4400836a 100644 --- a/src/you_get/downloader/sohu.py +++ b/src/you_get/downloader/sohu.py @@ -8,7 +8,7 @@ import json def real_url(host, prot, file, new): url = 'http://%s/?prot=%s&file=%s&new=%s' % (host, prot, file, new) - start, _, host, key, _, _ = get_html(url).split('|') + start, _, host, key = get_html(url).split('|')[:4] return '%s%s?key=%s' % (start[:-1], new, key) def sohu_download(url, output_dir = '.', merge = True, info_only = False): diff --git a/src/you_get/downloader/ted.py b/src/you_get/downloader/ted.py new file mode 100644 index 00000000..167da2a8 --- /dev/null +++ b/src/you_get/downloader/ted.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +__all__ = ['ted_download'] + +from ..common import * + +def ted_download(url, output_dir = '.', merge = True, info_only = False): + page = get_html(url).split("\n") + for line in page: + if line.find("") > -1: + title = line.replace("<title>", "").replace("", "").replace("\t", "") + title = title[:title.find(' | ')] + if line.find("no-flash-video-download") > -1: + url = line.replace('', html)) + title = unescape_html(r1(r'', html) or + r1(r'', html) or + r1(r'([^<\n]*)', html)).replace('\n', '') real_url = r1(r'source src=\\x22([^\\]+)\\', html) if not real_url: real_url = r1(r'audio_file=([^&]+)&', html) + '?plead=please-dont-download-this-or-our-lawyers-wont-let-us-host-audio' diff --git a/src/you_get/downloader/xiami.py b/src/you_get/downloader/xiami.py index 127da043..81b0744e 100644 --- a/src/you_get/downloader/xiami.py +++ b/src/you_get/downloader/xiami.py @@ -55,11 +55,14 @@ def xiami_download_song(sid, output_dir = '.', merge = True, info_only = False): if not ext: ext = 'mp3' - print_info(site_info, song_title, type, size) + print_info(site_info, song_title, ext, size) if not info_only: file_name = "%s - %s - %s" % (song_title, album_name, artist) download_urls([url], file_name, ext, size, output_dir, merge = merge, faker = True) - xiami_download_lyric(lrc_url, file_name, output_dir) + try: + xiami_download_lyric(lrc_url, file_name, output_dir) + except: + pass def xiami_download_showcollect(cid, output_dir = '.', merge = True, info_only = False): html = get_html('http://www.xiami.com/song/showcollect/id/' + cid, faker = True) @@ -84,7 +87,10 @@ def xiami_download_showcollect(cid, output_dir = '.', merge = True, info_only = if not info_only: file_name = "%02d.%s - %s - %s" % (track_nr, song_title, artist, album_name) download_urls([url], file_name, ext, size, output_dir, merge = merge, faker = True) - xiami_download_lyric(lrc_url, file_name, output_dir) + try: + xiami_download_lyric(lrc_url, file_name, output_dir) + except: + pass track_nr += 1 @@ -112,7 +118,10 @@ def xiami_download_album(aid, output_dir = '.', merge = True, info_only = False) if not info_only: file_name = "%02d.%s" % (track_nr, song_title) download_urls([url], file_name, ext, size, output_dir, merge = merge, faker = True) - xiami_download_lyric(lrc_url, file_name, output_dir) + try: + xiami_download_lyric(lrc_url, file_name, output_dir) + except: + pass if not pic_exist: xiami_download_pic(pic_url, 'cover', output_dir) pic_exist = True @@ -131,6 +140,10 @@ def xiami_download(url, output_dir = '.', stream_type = None, merge = True, info if re.match('http://www.xiami.com/song/\d+', url): id = r1(r'http://www.xiami.com/song/(\d+)', url) xiami_download_song(id, output_dir, merge, info_only) + + if re.match('http://www.xiami.com/song/detail/id/\d+', url): + id = r1(r'http://www.xiami.com/song/detail/id/(\d+)', url) + xiami_download_song(id, output_dir, merge, info_only) site_info = "Xiami.com" download = xiami_download diff --git a/src/you_get/downloader/yinyuetai.py b/src/you_get/downloader/yinyuetai.py index e243eb3f..1249845c 100644 --- a/src/you_get/downloader/yinyuetai.py +++ b/src/you_get/downloader/yinyuetai.py @@ -20,10 +20,10 @@ def yinyuetai_download_by_id(id, title = None, output_dir = '.', merge = True, i download_urls([url], title, ext, size, output_dir, merge = merge) def yinyuetai_download(url, output_dir = '.', merge = True, info_only = False): - id = r1(r'http://www.yinyuetai.com/video/(\d+)$', url) + id = r1(r'http://\w+.yinyuetai.com/video/(\d+)$', url) assert id html = get_html(url, 'utf-8') - title = r1(r'<meta property="og:title" content="([^"]+)"/>', html) + title = r1(r'<meta property="og:title"\s+content="([^"]+)"/>', html) assert title title = parse.unquote(title) title = escape_file_path(title) diff --git a/src/you_get/downloader/youku.py b/src/you_get/downloader/youku.py index 1603394d..20c79c4d 100644 --- a/src/you_get/downloader/youku.py +++ b/src/you_get/downloader/youku.py @@ -25,7 +25,7 @@ def find_video_id_from_url(url): return r1_of(patterns, url) def find_video_id_from_show_page(url): - return re.search(r'<div class="btnplay">.*href="([^"]+)"', get_html(url)).group(1) + return re.search(r'<a class="btnShow btnplay.*href="([^"]+)"', get_html(url)).group(1) def youku_url(url): id = find_video_id_from_url(url) @@ -61,7 +61,7 @@ def parse_video_title(url, page): def parse_playlist_title(url, page): if re.search(r'v_playlist', url): - # if we are playing a viedo from play list, the meta title might be incorrect + # if we are playing a video from play list, the meta title might be incorrect title = re.search(r'<title>([^<>]*)', page).group(1) else: title = re.search(r'(\d+)', get_html(url)).group(1)) return ['http://v.youku.com/v_playlist/f%so0p%s.html' % (id, i) for i in range(n)] -def youku_download_playlist(url, output_dir = '.', merge = True, info_only = False): - if re.match(r'http://www.youku.com/show_page/id_\w+.html', url): - url = find_video_id_from_show_page(url) +def youku_download_playlist(url, output_dir='.', merge=True, info_only=False): + """Downloads a Youku playlist. + """ if re.match(r'http://www.youku.com/playlist_show/id_\d+(?:_ascending_\d_mode_pic(?:_page_\d+)?)?.html', url): ids = parse_vplaylist(url) @@ -185,21 +173,36 @@ def youku_download_playlist(url, output_dir = '.', merge = True, info_only = Fal ids = parse_vplaylist(url) elif re.match(r'http://u.youku.com/user_playlist/pid_(\d+)_id_[\w=]+(?:_page_\d+)?.html', url): ids = parse_vplaylist(url) - else: + elif re.match(r'http://www.youku.com/show_page/id_\w+.html', url): + url = find_video_id_from_show_page(url) assert re.match(r'http://v.youku.com/v_show/id_([\w=]+).html', url), 'URL not supported as playlist' ids = parse_playlist(url) + else: + ids = [] + assert ids != [] title = parse_playlist_title(url, get_html(url)) - title = title.replace('?', '-') + title = filenameable(title) output_dir = os.path.join(output_dir, title) for i, id in enumerate(ids): + print('Processing %s of %s videos...' % (i + 1, len(ids))) try: - print('Processing %s of %s videos...' % (i + 1, len(ids))) - youku_download(id, output_dir, merge = merge, info_only = info_only) + id, title = parse_page(youku_url(id)) + youku_download_by_id(id, title, output_dir=output_dir, merge=merge, info_only=info_only) except: continue +def youku_download(url, output_dir='.', merge=True, info_only=False): + """Downloads Youku videos by URL. + """ + + try: + youku_download_playlist(url, output_dir=output_dir, merge=merge, info_only=info_only) + except: + id, title = parse_page(url) + youku_download_by_id(id, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + site_info = "Youku.com" download = youku_download download_playlist = youku_download_playlist diff --git a/src/you_get/downloader/youtube.py b/src/you_get/downloader/youtube.py index 6ec39ccf..51fbb07f 100644 --- a/src/you_get/downloader/youtube.py +++ b/src/you_get/downloader/youtube.py @@ -6,7 +6,7 @@ from ..common import * # YouTube media encoding options, in descending quality order. # taken from http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs, 3/22/2013. -youtube_codecs = [ +yt_codecs = [ {'itag': 38, 'container': 'MP4', 'video_resolution': '3072p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '3.5-5', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, {'itag': 46, 'container': 'WebM', 'video_resolution': '1080p', 'video_encoding': 'VP8', 'video_profile': '', 'video_bitrate': '', 'audio_encoding': 'Vorbis', 'audio_bitrate': '192'}, {'itag': 37, 'container': 'MP4', 'video_resolution': '1080p', 'video_encoding': 'H.264', 'video_profile': 'High', 'video_bitrate': '3-4.3', 'audio_encoding': 'AAC', 'audio_bitrate': '192'}, @@ -32,102 +32,70 @@ youtube_codecs = [ {'itag': 17, 'container': '3GP', 'video_resolution': '144p', 'video_encoding': 'MPEG-4 Visual', 'video_profile': 'Simple', 'video_bitrate': '0.05', 'audio_encoding': 'AAC', 'audio_bitrate': '24'}, ] -def parse_video_info(raw_info): - """Parser for YouTube's get_video_info data. - Returns a dict, where 'url_encoded_fmt_stream_map' maps to a sorted list. +def decipher(js, s): + def tr_js(code): + code = re.sub(r'function', r'def', code) + code = re.sub(r'\{', r':\n\t', code) + code = re.sub(r'\}', r'\n', code) + code = re.sub(r'var\s+', r'', code) + code = re.sub(r'(\w+).join\(""\)', r'"".join(\1)', code) + code = re.sub(r'(\w+).length', r'len(\1)', code) + code = re.sub(r'(\w+).reverse\(\)', r'\1[::-1]', code) + code = re.sub(r'(\w+).slice\((\d+)\)', r'\1[\2:]', code) + code = re.sub(r'(\w+).split\(""\)', r'list(\1)', code) + return code + + f1 = match1(js, r'g.sig\|\|(\w+)\(g.s\)') + f1def = match1(js, r'(function %s\(\w+\)\{[^\{]+\})' % f1) + code = tr_js(f1def) + f2 = match1(f1def, r'(\w+)\(\w+,\d+\)') + if f2 is not None: + f2def = match1(js, r'(function %s\(\w+,\w+\)\{[^\{]+\})' % f2) + code = code + 'global %s\n' % f2 + tr_js(f2def) + + code = code + 'sig=%s(s)' % f1 + exec(code, globals(), locals()) + return locals()['sig'] + +def youtube_download_by_id(id, title=None, output_dir='.', merge=True, info_only=False): + """Downloads a YouTube video by its unique id. """ - # Percent-encoding reserved characters, used as separators. - sepr = { - '&': '%26', - ',': '%2C', - '=': '%3D', + raw_video_info = get_content('http://www.youtube.com/get_video_info?video_id=%s' % id) + video_info = parse.parse_qs(raw_video_info) + + if video_info['status'] == ['ok'] and ('use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']): + title = parse.unquote_plus(video_info['title'][0]) + stream_list = parse.parse_qs(raw_video_info)['url_encoded_fmt_stream_map'][0].split(',') + + else: + # Parse video page when video_info is not usable. + video_page = get_content('http://www.youtube.com/watch?v=%s' % id) + ytplayer_config = json.loads(match1(video_page, r'ytplayer.config\s*=\s*([^\n]+);')) + + title = ytplayer_config['args']['title'] + stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') + + html5player = ytplayer_config['assets']['js'] + + streams = { + parse.parse_qs(stream)['itag'][0] : parse.parse_qs(stream) + for stream in stream_list } - # fmt_level = {'itag': level, ...} - # itag of a higher quality maps to a lower level number. - # The highest quality has level number 0. - fmt_level = dict( - zip( - [str(codec['itag']) - for codec in - youtube_codecs], - range(len(youtube_codecs)))) + for codec in yt_codecs: + itag = str(codec['itag']) + if itag in streams: + download_stream = streams[itag] + break - # {key1: value1, key2: value2, ..., - # 'url_encoded_fmt_stream_map': [{'itag': '38', ...}, ...] - # } - return dict( - [(lambda metadata: - ['url_encoded_fmt_stream_map', ( - lambda stream_map: - sorted( - [dict( - [subitem.split(sepr['=']) - for subitem in - item.split(sepr['&'])]) - for item in - stream_map.split(sepr[','])], - key = - lambda stream: - fmt_level[stream['itag']])) - (metadata[1])] - if metadata[0] == 'url_encoded_fmt_stream_map' - else metadata) - (item.split('=')) - for item in - raw_info.split('&')]) - -def youtube_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False): - - raw_info = request.urlopen('http://www.youtube.com/get_video_info?video_id=%s' % id).read().decode('utf-8') - - video_info = parse_video_info(raw_info) - - if video_info['status'] == 'ok': # use get_video_info data - - title = parse.unquote(video_info['title'].replace('+', ' ')) - - signature = video_info['url_encoded_fmt_stream_map'][0]['sig'] - url = parse.unquote(parse.unquote(video_info['url_encoded_fmt_stream_map'][0]['url'])) + "&signature=%s" % signature - - else: # parse video page when "embedding disabled by request" - - import json - html = request.urlopen('http://www.youtube.com/watch?v=' + id).read().decode('utf-8') - html = unescape_html(html) - yt_player_config = json.loads(r1(r'ytplayer.config = ([^\n]+);', html)) - title = yt_player_config['args']['title'] - title = unicodize(title) - title = parse.unquote(title) - title = escape_file_path(title) - - for itag in [ - '38', - '46', '37', - '102', '45', '22', - '84', - '120', - '85', - '44', '35', - '101', '100', '43', '34', '82', '18', - '6', '83', '13', '5', '36', '17', - ]: - fmt = r1(r'([^,\"]*itag=' + itag + "[^,\"]*)", html) - if fmt: - url = r1(r'url=([^\\]+)', fmt) - url = unicodize(url) - url = parse.unquote(url) - sig = r1(r'sig=([^\\]+)', fmt) - url = url + '&signature=' + sig - break - try: - url - except NameError: - url = r1(r'ytdns.ping\("([^"]+)"[^;]*;', html) - url = unicodize(url) - url = re.sub(r'\\/', '/', url) - url = re.sub(r'generate_204', 'videoplayback', url) + url = download_stream['url'][0] + if 'sig' in download_stream: + sig = download_stream['sig'][0] + else: + js = get_content(html5player) + sig = decipher(js, download_stream['s'][0]) + url = '%s&signature=%s' % (url, sig) type, ext, size = url_info(url) @@ -135,13 +103,14 @@ def youtube_download_by_id(id, title = None, output_dir = '.', merge = True, inf if not info_only: download_urls([url], title, ext, size, output_dir, merge = merge) -def youtube_download(url, output_dir = '.', merge = True, info_only = False): - id = r1(r'youtu.be/(.*)', url) - if not id: - id = parse.parse_qs(parse.urlparse(url).query)['v'][0] +def youtube_download(url, output_dir='.', merge=True, info_only=False): + """Downloads YouTube videos by URL. + """ + + id = match1(url, r'youtu.be/([^/]+)') or parse_query_param(url, 'v') assert id - youtube_download_by_id(id, None, output_dir, merge = merge, info_only = info_only) + youtube_download_by_id(id, title=None, output_dir=output_dir, merge=merge, info_only=info_only) site_info = "YouTube.com" download = youtube_download diff --git a/src/you_get/version.py b/src/you_get/version.py index 8b7c8a81..43c2747b 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,6 +1,5 @@ #!/usr/bin/env python - __all__ = ['__version__', '__date__'] -__version__ = '0.3.12' -__date__ = '2013-05-19' +__version__ = '0.3.21' +__date__ = '2013-08-17' diff --git a/tests/test.py b/tests/test.py index 75f6f7ac..641878ef 100644 --- a/tests/test.py +++ b/tests/test.py @@ -4,7 +4,7 @@ import unittest from you_get import * -from you_get.__main__ import url_to_module +from you_get.downloader.__main__ import url_to_module def test_urls(urls): for url in urls: @@ -17,11 +17,6 @@ class YouGetTests(unittest.TestCase): "http://www.freesound.org/people/Corsica_S/sounds/184419/", ]) - def test_jpopsuki(self): - test_urls([ - #"http://jpopsuki.tv/video/Dragon-Ash---Run-to-the-Sun/8ad7aec604badd0b0798cd999b63ae17", - ]) - def test_mixcloud(self): test_urls([ "http://www.mixcloud.com/beatbopz/beat-bopz-disco-mix/", diff --git a/tests/test_common.py b/tests/test_common.py new file mode 100644 index 00000000..5e97b77b --- /dev/null +++ b/tests/test_common.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python + +import unittest + +from you_get import * + +class TestCommon(unittest.TestCase): + + def test_match1(self): + self.assertEqual(match1('http://youtu.be/1234567890A', r'youtu.be/([^/]+)'), '1234567890A') + self.assertEqual(match1('http://youtu.be/1234567890A', r'youtu.be/([^/]+)', r'youtu.(\w+)'), ['1234567890A', 'be']) diff --git a/you-get b/you-get index 8f88af13..86b44109 100755 --- a/you-get +++ b/you-get @@ -1,9 +1,10 @@ #!/usr/bin/env python3 import os, sys -sys.path.insert(0, os.path.join((os.path.dirname(os.path.realpath(__file__))), "src")) +__path__ = os.path.dirname(os.path.realpath(__file__)) +__srcdir__ = 'src' +sys.path.insert(1, os.path.join(__path__, __srcdir__)) +from you_get.downloader import main -from you_get import * - -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/you-get.json b/you-get.json index e7619b96..92114cff 100644 --- a/you-get.json +++ b/you-get.json @@ -31,6 +31,6 @@ ], "console_scripts": [ - "you-get = you_get.__main__:main" + "you-get = you_get.downloader.__main__:main" ] }