initial commit

This commit is contained in:
Mort Yao 2012-08-20 23:54:03 +08:00
commit 146bae2f97
13 changed files with 2228 additions and 0 deletions

6
.gitignore vendored Normal file
View File

@ -0,0 +1,6 @@
_*
*.py[cod]
*.flv
*.mp4
*.webm

23
LICENSE Normal file
View File

@ -0,0 +1,23 @@
==============================================
This is a copy of the MIT license.
==============================================
Copyright (C) 2012 Mort Yao <mort.yao@gmail.com>
Copyright (C) 2012 Boyu Guo <iambus@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

98
README.md Normal file
View File

@ -0,0 +1,98 @@
# You-Get
一个Python 3的YouTube/优酷视频下载脚本。
### Python版本
Python 3.x
### 说明
基于优酷下载脚本[iambus/youku-lixian](https://github.com/iambus/youku-lixian)用Python 3改写而成增加了以下功能
* 支持YouTube
* 支持断点续传
* 可设置HTTP代理
### 支持的站点(持续更新中)
目前根据本人需求,仅实现了对有限几个视频站点的支持,以后会继续增加(・∀・)
* YouTube <http://www.youtube.com>
* 音悦台 <http://www.yinyuetai.com>
* 优酷 <http://www.youku.com>
* 土豆 <http://www.tudou.com>
### 输出视频格式
* WebM (*.webm)
* MP4 (*.mp4)
* FLV (*.flv)
* 3GP (*.3gp)
对于YouTube程序将下载画质最高的[编码格式](http://en.wikipedia.org/wiki/Youtube#Quality_and_codecs)。
### 如何下载视频
以下命令均以Linux shell为例……Windows用户请自行脑补正确的命令格式
显示视频信息,但不进行下载(`-i`或`--info`选项):
$ ./you-get -i http://www.yinyuetai.com/video/463772
下载视频:
$ ./you-get http://www.yinyuetai.com/video/463772
下载多个视频:
$ ./you-get http://www.yinyuetai.com/video/463772 http://www.yinyuetai.com/video/471500
若当前目录下已有与视频标题同名的文件,下载时会自动跳过。若有同名的`.download`临时文件,程序会从上次中断处开始下载。
如要强制重新下载该视频,可使用`-f``--force`)选项:
$ ./you-get -f http://www.yinyuetai.com/video/463772
`-l``--playlist`)选项用于下载播放列表(只对某些网站适用):
$ ./you-get -l http://www.youku.com/playlist_show/id_5344313.html
指定视频文件的下载目录:
$ ./you-get -o ~/Downloads http://www.yinyuetai.com/video/463772
显示详细帮助:
$ ./you-get -h
### 如何设置代理
默认情况下Python自动使用系统的代理配置。可以通过环境变量`http_proxy`来设置系统的HTTP代理。
`-x``--http-proxy`选项用于手动指定You-Get所使用的HTTP代理。例如GoAgent的代理服务器是`http://127.0.0.1:8087`则使用该代理下载某YouTube视频的命令是
$ ./you-get -x 127.0.0.1:8087 http://www.youtube.com/watch?v=KbtO_Ayjw0M
Windows下的自由门等翻墙软件会自动设置系统全局代理因此无需指定HTTP代理即可下载YouTube视频
$ ./you-get http://www.youtube.com/watch?v=KbtO_Ayjw0M
如果不希望程序在下载过程中使用任何代理(包括系统的代理配置),可以显式地指定`--no-proxy`选项:
$ ./you-get --no-proxy http://v.youku.com/v_show/id_XMjI0ODc1NTc2.html
### 断点续传
下载未完成时意外中止(因为网络中断或程序被强行终止等),在目标路径中会有一个扩展名为`.download`的临时文件。
下次运行只要在目标路径中找到相应的`.download`临时文件,程序会自动从中断处继续下载。(除非指定了`-f`选项)
### 使用Python 2
优酷等国内视频网站的下载,请移步:[iambus/youku-lixian](https://github.com/iambus/youku-lixian)
YouTube等国外视频网站的下载请移步[rg3/youtube-dl](https://github.com/rg3/youtube-dl)
### 许可证
源码在MIT License下发布。

430
common.py Normal file
View File

@ -0,0 +1,430 @@
#!/usr/bin/env python3
import getopt
import json
import os
import re
import sys
from urllib import request, parse
try:
proj_info = json.loads(open('you-get.json').read())
except:
proj_info = {'version': ''}
force = False
if sys.stdout.isatty():
default_encoding = sys.stdout.encoding.lower()
else:
default_encoding = locale.getpreferredencoding().lower()
def tr(s):
if default_encoding.startswith('utf') or default_encoding in ['cp936', '936', 'ms936', 'gbk']:
return s
else:
return s.encode('utf-8')
def r1(pattern, text):
m = re.search(pattern, text)
if m:
return m.group(1)
def r1_of(patterns, text):
for p in patterns:
x = r1(p, text)
if x:
return x
def escape_file_path(path):
path = path.replace('/', '-')
path = path.replace('\\', '-')
path = path.replace('*', '-')
path = path.replace('?', '-')
return path
def unescape_html(html):
from html import parser
html = parser.HTMLParser().unescape(html)
html = re.sub(r'&#(\d+);', lambda x: chr(int(x.group(1))), html)
return html
def ungzip(s):
from io import BytesIO
import gzip
buffer = BytesIO(s)
f = gzip.GzipFile(fileobj = buffer)
return f.read()
def undeflate(s):
import zlib
return zlib.decompress(s, -zlib.MAX_WBITS)
def get_response(url):
response = request.urlopen(url)
data = response.read()
if response.info().get('Content-Encoding') == 'gzip':
data = ungzip(data)
elif response.info().get('Content-Encoding') == 'deflate':
data = undeflate(data)
response.data = data
return response
def get_html(url, encoding = None):
content = get_response(url).data
return str(content, 'utf-8', 'ignore')
def get_decoded_html(url):
response = get_response(url)
data = response.data
charset = r1(r'charset=([\w-]+)', response.headers['content-type'])
if charset:
return data.decode(charset)
else:
return data
def url_size(url):
size = int(request.urlopen(url).headers['content-length'])
return size
def urls_size(urls):
return sum(map(url_size, urls))
def url_info(url):
response = request.urlopen(request.Request(url))
headers = response.headers
type = headers['content-type']
mapping = {
'video/3gpp': '3gp',
'video/f4v': 'flv',
'video/mp4': 'mp4',
'video/webm': 'webm',
'video/x-flv': 'flv'
}
assert type in mapping, type
ext = mapping[type]
size = int(headers['content-length'])
return type, ext, size
def url_save(url, filepath, bar, refer = None, is_part = False):
file_size = url_size(url)
if os.path.exists(filepath):
if not force and file_size == os.path.getsize(filepath):
if not is_part:
if bar:
bar.done()
print('Skipping %s: file already exists' % tr(os.path.basename(filepath)))
else:
if bar:
bar.update_received(file_size)
return
else:
if not is_part:
if bar:
bar.done()
print('Overwriting %s' % tr(os.path.basename(filepath)), '...')
elif not os.path.exists(os.path.dirname(filepath)):
os.mkdir(os.path.dirname(filepath))
temp_filepath = filepath + '.download'
received = 0
if not force:
open_mode = 'ab'
if os.path.exists(temp_filepath):
received += os.path.getsize(temp_filepath)
if bar:
bar.update_received(os.path.getsize(temp_filepath))
else:
open_mode = 'wb'
if received < file_size:
headers = {}
if received:
headers['Range'] = 'bytes=' + str(received) + '-'
if refer:
headers['Referer'] = refer
response = request.urlopen(request.Request(url, headers = headers), None)
assert file_size == received + int(response.headers['content-length'])
with open(temp_filepath, open_mode) as output:
while True:
buffer = response.read(1024 * 256)
if not buffer:
if received == file_size: # Download finished
break
else: # Unexpected termination. Retry request
headers['Range'] = 'bytes=' + str(received) + '-'
response = request.urlopen(request.Request(url, headers = headers), None)
output.write(buffer)
received += len(buffer)
if bar:
bar.update_received(len(buffer))
assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (received, os.path.getsize(temp_filepath))
if os.access(filepath, os.W_OK):
os.remove(filepath) # on Windows rename could fail if destination filepath exists
os.rename(temp_filepath, filepath)
class SimpleProgressBar:
def __init__(self, total_size, total_pieces = 1):
self.displayed = False
self.total_size = total_size
self.total_pieces = total_pieces
self.current_piece = 1
self.received = 0
def update(self):
self.displayed = True
bar_size = 40
percent = round(self.received * 100 / self.total_size, 1)
if percent > 100:
percent = 100
dots = bar_size * int(percent) // 100
plus = int(percent) - dots // bar_size * 100
if plus > 0.8:
plus = '='
elif plus > 0.4:
plus = '>'
else:
plus = ''
bar = '=' * dots + plus
bar = '{0:>5}% ({1:>5}/{2:<5}MB) [{3:<40}] {4}/{5}'.format(percent, round(self.received / 1048576, 1), round(self.total_size / 1048576, 1), bar, self.current_piece, self.total_pieces)
sys.stdout.write('\r' + bar)
sys.stdout.flush()
def update_received(self, n):
self.received += n
self.update()
def update_piece(self, n):
self.current_piece = n
def done(self):
if self.displayed:
print()
self.displayed = False
class PiecesProgressBar:
def __init__(self, total_size, total_pieces = 1):
self.displayed = False
self.total_size = total_size
self.total_pieces = total_pieces
self.current_piece = 1
self.received = 0
def update(self):
self.displayed = True
bar = '{0:>5}%[{1:<40}] {2}/{3}'.format('?', '?' * 40, self.current_piece, self.total_pieces)
sys.stdout.write('\r' + bar)
sys.stdout.flush()
def update_received(self, n):
self.received += n
self.update()
def update_piece(self, n):
self.current_piece = n
def done(self):
if self.displayed:
print()
self.displayed = False
class DummyProgressBar:
def __init__(self, *args):
pass
def update_received(self, n):
pass
def update_piece(self, n):
pass
def done(self):
pass
def download_urls(urls, title, ext, total_size, output_dir = '.', refer = None, merge = True):
assert urls
assert ext in ('3gp', 'flv', 'mp4', 'webm')
if not total_size:
try:
total_size = urls_size(urls)
except:
import traceback
import sys
traceback.print_exc(file = sys.stdout)
pass
title = escape_file_path(title)
filename = '%s.%s' % (title, ext)
filepath = os.path.join(output_dir, filename)
if total_size:
if not force and os.path.exists(filepath) and os.path.getsize(filepath) >= total_size * 0.9:
print('Skipping %s: file already exists' % tr(filepath))
return
bar = SimpleProgressBar(total_size, len(urls))
else:
bar = PiecesProgressBar(total_size, len(urls))
if len(urls) == 1:
url = urls[0]
print('Downloading %s ...' % tr(filename))
url_save(url, filepath, bar, refer = refer)
bar.done()
else:
flvs = []
print('Downloading %s.%s ...' % (tr(title), ext))
for i, url in enumerate(urls):
filename = '%s[%02d].%s' % (title, i, ext)
filepath = os.path.join(output_dir, filename)
flvs.append(filepath)
#print 'Downloading %s [%s/%s]...' % (tr(filename), i + 1, len(urls))
bar.update_piece(i + 1)
url_save(url, filepath, bar, refer = refer, is_part = True)
bar.done()
if not merge:
return
if ext == 'flv':
from merge_flv import concat_flvs
concat_flvs(flvs, os.path.join(output_dir, title + '.flv'))
for flv in flvs:
os.remove(flv)
elif ext == 'mp4':
from merge_mp4 import concat_mp4s
concat_mp4s(flvs, os.path.join(output_dir, title + '.mp4'))
for flv in flvs:
os.remove(flv)
else:
print("Can't merge %s files" % ext)
def playlist_not_supported(name):
def f(*args, **kwargs):
raise NotImplementedError('Playlist is not supported for ' + name)
return f
def print_info(site_info, title, type, size):
if type in ['3gp']:
type = 'video/3gpp'
elif type in ['flv', 'f4v']:
type = 'video/x-flv'
elif type in ['mp4']:
type = 'video/mp4'
elif type in ['webm']:
type = 'video/webm'
if type in ['video/3gpp']:
type_info = "3GPP multimedia file (%s)" % type
elif type in ['video/x-flv', 'video/f4v']:
type_info = "Flash video (%s)" % type
elif type in ['video/mp4', 'video/x-m4v']:
type_info = "MPEG-4 video (%s)" % type
elif type in ['video/webm']:
type_info = "WebM video (%s)" % type
#elif type in ['video/ogg']:
# type_info = "Ogg video (%s)" % type
#elif type in ['video/quicktime']:
# type_info = "QuickTime video (%s)" % type
#elif type in ['video/x-matroska']:
# type_info = "Matroska video (%s)" % type
#elif type in ['video/x-ms-wmv']:
# type_info = "Windows Media video (%s)" % type
#elif type in ['video/mpeg']:
# type_info = "MPEG video (%s)" % type
else:
type_info = "Unknown type (%s)" % type
print("Video Site:", site_info)
print("Title: ", tr(title))
print("Type: ", type_info)
print("Size: ", round(size / 1048576, 2), "MB (" + str(size) + " Bytes)")
def set_http_proxy(proxy):
if proxy == None: # Use system default setting
proxy_support = request.ProxyHandler()
elif proxy == '': # Don't use any proxy
proxy_support = request.ProxyHandler({})
else: # Use proxy
if not proxy.startswith('http://'):
proxy = 'http://' + proxy
proxy_support = request.ProxyHandler({'http': '%s' % proxy})
opener = request.build_opener(proxy_support)
request.install_opener(opener)
def main(script_name, download, download_playlist = None):
version = 'You-Get %s, a video downloader.' % proj_info['version']
help = 'Usage: [python3] %s [OPTION]... [URL]...\n' % script_name
help += '''\nStartup options:
-V | --version Display the version and exit.
-h | --help Print this help and exit.
'''
help += '''\nDownload options (use with URLs):
-f | --force Force overwriting existed files.
-i | --info Display the information of videos without downloading.
-l | --playlist Download playlists. (only available for some sites)
-n | --no-merge Don't merge video parts.
-o | --output-dir <PATH> Set the output directory for downloaded videos.
-x | --http-proxy <PROXY-SERVER-IP:PORT> Use specific HTTP proxy for downloading.
--no-proxy Don't use any proxy. (ignore $http_proxy)
'''
short_opts = 'Vhfino:x:'
opts = ['version', 'help', 'force', 'info', 'no-merge', 'no-proxy', 'output-dir=', 'http-proxy=']
if download_playlist:
short_opts = 'l' + short_opts
opts = ['playlist'] + opts
try:
opts, args = getopt.getopt(sys.argv[1:], short_opts, opts)
except getopt.GetoptError as err:
print(err)
print(help)
sys.exit(2)
info_only = False
playlist = False
merge = True
output_dir = '.'
proxy = None
for o, a in opts:
if o in ('-V', '--version'):
print(version)
sys.exit()
elif o in ('-h', '--help'):
print(version)
print(help)
sys.exit()
elif o in ('-f', '--force'):
global force
force = True
elif o in ('-i', '--info'):
info_only = True
elif o in ('-l', '--playlist'):
playlist = True
elif o in ('-n', '--no-merge'):
merge = False
elif o in ('--no-proxy'):
proxy = ''
elif o in ('-o', '--output-dir'):
output_dir = a
elif o in ('-x', '--http-proxy'):
proxy = a
else:
print(help)
sys.exit(1)
if not args:
print(help)
sys.exit(1)
set_http_proxy(proxy)
for url in args:
if not url.startswith('http://'):
url = 'http://' + url
if playlist:
download_playlist(url, output_dir = output_dir, merge = merge, info_only = info_only)
else:
download(url, output_dir = output_dir, merge = merge, info_only = info_only)

51
get.py Executable file
View File

@ -0,0 +1,51 @@
#!/usr/bin/env python3
from common import *
import get_tudou
import get_yinyuetai
import get_youku
import get_youtube
def url_to_module(url):
site = r1(r'http://([^/]+)/', url)
assert site, 'invalid url: ' + url
if site.endswith('.com.cn'):
site = site[:-3]
domain = r1(r'(\.[^.]+\.[^.]+)$', site)
assert domain, 'unsupported url: ' + url
k = r1(r'([^.]+)', domain)
downloads = {
'youtube': get_youtube,
'youku': get_youku,
'yinyuetai': get_yinyuetai,
'tudou': get_tudou,
#TODO:
# 'acfun': get_acfun,
# 'bilibili': get_bilibili,
# 'kankanews': get_bilibili,
# 'iask': get_iask,
# 'sina': get_iask,
# 'ku6': get_ku6,
# 'pptv': get_pptv,
# 'iqiyi': get_iqiyi,
# 'sohu': get_sohu,
# '56': get_w56,
# 'cntv': get_cntv,
}
if k in downloads:
return downloads[k]
else:
raise NotImplementedError(url)
def any_download(url, output_dir = '.', merge = True, info_only = False):
m = url_to_module(url)
m.download(url, output_dir = output_dir, merge = merge, info_only = info_only)
def any_download_playlist(url, output_dir = '.', merge = True, info_only = False):
m = url_to_module(url)
m.download_playlist(url, output_dir = output_dir, merge = merge, info_only = info_only)
if __name__ == '__main__':
main('get.py', any_download, any_download_playlist)

78
get_tudou.py Executable file
View File

@ -0,0 +1,78 @@
#!/usr/bin/env python3
__all__ = ['tudou_download', 'tudou_download_playlist', 'tudou_download_by_id', 'tudou_download_by_iid']
from common import *
def tudou_download_by_iid(iid, title, output_dir = '.', merge = True, info_only = False):
xml = get_html('http://v2.tudou.com/v?it=' + iid + '&st=1,2,3,4,99')
from xml.dom.minidom import parseString
doc = parseString(xml)
title = title or doc.firstChild.getAttribute('tt') or doc.firstChild.getAttribute('title')
urls = [(int(n.getAttribute('brt')), n.firstChild.nodeValue.strip()) for n in doc.getElementsByTagName('f')]
url = max(urls, key = lambda x:x[0])[1]
assert 'f4v' in url
type, ext, size = url_info(url)
print_info(site_info, title, type, size)
if not info_only:
#url_save(url, filepath, bar):
download_urls([url], title, ext, total_size = None, output_dir = output_dir, merge = merge)
def tudou_download_by_id(id, title, output_dir = '.', merge = True):
html = get_html('http://www.tudou.com/programs/view/%s/' % id)
iid = r1(r'iid\s*=\s*(\S+)', html)
tudou_download_by_iid(iid, title, output_dir = output_dir, merge = merge)
def tudou_download(url, output_dir = '.', merge = True, info_only = False):
html = get_decoded_html(url)
iid = r1(r'iid\s*[:=]\s*(\d+)', html)
assert iid
title = r1(r'kw\s*[:=]\s*"([^"]+)"', html)
assert title
title = unescape_html(title)
tudou_download_by_iid(iid, title, output_dir = output_dir, merge = merge, info_only = info_only)
def parse_playlist(url):
#if r1('http://www.tudou.com/playlist/p/a(\d+)\.html', url):
# html = get_html(url)
# print re.search(r'<script>var.*?</script>', html, flags=re.S).group()
#else:
# raise NotImplementedError(url)
raise NotImplementedError()
def parse_playlist(url):
aid = r1('http://www.tudou.com/playlist/p/a(\d+)(?:i\d+)?\.html', url)
html = get_decoded_html(url)
if not aid:
aid = r1(r"aid\s*[:=]\s*'(\d+)'", html)
if re.match(r'http://www.tudou.com/albumcover/', url):
atitle = r1(r"title\s*:\s*'([^']+)'", html)
elif re.match(r'http://www.tudou.com/playlist/p/', url):
atitle = r1(r'atitle\s*=\s*"([^"]+)"', html)
else:
raise NotImplementedError(url)
assert aid
assert atitle
import json
#url = 'http://www.tudou.com/playlist/service/getZyAlbumItems.html?aid='+aid
url = 'http://www.tudou.com/playlist/service/getAlbumItems.html?aid='+aid
return [(atitle + '-' + x['title'], str(x['itemId'])) for x in json.loads(get_html(url))['message']]
def tudou_download_playlist(url, create_dir = False, output_dir = '.', merge = True):
if create_dir:
raise NotImplementedError('please report a bug so I can implement this')
videos = parse_playlist(url)
for i, (title, id) in enumerate(videos):
print('Downloading %s of %s videos...' % (i + 1, len(videos)))
tudou_download_by_iid(id, title, output_dir = output_dir, merge = merge)
site_info = "Tudou.com"
download = tudou_download
download_playlist = tudou_download_playlist
if __name__ == '__main__':
main('tudou', tudou_download, tudou_download_playlist)

36
get_yinyuetai.py Executable file
View File

@ -0,0 +1,36 @@
#!/usr/bin/env python3
__all__ = ['yinyuetai_download', 'yinyuetai_download_by_id']
from common import *
def yinyuetai_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False):
assert title
html = get_html('http://www.yinyuetai.com/insite/get-video-info?flex=true&videoId=' + id)
#TODO: run a fully parse instead of text search
# url = r1(r'(http://flv.yinyuetai.com/uploads/videos/common/\w+\.flv\?t=[a-f0-9]{16})', html)
# url = r1(r'http://hc.yinyuetai.com/uploads/videos/common/[A-F0-9]{32}\.mp4\?v=\d{12}', html)
url = r1(r'(http://\w+\.yinyuetai\.com/uploads/videos/common/\w+\.(?:flv|mp4)\?(?:t=[a-f0-9]{16}|v=\d{12}))', html)
assert url
type, ext, size = url_info(url)
print_info(site_info, title, type, size)
if not info_only:
download_urls([url], title, ext, size, output_dir, merge = merge)
def yinyuetai_download(url, output_dir = '.', merge = True, info_only = False):
id = r1(r'http://www.yinyuetai.com/video/(\d+)$', url)
assert id
html = get_html(url, 'utf-8')
title = r1(r'<meta property="og:title" content="([^"]+)"/>', html)
assert title
title = parse.unquote(title)
title = escape_file_path(title)
yinyuetai_download_by_id(id, title, output_dir, merge = merge, info_only = info_only)
site_info = "YinYueTai.com"
download = yinyuetai_download
download_playlist = playlist_not_supported('yinyuetai')
if __name__ == '__main__':
main('get_yinyuetai.py', yinyuetai_download)

173
get_youku.py Executable file
View File

@ -0,0 +1,173 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
__all__ = ['youku_download', 'youku_download_playlist', 'youku_download_by_id']
from common import *
import json
from random import randint
from time import time
import re
import sys
def trim_title(title):
title = title.replace(' - 视频 - 优酷视频 - 在线观看', '')
title = title.replace(' - 专辑 - 优酷视频', '')
title = re.sub(r'—([^—]+)—优酷网,视频高清在线观看', '', title)
return title
def find_video_id_from_url(url):
patterns = [r'^http://v.youku.com/v_show/id_([\w=]+).html',
r'^http://player.youku.com/player.php/sid/([\w=]+)/v.swf',
r'^loader\.swf\?VideoIDS=([\w=]+)',
r'^([\w=]+)$']
return r1_of(patterns, url)
def find_video_id_from_show_page(url):
return re.search(r'<div class="btnplay">.*href="([^"]+)"', get_html(url)).group(1)
def youku_url(url):
id = find_video_id_from_url(url)
if id:
return 'http://v.youku.com/v_show/id_%s.html' % id
if re.match(r'http://www.youku.com/show_page/id_\w+.html', url):
return find_video_id_from_show_page(url)
if re.match(r'http://v.youku.com/v_playlist/\w+.html', url):
return url
raise Exception('Invalid Youku URL: '+url)
def parse_page(url):
url = youku_url(url)
page = get_html(url)
id2 = re.search(r"var\s+videoId2\s*=\s*'(\S+)'", page).group(1)
if re.search(r'v_playlist', url):
# if we are playing a video from playlist, the meta title might be incorrect
title = re.search(r'<title>([^<>]*)</title>', page).group(1)
else:
title = re.search(r'<meta name="title" content="([^"]*)">', page).group(1)
title = trim_title(title)
if re.search(r'v_playlist', url) and re.search(r'-.*\S+', title):
title = re.sub(r'^[^-]+-\s*', '', title) # remove the special name from title for playlist video
title = unescape_html(title)
subtitle = re.search(r'<span class="subtitle" id="subtitle">([^<>]*)</span>', page)
if subtitle:
subtitle = subtitle.group(1).strip()
if subtitle == title:
subtitle = None
return id2, title, subtitle
def get_info(videoId2):
return json.loads(get_html('http://v.youku.com/player/getPlayList/VideoIDS/' + videoId2))
def find_video(info, stream_type = None):
#key = '%s%x' % (info['data'][0]['key2'], int(info['data'][0]['key1'], 16) ^ 0xA55AA5A5)
segs = info['data'][0]['segs']
types = segs.keys()
if not stream_type:
for x in ['hd2', 'mp4', 'flv']:
if x in types:
stream_type = x
break
else:
raise NotImplementedError()
assert stream_type in ('hd2', 'mp4', 'flv')
file_type = {'hd2': 'flv', 'mp4': 'mp4', 'flv': 'flv'}[stream_type]
seed = info['data'][0]['seed']
source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\\:._-1234567890")
mixed = ''
while source:
seed = (seed * 211 + 30031) & 0xFFFF
index = seed * len(source) >> 16
c = source.pop(index)
mixed += c
ids = info['data'][0]['streamfileids'][stream_type].split('*')[:-1]
vid = ''.join(mixed[int(i)] for i in ids)
sid = '%s%s%s' % (int(time() * 1000), randint(1000, 1999), randint(1000, 9999))
urls = []
for s in segs[stream_type]:
no = '%02x' % int(s['no'])
url = 'http://f.youku.com/player/getFlvPath/sid/%s_%s/st/%s/fileid/%s%s%s?K=%s&ts=%s' % (sid, no, file_type, vid[:8], no.upper(), vid[10:], s['k'], s['seconds'])
urls.append((url, int(s['size'])))
return urls
def file_type_of_url(url):
return str(re.search(r'/st/([^/]+)/', url).group(1))
def youku_download_by_id(id2, title, output_dir = '.', stream_type = None, merge = True, info_only = False):
info = get_info(id2)
urls, sizes = zip(*find_video(info, stream_type))
total_size = sum(sizes)
print_info(site_info, title, file_type_of_url(urls[0]), total_size)
if not info_only:
download_urls(urls, title, file_type_of_url(urls[0]), total_size, output_dir, merge = merge)
def youku_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False):
id2, title, subtitle = parse_page(url)
if subtitle:
title += '-' + subtitle
youku_download_by_id(id2, title, output_dir, merge = merge, info_only = info_only)
def parse_playlist_videos(html):
return re.findall(r'id="A_(\w+)"', html)
def parse_playlist_pages(html):
m = re.search(r'<ul class="pages">.*?</ul>', html, flags = re.S)
if m:
urls = re.findall(r'href="([^"]+)"', m.group())
x1, x2, x3 = re.match(r'^(.*page_)(\d+)(_.*)$', urls[-1]).groups()
return ['http://v.youku.com%s%s%s?__rt=1&__ro=listShow' % (x1, i, x3) for i in range(2, int(x2) + 1)]
else:
return []
def parse_playlist(url):
html = get_html(url)
video_id = re.search(r"var\s+videoId\s*=\s*'(\d+)'", html).group(1)
show_id = re.search(r'var\s+showid\s*=\s*"(\d+)"', html).group(1)
list_url = 'http://v.youku.com/v_vpofficiallist/page_1_showid_%s_id_%s.html?__rt=1&__ro=listShow' % (show_id, video_id)
html = get_html(list_url)
ids = parse_playlist_videos(html)
for url in parse_playlist_pages(html):
ids.extend(parse_playlist_videos(get_html(url)))
return ids
def parse_vplaylist(url):
id = r1_of([r'^http://www.youku.com/playlist_show/id_(\d+)(?:_ascending_\d_mode_pic(?:_page_\d+)?)?.html',
r'^http://v.youku.com/v_playlist/f(\d+)o[01]p\d+.html',
r'^http://u.youku.com/user_playlist/pid_(\d+)_id_[\w=]+(?:_page_\d+)?.html'],
url)
assert id, 'not valid vplaylist url: ' + url
url = 'http://www.youku.com/playlist_show/id_%s.html' % id
n = int(re.search(r'<span class="num">(\d+)</span>', get_html(url)).group(1))
return ['http://v.youku.com/v_playlist/f%so0p%s.html' % (id, i) for i in range(n)]
def youku_download_playlist(url, output_dir = '.', merge = True, info_only = False):
if re.match(r'http://www.youku.com/show_page/id_\w+.html', url):
url = find_video_id_from_show_page(url)
if re.match(r'http://www.youku.com/playlist_show/id_\d+(?:_ascending_\d_mode_pic(?:_page_\d+)?)?.html', url):
ids = parse_vplaylist(url)
elif re.match(r'http://v.youku.com/v_playlist/f\d+o[01]p\d+.html', url):
ids = parse_vplaylist(url)
elif re.match(r'http://u.youku.com/user_playlist/pid_(\d+)_id_[\w=]+(?:_page_\d+)?.html', url):
ids = parse_vplaylist(url)
else:
assert re.match(r'http://v.youku.com/v_show/id_([\w=]+).html', url), 'URL not supported as playlist'
ids = parse_playlist(url)
for i, id in enumerate(ids):
print('Processing %s of %s videos...' % (i + 1, len(ids)))
youku_download(id, output_dir, merge = merge, info_only = info_only)
site_info = "Youku.com"
download = youku_download
download_playlist = youku_download_playlist
if __name__ == '__main__':
main('get_youku.py', youku_download, youku_download_playlist)

36
get_youtube.py Executable file
View File

@ -0,0 +1,36 @@
#!/usr/bin/env python3
__all__ = ['youtube_download', 'youtube_download_by_id']
from common import *
def youtube_download_by_id(id, title = None, output_dir = '.', merge = True, info_only = False):
try:
url = parse.parse_qs(parse.unquote(request.urlopen('http://www.youtube.com/get_video_info?&video_id=' + id).read().decode('utf-8')))['url_encoded_fmt_stream_map'][0][4:]
except:
url = parse.parse_qs(parse.unquote(request.urlopen('http://www.youtube.com/watch?v=' + id).read().decode('utf-8')))['url_encoded_fmt_stream_map'][0][4:]
type, ext, size = url_info(url)
print_info(site_info, title, type, size)
if not info_only:
download_urls([url], title, ext, size, output_dir, merge = merge)
def youtube_download(url, output_dir = '.', merge = True, info_only = False):
id = parse.parse_qs(parse.urlparse(url).query)['v'][0]
assert id
try:
title = parse.parse_qs(parse.unquote(request.urlopen('http://www.youtube.com/get_video_info?&video_id=' + id).read().decode('utf-8')))['title'][0]
except:
html = get_html(url, 'utf-8')
title = r1(r'"title": "([^"]+)"', html)
assert title
title = parse.unquote(title)
title = escape_file_path(title)
youtube_download_by_id(id, title, output_dir, merge = merge, info_only = info_only)
site_info = "YouTube.com"
download = youtube_download
download_playlist = playlist_not_supported('youtube')
if __name__ == '__main__':
main('get_youtube.py', youtube_download)

365
merge_flv.py Executable file
View File

@ -0,0 +1,365 @@
#!/usr/bin/env python3
import struct
from io import BytesIO
TAG_TYPE_METADATA = 18
##################################################
# AMF0
##################################################
AMF_TYPE_NUMBER = 0x00
AMF_TYPE_BOOLEAN = 0x01
AMF_TYPE_STRING = 0x02
AMF_TYPE_OBJECT = 0x03
AMF_TYPE_MOVIECLIP = 0x04
AMF_TYPE_NULL = 0x05
AMF_TYPE_UNDEFINED = 0x06
AMF_TYPE_REFERENCE = 0x07
AMF_TYPE_MIXED_ARRAY = 0x08
AMF_TYPE_END_OF_OBJECT = 0x09
AMF_TYPE_ARRAY = 0x0A
AMF_TYPE_DATE = 0x0B
AMF_TYPE_LONG_STRING = 0x0C
AMF_TYPE_UNSUPPORTED = 0x0D
AMF_TYPE_RECORDSET = 0x0E
AMF_TYPE_XML = 0x0F
AMF_TYPE_CLASS_OBJECT = 0x10
AMF_TYPE_AMF3_OBJECT = 0x11
class ECMAObject:
def __init__(self, max_number):
self.max_number = max_number
self.data = []
self.map = {}
def put(self, k, v):
self.data.append((k, v))
self.map[k] = v
def get(self, k):
return self.map[k]
def set(self, k, v):
for i in range(len(self.data)):
if self.data[i][0] == k:
self.data[i] = (k, v)
break
else:
raise KeyError(k)
self.map[k] = v
def keys(self):
return self.map.keys()
def __str__(self):
return 'ECMAObject<' + repr(self.map) + '>'
def __eq__(self, other):
return self.max_number == other.max_number and self.data == other.data
def read_amf_number(stream):
return struct.unpack('>d', stream.read(8))[0]
def read_amf_boolean(stream):
b = read_byte(stream)
assert b in (0, 1)
return bool(b)
def read_amf_string(stream):
xx = stream.read(2)
if xx == b'':
# dirty fix for the invalid Qiyi flv
return None
n = struct.unpack('>H', xx)[0]
s = stream.read(n)
assert len(s) == n
return s.decode('utf-8')
def read_amf_object(stream):
obj = {}
while True:
k = read_amf_string(stream)
if not k:
assert read_byte(stream) == AMF_TYPE_END_OF_OBJECT
break
v = read_amf(stream)
obj[k] = v
return obj
def read_amf_mixed_array(stream):
max_number = read_uint(stream)
mixed_results = ECMAObject(max_number)
while True:
k = read_amf_string(stream)
if k is None:
# dirty fix for the invalid Qiyi flv
break
if not k:
assert read_byte(stream) == AMF_TYPE_END_OF_OBJECT
break
v = read_amf(stream)
mixed_results.put(k, v)
assert len(mixed_results.data) == max_number
return mixed_results
def read_amf_array(stream):
n = read_uint(stream)
v = []
for i in range(n):
v.append(read_amf(stream))
return v
amf_readers = {
AMF_TYPE_NUMBER: read_amf_number,
AMF_TYPE_BOOLEAN: read_amf_boolean,
AMF_TYPE_STRING: read_amf_string,
AMF_TYPE_OBJECT: read_amf_object,
AMF_TYPE_MIXED_ARRAY: read_amf_mixed_array,
AMF_TYPE_ARRAY: read_amf_array,
}
def read_amf(stream):
return amf_readers[read_byte(stream)](stream)
def write_amf_number(stream, v):
stream.write(struct.pack('>d', v))
def write_amf_boolean(stream, v):
if v:
stream.write(b'\x01')
else:
stream.write(b'\x00')
def write_amf_string(stream, s):
s = s.encode('utf-8')
stream.write(struct.pack('>H', len(s)))
stream.write(s)
def write_amf_object(stream, o):
for k in o:
write_amf_string(stream, k)
write_amf(stream, o[k])
write_amf_string(stream, '')
write_byte(stream, AMF_TYPE_END_OF_OBJECT)
def write_amf_mixed_array(stream, o):
write_uint(stream, o.max_number)
for k, v in o.data:
write_amf_string(stream, k)
write_amf(stream, v)
write_amf_string(stream, '')
write_byte(stream, AMF_TYPE_END_OF_OBJECT)
def write_amf_array(stream, o):
write_uint(stream, len(o))
for v in o:
write_amf(stream, v)
amf_writers_tags = {
float: AMF_TYPE_NUMBER,
bool: AMF_TYPE_BOOLEAN,
str: AMF_TYPE_STRING,
dict: AMF_TYPE_OBJECT,
ECMAObject: AMF_TYPE_MIXED_ARRAY,
list: AMF_TYPE_ARRAY,
}
amf_writers = {
AMF_TYPE_NUMBER: write_amf_number,
AMF_TYPE_BOOLEAN: write_amf_boolean,
AMF_TYPE_STRING: write_amf_string,
AMF_TYPE_OBJECT: write_amf_object,
AMF_TYPE_MIXED_ARRAY: write_amf_mixed_array,
AMF_TYPE_ARRAY: write_amf_array,
}
def write_amf(stream, v):
if isinstance(v, ECMAObject):
tag = amf_writers_tags[ECMAObject]
else:
tag = amf_writers_tags[type(v)]
write_byte(stream, tag)
amf_writers[tag](stream, v)
##################################################
# FLV
##################################################
def read_int(stream):
return struct.unpack('>i', stream.read(4))[0]
def read_uint(stream):
return struct.unpack('>I', stream.read(4))[0]
def write_uint(stream, n):
stream.write(struct.pack('>I', n))
def read_byte(stream):
return ord(stream.read(1))
def write_byte(stream, b):
stream.write(bytes([b]))
def read_unsigned_medium_int(stream):
x1, x2, x3 = struct.unpack('BBB', stream.read(3))
return (x1 << 16) | (x2 << 8) | x3
def read_tag(stream):
# header size: 15 bytes
header = stream.read(15)
if len(header) == 4:
return
x = struct.unpack('>IBBBBBBBBBBB', header)
previous_tag_size = x[0]
data_type = x[1]
body_size = (x[2] << 16) | (x[3] << 8) | x[4]
assert body_size < 1024 * 1024 * 128, 'tag body size too big (> 128MB)'
timestamp = (x[5] << 16) | (x[6] << 8) | x[7]
timestamp += x[8] << 24
assert x[9:] == (0, 0, 0)
body = stream.read(body_size)
return (data_type, timestamp, body_size, body, previous_tag_size)
#previous_tag_size = read_uint(stream)
#data_type = read_byte(stream)
#body_size = read_unsigned_medium_int(stream)
#assert body_size < 1024*1024*128, 'tag body size too big (> 128MB)'
#timestamp = read_unsigned_medium_int(stream)
#timestamp += read_byte(stream) << 24
#assert read_unsigned_medium_int(stream) == 0
#body = stream.read(body_size)
#return (data_type, timestamp, body_size, body, previous_tag_size)
def write_tag(stream, tag):
data_type, timestamp, body_size, body, previous_tag_size = tag
write_uint(stream, previous_tag_size)
write_byte(stream, data_type)
write_byte(stream, body_size>>16 & 0xff)
write_byte(stream, body_size>>8 & 0xff)
write_byte(stream, body_size & 0xff)
write_byte(stream, timestamp>>16 & 0xff)
write_byte(stream, timestamp>>8 & 0xff)
write_byte(stream, timestamp & 0xff)
write_byte(stream, timestamp>>24 & 0xff)
stream.write(b'\0\0\0')
stream.write(body)
def read_flv_header(stream):
assert stream.read(3) == b'FLV'
header_version = read_byte(stream)
assert header_version == 1
type_flags = read_byte(stream)
assert type_flags == 5
data_offset = read_uint(stream)
assert data_offset == 9
def write_flv_header(stream):
stream.write(b'FLV')
write_byte(stream, 1)
write_byte(stream, 5)
write_uint(stream, 9)
def read_meta_data(stream):
meta_type = read_amf(stream)
meta = read_amf(stream)
return meta_type, meta
def read_meta_tag(tag):
data_type, timestamp, body_size, body, previous_tag_size = tag
assert data_type == TAG_TYPE_METADATA
assert timestamp == 0
assert previous_tag_size == 0
return read_meta_data(BytesIO(body))
#def write_meta_data(stream, meta_type, meta_data):
# assert isinstance(meta_type, basesting)
# write_amf(meta_type)
# write_amf(meta_data)
def write_meta_tag(stream, meta_type, meta_data):
buffer = BytesIO()
write_amf(buffer, meta_type)
write_amf(buffer, meta_data)
body = buffer.getvalue()
write_tag(stream, (TAG_TYPE_METADATA, 0, len(body), body, 0))
##################################################
# main
##################################################
def guess_output(inputs):
import os.path
inputs = map(os.path.basename, inputs)
n = min(map(len, inputs))
for i in reversed(range(1, n)):
if len(set(s[:i] for s in inputs)) == 1:
return inputs[0][:i] + '.flv'
return 'output.flv'
def concat_flvs(flvs, output = None):
assert flvs, 'no flv file found'
import os.path
if not output:
output = guess_output(flvs)
elif os.path.isdir(output):
output = os.path.join(output, guess_output(flvs))
print('Merging video parts...')
ins = [open(flv, 'rb') for flv in flvs]
for stream in ins:
read_flv_header(stream)
meta_tags = map(read_tag, ins)
metas = list(map(read_meta_tag, meta_tags))
meta_types, metas = zip(*metas)
assert len(set(meta_types)) == 1
meta_type = meta_types[0]
# must merge fields: duration
# TODO: check other meta info, update other meta info
total_duration = sum(meta.get('duration') for meta in metas)
meta_data = metas[0]
meta_data.set('duration', total_duration)
out = open(output, 'wb')
write_flv_header(out)
write_meta_tag(out, meta_type, meta_data)
timestamp_start = 0
for stream in ins:
while True:
tag = read_tag(stream)
if tag:
data_type, timestamp, body_size, body, previous_tag_size = tag
timestamp += timestamp_start
tag = data_type, timestamp, body_size, body, previous_tag_size
write_tag(out, tag)
else:
break
timestamp_start = timestamp
write_uint(out, previous_tag_size)
return output
def usage():
print('Usage: [python3] merge_flv.py --output TARGET.flv flv...')
def main():
import sys, getopt
try:
opts, args = getopt.getopt(sys.argv[1:], "ho:", ["help", "output="])
except getopt.GetoptError as err:
usage()
sys.exit(1)
output = None
for o, a in opts:
if o in ("-h", "--help"):
usage()
sys.exit()
elif o in ("-o", "--output"):
output = a
else:
usage()
sys.exit(1)
if not args:
usage()
sys.exit(1)
concat_flvs(args, output)
if __name__ == '__main__':
main()

907
merge_mp4.py Executable file
View File

@ -0,0 +1,907 @@
#!/usr/bin/env python3
# reference: c041828_ISO_IEC_14496-12_2005(E).pdf
##################################################
# reader and writer
##################################################
import struct
from io import BytesIO
def skip(stream, n):
stream.seek(stream.tell() + n)
def skip_zeros(stream, n):
assert stream.read(n) == b'\x00' * n
def read_int(stream):
return struct.unpack('>i', stream.read(4))[0]
def read_uint(stream):
return struct.unpack('>I', stream.read(4))[0]
def write_uint(stream, n):
stream.write(struct.pack('>I', n))
def read_ushort(stream):
return struct.unpack('>H', stream.read(2))[0]
def read_ulong(stream):
return struct.unpack('>Q', stream.read(8))[0]
def read_byte(stream):
return ord(stream.read(1))
def copy_stream(source, target, n):
buffer_size = 1024 * 1024
while n > 0:
to_read = min(buffer_size, n)
s = source.read(to_read)
assert len(s) == to_read, 'no enough data'
target.write(s)
n -= to_read
class Atom:
def __init__(self, type, size, body):
assert len(type) == 4
self.type = type
self.size = size
self.body = body
def __str__(self):
#return '<Atom(%s):%s>' % (self.type, repr(self.body))
return '<Atom(%s):%s>' % (self.type, '')
def __repr__(self):
return str(self)
def write1(self, stream):
write_uint(stream, self.size)
stream.write(self.type)
def write(self, stream):
assert type(self.body) == bytes, '%s: %s' % (self.type, type(self.body))
assert self.size == 8 + len(self.body)
self.write1(stream)
stream.write(self.body)
def calsize(self):
return self.size
class CompositeAtom(Atom):
def __init__(self, type, size, body):
assert isinstance(body, list)
Atom.__init__(self, type, size, body)
def write(self, stream):
assert type(self.body) == list
self.write1(stream)
for atom in self.body:
atom.write(stream)
def calsize(self):
self.size = 8 + sum([atom.calsize() for atom in self.body])
return self.size
def get1(self, k):
for a in self.body:
if a.type == k:
return a
else:
raise Exception('atom not found: ' + k)
def get(self, *keys):
atom = self
for k in keys:
atom = atom.get1(k)
return atom
def get_all(self, k):
return list(filter(lambda x: x.type == k, self.body))
class VariableAtom(Atom):
def __init__(self, type, size, body, variables):
assert isinstance(body, bytes)
Atom.__init__(self, type, size, body)
self.variables = variables
def write(self, stream):
self.write1(stream)
i = 0
n = 0
for name, offset, value in self.variables:
stream.write(self.body[i:offset])
write_uint(stream, value)
n += offset - i + 4
i = offset + 4
stream.write(self.body[i:])
n += len(self.body) - i
assert n == len(self.body)
def get(self, k):
for v in self.variables:
if v[0] == k:
return v[2]
else:
raise Exception('field not found: ' + k)
def set(self, k, v):
for i in range(len(self.variables)):
variable = self.variables[i]
if variable[0] == k:
self.variables[i] = (k, variable[1], v)
break
else:
raise Exception('field not found: '+k)
def read_raw(stream, size, left, type):
assert size == left + 8
body = stream.read(left)
return Atom(type, size, body)
def read_body_stream(stream, left):
body = stream.read(left)
assert len(body) == left
return body, BytesIO(body)
def read_full_atom(stream):
value = read_uint(stream)
version = value >> 24
flags = value & 0xffffff
assert version == 0
return value
def read_mvhd(stream, size, left, type):
body, stream = read_body_stream(stream, left)
value = read_full_atom(stream)
left -= 4
# new Date(movieTime * 1000 - 2082850791998L);
creation_time = read_uint(stream)
modification_time = read_uint(stream)
time_scale = read_uint(stream)
duration = read_uint(stream)
left -= 16
qt_preferred_fate = read_uint(stream)
qt_preferred_volume = read_ushort(stream)
assert stream.read(10) == b'\x00' * 10
qt_matrixA = read_uint(stream)
qt_matrixB = read_uint(stream)
qt_matrixU = read_uint(stream)
qt_matrixC = read_uint(stream)
qt_matrixD = read_uint(stream)
qt_matrixV = read_uint(stream)
qt_matrixX = read_uint(stream)
qt_matrixY = read_uint(stream)
qt_matrixW = read_uint(stream)
qt_previewTime = read_uint(stream)
qt_previewDuration = read_uint(stream)
qt_posterTime = read_uint(stream)
qt_selectionTime = read_uint(stream)
qt_selectionDuration = read_uint(stream)
qt_currentTime = read_uint(stream)
nextTrackID = read_uint(stream)
left -= 80
assert left == 0
return VariableAtom(b'mvhd', size, body, [('duration', 16, duration)])
def read_tkhd(stream, size, left, type):
body, stream = read_body_stream(stream, left)
value = read_full_atom(stream)
left -= 4
# new Date(movieTime * 1000 - 2082850791998L);
creation_time = read_uint(stream)
modification_time = read_uint(stream)
track_id = read_uint(stream)
assert stream.read(4) == b'\x00' * 4
duration = read_uint(stream)
left -= 20
assert stream.read(8) == b'\x00' * 8
qt_layer = read_ushort(stream)
qt_alternate_group = read_ushort(stream)
qt_volume = read_ushort(stream)
assert stream.read(2) == b'\x00\x00'
qt_matrixA = read_uint(stream)
qt_matrixB = read_uint(stream)
qt_matrixU = read_uint(stream)
qt_matrixC = read_uint(stream)
qt_matrixD = read_uint(stream)
qt_matrixV = read_uint(stream)
qt_matrixX = read_uint(stream)
qt_matrixY = read_uint(stream)
qt_matrixW = read_uint(stream)
qt_track_width = read_uint(stream)
width = qt_track_width >> 16
qt_track_height = read_uint(stream)
height = qt_track_height >> 16
left -= 60
assert left == 0
return VariableAtom(b'tkhd', size, body, [('duration', 20, duration)])
def read_mdhd(stream, size, left, type):
body, stream = read_body_stream(stream, left)
value = read_full_atom(stream)
left -= 4
# new Date(movieTime * 1000 - 2082850791998L);
creation_time = read_uint(stream)
modification_time = read_uint(stream)
time_scale = read_uint(stream)
duration = read_uint(stream)
left -= 16
packed_language = read_ushort(stream)
qt_quality = read_ushort(stream)
left -= 4
assert left == 0
return VariableAtom(b'mdhd', size, body, [('duration', 16, duration)])
def read_hdlr(stream, size, left, type):
body, stream = read_body_stream(stream, left)
value = read_full_atom(stream)
left -= 4
qt_component_type = read_uint(stream)
handler_type = read_uint(stream)
qt_component_manufacturer = read_uint(stream)
qt_component_flags = read_uint(stream)
qt_component_flags_mask = read_uint(stream)
left -= 20
track_name = stream.read(left - 1)
assert stream.read(1) == b'\x00'
return Atom(b'hdlr', size, body)
def read_vmhd(stream, size, left, type):
body, stream = read_body_stream(stream, left)
value = read_full_atom(stream)
left -= 4
assert left == 8
graphic_mode = read_ushort(stream)
op_color_read = read_ushort(stream)
op_color_green = read_ushort(stream)
op_color_blue = read_ushort(stream)
return Atom(b'vmhd', size, body)
def read_stsd(stream, size, left, type):
value = read_full_atom(stream)
left -= 4
entry_count = read_uint(stream)
left -= 4
children = []
for i in range(entry_count):
atom = read_atom(stream)
children.append(atom)
left -= atom.size
assert left == 0
#return Atom('stsd', size, children)
class stsd_atom(Atom):
def __init__(self, type, size, body):
Atom.__init__(self, type, size, body)
def write(self, stream):
self.write1(stream)
write_uint(stream, self.body[0])
write_uint(stream, len(self.body[1]))
for atom in self.body[1]:
atom.write(stream)
def calsize(self):
oldsize = self.size # TODO: remove
self.size = 8 + 4 + 4 + sum([atom.calsize() for atom in self.body[1]])
assert oldsize == self.size, '%s: %d, %d' % (self.type, oldsize, self.size) # TODO: remove
return self.size
return stsd_atom(b'stsd', size, (value, children))
def read_avc1(stream, size, left, type):
body, stream = read_body_stream(stream, left)
skip_zeros(stream, 6)
data_reference_index = read_ushort(stream)
skip_zeros(stream, 2)
skip_zeros(stream, 2)
skip_zeros(stream, 12)
width = read_ushort(stream)
height = read_ushort(stream)
horizontal_rez = read_uint(stream) >> 16
vertical_rez = read_uint(stream) >> 16
assert stream.read(4) == b'\x00' * 4
frame_count = read_ushort(stream)
string_len = read_byte(stream)
compressor_name = stream.read(31)
depth = read_ushort(stream)
assert stream.read(2) == b'\xff\xff'
left -= 78
child = read_atom(stream)
assert child.type in (b'avcC', b'pasp'), 'if the sub atom is not avcC or pasp (actual %s), you should not cache raw body' % child.type
left -= child.size
stream.read(left) # XXX
return Atom(b'avc1', size, body)
def read_avcC(stream, size, left, type):
stream.read(left)
return Atom(b'avcC', size, None)
def read_stts(stream, size, left, type):
value = read_full_atom(stream)
left -= 4
entry_count = read_uint(stream)
assert entry_count == 1
left -= 4
samples = []
for i in range(entry_count):
sample_count = read_uint(stream)
sample_duration = read_uint(stream)
samples.append((sample_count, sample_duration))
left -= 8
assert left == 0
#return Atom('stts', size, None)
class stts_atom(Atom):
def __init__(self, type, size, body):
Atom.__init__(self, type, size, body)
def write(self, stream):
self.write1(stream)
write_uint(stream, self.body[0])
write_uint(stream, len(self.body[1]))
for sample_count, sample_duration in self.body[1]:
write_uint(stream, sample_count)
write_uint(stream, sample_duration)
def calsize(self):
oldsize = self.size # TODO: remove
self.size = 8 + 4 + 4 + len(self.body[1]) * 8
assert oldsize == self.size, '%s: %d, %d' % (self.type, oldsize, self.size) # TODO: remove
return self.size
return stts_atom(b'stts', size, (value, samples))
def read_stss(stream, size, left, type):
value = read_full_atom(stream)
left -= 4
entry_count = read_uint(stream)
left -= 4
samples = []
for i in range(entry_count):
sample = read_uint(stream)
samples.append(sample)
left -= 4
assert left == 0
#return Atom('stss', size, None)
class stss_atom(Atom):
def __init__(self, type, size, body):
Atom.__init__(self, type, size, body)
def write(self, stream):
self.write1(stream)
write_uint(stream, self.body[0])
write_uint(stream, len(self.body[1]))
for sample in self.body[1]:
write_uint(stream, sample)
def calsize(self):
self.size = 8 + 4 + 4 + len(self.body[1]) * 4
return self.size
return stss_atom(b'stss', size, (value, samples))
def read_stsc(stream, size, left, type):
value = read_full_atom(stream)
left -= 4
entry_count = read_uint(stream)
left -= 4
chunks = []
for i in range(entry_count):
first_chunk = read_uint(stream)
samples_per_chunk = read_uint(stream)
sample_description_index = read_uint(stream)
assert sample_description_index == 1 # what is it?
chunks.append((first_chunk, samples_per_chunk, sample_description_index))
left -= 12
#chunks, samples = zip(*chunks)
#total = 0
#for c, s in zip(chunks[1:], samples):
# total += c*s
#print 'total', total
assert left == 0
#return Atom('stsc', size, None)
class stsc_atom(Atom):
def __init__(self, type, size, body):
Atom.__init__(self, type, size, body)
def write(self, stream):
self.write1(stream)
write_uint(stream, self.body[0])
write_uint(stream, len(self.body[1]))
for first_chunk, samples_per_chunk, sample_description_index in self.body[1]:
write_uint(stream, first_chunk)
write_uint(stream, samples_per_chunk)
write_uint(stream, sample_description_index)
def calsize(self):
self.size = 8 + 4 + 4 + len(self.body[1]) * 12
return self.size
return stsc_atom(b'stsc', size, (value, chunks))
def read_stsz(stream, size, left, type):
value = read_full_atom(stream)
left -= 4
sample_size = read_uint(stream)
sample_count = read_uint(stream)
left -= 8
assert sample_size == 0
total = 0
sizes = []
if sample_size == 0:
for i in range(sample_count):
entry_size = read_uint(stream)
sizes.append(entry_size)
total += entry_size
left -= 4
assert left == 0
#return Atom('stsz', size, None)
class stsz_atom(Atom):
def __init__(self, type, size, body):
Atom.__init__(self, type, size, body)
def write(self, stream):
self.write1(stream)
write_uint(stream, self.body[0])
write_uint(stream, self.body[1])
write_uint(stream, self.body[2])
for entry_size in self.body[3]:
write_uint(stream, entry_size)
def calsize(self):
self.size = 8 + 4 + 8 + len(self.body[3]) * 4
return self.size
return stsz_atom(b'stsz', size, (value, sample_size, sample_count, sizes))
def read_stco(stream, size, left, type):
value = read_full_atom(stream)
left -= 4
entry_count = read_uint(stream)
left -= 4
offsets = []
for i in range(entry_count):
chunk_offset = read_uint(stream)
offsets.append(chunk_offset)
left -= 4
assert left == 0
#return Atom('stco', size, None)
class stco_atom(Atom):
def __init__(self, type, size, body):
Atom.__init__(self, type, size, body)
def write(self, stream):
self.write1(stream)
write_uint(stream, self.body[0])
write_uint(stream, len(self.body[1]))
for chunk_offset in self.body[1]:
write_uint(stream, chunk_offset)
def calsize(self):
self.size = 8 + 4 + 4 + len(self.body[1]) * 4
return self.size
return stco_atom(b'stco', size, (value, offsets))
def read_ctts(stream, size, left, type):
value = read_full_atom(stream)
left -= 4
entry_count = read_uint(stream)
left -= 4
samples = []
for i in range(entry_count):
sample_count = read_uint(stream)
sample_offset = read_uint(stream)
samples.append((sample_count, sample_offset))
left -= 8
assert left == 0
class ctts_atom(Atom):
def __init__(self, type, size, body):
Atom.__init__(self, type, size, body)
def write(self, stream):
self.write1(stream)
write_uint(stream, self.body[0])
write_uint(stream, len(self.body[1]))
for sample_count, sample_offset in self.body[1]:
write_uint(stream, sample_count)
write_uint(stream, sample_offset)
def calsize(self):
self.size = 8 + 4 + 4 + len(self.body[1]) * 8
return self.size
return ctts_atom(b'ctts', size, (value, samples))
def read_smhd(stream, size, left, type):
body, stream = read_body_stream(stream, left)
value = read_full_atom(stream)
left -= 4
balance = read_ushort(stream)
assert stream.read(2) == b'\x00\x00'
left -= 4
assert left == 0
return Atom(b'smhd', size, body)
def read_mp4a(stream, size, left, type):
body, stream = read_body_stream(stream, left)
assert stream.read(6) == b'\x00' * 6
data_reference_index = read_ushort(stream)
assert stream.read(8) == b'\x00' * 8
channel_count = read_ushort(stream)
sample_size = read_ushort(stream)
assert stream.read(4) == b'\x00' * 4
time_scale = read_ushort(stream)
assert stream.read(2) == b'\x00' * 2
left -= 28
atom = read_atom(stream)
assert atom.type == b'esds'
left -= atom.size
assert left == 0
return Atom(b'mp4a', size, body)
def read_descriptor(stream):
tag = read_byte(stream)
raise NotImplementedError()
def read_esds(stream, size, left, type):
value = read_uint(stream)
version = value >> 24
assert version == 0
flags = value & 0xffffff
left -= 4
body = stream.read(left)
return Atom(b'esds', size, None)
def read_composite_atom(stream, size, left, type):
children = []
while left > 0:
atom = read_atom(stream)
children.append(atom)
left -= atom.size
assert left == 0, left
return CompositeAtom(type, size, children)
def read_mdat(stream, size, left, type):
source_start = stream.tell()
source_size = left
skip(stream, left)
#return Atom(type, size, None)
#raise NotImplementedError()
class mdat_atom(Atom):
def __init__(self, type, size, body):
Atom.__init__(self, type, size, body)
def write(self, stream):
self.write1(stream)
self.write2(stream)
def write2(self, stream):
source, source_start, source_size = self.body
original = source.tell()
source.seek(source_start)
copy_stream(source, stream, source_size)
def calsize(self):
return self.size
return mdat_atom(b'mdat', size, (stream, source_start, source_size))
atom_readers = {
b'mvhd': read_mvhd, # merge duration
b'tkhd': read_tkhd, # merge duration
b'mdhd': read_mdhd, # merge duration
b'hdlr': read_hdlr, # nothing
b'vmhd': read_vmhd, # nothing
b'stsd': read_stsd, # nothing
b'avc1': read_avc1, # nothing
b'avcC': read_avcC, # nothing
b'stts': read_stts, # sample_count, sample_duration
b'stss': read_stss, # join indexes
b'stsc': read_stsc, # merge # sample numbers
b'stsz': read_stsz, # merge # samples
b'stco': read_stco, # merge # chunk offsets
b'ctts': read_ctts, # merge
b'smhd': read_smhd, # nothing
b'mp4a': read_mp4a, # nothing
b'esds': read_esds, # noting
b'ftyp': read_raw,
b'yqoo': read_raw,
b'moov': read_composite_atom,
b'trak': read_composite_atom,
b'mdia': read_composite_atom,
b'minf': read_composite_atom,
b'dinf': read_composite_atom,
b'stbl': read_composite_atom,
b'iods': read_raw,
b'dref': read_raw,
b'free': read_raw,
b'edts': read_raw,
b'pasp': read_raw,
b'mdat': read_mdat,
}
#stsd sample descriptions (codec types, initialization etc.)
#stts (decoding) time-to-sample
#ctts (composition) time to sample
#stsc sample-to-chunk, partial data-offset information
#stsz sample sizes (framing)
#stz2 compact sample sizes (framing)
#stco chunk offset, partial data-offset information
#co64 64-bit chunk offset
#stss sync sample table (random access points)
#stsh shadow sync sample table
#padb sample padding bits
#stdp sample degradation priority
#sdtp independent and disposable samples
#sbgp sample-to-group
#sgpd sample group description
#subs sub-sample information
def read_atom(stream):
header = stream.read(8)
if not header:
return
assert len(header) == 8
n = 0
size = struct.unpack('>I', header[:4])[0]
assert size > 0
n += 4
type = header[4:8]
n += 4
assert type != b'uuid'
if size == 1:
size = read_ulong(stream)
n += 8
left = size - n
if type in atom_readers:
return atom_readers[type](stream, size, left, type)
raise NotImplementedError('%s: %d' % (type, left))
def write_atom(stream, atom):
atom.write(stream)
def parse_atoms(stream):
atoms = []
while True:
atom = read_atom(stream)
if atom:
atoms.append(atom)
else:
break
return atoms
def read_mp4(stream):
atoms = parse_atoms(stream)
moov = list(filter(lambda x: x.type == b'moov', atoms))
mdat = list(filter(lambda x: x.type == b'mdat', atoms))
assert len(moov) == 1
assert len(mdat) == 1
moov = moov[0]
mdat = mdat[0]
return atoms, moov, mdat
##################################################
# merge
##################################################
def merge_stts(samples_list):
sample_list = []
for samples in samples_list:
assert len(samples) == 1
sample_list.append(samples[0])
counts, durations = zip(*sample_list)
assert len(set(durations)) == 1, 'not all durations equal'
return [(sum(counts), durations[0])]
def merge_stss(samples, sample_number_list):
results = []
start = 0
for samples, sample_number_list in zip(samples, sample_number_list):
results.extend(map(lambda x: start + x, samples))
start += sample_number_list
return results
def merge_stsc(chunks_list, total_chunk_number_list):
results = []
chunk_index = 1
for chunks, total in zip(chunks_list, total_chunk_number_list):
for i in range(len(chunks)):
if i < len(chunks) - 1:
chunk_number = chunks[i + 1][0] - chunks[i][0]
else:
chunk_number = total + 1 - chunks[i][0]
sample_number = chunks[i][1]
description = chunks[i][2]
results.append((chunk_index, sample_number, description))
chunk_index += chunk_number
return results
def merge_stco(offsets_list, mdats):
offset = 0
results = []
for offsets, mdat in zip(offsets_list, mdats):
results.extend(offset + x - mdat.body[1] for x in offsets)
offset += mdat.size - 8
return results
def merge_stsz(sizes_list):
return sum(sizes_list, [])
def merge_mdats(mdats):
total_size = sum(x.size - 8 for x in mdats) + 8
class multi_mdat_atom(Atom):
def __init__(self, type, size, body):
Atom.__init__(self, type, size, body)
def write(self, stream):
self.write1(stream)
self.write2(stream)
def write2(self, stream):
for mdat in self.body:
mdat.write2(stream)
def calsize(self):
return self.size
return multi_mdat_atom(b'mdat', total_size, mdats)
def merge_moov(moovs, mdats):
mvhd_duration = 0
for x in moovs:
mvhd_duration += x.get(b'mvhd').get('duration')
tkhd_durations = [0, 0]
mdhd_durations = [0, 0]
for x in moovs:
traks = x.get_all(b'trak')
assert len(traks) == 2
tkhd_durations[0] += traks[0].get(b'tkhd').get('duration')
tkhd_durations[1] += traks[1].get(b'tkhd').get('duration')
mdhd_durations[0] += traks[0].get(b'mdia', b'mdhd').get('duration')
mdhd_durations[1] += traks[1].get(b'mdia', b'mdhd').get('duration')
#mvhd_duration = min(mvhd_duration, tkhd_durations)
trak0s = [x.get_all(b'trak')[0] for x in moovs]
trak1s = [x.get_all(b'trak')[1] for x in moovs]
stts0 = merge_stts(x.get(b'mdia', b'minf', b'stbl', b'stts').body[1] for x in trak0s)
stts1 = merge_stts(x.get(b'mdia', b'minf', b'stbl', b'stts').body[1] for x in trak1s)
stss = merge_stss((x.get(b'mdia', b'minf', b'stbl', b'stss').body[1] for x in trak0s), (len(x.get(b'mdia', b'minf', b'stbl', b'stsz').body[3]) for x in trak0s))
stsc0 = merge_stsc((x.get(b'mdia', b'minf', b'stbl', b'stsc').body[1] for x in trak0s), (len(x.get(b'mdia', b'minf', b'stbl', b'stco').body[1]) for x in trak0s))
stsc1 = merge_stsc((x.get(b'mdia', b'minf', b'stbl', b'stsc').body[1] for x in trak1s), (len(x.get(b'mdia', b'minf', b'stbl', b'stco').body[1]) for x in trak1s))
stco0 = merge_stco((x.get(b'mdia', b'minf', b'stbl', b'stco').body[1] for x in trak0s), mdats)
stco1 = merge_stco((x.get(b'mdia', b'minf', b'stbl', b'stco').body[1] for x in trak1s), mdats)
stsz0 = merge_stsz((x.get(b'mdia', b'minf', b'stbl', b'stsz').body[3] for x in trak0s))
stsz1 = merge_stsz((x.get(b'mdia', b'minf', b'stbl', b'stsz').body[3] for x in trak1s))
ctts = sum((x.get(b'mdia', b'minf', b'stbl', b'ctts').body[1] for x in trak0s), [])
moov = moovs[0]
moov.get(b'mvhd').set('duration', mvhd_duration)
trak0 = moov.get_all(b'trak')[0]
trak1 = moov.get_all(b'trak')[1]
trak0.get(b'tkhd').set('duration', tkhd_durations[0])
trak1.get(b'tkhd').set('duration', tkhd_durations[1])
trak0.get(b'mdia', b'mdhd').set('duration', mdhd_durations[0])
trak1.get(b'mdia', b'mdhd').set('duration', mdhd_durations[1])
stts_atom = trak0.get(b'mdia', b'minf', b'stbl', b'stts')
stts_atom.body = stts_atom.body[0], stts0
stts_atom = trak1.get(b'mdia', b'minf', b'stbl', b'stts')
stts_atom.body = stts_atom.body[0], stts1
stss_atom = trak0.get(b'mdia', b'minf', b'stbl', b'stss')
stss_atom.body = stss_atom.body[0], stss
stsc_atom = trak0.get(b'mdia', b'minf', b'stbl', b'stsc')
stsc_atom.body = stsc_atom.body[0], stsc0
stsc_atom = trak1.get(b'mdia', b'minf', b'stbl', b'stsc')
stsc_atom.body = stsc_atom.body[0], stsc1
stco_atom = trak0.get(b'mdia', b'minf', b'stbl', b'stco')
stco_atom.body = stss_atom.body[0], stco0
stco_atom = trak1.get(b'mdia', b'minf', b'stbl', b'stco')
stco_atom.body = stss_atom.body[0], stco1
stsz_atom = trak0.get(b'mdia', b'minf', b'stbl', b'stsz')
stsz_atom.body = stsz_atom.body[0], stsz_atom.body[1], len(stsz0), stsz0
stsz_atom = trak1.get(b'mdia', b'minf', b'stbl', b'stsz')
stsz_atom.body = stsz_atom.body[0], stsz_atom.body[1], len(stsz1), stsz1
ctts_atom = trak0.get(b'mdia', b'minf', b'stbl', b'ctts')
ctts_atom.body = ctts_atom.body[0], ctts
old_moov_size = moov.size
new_moov_size = moov.calsize()
new_mdat_start = mdats[0].body[1] + new_moov_size - old_moov_size
stco0 = list(map(lambda x: x + new_mdat_start, stco0))
stco1 = list(map(lambda x: x + new_mdat_start, stco1))
stco_atom = trak0.get(b'mdia', b'minf', b'stbl', b'stco')
stco_atom.body = stss_atom.body[0], stco0
stco_atom = trak1.get(b'mdia', b'minf', b'stbl', b'stco')
stco_atom.body = stss_atom.body[0], stco1
return moov
def merge_mp4s(files, output):
assert files
ins = [open(mp4, 'rb') for mp4 in files]
mp4s = list(map(read_mp4, ins))
moovs = list(map(lambda x: x[1], mp4s))
mdats = list(map(lambda x: x[2], mp4s))
moov = merge_moov(moovs, mdats)
mdat = merge_mdats(mdats)
with open(output, 'wb') as output:
for x in mp4s[0][0]:
if x.type == b'moov':
moov.write(output)
elif x.type == b'mdat':
mdat.write(output)
else:
x.write(output)
##################################################
# main
##################################################
# TODO: FIXME: duplicate of merge_flv
def guess_output(inputs):
import os.path
inputs = map(os.path.basename, inputs)
n = min(map(len, inputs))
for i in reversed(range(1, n)):
if len(set(s[:i] for s in inputs)) == 1:
return inputs[0][:i] + '.mp4'
return 'output.mp4'
def concat_mp4s(mp4s, output = None):
assert mp4s, 'no mp4 file found'
import os.path
if not output:
output = guess_output(mp4s)
elif os.path.isdir(output):
output = os.path.join(output, guess_output(mp4s))
print('Merging video parts...')
merge_mp4s(mp4s, output)
return output
def usage():
print('Usage: [python3] merge_mp4.py --output TARGET.mp4 mp4...')
def main():
import sys, getopt
try:
opts, args = getopt.getopt(sys.argv[1:], "ho:", ["help", "output="])
except getopt.GetoptError as err:
usage()
sys.exit(1)
output = None
for o, a in opts:
if o in ("-h", "--help"):
usage()
sys.exit()
elif o in ("-o", "--output"):
output = a
else:
usage()
sys.exit(1)
if not args:
usage()
sys.exit(1)
concat_mp4s(args, output)
if __name__ == '__main__':
main()

6
you-get Executable file
View File

@ -0,0 +1,6 @@
#!/usr/bin/env python3
from get import *
if __name__ == '__main__':
main('you-get', any_download, any_download_playlist)

19
you-get.json Normal file
View File

@ -0,0 +1,19 @@
{
"version": "0.0.1",
"date": "2012-08-20",
"author": "Mort Yao <mort.yao@gmail.com>",
"file_list": [
"LICENSE",
"README.md",
"common.py",
"get.py",
"get_tudou.py",
"get_yinyuetai.py",
"get_youku.py",
"get_youtube.py",
"merge_flv.py",
"merge_mp4.py",
"you-get",
"you-get.json"
]
}