you-get/src/you_get/common.py

1049 lines
34 KiB
Python
Raw Normal View History

2012-09-01 02:55:45 +04:00
#!/usr/bin/env python
2012-08-20 19:54:03 +04:00
import getopt
import json
2012-09-02 05:11:49 +04:00
import locale
2012-08-20 19:54:03 +04:00
import os
import platform
2012-08-20 19:54:03 +04:00
import re
import sys
from urllib import request, parse
2012-12-09 20:33:24 +04:00
from .version import __version__
2014-07-20 23:26:47 +04:00
from .util import log
from .util.strings import get_filename, unescape_html
2012-08-20 19:54:03 +04:00
dry_run = False
2012-08-20 19:54:03 +04:00
force = False
2014-01-01 10:25:44 +04:00
player = None
2014-06-24 05:59:47 +04:00
extractor_proxy = None
cookies_txt = None
2012-08-20 19:54:03 +04:00
2012-09-02 05:11:49 +04:00
fake_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'UTF-8,*;q=0.5',
'Accept-Encoding': 'gzip,deflate,sdch',
2012-09-02 05:11:49 +04:00
'Accept-Language': 'en-US,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:13.0) Gecko/20100101 Firefox/13.0'
2012-09-02 05:11:49 +04:00
}
2012-08-20 19:54:03 +04:00
if sys.stdout.isatty():
default_encoding = sys.stdout.encoding.lower()
else:
default_encoding = locale.getpreferredencoding().lower()
def tr(s):
if default_encoding == 'utf-8':
return s
else:
return s
#return str(s.encode('utf-8'))[2:-1]
2012-08-20 19:54:03 +04:00
2013-07-11 12:48:13 +04:00
# DEPRECATED in favor of match1()
2012-08-20 19:54:03 +04:00
def r1(pattern, text):
m = re.search(pattern, text)
if m:
return m.group(1)
2013-07-11 12:48:13 +04:00
# DEPRECATED in favor of match1()
2012-08-20 19:54:03 +04:00
def r1_of(patterns, text):
for p in patterns:
x = r1(p, text)
if x:
return x
2013-07-11 12:48:13 +04:00
def match1(text, *patterns):
"""Scans through a string for substrings matched some patterns (first-subgroups only).
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Args:
text: A string to be scanned.
patterns: Arbitrary number of regex patterns.
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Returns:
When only one pattern is given, returns a string (None if no match found).
When more than one pattern are given, returns a list of strings ([] if no match found).
"""
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
if len(patterns) == 1:
pattern = patterns[0]
match = re.search(pattern, text)
if match:
return match.group(1)
else:
return None
else:
ret = []
for pattern in patterns:
match = re.search(pattern, text)
if match:
ret.append(match.group(1))
return ret
def matchall(text, patterns):
"""Scans through a string for substrings matched some patterns.
Args:
text: A string to be scanned.
patterns: a list of regex pattern.
Returns:
a list if matched. empty if not.
"""
ret = []
for pattern in patterns:
match = re.findall(pattern, text)
ret += match
return ret
2014-01-01 10:25:44 +04:00
def launch_player(player, urls):
import subprocess
import shlex
subprocess.call(shlex.split(player) + list(urls))
2013-07-11 12:48:13 +04:00
def parse_query_param(url, param):
"""Parses the query string of a URL and returns the value of a parameter.
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Args:
url: A URL.
param: A string representing the name of the parameter.
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Returns:
The value of the parameter.
"""
2014-03-28 08:49:34 +04:00
2013-10-18 16:49:29 +04:00
try:
return parse.parse_qs(parse.urlparse(url).query)[param][0]
except:
return None
2013-07-11 12:48:13 +04:00
2012-09-17 15:11:46 +04:00
def unicodize(text):
return re.sub(r'\\u([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])', lambda x: chr(int(x.group(0)[2:], 16)), text)
2013-10-30 03:19:08 +04:00
# DEPRECATED in favor of util.legitimize()
2012-08-20 19:54:03 +04:00
def escape_file_path(path):
path = path.replace('/', '-')
path = path.replace('\\', '-')
path = path.replace('*', '-')
path = path.replace('?', '-')
return path
2013-07-11 12:48:13 +04:00
def ungzip(data):
"""Decompresses data for Content-Encoding: gzip.
"""
2012-08-20 19:54:03 +04:00
from io import BytesIO
import gzip
2013-07-11 12:48:13 +04:00
buffer = BytesIO(data)
f = gzip.GzipFile(fileobj=buffer)
2012-08-20 19:54:03 +04:00
return f.read()
2013-07-11 12:48:13 +04:00
def undeflate(data):
"""Decompresses data for Content-Encoding: deflate.
(the zlib compression is used.)
"""
2012-08-20 19:54:03 +04:00
import zlib
decompressobj = zlib.decompressobj(-zlib.MAX_WBITS)
return decompressobj.decompress(data)+decompressobj.flush()
2012-08-20 19:54:03 +04:00
2013-07-11 12:48:13 +04:00
# DEPRECATED in favor of get_content()
2012-09-02 05:11:49 +04:00
def get_response(url, faker = False):
if faker:
response = request.urlopen(request.Request(url, headers = fake_headers), None)
else:
response = request.urlopen(url)
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
data = response.read()
if response.info().get('Content-Encoding') == 'gzip':
data = ungzip(data)
elif response.info().get('Content-Encoding') == 'deflate':
data = undeflate(data)
response.data = data
return response
2013-07-11 12:48:13 +04:00
# DEPRECATED in favor of get_content()
2012-09-02 05:11:49 +04:00
def get_html(url, encoding = None, faker = False):
content = get_response(url, faker).data
2012-08-20 19:54:03 +04:00
return str(content, 'utf-8', 'ignore')
2013-07-11 12:48:13 +04:00
# DEPRECATED in favor of get_content()
2012-09-02 05:11:49 +04:00
def get_decoded_html(url, faker = False):
response = get_response(url, faker)
2012-08-20 19:54:03 +04:00
data = response.data
charset = r1(r'charset=([\w-]+)', response.headers['content-type'])
if charset:
2014-01-04 22:29:50 +04:00
return data.decode(charset, 'ignore')
2012-08-20 19:54:03 +04:00
else:
return data
def get_location(url):
response = request.urlopen(url)
# urllib will follow redirections and it's too much code to tell urllib
# not to do that
return response.geturl()
2013-07-11 12:48:13 +04:00
def get_content(url, headers={}, decoded=True):
"""Gets the content of a URL via sending a HTTP GET request.
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Args:
url: A URL.
headers: Request headers used by the client.
decoded: Whether decode the response body using UTF-8 or the charset specified in Content-Type.
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Returns:
The content as a string.
"""
2014-03-28 08:49:34 +04:00
req = request.Request(url, headers=headers)
if cookies_txt:
cookies_txt.add_cookie_header(req)
req.headers.update(req.unredirected_hdrs)
response = request.urlopen(req)
2013-07-11 12:48:13 +04:00
data = response.read()
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
# Handle HTTP compression for gzip and deflate (zlib)
content_encoding = response.getheader('Content-Encoding')
if content_encoding == 'gzip':
data = ungzip(data)
elif content_encoding == 'deflate':
data = undeflate(data)
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
# Decode the response body
if decoded:
charset = match1(response.getheader('Content-Type'), r'charset=([\w-]+)')
if charset is not None:
data = data.decode(charset)
else:
data = data.decode('utf-8')
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
return data
2012-09-02 05:11:49 +04:00
def url_size(url, faker = False):
if faker:
response = request.urlopen(request.Request(url, headers = fake_headers), None)
else:
response = request.urlopen(url)
2014-03-28 08:49:34 +04:00
size = response.headers['content-length']
return int(size) if size!=None else float('inf')
2012-08-20 19:54:03 +04:00
2014-06-28 18:01:34 +04:00
# TO BE DEPRECATED
# urls_size() does not have a faker
# also it takes too long time
2012-08-20 19:54:03 +04:00
def urls_size(urls):
return sum(map(url_size, urls))
2012-09-02 05:11:49 +04:00
def url_info(url, faker = False):
if faker:
response = request.urlopen(request.Request(url, headers = fake_headers), None)
else:
response = request.urlopen(request.Request(url))
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
headers = response.headers
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
type = headers['content-type']
mapping = {
'video/3gpp': '3gp',
'video/f4v': 'flv',
'video/mp4': 'mp4',
'video/MP2T': 'ts',
'video/quicktime': 'mov',
2012-08-20 19:54:03 +04:00
'video/webm': 'webm',
2012-12-10 03:09:13 +04:00
'video/x-flv': 'flv',
'video/x-ms-asf': 'asf',
2014-09-21 04:22:57 +04:00
'audio/mp4': 'mp4',
2012-12-10 03:09:13 +04:00
'audio/mpeg': 'mp3'
2012-08-20 19:54:03 +04:00
}
if type in mapping:
ext = mapping[type]
else:
2013-02-12 23:16:45 +04:00
type = None
2013-02-15 02:51:40 +04:00
if headers['content-disposition']:
try:
filename = parse.unquote(r1(r'filename="?([^"]+)"?', headers['content-disposition']))
if len(filename.split('.')) > 1:
ext = filename.split('.')[-1]
else:
ext = None
except:
2013-02-15 02:51:40 +04:00
ext = None
2013-02-12 23:16:45 +04:00
else:
ext = None
2014-03-28 08:49:34 +04:00
2013-02-12 23:16:45 +04:00
if headers['transfer-encoding'] != 'chunked':
size = headers['content-length'] and int(headers['content-length'])
2013-02-12 23:16:45 +04:00
else:
size = None
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
return type, ext, size
2012-09-02 05:11:49 +04:00
def url_locations(urls, faker = False):
locations = []
for url in urls:
2012-09-02 05:11:49 +04:00
if faker:
response = request.urlopen(request.Request(url, headers = fake_headers), None)
else:
response = request.urlopen(request.Request(url))
2014-03-28 08:49:34 +04:00
locations.append(response.url)
return locations
2012-09-02 05:11:49 +04:00
def url_save(url, filepath, bar, refer = None, is_part = False, faker = False):
file_size = url_size(url, faker = faker)
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if os.path.exists(filepath):
if not force and file_size == os.path.getsize(filepath):
if not is_part:
if bar:
bar.done()
print('Skipping %s: file already exists' % tr(os.path.basename(filepath)))
else:
if bar:
bar.update_received(file_size)
return
else:
if not is_part:
if bar:
bar.done()
print('Overwriting %s' % tr(os.path.basename(filepath)), '...')
elif not os.path.exists(os.path.dirname(filepath)):
os.mkdir(os.path.dirname(filepath))
2014-03-28 08:49:34 +04:00
temp_filepath = filepath + '.download' if file_size!=float('inf') else filepath
2012-08-20 19:54:03 +04:00
received = 0
if not force:
open_mode = 'ab'
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if os.path.exists(temp_filepath):
received += os.path.getsize(temp_filepath)
if bar:
bar.update_received(os.path.getsize(temp_filepath))
else:
open_mode = 'wb'
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if received < file_size:
2012-09-02 05:11:49 +04:00
if faker:
headers = fake_headers
else:
headers = {}
2012-08-20 19:54:03 +04:00
if received:
headers['Range'] = 'bytes=' + str(received) + '-'
if refer:
headers['Referer'] = refer
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
response = request.urlopen(request.Request(url, headers = headers), None)
try:
range_start = int(response.headers['content-range'][6:].split('/')[0].split('-')[0])
end_length = end = int(response.headers['content-range'][6:].split('/')[1])
range_length = end_length - range_start
except:
content_length = response.headers['content-length']
range_length = int(content_length) if content_length!=None else float('inf')
2014-03-28 08:49:34 +04:00
if file_size != received + range_length:
2012-09-01 14:20:19 +04:00
received = 0
if bar:
bar.received = 0
2012-09-01 14:20:19 +04:00
open_mode = 'wb'
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
with open(temp_filepath, open_mode) as output:
while True:
buffer = response.read(1024 * 256)
if not buffer:
if received == file_size: # Download finished
break
else: # Unexpected termination. Retry request
headers['Range'] = 'bytes=' + str(received) + '-'
response = request.urlopen(request.Request(url, headers = headers), None)
output.write(buffer)
received += len(buffer)
if bar:
bar.update_received(len(buffer))
2014-03-28 08:49:34 +04:00
assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (received, os.path.getsize(temp_filepath), temp_filepath)
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if os.access(filepath, os.W_OK):
os.remove(filepath) # on Windows rename could fail if destination filepath exists
os.rename(temp_filepath, filepath)
def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker = False):
if os.path.exists(filepath):
if not force:
if not is_part:
if bar:
bar.done()
print('Skipping %s: file already exists' % tr(os.path.basename(filepath)))
else:
if bar:
bar.update_received(os.path.getsize(filepath))
return
else:
if not is_part:
if bar:
bar.done()
print('Overwriting %s' % tr(os.path.basename(filepath)), '...')
elif not os.path.exists(os.path.dirname(filepath)):
os.mkdir(os.path.dirname(filepath))
2014-03-28 08:49:34 +04:00
temp_filepath = filepath + '.download'
received = 0
if not force:
open_mode = 'ab'
2014-03-28 08:49:34 +04:00
if os.path.exists(temp_filepath):
received += os.path.getsize(temp_filepath)
if bar:
bar.update_received(os.path.getsize(temp_filepath))
else:
open_mode = 'wb'
2014-03-28 08:49:34 +04:00
if faker:
headers = fake_headers
else:
headers = {}
if received:
headers['Range'] = 'bytes=' + str(received) + '-'
if refer:
headers['Referer'] = refer
2014-03-28 08:49:34 +04:00
response = request.urlopen(request.Request(url, headers = headers), None)
2014-03-28 08:49:34 +04:00
with open(temp_filepath, open_mode) as output:
while True:
buffer = response.read(1024 * 256)
if not buffer:
break
output.write(buffer)
received += len(buffer)
if bar:
bar.update_received(len(buffer))
2014-03-28 08:49:34 +04:00
assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (received, os.path.getsize(temp_filepath))
2014-03-28 08:49:34 +04:00
if os.access(filepath, os.W_OK):
os.remove(filepath) # on Windows rename could fail if destination filepath exists
os.rename(temp_filepath, filepath)
2012-08-20 19:54:03 +04:00
class SimpleProgressBar:
def __init__(self, total_size, total_pieces = 1):
self.displayed = False
self.total_size = total_size
self.total_pieces = total_pieces
self.current_piece = 1
self.received = 0
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def update(self):
self.displayed = True
bar_size = 40
percent = round(self.received * 100 / self.total_size, 1)
if percent > 100:
percent = 100
dots = bar_size * int(percent) // 100
plus = int(percent) - dots // bar_size * 100
if plus > 0.8:
plus = '='
elif plus > 0.4:
plus = '>'
else:
plus = ''
bar = '=' * dots + plus
bar = '{0:>5}% ({1:>5}/{2:<5}MB) [{3:<40}] {4}/{5}'.format(percent, round(self.received / 1048576, 1), round(self.total_size / 1048576, 1), bar, self.current_piece, self.total_pieces)
sys.stdout.write('\r' + bar)
sys.stdout.flush()
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def update_received(self, n):
self.received += n
self.update()
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def update_piece(self, n):
self.current_piece = n
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def done(self):
if self.displayed:
print()
self.displayed = False
class PiecesProgressBar:
def __init__(self, total_size, total_pieces = 1):
self.displayed = False
self.total_size = total_size
self.total_pieces = total_pieces
self.current_piece = 1
self.received = 0
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def update(self):
self.displayed = True
bar = '{0:>5}%[{1:<40}] {2}/{3}'.format('?', '?' * 40, self.current_piece, self.total_pieces)
sys.stdout.write('\r' + bar)
sys.stdout.flush()
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def update_received(self, n):
self.received += n
self.update()
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def update_piece(self, n):
self.current_piece = n
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def done(self):
if self.displayed:
print()
self.displayed = False
class DummyProgressBar:
def __init__(self, *args):
pass
def update_received(self, n):
pass
def update_piece(self, n):
pass
def done(self):
pass
2014-05-22 15:56:40 +04:00
def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False):
2012-08-20 19:54:03 +04:00
assert urls
if dry_run:
print('Real URLs:\n%s' % '\n'.join(urls))
return
2014-03-28 08:49:34 +04:00
2014-01-01 10:25:44 +04:00
if player:
launch_player(player, urls)
return
2012-08-20 19:54:03 +04:00
if not total_size:
try:
total_size = urls_size(urls)
except:
import traceback
import sys
traceback.print_exc(file = sys.stdout)
pass
2014-03-28 08:49:34 +04:00
title = tr(get_filename(title))
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
filename = '%s.%s' % (title, ext)
filepath = os.path.join(output_dir, filename)
if total_size:
if not force and os.path.exists(filepath) and os.path.getsize(filepath) >= total_size * 0.9:
print('Skipping %s: file already exists' % filepath)
2012-09-01 14:42:57 +04:00
print()
2012-08-20 19:54:03 +04:00
return
bar = SimpleProgressBar(total_size, len(urls))
else:
bar = PiecesProgressBar(total_size, len(urls))
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if len(urls) == 1:
url = urls[0]
print('Downloading %s ...' % tr(filename))
2012-09-02 05:11:49 +04:00
url_save(url, filepath, bar, refer = refer, faker = faker)
2012-08-20 19:54:03 +04:00
bar.done()
else:
2012-09-16 22:55:31 +04:00
parts = []
2012-08-20 19:54:03 +04:00
print('Downloading %s.%s ...' % (tr(title), ext))
for i, url in enumerate(urls):
filename = '%s[%02d].%s' % (title, i, ext)
filepath = os.path.join(output_dir, filename)
2012-09-16 22:55:31 +04:00
parts.append(filepath)
2012-08-20 19:54:03 +04:00
#print 'Downloading %s [%s/%s]...' % (tr(filename), i + 1, len(urls))
bar.update_piece(i + 1)
2012-09-02 05:11:49 +04:00
url_save(url, filepath, bar, refer = refer, is_part = True, faker = faker)
2012-08-20 19:54:03 +04:00
bar.done()
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if not merge:
2012-09-01 14:42:57 +04:00
print()
2012-08-20 19:54:03 +04:00
return
2014-09-09 01:38:11 +04:00
if ext in ['flv', 'f4v']:
try:
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
from .processor.ffmpeg import ffmpeg_concat_flv_to_mp4
ffmpeg_concat_flv_to_mp4(parts, os.path.join(output_dir, title + '.mp4'))
else:
from .processor.join_flv import concat_flv
concat_flv(parts, os.path.join(output_dir, title + '.flv'))
except:
raise
else:
for part in parts:
os.remove(part)
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
elif ext == 'mp4':
2012-09-19 00:23:10 +04:00
try:
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
2013-04-29 00:22:07 +04:00
from .processor.ffmpeg import ffmpeg_concat_mp4_to_mp4
ffmpeg_concat_mp4_to_mp4(parts, os.path.join(output_dir, title + '.mp4'))
2012-09-19 00:23:10 +04:00
else:
2013-08-07 10:00:00 +04:00
from .processor.join_mp4 import concat_mp4
concat_mp4(parts, os.path.join(output_dir, title + '.mp4'))
except:
raise
else:
for part in parts:
os.remove(part)
elif ext == "ts":
try:
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
from .processor.ffmpeg import ffmpeg_concat_ts_to_mkv
ffmpeg_concat_ts_to_mkv(parts, os.path.join(output_dir, title + '.mkv'))
else:
from .processor.join_ts import concat_ts
concat_ts(parts, os.path.join(output_dir, title + '.ts'))
except:
raise
else:
for part in parts:
os.remove(part)
2014-03-28 08:49:34 +04:00
else:
print("Can't merge %s files" % ext)
2014-03-28 08:49:34 +04:00
print()
2014-05-22 15:56:40 +04:00
def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False):
assert urls
if dry_run:
2014-05-22 15:56:40 +04:00
print('Real URLs:\n%s\n' % urls)
return
2014-03-28 08:49:34 +04:00
2014-01-01 10:25:44 +04:00
if player:
launch_player(player, urls)
return
assert ext in ('ts')
2014-03-28 08:49:34 +04:00
title = tr(get_filename(title))
2014-03-28 08:49:34 +04:00
2012-09-19 00:23:10 +04:00
filename = '%s.%s' % (title, 'ts')
filepath = os.path.join(output_dir, filename)
if total_size:
2012-09-19 00:23:10 +04:00
if not force and os.path.exists(filepath[:-3] + '.mkv'):
print('Skipping %s: file already exists' % filepath[:-3] + '.mkv')
print()
return
bar = SimpleProgressBar(total_size, len(urls))
else:
bar = PiecesProgressBar(total_size, len(urls))
2014-03-28 08:49:34 +04:00
if len(urls) == 1:
2012-09-19 00:23:10 +04:00
parts = []
url = urls[0]
print('Downloading %s ...' % tr(filename))
2012-09-19 00:23:10 +04:00
filepath = os.path.join(output_dir, filename)
parts.append(filepath)
url_save_chunked(url, filepath, bar, refer = refer, faker = faker)
bar.done()
2014-03-28 08:49:34 +04:00
2012-09-19 00:23:10 +04:00
if not merge:
print()
return
if ext == 'ts':
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
from .processor.ffmpeg import ffmpeg_convert_ts_to_mkv
2012-10-09 05:21:04 +04:00
if ffmpeg_convert_ts_to_mkv(parts, os.path.join(output_dir, title + '.mkv')):
for part in parts:
os.remove(part)
else:
os.remove(os.path.join(output_dir, title + '.mkv'))
2012-09-19 00:23:10 +04:00
else:
print('No ffmpeg is found. Conversion aborted.')
else:
print("Can't convert %s files" % ext)
else:
parts = []
print('Downloading %s.%s ...' % (tr(title), ext))
for i, url in enumerate(urls):
filename = '%s[%02d].%s' % (title, i, ext)
filepath = os.path.join(output_dir, filename)
parts.append(filepath)
#print 'Downloading %s [%s/%s]...' % (tr(filename), i + 1, len(urls))
bar.update_piece(i + 1)
url_save_chunked(url, filepath, bar, refer = refer, is_part = True, faker = faker)
bar.done()
2014-03-28 08:49:34 +04:00
if not merge:
print()
return
if ext == 'ts':
2012-09-19 00:23:10 +04:00
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
from .processor.ffmpeg import ffmpeg_concat_ts_to_mkv
2012-10-09 05:21:04 +04:00
if ffmpeg_concat_ts_to_mkv(parts, os.path.join(output_dir, title + '.mkv')):
for part in parts:
os.remove(part)
else:
os.remove(os.path.join(output_dir, title + '.mkv'))
2012-09-19 00:23:10 +04:00
else:
print('No ffmpeg is found. Merging aborted.')
2012-08-20 19:54:03 +04:00
else:
print("Can't merge %s files" % ext)
2014-03-28 08:49:34 +04:00
2012-09-01 14:42:57 +04:00
print()
2012-08-20 19:54:03 +04:00
def download_rtmp_url(url,title, ext,params={}, total_size=0, output_dir='.', refer=None, merge=True, faker=False):
2014-05-22 15:56:40 +04:00
assert url
if dry_run:
2014-05-22 16:04:22 +04:00
print('Real URL:\n%s\n' % [url])
if params.get("-y",False): #None or unset ->False
print('Real Playpath:\n%s\n' % [params.get("-y")])
2014-05-22 15:56:40 +04:00
return
if player:
2014-05-22 16:21:17 +04:00
from .processor.rtmpdump import play_rtmpdump_stream
play_rtmpdump_stream(player, url, params)
2014-05-22 15:56:40 +04:00
return
from .processor.rtmpdump import has_rtmpdump_installed, download_rtmpdump_stream
assert has_rtmpdump_installed(), "RTMPDump not installed."
download_rtmpdump_stream(url, title, ext,params, output_dir)
2014-05-22 15:56:40 +04:00
2012-08-20 19:54:03 +04:00
def playlist_not_supported(name):
def f(*args, **kwargs):
raise NotImplementedError('Playlist is not supported for ' + name)
return f
def print_info(site_info, title, type, size):
2013-04-25 17:56:44 +04:00
if type:
type = type.lower()
2012-08-20 19:54:03 +04:00
if type in ['3gp']:
type = 'video/3gpp'
2013-02-12 23:16:45 +04:00
elif type in ['asf', 'wmv']:
2012-12-10 03:09:13 +04:00
type = 'video/x-ms-asf'
2012-08-20 19:54:03 +04:00
elif type in ['flv', 'f4v']:
type = 'video/x-flv'
elif type in ['mkv']:
type = 'video/x-matroska'
2012-12-10 03:09:13 +04:00
elif type in ['mp3']:
type = 'audio/mpeg'
2012-08-20 19:54:03 +04:00
elif type in ['mp4']:
type = 'video/mp4'
elif type in ['mov']:
type = 'video/quicktime'
2012-09-16 22:55:31 +04:00
elif type in ['ts']:
type = 'video/MP2T'
2012-08-20 19:54:03 +04:00
elif type in ['webm']:
type = 'video/webm'
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if type in ['video/3gpp']:
type_info = "3GPP multimedia file (%s)" % type
elif type in ['video/x-flv', 'video/f4v']:
type_info = "Flash video (%s)" % type
elif type in ['video/mp4', 'video/x-m4v']:
type_info = "MPEG-4 video (%s)" % type
elif type in ['video/MP2T']:
2012-09-16 22:55:31 +04:00
type_info = "MPEG-2 transport stream (%s)" % type
2012-08-20 19:54:03 +04:00
elif type in ['video/webm']:
type_info = "WebM video (%s)" % type
#elif type in ['video/ogg']:
# type_info = "Ogg video (%s)" % type
elif type in ['video/quicktime']:
type_info = "QuickTime video (%s)" % type
elif type in ['video/x-matroska']:
type_info = "Matroska video (%s)" % type
2012-08-20 19:54:03 +04:00
#elif type in ['video/x-ms-wmv']:
# type_info = "Windows Media video (%s)" % type
2012-12-10 03:09:13 +04:00
elif type in ['video/x-ms-asf']:
type_info = "Advanced Systems Format (%s)" % type
2012-08-20 19:54:03 +04:00
#elif type in ['video/mpeg']:
# type_info = "MPEG video (%s)" % type
2014-09-21 04:22:57 +04:00
elif type in ['audio/mp4']:
type_info = "MPEG-4 audio (%s)" % type
2012-12-10 03:09:13 +04:00
elif type in ['audio/mpeg']:
type_info = "MP3 (%s)" % type
2012-08-20 19:54:03 +04:00
else:
type_info = "Unknown type (%s)" % type
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
print("Video Site:", site_info)
2014-04-11 15:55:54 +04:00
print("Title: ", unescape_html(tr(title)))
2012-08-20 19:54:03 +04:00
print("Type: ", type_info)
2013-10-30 10:41:29 +04:00
print("Size: ", round(size / 1048576, 2), "MiB (" + str(size) + " Bytes)")
2012-09-01 12:18:59 +04:00
print()
2012-08-20 19:54:03 +04:00
def mime_to_container(mime):
mapping = {
'video/3gpp': '3gp',
'video/mp4': 'mp4',
'video/webm': 'webm',
'video/x-flv': 'flv',
}
if mime in mapping:
return mapping[mime]
else:
return mime.split('/')[1]
2013-10-30 10:29:44 +04:00
def parse_host(host):
"""Parses host name and port number from a string.
"""
if re.match(r'^(\d+)$', host) is not None:
return ("0.0.0.0", int(host))
if re.match(r'^(\w+)://', host) is None:
host = "//" + host
o = parse.urlparse(host)
hostname = o.hostname or "0.0.0.0"
port = o.port or 0
return (hostname, port)
def set_proxy(proxy):
proxy_handler = request.ProxyHandler({
'http': '%s:%s' % proxy,
'https': '%s:%s' % proxy,
})
opener = request.build_opener(proxy_handler)
request.install_opener(opener)
def unset_proxy():
proxy_handler = request.ProxyHandler({})
opener = request.build_opener(proxy_handler)
request.install_opener(opener)
# DEPRECATED in favor of set_proxy() and unset_proxy()
2012-08-20 19:54:03 +04:00
def set_http_proxy(proxy):
if proxy == None: # Use system default setting
proxy_support = request.ProxyHandler()
elif proxy == '': # Don't use any proxy
proxy_support = request.ProxyHandler({})
else: # Use proxy
2013-06-26 10:01:11 +04:00
proxy_support = request.ProxyHandler({'http': '%s' % proxy, 'https': '%s' % proxy})
2012-08-20 19:54:03 +04:00
opener = request.build_opener(proxy_support)
request.install_opener(opener)
2014-06-28 20:10:29 +04:00
def download_main(download, download_playlist, urls, playlist, **kwargs):
for url in urls:
if url.startswith('https://'):
url = url[8:]
if not url.startswith('http://'):
url = 'http://' + url
2014-03-28 08:49:34 +04:00
if playlist:
2014-06-28 20:10:29 +04:00
download_playlist(url, **kwargs)
else:
2014-06-28 20:10:29 +04:00
download(url, **kwargs)
2012-08-31 19:20:38 +04:00
def script_main(script_name, download, download_playlist = None):
2014-07-20 23:52:04 +04:00
version = 'You-Get %s, a video downloader.' % __version__
2012-09-01 02:55:45 +04:00
help = 'Usage: %s [OPTION]... [URL]...\n' % script_name
2012-08-20 19:54:03 +04:00
help += '''\nStartup options:
-V | --version Display the version and exit.
-h | --help Print this help and exit.
'''
help += '''\nDownload options (use with URLs):
-f | --force Force overwriting existed files.
-i | --info Display the information of videos without downloading.
-u | --url Display the real URLs of videos without downloading.
2014-03-28 08:49:34 +04:00
-c | --cookies Load NetScape's cookies.txt file.
2012-08-20 19:54:03 +04:00
-n | --no-merge Don't merge video parts.
2014-06-28 20:10:29 +04:00
-F | --format <STREAM_ID> Video format code.
2012-08-20 19:54:03 +04:00
-o | --output-dir <PATH> Set the output directory for downloaded videos.
2014-02-08 07:13:29 +04:00
-p | --player <PLAYER [options]> Directly play the video with PLAYER like vlc/smplayer.
2013-10-30 10:54:30 +04:00
-x | --http-proxy <HOST:PORT> Use specific HTTP proxy for downloading.
2014-06-24 05:59:47 +04:00
-y | --extractor-proxy <HOST:PORT> Use specific HTTP proxy for extracting stream data.
2012-08-20 19:54:03 +04:00
--no-proxy Don't use any proxy. (ignore $http_proxy)
--debug Show traceback on KeyboardInterrupt.
2012-08-20 19:54:03 +04:00
'''
2014-03-28 08:49:34 +04:00
2014-07-20 23:26:47 +04:00
short_opts = 'Vhfiuc:nF:o:p:x:y:'
opts = ['version', 'help', 'force', 'info', 'url', 'cookies', 'no-merge', 'no-proxy', 'debug', 'format=', 'stream=', 'itag=', 'output-dir=', 'player=', 'http-proxy=', 'extractor-proxy=', 'lang=']
2012-08-20 19:54:03 +04:00
if download_playlist:
short_opts = 'l' + short_opts
opts = ['playlist'] + opts
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
try:
opts, args = getopt.getopt(sys.argv[1:], short_opts, opts)
except getopt.GetoptError as err:
2013-10-30 01:11:17 +04:00
log.e(err)
log.e("try 'you-get --help' for more options")
2012-08-20 19:54:03 +04:00
sys.exit(2)
2014-03-28 08:49:34 +04:00
2013-10-30 10:29:44 +04:00
global force
global dry_run
2014-01-01 10:25:44 +04:00
global player
2014-06-24 05:59:47 +04:00
global extractor_proxy
2014-03-28 08:49:34 +04:00
global cookies_txt
cookies_txt = None
2012-08-20 19:54:03 +04:00
info_only = False
playlist = False
merge = True
2014-06-28 20:10:29 +04:00
stream_id = None
lang = None
2012-08-20 19:54:03 +04:00
output_dir = '.'
proxy = None
2014-06-24 05:59:47 +04:00
extractor_proxy = None
traceback = False
2012-08-20 19:54:03 +04:00
for o, a in opts:
if o in ('-V', '--version'):
print(version)
sys.exit()
elif o in ('-h', '--help'):
print(version)
print(help)
sys.exit()
elif o in ('-f', '--force'):
force = True
elif o in ('-i', '--info'):
info_only = True
elif o in ('-u', '--url'):
dry_run = True
2014-03-28 08:49:34 +04:00
elif o in ('-c', '--cookies'):
from http import cookiejar
cookies_txt = cookiejar.MozillaCookieJar(a)
cookies_txt.load()
2012-08-20 19:54:03 +04:00
elif o in ('-l', '--playlist'):
playlist = True
elif o in ('-n', '--no-merge'):
merge = False
2014-01-01 10:25:44 +04:00
elif o in ('--no-proxy',):
2012-08-20 19:54:03 +04:00
proxy = ''
2014-01-01 10:25:44 +04:00
elif o in ('--debug',):
traceback = True
2014-07-17 07:04:15 +04:00
elif o in ('-F', '--format', '--stream', '--itag'):
2014-06-28 20:10:29 +04:00
stream_id = a
2012-08-20 19:54:03 +04:00
elif o in ('-o', '--output-dir'):
output_dir = a
2014-01-01 10:25:44 +04:00
elif o in ('-p', '--player'):
player = a
2012-08-20 19:54:03 +04:00
elif o in ('-x', '--http-proxy'):
proxy = a
2014-06-24 05:59:47 +04:00
elif o in ('-y', '--extractor-proxy'):
extractor_proxy = a
elif o in ('--lang',):
lang = a
2012-08-20 19:54:03 +04:00
else:
2013-10-30 01:11:17 +04:00
log.e("try 'you-get --help' for more options")
sys.exit(2)
2012-08-20 19:54:03 +04:00
if not args:
2014-07-20 23:26:47 +04:00
print(help)
sys.exit()
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
set_http_proxy(proxy)
2013-10-30 10:29:44 +04:00
try:
2014-06-28 20:10:29 +04:00
if stream_id:
if not extractor_proxy:
download_main(download, download_playlist, args, playlist, stream_id=stream_id, output_dir=output_dir, merge=merge, info_only=info_only)
else:
download_main(download, download_playlist, args, playlist, stream_id=stream_id, extractor_proxy=extractor_proxy, output_dir=output_dir, merge=merge, info_only=info_only)
2014-06-28 20:10:29 +04:00
else:
if not extractor_proxy:
download_main(download, download_playlist, args, playlist, output_dir=output_dir, merge=merge, info_only=info_only)
else:
download_main(download, download_playlist, args, playlist, extractor_proxy=extractor_proxy, output_dir=output_dir, merge=merge, info_only=info_only)
2013-10-30 10:29:44 +04:00
except KeyboardInterrupt:
if traceback:
raise
else:
sys.exit(1)
2014-06-24 05:59:47 +04:00
def url_to_module(url):
2015-09-10 21:43:39 +03:00
from .extractors import netease, w56, acfun, baidu, baomihua, bilibili, blip, catfun, cntv, cbs, coursera, dailymotion, dongting, douban, douyutv, ehow, facebook, freesound, funshion, google, sina, ifeng, alive, instagram, iqilu, iqiyi, joy, jpopsuki, khan, ku6, kugou, kuwo, letv, lizhi, magisto, metacafe, miaopai, miomio, mixcloud, mtv81, nicovideo, pptv, qianmo, qq, sohu, songtaste, soundcloud, ted, theplatform, tudou, tucao, tumblr, twitter, vid48, videobam, vidto, vimeo, vine, vk, xiami, yinyuetai, youku, youtube, zhanqi
2014-07-21 06:00:47 +04:00
video_host = r1(r'https?://([^/]+)/', url)
video_url = r1(r'https?://[^/]+(.*)', url)
assert video_host and video_url, 'invalid url: ' + url
if video_host.endswith('.com.cn'):
video_host = video_host[:-3]
domain = r1(r'(\.[^.]+\.[^.]+)$', video_host) or video_host
assert domain, 'unsupported url: ' + url
k = r1(r'([^.]+)', domain)
downloads = {
'163': netease,
'56': w56,
'acfun': acfun,
'baidu': baidu,
2013-01-20 08:56:16 +04:00
'baomihua': baomihua,
'bilibili': bilibili,
'blip': blip,
2014-07-21 05:17:30 +04:00
'catfun': catfun,
'cntv': cntv,
'cbs': cbs,
'coursera': coursera,
'dailymotion': dailymotion,
'dongting': dongting,
'douban': douban,
2014-10-10 18:48:00 +04:00
'douyutv': douyutv,
'ehow': ehow,
'facebook': facebook,
'freesound': freesound,
'fun': funshion,
'google': google,
'iask': sina,
'ifeng': ifeng,
'in': alive,
'instagram': instagram,
2015-09-10 21:43:39 +03:00
'iqilu': iqilu,
'iqiyi': iqiyi,
'joy': joy,
'jpopsuki': jpopsuki,
'kankanews': bilibili,
'khanacademy': khan,
'ku6': ku6,
2014-07-21 05:17:30 +04:00
'kugou': kugou,
'kuwo': kuwo,
'letv': letv,
2015-03-25 06:29:33 +03:00
'lizhi':lizhi,
'magisto': magisto,
2015-09-02 22:37:50 +03:00
'metacafe': metacafe,
'miomio': miomio,
'mixcloud': mixcloud,
'mtv81': mtv81,
'nicovideo': nicovideo,
'pptv': pptv,
'qianmo':qianmo,
'qq': qq,
'sina': sina,
'smgbb': bilibili,
'sohu': sohu,
2014-07-21 05:17:30 +04:00
'songtaste': songtaste,
'soundcloud': soundcloud,
'ted': ted,
'theplatform': theplatform,
2014-07-31 18:30:54 +04:00
"tucao":tucao,
'tudou': tudou,
'tumblr': tumblr,
2015-06-14 19:04:57 +03:00
'twitter': twitter,
'vid48': vid48,
2014-08-08 09:41:38 +04:00
'videobam': videobam,
2015-04-22 23:59:03 +03:00
'vidto': vidto,
'vimeo': vimeo,
2015-08-23 09:14:12 +03:00
'weibo': miaopai,
'vine': vine,
'vk': vk,
'xiami': xiami,
'yinyuetai': yinyuetai,
'youku': youku,
'youtu': youtube,
'youtube': youtube,
2014-11-16 17:54:57 +03:00
'zhanqi': zhanqi,
2014-07-17 07:04:15 +04:00
}
if k in downloads:
return downloads[k], url
2014-07-17 07:04:15 +04:00
else:
import http.client
conn = http.client.HTTPConnection(video_host)
conn.request("HEAD", video_url)
res = conn.getresponse()
location = res.getheader('location')
if location is None:
from .extractors import embed
return embed, url
2014-07-17 07:04:15 +04:00
else:
return url_to_module(location)
2014-07-17 07:04:15 +04:00
def any_download(url, **kwargs):
m, url = url_to_module(url)
m.download(url, **kwargs)
2014-06-24 05:59:47 +04:00
def any_download_playlist(url, **kwargs):
m, url = url_to_module(url)
m.download_playlist(url, **kwargs)
2014-06-24 05:59:47 +04:00
def main():
script_main('you-get', any_download, any_download_playlist)