you-get/src/you_get/common.py

1088 lines
35 KiB
Python
Raw Normal View History

2012-09-01 02:55:45 +04:00
#!/usr/bin/env python
2012-08-20 19:54:03 +04:00
import getopt
import json
2012-09-02 05:11:49 +04:00
import locale
2012-08-20 19:54:03 +04:00
import os
import re
import sys
from urllib import request, parse
2013-07-11 12:48:13 +04:00
import platform
2013-10-30 10:29:44 +04:00
import threading
2012-08-20 19:54:03 +04:00
2012-12-09 20:33:24 +04:00
from .version import __version__
2014-07-20 23:26:47 +04:00
from .util import log
from .util.strings import get_filename, unescape_html
2012-08-20 19:54:03 +04:00
dry_run = False
2012-08-20 19:54:03 +04:00
force = False
2014-01-01 10:25:44 +04:00
player = None
2014-06-24 05:59:47 +04:00
extractor_proxy = None
cookies_txt = None
2012-08-20 19:54:03 +04:00
2012-09-02 05:11:49 +04:00
fake_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'UTF-8,*;q=0.5',
'Accept-Encoding': 'gzip,deflate,sdch',
2012-09-02 05:11:49 +04:00
'Accept-Language': 'en-US,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:13.0) Gecko/20100101 Firefox/13.0'
2012-09-02 05:11:49 +04:00
}
2012-08-20 19:54:03 +04:00
if sys.stdout.isatty():
default_encoding = sys.stdout.encoding.lower()
else:
default_encoding = locale.getpreferredencoding().lower()
def tr(s):
try:
s.encode(default_encoding)
2012-08-20 19:54:03 +04:00
return s
except:
return str(s.encode('utf-8'))[2:-1]
2012-08-20 19:54:03 +04:00
2013-07-11 12:48:13 +04:00
# DEPRECATED in favor of match1()
2012-08-20 19:54:03 +04:00
def r1(pattern, text):
m = re.search(pattern, text)
if m:
return m.group(1)
2013-07-11 12:48:13 +04:00
# DEPRECATED in favor of match1()
2012-08-20 19:54:03 +04:00
def r1_of(patterns, text):
for p in patterns:
x = r1(p, text)
if x:
return x
2013-07-11 12:48:13 +04:00
def match1(text, *patterns):
"""Scans through a string for substrings matched some patterns (first-subgroups only).
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Args:
text: A string to be scanned.
patterns: Arbitrary number of regex patterns.
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Returns:
When only one pattern is given, returns a string (None if no match found).
When more than one pattern are given, returns a list of strings ([] if no match found).
"""
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
if len(patterns) == 1:
pattern = patterns[0]
match = re.search(pattern, text)
if match:
return match.group(1)
else:
return None
else:
ret = []
for pattern in patterns:
match = re.search(pattern, text)
if match:
ret.append(match.group(1))
return ret
2014-01-01 10:25:44 +04:00
def launch_player(player, urls):
import subprocess
import shlex
subprocess.call(shlex.split(player) + list(urls))
2013-07-11 12:48:13 +04:00
def parse_query_param(url, param):
"""Parses the query string of a URL and returns the value of a parameter.
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Args:
url: A URL.
param: A string representing the name of the parameter.
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Returns:
The value of the parameter.
"""
2014-03-28 08:49:34 +04:00
2013-10-18 16:49:29 +04:00
try:
return parse.parse_qs(parse.urlparse(url).query)[param][0]
except:
return None
2013-07-11 12:48:13 +04:00
2012-09-17 15:11:46 +04:00
def unicodize(text):
return re.sub(r'\\u([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])', lambda x: chr(int(x.group(0)[2:], 16)), text)
2013-10-30 03:19:08 +04:00
# DEPRECATED in favor of util.legitimize()
2012-08-20 19:54:03 +04:00
def escape_file_path(path):
path = path.replace('/', '-')
path = path.replace('\\', '-')
path = path.replace('*', '-')
path = path.replace('?', '-')
return path
2013-10-30 03:19:08 +04:00
# DEPRECATED in favor of util.legitimize()
2013-07-11 12:48:13 +04:00
def filenameable(text):
"""Converts a string to a legal filename through various OSes.
"""
# All POSIX systems
text = text.translate({
0: None,
ord('/'): '-',
})
if platform.system() == 'Windows': # For Windows
2013-07-11 12:48:13 +04:00
text = text.translate({
ord(':'): '-',
ord('*'): '-',
ord('?'): '-',
ord('\\'): '-',
ord('\"'): '\'',
ord('<'): '-',
ord('>'): '-',
ord('|'): '-',
ord('+'): '-',
ord('['): '(',
ord(']'): ')',
})
else:
if text.startswith("."):
text = text[1:]
if platform.system() == 'Darwin': # For Mac OS
text = text.translate({
ord(':'): '-',
})
2013-07-11 12:48:13 +04:00
return text
def ungzip(data):
"""Decompresses data for Content-Encoding: gzip.
"""
2012-08-20 19:54:03 +04:00
from io import BytesIO
import gzip
2013-07-11 12:48:13 +04:00
buffer = BytesIO(data)
f = gzip.GzipFile(fileobj=buffer)
2012-08-20 19:54:03 +04:00
return f.read()
2013-07-11 12:48:13 +04:00
def undeflate(data):
"""Decompresses data for Content-Encoding: deflate.
(the zlib compression is used.)
"""
2012-08-20 19:54:03 +04:00
import zlib
decompressobj = zlib.decompressobj(-zlib.MAX_WBITS)
return decompressobj.decompress(data)+decompressobj.flush()
2012-08-20 19:54:03 +04:00
2013-07-11 12:48:13 +04:00
# DEPRECATED in favor of get_content()
2012-09-02 05:11:49 +04:00
def get_response(url, faker = False):
if faker:
response = request.urlopen(request.Request(url, headers = fake_headers), None)
else:
response = request.urlopen(url)
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
data = response.read()
if response.info().get('Content-Encoding') == 'gzip':
data = ungzip(data)
elif response.info().get('Content-Encoding') == 'deflate':
data = undeflate(data)
response.data = data
return response
2013-07-11 12:48:13 +04:00
# DEPRECATED in favor of get_content()
2012-09-02 05:11:49 +04:00
def get_html(url, encoding = None, faker = False):
content = get_response(url, faker).data
2012-08-20 19:54:03 +04:00
return str(content, 'utf-8', 'ignore')
2013-07-11 12:48:13 +04:00
# DEPRECATED in favor of get_content()
2012-09-02 05:11:49 +04:00
def get_decoded_html(url, faker = False):
response = get_response(url, faker)
2012-08-20 19:54:03 +04:00
data = response.data
charset = r1(r'charset=([\w-]+)', response.headers['content-type'])
if charset:
2014-01-04 22:29:50 +04:00
return data.decode(charset, 'ignore')
2012-08-20 19:54:03 +04:00
else:
return data
2013-07-11 12:48:13 +04:00
def get_content(url, headers={}, decoded=True):
"""Gets the content of a URL via sending a HTTP GET request.
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Args:
url: A URL.
headers: Request headers used by the client.
decoded: Whether decode the response body using UTF-8 or the charset specified in Content-Type.
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Returns:
The content as a string.
"""
2014-03-28 08:49:34 +04:00
req = request.Request(url, headers=headers)
if cookies_txt:
cookies_txt.add_cookie_header(req)
req.headers.update(req.unredirected_hdrs)
response = request.urlopen(req)
2013-07-11 12:48:13 +04:00
data = response.read()
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
# Handle HTTP compression for gzip and deflate (zlib)
content_encoding = response.getheader('Content-Encoding')
if content_encoding == 'gzip':
data = ungzip(data)
elif content_encoding == 'deflate':
data = undeflate(data)
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
# Decode the response body
if decoded:
charset = match1(response.getheader('Content-Type'), r'charset=([\w-]+)')
if charset is not None:
data = data.decode(charset)
else:
data = data.decode('utf-8')
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
return data
2012-09-02 05:11:49 +04:00
def url_size(url, faker = False):
if faker:
response = request.urlopen(request.Request(url, headers = fake_headers), None)
else:
response = request.urlopen(url)
2014-03-28 08:49:34 +04:00
2012-09-02 05:11:49 +04:00
size = int(response.headers['content-length'])
2012-08-20 19:54:03 +04:00
return size
2014-06-28 18:01:34 +04:00
# TO BE DEPRECATED
# urls_size() does not have a faker
# also it takes too long time
2012-08-20 19:54:03 +04:00
def urls_size(urls):
return sum(map(url_size, urls))
2012-09-02 05:11:49 +04:00
def url_info(url, faker = False):
if faker:
response = request.urlopen(request.Request(url, headers = fake_headers), None)
else:
response = request.urlopen(request.Request(url))
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
headers = response.headers
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
type = headers['content-type']
mapping = {
'video/3gpp': '3gp',
'video/f4v': 'flv',
'video/mp4': 'mp4',
'video/MP2T': 'ts',
'video/quicktime': 'mov',
2012-08-20 19:54:03 +04:00
'video/webm': 'webm',
2012-12-10 03:09:13 +04:00
'video/x-flv': 'flv',
'video/x-ms-asf': 'asf',
'audio/mpeg': 'mp3'
2012-08-20 19:54:03 +04:00
}
if type in mapping:
ext = mapping[type]
else:
2013-02-12 23:16:45 +04:00
type = None
2013-02-15 02:51:40 +04:00
if headers['content-disposition']:
try:
filename = parse.unquote(r1(r'filename="?([^"]+)"?', headers['content-disposition']))
if len(filename.split('.')) > 1:
ext = filename.split('.')[-1]
else:
ext = None
except:
2013-02-15 02:51:40 +04:00
ext = None
2013-02-12 23:16:45 +04:00
else:
ext = None
2014-03-28 08:49:34 +04:00
2013-02-12 23:16:45 +04:00
if headers['transfer-encoding'] != 'chunked':
size = int(headers['content-length'])
else:
size = None
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
return type, ext, size
2012-09-02 05:11:49 +04:00
def url_locations(urls, faker = False):
locations = []
for url in urls:
2012-09-02 05:11:49 +04:00
if faker:
response = request.urlopen(request.Request(url, headers = fake_headers), None)
else:
response = request.urlopen(request.Request(url))
2014-03-28 08:49:34 +04:00
locations.append(response.url)
return locations
2012-09-02 05:11:49 +04:00
def url_save(url, filepath, bar, refer = None, is_part = False, faker = False):
file_size = url_size(url, faker = faker)
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if os.path.exists(filepath):
if not force and file_size == os.path.getsize(filepath):
if not is_part:
if bar:
bar.done()
print('Skipping %s: file already exists' % tr(os.path.basename(filepath)))
else:
if bar:
bar.update_received(file_size)
return
else:
if not is_part:
if bar:
bar.done()
print('Overwriting %s' % tr(os.path.basename(filepath)), '...')
elif not os.path.exists(os.path.dirname(filepath)):
os.mkdir(os.path.dirname(filepath))
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
temp_filepath = filepath + '.download'
received = 0
if not force:
open_mode = 'ab'
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if os.path.exists(temp_filepath):
received += os.path.getsize(temp_filepath)
if bar:
bar.update_received(os.path.getsize(temp_filepath))
else:
open_mode = 'wb'
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if received < file_size:
2012-09-02 05:11:49 +04:00
if faker:
headers = fake_headers
else:
headers = {}
2012-08-20 19:54:03 +04:00
if received:
headers['Range'] = 'bytes=' + str(received) + '-'
if refer:
headers['Referer'] = refer
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
response = request.urlopen(request.Request(url, headers = headers), None)
try:
range_start = int(response.headers['content-range'][6:].split('/')[0].split('-')[0])
end_length = end = int(response.headers['content-range'][6:].split('/')[1])
range_length = end_length - range_start
except:
range_length = int(response.headers['content-length'])
2014-03-28 08:49:34 +04:00
if file_size != received + range_length:
2012-09-01 14:20:19 +04:00
received = 0
if bar:
bar.received = 0
2012-09-01 14:20:19 +04:00
open_mode = 'wb'
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
with open(temp_filepath, open_mode) as output:
while True:
buffer = response.read(1024 * 256)
if not buffer:
if received == file_size: # Download finished
break
else: # Unexpected termination. Retry request
headers['Range'] = 'bytes=' + str(received) + '-'
response = request.urlopen(request.Request(url, headers = headers), None)
output.write(buffer)
received += len(buffer)
if bar:
bar.update_received(len(buffer))
2014-03-28 08:49:34 +04:00
assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (received, os.path.getsize(temp_filepath), temp_filepath)
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if os.access(filepath, os.W_OK):
os.remove(filepath) # on Windows rename could fail if destination filepath exists
os.rename(temp_filepath, filepath)
def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker = False):
if os.path.exists(filepath):
if not force:
if not is_part:
if bar:
bar.done()
print('Skipping %s: file already exists' % tr(os.path.basename(filepath)))
else:
if bar:
bar.update_received(os.path.getsize(filepath))
return
else:
if not is_part:
if bar:
bar.done()
print('Overwriting %s' % tr(os.path.basename(filepath)), '...')
elif not os.path.exists(os.path.dirname(filepath)):
os.mkdir(os.path.dirname(filepath))
2014-03-28 08:49:34 +04:00
temp_filepath = filepath + '.download'
received = 0
if not force:
open_mode = 'ab'
2014-03-28 08:49:34 +04:00
if os.path.exists(temp_filepath):
received += os.path.getsize(temp_filepath)
if bar:
bar.update_received(os.path.getsize(temp_filepath))
else:
open_mode = 'wb'
2014-03-28 08:49:34 +04:00
if faker:
headers = fake_headers
else:
headers = {}
if received:
headers['Range'] = 'bytes=' + str(received) + '-'
if refer:
headers['Referer'] = refer
2014-03-28 08:49:34 +04:00
response = request.urlopen(request.Request(url, headers = headers), None)
2014-03-28 08:49:34 +04:00
with open(temp_filepath, open_mode) as output:
while True:
buffer = response.read(1024 * 256)
if not buffer:
break
output.write(buffer)
received += len(buffer)
if bar:
bar.update_received(len(buffer))
2014-03-28 08:49:34 +04:00
assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (received, os.path.getsize(temp_filepath))
2014-03-28 08:49:34 +04:00
if os.access(filepath, os.W_OK):
os.remove(filepath) # on Windows rename could fail if destination filepath exists
os.rename(temp_filepath, filepath)
2012-08-20 19:54:03 +04:00
class SimpleProgressBar:
def __init__(self, total_size, total_pieces = 1):
self.displayed = False
self.total_size = total_size
self.total_pieces = total_pieces
self.current_piece = 1
self.received = 0
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def update(self):
self.displayed = True
bar_size = 40
percent = round(self.received * 100 / self.total_size, 1)
if percent > 100:
percent = 100
dots = bar_size * int(percent) // 100
plus = int(percent) - dots // bar_size * 100
if plus > 0.8:
plus = '='
elif plus > 0.4:
plus = '>'
else:
plus = ''
bar = '=' * dots + plus
bar = '{0:>5}% ({1:>5}/{2:<5}MB) [{3:<40}] {4}/{5}'.format(percent, round(self.received / 1048576, 1), round(self.total_size / 1048576, 1), bar, self.current_piece, self.total_pieces)
sys.stdout.write('\r' + bar)
sys.stdout.flush()
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def update_received(self, n):
self.received += n
self.update()
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def update_piece(self, n):
self.current_piece = n
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def done(self):
if self.displayed:
print()
self.displayed = False
class PiecesProgressBar:
def __init__(self, total_size, total_pieces = 1):
self.displayed = False
self.total_size = total_size
self.total_pieces = total_pieces
self.current_piece = 1
self.received = 0
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def update(self):
self.displayed = True
bar = '{0:>5}%[{1:<40}] {2}/{3}'.format('?', '?' * 40, self.current_piece, self.total_pieces)
sys.stdout.write('\r' + bar)
sys.stdout.flush()
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def update_received(self, n):
self.received += n
self.update()
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def update_piece(self, n):
self.current_piece = n
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def done(self):
if self.displayed:
print()
self.displayed = False
class DummyProgressBar:
def __init__(self, *args):
pass
def update_received(self, n):
pass
def update_piece(self, n):
pass
def done(self):
pass
2014-05-22 15:56:40 +04:00
def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False):
2012-08-20 19:54:03 +04:00
assert urls
if dry_run:
2014-05-22 15:56:40 +04:00
print('Real URLs:\n%s\n' % urls)
return
2014-03-28 08:49:34 +04:00
2014-01-01 10:25:44 +04:00
if player:
launch_player(player, urls)
return
2012-08-20 19:54:03 +04:00
if not total_size:
try:
total_size = urls_size(urls)
except:
import traceback
import sys
traceback.print_exc(file = sys.stdout)
pass
2014-03-28 08:49:34 +04:00
title = get_filename(title)
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
filename = '%s.%s' % (title, ext)
filepath = os.path.join(output_dir, filename)
if total_size:
if not force and os.path.exists(filepath) and os.path.getsize(filepath) >= total_size * 0.9:
print('Skipping %s: file already exists' % tr(filepath))
2012-09-01 14:42:57 +04:00
print()
2012-08-20 19:54:03 +04:00
return
bar = SimpleProgressBar(total_size, len(urls))
else:
bar = PiecesProgressBar(total_size, len(urls))
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if len(urls) == 1:
url = urls[0]
print('Downloading %s ...' % tr(filename))
2012-09-02 05:11:49 +04:00
url_save(url, filepath, bar, refer = refer, faker = faker)
2012-08-20 19:54:03 +04:00
bar.done()
else:
2012-09-16 22:55:31 +04:00
parts = []
2012-08-20 19:54:03 +04:00
print('Downloading %s.%s ...' % (tr(title), ext))
for i, url in enumerate(urls):
filename = '%s[%02d].%s' % (title, i, ext)
filepath = os.path.join(output_dir, filename)
2012-09-16 22:55:31 +04:00
parts.append(filepath)
2012-08-20 19:54:03 +04:00
#print 'Downloading %s [%s/%s]...' % (tr(filename), i + 1, len(urls))
bar.update_piece(i + 1)
2012-09-02 05:11:49 +04:00
url_save(url, filepath, bar, refer = refer, is_part = True, faker = faker)
2012-08-20 19:54:03 +04:00
bar.done()
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if not merge:
2012-09-01 14:42:57 +04:00
print()
2012-08-20 19:54:03 +04:00
return
if ext == 'flv':
try:
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
from .processor.ffmpeg import ffmpeg_concat_flv_to_mp4
ffmpeg_concat_flv_to_mp4(parts, os.path.join(output_dir, title + '.mp4'))
else:
from .processor.join_flv import concat_flv
concat_flv(parts, os.path.join(output_dir, title + '.flv'))
except:
raise
else:
for part in parts:
os.remove(part)
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
elif ext == 'mp4':
2012-09-19 00:23:10 +04:00
try:
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
2013-04-29 00:22:07 +04:00
from .processor.ffmpeg import ffmpeg_concat_mp4_to_mp4
ffmpeg_concat_mp4_to_mp4(parts, os.path.join(output_dir, title + '.mp4'))
2012-09-19 00:23:10 +04:00
else:
2013-08-07 10:00:00 +04:00
from .processor.join_mp4 import concat_mp4
concat_mp4(parts, os.path.join(output_dir, title + '.mp4'))
except:
raise
else:
for part in parts:
os.remove(part)
2014-03-28 08:49:34 +04:00
else:
print("Can't merge %s files" % ext)
2014-03-28 08:49:34 +04:00
print()
2014-05-22 15:56:40 +04:00
def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False):
assert urls
if dry_run:
2014-05-22 15:56:40 +04:00
print('Real URLs:\n%s\n' % urls)
return
2014-03-28 08:49:34 +04:00
2014-01-01 10:25:44 +04:00
if player:
launch_player(player, urls)
return
assert ext in ('ts')
2014-03-28 08:49:34 +04:00
title = get_filename(title)
2014-03-28 08:49:34 +04:00
2012-09-19 00:23:10 +04:00
filename = '%s.%s' % (title, 'ts')
filepath = os.path.join(output_dir, filename)
if total_size:
2012-09-19 00:23:10 +04:00
if not force and os.path.exists(filepath[:-3] + '.mkv'):
print('Skipping %s: file already exists' % tr(filepath[:-3] + '.mkv'))
print()
return
bar = SimpleProgressBar(total_size, len(urls))
else:
bar = PiecesProgressBar(total_size, len(urls))
2014-03-28 08:49:34 +04:00
if len(urls) == 1:
2012-09-19 00:23:10 +04:00
parts = []
url = urls[0]
print('Downloading %s ...' % tr(filename))
2012-09-19 00:23:10 +04:00
filepath = os.path.join(output_dir, filename)
parts.append(filepath)
url_save_chunked(url, filepath, bar, refer = refer, faker = faker)
bar.done()
2014-03-28 08:49:34 +04:00
2012-09-19 00:23:10 +04:00
if not merge:
print()
return
if ext == 'ts':
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
from .processor.ffmpeg import ffmpeg_convert_ts_to_mkv
2012-10-09 05:21:04 +04:00
if ffmpeg_convert_ts_to_mkv(parts, os.path.join(output_dir, title + '.mkv')):
for part in parts:
os.remove(part)
else:
os.remove(os.path.join(output_dir, title + '.mkv'))
2012-09-19 00:23:10 +04:00
else:
print('No ffmpeg is found. Conversion aborted.')
else:
print("Can't convert %s files" % ext)
else:
parts = []
print('Downloading %s.%s ...' % (tr(title), ext))
for i, url in enumerate(urls):
filename = '%s[%02d].%s' % (title, i, ext)
filepath = os.path.join(output_dir, filename)
parts.append(filepath)
#print 'Downloading %s [%s/%s]...' % (tr(filename), i + 1, len(urls))
bar.update_piece(i + 1)
url_save_chunked(url, filepath, bar, refer = refer, is_part = True, faker = faker)
bar.done()
2014-03-28 08:49:34 +04:00
if not merge:
print()
return
if ext == 'ts':
2012-09-19 00:23:10 +04:00
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
from .processor.ffmpeg import ffmpeg_concat_ts_to_mkv
2012-10-09 05:21:04 +04:00
if ffmpeg_concat_ts_to_mkv(parts, os.path.join(output_dir, title + '.mkv')):
for part in parts:
os.remove(part)
else:
os.remove(os.path.join(output_dir, title + '.mkv'))
2012-09-19 00:23:10 +04:00
else:
print('No ffmpeg is found. Merging aborted.')
2012-08-20 19:54:03 +04:00
else:
print("Can't merge %s files" % ext)
2014-03-28 08:49:34 +04:00
2012-09-01 14:42:57 +04:00
print()
2012-08-20 19:54:03 +04:00
def download_rtmp_url(url,title, ext,params={}, total_size=0, output_dir='.', refer=None, merge=True, faker=False):
2014-05-22 15:56:40 +04:00
assert url
if dry_run:
2014-05-22 16:04:22 +04:00
print('Real URL:\n%s\n' % [url])
if params.get("-y",False): #None or unset ->False
print('Real Playpath:\n%s\n' % [params.get("-y")])
2014-05-22 15:56:40 +04:00
return
if player:
2014-05-22 16:21:17 +04:00
from .processor.rtmpdump import play_rtmpdump_stream
play_rtmpdump_stream(player, url, params)
2014-05-22 15:56:40 +04:00
return
from .processor.rtmpdump import has_rtmpdump_installed, download_rtmpdump_stream
assert has_rtmpdump_installed(), "RTMPDump not installed."
download_rtmpdump_stream(url, title, ext,params, output_dir)
2014-05-22 15:56:40 +04:00
2012-08-20 19:54:03 +04:00
def playlist_not_supported(name):
def f(*args, **kwargs):
raise NotImplementedError('Playlist is not supported for ' + name)
return f
def print_info(site_info, title, type, size):
2013-04-25 17:56:44 +04:00
if type:
type = type.lower()
2012-08-20 19:54:03 +04:00
if type in ['3gp']:
type = 'video/3gpp'
2013-02-12 23:16:45 +04:00
elif type in ['asf', 'wmv']:
2012-12-10 03:09:13 +04:00
type = 'video/x-ms-asf'
2012-08-20 19:54:03 +04:00
elif type in ['flv', 'f4v']:
type = 'video/x-flv'
elif type in ['mkv']:
type = 'video/x-matroska'
2012-12-10 03:09:13 +04:00
elif type in ['mp3']:
type = 'audio/mpeg'
2012-08-20 19:54:03 +04:00
elif type in ['mp4']:
type = 'video/mp4'
elif type in ['mov']:
type = 'video/quicktime'
2012-09-16 22:55:31 +04:00
elif type in ['ts']:
type = 'video/MP2T'
2012-08-20 19:54:03 +04:00
elif type in ['webm']:
type = 'video/webm'
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if type in ['video/3gpp']:
type_info = "3GPP multimedia file (%s)" % type
elif type in ['video/x-flv', 'video/f4v']:
type_info = "Flash video (%s)" % type
elif type in ['video/mp4', 'video/x-m4v']:
type_info = "MPEG-4 video (%s)" % type
elif type in ['video/MP2T']:
2012-09-16 22:55:31 +04:00
type_info = "MPEG-2 transport stream (%s)" % type
2012-08-20 19:54:03 +04:00
elif type in ['video/webm']:
type_info = "WebM video (%s)" % type
#elif type in ['video/ogg']:
# type_info = "Ogg video (%s)" % type
elif type in ['video/quicktime']:
type_info = "QuickTime video (%s)" % type
elif type in ['video/x-matroska']:
type_info = "Matroska video (%s)" % type
2012-08-20 19:54:03 +04:00
#elif type in ['video/x-ms-wmv']:
# type_info = "Windows Media video (%s)" % type
2012-12-10 03:09:13 +04:00
elif type in ['video/x-ms-asf']:
type_info = "Advanced Systems Format (%s)" % type
2012-08-20 19:54:03 +04:00
#elif type in ['video/mpeg']:
# type_info = "MPEG video (%s)" % type
2012-12-10 03:09:13 +04:00
elif type in ['audio/mpeg']:
type_info = "MP3 (%s)" % type
2012-08-20 19:54:03 +04:00
else:
type_info = "Unknown type (%s)" % type
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
print("Video Site:", site_info)
2014-04-11 15:55:54 +04:00
print("Title: ", unescape_html(tr(title)))
2012-08-20 19:54:03 +04:00
print("Type: ", type_info)
2013-10-30 10:41:29 +04:00
print("Size: ", round(size / 1048576, 2), "MiB (" + str(size) + " Bytes)")
2012-09-01 12:18:59 +04:00
print()
2012-08-20 19:54:03 +04:00
2013-10-30 10:29:44 +04:00
def parse_host(host):
"""Parses host name and port number from a string.
"""
if re.match(r'^(\d+)$', host) is not None:
return ("0.0.0.0", int(host))
if re.match(r'^(\w+)://', host) is None:
host = "//" + host
o = parse.urlparse(host)
hostname = o.hostname or "0.0.0.0"
port = o.port or 0
return (hostname, port)
def set_proxy(proxy):
proxy_handler = request.ProxyHandler({
'http': '%s:%s' % proxy,
'https': '%s:%s' % proxy,
})
opener = request.build_opener(proxy_handler)
request.install_opener(opener)
def unset_proxy():
proxy_handler = request.ProxyHandler({})
opener = request.build_opener(proxy_handler)
request.install_opener(opener)
# DEPRECATED in favor of set_proxy() and unset_proxy()
2012-08-20 19:54:03 +04:00
def set_http_proxy(proxy):
if proxy == None: # Use system default setting
proxy_support = request.ProxyHandler()
elif proxy == '': # Don't use any proxy
proxy_support = request.ProxyHandler({})
else: # Use proxy
2013-06-26 10:01:11 +04:00
proxy_support = request.ProxyHandler({'http': '%s' % proxy, 'https': '%s' % proxy})
2012-08-20 19:54:03 +04:00
opener = request.build_opener(proxy_support)
request.install_opener(opener)
2014-06-28 20:10:29 +04:00
def download_main(download, download_playlist, urls, playlist, **kwargs):
for url in urls:
if url.startswith('https://'):
url = url[8:]
if not url.startswith('http://'):
url = 'http://' + url
2014-03-28 08:49:34 +04:00
if playlist:
2014-06-28 20:10:29 +04:00
download_playlist(url, **kwargs)
else:
2014-06-28 20:10:29 +04:00
download(url, **kwargs)
2012-08-31 19:20:38 +04:00
def script_main(script_name, download, download_playlist = None):
2014-07-20 23:52:04 +04:00
version = 'You-Get %s, a video downloader.' % __version__
2012-09-01 02:55:45 +04:00
help = 'Usage: %s [OPTION]... [URL]...\n' % script_name
2012-08-20 19:54:03 +04:00
help += '''\nStartup options:
-V | --version Display the version and exit.
-h | --help Print this help and exit.
'''
help += '''\nDownload options (use with URLs):
-f | --force Force overwriting existed files.
-i | --info Display the information of videos without downloading.
-u | --url Display the real URLs of videos without downloading.
2014-03-28 08:49:34 +04:00
-c | --cookies Load NetScape's cookies.txt file.
2012-08-20 19:54:03 +04:00
-n | --no-merge Don't merge video parts.
2014-06-28 20:10:29 +04:00
-F | --format <STREAM_ID> Video format code.
2012-08-20 19:54:03 +04:00
-o | --output-dir <PATH> Set the output directory for downloaded videos.
2014-02-08 07:13:29 +04:00
-p | --player <PLAYER [options]> Directly play the video with PLAYER like vlc/smplayer.
2013-10-30 10:54:30 +04:00
-x | --http-proxy <HOST:PORT> Use specific HTTP proxy for downloading.
2014-06-24 05:59:47 +04:00
-y | --extractor-proxy <HOST:PORT> Use specific HTTP proxy for extracting stream data.
2012-08-20 19:54:03 +04:00
--no-proxy Don't use any proxy. (ignore $http_proxy)
--debug Show traceback on KeyboardInterrupt.
2012-08-20 19:54:03 +04:00
'''
2014-03-28 08:49:34 +04:00
2014-07-20 23:26:47 +04:00
short_opts = 'Vhfiuc:nF:o:p:x:y:'
opts = ['version', 'help', 'force', 'info', 'url', 'cookies', 'no-merge', 'no-proxy', 'debug', 'format=', 'stream=', 'itag=', 'output-dir=', 'player=', 'http-proxy=', 'extractor-proxy=', 'lang=']
2012-08-20 19:54:03 +04:00
if download_playlist:
short_opts = 'l' + short_opts
opts = ['playlist'] + opts
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
try:
opts, args = getopt.getopt(sys.argv[1:], short_opts, opts)
except getopt.GetoptError as err:
2013-10-30 01:11:17 +04:00
log.e(err)
log.e("try 'you-get --help' for more options")
2012-08-20 19:54:03 +04:00
sys.exit(2)
2014-03-28 08:49:34 +04:00
2013-10-30 10:29:44 +04:00
global force
global dry_run
2014-01-01 10:25:44 +04:00
global player
2014-06-24 05:59:47 +04:00
global extractor_proxy
2014-03-28 08:49:34 +04:00
global cookies_txt
cookies_txt = None
2012-08-20 19:54:03 +04:00
info_only = False
playlist = False
merge = True
2014-06-28 20:10:29 +04:00
stream_id = None
lang = None
2012-08-20 19:54:03 +04:00
output_dir = '.'
proxy = None
2014-06-24 05:59:47 +04:00
extractor_proxy = None
traceback = False
2012-08-20 19:54:03 +04:00
for o, a in opts:
if o in ('-V', '--version'):
print(version)
sys.exit()
elif o in ('-h', '--help'):
print(version)
print(help)
sys.exit()
elif o in ('-f', '--force'):
force = True
elif o in ('-i', '--info'):
info_only = True
elif o in ('-u', '--url'):
dry_run = True
2014-03-28 08:49:34 +04:00
elif o in ('-c', '--cookies'):
from http import cookiejar
cookies_txt = cookiejar.MozillaCookieJar(a)
cookies_txt.load()
2012-08-20 19:54:03 +04:00
elif o in ('-l', '--playlist'):
playlist = True
elif o in ('-n', '--no-merge'):
merge = False
2014-01-01 10:25:44 +04:00
elif o in ('--no-proxy',):
2012-08-20 19:54:03 +04:00
proxy = ''
2014-01-01 10:25:44 +04:00
elif o in ('--debug',):
traceback = True
2014-07-17 07:04:15 +04:00
elif o in ('-F', '--format', '--stream', '--itag'):
2014-06-28 20:10:29 +04:00
stream_id = a
2012-08-20 19:54:03 +04:00
elif o in ('-o', '--output-dir'):
output_dir = a
2014-01-01 10:25:44 +04:00
elif o in ('-p', '--player'):
player = a
2012-08-20 19:54:03 +04:00
elif o in ('-x', '--http-proxy'):
proxy = a
2014-06-24 05:59:47 +04:00
elif o in ('-y', '--extractor-proxy'):
extractor_proxy = a
elif o in ('--lang',):
lang = a
2012-08-20 19:54:03 +04:00
else:
2013-10-30 01:11:17 +04:00
log.e("try 'you-get --help' for more options")
sys.exit(2)
2012-08-20 19:54:03 +04:00
if not args:
2014-07-20 23:26:47 +04:00
print(help)
sys.exit()
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
set_http_proxy(proxy)
2013-10-30 10:29:44 +04:00
try:
2014-06-28 20:10:29 +04:00
if stream_id:
download_main(download, download_playlist, args, playlist, stream_id=stream_id, lang=lang, output_dir=output_dir, merge=merge, info_only=info_only)
2014-06-28 20:10:29 +04:00
else:
download_main(download, download_playlist, args, playlist, lang=lang, output_dir=output_dir, merge=merge, info_only=info_only)
2013-10-30 10:29:44 +04:00
except KeyboardInterrupt:
if traceback:
raise
else:
sys.exit(1)
2014-06-24 05:59:47 +04:00
2014-07-17 07:04:15 +04:00
def mime_to_container(mime):
mapping = {
'video/3gpp': '3gp',
'video/mp4': 'mp4',
'video/webm': 'webm',
'video/x-flv': 'flv',
}
if mime in mapping:
return mapping[mime]
else:
return mime.split('/')[1]
2014-06-24 05:59:47 +04:00
class VideoExtractor():
def __init__(self, *args):
self.url = None
self.title = None
self.vid = None
self.streams = {}
self.streams_sorted = []
self.audiolang = None
2014-06-24 05:59:47 +04:00
if args:
self.url = args[0]
def download_by_url(self, url, **kwargs):
self.url = url
global extractor_proxy
if extractor_proxy:
set_proxy(parse_host(extractor_proxy))
2014-06-28 18:01:34 +04:00
self.prepare(**kwargs)
2014-07-17 07:04:15 +04:00
try:
self.streams_sorted = [dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams]
except:
self.streams_sorted = [dict([('itag', stream_type['itag'])] + list(self.streams[stream_type['itag']].items())) for stream_type in self.__class__.stream_types if stream_type['itag'] in self.streams]
2014-06-24 05:59:47 +04:00
self.extract(**kwargs)
if extractor_proxy:
unset_proxy()
self.download(**kwargs)
def download_by_vid(self, vid, **kwargs):
self.vid = vid
global extractor_proxy
if extractor_proxy:
set_proxy(parse_host(extractor_proxy))
2014-06-28 18:01:34 +04:00
self.prepare(**kwargs)
2014-07-17 07:04:15 +04:00
try:
self.streams_sorted = [dict([('id', stream_type['id'])] + list(self.streams[stream_type['id']].items())) for stream_type in self.__class__.stream_types if stream_type['id'] in self.streams]
except:
self.streams_sorted = [dict([('itag', stream_type['itag'])] + list(self.streams[stream_type['itag']].items())) for stream_type in self.__class__.stream_types if stream_type['itag'] in self.streams]
2014-06-24 05:59:47 +04:00
self.extract(**kwargs)
if extractor_proxy:
unset_proxy()
self.download(**kwargs)
def prepare(self, **kwargs):
pass
#raise NotImplementedError()
def extract(self, **kwargs):
pass
#raise NotImplementedError()
def p_stream(self, stream_id):
stream = self.streams[stream_id]
2014-07-17 07:04:15 +04:00
if 'itag' in stream:
print(" - itag: \033[7m%s\033[0m" % stream_id)
else:
2014-07-20 17:36:29 +04:00
print(" - format: \033[7m%s\033[0m" % stream_id)
2014-07-17 07:04:15 +04:00
if 'container' in stream:
print(" container: %s" % stream['container'])
if 'video_profile' in stream:
print(" video-profile: %s" % stream['video_profile'])
if 'quality' in stream:
print(" quality: %s" % stream['quality'])
2014-06-28 15:22:50 +04:00
if 'size' in stream:
print(" size: %s MiB (%s bytes)" % (round(stream['size'] / 1048576, 1), stream['size']))
2014-07-17 07:04:15 +04:00
if 'itag' in stream:
print(" # download-with: \033[4myou-get --itag=%s [URL]\033[0m" % stream_id)
2014-06-28 15:22:50 +04:00
else:
2014-07-17 07:04:15 +04:00
print(" # download-with: \033[4myou-get --format=%s [URL]\033[0m" % stream_id)
2014-06-24 05:59:47 +04:00
print()
def p_i(self, stream_id):
stream = self.streams[stream_id]
print(" - title: %s" % self.title)
print(" size: %s MiB (%s bytes)" % (round(stream['size'] / 1048576, 1), stream['size']))
print(" url: %s" % self.url)
print()
2014-06-24 05:59:47 +04:00
def p(self, stream_id=None):
print("site: %s" % self.__class__.name)
print("title: %s" % self.title)
if stream_id:
# Print the stream
print("stream:")
self.p_stream(stream_id)
elif stream_id is None:
# Print stream with best quality
print("stream: # Best quality")
2014-07-17 07:04:15 +04:00
stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag']
2014-06-24 05:59:47 +04:00
self.p_stream(stream_id)
elif stream_id == []:
# Print all available streams
print("streams: # Available quality and codecs")
for stream in self.streams_sorted:
2014-07-17 07:04:15 +04:00
self.p_stream(stream['id'] if 'id' in stream else stream['itag'])
2014-06-24 05:59:47 +04:00
if self.audiolang:
print("audio-languages:")
for i in self.audiolang:
print(" - lang: {}".format(i['lang']))
print(" download-url: {}\n".format(i['url']))
def p_playlist(self, stream_id=None):
print("site: %s" % self.__class__.name)
2014-07-17 11:01:44 +04:00
print("playlist: %s" % self.title)
print("videos:")
2014-06-24 05:59:47 +04:00
def download(self, **kwargs):
if 'info_only' in kwargs and kwargs['info_only']:
if 'stream_id' in kwargs and kwargs['stream_id']:
# Display the stream
stream_id = kwargs['stream_id']
if 'index' not in kwargs:
self.p(stream_id)
else:
self.p_i(stream_id)
2014-06-24 05:59:47 +04:00
else:
# Display all available streams
if 'index' not in kwargs:
self.p([])
else:
stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag']
self.p_i(stream_id)
2014-06-24 14:49:04 +04:00
2014-06-24 05:59:47 +04:00
else:
if 'stream_id' in kwargs and kwargs['stream_id']:
# Download the stream
stream_id = kwargs['stream_id']
else:
# Download stream with the best quality
2014-07-17 07:04:15 +04:00
stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag']
2014-06-24 05:59:47 +04:00
if 'index' not in kwargs:
self.p(None)
else:
self.p_i(stream_id)
2014-06-24 05:59:47 +04:00
urls = self.streams[stream_id]['src']
if not urls:
log.e('[Failed] Cannot extract video source.')
log.e('This is most likely because the video has not been made available in your country.')
log.e('You may try to use a proxy via \'-y\' for extracting stream data.')
exit(1)
download_urls(urls, self.title, self.streams[stream_id]['container'], self.streams[stream_id]['size'], output_dir=kwargs['output_dir'], merge=kwargs['merge'])
2014-06-24 14:49:04 +04:00
self.__init__()