you-get/src/you_get/common.py

1369 lines
46 KiB
Python
Raw Normal View History

2012-09-01 02:55:45 +04:00
#!/usr/bin/env python
2012-08-20 19:54:03 +04:00
SITES = {
'163' : 'netease',
'56' : 'w56',
'acfun' : 'acfun',
'archive' : 'archive',
'baidu' : 'baidu',
'bandcamp' : 'bandcamp',
'baomihua' : 'baomihua',
2016-08-01 12:39:38 +03:00
'bigthink' : 'bigthink',
'bilibili' : 'bilibili',
'cctv' : 'cntv',
'cntv' : 'cntv',
'cbs' : 'cbs',
'dailymotion' : 'dailymotion',
'dilidili' : 'dilidili',
'douban' : 'douban',
'douyu' : 'douyutv',
'ehow' : 'ehow',
'facebook' : 'facebook',
2017-08-18 07:49:10 +03:00
'fantasy' : 'fantasy',
'fc2' : 'fc2video',
'flickr' : 'flickr',
'freesound' : 'freesound',
'fun' : 'funshion',
'google' : 'google',
2017-09-16 23:01:13 +03:00
'giphy' : 'giphy',
'heavy-music' : 'heavymusic',
2016-03-11 12:57:47 +03:00
'huaban' : 'huaban',
2016-10-20 09:19:45 +03:00
'huomao' : 'huomaotv',
'iask' : 'sina',
2016-10-20 22:09:30 +03:00
'icourses' : 'icourses',
'ifeng' : 'ifeng',
2015-12-31 13:20:37 +03:00
'imgur' : 'imgur',
'in' : 'alive',
2016-03-05 04:26:29 +03:00
'infoq' : 'infoq',
'instagram' : 'instagram',
'interest' : 'interest',
'iqilu' : 'iqilu',
'iqiyi' : 'iqiyi',
'isuntv' : 'suntv',
'joy' : 'joy',
'kankanews' : 'bilibili',
'khanacademy' : 'khan',
'ku6' : 'ku6',
'kugou' : 'kugou',
'kuwo' : 'kuwo',
2016-03-03 23:49:47 +03:00
'le' : 'le',
'letv' : 'le',
'lizhi' : 'lizhi',
'magisto' : 'magisto',
'metacafe' : 'metacafe',
2016-05-06 23:30:08 +03:00
'mgtv' : 'mgtv',
'miomio' : 'miomio',
'mixcloud' : 'mixcloud',
'mtv81' : 'mtv81',
'musicplayon' : 'musicplayon',
2016-05-19 08:48:45 +03:00
'naver' : 'naver',
'7gogo' : 'nanagogo',
'nicovideo' : 'nicovideo',
2016-05-11 09:28:50 +03:00
'panda' : 'panda',
'pinterest' : 'pinterest',
'pixnet' : 'pixnet',
'pptv' : 'pptv',
'qingting' : 'qingting',
'qq' : 'qq',
2017-01-04 01:58:56 +03:00
'quanmin' : 'quanmin',
2016-07-01 08:07:32 +03:00
'showroom-live' : 'showroom',
'sina' : 'sina',
'smgbb' : 'bilibili',
'sohu' : 'sohu',
'soundcloud' : 'soundcloud',
'ted' : 'ted',
'theplatform' : 'theplatform',
'tucao' : 'tucao',
'tudou' : 'tudou',
'tumblr' : 'tumblr',
2015-12-29 18:10:45 +03:00
'twimg' : 'twitter',
'twitter' : 'twitter',
2017-05-14 00:15:18 +03:00
'ucas' : 'ucas',
2015-12-31 14:50:38 +03:00
'videomega' : 'videomega',
'vidto' : 'vidto',
'vimeo' : 'vimeo',
2016-08-25 06:12:06 +03:00
'wanmen' : 'wanmen',
'weibo' : 'miaopai',
'veoh' : 'veoh',
'vine' : 'vine',
'vk' : 'vk',
'xiami' : 'xiami',
'xiaokaxiu' : 'yixia',
'xiaojiadianvideo' : 'fc2video',
2017-02-28 22:35:47 +03:00
'ximalaya' : 'ximalaya',
'yinyuetai' : 'yinyuetai',
'miaopai' : 'yixia',
2016-12-10 14:23:35 +03:00
'yizhibo' : 'yizhibo',
'youku' : 'youku',
2017-06-25 09:20:28 +03:00
'iwara' : 'iwara',
'youtu' : 'youtube',
'youtube' : 'youtube',
'zhanqi' : 'zhanqi',
}
2012-08-20 19:54:03 +04:00
import json
2012-09-02 05:11:49 +04:00
import locale
import logging
2012-08-20 19:54:03 +04:00
import os
import re
import socket
2012-08-20 19:54:03 +04:00
import sys
import time
from urllib import request, parse, error
from http import cookiejar
from importlib import import_module
import argparse
2012-08-20 19:54:03 +04:00
2012-12-09 20:33:24 +04:00
from .version import __version__
from .util import log, term
2015-10-21 18:01:31 +03:00
from .util.git import get_version
2014-07-20 23:26:47 +04:00
from .util.strings import get_filename, unescape_html
from . import json_output as json_output_
2012-08-20 19:54:03 +04:00
dry_run = False
json_output = False
2012-08-20 19:54:03 +04:00
force = False
2014-01-01 10:25:44 +04:00
player = None
2014-06-24 05:59:47 +04:00
extractor_proxy = None
cookies = None
output_filename = None
2012-08-20 19:54:03 +04:00
2012-09-02 05:11:49 +04:00
fake_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'UTF-8,*;q=0.5',
'Accept-Encoding': 'gzip,deflate,sdch',
2012-09-02 05:11:49 +04:00
'Accept-Language': 'en-US,en;q=0.8',
2017-01-25 23:21:49 +03:00
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'
2012-09-02 05:11:49 +04:00
}
2012-08-20 19:54:03 +04:00
if sys.stdout.isatty():
default_encoding = sys.stdout.encoding.lower()
else:
default_encoding = locale.getpreferredencoding().lower()
2017-05-23 21:16:32 +03:00
def rc4(key, data):
#all encryption algo should work on bytes
assert type(key)==type(data) and type(key) == type(b'')
state = list(range(256))
j = 0
for i in range(256):
j += state[i] + key[i % len(key)]
j &= 0xff
state[i], state[j] = state[j], state[i]
i = 0
j = 0
out_list = []
for char in data:
i += 1
i &= 0xff
j += state[i]
j &= 0xff
state[i], state[j] = state[j], state[i]
prn = state[(state[i] + state[j]) & 0xff]
out_list.append(char ^ prn)
return bytes(out_list)
def general_m3u8_extractor(url, headers={}):
m3u8_list = get_content(url, headers=headers).split('\n')
urls = []
for line in m3u8_list:
line = line.strip()
if line and not line.startswith('#'):
if line.startswith('http'):
urls.append(line)
else:
seg_url = parse.urljoin(url, line)
urls.append(seg_url)
return urls
def maybe_print(*s):
try: print(*s)
except: pass
2012-08-20 19:54:03 +04:00
def tr(s):
if default_encoding == 'utf-8':
return s
else:
return s
#return str(s.encode('utf-8'))[2:-1]
2012-08-20 19:54:03 +04:00
2013-07-11 12:48:13 +04:00
# DEPRECATED in favor of match1()
2012-08-20 19:54:03 +04:00
def r1(pattern, text):
m = re.search(pattern, text)
if m:
return m.group(1)
2013-07-11 12:48:13 +04:00
# DEPRECATED in favor of match1()
2012-08-20 19:54:03 +04:00
def r1_of(patterns, text):
for p in patterns:
x = r1(p, text)
if x:
return x
2013-07-11 12:48:13 +04:00
def match1(text, *patterns):
"""Scans through a string for substrings matched some patterns (first-subgroups only).
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Args:
text: A string to be scanned.
patterns: Arbitrary number of regex patterns.
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Returns:
When only one pattern is given, returns a string (None if no match found).
When more than one pattern are given, returns a list of strings ([] if no match found).
"""
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
if len(patterns) == 1:
pattern = patterns[0]
match = re.search(pattern, text)
if match:
return match.group(1)
else:
return None
else:
ret = []
for pattern in patterns:
match = re.search(pattern, text)
if match:
ret.append(match.group(1))
return ret
def matchall(text, patterns):
"""Scans through a string for substrings matched some patterns.
Args:
text: A string to be scanned.
patterns: a list of regex pattern.
Returns:
a list if matched. empty if not.
"""
ret = []
for pattern in patterns:
match = re.findall(pattern, text)
ret += match
return ret
2014-01-01 10:25:44 +04:00
def launch_player(player, urls):
import subprocess
import shlex
subprocess.call(shlex.split(player) + list(urls))
2013-07-11 12:48:13 +04:00
def parse_query_param(url, param):
"""Parses the query string of a URL and returns the value of a parameter.
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Args:
url: A URL.
param: A string representing the name of the parameter.
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Returns:
The value of the parameter.
"""
2014-03-28 08:49:34 +04:00
2013-10-18 16:49:29 +04:00
try:
return parse.parse_qs(parse.urlparse(url).query)[param][0]
except:
return None
2013-07-11 12:48:13 +04:00
2012-09-17 15:11:46 +04:00
def unicodize(text):
return re.sub(r'\\u([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])', lambda x: chr(int(x.group(0)[2:], 16)), text)
2013-10-30 03:19:08 +04:00
# DEPRECATED in favor of util.legitimize()
2012-08-20 19:54:03 +04:00
def escape_file_path(path):
path = path.replace('/', '-')
path = path.replace('\\', '-')
path = path.replace('*', '-')
path = path.replace('?', '-')
return path
2013-07-11 12:48:13 +04:00
def ungzip(data):
"""Decompresses data for Content-Encoding: gzip.
"""
2012-08-20 19:54:03 +04:00
from io import BytesIO
import gzip
2013-07-11 12:48:13 +04:00
buffer = BytesIO(data)
f = gzip.GzipFile(fileobj=buffer)
2012-08-20 19:54:03 +04:00
return f.read()
2013-07-11 12:48:13 +04:00
def undeflate(data):
"""Decompresses data for Content-Encoding: deflate.
(the zlib compression is used.)
"""
2012-08-20 19:54:03 +04:00
import zlib
decompressobj = zlib.decompressobj(-zlib.MAX_WBITS)
return decompressobj.decompress(data)+decompressobj.flush()
2012-08-20 19:54:03 +04:00
2013-07-11 12:48:13 +04:00
# DEPRECATED in favor of get_content()
2012-09-02 05:11:49 +04:00
def get_response(url, faker = False):
logging.debug('get_response: %s' % url)
# install cookies
if cookies:
opener = request.build_opener(request.HTTPCookieProcessor(cookies))
request.install_opener(opener)
2012-09-02 05:11:49 +04:00
if faker:
response = request.urlopen(request.Request(url, headers = fake_headers), None)
else:
response = request.urlopen(url)
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
data = response.read()
if response.info().get('Content-Encoding') == 'gzip':
data = ungzip(data)
elif response.info().get('Content-Encoding') == 'deflate':
data = undeflate(data)
response.data = data
return response
2013-07-11 12:48:13 +04:00
# DEPRECATED in favor of get_content()
2012-09-02 05:11:49 +04:00
def get_html(url, encoding = None, faker = False):
content = get_response(url, faker).data
2012-08-20 19:54:03 +04:00
return str(content, 'utf-8', 'ignore')
2013-07-11 12:48:13 +04:00
# DEPRECATED in favor of get_content()
2012-09-02 05:11:49 +04:00
def get_decoded_html(url, faker = False):
response = get_response(url, faker)
2012-08-20 19:54:03 +04:00
data = response.data
charset = r1(r'charset=([\w-]+)', response.headers['content-type'])
if charset:
2014-01-04 22:29:50 +04:00
return data.decode(charset, 'ignore')
2012-08-20 19:54:03 +04:00
else:
return data
def get_location(url):
logging.debug('get_location: %s' % url)
response = request.urlopen(url)
# urllib will follow redirections and it's too much code to tell urllib
# not to do that
return response.geturl()
def urlopen_with_retry(*args, **kwargs):
2017-10-05 06:54:06 +03:00
retry_time = 3
for i in range(retry_time):
try:
return request.urlopen(*args, **kwargs)
2017-10-05 06:54:06 +03:00
except socket.timeout as e:
logging.debug('request attempt %s timeout' % str(i + 1))
2017-10-05 06:54:06 +03:00
if i + 1 == retry_time:
raise e
# try to tackle youku CDN fails
except error.HTTPError as http_error:
logging.debug('HTTP Error with code{}'.format(http_error.code))
2017-10-05 06:54:06 +03:00
if i + 1 == retry_time:
raise http_error
2013-07-11 12:48:13 +04:00
def get_content(url, headers={}, decoded=True):
"""Gets the content of a URL via sending a HTTP GET request.
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Args:
url: A URL.
headers: Request headers used by the client.
decoded: Whether decode the response body using UTF-8 or the charset specified in Content-Type.
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
Returns:
The content as a string.
"""
2014-03-28 08:49:34 +04:00
logging.debug('get_content: %s' % url)
2014-03-28 08:49:34 +04:00
req = request.Request(url, headers=headers)
if cookies:
cookies.add_cookie_header(req)
2014-03-28 08:49:34 +04:00
req.headers.update(req.unredirected_hdrs)
response = urlopen_with_retry(req)
2013-07-11 12:48:13 +04:00
data = response.read()
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
# Handle HTTP compression for gzip and deflate (zlib)
content_encoding = response.getheader('Content-Encoding')
if content_encoding == 'gzip':
data = ungzip(data)
elif content_encoding == 'deflate':
data = undeflate(data)
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
# Decode the response body
if decoded:
charset = match1(response.getheader('Content-Type'), r'charset=([\w-]+)')
if charset is not None:
data = data.decode(charset)
else:
data = data.decode('utf-8', 'ignore')
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
return data
2016-10-14 09:03:56 +03:00
def post_content(url, headers={}, post_data={}, decoded=True):
"""Post the content of a URL via sending a HTTP POST request.
Args:
url: A URL.
headers: Request headers used by the client.
decoded: Whether decode the response body using UTF-8 or the charset specified in Content-Type.
Returns:
The content as a string.
"""
logging.debug('post_content: %s \n post_data: %s' % (url, post_data))
req = request.Request(url, headers=headers)
if cookies:
cookies.add_cookie_header(req)
req.headers.update(req.unredirected_hdrs)
post_data_enc = bytes(parse.urlencode(post_data), 'utf-8')
response = urlopen_with_retry(req, data=post_data_enc)
2016-10-14 09:03:56 +03:00
data = response.read()
# Handle HTTP compression for gzip and deflate (zlib)
content_encoding = response.getheader('Content-Encoding')
if content_encoding == 'gzip':
data = ungzip(data)
elif content_encoding == 'deflate':
data = undeflate(data)
# Decode the response body
if decoded:
2013-07-11 12:48:13 +04:00
charset = match1(response.getheader('Content-Type'), r'charset=([\w-]+)')
if charset is not None:
data = data.decode(charset)
else:
data = data.decode('utf-8')
2014-03-28 08:49:34 +04:00
2013-07-11 12:48:13 +04:00
return data
def url_size(url, faker = False, headers = {}):
2012-09-02 05:11:49 +04:00
if faker:
response = urlopen_with_retry(request.Request(url, headers=fake_headers))
elif headers:
response = urlopen_with_retry(request.Request(url, headers=headers))
2012-09-02 05:11:49 +04:00
else:
response = urlopen_with_retry(url)
2014-03-28 08:49:34 +04:00
size = response.headers['content-length']
return int(size) if size!=None else float('inf')
2012-08-20 19:54:03 +04:00
def urls_size(urls, faker = False, headers = {}):
return sum([url_size(url, faker=faker, headers=headers) for url in urls])
2012-08-20 19:54:03 +04:00
def get_head(url, headers = {}, get_method = 'HEAD'):
logging.debug('get_head: %s' % url)
if headers:
req = request.Request(url, headers=headers)
else:
req = request.Request(url)
req.get_method = lambda: get_method
res = urlopen_with_retry(req)
2015-10-20 01:48:48 +03:00
return dict(res.headers)
def url_info(url, faker = False, headers = {}):
logging.debug('url_info: %s' % url)
2012-09-02 05:11:49 +04:00
if faker:
response = urlopen_with_retry(request.Request(url, headers=fake_headers))
elif headers:
response = urlopen_with_retry(request.Request(url, headers=headers))
2012-09-02 05:11:49 +04:00
else:
response = urlopen_with_retry(request.Request(url))
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
headers = response.headers
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
type = headers['content-type']
2016-02-04 06:48:16 +03:00
if type == 'image/jpg; charset=UTF-8' or type == 'image/jpg' : type = 'audio/mpeg' #fix for netease
2012-08-20 19:54:03 +04:00
mapping = {
'video/3gpp': '3gp',
'video/f4v': 'flv',
'video/mp4': 'mp4',
'video/MP2T': 'ts',
'video/quicktime': 'mov',
2012-08-20 19:54:03 +04:00
'video/webm': 'webm',
2012-12-10 03:09:13 +04:00
'video/x-flv': 'flv',
'video/x-ms-asf': 'asf',
2014-09-21 04:22:57 +04:00
'audio/mp4': 'mp4',
2015-10-21 05:23:12 +03:00
'audio/mpeg': 'mp3',
'audio/wav': 'wav',
'audio/x-wav': 'wav',
'audio/wave': 'wav',
2015-10-21 05:23:12 +03:00
'image/jpeg': 'jpg',
'image/png': 'png',
'image/gif': 'gif',
2016-03-05 04:26:29 +03:00
'application/pdf': 'pdf',
2012-08-20 19:54:03 +04:00
}
if type in mapping:
ext = mapping[type]
else:
2013-02-12 23:16:45 +04:00
type = None
2013-02-15 02:51:40 +04:00
if headers['content-disposition']:
try:
filename = parse.unquote(r1(r'filename="?([^"]+)"?', headers['content-disposition']))
if len(filename.split('.')) > 1:
ext = filename.split('.')[-1]
else:
ext = None
except:
2013-02-15 02:51:40 +04:00
ext = None
2013-02-12 23:16:45 +04:00
else:
ext = None
2014-03-28 08:49:34 +04:00
2013-02-12 23:16:45 +04:00
if headers['transfer-encoding'] != 'chunked':
size = headers['content-length'] and int(headers['content-length'])
2013-02-12 23:16:45 +04:00
else:
size = None
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
return type, ext, size
def url_locations(urls, faker = False, headers = {}):
locations = []
for url in urls:
logging.debug('url_locations: %s' % url)
2012-09-02 05:11:49 +04:00
if faker:
response = urlopen_with_retry(request.Request(url, headers=fake_headers))
elif headers:
response = urlopen_with_retry(request.Request(url, headers=headers))
2012-09-02 05:11:49 +04:00
else:
response = urlopen_with_retry(request.Request(url))
2014-03-28 08:49:34 +04:00
locations.append(response.url)
return locations
def url_save(url, filepath, bar, refer=None, is_part=False, faker=False, headers=None, timeout=None, **kwargs):
tmp_headers = headers.copy() if headers is not None else {}
# When a referer specified with param refer, the key must be 'Referer' for the hack here
if refer is not None:
tmp_headers['Referer'] = refer
file_size = url_size(url, faker=faker, headers=tmp_headers)
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if os.path.exists(filepath):
if not force and file_size == os.path.getsize(filepath):
if not is_part:
if bar:
bar.done()
print('Skipping %s: file already exists' % tr(os.path.basename(filepath)))
else:
if bar:
bar.update_received(file_size)
return
else:
if not is_part:
if bar:
bar.done()
print('Overwriting %s' % tr(os.path.basename(filepath)), '...')
elif not os.path.exists(os.path.dirname(filepath)):
os.mkdir(os.path.dirname(filepath))
2014-03-28 08:49:34 +04:00
temp_filepath = filepath + '.download' if file_size!=float('inf') else filepath
2012-08-20 19:54:03 +04:00
received = 0
if not force:
open_mode = 'ab'
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if os.path.exists(temp_filepath):
received += os.path.getsize(temp_filepath)
if bar:
bar.update_received(os.path.getsize(temp_filepath))
else:
open_mode = 'wb'
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if received < file_size:
2012-09-02 05:11:49 +04:00
if faker:
tmp_headers = fake_headers
'''
if parameter headers passed in, we have it copied as tmp_header
elif headers:
headers = headers
2012-09-02 05:11:49 +04:00
else:
headers = {}
'''
if received:
tmp_headers['Range'] = 'bytes=' + str(received) + '-'
2012-08-20 19:54:03 +04:00
if refer:
tmp_headers['Referer'] = refer
2014-03-28 08:49:34 +04:00
if timeout:
response = urlopen_with_retry(request.Request(url, headers=tmp_headers), timeout=timeout)
else:
response = urlopen_with_retry(request.Request(url, headers=tmp_headers))
try:
range_start = int(response.headers['content-range'][6:].split('/')[0].split('-')[0])
end_length = int(response.headers['content-range'][6:].split('/')[1])
range_length = end_length - range_start
except:
content_length = response.headers['content-length']
range_length = int(content_length) if content_length!=None else float('inf')
2014-03-28 08:49:34 +04:00
if file_size != received + range_length:
2012-09-01 14:20:19 +04:00
received = 0
if bar:
bar.received = 0
2012-09-01 14:20:19 +04:00
open_mode = 'wb'
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
with open(temp_filepath, open_mode) as output:
while True:
buffer = None
try:
buffer = response.read(1024 * 256)
except socket.timeout:
pass
2012-08-20 19:54:03 +04:00
if not buffer:
if received == file_size: # Download finished
break
# Unexpected termination. Retry request
tmp_headers['Range'] = 'bytes=' + str(received) + '-'
response = urlopen_with_retry(request.Request(url, headers=tmp_headers))
continue
2012-08-20 19:54:03 +04:00
output.write(buffer)
received += len(buffer)
if bar:
bar.update_received(len(buffer))
2014-03-28 08:49:34 +04:00
assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (received, os.path.getsize(temp_filepath), temp_filepath)
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if os.access(filepath, os.W_OK):
os.remove(filepath) # on Windows rename could fail if destination filepath exists
os.rename(temp_filepath, filepath)
2012-08-20 19:54:03 +04:00
class SimpleProgressBar:
term_size = term.get_terminal_size()[1]
2012-08-20 19:54:03 +04:00
def __init__(self, total_size, total_pieces = 1):
self.displayed = False
self.total_size = total_size
self.total_pieces = total_pieces
self.current_piece = 1
self.received = 0
self.speed = ''
self.last_updated = time.time()
2014-03-28 08:49:34 +04:00
total_pieces_len = len(str(total_pieces))
# 38 is the size of all statically known size in self.bar
total_str = '%5s' % round(self.total_size / 1048576, 1)
total_str_width = max(len(total_str), 5)
2017-10-05 06:54:36 +03:00
self.bar_size = self.term_size - 28 - 2*total_pieces_len - 2*total_str_width
2016-02-10 11:23:13 +03:00
self.bar = '{:>4}%% ({:>%s}/%sMB) ├{:─<%s}┤[{:>%s}/{:>%s}] {}' % (
total_str_width, total_str, self.bar_size, total_pieces_len, total_pieces_len)
2012-08-20 19:54:03 +04:00
def update(self):
self.displayed = True
bar_size = self.bar_size
2012-08-20 19:54:03 +04:00
percent = round(self.received * 100 / self.total_size, 1)
2016-02-10 11:23:13 +03:00
if percent >= 100:
2012-08-20 19:54:03 +04:00
percent = 100
dots = bar_size * int(percent) // 100
plus = int(percent) - dots // bar_size * 100
if plus > 0.8:
2015-10-18 03:40:24 +03:00
plus = ''
2012-08-20 19:54:03 +04:00
elif plus > 0.4:
plus = '>'
else:
plus = ''
2015-10-18 03:40:24 +03:00
bar = '' * dots + plus
bar = self.bar.format(percent, round(self.received / 1048576, 1), bar, self.current_piece, self.total_pieces, self.speed)
2012-08-20 19:54:03 +04:00
sys.stdout.write('\r' + bar)
sys.stdout.flush()
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def update_received(self, n):
self.received += n
2015-11-09 05:47:55 +03:00
time_diff = time.time() - self.last_updated
bytes_ps = n / time_diff if time_diff else 0
if bytes_ps >= 1024 ** 3:
self.speed = '{:4.0f} GB/s'.format(bytes_ps / 1024 ** 3)
elif bytes_ps >= 1024 ** 2:
self.speed = '{:4.0f} MB/s'.format(bytes_ps / 1024 ** 2)
elif bytes_ps >= 1024:
self.speed = '{:4.0f} kB/s'.format(bytes_ps / 1024)
else:
self.speed = '{:4.0f} B/s'.format(bytes_ps)
self.last_updated = time.time()
2012-08-20 19:54:03 +04:00
self.update()
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def update_piece(self, n):
self.current_piece = n
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def done(self):
if self.displayed:
print()
self.displayed = False
class PiecesProgressBar:
def __init__(self, total_size, total_pieces = 1):
self.displayed = False
self.total_size = total_size
self.total_pieces = total_pieces
self.current_piece = 1
self.received = 0
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def update(self):
self.displayed = True
bar = '{0:>5}%[{1:<40}] {2}/{3}'.format('', '=' * 40, self.current_piece, self.total_pieces)
2012-08-20 19:54:03 +04:00
sys.stdout.write('\r' + bar)
sys.stdout.flush()
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def update_received(self, n):
self.received += n
self.update()
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def update_piece(self, n):
self.current_piece = n
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
def done(self):
if self.displayed:
print()
self.displayed = False
class DummyProgressBar:
def __init__(self, *args):
pass
def update_received(self, n):
pass
def update_piece(self, n):
pass
def done(self):
pass
def get_output_filename(urls, title, ext, output_dir, merge):
# lame hack for the --output-filename option
global output_filename
if output_filename:
if ext:
return output_filename + '.' + ext
return output_filename
merged_ext = ext
if (len(urls) > 1) and merge:
from .processor.ffmpeg import has_ffmpeg_installed
if ext in ['flv', 'f4v']:
if has_ffmpeg_installed():
merged_ext = 'mp4'
else:
merged_ext = 'flv'
elif ext == 'mp4':
merged_ext = 'mp4'
elif ext == 'ts':
if has_ffmpeg_installed():
merged_ext = 'mkv'
else:
merged_ext = 'ts'
return '%s.%s' % (title, merged_ext)
def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False, headers = {}, **kwargs):
2012-08-20 19:54:03 +04:00
assert urls
if json_output:
json_output_.download_urls(urls=urls, title=title, ext=ext, total_size=total_size, refer=refer)
return
if dry_run:
print('Real URLs:\n%s' % '\n'.join(urls))
return
2014-03-28 08:49:34 +04:00
2014-01-01 10:25:44 +04:00
if player:
launch_player(player, urls)
return
2012-08-20 19:54:03 +04:00
if not total_size:
try:
total_size = urls_size(urls, faker=faker, headers=headers)
2012-08-20 19:54:03 +04:00
except:
import traceback
traceback.print_exc(file=sys.stdout)
2012-08-20 19:54:03 +04:00
pass
2014-03-28 08:49:34 +04:00
title = tr(get_filename(title))
output_filename = get_output_filename(urls, title, ext, output_dir, merge)
output_filepath = os.path.join(output_dir, output_filename)
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if total_size:
if not force and os.path.exists(output_filepath) and os.path.getsize(output_filepath) >= total_size * 0.9:
print('Skipping %s: file already exists' % output_filepath)
2012-09-01 14:42:57 +04:00
print()
2012-08-20 19:54:03 +04:00
return
bar = SimpleProgressBar(total_size, len(urls))
else:
bar = PiecesProgressBar(total_size, len(urls))
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if len(urls) == 1:
url = urls[0]
print('Downloading %s ...' % tr(output_filename))
bar.update()
url_save(url, output_filepath, bar, refer = refer, faker = faker, headers = headers, **kwargs)
2012-08-20 19:54:03 +04:00
bar.done()
else:
2012-09-16 22:55:31 +04:00
parts = []
2012-08-20 19:54:03 +04:00
print('Downloading %s.%s ...' % (tr(title), ext))
bar.update()
2012-08-20 19:54:03 +04:00
for i, url in enumerate(urls):
filename = '%s[%02d].%s' % (title, i, ext)
filepath = os.path.join(output_dir, filename)
2012-09-16 22:55:31 +04:00
parts.append(filepath)
2012-08-20 19:54:03 +04:00
#print 'Downloading %s [%s/%s]...' % (tr(filename), i + 1, len(urls))
bar.update_piece(i + 1)
url_save(url, filepath, bar, refer = refer, is_part = True, faker = faker, headers = headers, **kwargs)
2012-08-20 19:54:03 +04:00
bar.done()
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
if not merge:
2012-09-01 14:42:57 +04:00
print()
2012-08-20 19:54:03 +04:00
return
if 'av' in kwargs and kwargs['av']:
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
from .processor.ffmpeg import ffmpeg_concat_av
ret = ffmpeg_concat_av(parts, output_filepath, ext)
print('Merged into %s' % output_filename)
if ret == 0:
for part in parts: os.remove(part)
elif ext in ['flv', 'f4v']:
try:
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
from .processor.ffmpeg import ffmpeg_concat_flv_to_mp4
ffmpeg_concat_flv_to_mp4(parts, output_filepath)
else:
from .processor.join_flv import concat_flv
concat_flv(parts, output_filepath)
print('Merged into %s' % output_filename)
except:
raise
else:
for part in parts:
os.remove(part)
2014-03-28 08:49:34 +04:00
2012-08-20 19:54:03 +04:00
elif ext == 'mp4':
2012-09-19 00:23:10 +04:00
try:
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
2013-04-29 00:22:07 +04:00
from .processor.ffmpeg import ffmpeg_concat_mp4_to_mp4
ffmpeg_concat_mp4_to_mp4(parts, output_filepath)
2012-09-19 00:23:10 +04:00
else:
2013-08-07 10:00:00 +04:00
from .processor.join_mp4 import concat_mp4
concat_mp4(parts, output_filepath)
print('Merged into %s' % output_filename)
2013-08-07 10:00:00 +04:00
except:
raise
else:
for part in parts:
os.remove(part)
elif ext == "ts":
try:
from .processor.ffmpeg import has_ffmpeg_installed
if has_ffmpeg_installed():
from .processor.ffmpeg import ffmpeg_concat_ts_to_mkv
ffmpeg_concat_ts_to_mkv(parts, output_filepath)
else:
from .processor.join_ts import concat_ts
concat_ts(parts, output_filepath)
print('Merged into %s' % output_filename)
except:
raise
else:
for part in parts:
os.remove(part)
else:
print("Can't merge %s files" % ext)
2014-03-28 08:49:34 +04:00
print()
2012-08-20 19:54:03 +04:00
def download_rtmp_url(url,title, ext,params={}, total_size=0, output_dir='.', refer=None, merge=True, faker=False):
2014-05-22 15:56:40 +04:00
assert url
if dry_run:
2014-05-22 16:04:22 +04:00
print('Real URL:\n%s\n' % [url])
if params.get("-y",False): #None or unset ->False
print('Real Playpath:\n%s\n' % [params.get("-y")])
2014-05-22 15:56:40 +04:00
return
if player:
2014-05-22 16:21:17 +04:00
from .processor.rtmpdump import play_rtmpdump_stream
play_rtmpdump_stream(player, url, params)
2014-05-22 15:56:40 +04:00
return
from .processor.rtmpdump import has_rtmpdump_installed, download_rtmpdump_stream
assert has_rtmpdump_installed(), "RTMPDump not installed."
download_rtmpdump_stream(url, title, ext,params, output_dir)
2014-05-22 15:56:40 +04:00
def download_url_ffmpeg(url,title, ext,params={}, total_size=0, output_dir='.', refer=None, merge=True, faker=False, stream=True):
assert url
if dry_run:
print('Real URL:\n%s\n' % [url])
if params.get("-y",False): #None or unset ->False
print('Real Playpath:\n%s\n' % [params.get("-y")])
return
if player:
launch_player(player, [url])
return
2016-07-01 08:07:32 +03:00
from .processor.ffmpeg import has_ffmpeg_installed, ffmpeg_download_stream
assert has_ffmpeg_installed(), "FFmpeg not installed."
global output_filename
if output_filename:
dotPos = output_filename.rfind(".")
title = output_filename[:dotPos]
ext = output_filename[dotPos+1:]
title = tr(get_filename(title))
ffmpeg_download_stream(url, title, ext, params, output_dir, stream=stream)
2012-08-20 19:54:03 +04:00
def playlist_not_supported(name):
def f(*args, **kwargs):
raise NotImplementedError('Playlist is not supported for ' + name)
return f
def print_info(site_info, title, type, size, **kwargs):
if json_output:
json_output_.print_info(site_info=site_info, title=title, type=type, size=size)
return
2013-04-25 17:56:44 +04:00
if type:
type = type.lower()
2012-08-20 19:54:03 +04:00
if type in ['3gp']:
type = 'video/3gpp'
2013-02-12 23:16:45 +04:00
elif type in ['asf', 'wmv']:
2012-12-10 03:09:13 +04:00
type = 'video/x-ms-asf'
2012-08-20 19:54:03 +04:00
elif type in ['flv', 'f4v']:
type = 'video/x-flv'
elif type in ['mkv']:
type = 'video/x-matroska'
2012-12-10 03:09:13 +04:00
elif type in ['mp3']:
type = 'audio/mpeg'
2012-08-20 19:54:03 +04:00
elif type in ['mp4']:
type = 'video/mp4'
elif type in ['mov']:
type = 'video/quicktime'
2012-09-16 22:55:31 +04:00
elif type in ['ts']:
type = 'video/MP2T'
2012-08-20 19:54:03 +04:00
elif type in ['webm']:
type = 'video/webm'
2014-03-28 08:49:34 +04:00
2015-10-21 03:49:14 +03:00
elif type in ['jpg']:
type = 'image/jpeg'
elif type in ['png']:
type = 'image/png'
elif type in ['gif']:
type = 'image/gif'
2012-08-20 19:54:03 +04:00
if type in ['video/3gpp']:
type_info = "3GPP multimedia file (%s)" % type
elif type in ['video/x-flv', 'video/f4v']:
type_info = "Flash video (%s)" % type
elif type in ['video/mp4', 'video/x-m4v']:
type_info = "MPEG-4 video (%s)" % type
elif type in ['video/MP2T']:
2012-09-16 22:55:31 +04:00
type_info = "MPEG-2 transport stream (%s)" % type
2012-08-20 19:54:03 +04:00
elif type in ['video/webm']:
type_info = "WebM video (%s)" % type
#elif type in ['video/ogg']:
# type_info = "Ogg video (%s)" % type
elif type in ['video/quicktime']:
type_info = "QuickTime video (%s)" % type
elif type in ['video/x-matroska']:
type_info = "Matroska video (%s)" % type
2012-08-20 19:54:03 +04:00
#elif type in ['video/x-ms-wmv']:
# type_info = "Windows Media video (%s)" % type
2012-12-10 03:09:13 +04:00
elif type in ['video/x-ms-asf']:
type_info = "Advanced Systems Format (%s)" % type
2012-08-20 19:54:03 +04:00
#elif type in ['video/mpeg']:
# type_info = "MPEG video (%s)" % type
2017-02-28 22:35:47 +03:00
elif type in ['audio/mp4', 'audio/m4a']:
2014-09-21 04:22:57 +04:00
type_info = "MPEG-4 audio (%s)" % type
2012-12-10 03:09:13 +04:00
elif type in ['audio/mpeg']:
type_info = "MP3 (%s)" % type
elif type in ['audio/wav', 'audio/wave', 'audio/x-wav']:
type_info = 'Waveform Audio File Format ({})'.format(type)
2015-10-21 03:49:14 +03:00
elif type in ['image/jpeg']:
type_info = "JPEG Image (%s)" % type
elif type in ['image/png']:
type_info = "Portable Network Graphics (%s)" % type
elif type in ['image/gif']:
type_info = "Graphics Interchange Format (%s)" % type
elif type in ['m3u8']:
if 'm3u8_type' in kwargs:
if kwargs['m3u8_type'] == 'master':
type_info = 'M3U8 Master {}'.format(type)
else:
type_info = 'M3U8 Playlist {}'.format(type)
2012-08-20 19:54:03 +04:00
else:
type_info = "Unknown type (%s)" % type
2014-03-28 08:49:34 +04:00
maybe_print("Site: ", site_info)
maybe_print("Title: ", unescape_html(tr(title)))
2012-08-20 19:54:03 +04:00
print("Type: ", type_info)
if type != 'm3u8':
print("Size: ", round(size / 1048576, 2), "MiB (" + str(size) + " Bytes)")
if type == 'm3u8' and 'm3u8_url' in kwargs:
print('M3U8 Url: {}'.format(kwargs['m3u8_url']))
2012-09-01 12:18:59 +04:00
print()
2012-08-20 19:54:03 +04:00
def mime_to_container(mime):
mapping = {
'video/3gpp': '3gp',
'video/mp4': 'mp4',
'video/webm': 'webm',
'video/x-flv': 'flv',
}
if mime in mapping:
return mapping[mime]
else:
return mime.split('/')[1]
2013-10-30 10:29:44 +04:00
def parse_host(host):
"""Parses host name and port number from a string.
"""
if re.match(r'^(\d+)$', host) is not None:
return ("0.0.0.0", int(host))
if re.match(r'^(\w+)://', host) is None:
host = "//" + host
o = parse.urlparse(host)
hostname = o.hostname or "0.0.0.0"
port = o.port or 0
return (hostname, port)
def set_proxy(proxy):
proxy_handler = request.ProxyHandler({
'http': '%s:%s' % proxy,
'https': '%s:%s' % proxy,
})
opener = request.build_opener(proxy_handler)
request.install_opener(opener)
def unset_proxy():
proxy_handler = request.ProxyHandler({})
opener = request.build_opener(proxy_handler)
request.install_opener(opener)
# DEPRECATED in favor of set_proxy() and unset_proxy()
2012-08-20 19:54:03 +04:00
def set_http_proxy(proxy):
if proxy == None: # Use system default setting
proxy_support = request.ProxyHandler()
elif proxy == '': # Don't use any proxy
proxy_support = request.ProxyHandler({})
else: # Use proxy
2013-06-26 10:01:11 +04:00
proxy_support = request.ProxyHandler({'http': '%s' % proxy, 'https': '%s' % proxy})
2012-08-20 19:54:03 +04:00
opener = request.build_opener(proxy_support)
request.install_opener(opener)
def print_more_compatible(*args, **kwargs):
import builtins as __builtin__
"""Overload default print function as py (<3.3) does not support 'flush' keyword.
Although the function name can be same as print to get itself overloaded automatically,
I'd rather leave it with a different name and only overload it when importing to make less confusion. """
# nothing happens on py3.3 and later
if sys.version_info[:2] >= (3, 3):
return __builtin__.print(*args, **kwargs)
# in lower pyver (e.g. 3.2.x), remove 'flush' keyword and flush it as requested
doFlush = kwargs.pop('flush', False)
ret = __builtin__.print(*args, **kwargs)
if doFlush:
kwargs.get('file', sys.stdout).flush()
return ret
2014-06-28 20:10:29 +04:00
def download_main(download, download_playlist, urls, playlist, **kwargs):
for url in urls:
2017-08-10 19:05:15 +03:00
if re.match(r'https?://', url) is None:
url = 'http://' + url
2014-03-28 08:49:34 +04:00
if playlist:
2014-06-28 20:10:29 +04:00
download_playlist(url, **kwargs)
else:
2014-06-28 20:10:29 +04:00
download(url, **kwargs)
def load_cookies(cookiefile):
global cookies
try:
2017-09-13 12:28:49 +03:00
cookies = cookiejar.MozillaCookieJar(cookiefile)
cookies.load()
except Exception:
import sqlite3
cookies = cookiejar.MozillaCookieJar()
2017-09-13 12:28:49 +03:00
con = sqlite3.connect(cookiefile)
cur = con.cursor()
try:
cur.execute("""SELECT host, path, isSecure, expiry, name, value
FROM moz_cookies""")
for item in cur.fetchall():
c = cookiejar.Cookie(
0, item[4], item[5], None, False, item[0],
item[0].startswith('.'), item[0].startswith('.'),
item[1], False, item[2], item[3], item[3]=="", None,
None, {},
)
cookies.set_cookie(c)
except Exception:
pass
# TODO: Chromium Cookies
# SELECT host_key, path, secure, expires_utc, name, encrypted_value
# FROM cookies
# http://n8henrie.com/2013/11/use-chromes-cookies-for-easier-downloading-with-python-requests/
2015-10-21 18:01:31 +03:00
def set_socks_proxy(proxy):
try:
import socks
socks_proxy_addrs = proxy.split(':')
socks.set_default_proxy(socks.SOCKS5,
socks_proxy_addrs[0],
int(socks_proxy_addrs[1]))
socket.socket = socks.socksocket
def getaddrinfo(*args):
return [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))]
socket.getaddrinfo = getaddrinfo
except ImportError:
log.w('Error importing PySocks library, socks proxy ignored.'
'In order to use use socks proxy, please install PySocks.')
def script_main(download, download_playlist, **kwargs):
logging.basicConfig(format='[%(levelname)s] %(message)s')
def print_version():
log.i('version %s, a tiny downloader that scrapes the web.'
% get_version(kwargs['repo_path']
if 'repo_path' in kwargs else __version__))
2014-03-28 08:49:34 +04:00
parser = argparse.ArgumentParser(
prog='you-get',
usage='you-get [OPTION]... URL...',
description='A tiny downloader that scrapes the web',
add_help=False,
)
parser.add_argument('-V', '--version', action='store_true',
help='Print version and exit')
parser.add_argument('-h', '--help', action='store_true',
help='Print this help message and exit')
dry_run_grp = parser.add_argument_group('Dry-run options', '(no actual downloading)')
dry_run_grp = dry_run_grp.add_mutually_exclusive_group()
dry_run_grp.add_argument('-i', '--info', action='store_true',
help='Print extracted information')
dry_run_grp.add_argument('-u', '--url', action='store_true',
help='Print extracted information with URLs')
dry_run_grp.add_argument('--json', action='store_true',
help='Print extracted URLs in JSON format')
download_grp = parser.add_argument_group('Download options')
download_grp.add_argument('-n', '--no-merge', action='store_true', default=False,
help='Do not merge video parts')
download_grp.add_argument('--no-caption', action='store_true',
help='Do not download captions (subtitles, lyrics, danmaku, ...)')
download_grp.add_argument('-f', '--force', action='store_true', default=False,
help='Force overwriting existing files')
download_grp.add_argument('-F', '--format', metavar='STREAM_ID',
help='Set video format to STREAM_ID')
download_grp.add_argument('-O', '--output-filename', metavar='FILE',
help='Set output filename')
download_grp.add_argument('-o', '--output-dir', metavar='DIR', default='.',
help='Set output directory')
download_grp.add_argument('-p', '--player', metavar='PLAYER',
help='Stream extracted URL to a PLAYER')
download_grp.add_argument('-c', '--cookies', metavar='COOKIES_FILE',
help='Load cookies.txt or cookies.sqlite')
download_grp.add_argument('-t', '--timeout', metavar='SECONDS', type=int, default=600,
help='Set socket timeout')
download_grp.add_argument('-d', '--debug', action='store_true',
help='Show traceback and other debug info')
download_grp.add_argument('-I', '--input-file', metavar='FILE', type=argparse.FileType('r'),
help='Read non-playlist URLs from FILE')
download_grp.add_argument('-P', '--password',
help='Set video visit password to PASSWORD')
download_grp.add_argument('-l', '--playlist', action='store_true',
help='Prefer to download a playlist')
proxy_grp = parser.add_argument_group('Proxy options')
proxy_grp = proxy_grp.add_mutually_exclusive_group()
proxy_grp.add_argument('-x', '--http-proxy', metavar='HOST:PORT',
help='Use an HTTP proxy for downloading')
proxy_grp.add_argument('-y', '--extractor-proxy', metavar='HOST:PORT',
help='Use an HTTP proxy for extracting only')
proxy_grp.add_argument('--no-proxy', action='store_true',
help='Never use a proxy')
proxy_grp.add_argument('-s', '--socks-proxy', metavar='HOST:PORT',
help='Use an SOCKS5 proxy for downloading')
download_grp.add_argument('--stream',
help=argparse.SUPPRESS)
download_grp.add_argument('--itag',
help=argparse.SUPPRESS)
parser.add_argument('URL', nargs='*',
help=argparse.SUPPRESS)
args = parser.parse_args()
if args.help:
print_version()
parser.print_help()
sys.exit()
if args.version:
print_version()
sys.exit()
if args.debug:
# Set level of root logger to DEBUG
logging.getLogger().setLevel(logging.DEBUG)
2014-03-28 08:49:34 +04:00
2013-10-30 10:29:44 +04:00
global force
global dry_run
global json_output
2014-01-01 10:25:44 +04:00
global player
2014-06-24 05:59:47 +04:00
global extractor_proxy
global output_filename
2014-03-28 08:49:34 +04:00
output_filename = args.output_filename
extractor_proxy = args.extractor_proxy
info_only = args.info
if args.url:
dry_run = True
if args.json:
json_output = True
# to fix extractors not use VideoExtractor
dry_run = True
info_only = False
if args.cookies:
load_cookies(args.cookies)
2016-01-08 18:55:14 +03:00
caption = True
stream_id = args.format or args.stream or args.itag
if args.no_caption:
caption = False
if args.player:
player = args.player
caption = False
if args.no_proxy:
set_http_proxy('')
else:
set_http_proxy(args.http_proxy)
if args.socks_proxy:
set_socks_proxy(args.socks_proxy)
URLs = []
if args.input_file:
logging.debug('you are trying to load urls from %s', args.input_file)
if args.playlist:
log.e("reading playlist from a file is unsupported and won't make your life easier")
2013-10-30 01:11:17 +04:00
sys.exit(2)
URLs.extend(args.input_file.read().splitlines())
args.input_file.close()
URLs.extend(args.URL)
2014-03-28 08:49:34 +04:00
if not URLs:
parser.print_help()
sys.exit()
2013-10-30 10:29:44 +04:00
socket.setdefaulttimeout(args.timeout)
2013-10-30 10:29:44 +04:00
try:
extra = {}
if extractor_proxy:
extra['extractor_proxy'] = extractor_proxy
2014-06-28 20:10:29 +04:00
if stream_id:
extra['stream_id'] = stream_id
download_main(
download, download_playlist,
URLs, args.playlist,
output_dir=args.output_dir, merge=not args.no_merge,
2017-09-13 18:42:00 +03:00
info_only=info_only, json_output=json_output, caption=caption, password=args.password,
**extra
)
2013-10-30 10:29:44 +04:00
except KeyboardInterrupt:
if args.debug:
2013-10-30 10:29:44 +04:00
raise
else:
sys.exit(1)
except UnicodeEncodeError:
if args.debug:
raise
log.e('[error] oops, the current environment does not seem to support Unicode.')
log.e('please set it to a UTF-8-aware locale first,')
log.e('so as to save the video (with some Unicode characters) correctly.')
log.e('you can do it like this:')
log.e(' (Windows) % chcp 65001 ')
log.e(' (Linux) $ LC_CTYPE=en_US.UTF-8')
sys.exit(1)
except Exception:
if not args.debug:
log.e('[error] oops, something went wrong.')
log.e('don\'t panic, c\'est la vie. please try the following steps:')
log.e(' (1) Rule out any network problem.')
log.e(' (2) Make sure you-get is up-to-date.')
log.e(' (3) Check if the issue is already known, on')
log.e(' https://github.com/soimort/you-get/wiki/Known-Bugs')
log.e(' https://github.com/soimort/you-get/issues')
log.e(' (4) Run the command with \'--debug\' option,')
log.e(' and report this issue with the full output.')
else:
print_version()
log.i(args)
raise
sys.exit(1)
2014-06-24 05:59:47 +04:00
2015-10-21 16:00:46 +03:00
def google_search(url):
keywords = r1(r'https?://(.*)', url)
url = 'https://www.google.com/search?tbm=vid&q=%s' % parse.quote(keywords)
page = get_content(url, headers=fake_headers)
videos = re.findall(r'<a href="(https?://[^"]+)" onmousedown="[^"]+">([^<]+)<', page)
vdurs = re.findall(r'<span class="vdur _dwc">([^<]+)<', page)
durs = [r1(r'(\d+:\d+)', unescape_html(dur)) for dur in vdurs]
2015-10-21 16:00:46 +03:00
print("Google Videos search:")
for v in zip(videos, durs):
print("- video: %s [%s]" % (unescape_html(v[0][1]),
v[1] if v[1] else '?'))
2015-10-21 16:00:46 +03:00
print("# you-get %s" % log.sprint(v[0][0], log.UNDERLINE))
print()
print("Best matched result:")
return(videos[0][0])
def url_to_module(url):
2015-10-21 16:00:46 +03:00
try:
video_host = r1(r'https?://([^/]+)/', url)
video_url = r1(r'https?://[^/]+(.*)', url)
assert video_host and video_url
2017-08-10 19:05:15 +03:00
except AssertionError:
2015-10-21 16:00:46 +03:00
url = google_search(url)
video_host = r1(r'https?://([^/]+)/', url)
video_url = r1(r'https?://[^/]+(.*)', url)
2017-05-14 00:15:18 +03:00
if video_host.endswith('.com.cn') or video_host.endswith('.ac.cn'):
video_host = video_host[:-3]
domain = r1(r'(\.[^.]+\.[^.]+)$', video_host) or video_host
assert domain, 'unsupported url: ' + url
k = r1(r'([^.]+)', domain)
if k in SITES:
return import_module('.'.join(['you_get', 'extractors', SITES[k]])), url
2014-07-17 07:04:15 +04:00
else:
import http.client
2017-08-10 19:29:31 +03:00
video_host = r1(r'https?://([^/]+)/', url) # .cn could be removed
2017-08-10 19:05:15 +03:00
if url.startswith('https://'):
conn = http.client.HTTPSConnection(video_host)
else:
conn = http.client.HTTPConnection(video_host)
conn.request("HEAD", video_url, headers=fake_headers)
res = conn.getresponse()
location = res.getheader('location')
if location and location != url and not location.startswith('/'):
return url_to_module(location)
2015-09-21 11:17:56 +03:00
else:
return import_module('you_get.extractors.universal'), url
2014-07-17 07:04:15 +04:00
def any_download(url, **kwargs):
m, url = url_to_module(url)
m.download(url, **kwargs)
2014-06-24 05:59:47 +04:00
def any_download_playlist(url, **kwargs):
m, url = url_to_module(url)
m.download_playlist(url, **kwargs)
2014-06-24 05:59:47 +04:00
2015-10-21 18:01:31 +03:00
def main(**kwargs):
script_main(any_download, any_download_playlist, **kwargs)