mirror of
https://github.com/soimort/you-get.git
synced 2025-02-09 03:37:52 +03:00
use httpx or reuqests if available
This commit is contained in:
parent
671627be72
commit
4bfa53983f
@ -15,6 +15,22 @@ from http import cookiejar
|
|||||||
from importlib import import_module
|
from importlib import import_module
|
||||||
from urllib import request, parse, error
|
from urllib import request, parse, error
|
||||||
|
|
||||||
|
try:
|
||||||
|
import httpx
|
||||||
|
session = httpx.Client(transport=httpx.HTTPTransport(retries=3), follow_redirects=True, http2=True,
|
||||||
|
headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0'}) # some website can accept 'Python-urllib' or 'python-requests' but not 'httpx'
|
||||||
|
__urllib__ = False
|
||||||
|
except ImportError:
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
session = requests.Session()
|
||||||
|
session.mount('http://', HTTPAdapter(max_retries=3))
|
||||||
|
session.mount('https://', HTTPAdapter(max_retries=3))
|
||||||
|
__urllib__ = False
|
||||||
|
except ImportError:
|
||||||
|
__urllib__ = True
|
||||||
|
|
||||||
from .version import __version__
|
from .version import __version__
|
||||||
from .util import log, term
|
from .util import log, term
|
||||||
from .util.git import get_version
|
from .util.git import get_version
|
||||||
@ -346,6 +362,12 @@ def undeflate(data):
|
|||||||
# an http.client implementation of get_content()
|
# an http.client implementation of get_content()
|
||||||
# because urllib does not support "Connection: keep-alive"
|
# because urllib does not support "Connection: keep-alive"
|
||||||
def getHttps(host, url, headers, debuglevel=0):
|
def getHttps(host, url, headers, debuglevel=0):
|
||||||
|
if not __urllib__:
|
||||||
|
if not (url.startswith('http://') or url.startswith('https://')):
|
||||||
|
url = 'https://' + host + url
|
||||||
|
r = session.get(url, headers=headers)
|
||||||
|
return r.text, r.headers.get('set-cookie')
|
||||||
|
|
||||||
import http.client
|
import http.client
|
||||||
|
|
||||||
conn = http.client.HTTPSConnection(host)
|
conn = http.client.HTTPSConnection(host)
|
||||||
@ -378,6 +400,9 @@ def get_decoded_html(url, faker=False):
|
|||||||
def get_location(url, headers=None, get_method='HEAD'):
|
def get_location(url, headers=None, get_method='HEAD'):
|
||||||
logging.debug('get_location: %s' % url)
|
logging.debug('get_location: %s' % url)
|
||||||
|
|
||||||
|
if not __urllib__:
|
||||||
|
return str(session.request(get_method, url, headers=headers).url)
|
||||||
|
|
||||||
if headers:
|
if headers:
|
||||||
req = request.Request(url, headers=headers)
|
req = request.Request(url, headers=headers)
|
||||||
else:
|
else:
|
||||||
@ -424,6 +449,11 @@ def get_content(url, headers={}, decoded=True):
|
|||||||
|
|
||||||
logging.debug('get_content: %s' % url)
|
logging.debug('get_content: %s' % url)
|
||||||
|
|
||||||
|
if not __urllib__:
|
||||||
|
if cookies: session.cookies = cookies # https://www.python-httpx.org/compatibility/#cookies
|
||||||
|
r = session.get(url, headers=headers)
|
||||||
|
return r.text if decoded else r.content
|
||||||
|
|
||||||
req = request.Request(url, headers=headers)
|
req = request.Request(url, headers=headers)
|
||||||
if cookies:
|
if cookies:
|
||||||
# NOTE: Do not use cookies.add_cookie_header(req)
|
# NOTE: Do not use cookies.add_cookie_header(req)
|
||||||
@ -477,6 +507,16 @@ def post_content(url, headers={}, post_data={}, decoded=True, **kwargs):
|
|||||||
else:
|
else:
|
||||||
logging.debug('post_content: %s\npost_data: %s' % (url, post_data))
|
logging.debug('post_content: %s\npost_data: %s' % (url, post_data))
|
||||||
|
|
||||||
|
if not __urllib__:
|
||||||
|
if cookies: session.cookies = cookies # https://www.python-httpx.org/compatibility/#cookies
|
||||||
|
r = session.post(url, headers=headers, data=kwargs.get('post_data_raw') or post_data) # https://www.python-httpx.org/compatibility/#request-content
|
||||||
|
return r.text if decoded else r.content
|
||||||
|
|
||||||
|
if kwargs.get('post_data_raw'):
|
||||||
|
post_data_enc = bytes(kwargs['post_data_raw'], 'utf-8')
|
||||||
|
else:
|
||||||
|
post_data_enc = bytes(parse.urlencode(post_data), 'utf-8')
|
||||||
|
|
||||||
req = request.Request(url, headers=headers)
|
req = request.Request(url, headers=headers)
|
||||||
if cookies:
|
if cookies:
|
||||||
# NOTE: Do not use cookies.add_cookie_header(req)
|
# NOTE: Do not use cookies.add_cookie_header(req)
|
||||||
@ -490,10 +530,6 @@ def post_content(url, headers={}, post_data={}, decoded=True, **kwargs):
|
|||||||
cookie_strings.append(cookie.name + '=' + cookie.value)
|
cookie_strings.append(cookie.name + '=' + cookie.value)
|
||||||
cookie_headers = {'Cookie': '; '.join(cookie_strings)}
|
cookie_headers = {'Cookie': '; '.join(cookie_strings)}
|
||||||
req.headers.update(cookie_headers)
|
req.headers.update(cookie_headers)
|
||||||
if kwargs.get('post_data_raw'):
|
|
||||||
post_data_enc = bytes(kwargs['post_data_raw'], 'utf-8')
|
|
||||||
else:
|
|
||||||
post_data_enc = bytes(parse.urlencode(post_data), 'utf-8')
|
|
||||||
response = urlopen_with_retry(req, data=post_data_enc)
|
response = urlopen_with_retry(req, data=post_data_enc)
|
||||||
data = response.read()
|
data = response.read()
|
||||||
|
|
||||||
@ -518,14 +554,10 @@ def post_content(url, headers={}, post_data={}, decoded=True, **kwargs):
|
|||||||
|
|
||||||
|
|
||||||
def url_size(url, faker=False, headers={}):
|
def url_size(url, faker=False, headers={}):
|
||||||
if faker:
|
if __urllib__:
|
||||||
response = urlopen_with_retry(
|
response = urlopen_with_retry(request.Request(url, headers=fake_headers if faker else headers))
|
||||||
request.Request(url, headers=fake_headers)
|
|
||||||
)
|
|
||||||
elif headers:
|
|
||||||
response = urlopen_with_retry(request.Request(url, headers=headers))
|
|
||||||
else:
|
else:
|
||||||
response = urlopen_with_retry(url)
|
response = session.head(url, headers=fake_headers if faker else headers)
|
||||||
|
|
||||||
size = response.headers['content-length']
|
size = response.headers['content-length']
|
||||||
return int(size) if size is not None else float('inf')
|
return int(size) if size is not None else float('inf')
|
||||||
@ -535,13 +567,13 @@ def urls_size(urls, faker=False, headers={}):
|
|||||||
return sum([url_size(url, faker=faker, headers=headers) for url in urls])
|
return sum([url_size(url, faker=faker, headers=headers) for url in urls])
|
||||||
|
|
||||||
|
|
||||||
def get_head(url, headers=None, get_method='HEAD'):
|
def get_head(url, headers={}, get_method='HEAD'):
|
||||||
logging.debug('get_head: %s' % url)
|
logging.debug('get_head: %s' % url)
|
||||||
|
|
||||||
if headers:
|
if not __urllib__:
|
||||||
req = request.Request(url, headers=headers)
|
return session.request(get_method, url, headers=headers).headers
|
||||||
else:
|
|
||||||
req = request.Request(url)
|
req = request.Request(url, headers=headers)
|
||||||
req.get_method = lambda: get_method
|
req.get_method = lambda: get_method
|
||||||
res = urlopen_with_retry(req)
|
res = urlopen_with_retry(req)
|
||||||
return res.headers
|
return res.headers
|
||||||
@ -608,6 +640,16 @@ def url_info(url, faker=False, headers={}):
|
|||||||
|
|
||||||
return type, ext, size
|
return type, ext, size
|
||||||
|
|
||||||
|
def iter_content(response):
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
buffer = response.read(1024 * 256)
|
||||||
|
except socket.timeout:
|
||||||
|
break
|
||||||
|
if buffer:
|
||||||
|
yield buffer
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
def url_save(
|
def url_save(
|
||||||
url, filepath, bar, refer=None, is_part=False, faker=False,
|
url, filepath, bar, refer=None, is_part=False, faker=False,
|
||||||
@ -704,66 +746,68 @@ def url_save(
|
|||||||
else:
|
else:
|
||||||
headers = {}
|
headers = {}
|
||||||
'''
|
'''
|
||||||
if received:
|
|
||||||
# chunk_start will always be 0 if not chunked
|
|
||||||
tmp_headers['Range'] = 'bytes=' + str(received - chunk_start) + '-'
|
|
||||||
if refer:
|
if refer:
|
||||||
tmp_headers['Referer'] = refer
|
tmp_headers['Referer'] = refer
|
||||||
|
|
||||||
if timeout:
|
while True:
|
||||||
response = urlopen_with_retry(
|
if received:
|
||||||
request.Request(url, headers=tmp_headers), timeout=timeout
|
# chunk_start will always be 0 if not chunked
|
||||||
)
|
tmp_headers['Range'] = 'bytes=' + str(received - chunk_start) + '-'
|
||||||
else:
|
if __urllib__:
|
||||||
response = urlopen_with_retry(
|
if timeout:
|
||||||
request.Request(url, headers=tmp_headers)
|
_response = urlopen_with_retry(
|
||||||
)
|
request.Request(url, headers=tmp_headers), timeout=timeout
|
||||||
try:
|
)
|
||||||
range_start = int(
|
else:
|
||||||
response.headers[
|
_response = urlopen_with_retry(
|
||||||
'content-range'
|
request.Request(url, headers=tmp_headers)
|
||||||
][6:].split('/')[0].split('-')[0]
|
)
|
||||||
)
|
elif callable(session.stream): # HTTPX
|
||||||
end_length = int(
|
_response = session.stream('GET', url, headers=tmp_headers, timeout=timeout)
|
||||||
response.headers['content-range'][6:].split('/')[1]
|
else: # requests
|
||||||
)
|
_response = session.get(url, headers=tmp_headers, timeout=timeout, stream=True)
|
||||||
range_length = end_length - range_start
|
with _response as response:
|
||||||
except:
|
|
||||||
content_length = response.headers['content-length']
|
|
||||||
range_length = int(content_length) if content_length is not None \
|
|
||||||
else float('inf')
|
|
||||||
|
|
||||||
if is_chunked: # always append if chunked
|
|
||||||
open_mode = 'ab'
|
|
||||||
elif file_size != received + range_length: # is it ever necessary?
|
|
||||||
received = 0
|
|
||||||
if bar:
|
|
||||||
bar.received = 0
|
|
||||||
open_mode = 'wb'
|
|
||||||
|
|
||||||
with open(temp_filepath, open_mode) as output:
|
|
||||||
while True:
|
|
||||||
buffer = None
|
|
||||||
try:
|
try:
|
||||||
buffer = response.read(1024 * 256)
|
range_start = int(
|
||||||
except socket.timeout:
|
response.headers[
|
||||||
pass
|
'content-range'
|
||||||
if not buffer:
|
][6:].split('/')[0].split('-')[0]
|
||||||
|
)
|
||||||
|
end_length = int(
|
||||||
|
response.headers['content-range'][6:].split('/')[1]
|
||||||
|
)
|
||||||
|
range_length = end_length - range_start
|
||||||
|
except:
|
||||||
|
content_length = response.headers['content-length']
|
||||||
|
range_length = int(content_length) if content_length is not None \
|
||||||
|
else float('inf')
|
||||||
|
|
||||||
|
if is_chunked: # always append if chunked
|
||||||
|
open_mode = 'ab'
|
||||||
|
elif file_size != received + range_length: # is it ever necessary?
|
||||||
|
received = 0
|
||||||
|
if bar:
|
||||||
|
bar.received = 0
|
||||||
|
open_mode = 'wb'
|
||||||
|
|
||||||
|
with open(temp_filepath, open_mode) as output:
|
||||||
|
if __urllib__:
|
||||||
|
iter = iter_content(response)
|
||||||
|
elif hasattr(response, 'iter_content'): # HTTPX
|
||||||
|
iter = response.iter_content(1024 * 256)
|
||||||
|
else: # requests
|
||||||
|
iter = response.iter_bytes(1024 * 256)
|
||||||
|
for buffer in iter:
|
||||||
|
output.write(buffer)
|
||||||
|
received += len(buffer)
|
||||||
|
received_chunk += len(buffer)
|
||||||
|
if bar:
|
||||||
|
bar.update_received(len(buffer))
|
||||||
if is_chunked and received_chunk == range_length:
|
if is_chunked and received_chunk == range_length:
|
||||||
break
|
break
|
||||||
elif not is_chunked and received == file_size: # Download finished
|
elif not is_chunked and received == file_size: # Download finished
|
||||||
break
|
break
|
||||||
# Unexpected termination. Retry request
|
# Unexpected termination. Retry request
|
||||||
tmp_headers['Range'] = 'bytes=' + str(received - chunk_start) + '-'
|
|
||||||
response = urlopen_with_retry(
|
|
||||||
request.Request(url, headers=tmp_headers)
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
output.write(buffer)
|
|
||||||
received += len(buffer)
|
|
||||||
received_chunk += len(buffer)
|
|
||||||
if bar:
|
|
||||||
bar.update_received(len(buffer))
|
|
||||||
|
|
||||||
assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (
|
assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (
|
||||||
received, os.path.getsize(temp_filepath), temp_filepath
|
received, os.path.getsize(temp_filepath), temp_filepath
|
||||||
|
Loading…
Reference in New Issue
Block a user