use httpx or reuqests if available

This commit is contained in:
URenko 2023-02-10 04:57:16 +00:00
parent 671627be72
commit 4bfa53983f

View File

@ -15,6 +15,22 @@ from http import cookiejar
from importlib import import_module from importlib import import_module
from urllib import request, parse, error from urllib import request, parse, error
try:
import httpx
session = httpx.Client(transport=httpx.HTTPTransport(retries=3), follow_redirects=True, http2=True,
headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0'}) # some website can accept 'Python-urllib' or 'python-requests' but not 'httpx'
__urllib__ = False
except ImportError:
try:
import requests
from requests.adapters import HTTPAdapter
session = requests.Session()
session.mount('http://', HTTPAdapter(max_retries=3))
session.mount('https://', HTTPAdapter(max_retries=3))
__urllib__ = False
except ImportError:
__urllib__ = True
from .version import __version__ from .version import __version__
from .util import log, term from .util import log, term
from .util.git import get_version from .util.git import get_version
@ -346,6 +362,12 @@ def undeflate(data):
# an http.client implementation of get_content() # an http.client implementation of get_content()
# because urllib does not support "Connection: keep-alive" # because urllib does not support "Connection: keep-alive"
def getHttps(host, url, headers, debuglevel=0): def getHttps(host, url, headers, debuglevel=0):
if not __urllib__:
if not (url.startswith('http://') or url.startswith('https://')):
url = 'https://' + host + url
r = session.get(url, headers=headers)
return r.text, r.headers.get('set-cookie')
import http.client import http.client
conn = http.client.HTTPSConnection(host) conn = http.client.HTTPSConnection(host)
@ -378,6 +400,9 @@ def get_decoded_html(url, faker=False):
def get_location(url, headers=None, get_method='HEAD'): def get_location(url, headers=None, get_method='HEAD'):
logging.debug('get_location: %s' % url) logging.debug('get_location: %s' % url)
if not __urllib__:
return str(session.request(get_method, url, headers=headers).url)
if headers: if headers:
req = request.Request(url, headers=headers) req = request.Request(url, headers=headers)
else: else:
@ -424,6 +449,11 @@ def get_content(url, headers={}, decoded=True):
logging.debug('get_content: %s' % url) logging.debug('get_content: %s' % url)
if not __urllib__:
if cookies: session.cookies = cookies # https://www.python-httpx.org/compatibility/#cookies
r = session.get(url, headers=headers)
return r.text if decoded else r.content
req = request.Request(url, headers=headers) req = request.Request(url, headers=headers)
if cookies: if cookies:
# NOTE: Do not use cookies.add_cookie_header(req) # NOTE: Do not use cookies.add_cookie_header(req)
@ -477,6 +507,16 @@ def post_content(url, headers={}, post_data={}, decoded=True, **kwargs):
else: else:
logging.debug('post_content: %s\npost_data: %s' % (url, post_data)) logging.debug('post_content: %s\npost_data: %s' % (url, post_data))
if not __urllib__:
if cookies: session.cookies = cookies # https://www.python-httpx.org/compatibility/#cookies
r = session.post(url, headers=headers, data=kwargs.get('post_data_raw') or post_data) # https://www.python-httpx.org/compatibility/#request-content
return r.text if decoded else r.content
if kwargs.get('post_data_raw'):
post_data_enc = bytes(kwargs['post_data_raw'], 'utf-8')
else:
post_data_enc = bytes(parse.urlencode(post_data), 'utf-8')
req = request.Request(url, headers=headers) req = request.Request(url, headers=headers)
if cookies: if cookies:
# NOTE: Do not use cookies.add_cookie_header(req) # NOTE: Do not use cookies.add_cookie_header(req)
@ -490,10 +530,6 @@ def post_content(url, headers={}, post_data={}, decoded=True, **kwargs):
cookie_strings.append(cookie.name + '=' + cookie.value) cookie_strings.append(cookie.name + '=' + cookie.value)
cookie_headers = {'Cookie': '; '.join(cookie_strings)} cookie_headers = {'Cookie': '; '.join(cookie_strings)}
req.headers.update(cookie_headers) req.headers.update(cookie_headers)
if kwargs.get('post_data_raw'):
post_data_enc = bytes(kwargs['post_data_raw'], 'utf-8')
else:
post_data_enc = bytes(parse.urlencode(post_data), 'utf-8')
response = urlopen_with_retry(req, data=post_data_enc) response = urlopen_with_retry(req, data=post_data_enc)
data = response.read() data = response.read()
@ -518,14 +554,10 @@ def post_content(url, headers={}, post_data={}, decoded=True, **kwargs):
def url_size(url, faker=False, headers={}): def url_size(url, faker=False, headers={}):
if faker: if __urllib__:
response = urlopen_with_retry( response = urlopen_with_retry(request.Request(url, headers=fake_headers if faker else headers))
request.Request(url, headers=fake_headers)
)
elif headers:
response = urlopen_with_retry(request.Request(url, headers=headers))
else: else:
response = urlopen_with_retry(url) response = session.head(url, headers=fake_headers if faker else headers)
size = response.headers['content-length'] size = response.headers['content-length']
return int(size) if size is not None else float('inf') return int(size) if size is not None else float('inf')
@ -535,13 +567,13 @@ def urls_size(urls, faker=False, headers={}):
return sum([url_size(url, faker=faker, headers=headers) for url in urls]) return sum([url_size(url, faker=faker, headers=headers) for url in urls])
def get_head(url, headers=None, get_method='HEAD'): def get_head(url, headers={}, get_method='HEAD'):
logging.debug('get_head: %s' % url) logging.debug('get_head: %s' % url)
if headers: if not __urllib__:
req = request.Request(url, headers=headers) return session.request(get_method, url, headers=headers).headers
else:
req = request.Request(url) req = request.Request(url, headers=headers)
req.get_method = lambda: get_method req.get_method = lambda: get_method
res = urlopen_with_retry(req) res = urlopen_with_retry(req)
return res.headers return res.headers
@ -608,6 +640,16 @@ def url_info(url, faker=False, headers={}):
return type, ext, size return type, ext, size
def iter_content(response):
while True:
try:
buffer = response.read(1024 * 256)
except socket.timeout:
break
if buffer:
yield buffer
else:
break
def url_save( def url_save(
url, filepath, bar, refer=None, is_part=False, faker=False, url, filepath, bar, refer=None, is_part=False, faker=False,
@ -704,66 +746,68 @@ def url_save(
else: else:
headers = {} headers = {}
''' '''
if received:
# chunk_start will always be 0 if not chunked
tmp_headers['Range'] = 'bytes=' + str(received - chunk_start) + '-'
if refer: if refer:
tmp_headers['Referer'] = refer tmp_headers['Referer'] = refer
if timeout: while True:
response = urlopen_with_retry( if received:
request.Request(url, headers=tmp_headers), timeout=timeout # chunk_start will always be 0 if not chunked
) tmp_headers['Range'] = 'bytes=' + str(received - chunk_start) + '-'
else: if __urllib__:
response = urlopen_with_retry( if timeout:
request.Request(url, headers=tmp_headers) _response = urlopen_with_retry(
) request.Request(url, headers=tmp_headers), timeout=timeout
try: )
range_start = int( else:
response.headers[ _response = urlopen_with_retry(
'content-range' request.Request(url, headers=tmp_headers)
][6:].split('/')[0].split('-')[0] )
) elif callable(session.stream): # HTTPX
end_length = int( _response = session.stream('GET', url, headers=tmp_headers, timeout=timeout)
response.headers['content-range'][6:].split('/')[1] else: # requests
) _response = session.get(url, headers=tmp_headers, timeout=timeout, stream=True)
range_length = end_length - range_start with _response as response:
except:
content_length = response.headers['content-length']
range_length = int(content_length) if content_length is not None \
else float('inf')
if is_chunked: # always append if chunked
open_mode = 'ab'
elif file_size != received + range_length: # is it ever necessary?
received = 0
if bar:
bar.received = 0
open_mode = 'wb'
with open(temp_filepath, open_mode) as output:
while True:
buffer = None
try: try:
buffer = response.read(1024 * 256) range_start = int(
except socket.timeout: response.headers[
pass 'content-range'
if not buffer: ][6:].split('/')[0].split('-')[0]
)
end_length = int(
response.headers['content-range'][6:].split('/')[1]
)
range_length = end_length - range_start
except:
content_length = response.headers['content-length']
range_length = int(content_length) if content_length is not None \
else float('inf')
if is_chunked: # always append if chunked
open_mode = 'ab'
elif file_size != received + range_length: # is it ever necessary?
received = 0
if bar:
bar.received = 0
open_mode = 'wb'
with open(temp_filepath, open_mode) as output:
if __urllib__:
iter = iter_content(response)
elif hasattr(response, 'iter_content'): # HTTPX
iter = response.iter_content(1024 * 256)
else: # requests
iter = response.iter_bytes(1024 * 256)
for buffer in iter:
output.write(buffer)
received += len(buffer)
received_chunk += len(buffer)
if bar:
bar.update_received(len(buffer))
if is_chunked and received_chunk == range_length: if is_chunked and received_chunk == range_length:
break break
elif not is_chunked and received == file_size: # Download finished elif not is_chunked and received == file_size: # Download finished
break break
# Unexpected termination. Retry request # Unexpected termination. Retry request
tmp_headers['Range'] = 'bytes=' + str(received - chunk_start) + '-'
response = urlopen_with_retry(
request.Request(url, headers=tmp_headers)
)
continue
output.write(buffer)
received += len(buffer)
received_chunk += len(buffer)
if bar:
bar.update_received(len(buffer))
assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % ( assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (
received, os.path.getsize(temp_filepath), temp_filepath received, os.path.getsize(temp_filepath), temp_filepath