mirror of
https://github.com/soimort/you-get.git
synced 2025-01-24 22:15:03 +03:00
[universal] workaround for websites that block HEAD requests
This commit is contained in:
parent
03266c030a
commit
538f1796f2
@ -338,7 +338,7 @@ def get_content(url, headers={}, decoded=True):
|
|||||||
if charset is not None:
|
if charset is not None:
|
||||||
data = data.decode(charset)
|
data = data.decode(charset)
|
||||||
else:
|
else:
|
||||||
data = data.decode('utf-8')
|
data = data.decode('utf-8', 'ignore')
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@ -395,12 +395,12 @@ def url_size(url, faker = False, headers = {}):
|
|||||||
def urls_size(urls, faker = False, headers = {}):
|
def urls_size(urls, faker = False, headers = {}):
|
||||||
return sum([url_size(url, faker=faker, headers=headers) for url in urls])
|
return sum([url_size(url, faker=faker, headers=headers) for url in urls])
|
||||||
|
|
||||||
def get_head(url, headers = {}):
|
def get_head(url, headers = {}, get_method = 'HEAD'):
|
||||||
if headers:
|
if headers:
|
||||||
req = request.Request(url, headers = headers)
|
req = request.Request(url, headers = headers)
|
||||||
else:
|
else:
|
||||||
req = request.Request(url)
|
req = request.Request(url)
|
||||||
req.get_method = lambda : 'HEAD'
|
req.get_method = lambda : get_method
|
||||||
res = request.urlopen(req)
|
res = request.urlopen(req)
|
||||||
return dict(res.headers)
|
return dict(res.headers)
|
||||||
|
|
||||||
|
@ -6,7 +6,10 @@ from ..common import *
|
|||||||
from .embed import *
|
from .embed import *
|
||||||
|
|
||||||
def universal_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
|
def universal_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
|
||||||
|
try:
|
||||||
content_type = get_head(url, headers=fake_headers)['Content-Type']
|
content_type = get_head(url, headers=fake_headers)['Content-Type']
|
||||||
|
except:
|
||||||
|
content_type = get_head(url, headers=fake_headers, get_method='GET')['Content-Type']
|
||||||
if content_type.startswith('text/html'):
|
if content_type.startswith('text/html'):
|
||||||
try:
|
try:
|
||||||
embed_download(url, output_dir, merge=merge, info_only=info_only)
|
embed_download(url, output_dir, merge=merge, info_only=info_only)
|
||||||
|
Loading…
Reference in New Issue
Block a user