[universal] workaround for websites that block HEAD requests

This commit is contained in:
Mort Yao 2016-11-26 17:09:28 +01:00
parent 03266c030a
commit 538f1796f2
No known key found for this signature in database
GPG Key ID: 07DA00CB78203251
2 changed files with 7 additions and 4 deletions

View File

@ -338,7 +338,7 @@ def get_content(url, headers={}, decoded=True):
if charset is not None:
data = data.decode(charset)
else:
data = data.decode('utf-8')
data = data.decode('utf-8', 'ignore')
return data
@ -395,12 +395,12 @@ def url_size(url, faker = False, headers = {}):
def urls_size(urls, faker = False, headers = {}):
return sum([url_size(url, faker=faker, headers=headers) for url in urls])
def get_head(url, headers = {}):
def get_head(url, headers = {}, get_method = 'HEAD'):
if headers:
req = request.Request(url, headers = headers)
else:
req = request.Request(url)
req.get_method = lambda : 'HEAD'
req.get_method = lambda : get_method
res = request.urlopen(req)
return dict(res.headers)

View File

@ -6,7 +6,10 @@ from ..common import *
from .embed import *
def universal_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
content_type = get_head(url, headers=fake_headers)['Content-Type']
try:
content_type = get_head(url, headers=fake_headers)['Content-Type']
except:
content_type = get_head(url, headers=fake_headers, get_method='GET')['Content-Type']
if content_type.startswith('text/html'):
try:
embed_download(url, output_dir, merge=merge, info_only=info_only)