[universal] workaround for websites that block HEAD requests

This commit is contained in:
Mort Yao 2016-11-26 17:09:28 +01:00
parent 03266c030a
commit 538f1796f2
No known key found for this signature in database
GPG Key ID: 07DA00CB78203251
2 changed files with 7 additions and 4 deletions

View File

@ -338,7 +338,7 @@ def get_content(url, headers={}, decoded=True):
if charset is not None: if charset is not None:
data = data.decode(charset) data = data.decode(charset)
else: else:
data = data.decode('utf-8') data = data.decode('utf-8', 'ignore')
return data return data
@ -395,12 +395,12 @@ def url_size(url, faker = False, headers = {}):
def urls_size(urls, faker = False, headers = {}): def urls_size(urls, faker = False, headers = {}):
return sum([url_size(url, faker=faker, headers=headers) for url in urls]) return sum([url_size(url, faker=faker, headers=headers) for url in urls])
def get_head(url, headers = {}): def get_head(url, headers = {}, get_method = 'HEAD'):
if headers: if headers:
req = request.Request(url, headers = headers) req = request.Request(url, headers = headers)
else: else:
req = request.Request(url) req = request.Request(url)
req.get_method = lambda : 'HEAD' req.get_method = lambda : get_method
res = request.urlopen(req) res = request.urlopen(req)
return dict(res.headers) return dict(res.headers)

View File

@ -6,7 +6,10 @@ from ..common import *
from .embed import * from .embed import *
def universal_download(url, output_dir='.', merge=True, info_only=False, **kwargs): def universal_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
content_type = get_head(url, headers=fake_headers)['Content-Type'] try:
content_type = get_head(url, headers=fake_headers)['Content-Type']
except:
content_type = get_head(url, headers=fake_headers, get_method='GET')['Content-Type']
if content_type.startswith('text/html'): if content_type.startswith('text/html'):
try: try:
embed_download(url, output_dir, merge=merge, info_only=info_only) embed_download(url, output_dir, merge=merge, info_only=info_only)