From 6fc2cc375ea4bb2dcb6bab74ce67688af0efde59 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 26 Aug 2016 19:27:40 +0200 Subject: [PATCH] [universal] call embed_download only if content_type is text/html (#1369) --- src/you_get/extractors/universal.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/you_get/extractors/universal.py b/src/you_get/extractors/universal.py index b0d929c9..ebab70f8 100644 --- a/src/you_get/extractors/universal.py +++ b/src/you_get/extractors/universal.py @@ -6,20 +6,20 @@ from ..common import * from .embed import * def universal_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - try: - embed_download(url, output_dir, merge=merge, info_only=info_only) - except: pass - else: return + content_type = get_head(url, headers=fake_headers)['Content-Type'] + if content_type.startswith('text/html'): + try: + embed_download(url, output_dir, merge=merge, info_only=info_only) + except: pass + else: return domains = url.split('/')[2].split('.') if len(domains) > 2: domains = domains[1:] site_info = '.'.join(domains) - response = get_response(url, faker=True) - content_type = response.headers['Content-Type'] - if content_type.startswith('text/html'): # extract an HTML page + response = get_response(url, faker=True) page = str(response.data) page_title = r1(r'([^<]*)', page)