From 2c7aa3b16189c386502b89286efa6319528118b8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 21 Oct 2015 05:02:24 +0200 Subject: [PATCH] [universal] download images --- src/you_get/extractors/universal.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/you_get/extractors/universal.py b/src/you_get/extractors/universal.py index 60599c9b..1bf595f5 100644 --- a/src/you_get/extractors/universal.py +++ b/src/you_get/extractors/universal.py @@ -27,14 +27,25 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg page_title = unescape_html(page_title) # most common media file extensions on the Internet - media_exts = ['flv', 'mp3', 'mp4', 'webm'] + media_exts = ['\.flv', '\.mp3', '\.mp4', '\.webm', + '[-_]1\d\d\d\.jpg', '[-_][6-9]\d\d\.jpg', # tumblr + '[-_]1\d\d\dx[6-9]\d\d\.jpg', + 's1600/[\w%]+\.jpg', # blogger + 'img[6-9]\d\d/[\w%]+\.jpg' # oricon? + ] urls = [] for i in media_exts: - urls += re.findall(r'(https?://[^;"\'\\]+\.' + i + r'[^;"\'\\]*)', page) + urls += re.findall(r'(https?://[^;"\'\\]+' + i + r'[^;"\'\\]*)', page) - q_urls = re.findall(r'(https?%3A%2F%2F[^;&]+\.' + i + r'[^;&]*)', page) - urls += [parse.unquote(url) for url in q_urls] + p_urls = re.findall(r'(https?%3A%2F%2F[^;&]+' + i + r'[^;&]*)', page) + urls += [parse.unquote(url) for url in p_urls] + + q_urls = re.findall(r'(https?:\\\\/\\\\/[^;"\']+' + i + r'[^;"\']*)', page) + urls += [url.replace('\\\\/', '/') for url in q_urls] + + # a link href to an image is often an interesting one + urls += re.findall(r'href="(https?://[^"]+\.jpg)"', page) # have some candy! candies = [] @@ -51,6 +62,7 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg for candy in candies: try: mime, ext, size = url_info(candy['url'], faker=True) + if not size: size = float('Int') except: continue else: