From 214deb6c8b61eee02386942d2d3e792f33bd583b Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 29 Oct 2015 20:47:18 +0100 Subject: [PATCH] [tumblr] filter out some non-image urls --- src/you_get/extractors/tumblr.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/tumblr.py b/src/you_get/extractors/tumblr.py index f876b2d2..80f4d2e4 100644 --- a/src/you_get/extractors/tumblr.py +++ b/src/you_get/extractors/tumblr.py @@ -14,9 +14,9 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): page_title = r1(r'([^<\n]*)', html) - urls = re.findall(r'(https?://[^;"]+/tumblr_[^;"]+_\d+\.jpg)', html) +\ - re.findall(r'(https?://[^;"]+/tumblr_[^;"]+_\d+\.png)', html) +\ - re.findall(r'(https?://[^;"]+/tumblr_[^";]+_\d+\.gif)', html) + urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.jpg)', html) +\ + re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.png)', html) +\ + re.findall(r'(https?://[^;"&]+/tumblr_[^";]+_\d+\.gif)', html) tuggles = {} for url in urls: