[tumblr] filter out some non-image urls

This commit is contained in:
Mort Yao 2015-10-29 20:47:18 +01:00
parent 42ed56d2fd
commit 214deb6c8b

View File

@ -14,9 +14,9 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
page_title = r1(r'<meta name="description" content="([^"\n]+)', html) or \
r1(r'<meta property="og:description" content="([^"\n]+)', html) or \
r1(r'<title>([^<\n]*)', html)
urls = re.findall(r'(https?://[^;"]+/tumblr_[^;"]+_\d+\.jpg)', html) +\
re.findall(r'(https?://[^;"]+/tumblr_[^;"]+_\d+\.png)', html) +\
re.findall(r'(https?://[^;"]+/tumblr_[^";]+_\d+\.gif)', html)
urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.jpg)', html) +\
re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.png)', html) +\
re.findall(r'(https?://[^;"&]+/tumblr_[^";]+_\d+\.gif)', html)
tuggles = {}
for url in urls: