mirror of
https://github.com/soimort/you-get.git
synced 2025-01-23 21:45:02 +03:00
[tumblr] filter out some non-image urls
This commit is contained in:
parent
42ed56d2fd
commit
214deb6c8b
@ -14,9 +14,9 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
|
|||||||
page_title = r1(r'<meta name="description" content="([^"\n]+)', html) or \
|
page_title = r1(r'<meta name="description" content="([^"\n]+)', html) or \
|
||||||
r1(r'<meta property="og:description" content="([^"\n]+)', html) or \
|
r1(r'<meta property="og:description" content="([^"\n]+)', html) or \
|
||||||
r1(r'<title>([^<\n]*)', html)
|
r1(r'<title>([^<\n]*)', html)
|
||||||
urls = re.findall(r'(https?://[^;"]+/tumblr_[^;"]+_\d+\.jpg)', html) +\
|
urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.jpg)', html) +\
|
||||||
re.findall(r'(https?://[^;"]+/tumblr_[^;"]+_\d+\.png)', html) +\
|
re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.png)', html) +\
|
||||||
re.findall(r'(https?://[^;"]+/tumblr_[^";]+_\d+\.gif)', html)
|
re.findall(r'(https?://[^;"&]+/tumblr_[^";]+_\d+\.gif)', html)
|
||||||
|
|
||||||
tuggles = {}
|
tuggles = {}
|
||||||
for url in urls:
|
for url in urls:
|
||||||
|
Loading…
Reference in New Issue
Block a user