mirror of
https://github.com/soimort/you-get.git
synced 2025-01-24 14:05:01 +03:00
[universal] download images
This commit is contained in:
parent
5916c96a6e
commit
2c7aa3b161
@ -27,14 +27,25 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg
|
|||||||
page_title = unescape_html(page_title)
|
page_title = unescape_html(page_title)
|
||||||
|
|
||||||
# most common media file extensions on the Internet
|
# most common media file extensions on the Internet
|
||||||
media_exts = ['flv', 'mp3', 'mp4', 'webm']
|
media_exts = ['\.flv', '\.mp3', '\.mp4', '\.webm',
|
||||||
|
'[-_]1\d\d\d\.jpg', '[-_][6-9]\d\d\.jpg', # tumblr
|
||||||
|
'[-_]1\d\d\dx[6-9]\d\d\.jpg',
|
||||||
|
's1600/[\w%]+\.jpg', # blogger
|
||||||
|
'img[6-9]\d\d/[\w%]+\.jpg' # oricon?
|
||||||
|
]
|
||||||
|
|
||||||
urls = []
|
urls = []
|
||||||
for i in media_exts:
|
for i in media_exts:
|
||||||
urls += re.findall(r'(https?://[^;"\'\\]+\.' + i + r'[^;"\'\\]*)', page)
|
urls += re.findall(r'(https?://[^;"\'\\]+' + i + r'[^;"\'\\]*)', page)
|
||||||
|
|
||||||
q_urls = re.findall(r'(https?%3A%2F%2F[^;&]+\.' + i + r'[^;&]*)', page)
|
p_urls = re.findall(r'(https?%3A%2F%2F[^;&]+' + i + r'[^;&]*)', page)
|
||||||
urls += [parse.unquote(url) for url in q_urls]
|
urls += [parse.unquote(url) for url in p_urls]
|
||||||
|
|
||||||
|
q_urls = re.findall(r'(https?:\\\\/\\\\/[^;"\']+' + i + r'[^;"\']*)', page)
|
||||||
|
urls += [url.replace('\\\\/', '/') for url in q_urls]
|
||||||
|
|
||||||
|
# a link href to an image is often an interesting one
|
||||||
|
urls += re.findall(r'href="(https?://[^"]+\.jpg)"', page)
|
||||||
|
|
||||||
# have some candy!
|
# have some candy!
|
||||||
candies = []
|
candies = []
|
||||||
@ -51,6 +62,7 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg
|
|||||||
for candy in candies:
|
for candy in candies:
|
||||||
try:
|
try:
|
||||||
mime, ext, size = url_info(candy['url'], faker=True)
|
mime, ext, size = url_info(candy['url'], faker=True)
|
||||||
|
if not size: size = float('Int')
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user