[universal] download images

This commit is contained in:
Mort Yao 2015-10-21 05:02:24 +02:00
parent 5916c96a6e
commit 2c7aa3b161

View File

@ -27,14 +27,25 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg
page_title = unescape_html(page_title) page_title = unescape_html(page_title)
# most common media file extensions on the Internet # most common media file extensions on the Internet
media_exts = ['flv', 'mp3', 'mp4', 'webm'] media_exts = ['\.flv', '\.mp3', '\.mp4', '\.webm',
'[-_]1\d\d\d\.jpg', '[-_][6-9]\d\d\.jpg', # tumblr
'[-_]1\d\d\dx[6-9]\d\d\.jpg',
's1600/[\w%]+\.jpg', # blogger
'img[6-9]\d\d/[\w%]+\.jpg' # oricon?
]
urls = [] urls = []
for i in media_exts: for i in media_exts:
urls += re.findall(r'(https?://[^;"\'\\]+\.' + i + r'[^;"\'\\]*)', page) urls += re.findall(r'(https?://[^;"\'\\]+' + i + r'[^;"\'\\]*)', page)
q_urls = re.findall(r'(https?%3A%2F%2F[^;&]+\.' + i + r'[^;&]*)', page) p_urls = re.findall(r'(https?%3A%2F%2F[^;&]+' + i + r'[^;&]*)', page)
urls += [parse.unquote(url) for url in q_urls] urls += [parse.unquote(url) for url in p_urls]
q_urls = re.findall(r'(https?:\\\\/\\\\/[^;"\']+' + i + r'[^;"\']*)', page)
urls += [url.replace('\\\\/', '/') for url in q_urls]
# a link href to an image is often an interesting one
urls += re.findall(r'href="(https?://[^"]+\.jpg)"', page)
# have some candy! # have some candy!
candies = [] candies = []
@ -51,6 +62,7 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg
for candy in candies: for candy in candies:
try: try:
mime, ext, size = url_info(candy['url'], faker=True) mime, ext, size = url_info(candy['url'], faker=True)
if not size: size = float('Int')
except: except:
continue continue
else: else: