2012-12-18 20:26:35 +04:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
__all__ = ['tumblr_download']
|
|
|
|
|
|
|
|
from ..common import *
|
2015-11-11 09:53:58 +03:00
|
|
|
from .universal import *
|
2015-12-16 17:41:36 +03:00
|
|
|
from .dailymotion import dailymotion_download
|
2015-12-13 05:31:09 +03:00
|
|
|
from .vimeo import vimeo_download
|
2016-01-25 16:48:55 +03:00
|
|
|
from .vine import vine_download
|
2012-12-18 20:26:35 +04:00
|
|
|
|
2015-10-20 04:58:49 +03:00
|
|
|
def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
|
2015-11-11 09:53:58 +03:00
|
|
|
if re.match(r'https?://\d+\.media\.tumblr\.com/', url):
|
|
|
|
universal_download(url, output_dir, merge=merge, info_only=info_only)
|
|
|
|
return
|
|
|
|
|
2014-11-06 04:57:01 +03:00
|
|
|
html = parse.unquote(get_html(url)).replace('\/', '/')
|
|
|
|
feed = r1(r'<meta property="og:type" content="tumblr-feed:(\w+)" />', html)
|
|
|
|
|
2015-12-17 05:09:19 +03:00
|
|
|
if feed in ['photo', 'photoset', 'entry'] or feed is None:
|
2015-12-13 05:31:09 +03:00
|
|
|
# try to extract photos
|
2015-10-20 04:58:49 +03:00
|
|
|
page_title = r1(r'<meta name="description" content="([^"\n]+)', html) or \
|
|
|
|
r1(r'<meta property="og:description" content="([^"\n]+)', html) or \
|
|
|
|
r1(r'<title>([^<\n]*)', html)
|
2015-10-29 22:47:18 +03:00
|
|
|
urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.jpg)', html) +\
|
|
|
|
re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.png)', html) +\
|
|
|
|
re.findall(r'(https?://[^;"&]+/tumblr_[^";]+_\d+\.gif)', html)
|
2015-10-20 04:58:49 +03:00
|
|
|
|
|
|
|
tuggles = {}
|
|
|
|
for url in urls:
|
|
|
|
filename = parse.unquote(url.split('/')[-1])
|
|
|
|
title = '.'.join(filename.split('.')[:-1])
|
|
|
|
tumblr_id = r1(r'^tumblr_(.+)_\d+$', title)
|
|
|
|
quality = int(r1(r'^tumblr_.+_(\d+)$', title))
|
|
|
|
ext = filename.split('.')[-1]
|
|
|
|
size = int(get_head(url)['Content-Length'])
|
|
|
|
if tumblr_id not in tuggles or tuggles[tumblr_id]['quality'] < quality:
|
|
|
|
tuggles[tumblr_id] = {
|
|
|
|
'title': title,
|
|
|
|
'url': url,
|
|
|
|
'quality': quality,
|
|
|
|
'ext': ext,
|
|
|
|
'size': size,
|
|
|
|
}
|
|
|
|
|
2015-12-13 05:31:09 +03:00
|
|
|
if tuggles:
|
|
|
|
size = sum([tuggles[t]['size'] for t in tuggles])
|
|
|
|
print_info(site_info, page_title, None, size)
|
2015-10-20 04:58:49 +03:00
|
|
|
|
2015-12-13 05:31:09 +03:00
|
|
|
if not info_only:
|
|
|
|
for t in tuggles:
|
|
|
|
title = tuggles[t]['title']
|
|
|
|
ext = tuggles[t]['ext']
|
|
|
|
size = tuggles[t]['size']
|
|
|
|
url = tuggles[t]['url']
|
|
|
|
print_info(site_info, title, ext, size)
|
|
|
|
download_urls([url], title, ext, size,
|
|
|
|
output_dir=output_dir)
|
|
|
|
return
|
2015-10-20 04:58:49 +03:00
|
|
|
|
2015-12-13 05:31:09 +03:00
|
|
|
# feed == 'audio' or feed == 'video' or feed is None
|
|
|
|
# try to extract video / audio
|
|
|
|
real_url = r1(r'source src=\\x22([^\\]+)\\', html)
|
|
|
|
if not real_url:
|
|
|
|
real_url = r1(r'audio_file=([^&]+)&', html)
|
|
|
|
if real_url:
|
|
|
|
real_url = real_url + '?plead=please-dont-download-this-or-our-lawyers-wont-let-us-host-audio'
|
|
|
|
if not real_url:
|
2014-11-06 04:57:01 +03:00
|
|
|
real_url = r1(r'<source src="([^"]*)"', html)
|
2015-12-13 05:31:09 +03:00
|
|
|
if not real_url:
|
2016-03-01 02:59:04 +03:00
|
|
|
iframe_url = r1(r'<[^>]+tumblr_video_container[^>]+><iframe[^>]+src=[\'"]([^\'"]*)[\'"]', html)
|
2016-07-01 21:28:03 +03:00
|
|
|
if iframe_url:
|
2016-03-01 02:59:04 +03:00
|
|
|
iframe_html = get_content(iframe_url, headers=fake_headers)
|
|
|
|
real_url = r1(r'<video[^>]*>[\n ]*<source[^>]+src=[\'"]([^\'"]*)[\'"]', iframe_html)
|
2015-12-13 05:31:09 +03:00
|
|
|
else:
|
2016-03-01 02:59:04 +03:00
|
|
|
iframe_url = r1(r'<iframe[^>]+src=[\'"]([^\'"]*)[\'"]', html)
|
|
|
|
if iframe_url[:2] == '//': iframe_url = 'http:' + iframe_url
|
|
|
|
if re.search(r'player\.vimeo\.com', iframe_url):
|
|
|
|
vimeo_download(iframe_url, output_dir, merge=merge, info_only=info_only,
|
|
|
|
referer='http://tumblr.com/', **kwargs)
|
|
|
|
return
|
|
|
|
elif re.search(r'dailymotion\.com', iframe_url):
|
|
|
|
dailymotion_download(iframe_url, output_dir, merge=merge, info_only=info_only, **kwargs)
|
|
|
|
return
|
|
|
|
elif re.search(r'vine\.co', iframe_url):
|
|
|
|
vine_download(iframe_url, output_dir, merge=merge, info_only=info_only, **kwargs)
|
|
|
|
return
|
|
|
|
else:
|
|
|
|
iframe_html = get_content(iframe_url)
|
|
|
|
real_url = r1(r'<source src="([^"]*)"', iframe_html)
|
2015-10-20 04:58:49 +03:00
|
|
|
|
2013-08-23 00:25:51 +04:00
|
|
|
title = unescape_html(r1(r'<meta property="og:title" content="([^"]*)" />', html) or
|
|
|
|
r1(r'<meta property="og:description" content="([^"]*)" />', html) or
|
2015-06-30 10:39:56 +03:00
|
|
|
r1(r'<title>([^<\n]*)', html) or url.split("/")[4]).replace('\n', '')
|
2015-10-20 04:58:49 +03:00
|
|
|
|
2012-12-18 20:26:35 +04:00
|
|
|
type, ext, size = url_info(real_url)
|
2015-10-20 04:58:49 +03:00
|
|
|
|
2012-12-18 20:26:35 +04:00
|
|
|
print_info(site_info, title, type, size)
|
|
|
|
if not info_only:
|
|
|
|
download_urls([real_url], title, ext, size, output_dir, merge = merge)
|
|
|
|
|
|
|
|
site_info = "Tumblr.com"
|
|
|
|
download = tumblr_download
|
|
|
|
download_playlist = playlist_not_supported('tumblr')
|