you-get/src/you_get/extractors/tumblr.py

145 lines
6.4 KiB
Python
Raw Normal View History

2012-12-18 20:26:35 +04:00
#!/usr/bin/env python
__all__ = ['tumblr_download']
from ..common import *
2015-11-11 09:53:58 +03:00
from .universal import *
2015-12-16 17:41:36 +03:00
from .dailymotion import dailymotion_download
2015-12-13 05:31:09 +03:00
from .vimeo import vimeo_download
from .vine import vine_download
2012-12-18 20:26:35 +04:00
2015-10-20 04:58:49 +03:00
def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
2015-11-11 09:53:58 +03:00
if re.match(r'https?://\d+\.media\.tumblr\.com/', url):
universal_download(url, output_dir, merge=merge, info_only=info_only)
return
2018-09-12 00:23:50 +03:00
import ssl
ssl_context = request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1))
cookie_handler = request.HTTPCookieProcessor()
opener = request.build_opener(ssl_context, cookie_handler)
request.install_opener(opener)
page = get_html(url)
form_key = match1(page, r'id="tumblr_form_key" content="([^"]+)"')
if form_key is not None:
# bypass GDPR consent page
referer = 'https://www.tumblr.com/privacy/consent?redirect=%s' % parse.quote_plus(url)
post_content('https://www.tumblr.com/svc/privacy/consent',
headers={
'Content-Type': 'application/json',
'User-Agent': fake_headers['User-Agent'],
'Referer': referer,
'X-tumblr-form-key': form_key,
'X-Requested-With': 'XMLHttpRequest'
},
post_data_raw='{"eu_resident":true,"gdpr_is_acceptable_age":true,"gdpr_consent_core":true,"gdpr_consent_first_party_ads":true,"gdpr_consent_third_party_ads":true,"gdpr_consent_search_history":true,"redirect_to":"%s","gdpr_reconsent":false}' % url)
2018-09-16 15:18:31 +03:00
page = get_html(url, faker=True)
2018-09-12 00:23:50 +03:00
html = parse.unquote(page).replace('\/', '/')
2014-11-06 04:57:01 +03:00
feed = r1(r'<meta property="og:type" content="tumblr-feed:(\w+)" />', html)
2015-12-17 05:09:19 +03:00
if feed in ['photo', 'photoset', 'entry'] or feed is None:
2015-12-13 05:31:09 +03:00
# try to extract photos
2015-10-20 04:58:49 +03:00
page_title = r1(r'<meta name="description" content="([^"\n]+)', html) or \
r1(r'<meta property="og:description" content="([^"\n]+)', html) or \
r1(r'<title>([^<\n]*)', html)
2018-09-16 15:18:31 +03:00
urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.jpg)', html) +\
re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.png)', html) +\
re.findall(r'(https?://[^;"&]+/tumblr_[^";&]+_\d+\.gif)', html)
2015-10-20 04:58:49 +03:00
tuggles = {}
for url in urls:
if url.endswith('.gif'):
hd_url = url
elif url.endswith('.jpg'):
hd_url = r1(r'(.+)_\d+\.jpg$', url) + '_1280.jpg' # FIXME: decide actual quality
elif url.endswith('.png'):
hd_url = r1(r'(.+)_\d+\.png$', url) + '_1280.png' # FIXME: decide actual quality
else:
continue
filename = parse.unquote(hd_url.split('/')[-1])
2015-10-20 04:58:49 +03:00
title = '.'.join(filename.split('.')[:-1])
tumblr_id = r1(r'^tumblr_(.+)_\d+$', title)
quality = int(r1(r'^tumblr_.+_(\d+)$', title))
ext = filename.split('.')[-1]
2017-10-31 21:02:44 +03:00
try:
size = int(get_head(hd_url)['Content-Length'])
2017-10-31 21:02:44 +03:00
if tumblr_id not in tuggles or tuggles[tumblr_id]['quality'] < quality:
tuggles[tumblr_id] = {
'title': title,
'url': hd_url,
2017-10-31 21:02:44 +03:00
'quality': quality,
'ext': ext,
'size': size,
}
except: pass
2015-10-20 04:58:49 +03:00
2015-12-13 05:31:09 +03:00
if tuggles:
size = sum([tuggles[t]['size'] for t in tuggles])
print_info(site_info, page_title, None, size)
2015-10-20 04:58:49 +03:00
2015-12-13 05:31:09 +03:00
if not info_only:
for t in tuggles:
title = tuggles[t]['title']
ext = tuggles[t]['ext']
size = tuggles[t]['size']
url = tuggles[t]['url']
print_info(site_info, title, ext, size)
download_urls([url], title, ext, size,
output_dir=output_dir)
return
2015-10-20 04:58:49 +03:00
2015-12-13 05:31:09 +03:00
# feed == 'audio' or feed == 'video' or feed is None
# try to extract video / audio
real_url = r1(r'source src=\\x22([^\\]+)\\', html)
if not real_url:
real_url = r1(r'audio_file=([^&]+)&', html)
if real_url:
real_url = real_url + '?plead=please-dont-download-this-or-our-lawyers-wont-let-us-host-audio'
if not real_url:
2014-11-06 04:57:01 +03:00
real_url = r1(r'<source src="([^"]*)"', html)
2015-12-13 05:31:09 +03:00
if not real_url:
2016-03-01 02:59:04 +03:00
iframe_url = r1(r'<[^>]+tumblr_video_container[^>]+><iframe[^>]+src=[\'"]([^\'"]*)[\'"]', html)
if iframe_url is None:
universal_download(url, output_dir, merge=merge, info_only=info_only, **kwargs)
return
2016-07-01 21:28:03 +03:00
if iframe_url:
2016-03-01 02:59:04 +03:00
iframe_html = get_content(iframe_url, headers=fake_headers)
real_url = r1(r'<video[^>]*>[\n ]*<source[^>]+src=[\'"]([^\'"]*)[\'"]', iframe_html)
2015-12-13 05:31:09 +03:00
else:
2016-03-01 02:59:04 +03:00
iframe_url = r1(r'<iframe[^>]+src=[\'"]([^\'"]*)[\'"]', html)
if iframe_url[:2] == '//': iframe_url = 'http:' + iframe_url
if re.search(r'player\.vimeo\.com', iframe_url):
vimeo_download(iframe_url, output_dir, merge=merge, info_only=info_only,
referer='http://tumblr.com/', **kwargs)
return
elif re.search(r'dailymotion\.com', iframe_url):
dailymotion_download(iframe_url, output_dir, merge=merge, info_only=info_only, **kwargs)
return
elif re.search(r'vine\.co', iframe_url):
vine_download(iframe_url, output_dir, merge=merge, info_only=info_only, **kwargs)
return
else:
iframe_html = get_content(iframe_url)
real_url = r1(r'<source src="([^"]*)"', iframe_html)
2015-10-20 04:58:49 +03:00
2013-08-23 00:25:51 +04:00
title = unescape_html(r1(r'<meta property="og:title" content="([^"]*)" />', html) or
r1(r'<meta property="og:description" content="([^"]*)" />', html) or
2015-06-30 10:39:56 +03:00
r1(r'<title>([^<\n]*)', html) or url.split("/")[4]).replace('\n', '')
2015-10-20 04:58:49 +03:00
2018-09-16 15:34:48 +03:00
# this is better
vcode = r1(r'tumblr_(\w+)', real_url)
real_url = 'https://vt.media.tumblr.com/tumblr_%s.mp4' % vcode
type, ext, size = url_info(real_url, faker=True)
2015-10-20 04:58:49 +03:00
2012-12-18 20:26:35 +04:00
print_info(site_info, title, type, size)
if not info_only:
2018-09-16 15:34:48 +03:00
download_urls([real_url], title, ext, size, output_dir, merge=merge)
2012-12-18 20:26:35 +04:00
site_info = "Tumblr.com"
download = tumblr_download
download_playlist = playlist_not_supported('tumblr')