you-get/src/you_get/extractors/tumblr.py

#!/usr/bin/env python

__all__ = ['tumblr_download']

from ..common import *
from .universal import *
from .dailymotion import dailymotion_download
from .vimeo import vimeo_download
from .vine import vine_download

def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
    if re.match(r'https?://\d+\.media\.tumblr\.com/', url):
        universal_download(url, output_dir, merge=merge, info_only=info_only)
        return

    import ssl
    ssl_context = request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1))
    cookie_handler = request.HTTPCookieProcessor()
    opener = request.build_opener(ssl_context, cookie_handler)
    request.install_opener(opener)

    page = get_html(url)
    form_key = match1(page, r'id="tumblr_form_key" content="([^"]+)"')
    if form_key is not None:
        # bypass GDPR consent page
        referer = 'https://www.tumblr.com/privacy/consent?redirect=%s' % parse.quote_plus(url)
        post_content('https://www.tumblr.com/svc/privacy/consent',
                     headers={
                         'Content-Type': 'application/json',
                         'User-Agent': fake_headers['User-Agent'],
                         'Referer': referer,
                         'X-tumblr-form-key': form_key,
                         'X-Requested-With': 'XMLHttpRequest'
                     },
                     post_data_raw='{"eu_resident":true,"gdpr_is_acceptable_age":true,"gdpr_consent_core":true,"gdpr_consent_first_party_ads":true,"gdpr_consent_third_party_ads":true,"gdpr_consent_search_history":true,"redirect_to":"%s","gdpr_reconsent":false}' % url)
        page = get_html(url, faker=True)

    html = parse.unquote(page).replace('\/', '/')
    feed = r1(r'<meta property="og:type" content="tumblr-feed:(\w+)" />', html)

    if feed in ['photo', 'photoset', 'entry'] or feed is None:
        # try to extract photos
        page_title = r1(r'<meta name="description" content="([^"\n]+)', html) or \
                     r1(r'<meta property="og:description" content="([^"\n]+)', html) or \
                     r1(r'<title>([^<\n]*)', html)
        urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.jpg)', html) +\
               re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.png)', html) +\
               re.findall(r'(https?://[^;"&]+/tumblr_[^";&]+_\d+\.gif)', html)

        tuggles = {}
        for url in urls:
            if url.endswith('.gif'):
                hd_url = url
            elif url.endswith('.jpg'):
                hd_url = r1(r'(.+)_\d+\.jpg$', url) + '_1280.jpg' # FIXME: decide actual quality
            elif url.endswith('.png'):
                hd_url = r1(r'(.+)_\d+\.png$', url) + '_1280.png' # FIXME: decide actual quality
            else:
                continue
            filename = parse.unquote(hd_url.split('/')[-1])
            title = '.'.join(filename.split('.')[:-1])
            tumblr_id = r1(r'^tumblr_(.+)_\d+$', title)
            quality = int(r1(r'^tumblr_.+_(\d+)$', title))
            ext = filename.split('.')[-1]
            try:
                size = int(get_head(hd_url)['Content-Length'])
                if tumblr_id not in tuggles or tuggles[tumblr_id]['quality'] < quality:
                    tuggles[tumblr_id] = {
                        'title': title,
                        'url': hd_url,
                        'quality': quality,
                        'ext': ext,
                        'size': size,
                    }
            except: pass

        if tuggles:
            size = sum([tuggles[t]['size'] for t in tuggles])
            print_info(site_info, page_title, None, size)

            if not info_only:
                for t in tuggles:
                    title = tuggles[t]['title']
                    ext = tuggles[t]['ext']
                    size = tuggles[t]['size']
                    url = tuggles[t]['url']
                    print_info(site_info, title, ext, size)
                    download_urls([url], title, ext, size,
                                  output_dir=output_dir)
            return

    # feed == 'audio' or feed == 'video' or feed is None
    # try to extract video / audio
    real_url = r1(r'source src=\\x22([^\\]+)\\', html)
    if not real_url:
        real_url = r1(r'audio_file=([^&]+)&', html)
        if real_url:
            real_url = real_url + '?plead=please-dont-download-this-or-our-lawyers-wont-let-us-host-audio'
    if not real_url:
        real_url = r1(r'<source src="([^"]*)"', html)
    if not real_url:
        iframe_url = r1(r'<[^>]+tumblr_video_container[^>]+><iframe[^>]+src=[\'"]([^\'"]*)[\'"]', html)

        if iframe_url is None:
            universal_download(url, output_dir, merge=merge, info_only=info_only, **kwargs)
            return

        if iframe_url:
            iframe_html = get_content(iframe_url, headers=fake_headers)
            real_url = r1(r'<video[^>]*>[\n ]*<source[^>]+src=[\'"]([^\'"]*)[\'"]', iframe_html)
        else:
            iframe_url = r1(r'<iframe[^>]+src=[\'"]([^\'"]*)[\'"]', html)
            if iframe_url[:2] == '//': iframe_url = 'http:' + iframe_url
            if re.search(r'player\.vimeo\.com', iframe_url):
                vimeo_download(iframe_url, output_dir, merge=merge, info_only=info_only,
                               referer='http://tumblr.com/', **kwargs)
                return
            elif re.search(r'dailymotion\.com', iframe_url):
                dailymotion_download(iframe_url, output_dir, merge=merge, info_only=info_only, **kwargs)
                return
            elif re.search(r'vine\.co', iframe_url):
                vine_download(iframe_url, output_dir, merge=merge, info_only=info_only, **kwargs)
                return
            else:
                iframe_html = get_content(iframe_url)
                real_url = r1(r'<source src="([^"]*)"', iframe_html)

    title = unescape_html(r1(r'<meta property="og:title" content="([^"]*)" />', html) or
        r1(r'<meta property="og:description" content="([^"]*)" />', html) or
        r1(r'<title>([^<\n]*)', html) or url.split("/")[4]).replace('\n', '')

    # this is better
    vcode = r1(r'tumblr_(\w+)', real_url)
    real_url = 'https://vt.media.tumblr.com/tumblr_%s.mp4' % vcode

    type, ext, size = url_info(real_url, faker=True)

    print_info(site_info, title, type, size)
    if not info_only:
        download_urls([real_url], title, ext, size, output_dir, merge=merge)

site_info = "Tumblr.com"
download = tumblr_download
download_playlist = playlist_not_supported('tumblr')
add support for Tumblr (fix #44) 2012-12-18 20:26:35 +04:00			`#!/usr/bin/env python`

			`__all__ = ['tumblr_download']`

			`from ..common import *`
[tumblr] support direct URLs 2015-11-11 09:53:58 +03:00			`from .universal import *`
[tumblr] support dailymotion_download 2015-12-16 17:41:36 +03:00			`from .dailymotion import dailymotion_download`
[tumblr] fix for embedded vimeo videos 2015-12-13 05:31:09 +03:00			`from .vimeo import vimeo_download`
[tumblr] fix #901 (2, embedded vine videos) 2016-01-25 16:48:55 +03:00			`from .vine import vine_download`
add support for Tumblr (fix #44) 2012-12-18 20:26:35 +04:00
[tumblr] download photo / photoset 2015-10-20 04:58:49 +03:00			`def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs):`
[tumblr] support direct URLs 2015-11-11 09:53:58 +03:00			`if re.match(r'https?://\d+\.media\.tumblr\.com/', url):`
			`universal_download(url, output_dir, merge=merge, info_only=info_only)`
			`return`

[tumblr] take my consent 2018-09-12 00:23:50 +03:00			`import ssl`
			`ssl_context = request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1))`
			`cookie_handler = request.HTTPCookieProcessor()`
			`opener = request.build_opener(ssl_context, cookie_handler)`
			`request.install_opener(opener)`

			`page = get_html(url)`
			`form_key = match1(page, r'id="tumblr_form_key" content="([^"]+)"')`
			`if form_key is not None:`
			`# bypass GDPR consent page`
			`referer = 'https://www.tumblr.com/privacy/consent?redirect=%s' % parse.quote_plus(url)`
			`post_content('https://www.tumblr.com/svc/privacy/consent',`
			`headers={`
			`'Content-Type': 'application/json',`
			`'User-Agent': fake_headers['User-Agent'],`
			`'Referer': referer,`
			`'X-tumblr-form-key': form_key,`
			`'X-Requested-With': 'XMLHttpRequest'`
			`},`
			`post_data_raw='{"eu_resident":true,"gdpr_is_acceptable_age":true,"gdpr_consent_core":true,"gdpr_consent_first_party_ads":true,"gdpr_consent_third_party_ads":true,"gdpr_consent_search_history":true,"redirect_to":"%s","gdpr_reconsent":false}' % url)`
[tumblr] squanch this 2018-09-16 15:18:31 +03:00			`page = get_html(url, faker=True)`
[tumblr] take my consent 2018-09-12 00:23:50 +03:00
			`html = parse.unquote(page).replace('\/', '/')`
Tumblr: video download error 2014-11-06 04:57:01 +03:00			`feed = r1(r'<meta property="og:type" content="tumblr-feed:(\w+)" />', html)`

[tumblr] support tumblr-feed:entry 2015-12-17 05:09:19 +03:00			`if feed in ['photo', 'photoset', 'entry'] or feed is None:`
[tumblr] fix for embedded vimeo videos 2015-12-13 05:31:09 +03:00			`# try to extract photos`
[tumblr] download photo / photoset 2015-10-20 04:58:49 +03:00			`page_title = r1(r'<meta name="description" content="([^"\n]+)', html) or \`
			`r1(r'<meta property="og:description" content="([^"\n]+)', html) or \`
			`r1(r'<title>([^<\n]*)', html)`
[tumblr] squanch this 2018-09-16 15:18:31 +03:00			`urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.jpg)', html) +\`
			`re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.png)', html) +\`
			`re.findall(r'(https?://[^;"&]+/tumblr_[^";&]+_\d+\.gif)', html)`
[tumblr] download photo / photoset 2015-10-20 04:58:49 +03:00
			`tuggles = {}`
			`for url in urls:`
[tumblr] glad to see people still use tumblr these days 2019-03-18 00:56:08 +03:00			`if url.endswith('.gif'):`
			`hd_url = url`
			`elif url.endswith('.jpg'):`
			`hd_url = r1(r'(.+)_\d+\.jpg$', url) + '_1280.jpg' # FIXME: decide actual quality`
			`elif url.endswith('.png'):`
			`hd_url = r1(r'(.+)_\d+\.png$', url) + '_1280.png' # FIXME: decide actual quality`
			`else:`
			`continue`
[tumblr] always download the high res (1280) version of images 2018-12-23 01:14:30 +03:00			`filename = parse.unquote(hd_url.split('/')[-1])`
[tumblr] download photo / photoset 2015-10-20 04:58:49 +03:00			`title = '.'.join(filename.split('.')[:-1])`
			`tumblr_id = r1(r'^tumblr_(.+)_\d+$', title)`
			`quality = int(r1(r'^tumblr_.+_(\d+)$', title))`
			`ext = filename.split('.')[-1]`
[tumblr] ignore bad URLs 2017-10-31 21:02:44 +03:00			`try:`
[tumblr] always download the high res (1280) version of images 2018-12-23 01:14:30 +03:00			`size = int(get_head(hd_url)['Content-Length'])`
[tumblr] ignore bad URLs 2017-10-31 21:02:44 +03:00			`if tumblr_id not in tuggles or tuggles[tumblr_id]['quality'] < quality:`
			`tuggles[tumblr_id] = {`
			`'title': title,`
[tumblr] always download the high res (1280) version of images 2018-12-23 01:14:30 +03:00			`'url': hd_url,`
[tumblr] ignore bad URLs 2017-10-31 21:02:44 +03:00			`'quality': quality,`
			`'ext': ext,`
			`'size': size,`
			`}`
			`except: pass`
[tumblr] download photo / photoset 2015-10-20 04:58:49 +03:00
[tumblr] fix for embedded vimeo videos 2015-12-13 05:31:09 +03:00			`if tuggles:`
			`size = sum([tuggles[t]['size'] for t in tuggles])`
			`print_info(site_info, page_title, None, size)`
[tumblr] download photo / photoset 2015-10-20 04:58:49 +03:00
[tumblr] fix for embedded vimeo videos 2015-12-13 05:31:09 +03:00			`if not info_only:`
			`for t in tuggles:`
			`title = tuggles[t]['title']`
			`ext = tuggles[t]['ext']`
			`size = tuggles[t]['size']`
			`url = tuggles[t]['url']`
			`print_info(site_info, title, ext, size)`
			`download_urls([url], title, ext, size,`
			`output_dir=output_dir)`
			`return`
[tumblr] download photo / photoset 2015-10-20 04:58:49 +03:00
[tumblr] fix for embedded vimeo videos 2015-12-13 05:31:09 +03:00			`# feed == 'audio' or feed == 'video' or feed is None`
			`# try to extract video / audio`
			`real_url = r1(r'source src=\\x22([^\\]+)\\', html)`
			`if not real_url:`
			`real_url = r1(r'audio_file=([^&]+)&', html)`
			`if real_url:`
			`real_url = real_url + '?plead=please-dont-download-this-or-our-lawyers-wont-let-us-host-audio'`
			`if not real_url:`
Tumblr: video download error 2014-11-06 04:57:01 +03:00			`real_url = r1(r'<source src="([^"]*)"', html)`
[tumblr] fix for embedded vimeo videos 2015-12-13 05:31:09 +03:00			`if not real_url:`
Support for tumblr-hosted videos. 2016-03-01 02:59:04 +03:00			`iframe_url = r1(r'<[^>]+tumblr_video_container[^>]+><iframe[^>]+src=[\'"]([^\'"]*)[\'"]', html)`
[tumblr] fallback to universal_download 2018-04-24 15:48:20 +03:00
			`if iframe_url is None:`
			`universal_download(url, output_dir, merge=merge, info_only=info_only, **kwargs)`
			`return`

[tumblr] fix #1232 2016-07-01 21:28:03 +03:00			`if iframe_url:`
Support for tumblr-hosted videos. 2016-03-01 02:59:04 +03:00			`iframe_html = get_content(iframe_url, headers=fake_headers)`
			`real_url = r1(r'<video[^>]>[\n ]<source[^>]+src=[\'"]([^\'"]*)[\'"]', iframe_html)`
[tumblr] fix for embedded vimeo videos 2015-12-13 05:31:09 +03:00			`else:`
Support for tumblr-hosted videos. 2016-03-01 02:59:04 +03:00			`iframe_url = r1(r'<iframe[^>]+src=[\'"]([^\'"]*)[\'"]', html)`
			`if iframe_url[:2] == '//': iframe_url = 'http:' + iframe_url`
			`if re.search(r'player\.vimeo\.com', iframe_url):`
			`vimeo_download(iframe_url, output_dir, merge=merge, info_only=info_only,`
			`referer='http://tumblr.com/', **kwargs)`
			`return`
			`elif re.search(r'dailymotion\.com', iframe_url):`
			`dailymotion_download(iframe_url, output_dir, merge=merge, info_only=info_only, **kwargs)`
			`return`
			`elif re.search(r'vine\.co', iframe_url):`
			`vine_download(iframe_url, output_dir, merge=merge, info_only=info_only, **kwargs)`
			`return`
			`else:`
			`iframe_html = get_content(iframe_url)`
			`real_url = r1(r'<source src="([^"]*)"', iframe_html)`
[tumblr] download photo / photoset 2015-10-20 04:58:49 +03:00
Tumblr: fix title 2013-08-23 00:25:51 +04:00			`title = unescape_html(r1(r'<meta property="og:title" content="([^"]*)" />', html) or`
			`r1(r'<meta property="og:description" content="([^"]*)" />', html) or`
[Tumblr] fix for videos with no title 2015-06-30 10:39:56 +03:00			`r1(r'<title>([^<\n]*)', html) or url.split("/")[4]).replace('\n', '')`
[tumblr] download photo / photoset 2015-10-20 04:58:49 +03:00
[tumblr] squanch that 2018-09-16 15:34:48 +03:00			`# this is better`
			`vcode = r1(r'tumblr_(\w+)', real_url)`
			`real_url = 'https://vt.media.tumblr.com/tumblr_%s.mp4' % vcode`

			`type, ext, size = url_info(real_url, faker=True)`
[tumblr] download photo / photoset 2015-10-20 04:58:49 +03:00
add support for Tumblr (fix #44) 2012-12-18 20:26:35 +04:00			`print_info(site_info, title, type, size)`
			`if not info_only:`
[tumblr] squanch that 2018-09-16 15:34:48 +03:00			`download_urls([real_url], title, ext, size, output_dir, merge=merge)`
add support for Tumblr (fix #44) 2012-12-18 20:26:35 +04:00
			`site_info = "Tumblr.com"`
			`download = tumblr_download`
			`download_playlist = playlist_not_supported('tumblr')`