2015-10-19 04:50:17 +03:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
__all__ = ['universal_download']
|
|
|
|
|
|
|
|
from ..common import *
|
|
|
|
from .embed import *
|
|
|
|
|
|
|
|
def universal_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
|
2016-11-26 19:09:28 +03:00
|
|
|
try:
|
|
|
|
content_type = get_head(url, headers=fake_headers)['Content-Type']
|
|
|
|
except:
|
|
|
|
content_type = get_head(url, headers=fake_headers, get_method='GET')['Content-Type']
|
2016-08-26 20:27:40 +03:00
|
|
|
if content_type.startswith('text/html'):
|
|
|
|
try:
|
2017-08-12 15:57:08 +03:00
|
|
|
embed_download(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs)
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
return
|
2015-10-19 04:50:17 +03:00
|
|
|
|
|
|
|
domains = url.split('/')[2].split('.')
|
|
|
|
if len(domains) > 2: domains = domains[1:]
|
|
|
|
site_info = '.'.join(domains)
|
|
|
|
|
|
|
|
if content_type.startswith('text/html'):
|
|
|
|
# extract an HTML page
|
2016-08-26 20:27:40 +03:00
|
|
|
response = get_response(url, faker=True)
|
2015-10-19 04:50:17 +03:00
|
|
|
page = str(response.data)
|
|
|
|
|
|
|
|
page_title = r1(r'<title>([^<]*)', page)
|
|
|
|
if page_title:
|
|
|
|
page_title = unescape_html(page_title)
|
|
|
|
|
2017-07-08 03:56:54 +03:00
|
|
|
hls_urls = re.findall(r'(https?://[^;"\'\\]+' + '\.m3u8?' +
|
|
|
|
r'[^;"\'\\]*)', page)
|
|
|
|
if hls_urls:
|
|
|
|
for hls_url in hls_urls:
|
|
|
|
type_, ext, size = url_info(hls_url)
|
|
|
|
print_info(site_info, page_title, type_, size)
|
|
|
|
if not info_only:
|
|
|
|
download_url_ffmpeg(url=hls_url, title=page_title,
|
|
|
|
ext='mp4', output_dir=output_dir)
|
|
|
|
return
|
|
|
|
|
2015-10-19 04:50:17 +03:00
|
|
|
# most common media file extensions on the Internet
|
2015-10-21 06:02:24 +03:00
|
|
|
media_exts = ['\.flv', '\.mp3', '\.mp4', '\.webm',
|
2015-10-24 04:25:27 +03:00
|
|
|
'[-_]1\d\d\d\.jpe?g', '[-_][6-9]\d\d\.jpe?g', # tumblr
|
|
|
|
'[-_]1\d\d\dx[6-9]\d\d\.jpe?g',
|
|
|
|
'[-_][6-9]\d\dx1\d\d\d\.jpe?g',
|
|
|
|
'[-_][6-9]\d\dx[6-9]\d\d\.jpe?g',
|
|
|
|
's1600/[\w%]+\.jpe?g', # blogger
|
|
|
|
'img[6-9]\d\d/[\w%]+\.jpe?g' # oricon?
|
2015-10-21 06:02:24 +03:00
|
|
|
]
|
2015-10-19 04:50:17 +03:00
|
|
|
|
|
|
|
urls = []
|
|
|
|
for i in media_exts:
|
2015-10-21 06:02:24 +03:00
|
|
|
urls += re.findall(r'(https?://[^;"\'\\]+' + i + r'[^;"\'\\]*)', page)
|
2015-10-19 04:50:17 +03:00
|
|
|
|
2015-10-21 06:02:24 +03:00
|
|
|
p_urls = re.findall(r'(https?%3A%2F%2F[^;&]+' + i + r'[^;&]*)', page)
|
|
|
|
urls += [parse.unquote(url) for url in p_urls]
|
|
|
|
|
|
|
|
q_urls = re.findall(r'(https?:\\\\/\\\\/[^;"\']+' + i + r'[^;"\']*)', page)
|
|
|
|
urls += [url.replace('\\\\/', '/') for url in q_urls]
|
|
|
|
|
|
|
|
# a link href to an image is often an interesting one
|
2017-10-25 22:02:53 +03:00
|
|
|
urls += re.findall(r'href="(https?://[^"]+\.jpe?g)"', page, re.I)
|
|
|
|
urls += re.findall(r'href="(https?://[^"]+\.png)"', page, re.I)
|
|
|
|
urls += re.findall(r'href="(https?://[^"]+\.gif)"', page, re.I)
|
2015-10-19 04:50:17 +03:00
|
|
|
|
2017-09-12 13:37:16 +03:00
|
|
|
# MPEG-DASH MPD
|
|
|
|
mpd_urls = re.findall(r'src="(https?://[^"]+\.mpd)"', page)
|
|
|
|
for mpd_url in mpd_urls:
|
|
|
|
cont = get_content(mpd_url)
|
|
|
|
base_url = r1(r'<BaseURL>(.*)</BaseURL>', cont)
|
|
|
|
urls += [ r1(r'(.*/)[^/]*', mpd_url) + base_url ]
|
|
|
|
|
2015-10-19 04:50:17 +03:00
|
|
|
# have some candy!
|
|
|
|
candies = []
|
2015-10-27 14:45:04 +03:00
|
|
|
i = 1
|
2015-10-19 04:50:17 +03:00
|
|
|
for url in set(urls):
|
|
|
|
filename = parse.unquote(url.split('/')[-1])
|
2015-10-27 14:45:04 +03:00
|
|
|
if 5 <= len(filename) <= 80:
|
2015-10-19 04:50:17 +03:00
|
|
|
title = '.'.join(filename.split('.')[:-1])
|
|
|
|
else:
|
2015-10-27 14:45:04 +03:00
|
|
|
title = '%s' % i
|
|
|
|
i += 1
|
2015-10-19 04:50:17 +03:00
|
|
|
|
|
|
|
candies.append({'url': url,
|
|
|
|
'title': title})
|
|
|
|
|
|
|
|
for candy in candies:
|
|
|
|
try:
|
|
|
|
mime, ext, size = url_info(candy['url'], faker=True)
|
2015-10-21 06:02:24 +03:00
|
|
|
if not size: size = float('Int')
|
2015-10-19 04:50:17 +03:00
|
|
|
except:
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
print_info(site_info, candy['title'], ext, size)
|
|
|
|
if not info_only:
|
|
|
|
download_urls([candy['url']], candy['title'], ext, size,
|
|
|
|
output_dir=output_dir, merge=merge,
|
|
|
|
faker=True)
|
|
|
|
return
|
|
|
|
|
|
|
|
else:
|
|
|
|
# direct download
|
|
|
|
filename = parse.unquote(url.split('/')[-1])
|
|
|
|
title = '.'.join(filename.split('.')[:-1])
|
|
|
|
ext = filename.split('.')[-1]
|
|
|
|
_, _, size = url_info(url, faker=True)
|
|
|
|
print_info(site_info, title, ext, size)
|
|
|
|
if not info_only:
|
|
|
|
download_urls([url], title, ext, size,
|
|
|
|
output_dir=output_dir, merge=merge,
|
|
|
|
faker=True)
|
|
|
|
return
|
|
|
|
|
|
|
|
site_info = None
|
|
|
|
download = universal_download
|
|
|
|
download_playlist = playlist_not_supported('universal')
|