[instagram] fix extraction

This commit is contained in:
Mort Yao 2020-12-05 16:14:40 +01:00
parent 6f9cd8a069
commit 6e39a594e4
No known key found for this signature in database
GPG Key ID: 07DA00CB78203251

View File

@ -9,7 +9,7 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg
html = get_html(url)
vid = r1(r'instagram.com/\w+/([^/]+)', url)
description = r1(r'<meta property="og:title" content="([^"]*)"', html)
description = r1(r'<title>\s([^<]*)</title>', html)
title = "{} [{}]".format(description.replace("\n", " "), vid)
stream = r1(r'<meta property="og:video" content="([^"]*)"', html)
if stream:
@ -19,11 +19,11 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg
if not info_only:
download_urls([stream], title, ext, size, output_dir, merge=merge)
else:
data = re.search(r'window\._sharedData\s*=\s*(.*);</script>', html)
info = json.loads(data.group(1))
data = re.search(r'window\.__additionalDataLoaded\(\'[^\']+\',(.*)\);</script>', html)
post = json.loads(data.group(1))
if 'edge_sidecar_to_children' in info['entry_data']['PostPage'][0]['graphql']['shortcode_media']:
edges = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['edge_sidecar_to_children']['edges']
if 'edge_sidecar_to_children' in post['graphql']['shortcode_media']:
edges = post['graphql']['shortcode_media']['edge_sidecar_to_children']['edges']
for edge in edges:
title = edge['node']['shortcode']
image_url = edge['node']['display_url']
@ -40,10 +40,10 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg
total_size=size,
output_dir=output_dir)
else:
title = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['shortcode']
image_url = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['display_url']
if 'video_url' in info['entry_data']['PostPage'][0]['graphql']['shortcode_media']:
image_url =info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['video_url']
title = post['graphql']['shortcode_media']['shortcode']
image_url = post['graphql']['shortcode_media']['display_url']
if 'video_url' in post['graphql']['shortcode_media']:
image_url = post['graphql']['shortcode_media']['video_url']
ext = image_url.split('?')[0].split('.')[-1]
size = int(get_head(image_url)['Content-Length'])