you-get/src/you_get/extractors/twitter.py

95 lines
4.0 KiB
Python
Raw Normal View History

2015-06-14 19:04:57 +03:00
#!/usr/bin/env python
__all__ = ['twitter_download']
from ..common import *
2015-11-09 18:48:14 +03:00
from .vine import vine_download
2015-06-14 19:04:57 +03:00
2016-05-23 18:28:29 +03:00
def extract_m3u(source):
r1 = get_content(source)
2017-10-13 16:20:06 +03:00
s1 = re.findall(r'(/ext_tw_video/.*)', r1)
s1 += re.findall(r'(/amplify_video/.*)', r1)
2016-05-23 18:28:29 +03:00
r2 = get_content('https://video.twimg.com%s' % s1[-1])
2017-10-13 16:20:06 +03:00
s2 = re.findall(r'(/ext_tw_video/.*)', r2)
s2 += re.findall(r'(/amplify_video/.*)', r2)
2016-05-23 23:13:12 +03:00
return ['https://video.twimg.com%s' % i for i in s2]
2016-05-23 18:28:29 +03:00
def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
2018-02-18 02:52:35 +03:00
if re.match(r'https?://mobile', url): # normalize mobile URL
url = 'https://' + match1(url, r'//mobile\.(.+)')
2018-04-19 04:22:18 +03:00
if re.match(r'https?://twitter\.com/i/moments/', url): # moments
2018-05-21 23:59:51 +03:00
html = get_html(url, faker=True)
2018-04-19 04:22:18 +03:00
paths = re.findall(r'data-permalink-path="([^"]+)"', html)
for path in paths:
twitter_download('https://twitter.com' + path,
output_dir=output_dir,
merge=merge,
info_only=info_only,
**kwargs)
return
2018-05-21 23:59:51 +03:00
html = get_html(url, faker=True)
2015-12-29 18:10:45 +03:00
screen_name = r1(r'data-screen-name="([^"]*)"', html) or \
r1(r'<meta name="twitter:title" content="([^"]*)"', html)
item_id = r1(r'data-item-id="([^"]*)"', html) or \
r1(r'<meta name="twitter:site:id" content="([^"]*)"', html)
2015-10-20 06:20:15 +03:00
page_title = "{} [{}]".format(screen_name, item_id)
try: # extract images
urls = re.findall(r'property="og:image"\s*content="([^"]+:large)"', html)
assert urls
2015-10-20 06:20:15 +03:00
images = []
for url in urls:
url = ':'.join(url.split(':')[:-1]) + ':orig'
filename = parse.unquote(url.split('/')[-1])
title = '.'.join(filename.split('.')[:-1])
ext = url.split(':')[-2].split('.')[-1]
size = int(get_head(url)['Content-Length'])
images.append({'title': title,
'url': url,
'ext': ext,
'size': size})
size = sum([image['size'] for image in images])
print_info(site_info, page_title, images[0]['ext'], size)
if not info_only:
for image in images:
title = image['title']
ext = image['ext']
size = image['size']
url = image['url']
print_info(site_info, title, ext, size)
download_urls([url], title, ext, size,
output_dir=output_dir)
2015-06-14 19:04:57 +03:00
except: # extract video
2018-08-08 17:21:49 +03:00
#i_url = 'https://twitter.com/i/videos/' + item_id
#i_content = get_content(i_url)
#js_url = r1(r'src="([^"]+)"', i_content)
#js_content = get_content(js_url)
#authorization = r1(r'"(Bearer [^"]+)"', js_content)
authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
2016-03-04 16:46:14 +03:00
2018-08-08 17:21:49 +03:00
ga_url = 'https://api.twitter.com/1.1/guest/activate.json'
ga_content = post_content(ga_url, headers={'authorization': authorization})
guest_token = json.loads(ga_content)['guest_token']
2018-08-08 17:21:49 +03:00
api_url = 'https://api.twitter.com/2/timeline/conversation/%s.json?tweet_mode=extended' % item_id
api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token})
info = json.loads(api_content)
variants = info['globalObjects']['tweets'][item_id]['extended_entities']['media'][0]['video_info']['variants']
variants = sorted(variants, key=lambda kv: kv.get('bitrate', 0))
urls = [ variants[-1]['url'] ]
2016-05-23 23:13:12 +03:00
size = urls_size(urls)
2018-08-08 17:21:49 +03:00
mime, ext = variants[-1]['content_type'], 'mp4'
print_info(site_info, page_title, mime, size)
if not info_only:
2016-05-23 23:13:12 +03:00
download_urls(urls, page_title, ext, size, output_dir, merge=merge)
2015-06-14 19:04:57 +03:00
site_info = "Twitter.com"
download = twitter_download
download_playlist = playlist_not_supported('twitter')