mirror of
https://github.com/soimort/you-get.git
synced 2025-01-23 13:35:16 +03:00
[twitter] fix extraction
This commit is contained in:
parent
ad5825a8f6
commit
e733351a05
@ -23,7 +23,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs)
|
|||||||
if re.match(r'https?://mobile', url): # normalize mobile URL
|
if re.match(r'https?://mobile', url): # normalize mobile URL
|
||||||
url = 'https://' + match1(url, r'//mobile\.(.+)')
|
url = 'https://' + match1(url, r'//mobile\.(.+)')
|
||||||
|
|
||||||
if re.match(r'https?://twitter\.com/i/moments/', url): # moments
|
if re.match(r'https?://twitter\.com/i/moments/', url): # FIXME: moments
|
||||||
html = get_html(url, faker=True)
|
html = get_html(url, faker=True)
|
||||||
paths = re.findall(r'data-permalink-path="([^"]+)"', html)
|
paths = re.findall(r'data-permalink-path="([^"]+)"', html)
|
||||||
for path in paths:
|
for path in paths:
|
||||||
@ -34,114 +34,47 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs)
|
|||||||
**kwargs)
|
**kwargs)
|
||||||
return
|
return
|
||||||
|
|
||||||
headers = {
|
m = re.match('^https?://(mobile\.)?twitter\.com/([^/]+)/status/(\d+)', url)
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
|
assert m
|
||||||
}
|
screen_name, item_id = m.group(2), m.group(3)
|
||||||
host = 'www.twitter.com'
|
|
||||||
|
|
||||||
html, set_cookie = getHttps(host, url, headers=headers)
|
|
||||||
# "Found. Redirecting to..."
|
|
||||||
guest_id = r1('guest_id=([^;]+);', set_cookie)
|
|
||||||
headers['Cookie'] = 'guest_id=%s' % guest_id
|
|
||||||
|
|
||||||
html = get_content(url, headers=headers)
|
|
||||||
|
|
||||||
screen_name = r1(r'twitter\.com/([^/]+)', url) or r1(r'data-screen-name="([^"]*)"', html) or \
|
|
||||||
r1(r'<meta name="twitter:title" content="([^"]*)"', html)
|
|
||||||
item_id = r1(r'twitter\.com/[^/]+/status/(\d+)', url) or r1(r'data-item-id="([^"]*)"', html) or \
|
|
||||||
r1(r'<meta name="twitter:site:id" content="([^"]*)"', html)
|
|
||||||
page_title = "{} [{}]".format(screen_name, item_id)
|
page_title = "{} [{}]".format(screen_name, item_id)
|
||||||
|
|
||||||
try:
|
# FIXME: this API won't work for protected or nsfw contents
|
||||||
authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
|
api_url = 'https://cdn.syndication.twimg.com/tweet-result?id=%s' % item_id
|
||||||
|
content = get_content(api_url)
|
||||||
|
info = json.loads(content)
|
||||||
|
|
||||||
# FIXME: 403 with cookies
|
author = info['user']['name']
|
||||||
ga_url = 'https://api.twitter.com/1.1/guest/activate.json'
|
url = 'https://twitter.com/%s/status/%s' % (info['user']['screen_name'], item_id)
|
||||||
ga_content = post_content(ga_url, headers={'authorization': authorization})
|
full_text = info['text']
|
||||||
guest_token = json.loads(ga_content)['guest_token']
|
|
||||||
|
|
||||||
api_url = 'https://api.twitter.com/2/timeline/conversation/%s.json?tweet_mode=extended' % item_id
|
if 'photos' in info:
|
||||||
api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token})
|
for photo in info['photos']:
|
||||||
|
photo_url = photo['url']
|
||||||
|
title = item_id + '_' + photo_url.split('.')[-2].split('/')[-1]
|
||||||
|
urls = [ photo_url + ':orig' ]
|
||||||
|
size = urls_size(urls)
|
||||||
|
ext = photo_url.split('.')[-1]
|
||||||
|
|
||||||
info = json.loads(api_content)
|
print_info(site_info, title, ext, size)
|
||||||
if item_id not in info['globalObjects']['tweets']:
|
if not info_only:
|
||||||
# something wrong here
|
download_urls(urls, title, ext, size, output_dir, merge=merge)
|
||||||
log.w('[Failed] ' + info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text'])
|
|
||||||
assert False
|
|
||||||
|
|
||||||
elif 'extended_entities' in info['globalObjects']['tweets'][item_id]:
|
if 'video' in info:
|
||||||
# if the tweet contains media, download them
|
for mediaDetail in info['mediaDetails']:
|
||||||
media = info['globalObjects']['tweets'][item_id]['extended_entities']['media']
|
variants = mediaDetail['video_info']['variants']
|
||||||
|
|
||||||
elif 'entities' in info['globalObjects']['tweets'][item_id]:
|
|
||||||
# if the tweet contains media from another tweet, download it
|
|
||||||
expanded_url = None
|
|
||||||
for j in info['globalObjects']['tweets'][item_id]['entities']['urls']:
|
|
||||||
if re.match(r'^https://twitter.com/.*', j['expanded_url']):
|
|
||||||
# FIXME: multiple valid expanded_url's?
|
|
||||||
expanded_url = j['expanded_url']
|
|
||||||
if expanded_url is not None:
|
|
||||||
item_id = r1(r'/status/(\d+)', expanded_url)
|
|
||||||
assert False
|
|
||||||
|
|
||||||
elif info['globalObjects']['tweets'][item_id].get('is_quote_status') == True:
|
|
||||||
# if the tweet does not contain media, but it quotes a tweet
|
|
||||||
# and the quoted tweet contains media, download them
|
|
||||||
item_id = info['globalObjects']['tweets'][item_id]['quoted_status_id_str']
|
|
||||||
|
|
||||||
api_url = 'https://api.twitter.com/2/timeline/conversation/%s.json?tweet_mode=extended' % item_id
|
|
||||||
api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token})
|
|
||||||
|
|
||||||
info = json.loads(api_content)
|
|
||||||
|
|
||||||
if 'extended_entities' in info['globalObjects']['tweets'][item_id]:
|
|
||||||
media = info['globalObjects']['tweets'][item_id]['extended_entities']['media']
|
|
||||||
else:
|
|
||||||
# quoted tweet has no media
|
|
||||||
return
|
|
||||||
|
|
||||||
else:
|
|
||||||
# no media, no quoted tweet
|
|
||||||
return
|
|
||||||
|
|
||||||
except:
|
|
||||||
log.w('[Warning] Falling back to deprecated Twitter API. Extraction may be incomplete.')
|
|
||||||
|
|
||||||
authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw'
|
|
||||||
|
|
||||||
# FIXME: 403 with cookies
|
|
||||||
ga_url = 'https://api.twitter.com/1.1/guest/activate.json'
|
|
||||||
ga_content = post_content(ga_url, headers={'authorization': authorization})
|
|
||||||
guest_token = json.loads(ga_content)['guest_token']
|
|
||||||
|
|
||||||
api_url = 'https://api.twitter.com/1.1/statuses/show/%s.json?tweet_mode=extended' % item_id
|
|
||||||
api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token})
|
|
||||||
info = json.loads(api_content)
|
|
||||||
media = info['extended_entities']['media']
|
|
||||||
|
|
||||||
for medium in media:
|
|
||||||
if 'video_info' in medium:
|
|
||||||
variants = medium['video_info']['variants']
|
|
||||||
variants = sorted(variants, key=lambda kv: kv.get('bitrate', 0))
|
variants = sorted(variants, key=lambda kv: kv.get('bitrate', 0))
|
||||||
title = item_id + '_' + variants[-1]['url'].split('/')[-1].split('?')[0].split('.')[0]
|
title = item_id + '_' + variants[-1]['url'].split('/')[-1].split('?')[0].split('.')[0]
|
||||||
urls = [ variants[-1]['url'] ]
|
urls = [ variants[-1]['url'] ]
|
||||||
size = urls_size(urls)
|
size = urls_size(urls)
|
||||||
mime, ext = variants[-1]['content_type'], 'mp4'
|
mime, ext = variants[-1]['content_type'], 'mp4'
|
||||||
|
|
||||||
print_info(site_info, title, mime, size)
|
|
||||||
if not info_only:
|
|
||||||
download_urls(urls, title, ext, size, output_dir, merge=merge)
|
|
||||||
|
|
||||||
else:
|
|
||||||
title = item_id + '_' + medium['media_url_https'].split('.')[-2].split('/')[-1]
|
|
||||||
urls = [ medium['media_url_https'] + ':orig' ]
|
|
||||||
size = urls_size(urls)
|
|
||||||
ext = medium['media_url_https'].split('.')[-1]
|
|
||||||
|
|
||||||
print_info(site_info, title, ext, size)
|
print_info(site_info, title, ext, size)
|
||||||
if not info_only:
|
if not info_only:
|
||||||
download_urls(urls, title, ext, size, output_dir, merge=merge)
|
download_urls(urls, title, ext, size, output_dir, merge=merge)
|
||||||
|
|
||||||
|
# TODO: should we deal with quoted tweets?
|
||||||
|
|
||||||
|
|
||||||
site_info = "Twitter.com"
|
site_info = "Twitter.com"
|
||||||
download = twitter_download
|
download = twitter_download
|
||||||
|
Loading…
Reference in New Issue
Block a user