From 16970c041f10ce8799993f4717f5a333eb4f380b Mon Sep 17 00:00:00 2001 From: Rafael Lee Date: Wed, 30 Nov 2022 01:25:10 +0800 Subject: [PATCH] Add function to save tweets in json format --- src/you_get/extractors/twitter.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 7453fcff..542ca7f4 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -52,6 +52,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) api_url = 'https://api.twitter.com/2/timeline/conversation/%s.json?tweet_mode=extended' % item_id api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token}) info = json.loads(api_content) + twitter_write_json(info, screen_name, item_id) if item_id not in info['globalObjects']['tweets']: # something wrong here #log.wtf('[Failed] ' + info['timeline']['instructions'][0]['addEntries']['entries'][0]['content']['item']['content']['tombstone']['tombstoneInfo']['richText']['text'], exit_code=None) @@ -103,6 +104,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) api_url = 'https://api.twitter.com/1.1/statuses/show/%s.json?tweet_mode=extended' % item_id api_content = get_content(api_url, headers={'authorization': authorization, 'x-guest-token': guest_token}) info = json.loads(api_content) + twitter_write_json(info, screen_name, item_id) media = info['extended_entities']['media'] for medium in media: @@ -129,6 +131,30 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) download_urls(urls, title, ext, size, output_dir, merge=merge) +def twitter_write_json(info, screen_name, item_id): + # this function save tweets in human readable json format + # # uncomment these lines if you need the original api returned json + # info_string = json.dumps(info, indent=" ", ensure_ascii=False) + # with open(screen_name+'_'+item_id+"_tweet.json", 'w') as fw: + # fw.write(info_string) + if 'globalObjects' in info.keys(): + tweets = info['globalObjects']['tweets'] + info_users = info["globalObjects"]['users'] + tweets_simplified = {} + for key in tweets.keys(): + user_id_str = tweets[key]['user_id_str'] + tweets_simplified[key] = {} + tweets_simplified[key]['created_at'] = tweets[key]['created_at'] + tweets_simplified[key]['user_id_str'] = tweets[key]['user_id_str'] + tweets_simplified[key]['full_text'] = tweets[key]['full_text'] + tweets_simplified[key]['name'] = info_users[user_id_str]['name'] + + tweet_string = json.dumps( + tweets_simplified, indent=" ", ensure_ascii=False) + with open(screen_name+'_'+item_id+".json", 'w') as fw: + fw.write(tweet_string) + + site_info = "Twitter.com" download = twitter_download download_playlist = playlist_not_supported('twitter')