From 7b845b34ce18863e519ad3cce8e53431ba41664d Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 11 Dec 2022 17:43:07 +0100 Subject: [PATCH] [tiktok] fix extraction for alternative URLs --- src/you_get/common.py | 15 +++++++++------ src/you_get/extractors/tiktok.py | 12 ++++++------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 1558baf6..c337a2a2 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -344,21 +344,24 @@ def undeflate(data): # an http.client implementation of get_content() # because urllib does not support "Connection: keep-alive" -def getHttps(host, url, headers, gzip=True, deflate=False, debuglevel=0): +def getHttps(host, url, headers, debuglevel=0): import http.client conn = http.client.HTTPSConnection(host) conn.set_debuglevel(debuglevel) conn.request("GET", url, headers=headers) resp = conn.getresponse() + set_cookie = resp.getheader('set-cookie') data = resp.read() - if gzip: - data = ungzip(data) - if deflate: - data = undeflate(data) + try: + data = ungzip(data) # gzip + data = undeflate(data) # deflate + except: + pass - return str(data, encoding='utf-8'), resp.getheader('set-cookie') + conn.close() + return str(data, encoding='utf-8'), set_cookie # DEPRECATED in favor of get_content() diff --git a/src/you_get/extractors/tiktok.py b/src/you_get/extractors/tiktok.py index 641e5e97..2c4892f6 100644 --- a/src/you_get/extractors/tiktok.py +++ b/src/you_get/extractors/tiktok.py @@ -16,12 +16,12 @@ def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs): m = re.match('(https?://)?([^/]+)(/.*)', url) host = m.group(2) if host != 'www.tiktok.com': # non-canonical URL - url = get_location(url, headers=headers) - m = re.match('(https?://)?([^/]+)(/.*)', url) - host = m.group(2) - - url = m.group(3).split('?')[0] - vid = url.split('/')[3] # should be a string of numbers + vid = r1(r'/video/(\d+)', url) + url = 'https://www.tiktok.com/@/video/%s/' % vid + host = 'www.tiktok.com' + else: + url = m.group(3).split('?')[0] + vid = url.split('/')[3] # should be a string of numbers html, set_cookie = getHttps(host, url, headers=headers) tt_chain_token = r1('tt_chain_token=([^;]+);', set_cookie)