diff --git a/src/you_get/extractors/tiktok.py b/src/you_get/extractors/tiktok.py index 50780ac1..09a36bb8 100644 --- a/src/you_get/extractors/tiktok.py +++ b/src/you_get/extractors/tiktok.py @@ -5,16 +5,6 @@ __all__ = ['tiktok_download'] from ..common import * def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - while True: - m = re.match('(https?://)?([^/]+)(/.*)', url) - host = m.group(2) - if host == 'www.tiktok.com': # canonical URL reached - url = m.group(3).split('?')[0] - vid = url.split('/')[3] # should be a string of numbers - break - else: - url = get_location(url) - headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0', 'Accept-Encoding': 'gzip, deflate', @@ -22,7 +12,20 @@ def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs): 'Connection': 'keep-alive' # important } + m = re.match('(https?://)?([^/]+)(/.*)', url) + host = m.group(2) + if host != 'www.tiktok.com': # non-canonical URL + html = getHttps(host, url, headers=headers, gzip=False) + url = r1(r'(https://www.tiktok.com/[^?"]+)', html) + # use canonical URL + m = re.match('(https?://)?([^/]+)(/.*)', url) + host = m.group(2) + + url = m.group(3).split('?')[0] + vid = url.split('/')[3] # should be a string of numbers + html = getHttps(host, url, headers=headers) + data = r1(r'window\[\'SIGI_STATE\'\]=(.*?);window\[\'SIGI_RETRY\'\]', html) or \ r1(r'', html) info = json.loads(data)