fix douyin extractor

2025-01-23 21:45:02 +03:00 · 2022-07-25 12:34:55 +08:00 · 2022-07-25 12:34:55 +08:00 · 4119a1493e
commit 4119a1493e
parent 82b376a0c6
1 changed files with 38 additions and 13 deletions
--- a/src/you_get/extractors/douyin.py
+++ b/src/you_get/extractors/douyin.py
@ -1,8 +1,6 @@
 # coding=utf-8
 import re
 import json
 from urllib.parse import unquote
 from ..common import (
    url_size,
@ -11,25 +9,52 @@ from ..common import (
    fake_headers,
    download_urls,
    playlist_not_supported,
    match1,
    get_location,
 )
 __all__ = ['douyin_download_by_url']
 def get_value(source: dict, path):
    try:
        value = source
        for key in path:
            if type(key) is str:
                if key in value.keys():
                    value = value[key]
                else:
                    value = None
                    break
            elif type(key) is int:
                if len(value) != 0:
                    value = value[key]
                else:
                    value = None
                    break
    except:
        value = None
    return value
 def douyin_download_by_url(url, **kwargs):
    # if short link, get the real url
    if 'v.douyin.com' in url:
        url = get_location(url)
    aweme_id = match1(url, r'/(\d+)/?')
    # get video info
    video_info_api = 'https://www.douyin.com/web/api/v2/aweme/iteminfo/?item_ids={}'
    url = video_info_api.format(aweme_id)
    page_content = get_content(url, headers=fake_headers)
-    # The video player and video source are rendered client-side, the data
+    video_info = json.loads(page_content)
-    # contains in a <script id="RENDER_DATA" type="application/json"> tag
+
-    # quoted, unquote the whole page content then search using regex with
+    # get video id and title
-    # regular string.
+    video_id = get_value(video_info, ['item_list', 0, 'video', 'vid'])
-    page_content = unquote(page_content)
+    title = get_value(video_info, ['item_list', 0, 'desc'])
-    title = re.findall(r'"desc":"([^"]*)"', page_content)[0].strip()
+
    # get video play url
    video_url = "https://aweme.snssdk.com/aweme/v1/playwm/?ratio=720p&line=0&video_id={}".format(video_id)
    video_format = 'mp4'
    # video URLs are in this pattern {"src":"THE_URL"}, in json format
    urls_pattern = r'"playAddr":(\[.*?\])'
    urls = json.loads(re.findall(urls_pattern, page_content)[0])
    video_url = 'https:' + urls[0]['src']
    size = url_size(video_url, faker=True)
    print_info(
        site_info='douyin.com', title=title,