fix douyin extractor

This commit is contained in:
owlwang 2022-07-25 12:34:55 +08:00 committed by GitHub
parent 82b376a0c6
commit 4119a1493e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,8 +1,6 @@
# coding=utf-8 # coding=utf-8
import re
import json import json
from urllib.parse import unquote
from ..common import ( from ..common import (
url_size, url_size,
@ -11,25 +9,52 @@ from ..common import (
fake_headers, fake_headers,
download_urls, download_urls,
playlist_not_supported, playlist_not_supported,
match1,
get_location,
) )
__all__ = ['douyin_download_by_url'] __all__ = ['douyin_download_by_url']
def get_value(source: dict, path):
try:
value = source
for key in path:
if type(key) is str:
if key in value.keys():
value = value[key]
else:
value = None
break
elif type(key) is int:
if len(value) != 0:
value = value[key]
else:
value = None
break
except:
value = None
return value
def douyin_download_by_url(url, **kwargs): def douyin_download_by_url(url, **kwargs):
# if short link, get the real url
if 'v.douyin.com' in url:
url = get_location(url)
aweme_id = match1(url, r'/(\d+)/?')
# get video info
video_info_api = 'https://www.douyin.com/web/api/v2/aweme/iteminfo/?item_ids={}'
url = video_info_api.format(aweme_id)
page_content = get_content(url, headers=fake_headers) page_content = get_content(url, headers=fake_headers)
# The video player and video source are rendered client-side, the data video_info = json.loads(page_content)
# contains in a <script id="RENDER_DATA" type="application/json"> tag
# quoted, unquote the whole page content then search using regex with # get video id and title
# regular string. video_id = get_value(video_info, ['item_list', 0, 'video', 'vid'])
page_content = unquote(page_content) title = get_value(video_info, ['item_list', 0, 'desc'])
title = re.findall(r'"desc":"([^"]*)"', page_content)[0].strip()
# get video play url
video_url = "https://aweme.snssdk.com/aweme/v1/playwm/?ratio=720p&line=0&video_id={}".format(video_id)
video_format = 'mp4' video_format = 'mp4'
# video URLs are in this pattern {"src":"THE_URL"}, in json format
urls_pattern = r'"playAddr":(\[.*?\])'
urls = json.loads(re.findall(urls_pattern, page_content)[0])
video_url = 'https:' + urls[0]['src']
size = url_size(video_url, faker=True) size = url_size(video_url, faker=True)
print_info( print_info(
site_info='douyin.com', title=title, site_info='douyin.com', title=title,