Update douyin.py

Updated the extractors for the latest www.douyin.com
This commit is contained in:
Philip Xu 2021-07-06 06:24:52 -04:00 committed by GitHub
parent 5445f5ecde
commit 71780ae4aa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,6 +1,7 @@
# coding=utf-8 # coding=utf-8
import re import re
import json
from urllib.parse import unquote from urllib.parse import unquote
from ..common import ( from ..common import (
@ -18,17 +19,17 @@ __all__ = ['douyin_download_by_url']
def douyin_download_by_url(url, **kwargs): def douyin_download_by_url(url, **kwargs):
page_content = get_content(url, headers=fake_headers) page_content = get_content(url, headers=fake_headers)
# The easiest way to get the title is, obviously, from <title> # The video player and video source are rendered client-side, the data
title = re.findall(r'<title.*>(.*)</title>', page_content)[0].strip() # contains in a <script id="RENDER_DATA" type="application/json"> tag
# Remove the site name from title # quoted, unquote the whole page content then search using regex with
site_name = ' - 抖音' # regular string.
if title.endswith(site_name): page_content = unquote(page_content)
title = title[:-len(site_name)] title = re.findall(r'"desc":"([^"]*)"', page_content)[0].strip()
video_format = 'mp4' video_format = 'mp4'
# The video url is url escaped, as of today, there are 4 working CDN video # video URLs are in this pattern {"src":"THE_URL"}, in json format
# urls for the same video, I chose the shortest one. urls_pattern = r'"playAddr":(\[.*?\])'
cdn_pattern = r'(api\.amemv\.com.*PackSourceEnum_AWEME_DETAIL)' urls = json.loads(re.findall(urls_pattern, page_content)[0])
video_url = 'https://' + unquote(re.findall(cdn_pattern, page_content)[0]) video_url = 'https:' + urls[0]['src']
size = url_size(video_url, faker=True) size = url_size(video_url, faker=True)
print_info( print_info(
site_info='douyin.com', title=title, site_info='douyin.com', title=title,