From fd2a290ffe9193043b025ceb122042cf6623931d Mon Sep 17 00:00:00 2001 From: jackyzy823 Date: Sat, 28 May 2016 09:32:07 +0000 Subject: [PATCH] [QQ] more robust url extractor --- src/you_get/extractors/qq.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/you_get/extractors/qq.py b/src/you_get/extractors/qq.py index cb4aeebf..783bb2f5 100644 --- a/src/you_get/extractors/qq.py +++ b/src/you_get/extractors/qq.py @@ -4,7 +4,7 @@ __all__ = ['qq_download'] from ..common import * from .qie import download as qieDownload - +from urllib.parse import urlparse,parse_qs def qq_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False): api = "http://h5vv.video.qq.com/getinfo?otype=json&platform=10901&vid=%s" % vid content = get_html(api) @@ -24,31 +24,35 @@ def qq_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False): download_urls([url], title, ext, size, output_dir=output_dir, merge=merge) def qq_download(url, output_dir='.', merge=True, info_only=False, **kwargs): + if 'live.qq.com' in url: + qieDownload(url,output_dir=output_dir, merge=merge, info_only=info_only) + return + + #do redirect if 'v.qq.com/page' in url: # for URLs like this: # http://v.qq.com/page/k/9/7/k0194pwgw97.html - # it will redirect. - vid = match1(url, r'\b(\w+).html') - title = vid - elif 'kuaibao.qq.com' in url: + content = get_html(url) + url = match1(content,r'window\.location\.href="(.*?)"') + + if 'kuaibao.qq.com' in url: content = get_html(url) vid = match1(content, r'vid\s*=\s*"\s*([^"]+)"') title = match1(content, r'title">([^"]+)

') title = title.strip() if title else vid - elif 'live.qq.com' in url: - qieDownload(url,output_dir=output_dir, merge=merge, info_only=info_only) - exit() elif 'iframe/player.html' in url: vid = match1(url, r'\bvid=(\w+)') # for embedded URLs; don't know what the title is title = vid else: content = get_html(url) - vid = match1(content, r'vid\s*:\s*"\s*([^"]+)"') - title = match1(content, r'title\s*:\s*"\s*([^"]+)"') - # try to get the right title for URLs like this: - # http://v.qq.com/cover/p/ps6mnfqyrfo7es3.html?vid=q0181hpdvo5 - title = matchall(content, [r'title\s*:\s*"\s*([^"]+)"'])[-1] + vid = parse_qs(urlparse(url).query).get('vid') #for links specified vid like http://v.qq.com/cover/p/ps6mnfqyrfo7es3.html?vid=q0181hpdvo5 + vid = vid[0] if vid else match1(content, r'vid\s*:\s*"\s*([^"]+)"') #general fallback + title = match1(content,r''%vid) + title = match1(content, r'title">([^"]+)

') if not title else title + title = vid if not title else title #general fallback + + qq_download_by_vid(vid, title, output_dir, merge, info_only)