From db16bcb659ae98cfdbf69fa8c4acbf913fd6ab7f Mon Sep 17 00:00:00 2001 From: Zhang Date: Thu, 22 Dec 2016 22:33:37 +0800 Subject: [PATCH 01/22] [BiliBili] Better Multi-Part Video Naming --- src/you_get/extractors/bilibili.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 122dea0b..043c3753 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -167,10 +167,10 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs if not pages: cids = [cid] titles = [r1(r'', html) or title] - for i in range(len(cids)): + completeTitle=title+"-"+titles[i]#Build Better Title bilibili_download_by_cid(cids[i], - titles[i], + completeTitle, output_dir=output_dir, merge=merge, info_only=info_only) From bc5ff346d043e8097b81902d6f5392fc3e7869fc Mon Sep 17 00:00:00 2001 From: Zhang Date: Fri, 23 Dec 2016 11:50:51 +0800 Subject: [PATCH 02/22] [BiliBili] revert naming for single part videos --- src/you_get/extractors/bilibili.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 043c3753..2e54ed47 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -168,7 +168,11 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs cids = [cid] titles = [r1(r'', html) or title] for i in range(len(cids)): - completeTitle=title+"-"+titles[i]#Build Better Title + completeTitle=None + if (title == titles[i]): + completeTitle=title + else: + completeTitle=title+"-"+titles[i]#Build Better Title bilibili_download_by_cid(cids[i], completeTitle, output_dir=output_dir, From 7eca091d0df30f84520f3b665754828f33be95ae Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 10 Jan 2017 18:45:28 +0100 Subject: [PATCH 03/22] tag classifier: Python 3.6 --- you-get.json | 1 + 1 file changed, 1 insertion(+) diff --git a/you-get.json b/you-get.json index 084657d9..594742c2 100644 --- a/you-get.json +++ b/you-get.json @@ -24,6 +24,7 @@ "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", "Topic :: Internet", "Topic :: Internet :: WWW/HTTP", "Topic :: Multimedia", From 0c1553b97d981a5ab0ffc7605b8c70646423ce3f Mon Sep 17 00:00:00 2001 From: Valdemar Erk Date: Sun, 15 Jan 2017 12:43:34 +0100 Subject: [PATCH 04/22] Fixes #1612 --- src/you_get/extractors/panda.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/you_get/extractors/panda.py b/src/you_get/extractors/panda.py index 3f9ceade..45249bd2 100644 --- a/src/you_get/extractors/panda.py +++ b/src/you_get/extractors/panda.py @@ -8,22 +8,28 @@ import time def panda_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): roomid = url[url.rfind('/')+1:] - json_request_url = 'http://www.panda.tv/api_room?roomid={}&pub_key=&_={}'.format(roomid, int(time.time())) + json_request_url ="http://www.panda.tv/api_room_v2?roomid={}&__plat=pc_web&_={}".format(roomid, int(time.time())) content = get_html(json_request_url) - errno = json.loads(content)['errno'] - errmsg = json.loads(content)['errmsg'] + api_json = json.loads(content) + + errno = api_json["errno"] + errmsg = api_json["errmsg"] if errno: raise ValueError("Errno : {}, Errmsg : {}".format(errno, errmsg)) - - data = json.loads(content)['data'] - title = data.get('roominfo')['name'] - room_key = data.get('videoinfo')['room_key'] - plflag = data.get('videoinfo')['plflag'].split('_') - status = data.get('videoinfo')['status'] + data = api_json["data"] + title = data["roominfo"]["name"] + room_key = data["videoinfo"]["room_key"] + plflag = data["videoinfo"]["plflag"].split("_") + status = data["videoinfo"]["status"] if status is not "2": raise ValueError("The live stream is not online! (status:%s)" % status) - real_url = 'http://pl{}.live.panda.tv/live_panda/{}.flv'.format(plflag[1],room_key) + data2 = json.loads(data["videoinfo"]["plflag_list"]) + rid = data2["auth"]["rid"] + sign = data2["auth"]["sign"] + ts = data2["auth"]["time"] + real_url = "http://pl{}.live.panda.tv/live_panda/{}.flv?sign={}&ts={}&rid={}".format(plflag[1], room_key, sign, ts, rid) + print_info(site_info, title, 'flv', float('inf')) if not info_only: download_urls([real_url], title, 'flv', None, output_dir, merge = merge) From 374e1032db23cebb5f8f22a6de5eff4950bd7bf2 Mon Sep 17 00:00:00 2001 From: JayXon Date: Mon, 16 Jan 2017 09:56:24 -0800 Subject: [PATCH 05/22] [common] also retry if timeout in url_save, post_content, etc. --- src/you_get/common.py | 51 ++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index f320f6ab..bea6e62c 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -298,6 +298,13 @@ def get_location(url): # not to do that return response.geturl() +def urlopen_with_retry(*args, **kwargs): + for i in range(10): + try: + return request.urlopen(*args, **kwargs) + except socket.timeout: + logging.debug('request attempt %s timeout' % str(i + 1)) + def get_content(url, headers={}, decoded=True): """Gets the content of a URL via sending a HTTP GET request. @@ -317,13 +324,7 @@ def get_content(url, headers={}, decoded=True): cookies.add_cookie_header(req) req.headers.update(req.unredirected_hdrs) - for i in range(10): - try: - response = request.urlopen(req) - break - except socket.timeout: - logging.debug('request attempt %s timeout' % str(i + 1)) - + response = urlopen_with_retry(req) data = response.read() # Handle HTTP compression for gzip and deflate (zlib) @@ -362,7 +363,7 @@ def post_content(url, headers={}, post_data={}, decoded=True): cookies.add_cookie_header(req) req.headers.update(req.unredirected_hdrs) post_data_enc = bytes(parse.urlencode(post_data), 'utf-8') - response = request.urlopen(req, data = post_data_enc) + response = urlopen_with_retry(req, data=post_data_enc) data = response.read() # Handle HTTP compression for gzip and deflate (zlib) @@ -384,11 +385,11 @@ def post_content(url, headers={}, post_data={}, decoded=True): def url_size(url, faker = False, headers = {}): if faker: - response = request.urlopen(request.Request(url, headers = fake_headers), None) + response = urlopen_with_retry(request.Request(url, headers=fake_headers)) elif headers: - response = request.urlopen(request.Request(url, headers = headers), None) + response = urlopen_with_retry(request.Request(url, headers=headers)) else: - response = request.urlopen(url) + response = urlopen_with_retry(url) size = response.headers['content-length'] return int(size) if size!=None else float('inf') @@ -398,20 +399,20 @@ def urls_size(urls, faker = False, headers = {}): def get_head(url, headers = {}, get_method = 'HEAD'): if headers: - req = request.Request(url, headers = headers) + req = request.Request(url, headers=headers) else: req = request.Request(url) - req.get_method = lambda : get_method - res = request.urlopen(req) + req.get_method = lambda: get_method + res = urlopen_with_retry(req) return dict(res.headers) def url_info(url, faker = False, headers = {}): if faker: - response = request.urlopen(request.Request(url, headers = fake_headers), None) + response = urlopen_with_retry(request.Request(url, headers=fake_headers)) elif headers: - response = request.urlopen(request.Request(url, headers = headers), None) + response = urlopen_with_retry(request.Request(url, headers=headers)) else: - response = request.urlopen(request.Request(url)) + response = urlopen_with_retry(request.Request(url)) headers = response.headers @@ -460,11 +461,11 @@ def url_locations(urls, faker = False, headers = {}): locations = [] for url in urls: if faker: - response = request.urlopen(request.Request(url, headers = fake_headers), None) + response = urlopen_with_retry(request.Request(url, headers=fake_headers)) elif headers: - response = request.urlopen(request.Request(url, headers = headers), None) + response = urlopen_with_retry(request.Request(url, headers=headers)) else: - response = request.urlopen(request.Request(url)) + response = urlopen_with_retry(request.Request(url)) locations.append(response.url) return locations @@ -514,10 +515,10 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False, h if refer: headers['Referer'] = refer - response = request.urlopen(request.Request(url, headers = headers), None) + response = urlopen_with_retry(request.Request(url, headers=headers)) try: range_start = int(response.headers['content-range'][6:].split('/')[0].split('-')[0]) - end_length = end = int(response.headers['content-range'][6:].split('/')[1]) + end_length = int(response.headers['content-range'][6:].split('/')[1]) range_length = end_length - range_start except: content_length = response.headers['content-length'] @@ -537,7 +538,7 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False, h break else: # Unexpected termination. Retry request headers['Range'] = 'bytes=' + str(received) + '-' - response = request.urlopen(request.Request(url, headers = headers), None) + response = urlopen_with_retry(request.Request(url, headers=headers)) output.write(buffer) received += len(buffer) if bar: @@ -597,7 +598,7 @@ def url_save_chunked(url, filepath, bar, dyn_callback=None, chunk_size=0, ignore if refer: headers['Referer'] = refer - response = request.urlopen(request.Request(url, headers=headers), None) + response = urlopen_with_retry(request.Request(url, headers=headers)) with open(temp_filepath, open_mode) as output: this_chunk = received @@ -610,7 +611,7 @@ def url_save_chunked(url, filepath, bar, dyn_callback=None, chunk_size=0, ignore if chunk_size and (received - this_chunk) >= chunk_size: url = dyn_callback(received) this_chunk = received - response = request.urlopen(request.Request(url, headers=headers), None) + response = urlopen_with_retry(request.Request(url, headers=headers)) if bar: bar.update_received(len(buffer)) From 0f131e38d4b7fed6cb232aa346df01858547f921 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 16 Jan 2017 23:29:21 +0100 Subject: [PATCH 06/22] [facebook] fix #1615 --- src/you_get/extractors/facebook.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/facebook.py b/src/you_get/extractors/facebook.py index 2a96fcb0..9eb9fae9 100644 --- a/src/you_get/extractors/facebook.py +++ b/src/you_get/extractors/facebook.py @@ -11,11 +11,11 @@ def facebook_download(url, output_dir='.', merge=True, info_only=False, **kwargs title = r1(r'(.+)', html) sd_urls = list(set([ unicodize(str.replace(i, '\\/', '/')) - for i in re.findall(r'"sd_src_no_ratelimit":"([^"]*)"', html) + for i in re.findall(r'sd_src_no_ratelimit:"([^"]*)"', html) ])) hd_urls = list(set([ unicodize(str.replace(i, '\\/', '/')) - for i in re.findall(r'"hd_src_no_ratelimit":"([^"]*)"', html) + for i in re.findall(r'hd_src_no_ratelimit:"([^"]*)"', html) ])) urls = hd_urls if hd_urls else sd_urls From 015871dfa96d480ceed982ecdf45f911ee5b34a8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 16 Jan 2017 23:49:27 +0100 Subject: [PATCH 07/22] [acfun] correct active p title, fix #1617 --- src/you_get/extractors/acfun.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/you_get/extractors/acfun.py b/src/you_get/extractors/acfun.py index 87e005fb..6bb0dca4 100644 --- a/src/you_get/extractors/acfun.py +++ b/src/you_get/extractors/acfun.py @@ -77,6 +77,8 @@ def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs): title = unescape_html(title) title = escape_file_path(title) assert title + if match1(url, r'_(\d+)$'): # current P + title = title + " " + r1(r'active">([^<]*)', html) vid = r1('data-vid="(\d+)"', html) up = r1('data-name="([^"]+)"', html) From fc1646d74ea14012a03dc17aad395b5c5f1554b3 Mon Sep 17 00:00:00 2001 From: haoflynet Date: Sun, 22 Jan 2017 23:35:23 +0800 Subject: [PATCH 08/22] fix youku.py bug --- src/you_get/extractors/youku.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/you_get/extractors/youku.py b/src/you_get/extractors/youku.py index d673e58c..65fcbc27 100644 --- a/src/you_get/extractors/youku.py +++ b/src/you_get/extractors/youku.py @@ -143,6 +143,9 @@ class Youku(VideoExtractor): }) else: proxy_handler = request.ProxyHandler({}) + if not request._opener: + opener = request.build_opener(proxy_handler) + request.install_opener(opener) for handler in (ssl_context, cookie_handler, proxy_handler): request._opener.add_handler(handler) request._opener.addheaders = [('Cookie','__ysuid={}'.format(time.time()))] From 10624ca5b34e542bb9004765889499dc0341d698 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 25 Jan 2017 21:21:09 +0100 Subject: [PATCH 09/22] [google] add UA in get_html --- src/you_get/extractors/google.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/you_get/extractors/google.py b/src/you_get/extractors/google.py index 18483920..febac780 100644 --- a/src/you_get/extractors/google.py +++ b/src/you_get/extractors/google.py @@ -51,7 +51,7 @@ def google_download(url, output_dir = '.', merge = True, info_only = False, **kw # attempt to extract images first # TBD: posts with > 4 images # TBD: album links - html = get_html(parse.unquote(url)) + html = get_html(parse.unquote(url), faker=True) real_urls = [] for src in re.findall(r'src="([^"]+)"[^>]*itemprop="image"', html): t = src.split('/') @@ -66,7 +66,7 @@ def google_download(url, output_dir = '.', merge = True, info_only = False, **kw try: url = "https://plus.google.com/" + r1(r'"(photos/\d+/albums/\d+/\d+)', html) - html = get_html(url) + html = get_html(url, faker=True) temp = re.findall(r'\[(\d+),\d+,\d+,"([^"]+)"\]', html) temp = sorted(temp, key = lambda x : fmt_level[x[0]]) urls = [unicodize(i[1]) for i in temp if i[0] == temp[0][0]] @@ -77,7 +77,7 @@ def google_download(url, output_dir = '.', merge = True, info_only = False, **kw post_author = r1(r'/\+([^/]+)/posts', post_url) if post_author: post_url = "https://plus.google.com/+%s/posts/%s" % (parse.quote(post_author), r1(r'posts/(.+)', post_url)) - post_html = get_html(post_url) + post_html = get_html(post_url, faker=True) title = r1(r']*>([^<\n]+)', post_html) if title is None: @@ -98,7 +98,7 @@ def google_download(url, output_dir = '.', merge = True, info_only = False, **kw elif service in ['docs', 'drive'] : # Google Docs - html = get_html(url) + html = get_html(url, faker=True) title = r1(r'"title":"([^"]*)"', html) or r1(r' 1: From f299d30161f2017318211099979845192a891025 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 25 Jan 2017 21:21:49 +0100 Subject: [PATCH 10/22] [common] update fake_headers --- src/you_get/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index bea6e62c..9ee38821 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -131,7 +131,7 @@ fake_headers = { 'Accept-Charset': 'UTF-8,*;q=0.5', 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'en-US,en;q=0.8', - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:13.0) Gecko/20100101 Firefox/13.0' + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0' } if sys.stdout.isatty(): From 4108e2112deac199fe948fdcf3793148fea3a141 Mon Sep 17 00:00:00 2001 From: Justsoos Date: Thu, 26 Jan 2017 16:31:56 +0800 Subject: [PATCH 11/22] fix:[zhanqi.tv]recode all --- src/you_get/extractors/zhanqi.py | 99 +++++++++++++------------------- 1 file changed, 39 insertions(+), 60 deletions(-) diff --git a/src/you_get/extractors/zhanqi.py b/src/you_get/extractors/zhanqi.py index 7d6b75b6..25e7e132 100644 --- a/src/you_get/extractors/zhanqi.py +++ b/src/you_get/extractors/zhanqi.py @@ -3,73 +3,52 @@ __all__ = ['zhanqi_download'] from ..common import * -import re -import base64 import json -import time -import hashlib -def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): - html = get_content(url) - video_type_patt = r'VideoType":"([^"]+)"' - video_type = match1(html, video_type_patt) +def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):#the programmers of zhanqi are noobs + host_name = url.split('/')[2] + first_folder_path = url.split('/')[3] + + if first_folder_path != 'videos': #url = "https://www.zhanqi.tv/huashan" + if first_folder_path == 'topic': #https://www.zhanqi.tv/topic/lyingman + first_folder_path = url.split('/')[4] + api_url = "https://www.zhanqi.tv/api/static/v2.1/room/domain/" + first_folder_path + ".json" + api_json = json.loads(get_html(api_url)) + data = api_json['data'] + status = data['status'] + if status != '4': + raise ValueError ("The live stream is not online!") + + nickname = data['nickname'] + title = nickname + ": " + data['title'] + + roomid = data['id'] + videoId = data['videoId'] + jump_url = "http://wshdl.load.cdn.zhanqi.tv/zqlive/" + videoId + ".flv?get_url=1" + jump_url = jump_url.strip('\r\n') + + real_url = get_html(jump_url) + real_url = real_url.strip('\r\n') - #rtmp_base_patt = r'VideoUrl":"([^"]+)"' - rtmp_id_patt = r'videoId":"([^"]+)"' - vod_m3u8_id_patt = r'VideoID":"([^"]+)"' - title_patt = r'

([^<]+)

' - title_patt_backup = r'([^<]{1,9999})' - title = match1(html, title_patt) or match1(html, title_patt_backup) - title = unescape_html(title) - rtmp_base = "http://wshdl.load.cdn.zhanqi.tv/zqlive" - vod_base = "http://dlvod.cdn.zhanqi.tv" - rtmp_real_base = "rtmp://dlrtmp.cdn.zhanqi.tv/zqlive/" - room_info = "http://www.zhanqi.tv/api/static/live.roomid/" - KEY_MASK = "#{&..?!(" - ak2_pattern = r'ak2":"\d-([^|]+)' - - if video_type == "LIVE": - rtmp_id = match1(html, rtmp_id_patt).replace('\\/','/') - #request_url = rtmp_base+'/'+rtmp_id+'.flv?get_url=1' - #real_url = get_html(request_url) - html2 = get_content(room_info + rtmp_id.split("_")[0] + ".json") - json_data = json.loads(html2) - cdns = json_data["data"]["flashvars"]["cdns"] - cdns = base64.b64decode(cdns).decode("utf-8") - cdn = match1(cdns, ak2_pattern) - cdn = base64.b64decode(cdn).decode("utf-8") - key = '' - i = 0 - while(i < len(cdn)): - key = key + chr(ord(cdn[i]) ^ ord(KEY_MASK[i % 8])) - i = i + 1 - time_hex = hex(int(time.time()))[2:] - key = hashlib.md5(bytes(key + "/zqlive/" + rtmp_id + time_hex, "utf-8")).hexdigest() - real_url = rtmp_real_base + '/' + rtmp_id + "?k=" + key + "&t=" + time_hex print_info(site_info, title, 'flv', float('inf')) if not info_only: - download_rtmp_url(real_url, title, 'flv', {}, output_dir, merge = merge) - #download_urls([real_url], title, 'flv', None, output_dir, merge = merge) - elif video_type == "VOD": - vod_m3u8_request = vod_base + match1(html, vod_m3u8_id_patt).replace('\\/','/') - vod_m3u8 = get_html(vod_m3u8_request) - part_url = re.findall(r'(/[^#]+)\.ts',vod_m3u8) - real_url = [] - for i in part_url: - i = vod_base + i + ".ts" - real_url.append(i) - type_ = '' - size = 0 - for url in real_url: - _, type_, temp = url_info(url) - size += temp or 0 + download_url_ffmpeg(real_url, title, 'flv', {}, output_dir = output_dir, merge = merge) - print_info(site_info, title, type_ or 'ts', size) + else: #url = 'https://www.zhanqi.tv/videos/Lyingman/2017/01/182308.html' + video_id = url.split('/')[-1].split('.')[0] + api_url = "https://www.zhanqi.tv/api/static/v2.1/video/" + video_id + ".json" + api_json = json.loads(get_html(api_url)) + data = api_json['data'] + + title = data['title'] + + video_url_id = data['flashvars']['VideoID'] + real_url = "http://dlvod.cdn.zhanqi.tv/" + video_url_id + + print_info(site_info, title, 'flv', float('inf')) if not info_only: - download_urls(real_url, title, type_ or 'ts', size, output_dir, merge = merge) - else: - NotImplementedError('Unknown_video_type') + download_url_ffmpeg(real_url, title, 'flv', {}, output_dir = output_dir, merge = merge) site_info = "zhanqi.tv" download = zhanqi_download -download_playlist = playlist_not_supported('zhanqi') +download_playlist = playlist_not_supported('zhanqi') \ No newline at end of file From 15ae8feb5b5e4467e5eed54ff18b32021efaa813 Mon Sep 17 00:00:00 2001 From: Justsoos Date: Sat, 28 Jan 2017 03:08:54 +0800 Subject: [PATCH 12/22] little fix --- src/you_get/extractors/zhanqi.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/you_get/extractors/zhanqi.py b/src/you_get/extractors/zhanqi.py index 25e7e132..f2c673ca 100644 --- a/src/you_get/extractors/zhanqi.py +++ b/src/you_get/extractors/zhanqi.py @@ -5,13 +5,13 @@ __all__ = ['zhanqi_download'] from ..common import * import json -def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):#the programmers of zhanqi are noobs +def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): host_name = url.split('/')[2] - first_folder_path = url.split('/')[3] + first_folder_path = url.split('/')[3].split('?')[0] - if first_folder_path != 'videos': #url = "https://www.zhanqi.tv/huashan" + if first_folder_path != 'videos': #url = "https://www.zhanqi.tv/huashan?param_s=1_0.2.0" if first_folder_path == 'topic': #https://www.zhanqi.tv/topic/lyingman - first_folder_path = url.split('/')[4] + first_folder_path = url.split('/')[4].split('?')[0] api_url = "https://www.zhanqi.tv/api/static/v2.1/room/domain/" + first_folder_path + ".json" api_json = json.loads(get_html(api_url)) data = api_json['data'] @@ -29,13 +29,15 @@ def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kw real_url = get_html(jump_url) real_url = real_url.strip('\r\n') + site_info = "www.zhanqi.tv" print_info(site_info, title, 'flv', float('inf')) if not info_only: download_url_ffmpeg(real_url, title, 'flv', {}, output_dir = output_dir, merge = merge) else: #url = 'https://www.zhanqi.tv/videos/Lyingman/2017/01/182308.html' - video_id = url.split('/')[-1].split('.')[0] + video_id = url.split('/')[-1].split('?')[0].split('.')[0] + assert video_id api_url = "https://www.zhanqi.tv/api/static/v2.1/video/" + video_id + ".json" api_json = json.loads(get_html(api_url)) data = api_json['data'] @@ -44,11 +46,11 @@ def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kw video_url_id = data['flashvars']['VideoID'] real_url = "http://dlvod.cdn.zhanqi.tv/" + video_url_id + site_info = "www.zhanqi.tv/videos" print_info(site_info, title, 'flv', float('inf')) if not info_only: download_url_ffmpeg(real_url, title, 'flv', {}, output_dir = output_dir, merge = merge) -site_info = "zhanqi.tv" download = zhanqi_download download_playlist = playlist_not_supported('zhanqi') \ No newline at end of file From 753879b49736e314b08c2122ddeef550a06646f8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 28 Jan 2017 03:20:17 +0100 Subject: [PATCH 13/22] [netease] fix #1642 --- src/you_get/extractors/netease.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/netease.py b/src/you_get/extractors/netease.py index d5f3b1fa..17ae70a9 100644 --- a/src/you_get/extractors/netease.py +++ b/src/you_get/extractors/netease.py @@ -22,9 +22,9 @@ def netease_hymn(): """ def netease_cloud_music_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - rid = match1(url, r'id=(.*)') + rid = match1(url, r'\Wid=(.*)') if rid is None: - rid = match1(url, r'/(\d+)/?$') + rid = match1(url, r'/(\d+)/?') if "album" in url: j = loads(get_content("http://music.163.com/api/album/%s?id=%s&csrf_token=" % (rid, rid), headers={"Referer": "http://music.163.com/"})) From 5139b40b44265128088724a2619f3a3258728517 Mon Sep 17 00:00:00 2001 From: l34p Date: Wed, 1 Feb 2017 21:07:59 +0900 Subject: [PATCH 14/22] [youtube] fix broken link of html5player --- src/you_get/extractors/youtube.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index c403cb74..b0097f13 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -165,7 +165,7 @@ class YouTube(VideoExtractor): video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) try: ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) - self.html5player = 'https:' + ytplayer_config['assets']['js'] + self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] # Workaround: get_video_info returns bad s. Why? stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') except: @@ -177,7 +177,7 @@ class YouTube(VideoExtractor): ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) self.title = ytplayer_config['args']['title'] - self.html5player = 'https:' + ytplayer_config['assets']['js'] + self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') elif video_info['status'] == ['fail']: @@ -193,7 +193,7 @@ class YouTube(VideoExtractor): # 150 Restricted from playback on certain sites # Parse video page instead self.title = ytplayer_config['args']['title'] - self.html5player = 'https:' + ytplayer_config['assets']['js'] + self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') else: log.wtf('[Error] The uploader has not made this video available in your country.') From 2f4dc0f9a0000ed7ab6ecbfc7d903eed3c71a49d Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 1 Feb 2017 17:33:57 +0100 Subject: [PATCH 15/22] [google] quick fix for Google+ videos --- src/you_get/extractors/google.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/google.py b/src/you_get/extractors/google.py index febac780..1f2c354c 100644 --- a/src/you_get/extractors/google.py +++ b/src/you_get/extractors/google.py @@ -65,7 +65,7 @@ def google_download(url, output_dir = '.', merge = True, info_only = False, **kw title = post_date + "_" + post_id try: - url = "https://plus.google.com/" + r1(r'"(photos/\d+/albums/\d+/\d+)', html) + url = "https://plus.google.com/" + r1(r'(photos/\d+/albums/\d+/\d+)\?authkey', html) html = get_html(url, faker=True) temp = re.findall(r'\[(\d+),\d+,\d+,"([^"]+)"\]', html) temp = sorted(temp, key = lambda x : fmt_level[x[0]]) From 8afb998d59be335b4746f1792d317e5f5386a5f1 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 1 Feb 2017 23:39:46 +0100 Subject: [PATCH 16/22] Remove dead sites (2017-02-01) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * JPopsuki TV http://www.jpopsuki.tv/ * 天天动听 http://www.dongting.com/ * THVideo http://thvideo.tv/ * 阡陌视频 http://qianmo.com/ --- README.md | 4 -- src/you_get/common.py | 4 -- src/you_get/extractors/__init__.py | 3 -- src/you_get/extractors/dongting.py | 55 -------------------- src/you_get/extractors/jpopsuki.py | 23 --------- src/you_get/extractors/qianmo.py | 40 -------------- src/you_get/extractors/thvideo.py | 83 ------------------------------ 7 files changed, 212 deletions(-) delete mode 100644 src/you_get/extractors/dongting.py delete mode 100644 src/you_get/extractors/jpopsuki.py delete mode 100644 src/you_get/extractors/qianmo.py delete mode 100644 src/you_get/extractors/thvideo.py diff --git a/README.md b/README.md index 98c403c3..57f49a68 100644 --- a/README.md +++ b/README.md @@ -347,7 +347,6 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | Metacafe | |✓| | | | Magisto | |✓| | | | Khan Academy | |✓| | | -| JPopsuki TV | |✓| | | | Internet Archive | |✓| | | | **Instagram** | |✓|✓| | | InfoQ | |✓| | | @@ -392,11 +391,8 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 齐鲁网 | |✓| | | | QQ
腾讯视频 | |✓| | | | 企鹅直播 | |✓| | | -| 阡陌视频 | |✓| | | -| THVideo | |✓| | | | Sina
新浪视频
微博秒拍视频 |
|✓| | | | Sohu
搜狐视频 | |✓| | | -| 天天动听 | | | |✓| | **Tudou
土豆** | |✓| | | | 虾米 | | | |✓| | 阳光卫视 | |✓| | | diff --git a/src/you_get/common.py b/src/you_get/common.py index 9ee38821..a4aea070 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -15,7 +15,6 @@ SITES = { 'cbs' : 'cbs', 'dailymotion' : 'dailymotion', 'dilidili' : 'dilidili', - 'dongting' : 'dongting', 'douban' : 'douban', 'douyu' : 'douyutv', 'ehow' : 'ehow', @@ -40,7 +39,6 @@ SITES = { 'iqiyi' : 'iqiyi', 'isuntv' : 'suntv', 'joy' : 'joy', - 'jpopsuki' : 'jpopsuki', 'kankanews' : 'bilibili', 'khanacademy' : 'khan', 'ku6' : 'ku6', @@ -63,7 +61,6 @@ SITES = { 'pinterest' : 'pinterest', 'pixnet' : 'pixnet', 'pptv' : 'pptv', - 'qianmo' : 'qianmo', 'qq' : 'qq', 'quanmin' : 'quanmin', 'showroom-live' : 'showroom', @@ -73,7 +70,6 @@ SITES = { 'soundcloud' : 'soundcloud', 'ted' : 'ted', 'theplatform' : 'theplatform', - 'thvideo' : 'thvideo', 'tucao' : 'tucao', 'tudou' : 'tudou', 'tumblr' : 'tumblr', diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index 61b6a0d1..a027c396 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -33,7 +33,6 @@ from .interest import * from .iqilu import * from .iqiyi import * from .joy import * -from .jpopsuki import * from .ku6 import * from .kugou import * from .kuwo import * @@ -55,7 +54,6 @@ from .panda import * from .pinterest import * from .pixnet import * from .pptv import * -from .qianmo import * from .qie import * from .qq import * from .showroom import * @@ -64,7 +62,6 @@ from .sohu import * from .soundcloud import * from .suntv import * from .theplatform import * -from .thvideo import * from .tucao import * from .tudou import * from .tumblr import * diff --git a/src/you_get/extractors/dongting.py b/src/you_get/extractors/dongting.py deleted file mode 100644 index 56c1d394..00000000 --- a/src/you_get/extractors/dongting.py +++ /dev/null @@ -1,55 +0,0 @@ -# -*- coding: utf-8 -*- - -__all__ = ['dongting_download'] - -from ..common import * - -_unit_prefixes = 'bkmg' - -def parse_size(size): - m = re.match(r'([\d.]+)(.(?:i?B)?)', size, re.I) - if m: - return int(float(m.group(1)) * 1024 ** - _unit_prefixes.index(m.group(2).lower())) - else: - return 0 - -def dongting_download_lyric(lrc_url, file_name, output_dir): - j = get_html(lrc_url) - info = json.loads(j) - lrc = j['data']['lrc'] - filename = get_filename(file_name) - with open(output_dir + "/" + filename + '.lrc', 'w', encoding='utf-8') as x: - x.write(lrc) - -def dongting_download_song(sid, output_dir = '.', merge = True, info_only = False): - j = get_html('http://ting.hotchanson.com/detail.do?neid=%s&size=0' % sid) - info = json.loads(j) - - song_title = info['data']['songName'] - album_name = info['data']['albumName'] - artist = info['data']['singerName'] - ext = 'mp3' - size = parse_size(info['data']['itemList'][-1]['size']) - url = info['data']['itemList'][-1]['downUrl'] - - print_info(site_info, song_title, ext, size) - if not info_only: - file_name = "%s - %s - %s" % (song_title, album_name, artist) - download_urls([url], file_name, ext, size, output_dir, merge = merge) - lrc_url = ('http://lp.music.ttpod.com/lrc/down?' - 'lrcid=&artist=%s&title=%s') % ( - parse.quote(artist), parse.quote(song_title)) - try: - dongting_download_lyric(lrc_url, file_name, output_dir) - except: - pass - -def dongting_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False, **kwargs): - if re.match('http://www.dongting.com/\?song_id=\d+', url): - id = r1(r'http://www.dongting.com/\?song_id=(\d+)', url) - dongting_download_song(id, output_dir, merge, info_only) - -site_info = "Dongting.com" -download = dongting_download -download_playlist = playlist_not_supported("dongting") diff --git a/src/you_get/extractors/jpopsuki.py b/src/you_get/extractors/jpopsuki.py deleted file mode 100644 index eeac4f63..00000000 --- a/src/you_get/extractors/jpopsuki.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python - -__all__ = ['jpopsuki_download'] - -from ..common import * - -def jpopsuki_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - html = get_html(url, faker=True) - - title = r1(r'list - From Biligrab.""" - interface_url = 'http://thvideo.tv/api/playurl.php?cid={cid}-{p}'.format(cid = cid, p = p) - data = get_content(interface_url) - rawurl = [] - dom = parseString(data) - - for node in dom.getElementsByTagName('durl'): - url = node.getElementsByTagName('url')[0] - rawurl.append(url.childNodes[0].data) - return rawurl - -#---------------------------------------------------------------------- -def th_video_get_title(url, p): - """""" - if re.match(r'http://thvideo.tv/v/\w+', url): - html = get_content(url) - title = match1(html, r'cid=(.+)').split('**') - - if int(p) > 0: #not the 1st P or multi part - title = title + ' - ' + [i.split('=')[-1:][0].split('|')[1] for i in video_list][p] - - return title - -#---------------------------------------------------------------------- -def thvideo_download(url, output_dir = '.', merge = False, info_only = False, **kwargs): - if re.match(r'http://thvideo.tv/v/\w+', url): - if 'p' in kwargs and kwargs['p']: - p = kwargs['p'] - else: - p = int(match1(url, r'http://thvideo.tv/v/th\d+#(\d+)')) - p -= 1 - - if not p or p < 0: - p = 0 - - if 'title' in kwargs and kwargs['title']: - title = kwargs['title'] - else: - title = th_video_get_title(url, p) - - cid = match1(url, r'http://thvideo.tv/v/th(\d+)') - - type_ = '' - size = 0 - urls = thvideo_cid_to_url(cid, p) - - for url in urls: - _, type_, temp = url_info(url) - size += temp - - print_info(site_info, title, type_, size) - if not info_only: - download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge) - -#---------------------------------------------------------------------- -def thvideo_download_playlist(url, output_dir = '.', merge = False, info_only = False, **kwargs): - """""" - if re.match(r'http://thvideo.tv/v/\w+', url): - html = get_content(url) - video_list = match1(html, r'
  • cid=(.+)
  • ').split('**') - - title_base = th_video_get_title(url, 0) - for p, v in video_list: - part_title = [i.split('=')[-1:][0].split('|')[1] for i in video_list][p] - title = title_base + part_title - thvideo_download(url, output_dir, merge, - info_only, p = p, title = title) - -site_info = "THVideo" -download = thvideo_download -download_playlist = thvideo_download_playlist From 847e531b0d287d970bcbbdec13b8a2224151b0a8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 1 Feb 2017 23:51:06 +0100 Subject: [PATCH 17/22] update .travis.yml (add python 3.6) and LICENSE (2017) --- .travis.yml | 1 + LICENSE.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 9b73708d..2d780e81 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ python: - "3.3" - "3.4" - "3.5" + - "3.6" - "nightly" - "pypy3" script: make test diff --git a/LICENSE.txt b/LICENSE.txt index 54a06fe5..7b25d906 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,7 +1,7 @@ ============================================== This is a copy of the MIT license. ============================================== -Copyright (C) 2012, 2013, 2014, 2015, 2016 Mort Yao +Copyright (C) 2012-2017 Mort Yao Copyright (C) 2012 Boyu Guo Permission is hereby granted, free of charge, to any person obtaining a copy of From 62a535a4180a736608e56c440951d8b0e7b23ae8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 1 Feb 2017 23:53:32 +0100 Subject: [PATCH 18/22] version 0.4.648 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 2e8e4f41..933c46ad 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.626' +__version__ = '0.4.648' From ed99b91d1893186437f52701be03048e50873b9a Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 2 Feb 2017 05:43:57 +0100 Subject: [PATCH 19/22] [xiami] fix #1650 --- src/you_get/extractors/xiami.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/you_get/extractors/xiami.py b/src/you_get/extractors/xiami.py index b056c08e..e321c42e 100644 --- a/src/you_get/extractors/xiami.py +++ b/src/you_get/extractors/xiami.py @@ -13,7 +13,7 @@ def location_dec(str): str = str[1:] rows = head cols = int(len(str)/rows) + 1 - + out = "" full_row = len(str) % head for c in range(cols): @@ -58,7 +58,7 @@ def xiami_download_song(sid, output_dir = '.', merge = True, info_only = False): type, ext, size = url_info(url, faker = True) if not ext: ext = 'mp3' - + print_info(site_info, song_title, ext, size) if not info_only: file_name = "%s - %s - %s" % (song_title, artist, album_name) @@ -95,7 +95,7 @@ def xiami_download_showcollect(cid, output_dir = '.', merge = True, info_only = type, ext, size = url_info(url, faker = True) if not ext: ext = 'mp3' - + print_info(site_info, song_title, type, size) if not info_only: file_name = "%02d.%s - %s - %s" % (track_nr, song_title, artist, album_name) @@ -104,7 +104,7 @@ def xiami_download_showcollect(cid, output_dir = '.', merge = True, info_only = xiami_download_lyric(lrc_url, file_name, output_dir) except: pass - + track_nr += 1 def xiami_download_album(aid, output_dir = '.', merge = True, info_only = False): @@ -140,22 +140,23 @@ def xiami_download_album(aid, output_dir = '.', merge = True, info_only = False) if not pic_exist: xiami_download_pic(pic_url, 'cover', output_dir) pic_exist = True - + track_nr += 1 def xiami_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False, **kwargs): if re.match(r'http://www.xiami.com/album/\d+', url): id = r1(r'http://www.xiami.com/album/(\d+)', url) xiami_download_album(id, output_dir, merge, info_only) - + if re.match(r'http://www.xiami.com/collect/\d+', url): id = r1(r'http://www.xiami.com/collect/(\d+)', url) xiami_download_showcollect(id, output_dir, merge, info_only) - + if re.match('http://www.xiami.com/song/\d+', url): - id = r1(r'http://www.xiami.com/song/(\d+)', url) + html = get_html(url, faker=True) + id = r1(r'rel="canonical" href="http://www.xiami.com/song/([^"]+)"', html) xiami_download_song(id, output_dir, merge, info_only) - + if re.match('http://www.xiami.com/song/detail/id/\d+', url): id = r1(r'http://www.xiami.com/song/detail/id/(\d+)', url) xiami_download_song(id, output_dir, merge, info_only) From 098b6a9dd8b4db5d3516ada1c3dc24fd8d645fba Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 2 Feb 2017 05:50:00 +0100 Subject: [PATCH 20/22] [youtube] fix signature extraction --- src/you_get/extractors/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index b0097f13..ad1706be 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -52,7 +52,7 @@ class YouTube(VideoExtractor): return code js = js.replace('\n', ' ') - f1 = match1(js, r'\w+\.sig\|\|([$\w]+)\(\w+\.\w+\)') + f1 = match1(js, r'"signature",([\w]+)\(\w+\.\w+\)') f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \ match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1)) f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def) From c5dbb9766116e6362bd1c3e2a680dedb16979d6f Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 2 Feb 2017 06:00:30 +0100 Subject: [PATCH 21/22] tests: remove test_freesound --- tests/test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test.py b/tests/test.py index 020455b0..ba15e447 100644 --- a/tests/test.py +++ b/tests/test.py @@ -8,9 +8,6 @@ from you_get.common import * class YouGetTests(unittest.TestCase): - def test_freesound(self): - freesound.download("http://www.freesound.org/people/Corsica_S/sounds/184419/", info_only=True) - def test_imgur(self): imgur.download("http://imgur.com/WVLk5nD", info_only=True) imgur.download("http://imgur.com/gallery/WVLk5nD", info_only=True) From 858435d5035b72832f5f2a63adbd176916a9a27a Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 2 Feb 2017 06:03:23 +0100 Subject: [PATCH 22/22] version 0.4.652 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 933c46ad..63d908c6 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.648' +__version__ = '0.4.652'