From fc1646d74ea14012a03dc17aad395b5c5f1554b3 Mon Sep 17 00:00:00 2001 From: haoflynet Date: Sun, 22 Jan 2017 23:35:23 +0800 Subject: [PATCH 01/20] fix youku.py bug --- src/you_get/extractors/youku.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/you_get/extractors/youku.py b/src/you_get/extractors/youku.py index d673e58c..65fcbc27 100644 --- a/src/you_get/extractors/youku.py +++ b/src/you_get/extractors/youku.py @@ -143,6 +143,9 @@ class Youku(VideoExtractor): }) else: proxy_handler = request.ProxyHandler({}) + if not request._opener: + opener = request.build_opener(proxy_handler) + request.install_opener(opener) for handler in (ssl_context, cookie_handler, proxy_handler): request._opener.add_handler(handler) request._opener.addheaders = [('Cookie','__ysuid={}'.format(time.time()))] From 10624ca5b34e542bb9004765889499dc0341d698 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 25 Jan 2017 21:21:09 +0100 Subject: [PATCH 02/20] [google] add UA in get_html --- src/you_get/extractors/google.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/you_get/extractors/google.py b/src/you_get/extractors/google.py index 18483920..febac780 100644 --- a/src/you_get/extractors/google.py +++ b/src/you_get/extractors/google.py @@ -51,7 +51,7 @@ def google_download(url, output_dir = '.', merge = True, info_only = False, **kw # attempt to extract images first # TBD: posts with > 4 images # TBD: album links - html = get_html(parse.unquote(url)) + html = get_html(parse.unquote(url), faker=True) real_urls = [] for src in re.findall(r'src="([^"]+)"[^>]*itemprop="image"', html): t = src.split('/') @@ -66,7 +66,7 @@ def google_download(url, output_dir = '.', merge = True, info_only = False, **kw try: url = "https://plus.google.com/" + r1(r'"(photos/\d+/albums/\d+/\d+)', html) - html = get_html(url) + html = get_html(url, faker=True) temp = re.findall(r'\[(\d+),\d+,\d+,"([^"]+)"\]', html) temp = sorted(temp, key = lambda x : fmt_level[x[0]]) urls = [unicodize(i[1]) for i in temp if i[0] == temp[0][0]] @@ -77,7 +77,7 @@ def google_download(url, output_dir = '.', merge = True, info_only = False, **kw post_author = r1(r'/\+([^/]+)/posts', post_url) if post_author: post_url = "https://plus.google.com/+%s/posts/%s" % (parse.quote(post_author), r1(r'posts/(.+)', post_url)) - post_html = get_html(post_url) + post_html = get_html(post_url, faker=True) title = r1(r']*>([^<\n]+)', post_html) if title is None: @@ -98,7 +98,7 @@ def google_download(url, output_dir = '.', merge = True, info_only = False, **kw elif service in ['docs', 'drive'] : # Google Docs - html = get_html(url) + html = get_html(url, faker=True) title = r1(r'"title":"([^"]*)"', html) or r1(r' 1: From f299d30161f2017318211099979845192a891025 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 25 Jan 2017 21:21:49 +0100 Subject: [PATCH 03/20] [common] update fake_headers --- src/you_get/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index bea6e62c..9ee38821 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -131,7 +131,7 @@ fake_headers = { 'Accept-Charset': 'UTF-8,*;q=0.5', 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'en-US,en;q=0.8', - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:13.0) Gecko/20100101 Firefox/13.0' + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0' } if sys.stdout.isatty(): From 4108e2112deac199fe948fdcf3793148fea3a141 Mon Sep 17 00:00:00 2001 From: Justsoos Date: Thu, 26 Jan 2017 16:31:56 +0800 Subject: [PATCH 04/20] fix:[zhanqi.tv]recode all --- src/you_get/extractors/zhanqi.py | 99 +++++++++++++------------------- 1 file changed, 39 insertions(+), 60 deletions(-) diff --git a/src/you_get/extractors/zhanqi.py b/src/you_get/extractors/zhanqi.py index 7d6b75b6..25e7e132 100644 --- a/src/you_get/extractors/zhanqi.py +++ b/src/you_get/extractors/zhanqi.py @@ -3,73 +3,52 @@ __all__ = ['zhanqi_download'] from ..common import * -import re -import base64 import json -import time -import hashlib -def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): - html = get_content(url) - video_type_patt = r'VideoType":"([^"]+)"' - video_type = match1(html, video_type_patt) +def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):#the programmers of zhanqi are noobs + host_name = url.split('/')[2] + first_folder_path = url.split('/')[3] + + if first_folder_path != 'videos': #url = "https://www.zhanqi.tv/huashan" + if first_folder_path == 'topic': #https://www.zhanqi.tv/topic/lyingman + first_folder_path = url.split('/')[4] + api_url = "https://www.zhanqi.tv/api/static/v2.1/room/domain/" + first_folder_path + ".json" + api_json = json.loads(get_html(api_url)) + data = api_json['data'] + status = data['status'] + if status != '4': + raise ValueError ("The live stream is not online!") + + nickname = data['nickname'] + title = nickname + ": " + data['title'] + + roomid = data['id'] + videoId = data['videoId'] + jump_url = "http://wshdl.load.cdn.zhanqi.tv/zqlive/" + videoId + ".flv?get_url=1" + jump_url = jump_url.strip('\r\n') + + real_url = get_html(jump_url) + real_url = real_url.strip('\r\n') - #rtmp_base_patt = r'VideoUrl":"([^"]+)"' - rtmp_id_patt = r'videoId":"([^"]+)"' - vod_m3u8_id_patt = r'VideoID":"([^"]+)"' - title_patt = r'

([^<]+)

' - title_patt_backup = r'([^<]{1,9999})' - title = match1(html, title_patt) or match1(html, title_patt_backup) - title = unescape_html(title) - rtmp_base = "http://wshdl.load.cdn.zhanqi.tv/zqlive" - vod_base = "http://dlvod.cdn.zhanqi.tv" - rtmp_real_base = "rtmp://dlrtmp.cdn.zhanqi.tv/zqlive/" - room_info = "http://www.zhanqi.tv/api/static/live.roomid/" - KEY_MASK = "#{&..?!(" - ak2_pattern = r'ak2":"\d-([^|]+)' - - if video_type == "LIVE": - rtmp_id = match1(html, rtmp_id_patt).replace('\\/','/') - #request_url = rtmp_base+'/'+rtmp_id+'.flv?get_url=1' - #real_url = get_html(request_url) - html2 = get_content(room_info + rtmp_id.split("_")[0] + ".json") - json_data = json.loads(html2) - cdns = json_data["data"]["flashvars"]["cdns"] - cdns = base64.b64decode(cdns).decode("utf-8") - cdn = match1(cdns, ak2_pattern) - cdn = base64.b64decode(cdn).decode("utf-8") - key = '' - i = 0 - while(i < len(cdn)): - key = key + chr(ord(cdn[i]) ^ ord(KEY_MASK[i % 8])) - i = i + 1 - time_hex = hex(int(time.time()))[2:] - key = hashlib.md5(bytes(key + "/zqlive/" + rtmp_id + time_hex, "utf-8")).hexdigest() - real_url = rtmp_real_base + '/' + rtmp_id + "?k=" + key + "&t=" + time_hex print_info(site_info, title, 'flv', float('inf')) if not info_only: - download_rtmp_url(real_url, title, 'flv', {}, output_dir, merge = merge) - #download_urls([real_url], title, 'flv', None, output_dir, merge = merge) - elif video_type == "VOD": - vod_m3u8_request = vod_base + match1(html, vod_m3u8_id_patt).replace('\\/','/') - vod_m3u8 = get_html(vod_m3u8_request) - part_url = re.findall(r'(/[^#]+)\.ts',vod_m3u8) - real_url = [] - for i in part_url: - i = vod_base + i + ".ts" - real_url.append(i) - type_ = '' - size = 0 - for url in real_url: - _, type_, temp = url_info(url) - size += temp or 0 + download_url_ffmpeg(real_url, title, 'flv', {}, output_dir = output_dir, merge = merge) - print_info(site_info, title, type_ or 'ts', size) + else: #url = 'https://www.zhanqi.tv/videos/Lyingman/2017/01/182308.html' + video_id = url.split('/')[-1].split('.')[0] + api_url = "https://www.zhanqi.tv/api/static/v2.1/video/" + video_id + ".json" + api_json = json.loads(get_html(api_url)) + data = api_json['data'] + + title = data['title'] + + video_url_id = data['flashvars']['VideoID'] + real_url = "http://dlvod.cdn.zhanqi.tv/" + video_url_id + + print_info(site_info, title, 'flv', float('inf')) if not info_only: - download_urls(real_url, title, type_ or 'ts', size, output_dir, merge = merge) - else: - NotImplementedError('Unknown_video_type') + download_url_ffmpeg(real_url, title, 'flv', {}, output_dir = output_dir, merge = merge) site_info = "zhanqi.tv" download = zhanqi_download -download_playlist = playlist_not_supported('zhanqi') +download_playlist = playlist_not_supported('zhanqi') \ No newline at end of file From 15ae8feb5b5e4467e5eed54ff18b32021efaa813 Mon Sep 17 00:00:00 2001 From: Justsoos Date: Sat, 28 Jan 2017 03:08:54 +0800 Subject: [PATCH 05/20] little fix --- src/you_get/extractors/zhanqi.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/you_get/extractors/zhanqi.py b/src/you_get/extractors/zhanqi.py index 25e7e132..f2c673ca 100644 --- a/src/you_get/extractors/zhanqi.py +++ b/src/you_get/extractors/zhanqi.py @@ -5,13 +5,13 @@ __all__ = ['zhanqi_download'] from ..common import * import json -def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):#the programmers of zhanqi are noobs +def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): host_name = url.split('/')[2] - first_folder_path = url.split('/')[3] + first_folder_path = url.split('/')[3].split('?')[0] - if first_folder_path != 'videos': #url = "https://www.zhanqi.tv/huashan" + if first_folder_path != 'videos': #url = "https://www.zhanqi.tv/huashan?param_s=1_0.2.0" if first_folder_path == 'topic': #https://www.zhanqi.tv/topic/lyingman - first_folder_path = url.split('/')[4] + first_folder_path = url.split('/')[4].split('?')[0] api_url = "https://www.zhanqi.tv/api/static/v2.1/room/domain/" + first_folder_path + ".json" api_json = json.loads(get_html(api_url)) data = api_json['data'] @@ -29,13 +29,15 @@ def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kw real_url = get_html(jump_url) real_url = real_url.strip('\r\n') + site_info = "www.zhanqi.tv" print_info(site_info, title, 'flv', float('inf')) if not info_only: download_url_ffmpeg(real_url, title, 'flv', {}, output_dir = output_dir, merge = merge) else: #url = 'https://www.zhanqi.tv/videos/Lyingman/2017/01/182308.html' - video_id = url.split('/')[-1].split('.')[0] + video_id = url.split('/')[-1].split('?')[0].split('.')[0] + assert video_id api_url = "https://www.zhanqi.tv/api/static/v2.1/video/" + video_id + ".json" api_json = json.loads(get_html(api_url)) data = api_json['data'] @@ -44,11 +46,11 @@ def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kw video_url_id = data['flashvars']['VideoID'] real_url = "http://dlvod.cdn.zhanqi.tv/" + video_url_id + site_info = "www.zhanqi.tv/videos" print_info(site_info, title, 'flv', float('inf')) if not info_only: download_url_ffmpeg(real_url, title, 'flv', {}, output_dir = output_dir, merge = merge) -site_info = "zhanqi.tv" download = zhanqi_download download_playlist = playlist_not_supported('zhanqi') \ No newline at end of file From 753879b49736e314b08c2122ddeef550a06646f8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 28 Jan 2017 03:20:17 +0100 Subject: [PATCH 06/20] [netease] fix #1642 --- src/you_get/extractors/netease.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/netease.py b/src/you_get/extractors/netease.py index d5f3b1fa..17ae70a9 100644 --- a/src/you_get/extractors/netease.py +++ b/src/you_get/extractors/netease.py @@ -22,9 +22,9 @@ def netease_hymn(): """ def netease_cloud_music_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - rid = match1(url, r'id=(.*)') + rid = match1(url, r'\Wid=(.*)') if rid is None: - rid = match1(url, r'/(\d+)/?$') + rid = match1(url, r'/(\d+)/?') if "album" in url: j = loads(get_content("http://music.163.com/api/album/%s?id=%s&csrf_token=" % (rid, rid), headers={"Referer": "http://music.163.com/"})) From 5139b40b44265128088724a2619f3a3258728517 Mon Sep 17 00:00:00 2001 From: l34p Date: Wed, 1 Feb 2017 21:07:59 +0900 Subject: [PATCH 07/20] [youtube] fix broken link of html5player --- src/you_get/extractors/youtube.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index c403cb74..b0097f13 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -165,7 +165,7 @@ class YouTube(VideoExtractor): video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) try: ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) - self.html5player = 'https:' + ytplayer_config['assets']['js'] + self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] # Workaround: get_video_info returns bad s. Why? stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') except: @@ -177,7 +177,7 @@ class YouTube(VideoExtractor): ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) self.title = ytplayer_config['args']['title'] - self.html5player = 'https:' + ytplayer_config['assets']['js'] + self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') elif video_info['status'] == ['fail']: @@ -193,7 +193,7 @@ class YouTube(VideoExtractor): # 150 Restricted from playback on certain sites # Parse video page instead self.title = ytplayer_config['args']['title'] - self.html5player = 'https:' + ytplayer_config['assets']['js'] + self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') else: log.wtf('[Error] The uploader has not made this video available in your country.') From 2f4dc0f9a0000ed7ab6ecbfc7d903eed3c71a49d Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 1 Feb 2017 17:33:57 +0100 Subject: [PATCH 08/20] [google] quick fix for Google+ videos --- src/you_get/extractors/google.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/google.py b/src/you_get/extractors/google.py index febac780..1f2c354c 100644 --- a/src/you_get/extractors/google.py +++ b/src/you_get/extractors/google.py @@ -65,7 +65,7 @@ def google_download(url, output_dir = '.', merge = True, info_only = False, **kw title = post_date + "_" + post_id try: - url = "https://plus.google.com/" + r1(r'"(photos/\d+/albums/\d+/\d+)', html) + url = "https://plus.google.com/" + r1(r'(photos/\d+/albums/\d+/\d+)\?authkey', html) html = get_html(url, faker=True) temp = re.findall(r'\[(\d+),\d+,\d+,"([^"]+)"\]', html) temp = sorted(temp, key = lambda x : fmt_level[x[0]]) From 8afb998d59be335b4746f1792d317e5f5386a5f1 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 1 Feb 2017 23:39:46 +0100 Subject: [PATCH 09/20] Remove dead sites (2017-02-01) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * JPopsuki TV http://www.jpopsuki.tv/ * 天天动听 http://www.dongting.com/ * THVideo http://thvideo.tv/ * 阡陌视频 http://qianmo.com/ --- README.md | 4 -- src/you_get/common.py | 4 -- src/you_get/extractors/__init__.py | 3 -- src/you_get/extractors/dongting.py | 55 -------------------- src/you_get/extractors/jpopsuki.py | 23 --------- src/you_get/extractors/qianmo.py | 40 -------------- src/you_get/extractors/thvideo.py | 83 ------------------------------ 7 files changed, 212 deletions(-) delete mode 100644 src/you_get/extractors/dongting.py delete mode 100644 src/you_get/extractors/jpopsuki.py delete mode 100644 src/you_get/extractors/qianmo.py delete mode 100644 src/you_get/extractors/thvideo.py diff --git a/README.md b/README.md index 98c403c3..57f49a68 100644 --- a/README.md +++ b/README.md @@ -347,7 +347,6 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | Metacafe | |✓| | | | Magisto | |✓| | | | Khan Academy | |✓| | | -| JPopsuki TV | |✓| | | | Internet Archive | |✓| | | | **Instagram** | |✓|✓| | | InfoQ | |✓| | | @@ -392,11 +391,8 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 齐鲁网 | |✓| | | | QQ
腾讯视频 | |✓| | | | 企鹅直播 | |✓| | | -| 阡陌视频 | |✓| | | -| THVideo | |✓| | | | Sina
新浪视频
微博秒拍视频 |
|✓| | | | Sohu
搜狐视频 | |✓| | | -| 天天动听 | | | |✓| | **Tudou
土豆** | |✓| | | | 虾米 | | | |✓| | 阳光卫视 | |✓| | | diff --git a/src/you_get/common.py b/src/you_get/common.py index 9ee38821..a4aea070 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -15,7 +15,6 @@ SITES = { 'cbs' : 'cbs', 'dailymotion' : 'dailymotion', 'dilidili' : 'dilidili', - 'dongting' : 'dongting', 'douban' : 'douban', 'douyu' : 'douyutv', 'ehow' : 'ehow', @@ -40,7 +39,6 @@ SITES = { 'iqiyi' : 'iqiyi', 'isuntv' : 'suntv', 'joy' : 'joy', - 'jpopsuki' : 'jpopsuki', 'kankanews' : 'bilibili', 'khanacademy' : 'khan', 'ku6' : 'ku6', @@ -63,7 +61,6 @@ SITES = { 'pinterest' : 'pinterest', 'pixnet' : 'pixnet', 'pptv' : 'pptv', - 'qianmo' : 'qianmo', 'qq' : 'qq', 'quanmin' : 'quanmin', 'showroom-live' : 'showroom', @@ -73,7 +70,6 @@ SITES = { 'soundcloud' : 'soundcloud', 'ted' : 'ted', 'theplatform' : 'theplatform', - 'thvideo' : 'thvideo', 'tucao' : 'tucao', 'tudou' : 'tudou', 'tumblr' : 'tumblr', diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index 61b6a0d1..a027c396 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -33,7 +33,6 @@ from .interest import * from .iqilu import * from .iqiyi import * from .joy import * -from .jpopsuki import * from .ku6 import * from .kugou import * from .kuwo import * @@ -55,7 +54,6 @@ from .panda import * from .pinterest import * from .pixnet import * from .pptv import * -from .qianmo import * from .qie import * from .qq import * from .showroom import * @@ -64,7 +62,6 @@ from .sohu import * from .soundcloud import * from .suntv import * from .theplatform import * -from .thvideo import * from .tucao import * from .tudou import * from .tumblr import * diff --git a/src/you_get/extractors/dongting.py b/src/you_get/extractors/dongting.py deleted file mode 100644 index 56c1d394..00000000 --- a/src/you_get/extractors/dongting.py +++ /dev/null @@ -1,55 +0,0 @@ -# -*- coding: utf-8 -*- - -__all__ = ['dongting_download'] - -from ..common import * - -_unit_prefixes = 'bkmg' - -def parse_size(size): - m = re.match(r'([\d.]+)(.(?:i?B)?)', size, re.I) - if m: - return int(float(m.group(1)) * 1024 ** - _unit_prefixes.index(m.group(2).lower())) - else: - return 0 - -def dongting_download_lyric(lrc_url, file_name, output_dir): - j = get_html(lrc_url) - info = json.loads(j) - lrc = j['data']['lrc'] - filename = get_filename(file_name) - with open(output_dir + "/" + filename + '.lrc', 'w', encoding='utf-8') as x: - x.write(lrc) - -def dongting_download_song(sid, output_dir = '.', merge = True, info_only = False): - j = get_html('http://ting.hotchanson.com/detail.do?neid=%s&size=0' % sid) - info = json.loads(j) - - song_title = info['data']['songName'] - album_name = info['data']['albumName'] - artist = info['data']['singerName'] - ext = 'mp3' - size = parse_size(info['data']['itemList'][-1]['size']) - url = info['data']['itemList'][-1]['downUrl'] - - print_info(site_info, song_title, ext, size) - if not info_only: - file_name = "%s - %s - %s" % (song_title, album_name, artist) - download_urls([url], file_name, ext, size, output_dir, merge = merge) - lrc_url = ('http://lp.music.ttpod.com/lrc/down?' - 'lrcid=&artist=%s&title=%s') % ( - parse.quote(artist), parse.quote(song_title)) - try: - dongting_download_lyric(lrc_url, file_name, output_dir) - except: - pass - -def dongting_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False, **kwargs): - if re.match('http://www.dongting.com/\?song_id=\d+', url): - id = r1(r'http://www.dongting.com/\?song_id=(\d+)', url) - dongting_download_song(id, output_dir, merge, info_only) - -site_info = "Dongting.com" -download = dongting_download -download_playlist = playlist_not_supported("dongting") diff --git a/src/you_get/extractors/jpopsuki.py b/src/you_get/extractors/jpopsuki.py deleted file mode 100644 index eeac4f63..00000000 --- a/src/you_get/extractors/jpopsuki.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python - -__all__ = ['jpopsuki_download'] - -from ..common import * - -def jpopsuki_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - html = get_html(url, faker=True) - - title = r1(r'list - From Biligrab.""" - interface_url = 'http://thvideo.tv/api/playurl.php?cid={cid}-{p}'.format(cid = cid, p = p) - data = get_content(interface_url) - rawurl = [] - dom = parseString(data) - - for node in dom.getElementsByTagName('durl'): - url = node.getElementsByTagName('url')[0] - rawurl.append(url.childNodes[0].data) - return rawurl - -#---------------------------------------------------------------------- -def th_video_get_title(url, p): - """""" - if re.match(r'http://thvideo.tv/v/\w+', url): - html = get_content(url) - title = match1(html, r'cid=(.+)').split('**') - - if int(p) > 0: #not the 1st P or multi part - title = title + ' - ' + [i.split('=')[-1:][0].split('|')[1] for i in video_list][p] - - return title - -#---------------------------------------------------------------------- -def thvideo_download(url, output_dir = '.', merge = False, info_only = False, **kwargs): - if re.match(r'http://thvideo.tv/v/\w+', url): - if 'p' in kwargs and kwargs['p']: - p = kwargs['p'] - else: - p = int(match1(url, r'http://thvideo.tv/v/th\d+#(\d+)')) - p -= 1 - - if not p or p < 0: - p = 0 - - if 'title' in kwargs and kwargs['title']: - title = kwargs['title'] - else: - title = th_video_get_title(url, p) - - cid = match1(url, r'http://thvideo.tv/v/th(\d+)') - - type_ = '' - size = 0 - urls = thvideo_cid_to_url(cid, p) - - for url in urls: - _, type_, temp = url_info(url) - size += temp - - print_info(site_info, title, type_, size) - if not info_only: - download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge) - -#---------------------------------------------------------------------- -def thvideo_download_playlist(url, output_dir = '.', merge = False, info_only = False, **kwargs): - """""" - if re.match(r'http://thvideo.tv/v/\w+', url): - html = get_content(url) - video_list = match1(html, r'
  • cid=(.+)
  • ').split('**') - - title_base = th_video_get_title(url, 0) - for p, v in video_list: - part_title = [i.split('=')[-1:][0].split('|')[1] for i in video_list][p] - title = title_base + part_title - thvideo_download(url, output_dir, merge, - info_only, p = p, title = title) - -site_info = "THVideo" -download = thvideo_download -download_playlist = thvideo_download_playlist From 847e531b0d287d970bcbbdec13b8a2224151b0a8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 1 Feb 2017 23:51:06 +0100 Subject: [PATCH 10/20] update .travis.yml (add python 3.6) and LICENSE (2017) --- .travis.yml | 1 + LICENSE.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 9b73708d..2d780e81 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ python: - "3.3" - "3.4" - "3.5" + - "3.6" - "nightly" - "pypy3" script: make test diff --git a/LICENSE.txt b/LICENSE.txt index 54a06fe5..7b25d906 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,7 +1,7 @@ ============================================== This is a copy of the MIT license. ============================================== -Copyright (C) 2012, 2013, 2014, 2015, 2016 Mort Yao +Copyright (C) 2012-2017 Mort Yao Copyright (C) 2012 Boyu Guo Permission is hereby granted, free of charge, to any person obtaining a copy of From 62a535a4180a736608e56c440951d8b0e7b23ae8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 1 Feb 2017 23:53:32 +0100 Subject: [PATCH 11/20] version 0.4.648 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 2e8e4f41..933c46ad 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.626' +__version__ = '0.4.648' From ed99b91d1893186437f52701be03048e50873b9a Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 2 Feb 2017 05:43:57 +0100 Subject: [PATCH 12/20] [xiami] fix #1650 --- src/you_get/extractors/xiami.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/you_get/extractors/xiami.py b/src/you_get/extractors/xiami.py index b056c08e..e321c42e 100644 --- a/src/you_get/extractors/xiami.py +++ b/src/you_get/extractors/xiami.py @@ -13,7 +13,7 @@ def location_dec(str): str = str[1:] rows = head cols = int(len(str)/rows) + 1 - + out = "" full_row = len(str) % head for c in range(cols): @@ -58,7 +58,7 @@ def xiami_download_song(sid, output_dir = '.', merge = True, info_only = False): type, ext, size = url_info(url, faker = True) if not ext: ext = 'mp3' - + print_info(site_info, song_title, ext, size) if not info_only: file_name = "%s - %s - %s" % (song_title, artist, album_name) @@ -95,7 +95,7 @@ def xiami_download_showcollect(cid, output_dir = '.', merge = True, info_only = type, ext, size = url_info(url, faker = True) if not ext: ext = 'mp3' - + print_info(site_info, song_title, type, size) if not info_only: file_name = "%02d.%s - %s - %s" % (track_nr, song_title, artist, album_name) @@ -104,7 +104,7 @@ def xiami_download_showcollect(cid, output_dir = '.', merge = True, info_only = xiami_download_lyric(lrc_url, file_name, output_dir) except: pass - + track_nr += 1 def xiami_download_album(aid, output_dir = '.', merge = True, info_only = False): @@ -140,22 +140,23 @@ def xiami_download_album(aid, output_dir = '.', merge = True, info_only = False) if not pic_exist: xiami_download_pic(pic_url, 'cover', output_dir) pic_exist = True - + track_nr += 1 def xiami_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False, **kwargs): if re.match(r'http://www.xiami.com/album/\d+', url): id = r1(r'http://www.xiami.com/album/(\d+)', url) xiami_download_album(id, output_dir, merge, info_only) - + if re.match(r'http://www.xiami.com/collect/\d+', url): id = r1(r'http://www.xiami.com/collect/(\d+)', url) xiami_download_showcollect(id, output_dir, merge, info_only) - + if re.match('http://www.xiami.com/song/\d+', url): - id = r1(r'http://www.xiami.com/song/(\d+)', url) + html = get_html(url, faker=True) + id = r1(r'rel="canonical" href="http://www.xiami.com/song/([^"]+)"', html) xiami_download_song(id, output_dir, merge, info_only) - + if re.match('http://www.xiami.com/song/detail/id/\d+', url): id = r1(r'http://www.xiami.com/song/detail/id/(\d+)', url) xiami_download_song(id, output_dir, merge, info_only) From 098b6a9dd8b4db5d3516ada1c3dc24fd8d645fba Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 2 Feb 2017 05:50:00 +0100 Subject: [PATCH 13/20] [youtube] fix signature extraction --- src/you_get/extractors/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index b0097f13..ad1706be 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -52,7 +52,7 @@ class YouTube(VideoExtractor): return code js = js.replace('\n', ' ') - f1 = match1(js, r'\w+\.sig\|\|([$\w]+)\(\w+\.\w+\)') + f1 = match1(js, r'"signature",([\w]+)\(\w+\.\w+\)') f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \ match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1)) f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def) From c5dbb9766116e6362bd1c3e2a680dedb16979d6f Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 2 Feb 2017 06:00:30 +0100 Subject: [PATCH 14/20] tests: remove test_freesound --- tests/test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test.py b/tests/test.py index 020455b0..ba15e447 100644 --- a/tests/test.py +++ b/tests/test.py @@ -8,9 +8,6 @@ from you_get.common import * class YouGetTests(unittest.TestCase): - def test_freesound(self): - freesound.download("http://www.freesound.org/people/Corsica_S/sounds/184419/", info_only=True) - def test_imgur(self): imgur.download("http://imgur.com/WVLk5nD", info_only=True) imgur.download("http://imgur.com/gallery/WVLk5nD", info_only=True) From 858435d5035b72832f5f2a63adbd176916a9a27a Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 2 Feb 2017 06:03:23 +0100 Subject: [PATCH 15/20] version 0.4.652 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 933c46ad..63d908c6 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.648' +__version__ = '0.4.652' From 69714046b838499c5fce166153ccbf907a69e4a2 Mon Sep 17 00:00:00 2001 From: l34p Date: Mon, 6 Feb 2017 18:04:52 +0900 Subject: [PATCH 16/20] [youtube] improve livestream detection, fix #1673 --- src/you_get/extractors/youtube.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index ad1706be..18b46c9d 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -149,18 +149,6 @@ class YouTube(VideoExtractor): if 'use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']: self.title = parse.unquote_plus(video_info['title'][0]) - # YouTube Live - if 'url_encoded_fmt_stream_map' not in video_info: - hlsvp = video_info['hlsvp'][0] - - if 'info_only' in kwargs and kwargs['info_only']: - return - else: - download_url_ffmpeg(hlsvp, self.title, 'mp4') - exit(0) - - stream_list = video_info['url_encoded_fmt_stream_map'][0].split(',') - # Parse video page (for DASH) video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) try: @@ -169,6 +157,7 @@ class YouTube(VideoExtractor): # Workaround: get_video_info returns bad s. Why? stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') except: + stream_list = video_info['url_encoded_fmt_stream_map'][0].split(',') self.html5player = None else: @@ -209,6 +198,16 @@ class YouTube(VideoExtractor): else: log.wtf('[Failed] Invalid status.') + # YouTube Live + if ytplayer_config['args'].get('livestream') == '1' or ytplayer_config['args'].get('live_playback') == '1': + hlsvp = ytplayer_config['args']['hlsvp'] + + if 'info_only' in kwargs and kwargs['info_only']: + return + else: + download_url_ffmpeg(hlsvp, self.title, 'mp4') + exit(0) + for stream in stream_list: metadata = parse.parse_qs(stream) stream_itag = metadata['itag'][0] From 1997ea45ce2e0afda20f2d9f2f77d9df947d800d Mon Sep 17 00:00:00 2001 From: Zhiming Wang Date: Thu, 9 Feb 2017 11:26:32 -0500 Subject: [PATCH 17/20] [common] log URLs in more functions with network requests This is a follow-up to #999. This commit adds the : debug message, which was previously only emitted by get_content and post_content, to all high level utility functions with network requests except url_size, url_save and url_save_chunked (in order not to ruin progress bars). --- src/you_get/common.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/you_get/common.py b/src/you_get/common.py index a4aea070..2edbc426 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -255,6 +255,8 @@ def undeflate(data): # DEPRECATED in favor of get_content() def get_response(url, faker = False): + logging.debug('get_response: %s' % url) + # install cookies if cookies: opener = request.build_opener(request.HTTPCookieProcessor(cookies)) @@ -275,11 +277,15 @@ def get_response(url, faker = False): # DEPRECATED in favor of get_content() def get_html(url, encoding = None, faker = False): + logging.debug('get_html: %s' % url) + content = get_response(url, faker).data return str(content, 'utf-8', 'ignore') # DEPRECATED in favor of get_content() def get_decoded_html(url, faker = False): + logging.debug('get_decoded_html: %s' % url) + response = get_response(url, faker) data = response.data charset = r1(r'charset=([\w-]+)', response.headers['content-type']) @@ -289,6 +295,8 @@ def get_decoded_html(url, faker = False): return data def get_location(url): + logging.debug('get_location: %s' % url) + response = request.urlopen(url) # urllib will follow redirections and it's too much code to tell urllib # not to do that @@ -394,6 +402,8 @@ def urls_size(urls, faker = False, headers = {}): return sum([url_size(url, faker=faker, headers=headers) for url in urls]) def get_head(url, headers = {}, get_method = 'HEAD'): + logging.debug('get_head: %s' % url) + if headers: req = request.Request(url, headers=headers) else: @@ -403,6 +413,8 @@ def get_head(url, headers = {}, get_method = 'HEAD'): return dict(res.headers) def url_info(url, faker = False, headers = {}): + logging.debug('url_info: %s' % url) + if faker: response = urlopen_with_retry(request.Request(url, headers=fake_headers)) elif headers: @@ -456,6 +468,8 @@ def url_info(url, faker = False, headers = {}): def url_locations(urls, faker = False, headers = {}): locations = [] for url in urls: + logging.debug('url_locations: %s' % url) + if faker: response = urlopen_with_retry(request.Request(url, headers=fake_headers)) elif headers: From c1ed0d7e15d327327922235894623f7a551438b3 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 12 Feb 2017 00:40:16 +0100 Subject: [PATCH 18/20] [youtube] fix signature extraction (regression in #1662) --- src/you_get/extractors/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index ad1706be..6f75a129 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -52,7 +52,7 @@ class YouTube(VideoExtractor): return code js = js.replace('\n', ' ') - f1 = match1(js, r'"signature",([\w]+)\(\w+\.\w+\)') + f1 = match1(js, r'"signature",([$\w]+)\(\w+\.\w+\)') f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \ match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1)) f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def) From 31153bdb3df7b82fec2b5885302856d48fe041f9 Mon Sep 17 00:00:00 2001 From: Peter Xia Date: Mon, 13 Feb 2017 18:30:32 -0800 Subject: [PATCH 19/20] Multithreaded downloading. --- src/you_get/common.py | 64 +++++++++++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 2edbc426..75aa6b47 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -107,6 +107,8 @@ import time from urllib import request, parse, error from http import cookiejar from importlib import import_module +from concurrent.futures import ThreadPoolExecutor +from threading import Lock from .version import __version__ from .util import log, term @@ -638,10 +640,12 @@ class SimpleProgressBar: self.displayed = False self.total_size = total_size self.total_pieces = total_pieces - self.current_piece = 1 + self.current_piece = 0 self.received = 0 self.speed = '' self.last_updated = time.time() + self.data_lock = Lock() + self.ui_lock = Lock() total_pieces_len = len(str(total_pieces)) # 38 is the size of all statically known size in self.bar @@ -652,9 +656,13 @@ class SimpleProgressBar: total_str_width, total_str, self.bar_size, total_pieces_len, total_pieces_len) def update(self): + # Don't bother updating the UI if cannot aquire the lock + if not self.ui_lock.acquire(blocking=False) return; + self.data_lock.acquire() self.displayed = True bar_size = self.bar_size percent = round(self.received * 100 / self.total_size, 1) + self.data_lock.release() if percent >= 100: percent = 100 dots = bar_size * int(percent) // 100 @@ -669,8 +677,10 @@ class SimpleProgressBar: bar = self.bar.format(percent, round(self.received / 1048576, 1), bar, self.current_piece, self.total_pieces, self.speed) sys.stdout.write('\r' + bar) sys.stdout.flush() + self.ui_lock.release() def update_received(self, n): + self.data_lock.acquire() self.received += n time_diff = time.time() - self.last_updated bytes_ps = n / time_diff if time_diff else 0 @@ -683,15 +693,23 @@ class SimpleProgressBar: else: self.speed = '{:4.0f} B/s'.format(bytes_ps) self.last_updated = time.time() + self.data_lock.release() self.update() def update_piece(self, n): + self.data_lock.acquire() self.current_piece = n + self.data_lock.release() def done(self): + self.ui_lock.acquire() + self.data_lock.acquire() if self.displayed: print() self.displayed = False + self.data_lock.release() + self.ui_lock.release() + class PiecesProgressBar: def __init__(self, total_size, total_pieces = 1): @@ -700,24 +718,38 @@ class PiecesProgressBar: self.total_pieces = total_pieces self.current_piece = 1 self.received = 0 + self.data_lock = Lock() + self.ui_lock = Lock() def update(self): + self.ui_lock.acquire() + self.data_lock.acquire() self.displayed = True + self.data_lock.release() bar = '{0:>5}%[{1:<40}] {2}/{3}'.format('', '=' * 40, self.current_piece, self.total_pieces) sys.stdout.write('\r' + bar) sys.stdout.flush() + self.ui_lock.release() def update_received(self, n): + self.data_lock.acquire() self.received += n + self.data_lock.release() self.update() def update_piece(self, n): + self.data_lock.acquire() self.current_piece = n + self.data_lock.release() def done(self): + self.ui_lock.acquire() + self.data_lock.acquire() if self.displayed: print() self.displayed = False + self.data_lock.release() + self.ui_lock.release() class DummyProgressBar: def __init__(self, *args): @@ -795,13 +827,14 @@ def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merg parts = [] print('Downloading %s.%s ...' % (tr(title), ext)) bar.update() - for i, url in enumerate(urls): - filename = '%s[%02d].%s' % (title, i, ext) - filepath = os.path.join(output_dir, filename) - parts.append(filepath) - #print 'Downloading %s [%s/%s]...' % (tr(filename), i + 1, len(urls)) - bar.update_piece(i + 1) - url_save(url, filepath, bar, refer = refer, is_part = True, faker = faker, headers = headers) + with ThreadPoolExecutor(max_workers=16) as e: + for i, url in enumerate(urls): + filename = '%s[%02d].%s' % (title, i, ext) + filepath = os.path.join(output_dir, filename) + parts.append(filepath) + #print 'Downloading %s [%s/%s]...' % (tr(filename), i + 1, len(urls)) + bar.update_piece(i + 1) + e.submit(url_save, url, filepath, bar, refer = refer, is_part = True, faker = faker, headers = headers) bar.done() if not merge: @@ -921,13 +954,14 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=No else: parts = [] print('Downloading %s.%s ...' % (tr(title), ext)) - for i, url in enumerate(urls): - filename = '%s[%02d].%s' % (title, i, ext) - filepath = os.path.join(output_dir, filename) - parts.append(filepath) - #print 'Downloading %s [%s/%s]...' % (tr(filename), i + 1, len(urls)) - bar.update_piece(i + 1) - url_save_chunked(url, filepath, bar, refer = refer, is_part = True, faker = faker, headers = headers) + with ThreadPoolExecutor(max_workers=16) as e: + for i, url in enumerate(urls): + filename = '%s[%02d].%s' % (title, i, ext) + filepath = os.path.join(output_dir, filename) + parts.append(filepath) + #print 'Downloading %s [%s/%s]...' % (tr(filename), i + 1, len(urls)) + bar.update_piece(i + 1) + e.submit(url_save_chunked, url, filepath, bar, refer = refer, is_part = True, faker = faker, headers = headers) bar.done() if not merge: From b1f6a19ba784554b7185902cbaf1da68eebbf657 Mon Sep 17 00:00:00 2001 From: Peter Xia Date: Mon, 13 Feb 2017 19:31:22 -0800 Subject: [PATCH 20/20] Fix progress bar for multithreaded download. --- src/you_get/common.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 75aa6b47..da167deb 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -555,6 +555,7 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False, h received += len(buffer) if bar: bar.update_received(len(buffer)) + bar.update_piece() assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (received, os.path.getsize(temp_filepath), temp_filepath) @@ -626,6 +627,7 @@ def url_save_chunked(url, filepath, bar, dyn_callback=None, chunk_size=0, ignore response = urlopen_with_retry(request.Request(url, headers=headers)) if bar: bar.update_received(len(buffer)) + bar.update_piece() assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (received, os.path.getsize(temp_filepath)) @@ -657,7 +659,7 @@ class SimpleProgressBar: def update(self): # Don't bother updating the UI if cannot aquire the lock - if not self.ui_lock.acquire(blocking=False) return; + if not self.ui_lock.acquire(blocking=False): return self.data_lock.acquire() self.displayed = True bar_size = self.bar_size @@ -696,9 +698,9 @@ class SimpleProgressBar: self.data_lock.release() self.update() - def update_piece(self, n): + def update_piece(self): self.data_lock.acquire() - self.current_piece = n + self.current_piece += 1 self.data_lock.release() def done(self): @@ -737,9 +739,9 @@ class PiecesProgressBar: self.data_lock.release() self.update() - def update_piece(self, n): + def update_piece(self): self.data_lock.acquire() - self.current_piece = n + self.current_piece += 1 self.data_lock.release() def done(self): @@ -756,7 +758,7 @@ class DummyProgressBar: pass def update_received(self, n): pass - def update_piece(self, n): + def update_piece(self): pass def done(self): pass @@ -833,7 +835,6 @@ def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merg filepath = os.path.join(output_dir, filename) parts.append(filepath) #print 'Downloading %s [%s/%s]...' % (tr(filename), i + 1, len(urls)) - bar.update_piece(i + 1) e.submit(url_save, url, filepath, bar, refer = refer, is_part = True, faker = faker, headers = headers) bar.done()