From 935a4233cda2b506859f51137ab5eed28ddff77c Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 15 Aug 2018 13:19:25 +0200 Subject: [PATCH 01/67] [youku] get schwifty --- src/you_get/extractors/youku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/youku.py b/src/you_get/extractors/youku.py index d5186328..d7c79385 100644 --- a/src/you_get/extractors/youku.py +++ b/src/you_get/extractors/youku.py @@ -78,7 +78,7 @@ class Youku(VideoExtractor): self.api_error_code = None self.api_error_msg = None - self.ccode = '0508' + self.ccode = '0511' # Found in http://g.alicdn.com/player/ykplayer/0.5.64/youku-player.min.js # grep -oE '"[0-9a-zA-Z+/=]{256}"' youku-player.min.js self.ckey = 'DIl58SLFxFNndSV1GFNnMQVYkx1PP5tKe1siZu/86PR1u/Wh1Ptd+WOZsHHWxysSfAOhNJpdVWsdVJNsfJ8Sxd8WKVvNfAS8aS8fAOzYARzPyPc3JvtnPHjTdKfESTdnuTW6ZPvk2pNDh4uFzotgdMEFkzQ5wZVXl2Pf1/Y6hLK0OnCNxBj3+nb0v72gZ6b0td+WOZsHHWxysSo/0y9D2K42SaB8Y/+aD2K42SaB8Y/+ahU+WOZsHcrxysooUeND' From 28e1f6cc15568004f8977ab1cb0f3c171cabeeaf Mon Sep 17 00:00:00 2001 From: Mao Chang <1702190+moaix@users.noreply.github.com> Date: Wed, 15 Aug 2018 22:58:17 +0800 Subject: [PATCH 02/67] fix lizhi --- src/you_get/extractors/lizhi.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/lizhi.py b/src/you_get/extractors/lizhi.py index 65988a9f..4991df31 100644 --- a/src/you_get/extractors/lizhi.py +++ b/src/you_get/extractors/lizhi.py @@ -2,8 +2,17 @@ __all__ = ['lizhi_download'] import json +import datetime from ..common import * +# +# Worked well but not perfect. +# TODO: add option --format={sd|hd} +# +def get_url(ep): + readable = datetime.datetime.fromtimestamp(int(ep['create_time']) / 1000).strftime('%Y/%m/%d') + return 'http://cdn5.lizhi.fm/audio/{}/{}_hd.mp3'.format(readable, ep['id']) + # radio_id: e.g. 549759 from http://www.lizhi.fm/549759/ # # Returns a list of tuples (audio_id, title, url) for each episode @@ -23,7 +32,7 @@ def lizhi_extract_playlist_info(radio_id): # (au_cnt), then handle pagination properly. api_url = 'http://www.lizhi.fm/api/radio_audios?s=0&l=65535&band=%s' % radio_id api_response = json.loads(get_content(api_url)) - return [(ep['id'], ep['name'], ep['url']) for ep in api_response] + return [(ep['id'], ep['name'], get_url(ep)) for ep in api_response] def lizhi_download_audio(audio_id, title, url, output_dir='.', info_only=False): filetype, ext, size = url_info(url) From 82db2fe8f07e65616d3aff6faf5ec7b61d430534 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 15 Aug 2018 21:42:40 +0200 Subject: [PATCH 03/67] [baidu] you got this --- src/you_get/extractors/baidu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/baidu.py b/src/you_get/extractors/baidu.py index b30c9d86..65e62098 100644 --- a/src/you_get/extractors/baidu.py +++ b/src/you_get/extractors/baidu.py @@ -130,7 +130,7 @@ def baidu_download(url, output_dir='.', stream_type=None, merge=True, info_only= title = r1(r'title:"([^"]+)"', html) vhsrc = re.findall(r'"BDE_Image"[^>]+src="([^"]+)"', html) or re.findall(r'vhsrc="([^"]+)"', html) - if vhsrc is not None: + if len(vhsrc) > 0: ext = 'mp4' size = url_size(vhsrc[0]) print_info(site_info, title, ext, size) From e36404cf2243d8de52062d834f0676a0f95966a7 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 25 Aug 2018 15:41:07 +0200 Subject: [PATCH 04/67] [youku] fire in the hole! --- src/you_get/extractors/youku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/youku.py b/src/you_get/extractors/youku.py index d7c79385..f2e67336 100644 --- a/src/you_get/extractors/youku.py +++ b/src/you_get/extractors/youku.py @@ -78,7 +78,7 @@ class Youku(VideoExtractor): self.api_error_code = None self.api_error_msg = None - self.ccode = '0511' + self.ccode = '0515' # Found in http://g.alicdn.com/player/ykplayer/0.5.64/youku-player.min.js # grep -oE '"[0-9a-zA-Z+/=]{256}"' youku-player.min.js self.ckey = 'DIl58SLFxFNndSV1GFNnMQVYkx1PP5tKe1siZu/86PR1u/Wh1Ptd+WOZsHHWxysSfAOhNJpdVWsdVJNsfJ8Sxd8WKVvNfAS8aS8fAOzYARzPyPc3JvtnPHjTdKfESTdnuTW6ZPvk2pNDh4uFzotgdMEFkzQ5wZVXl2Pf1/Y6hLK0OnCNxBj3+nb0v72gZ6b0td+WOZsHHWxysSo/0y9D2K42SaB8Y/+aD2K42SaB8Y/+ahU+WOZsHcrxysooUeND' From 2a1bb6978c6e58995e89e055bd3a16042f5c5636 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 27 Aug 2018 14:49:37 +0200 Subject: [PATCH 05/67] [twitter] match correct screen_name and item_id in a conversation --- src/you_get/extractors/twitter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 8ed400db..1c027973 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -30,9 +30,9 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) return html = get_html(url, faker=True) - screen_name = r1(r'data-screen-name="([^"]*)"', html) or \ + screen_name = r1(r'twitter\.com/([^/]+)', url) or r1(r'data-screen-name="([^"]*)"', html) or \ r1(r' Date: Wed, 29 Aug 2018 16:44:13 +0200 Subject: [PATCH 06/67] [[util.os] fix Android termux compatibility (no permission to access /proc) --- src/you_get/util/os.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/you_get/util/os.py b/src/you_get/util/os.py index 11730e28..1a00d2b5 100644 --- a/src/you_get/util/os.py +++ b/src/you_get/util/os.py @@ -19,9 +19,11 @@ def detect_os(): elif 'linux' in syst: os = 'linux' # detect WSL https://github.com/Microsoft/BashOnWindows/issues/423 - with open('/proc/version', 'r') as f: - if 'microsoft' in f.read().lower(): - os = 'wsl' + try: + with open('/proc/version', 'r') as f: + if 'microsoft' in f.read().lower(): + os = 'wsl' + except: pass elif 'windows' in syst: os = 'windows' elif 'bsd' in syst: From 9ba7690cb9b33a21ec3e068d8b652b2c59d12797 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 29 Aug 2018 17:15:47 +0200 Subject: [PATCH 07/67] [bilibili] as you can see --- src/you_get/extractors/bilibili.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 96fc60c8..7234340a 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -144,11 +144,13 @@ class Bilibili(VideoExtractor): else: playinfo = re.search(r'__INITIAL_STATE__=(.*?);\(function\(\)', self.page) if playinfo is not None: - pages = json.loads(playinfo.group(1))['videoData']['pages'] - if len(pages) > 1: - qs = dict(parse.parse_qsl(urllib.parse.urlparse(self.url).query)) - page = pages[int(qs.get('p', 1)) - 1] - self.title = '{} #{}. {}'.format(self.title, page['page'], page['part']) + jsonPlayinfo = json.loads(playinfo.group(1)) + if 'videoData' in jsonPlayinfo: + pages = jsonPlayinfo['videoData']['pages'] + if len(pages) > 1: + qs = dict(parse.parse_qsl(urllib.parse.urlparse(self.url).query)) + page = pages[int(qs.get('p', 1)) - 1] + self.title = '{} #{}. {}'.format(self.title, page['page'], page['part']) if 'bangumi.bilibili.com/movie' in self.url: self.movie_entry(**kwargs) From bd47cb656ec5a7d6ca43ec8664f7c3908b3a8286 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 7 Sep 2018 22:21:24 +0200 Subject: [PATCH 08/67] [youtube] fix for new base.js (close #2641) --- src/you_get/extractors/youtube.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 33dc470e..5482f1e4 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -37,6 +37,9 @@ class YouTube(VideoExtractor): ] def decipher(js, s): + # Examples: + # - https://www.youtube.com/yts/jsbin/player-da_DK-vflWlK-zq/base.js + # - https://www.youtube.com/yts/jsbin/player-vflvABTsY/da_DK/base.js def tr_js(code): code = re.sub(r'function', r'def', code) code = re.sub(r'(\W)(as|if|in|is|or)\(', r'\1_\2(', code) @@ -52,7 +55,8 @@ class YouTube(VideoExtractor): return code js = js.replace('\n', ' ') - f1 = match1(js, r'"signature",([$\w]+)\(\w+\.\w+\)') + f1 = match1(js, r'\.set\(\w+\.sp,([$\w]+)\(\w+\.s\)\)') or \ + match1(js, r'"signature",([$\w]+)\(\w+\.\w+\)') f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \ match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1)) f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def) From 47b2164b2e21f11de2acbb28303f13f45ddacd6a Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 7 Sep 2018 22:26:51 +0200 Subject: [PATCH 09/67] [youku] +1 --- src/you_get/extractors/youku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/youku.py b/src/you_get/extractors/youku.py index f2e67336..e86b53b9 100644 --- a/src/you_get/extractors/youku.py +++ b/src/you_get/extractors/youku.py @@ -78,7 +78,7 @@ class Youku(VideoExtractor): self.api_error_code = None self.api_error_msg = None - self.ccode = '0515' + self.ccode = '0516' # Found in http://g.alicdn.com/player/ykplayer/0.5.64/youku-player.min.js # grep -oE '"[0-9a-zA-Z+/=]{256}"' youku-player.min.js self.ckey = 'DIl58SLFxFNndSV1GFNnMQVYkx1PP5tKe1siZu/86PR1u/Wh1Ptd+WOZsHHWxysSfAOhNJpdVWsdVJNsfJ8Sxd8WKVvNfAS8aS8fAOzYARzPyPc3JvtnPHjTdKfESTdnuTW6ZPvk2pNDh4uFzotgdMEFkzQ5wZVXl2Pf1/Y6hLK0OnCNxBj3+nb0v72gZ6b0td+WOZsHHWxysSo/0y9D2K42SaB8Y/+aD2K42SaB8Y/+ahU+WOZsHcrxysooUeND' From 932fc7a7b5a70e466a11cef0df92aa18aca9d18f Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 8 Sep 2018 00:46:15 +0200 Subject: [PATCH 10/67] [baidu] as you can see --- src/you_get/extractors/baidu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/baidu.py b/src/you_get/extractors/baidu.py index 65e62098..a8cb3d5d 100644 --- a/src/you_get/extractors/baidu.py +++ b/src/you_get/extractors/baidu.py @@ -129,7 +129,8 @@ def baidu_download(url, output_dir='.', stream_type=None, merge=True, info_only= html = get_html(url) title = r1(r'title:"([^"]+)"', html) - vhsrc = re.findall(r'"BDE_Image"[^>]+src="([^"]+)"', html) or re.findall(r'vhsrc="([^"]+)"', html) + vhsrc = re.findall(r'"BDE_Image"[^>]+src="([^"]+\.mp4)"', html) or \ + re.findall(r'vhsrc="([^"]+)"', html) if len(vhsrc) > 0: ext = 'mp4' size = url_size(vhsrc[0]) From a062be55f8eda54f4154870c0dadab4c866cb9cc Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 8 Sep 2018 00:48:10 +0200 Subject: [PATCH 11/67] version 0.4.1140 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 76969dc3..27ed8849 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1128' +__version__ = '0.4.1140' From 2d8bf0e556fc509fc4dbfc0c884e602962ec837c Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 11 Sep 2018 03:08:26 +0200 Subject: [PATCH 12/67] [bilibili] there's a solution you're not seeing (close #2642) --- src/you_get/extractors/bilibili.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 7234340a..ceeba3ef 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -115,7 +115,7 @@ class Bilibili(VideoExtractor): self.url = 'http://www.bilibili.com/video/av{}/'.format(aid) self.ua = fake_headers['User-Agent'] - self.url = url_locations([self.url])[0] + self.url = url_locations([self.url], faker=True)[0] frag = urllib.parse.urlparse(self.url).fragment # http://www.bilibili.com/video/av3141144/index_2.html#page=3 if frag: @@ -125,7 +125,7 @@ class Bilibili(VideoExtractor): aid = re.search(r'av(\d+)', self.url).group(1) self.url = 'http://www.bilibili.com/video/av{}/index_{}.html'.format(aid, page) self.referer = self.url - self.page = get_content(self.url) + self.page = get_content(self.url, headers=fake_headers) m = re.search(r'(.*?)', self.page) or re.search(r'

', self.page) if m is not None: From 3e6387e51c14b9ef6dd437367723dbb8919812ef Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 11 Sep 2018 03:14:53 +0200 Subject: [PATCH 13/67] [bilibili] duang duang --- src/you_get/extractors/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index ceeba3ef..94eed2ea 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -381,7 +381,7 @@ def download_video_from_favlist(url, **kwargs): def bilibili_download_playlist_by_url(url, **kwargs): - url = url_locations([url])[0] + url = url_locations([url], faker=True)[0] kwargs['playlist'] = True # a bangumi here? possible? if 'live.bilibili' in url: From 89844858199bfc3b3a3317e686e5982c74949777 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 11 Sep 2018 17:31:47 +0200 Subject: [PATCH 14/67] [youtube] faster than light --- src/you_get/common.py | 137 ++++++++++++++++-------------- src/you_get/extractors/youtube.py | 26 +++++- 2 files changed, 97 insertions(+), 66 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index b19d602f..d212b62b 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -602,7 +602,12 @@ def url_save( # the key must be 'Referer' for the hack here if refer is not None: tmp_headers['Referer'] = refer - file_size = url_size(url, faker=faker, headers=tmp_headers) + if type(url) is list: + file_size = urls_size(url, faker=faker, headers=tmp_headers) + is_chunked, urls = True, url + else: + file_size = url_size(url, faker=faker, headers=tmp_headers) + is_chunked, urls = False, [url] continue_renameing = True while continue_renameing: @@ -655,70 +660,78 @@ def url_save( else: open_mode = 'wb' - if received < file_size: - if faker: - tmp_headers = fake_headers - ''' - if parameter headers passed in, we have it copied as tmp_header - elif headers: - headers = headers - else: - headers = {} - ''' - if received: - tmp_headers['Range'] = 'bytes=' + str(received) + '-' - if refer: - tmp_headers['Referer'] = refer + for url in urls: + received_chunk = 0 + if received < file_size: + if faker: + tmp_headers = fake_headers + ''' + if parameter headers passed in, we have it copied as tmp_header + elif headers: + headers = headers + else: + headers = {} + ''' + if received and not is_chunked: # only request a range when not chunked + tmp_headers['Range'] = 'bytes=' + str(received) + '-' + if refer: + tmp_headers['Referer'] = refer - if timeout: - response = urlopen_with_retry( - request.Request(url, headers=tmp_headers), timeout=timeout - ) - else: - response = urlopen_with_retry( - request.Request(url, headers=tmp_headers) - ) - try: - range_start = int( - response.headers[ - 'content-range' - ][6:].split('/')[0].split('-')[0] - ) - end_length = int( - response.headers['content-range'][6:].split('/')[1] - ) - range_length = end_length - range_start - except: - content_length = response.headers['content-length'] - range_length = int(content_length) if content_length is not None \ - else float('inf') + if timeout: + response = urlopen_with_retry( + request.Request(url, headers=tmp_headers), timeout=timeout + ) + else: + response = urlopen_with_retry( + request.Request(url, headers=tmp_headers) + ) + try: + range_start = int( + response.headers[ + 'content-range' + ][6:].split('/')[0].split('-')[0] + ) + end_length = int( + response.headers['content-range'][6:].split('/')[1] + ) + range_length = end_length - range_start + except: + content_length = response.headers['content-length'] + range_length = int(content_length) if content_length is not None \ + else float('inf') - if file_size != received + range_length: - received = 0 - if bar: - bar.received = 0 - open_mode = 'wb' - - with open(temp_filepath, open_mode) as output: - while True: - buffer = None - try: - buffer = response.read(1024 * 256) - except socket.timeout: - pass - if not buffer: - if received == file_size: # Download finished - break - # Unexpected termination. Retry request - tmp_headers['Range'] = 'bytes=' + str(received) + '-' - response = urlopen_with_retry( - request.Request(url, headers=tmp_headers) - ) - continue - output.write(buffer) - received += len(buffer) + if is_chunked: # always append if chunked + open_mode = 'ab' + elif file_size != received + range_length: # is it ever necessary? + received = 0 if bar: - bar.update_received(len(buffer)) + bar.received = 0 + open_mode = 'wb' + + with open(temp_filepath, open_mode) as output: + while True: + buffer = None + try: + buffer = response.read(1024 * 256) + except socket.timeout: + pass + if not buffer: + if is_chunked and received_chunk == range_length: + break + elif not is_chunked and received == file_size: # Download finished + break + # Unexpected termination. Retry request + if not is_chunked: # when + tmp_headers['Range'] = 'bytes=' + str(received) + '-' + response = urlopen_with_retry( + request.Request(url, headers=tmp_headers) + ) + continue + output.write(buffer) + received += len(buffer) + received_chunk += len(buffer) + if bar: + bar.update_received(len(buffer)) assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % ( received, os.path.getsize(temp_filepath), temp_filepath diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 5482f1e4..19864590 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -81,6 +81,16 @@ class YouTube(VideoExtractor): exec(code, globals(), locals()) return locals()['sig'] + def chunk_by_range(url, size): + urls = [] + chunk_size = 10485760 + start, end = 0, chunk_size - 1 + urls.append('%s&range=%s-%s' % (url, start, end)) + while end + 1 < size: # processed size < expected size + start, end = end + 1, end + chunk_size + urls.append('%s&range=%s-%s' % (url, start, end)) + return urls + def get_url_from_vid(vid): return 'https://youtu.be/{}'.format(vid) @@ -290,13 +300,15 @@ class YouTube(VideoExtractor): if not dash_size: try: dash_size = url_size(dash_url) except: continue + dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size)) + dash_mp4_a_urls = self.__class__.chunk_by_range(dash_mp4_a_url, int(dash_mp4_a_size)) self.dash_streams[itag] = { 'quality': '%sx%s' % (w, h), 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'mp4', - 'src': [dash_url, dash_mp4_a_url], + 'src': [dash_urls, dash_mp4_a_urls], 'size': int(dash_size) + int(dash_mp4_a_size) } elif mimeType == 'video/webm': @@ -310,13 +322,15 @@ class YouTube(VideoExtractor): if not dash_size: try: dash_size = url_size(dash_url) except: continue + dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size)) + dash_webm_a_urls = self.__class__.chunk_by_range(dash_webm_a_url, int(dash_webm_a_size)) self.dash_streams[itag] = { 'quality': '%sx%s' % (w, h), 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'webm', - 'src': [dash_url, dash_webm_a_url], + 'src': [dash_urls, dash_webm_a_urls], 'size': int(dash_size) + int(dash_webm_a_size) } except: @@ -353,13 +367,15 @@ class YouTube(VideoExtractor): dash_url += '&signature={}'.format(sig) dash_size = stream['clen'] itag = stream['itag'] + dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size)) + dash_mp4_a_urls = self.__class__.chunk_by_range(dash_mp4_a_url, int(dash_mp4_a_size)) self.dash_streams[itag] = { 'quality': stream['size'], 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'mp4', - 'src': [dash_url, dash_mp4_a_url], + 'src': [dash_urls, dash_mp4_a_urls], 'size': int(dash_size) + int(dash_mp4_a_size) } elif stream['type'].startswith('video/webm'): @@ -378,13 +394,15 @@ class YouTube(VideoExtractor): except UnboundLocalError as e: audio_url = dash_mp4_a_url audio_size = int(dash_mp4_a_size) + dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size)) + audio_urls = self.__class__.chunk_by_range(audio_url, int(audio_size)) self.dash_streams[itag] = { 'quality': stream['size'], 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'webm', - 'src': [dash_url, audio_url], + 'src': [dash_urls, audio_urls], 'size': int(dash_size) + int(audio_size) } From f8c39fbe4cbe83d8c1f316d3c221808dbfc22931 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 11 Sep 2018 23:18:39 +0200 Subject: [PATCH 15/67] [common] post_content: allow post_data_raw --- src/you_get/common.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index d212b62b..5ce52990 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -439,7 +439,7 @@ def get_content(url, headers={}, decoded=True): return data -def post_content(url, headers={}, post_data={}, decoded=True): +def post_content(url, headers={}, post_data={}, decoded=True, **kwargs): """Post the content of a URL via sending a HTTP POST request. Args: @@ -457,7 +457,10 @@ def post_content(url, headers={}, post_data={}, decoded=True): if cookies: cookies.add_cookie_header(req) req.headers.update(req.unredirected_hdrs) - post_data_enc = bytes(parse.urlencode(post_data), 'utf-8') + if kwargs.get('post_data_raw'): + post_data_enc = bytes(kwargs['post_data_raw'], 'utf-8') + else: + post_data_enc = bytes(parse.urlencode(post_data), 'utf-8') response = urlopen_with_retry(req, data=post_data_enc) data = response.read() From f3cb2512a32f5fd14e91f0cded96cb5677a1b7fa Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 11 Sep 2018 23:23:50 +0200 Subject: [PATCH 16/67] [tumblr] take my consent --- src/you_get/extractors/tumblr.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/tumblr.py b/src/you_get/extractors/tumblr.py index fe4973be..f01c3352 100644 --- a/src/you_get/extractors/tumblr.py +++ b/src/you_get/extractors/tumblr.py @@ -13,7 +13,29 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): universal_download(url, output_dir, merge=merge, info_only=info_only) return - html = parse.unquote(get_html(url)).replace('\/', '/') + import ssl + ssl_context = request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1)) + cookie_handler = request.HTTPCookieProcessor() + opener = request.build_opener(ssl_context, cookie_handler) + request.install_opener(opener) + + page = get_html(url) + form_key = match1(page, r'id="tumblr_form_key" content="([^"]+)"') + if form_key is not None: + # bypass GDPR consent page + referer = 'https://www.tumblr.com/privacy/consent?redirect=%s' % parse.quote_plus(url) + post_content('https://www.tumblr.com/svc/privacy/consent', + headers={ + 'Content-Type': 'application/json', + 'User-Agent': fake_headers['User-Agent'], + 'Referer': referer, + 'X-tumblr-form-key': form_key, + 'X-Requested-With': 'XMLHttpRequest' + }, + post_data_raw='{"eu_resident":true,"gdpr_is_acceptable_age":true,"gdpr_consent_core":true,"gdpr_consent_first_party_ads":true,"gdpr_consent_third_party_ads":true,"gdpr_consent_search_history":true,"redirect_to":"%s","gdpr_reconsent":false}' % url) + page = get_html(url) + + html = parse.unquote(page).replace('\/', '/') feed = r1(r'', html) if feed in ['photo', 'photoset', 'entry'] or feed is None: From cc69f0945aaaff6535af020d84effa7d3c89ffab Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 11 Sep 2018 23:44:51 +0200 Subject: [PATCH 17/67] [universal] let Pinterest go suck a lemon --- src/you_get/extractors/universal.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/you_get/extractors/universal.py b/src/you_get/extractors/universal.py index 57b9b2d1..43272cb8 100644 --- a/src/you_get/extractors/universal.py +++ b/src/you_get/extractors/universal.py @@ -106,6 +106,9 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg title = '%s' % i i += 1 + if r1(r'(https://pinterest.com/pin/)', url): + continue + candies.append({'url': url, 'title': title}) From 5dfee49688645497061841d981c50fe577e4ba32 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 11 Sep 2018 23:49:39 +0200 Subject: [PATCH 18/67] [common] post_content: make log right --- src/you_get/common.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 5ce52990..f40b0220 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -450,8 +450,10 @@ def post_content(url, headers={}, post_data={}, decoded=True, **kwargs): Returns: The content as a string. """ - - logging.debug('post_content: %s \n post_data: %s' % (url, post_data)) + if kwargs.get('post_data_raw'): + logging.debug('post_content: %s\npost_data_raw: %s' % (url, kwargs['post_data_raw'])) + else: + logging.debug('post_content: %s\npost_data: %s' % (url, post_data)) req = request.Request(url, headers=headers) if cookies: From 76e831d443e9ca8e4344ad0ab5130ffc25eb9a73 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 11 Sep 2018 23:51:43 +0200 Subject: [PATCH 19/67] version 0.4.1148 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 27ed8849..4f5b1645 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1140' +__version__ = '0.4.1148' From fcdfce68d29d2f56b552fb3883f41fea7b7bf9de Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 16 Sep 2018 14:18:31 +0200 Subject: [PATCH 20/67] [tumblr] squanch this --- src/you_get/extractors/tumblr.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/you_get/extractors/tumblr.py b/src/you_get/extractors/tumblr.py index f01c3352..bc37fa43 100644 --- a/src/you_get/extractors/tumblr.py +++ b/src/you_get/extractors/tumblr.py @@ -33,7 +33,7 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): 'X-Requested-With': 'XMLHttpRequest' }, post_data_raw='{"eu_resident":true,"gdpr_is_acceptable_age":true,"gdpr_consent_core":true,"gdpr_consent_first_party_ads":true,"gdpr_consent_third_party_ads":true,"gdpr_consent_search_history":true,"redirect_to":"%s","gdpr_reconsent":false}' % url) - page = get_html(url) + page = get_html(url, faker=True) html = parse.unquote(page).replace('\/', '/') feed = r1(r'', html) @@ -43,9 +43,9 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): page_title = r1(r'([^<\n]*)', html) - urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.jpg)', html) +\ - re.findall(r'(https?://[^;"&]+/tumblr_[^;"]+_\d+\.png)', html) +\ - re.findall(r'(https?://[^;"&]+/tumblr_[^";]+_\d+\.gif)', html) + urls = re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.jpg)', html) +\ + re.findall(r'(https?://[^;"&]+/tumblr_[^;"&]+_\d+\.png)', html) +\ + re.findall(r'(https?://[^;"&]+/tumblr_[^";&]+_\d+\.gif)', html) tuggles = {} for url in urls: From caabb083f7c78f9170347ed0d4f60330c26b6da9 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 16 Sep 2018 14:34:48 +0200 Subject: [PATCH 21/67] [tumblr] squanch that --- src/you_get/extractors/tumblr.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/tumblr.py b/src/you_get/extractors/tumblr.py index bc37fa43..d63aee72 100644 --- a/src/you_get/extractors/tumblr.py +++ b/src/you_get/extractors/tumblr.py @@ -121,11 +121,15 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): r1(r'', html) or r1(r'([^<\n]*)', html) or url.split("/")[4]).replace('\n', '') - type, ext, size = url_info(real_url) + # this is better + vcode = r1(r'tumblr_(\w+)', real_url) + real_url = 'https://vt.media.tumblr.com/tumblr_%s.mp4' % vcode + + type, ext, size = url_info(real_url, faker=True) print_info(site_info, title, type, size) if not info_only: - download_urls([real_url], title, ext, size, output_dir, merge = merge) + download_urls([real_url], title, ext, size, output_dir, merge=merge) site_info = "Tumblr.com" download = tumblr_download From 1ea4abdb779ce2073accad10fd795a4add418142 Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Mon, 17 Sep 2018 13:52:04 +0200 Subject: [PATCH 22/67] [universal] lalalala --- src/you_get/extractors/universal.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/universal.py b/src/you_get/extractors/universal.py index 43272cb8..a1ab1536 100644 --- a/src/you_get/extractors/universal.py +++ b/src/you_get/extractors/universal.py @@ -67,9 +67,9 @@ def universal_download(url, output_dir='.', merge=True, info_only=False, **kwarg urls = [] for i in media_exts: - urls += re.findall(r'(https?://[^ ;"\'\\]+' + i + r'[^ ;"\'\\]*)', page) + urls += re.findall(r'(https?://[^ ;&"\'\\]+' + i + r'[^ ;&"\'\\]*)', page) - p_urls = re.findall(r'(https?%3A%2F%2F[^;&]+' + i + r'[^;&]*)', page) + p_urls = re.findall(r'(https?%3A%2F%2F[^;&"]+' + i + r'[^;&"]*)', page) urls += [parse.unquote(url) for url in p_urls] q_urls = re.findall(r'(https?:\\\\/\\\\/[^ ;"\']+' + i + r'[^ ;"\']*)', page) From fc8df5eb24b0856d4b0a7c99bebb58b901cb40f1 Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Mon, 17 Sep 2018 13:52:44 +0200 Subject: [PATCH 23/67] [naver] call universal_download if video extraction fails --- src/you_get/extractors/naver.py | 42 ++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/src/you_get/extractors/naver.py b/src/you_get/extractors/naver.py index d79e5245..b9eef8d7 100644 --- a/src/you_get/extractors/naver.py +++ b/src/you_get/extractors/naver.py @@ -7,31 +7,35 @@ import re from ..util import log from ..common import get_content, download_urls, print_info, playlist_not_supported, url_size +from .universal import * __all__ = ['naver_download_by_url'] -def naver_download_by_url(url, info_only=False, **kwargs): +def naver_download_by_url(url, output_dir='.', merge=True, info_only=False, **kwargs): ep = 'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{}?key={}' page = get_content(url) - og_video_url = re.search(r"<meta\s+property=\"og:video:url\"\s+content='(.+?)'>", page).group(1) - params_dict = urllib.parse.parse_qs(urllib.parse.urlparse(og_video_url).query) - vid = params_dict['vid'][0] - key = params_dict['outKey'][0] - meta_str = get_content(ep.format(vid, key)) - meta_json = json.loads(meta_str) - if 'errorCode' in meta_json: - log.wtf(meta_json['errorCode']) - title = meta_json['meta']['subject'] - videos = meta_json['videos']['list'] - video_list = sorted(videos, key=lambda video: video['encodingOption']['width']) - video_url = video_list[-1]['source'] - # size = video_list[-1]['size'] - # result wrong size - size = url_size(video_url) - print_info(site_info, title, 'mp4', size) - if not info_only: - download_urls([video_url], title, 'mp4', size, **kwargs) + try: + og_video_url = re.search(r"<meta\s+property=\"og:video:url\"\s+content='(.+?)'>", page).group(1) + params_dict = urllib.parse.parse_qs(urllib.parse.urlparse(og_video_url).query) + vid = params_dict['vid'][0] + key = params_dict['outKey'][0] + meta_str = get_content(ep.format(vid, key)) + meta_json = json.loads(meta_str) + if 'errorCode' in meta_json: + log.wtf(meta_json['errorCode']) + title = meta_json['meta']['subject'] + videos = meta_json['videos']['list'] + video_list = sorted(videos, key=lambda video: video['encodingOption']['width']) + video_url = video_list[-1]['source'] + # size = video_list[-1]['size'] + # result wrong size + size = url_size(video_url) + print_info(site_info, title, 'mp4', size) + if not info_only: + download_urls([video_url], title, 'mp4', size, **kwargs) + except: + universal_download(url, output_dir, merge=merge, info_only=info_only, **kwargs) site_info = "naver.com" download = naver_download_by_url From 286a7788d2fc57b4888185eb795ab8ab7e6d5ca6 Mon Sep 17 00:00:00 2001 From: Mateusz Piotrowski <0mp@FreeBSD.org> Date: Wed, 19 Sep 2018 09:38:26 +0200 Subject: [PATCH 24/67] Add installation instructions for FreeBSD --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index f6f8efdc..14500577 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,14 @@ You can install `you-get` easily via: $ brew install you-get ``` +### Option 8: pkg (FreeBSD only) + +You can install `you-get` easily via: + +``` +# pkg install you-get +``` + ### Shell completion Completion definitions for Bash, Fish and Zsh can be found in [`contrib/completion`](https://github.com/soimort/you-get/tree/develop/contrib/completion). Please consult your shell's manual for how to take advantage of them. From 4b7753f2d43701203d82d8826966ef34f2bd29e8 Mon Sep 17 00:00:00 2001 From: mq-liu <mingquan_liu@163.com> Date: Fri, 21 Sep 2018 15:02:22 +0800 Subject: [PATCH 25/67] update sohu.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 原来的下载, 下载特别慢, 还会出错 --- src/you_get/extractors/sohu.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/you_get/extractors/sohu.py b/src/you_get/extractors/sohu.py index 58eb1ac7..a8c81be5 100644 --- a/src/you_get/extractors/sohu.py +++ b/src/you_get/extractors/sohu.py @@ -15,9 +15,9 @@ Changelog: new api ''' -def real_url(host,vid,tvid,new,clipURL,ck): - url = 'http://'+host+'/?prot=9&prod=flash&pt=1&file='+clipURL+'&new='+new +'&key='+ ck+'&vid='+str(vid)+'&uid='+str(int(time.time()*1000))+'&t='+str(random())+'&rb=1' - return json.loads(get_html(url))['url'] +def real_url(fileName,key,ch): + url = "https://data.vod.itc.cn/ip?new=" + fileName + "&num=1&key=" + key + "&ch=" + ch + "&pt=1&pg=2&prod=h5n" + return json.loads(get_html(url))['servers'][0]['url'] def sohu_download(url, output_dir = '.', merge = True, info_only = False, extractor_proxy=None, **kwargs): if re.match(r'http://share.vrs.sohu.com', url): @@ -51,9 +51,8 @@ def sohu_download(url, output_dir = '.', merge = True, info_only = False, extrac title = data['tvName'] size = sum(data['clipsBytes']) assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su']) - for new,clip,ck, in zip(data['su'], data['clipsURL'], data['ck']): - clipURL = urlparse(clip).path - urls.append(real_url(host,hqvid,tvid,new,clipURL,ck)) + for fileName,key,ch, in zip(data['su'], data['ck'], data['ch']): + urls.append(real_url(fileName,key,ch)) # assert data['clipsURL'][0].endswith('.mp4') else: @@ -66,9 +65,8 @@ def sohu_download(url, output_dir = '.', merge = True, info_only = False, extrac title = data['tvName'] size = sum(map(int,data['clipsBytes'])) assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su']) - for new,clip,ck, in zip(data['su'], data['clipsURL'], data['ck']): - clipURL = urlparse(clip).path - urls.append(real_url(host,vid,tvid,new,clipURL,ck)) + for fileName,key,ch, in zip(data['su'], data['ck'], data['ch']): + urls.append(real_url(fileName,key,ch)) print_info(site_info, title, 'mp4', size) if not info_only: From 251b80962a29aabc314580b8d30887d88fff007c Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Sun, 14 Oct 2018 15:34:49 +0200 Subject: [PATCH 26/67] util.fs: \t not allowed in FAT (close #2646) --- src/you_get/util/fs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/you_get/util/fs.py b/src/you_get/util/fs.py index b6b7069a..c04a10a7 100644 --- a/src/you_get/util/fs.py +++ b/src/you_get/util/fs.py @@ -13,6 +13,7 @@ def legitimize(text, os=detect_os()): ord('|'): '-', }) + # FIXME: do some filesystem detection if os == 'windows' or os == 'cygwin' or os == 'wsl': # Windows (non-POSIX namespace) text = text.translate({ @@ -28,6 +29,7 @@ def legitimize(text, os=detect_os()): ord('>'): '-', ord('['): '(', ord(']'): ')', + ord('\t'): ' ', }) else: # *nix From f94c8d530df77254e3597dbd16a4ba40b2048e56 Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Wed, 17 Oct 2018 22:26:55 +0200 Subject: [PATCH 27/67] util.log: add yes_or_no() --- src/you_get/util/log.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/you_get/util/log.py b/src/you_get/util/log.py index a2c77ab5..67b26b78 100644 --- a/src/you_get/util/log.py +++ b/src/you_get/util/log.py @@ -96,3 +96,9 @@ def wtf(message, exit_code=1): print_log(message, RED, BOLD) if exit_code is not None: sys.exit(exit_code) + +def yes_or_no(message): + ans = str(input('%s (y/N) ' % message)).lower().strip() + if ans == 'y': + return True + return False From 5026436e8a573a3a7656184738dfe6a537936291 Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Wed, 17 Oct 2018 22:28:21 +0200 Subject: [PATCH 28/67] common: add proper warning and confirming before overwriting things --- src/you_get/common.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index f40b0220..88e7d8d3 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -622,7 +622,7 @@ def url_save( if not is_part: if bar: bar.done() - print( + log.w( 'Skipping {}: file already exists'.format( tr(os.path.basename(filepath)) ) @@ -648,7 +648,10 @@ def url_save( print('Changing name to %s' % tr(os.path.basename(filepath)), '...') continue_renameing = True continue - print('Overwriting %s' % tr(os.path.basename(filepath)), '...') + if log.yes_or_no('File with this name already exists. Overwrite?'): + log.w('Overwriting %s ...' % tr(os.path.basename(filepath))) + else: + return elif not os.path.exists(os.path.dirname(filepath)): os.mkdir(os.path.dirname(filepath)) @@ -925,7 +928,7 @@ def download_urls( if total_size: if not force and os.path.exists(output_filepath) and not auto_rename\ and os.path.getsize(output_filepath) >= total_size * 0.9: - print('Skipping %s: file already exists' % output_filepath) + log.w('Skipping %s: file already exists' % output_filepath) print() return bar = SimpleProgressBar(total_size, len(urls)) From fabb35a5b982d918e94abe89cd5a63a501b518cb Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Sat, 20 Oct 2018 16:22:14 +0200 Subject: [PATCH 29/67] [sohu] do not zip ch --- src/you_get/extractors/sohu.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/you_get/extractors/sohu.py b/src/you_get/extractors/sohu.py index a8c81be5..607bf44c 100644 --- a/src/you_get/extractors/sohu.py +++ b/src/you_get/extractors/sohu.py @@ -15,7 +15,7 @@ Changelog: new api ''' -def real_url(fileName,key,ch): +def real_url(fileName, key, ch): url = "https://data.vod.itc.cn/ip?new=" + fileName + "&num=1&key=" + key + "&ch=" + ch + "&pt=1&pg=2&prod=h5n" return json.loads(get_html(url))['servers'][0]['url'] @@ -51,8 +51,8 @@ def sohu_download(url, output_dir = '.', merge = True, info_only = False, extrac title = data['tvName'] size = sum(data['clipsBytes']) assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su']) - for fileName,key,ch, in zip(data['su'], data['ck'], data['ch']): - urls.append(real_url(fileName,key,ch)) + for fileName, key in zip(data['su'], data['ck']): + urls.append(real_url(fileName, key, data['ch'])) # assert data['clipsURL'][0].endswith('.mp4') else: @@ -65,8 +65,8 @@ def sohu_download(url, output_dir = '.', merge = True, info_only = False, extrac title = data['tvName'] size = sum(map(int,data['clipsBytes'])) assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su']) - for fileName,key,ch, in zip(data['su'], data['ck'], data['ch']): - urls.append(real_url(fileName,key,ch)) + for fileName, key in zip(data['su'], data['ck']): + urls.append(real_url(fileName, key, data['ch'])) print_info(site_info, title, 'mp4', size) if not info_only: From 50b66f3151dc63ffb7b7e216056906afff150358 Mon Sep 17 00:00:00 2001 From: kxy000 <kxy000@qq.com> Date: Mon, 22 Oct 2018 23:54:10 +0800 Subject: [PATCH 30/67] Update pptv.py add user agent --- src/you_get/extractors/pptv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/pptv.py b/src/you_get/extractors/pptv.py index 17503c1c..8d95a5a1 100644 --- a/src/you_get/extractors/pptv.py +++ b/src/you_get/extractors/pptv.py @@ -192,14 +192,14 @@ class PPTV(VideoExtractor): if self.url and not self.vid: if not re.match(r'http://v.pptv.com/show/(\w+)\.html', self.url): raise('Unknown url pattern') - page_content = get_content(self.url) + page_content = get_content(self.url,{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}) self.vid = match1(page_content, r'webcfg\s*=\s*{"id":\s*(\d+)') if not self.vid: raise('Cannot find id') api_url = 'http://web-play.pptv.com/webplay3-0-{}.xml'.format(self.vid) api_url += '?appplt=flp&appid=pptv.flashplayer.vod&appver=3.4.2.28&type=&version=4' - dom = parseString(get_content(api_url)) + dom = parseString(get_content(api_url,{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"})) self.title, m_items, m_streams, m_segs = parse_pptv_xml(dom) xml_streams = merge_meta(m_items, m_streams, m_segs) for stream_id in xml_streams: From dc1581869fe819e127bd17da83fccf5fc08d1339 Mon Sep 17 00:00:00 2001 From: beyond <yangbing@gozap.com> Date: Thu, 25 Oct 2018 11:12:36 +0800 Subject: [PATCH 31/67] Update miapai api --- src/you_get/extractors/yixia.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/yixia.py b/src/you_get/extractors/yixia.py index ff45730d..d3d1ef35 100644 --- a/src/you_get/extractors/yixia.py +++ b/src/you_get/extractors/yixia.py @@ -7,6 +7,24 @@ from urllib.parse import urlparse from json import loads import re +#---------------------------------------------------------------------- +def miaopai_download_by_smid(smid, output_dir = '.', merge = True, info_only = False): + """""" + api_endpoint = 'https://n.miaopai.com/api/aj_media/info.json?smid={smid}'.format(smid = smid) + + html = get_content(api_endpoint) + + api_content = loads(html) + + video_url = api_content['data']['meta_data'][0]['play_urls']['l'] + title = api_content['data']['description'] + + type, ext, size = url_info(video_url) + + print_info(site_info, title, type, size) + if not info_only: + download_urls([video_url], title, ext, size, output_dir, merge=merge) + #---------------------------------------------------------------------- def yixia_miaopai_download_by_scid(scid, output_dir = '.', merge = True, info_only = False): """""" @@ -47,7 +65,11 @@ def yixia_xiaokaxiu_download_by_scid(scid, output_dir = '.', merge = True, info_ def yixia_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): """wrapper""" hostname = urlparse(url).hostname - if 'miaopai.com' in hostname: #Miaopai + if 'n.miaopai.com' == hostname: + smid = match1(url, r'n\.miaopai\.com/media/([^.]+)') + miaopai_download_by_smid(smid, output_dir, merge, info_only) + return + elif 'miaopai.com' in hostname: #Miaopai yixia_download_by_scid = yixia_miaopai_download_by_scid site_info = "Yixia Miaopai" From 035294e573b9397bbe2278e1666c54268562e7e0 Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Fri, 26 Oct 2018 22:36:24 +0200 Subject: [PATCH 32/67] [bilibili] the production of too many useful things results in --- src/you_get/extractors/bilibili.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 94eed2ea..ed9663c0 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -192,7 +192,12 @@ class Bilibili(VideoExtractor): index_id = int(re.search(r'index_(\d+)', self.url).group(1)) cid = page_list[index_id-1]['cid'] # change cid match rule except: - cid = re.search(r'"cid":(\d+)', self.page).group(1) + page = re.search(r'p=(\d+)', self.url) + if page is None: + p = 1 + else: + p = int(page.group(1)) + cid = re.search(r'"cid":(\d+),"page":%s' % p, self.page).group(1) if cid is not None: self.download_by_vid(cid, re.search('bangumi', self.url) is not None, **kwargs) else: From 389b55b12229ebe114120b6ccd1490446b75fdb4 Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Fri, 26 Oct 2018 22:48:04 +0200 Subject: [PATCH 33/67] .travis.yml: skip more of flake8 --- .travis.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9df327b0..7e772c8c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,9 +8,10 @@ python: - "3.6" - "nightly" - "pypy3" -before_install: pip install flake8 +before_install: + - if [[ $TRAVIS_PYTHON_VERSION != '3.2'* && $TRAVIS_PYTHON_VERSION != '3.3'* ]]; then pip install flake8; fi before_script: - - if [[ $TRAVIS_PYTHON_VERSION != '3.2'* ]]; then flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics; fi + - if [[ $TRAVIS_PYTHON_VERSION != '3.2'* && $TRAVIS_PYTHON_VERSION != '3.3'* ]]; then flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics; fi script: make test sudo: false notifications: From 6d6c219a282c1887483c2a167735f802b8686467 Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Sun, 28 Oct 2018 13:33:28 +0100 Subject: [PATCH 34/67] version 0.4.1164 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 4f5b1645..e1a5349d 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1148' +__version__ = '0.4.1164' From 1f70be6aa937a6533b3e990334d4138283949deb Mon Sep 17 00:00:00 2001 From: lc4t <lc4t0.0@gmail.com> Date: Thu, 1 Nov 2018 11:57:21 +0800 Subject: [PATCH 35/67] use new bilibili live api, fix None Content-Type --- src/you_get/common.py | 2 +- src/you_get/extractors/bilibili.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 88e7d8d3..3d04e8a1 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -429,7 +429,7 @@ def get_content(url, headers={}, decoded=True): # Decode the response body if decoded: charset = match1( - response.getheader('Content-Type'), r'charset=([\w-]+)' + response.getheader('Content-Type', ''), r'charset=([\w-]+)' ) if charset is not None: data = data.decode(charset) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index ed9663c0..71cc7fc2 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -22,7 +22,7 @@ from .youku import youku_download_by_vid class Bilibili(VideoExtractor): name = 'Bilibili' - live_api = 'http://live.bilibili.com/api/playurl?cid={}&otype=json' + live_api = 'https://api.live.bilibili.com/room/v1/Room/playUrl?cid={}&quality=0&platform=web' api_url = 'http://interface.bilibili.com/v2/playurl?' bangumi_api_url = 'http://bangumi.bilibili.com/player/web_api/playurl?' live_room_init_api_url = 'https://api.live.bilibili.com/room/v1/Room/room_init?id={}' @@ -233,7 +233,7 @@ class Bilibili(VideoExtractor): api_url = self.live_api.format(self.room_id) json_data = json.loads(get_content(api_url)) - urls = [json_data['durl'][0]['url']] + urls = [json_data['data']['durl'][0]['url']] self.streams['live'] = {} self.streams['live']['src'] = urls From aa221f137817d4b30611dea8860b6c625cc3f5ee Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Wed, 7 Nov 2018 16:49:18 +0100 Subject: [PATCH 36/67] [youtube] whatever this (0,window.encodeURIComponent) thing is (fix #2652) --- src/you_get/extractors/youtube.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 19864590..b1a680b9 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -40,6 +40,7 @@ class YouTube(VideoExtractor): # Examples: # - https://www.youtube.com/yts/jsbin/player-da_DK-vflWlK-zq/base.js # - https://www.youtube.com/yts/jsbin/player-vflvABTsY/da_DK/base.js + # - https://www.youtube.com/yts/jsbin/player-vfls4aurX/da_DK/base.js def tr_js(code): code = re.sub(r'function', r'def', code) code = re.sub(r'(\W)(as|if|in|is|or)\(', r'\1_\2(', code) @@ -55,7 +56,8 @@ class YouTube(VideoExtractor): return code js = js.replace('\n', ' ') - f1 = match1(js, r'\.set\(\w+\.sp,([$\w]+)\(\w+\.s\)\)') or \ + f1 = match1(js, r'\.set\(\w+\.sp,\(0,window\.encodeURIComponent\)\(([$\w]+)') or \ + match1(js, r'\.set\(\w+\.sp,([$\w]+)\(\w+\.s\)\)') or \ match1(js, r'"signature",([$\w]+)\(\w+\.\w+\)') f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \ match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1)) From 046dcea5e805ff18473a6ecdd722ddfa437855fd Mon Sep 17 00:00:00 2001 From: Mort Yao <soi@mort.ninja> Date: Wed, 7 Nov 2018 16:59:58 +0100 Subject: [PATCH 37/67] version 0.4.1167 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index e1a5349d..883b7dca 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1164' +__version__ = '0.4.1167' From 7d9ce6b8d01145a4d8215916d2acab29d2d08565 Mon Sep 17 00:00:00 2001 From: Yingdong Yang <storm-yyd@outlook.com> Date: Fri, 9 Nov 2018 13:36:03 +0800 Subject: [PATCH 38/67] fix iwara --- src/you_get/extractors/iwara.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/you_get/extractors/iwara.py b/src/you_get/extractors/iwara.py index 50d14fb8..a30159d7 100644 --- a/src/you_get/extractors/iwara.py +++ b/src/you_get/extractors/iwara.py @@ -17,20 +17,20 @@ headers = { def iwara_download(url, output_dir='.', merge=True, info_only=False, **kwargs): global headers - video_hash=match1(url, r'http://\w+.iwara.tv/videos/(\w+)') - video_url=match1(url, r'(http://\w+.iwara.tv)/videos/\w+') - html = get_content(url,headers=headers) + video_hash = match1(url, r'https?://\w+.iwara.tv/videos/(\w+)') + video_url = match1(url, r'(https?://\w+.iwara.tv)/videos/\w+') + html = get_content(url, headers=headers) title = r1(r'<title>(.*)', html) - api_url=video_url+'/api/video/'+video_hash - content=get_content(api_url,headers=headers) - data=json.loads(content) - type,ext,size=url_info(data[0]['uri'], headers=headers) - down_urls=data[0]['uri'] - print_info(down_urls,title+data[0]['resolution'],type,size) + api_url = video_url + '/api/video/' + video_hash + content = get_content(api_url, headers=headers) + data = json.loads(content) + down_urls = 'https:' + data[0]['uri'] + type, ext, size = url_info(down_urls, headers=headers) + print_info(site_info, title+data[0]['resolution'], type, size) if not info_only: - download_urls([down_urls], title, ext, size, output_dir, merge = merge,headers=headers) + download_urls([down_urls], title, ext, size, output_dir, merge=merge, headers=headers) -site_info = "iwara" +site_info = "Iwara" download = iwara_download download_playlist = playlist_not_supported('iwara') From 64e15159abc326dbe3afb16883e652942361cb5b Mon Sep 17 00:00:00 2001 From: Vcinly Date: Sun, 11 Nov 2018 13:03:29 +0800 Subject: [PATCH 39/67] support download bilibili uploader all videos --- src/you_get/extractors/bilibili.py | 34 ++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 71cc7fc2..053b4d19 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -384,6 +384,38 @@ def download_video_from_favlist(url, **kwargs): else: log.wtf("Fail to parse the fav title" + url, "") +def download_video_from_totallist(url, page, **kwargs): + # the url has format: https://space.bilibili.com/64169458/#/video + m = re.search(r'space\.bilibili\.com/(\d+)/.*?video', url) + mid = "" + if m is not None: + mid = m.group(1) + jsonresult = json.loads(get_content("https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&pagesize=100&tid=0&page={}&keyword=&order=pubdate&jsonp=jsonp".format(mid, page))) + if jsonresult['status']: + videos = jsonresult['data']['vlist'] + videocount = len(videos) + for i in range(videocount): + videoid = videos[i]["aid"] + videotitle = videos[i]["title"] + videourl = "https://www.bilibili.com/video/av{}".format(videoid) + print("Start downloading ", videotitle, " video ", videotitle) + kwargs["output_dir"] = kwargs["output_dir"] + '/' + str(videoid) + download_cover(videos[i]['pic'], videotitle, **kwargs) + Bilibili().download_by_url(videourl, subtitle=videotitle, **kwargs) + if page <= jsonresult['pages']: + page += 1 + download_video_from_totallist(url, page, **kwargs) + else: + log.wtf("Fail to get the files of page " + jsonresult) + sys.exit(2) + + else: + log.wtf("Fail to parse the fav title" + url, "") + +def download_cover(url, title, **kwargs): + if re.match(r'https?://', url) is None: + url = 'https:' + url + download_urls([url], title, "jpg", 0, kwargs["output_dir"]) def bilibili_download_playlist_by_url(url, **kwargs): url = url_locations([url], faker=True)[0] @@ -403,6 +435,8 @@ def bilibili_download_playlist_by_url(url, **kwargs): elif 'favlist' in url: # this a fav list folder download_video_from_favlist(url, **kwargs) + elif 'video' in url: + download_video_from_totallist(url, 1, **kwargs) else: aid = re.search(r'av(\d+)', url).group(1) page_list = json.loads(get_content('http://www.bilibili.com/widget/getPageList?aid={}'.format(aid))) From 470b74f3aff77168e0d11c4f7caa470bb1cac238 Mon Sep 17 00:00:00 2001 From: Vcinly Date: Sat, 17 Nov 2018 22:44:25 +0800 Subject: [PATCH 40/67] [bilibili] fixed space videos url detect --- src/you_get/extractors/bilibili.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 053b4d19..10077bf8 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -399,10 +399,8 @@ def download_video_from_totallist(url, page, **kwargs): videotitle = videos[i]["title"] videourl = "https://www.bilibili.com/video/av{}".format(videoid) print("Start downloading ", videotitle, " video ", videotitle) - kwargs["output_dir"] = kwargs["output_dir"] + '/' + str(videoid) - download_cover(videos[i]['pic'], videotitle, **kwargs) Bilibili().download_by_url(videourl, subtitle=videotitle, **kwargs) - if page <= jsonresult['pages']: + if page < jsonresult['data']['pages']: page += 1 download_video_from_totallist(url, page, **kwargs) else: @@ -410,12 +408,7 @@ def download_video_from_totallist(url, page, **kwargs): sys.exit(2) else: - log.wtf("Fail to parse the fav title" + url, "") - -def download_cover(url, title, **kwargs): - if re.match(r'https?://', url) is None: - url = 'https:' + url - download_urls([url], title, "jpg", 0, kwargs["output_dir"]) + log.wtf("Fail to parse the video title" + url, "") def bilibili_download_playlist_by_url(url, **kwargs): url = url_locations([url], faker=True)[0] @@ -435,7 +428,7 @@ def bilibili_download_playlist_by_url(url, **kwargs): elif 'favlist' in url: # this a fav list folder download_video_from_favlist(url, **kwargs) - elif 'video' in url: + elif re.match(r'https?://space.bilibili.com/\d+/#/video', url): download_video_from_totallist(url, 1, **kwargs) else: aid = re.search(r'av(\d+)', url).group(1) From 9f68d3c37a5c98e75cd884332f92fd27d6246c82 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 18 Nov 2018 23:50:38 +0100 Subject: [PATCH 41/67] [bilibili] fix bangumi thing --- src/you_get/extractors/bilibili.py | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 10077bf8..9ae54640 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -265,22 +265,9 @@ class Bilibili(VideoExtractor): episode_id = frag else: episode_id = re.search(r'first_ep_id\s*=\s*"(\d+)"', self.page) or re.search(r'\/ep(\d+)', self.url).group(1) - # cont = post_content('http://bangumi.bilibili.com/web_api/get_source', post_data=dict(episode_id=episode_id)) - # cid = json.loads(cont)['result']['cid'] - cont = get_content('http://bangumi.bilibili.com/web_api/episode/{}.json'.format(episode_id)) - ep_info = json.loads(cont)['result']['currentEpisode'] - - bangumi_data = get_bangumi_info(str(ep_info['seasonId'])) - bangumi_payment = bangumi_data.get('payment') - if bangumi_payment and bangumi_payment['price'] != '0': - log.w("It's a paid item") - # ep_ids = collect_bangumi_epids(bangumi_data) - - index_title = ep_info['indexTitle'] - long_title = ep_info['longTitle'].strip() - cid = ep_info['danmaku'] - - self.title = '{} [{} {}]'.format(self.title, index_title, long_title) + data = json.loads(re.search(r'__INITIAL_STATE__=(.+);\(function', self.page).group(1)) + cid = data['epInfo']['cid'] + # index_title = data['epInfo']['index_title'] self.download_by_vid(cid, bangumi=True, **kwargs) From 98d9580dd344b36c65e59652292b63552ddf21cc Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 19 Nov 2018 00:14:04 +0100 Subject: [PATCH 42/67] [common] fix google_search --- src/you_get/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 3d04e8a1..6a239154 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -1575,9 +1575,9 @@ def google_search(url): url = 'https://www.google.com/search?tbm=vid&q=%s' % parse.quote(keywords) page = get_content(url, headers=fake_headers) videos = re.findall( - r'([^<]+)<', page + r'

([^<]+)<', page ) - vdurs = re.findall(r'([^<]+)<', page) + vdurs = re.findall(r'([^<]+)<', page) durs = [r1(r'(\d+:\d+)', unescape_html(dur)) for dur in vdurs] print('Google Videos search:') for v in zip(videos, durs): From e14f21f323c5210ab2f04a0a861d1515c2178092 Mon Sep 17 00:00:00 2001 From: bitdust Date: Wed, 21 Nov 2018 01:56:57 +0800 Subject: [PATCH 43/67] fix bilibili title regex match '' with html attribute --- src/you_get/extractors/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 9ae54640..079501c6 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -130,7 +130,7 @@ class Bilibili(VideoExtractor): m = re.search(r'(.*?)

', self.page) or re.search(r'

', self.page) if m is not None: self.title = m.group(1) - s = re.search(r'([^<]+)', m.group(1)) + s = re.search(r'([^<]+)', m.group(1)) if s: self.title = unescape_html(s.group(1)) if self.title is None: From 5946a545751ae8376beec54032ea92e2fc6e710d Mon Sep 17 00:00:00 2001 From: FengLi666 Date: Thu, 22 Nov 2018 13:45:00 +0800 Subject: [PATCH 44/67] fix acfun bangumi page --- src/you_get/extractors/acfun.py | 35 +++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/src/you_get/extractors/acfun.py b/src/you_get/extractors/acfun.py index 4b45c5e9..772132fe 100644 --- a/src/you_get/extractors/acfun.py +++ b/src/you_get/extractors/acfun.py @@ -105,27 +105,42 @@ def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=Fals pass def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - assert re.match(r'http://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url) - html = get_content(url) + assert re.match(r'http://[^\.]*\.*acfun\.[^\.]+/(\D|bangumi)/\D\D(\d+)', url) - title = r1(r'data-title="([^"]+)"', html) + if re.match(r'http://[^\.]*\.*acfun\.[^\.]+/\D/\D\D(\d+)', url): + html = get_content(url) + title = r1(r'data-title="([^"]+)"', html) + if match1(url, r'_(\d+)$'): # current P + title = title + " " + r1(r'active">([^<]*)', html) + vid = r1('data-vid="(\d+)"', html) + up = r1('data-name="([^"]+)"', html) + # bangumi + elif re.match("http://[^\.]*\.*acfun\.[^\.]+/bangumi/ab(\d+)", url): + html = get_content(url) + title = match1(html, r'"newTitle"\s*:\s*"([^"]+)"') + if match1(url, r'_(\d+)$'): # current P + title = title + " " + r1(r'active">([^<]*)', html) + vid = match1(html, r'videoId="(\d+)"') + up = "acfun" + else: + raise NotImplemented + + assert title and vid title = unescape_html(title) title = escape_file_path(title) - assert title - if match1(url, r'_(\d+)$'): # current P - title = title + " " + r1(r'active">([^<]*)', html) - - vid = r1('data-vid="(\d+)"', html) - up = r1('data-name="([^"]+)"', html) p_title = r1('active">([^<]+)', html) title = '%s (%s)' % (title, up) - if p_title: title = '%s - %s' % (title, p_title) + if p_title: + title = '%s - %s' % (title, p_title) + + acfun_download_by_vid(vid, title, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) + site_info = "AcFun.tv" download = acfun_download download_playlist = playlist_not_supported('acfun') From ab8a3a2ccff0292f38fa8f229b4cfb8784d6bcd6 Mon Sep 17 00:00:00 2001 From: URenko <18209292+URenko@users.noreply.github.com> Date: Sun, 25 Nov 2018 20:07:52 +0800 Subject: [PATCH 45/67] fix acfun flv support --- src/you_get/extractors/acfun.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/acfun.py b/src/you_get/extractors/acfun.py index 772132fe..200a3f54 100644 --- a/src/you_get/extractors/acfun.py +++ b/src/you_get/extractors/acfun.py @@ -85,9 +85,13 @@ def acfun_download_by_vid(vid, title, output_dir='.', merge=True, info_only=Fals _, _, seg_size = url_info(url) size += seg_size #fallback to flvhd is not quite possible - print_info(site_info, title, 'mp4', size) + if re.search(r'fid=[0-9A-Z\-]*.flv', preferred[0][0]): + ext = 'flv' + else: + ext = 'mp4' + print_info(site_info, title, ext, size) if not info_only: - download_urls(preferred[0], title, 'mp4', size, output_dir=output_dir, merge=merge) + download_urls(preferred[0], title, ext, size, output_dir=output_dir, merge=merge) else: raise NotImplementedError(sourceType) From 1f52bd01ae062c2b51511aa76cd56c939dc0e02d Mon Sep 17 00:00:00 2001 From: astronaut <519537870@qq.com> Date: Mon, 26 Nov 2018 20:57:46 +0800 Subject: [PATCH 46/67] support bilibili audio --- src/you_get/extractor.py | 2 +- src/you_get/extractors/bilibili.py | 78 +++++++++++++++++++++++++++++- 2 files changed, 78 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractor.py b/src/you_get/extractor.py index 4c9ccaa5..8aeed560 100644 --- a/src/you_get/extractor.py +++ b/src/you_get/extractor.py @@ -211,7 +211,7 @@ class VideoExtractor(): ext = self.dash_streams[stream_id]['container'] total_size = self.dash_streams[stream_id]['size'] - if ext == 'm3u8': + if ext == 'm3u8' or ext == 'm4a': ext = 'mp4' if not urls: diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 079501c6..24821d77 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -137,7 +137,6 @@ class Bilibili(VideoExtractor): m = re.search(r'property="og:title" content="([^"]+)"', self.page) if m is not None: self.title = m.group(1) - if 'subtitle' in kwargs: subtitle = kwargs['subtitle'] self.title = '{} {}'.format(self.title, subtitle) @@ -162,6 +161,8 @@ class Bilibili(VideoExtractor): self.live_entry(**kwargs) elif 'vc.bilibili.com' in self.url: self.vc_entry(**kwargs) + elif 'audio/au' in self.url: + self.audio_entry(**kwargs) else: self.entry(**kwargs) @@ -173,6 +174,30 @@ class Bilibili(VideoExtractor): self.title = page_list[0]['pagename'] self.download_by_vid(page_list[0]['cid'], True, bangumi_movie=True, **kwargs) + def audio_entry(self, **kwargs): + assert re.match(r'https?://www.bilibili.com/audio/au\d+', self.url) + patt = r"(\d+)" + audio_id = re.search(patt, self.url).group(1) + audio_info_url = \ + 'https://www.bilibili.com/audio/music-service-c/web/song/info?sid={}'.format(audio_id) + audio_info_response = json.loads(get_content(audio_info_url)) + if audio_info_response['msg'] != 'success': + log.wtf('fetch audio information failed!') + sys.exit(2) + self.title = audio_info_response['data']['title'] + # TODO:there is no quality option for now + audio_download_url = \ + 'https://www.bilibili.com/audio/music-service-c/web/url?sid={}&privilege=2&quality=2'.format(audio_id) + audio_download_response = json.loads(get_content(audio_download_url)) + if audio_download_response['msg'] != 'success': + log.wtf('fetch audio resource failed!') + sys.exit(2) + self.streams['mp4'] = {} + self.streams['mp4']['src'] = [audio_download_response['data']['cdns'][0]] + self.streams['mp4']['container'] = 'm4a' + self.streams['mp4']['size'] = audio_download_response['data']['size'] + + def entry(self, **kwargs): # tencent player tc_flashvars = re.search(r'"bili-cid=\d+&bili-aid=\d+&vid=([^"]+)"', self.page) @@ -370,6 +395,29 @@ def download_video_from_favlist(url, **kwargs): else: log.wtf("Fail to parse the fav title" + url, "") +def download_music_from_favlist(url, page, **kwargs): + m = re.search(r'https?://www.bilibili.com/audio/mycollection/(\d+)', url) + if m is not None: + sid = m.group(1) + json_result = json.loads(get_content("https://www.bilibili.com/audio/music-service-c/web/song/of-coll?" + "sid={}&pn={}&ps=100".format(sid, page))) + if json_result['msg'] == 'success': + music_list = json_result['data']['data'] + music_count = len(music_list) + for i in range(music_count): + audio_id = music_list[i]['id'] + audio_title = music_list[i]['title'] + audio_url = "https://www.bilibili.com/audio/au{}".format(audio_id) + print("Start downloading music ", audio_title) + Bilibili().download_by_url(audio_url, **kwargs) + if page < json_result['data']['pageCount']: + page += 1 + download_music_from_favlist(url, page, **kwargs) + else: + log.wtf("Fail to get music list of page " + json_result) + sys.exit(2) + else: + log.wtf("Fail to parse the sid from " + url, "") def download_video_from_totallist(url, page, **kwargs): # the url has format: https://space.bilibili.com/64169458/#/video @@ -397,6 +445,30 @@ def download_video_from_totallist(url, page, **kwargs): else: log.wtf("Fail to parse the video title" + url, "") +def download_music_from_totallist(url, page, **kwargs): + m = re.search(r'https?://www.bilibili.com/audio/am(\d+)\?type=\d', url) + if m is not None: + sid = m.group(1) + json_result = json.loads(get_content("https://www.bilibili.com/audio/music-service-c/web/song/of-menu?" + "sid={}&pn={}&ps=100".format(sid, page))) + if json_result['msg'] == 'success': + music_list = json_result['data']['data'] + music_count = len(music_list) + for i in range(music_count): + audio_id = music_list[i]['id'] + audio_title = music_list[i]['title'] + audio_url = "https://www.bilibili.com/audio/au{}".format(audio_id) + print("Start downloading music ",audio_title) + Bilibili().download_by_url(audio_url, **kwargs) + if page < json_result['data']['pageCount']: + page += 1 + download_music_from_totallist(url, page, **kwargs) + else: + log.wtf("Fail to get music list of page " + json_result) + sys.exit(2) + else: + log.wtf("Fail to parse the sid from " + url, "") + def bilibili_download_playlist_by_url(url, **kwargs): url = url_locations([url], faker=True)[0] kwargs['playlist'] = True @@ -417,6 +489,10 @@ def bilibili_download_playlist_by_url(url, **kwargs): download_video_from_favlist(url, **kwargs) elif re.match(r'https?://space.bilibili.com/\d+/#/video', url): download_video_from_totallist(url, 1, **kwargs) + elif re.match(r'https://www.bilibili.com/audio/mycollection/\d+', url): + download_music_from_favlist(url, 1, **kwargs) + elif re.match(r'https?://www.bilibili.com/audio/am\d+\?type=\d', url): + download_music_from_totallist(url, 1, **kwargs) else: aid = re.search(r'av(\d+)', url).group(1) page_list = json.loads(get_content('http://www.bilibili.com/widget/getPageList?aid={}'.format(aid))) From 32a2e24785e835a790754eb58f3eaaf024db056a Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 30 Nov 2018 14:12:43 +0100 Subject: [PATCH 47/67] [youku] sometimes naive --- src/you_get/extractors/youku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/youku.py b/src/you_get/extractors/youku.py index e86b53b9..75a49c70 100644 --- a/src/you_get/extractors/youku.py +++ b/src/you_get/extractors/youku.py @@ -78,7 +78,7 @@ class Youku(VideoExtractor): self.api_error_code = None self.api_error_msg = None - self.ccode = '0516' + self.ccode = '0590' # Found in http://g.alicdn.com/player/ykplayer/0.5.64/youku-player.min.js # grep -oE '"[0-9a-zA-Z+/=]{256}"' youku-player.min.js self.ckey = 'DIl58SLFxFNndSV1GFNnMQVYkx1PP5tKe1siZu/86PR1u/Wh1Ptd+WOZsHHWxysSfAOhNJpdVWsdVJNsfJ8Sxd8WKVvNfAS8aS8fAOzYARzPyPc3JvtnPHjTdKfESTdnuTW6ZPvk2pNDh4uFzotgdMEFkzQ5wZVXl2Pf1/Y6hLK0OnCNxBj3+nb0v72gZ6b0td+WOZsHHWxysSo/0y9D2K42SaB8Y/+aD2K42SaB8Y/+ahU+WOZsHcrxysooUeND' From fdb021371487955318fdec7b94cb48f483c90f76 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 30 Nov 2018 17:36:59 +0100 Subject: [PATCH 48/67] [youtube] don't fail the whole playlist --- src/you_get/extractors/youtube.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index b1a680b9..bc1bc469 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -144,7 +144,10 @@ class YouTube(VideoExtractor): for video in videos: vid = parse_query_param(video, 'v') index = parse_query_param(video, 'index') - self.__class__().download_by_url(self.__class__.get_url_from_vid(vid), index=index, **kwargs) + try: + self.__class__().download_by_url(self.__class__.get_url_from_vid(vid), index=index, **kwargs) + except: + pass def prepare(self, **kwargs): assert self.url or self.vid @@ -160,7 +163,8 @@ class YouTube(VideoExtractor): ytplayer_config = None if 'status' not in video_info: - log.wtf('[Failed] Unknown status.') + log.wtf('[Failed] Unknown status.', exit_code=None) + raise elif video_info['status'] == ['ok']: if 'use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']: self.title = parse.unquote_plus(video_info['title'][0]) @@ -192,7 +196,8 @@ class YouTube(VideoExtractor): ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+});ytplayer', video_page).group(1)) except: msg = re.search('class="message">([^<]+)<', video_page).group(1) - log.wtf('[Failed] "%s"' % msg.strip()) + log.wtf('[Failed] "%s"' % msg.strip(), exit_code=None) + raise if 'title' in ytplayer_config['args']: # 150 Restricted from playback on certain sites @@ -201,18 +206,22 @@ class YouTube(VideoExtractor): self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') else: - log.wtf('[Error] The uploader has not made this video available in your country.') + log.wtf('[Error] The uploader has not made this video available in your country.', exit_code=None) + raise #self.title = re.search(' Date: Fri, 30 Nov 2018 18:29:22 +0100 Subject: [PATCH 49/67] [tiktok] new site support --- src/you_get/common.py | 1 + src/you_get/extractors/__init__.py | 3 ++- src/you_get/extractors/tiktok.py | 21 +++++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 src/you_get/extractors/tiktok.py diff --git a/src/you_get/common.py b/src/you_get/common.py index 6a239154..47893910 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -102,6 +102,7 @@ SITES = { 'soundcloud' : 'soundcloud', 'ted' : 'ted', 'theplatform' : 'theplatform', + 'tiktok' : 'tiktok', 'tucao' : 'tucao', 'tudou' : 'tudou', 'tumblr' : 'tumblr', diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index 649a911f..302433c0 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -67,6 +67,7 @@ from .sohu import * from .soundcloud import * from .suntv import * from .theplatform import * +from .tiktok import * from .tucao import * from .tudou import * from .tumblr import * @@ -88,4 +89,4 @@ from .ted import * from .khan import * from .zhanqi import * from .kuaishou import * -from .zhibo import * \ No newline at end of file +from .zhibo import * diff --git a/src/you_get/extractors/tiktok.py b/src/you_get/extractors/tiktok.py new file mode 100644 index 00000000..9718abde --- /dev/null +++ b/src/you_get/extractors/tiktok.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python + +__all__ = ['tiktok_download'] + +from ..common import * + +def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs): + html = get_html(url) + title = r1(r'(.*?)', html) + dataText = r1(r'var data = \[(.*)\] ', html) + data = json.loads(dataText) + source = 'http:' + data['video']['play_addr']['url_list'][0] + mime, ext, size = url_info(source) + + print_info(site_info, title, mime, size) + if not info_only: + download_urls([source], title, ext, size, output_dir, merge=merge) + +site_info = "TikTok.com" +download = tiktok_download +download_playlist = playlist_not_supported('tiktok') From 5fece0bd1cb1e68f19993e024bab968de2778d83 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 30 Nov 2018 18:54:05 +0100 Subject: [PATCH 50/67] [tiktok] more URL patterns --- src/you_get/extractors/tiktok.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/tiktok.py b/src/you_get/extractors/tiktok.py index 9718abde..e9ff63ab 100644 --- a/src/you_get/extractors/tiktok.py +++ b/src/you_get/extractors/tiktok.py @@ -7,7 +7,9 @@ from ..common import * def tiktok_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_html(url) title = r1(r'(.*?)', html) - dataText = r1(r'var data = \[(.*)\] ', html) + video_id = r1(r'/video/(\d+)', url) or r1(r'musical\?id=(\d+)', html) + title = '%s [%s]' % (title, video_id) + dataText = r1(r'var data = \[(.*)\] ', html) or r1(r'var data = (\{.*\})', html) data = json.loads(dataText) source = 'http:' + data['video']['play_addr']['url_list'][0] mime, ext, size = url_info(source) From 0e90b9b00053e178eab032909fa8f1af16a55f90 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 30 Nov 2018 21:51:11 +0100 Subject: [PATCH 51/67] version 0.4.1181 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 883b7dca..e0a79a8e 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1167' +__version__ = '0.4.1181' From fe3eeacd543f2850f47cc9cbe8efe425129c3084 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 30 Nov 2018 22:01:26 +0100 Subject: [PATCH 52/67] claim to support Python 3.7 --- you-get.json | 1 + 1 file changed, 1 insertion(+) diff --git a/you-get.json b/you-get.json index 594742c2..56f8212a 100644 --- a/you-get.json +++ b/you-get.json @@ -25,6 +25,7 @@ "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", "Topic :: Internet", "Topic :: Internet :: WWW/HTTP", "Topic :: Multimedia", From 0930e0784e59a049ae82d29f4a44ad8471cbd622 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 30 Nov 2018 22:07:04 +0100 Subject: [PATCH 53/67] update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 14500577..856f6c80 100644 --- a/README.md +++ b/README.md @@ -424,6 +424,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 西瓜视频 | |✓| | | | 快手 | |✓|✓| | | 抖音 | |✓| | | +| TikTok | |✓| | | | 中国体育(TV) |
|✓| | | For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. From 265818d39bcb1afae231ad108f4492b022ca9bbc Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 30 Nov 2018 22:08:38 +0100 Subject: [PATCH 54/67] .travis.yml: add 3.7-dev --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 7e772c8c..c11cbe34 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,7 @@ python: - "3.4" - "3.5" - "3.6" + - "3.7-dev" - "nightly" - "pypy3" before_install: From 45f951b9b3267279f53a956454010decbbdef0ae Mon Sep 17 00:00:00 2001 From: FengLi666 Date: Sat, 1 Dec 2018 22:34:49 +0800 Subject: [PATCH 55/67] fix bilibili bangumi page delete out of date regex which causes error --- src/you_get/extractors/bilibili.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 24821d77..5ed7f28d 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -284,12 +284,6 @@ class Bilibili(VideoExtractor): self.streams['vc']['size'] = int(item['video_size']) def bangumi_entry(self, **kwargs): - bangumi_id = re.search(r'(\d+)', self.url).group(1) - frag = urllib.parse.urlparse(self.url).fragment - if frag: - episode_id = frag - else: - episode_id = re.search(r'first_ep_id\s*=\s*"(\d+)"', self.page) or re.search(r'\/ep(\d+)', self.url).group(1) data = json.loads(re.search(r'__INITIAL_STATE__=(.+);\(function', self.page).group(1)) cid = data['epInfo']['cid'] # index_title = data['epInfo']['index_title'] From e37291dfd86a3cb6bf780585ab51cd308bf8de26 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 5 Dec 2018 14:11:59 +0100 Subject: [PATCH 56/67] [instagram] hey ho --- src/you_get/extractors/instagram.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/instagram.py b/src/you_get/extractors/instagram.py index 65fc01f5..6537b606 100755 --- a/src/you_get/extractors/instagram.py +++ b/src/you_get/extractors/instagram.py @@ -29,7 +29,7 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg image_url = edge['node']['display_url'] if 'video_url' in edge['node']: image_url = edge['node']['video_url'] - image_url = image_url.split('?')[0] + image_url = image_url.split('?')[0] # (not here: '?_nc_ht=instagram.fcph1-1.fna.fbcdn.net') ext = image_url.split('.')[-1] size = int(get_head(image_url)['Content-Length']) print_info(site_info, title, ext, size) @@ -44,7 +44,7 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg image_url = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['display_url'] if 'video_url' in info['entry_data']['PostPage'][0]['graphql']['shortcode_media']: image_url =info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['video_url'] - image_url = image_url.split('?')[0] + image_url = image_url.split('?')[0] + '?_nc_ht=instagram.fcph1-1.fna.fbcdn.net' ext = image_url.split('.')[-1] size = int(get_head(image_url)['Content-Length']) print_info(site_info, title, ext, size) From 926c7b283a640532044731d2b5ba6d700bb81702 Mon Sep 17 00:00:00 2001 From: perror <15058342792@163.com> Date: Thu, 6 Dec 2018 00:55:08 +0800 Subject: [PATCH 57/67] fix ixigua downloading failure --- src/you_get/extractors/ixigua.py | 128 +++++++++++++++++++++++++++++-- 1 file changed, 123 insertions(+), 5 deletions(-) diff --git a/src/you_get/extractors/ixigua.py b/src/you_get/extractors/ixigua.py index 59133442..3cf07b09 100644 --- a/src/you_get/extractors/ixigua.py +++ b/src/you_get/extractors/ixigua.py @@ -1,14 +1,132 @@ #!/usr/bin/env python -__all__ = ['ixigua_download'] +import base64 -from .toutiao import download as toutiao_download -from .toutiao import download_playlist as toutiao_download_playlist +import binascii + +from ..common import * +import random +import ctypes +from json import loads + +__all__ = ['ixigua_download', 'ixigua_download_playlist_by_url'] + +headers = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 " + "Safari/537.36", +} + + +def int_overflow(val): + maxint = 2147483647 + if not -maxint - 1 <= val <= maxint: + val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1 + return val + + +def unsigned_right_shitf(n, i): + if n < 0: + n = ctypes.c_uint32(n).value + if i < 0: + return -int_overflow(n << abs(i)) + return int_overflow(n >> i) + + +def get_video_url_from_video_id(video_id): + """Splicing URLs according to video ID to get video details""" + # from js + data = [""] * 256 + for index, _ in enumerate(data): + t = index + for i in range(8): + t = -306674912 ^ unsigned_right_shitf(t, 1) if 1 & t else unsigned_right_shitf(t, 1) + data[index] = t + + def tmp(): + rand_num = random.random() + path = "/video/urls/v/1/toutiao/mp4/{video_id}?r={random_num}".format(video_id=video_id, + random_num=str(rand_num)[2:]) + e = o = r = -1 + i, a = 0, len(path) + while i < a: + e = ord(path[i]) + i += 1 + if e < 128: + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ e)] + else: + if e < 2048: + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (192 | e >> 6 & 31))] + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))] + else: + if 55296 <= e < 57344: + e = (1023 & e) + 64 + i += 1 + o = 1023 & t.url(i) + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (240 | e >> 8 & 7))] + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 2 & 63))] + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | o >> 6 & 15 | (3 & e) << 4))] + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & o))] + else: + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (224 | e >> 12 & 15))] + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | e >> 6 & 63))] + r = unsigned_right_shitf(r, 8) ^ data[255 & (r ^ (128 | 63 & e))] + + return "https://ib.365yg.com{path}&s={param}".format(path=path, param=unsigned_right_shitf(r ^ -1, 0)) + + while 1: + url = tmp() + if url.split("=")[-1][0] != "-": # 参数s不能为负数 + return url def ixigua_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - return toutiao_download(url.replace('ixigua', '365yg')) + # example url: https://www.ixigua.com/i6631065141750268420/#mid=63024814422 + html = get_html(url, faker=True) + video_id = match1(html, r"videoId\s*:\s*'([^']+)'") + title = match1(html, r"title: '(\S+)',") + if not video_id: + log.e("video_id not found, url:{}".format(url)) + return + video_info_url = get_video_url_from_video_id(video_id) + video_info = loads(get_content(video_info_url)) + if video_info.get("code", 1) != 0: + log.e("Get video info from {} error: server return code {}".format(video_info_url, video_info.get("code", 1))) + return + if not video_info.get("data", None): + log.e("Get video info from {} error: The server returns JSON value" + " without data or data is empty".format(video_info_url)) + return + if not video_info["data"].get("video_list", None): + log.e("Get video info from {} error: The server returns JSON value" + " without data.video_list or data.video_list is empty".format(video_info_url)) + return + if not video_info["data"]["video_list"].get("video_1", None): + log.e("Get video info from {} error: The server returns JSON value" + " without data.video_list.video_1 or data.video_list.video_1 is empty".format(video_info_url)) + return + size = int(video_info["data"]["video_list"]["video_1"]["size"]) + print_info(site_info=site_info, title=title, type="mp4", size=size) # 该网站只有mp4类型文件 + if not info_only: + video_url = base64.b64decode(video_info["data"]["video_list"]["video_1"]["main_url"].encode("utf-8")) + download_urls([video_url.decode("utf-8")], title, "mp4", size, output_dir, merge=merge, headers=headers, **kwargs) + + +def ixigua_download_playlist_by_url(url, output_dir='.', merge=True, info_only=False, **kwargs): + assert "user" in url, "Only support users to publish video list,Please provide a similar url:" \ + "https://www.ixigua.com/c/user/6907091136/" + + user_id = url.split("/")[-2] if url[-1] == "/" else url.split("/")[-1] + params = {"max_behot_time": "0", "max_repin_time": "0", "count": "20", "page_type": "0", "user_id": user_id} + while 1: + url = "https://www.ixigua.com/c/user/article/?" + "&".join(["{}={}".format(k, v) for k, v in params.items()]) + video_list = loads(get_content(url, headers=headers)) + params["max_behot_time"] = video_list["next"]["max_behot_time"] + for video in video_list["data"]: + ixigua_download("https://www.ixigua.com/i{}/".format(video["item_id"]), output_dir, merge, info_only, + **kwargs) + if video_list["next"]["max_behot_time"] == 0: + break site_info = "ixigua.com" download = ixigua_download -download_playlist = toutiao_download_playlist +download_playlist = ixigua_download_playlist_by_url From 58e806d72e57f919d3a4f9fd6a30c9691fa46903 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 5 Dec 2018 23:24:24 +0100 Subject: [PATCH 58/67] [youtube] use prefix to avoid potential namespace conflict (fix #2666) --- src/you_get/extractors/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index bc1bc469..9f2d2863 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -62,7 +62,7 @@ class YouTube(VideoExtractor): f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \ match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1)) f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def) - f1def = 'function %s%s' % (f1, f1def) + f1def = 'function main_%s%s' % (f1, f1def) # prefix to avoid potential namespace conflict code = tr_js(f1def) f2s = set(re.findall(r'([$\w]+)\(\w+,\d+\)', f1def)) for f2 in f2s: @@ -79,7 +79,7 @@ class YouTube(VideoExtractor): f1 = re.sub(r'(as|if|in|is|or)', r'_\1', f1) f1 = re.sub(r'\$', '_dollar', f1) - code = code + 'sig=%s(s)' % f1 + code = code + 'sig=main_%s(s)' % f1 # prefix to avoid potential namespace conflict exec(code, globals(), locals()) return locals()['sig'] From 7dbfece21ffbe586ae08731a0f2a86e0882c38ad Mon Sep 17 00:00:00 2001 From: lniwn Date: Thu, 6 Dec 2018 21:24:10 +0800 Subject: [PATCH 59/67] [miaopai] fix weibo.com download error --- src/you_get/extractors/miaopai.py | 52 ++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/miaopai.py b/src/you_get/extractors/miaopai.py index f37d45b0..b827024b 100644 --- a/src/you_get/extractors/miaopai.py +++ b/src/you_get/extractors/miaopai.py @@ -2,9 +2,12 @@ __all__ = ['miaopai_download'] +import string +import random from ..common import * import urllib.error import urllib.parse +from ..util import fs fake_headers_mobile = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', @@ -20,6 +23,10 @@ def miaopai_download_by_fid(fid, output_dir = '.', merge = False, info_only = Fa mobile_page = get_content(page_url, headers=fake_headers_mobile) url = match1(mobile_page, r'
Date: Mon, 10 Dec 2018 17:22:52 +0100 Subject: [PATCH 61/67] [instagram] let's go --- src/you_get/extractors/instagram.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/you_get/extractors/instagram.py b/src/you_get/extractors/instagram.py index 6537b606..9dd7207d 100755 --- a/src/you_get/extractors/instagram.py +++ b/src/you_get/extractors/instagram.py @@ -29,9 +29,14 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg image_url = edge['node']['display_url'] if 'video_url' in edge['node']: image_url = edge['node']['video_url'] - image_url = image_url.split('?')[0] # (not here: '?_nc_ht=instagram.fcph1-1.fna.fbcdn.net') - ext = image_url.split('.')[-1] - size = int(get_head(image_url)['Content-Length']) + try: + image_url = image_url.split('?')[0] + ext = image_url.split('.')[-1] + size = int(get_head(image_url)['Content-Length']) + except: + image_url = image_url.split('?')[0] + '?_nc_ht=instagram.fcph1-1.fna.fbcdn.net' + ext = image_url.split('.')[-1] + size = int(get_head(image_url)['Content-Length']) print_info(site_info, title, ext, size) if not info_only: download_urls(urls=[image_url], @@ -44,9 +49,14 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg image_url = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['display_url'] if 'video_url' in info['entry_data']['PostPage'][0]['graphql']['shortcode_media']: image_url =info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['video_url'] - image_url = image_url.split('?')[0] + '?_nc_ht=instagram.fcph1-1.fna.fbcdn.net' - ext = image_url.split('.')[-1] - size = int(get_head(image_url)['Content-Length']) + try: + image_url = image_url.split('?')[0] + ext = image_url.split('.')[-1] + size = int(get_head(image_url)['Content-Length']) + except: + image_url = image_url.split('?')[0] + '?_nc_ht=instagram.fcph1-1.fna.fbcdn.net' + ext = image_url.split('.')[-1] + size = int(get_head(image_url)['Content-Length']) print_info(site_info, title, ext, size) if not info_only: download_urls(urls=[image_url], From 5d7df2eb1bd1a8bd572e9ad656696870d0f297d4 Mon Sep 17 00:00:00 2001 From: Yang Bo Date: Sun, 16 Dec 2018 07:23:59 +0000 Subject: [PATCH 62/67] Fix zhibo.tv regular expression. --- src/you_get/extractors/zhibo.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/you_get/extractors/zhibo.py b/src/you_get/extractors/zhibo.py index 4aaa293e..1d2eadea 100644 --- a/src/you_get/extractors/zhibo.py +++ b/src/you_get/extractors/zhibo.py @@ -37,11 +37,14 @@ def zhibo_download(url, output_dir = '.', merge = True, info_only = False, **kwa if is_live is not "1": raise ValueError("The live stream is not online! (Errno:%s)" % is_live) - ourStreamName = r1(r"window.ourStreamName=\'([s\S'\s\.]*)\'\;[\s\S]*window.rtmpDefaultSource", html) - rtmpPollUrl = r1(r"window.rtmpPollUrl=\'([s\S'\s\.]*)\'\;[\s\S]*window.hlsDefaultSource", html) - - #real_url = 'rtmp://220.194.213.56/live.zhibo.tv/8live/' + ourStreamName - real_url = rtmpPollUrl + ourStreamName + match = re.search(r""" + ourStreamName .*? + '(.*?)' .*? + rtmpHighSource .*? + '(.*?)' .*? + '(.*?)' + """, html, re.S | re.X) + real_url = match.group(3) + match.group(1) + match.group(2) print_info(site_info, title, 'flv', float('inf')) if not info_only: From afb2db7c3c943fcc327f7bff254bece4ae5717f8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 17 Dec 2018 12:07:56 +0100 Subject: [PATCH 63/67] version 0.4.1193 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index e0a79a8e..e89eb41a 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.1181' +__version__ = '0.4.1193' From fef2298b956219a2856632199e604e380da486f0 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 18 Dec 2018 15:53:56 +0100 Subject: [PATCH 64/67] [instagram] they're forming in straight line --- src/you_get/extractors/instagram.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/src/you_get/extractors/instagram.py b/src/you_get/extractors/instagram.py index 9dd7207d..567e0dd7 100755 --- a/src/you_get/extractors/instagram.py +++ b/src/you_get/extractors/instagram.py @@ -29,14 +29,9 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg image_url = edge['node']['display_url'] if 'video_url' in edge['node']: image_url = edge['node']['video_url'] - try: - image_url = image_url.split('?')[0] - ext = image_url.split('.')[-1] - size = int(get_head(image_url)['Content-Length']) - except: - image_url = image_url.split('?')[0] + '?_nc_ht=instagram.fcph1-1.fna.fbcdn.net' - ext = image_url.split('.')[-1] - size = int(get_head(image_url)['Content-Length']) + ext = image_url.split('?')[0].split('.')[-1] + size = int(get_head(image_url)['Content-Length']) + print_info(site_info, title, ext, size) if not info_only: download_urls(urls=[image_url], @@ -49,14 +44,9 @@ def instagram_download(url, output_dir='.', merge=True, info_only=False, **kwarg image_url = info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['display_url'] if 'video_url' in info['entry_data']['PostPage'][0]['graphql']['shortcode_media']: image_url =info['entry_data']['PostPage'][0]['graphql']['shortcode_media']['video_url'] - try: - image_url = image_url.split('?')[0] - ext = image_url.split('.')[-1] - size = int(get_head(image_url)['Content-Length']) - except: - image_url = image_url.split('?')[0] + '?_nc_ht=instagram.fcph1-1.fna.fbcdn.net' - ext = image_url.split('.')[-1] - size = int(get_head(image_url)['Content-Length']) + ext = image_url.split('?')[0].split('.')[-1] + size = int(get_head(image_url)['Content-Length']) + print_info(site_info, title, ext, size) if not info_only: download_urls(urls=[image_url], From 98fedfb2a277a2c4e77fc85adc3865025bc696f2 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 18 Dec 2018 18:20:01 +0100 Subject: [PATCH 65/67] [miaopai] handle weibo.com/tv/v URLs --- src/you_get/extractors/miaopai.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/miaopai.py b/src/you_get/extractors/miaopai.py index b827024b..304ac176 100644 --- a/src/you_get/extractors/miaopai.py +++ b/src/you_get/extractors/miaopai.py @@ -67,7 +67,10 @@ def miaopai_download_by_wbmp(wbmp_url, fid, info_only=False, **kwargs): def miaopai_download_direct(url, info_only, **kwargs): mobile_page = get_content(url, headers=fake_headers_mobile) - title = re.search(r'([\'"])title\1:\s*([\'"])(.+?)\2,', mobile_page).group(3) + try: + title = re.search(r'([\'"])title\1:\s*([\'"])(.+?)\2,', mobile_page).group(3) + except: + title = re.search(r'([\'"])status_title\1:\s*([\'"])(.+?)\2,', mobile_page).group(3) title = title.replace('\n', '_') stream_url = re.search(r'([\'"])stream_url\1:\s*([\'"])(.+?)\2,', mobile_page).group(3) ext = 'mp4' @@ -78,6 +81,9 @@ def miaopai_download_direct(url, info_only, **kwargs): # ---------------------------------------------------------------------- def miaopai_download(url, output_dir = '.', merge = False, info_only = False, **kwargs): + if match1(url, r'weibo\.com/tv/v/(\w+)'): + return miaopai_download_direct(url, info_only=info_only, output_dir=output_dir, merge=merge, **kwargs) + fid = match1(url, r'\?fid=(\d{4}:\w+)') if fid is not None: miaopai_download_by_fid(fid, output_dir, merge, info_only) From b8470667568d723265ae1414b07be7c8cfdaa947 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 21 Dec 2018 15:46:11 +0100 Subject: [PATCH 66/67] [naver] fix #2671 --- src/you_get/extractors/naver.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/you_get/extractors/naver.py b/src/you_get/extractors/naver.py index b9eef8d7..add884e9 100644 --- a/src/you_get/extractors/naver.py +++ b/src/you_get/extractors/naver.py @@ -16,10 +16,15 @@ def naver_download_by_url(url, output_dir='.', merge=True, info_only=False, **kw ep = 'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{}?key={}' page = get_content(url) try: - og_video_url = re.search(r"", page).group(1) - params_dict = urllib.parse.parse_qs(urllib.parse.urlparse(og_video_url).query) - vid = params_dict['vid'][0] - key = params_dict['outKey'][0] + temp = re.search(r"", page) + if temp is not None: + og_video_url = temp.group(1) + params_dict = urllib.parse.parse_qs(urllib.parse.urlparse(og_video_url).query) + vid = params_dict['vid'][0] + key = params_dict['outKey'][0] + else: + vid = re.search(r"\"videoId\"\s*:\s*\"(.+?)\"", page).group(1) + key = re.search(r"\"inKey\"\s*:\s*\"(.+?)\"", page).group(1) meta_str = get_content(ep.format(vid, key)) meta_json = json.loads(meta_str) if 'errorCode' in meta_json: From 4e98f7bcae333ad974a940bbd8fdb540cc9e1e9e Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 22 Dec 2018 23:14:30 +0100 Subject: [PATCH 67/67] [tumblr] always download the high res (1280) version of images --- src/you_get/extractors/tumblr.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/tumblr.py b/src/you_get/extractors/tumblr.py index d63aee72..91b348fc 100644 --- a/src/you_get/extractors/tumblr.py +++ b/src/you_get/extractors/tumblr.py @@ -49,17 +49,18 @@ def tumblr_download(url, output_dir='.', merge=True, info_only=False, **kwargs): tuggles = {} for url in urls: - filename = parse.unquote(url.split('/')[-1]) + hd_url = r1(r'(.+)_\d+\.jpg$', url) + '_1280.jpg' # FIXME: decide actual quality + filename = parse.unquote(hd_url.split('/')[-1]) title = '.'.join(filename.split('.')[:-1]) tumblr_id = r1(r'^tumblr_(.+)_\d+$', title) quality = int(r1(r'^tumblr_.+_(\d+)$', title)) ext = filename.split('.')[-1] try: - size = int(get_head(url)['Content-Length']) + size = int(get_head(hd_url)['Content-Length']) if tumblr_id not in tuggles or tuggles[tumblr_id]['quality'] < quality: tuggles[tumblr_id] = { 'title': title, - 'url': url, + 'url': hd_url, 'quality': quality, 'ext': ext, 'size': size,