From 89844858199bfc3b3a3317e686e5982c74949777 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 11 Sep 2018 17:31:47 +0200 Subject: [PATCH] [youtube] faster than light --- src/you_get/common.py | 137 ++++++++++++++++-------------- src/you_get/extractors/youtube.py | 26 +++++- 2 files changed, 97 insertions(+), 66 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index b19d602f..d212b62b 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -602,7 +602,12 @@ def url_save( # the key must be 'Referer' for the hack here if refer is not None: tmp_headers['Referer'] = refer - file_size = url_size(url, faker=faker, headers=tmp_headers) + if type(url) is list: + file_size = urls_size(url, faker=faker, headers=tmp_headers) + is_chunked, urls = True, url + else: + file_size = url_size(url, faker=faker, headers=tmp_headers) + is_chunked, urls = False, [url] continue_renameing = True while continue_renameing: @@ -655,70 +660,78 @@ def url_save( else: open_mode = 'wb' - if received < file_size: - if faker: - tmp_headers = fake_headers - ''' - if parameter headers passed in, we have it copied as tmp_header - elif headers: - headers = headers - else: - headers = {} - ''' - if received: - tmp_headers['Range'] = 'bytes=' + str(received) + '-' - if refer: - tmp_headers['Referer'] = refer + for url in urls: + received_chunk = 0 + if received < file_size: + if faker: + tmp_headers = fake_headers + ''' + if parameter headers passed in, we have it copied as tmp_header + elif headers: + headers = headers + else: + headers = {} + ''' + if received and not is_chunked: # only request a range when not chunked + tmp_headers['Range'] = 'bytes=' + str(received) + '-' + if refer: + tmp_headers['Referer'] = refer - if timeout: - response = urlopen_with_retry( - request.Request(url, headers=tmp_headers), timeout=timeout - ) - else: - response = urlopen_with_retry( - request.Request(url, headers=tmp_headers) - ) - try: - range_start = int( - response.headers[ - 'content-range' - ][6:].split('/')[0].split('-')[0] - ) - end_length = int( - response.headers['content-range'][6:].split('/')[1] - ) - range_length = end_length - range_start - except: - content_length = response.headers['content-length'] - range_length = int(content_length) if content_length is not None \ - else float('inf') + if timeout: + response = urlopen_with_retry( + request.Request(url, headers=tmp_headers), timeout=timeout + ) + else: + response = urlopen_with_retry( + request.Request(url, headers=tmp_headers) + ) + try: + range_start = int( + response.headers[ + 'content-range' + ][6:].split('/')[0].split('-')[0] + ) + end_length = int( + response.headers['content-range'][6:].split('/')[1] + ) + range_length = end_length - range_start + except: + content_length = response.headers['content-length'] + range_length = int(content_length) if content_length is not None \ + else float('inf') - if file_size != received + range_length: - received = 0 - if bar: - bar.received = 0 - open_mode = 'wb' - - with open(temp_filepath, open_mode) as output: - while True: - buffer = None - try: - buffer = response.read(1024 * 256) - except socket.timeout: - pass - if not buffer: - if received == file_size: # Download finished - break - # Unexpected termination. Retry request - tmp_headers['Range'] = 'bytes=' + str(received) + '-' - response = urlopen_with_retry( - request.Request(url, headers=tmp_headers) - ) - continue - output.write(buffer) - received += len(buffer) + if is_chunked: # always append if chunked + open_mode = 'ab' + elif file_size != received + range_length: # is it ever necessary? + received = 0 if bar: - bar.update_received(len(buffer)) + bar.received = 0 + open_mode = 'wb' + + with open(temp_filepath, open_mode) as output: + while True: + buffer = None + try: + buffer = response.read(1024 * 256) + except socket.timeout: + pass + if not buffer: + if is_chunked and received_chunk == range_length: + break + elif not is_chunked and received == file_size: # Download finished + break + # Unexpected termination. Retry request + if not is_chunked: # when + tmp_headers['Range'] = 'bytes=' + str(received) + '-' + response = urlopen_with_retry( + request.Request(url, headers=tmp_headers) + ) + continue + output.write(buffer) + received += len(buffer) + received_chunk += len(buffer) + if bar: + bar.update_received(len(buffer)) assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % ( received, os.path.getsize(temp_filepath), temp_filepath diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 5482f1e4..19864590 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -81,6 +81,16 @@ class YouTube(VideoExtractor): exec(code, globals(), locals()) return locals()['sig'] + def chunk_by_range(url, size): + urls = [] + chunk_size = 10485760 + start, end = 0, chunk_size - 1 + urls.append('%s&range=%s-%s' % (url, start, end)) + while end + 1 < size: # processed size < expected size + start, end = end + 1, end + chunk_size + urls.append('%s&range=%s-%s' % (url, start, end)) + return urls + def get_url_from_vid(vid): return 'https://youtu.be/{}'.format(vid) @@ -290,13 +300,15 @@ class YouTube(VideoExtractor): if not dash_size: try: dash_size = url_size(dash_url) except: continue + dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size)) + dash_mp4_a_urls = self.__class__.chunk_by_range(dash_mp4_a_url, int(dash_mp4_a_size)) self.dash_streams[itag] = { 'quality': '%sx%s' % (w, h), 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'mp4', - 'src': [dash_url, dash_mp4_a_url], + 'src': [dash_urls, dash_mp4_a_urls], 'size': int(dash_size) + int(dash_mp4_a_size) } elif mimeType == 'video/webm': @@ -310,13 +322,15 @@ class YouTube(VideoExtractor): if not dash_size: try: dash_size = url_size(dash_url) except: continue + dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size)) + dash_webm_a_urls = self.__class__.chunk_by_range(dash_webm_a_url, int(dash_webm_a_size)) self.dash_streams[itag] = { 'quality': '%sx%s' % (w, h), 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'webm', - 'src': [dash_url, dash_webm_a_url], + 'src': [dash_urls, dash_webm_a_urls], 'size': int(dash_size) + int(dash_webm_a_size) } except: @@ -353,13 +367,15 @@ class YouTube(VideoExtractor): dash_url += '&signature={}'.format(sig) dash_size = stream['clen'] itag = stream['itag'] + dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size)) + dash_mp4_a_urls = self.__class__.chunk_by_range(dash_mp4_a_url, int(dash_mp4_a_size)) self.dash_streams[itag] = { 'quality': stream['size'], 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'mp4', - 'src': [dash_url, dash_mp4_a_url], + 'src': [dash_urls, dash_mp4_a_urls], 'size': int(dash_size) + int(dash_mp4_a_size) } elif stream['type'].startswith('video/webm'): @@ -378,13 +394,15 @@ class YouTube(VideoExtractor): except UnboundLocalError as e: audio_url = dash_mp4_a_url audio_size = int(dash_mp4_a_size) + dash_urls = self.__class__.chunk_by_range(dash_url, int(dash_size)) + audio_urls = self.__class__.chunk_by_range(audio_url, int(audio_size)) self.dash_streams[itag] = { 'quality': stream['size'], 'itag': itag, 'type': mimeType, 'mime': mimeType, 'container': 'webm', - 'src': [dash_url, audio_url], + 'src': [dash_urls, audio_urls], 'size': int(dash_size) + int(audio_size) }