From 4cb24e07d2d09a5c2a6df2d72eb03c8a915e9154 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 17 Sep 2012 02:33:01 +0200 Subject: [PATCH] fix iambus/youku-lixian#11: download .ts files for iQIYI --- you_get/common.py | 107 ++++++++++++++++++++++++++++++++++-- you_get/downloader/iqiyi.py | 16 ++---- 2 files changed, 107 insertions(+), 16 deletions(-) diff --git a/you_get/common.py b/you_get/common.py index 7ab9cf42..ef1f7318 100644 --- a/you_get/common.py +++ b/you_get/common.py @@ -119,7 +119,7 @@ def url_info(url, faker = False): 'video/3gpp': '3gp', 'video/f4v': 'flv', 'video/mp4': 'mp4', - 'video/mp2t': 'ts', + 'video/MP2T': 'ts', 'video/webm': 'webm', 'video/x-flv': 'flv' } @@ -212,6 +212,64 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False): os.remove(filepath) # on Windows rename could fail if destination filepath exists os.rename(temp_filepath, filepath) +def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker = False): + if os.path.exists(filepath): + if not force: + if not is_part: + if bar: + bar.done() + print('Skipping %s: file already exists' % tr(os.path.basename(filepath))) + else: + if bar: + bar.update_received(os.path.getsize(filepath)) + return + else: + if not is_part: + if bar: + bar.done() + print('Overwriting %s' % tr(os.path.basename(filepath)), '...') + elif not os.path.exists(os.path.dirname(filepath)): + os.mkdir(os.path.dirname(filepath)) + + temp_filepath = filepath + '.download' + received = 0 + if not force: + open_mode = 'ab' + + if os.path.exists(temp_filepath): + received += os.path.getsize(temp_filepath) + if bar: + bar.update_received(os.path.getsize(temp_filepath)) + else: + open_mode = 'wb' + + if faker: + headers = fake_headers + else: + headers = {} + if received: + headers['Range'] = 'bytes=' + str(received) + '-' + if refer: + headers['Referer'] = refer + + response = request.urlopen(request.Request(url, headers = headers), None) + + with open(temp_filepath, open_mode) as output: + while True: + buffer = response.read(1024 * 256) + if not buffer: + break + output.write(buffer) + received += len(buffer) + if bar: + bar.update_received(len(buffer)) + + assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (received, os.path.getsize(temp_filepath)) + + if os.access(filepath, os.W_OK): + os.remove(filepath) # on Windows rename could fail if destination filepath exists + os.rename(temp_filepath, filepath) + class SimpleProgressBar: def __init__(self, total_size, total_pieces = 1): self.displayed = False @@ -289,7 +347,7 @@ class DummyProgressBar: def download_urls(urls, title, ext, total_size, output_dir = '.', refer = None, merge = True, faker = False): assert urls - assert ext in ('3gp', 'flv', 'mp4', 'ts', 'webm') + assert ext in ('3gp', 'flv', 'mp4', 'webm') if not total_size: try: total_size = urls_size(urls) @@ -339,7 +397,46 @@ def download_urls(urls, title, ext, total_size, output_dir = '.', refer = None, concat_mp4(parts, os.path.join(output_dir, title + '.mp4')) for part in parts: os.remove(part) - elif ext == 'ts': + else: + print("Can't merge %s files" % ext) + + print() + +def download_urls_chunked(urls, title, ext, total_size, output_dir = '.', refer = None, merge = True, faker = False): + assert urls + assert ext in ('ts') + title = escape_file_path(title) + filename = '%s.%s' % (title, ext) + filepath = os.path.join(output_dir, filename) + if total_size: + if not force and os.path.exists(filepath) and os.path.getsize(filepath) >= total_size * 0.9: + print('Skipping %s: file already exists' % tr(filepath)) + print() + return + bar = SimpleProgressBar(total_size, len(urls)) + else: + bar = PiecesProgressBar(total_size, len(urls)) + + if len(urls) == 1: + url = urls[0] + print('Downloading %s ...' % tr(filename)) + url_save_chunked(url, filepath, bar, refer = refer, faker = faker) + bar.done() + else: + parts = [] + print('Downloading %s.%s ...' % (tr(title), ext)) + for i, url in enumerate(urls): + filename = '%s[%02d].%s' % (title, i, ext) + filepath = os.path.join(output_dir, filename) + parts.append(filepath) + #print 'Downloading %s [%s/%s]...' % (tr(filename), i + 1, len(urls)) + bar.update_piece(i + 1) + url_save_chunked(url, filepath, bar, refer = refer, is_part = True, faker = faker) + bar.done() + if not merge: + print() + return + if ext == 'ts': from .processor.join_ts import concat_ts concat_ts(parts, os.path.join(output_dir, title + '.ts')) for part in parts: @@ -362,7 +459,7 @@ def print_info(site_info, title, type, size): elif type in ['mp4']: type = 'video/mp4' elif type in ['ts']: - type = 'video/mp2t' + type = 'video/MP2T' elif type in ['webm']: type = 'video/webm' @@ -372,7 +469,7 @@ def print_info(site_info, title, type, size): type_info = "Flash video (%s)" % type elif type in ['video/mp4', 'video/x-m4v']: type_info = "MPEG-4 video (%s)" % type - elif type in ['video/mp2t']: + elif type in ['video/MP2T']: type_info = "MPEG-2 transport stream (%s)" % type elif type in ['video/webm']: type_info = "WebM video (%s)" % type diff --git a/you_get/downloader/iqiyi.py b/you_get/downloader/iqiyi.py index 4ecbf018..d0c87ef3 100644 --- a/you_get/downloader/iqiyi.py +++ b/you_get/downloader/iqiyi.py @@ -4,13 +4,6 @@ __all__ = ['iqiyi_download'] from ..common import * -import re - -def real_url(url): - import time - import json - return json.loads(get_html(url[:-3] + 'hml?v=' + str(int(time.time()) + 1921658928)))['l'] # XXX: what is 1921658928? - def iqiyi_download(url, output_dir = '.', merge = True, info_only = False): html = get_html(url) #title = r1(r'title\s*:\s*"([^"]+)"', html) @@ -23,18 +16,19 @@ def iqiyi_download(url, output_dir = '.', merge = True, info_only = False): assert videoId info_url = 'http://cache.video.qiyi.com/v/%s' % videoId info_xml = get_html(info_url) - + from xml.dom.minidom import parseString doc = parseString(info_xml) title = doc.getElementsByTagName('title')[0].firstChild.nodeValue size = int(doc.getElementsByTagName('totalBytes')[0].firstChild.nodeValue) urls = [n.firstChild.nodeValue for n in doc.getElementsByTagName('file')] assert urls[0].endswith('.f4v'), urls[0] - #urls = map(real_url, urls) + tss = ["http://61.155.192.31/videos2" + url[33:-4] + ".ts" for url in urls] # use MPEG-TS .ts files temporarily + # FIXME: find the key to access .f4v files - print_info(site_info, title, 'flv', size) + print_info(site_info, title, 'ts', size) if not info_only: - download_urls(urls, title, 'flv', size, output_dir = output_dir, merge = merge) + download_urls_chunked(tss, title, 'ts', size, output_dir = output_dir, merge = merge) # use MPEG-TS .ts files temporarily site_info = "iQIYI.com" download = iqiyi_download