From 8674f43c36055fd1ceb06844a3bb445637e12271 Mon Sep 17 00:00:00 2001 From: zhangchao Date: Tue, 27 Sep 2016 23:04:01 +0800 Subject: [PATCH 001/765] fix bug about renaming videos when downloading with ffmpeg --- src/you_get/common.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/you_get/common.py b/src/you_get/common.py index 7f76aaac..c2b585a6 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -916,6 +916,11 @@ def download_url_ffmpeg(url,title, ext,params={}, total_size=0, output_dir='.', from .processor.ffmpeg import has_ffmpeg_installed, ffmpeg_download_stream assert has_ffmpeg_installed(), "FFmpeg not installed." + global output_filename + if(output_filename) + dotPos = output_filename.rfind(".") + title = output_filename[:dotPos] + ext = output_filename[dotPos+1:] ffmpeg_download_stream(url, title, ext, params, output_dir) def playlist_not_supported(name): From 4bbafeb9e48e76b7b622f2133685905b362a9096 Mon Sep 17 00:00:00 2001 From: liushuyu Date: Thu, 20 Oct 2016 13:09:30 -0600 Subject: [PATCH 002/765] icourse: add supprt --- src/you_get/common.py | 1 + src/you_get/extractors/__init__.py | 1 + src/you_get/extractors/icourses.py | 129 +++++++++++++++++++++++++++++ 3 files changed, 131 insertions(+) create mode 100644 src/you_get/extractors/icourses.py diff --git a/src/you_get/common.py b/src/you_get/common.py index 948b0ca2..ca867673 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -29,6 +29,7 @@ SITES = { 'huaban' : 'huaban', 'huomao' : 'huomaotv', 'iask' : 'sina', + 'icourses' : 'icourses', 'ifeng' : 'ifeng', 'imgur' : 'imgur', 'in' : 'alive', diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index e69bc2fd..61b6a0d1 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -24,6 +24,7 @@ from .funshion import * from .google import * from .heavymusic import * from .huaban import * +from .icourses import * from .ifeng import * from .imgur import * from .infoq import * diff --git a/src/you_get/extractors/icourses.py b/src/you_get/extractors/icourses.py new file mode 100644 index 00000000..5f9b8edf --- /dev/null +++ b/src/you_get/extractors/icourses.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +from ..common import * +from urllib import parse +import xml.etree.ElementTree as ET +import datetime +import hashlib +import base64 +import logging +from urllib import error +import re + +__all__ = ['icourses_download'] + + +def icourses_download(url, info_only, merge=False, output_dir='.', **kwargs): + title, real_url = icourses_cn_url_parser( + url, info_only=info_only, **kwargs) + if real_url is not None: + for tries in range(0, 3): + try: + _, type_, size = url_info(real_url, faker=True) + break + except error.HTTPError: + logging.warning('Failed to fetch the video file! Retrying...') + title, real_url = icourses_cn_url_parser(url) + print_info(site_info, title, type_, size) + if not info_only: + download_urls([real_url], title, 'flv', + total_size=size, output_dir=output_dir, refer=url, merge=merge, faker=True) + + +def icourses_playlist_download(url, **kwargs): + import random + from time import sleep + html = get_content(url) + page_type_patt = r'showSectionNode\(this,(\d+),(\d+)\)' + video_js_number = r'changeforvideo\((.*?)\)' + fs_flag = r'' + page_navi_vars = re.search(pattern=page_type_patt, string=html) + dummy_page = 'http://www.icourses.cn/jpk/viewCharacterDetail.action?sectionId={}&courseId={}'.format( + page_navi_vars.group(2), page_navi_vars.group(1)) + html = get_content(dummy_page) + fs_status = match1(html, fs_flag) + video_list = re.findall(pattern=video_js_number, string=html) + for video in video_list: + video_args = video.replace('\'', '').split(',') + video_url = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}&firstShowFlag={}'.format( + video_args[0], video_args[1], fs_status or '1') + sleep(random.Random().randint(0, 5)) # Prevent from blockage + icourses_download(url=video_url, **kwargs) + + +def icourses_cn_url_parser(url, **kwargs): + PLAYER_BASE_VER = '150606-1' + ENCRYPT_MOD_VER = '151020' + ENCRYPT_SALT = '3DAPmXsZ4o' # It took really long time to find this... + html = get_content(url) + if re.search(pattern=r'showSectionNode\(.*\)', string=html): + logging.warning('Switching to playlist mode!') + return icourses_playlist_download(url, **kwargs) + flashvars_patt = r'var\ flashvars\=((.|\n)*)};' + server_time_patt = r'MPlayer.swf\?v\=(\d+)' + uuid_patt = r'uuid:(\d+)' + other_args_patt = r'other:"(.*)"' + res_url_patt = r'IService:\'([^\']+)' + title_a_patt = r'
(.*?)' + title_b_patt = r'
((.|\n)*?)
' + title_a = match1(html, title_a_patt).strip() + title_b = match1(html, title_b_patt).strip() + title = title_a + title_b # WIP, FIXME + title = re.sub('( +|\n|\t|\r|\ \;)', '', + unescape_html(title).replace(' ', '')) + server_time = match1(html, server_time_patt) + flashvars = match1(html, flashvars_patt) + uuid = match1(flashvars, uuid_patt) + other_args = match1(flashvars, other_args_patt) + res_url = match1(flashvars, res_url_patt) + url_parts = {'v': server_time, 'other': other_args, + 'uuid': uuid, 'IService': res_url} + req_url = '%s?%s' % (res_url, parse.urlencode(url_parts)) + logging.debug('Requesting video resource location...') + xml_resp = get_html(req_url) + xml_obj = ET.fromstring(xml_resp) + logging.debug('The result was {}'.format(xml_obj.get('status'))) + if xml_obj.get('status') != 'success': + raise ValueError('Server returned error!') + common_args = {'lv': PLAYER_BASE_VER, 'ls': 'play', + 'lt': datetime.datetime.now().strftime('%m-%d/%H:%M:%S'), + 'start': 0} + media_host = xml_obj.find(".//*[@name='host']").text + media_url = media_host + xml_obj.find(".//*[@name='url']").text + # This is what they called `SSLModule`... But obviously, just a kind of + # encryption, takes absolutely no effect in protecting data intergrity + if xml_obj.find(".//*[@name='ssl']").text != 'true': + logging.debug('The encryption mode is disabled') + # when the so-called `SSLMode` is not activated, the parameters, `h` + # and `p` can be found in response + arg_h = xml_obj.find(".//*[@name='h']").text + assert arg_h + arg_r = xml_obj.find(".//*[@name='p']").text or ENCRYPT_MOD_VER + url_args = common_args.copy() + url_args.update({'h': arg_h, 'r': arg_r}) + final_url = '{}?{}'.format( + media_url, parse.urlencode(url_args)) + return title, final_url + # when the `SSLMode` is activated, we need to receive the timestamp and the + # time offset (?) value from the server + logging.debug('The encryption mode is in effect') + ssl_callback = get_html('{}/ssl/ssl.shtml'.format(media_host)).split(',') + ssl_timestamp = int(datetime.datetime.strptime( + ssl_callback[1], "%b %d %H:%M:%S %Y").timestamp() + int(ssl_callback[0])) + sign_this = ENCRYPT_SALT + \ + parse.urlparse(media_url).path + str(ssl_timestamp) + arg_h = base64.b64encode(hashlib.md5(bytes(sign_this, 'utf-8')).digest()) + # Post-processing, may subject to change, so leaving this alone... + arg_h = arg_h.decode('utf-8').strip('=').replace('+', + '-').replace('/', '_') + arg_r = ssl_timestamp + url_args = common_args.copy() + url_args.update({'h': arg_h, 'r': arg_r, 'p': ENCRYPT_MOD_VER}) + final_url = '{}?{}'.format( + media_url, parse.urlencode(url_args)) + logging.debug('Concat`ed URL: {}'.format(final_url)) + return title, final_url + + +site_info = 'icourses.cn' +download = icourses_download +download_playlist = icourses_playlist_download From 5351121186c2c8c94bc7b24419ea5ca305582462 Mon Sep 17 00:00:00 2001 From: liushuyu Date: Tue, 25 Oct 2016 12:52:30 -0600 Subject: [PATCH 003/765] icouses: Code clean up --- src/you_get/extractors/icourses.py | 197 +++++++++++++++-------------- 1 file changed, 105 insertions(+), 92 deletions(-) diff --git a/src/you_get/extractors/icourses.py b/src/you_get/extractors/icourses.py index 5f9b8edf..5c2f8cda 100644 --- a/src/you_get/extractors/icourses.py +++ b/src/you_get/extractors/icourses.py @@ -13,8 +13,9 @@ __all__ = ['icourses_download'] def icourses_download(url, info_only, merge=False, output_dir='.', **kwargs): - title, real_url = icourses_cn_url_parser( - url, info_only=info_only, **kwargs) + icourses_parser = ICousesExactor(url=url) + real_url = icourses_parser.icourses_cn_url_parser(**kwargs) + title = icourses_parser.title if real_url is not None: for tries in range(0, 3): try: @@ -22,108 +23,120 @@ def icourses_download(url, info_only, merge=False, output_dir='.', **kwargs): break except error.HTTPError: logging.warning('Failed to fetch the video file! Retrying...') - title, real_url = icourses_cn_url_parser(url) + real_url = icourses_parser.icourses_cn_url_parser() + title = icourses_parser.title print_info(site_info, title, type_, size) if not info_only: download_urls([real_url], title, 'flv', total_size=size, output_dir=output_dir, refer=url, merge=merge, faker=True) -def icourses_playlist_download(url, **kwargs): - import random - from time import sleep - html = get_content(url) - page_type_patt = r'showSectionNode\(this,(\d+),(\d+)\)' - video_js_number = r'changeforvideo\((.*?)\)' - fs_flag = r'' - page_navi_vars = re.search(pattern=page_type_patt, string=html) - dummy_page = 'http://www.icourses.cn/jpk/viewCharacterDetail.action?sectionId={}&courseId={}'.format( - page_navi_vars.group(2), page_navi_vars.group(1)) - html = get_content(dummy_page) - fs_status = match1(html, fs_flag) - video_list = re.findall(pattern=video_js_number, string=html) - for video in video_list: - video_args = video.replace('\'', '').split(',') - video_url = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}&firstShowFlag={}'.format( - video_args[0], video_args[1], fs_status or '1') - sleep(random.Random().randint(0, 5)) # Prevent from blockage - icourses_download(url=video_url, **kwargs) +# Why not using VideoExtractor: This site needs specical download method +class ICousesExactor(object): + def __init__(self, url): + self.url = url + self.title = '' + return -def icourses_cn_url_parser(url, **kwargs): - PLAYER_BASE_VER = '150606-1' - ENCRYPT_MOD_VER = '151020' - ENCRYPT_SALT = '3DAPmXsZ4o' # It took really long time to find this... - html = get_content(url) - if re.search(pattern=r'showSectionNode\(.*\)', string=html): - logging.warning('Switching to playlist mode!') - return icourses_playlist_download(url, **kwargs) - flashvars_patt = r'var\ flashvars\=((.|\n)*)};' - server_time_patt = r'MPlayer.swf\?v\=(\d+)' - uuid_patt = r'uuid:(\d+)' - other_args_patt = r'other:"(.*)"' - res_url_patt = r'IService:\'([^\']+)' - title_a_patt = r'
(.*?)' - title_b_patt = r'
((.|\n)*?)
' - title_a = match1(html, title_a_patt).strip() - title_b = match1(html, title_b_patt).strip() - title = title_a + title_b # WIP, FIXME - title = re.sub('( +|\n|\t|\r|\ \;)', '', - unescape_html(title).replace(' ', '')) - server_time = match1(html, server_time_patt) - flashvars = match1(html, flashvars_patt) - uuid = match1(flashvars, uuid_patt) - other_args = match1(flashvars, other_args_patt) - res_url = match1(flashvars, res_url_patt) - url_parts = {'v': server_time, 'other': other_args, - 'uuid': uuid, 'IService': res_url} - req_url = '%s?%s' % (res_url, parse.urlencode(url_parts)) - logging.debug('Requesting video resource location...') - xml_resp = get_html(req_url) - xml_obj = ET.fromstring(xml_resp) - logging.debug('The result was {}'.format(xml_obj.get('status'))) - if xml_obj.get('status') != 'success': - raise ValueError('Server returned error!') - common_args = {'lv': PLAYER_BASE_VER, 'ls': 'play', - 'lt': datetime.datetime.now().strftime('%m-%d/%H:%M:%S'), - 'start': 0} - media_host = xml_obj.find(".//*[@name='host']").text - media_url = media_host + xml_obj.find(".//*[@name='url']").text - # This is what they called `SSLModule`... But obviously, just a kind of - # encryption, takes absolutely no effect in protecting data intergrity - if xml_obj.find(".//*[@name='ssl']").text != 'true': - logging.debug('The encryption mode is disabled') - # when the so-called `SSLMode` is not activated, the parameters, `h` - # and `p` can be found in response - arg_h = xml_obj.find(".//*[@name='h']").text - assert arg_h - arg_r = xml_obj.find(".//*[@name='p']").text or ENCRYPT_MOD_VER + def icourses_playlist_download(self, **kwargs): + import random + from time import sleep + html = get_content(url) + page_type_patt = r'showSectionNode\(this,(\d+),(\d+)\)' + video_js_number = r'changeforvideo\((.*?)\)' + fs_flag = r'' + page_navi_vars = re.search(pattern=page_type_patt, string=html) + dummy_page = 'http://www.icourses.cn/jpk/viewCharacterDetail.action?sectionId={}&courseId={}'.format( + page_navi_vars.group(2), page_navi_vars.group(1)) + html = get_content(dummy_page) + fs_status = match1(html, fs_flag) + video_list = re.findall(pattern=video_js_number, string=html) + for video in video_list: + video_args = video.replace('\'', '').split(',') + video_url = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}&firstShowFlag={}'.format( + video_args[0], video_args[1], fs_status or '1') + sleep(random.Random().randint(0, 5)) # Prevent from blockage + icourses_download(video_url, **kwargs) + + def icourses_cn_url_parser(self, **kwargs): + PLAYER_BASE_VER = '150606-1' + ENCRYPT_MOD_VER = '151020' + ENCRYPT_SALT = '3DAPmXsZ4o' # It took really long time to find this... + html = get_content(self.url) + if re.search(pattern=r'showSectionNode\(.*\)', string=html): + logging.warning('Switching to playlist mode!') + return self.icourses_playlist_download(**kwargs) + flashvars_patt = r'var\ flashvars\=((.|\n)*)};' + server_time_patt = r'MPlayer.swf\?v\=(\d+)' + uuid_patt = r'uuid:(\d+)' + other_args_patt = r'other:"(.*)"' + res_url_patt = r'IService:\'([^\']+)' + title_a_patt = r'
(.*?)' + title_b_patt = r'
((.|\n)*?)
' + title_a = match1(html, title_a_patt).strip() + title_b = match1(html, title_b_patt).strip() + title = title_a + title_b # WIP, FIXME + title = re.sub('( +|\n|\t|\r|\ \;)', '', + unescape_html(title).replace(' ', '')) + server_time = match1(html, server_time_patt) + flashvars = match1(html, flashvars_patt) + uuid = match1(flashvars, uuid_patt) + other_args = match1(flashvars, other_args_patt) + res_url = match1(flashvars, res_url_patt) + url_parts = {'v': server_time, 'other': other_args, + 'uuid': uuid, 'IService': res_url} + req_url = '%s?%s' % (res_url, parse.urlencode(url_parts)) + logging.debug('Requesting video resource location...') + xml_resp = get_html(req_url) + xml_obj = ET.fromstring(xml_resp) + logging.debug('The result was {}'.format(xml_obj.get('status'))) + if xml_obj.get('status') != 'success': + raise ValueError('Server returned error!') + common_args = {'lv': PLAYER_BASE_VER, 'ls': 'play', + 'lt': datetime.datetime.now().strftime('%m-%d/%H:%M:%S'), + 'start': 0} + media_host = xml_obj.find(".//*[@name='host']").text + media_url = media_host + xml_obj.find(".//*[@name='url']").text + # This is what they called `SSLModule`... But obviously, just a kind of + # encryption, takes absolutely no effect in protecting data intergrity + if xml_obj.find(".//*[@name='ssl']").text != 'true': + logging.debug('The encryption mode is disabled') + # when the so-called `SSLMode` is not activated, the parameters, `h` + # and `p` can be found in response + arg_h = xml_obj.find(".//*[@name='h']").text + assert arg_h + arg_r = xml_obj.find(".//*[@name='p']").text or ENCRYPT_MOD_VER + url_args = common_args.copy() + url_args.update({'h': arg_h, 'r': arg_r}) + final_url = '{}?{}'.format( + media_url, parse.urlencode(url_args)) + self.title = title + return final_url + # when the `SSLMode` is activated, we need to receive the timestamp and the + # time offset (?) value from the server + logging.debug('The encryption mode is in effect') + ssl_callback = get_html( + '{}/ssl/ssl.shtml'.format(media_host)).split(',') + ssl_timestamp = int(datetime.datetime.strptime( + ssl_callback[1], "%b %d %H:%M:%S %Y").timestamp() + int(ssl_callback[0])) + sign_this = ENCRYPT_SALT + \ + parse.urlparse(media_url).path + str(ssl_timestamp) + arg_h = base64.b64encode(hashlib.md5( + bytes(sign_this, 'utf-8')).digest()) + # Post-processing, may subject to change, so leaving this alone... + arg_h = arg_h.decode('utf-8').strip('=').replace('+', + '-').replace('/', '_') + arg_r = ssl_timestamp url_args = common_args.copy() - url_args.update({'h': arg_h, 'r': arg_r}) + url_args.update({'h': arg_h, 'r': arg_r, 'p': ENCRYPT_MOD_VER}) final_url = '{}?{}'.format( media_url, parse.urlencode(url_args)) - return title, final_url - # when the `SSLMode` is activated, we need to receive the timestamp and the - # time offset (?) value from the server - logging.debug('The encryption mode is in effect') - ssl_callback = get_html('{}/ssl/ssl.shtml'.format(media_host)).split(',') - ssl_timestamp = int(datetime.datetime.strptime( - ssl_callback[1], "%b %d %H:%M:%S %Y").timestamp() + int(ssl_callback[0])) - sign_this = ENCRYPT_SALT + \ - parse.urlparse(media_url).path + str(ssl_timestamp) - arg_h = base64.b64encode(hashlib.md5(bytes(sign_this, 'utf-8')).digest()) - # Post-processing, may subject to change, so leaving this alone... - arg_h = arg_h.decode('utf-8').strip('=').replace('+', - '-').replace('/', '_') - arg_r = ssl_timestamp - url_args = common_args.copy() - url_args.update({'h': arg_h, 'r': arg_r, 'p': ENCRYPT_MOD_VER}) - final_url = '{}?{}'.format( - media_url, parse.urlencode(url_args)) - logging.debug('Concat`ed URL: {}'.format(final_url)) - return title, final_url + logging.debug('Crafted URL: {}'.format(final_url)) + self.title = title + return final_url site_info = 'icourses.cn' download = icourses_download -download_playlist = icourses_playlist_download +# download_playlist = icourses_playlist_download From ae4e533ec9d28fb1598fb91dfa87ce16cb06bc92 Mon Sep 17 00:00:00 2001 From: liushuyu Date: Tue, 25 Oct 2016 14:03:21 -0600 Subject: [PATCH 004/765] common: add dynamic url support for `url_save_chunked` --- src/you_get/common.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 948b0ca2..0f7fd0e3 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -547,7 +547,11 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False, h os.remove(filepath) # on Windows rename could fail if destination filepath exists os.rename(temp_filepath, filepath) -def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker = False, headers = {}): +def url_save_chunked(url, filepath, bar, dyn_callback=None, chunk_size=0, ignore_range=False, refer=None, is_part=False, faker=False, headers={}): + def dyn_update_url(received): + if callable(dyn_callback): + logging.debug('Calling callback %s for new URL from %s' % (dyn_callback.__name__, received)) + return dyn_callback(received) if os.path.exists(filepath): if not force: if not is_part: @@ -585,19 +589,26 @@ def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker = else: headers = {} if received: - headers['Range'] = 'bytes=' + str(received) + '-' + url = dyn_update_url(received) + if not ignore_range: + headers['Range'] = 'bytes=' + str(received) + '-' if refer: headers['Referer'] = refer - response = request.urlopen(request.Request(url, headers = headers), None) + response = request.urlopen(request.Request(url, headers=headers), None) with open(temp_filepath, open_mode) as output: + this_chunk = received while True: buffer = response.read(1024 * 256) if not buffer: break output.write(buffer) received += len(buffer) + if chunk_size and (received - this_chunk) >= chunk_size: + url = dyn_callback(received) + this_chunk = received + response = request.urlopen(request.Request(url, headers=headers), None) if bar: bar.update_received(len(buffer)) @@ -846,7 +857,7 @@ def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merg print() -def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False, headers = {}): +def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False, headers = {}, **kwargs): assert urls if dry_run: print('Real URLs:\n%s\n' % urls) @@ -860,7 +871,7 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=No filename = '%s.%s' % (title, ext) filepath = os.path.join(output_dir, filename) - if total_size and ext in ('ts'): + if total_size: if not force and os.path.exists(filepath[:-3] + '.mkv'): print('Skipping %s: file already exists' % filepath[:-3] + '.mkv') print() @@ -875,7 +886,7 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=No print('Downloading %s ...' % tr(filename)) filepath = os.path.join(output_dir, filename) parts.append(filepath) - url_save_chunked(url, filepath, bar, refer = refer, faker = faker, headers = headers) + url_save_chunked(url, filepath, bar, refer = refer, faker = faker, headers = headers, **kwargs) bar.done() if not merge: From 2183448c9098c1abd0e9cf47fa305e3775e1e098 Mon Sep 17 00:00:00 2001 From: liushuyu Date: Tue, 25 Oct 2016 14:15:23 -0600 Subject: [PATCH 005/765] icourses: implement fake `keep connection alive` --- src/you_get/extractors/icourses.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/you_get/extractors/icourses.py b/src/you_get/extractors/icourses.py index 5c2f8cda..cb2ff74a 100644 --- a/src/you_get/extractors/icourses.py +++ b/src/you_get/extractors/icourses.py @@ -1,6 +1,8 @@ #!/usr/bin/env python from ..common import * from urllib import parse +import random +from time import sleep import xml.etree.ElementTree as ET import datetime import hashlib @@ -12,23 +14,24 @@ import re __all__ = ['icourses_download'] -def icourses_download(url, info_only, merge=False, output_dir='.', **kwargs): +def icourses_download(url, merge=False, output_dir='.', **kwargs): icourses_parser = ICousesExactor(url=url) real_url = icourses_parser.icourses_cn_url_parser(**kwargs) title = icourses_parser.title if real_url is not None: - for tries in range(0, 3): + for tries in range(0, 5): try: _, type_, size = url_info(real_url, faker=True) break except error.HTTPError: logging.warning('Failed to fetch the video file! Retrying...') + sleep(random.Random().randint(0, 5)) # Prevent from blockage real_url = icourses_parser.icourses_cn_url_parser() title = icourses_parser.title print_info(site_info, title, type_, size) - if not info_only: - download_urls([real_url], title, 'flv', - total_size=size, output_dir=output_dir, refer=url, merge=merge, faker=True) + if not kwargs['info_only']: + download_urls_chunked([real_url], title, 'flv', + total_size=size, output_dir=output_dir, refer=url, merge=merge, faker=True, ignore_range=True, chunk_size=15000000, dyn_callback=icourses_parser.icourses_cn_url_parser) # Why not using VideoExtractor: This site needs specical download method @@ -40,9 +43,7 @@ class ICousesExactor(object): return def icourses_playlist_download(self, **kwargs): - import random - from time import sleep - html = get_content(url) + html = get_content(self.url) page_type_patt = r'showSectionNode\(this,(\d+),(\d+)\)' video_js_number = r'changeforvideo\((.*?)\)' fs_flag = r'' @@ -59,7 +60,7 @@ class ICousesExactor(object): sleep(random.Random().randint(0, 5)) # Prevent from blockage icourses_download(video_url, **kwargs) - def icourses_cn_url_parser(self, **kwargs): + def icourses_cn_url_parser(self, received=0, **kwargs): PLAYER_BASE_VER = '150606-1' ENCRYPT_MOD_VER = '151020' ENCRYPT_SALT = '3DAPmXsZ4o' # It took really long time to find this... @@ -93,9 +94,14 @@ class ICousesExactor(object): logging.debug('The result was {}'.format(xml_obj.get('status'))) if xml_obj.get('status') != 'success': raise ValueError('Server returned error!') - common_args = {'lv': PLAYER_BASE_VER, 'ls': 'play', + if received: + play_type = 'seek' + else: + play_type = 'play' + received -= 1 + common_args = {'lv': PLAYER_BASE_VER, 'ls': play_type, 'lt': datetime.datetime.now().strftime('%m-%d/%H:%M:%S'), - 'start': 0} + 'start': received + 1} media_host = xml_obj.find(".//*[@name='host']").text media_url = media_host + xml_obj.find(".//*[@name='url']").text # This is what they called `SSLModule`... But obviously, just a kind of From ac33461c88344d86f74b69572f2f27d03fd708b5 Mon Sep 17 00:00:00 2001 From: Cheng Gu Date: Thu, 27 Oct 2016 17:44:02 +0800 Subject: [PATCH 006/765] fix(huomao): adapt to new url format --- src/you_get/extractors/huomaotv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/huomaotv.py b/src/you_get/extractors/huomaotv.py index 4852ff06..6e98c800 100644 --- a/src/you_get/extractors/huomaotv.py +++ b/src/you_get/extractors/huomaotv.py @@ -6,7 +6,7 @@ from ..common import * def get_mobile_room_url(room_id): - return 'http://www.huomao.com/mobile/mob_live?cid=%s' % room_id + return 'http://www.huomao.com/mobile/mob_live/%s' % room_id def get_m3u8_url(stream_id): From 0f3fe97e9caedf976286193aff5dddf430d80962 Mon Sep 17 00:00:00 2001 From: Cheng Gu Date: Thu, 27 Oct 2016 17:44:54 +0800 Subject: [PATCH 007/765] update: add huomao.com --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b994ebd1..182fc12a 100644 --- a/README.md +++ b/README.md @@ -407,6 +407,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 花瓣 | | |✓| | | Naver
네이버 | |✓| | | | 芒果TV | |✓| | | +| 火猫TV | |✓| | | For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. From feffcb656ad2c33b17fb2e20598f8137fc69789c Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 30 Oct 2016 00:24:31 +0200 Subject: [PATCH 008/765] [processor.ffmpeg] fix params in ffmpeg_download_stream --- src/you_get/processor/ffmpeg.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/you_get/processor/ffmpeg.py b/src/you_get/processor/ffmpeg.py index 1c0ba1a3..c6da97f7 100644 --- a/src/you_get/processor/ffmpeg.py +++ b/src/you_get/processor/ffmpeg.py @@ -212,15 +212,6 @@ def ffmpeg_download_stream(files, title, ext, params={}, output_dir='.'): if not (output_dir == '.'): output = output_dir + '/' + output - ffmpeg_params = [] - #should these exist... - if params is not None: - if len(params) > 0: - for k, v in params: - ffmpeg_params.append(k) - ffmpeg_params.append(v) - - print('Downloading streaming content with FFmpeg, press q to stop recording...') ffmpeg_params = [FFMPEG] + ['-y', '-re', '-i'] ffmpeg_params.append(files) #not the same here!!!! @@ -230,6 +221,12 @@ def ffmpeg_download_stream(files, title, ext, params={}, output_dir='.'): else: ffmpeg_params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc'] + if params is not None: + if len(params) > 0: + for k, v in params: + ffmpeg_params.append(k) + ffmpeg_params.append(v) + ffmpeg_params.append(output) print(' '.join(ffmpeg_params)) From 4b55884e86df68c56ae9fce85293f9b757e97576 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 30 Oct 2016 00:26:25 +0200 Subject: [PATCH 009/765] [dailymotion] use ffmpeg_download_stream, fix #1466 --- src/you_get/extractors/dailymotion.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/you_get/extractors/dailymotion.py b/src/you_get/extractors/dailymotion.py index 8b701cd1..2e96c160 100644 --- a/src/you_get/extractors/dailymotion.py +++ b/src/you_get/extractors/dailymotion.py @@ -4,6 +4,11 @@ __all__ = ['dailymotion_download'] from ..common import * +def extract_m3u(url): + content = get_content(url) + m3u_url = re.findall(r'http://.*', content)[0] + return match1(m3u_url, r'([^#]+)') + def dailymotion_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): """Downloads Dailymotion videos by URL. """ @@ -13,7 +18,7 @@ def dailymotion_download(url, output_dir = '.', merge = True, info_only = False, title = match1(html, r'"video_title"\s*:\s*"([^"]+)"') or \ match1(html, r'"title"\s*:\s*"([^"]+)"') - for quality in ['720','480','380','240','auto']: + for quality in ['1080','720','480','380','240','auto']: try: real_url = info[quality][0]["url"] if real_url: @@ -21,11 +26,12 @@ def dailymotion_download(url, output_dir = '.', merge = True, info_only = False, except KeyError: pass - type, ext, size = url_info(real_url) + m3u_url = extract_m3u(real_url) + mime, ext, size = 'video/mp4', 'mp4', 0 - print_info(site_info, title, type, size) + print_info(site_info, title, mime, size) if not info_only: - download_urls([real_url], title, ext, size, output_dir, merge = merge) + download_url_ffmpeg(m3u_url, title, ext, output_dir=output_dir, merge=merge) site_info = "Dailymotion.com" download = dailymotion_download From a4f4fb362616862cc283b05122e74be346f1a309 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 30 Oct 2016 16:16:04 +0100 Subject: [PATCH 010/765] Revert "fix for #1405" (fix #1485) This reverts commit 38ba0dbe48ecac4b7a354e4cf5766cf9415fb3c9. --- src/you_get/extractors/youku.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/you_get/extractors/youku.py b/src/you_get/extractors/youku.py index 1fb09e8c..853a75ba 100644 --- a/src/you_get/extractors/youku.py +++ b/src/you_get/extractors/youku.py @@ -314,9 +314,6 @@ class Youku(VideoExtractor): q = q ) ksegs += [i['server'] for i in json.loads(get_content(u))] - - if (parse_host(ksegs[len(ksegs)-1])[0] == "vali.cp31.ott.cibntv.net"): - ksegs.pop(len(ksegs)-1) except error.HTTPError as e: # Use fallback stream data in case of HTTP 404 log.e('[Error] ' + str(e)) From e8514d1370bc748946940c7c2f757db5c9cf42c8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 3 Nov 2016 01:44:04 +0100 Subject: [PATCH 011/765] version 0.4.575 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 6d91656c..6d4f6c4f 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.555' +__version__ = '0.4.575' From 391ca5643a355c310db786e467c6929fd5dde53f Mon Sep 17 00:00:00 2001 From: Zhiming Wang Date: Wed, 2 Nov 2016 20:44:40 -0400 Subject: [PATCH 012/765] [embed] correct tudou pattern Hyphen-minus (-) is a valid character in Tudou's video ID. It's even present in the second pattern of tudou_embed_patterns, just not the first. --- src/you_get/extractors/embed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/embed.py b/src/you_get/extractors/embed.py index a177e663..fc4015c4 100644 --- a/src/you_get/extractors/embed.py +++ b/src/you_get/extractors/embed.py @@ -25,7 +25,7 @@ youku_embed_patterns = [ 'youku\.com/v_show/id_([a-zA-Z0-9=]+)', """ http://www.tudou.com/programs/view/html5embed.action?type=0&code=3LS_URGvl54&lcode=&resourceId=0_06_05_99 """ -tudou_embed_patterns = [ 'tudou\.com[a-zA-Z0-9\/\?=\&\.\;]+code=([a-zA-Z0-9_]+)\&', +tudou_embed_patterns = [ 'tudou\.com[a-zA-Z0-9\/\?=\&\.\;]+code=([a-zA-Z0-9_-]+)\&', 'www\.tudou\.com/v/([a-zA-Z0-9_-]+)/[^"]*v\.swf' ] From 2b0fe3443f844690305caa0a468d1b744c72ced5 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 3 Nov 2016 17:03:01 +0100 Subject: [PATCH 013/765] [test] remove test_vimeo --- tests/test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test.py b/tests/test.py index 638206af..0fa2979a 100644 --- a/tests/test.py +++ b/tests/test.py @@ -21,9 +21,6 @@ class YouGetTests(unittest.TestCase): def test_mixcloud(self): mixcloud.download("http://www.mixcloud.com/DJVadim/north-america-are-you-ready/", info_only=True) - def test_vimeo(self): - vimeo.download("http://vimeo.com/56810854", info_only=True) - def test_youtube(self): youtube.download("http://www.youtube.com/watch?v=pzKerr0JIPA", info_only=True) youtube.download("http://youtu.be/pzKerr0JIPA", info_only=True) From bc590cbd62ca4350598551e41910c719864f0c36 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 3 Nov 2016 21:32:13 +0100 Subject: [PATCH 014/765] [douban] add support: movie.douban.com --- README.md | 4 ++-- src/you_get/extractors/douban.py | 23 +++++++++++++++++------ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 182fc12a..40a26803 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,7 @@ $ you-get https://github.com/soimort/you-get/archive/master.zip or use [chocolatey package manager](https://chocolatey.org): ``` -> choco upgrade you-get +> choco upgrade you-get ``` In order to get the latest ```develop``` branch without messing up the PIP, you can try: @@ -373,7 +373,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 爆米花网 | |✓| | | | **bilibili
哔哩哔哩** | |✓| | | | Dilidili | |✓| | | -| 豆瓣 | | | |✓| +| 豆瓣 | |✓| |✓| | 斗鱼 | |✓| | | | Panda
熊猫 | |✓| | | | 凤凰视频 | |✓| | | diff --git a/src/you_get/extractors/douban.py b/src/you_get/extractors/douban.py index 187e99c0..1a4a67d1 100644 --- a/src/you_get/extractors/douban.py +++ b/src/you_get/extractors/douban.py @@ -7,12 +7,23 @@ from ..common import * def douban_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): html = get_html(url) - if 'subject' in url: + + if re.match(r'https?://movie', url): + title = match1(html, 'name="description" content="([^"]+)') + tid = match1(url, 'trailer/(\d+)') + real_url = 'https://movie.douban.com/trailer/video_url?tid=%s' % tid + type, ext, size = url_info(real_url) + + print_info(site_info, title, type, size) + if not info_only: + download_urls([real_url], title, ext, size, output_dir, merge = merge) + + elif 'subject' in url: titles = re.findall(r'data-title="([^"]*)">', html) song_id = re.findall(r'
  • Date: Thu, 3 Nov 2016 22:03:56 +0100 Subject: [PATCH 015/765] [bilibili] fix support for bangumi --- src/you_get/extractors/bilibili.py | 108 +++++++++++++++-------------- 1 file changed, 56 insertions(+), 52 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index c18290b8..122dea0b 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -119,66 +119,70 @@ def bilibili_live_download_by_cid(cid, title, output_dir='.', merge=True, info_o def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_content(url) - if re.match(r'https?://bangumi\.bilibili\.com/', url): - # quick hack for bangumi URLs - url = r1(r'"([^"]+)" class="v-av-link"', html) - html = get_content(url) - title = r1_of([r'', r']*>\s*([^<>]+)\s*'], html) if title: title = unescape_html(title) title = escape_file_path(title) - flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"', - r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) - assert flashvars - flashvars = flashvars.replace(': ', '=') - t, cid = flashvars.split('=', 1) - cid = cid.split('&')[0] - if t == 'cid': - if re.match(r'https?://live\.bilibili\.com/', url): - title = r1(r'\s*([^<>]+)\s*', html) - bilibili_live_download_by_cid(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) + if re.match(r'https?://bangumi\.bilibili\.com/', url): + # quick hack for bangumi URLs + episode_id = r1(r'data-current-episode-id="(\d+)"', html) + cont = post_content('http://bangumi.bilibili.com/web_api/get_source', + post_data={'episode_id': episode_id}) + cid = json.loads(cont)['result']['cid'] + bilibili_download_by_cid(str(cid), title, output_dir=output_dir, merge=merge, info_only=info_only) - else: - # multi-P - cids = [] - pages = re.findall('', html) - for i, page in enumerate(pages): - html = get_html("http://www.bilibili.com%s" % page) - flashvars = r1_of([r'(cid=\d+)', - r'flashvars="([^"]+)"', - r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) - if flashvars: - t, cid = flashvars.split('=', 1) - cids.append(cid.split('&')[0]) - if url.endswith(page): - cids = [cid.split('&')[0]] - titles = [titles[i]] - break - - # no multi-P - if not pages: - cids = [cid] - titles = [r1(r'', html) or title] - - for i in range(len(cids)): - bilibili_download_by_cid(cids[i], - titles[i], - output_dir=output_dir, - merge=merge, - info_only=info_only) - - elif t == 'vid': - sina_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) - elif t == 'ykid': - youku_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) - elif t == 'uid': - tudou_download_by_id(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) else: - raise NotImplementedError(flashvars) + flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"', + r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) + assert flashvars + flashvars = flashvars.replace(': ', '=') + t, cid = flashvars.split('=', 1) + cid = cid.split('&')[0] + if t == 'cid': + if re.match(r'https?://live\.bilibili\.com/', url): + title = r1(r'\s*([^<>]+)\s*', html) + bilibili_live_download_by_cid(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) + + else: + # multi-P + cids = [] + pages = re.findall('', html) + for i, page in enumerate(pages): + html = get_html("http://www.bilibili.com%s" % page) + flashvars = r1_of([r'(cid=\d+)', + r'flashvars="([^"]+)"', + r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) + if flashvars: + t, cid = flashvars.split('=', 1) + cids.append(cid.split('&')[0]) + if url.endswith(page): + cids = [cid.split('&')[0]] + titles = [titles[i]] + break + + # no multi-P + if not pages: + cids = [cid] + titles = [r1(r'', html) or title] + + for i in range(len(cids)): + bilibili_download_by_cid(cids[i], + titles[i], + output_dir=output_dir, + merge=merge, + info_only=info_only) + + elif t == 'vid': + sina_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + elif t == 'ykid': + youku_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + elif t == 'uid': + tudou_download_by_id(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) + else: + raise NotImplementedError(flashvars) if not info_only and not dry_run: if not kwargs['caption']: From d04997ec9bc2ce68655334063e5cce840053a0b0 Mon Sep 17 00:00:00 2001 From: Rokic Date: Tue, 8 Nov 2016 02:09:39 +0800 Subject: [PATCH 016/765] fix #1415 Songs from netease cloud music playlist will have a prefix indicates their order in the list. --- src/you_get/extractors/netease.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/you_get/extractors/netease.py b/src/you_get/extractors/netease.py index 63ee59b8..d5f3b1fa 100644 --- a/src/you_get/extractors/netease.py +++ b/src/you_get/extractors/netease.py @@ -54,13 +54,15 @@ def netease_cloud_music_download(url, output_dir='.', merge=True, info_only=Fals os.mkdir(new_dir) cover_url = j['result']['coverImgUrl'] download_urls([cover_url], "cover", "jpg", 0, new_dir) - - for i in j['result']['tracks']: - netease_song_download(i, output_dir=new_dir, info_only=info_only) + + prefix_width = len(str(len(j['result']['tracks']))) + for n, i in enumerate(j['result']['tracks']): + playlist_prefix = '%%.%dd_' % prefix_width % n + netease_song_download(i, output_dir=new_dir, info_only=info_only, playlist_prefix=playlist_prefix) try: # download lyrics assert kwargs['caption'] l = loads(get_content("http://music.163.com/api/song/lyric/?id=%s&lv=-1&csrf_token=" % i['id'], headers={"Referer": "http://music.163.com/"})) - netease_lyric_download(i, l["lrc"]["lyric"], output_dir=new_dir, info_only=info_only) + netease_lyric_download(i, l["lrc"]["lyric"], output_dir=new_dir, info_only=info_only, playlist_prefix=playlist_prefix) except: pass elif "song" in url: @@ -85,10 +87,10 @@ def netease_cloud_music_download(url, output_dir='.', merge=True, info_only=Fals j = loads(get_content("http://music.163.com/api/mv/detail/?id=%s&ids=[%s]&csrf_token=" % (rid, rid), headers={"Referer": "http://music.163.com/"})) netease_video_download(j['data'], output_dir=output_dir, info_only=info_only) -def netease_lyric_download(song, lyric, output_dir='.', info_only=False): +def netease_lyric_download(song, lyric, output_dir='.', info_only=False, playlist_prefix=""): if info_only: return - title = "%s. %s" % (song['position'], song['name']) + title = "%s%s. %s" % (playlist_prefix, song['position'], song['name']) filename = '%s.lrc' % get_filename(title) print('Saving %s ...' % filename, end="", flush=True) with open(os.path.join(output_dir, filename), @@ -103,8 +105,8 @@ def netease_video_download(vinfo, output_dir='.', info_only=False): netease_download_common(title, url_best, output_dir=output_dir, info_only=info_only) -def netease_song_download(song, output_dir='.', info_only=False): - title = "%s. %s" % (song['position'], song['name']) +def netease_song_download(song, output_dir='.', info_only=False, playlist_prefix=""): + title = "%s%s. %s" % (playlist_prefix, song['position'], song['name']) songNet = 'p' + song['mp3Url'].split('/')[2][1:] if 'hMusic' in song and song['hMusic'] != None: From 51dd7ad8e6b757687a4c06af7b6b3fb3dfa5f5b1 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 9 Nov 2016 17:13:02 +0100 Subject: [PATCH 017/765] [youtube] use url_encoded_fmt_stream_map from video page, fix #1502 --- src/you_get/extractors/youtube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 33e3923e..64af5c14 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -155,6 +155,8 @@ class YouTube(VideoExtractor): try: ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) self.html5player = 'https:' + ytplayer_config['assets']['js'] + # Workaround: get_video_info returns bad s. Why? + stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') except: self.html5player = None From 78ba20266c6b5e1fef1398af60ea8361bf57fff0 Mon Sep 17 00:00:00 2001 From: moyo Date: Sun, 13 Nov 2016 17:41:00 +0800 Subject: [PATCH 018/765] 1. Change container from FLV to TS 2. Fix video url matcher 3. Use m3u8 ext-info for fast size calculate 4. Use m3u8 url for video playing --- src/you_get/extractors/mgtv.py | 74 ++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 17 deletions(-) diff --git a/src/you_get/extractors/mgtv.py b/src/you_get/extractors/mgtv.py index aeb42490..3ce62efe 100644 --- a/src/you_get/extractors/mgtv.py +++ b/src/you_get/extractors/mgtv.py @@ -12,11 +12,11 @@ import re class MGTV(VideoExtractor): name = "芒果 (MGTV)" - # Last updated: 2015-11-24 + # Last updated: 2016-11-13 stream_types = [ - {'id': 'hd', 'container': 'flv', 'video_profile': '超清'}, - {'id': 'sd', 'container': 'flv', 'video_profile': '高清'}, - {'id': 'ld', 'container': 'flv', 'video_profile': '标清'}, + {'id': 'hd', 'container': 'ts', 'video_profile': '超清'}, + {'id': 'sd', 'container': 'ts', 'video_profile': '高清'}, + {'id': 'ld', 'container': 'ts', 'video_profile': '标清'}, ] id_dic = {i['video_profile']:(i['id']) for i in stream_types} @@ -27,7 +27,7 @@ class MGTV(VideoExtractor): def get_vid_from_url(url): """Extracts video ID from URL. """ - return match1(url, 'http://www.mgtv.com/v/\d/\d+/\w+/(\d+).html') + return match1(url, 'http://www.mgtv.com/b/\d+/(\d+).html') #---------------------------------------------------------------------- @staticmethod @@ -44,10 +44,15 @@ class MGTV(VideoExtractor): content = get_content(content['info']) #get the REAL M3U url, maybe to be changed later? segment_list = [] + segments_size = 0 for i in content.split(): if not i.startswith('#'): #not the best way, better we use the m3u8 package segment_list.append(base_url + i) - return segment_list + # use ext-info for fast size calculate + elif i.startswith('#EXT-MGTV-File-SIZE:'): + segments_size += int(i[i.rfind(':')+1:]) + + return m3u_url, segments_size, segment_list def download_playlist_by_url(self, url, **kwargs): pass @@ -69,28 +74,25 @@ class MGTV(VideoExtractor): quality_id = self.id_dic[s['video_profile']] url = stream_available[s['video_profile']] url = re.sub( r'(\&arange\=\d+)', '', url) #Un-Hum - segment_list_this = self.get_mgtv_real_url(url) - - container_this_stream = '' - size_this_stream = 0 + m3u8_url, m3u8_size, segment_list_this = self.get_mgtv_real_url(url) + stream_fileid_list = [] for i in segment_list_this: - _, container_this_stream, size_this_seg = url_info(i) - size_this_stream += size_this_seg stream_fileid_list.append(os.path.basename(i).split('.')[0]) - + #make pieces pieces = [] for i in zip(stream_fileid_list, segment_list_this): pieces.append({'fileid': i[0], 'segs': i[1],}) self.streams[quality_id] = { - 'container': 'flv', + 'container': s['container'], 'video_profile': s['video_profile'], - 'size': size_this_stream, - 'pieces': pieces + 'size': m3u8_size, + 'pieces': pieces, + 'm3u8_url': m3u8_url } - + if not kwargs['info_only']: self.streams[quality_id]['src'] = segment_list_this @@ -107,6 +109,44 @@ class MGTV(VideoExtractor): # Extract stream with the best quality stream_id = self.streams_sorted[0]['id'] + def download(self, **kwargs): + + if 'stream_id' in kwargs and kwargs['stream_id']: + stream_id = kwargs['stream_id'] + else: + stream_id = 'null' + + # print video info only + if 'info_only' in kwargs and kwargs['info_only']: + if stream_id != 'null': + if 'index' not in kwargs: + self.p(stream_id) + else: + self.p_i(stream_id) + else: + # Display all available streams + if 'index' not in kwargs: + self.p([]) + else: + stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag'] + self.p_i(stream_id) + + # default to use the best quality + if stream_id == 'null': + stream_id = self.streams_sorted[0]['id'] + + stream_info = self.streams[stream_id] + + if not kwargs['info_only']: + if player: + # with m3u8 format because some video player can process urls automatically (e.g. mpv) + launch_player(player, [stream_info['m3u8_url']]) + else: + download_urls(stream_info['src'], self.title, stream_info['container'], stream_info['size'], + output_dir=kwargs['output_dir'], + merge=kwargs['merge'], + av=stream_id in self.dash_streams) + site = MGTV() download = site.download_by_url download_playlist = site.download_playlist_by_url \ No newline at end of file From 65713cae2cf1c122be72c2d6fdaf854b35260562 Mon Sep 17 00:00:00 2001 From: L Date: Mon, 14 Nov 2016 21:49:13 +0800 Subject: [PATCH 019/765] update yixia_download url match rule resolved #1346 --- src/you_get/extractors/yixia.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/yixia.py b/src/you_get/extractors/yixia.py index ca5c4bd6..7d5ba290 100644 --- a/src/you_get/extractors/yixia.py +++ b/src/you_get/extractors/yixia.py @@ -51,11 +51,11 @@ def yixia_download(url, output_dir = '.', merge = True, info_only = False, **kwa yixia_download_by_scid = yixia_miaopai_download_by_scid site_info = "Yixia Miaopai" - if re.match(r'http://www.miaopai.com/show/channel/\w+', url): #PC + if re.match(r'http://www.miaopai.com/show/channel/.+', url): #PC scid = match1(url, r'http://www.miaopai.com/show/channel/(.+)\.htm') - elif re.match(r'http://www.miaopai.com/show/\w+', url): #PC + elif re.match(r'http://www.miaopai.com/show/.+', url): #PC scid = match1(url, r'http://www.miaopai.com/show/(.+)\.htm') - elif re.match(r'http://m.miaopai.com/show/channel/\w+', url): #Mobile + elif re.match(r'http://m.miaopai.com/show/channel/.+', url): #Mobile scid = match1(url, r'http://m.miaopai.com/show/channel/(.+)\.htm') elif 'xiaokaxiu.com' in hostname: #Xiaokaxiu From a7635e96a5e20cc4025fbcb236254e7a69c6556c Mon Sep 17 00:00:00 2001 From: Zhang Cheng Date: Thu, 17 Nov 2016 11:18:01 +0800 Subject: [PATCH 020/765] [mgtv] add bsf:a aac_adtstoasc to ffmpeg args, fix #1458. --- src/you_get/processor/ffmpeg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/processor/ffmpeg.py b/src/you_get/processor/ffmpeg.py index 1c0ba1a3..dcc8e1c8 100644 --- a/src/you_get/processor/ffmpeg.py +++ b/src/you_get/processor/ffmpeg.py @@ -125,7 +125,7 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'): params = [FFMPEG] + LOGLEVEL + ['-f', 'concat', '-safe', '-1', '-y', '-i'] params.append(output + '.txt') - params += ['-c', 'copy', output] + params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc', output] subprocess.check_call(params) os.remove(output + '.txt') From 250672f42d475eba1b7a69b48683cf0d0576698a Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 19 Nov 2016 20:47:18 +0100 Subject: [PATCH 021/765] version 0.4.595 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 6d4f6c4f..28919906 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.575' +__version__ = '0.4.595' From fe2720544a804926a24aba33f44c98d3706ec3bd Mon Sep 17 00:00:00 2001 From: Yohohaha <390342156@qq.com> Date: Fri, 25 Nov 2016 20:03:38 +0800 Subject: [PATCH 022/765] fix syntax error --- src/you_get/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 9faaa939..0100cae7 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -969,7 +969,7 @@ def download_url_ffmpeg(url,title, ext,params={}, total_size=0, output_dir='.', from .processor.ffmpeg import has_ffmpeg_installed, ffmpeg_download_stream assert has_ffmpeg_installed(), "FFmpeg not installed." global output_filename - if(output_filename) + if(output_filename): dotPos = output_filename.rfind(".") title = output_filename[:dotPos] ext = output_filename[dotPos+1:] From e65c2d23a0cdfe622c15a740f1c04384c7813563 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 26 Nov 2016 13:07:21 +0100 Subject: [PATCH 023/765] [tudou] fix #1526 --- src/you_get/extractors/tudou.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/you_get/extractors/tudou.py b/src/you_get/extractors/tudou.py index 6bbbc12b..8c434437 100644 --- a/src/you_get/extractors/tudou.py +++ b/src/you_get/extractors/tudou.py @@ -32,11 +32,11 @@ def tudou_download_by_id(id, title, output_dir = '.', merge = True, info_only = def tudou_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): if 'acfun.tudou.com' in url: #wrong way! url = url.replace('acfun.tudou.com', 'www.acfun.tv') - you_get.extractors.acfun.acfun_download(url, output_dir, - merge, + you_get.extractors.acfun.acfun_download(url, output_dir, + merge, info_only) return #throw you back - + # Embedded player id = r1(r'http://www.tudou.com/v/([^/]+)/', url) if id: @@ -44,7 +44,7 @@ def tudou_download(url, output_dir = '.', merge = True, info_only = False, **kwa html = get_decoded_html(url) - title = r1(r'kw\s*[:=]\s*[\'\"]([^\n]+?)\'\s*\n', html).replace("\\'", "\'") + title = r1(r'\Wkw\s*[:=]\s*[\'\"]([^\n]+?)\'\s*\n', html).replace("\\'", "\'") assert title title = unescape_html(title) From 03266c030a254dac2103a3c2a2d086e36fb9dc9a Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 26 Nov 2016 12:35:50 +0100 Subject: [PATCH 024/765] [youtube] fix dash-mpd for live streams (no yt:contentLength field) --- src/you_get/extractors/youtube.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 64af5c14..61dc2cb7 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -258,11 +258,17 @@ class YouTube(VideoExtractor): burls = rep.getElementsByTagName('BaseURL') dash_mp4_a_url = burls[0].firstChild.nodeValue dash_mp4_a_size = burls[0].getAttribute('yt:contentLength') + if not dash_mp4_a_size: + try: dash_mp4_a_size = url_size(dash_mp4_a_url) + except: continue elif mimeType == 'audio/webm': rep = aset.getElementsByTagName('Representation')[-1] burls = rep.getElementsByTagName('BaseURL') dash_webm_a_url = burls[0].firstChild.nodeValue dash_webm_a_size = burls[0].getAttribute('yt:contentLength') + if not dash_webm_a_size: + try: dash_webm_a_size = url_size(dash_webm_a_url) + except: continue elif mimeType == 'video/mp4': for rep in aset.getElementsByTagName('Representation'): w = int(rep.getAttribute('width')) @@ -271,6 +277,9 @@ class YouTube(VideoExtractor): burls = rep.getElementsByTagName('BaseURL') dash_url = burls[0].firstChild.nodeValue dash_size = burls[0].getAttribute('yt:contentLength') + if not dash_size: + try: dash_size = url_size(dash_url) + except: continue self.dash_streams[itag] = { 'quality': '%sx%s' % (w, h), 'itag': itag, @@ -288,6 +297,9 @@ class YouTube(VideoExtractor): burls = rep.getElementsByTagName('BaseURL') dash_url = burls[0].firstChild.nodeValue dash_size = burls[0].getAttribute('yt:contentLength') + if not dash_size: + try: dash_size = url_size(dash_url) + except: continue self.dash_streams[itag] = { 'quality': '%sx%s' % (w, h), 'itag': itag, From 538f1796f203297ef9e66c0a9d07691daa28df97 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 26 Nov 2016 17:09:28 +0100 Subject: [PATCH 025/765] [universal] workaround for websites that block HEAD requests --- src/you_get/common.py | 6 +++--- src/you_get/extractors/universal.py | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 0100cae7..27998cf5 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -338,7 +338,7 @@ def get_content(url, headers={}, decoded=True): if charset is not None: data = data.decode(charset) else: - data = data.decode('utf-8') + data = data.decode('utf-8', 'ignore') return data @@ -395,12 +395,12 @@ def url_size(url, faker = False, headers = {}): def urls_size(urls, faker = False, headers = {}): return sum([url_size(url, faker=faker, headers=headers) for url in urls]) -def get_head(url, headers = {}): +def get_head(url, headers = {}, get_method = 'HEAD'): if headers: req = request.Request(url, headers = headers) else: req = request.Request(url) - req.get_method = lambda : 'HEAD' + req.get_method = lambda : get_method res = request.urlopen(req) return dict(res.headers) diff --git a/src/you_get/extractors/universal.py b/src/you_get/extractors/universal.py index ebab70f8..a4262f61 100644 --- a/src/you_get/extractors/universal.py +++ b/src/you_get/extractors/universal.py @@ -6,7 +6,10 @@ from ..common import * from .embed import * def universal_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - content_type = get_head(url, headers=fake_headers)['Content-Type'] + try: + content_type = get_head(url, headers=fake_headers)['Content-Type'] + except: + content_type = get_head(url, headers=fake_headers, get_method='GET')['Content-Type'] if content_type.startswith('text/html'): try: embed_download(url, output_dir, merge=merge, info_only=info_only) From 8e150e69897724d315c3e31cbc187511a0d2d54c Mon Sep 17 00:00:00 2001 From: sheerluck Date: Mon, 28 Nov 2016 18:01:42 +0300 Subject: [PATCH 026/765] fix for NameError: name 'output_json' is not defined --- src/you_get/extractors/qq.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/qq.py b/src/you_get/extractors/qq.py index f1707527..c9ee7c0f 100644 --- a/src/you_get/extractors/qq.py +++ b/src/you_get/extractors/qq.py @@ -56,12 +56,12 @@ def qq_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False): if not info_only: download_urls(part_urls, parts_ti, ext, total_size, output_dir=output_dir, merge=merge) else: - fvkey = output_json['vl']['vi'][0]['fvkey'] - mp4 = output_json['vl']['vi'][0]['cl'].get('ci', None) + fvkey = video_json['vl']['vi'][0]['fvkey'] + mp4 = video_json['vl']['vi'][0]['cl'].get('ci', None) if mp4: mp4 = mp4[0]['keyid'].replace('.10', '.p') + '.mp4' else: - mp4 = output_json['vl']['vi'][0]['fn'] + mp4 = video_json['vl']['vi'][0]['fn'] url = '%s/%s?vkey=%s' % ( parts_prefix, mp4, fvkey ) _, ext, size = url_info(url, faker=True) From 474f4d724a796426db99c398dfe56756549cd223 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 3 Dec 2016 17:40:29 +0100 Subject: [PATCH 027/765] [common] pass valid filename in download_url_ffmpeg --- src/you_get/common.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 27998cf5..7db4fba2 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -968,11 +968,15 @@ def download_url_ffmpeg(url,title, ext,params={}, total_size=0, output_dir='.', from .processor.ffmpeg import has_ffmpeg_installed, ffmpeg_download_stream assert has_ffmpeg_installed(), "FFmpeg not installed." + global output_filename - if(output_filename): + if output_filename: dotPos = output_filename.rfind(".") title = output_filename[:dotPos] ext = output_filename[dotPos+1:] + + title = tr(get_filename(title)) + ffmpeg_download_stream(url, title, ext, params, output_dir) def playlist_not_supported(name): From 61d9bf124edf5bd89283eb5e373cabae5e8953b6 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 3 Dec 2016 17:41:23 +0100 Subject: [PATCH 028/765] [youtube] download hlsvp via ffmpeg --- src/you_get/extractors/youtube.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 61dc2cb7..c403cb74 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -148,6 +148,17 @@ class YouTube(VideoExtractor): elif video_info['status'] == ['ok']: if 'use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']: self.title = parse.unquote_plus(video_info['title'][0]) + + # YouTube Live + if 'url_encoded_fmt_stream_map' not in video_info: + hlsvp = video_info['hlsvp'][0] + + if 'info_only' in kwargs and kwargs['info_only']: + return + else: + download_url_ffmpeg(hlsvp, self.title, 'mp4') + exit(0) + stream_list = video_info['url_encoded_fmt_stream_map'][0].split(',') # Parse video page (for DASH) From 606e0a786e2ab631288d2f4567ed1d37334ae52e Mon Sep 17 00:00:00 2001 From: Zhiming Wang Date: Sun, 4 Dec 2016 19:36:17 -0500 Subject: [PATCH 029/765] [lizhi] overhaul Lizhi extractor has stopped working. In particular, there are two major changes: - URL format change: no more #/ in URL paths; - The /api/audio/{radio_id}/{audio_id} API now returns 404. This is a rewrite based on the /api/radio_audios API. --- src/you_get/extractors/lizhi.py | 74 ++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 28 deletions(-) diff --git a/src/you_get/extractors/lizhi.py b/src/you_get/extractors/lizhi.py index 56dbf756..65988a9f 100644 --- a/src/you_get/extractors/lizhi.py +++ b/src/you_get/extractors/lizhi.py @@ -4,37 +4,55 @@ __all__ = ['lizhi_download'] import json from ..common import * -def lizhi_download_playlist(url, output_dir = '.', merge = True, info_only = False, **kwargs): - # like this http://www.lizhi.fm/#/31365/ - #api desc: s->start l->length band->some radio - #http://www.lizhi.fm/api/radio_audios?s=0&l=100&band=31365 - band_id = match1(url,r'#/(\d+)') - #try to get a considerable large l to reduce html parsing task. - api_url = 'http://www.lizhi.fm/api/radio_audios?s=0&l=65535&band='+band_id - content_json = json.loads(get_content(api_url)) - for sound in content_json: - title = sound["name"] - res_url = sound["url"] - songtype, ext, size = url_info(res_url,faker=True) - print_info(site_info, title, songtype, size) - if not info_only: - #no referer no speed! - download_urls([res_url], title, ext, size, output_dir, merge=merge ,refer = 'http://www.lizhi.fm',faker=True) - pass +# radio_id: e.g. 549759 from http://www.lizhi.fm/549759/ +# +# Returns a list of tuples (audio_id, title, url) for each episode +# (audio) in the radio playlist. url is the direct link to the audio +# file. +def lizhi_extract_playlist_info(radio_id): + # /api/radio_audios API parameters: + # + # - s: starting episode + # - l: count (per page) + # - band: radio_id + # + # We use l=65535 for poor man's pagination (that is, no pagination + # at all -- hope all fits on a single page). + # + # TODO: Use /api/radio?band={radio_id} to get number of episodes + # (au_cnt), then handle pagination properly. + api_url = 'http://www.lizhi.fm/api/radio_audios?s=0&l=65535&band=%s' % radio_id + api_response = json.loads(get_content(api_url)) + return [(ep['id'], ep['name'], ep['url']) for ep in api_response] -def lizhi_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): - # url like http://www.lizhi.fm/#/549759/18864883431656710 - api_id = match1(url,r'#/(\d+/\d+)') - api_url = 'http://www.lizhi.fm/api/audio/'+api_id - content_json = json.loads(get_content(api_url)) - title = content_json["audio"]["name"] - res_url = content_json["audio"]["url"] - songtype, ext, size = url_info(res_url,faker=True) - print_info(site_info, title, songtype, size) +def lizhi_download_audio(audio_id, title, url, output_dir='.', info_only=False): + filetype, ext, size = url_info(url) + print_info(site_info, title, filetype, size) if not info_only: - #no referer no speed! - download_urls([res_url], title, ext, size, output_dir, merge=merge ,refer = 'http://www.lizhi.fm',faker=True) + download_urls([url], title, ext, size, output_dir=output_dir) +def lizhi_download_playlist(url, output_dir='.', info_only=False, **kwargs): + # Sample URL: http://www.lizhi.fm/549759/ + radio_id = match1(url,r'/(\d+)') + if not radio_id: + raise NotImplementedError('%s not supported' % url) + for audio_id, title, url in lizhi_extract_playlist_info(radio_id): + lizhi_download_audio(audio_id, title, url, output_dir=output_dir, info_only=info_only) + +def lizhi_download(url, output_dir='.', info_only=False, **kwargs): + # Sample URL: http://www.lizhi.fm/549759/18864883431656710/ + m = re.search(r'/(?P\d+)/(?P\d+)', url) + if not m: + raise NotImplementedError('%s not supported' % url) + radio_id = m.group('radio_id') + audio_id = m.group('audio_id') + # Look for the audio_id among the full list of episodes + for aid, title, url in lizhi_extract_playlist_info(radio_id): + if aid == audio_id: + lizhi_download_audio(audio_id, title, url, output_dir=output_dir, info_only=info_only) + break + else: + raise NotImplementedError('Audio #%s not found in playlist #%s' % (audio_id, radio_id)) site_info = "lizhi.fm" download = lizhi_download From a6d3c13684cff5811e3c1c6bac93698355cc3a43 Mon Sep 17 00:00:00 2001 From: Zhiming Wang Date: Mon, 5 Dec 2016 23:45:28 -0500 Subject: [PATCH 030/765] [embed] add support for bilibili's embedded player Sample embed: for http://www.bilibili.com/video/av5079467/: --- src/you_get/extractors/embed.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/you_get/extractors/embed.py b/src/you_get/extractors/embed.py index fc4015c4..3bdb924c 100644 --- a/src/you_get/extractors/embed.py +++ b/src/you_get/extractors/embed.py @@ -2,6 +2,7 @@ __all__ = ['embed_download'] from ..common import * +from .bilibili import bilibili_download from .iqiyi import iqiyi_download_by_vid from .le import letvcloud_download_by_vu from .netease import netease_download @@ -42,6 +43,11 @@ netease_embed_patterns = [ '(http://\w+\.163\.com/movie/[^\'"]+)' ] vimeo_embed_patters = [ 'player\.vimeo\.com/video/(\d+)' ] +""" +check the share button on http://www.bilibili.com/video/av5079467/ +""" +bilibili_embed_patterns = [ 'static\.hdslb\.com/miniloader\.swf.*aid=(\d+)' ] + def embed_download(url, output_dir = '.', merge = True, info_only = False ,**kwargs): content = get_content(url, headers=fake_headers) @@ -78,6 +84,12 @@ def embed_download(url, output_dir = '.', merge = True, info_only = False ,**kwa found = True vimeo_download_by_id(url, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + aids = matchall(content, bilibili_embed_patterns) + for aid in aids: + found = True + url = 'http://www.bilibili.com/video/av%s/' % aid + bilibili_download(url, output_dir=output_dir, merge=merge, info_only=info_only) + if not found: raise NotImplementedError(url) From 44e60c3e2193d3198899f211a8b7c9767b0b6d5e Mon Sep 17 00:00:00 2001 From: Valdemar Erk Date: Sat, 10 Dec 2016 12:23:35 +0100 Subject: [PATCH 031/765] Initial support for yizhibo.com --- src/you_get/common.py | 1 + src/you_get/extractors/yizhibo.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 src/you_get/extractors/yizhibo.py diff --git a/src/you_get/common.py b/src/you_get/common.py index 7db4fba2..fd727cf4 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -91,6 +91,7 @@ SITES = { 'xiaojiadianvideo' : 'fc2video', 'yinyuetai' : 'yinyuetai', 'miaopai' : 'yixia', + 'yizhibo' : 'yizhibo', 'youku' : 'youku', 'youtu' : 'youtube', 'youtube' : 'youtube', diff --git a/src/you_get/extractors/yizhibo.py b/src/you_get/extractors/yizhibo.py new file mode 100644 index 00000000..f524a0a8 --- /dev/null +++ b/src/you_get/extractors/yizhibo.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +__all__ = ['yizhibo_download'] + +from ..common import * +import json +import time + +def yizhibo_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): + video_id = url[url.rfind('/')+1:].split(".")[0] + json_request_url = 'http://www.yizhibo.com/live/h5api/get_basic_live_info?scid={}'.format(video_id) + content = get_html(json_request_url) + error = json.loads(content)['result'] + if (error != 1): + raise ValueError("Error : {}".format(error)) + + data = json.loads(content)#['data'] + title = data.get('data')['live_title'] + if (title == ''): + title = data.get('data')['nickname'] + real_url = data.get('data')['play_url'] + + print_info(site_info, title, 'flv', float('inf')) + if not info_only: + download_url_ffmpeg(real_url, title, 'flv', None, output_dir, merge = merge) + +site_info = "yizhibo.com" +download = yizhibo_download +download_playlist = playlist_not_supported('yizhibo') From 0f33e471ad65c2c2dfb0a1e4480cb39d1f2430a2 Mon Sep 17 00:00:00 2001 From: Valdemar Erk Date: Sat, 10 Dec 2016 12:26:06 +0100 Subject: [PATCH 032/765] minor correction --- src/you_get/extractors/yizhibo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/yizhibo.py b/src/you_get/extractors/yizhibo.py index f524a0a8..0744e1f9 100644 --- a/src/you_get/extractors/yizhibo.py +++ b/src/you_get/extractors/yizhibo.py @@ -14,7 +14,7 @@ def yizhibo_download(url, output_dir = '.', merge = True, info_only = False, **k if (error != 1): raise ValueError("Error : {}".format(error)) - data = json.loads(content)#['data'] + data = json.loads(content) title = data.get('data')['live_title'] if (title == ''): title = data.get('data')['nickname'] From 0f1d5beb1494ca6b64b90e3d8d5949de29b2c31b Mon Sep 17 00:00:00 2001 From: Valdemar Erk Date: Sun, 11 Dec 2016 01:46:23 +0100 Subject: [PATCH 033/765] Changed the plugin to use download_urls instead of ffmpeg --- src/you_get/extractors/yizhibo.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/you_get/extractors/yizhibo.py b/src/you_get/extractors/yizhibo.py index 0744e1f9..37fa043c 100644 --- a/src/you_get/extractors/yizhibo.py +++ b/src/you_get/extractors/yizhibo.py @@ -9,7 +9,7 @@ import time def yizhibo_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): video_id = url[url.rfind('/')+1:].split(".")[0] json_request_url = 'http://www.yizhibo.com/live/h5api/get_basic_live_info?scid={}'.format(video_id) - content = get_html(json_request_url) + content = get_content(json_request_url) error = json.loads(content)['result'] if (error != 1): raise ValueError("Error : {}".format(error)) @@ -18,11 +18,17 @@ def yizhibo_download(url, output_dir = '.', merge = True, info_only = False, **k title = data.get('data')['live_title'] if (title == ''): title = data.get('data')['nickname'] - real_url = data.get('data')['play_url'] - - print_info(site_info, title, 'flv', float('inf')) + m3u8_url = data.get('data')['play_url'] + m3u8 = get_content(m3u8_url) + base_url = "/".join(data.get('data')['play_url'].split("/")[:7])+"/" + part_url = re.findall(r'([0-9]+\.ts)', m3u8) + real_url = [] + for i in part_url: + url = base_url + i + real_url.append(url) + print_info(site_info, title, 'ts', float('inf')) if not info_only: - download_url_ffmpeg(real_url, title, 'flv', None, output_dir, merge = merge) + download_urls(real_url, title, 'ts', float('inf'), output_dir, merge = merge) site_info = "yizhibo.com" download = yizhibo_download From e0554b2d7b7a214c988100ac32187208b22e1d26 Mon Sep 17 00:00:00 2001 From: Valdemar Erk Date: Sun, 11 Dec 2016 01:49:13 +0100 Subject: [PATCH 034/765] Made player use the m3u8 file. --- src/you_get/extractors/yizhibo.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/you_get/extractors/yizhibo.py b/src/you_get/extractors/yizhibo.py index 37fa043c..11ce86ad 100644 --- a/src/you_get/extractors/yizhibo.py +++ b/src/you_get/extractors/yizhibo.py @@ -28,6 +28,8 @@ def yizhibo_download(url, output_dir = '.', merge = True, info_only = False, **k real_url.append(url) print_info(site_info, title, 'ts', float('inf')) if not info_only: + if player: + launch_player(player, [m3u8_url]) download_urls(real_url, title, 'ts', float('inf'), output_dir, merge = merge) site_info = "yizhibo.com" From 9905620b5297483e5e10195aad90a14be1d360fd Mon Sep 17 00:00:00 2001 From: Valdemar Erk Date: Fri, 16 Dec 2016 09:36:29 +0100 Subject: [PATCH 035/765] Fix for magisto --- src/you_get/extractors/magisto.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/you_get/extractors/magisto.py b/src/you_get/extractors/magisto.py index 2a53be02..b2e8e502 100644 --- a/src/you_get/extractors/magisto.py +++ b/src/you_get/extractors/magisto.py @@ -3,15 +3,19 @@ __all__ = ['magisto_download'] from ..common import * +import json def magisto_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_html(url) - - title1 = r1(r' Date: Thu, 22 Dec 2016 22:33:37 +0800 Subject: [PATCH 036/765] [BiliBili] Better Multi-Part Video Naming --- src/you_get/extractors/bilibili.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 122dea0b..043c3753 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -167,10 +167,10 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs if not pages: cids = [cid] titles = [r1(r'', html) or title] - for i in range(len(cids)): + completeTitle=title+"-"+titles[i]#Build Better Title bilibili_download_by_cid(cids[i], - titles[i], + completeTitle, output_dir=output_dir, merge=merge, info_only=info_only) From bc5ff346d043e8097b81902d6f5392fc3e7869fc Mon Sep 17 00:00:00 2001 From: Zhang Date: Fri, 23 Dec 2016 11:50:51 +0800 Subject: [PATCH 037/765] [BiliBili] revert naming for single part videos --- src/you_get/extractors/bilibili.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 043c3753..2e54ed47 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -168,7 +168,11 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs cids = [cid] titles = [r1(r'', html) or title] for i in range(len(cids)): - completeTitle=title+"-"+titles[i]#Build Better Title + completeTitle=None + if (title == titles[i]): + completeTitle=title + else: + completeTitle=title+"-"+titles[i]#Build Better Title bilibili_download_by_cid(cids[i], completeTitle, output_dir=output_dir, From af4db738a2f2e9e23ef192145a0ece286f1a4c67 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 24 Dec 2016 15:49:47 +0100 Subject: [PATCH 038/765] [test] remove mixcloud --- tests/test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test.py b/tests/test.py index 0fa2979a..020455b0 100644 --- a/tests/test.py +++ b/tests/test.py @@ -18,9 +18,6 @@ class YouGetTests(unittest.TestCase): def test_magisto(self): magisto.download("http://www.magisto.com/album/video/f3x9AAQORAkfDnIFDA", info_only=True) - def test_mixcloud(self): - mixcloud.download("http://www.mixcloud.com/DJVadim/north-america-are-you-ready/", info_only=True) - def test_youtube(self): youtube.download("http://www.youtube.com/watch?v=pzKerr0JIPA", info_only=True) youtube.download("http://youtu.be/pzKerr0JIPA", info_only=True) From b493af9a69878544ddc6a1fdb71ca61b48bd57ab Mon Sep 17 00:00:00 2001 From: Zhiming Wang Date: Thu, 15 Dec 2016 23:37:35 -0500 Subject: [PATCH 039/765] [ffmpeg] fix concat list when output dir is not pwd Relative paths in the concat list are considered relative to the parent directory of the script, not the calling directory. This isn't entirely obvious from the documentation, but it is easy to infer from the concat demuxer's concept of "safety", and easy to test (confirmed on FFmpeg 3.2.2). See https://ffmpeg.org/ffmpeg-all.html#concat-1 for details. This commit fixes the wrong relative paths when --output-dir is specified and not pwd. This commit also - Factors out common concat list writer code; - Slightly simplifies the code to collect FFmpeg params (on Py35+ we can further simplify by unpacking LOGLEVEL with the star operator right in the list literal). --- src/you_get/processor/ffmpeg.py | 56 ++++++++++++++------------------- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/src/you_get/processor/ffmpeg.py b/src/you_get/processor/ffmpeg.py index a8599e52..433aff3f 100644 --- a/src/you_get/processor/ffmpeg.py +++ b/src/you_get/processor/ffmpeg.py @@ -26,6 +26,18 @@ LOGLEVEL = ['-loglevel', 'quiet'] def has_ffmpeg_installed(): return FFMPEG is not None +# Given a list of segments and the output path, generates the concat +# list and returns the path to the concat list. +def generate_concat_list(files, output): + concat_list_path = output + '.txt' + concat_list_dir = os.path.dirname(concat_list_path) + with open(concat_list_path, 'w', encoding='utf-8') as concat_list: + for file in files: + if os.path.isfile(file): + relpath = os.path.relpath(file, start=concat_list_dir) + concat_list.write('file %s\n' % parameterize(relpath)) + return concat_list_path + def ffmpeg_concat_av(files, output, ext): print('Merging video parts... ', end="", flush=True) params = [FFMPEG] + LOGLEVEL @@ -52,17 +64,9 @@ def ffmpeg_convert_ts_to_mkv(files, output='output.mkv'): def ffmpeg_concat_mp4_to_mpg(files, output='output.mpg'): # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): - concat_list = open(output + '.txt', 'w', encoding="utf-8") - for file in files: - if os.path.isfile(file): - concat_list.write("file %s\n" % parameterize(file)) - concat_list.close() - - params = [FFMPEG] + LOGLEVEL - params.extend(['-f', 'concat', '-safe', '-1', '-y', '-i']) - params.append(output + '.txt') - params += ['-c', 'copy', output] - + concat_list = generate_concat_list(files, output) + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + '-i', concat_list, '-c', 'copy', output] if subprocess.call(params) == 0: os.remove(output + '.txt') return True @@ -115,18 +119,10 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'): print('Merging video parts... ', end="", flush=True) # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): - concat_list = open(output + '.txt', 'w', encoding="utf-8") - for file in files: - if os.path.isfile(file): - # for escaping rules, see: - # https://www.ffmpeg.org/ffmpeg-utils.html#Quoting-and-escaping - concat_list.write("file %s\n" % parameterize(file)) - concat_list.close() - - params = [FFMPEG] + LOGLEVEL + ['-f', 'concat', '-safe', '-1', '-y', '-i'] - params.append(output + '.txt') - params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc', output] - + concat_list = generate_concat_list(files, output) + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + '-i', concat_list, '-c', 'copy', + '-bsf:a', 'aac_adtstoasc', output] subprocess.check_call(params) os.remove(output + '.txt') return True @@ -162,16 +158,10 @@ def ffmpeg_concat_mp4_to_mp4(files, output='output.mp4'): print('Merging video parts... ', end="", flush=True) # Use concat demuxer on FFmpeg >= 1.1 if FFMPEG == 'ffmpeg' and (FFMPEG_VERSION[0] >= 2 or (FFMPEG_VERSION[0] == 1 and FFMPEG_VERSION[1] >= 1)): - concat_list = open(output + '.txt', 'w', encoding="utf-8") - for file in files: - if os.path.isfile(file): - concat_list.write("file %s\n" % parameterize(file)) - concat_list.close() - - params = [FFMPEG] + LOGLEVEL + ['-f', 'concat', '-safe', '-1', '-y', '-i'] - params.append(output + '.txt') - params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc', output] - + concat_list = generate_concat_list(files, output) + params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', + '-i', concat_list, '-c', 'copy', + '-bsf:a', 'aac_adtstoasc', output] subprocess.check_call(params) os.remove(output + '.txt') return True From f7b6f6b40f97813206252f9c41dbe05bda592918 Mon Sep 17 00:00:00 2001 From: Zhiming Wang Date: Sun, 25 Dec 2016 13:48:00 -0500 Subject: [PATCH 040/765] ffmpeg: set loglevel to info in debug mode Occasionally, the FFmpeg invocation fails (which could be due to bugs in you-get; see #1558 for instance), but -loglevel quiet means nothing is printed other than the exit status (pretty much always 1) in Python's traceback, which is not helpful at all. This commit restores FFmpeg's regular output (-loglevel info) when --debug is specified. We're not using verbose, debug or trace because those levels are mostly only useful for debugging FFmpeg itself, which is not our goal. Due to lack of meaningful API to access the global logging level, this is a hack based on two assumptions: 1. When --debug is enabled, the root logger level is set to DEBUG; 2. processor.ffmpeg is lazily imported, after command line options are parsed. --- src/you_get/processor/ffmpeg.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) mode change 100644 => 100755 src/you_get/processor/ffmpeg.py diff --git a/src/you_get/processor/ffmpeg.py b/src/you_get/processor/ffmpeg.py old mode 100644 new mode 100755 index a8599e52..f5b3cd38 --- a/src/you_get/processor/ffmpeg.py +++ b/src/you_get/processor/ffmpeg.py @@ -1,5 +1,6 @@ #!/usr/bin/env python +import logging import os.path import subprocess from ..util.strings import parameterize @@ -21,7 +22,10 @@ def get_usable_ffmpeg(cmd): return None FFMPEG, FFMPEG_VERSION = get_usable_ffmpeg('ffmpeg') or get_usable_ffmpeg('avconv') or (None, None) -LOGLEVEL = ['-loglevel', 'quiet'] +if logging.getLogger().isEnabledFor(logging.DEBUG): + LOGLEVEL = ['-loglevel', 'info'] +else: + LOGLEVEL = ['-loglevel', 'quiet'] def has_ffmpeg_installed(): return FFMPEG is not None From 927a1cb91f854cb5260f67b15d9811f763955407 Mon Sep 17 00:00:00 2001 From: liujianshan Date: Thu, 29 Dec 2016 19:47:53 +0800 Subject: [PATCH 041/765] Fix soku.com vid download error problem --- src/you_get/extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractor.py b/src/you_get/extractor.py index 594b908e..332440dd 100644 --- a/src/you_get/extractor.py +++ b/src/you_get/extractor.py @@ -206,7 +206,7 @@ class VideoExtractor(): output_dir=kwargs['output_dir'], merge=kwargs['merge'], av=stream_id in self.dash_streams) - if not kwargs['caption']: + if 'caption' not in kwargs or not kwargs['caption']: print('Skipping captions.') return for lang in self.caption_tracks: From 76399e8561c421ead7a590ef857a98eccb16af61 Mon Sep 17 00:00:00 2001 From: ChenYuan Date: Sun, 1 Jan 2017 00:44:56 +0800 Subject: [PATCH 042/765] fix bilibili bangumi modify the regex to get eposide id --- src/you_get/extractors/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 122dea0b..aecb072c 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -127,7 +127,7 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs if re.match(r'https?://bangumi\.bilibili\.com/', url): # quick hack for bangumi URLs - episode_id = r1(r'data-current-episode-id="(\d+)"', html) + episode_id = r1(r'first_ep_id = "(\d+)"', html) cont = post_content('http://bangumi.bilibili.com/web_api/get_source', post_data={'episode_id': episode_id}) cid = json.loads(cont)['result']['cid'] From 60b6834e547e328b1dee86dc748689292beba0e8 Mon Sep 17 00:00:00 2001 From: Valdemar Erk Date: Tue, 3 Jan 2017 23:58:56 +0100 Subject: [PATCH 043/765] Quanmin support. --- README.md | 1 + src/you_get/common.py | 1 + src/you_get/extractors/quanmin.py | 25 +++++++++++++++++++++++++ 3 files changed, 27 insertions(+) create mode 100644 src/you_get/extractors/quanmin.py diff --git a/README.md b/README.md index 40a26803..98c403c3 100644 --- a/README.md +++ b/README.md @@ -408,6 +408,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | Naver
    네이버 | |✓| | | | 芒果TV | |✓| | | | 火猫TV | |✓| | | +| 全民Tv | |✓| | | For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. diff --git a/src/you_get/common.py b/src/you_get/common.py index 7db4fba2..f320f6ab 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -65,6 +65,7 @@ SITES = { 'pptv' : 'pptv', 'qianmo' : 'qianmo', 'qq' : 'qq', + 'quanmin' : 'quanmin', 'showroom-live' : 'showroom', 'sina' : 'sina', 'smgbb' : 'bilibili', diff --git a/src/you_get/extractors/quanmin.py b/src/you_get/extractors/quanmin.py new file mode 100644 index 00000000..99e8790c --- /dev/null +++ b/src/you_get/extractors/quanmin.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python + +__all__ = ['quanmin_download'] + +from ..common import * +import json +import time + +def quanmin_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): + roomid = url[url.rfind("/")+1:] + json_request_url = 'http://www.quanmin.tv/json/rooms/{}/info4.json'.format(roomid) + + content = get_html(json_request_url) + data = json.loads(content) + + title = data["title"] + real_url = "http://flv.quanmin.tv/live/{}.flv".format(roomid) + + print_info(site_info, title, 'flv', float('inf')) + if not info_only: + download_urls([real_url], title, 'flv', None, output_dir, merge = merge) + +site_info = "quanmin.tv" +download = quanmin_download +download_playlist = playlist_not_supported('quanmin') From fc2c77effaae54970e40246a1ceded8bcced6dc5 Mon Sep 17 00:00:00 2001 From: Valdemar Erk Date: Wed, 4 Jan 2017 13:56:32 +0100 Subject: [PATCH 044/765] Fixes quanmin, when stream is offline. --- src/you_get/extractors/quanmin.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/you_get/extractors/quanmin.py b/src/you_get/extractors/quanmin.py index 99e8790c..89d63ea9 100644 --- a/src/you_get/extractors/quanmin.py +++ b/src/you_get/extractors/quanmin.py @@ -9,11 +9,13 @@ import time def quanmin_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): roomid = url[url.rfind("/")+1:] json_request_url = 'http://www.quanmin.tv/json/rooms/{}/info4.json'.format(roomid) - content = get_html(json_request_url) data = json.loads(content) title = data["title"] + + if not data["play_status"]: + raise ValueError("The live stream is not online!") real_url = "http://flv.quanmin.tv/live/{}.flv".format(roomid) print_info(site_info, title, 'flv', float('inf')) From f452eec729ac961c35043a11007f4fd1bfb79c20 Mon Sep 17 00:00:00 2001 From: lilydjwg Date: Sun, 8 Jan 2017 21:36:03 +0800 Subject: [PATCH 045/765] [qq] support for videos embedded in weixin example url: http://mp.weixin.qq.com/s?__biz=MzA3OTgxODI4NQ==&mid=2653200488&idx=1&sn=bd6d0279b2430cc208d9da74226871db&chksm=847dbb2ab30a323c4b1735887158daf1e295abe586aff0a646ce4257a48010f80bcfb1379c95&scene=0#rd --- src/you_get/extractors/qq.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/qq.py b/src/you_get/extractors/qq.py index c9ee7c0f..f2c3d9ec 100644 --- a/src/you_get/extractors/qq.py +++ b/src/you_get/extractors/qq.py @@ -73,7 +73,14 @@ def qq_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False): def qq_download(url, output_dir='.', merge=True, info_only=False, **kwargs): """""" if 'live.qq.com' in url: - qieDownload(url,output_dir=output_dir, merge=merge, info_only=info_only) + qieDownload(url, output_dir=output_dir, merge=merge, info_only=info_only) + return + + if 'mp.weixin.qq.com/s?' in url: + content = get_html(url) + vids = matchall(content, [r'\bvid=(\w+)']) + for vid in vids: + qq_download_by_vid(vid, vid, output_dir, merge, info_only) return #do redirect @@ -101,8 +108,6 @@ def qq_download(url, output_dir='.', merge=True, info_only=False, **kwargs): title = match1(content, r'"title":"([^"]+)"') if not title else title title = vid if not title else title #general fallback - - qq_download_by_vid(vid, title, output_dir, merge, info_only) site_info = "QQ.com" From 64dca2182e3a507b516dca7ed0adfc9102904f1f Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 9 Jan 2017 01:14:03 +0100 Subject: [PATCH 046/765] [youku] do not override existing proxy handler (fix #1546, close #1548) --- src/you_get/extractors/youku.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/you_get/extractors/youku.py b/src/you_get/extractors/youku.py index 853a75ba..d673e58c 100644 --- a/src/you_get/extractors/youku.py +++ b/src/you_get/extractors/youku.py @@ -143,9 +143,9 @@ class Youku(VideoExtractor): }) else: proxy_handler = request.ProxyHandler({}) - opener = request.build_opener(ssl_context, cookie_handler, proxy_handler) - opener.addheaders = [('Cookie','__ysuid={}'.format(time.time()))] - request.install_opener(opener) + for handler in (ssl_context, cookie_handler, proxy_handler): + request._opener.add_handler(handler) + request._opener.addheaders = [('Cookie','__ysuid={}'.format(time.time()))] assert self.url or self.vid @@ -162,7 +162,7 @@ class Youku(VideoExtractor): api12_url = kwargs['api12_url'] #86 self.ctype = kwargs['ctype'] self.title = kwargs['title'] - + else: api_url = 'http://play.youku.com/play/get.json?vid=%s&ct=10' % self.vid api12_url = 'http://play.youku.com/play/get.json?vid=%s&ct=12' % self.vid @@ -330,36 +330,36 @@ class Youku(VideoExtractor): def open_download_by_vid(self, client_id, vid, **kwargs): """self, str, str, **kwargs->None - + Arguments: client_id: An ID per client. For now we only know Acfun's such ID. - + vid: An video ID for each video, starts with "C". - + kwargs['embsig']: Youku COOP's anti hotlinking. For Acfun, an API call must be done to Acfun's server, or the "playsign" of the content of sign_url shall be empty. - + Misc: Override the original one with VideoExtractor. - + Author: Most of the credit are to @ERioK, who gave his POC. - + History: Jul.28.2016 Youku COOP now have anti hotlinking via embsig. """ self.f_code_1 = '10ehfkbv' #can be retrived by running r.translate with the keys and the list e self.f_code_2 = 'msjv7h2b' - + # as in VideoExtractor self.url = None self.vid = vid self.name = "优酷开放平台 (Youku COOP)" #A little bit of work before self.prepare - + #Change as Jul.28.2016 Youku COOP updates its platform to add ant hotlinking if kwargs['embsig']: sign_url = "https://api.youku.com/players/custom.json?client_id={client_id}&video_id={video_id}&embsig={embsig}".format(client_id = client_id, video_id = vid, embsig = kwargs['embsig']) @@ -371,9 +371,9 @@ class Youku(VideoExtractor): #to be injected and replace ct10 and 12 api85_url = 'http://play.youku.com/partner/get.json?cid={client_id}&vid={vid}&ct=85&sign={playsign}'.format(client_id = client_id, vid = vid, playsign = playsign) api86_url = 'http://play.youku.com/partner/get.json?cid={client_id}&vid={vid}&ct=86&sign={playsign}'.format(client_id = client_id, vid = vid, playsign = playsign) - + self.prepare(api_url = api85_url, api12_url = api86_url, ctype = 86, **kwargs) - + #exact copy from original VideoExtractor if 'extractor_proxy' in kwargs and kwargs['extractor_proxy']: unset_proxy() From 4b782f92be59e92ad38c3b44fe09d2be3e20c582 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 10 Jan 2017 17:25:37 +0100 Subject: [PATCH 047/765] [nanagogo] skip erroneous posts --- src/you_get/extractors/nanagogo.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/you_get/extractors/nanagogo.py b/src/you_get/extractors/nanagogo.py index 222659f6..9cce9e4c 100644 --- a/src/you_get/extractors/nanagogo.py +++ b/src/you_get/extractors/nanagogo.py @@ -17,6 +17,8 @@ def nanagogo_download(url, output_dir='.', merge=True, info_only=False, **kwargs info = json.loads(get_content(api_url)) items = [] + if info['data']['posts']['post'] is None: + return for i in info['data']['posts']['post']['body']: if 'image' in i: image_url = i['image'] From c401c9b9f83050873fddc2c2ac26fc5e79984e35 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 10 Jan 2017 17:31:57 +0100 Subject: [PATCH 048/765] [bilibili] fix #1605 --- src/you_get/extractors/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index aecb072c..920ab779 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -127,7 +127,7 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs if re.match(r'https?://bangumi\.bilibili\.com/', url): # quick hack for bangumi URLs - episode_id = r1(r'first_ep_id = "(\d+)"', html) + episode_id = r1(r'#(\d+)$', url) or r1(r'first_ep_id = "(\d+)"', html) cont = post_content('http://bangumi.bilibili.com/web_api/get_source', post_data={'episode_id': episode_id}) cid = json.loads(cont)['result']['cid'] From a7cd3e2c6e5019dbc07d4c974fe0a751095555bf Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 10 Jan 2017 17:45:09 +0100 Subject: [PATCH 049/765] [bilibili] bangumi titling with episode_id --- src/you_get/extractors/bilibili.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index 920ab779..5f00ffe9 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -131,6 +131,7 @@ def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs cont = post_content('http://bangumi.bilibili.com/web_api/get_source', post_data={'episode_id': episode_id}) cid = json.loads(cont)['result']['cid'] + title = '%s [%s]' % (title, episode_id) bilibili_download_by_cid(str(cid), title, output_dir=output_dir, merge=merge, info_only=info_only) else: From 866876e59ffefef55353c4a6ca819681014ab763 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 10 Jan 2017 17:46:04 +0100 Subject: [PATCH 050/765] version 0.4.626 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 28919906..2e8e4f41 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.595' +__version__ = '0.4.626' From 7eca091d0df30f84520f3b665754828f33be95ae Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Tue, 10 Jan 2017 18:45:28 +0100 Subject: [PATCH 051/765] tag classifier: Python 3.6 --- you-get.json | 1 + 1 file changed, 1 insertion(+) diff --git a/you-get.json b/you-get.json index 084657d9..594742c2 100644 --- a/you-get.json +++ b/you-get.json @@ -24,6 +24,7 @@ "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", "Topic :: Internet", "Topic :: Internet :: WWW/HTTP", "Topic :: Multimedia", From 0c1553b97d981a5ab0ffc7605b8c70646423ce3f Mon Sep 17 00:00:00 2001 From: Valdemar Erk Date: Sun, 15 Jan 2017 12:43:34 +0100 Subject: [PATCH 052/765] Fixes #1612 --- src/you_get/extractors/panda.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/you_get/extractors/panda.py b/src/you_get/extractors/panda.py index 3f9ceade..45249bd2 100644 --- a/src/you_get/extractors/panda.py +++ b/src/you_get/extractors/panda.py @@ -8,22 +8,28 @@ import time def panda_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): roomid = url[url.rfind('/')+1:] - json_request_url = 'http://www.panda.tv/api_room?roomid={}&pub_key=&_={}'.format(roomid, int(time.time())) + json_request_url ="http://www.panda.tv/api_room_v2?roomid={}&__plat=pc_web&_={}".format(roomid, int(time.time())) content = get_html(json_request_url) - errno = json.loads(content)['errno'] - errmsg = json.loads(content)['errmsg'] + api_json = json.loads(content) + + errno = api_json["errno"] + errmsg = api_json["errmsg"] if errno: raise ValueError("Errno : {}, Errmsg : {}".format(errno, errmsg)) - - data = json.loads(content)['data'] - title = data.get('roominfo')['name'] - room_key = data.get('videoinfo')['room_key'] - plflag = data.get('videoinfo')['plflag'].split('_') - status = data.get('videoinfo')['status'] + data = api_json["data"] + title = data["roominfo"]["name"] + room_key = data["videoinfo"]["room_key"] + plflag = data["videoinfo"]["plflag"].split("_") + status = data["videoinfo"]["status"] if status is not "2": raise ValueError("The live stream is not online! (status:%s)" % status) - real_url = 'http://pl{}.live.panda.tv/live_panda/{}.flv'.format(plflag[1],room_key) + data2 = json.loads(data["videoinfo"]["plflag_list"]) + rid = data2["auth"]["rid"] + sign = data2["auth"]["sign"] + ts = data2["auth"]["time"] + real_url = "http://pl{}.live.panda.tv/live_panda/{}.flv?sign={}&ts={}&rid={}".format(plflag[1], room_key, sign, ts, rid) + print_info(site_info, title, 'flv', float('inf')) if not info_only: download_urls([real_url], title, 'flv', None, output_dir, merge = merge) From 374e1032db23cebb5f8f22a6de5eff4950bd7bf2 Mon Sep 17 00:00:00 2001 From: JayXon Date: Mon, 16 Jan 2017 09:56:24 -0800 Subject: [PATCH 053/765] [common] also retry if timeout in url_save, post_content, etc. --- src/you_get/common.py | 51 ++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index f320f6ab..bea6e62c 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -298,6 +298,13 @@ def get_location(url): # not to do that return response.geturl() +def urlopen_with_retry(*args, **kwargs): + for i in range(10): + try: + return request.urlopen(*args, **kwargs) + except socket.timeout: + logging.debug('request attempt %s timeout' % str(i + 1)) + def get_content(url, headers={}, decoded=True): """Gets the content of a URL via sending a HTTP GET request. @@ -317,13 +324,7 @@ def get_content(url, headers={}, decoded=True): cookies.add_cookie_header(req) req.headers.update(req.unredirected_hdrs) - for i in range(10): - try: - response = request.urlopen(req) - break - except socket.timeout: - logging.debug('request attempt %s timeout' % str(i + 1)) - + response = urlopen_with_retry(req) data = response.read() # Handle HTTP compression for gzip and deflate (zlib) @@ -362,7 +363,7 @@ def post_content(url, headers={}, post_data={}, decoded=True): cookies.add_cookie_header(req) req.headers.update(req.unredirected_hdrs) post_data_enc = bytes(parse.urlencode(post_data), 'utf-8') - response = request.urlopen(req, data = post_data_enc) + response = urlopen_with_retry(req, data=post_data_enc) data = response.read() # Handle HTTP compression for gzip and deflate (zlib) @@ -384,11 +385,11 @@ def post_content(url, headers={}, post_data={}, decoded=True): def url_size(url, faker = False, headers = {}): if faker: - response = request.urlopen(request.Request(url, headers = fake_headers), None) + response = urlopen_with_retry(request.Request(url, headers=fake_headers)) elif headers: - response = request.urlopen(request.Request(url, headers = headers), None) + response = urlopen_with_retry(request.Request(url, headers=headers)) else: - response = request.urlopen(url) + response = urlopen_with_retry(url) size = response.headers['content-length'] return int(size) if size!=None else float('inf') @@ -398,20 +399,20 @@ def urls_size(urls, faker = False, headers = {}): def get_head(url, headers = {}, get_method = 'HEAD'): if headers: - req = request.Request(url, headers = headers) + req = request.Request(url, headers=headers) else: req = request.Request(url) - req.get_method = lambda : get_method - res = request.urlopen(req) + req.get_method = lambda: get_method + res = urlopen_with_retry(req) return dict(res.headers) def url_info(url, faker = False, headers = {}): if faker: - response = request.urlopen(request.Request(url, headers = fake_headers), None) + response = urlopen_with_retry(request.Request(url, headers=fake_headers)) elif headers: - response = request.urlopen(request.Request(url, headers = headers), None) + response = urlopen_with_retry(request.Request(url, headers=headers)) else: - response = request.urlopen(request.Request(url)) + response = urlopen_with_retry(request.Request(url)) headers = response.headers @@ -460,11 +461,11 @@ def url_locations(urls, faker = False, headers = {}): locations = [] for url in urls: if faker: - response = request.urlopen(request.Request(url, headers = fake_headers), None) + response = urlopen_with_retry(request.Request(url, headers=fake_headers)) elif headers: - response = request.urlopen(request.Request(url, headers = headers), None) + response = urlopen_with_retry(request.Request(url, headers=headers)) else: - response = request.urlopen(request.Request(url)) + response = urlopen_with_retry(request.Request(url)) locations.append(response.url) return locations @@ -514,10 +515,10 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False, h if refer: headers['Referer'] = refer - response = request.urlopen(request.Request(url, headers = headers), None) + response = urlopen_with_retry(request.Request(url, headers=headers)) try: range_start = int(response.headers['content-range'][6:].split('/')[0].split('-')[0]) - end_length = end = int(response.headers['content-range'][6:].split('/')[1]) + end_length = int(response.headers['content-range'][6:].split('/')[1]) range_length = end_length - range_start except: content_length = response.headers['content-length'] @@ -537,7 +538,7 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False, h break else: # Unexpected termination. Retry request headers['Range'] = 'bytes=' + str(received) + '-' - response = request.urlopen(request.Request(url, headers = headers), None) + response = urlopen_with_retry(request.Request(url, headers=headers)) output.write(buffer) received += len(buffer) if bar: @@ -597,7 +598,7 @@ def url_save_chunked(url, filepath, bar, dyn_callback=None, chunk_size=0, ignore if refer: headers['Referer'] = refer - response = request.urlopen(request.Request(url, headers=headers), None) + response = urlopen_with_retry(request.Request(url, headers=headers)) with open(temp_filepath, open_mode) as output: this_chunk = received @@ -610,7 +611,7 @@ def url_save_chunked(url, filepath, bar, dyn_callback=None, chunk_size=0, ignore if chunk_size and (received - this_chunk) >= chunk_size: url = dyn_callback(received) this_chunk = received - response = request.urlopen(request.Request(url, headers=headers), None) + response = urlopen_with_retry(request.Request(url, headers=headers)) if bar: bar.update_received(len(buffer)) From 0f131e38d4b7fed6cb232aa346df01858547f921 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 16 Jan 2017 23:29:21 +0100 Subject: [PATCH 054/765] [facebook] fix #1615 --- src/you_get/extractors/facebook.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/facebook.py b/src/you_get/extractors/facebook.py index 2a96fcb0..9eb9fae9 100644 --- a/src/you_get/extractors/facebook.py +++ b/src/you_get/extractors/facebook.py @@ -11,11 +11,11 @@ def facebook_download(url, output_dir='.', merge=True, info_only=False, **kwargs title = r1(r'(.+)', html) sd_urls = list(set([ unicodize(str.replace(i, '\\/', '/')) - for i in re.findall(r'"sd_src_no_ratelimit":"([^"]*)"', html) + for i in re.findall(r'sd_src_no_ratelimit:"([^"]*)"', html) ])) hd_urls = list(set([ unicodize(str.replace(i, '\\/', '/')) - for i in re.findall(r'"hd_src_no_ratelimit":"([^"]*)"', html) + for i in re.findall(r'hd_src_no_ratelimit:"([^"]*)"', html) ])) urls = hd_urls if hd_urls else sd_urls From 015871dfa96d480ceed982ecdf45f911ee5b34a8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 16 Jan 2017 23:49:27 +0100 Subject: [PATCH 055/765] [acfun] correct active p title, fix #1617 --- src/you_get/extractors/acfun.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/you_get/extractors/acfun.py b/src/you_get/extractors/acfun.py index 87e005fb..6bb0dca4 100644 --- a/src/you_get/extractors/acfun.py +++ b/src/you_get/extractors/acfun.py @@ -77,6 +77,8 @@ def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs): title = unescape_html(title) title = escape_file_path(title) assert title + if match1(url, r'_(\d+)$'): # current P + title = title + " " + r1(r'active">([^<]*)', html) vid = r1('data-vid="(\d+)"', html) up = r1('data-name="([^"]+)"', html) From a520eb051e797b70eddfecaf5c934259c071bf3c Mon Sep 17 00:00:00 2001 From: AlanYang Date: Thu, 19 Jan 2017 11:15:42 +0800 Subject: [PATCH 056/765] fixed mgtv.com 1.17 change api address and stream domain --- src/you_get/extractors/mgtv.py | 9 +++++---- src/you_get/json_output.py | 5 +++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/you_get/extractors/mgtv.py b/src/you_get/extractors/mgtv.py index 3ce62efe..1656ac3c 100644 --- a/src/you_get/extractors/mgtv.py +++ b/src/you_get/extractors/mgtv.py @@ -21,7 +21,7 @@ class MGTV(VideoExtractor): id_dic = {i['video_profile']:(i['id']) for i in stream_types} - api_endpoint = 'http://v.api.mgtv.com/player/video?video_id={video_id}' + api_endpoint = 'http://pcweb.api.mgtv.com/player/video?video_id={video_id}' @staticmethod def get_vid_from_url(url): @@ -63,6 +63,7 @@ class MGTV(VideoExtractor): content = get_content(self.api_endpoint.format(video_id = self.vid)) content = loads(content) self.title = content['data']['info']['title'] + domain = content['data']['stream_domain'][0] #stream_avalable = [i['name'] for i in content['data']['stream']] stream_available = {} @@ -73,7 +74,7 @@ class MGTV(VideoExtractor): if s['video_profile'] in stream_available.keys(): quality_id = self.id_dic[s['video_profile']] url = stream_available[s['video_profile']] - url = re.sub( r'(\&arange\=\d+)', '', url) #Un-Hum + url = domain + re.sub( r'(\&arange\=\d+)', '', url) #Un-Hum m3u8_url, m3u8_size, segment_list_this = self.get_mgtv_real_url(url) stream_fileid_list = [] @@ -144,8 +145,8 @@ class MGTV(VideoExtractor): else: download_urls(stream_info['src'], self.title, stream_info['container'], stream_info['size'], output_dir=kwargs['output_dir'], - merge=kwargs['merge'], - av=stream_id in self.dash_streams) + merge=kwargs.get('merge', True)) + # av=stream_id in self.dash_streams) site = MGTV() download = site.download_by_url diff --git a/src/you_get/json_output.py b/src/you_get/json_output.py index 86a42abc..3e1bac9f 100644 --- a/src/you_get/json_output.py +++ b/src/you_get/json_output.py @@ -31,6 +31,11 @@ def print_info(site_info=None, title=None, type=None, size=None): def download_urls(urls=None, title=None, ext=None, total_size=None, refer=None): ve = last_info + if not ve: + ve = VideoExtractor() + ve.name = '' + ve.url = urls + ve.title=title # save download info in streams stream = {} stream['container'] = ext From fc1646d74ea14012a03dc17aad395b5c5f1554b3 Mon Sep 17 00:00:00 2001 From: haoflynet Date: Sun, 22 Jan 2017 23:35:23 +0800 Subject: [PATCH 057/765] fix youku.py bug --- src/you_get/extractors/youku.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/you_get/extractors/youku.py b/src/you_get/extractors/youku.py index d673e58c..65fcbc27 100644 --- a/src/you_get/extractors/youku.py +++ b/src/you_get/extractors/youku.py @@ -143,6 +143,9 @@ class Youku(VideoExtractor): }) else: proxy_handler = request.ProxyHandler({}) + if not request._opener: + opener = request.build_opener(proxy_handler) + request.install_opener(opener) for handler in (ssl_context, cookie_handler, proxy_handler): request._opener.add_handler(handler) request._opener.addheaders = [('Cookie','__ysuid={}'.format(time.time()))] From 61225b1552df86dbecf1be22c6b5433cd3412f44 Mon Sep 17 00:00:00 2001 From: Chuntao Hong Date: Tue, 24 Jan 2017 12:36:57 +0800 Subject: [PATCH 058/765] fix non-ascii url --- src/you_get/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/you_get/common.py b/src/you_get/common.py index bea6e62c..51b81cad 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -259,6 +259,7 @@ def undeflate(data): # DEPRECATED in favor of get_content() def get_response(url, faker = False): + url = parse.quote(url,':/') # install cookies if cookies: opener = request.build_opener(request.HTTPCookieProcessor(cookies)) From 10624ca5b34e542bb9004765889499dc0341d698 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 25 Jan 2017 21:21:09 +0100 Subject: [PATCH 059/765] [google] add UA in get_html --- src/you_get/extractors/google.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/you_get/extractors/google.py b/src/you_get/extractors/google.py index 18483920..febac780 100644 --- a/src/you_get/extractors/google.py +++ b/src/you_get/extractors/google.py @@ -51,7 +51,7 @@ def google_download(url, output_dir = '.', merge = True, info_only = False, **kw # attempt to extract images first # TBD: posts with > 4 images # TBD: album links - html = get_html(parse.unquote(url)) + html = get_html(parse.unquote(url), faker=True) real_urls = [] for src in re.findall(r'src="([^"]+)"[^>]*itemprop="image"', html): t = src.split('/') @@ -66,7 +66,7 @@ def google_download(url, output_dir = '.', merge = True, info_only = False, **kw try: url = "https://plus.google.com/" + r1(r'"(photos/\d+/albums/\d+/\d+)', html) - html = get_html(url) + html = get_html(url, faker=True) temp = re.findall(r'\[(\d+),\d+,\d+,"([^"]+)"\]', html) temp = sorted(temp, key = lambda x : fmt_level[x[0]]) urls = [unicodize(i[1]) for i in temp if i[0] == temp[0][0]] @@ -77,7 +77,7 @@ def google_download(url, output_dir = '.', merge = True, info_only = False, **kw post_author = r1(r'/\+([^/]+)/posts', post_url) if post_author: post_url = "https://plus.google.com/+%s/posts/%s" % (parse.quote(post_author), r1(r'posts/(.+)', post_url)) - post_html = get_html(post_url) + post_html = get_html(post_url, faker=True) title = r1(r']*>([^<\n]+)', post_html) if title is None: @@ -98,7 +98,7 @@ def google_download(url, output_dir = '.', merge = True, info_only = False, **kw elif service in ['docs', 'drive'] : # Google Docs - html = get_html(url) + html = get_html(url, faker=True) title = r1(r'"title":"([^"]*)"', html) or r1(r' 1: From f299d30161f2017318211099979845192a891025 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 25 Jan 2017 21:21:49 +0100 Subject: [PATCH 060/765] [common] update fake_headers --- src/you_get/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index bea6e62c..9ee38821 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -131,7 +131,7 @@ fake_headers = { 'Accept-Charset': 'UTF-8,*;q=0.5', 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'en-US,en;q=0.8', - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:13.0) Gecko/20100101 Firefox/13.0' + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0' } if sys.stdout.isatty(): From 4108e2112deac199fe948fdcf3793148fea3a141 Mon Sep 17 00:00:00 2001 From: Justsoos Date: Thu, 26 Jan 2017 16:31:56 +0800 Subject: [PATCH 061/765] fix:[zhanqi.tv]recode all --- src/you_get/extractors/zhanqi.py | 99 +++++++++++++------------------- 1 file changed, 39 insertions(+), 60 deletions(-) diff --git a/src/you_get/extractors/zhanqi.py b/src/you_get/extractors/zhanqi.py index 7d6b75b6..25e7e132 100644 --- a/src/you_get/extractors/zhanqi.py +++ b/src/you_get/extractors/zhanqi.py @@ -3,73 +3,52 @@ __all__ = ['zhanqi_download'] from ..common import * -import re -import base64 import json -import time -import hashlib -def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): - html = get_content(url) - video_type_patt = r'VideoType":"([^"]+)"' - video_type = match1(html, video_type_patt) +def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):#the programmers of zhanqi are noobs + host_name = url.split('/')[2] + first_folder_path = url.split('/')[3] + + if first_folder_path != 'videos': #url = "https://www.zhanqi.tv/huashan" + if first_folder_path == 'topic': #https://www.zhanqi.tv/topic/lyingman + first_folder_path = url.split('/')[4] + api_url = "https://www.zhanqi.tv/api/static/v2.1/room/domain/" + first_folder_path + ".json" + api_json = json.loads(get_html(api_url)) + data = api_json['data'] + status = data['status'] + if status != '4': + raise ValueError ("The live stream is not online!") + + nickname = data['nickname'] + title = nickname + ": " + data['title'] + + roomid = data['id'] + videoId = data['videoId'] + jump_url = "http://wshdl.load.cdn.zhanqi.tv/zqlive/" + videoId + ".flv?get_url=1" + jump_url = jump_url.strip('\r\n') + + real_url = get_html(jump_url) + real_url = real_url.strip('\r\n') - #rtmp_base_patt = r'VideoUrl":"([^"]+)"' - rtmp_id_patt = r'videoId":"([^"]+)"' - vod_m3u8_id_patt = r'VideoID":"([^"]+)"' - title_patt = r'

    ([^<]+)

    ' - title_patt_backup = r'([^<]{1,9999})' - title = match1(html, title_patt) or match1(html, title_patt_backup) - title = unescape_html(title) - rtmp_base = "http://wshdl.load.cdn.zhanqi.tv/zqlive" - vod_base = "http://dlvod.cdn.zhanqi.tv" - rtmp_real_base = "rtmp://dlrtmp.cdn.zhanqi.tv/zqlive/" - room_info = "http://www.zhanqi.tv/api/static/live.roomid/" - KEY_MASK = "#{&..?!(" - ak2_pattern = r'ak2":"\d-([^|]+)' - - if video_type == "LIVE": - rtmp_id = match1(html, rtmp_id_patt).replace('\\/','/') - #request_url = rtmp_base+'/'+rtmp_id+'.flv?get_url=1' - #real_url = get_html(request_url) - html2 = get_content(room_info + rtmp_id.split("_")[0] + ".json") - json_data = json.loads(html2) - cdns = json_data["data"]["flashvars"]["cdns"] - cdns = base64.b64decode(cdns).decode("utf-8") - cdn = match1(cdns, ak2_pattern) - cdn = base64.b64decode(cdn).decode("utf-8") - key = '' - i = 0 - while(i < len(cdn)): - key = key + chr(ord(cdn[i]) ^ ord(KEY_MASK[i % 8])) - i = i + 1 - time_hex = hex(int(time.time()))[2:] - key = hashlib.md5(bytes(key + "/zqlive/" + rtmp_id + time_hex, "utf-8")).hexdigest() - real_url = rtmp_real_base + '/' + rtmp_id + "?k=" + key + "&t=" + time_hex print_info(site_info, title, 'flv', float('inf')) if not info_only: - download_rtmp_url(real_url, title, 'flv', {}, output_dir, merge = merge) - #download_urls([real_url], title, 'flv', None, output_dir, merge = merge) - elif video_type == "VOD": - vod_m3u8_request = vod_base + match1(html, vod_m3u8_id_patt).replace('\\/','/') - vod_m3u8 = get_html(vod_m3u8_request) - part_url = re.findall(r'(/[^#]+)\.ts',vod_m3u8) - real_url = [] - for i in part_url: - i = vod_base + i + ".ts" - real_url.append(i) - type_ = '' - size = 0 - for url in real_url: - _, type_, temp = url_info(url) - size += temp or 0 + download_url_ffmpeg(real_url, title, 'flv', {}, output_dir = output_dir, merge = merge) - print_info(site_info, title, type_ or 'ts', size) + else: #url = 'https://www.zhanqi.tv/videos/Lyingman/2017/01/182308.html' + video_id = url.split('/')[-1].split('.')[0] + api_url = "https://www.zhanqi.tv/api/static/v2.1/video/" + video_id + ".json" + api_json = json.loads(get_html(api_url)) + data = api_json['data'] + + title = data['title'] + + video_url_id = data['flashvars']['VideoID'] + real_url = "http://dlvod.cdn.zhanqi.tv/" + video_url_id + + print_info(site_info, title, 'flv', float('inf')) if not info_only: - download_urls(real_url, title, type_ or 'ts', size, output_dir, merge = merge) - else: - NotImplementedError('Unknown_video_type') + download_url_ffmpeg(real_url, title, 'flv', {}, output_dir = output_dir, merge = merge) site_info = "zhanqi.tv" download = zhanqi_download -download_playlist = playlist_not_supported('zhanqi') +download_playlist = playlist_not_supported('zhanqi') \ No newline at end of file From 15ae8feb5b5e4467e5eed54ff18b32021efaa813 Mon Sep 17 00:00:00 2001 From: Justsoos Date: Sat, 28 Jan 2017 03:08:54 +0800 Subject: [PATCH 062/765] little fix --- src/you_get/extractors/zhanqi.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/you_get/extractors/zhanqi.py b/src/you_get/extractors/zhanqi.py index 25e7e132..f2c673ca 100644 --- a/src/you_get/extractors/zhanqi.py +++ b/src/you_get/extractors/zhanqi.py @@ -5,13 +5,13 @@ __all__ = ['zhanqi_download'] from ..common import * import json -def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):#the programmers of zhanqi are noobs +def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): host_name = url.split('/')[2] - first_folder_path = url.split('/')[3] + first_folder_path = url.split('/')[3].split('?')[0] - if first_folder_path != 'videos': #url = "https://www.zhanqi.tv/huashan" + if first_folder_path != 'videos': #url = "https://www.zhanqi.tv/huashan?param_s=1_0.2.0" if first_folder_path == 'topic': #https://www.zhanqi.tv/topic/lyingman - first_folder_path = url.split('/')[4] + first_folder_path = url.split('/')[4].split('?')[0] api_url = "https://www.zhanqi.tv/api/static/v2.1/room/domain/" + first_folder_path + ".json" api_json = json.loads(get_html(api_url)) data = api_json['data'] @@ -29,13 +29,15 @@ def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kw real_url = get_html(jump_url) real_url = real_url.strip('\r\n') + site_info = "www.zhanqi.tv" print_info(site_info, title, 'flv', float('inf')) if not info_only: download_url_ffmpeg(real_url, title, 'flv', {}, output_dir = output_dir, merge = merge) else: #url = 'https://www.zhanqi.tv/videos/Lyingman/2017/01/182308.html' - video_id = url.split('/')[-1].split('.')[0] + video_id = url.split('/')[-1].split('?')[0].split('.')[0] + assert video_id api_url = "https://www.zhanqi.tv/api/static/v2.1/video/" + video_id + ".json" api_json = json.loads(get_html(api_url)) data = api_json['data'] @@ -44,11 +46,11 @@ def zhanqi_download(url, output_dir = '.', merge = True, info_only = False, **kw video_url_id = data['flashvars']['VideoID'] real_url = "http://dlvod.cdn.zhanqi.tv/" + video_url_id + site_info = "www.zhanqi.tv/videos" print_info(site_info, title, 'flv', float('inf')) if not info_only: download_url_ffmpeg(real_url, title, 'flv', {}, output_dir = output_dir, merge = merge) -site_info = "zhanqi.tv" download = zhanqi_download download_playlist = playlist_not_supported('zhanqi') \ No newline at end of file From 753879b49736e314b08c2122ddeef550a06646f8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 28 Jan 2017 03:20:17 +0100 Subject: [PATCH 063/765] [netease] fix #1642 --- src/you_get/extractors/netease.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/you_get/extractors/netease.py b/src/you_get/extractors/netease.py index d5f3b1fa..17ae70a9 100644 --- a/src/you_get/extractors/netease.py +++ b/src/you_get/extractors/netease.py @@ -22,9 +22,9 @@ def netease_hymn(): """ def netease_cloud_music_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - rid = match1(url, r'id=(.*)') + rid = match1(url, r'\Wid=(.*)') if rid is None: - rid = match1(url, r'/(\d+)/?$') + rid = match1(url, r'/(\d+)/?') if "album" in url: j = loads(get_content("http://music.163.com/api/album/%s?id=%s&csrf_token=" % (rid, rid), headers={"Referer": "http://music.163.com/"})) From 7d72596f06b46299c60ff0c1761cb9167060606a Mon Sep 17 00:00:00 2001 From: Justsoos Date: Sun, 29 Jan 2017 18:37:10 +0800 Subject: [PATCH 064/765] fix quanmin.py fix #1653 --- src/you_get/extractors/quanmin.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/quanmin.py b/src/you_get/extractors/quanmin.py index 89d63ea9..668e84f9 100644 --- a/src/you_get/extractors/quanmin.py +++ b/src/you_get/extractors/quanmin.py @@ -7,13 +7,15 @@ import json import time def quanmin_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): - roomid = url[url.rfind("/")+1:] - json_request_url = 'http://www.quanmin.tv/json/rooms/{}/info4.json'.format(roomid) + roomid = url.split('/')[3].split('?')[0] #add ?parameter.split and change the unstable url(from the back) rfind to split(from the front) + + #json_request_url = 'http://www.quanmin.tv/json/rooms/{}/info4.json'.format(roomid) #"http://www.quanmin.tv/json/rooms/308137/info4.json" switch to "http://m.quanmin.tv/json/rooms/308137/noinfo6.json" + json_request_url = 'http://m.quanmin.tv/json/rooms/{}/noinfo6.json'.format(roomid) content = get_html(json_request_url) data = json.loads(content) title = data["title"] - + if not data["play_status"]: raise ValueError("The live stream is not online!") real_url = "http://flv.quanmin.tv/live/{}.flv".format(roomid) From 5139b40b44265128088724a2619f3a3258728517 Mon Sep 17 00:00:00 2001 From: l34p Date: Wed, 1 Feb 2017 21:07:59 +0900 Subject: [PATCH 065/765] [youtube] fix broken link of html5player --- src/you_get/extractors/youtube.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index c403cb74..b0097f13 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -165,7 +165,7 @@ class YouTube(VideoExtractor): video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) try: ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) - self.html5player = 'https:' + ytplayer_config['assets']['js'] + self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] # Workaround: get_video_info returns bad s. Why? stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') except: @@ -177,7 +177,7 @@ class YouTube(VideoExtractor): ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) self.title = ytplayer_config['args']['title'] - self.html5player = 'https:' + ytplayer_config['assets']['js'] + self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') elif video_info['status'] == ['fail']: @@ -193,7 +193,7 @@ class YouTube(VideoExtractor): # 150 Restricted from playback on certain sites # Parse video page instead self.title = ytplayer_config['args']['title'] - self.html5player = 'https:' + ytplayer_config['assets']['js'] + self.html5player = 'https://www.youtube.com' + ytplayer_config['assets']['js'] stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') else: log.wtf('[Error] The uploader has not made this video available in your country.') From 2f4dc0f9a0000ed7ab6ecbfc7d903eed3c71a49d Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 1 Feb 2017 17:33:57 +0100 Subject: [PATCH 066/765] [google] quick fix for Google+ videos --- src/you_get/extractors/google.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/google.py b/src/you_get/extractors/google.py index febac780..1f2c354c 100644 --- a/src/you_get/extractors/google.py +++ b/src/you_get/extractors/google.py @@ -65,7 +65,7 @@ def google_download(url, output_dir = '.', merge = True, info_only = False, **kw title = post_date + "_" + post_id try: - url = "https://plus.google.com/" + r1(r'"(photos/\d+/albums/\d+/\d+)', html) + url = "https://plus.google.com/" + r1(r'(photos/\d+/albums/\d+/\d+)\?authkey', html) html = get_html(url, faker=True) temp = re.findall(r'\[(\d+),\d+,\d+,"([^"]+)"\]', html) temp = sorted(temp, key = lambda x : fmt_level[x[0]]) From 8afb998d59be335b4746f1792d317e5f5386a5f1 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 1 Feb 2017 23:39:46 +0100 Subject: [PATCH 067/765] Remove dead sites (2017-02-01) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * JPopsuki TV http://www.jpopsuki.tv/ * 天天动听 http://www.dongting.com/ * THVideo http://thvideo.tv/ * 阡陌视频 http://qianmo.com/ --- README.md | 4 -- src/you_get/common.py | 4 -- src/you_get/extractors/__init__.py | 3 -- src/you_get/extractors/dongting.py | 55 -------------------- src/you_get/extractors/jpopsuki.py | 23 --------- src/you_get/extractors/qianmo.py | 40 -------------- src/you_get/extractors/thvideo.py | 83 ------------------------------ 7 files changed, 212 deletions(-) delete mode 100644 src/you_get/extractors/dongting.py delete mode 100644 src/you_get/extractors/jpopsuki.py delete mode 100644 src/you_get/extractors/qianmo.py delete mode 100644 src/you_get/extractors/thvideo.py diff --git a/README.md b/README.md index 98c403c3..57f49a68 100644 --- a/README.md +++ b/README.md @@ -347,7 +347,6 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | Metacafe | |✓| | | | Magisto | |✓| | | | Khan Academy | |✓| | | -| JPopsuki TV | |✓| | | | Internet Archive | |✓| | | | **Instagram** | |✓|✓| | | InfoQ | |✓| | | @@ -392,11 +391,8 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 齐鲁网 | |✓| | | | QQ
    腾讯视频 | |✓| | | | 企鹅直播 | |✓| | | -| 阡陌视频 | |✓| | | -| THVideo | |✓| | | | Sina
    新浪视频
    微博秒拍视频 |
    |✓| | | | Sohu
    搜狐视频 | |✓| | | -| 天天动听 | | | |✓| | **Tudou
    土豆** | |✓| | | | 虾米 | | | |✓| | 阳光卫视 | |✓| | | diff --git a/src/you_get/common.py b/src/you_get/common.py index 9ee38821..a4aea070 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -15,7 +15,6 @@ SITES = { 'cbs' : 'cbs', 'dailymotion' : 'dailymotion', 'dilidili' : 'dilidili', - 'dongting' : 'dongting', 'douban' : 'douban', 'douyu' : 'douyutv', 'ehow' : 'ehow', @@ -40,7 +39,6 @@ SITES = { 'iqiyi' : 'iqiyi', 'isuntv' : 'suntv', 'joy' : 'joy', - 'jpopsuki' : 'jpopsuki', 'kankanews' : 'bilibili', 'khanacademy' : 'khan', 'ku6' : 'ku6', @@ -63,7 +61,6 @@ SITES = { 'pinterest' : 'pinterest', 'pixnet' : 'pixnet', 'pptv' : 'pptv', - 'qianmo' : 'qianmo', 'qq' : 'qq', 'quanmin' : 'quanmin', 'showroom-live' : 'showroom', @@ -73,7 +70,6 @@ SITES = { 'soundcloud' : 'soundcloud', 'ted' : 'ted', 'theplatform' : 'theplatform', - 'thvideo' : 'thvideo', 'tucao' : 'tucao', 'tudou' : 'tudou', 'tumblr' : 'tumblr', diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index 61b6a0d1..a027c396 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -33,7 +33,6 @@ from .interest import * from .iqilu import * from .iqiyi import * from .joy import * -from .jpopsuki import * from .ku6 import * from .kugou import * from .kuwo import * @@ -55,7 +54,6 @@ from .panda import * from .pinterest import * from .pixnet import * from .pptv import * -from .qianmo import * from .qie import * from .qq import * from .showroom import * @@ -64,7 +62,6 @@ from .sohu import * from .soundcloud import * from .suntv import * from .theplatform import * -from .thvideo import * from .tucao import * from .tudou import * from .tumblr import * diff --git a/src/you_get/extractors/dongting.py b/src/you_get/extractors/dongting.py deleted file mode 100644 index 56c1d394..00000000 --- a/src/you_get/extractors/dongting.py +++ /dev/null @@ -1,55 +0,0 @@ -# -*- coding: utf-8 -*- - -__all__ = ['dongting_download'] - -from ..common import * - -_unit_prefixes = 'bkmg' - -def parse_size(size): - m = re.match(r'([\d.]+)(.(?:i?B)?)', size, re.I) - if m: - return int(float(m.group(1)) * 1024 ** - _unit_prefixes.index(m.group(2).lower())) - else: - return 0 - -def dongting_download_lyric(lrc_url, file_name, output_dir): - j = get_html(lrc_url) - info = json.loads(j) - lrc = j['data']['lrc'] - filename = get_filename(file_name) - with open(output_dir + "/" + filename + '.lrc', 'w', encoding='utf-8') as x: - x.write(lrc) - -def dongting_download_song(sid, output_dir = '.', merge = True, info_only = False): - j = get_html('http://ting.hotchanson.com/detail.do?neid=%s&size=0' % sid) - info = json.loads(j) - - song_title = info['data']['songName'] - album_name = info['data']['albumName'] - artist = info['data']['singerName'] - ext = 'mp3' - size = parse_size(info['data']['itemList'][-1]['size']) - url = info['data']['itemList'][-1]['downUrl'] - - print_info(site_info, song_title, ext, size) - if not info_only: - file_name = "%s - %s - %s" % (song_title, album_name, artist) - download_urls([url], file_name, ext, size, output_dir, merge = merge) - lrc_url = ('http://lp.music.ttpod.com/lrc/down?' - 'lrcid=&artist=%s&title=%s') % ( - parse.quote(artist), parse.quote(song_title)) - try: - dongting_download_lyric(lrc_url, file_name, output_dir) - except: - pass - -def dongting_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False, **kwargs): - if re.match('http://www.dongting.com/\?song_id=\d+', url): - id = r1(r'http://www.dongting.com/\?song_id=(\d+)', url) - dongting_download_song(id, output_dir, merge, info_only) - -site_info = "Dongting.com" -download = dongting_download -download_playlist = playlist_not_supported("dongting") diff --git a/src/you_get/extractors/jpopsuki.py b/src/you_get/extractors/jpopsuki.py deleted file mode 100644 index eeac4f63..00000000 --- a/src/you_get/extractors/jpopsuki.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python - -__all__ = ['jpopsuki_download'] - -from ..common import * - -def jpopsuki_download(url, output_dir='.', merge=True, info_only=False, **kwargs): - html = get_html(url, faker=True) - - title = r1(r'list - From Biligrab.""" - interface_url = 'http://thvideo.tv/api/playurl.php?cid={cid}-{p}'.format(cid = cid, p = p) - data = get_content(interface_url) - rawurl = [] - dom = parseString(data) - - for node in dom.getElementsByTagName('durl'): - url = node.getElementsByTagName('url')[0] - rawurl.append(url.childNodes[0].data) - return rawurl - -#---------------------------------------------------------------------- -def th_video_get_title(url, p): - """""" - if re.match(r'http://thvideo.tv/v/\w+', url): - html = get_content(url) - title = match1(html, r'cid=(.+)
  • ').split('**') - - if int(p) > 0: #not the 1st P or multi part - title = title + ' - ' + [i.split('=')[-1:][0].split('|')[1] for i in video_list][p] - - return title - -#---------------------------------------------------------------------- -def thvideo_download(url, output_dir = '.', merge = False, info_only = False, **kwargs): - if re.match(r'http://thvideo.tv/v/\w+', url): - if 'p' in kwargs and kwargs['p']: - p = kwargs['p'] - else: - p = int(match1(url, r'http://thvideo.tv/v/th\d+#(\d+)')) - p -= 1 - - if not p or p < 0: - p = 0 - - if 'title' in kwargs and kwargs['title']: - title = kwargs['title'] - else: - title = th_video_get_title(url, p) - - cid = match1(url, r'http://thvideo.tv/v/th(\d+)') - - type_ = '' - size = 0 - urls = thvideo_cid_to_url(cid, p) - - for url in urls: - _, type_, temp = url_info(url) - size += temp - - print_info(site_info, title, type_, size) - if not info_only: - download_urls(urls, title, type_, total_size=None, output_dir=output_dir, merge=merge) - -#---------------------------------------------------------------------- -def thvideo_download_playlist(url, output_dir = '.', merge = False, info_only = False, **kwargs): - """""" - if re.match(r'http://thvideo.tv/v/\w+', url): - html = get_content(url) - video_list = match1(html, r'
  • cid=(.+)
  • ').split('**') - - title_base = th_video_get_title(url, 0) - for p, v in video_list: - part_title = [i.split('=')[-1:][0].split('|')[1] for i in video_list][p] - title = title_base + part_title - thvideo_download(url, output_dir, merge, - info_only, p = p, title = title) - -site_info = "THVideo" -download = thvideo_download -download_playlist = thvideo_download_playlist From 847e531b0d287d970bcbbdec13b8a2224151b0a8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 1 Feb 2017 23:51:06 +0100 Subject: [PATCH 068/765] update .travis.yml (add python 3.6) and LICENSE (2017) --- .travis.yml | 1 + LICENSE.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 9b73708d..2d780e81 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ python: - "3.3" - "3.4" - "3.5" + - "3.6" - "nightly" - "pypy3" script: make test diff --git a/LICENSE.txt b/LICENSE.txt index 54a06fe5..7b25d906 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,7 +1,7 @@ ============================================== This is a copy of the MIT license. ============================================== -Copyright (C) 2012, 2013, 2014, 2015, 2016 Mort Yao +Copyright (C) 2012-2017 Mort Yao Copyright (C) 2012 Boyu Guo Permission is hereby granted, free of charge, to any person obtaining a copy of From 62a535a4180a736608e56c440951d8b0e7b23ae8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 1 Feb 2017 23:53:32 +0100 Subject: [PATCH 069/765] version 0.4.648 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 2e8e4f41..933c46ad 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.626' +__version__ = '0.4.648' From ed99b91d1893186437f52701be03048e50873b9a Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 2 Feb 2017 05:43:57 +0100 Subject: [PATCH 070/765] [xiami] fix #1650 --- src/you_get/extractors/xiami.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/you_get/extractors/xiami.py b/src/you_get/extractors/xiami.py index b056c08e..e321c42e 100644 --- a/src/you_get/extractors/xiami.py +++ b/src/you_get/extractors/xiami.py @@ -13,7 +13,7 @@ def location_dec(str): str = str[1:] rows = head cols = int(len(str)/rows) + 1 - + out = "" full_row = len(str) % head for c in range(cols): @@ -58,7 +58,7 @@ def xiami_download_song(sid, output_dir = '.', merge = True, info_only = False): type, ext, size = url_info(url, faker = True) if not ext: ext = 'mp3' - + print_info(site_info, song_title, ext, size) if not info_only: file_name = "%s - %s - %s" % (song_title, artist, album_name) @@ -95,7 +95,7 @@ def xiami_download_showcollect(cid, output_dir = '.', merge = True, info_only = type, ext, size = url_info(url, faker = True) if not ext: ext = 'mp3' - + print_info(site_info, song_title, type, size) if not info_only: file_name = "%02d.%s - %s - %s" % (track_nr, song_title, artist, album_name) @@ -104,7 +104,7 @@ def xiami_download_showcollect(cid, output_dir = '.', merge = True, info_only = xiami_download_lyric(lrc_url, file_name, output_dir) except: pass - + track_nr += 1 def xiami_download_album(aid, output_dir = '.', merge = True, info_only = False): @@ -140,22 +140,23 @@ def xiami_download_album(aid, output_dir = '.', merge = True, info_only = False) if not pic_exist: xiami_download_pic(pic_url, 'cover', output_dir) pic_exist = True - + track_nr += 1 def xiami_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False, **kwargs): if re.match(r'http://www.xiami.com/album/\d+', url): id = r1(r'http://www.xiami.com/album/(\d+)', url) xiami_download_album(id, output_dir, merge, info_only) - + if re.match(r'http://www.xiami.com/collect/\d+', url): id = r1(r'http://www.xiami.com/collect/(\d+)', url) xiami_download_showcollect(id, output_dir, merge, info_only) - + if re.match('http://www.xiami.com/song/\d+', url): - id = r1(r'http://www.xiami.com/song/(\d+)', url) + html = get_html(url, faker=True) + id = r1(r'rel="canonical" href="http://www.xiami.com/song/([^"]+)"', html) xiami_download_song(id, output_dir, merge, info_only) - + if re.match('http://www.xiami.com/song/detail/id/\d+', url): id = r1(r'http://www.xiami.com/song/detail/id/(\d+)', url) xiami_download_song(id, output_dir, merge, info_only) From 098b6a9dd8b4db5d3516ada1c3dc24fd8d645fba Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 2 Feb 2017 05:50:00 +0100 Subject: [PATCH 071/765] [youtube] fix signature extraction --- src/you_get/extractors/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index b0097f13..ad1706be 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -52,7 +52,7 @@ class YouTube(VideoExtractor): return code js = js.replace('\n', ' ') - f1 = match1(js, r'\w+\.sig\|\|([$\w]+)\(\w+\.\w+\)') + f1 = match1(js, r'"signature",([\w]+)\(\w+\.\w+\)') f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \ match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1)) f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def) From c5dbb9766116e6362bd1c3e2a680dedb16979d6f Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 2 Feb 2017 06:00:30 +0100 Subject: [PATCH 072/765] tests: remove test_freesound --- tests/test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test.py b/tests/test.py index 020455b0..ba15e447 100644 --- a/tests/test.py +++ b/tests/test.py @@ -8,9 +8,6 @@ from you_get.common import * class YouGetTests(unittest.TestCase): - def test_freesound(self): - freesound.download("http://www.freesound.org/people/Corsica_S/sounds/184419/", info_only=True) - def test_imgur(self): imgur.download("http://imgur.com/WVLk5nD", info_only=True) imgur.download("http://imgur.com/gallery/WVLk5nD", info_only=True) From 858435d5035b72832f5f2a63adbd176916a9a27a Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 2 Feb 2017 06:03:23 +0100 Subject: [PATCH 073/765] version 0.4.652 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 933c46ad..63d908c6 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.648' +__version__ = '0.4.652' From b310fdc2d583fb006ebb3a46be10488054ef1561 Mon Sep 17 00:00:00 2001 From: Justsoos Date: Sat, 4 Feb 2017 23:14:33 +0800 Subject: [PATCH 074/765] delete comments --- src/you_get/extractors/quanmin.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/you_get/extractors/quanmin.py b/src/you_get/extractors/quanmin.py index 668e84f9..bf1af659 100644 --- a/src/you_get/extractors/quanmin.py +++ b/src/you_get/extractors/quanmin.py @@ -7,9 +7,8 @@ import json import time def quanmin_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): - roomid = url.split('/')[3].split('?')[0] #add ?parameter.split and change the unstable url(from the back) rfind to split(from the front) + roomid = url.split('/')[3].split('?')[0] - #json_request_url = 'http://www.quanmin.tv/json/rooms/{}/info4.json'.format(roomid) #"http://www.quanmin.tv/json/rooms/308137/info4.json" switch to "http://m.quanmin.tv/json/rooms/308137/noinfo6.json" json_request_url = 'http://m.quanmin.tv/json/rooms/{}/noinfo6.json'.format(roomid) content = get_html(json_request_url) data = json.loads(content) From 69714046b838499c5fce166153ccbf907a69e4a2 Mon Sep 17 00:00:00 2001 From: l34p Date: Mon, 6 Feb 2017 18:04:52 +0900 Subject: [PATCH 075/765] [youtube] improve livestream detection, fix #1673 --- src/you_get/extractors/youtube.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index ad1706be..18b46c9d 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -149,18 +149,6 @@ class YouTube(VideoExtractor): if 'use_cipher_signature' not in video_info or video_info['use_cipher_signature'] == ['False']: self.title = parse.unquote_plus(video_info['title'][0]) - # YouTube Live - if 'url_encoded_fmt_stream_map' not in video_info: - hlsvp = video_info['hlsvp'][0] - - if 'info_only' in kwargs and kwargs['info_only']: - return - else: - download_url_ffmpeg(hlsvp, self.title, 'mp4') - exit(0) - - stream_list = video_info['url_encoded_fmt_stream_map'][0].split(',') - # Parse video page (for DASH) video_page = get_content('https://www.youtube.com/watch?v=%s' % self.vid) try: @@ -169,6 +157,7 @@ class YouTube(VideoExtractor): # Workaround: get_video_info returns bad s. Why? stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') except: + stream_list = video_info['url_encoded_fmt_stream_map'][0].split(',') self.html5player = None else: @@ -209,6 +198,16 @@ class YouTube(VideoExtractor): else: log.wtf('[Failed] Invalid status.') + # YouTube Live + if ytplayer_config['args'].get('livestream') == '1' or ytplayer_config['args'].get('live_playback') == '1': + hlsvp = ytplayer_config['args']['hlsvp'] + + if 'info_only' in kwargs and kwargs['info_only']: + return + else: + download_url_ffmpeg(hlsvp, self.title, 'mp4') + exit(0) + for stream in stream_list: metadata = parse.parse_qs(stream) stream_itag = metadata['itag'][0] From 1997ea45ce2e0afda20f2d9f2f77d9df947d800d Mon Sep 17 00:00:00 2001 From: Zhiming Wang Date: Thu, 9 Feb 2017 11:26:32 -0500 Subject: [PATCH 076/765] [common] log URLs in more functions with network requests This is a follow-up to #999. This commit adds the : debug message, which was previously only emitted by get_content and post_content, to all high level utility functions with network requests except url_size, url_save and url_save_chunked (in order not to ruin progress bars). --- src/you_get/common.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/you_get/common.py b/src/you_get/common.py index a4aea070..2edbc426 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -255,6 +255,8 @@ def undeflate(data): # DEPRECATED in favor of get_content() def get_response(url, faker = False): + logging.debug('get_response: %s' % url) + # install cookies if cookies: opener = request.build_opener(request.HTTPCookieProcessor(cookies)) @@ -275,11 +277,15 @@ def get_response(url, faker = False): # DEPRECATED in favor of get_content() def get_html(url, encoding = None, faker = False): + logging.debug('get_html: %s' % url) + content = get_response(url, faker).data return str(content, 'utf-8', 'ignore') # DEPRECATED in favor of get_content() def get_decoded_html(url, faker = False): + logging.debug('get_decoded_html: %s' % url) + response = get_response(url, faker) data = response.data charset = r1(r'charset=([\w-]+)', response.headers['content-type']) @@ -289,6 +295,8 @@ def get_decoded_html(url, faker = False): return data def get_location(url): + logging.debug('get_location: %s' % url) + response = request.urlopen(url) # urllib will follow redirections and it's too much code to tell urllib # not to do that @@ -394,6 +402,8 @@ def urls_size(urls, faker = False, headers = {}): return sum([url_size(url, faker=faker, headers=headers) for url in urls]) def get_head(url, headers = {}, get_method = 'HEAD'): + logging.debug('get_head: %s' % url) + if headers: req = request.Request(url, headers=headers) else: @@ -403,6 +413,8 @@ def get_head(url, headers = {}, get_method = 'HEAD'): return dict(res.headers) def url_info(url, faker = False, headers = {}): + logging.debug('url_info: %s' % url) + if faker: response = urlopen_with_retry(request.Request(url, headers=fake_headers)) elif headers: @@ -456,6 +468,8 @@ def url_info(url, faker = False, headers = {}): def url_locations(urls, faker = False, headers = {}): locations = [] for url in urls: + logging.debug('url_locations: %s' % url) + if faker: response = urlopen_with_retry(request.Request(url, headers=fake_headers)) elif headers: From c1ed0d7e15d327327922235894623f7a551438b3 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 12 Feb 2017 00:40:16 +0100 Subject: [PATCH 077/765] [youtube] fix signature extraction (regression in #1662) --- src/you_get/extractors/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index ad1706be..6f75a129 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -52,7 +52,7 @@ class YouTube(VideoExtractor): return code js = js.replace('\n', ' ') - f1 = match1(js, r'"signature",([\w]+)\(\w+\.\w+\)') + f1 = match1(js, r'"signature",([$\w]+)\(\w+\.\w+\)') f1def = match1(js, r'function %s(\(\w+\)\{[^\{]+\})' % re.escape(f1)) or \ match1(js, r'\W%s=function(\(\w+\)\{[^\{]+\})' % re.escape(f1)) f1def = re.sub(r'([$\w]+\.)([$\w]+\(\w+,\d+\))', r'\2', f1def) From 4d0dac29681a18520dabe1fc6a6deb81fe20f49d Mon Sep 17 00:00:00 2001 From: Zhiming Wang Date: Thu, 2 Feb 2017 03:59:44 -0500 Subject: [PATCH 078/765] [ffmpeg] call ffmpeg with stdin redirected to the null device Prevent FFmpeg from consuming stdin and interpreting the character stream as a stream of interactive commands, specifically: ? show this help + increase verbosity - decrease verbosity c Send command to first matching filter supporting it C Send/Queue command to all matching filters D cycle through available debug modes h dump packets/hex press to cycle through the 3 states q quit s Show QP histogram This prevents misclicking a key or key sequence (e.g., h) produces a large amount of debugging output which may confuse the unseasoned user. It is also useful in a batch environment where an unsuspecting user may not realize you-get could consume stdin through FFmpeg, e.g. while read url; do you-get $url; done '0') or (vers[0] == 'avconv') @@ -24,8 +33,10 @@ def get_usable_ffmpeg(cmd): FFMPEG, FFMPEG_VERSION = get_usable_ffmpeg('ffmpeg') or get_usable_ffmpeg('avconv') or (None, None) if logging.getLogger().isEnabledFor(logging.DEBUG): LOGLEVEL = ['-loglevel', 'info'] + STDIN = None else: LOGLEVEL = ['-loglevel', 'quiet'] + STDIN = DEVNULL def has_ffmpeg_installed(): return FFMPEG is not None @@ -54,14 +65,14 @@ def ffmpeg_concat_av(files, output, ext): params.extend(['-c:a', 'vorbis']) params.extend(['-strict', 'experimental']) params.append(output) - return subprocess.call(params) + return subprocess.call(params, stdin=STDIN) def ffmpeg_convert_ts_to_mkv(files, output='output.mkv'): for file in files: if os.path.isfile(file): params = [FFMPEG] + LOGLEVEL params.extend(['-y', '-i', file, output]) - subprocess.call(params) + subprocess.call(params, stdin=STDIN) return @@ -71,7 +82,7 @@ def ffmpeg_concat_mp4_to_mpg(files, output='output.mpg'): concat_list = generate_concat_list(files, output) params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', '-i', concat_list, '-c', 'copy', output] - if subprocess.call(params) == 0: + if subprocess.call(params, stdin=STDIN) == 0: os.remove(output + '.txt') return True else: @@ -81,7 +92,7 @@ def ffmpeg_concat_mp4_to_mpg(files, output='output.mpg'): if os.path.isfile(file): params = [FFMPEG] + LOGLEVEL + ['-y', '-i'] params.extend([file, file + '.mpg']) - subprocess.call(params) + subprocess.call(params, stdin=STDIN) inputs = [open(file + '.mpg', 'rb') for file in files] with open(output + '.mpg', 'wb') as o: @@ -92,9 +103,8 @@ def ffmpeg_concat_mp4_to_mpg(files, output='output.mpg'): params.append(output + '.mpg') params += ['-vcodec', 'copy', '-acodec', 'copy'] params.append(output) - subprocess.call(params) - if subprocess.call(params) == 0: + if subprocess.call(params, stdin=STDIN) == 0: for file in files: os.remove(file + '.mpg') os.remove(output + '.mpg') @@ -112,7 +122,7 @@ def ffmpeg_concat_ts_to_mkv(files, output='output.mkv'): params += ['-f', 'matroska', '-c', 'copy', output] try: - if subprocess.call(params) == 0: + if subprocess.call(params, stdin=STDIN) == 0: return True else: return False @@ -127,7 +137,7 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'): params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', '-i', concat_list, '-c', 'copy', '-bsf:a', 'aac_adtstoasc', output] - subprocess.check_call(params) + subprocess.check_call(params, stdin=STDIN) os.remove(output + '.txt') return True @@ -138,7 +148,7 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'): params += ['-map', '0', '-c', 'copy', '-f', 'mpegts', '-bsf:v', 'h264_mp4toannexb'] params.append(file + '.ts') - subprocess.call(params) + subprocess.call(params, stdin=STDIN) params = [FFMPEG] + LOGLEVEL + ['-y', '-i'] params.append('concat:') @@ -151,7 +161,7 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'): else: params += ['-c', 'copy', '-absf', 'aac_adtstoasc', output] - if subprocess.call(params) == 0: + if subprocess.call(params, stdin=STDIN) == 0: for file in files: os.remove(file + '.ts') return True @@ -166,7 +176,7 @@ def ffmpeg_concat_mp4_to_mp4(files, output='output.mp4'): params = [FFMPEG] + LOGLEVEL + ['-y', '-f', 'concat', '-safe', '-1', '-i', concat_list, '-c', 'copy', '-bsf:a', 'aac_adtstoasc', output] - subprocess.check_call(params) + subprocess.check_call(params, stdin=STDIN) os.remove(output + '.txt') return True @@ -177,7 +187,7 @@ def ffmpeg_concat_mp4_to_mp4(files, output='output.mp4'): params += ['-c', 'copy', '-f', 'mpegts', '-bsf:v', 'h264_mp4toannexb'] params.append(file + '.ts') - subprocess.call(params) + subprocess.call(params, stdin=STDIN) params = [FFMPEG] + LOGLEVEL + ['-y', '-i'] params.append('concat:') @@ -190,7 +200,7 @@ def ffmpeg_concat_mp4_to_mp4(files, output='output.mp4'): else: params += ['-c', 'copy', '-absf', 'aac_adtstoasc', output] - subprocess.check_call(params) + subprocess.check_call(params, stdin=STDIN) for file in files: os.remove(file + '.ts') return True From 8799197befd1f52278a4344fc41ba94cc45c548a Mon Sep 17 00:00:00 2001 From: YK Liu Date: Mon, 20 Feb 2017 15:09:38 +0800 Subject: [PATCH 079/765] Print audiolang in json output --- src/you_get/json_output.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/you_get/json_output.py b/src/you_get/json_output.py index 3e1bac9f..0e610a41 100644 --- a/src/you_get/json_output.py +++ b/src/you_get/json_output.py @@ -11,6 +11,11 @@ def output(video_extractor, pretty_print=True): out['title'] = ve.title out['site'] = ve.name out['streams'] = ve.streams + try: + if ve.audiolang: + out['audiolang'] = ve.audiolang + except NameError: + pass if pretty_print: print(json.dumps(out, indent=4, sort_keys=True, ensure_ascii=False)) else: From 9b9d80b32deb6bae475d3d85f376e6d69c6c0835 Mon Sep 17 00:00:00 2001 From: MaxwellGoblin Date: Sat, 25 Feb 2017 02:31:07 +0800 Subject: [PATCH 080/765] do not print size when the container is m3u8 --- src/you_get/extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractor.py b/src/you_get/extractor.py index 332440dd..af7cc824 100644 --- a/src/you_get/extractor.py +++ b/src/you_get/extractor.py @@ -98,7 +98,7 @@ class VideoExtractor(): if 'quality' in stream: print(" quality: %s" % stream['quality']) - if 'size' in stream: + if 'size' in stream and stream['container'].lower() != 'm3u8': print(" size: %s MiB (%s bytes)" % (round(stream['size'] / 1048576, 1), stream['size'])) if 'itag' in stream: From 251a1bff489d2eb34bfa52b54b55dbab6069bd63 Mon Sep 17 00:00:00 2001 From: MaxwellGoblin Date: Fri, 24 Feb 2017 22:54:59 +0800 Subject: [PATCH 081/765] ckplayer.py: fix when got data without a ckplayer->info and clean the code --- src/you_get/extractors/ckplayer.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/you_get/extractors/ckplayer.py b/src/you_get/extractors/ckplayer.py index 09e95557..91159897 100644 --- a/src/you_get/extractors/ckplayer.py +++ b/src/you_get/extractors/ckplayer.py @@ -9,7 +9,6 @@ __all__ = ['ckplayer_download'] from xml.etree import cElementTree as ET from copy import copy from ..common import * - #---------------------------------------------------------------------- def ckplayer_get_info_by_xml(ckinfo): """str->dict @@ -20,20 +19,22 @@ def ckplayer_get_info_by_xml(ckinfo): 'links': [], 'size': 0, 'flashvars': '',} - if '_text' in dictify(e)['ckplayer']['info'][0]['title'][0]: #title - video_dict['title'] = dictify(e)['ckplayer']['info'][0]['title'][0]['_text'].strip() + dictified = dictify(e)['ckplayer'] + if 'info' in dictified: + if '_text' in dictified['info'][0]['title'][0]: #title + video_dict['title'] = dictified['info'][0]['title'][0]['_text'].strip() #if dictify(e)['ckplayer']['info'][0]['title'][0]['_text'].strip(): #duration #video_dict['title'] = dictify(e)['ckplayer']['info'][0]['title'][0]['_text'].strip() - if '_text' in dictify(e)['ckplayer']['video'][0]['size'][0]: #size exists for 1 piece - video_dict['size'] = sum([int(i['size'][0]['_text']) for i in dictify(e)['ckplayer']['video']]) + if '_text' in dictified['video'][0]['size'][0]: #size exists for 1 piece + video_dict['size'] = sum([int(i['size'][0]['_text']) for i in dictified['video']]) - if '_text' in dictify(e)['ckplayer']['video'][0]['file'][0]: #link exist - video_dict['links'] = [i['file'][0]['_text'].strip() for i in dictify(e)['ckplayer']['video']] + if '_text' in dictified['video'][0]['file'][0]: #link exist + video_dict['links'] = [i['file'][0]['_text'].strip() for i in dictified['video']] - if '_text' in dictify(e)['ckplayer']['flashvars'][0]: - video_dict['flashvars'] = dictify(e)['ckplayer']['flashvars'][0]['_text'].strip() + if '_text' in dictified['flashvars'][0]: + video_dict['flashvars'] = dictified['flashvars'][0]['_text'].strip() return video_dict From 925415fa2b831c6fb5856de0e3739c31c101c1a9 Mon Sep 17 00:00:00 2001 From: MaxwellGoblin Date: Sat, 25 Feb 2017 00:31:30 +0800 Subject: [PATCH 082/765] add support for dilidili.mobi and dilidili.wang --- src/you_get/extractors/dilidili.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/you_get/extractors/dilidili.py b/src/you_get/extractors/dilidili.py index 082f84e1..f7b5922d 100644 --- a/src/you_get/extractors/dilidili.py +++ b/src/you_get/extractors/dilidili.py @@ -21,8 +21,9 @@ headers = { #---------------------------------------------------------------------- def dilidili_parser_data_to_stream_types(typ ,vid ,hd2 ,sign, tmsign, ulk): """->list""" + another_url = 'https://newplayer.jfrft.com/parse.php?xmlurl=null&type={typ}&vid={vid}&hd={hd2}&sign={sign}&tmsign={tmsign}&userlink={ulk}'.format(typ = typ, vid = vid, hd2 = hd2, sign = sign, tmsign = tmsign, ulk = ulk) parse_url = 'http://player.005.tv/parse.php?xmlurl=null&type={typ}&vid={vid}&hd={hd2}&sign={sign}&tmsign={tmsign}&userlink={ulk}'.format(typ = typ, vid = vid, hd2 = hd2, sign = sign, tmsign = tmsign, ulk = ulk) - html = get_content(parse_url, headers=headers) + html = get_content(another_url, headers=headers) info = re.search(r'(\{[^{]+\})(\{[^{]+\})(\{[^{]+\})(\{[^{]+\})(\{[^{]+\})', html).groups() info = [i.strip('{}').split('->') for i in info] @@ -35,13 +36,22 @@ def dilidili_parser_data_to_stream_types(typ ,vid ,hd2 ,sign, tmsign, ulk): #---------------------------------------------------------------------- def dilidili_download(url, output_dir = '.', merge = False, info_only = False, **kwargs): - if re.match(r'http://www.dilidili.com/watch\S+', url): + global headers + re_str = r'http://www.dilidili.com/watch\S+' + if re.match(r'http://www.dilidili.wang', url): + re_str = r'http://www.dilidili.wang/watch\S+' + headers['Referer'] = 'http://www.dilidili.wang/' + elif re.match(r'http://www.dilidili.mobi', url): + re_str = r'http://www.dilidili.mobi/watch\S+' + headers['Referer'] = 'http://www.dilidili.mobi/' + + if re.match(re_str, url): html = get_content(url) title = match1(html, r'(.+)丨(.+)') #title # player loaded via internal iframe frame_url = re.search(r'