From 677040a39ef448f8ccf3f4c8056642b6a9a9d585 Mon Sep 17 00:00:00 2001 From: NickeyKim Date: Thu, 19 May 2016 14:48:45 +0900 Subject: [PATCH 01/16] add Naver support add Naver support --- README.md | 1 + src/you_get/common.py | 1 + src/you_get/extractors/__init__.py | 1 + src/you_get/extractors/naver.py | 48 ++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+) create mode 100644 src/you_get/extractors/naver.py diff --git a/README.md b/README.md index a4f4fcd9..d0a91aa8 100644 --- a/README.md +++ b/README.md @@ -373,6 +373,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 战旗TV | |✓| | | | 央视网 | |✓| | | | 花瓣 | | |✓| | +| Naver
네이버 | |✓| | | For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. diff --git a/src/you_get/common.py b/src/you_get/common.py index 6c65bd49..9668df58 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -52,6 +52,7 @@ SITES = { 'mixcloud' : 'mixcloud', 'mtv81' : 'mtv81', 'musicplayon' : 'musicplayon', + 'naver' : 'naver', '7gogo' : 'nanagogo', 'nicovideo' : 'nicovideo', 'panda' : 'panda', diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index 1bb7a7ab..6fdaa340 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -45,6 +45,7 @@ from .mixcloud import * from .mtv81 import * from .musicplayon import * from .nanagogo import * +from .naver import * from .netease import * from .nicovideo import * from .panda import * diff --git a/src/you_get/extractors/naver.py b/src/you_get/extractors/naver.py new file mode 100644 index 00000000..7f7d5548 --- /dev/null +++ b/src/you_get/extractors/naver.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python + +__all__ = ['naver_download'] +import urllib.request, urllib.parse +from ..common import * + +def naver_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): + + assert re.search(r'http://tvcast.naver.com/v/', url), "URL is not supported" + + html = get_html(url) + contentid = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"',html) + videoid = contentid.group(1) + inkey = contentid.group(2) + assert videoid + assert inkey + info_key = urllib.parse.urlencode({'vid': videoid, 'inKey': inkey, }) + down_key = urllib.parse.urlencode({'masterVid': videoid,'protocol': 'p2p','inKey': inkey, }) + inf_xml = get_html('http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?%s' % info_key ) + + from xml.dom.minidom import parseString + doc_info = parseString(inf_xml) + Subject = doc_info.getElementsByTagName('Subject')[0].firstChild + title = Subject.data + assert title + + xml = get_html('http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?%s' % down_key ) + doc = parseString(xml) + + encodingoptions = doc.getElementsByTagName('EncodingOption') + old_height = doc.getElementsByTagName('height')[0] + real_url= '' + #to download the highest resolution one, + for node in encodingoptions: + new_height = node.getElementsByTagName('height')[0] + domain_node = node.getElementsByTagName('Domain')[0] + uri_node = node.getElementsByTagName('uri')[0] + if int(new_height.firstChild.data) > int (old_height.firstChild.data): + real_url= domain_node.firstChild.data+ '/' +uri_node.firstChild.data + + type, ext, size = url_info(real_url) + print_info(site_info, title, type, size) + if not info_only: + download_urls([real_url], title, ext, size, output_dir, merge = merge) + +site_info = "tvcast.naver.com" +download = naver_download +download_playlist = playlist_not_supported('naver') From e6af1c6265be5a45f978f3321db20cfe9cb30a05 Mon Sep 17 00:00:00 2001 From: jmargeta Date: Thu, 19 May 2016 22:57:48 +0200 Subject: [PATCH 02/16] Support for embedded Vimeo videos Example URL: - http://www.miracletutorials.com/howto-embed-vimeo/ --- src/you_get/extractors/embed.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/you_get/extractors/embed.py b/src/you_get/extractors/embed.py index fd463c92..a177e663 100644 --- a/src/you_get/extractors/embed.py +++ b/src/you_get/extractors/embed.py @@ -8,6 +8,7 @@ from .netease import netease_download from .qq import qq_download_by_vid from .sina import sina_download_by_vid from .tudou import tudou_download_by_id +from .vimeo import vimeo_download_by_id from .yinyuetai import yinyuetai_download_by_id from .youku import youku_download_by_vid @@ -39,6 +40,9 @@ iqiyi_embed_patterns = [ 'player\.video\.qiyi\.com/([^/]+)/[^/]+/[^/]+/[^/]+\.sw netease_embed_patterns = [ '(http://\w+\.163\.com/movie/[^\'"]+)' ] +vimeo_embed_patters = [ 'player\.vimeo\.com/video/(\d+)' ] + + def embed_download(url, output_dir = '.', merge = True, info_only = False ,**kwargs): content = get_content(url, headers=fake_headers) found = False @@ -69,6 +73,11 @@ def embed_download(url, output_dir = '.', merge = True, info_only = False ,**kwa found = True netease_download(url, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + urls = matchall(content, vimeo_embed_patters) + for url in urls: + found = True + vimeo_download_by_id(url, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + if not found: raise NotImplementedError(url) From 2cd4656b32c1005edebf3047ba7f1f7470abfbf0 Mon Sep 17 00:00:00 2001 From: David Zhuang Date: Fri, 20 May 2016 04:28:30 -0400 Subject: [PATCH 03/16] [QiE]Add Support --- src/you_get/extractors/__init__.py | 1 + src/you_get/extractors/qie.py | 78 ++++++++++++++++++++++++++++++ src/you_get/extractors/qq.py | 4 ++ 3 files changed, 83 insertions(+) create mode 100644 src/you_get/extractors/qie.py diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index 1bb7a7ab..e2cf656c 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -52,6 +52,7 @@ from .pinterest import * from .pixnet import * from .pptv import * from .qianmo import * +from .qie import * from .qq import * from .sina import * from .sohu import * diff --git a/src/you_get/extractors/qie.py b/src/you_get/extractors/qie.py new file mode 100644 index 00000000..2288106a --- /dev/null +++ b/src/you_get/extractors/qie.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from ..common import * +from ..extractor import VideoExtractor + +from json import loads + +class QiE(VideoExtractor): + name = "QiE (企鹅直播)" + + # Last updated: 2015-11-24 + stream_types = [ + {'id': 'normal', 'container': 'flv', 'video_profile': '标清'}, + {'id': 'middle', 'container': 'flv', 'video_profile': '550'}, + {'id': 'middle2', 'container': 'flv', 'video_profile': '900'}, + ] + + id_dic = {i['video_profile']:(i['id']) for i in stream_types} + + api_endpoint = 'http://www.qie.tv/api/v1/room/{room_id}' + + @staticmethod + def get_vid_from_url(url): + """Extracts video ID from live.qq.com. + """ + html = get_content(url) + return match1(html, r'room_id\":(\d+)') + + def download_playlist_by_url(self, url, **kwargs): + pass + + def prepare(self, **kwargs): + if self.url: + self.vid = self.get_vid_from_url(self.url) + + content = get_content(self.api_endpoint.format(room_id = self.vid)) + content = loads(content) + self.title = content['data']['room_name'] + rtmp_url = content['data']['rtmp_url'] + #stream_avalable = [i['name'] for i in content['data']['stream']] + stream_available = {} + stream_available['normal'] = rtmp_url + '/' + content['data']['rtmp_live'] + if len(content['data']['rtmp_multi_bitrate']) > 0: + for k , v in content['data']['rtmp_multi_bitrate'].items(): + stream_available[k] = rtmp_url + '/' + v + + for s in self.stream_types: + if s['id'] in stream_available.keys(): + quality_id = s['id'] + url = stream_available[quality_id] + self.streams[quality_id] = { + 'container': 'flv', + 'video_profile': s['video_profile'], + 'size': 0, + 'url': url + } + + def extract(self, **kwargs): + for i in self.streams: + s = self.streams[i] + s['src'] = [s['url']] + if 'stream_id' in kwargs and kwargs['stream_id']: + # Extract the stream + stream_id = kwargs['stream_id'] + + if stream_id not in self.streams: + log.e('[Error] Invalid video format.') + log.e('Run \'-i\' command with no specific video format to view all available formats.') + exit(2) + else: + # Extract stream with the best quality + stream_id = self.streams_sorted[0]['id'] + s['src'] = [s['url']] + +site = QiE() +download = site.download_by_url +download_playlist = playlist_not_supported('QiE') \ No newline at end of file diff --git a/src/you_get/extractors/qq.py b/src/you_get/extractors/qq.py index ebe1a9ad..cb4aeebf 100644 --- a/src/you_get/extractors/qq.py +++ b/src/you_get/extractors/qq.py @@ -3,6 +3,7 @@ __all__ = ['qq_download'] from ..common import * +from .qie import download as qieDownload def qq_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False): api = "http://h5vv.video.qq.com/getinfo?otype=json&platform=10901&vid=%s" % vid @@ -34,6 +35,9 @@ def qq_download(url, output_dir='.', merge=True, info_only=False, **kwargs): vid = match1(content, r'vid\s*=\s*"\s*([^"]+)"') title = match1(content, r'title">([^"]+)

') title = title.strip() if title else vid + elif 'live.qq.com' in url: + qieDownload(url,output_dir=output_dir, merge=merge, info_only=info_only) + exit() elif 'iframe/player.html' in url: vid = match1(url, r'\bvid=(\w+)') # for embedded URLs; don't know what the title is From b4eb73965ccadd9ba78e9d65d1c05b93c9467979 Mon Sep 17 00:00:00 2001 From: cnbeining Date: Thu, 7 Apr 2016 16:43:17 -0400 Subject: [PATCH 04/16] Add -bsf:a aac_adtstoasc when merging As in #813. --- src/you_get/processor/ffmpeg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 src/you_get/processor/ffmpeg.py diff --git a/src/you_get/processor/ffmpeg.py b/src/you_get/processor/ffmpeg.py old mode 100644 new mode 100755 index 320eb642..e7ee35d6 --- a/src/you_get/processor/ffmpeg.py +++ b/src/you_get/processor/ffmpeg.py @@ -169,7 +169,7 @@ def ffmpeg_concat_mp4_to_mp4(files, output='output.mp4'): params = [FFMPEG] + LOGLEVEL + ['-f', 'concat', '-safe', '-1', '-y', '-i'] params.append(output + '.txt') - params += ['-c', 'copy', output] + params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc', output] subprocess.check_call(params) os.remove(output + '.txt') From 67d18c766ae832864ad8188e71f981db24983025 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 22 May 2016 17:28:01 +0200 Subject: [PATCH 05/16] processor/ffmpeg.py: 644 --- src/you_get/processor/ffmpeg.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 src/you_get/processor/ffmpeg.py diff --git a/src/you_get/processor/ffmpeg.py b/src/you_get/processor/ffmpeg.py old mode 100755 new mode 100644 From d1d62ae3040c98b7c3efb81937db211480ecb859 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 23 May 2016 17:28:29 +0200 Subject: [PATCH 06/16] [twitter] fix #1139 --- src/you_get/extractors/twitter.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 251cb8d5..1794081b 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -5,6 +5,13 @@ __all__ = ['twitter_download'] from ..common import * from .vine import vine_download +def extract_m3u(source): + r1 = get_content(source) + s1 = re.findall(r'(/ext_tw_video/.*)', r1) + r2 = get_content('https://video.twimg.com%s' % s1[-1]) + s2 = re.findall(r'(/ext_tw_video/.*)', r2) + return 'https://video.twimg.com%s' % s2[-1] + def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_html(url) screen_name = r1(r'data-screen-name="([^"]*)"', html) or \ @@ -63,6 +70,7 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) source = r1(r'\s*', vmap) if not item_id: page_title = i['tweet_id'] + source = extract_m3u(source) mime, ext, size = url_info(source) print_info(site_info, page_title, mime, size) From d2d6fcb95397b969272a53dc9abf852f1181916d Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 23 May 2016 22:13:12 +0200 Subject: [PATCH 07/16] [twitter] fix #1139 (really) --- src/you_get/extractors/twitter.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/you_get/extractors/twitter.py b/src/you_get/extractors/twitter.py index 1794081b..b0c89189 100644 --- a/src/you_get/extractors/twitter.py +++ b/src/you_get/extractors/twitter.py @@ -10,7 +10,7 @@ def extract_m3u(source): s1 = re.findall(r'(/ext_tw_video/.*)', r1) r2 = get_content('https://video.twimg.com%s' % s1[-1]) s2 = re.findall(r'(/ext_tw_video/.*)', r2) - return 'https://video.twimg.com%s' % s2[-1] + return ['https://video.twimg.com%s' % i for i in s2] def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_html(url) @@ -70,12 +70,13 @@ def twitter_download(url, output_dir='.', merge=True, info_only=False, **kwargs) source = r1(r'\s*', vmap) if not item_id: page_title = i['tweet_id'] - source = extract_m3u(source) - mime, ext, size = url_info(source) + urls = extract_m3u(source) + size = urls_size(urls) + mime, ext = 'video/mp4', 'mp4' print_info(site_info, page_title, mime, size) if not info_only: - download_urls([source], page_title, ext, size, output_dir, merge=merge) + download_urls(urls, page_title, ext, size, output_dir, merge=merge) site_info = "Twitter.com" download = twitter_download From 186762b4b51679d1af40139e782e2b722a52c633 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 23 May 2016 22:26:36 +0200 Subject: [PATCH 08/16] update README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index d0a91aa8..60cb125a 100644 --- a/README.md +++ b/README.md @@ -360,6 +360,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | PPTV聚力 | |✓| | | | 齐鲁网 | |✓| | | | QQ
腾讯视频 | |✓| | | +| 企鹅直播 | |✓| | | | 阡陌视频 | |✓| | | | THVideo | |✓| | | | Sina
新浪视频
微博秒拍视频 |
|✓| | | @@ -374,6 +375,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 央视网 | |✓| | | | 花瓣 | | |✓| | | Naver
네이버 | |✓| | | +| 芒果TV | |✓| | | For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. From 5f99e1dfd3ca5d6c83b11e5c060a8c0bfe611d8d Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Mon, 23 May 2016 22:38:19 +0200 Subject: [PATCH 09/16] version 0.4.424 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 0e7b6632..5a9d5581 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.390' +__version__ = '0.4.424' From 854386f22fd7c38462e3d89f46c94f5c13ac197b Mon Sep 17 00:00:00 2001 From: JayXon Date: Sun, 22 May 2016 23:07:10 -0700 Subject: [PATCH 10/16] Retry request if timeout in get_content --- src/you_get/common.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 9668df58..dadba69c 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -98,6 +98,7 @@ import logging import os import platform import re +import socket import sys import time from urllib import request, parse, error @@ -308,7 +309,14 @@ def get_content(url, headers={}, decoded=True): if cookies: cookies.add_cookie_header(req) req.headers.update(req.unredirected_hdrs) - response = request.urlopen(req) + + for i in range(10): + try: + response = request.urlopen(req, timeout=10) + break + except socket.timeout: + logging.debug('request attempt %s timeout' % str(i + 1)) + data = response.read() # Handle HTTP compression for gzip and deflate (zlib) From 48640e279f451cb2d365154870ed20e2d8bcc3e4 Mon Sep 17 00:00:00 2001 From: JayXon Date: Tue, 24 May 2016 20:58:28 -0700 Subject: [PATCH 11/16] Add -t and --timeout option To set socket timeout, default is 600 seconds. --- src/you_get/common.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index dadba69c..100f3869 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -312,7 +312,7 @@ def get_content(url, headers={}, decoded=True): for i in range(10): try: - response = request.urlopen(req, timeout=10) + response = request.urlopen(req) break except socket.timeout: logging.debug('request attempt %s timeout' % str(i + 1)) @@ -1071,11 +1071,12 @@ def script_main(script_name, download, download_playlist, **kwargs): -x | --http-proxy Use an HTTP proxy for downloading. -y | --extractor-proxy Use an HTTP proxy for extracting only. --no-proxy Never use a proxy. + -t | --timeout Set socket timeout. -d | --debug Show traceback and other debug info. ''' - short_opts = 'Vhfiuc:ndF:O:o:p:x:y:' - opts = ['version', 'help', 'force', 'info', 'url', 'cookies', 'no-caption', 'no-merge', 'no-proxy', 'debug', 'json', 'format=', 'stream=', 'itag=', 'output-filename=', 'output-dir=', 'player=', 'http-proxy=', 'extractor-proxy=', 'lang='] + short_opts = 'Vhfiuc:ndF:O:o:p:x:y:t:' + opts = ['version', 'help', 'force', 'info', 'url', 'cookies', 'no-caption', 'no-merge', 'no-proxy', 'debug', 'json', 'format=', 'stream=', 'itag=', 'output-filename=', 'output-dir=', 'player=', 'http-proxy=', 'extractor-proxy=', 'lang=', 'timeout='] if download_playlist: short_opts = 'l' + short_opts opts = ['playlist'] + opts @@ -1105,6 +1106,7 @@ def script_main(script_name, download, download_playlist, **kwargs): proxy = None extractor_proxy = None traceback = False + timeout = 600 for o, a in opts: if o in ('-V', '--version'): version() @@ -1178,6 +1180,8 @@ def script_main(script_name, download, download_playlist, **kwargs): extractor_proxy = a elif o in ('--lang',): lang = a + elif o in ('-t', '--timeout'): + timeout = int(a) else: log.e("try 'you-get --help' for more options") sys.exit(2) @@ -1187,6 +1191,8 @@ def script_main(script_name, download, download_playlist, **kwargs): set_http_proxy(proxy) + socket.setdefaulttimeout(timeout) + try: if stream_id: if not extractor_proxy: From c89c977189372c64978b9d8aac0906047c6affde Mon Sep 17 00:00:00 2001 From: osinx zhou Date: Wed, 25 May 2016 15:35:41 +0800 Subject: [PATCH 12/16] Support http://baidu.ku6.com #1163 --- src/you_get/extractors/ku6.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/src/you_get/extractors/ku6.py b/src/you_get/extractors/ku6.py index d9a1ef12..7f28c75b 100644 --- a/src/you_get/extractors/ku6.py +++ b/src/you_get/extractors/ku6.py @@ -27,13 +27,30 @@ def ku6_download_by_id(id, title = None, output_dir = '.', merge = True, info_on download_urls(urls, title, ext, size, output_dir, merge = merge) def ku6_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): - patterns = [r'http://v.ku6.com/special/show_\d+/(.*)\.\.\.html', - r'http://v.ku6.com/show/(.*)\.\.\.html', - r'http://my.ku6.com/watch\?.*v=(.*)\.\..*'] - id = r1_of(patterns, url) + id = None + + if match1(url, r'http://baidu.ku6.com/watch/(.*)\.html') is not None: + id = baidu_ku6(url) + else: + patterns = [r'http://v.ku6.com/special/show_\d+/(.*)\.\.\.html', + r'http://v.ku6.com/show/(.*)\.\.\.html', + r'http://my.ku6.com/watch\?.*v=(.*)\.\..*'] + id = r1_of(patterns, url) ku6_download_by_id(id, output_dir = output_dir, merge = merge, info_only = info_only) +def baidu_ku6(url): + id = None + + h1 = get_html(url) + isrc = match1(h1, r'