diff --git a/README.md b/README.md index a4f4fcd9..60cb125a 100644 --- a/README.md +++ b/README.md @@ -360,6 +360,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | PPTV聚力 | |✓| | | | 齐鲁网 | |✓| | | | QQ
腾讯视频 | |✓| | | +| 企鹅直播 | |✓| | | | 阡陌视频 | |✓| | | | THVideo | |✓| | | | Sina
新浪视频
微博秒拍视频 |
|✓| | | @@ -373,6 +374,8 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 战旗TV | |✓| | | | 央视网 | |✓| | | | 花瓣 | | |✓| | +| Naver
네이버 | |✓| | | +| 芒果TV | |✓| | | For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. diff --git a/src/you_get/common.py b/src/you_get/common.py index 6c65bd49..100f3869 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -52,6 +52,7 @@ SITES = { 'mixcloud' : 'mixcloud', 'mtv81' : 'mtv81', 'musicplayon' : 'musicplayon', + 'naver' : 'naver', '7gogo' : 'nanagogo', 'nicovideo' : 'nicovideo', 'panda' : 'panda', @@ -97,6 +98,7 @@ import logging import os import platform import re +import socket import sys import time from urllib import request, parse, error @@ -307,7 +309,14 @@ def get_content(url, headers={}, decoded=True): if cookies: cookies.add_cookie_header(req) req.headers.update(req.unredirected_hdrs) - response = request.urlopen(req) + + for i in range(10): + try: + response = request.urlopen(req) + break + except socket.timeout: + logging.debug('request attempt %s timeout' % str(i + 1)) + data = response.read() # Handle HTTP compression for gzip and deflate (zlib) @@ -1062,11 +1071,12 @@ def script_main(script_name, download, download_playlist, **kwargs): -x | --http-proxy Use an HTTP proxy for downloading. -y | --extractor-proxy Use an HTTP proxy for extracting only. --no-proxy Never use a proxy. + -t | --timeout Set socket timeout. -d | --debug Show traceback and other debug info. ''' - short_opts = 'Vhfiuc:ndF:O:o:p:x:y:' - opts = ['version', 'help', 'force', 'info', 'url', 'cookies', 'no-caption', 'no-merge', 'no-proxy', 'debug', 'json', 'format=', 'stream=', 'itag=', 'output-filename=', 'output-dir=', 'player=', 'http-proxy=', 'extractor-proxy=', 'lang='] + short_opts = 'Vhfiuc:ndF:O:o:p:x:y:t:' + opts = ['version', 'help', 'force', 'info', 'url', 'cookies', 'no-caption', 'no-merge', 'no-proxy', 'debug', 'json', 'format=', 'stream=', 'itag=', 'output-filename=', 'output-dir=', 'player=', 'http-proxy=', 'extractor-proxy=', 'lang=', 'timeout='] if download_playlist: short_opts = 'l' + short_opts opts = ['playlist'] + opts @@ -1096,6 +1106,7 @@ def script_main(script_name, download, download_playlist, **kwargs): proxy = None extractor_proxy = None traceback = False + timeout = 600 for o, a in opts: if o in ('-V', '--version'): version() @@ -1169,6 +1180,8 @@ def script_main(script_name, download, download_playlist, **kwargs): extractor_proxy = a elif o in ('--lang',): lang = a + elif o in ('-t', '--timeout'): + timeout = int(a) else: log.e("try 'you-get --help' for more options") sys.exit(2) @@ -1178,6 +1191,8 @@ def script_main(script_name, download, download_playlist, **kwargs): set_http_proxy(proxy) + socket.setdefaulttimeout(timeout) + try: if stream_id: if not extractor_proxy: diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index 1bb7a7ab..20a7f7cf 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -45,6 +45,7 @@ from .mixcloud import * from .mtv81 import * from .musicplayon import * from .nanagogo import * +from .naver import * from .netease import * from .nicovideo import * from .panda import * @@ -52,6 +53,7 @@ from .pinterest import * from .pixnet import * from .pptv import * from .qianmo import * +from .qie import * from .qq import * from .sina import * from .sohu import * diff --git a/src/you_get/extractors/embed.py b/src/you_get/extractors/embed.py index fd463c92..a177e663 100644 --- a/src/you_get/extractors/embed.py +++ b/src/you_get/extractors/embed.py @@ -8,6 +8,7 @@ from .netease import netease_download from .qq import qq_download_by_vid from .sina import sina_download_by_vid from .tudou import tudou_download_by_id +from .vimeo import vimeo_download_by_id from .yinyuetai import yinyuetai_download_by_id from .youku import youku_download_by_vid @@ -39,6 +40,9 @@ iqiyi_embed_patterns = [ 'player\.video\.qiyi\.com/([^/]+)/[^/]+/[^/]+/[^/]+\.sw netease_embed_patterns = [ '(http://\w+\.163\.com/movie/[^\'"]+)' ] +vimeo_embed_patters = [ 'player\.vimeo\.com/video/(\d+)' ] + + def embed_download(url, output_dir = '.', merge = True, info_only = False ,**kwargs): content = get_content(url, headers=fake_headers) found = False @@ -69,6 +73,11 @@ def embed_download(url, output_dir = '.', merge = True, info_only = False ,**kwa found = True netease_download(url, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + urls = matchall(content, vimeo_embed_patters) + for url in urls: + found = True + vimeo_download_by_id(url, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + if not found: raise NotImplementedError(url) diff --git a/src/you_get/extractors/ku6.py b/src/you_get/extractors/ku6.py index d9a1ef12..7f28c75b 100644 --- a/src/you_get/extractors/ku6.py +++ b/src/you_get/extractors/ku6.py @@ -27,13 +27,30 @@ def ku6_download_by_id(id, title = None, output_dir = '.', merge = True, info_on download_urls(urls, title, ext, size, output_dir, merge = merge) def ku6_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): - patterns = [r'http://v.ku6.com/special/show_\d+/(.*)\.\.\.html', - r'http://v.ku6.com/show/(.*)\.\.\.html', - r'http://my.ku6.com/watch\?.*v=(.*)\.\..*'] - id = r1_of(patterns, url) + id = None + + if match1(url, r'http://baidu.ku6.com/watch/(.*)\.html') is not None: + id = baidu_ku6(url) + else: + patterns = [r'http://v.ku6.com/special/show_\d+/(.*)\.\.\.html', + r'http://v.ku6.com/show/(.*)\.\.\.html', + r'http://my.ku6.com/watch\?.*v=(.*)\.\..*'] + id = r1_of(patterns, url) ku6_download_by_id(id, output_dir = output_dir, merge = merge, info_only = info_only) +def baidu_ku6(url): + id = None + + h1 = get_html(url) + isrc = match1(h1, r'