Merge pull request #1 from soimort/develop

Project Updatding
This commit is contained in:
Yohohaha 2016-11-25 19:16:16 +08:00 committed by GitHub
commit c44a7ec1b9
21 changed files with 675 additions and 166 deletions

View File

@ -37,7 +37,7 @@ Interested? [Install it](#installation) now and [get started by examples](#getti
Are you a Python programmer? Then check out [the source](https://github.com/soimort/you-get) and fork it!
![](http://i.imgur.com/GfthFAz.png)
![](https://i.imgur.com/GfthFAz.png)
## Installation
@ -339,6 +339,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the
| **Tumblr** | <https://www.tumblr.com/> |✓|✓|✓|
| TED | <http://www.ted.com/> |✓| | |
| SoundCloud | <https://soundcloud.com/> | | |✓|
| SHOWROOM | <https://www.showroom-live.com/> |✓| | |
| Pinterest | <https://www.pinterest.com/> | |✓| |
| MusicPlayOn | <http://en.musicplayon.com/> |✓| | |
| MTV81 | <http://www.mtv81.com/> |✓| | |
@ -372,7 +373,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the
| 爆米花网 | <http://www.baomihua.com/> |✓| | |
| **bilibili<br/>哔哩哔哩** | <http://www.bilibili.com/> |✓| | |
| Dilidili | <http://www.dilidili.com/> |✓| | |
| 豆瓣 | <http://www.douban.com/> | | |✓|
| 豆瓣 | <http://www.douban.com/> || |✓|
| 斗鱼 | <http://www.douyutv.com/> |✓| | |
| Panda<br/>熊猫 | <http://www.panda.tv/> |✓| | |
| 凤凰视频 | <http://v.ifeng.com/> |✓| | |
@ -406,6 +407,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the
| 花瓣 | <http://huaban.com/> | |✓| |
| Naver<br/>네이버 | <http://tvcast.naver.com/> |✓| | |
| 芒果TV | <http://www.mgtv.com/> |✓| | |
| 火猫TV | <http://www.huomao.com/> |✓| | |
For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page.

View File

@ -27,7 +27,9 @@ SITES = {
'google' : 'google',
'heavy-music' : 'heavymusic',
'huaban' : 'huaban',
'huomao' : 'huomaotv',
'iask' : 'sina',
'icourses' : 'icourses',
'ifeng' : 'ifeng',
'imgur' : 'imgur',
'in' : 'alive',
@ -340,6 +342,45 @@ def get_content(url, headers={}, decoded=True):
return data
def post_content(url, headers={}, post_data={}, decoded=True):
"""Post the content of a URL via sending a HTTP POST request.
Args:
url: A URL.
headers: Request headers used by the client.
decoded: Whether decode the response body using UTF-8 or the charset specified in Content-Type.
Returns:
The content as a string.
"""
logging.debug('post_content: %s \n post_data: %s' % (url, post_data))
req = request.Request(url, headers=headers)
if cookies:
cookies.add_cookie_header(req)
req.headers.update(req.unredirected_hdrs)
post_data_enc = bytes(parse.urlencode(post_data), 'utf-8')
response = request.urlopen(req, data = post_data_enc)
data = response.read()
# Handle HTTP compression for gzip and deflate (zlib)
content_encoding = response.getheader('Content-Encoding')
if content_encoding == 'gzip':
data = ungzip(data)
elif content_encoding == 'deflate':
data = undeflate(data)
# Decode the response body
if decoded:
charset = match1(response.getheader('Content-Type'), r'charset=([\w-]+)')
if charset is not None:
data = data.decode(charset)
else:
data = data.decode('utf-8')
return data
def url_size(url, faker = False, headers = {}):
if faker:
response = request.urlopen(request.Request(url, headers = fake_headers), None)
@ -507,7 +548,11 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False, h
os.remove(filepath) # on Windows rename could fail if destination filepath exists
os.rename(temp_filepath, filepath)
def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker = False, headers = {}):
def url_save_chunked(url, filepath, bar, dyn_callback=None, chunk_size=0, ignore_range=False, refer=None, is_part=False, faker=False, headers={}):
def dyn_update_url(received):
if callable(dyn_callback):
logging.debug('Calling callback %s for new URL from %s' % (dyn_callback.__name__, received))
return dyn_callback(received)
if os.path.exists(filepath):
if not force:
if not is_part:
@ -545,19 +590,26 @@ def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker =
else:
headers = {}
if received:
url = dyn_update_url(received)
if not ignore_range:
headers['Range'] = 'bytes=' + str(received) + '-'
if refer:
headers['Referer'] = refer
response = request.urlopen(request.Request(url, headers = headers), None)
response = request.urlopen(request.Request(url, headers=headers), None)
with open(temp_filepath, open_mode) as output:
this_chunk = received
while True:
buffer = response.read(1024 * 256)
if not buffer:
break
output.write(buffer)
received += len(buffer)
if chunk_size and (received - this_chunk) >= chunk_size:
url = dyn_callback(received)
this_chunk = received
response = request.urlopen(request.Request(url, headers=headers), None)
if bar:
bar.update_received(len(buffer))
@ -806,7 +858,7 @@ def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merg
print()
def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False, headers = {}):
def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False, headers = {}, **kwargs):
assert urls
if dry_run:
print('Real URLs:\n%s\n' % urls)
@ -820,7 +872,7 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=No
filename = '%s.%s' % (title, ext)
filepath = os.path.join(output_dir, filename)
if total_size and ext in ('ts'):
if total_size:
if not force and os.path.exists(filepath[:-3] + '.mkv'):
print('Skipping %s: file already exists' % filepath[:-3] + '.mkv')
print()
@ -835,7 +887,7 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=No
print('Downloading %s ...' % tr(filename))
filepath = os.path.join(output_dir, filename)
parts.append(filepath)
url_save_chunked(url, filepath, bar, refer = refer, faker = faker, headers = headers)
url_save_chunked(url, filepath, bar, refer = refer, faker = faker, headers = headers, **kwargs)
bar.done()
if not merge:

View File

@ -24,6 +24,7 @@ from .funshion import *
from .google import *
from .heavymusic import *
from .huaban import *
from .icourses import *
from .ifeng import *
from .imgur import *
from .infoq import *

View File

@ -73,14 +73,14 @@ def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
assert re.match(r'http://[^\.]+.acfun.[^\.]+/\D/\D\D(\d+)', url)
html = get_html(url)
title = r1(r'<h1 id="txt-title-view">([^<>]+)<', html)
title = r1(r'data-title="([^"]+)"', html)
title = unescape_html(title)
title = escape_file_path(title)
assert title
video = re.search('data-vid="(\d+)"\s*data-scode=""[^<]*title="([^"]+)"', html)
vid = video.group(1)
title = title + ' - ' + video.group(2)
vid = r1('data-vid="(\d+)"', html)
up = r1('data-name="([^"]+)"', html)
title = title + ' - ' + up
acfun_download_by_vid(vid, title,
output_dir=output_dir,
merge=merge,

View File

@ -7,8 +7,10 @@ from ..common import *
from .embed import *
from .universal import *
def baidu_get_song_data(sid):
data = json.loads(get_html('http://music.baidu.com/data/music/fmlink?songIds=%s' % sid, faker = True))['data']
data = json.loads(get_html(
'http://music.baidu.com/data/music/fmlink?songIds=%s' % sid, faker=True))['data']
if data['xcode'] != '':
# inside china mainland
@ -17,22 +19,28 @@ def baidu_get_song_data(sid):
# outside china mainland
return None
def baidu_get_song_url(data):
return data['songLink']
def baidu_get_song_artist(data):
return data['artistName']
def baidu_get_song_album(data):
return data['albumName']
def baidu_get_song_title(data):
return data['songName']
def baidu_get_song_lyric(data):
lrc = data['lrcLink']
return None if lrc is '' else "http://music.baidu.com%s" % lrc
def baidu_download_song(sid, output_dir='.', merge=True, info_only=False):
data = baidu_get_song_data(sid)
if data is not None:
@ -51,7 +59,8 @@ def baidu_download_song(sid, output_dir='.', merge=True, info_only=False):
type, ext, size = url_info(url, faker=True)
print_info(site_info, title, type, size)
if not info_only:
download_urls([url], file_name, ext, size, output_dir, merge=merge, faker=True)
download_urls([url], file_name, ext, size,
output_dir, merge=merge, faker=True)
try:
type, ext, size = url_info(lrc, faker=True)
@ -61,12 +70,14 @@ def baidu_download_song(sid, output_dir='.', merge=True, info_only=False):
except:
pass
def baidu_download_album(aid, output_dir = '.', merge = True, info_only = False):
html = get_html('http://music.baidu.com/album/%s' % aid, faker = True)
def baidu_download_album(aid, output_dir='.', merge=True, info_only=False):
html = get_html('http://music.baidu.com/album/%s' % aid, faker=True)
album_name = r1(r'<h2 class="album-name">(.+?)<\/h2>', html)
artist = r1(r'<span class="author_list" title="(.+?)">', html)
output_dir = '%s/%s - %s' % (output_dir, artist, album_name)
ids = json.loads(r1(r'<span class="album-add" data-adddata=\'(.+?)\'>', html).replace('&quot', '').replace(';', '"'))['ids']
ids = json.loads(r1(r'<span class="album-add" data-adddata=\'(.+?)\'>',
html).replace('&quot', '').replace(';', '"'))['ids']
track_nr = 1
for id in ids:
song_data = baidu_get_song_data(id)
@ -75,38 +86,29 @@ def baidu_download_album(aid, output_dir = '.', merge = True, info_only = False)
song_lrc = baidu_get_song_lyric(song_data)
file_name = '%02d.%s' % (track_nr, song_title)
type, ext, size = url_info(song_url, faker = True)
type, ext, size = url_info(song_url, faker=True)
print_info(site_info, song_title, type, size)
if not info_only:
download_urls([song_url], file_name, ext, size, output_dir, merge = merge, faker = True)
download_urls([song_url], file_name, ext, size,
output_dir, merge=merge, faker=True)
if song_lrc:
type, ext, size = url_info(song_lrc, faker = True)
type, ext, size = url_info(song_lrc, faker=True)
print_info(site_info, song_title, type, size)
if not info_only:
download_urls([song_lrc], file_name, ext, size, output_dir, faker = True)
download_urls([song_lrc], file_name, ext,
size, output_dir, faker=True)
track_nr += 1
def baidu_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False, **kwargs):
if re.match(r'http://imgsrc.baidu.com', url):
universal_download(url, output_dir, merge=merge, info_only=info_only)
return
elif re.match(r'http://pan.baidu.com', url):
html = get_html(url)
def baidu_download(url, output_dir='.', stream_type=None, merge=True, info_only=False, **kwargs):
title = r1(r'server_filename="([^"]+)"', html)
if len(title.split('.')) > 1:
title = ".".join(title.split('.')[:-1])
real_url = r1(r'\\"dlink\\":\\"([^"]*)\\"', html).replace('\\\\/', '/')
type, ext, size = url_info(real_url, faker = True)
print_info(site_info, title, ext, size)
if re.match(r'http://pan.baidu.com', url):
real_url, title, ext, size = baidu_pan_download(url)
if not info_only:
download_urls([real_url], title, ext, size, output_dir, merge = merge)
download_urls([real_url], title, ext, size,
output_dir, url, merge=merge, faker=True)
elif re.match(r'http://music.baidu.com/album/\d+', url):
id = r1(r'http://music.baidu.com/album/(\d+)', url)
baidu_download_album(id, output_dir, merge, info_only)
@ -124,17 +126,20 @@ def baidu_download(url, output_dir = '.', stream_type = None, merge = True, info
html = get_html(url)
title = r1(r'title:"([^"]+)"', html)
items = re.findall(r'//imgsrc.baidu.com/forum/w[^"]+/([^/"]+)', html)
items = re.findall(
r'//imgsrc.baidu.com/forum/w[^"]+/([^/"]+)', html)
urls = ['http://imgsrc.baidu.com/forum/pic/item/' + i
for i in set(items)]
# handle albums
kw = r1(r'kw=([^&]+)', html) or r1(r"kw:'([^']+)'", html)
tid = r1(r'tid=(\d+)', html) or r1(r"tid:'([^']+)'", html)
album_url = 'http://tieba.baidu.com/photo/g/bw/picture/list?kw=%s&tid=%s' % (kw, tid)
album_url = 'http://tieba.baidu.com/photo/g/bw/picture/list?kw=%s&tid=%s' % (
kw, tid)
album_info = json.loads(get_content(album_url))
for i in album_info['data']['pic_list']:
urls.append('http://imgsrc.baidu.com/forum/pic/item/' + i['pic_id'] + '.jpg')
urls.append(
'http://imgsrc.baidu.com/forum/pic/item/' + i['pic_id'] + '.jpg')
ext = 'jpg'
size = float('Inf')
@ -144,6 +149,170 @@ def baidu_download(url, output_dir = '.', stream_type = None, merge = True, info
download_urls(urls, title, ext, size,
output_dir=output_dir, merge=False)
def baidu_pan_download(url):
errno_patt = r'errno":([^"]+),'
refer_url = ""
fake_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'UTF-8,*;q=0.5',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Host': 'pan.baidu.com',
'Origin': 'http://pan.baidu.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:13.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2500.0 Safari/537.36',
'Referer': refer_url
}
if cookies:
print('Use user specified cookies')
else:
print('Generating cookies...')
fake_headers['Cookie'] = baidu_pan_gen_cookies(url)
refer_url = "http://pan.baidu.com"
html = get_content(url, fake_headers, decoded=True)
isprotected = False
sign, timestamp, bdstoken, appid, primary_id, fs_id, uk = baidu_pan_parse(
html)
if sign == None:
if re.findall(r'\baccess-code\b', html):
isprotected = True
sign, timestamp, bdstoken, appid, primary_id, fs_id, uk, fake_headers, psk = baidu_pan_protected_share(
url)
# raise NotImplementedError("Password required!")
if isprotected != True:
raise AssertionError("Share not found or canceled: %s" % url)
if bdstoken == None:
bdstoken = ""
if isprotected != True:
sign, timestamp, bdstoken, appid, primary_id, fs_id, uk = baidu_pan_parse(
html)
request_url = "http://pan.baidu.com/api/sharedownload?sign=%s&timestamp=%s&bdstoken=%s&channel=chunlei&clienttype=0&web=1&app_id=%s" % (
sign, timestamp, bdstoken, appid)
refer_url = url
post_data = {
'encrypt': 0,
'product': 'share',
'uk': uk,
'primaryid': primary_id,
'fid_list': '[' + fs_id + ']'
}
if isprotected == True:
post_data['sekey'] = psk
response_content = post_content(request_url, fake_headers, post_data, True)
errno = match1(response_content, errno_patt)
if errno != "0":
raise AssertionError(
"Server refused to provide download link! (Errno:%s)" % errno)
real_url = r1(r'dlink":"([^"]+)"', response_content).replace('\\/', '/')
title = r1(r'server_filename":"([^"]+)"', response_content)
assert real_url
type, ext, size = url_info(real_url, faker=True)
title_wrapped = json.loads('{"wrapper":"%s"}' % title)
title = title_wrapped['wrapper']
logging.debug(real_url)
print_info(site_info, title, ext, size)
print('Hold on...')
time.sleep(5)
return real_url, title, ext, size
def baidu_pan_parse(html):
sign_patt = r'sign":"([^"]+)"'
timestamp_patt = r'timestamp":([^"]+),'
appid_patt = r'app_id":"([^"]+)"'
bdstoken_patt = r'bdstoken":"([^"]+)"'
fs_id_patt = r'fs_id":([^"]+),'
uk_patt = r'uk":([^"]+),'
errno_patt = r'errno":([^"]+),'
primary_id_patt = r'shareid":([^"]+),'
sign = match1(html, sign_patt)
timestamp = match1(html, timestamp_patt)
appid = match1(html, appid_patt)
bdstoken = match1(html, bdstoken_patt)
fs_id = match1(html, fs_id_patt)
uk = match1(html, uk_patt)
primary_id = match1(html, primary_id_patt)
return sign, timestamp, bdstoken, appid, primary_id, fs_id, uk
def baidu_pan_gen_cookies(url, post_data=None):
from http import cookiejar
cookiejar = cookiejar.CookieJar()
opener = request.build_opener(request.HTTPCookieProcessor(cookiejar))
resp = opener.open('http://pan.baidu.com')
if post_data != None:
resp = opener.open(url, bytes(parse.urlencode(post_data), 'utf-8'))
return cookjar2hdr(cookiejar)
def baidu_pan_protected_share(url):
print('This share is protected by password!')
inpwd = input('Please provide unlock password: ')
inpwd = inpwd.replace(' ', '').replace('\t', '')
print('Please wait...')
post_pwd = {
'pwd': inpwd,
'vcode': None,
'vstr': None
}
from http import cookiejar
import time
cookiejar = cookiejar.CookieJar()
opener = request.build_opener(request.HTTPCookieProcessor(cookiejar))
resp = opener.open('http://pan.baidu.com')
resp = opener.open(url)
init_url = resp.geturl()
verify_url = 'http://pan.baidu.com/share/verify?%s&t=%s&channel=chunlei&clienttype=0&web=1' % (
init_url.split('?', 1)[1], int(time.time()))
refer_url = init_url
fake_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'UTF-8,*;q=0.5',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Host': 'pan.baidu.com',
'Origin': 'http://pan.baidu.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:13.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2500.0 Safari/537.36',
'Referer': refer_url
}
opener.addheaders = dict2triplet(fake_headers)
pwd_resp = opener.open(verify_url, bytes(
parse.urlencode(post_pwd), 'utf-8'))
pwd_resp_str = ungzip(pwd_resp.read()).decode('utf-8')
pwd_res = json.loads(pwd_resp_str)
if pwd_res['errno'] != 0:
raise AssertionError(
'Server returned an error: %s (Incorrect password?)' % pwd_res['errno'])
pg_resp = opener.open('http://pan.baidu.com/share/link?%s' %
init_url.split('?', 1)[1])
content = ungzip(pg_resp.read()).decode('utf-8')
sign, timestamp, bdstoken, appid, primary_id, fs_id, uk = baidu_pan_parse(
content)
psk = query_cookiejar(cookiejar, 'BDCLND')
psk = parse.unquote(psk)
fake_headers['Cookie'] = cookjar2hdr(cookiejar)
return sign, timestamp, bdstoken, appid, primary_id, fs_id, uk, fake_headers, psk
def cookjar2hdr(cookiejar):
cookie_str = ''
for i in cookiejar:
cookie_str = cookie_str + i.name + '=' + i.value + ';'
return cookie_str[:-1]
def query_cookiejar(cookiejar, name):
for i in cookiejar:
if i.name == name:
return i.value
def dict2triplet(dictin):
out_triplet = []
for i in dictin:
out_triplet.append((i, dictin[i]))
return out_triplet
site_info = "Baidu.com"
download = baidu_download
download_playlist = playlist_not_supported("baidu")

View File

@ -119,17 +119,21 @@ def bilibili_live_download_by_cid(cid, title, output_dir='.', merge=True, info_o
def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
html = get_content(url)
if re.match(r'https?://bangumi\.bilibili\.com/', url):
# quick hack for bangumi URLs
url = r1(r'"([^"]+)" class="v-av-link"', html)
html = get_content(url)
title = r1_of([r'<meta name="title" content="\s*([^<>]{1,999})\s*" />',
r'<h1[^>]*>\s*([^<>]+)\s*</h1>'], html)
if title:
title = unescape_html(title)
title = escape_file_path(title)
if re.match(r'https?://bangumi\.bilibili\.com/', url):
# quick hack for bangumi URLs
episode_id = r1(r'data-current-episode-id="(\d+)"', html)
cont = post_content('http://bangumi.bilibili.com/web_api/get_source',
post_data={'episode_id': episode_id})
cid = json.loads(cont)['result']['cid']
bilibili_download_by_cid(str(cid), title, output_dir=output_dir, merge=merge, info_only=info_only)
else:
flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"',
r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
assert flashvars

View File

@ -4,6 +4,11 @@ __all__ = ['dailymotion_download']
from ..common import *
def extract_m3u(url):
content = get_content(url)
m3u_url = re.findall(r'http://.*', content)[0]
return match1(m3u_url, r'([^#]+)')
def dailymotion_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):
"""Downloads Dailymotion videos by URL.
"""
@ -13,7 +18,7 @@ def dailymotion_download(url, output_dir = '.', merge = True, info_only = False,
title = match1(html, r'"video_title"\s*:\s*"([^"]+)"') or \
match1(html, r'"title"\s*:\s*"([^"]+)"')
for quality in ['720','480','380','240','auto']:
for quality in ['1080','720','480','380','240','auto']:
try:
real_url = info[quality][0]["url"]
if real_url:
@ -21,11 +26,12 @@ def dailymotion_download(url, output_dir = '.', merge = True, info_only = False,
except KeyError:
pass
type, ext, size = url_info(real_url)
m3u_url = extract_m3u(real_url)
mime, ext, size = 'video/mp4', 'mp4', 0
print_info(site_info, title, type, size)
print_info(site_info, title, mime, size)
if not info_only:
download_urls([real_url], title, ext, size, output_dir, merge = merge)
download_url_ffmpeg(m3u_url, title, ext, output_dir=output_dir, merge=merge)
site_info = "Dailymotion.com"
download = dailymotion_download

View File

@ -7,7 +7,18 @@ from ..common import *
def douban_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):
html = get_html(url)
if 'subject' in url:
if re.match(r'https?://movie', url):
title = match1(html, 'name="description" content="([^"]+)')
tid = match1(url, 'trailer/(\d+)')
real_url = 'https://movie.douban.com/trailer/video_url?tid=%s' % tid
type, ext, size = url_info(real_url)
print_info(site_info, title, type, size)
if not info_only:
download_urls([real_url], title, ext, size, output_dir, merge = merge)
elif 'subject' in url:
titles = re.findall(r'data-title="([^"]*)">', html)
song_id = re.findall(r'<li class="song-item" id="([^"]*)"', html)
song_ssid = re.findall(r'data-ssid="([^"]*)"', html)

View File

@ -25,7 +25,7 @@ youku_embed_patterns = [ 'youku\.com/v_show/id_([a-zA-Z0-9=]+)',
"""
http://www.tudou.com/programs/view/html5embed.action?type=0&amp;code=3LS_URGvl54&amp;lcode=&amp;resourceId=0_06_05_99
"""
tudou_embed_patterns = [ 'tudou\.com[a-zA-Z0-9\/\?=\&\.\;]+code=([a-zA-Z0-9_]+)\&',
tudou_embed_patterns = [ 'tudou\.com[a-zA-Z0-9\/\?=\&\.\;]+code=([a-zA-Z0-9_-]+)\&',
'www\.tudou\.com/v/([a-zA-Z0-9_-]+)/[^"]*v\.swf'
]

View File

@ -0,0 +1,36 @@
#!/usr/bin/env python
__all__ = ['huomaotv_download']
from ..common import *
def get_mobile_room_url(room_id):
return 'http://www.huomao.com/mobile/mob_live/%s' % room_id
def get_m3u8_url(stream_id):
return 'http://live-ws.huomaotv.cn/live/%s/playlist.m3u8' % stream_id
def huomaotv_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
room_id_pattern = r'huomao.com/(\d+)'
room_id = match1(url, room_id_pattern)
html = get_content(get_mobile_room_url(room_id))
stream_id_pattern = r'id="html_stream" value="(\w+)"'
stream_id = match1(html, stream_id_pattern)
m3u8_url = get_m3u8_url(stream_id)
title = match1(html, r'<title>([^<]{1,9999})</title>')
print_info(site_info, title, 'm3u8', float('inf'))
if not info_only:
download_url_ffmpeg(m3u8_url, title, 'm3u8', None, output_dir=output_dir, merge=merge)
site_info = 'huomao.com'
download = huomaotv_download
download_playlist = playlist_not_supported('huomao')

View File

@ -0,0 +1,148 @@
#!/usr/bin/env python
from ..common import *
from urllib import parse
import random
from time import sleep
import xml.etree.ElementTree as ET
import datetime
import hashlib
import base64
import logging
from urllib import error
import re
__all__ = ['icourses_download']
def icourses_download(url, merge=False, output_dir='.', **kwargs):
icourses_parser = ICousesExactor(url=url)
real_url = icourses_parser.icourses_cn_url_parser(**kwargs)
title = icourses_parser.title
if real_url is not None:
for tries in range(0, 5):
try:
_, type_, size = url_info(real_url, faker=True)
break
except error.HTTPError:
logging.warning('Failed to fetch the video file! Retrying...')
sleep(random.Random().randint(0, 5)) # Prevent from blockage
real_url = icourses_parser.icourses_cn_url_parser()
title = icourses_parser.title
print_info(site_info, title, type_, size)
if not kwargs['info_only']:
download_urls_chunked([real_url], title, 'flv',
total_size=size, output_dir=output_dir, refer=url, merge=merge, faker=True, ignore_range=True, chunk_size=15000000, dyn_callback=icourses_parser.icourses_cn_url_parser)
# Why not using VideoExtractor: This site needs specical download method
class ICousesExactor(object):
def __init__(self, url):
self.url = url
self.title = ''
return
def icourses_playlist_download(self, **kwargs):
html = get_content(self.url)
page_type_patt = r'showSectionNode\(this,(\d+),(\d+)\)'
video_js_number = r'changeforvideo\((.*?)\)'
fs_flag = r'<input type="hidden" value=(\w+) id="firstShowFlag">'
page_navi_vars = re.search(pattern=page_type_patt, string=html)
dummy_page = 'http://www.icourses.cn/jpk/viewCharacterDetail.action?sectionId={}&courseId={}'.format(
page_navi_vars.group(2), page_navi_vars.group(1))
html = get_content(dummy_page)
fs_status = match1(html, fs_flag)
video_list = re.findall(pattern=video_js_number, string=html)
for video in video_list:
video_args = video.replace('\'', '').split(',')
video_url = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}&firstShowFlag={}'.format(
video_args[0], video_args[1], fs_status or '1')
sleep(random.Random().randint(0, 5)) # Prevent from blockage
icourses_download(video_url, **kwargs)
def icourses_cn_url_parser(self, received=0, **kwargs):
PLAYER_BASE_VER = '150606-1'
ENCRYPT_MOD_VER = '151020'
ENCRYPT_SALT = '3DAPmXsZ4o' # It took really long time to find this...
html = get_content(self.url)
if re.search(pattern=r'showSectionNode\(.*\)', string=html):
logging.warning('Switching to playlist mode!')
return self.icourses_playlist_download(**kwargs)
flashvars_patt = r'var\ flashvars\=((.|\n)*)};'
server_time_patt = r'MPlayer.swf\?v\=(\d+)'
uuid_patt = r'uuid:(\d+)'
other_args_patt = r'other:"(.*)"'
res_url_patt = r'IService:\'([^\']+)'
title_a_patt = r'<div class="con"> <a.*?>(.*?)</a>'
title_b_patt = r'<div class="con"> <a.*?/a>((.|\n)*?)</div>'
title_a = match1(html, title_a_patt).strip()
title_b = match1(html, title_b_patt).strip()
title = title_a + title_b # WIP, FIXME
title = re.sub('( +|\n|\t|\r|\&nbsp\;)', '',
unescape_html(title).replace(' ', ''))
server_time = match1(html, server_time_patt)
flashvars = match1(html, flashvars_patt)
uuid = match1(flashvars, uuid_patt)
other_args = match1(flashvars, other_args_patt)
res_url = match1(flashvars, res_url_patt)
url_parts = {'v': server_time, 'other': other_args,
'uuid': uuid, 'IService': res_url}
req_url = '%s?%s' % (res_url, parse.urlencode(url_parts))
logging.debug('Requesting video resource location...')
xml_resp = get_html(req_url)
xml_obj = ET.fromstring(xml_resp)
logging.debug('The result was {}'.format(xml_obj.get('status')))
if xml_obj.get('status') != 'success':
raise ValueError('Server returned error!')
if received:
play_type = 'seek'
else:
play_type = 'play'
received -= 1
common_args = {'lv': PLAYER_BASE_VER, 'ls': play_type,
'lt': datetime.datetime.now().strftime('%m-%d/%H:%M:%S'),
'start': received + 1}
media_host = xml_obj.find(".//*[@name='host']").text
media_url = media_host + xml_obj.find(".//*[@name='url']").text
# This is what they called `SSLModule`... But obviously, just a kind of
# encryption, takes absolutely no effect in protecting data intergrity
if xml_obj.find(".//*[@name='ssl']").text != 'true':
logging.debug('The encryption mode is disabled')
# when the so-called `SSLMode` is not activated, the parameters, `h`
# and `p` can be found in response
arg_h = xml_obj.find(".//*[@name='h']").text
assert arg_h
arg_r = xml_obj.find(".//*[@name='p']").text or ENCRYPT_MOD_VER
url_args = common_args.copy()
url_args.update({'h': arg_h, 'r': arg_r})
final_url = '{}?{}'.format(
media_url, parse.urlencode(url_args))
self.title = title
return final_url
# when the `SSLMode` is activated, we need to receive the timestamp and the
# time offset (?) value from the server
logging.debug('The encryption mode is in effect')
ssl_callback = get_html(
'{}/ssl/ssl.shtml'.format(media_host)).split(',')
ssl_timestamp = int(datetime.datetime.strptime(
ssl_callback[1], "%b %d %H:%M:%S %Y").timestamp() + int(ssl_callback[0]))
sign_this = ENCRYPT_SALT + \
parse.urlparse(media_url).path + str(ssl_timestamp)
arg_h = base64.b64encode(hashlib.md5(
bytes(sign_this, 'utf-8')).digest())
# Post-processing, may subject to change, so leaving this alone...
arg_h = arg_h.decode('utf-8').strip('=').replace('+',
'-').replace('/', '_')
arg_r = ssl_timestamp
url_args = common_args.copy()
url_args.update({'h': arg_h, 'r': arg_r, 'p': ENCRYPT_MOD_VER})
final_url = '{}?{}'.format(
media_url, parse.urlencode(url_args))
logging.debug('Crafted URL: {}'.format(final_url))
self.title = title
return final_url
site_info = 'icourses.cn'
download = icourses_download
# download_playlist = icourses_playlist_download

View File

@ -12,11 +12,11 @@ import re
class MGTV(VideoExtractor):
name = "芒果 (MGTV)"
# Last updated: 2015-11-24
# Last updated: 2016-11-13
stream_types = [
{'id': 'hd', 'container': 'flv', 'video_profile': '超清'},
{'id': 'sd', 'container': 'flv', 'video_profile': '高清'},
{'id': 'ld', 'container': 'flv', 'video_profile': '标清'},
{'id': 'hd', 'container': 'ts', 'video_profile': '超清'},
{'id': 'sd', 'container': 'ts', 'video_profile': '高清'},
{'id': 'ld', 'container': 'ts', 'video_profile': '标清'},
]
id_dic = {i['video_profile']:(i['id']) for i in stream_types}
@ -27,7 +27,7 @@ class MGTV(VideoExtractor):
def get_vid_from_url(url):
"""Extracts video ID from URL.
"""
return match1(url, 'http://www.mgtv.com/v/\d/\d+/\w+/(\d+).html')
return match1(url, 'http://www.mgtv.com/b/\d+/(\d+).html')
#----------------------------------------------------------------------
@staticmethod
@ -44,10 +44,15 @@ class MGTV(VideoExtractor):
content = get_content(content['info']) #get the REAL M3U url, maybe to be changed later?
segment_list = []
segments_size = 0
for i in content.split():
if not i.startswith('#'): #not the best way, better we use the m3u8 package
segment_list.append(base_url + i)
return segment_list
# use ext-info for fast size calculate
elif i.startswith('#EXT-MGTV-File-SIZE:'):
segments_size += int(i[i.rfind(':')+1:])
return m3u_url, segments_size, segment_list
def download_playlist_by_url(self, url, **kwargs):
pass
@ -69,14 +74,10 @@ class MGTV(VideoExtractor):
quality_id = self.id_dic[s['video_profile']]
url = stream_available[s['video_profile']]
url = re.sub( r'(\&arange\=\d+)', '', url) #Un-Hum
segment_list_this = self.get_mgtv_real_url(url)
m3u8_url, m3u8_size, segment_list_this = self.get_mgtv_real_url(url)
container_this_stream = ''
size_this_stream = 0
stream_fileid_list = []
for i in segment_list_this:
_, container_this_stream, size_this_seg = url_info(i)
size_this_stream += size_this_seg
stream_fileid_list.append(os.path.basename(i).split('.')[0])
#make pieces
@ -85,10 +86,11 @@ class MGTV(VideoExtractor):
pieces.append({'fileid': i[0], 'segs': i[1],})
self.streams[quality_id] = {
'container': 'flv',
'container': s['container'],
'video_profile': s['video_profile'],
'size': size_this_stream,
'pieces': pieces
'size': m3u8_size,
'pieces': pieces,
'm3u8_url': m3u8_url
}
if not kwargs['info_only']:
@ -107,6 +109,44 @@ class MGTV(VideoExtractor):
# Extract stream with the best quality
stream_id = self.streams_sorted[0]['id']
def download(self, **kwargs):
if 'stream_id' in kwargs and kwargs['stream_id']:
stream_id = kwargs['stream_id']
else:
stream_id = 'null'
# print video info only
if 'info_only' in kwargs and kwargs['info_only']:
if stream_id != 'null':
if 'index' not in kwargs:
self.p(stream_id)
else:
self.p_i(stream_id)
else:
# Display all available streams
if 'index' not in kwargs:
self.p([])
else:
stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag']
self.p_i(stream_id)
# default to use the best quality
if stream_id == 'null':
stream_id = self.streams_sorted[0]['id']
stream_info = self.streams[stream_id]
if not kwargs['info_only']:
if player:
# with m3u8 format because some video player can process urls automatically (e.g. mpv)
launch_player(player, [stream_info['m3u8_url']])
else:
download_urls(stream_info['src'], self.title, stream_info['container'], stream_info['size'],
output_dir=kwargs['output_dir'],
merge=kwargs['merge'],
av=stream_id in self.dash_streams)
site = MGTV()
download = site.download_by_url
download_playlist = site.download_playlist_by_url

View File

@ -55,12 +55,14 @@ def netease_cloud_music_download(url, output_dir='.', merge=True, info_only=Fals
cover_url = j['result']['coverImgUrl']
download_urls([cover_url], "cover", "jpg", 0, new_dir)
for i in j['result']['tracks']:
netease_song_download(i, output_dir=new_dir, info_only=info_only)
prefix_width = len(str(len(j['result']['tracks'])))
for n, i in enumerate(j['result']['tracks']):
playlist_prefix = '%%.%dd_' % prefix_width % n
netease_song_download(i, output_dir=new_dir, info_only=info_only, playlist_prefix=playlist_prefix)
try: # download lyrics
assert kwargs['caption']
l = loads(get_content("http://music.163.com/api/song/lyric/?id=%s&lv=-1&csrf_token=" % i['id'], headers={"Referer": "http://music.163.com/"}))
netease_lyric_download(i, l["lrc"]["lyric"], output_dir=new_dir, info_only=info_only)
netease_lyric_download(i, l["lrc"]["lyric"], output_dir=new_dir, info_only=info_only, playlist_prefix=playlist_prefix)
except: pass
elif "song" in url:
@ -85,10 +87,10 @@ def netease_cloud_music_download(url, output_dir='.', merge=True, info_only=Fals
j = loads(get_content("http://music.163.com/api/mv/detail/?id=%s&ids=[%s]&csrf_token=" % (rid, rid), headers={"Referer": "http://music.163.com/"}))
netease_video_download(j['data'], output_dir=output_dir, info_only=info_only)
def netease_lyric_download(song, lyric, output_dir='.', info_only=False):
def netease_lyric_download(song, lyric, output_dir='.', info_only=False, playlist_prefix=""):
if info_only: return
title = "%s. %s" % (song['position'], song['name'])
title = "%s%s. %s" % (playlist_prefix, song['position'], song['name'])
filename = '%s.lrc' % get_filename(title)
print('Saving %s ...' % filename, end="", flush=True)
with open(os.path.join(output_dir, filename),
@ -103,8 +105,8 @@ def netease_video_download(vinfo, output_dir='.', info_only=False):
netease_download_common(title, url_best,
output_dir=output_dir, info_only=info_only)
def netease_song_download(song, output_dir='.', info_only=False):
title = "%s. %s" % (song['position'], song['name'])
def netease_song_download(song, output_dir='.', info_only=False, playlist_prefix=""):
title = "%s%s. %s" % (playlist_prefix, song['position'], song['name'])
songNet = 'p' + song['mp3Url'].split('/')[2][1:]
if 'hMusic' in song and song['hMusic'] != None:

View File

@ -7,17 +7,62 @@ from .qie import download as qieDownload
from urllib.parse import urlparse,parse_qs
def qq_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False):
api = "http://h5vv.video.qq.com/getinfo?otype=json&platform=10901&vid=%s" % vid
content = get_html(api)
output_json = json.loads(match1(content, r'QZOutputJson=(.*)')[:-1])
url = output_json['vl']['vi'][0]['ul']['ui'][0]['url']
info_api = 'http://vv.video.qq.com/getinfo?otype=json&appver=3%2E2%2E19%2E333&platform=11&defnpayver=1&vid=' + vid
info = get_html(info_api)
video_json = json.loads(match1(info, r'QZOutputJson=(.*)')[:-1])
parts_vid = video_json['vl']['vi'][0]['vid']
parts_ti = video_json['vl']['vi'][0]['ti']
parts_prefix = video_json['vl']['vi'][0]['ul']['ui'][0]['url']
parts_formats = video_json['fl']['fi']
# find best quality
# only looking for fhd(1080p) and shd(720p) here.
# 480p usually come with a single file, will be downloaded as fallback.
best_quality = ''
for part_format in parts_formats:
if part_format['name'] == 'fhd':
best_quality = 'fhd'
break
if part_format['name'] == 'shd':
best_quality = 'shd'
for part_format in parts_formats:
if (not best_quality == '') and (not part_format['name'] == best_quality):
continue
part_format_id = part_format['id']
part_format_sl = part_format['sl']
if part_format_sl == 0:
part_urls= []
total_size = 0
try:
# For fhd(1080p), every part is about 100M and 6 minutes
# try 100 parts here limited download longest single video of 10 hours.
for part in range(1,100):
filename = vid + '.p' + str(part_format_id % 1000) + '.' + str(part) + '.mp4'
key_api = "http://vv.video.qq.com/getkey?otype=json&platform=11&format=%s&vid=%s&filename=%s" % (part_format_id, parts_vid, filename)
#print(filename)
#print(key_api)
part_info = get_html(key_api)
key_json = json.loads(match1(part_info, r'QZOutputJson=(.*)')[:-1])
#print(key_json)
vkey = key_json['key']
url = '%s/%s?vkey=%s' % (parts_prefix, filename, vkey)
part_urls.append(url)
_, ext, size = url_info(url, faker=True)
total_size += size
except:
pass
print_info(site_info, parts_ti, ext, total_size)
if not info_only:
download_urls(part_urls, parts_ti, ext, total_size, output_dir=output_dir, merge=merge)
else:
fvkey = output_json['vl']['vi'][0]['fvkey']
mp4 = output_json['vl']['vi'][0]['cl'].get('ci', None)
if mp4:
mp4 = mp4[0]['keyid'].replace('.10', '.p') + '.mp4'
else:
mp4 = output_json['vl']['vi'][0]['fn']
url = '%s/%s?vkey=%s' % ( url, mp4, fvkey )
url = '%s/%s?vkey=%s' % ( parts_prefix, mp4, fvkey )
_, ext, size = url_info(url, faker=True)
print_info(site_info, title, ext, size)

View File

@ -51,11 +51,11 @@ def yixia_download(url, output_dir = '.', merge = True, info_only = False, **kwa
yixia_download_by_scid = yixia_miaopai_download_by_scid
site_info = "Yixia Miaopai"
if re.match(r'http://www.miaopai.com/show/channel/\w+', url): #PC
if re.match(r'http://www.miaopai.com/show/channel/.+', url): #PC
scid = match1(url, r'http://www.miaopai.com/show/channel/(.+)\.htm')
elif re.match(r'http://www.miaopai.com/show/\w+', url): #PC
elif re.match(r'http://www.miaopai.com/show/.+', url): #PC
scid = match1(url, r'http://www.miaopai.com/show/(.+)\.htm')
elif re.match(r'http://m.miaopai.com/show/channel/\w+', url): #Mobile
elif re.match(r'http://m.miaopai.com/show/channel/.+', url): #Mobile
scid = match1(url, r'http://m.miaopai.com/show/channel/(.+)\.htm')
elif 'xiaokaxiu.com' in hostname: #Xiaokaxiu

View File

@ -314,9 +314,6 @@ class Youku(VideoExtractor):
q = q
)
ksegs += [i['server'] for i in json.loads(get_content(u))]
if (parse_host(ksegs[len(ksegs)-1])[0] == "vali.cp31.ott.cibntv.net"):
ksegs.pop(len(ksegs)-1)
except error.HTTPError as e:
# Use fallback stream data in case of HTTP 404
log.e('[Error] ' + str(e))

View File

@ -155,6 +155,8 @@ class YouTube(VideoExtractor):
try:
ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1))
self.html5player = 'https:' + ytplayer_config['assets']['js']
# Workaround: get_video_info returns bad s. Why?
stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',')
except:
self.html5player = None
@ -236,7 +238,7 @@ class YouTube(VideoExtractor):
start = '{:0>2}:{:0>2}:{:06.3f}'.format(int(h), int(m), s).replace('.', ',')
m, s = divmod(finish, 60); h, m = divmod(m, 60)
finish = '{:0>2}:{:0>2}:{:06.3f}'.format(int(h), int(m), s).replace('.', ',')
content = text.firstChild.nodeValue
content = unescape_html(text.firstChild.nodeValue)
srt += '%s\n' % str(seq)
srt += '%s --> %s\n' % (start, finish)

View File

@ -125,7 +125,7 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'):
params = [FFMPEG] + LOGLEVEL + ['-f', 'concat', '-safe', '-1', '-y', '-i']
params.append(output + '.txt')
params += ['-c', 'copy', output]
params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc', output]
subprocess.check_call(params)
os.remove(output + '.txt')
@ -212,15 +212,6 @@ def ffmpeg_download_stream(files, title, ext, params={}, output_dir='.'):
if not (output_dir == '.'):
output = output_dir + '/' + output
ffmpeg_params = []
#should these exist...
if params is not None:
if len(params) > 0:
for k, v in params:
ffmpeg_params.append(k)
ffmpeg_params.append(v)
print('Downloading streaming content with FFmpeg, press q to stop recording...')
ffmpeg_params = [FFMPEG] + ['-y', '-re', '-i']
ffmpeg_params.append(files) #not the same here!!!!
@ -230,6 +221,12 @@ def ffmpeg_download_stream(files, title, ext, params={}, output_dir='.'):
else:
ffmpeg_params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc']
if params is not None:
if len(params) > 0:
for k, v in params:
ffmpeg_params.append(k)
ffmpeg_params.append(v)
ffmpeg_params.append(output)
print(' '.join(ffmpeg_params))

View File

@ -10,6 +10,7 @@ def legitimize(text, os=platform.system()):
text = text.translate({
0: None,
ord('/'): '-',
ord('|'): '-',
})
if os == 'Windows':
@ -20,7 +21,6 @@ def legitimize(text, os=platform.system()):
ord('*'): '-',
ord('?'): '-',
ord('\\'): '-',
ord('|'): '-',
ord('\"'): '\'',
# Reserved in Windows VFAT
ord('+'): '-',

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python
script_name = 'you-get'
__version__ = '0.4.555'
__version__ = '0.4.595'

View File

@ -21,9 +21,6 @@ class YouGetTests(unittest.TestCase):
def test_mixcloud(self):
mixcloud.download("http://www.mixcloud.com/DJVadim/north-america-are-you-ready/", info_only=True)
def test_vimeo(self):
vimeo.download("http://vimeo.com/56810854", info_only=True)
def test_youtube(self):
youtube.download("http://www.youtube.com/watch?v=pzKerr0JIPA", info_only=True)
youtube.download("http://youtu.be/pzKerr0JIPA", info_only=True)