Merge pull request #1 from soimort/develop

Project Updatding
This commit is contained in:
Yohohaha 2016-11-25 19:16:16 +08:00 committed by GitHub
commit c44a7ec1b9
21 changed files with 675 additions and 166 deletions

View File

@ -37,7 +37,7 @@ Interested? [Install it](#installation) now and [get started by examples](#getti
Are you a Python programmer? Then check out [the source](https://github.com/soimort/you-get) and fork it! Are you a Python programmer? Then check out [the source](https://github.com/soimort/you-get) and fork it!
![](http://i.imgur.com/GfthFAz.png) ![](https://i.imgur.com/GfthFAz.png)
## Installation ## Installation
@ -339,6 +339,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the
| **Tumblr** | <https://www.tumblr.com/> |✓|✓|✓| | **Tumblr** | <https://www.tumblr.com/> |✓|✓|✓|
| TED | <http://www.ted.com/> |✓| | | | TED | <http://www.ted.com/> |✓| | |
| SoundCloud | <https://soundcloud.com/> | | |✓| | SoundCloud | <https://soundcloud.com/> | | |✓|
| SHOWROOM | <https://www.showroom-live.com/> |✓| | |
| Pinterest | <https://www.pinterest.com/> | |✓| | | Pinterest | <https://www.pinterest.com/> | |✓| |
| MusicPlayOn | <http://en.musicplayon.com/> |✓| | | | MusicPlayOn | <http://en.musicplayon.com/> |✓| | |
| MTV81 | <http://www.mtv81.com/> |✓| | | | MTV81 | <http://www.mtv81.com/> |✓| | |
@ -372,7 +373,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the
| 爆米花网 | <http://www.baomihua.com/> |✓| | | | 爆米花网 | <http://www.baomihua.com/> |✓| | |
| **bilibili<br/>哔哩哔哩** | <http://www.bilibili.com/> |✓| | | | **bilibili<br/>哔哩哔哩** | <http://www.bilibili.com/> |✓| | |
| Dilidili | <http://www.dilidili.com/> |✓| | | | Dilidili | <http://www.dilidili.com/> |✓| | |
| 豆瓣 | <http://www.douban.com/> | | |✓| | 豆瓣 | <http://www.douban.com/> || |✓|
| 斗鱼 | <http://www.douyutv.com/> |✓| | | | 斗鱼 | <http://www.douyutv.com/> |✓| | |
| Panda<br/>熊猫 | <http://www.panda.tv/> |✓| | | | Panda<br/>熊猫 | <http://www.panda.tv/> |✓| | |
| 凤凰视频 | <http://v.ifeng.com/> |✓| | | | 凤凰视频 | <http://v.ifeng.com/> |✓| | |
@ -406,6 +407,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the
| 花瓣 | <http://huaban.com/> | |✓| | | 花瓣 | <http://huaban.com/> | |✓| |
| Naver<br/>네이버 | <http://tvcast.naver.com/> |✓| | | | Naver<br/>네이버 | <http://tvcast.naver.com/> |✓| | |
| 芒果TV | <http://www.mgtv.com/> |✓| | | | 芒果TV | <http://www.mgtv.com/> |✓| | |
| 火猫TV | <http://www.huomao.com/> |✓| | |
For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page.

View File

@ -27,7 +27,9 @@ SITES = {
'google' : 'google', 'google' : 'google',
'heavy-music' : 'heavymusic', 'heavy-music' : 'heavymusic',
'huaban' : 'huaban', 'huaban' : 'huaban',
'huomao' : 'huomaotv',
'iask' : 'sina', 'iask' : 'sina',
'icourses' : 'icourses',
'ifeng' : 'ifeng', 'ifeng' : 'ifeng',
'imgur' : 'imgur', 'imgur' : 'imgur',
'in' : 'alive', 'in' : 'alive',
@ -340,6 +342,45 @@ def get_content(url, headers={}, decoded=True):
return data return data
def post_content(url, headers={}, post_data={}, decoded=True):
"""Post the content of a URL via sending a HTTP POST request.
Args:
url: A URL.
headers: Request headers used by the client.
decoded: Whether decode the response body using UTF-8 or the charset specified in Content-Type.
Returns:
The content as a string.
"""
logging.debug('post_content: %s \n post_data: %s' % (url, post_data))
req = request.Request(url, headers=headers)
if cookies:
cookies.add_cookie_header(req)
req.headers.update(req.unredirected_hdrs)
post_data_enc = bytes(parse.urlencode(post_data), 'utf-8')
response = request.urlopen(req, data = post_data_enc)
data = response.read()
# Handle HTTP compression for gzip and deflate (zlib)
content_encoding = response.getheader('Content-Encoding')
if content_encoding == 'gzip':
data = ungzip(data)
elif content_encoding == 'deflate':
data = undeflate(data)
# Decode the response body
if decoded:
charset = match1(response.getheader('Content-Type'), r'charset=([\w-]+)')
if charset is not None:
data = data.decode(charset)
else:
data = data.decode('utf-8')
return data
def url_size(url, faker = False, headers = {}): def url_size(url, faker = False, headers = {}):
if faker: if faker:
response = request.urlopen(request.Request(url, headers = fake_headers), None) response = request.urlopen(request.Request(url, headers = fake_headers), None)
@ -507,7 +548,11 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False, h
os.remove(filepath) # on Windows rename could fail if destination filepath exists os.remove(filepath) # on Windows rename could fail if destination filepath exists
os.rename(temp_filepath, filepath) os.rename(temp_filepath, filepath)
def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker = False, headers = {}): def url_save_chunked(url, filepath, bar, dyn_callback=None, chunk_size=0, ignore_range=False, refer=None, is_part=False, faker=False, headers={}):
def dyn_update_url(received):
if callable(dyn_callback):
logging.debug('Calling callback %s for new URL from %s' % (dyn_callback.__name__, received))
return dyn_callback(received)
if os.path.exists(filepath): if os.path.exists(filepath):
if not force: if not force:
if not is_part: if not is_part:
@ -545,19 +590,26 @@ def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker =
else: else:
headers = {} headers = {}
if received: if received:
url = dyn_update_url(received)
if not ignore_range:
headers['Range'] = 'bytes=' + str(received) + '-' headers['Range'] = 'bytes=' + str(received) + '-'
if refer: if refer:
headers['Referer'] = refer headers['Referer'] = refer
response = request.urlopen(request.Request(url, headers = headers), None) response = request.urlopen(request.Request(url, headers=headers), None)
with open(temp_filepath, open_mode) as output: with open(temp_filepath, open_mode) as output:
this_chunk = received
while True: while True:
buffer = response.read(1024 * 256) buffer = response.read(1024 * 256)
if not buffer: if not buffer:
break break
output.write(buffer) output.write(buffer)
received += len(buffer) received += len(buffer)
if chunk_size and (received - this_chunk) >= chunk_size:
url = dyn_callback(received)
this_chunk = received
response = request.urlopen(request.Request(url, headers=headers), None)
if bar: if bar:
bar.update_received(len(buffer)) bar.update_received(len(buffer))
@ -806,7 +858,7 @@ def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merg
print() print()
def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False, headers = {}): def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False, headers = {}, **kwargs):
assert urls assert urls
if dry_run: if dry_run:
print('Real URLs:\n%s\n' % urls) print('Real URLs:\n%s\n' % urls)
@ -820,7 +872,7 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=No
filename = '%s.%s' % (title, ext) filename = '%s.%s' % (title, ext)
filepath = os.path.join(output_dir, filename) filepath = os.path.join(output_dir, filename)
if total_size and ext in ('ts'): if total_size:
if not force and os.path.exists(filepath[:-3] + '.mkv'): if not force and os.path.exists(filepath[:-3] + '.mkv'):
print('Skipping %s: file already exists' % filepath[:-3] + '.mkv') print('Skipping %s: file already exists' % filepath[:-3] + '.mkv')
print() print()
@ -835,7 +887,7 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=No
print('Downloading %s ...' % tr(filename)) print('Downloading %s ...' % tr(filename))
filepath = os.path.join(output_dir, filename) filepath = os.path.join(output_dir, filename)
parts.append(filepath) parts.append(filepath)
url_save_chunked(url, filepath, bar, refer = refer, faker = faker, headers = headers) url_save_chunked(url, filepath, bar, refer = refer, faker = faker, headers = headers, **kwargs)
bar.done() bar.done()
if not merge: if not merge:

View File

@ -24,6 +24,7 @@ from .funshion import *
from .google import * from .google import *
from .heavymusic import * from .heavymusic import *
from .huaban import * from .huaban import *
from .icourses import *
from .ifeng import * from .ifeng import *
from .imgur import * from .imgur import *
from .infoq import * from .infoq import *

View File

@ -73,14 +73,14 @@ def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
assert re.match(r'http://[^\.]+.acfun.[^\.]+/\D/\D\D(\d+)', url) assert re.match(r'http://[^\.]+.acfun.[^\.]+/\D/\D\D(\d+)', url)
html = get_html(url) html = get_html(url)
title = r1(r'<h1 id="txt-title-view">([^<>]+)<', html) title = r1(r'data-title="([^"]+)"', html)
title = unescape_html(title) title = unescape_html(title)
title = escape_file_path(title) title = escape_file_path(title)
assert title assert title
video = re.search('data-vid="(\d+)"\s*data-scode=""[^<]*title="([^"]+)"', html) vid = r1('data-vid="(\d+)"', html)
vid = video.group(1) up = r1('data-name="([^"]+)"', html)
title = title + ' - ' + video.group(2) title = title + ' - ' + up
acfun_download_by_vid(vid, title, acfun_download_by_vid(vid, title,
output_dir=output_dir, output_dir=output_dir,
merge=merge, merge=merge,

View File

@ -7,8 +7,10 @@ from ..common import *
from .embed import * from .embed import *
from .universal import * from .universal import *
def baidu_get_song_data(sid): def baidu_get_song_data(sid):
data = json.loads(get_html('http://music.baidu.com/data/music/fmlink?songIds=%s' % sid, faker = True))['data'] data = json.loads(get_html(
'http://music.baidu.com/data/music/fmlink?songIds=%s' % sid, faker=True))['data']
if data['xcode'] != '': if data['xcode'] != '':
# inside china mainland # inside china mainland
@ -17,22 +19,28 @@ def baidu_get_song_data(sid):
# outside china mainland # outside china mainland
return None return None
def baidu_get_song_url(data): def baidu_get_song_url(data):
return data['songLink'] return data['songLink']
def baidu_get_song_artist(data): def baidu_get_song_artist(data):
return data['artistName'] return data['artistName']
def baidu_get_song_album(data): def baidu_get_song_album(data):
return data['albumName'] return data['albumName']
def baidu_get_song_title(data): def baidu_get_song_title(data):
return data['songName'] return data['songName']
def baidu_get_song_lyric(data): def baidu_get_song_lyric(data):
lrc = data['lrcLink'] lrc = data['lrcLink']
return None if lrc is '' else "http://music.baidu.com%s" % lrc return None if lrc is '' else "http://music.baidu.com%s" % lrc
def baidu_download_song(sid, output_dir='.', merge=True, info_only=False): def baidu_download_song(sid, output_dir='.', merge=True, info_only=False):
data = baidu_get_song_data(sid) data = baidu_get_song_data(sid)
if data is not None: if data is not None:
@ -51,7 +59,8 @@ def baidu_download_song(sid, output_dir='.', merge=True, info_only=False):
type, ext, size = url_info(url, faker=True) type, ext, size = url_info(url, faker=True)
print_info(site_info, title, type, size) print_info(site_info, title, type, size)
if not info_only: if not info_only:
download_urls([url], file_name, ext, size, output_dir, merge=merge, faker=True) download_urls([url], file_name, ext, size,
output_dir, merge=merge, faker=True)
try: try:
type, ext, size = url_info(lrc, faker=True) type, ext, size = url_info(lrc, faker=True)
@ -61,12 +70,14 @@ def baidu_download_song(sid, output_dir='.', merge=True, info_only=False):
except: except:
pass pass
def baidu_download_album(aid, output_dir = '.', merge = True, info_only = False):
html = get_html('http://music.baidu.com/album/%s' % aid, faker = True) def baidu_download_album(aid, output_dir='.', merge=True, info_only=False):
html = get_html('http://music.baidu.com/album/%s' % aid, faker=True)
album_name = r1(r'<h2 class="album-name">(.+?)<\/h2>', html) album_name = r1(r'<h2 class="album-name">(.+?)<\/h2>', html)
artist = r1(r'<span class="author_list" title="(.+?)">', html) artist = r1(r'<span class="author_list" title="(.+?)">', html)
output_dir = '%s/%s - %s' % (output_dir, artist, album_name) output_dir = '%s/%s - %s' % (output_dir, artist, album_name)
ids = json.loads(r1(r'<span class="album-add" data-adddata=\'(.+?)\'>', html).replace('&quot', '').replace(';', '"'))['ids'] ids = json.loads(r1(r'<span class="album-add" data-adddata=\'(.+?)\'>',
html).replace('&quot', '').replace(';', '"'))['ids']
track_nr = 1 track_nr = 1
for id in ids: for id in ids:
song_data = baidu_get_song_data(id) song_data = baidu_get_song_data(id)
@ -75,38 +86,29 @@ def baidu_download_album(aid, output_dir = '.', merge = True, info_only = False)
song_lrc = baidu_get_song_lyric(song_data) song_lrc = baidu_get_song_lyric(song_data)
file_name = '%02d.%s' % (track_nr, song_title) file_name = '%02d.%s' % (track_nr, song_title)
type, ext, size = url_info(song_url, faker = True) type, ext, size = url_info(song_url, faker=True)
print_info(site_info, song_title, type, size) print_info(site_info, song_title, type, size)
if not info_only: if not info_only:
download_urls([song_url], file_name, ext, size, output_dir, merge = merge, faker = True) download_urls([song_url], file_name, ext, size,
output_dir, merge=merge, faker=True)
if song_lrc: if song_lrc:
type, ext, size = url_info(song_lrc, faker = True) type, ext, size = url_info(song_lrc, faker=True)
print_info(site_info, song_title, type, size) print_info(site_info, song_title, type, size)
if not info_only: if not info_only:
download_urls([song_lrc], file_name, ext, size, output_dir, faker = True) download_urls([song_lrc], file_name, ext,
size, output_dir, faker=True)
track_nr += 1 track_nr += 1
def baidu_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False, **kwargs):
if re.match(r'http://imgsrc.baidu.com', url):
universal_download(url, output_dir, merge=merge, info_only=info_only)
return
elif re.match(r'http://pan.baidu.com', url): def baidu_download(url, output_dir='.', stream_type=None, merge=True, info_only=False, **kwargs):
html = get_html(url)
title = r1(r'server_filename="([^"]+)"', html) if re.match(r'http://pan.baidu.com', url):
if len(title.split('.')) > 1: real_url, title, ext, size = baidu_pan_download(url)
title = ".".join(title.split('.')[:-1])
real_url = r1(r'\\"dlink\\":\\"([^"]*)\\"', html).replace('\\\\/', '/')
type, ext, size = url_info(real_url, faker = True)
print_info(site_info, title, ext, size)
if not info_only: if not info_only:
download_urls([real_url], title, ext, size, output_dir, merge = merge) download_urls([real_url], title, ext, size,
output_dir, url, merge=merge, faker=True)
elif re.match(r'http://music.baidu.com/album/\d+', url): elif re.match(r'http://music.baidu.com/album/\d+', url):
id = r1(r'http://music.baidu.com/album/(\d+)', url) id = r1(r'http://music.baidu.com/album/(\d+)', url)
baidu_download_album(id, output_dir, merge, info_only) baidu_download_album(id, output_dir, merge, info_only)
@ -124,17 +126,20 @@ def baidu_download(url, output_dir = '.', stream_type = None, merge = True, info
html = get_html(url) html = get_html(url)
title = r1(r'title:"([^"]+)"', html) title = r1(r'title:"([^"]+)"', html)
items = re.findall(r'//imgsrc.baidu.com/forum/w[^"]+/([^/"]+)', html) items = re.findall(
r'//imgsrc.baidu.com/forum/w[^"]+/([^/"]+)', html)
urls = ['http://imgsrc.baidu.com/forum/pic/item/' + i urls = ['http://imgsrc.baidu.com/forum/pic/item/' + i
for i in set(items)] for i in set(items)]
# handle albums # handle albums
kw = r1(r'kw=([^&]+)', html) or r1(r"kw:'([^']+)'", html) kw = r1(r'kw=([^&]+)', html) or r1(r"kw:'([^']+)'", html)
tid = r1(r'tid=(\d+)', html) or r1(r"tid:'([^']+)'", html) tid = r1(r'tid=(\d+)', html) or r1(r"tid:'([^']+)'", html)
album_url = 'http://tieba.baidu.com/photo/g/bw/picture/list?kw=%s&tid=%s' % (kw, tid) album_url = 'http://tieba.baidu.com/photo/g/bw/picture/list?kw=%s&tid=%s' % (
kw, tid)
album_info = json.loads(get_content(album_url)) album_info = json.loads(get_content(album_url))
for i in album_info['data']['pic_list']: for i in album_info['data']['pic_list']:
urls.append('http://imgsrc.baidu.com/forum/pic/item/' + i['pic_id'] + '.jpg') urls.append(
'http://imgsrc.baidu.com/forum/pic/item/' + i['pic_id'] + '.jpg')
ext = 'jpg' ext = 'jpg'
size = float('Inf') size = float('Inf')
@ -144,6 +149,170 @@ def baidu_download(url, output_dir = '.', stream_type = None, merge = True, info
download_urls(urls, title, ext, size, download_urls(urls, title, ext, size,
output_dir=output_dir, merge=False) output_dir=output_dir, merge=False)
def baidu_pan_download(url):
errno_patt = r'errno":([^"]+),'
refer_url = ""
fake_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'UTF-8,*;q=0.5',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Host': 'pan.baidu.com',
'Origin': 'http://pan.baidu.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:13.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2500.0 Safari/537.36',
'Referer': refer_url
}
if cookies:
print('Use user specified cookies')
else:
print('Generating cookies...')
fake_headers['Cookie'] = baidu_pan_gen_cookies(url)
refer_url = "http://pan.baidu.com"
html = get_content(url, fake_headers, decoded=True)
isprotected = False
sign, timestamp, bdstoken, appid, primary_id, fs_id, uk = baidu_pan_parse(
html)
if sign == None:
if re.findall(r'\baccess-code\b', html):
isprotected = True
sign, timestamp, bdstoken, appid, primary_id, fs_id, uk, fake_headers, psk = baidu_pan_protected_share(
url)
# raise NotImplementedError("Password required!")
if isprotected != True:
raise AssertionError("Share not found or canceled: %s" % url)
if bdstoken == None:
bdstoken = ""
if isprotected != True:
sign, timestamp, bdstoken, appid, primary_id, fs_id, uk = baidu_pan_parse(
html)
request_url = "http://pan.baidu.com/api/sharedownload?sign=%s&timestamp=%s&bdstoken=%s&channel=chunlei&clienttype=0&web=1&app_id=%s" % (
sign, timestamp, bdstoken, appid)
refer_url = url
post_data = {
'encrypt': 0,
'product': 'share',
'uk': uk,
'primaryid': primary_id,
'fid_list': '[' + fs_id + ']'
}
if isprotected == True:
post_data['sekey'] = psk
response_content = post_content(request_url, fake_headers, post_data, True)
errno = match1(response_content, errno_patt)
if errno != "0":
raise AssertionError(
"Server refused to provide download link! (Errno:%s)" % errno)
real_url = r1(r'dlink":"([^"]+)"', response_content).replace('\\/', '/')
title = r1(r'server_filename":"([^"]+)"', response_content)
assert real_url
type, ext, size = url_info(real_url, faker=True)
title_wrapped = json.loads('{"wrapper":"%s"}' % title)
title = title_wrapped['wrapper']
logging.debug(real_url)
print_info(site_info, title, ext, size)
print('Hold on...')
time.sleep(5)
return real_url, title, ext, size
def baidu_pan_parse(html):
sign_patt = r'sign":"([^"]+)"'
timestamp_patt = r'timestamp":([^"]+),'
appid_patt = r'app_id":"([^"]+)"'
bdstoken_patt = r'bdstoken":"([^"]+)"'
fs_id_patt = r'fs_id":([^"]+),'
uk_patt = r'uk":([^"]+),'
errno_patt = r'errno":([^"]+),'
primary_id_patt = r'shareid":([^"]+),'
sign = match1(html, sign_patt)
timestamp = match1(html, timestamp_patt)
appid = match1(html, appid_patt)
bdstoken = match1(html, bdstoken_patt)
fs_id = match1(html, fs_id_patt)
uk = match1(html, uk_patt)
primary_id = match1(html, primary_id_patt)
return sign, timestamp, bdstoken, appid, primary_id, fs_id, uk
def baidu_pan_gen_cookies(url, post_data=None):
from http import cookiejar
cookiejar = cookiejar.CookieJar()
opener = request.build_opener(request.HTTPCookieProcessor(cookiejar))
resp = opener.open('http://pan.baidu.com')
if post_data != None:
resp = opener.open(url, bytes(parse.urlencode(post_data), 'utf-8'))
return cookjar2hdr(cookiejar)
def baidu_pan_protected_share(url):
print('This share is protected by password!')
inpwd = input('Please provide unlock password: ')
inpwd = inpwd.replace(' ', '').replace('\t', '')
print('Please wait...')
post_pwd = {
'pwd': inpwd,
'vcode': None,
'vstr': None
}
from http import cookiejar
import time
cookiejar = cookiejar.CookieJar()
opener = request.build_opener(request.HTTPCookieProcessor(cookiejar))
resp = opener.open('http://pan.baidu.com')
resp = opener.open(url)
init_url = resp.geturl()
verify_url = 'http://pan.baidu.com/share/verify?%s&t=%s&channel=chunlei&clienttype=0&web=1' % (
init_url.split('?', 1)[1], int(time.time()))
refer_url = init_url
fake_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'UTF-8,*;q=0.5',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Host': 'pan.baidu.com',
'Origin': 'http://pan.baidu.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:13.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2500.0 Safari/537.36',
'Referer': refer_url
}
opener.addheaders = dict2triplet(fake_headers)
pwd_resp = opener.open(verify_url, bytes(
parse.urlencode(post_pwd), 'utf-8'))
pwd_resp_str = ungzip(pwd_resp.read()).decode('utf-8')
pwd_res = json.loads(pwd_resp_str)
if pwd_res['errno'] != 0:
raise AssertionError(
'Server returned an error: %s (Incorrect password?)' % pwd_res['errno'])
pg_resp = opener.open('http://pan.baidu.com/share/link?%s' %
init_url.split('?', 1)[1])
content = ungzip(pg_resp.read()).decode('utf-8')
sign, timestamp, bdstoken, appid, primary_id, fs_id, uk = baidu_pan_parse(
content)
psk = query_cookiejar(cookiejar, 'BDCLND')
psk = parse.unquote(psk)
fake_headers['Cookie'] = cookjar2hdr(cookiejar)
return sign, timestamp, bdstoken, appid, primary_id, fs_id, uk, fake_headers, psk
def cookjar2hdr(cookiejar):
cookie_str = ''
for i in cookiejar:
cookie_str = cookie_str + i.name + '=' + i.value + ';'
return cookie_str[:-1]
def query_cookiejar(cookiejar, name):
for i in cookiejar:
if i.name == name:
return i.value
def dict2triplet(dictin):
out_triplet = []
for i in dictin:
out_triplet.append((i, dictin[i]))
return out_triplet
site_info = "Baidu.com" site_info = "Baidu.com"
download = baidu_download download = baidu_download
download_playlist = playlist_not_supported("baidu") download_playlist = playlist_not_supported("baidu")

View File

@ -119,17 +119,21 @@ def bilibili_live_download_by_cid(cid, title, output_dir='.', merge=True, info_o
def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs): def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
html = get_content(url) html = get_content(url)
if re.match(r'https?://bangumi\.bilibili\.com/', url):
# quick hack for bangumi URLs
url = r1(r'"([^"]+)" class="v-av-link"', html)
html = get_content(url)
title = r1_of([r'<meta name="title" content="\s*([^<>]{1,999})\s*" />', title = r1_of([r'<meta name="title" content="\s*([^<>]{1,999})\s*" />',
r'<h1[^>]*>\s*([^<>]+)\s*</h1>'], html) r'<h1[^>]*>\s*([^<>]+)\s*</h1>'], html)
if title: if title:
title = unescape_html(title) title = unescape_html(title)
title = escape_file_path(title) title = escape_file_path(title)
if re.match(r'https?://bangumi\.bilibili\.com/', url):
# quick hack for bangumi URLs
episode_id = r1(r'data-current-episode-id="(\d+)"', html)
cont = post_content('http://bangumi.bilibili.com/web_api/get_source',
post_data={'episode_id': episode_id})
cid = json.loads(cont)['result']['cid']
bilibili_download_by_cid(str(cid), title, output_dir=output_dir, merge=merge, info_only=info_only)
else:
flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"', flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"',
r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html)
assert flashvars assert flashvars

View File

@ -4,6 +4,11 @@ __all__ = ['dailymotion_download']
from ..common import * from ..common import *
def extract_m3u(url):
content = get_content(url)
m3u_url = re.findall(r'http://.*', content)[0]
return match1(m3u_url, r'([^#]+)')
def dailymotion_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): def dailymotion_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):
"""Downloads Dailymotion videos by URL. """Downloads Dailymotion videos by URL.
""" """
@ -13,7 +18,7 @@ def dailymotion_download(url, output_dir = '.', merge = True, info_only = False,
title = match1(html, r'"video_title"\s*:\s*"([^"]+)"') or \ title = match1(html, r'"video_title"\s*:\s*"([^"]+)"') or \
match1(html, r'"title"\s*:\s*"([^"]+)"') match1(html, r'"title"\s*:\s*"([^"]+)"')
for quality in ['720','480','380','240','auto']: for quality in ['1080','720','480','380','240','auto']:
try: try:
real_url = info[quality][0]["url"] real_url = info[quality][0]["url"]
if real_url: if real_url:
@ -21,11 +26,12 @@ def dailymotion_download(url, output_dir = '.', merge = True, info_only = False,
except KeyError: except KeyError:
pass pass
type, ext, size = url_info(real_url) m3u_url = extract_m3u(real_url)
mime, ext, size = 'video/mp4', 'mp4', 0
print_info(site_info, title, type, size) print_info(site_info, title, mime, size)
if not info_only: if not info_only:
download_urls([real_url], title, ext, size, output_dir, merge = merge) download_url_ffmpeg(m3u_url, title, ext, output_dir=output_dir, merge=merge)
site_info = "Dailymotion.com" site_info = "Dailymotion.com"
download = dailymotion_download download = dailymotion_download

View File

@ -7,7 +7,18 @@ from ..common import *
def douban_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): def douban_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):
html = get_html(url) html = get_html(url)
if 'subject' in url:
if re.match(r'https?://movie', url):
title = match1(html, 'name="description" content="([^"]+)')
tid = match1(url, 'trailer/(\d+)')
real_url = 'https://movie.douban.com/trailer/video_url?tid=%s' % tid
type, ext, size = url_info(real_url)
print_info(site_info, title, type, size)
if not info_only:
download_urls([real_url], title, ext, size, output_dir, merge = merge)
elif 'subject' in url:
titles = re.findall(r'data-title="([^"]*)">', html) titles = re.findall(r'data-title="([^"]*)">', html)
song_id = re.findall(r'<li class="song-item" id="([^"]*)"', html) song_id = re.findall(r'<li class="song-item" id="([^"]*)"', html)
song_ssid = re.findall(r'data-ssid="([^"]*)"', html) song_ssid = re.findall(r'data-ssid="([^"]*)"', html)

View File

@ -25,7 +25,7 @@ youku_embed_patterns = [ 'youku\.com/v_show/id_([a-zA-Z0-9=]+)',
""" """
http://www.tudou.com/programs/view/html5embed.action?type=0&amp;code=3LS_URGvl54&amp;lcode=&amp;resourceId=0_06_05_99 http://www.tudou.com/programs/view/html5embed.action?type=0&amp;code=3LS_URGvl54&amp;lcode=&amp;resourceId=0_06_05_99
""" """
tudou_embed_patterns = [ 'tudou\.com[a-zA-Z0-9\/\?=\&\.\;]+code=([a-zA-Z0-9_]+)\&', tudou_embed_patterns = [ 'tudou\.com[a-zA-Z0-9\/\?=\&\.\;]+code=([a-zA-Z0-9_-]+)\&',
'www\.tudou\.com/v/([a-zA-Z0-9_-]+)/[^"]*v\.swf' 'www\.tudou\.com/v/([a-zA-Z0-9_-]+)/[^"]*v\.swf'
] ]

View File

@ -0,0 +1,36 @@
#!/usr/bin/env python
__all__ = ['huomaotv_download']
from ..common import *
def get_mobile_room_url(room_id):
return 'http://www.huomao.com/mobile/mob_live/%s' % room_id
def get_m3u8_url(stream_id):
return 'http://live-ws.huomaotv.cn/live/%s/playlist.m3u8' % stream_id
def huomaotv_download(url, output_dir='.', merge=True, info_only=False, **kwargs):
room_id_pattern = r'huomao.com/(\d+)'
room_id = match1(url, room_id_pattern)
html = get_content(get_mobile_room_url(room_id))
stream_id_pattern = r'id="html_stream" value="(\w+)"'
stream_id = match1(html, stream_id_pattern)
m3u8_url = get_m3u8_url(stream_id)
title = match1(html, r'<title>([^<]{1,9999})</title>')
print_info(site_info, title, 'm3u8', float('inf'))
if not info_only:
download_url_ffmpeg(m3u8_url, title, 'm3u8', None, output_dir=output_dir, merge=merge)
site_info = 'huomao.com'
download = huomaotv_download
download_playlist = playlist_not_supported('huomao')

View File

@ -0,0 +1,148 @@
#!/usr/bin/env python
from ..common import *
from urllib import parse
import random
from time import sleep
import xml.etree.ElementTree as ET
import datetime
import hashlib
import base64
import logging
from urllib import error
import re
__all__ = ['icourses_download']
def icourses_download(url, merge=False, output_dir='.', **kwargs):
icourses_parser = ICousesExactor(url=url)
real_url = icourses_parser.icourses_cn_url_parser(**kwargs)
title = icourses_parser.title
if real_url is not None:
for tries in range(0, 5):
try:
_, type_, size = url_info(real_url, faker=True)
break
except error.HTTPError:
logging.warning('Failed to fetch the video file! Retrying...')
sleep(random.Random().randint(0, 5)) # Prevent from blockage
real_url = icourses_parser.icourses_cn_url_parser()
title = icourses_parser.title
print_info(site_info, title, type_, size)
if not kwargs['info_only']:
download_urls_chunked([real_url], title, 'flv',
total_size=size, output_dir=output_dir, refer=url, merge=merge, faker=True, ignore_range=True, chunk_size=15000000, dyn_callback=icourses_parser.icourses_cn_url_parser)
# Why not using VideoExtractor: This site needs specical download method
class ICousesExactor(object):
def __init__(self, url):
self.url = url
self.title = ''
return
def icourses_playlist_download(self, **kwargs):
html = get_content(self.url)
page_type_patt = r'showSectionNode\(this,(\d+),(\d+)\)'
video_js_number = r'changeforvideo\((.*?)\)'
fs_flag = r'<input type="hidden" value=(\w+) id="firstShowFlag">'
page_navi_vars = re.search(pattern=page_type_patt, string=html)
dummy_page = 'http://www.icourses.cn/jpk/viewCharacterDetail.action?sectionId={}&courseId={}'.format(
page_navi_vars.group(2), page_navi_vars.group(1))
html = get_content(dummy_page)
fs_status = match1(html, fs_flag)
video_list = re.findall(pattern=video_js_number, string=html)
for video in video_list:
video_args = video.replace('\'', '').split(',')
video_url = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}&firstShowFlag={}'.format(
video_args[0], video_args[1], fs_status or '1')
sleep(random.Random().randint(0, 5)) # Prevent from blockage
icourses_download(video_url, **kwargs)
def icourses_cn_url_parser(self, received=0, **kwargs):
PLAYER_BASE_VER = '150606-1'
ENCRYPT_MOD_VER = '151020'
ENCRYPT_SALT = '3DAPmXsZ4o' # It took really long time to find this...
html = get_content(self.url)
if re.search(pattern=r'showSectionNode\(.*\)', string=html):
logging.warning('Switching to playlist mode!')
return self.icourses_playlist_download(**kwargs)
flashvars_patt = r'var\ flashvars\=((.|\n)*)};'
server_time_patt = r'MPlayer.swf\?v\=(\d+)'
uuid_patt = r'uuid:(\d+)'
other_args_patt = r'other:"(.*)"'
res_url_patt = r'IService:\'([^\']+)'
title_a_patt = r'<div class="con"> <a.*?>(.*?)</a>'
title_b_patt = r'<div class="con"> <a.*?/a>((.|\n)*?)</div>'
title_a = match1(html, title_a_patt).strip()
title_b = match1(html, title_b_patt).strip()
title = title_a + title_b # WIP, FIXME
title = re.sub('( +|\n|\t|\r|\&nbsp\;)', '',
unescape_html(title).replace(' ', ''))
server_time = match1(html, server_time_patt)
flashvars = match1(html, flashvars_patt)
uuid = match1(flashvars, uuid_patt)
other_args = match1(flashvars, other_args_patt)
res_url = match1(flashvars, res_url_patt)
url_parts = {'v': server_time, 'other': other_args,
'uuid': uuid, 'IService': res_url}
req_url = '%s?%s' % (res_url, parse.urlencode(url_parts))
logging.debug('Requesting video resource location...')
xml_resp = get_html(req_url)
xml_obj = ET.fromstring(xml_resp)
logging.debug('The result was {}'.format(xml_obj.get('status')))
if xml_obj.get('status') != 'success':
raise ValueError('Server returned error!')
if received:
play_type = 'seek'
else:
play_type = 'play'
received -= 1
common_args = {'lv': PLAYER_BASE_VER, 'ls': play_type,
'lt': datetime.datetime.now().strftime('%m-%d/%H:%M:%S'),
'start': received + 1}
media_host = xml_obj.find(".//*[@name='host']").text
media_url = media_host + xml_obj.find(".//*[@name='url']").text
# This is what they called `SSLModule`... But obviously, just a kind of
# encryption, takes absolutely no effect in protecting data intergrity
if xml_obj.find(".//*[@name='ssl']").text != 'true':
logging.debug('The encryption mode is disabled')
# when the so-called `SSLMode` is not activated, the parameters, `h`
# and `p` can be found in response
arg_h = xml_obj.find(".//*[@name='h']").text
assert arg_h
arg_r = xml_obj.find(".//*[@name='p']").text or ENCRYPT_MOD_VER
url_args = common_args.copy()
url_args.update({'h': arg_h, 'r': arg_r})
final_url = '{}?{}'.format(
media_url, parse.urlencode(url_args))
self.title = title
return final_url
# when the `SSLMode` is activated, we need to receive the timestamp and the
# time offset (?) value from the server
logging.debug('The encryption mode is in effect')
ssl_callback = get_html(
'{}/ssl/ssl.shtml'.format(media_host)).split(',')
ssl_timestamp = int(datetime.datetime.strptime(
ssl_callback[1], "%b %d %H:%M:%S %Y").timestamp() + int(ssl_callback[0]))
sign_this = ENCRYPT_SALT + \
parse.urlparse(media_url).path + str(ssl_timestamp)
arg_h = base64.b64encode(hashlib.md5(
bytes(sign_this, 'utf-8')).digest())
# Post-processing, may subject to change, so leaving this alone...
arg_h = arg_h.decode('utf-8').strip('=').replace('+',
'-').replace('/', '_')
arg_r = ssl_timestamp
url_args = common_args.copy()
url_args.update({'h': arg_h, 'r': arg_r, 'p': ENCRYPT_MOD_VER})
final_url = '{}?{}'.format(
media_url, parse.urlencode(url_args))
logging.debug('Crafted URL: {}'.format(final_url))
self.title = title
return final_url
site_info = 'icourses.cn'
download = icourses_download
# download_playlist = icourses_playlist_download

View File

@ -12,11 +12,11 @@ import re
class MGTV(VideoExtractor): class MGTV(VideoExtractor):
name = "芒果 (MGTV)" name = "芒果 (MGTV)"
# Last updated: 2015-11-24 # Last updated: 2016-11-13
stream_types = [ stream_types = [
{'id': 'hd', 'container': 'flv', 'video_profile': '超清'}, {'id': 'hd', 'container': 'ts', 'video_profile': '超清'},
{'id': 'sd', 'container': 'flv', 'video_profile': '高清'}, {'id': 'sd', 'container': 'ts', 'video_profile': '高清'},
{'id': 'ld', 'container': 'flv', 'video_profile': '标清'}, {'id': 'ld', 'container': 'ts', 'video_profile': '标清'},
] ]
id_dic = {i['video_profile']:(i['id']) for i in stream_types} id_dic = {i['video_profile']:(i['id']) for i in stream_types}
@ -27,7 +27,7 @@ class MGTV(VideoExtractor):
def get_vid_from_url(url): def get_vid_from_url(url):
"""Extracts video ID from URL. """Extracts video ID from URL.
""" """
return match1(url, 'http://www.mgtv.com/v/\d/\d+/\w+/(\d+).html') return match1(url, 'http://www.mgtv.com/b/\d+/(\d+).html')
#---------------------------------------------------------------------- #----------------------------------------------------------------------
@staticmethod @staticmethod
@ -44,10 +44,15 @@ class MGTV(VideoExtractor):
content = get_content(content['info']) #get the REAL M3U url, maybe to be changed later? content = get_content(content['info']) #get the REAL M3U url, maybe to be changed later?
segment_list = [] segment_list = []
segments_size = 0
for i in content.split(): for i in content.split():
if not i.startswith('#'): #not the best way, better we use the m3u8 package if not i.startswith('#'): #not the best way, better we use the m3u8 package
segment_list.append(base_url + i) segment_list.append(base_url + i)
return segment_list # use ext-info for fast size calculate
elif i.startswith('#EXT-MGTV-File-SIZE:'):
segments_size += int(i[i.rfind(':')+1:])
return m3u_url, segments_size, segment_list
def download_playlist_by_url(self, url, **kwargs): def download_playlist_by_url(self, url, **kwargs):
pass pass
@ -69,14 +74,10 @@ class MGTV(VideoExtractor):
quality_id = self.id_dic[s['video_profile']] quality_id = self.id_dic[s['video_profile']]
url = stream_available[s['video_profile']] url = stream_available[s['video_profile']]
url = re.sub( r'(\&arange\=\d+)', '', url) #Un-Hum url = re.sub( r'(\&arange\=\d+)', '', url) #Un-Hum
segment_list_this = self.get_mgtv_real_url(url) m3u8_url, m3u8_size, segment_list_this = self.get_mgtv_real_url(url)
container_this_stream = ''
size_this_stream = 0
stream_fileid_list = [] stream_fileid_list = []
for i in segment_list_this: for i in segment_list_this:
_, container_this_stream, size_this_seg = url_info(i)
size_this_stream += size_this_seg
stream_fileid_list.append(os.path.basename(i).split('.')[0]) stream_fileid_list.append(os.path.basename(i).split('.')[0])
#make pieces #make pieces
@ -85,10 +86,11 @@ class MGTV(VideoExtractor):
pieces.append({'fileid': i[0], 'segs': i[1],}) pieces.append({'fileid': i[0], 'segs': i[1],})
self.streams[quality_id] = { self.streams[quality_id] = {
'container': 'flv', 'container': s['container'],
'video_profile': s['video_profile'], 'video_profile': s['video_profile'],
'size': size_this_stream, 'size': m3u8_size,
'pieces': pieces 'pieces': pieces,
'm3u8_url': m3u8_url
} }
if not kwargs['info_only']: if not kwargs['info_only']:
@ -107,6 +109,44 @@ class MGTV(VideoExtractor):
# Extract stream with the best quality # Extract stream with the best quality
stream_id = self.streams_sorted[0]['id'] stream_id = self.streams_sorted[0]['id']
def download(self, **kwargs):
if 'stream_id' in kwargs and kwargs['stream_id']:
stream_id = kwargs['stream_id']
else:
stream_id = 'null'
# print video info only
if 'info_only' in kwargs and kwargs['info_only']:
if stream_id != 'null':
if 'index' not in kwargs:
self.p(stream_id)
else:
self.p_i(stream_id)
else:
# Display all available streams
if 'index' not in kwargs:
self.p([])
else:
stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag']
self.p_i(stream_id)
# default to use the best quality
if stream_id == 'null':
stream_id = self.streams_sorted[0]['id']
stream_info = self.streams[stream_id]
if not kwargs['info_only']:
if player:
# with m3u8 format because some video player can process urls automatically (e.g. mpv)
launch_player(player, [stream_info['m3u8_url']])
else:
download_urls(stream_info['src'], self.title, stream_info['container'], stream_info['size'],
output_dir=kwargs['output_dir'],
merge=kwargs['merge'],
av=stream_id in self.dash_streams)
site = MGTV() site = MGTV()
download = site.download_by_url download = site.download_by_url
download_playlist = site.download_playlist_by_url download_playlist = site.download_playlist_by_url

View File

@ -55,12 +55,14 @@ def netease_cloud_music_download(url, output_dir='.', merge=True, info_only=Fals
cover_url = j['result']['coverImgUrl'] cover_url = j['result']['coverImgUrl']
download_urls([cover_url], "cover", "jpg", 0, new_dir) download_urls([cover_url], "cover", "jpg", 0, new_dir)
for i in j['result']['tracks']: prefix_width = len(str(len(j['result']['tracks'])))
netease_song_download(i, output_dir=new_dir, info_only=info_only) for n, i in enumerate(j['result']['tracks']):
playlist_prefix = '%%.%dd_' % prefix_width % n
netease_song_download(i, output_dir=new_dir, info_only=info_only, playlist_prefix=playlist_prefix)
try: # download lyrics try: # download lyrics
assert kwargs['caption'] assert kwargs['caption']
l = loads(get_content("http://music.163.com/api/song/lyric/?id=%s&lv=-1&csrf_token=" % i['id'], headers={"Referer": "http://music.163.com/"})) l = loads(get_content("http://music.163.com/api/song/lyric/?id=%s&lv=-1&csrf_token=" % i['id'], headers={"Referer": "http://music.163.com/"}))
netease_lyric_download(i, l["lrc"]["lyric"], output_dir=new_dir, info_only=info_only) netease_lyric_download(i, l["lrc"]["lyric"], output_dir=new_dir, info_only=info_only, playlist_prefix=playlist_prefix)
except: pass except: pass
elif "song" in url: elif "song" in url:
@ -85,10 +87,10 @@ def netease_cloud_music_download(url, output_dir='.', merge=True, info_only=Fals
j = loads(get_content("http://music.163.com/api/mv/detail/?id=%s&ids=[%s]&csrf_token=" % (rid, rid), headers={"Referer": "http://music.163.com/"})) j = loads(get_content("http://music.163.com/api/mv/detail/?id=%s&ids=[%s]&csrf_token=" % (rid, rid), headers={"Referer": "http://music.163.com/"}))
netease_video_download(j['data'], output_dir=output_dir, info_only=info_only) netease_video_download(j['data'], output_dir=output_dir, info_only=info_only)
def netease_lyric_download(song, lyric, output_dir='.', info_only=False): def netease_lyric_download(song, lyric, output_dir='.', info_only=False, playlist_prefix=""):
if info_only: return if info_only: return
title = "%s. %s" % (song['position'], song['name']) title = "%s%s. %s" % (playlist_prefix, song['position'], song['name'])
filename = '%s.lrc' % get_filename(title) filename = '%s.lrc' % get_filename(title)
print('Saving %s ...' % filename, end="", flush=True) print('Saving %s ...' % filename, end="", flush=True)
with open(os.path.join(output_dir, filename), with open(os.path.join(output_dir, filename),
@ -103,8 +105,8 @@ def netease_video_download(vinfo, output_dir='.', info_only=False):
netease_download_common(title, url_best, netease_download_common(title, url_best,
output_dir=output_dir, info_only=info_only) output_dir=output_dir, info_only=info_only)
def netease_song_download(song, output_dir='.', info_only=False): def netease_song_download(song, output_dir='.', info_only=False, playlist_prefix=""):
title = "%s. %s" % (song['position'], song['name']) title = "%s%s. %s" % (playlist_prefix, song['position'], song['name'])
songNet = 'p' + song['mp3Url'].split('/')[2][1:] songNet = 'p' + song['mp3Url'].split('/')[2][1:]
if 'hMusic' in song and song['hMusic'] != None: if 'hMusic' in song and song['hMusic'] != None:

View File

@ -7,17 +7,62 @@ from .qie import download as qieDownload
from urllib.parse import urlparse,parse_qs from urllib.parse import urlparse,parse_qs
def qq_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False): def qq_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False):
api = "http://h5vv.video.qq.com/getinfo?otype=json&platform=10901&vid=%s" % vid info_api = 'http://vv.video.qq.com/getinfo?otype=json&appver=3%2E2%2E19%2E333&platform=11&defnpayver=1&vid=' + vid
content = get_html(api) info = get_html(info_api)
output_json = json.loads(match1(content, r'QZOutputJson=(.*)')[:-1]) video_json = json.loads(match1(info, r'QZOutputJson=(.*)')[:-1])
url = output_json['vl']['vi'][0]['ul']['ui'][0]['url'] parts_vid = video_json['vl']['vi'][0]['vid']
parts_ti = video_json['vl']['vi'][0]['ti']
parts_prefix = video_json['vl']['vi'][0]['ul']['ui'][0]['url']
parts_formats = video_json['fl']['fi']
# find best quality
# only looking for fhd(1080p) and shd(720p) here.
# 480p usually come with a single file, will be downloaded as fallback.
best_quality = ''
for part_format in parts_formats:
if part_format['name'] == 'fhd':
best_quality = 'fhd'
break
if part_format['name'] == 'shd':
best_quality = 'shd'
for part_format in parts_formats:
if (not best_quality == '') and (not part_format['name'] == best_quality):
continue
part_format_id = part_format['id']
part_format_sl = part_format['sl']
if part_format_sl == 0:
part_urls= []
total_size = 0
try:
# For fhd(1080p), every part is about 100M and 6 minutes
# try 100 parts here limited download longest single video of 10 hours.
for part in range(1,100):
filename = vid + '.p' + str(part_format_id % 1000) + '.' + str(part) + '.mp4'
key_api = "http://vv.video.qq.com/getkey?otype=json&platform=11&format=%s&vid=%s&filename=%s" % (part_format_id, parts_vid, filename)
#print(filename)
#print(key_api)
part_info = get_html(key_api)
key_json = json.loads(match1(part_info, r'QZOutputJson=(.*)')[:-1])
#print(key_json)
vkey = key_json['key']
url = '%s/%s?vkey=%s' % (parts_prefix, filename, vkey)
part_urls.append(url)
_, ext, size = url_info(url, faker=True)
total_size += size
except:
pass
print_info(site_info, parts_ti, ext, total_size)
if not info_only:
download_urls(part_urls, parts_ti, ext, total_size, output_dir=output_dir, merge=merge)
else:
fvkey = output_json['vl']['vi'][0]['fvkey'] fvkey = output_json['vl']['vi'][0]['fvkey']
mp4 = output_json['vl']['vi'][0]['cl'].get('ci', None) mp4 = output_json['vl']['vi'][0]['cl'].get('ci', None)
if mp4: if mp4:
mp4 = mp4[0]['keyid'].replace('.10', '.p') + '.mp4' mp4 = mp4[0]['keyid'].replace('.10', '.p') + '.mp4'
else: else:
mp4 = output_json['vl']['vi'][0]['fn'] mp4 = output_json['vl']['vi'][0]['fn']
url = '%s/%s?vkey=%s' % ( url, mp4, fvkey ) url = '%s/%s?vkey=%s' % ( parts_prefix, mp4, fvkey )
_, ext, size = url_info(url, faker=True) _, ext, size = url_info(url, faker=True)
print_info(site_info, title, ext, size) print_info(site_info, title, ext, size)

View File

@ -51,11 +51,11 @@ def yixia_download(url, output_dir = '.', merge = True, info_only = False, **kwa
yixia_download_by_scid = yixia_miaopai_download_by_scid yixia_download_by_scid = yixia_miaopai_download_by_scid
site_info = "Yixia Miaopai" site_info = "Yixia Miaopai"
if re.match(r'http://www.miaopai.com/show/channel/\w+', url): #PC if re.match(r'http://www.miaopai.com/show/channel/.+', url): #PC
scid = match1(url, r'http://www.miaopai.com/show/channel/(.+)\.htm') scid = match1(url, r'http://www.miaopai.com/show/channel/(.+)\.htm')
elif re.match(r'http://www.miaopai.com/show/\w+', url): #PC elif re.match(r'http://www.miaopai.com/show/.+', url): #PC
scid = match1(url, r'http://www.miaopai.com/show/(.+)\.htm') scid = match1(url, r'http://www.miaopai.com/show/(.+)\.htm')
elif re.match(r'http://m.miaopai.com/show/channel/\w+', url): #Mobile elif re.match(r'http://m.miaopai.com/show/channel/.+', url): #Mobile
scid = match1(url, r'http://m.miaopai.com/show/channel/(.+)\.htm') scid = match1(url, r'http://m.miaopai.com/show/channel/(.+)\.htm')
elif 'xiaokaxiu.com' in hostname: #Xiaokaxiu elif 'xiaokaxiu.com' in hostname: #Xiaokaxiu

View File

@ -314,9 +314,6 @@ class Youku(VideoExtractor):
q = q q = q
) )
ksegs += [i['server'] for i in json.loads(get_content(u))] ksegs += [i['server'] for i in json.loads(get_content(u))]
if (parse_host(ksegs[len(ksegs)-1])[0] == "vali.cp31.ott.cibntv.net"):
ksegs.pop(len(ksegs)-1)
except error.HTTPError as e: except error.HTTPError as e:
# Use fallback stream data in case of HTTP 404 # Use fallback stream data in case of HTTP 404
log.e('[Error] ' + str(e)) log.e('[Error] ' + str(e))

View File

@ -155,6 +155,8 @@ class YouTube(VideoExtractor):
try: try:
ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1))
self.html5player = 'https:' + ytplayer_config['assets']['js'] self.html5player = 'https:' + ytplayer_config['assets']['js']
# Workaround: get_video_info returns bad s. Why?
stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',')
except: except:
self.html5player = None self.html5player = None
@ -236,7 +238,7 @@ class YouTube(VideoExtractor):
start = '{:0>2}:{:0>2}:{:06.3f}'.format(int(h), int(m), s).replace('.', ',') start = '{:0>2}:{:0>2}:{:06.3f}'.format(int(h), int(m), s).replace('.', ',')
m, s = divmod(finish, 60); h, m = divmod(m, 60) m, s = divmod(finish, 60); h, m = divmod(m, 60)
finish = '{:0>2}:{:0>2}:{:06.3f}'.format(int(h), int(m), s).replace('.', ',') finish = '{:0>2}:{:0>2}:{:06.3f}'.format(int(h), int(m), s).replace('.', ',')
content = text.firstChild.nodeValue content = unescape_html(text.firstChild.nodeValue)
srt += '%s\n' % str(seq) srt += '%s\n' % str(seq)
srt += '%s --> %s\n' % (start, finish) srt += '%s --> %s\n' % (start, finish)

View File

@ -125,7 +125,7 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'):
params = [FFMPEG] + LOGLEVEL + ['-f', 'concat', '-safe', '-1', '-y', '-i'] params = [FFMPEG] + LOGLEVEL + ['-f', 'concat', '-safe', '-1', '-y', '-i']
params.append(output + '.txt') params.append(output + '.txt')
params += ['-c', 'copy', output] params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc', output]
subprocess.check_call(params) subprocess.check_call(params)
os.remove(output + '.txt') os.remove(output + '.txt')
@ -212,15 +212,6 @@ def ffmpeg_download_stream(files, title, ext, params={}, output_dir='.'):
if not (output_dir == '.'): if not (output_dir == '.'):
output = output_dir + '/' + output output = output_dir + '/' + output
ffmpeg_params = []
#should these exist...
if params is not None:
if len(params) > 0:
for k, v in params:
ffmpeg_params.append(k)
ffmpeg_params.append(v)
print('Downloading streaming content with FFmpeg, press q to stop recording...') print('Downloading streaming content with FFmpeg, press q to stop recording...')
ffmpeg_params = [FFMPEG] + ['-y', '-re', '-i'] ffmpeg_params = [FFMPEG] + ['-y', '-re', '-i']
ffmpeg_params.append(files) #not the same here!!!! ffmpeg_params.append(files) #not the same here!!!!
@ -230,6 +221,12 @@ def ffmpeg_download_stream(files, title, ext, params={}, output_dir='.'):
else: else:
ffmpeg_params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc'] ffmpeg_params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc']
if params is not None:
if len(params) > 0:
for k, v in params:
ffmpeg_params.append(k)
ffmpeg_params.append(v)
ffmpeg_params.append(output) ffmpeg_params.append(output)
print(' '.join(ffmpeg_params)) print(' '.join(ffmpeg_params))

View File

@ -10,6 +10,7 @@ def legitimize(text, os=platform.system()):
text = text.translate({ text = text.translate({
0: None, 0: None,
ord('/'): '-', ord('/'): '-',
ord('|'): '-',
}) })
if os == 'Windows': if os == 'Windows':
@ -20,7 +21,6 @@ def legitimize(text, os=platform.system()):
ord('*'): '-', ord('*'): '-',
ord('?'): '-', ord('?'): '-',
ord('\\'): '-', ord('\\'): '-',
ord('|'): '-',
ord('\"'): '\'', ord('\"'): '\'',
# Reserved in Windows VFAT # Reserved in Windows VFAT
ord('+'): '-', ord('+'): '-',

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python
script_name = 'you-get' script_name = 'you-get'
__version__ = '0.4.555' __version__ = '0.4.595'

View File

@ -21,9 +21,6 @@ class YouGetTests(unittest.TestCase):
def test_mixcloud(self): def test_mixcloud(self):
mixcloud.download("http://www.mixcloud.com/DJVadim/north-america-are-you-ready/", info_only=True) mixcloud.download("http://www.mixcloud.com/DJVadim/north-america-are-you-ready/", info_only=True)
def test_vimeo(self):
vimeo.download("http://vimeo.com/56810854", info_only=True)
def test_youtube(self): def test_youtube(self):
youtube.download("http://www.youtube.com/watch?v=pzKerr0JIPA", info_only=True) youtube.download("http://www.youtube.com/watch?v=pzKerr0JIPA", info_only=True)
youtube.download("http://youtu.be/pzKerr0JIPA", info_only=True) youtube.download("http://youtu.be/pzKerr0JIPA", info_only=True)