From fb97d7d0c3a4b5c4db28b2f08dee82faa914b7c4 Mon Sep 17 00:00:00 2001 From: Bochun Bai Date: Mon, 3 Oct 2016 00:18:50 +0800 Subject: [PATCH 01/29] Tencent Video use best quality Prefer 1080p and 720p if available --- src/you_get/extractors/qq.py | 75 ++++++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 15 deletions(-) diff --git a/src/you_get/extractors/qq.py b/src/you_get/extractors/qq.py index 9ca8af82..f1707527 100644 --- a/src/you_get/extractors/qq.py +++ b/src/you_get/extractors/qq.py @@ -7,22 +7,67 @@ from .qie import download as qieDownload from urllib.parse import urlparse,parse_qs def qq_download_by_vid(vid, title, output_dir='.', merge=True, info_only=False): - api = "http://h5vv.video.qq.com/getinfo?otype=json&platform=10901&vid=%s" % vid - content = get_html(api) - output_json = json.loads(match1(content, r'QZOutputJson=(.*)')[:-1]) - url = output_json['vl']['vi'][0]['ul']['ui'][0]['url'] - fvkey = output_json['vl']['vi'][0]['fvkey'] - mp4 = output_json['vl']['vi'][0]['cl'].get('ci', None) - if mp4: - mp4 = mp4[0]['keyid'].replace('.10', '.p') + '.mp4' - else: - mp4 = output_json['vl']['vi'][0]['fn'] - url = '%s/%s?vkey=%s' % ( url, mp4, fvkey ) - _, ext, size = url_info(url, faker=True) + info_api = 'http://vv.video.qq.com/getinfo?otype=json&appver=3%2E2%2E19%2E333&platform=11&defnpayver=1&vid=' + vid + info = get_html(info_api) + video_json = json.loads(match1(info, r'QZOutputJson=(.*)')[:-1]) + parts_vid = video_json['vl']['vi'][0]['vid'] + parts_ti = video_json['vl']['vi'][0]['ti'] + parts_prefix = video_json['vl']['vi'][0]['ul']['ui'][0]['url'] + parts_formats = video_json['fl']['fi'] + # find best quality + # only looking for fhd(1080p) and shd(720p) here. + # 480p usually come with a single file, will be downloaded as fallback. + best_quality = '' + for part_format in parts_formats: + if part_format['name'] == 'fhd': + best_quality = 'fhd' + break - print_info(site_info, title, ext, size) - if not info_only: - download_urls([url], title, ext, size, output_dir=output_dir, merge=merge) + if part_format['name'] == 'shd': + best_quality = 'shd' + + for part_format in parts_formats: + if (not best_quality == '') and (not part_format['name'] == best_quality): + continue + part_format_id = part_format['id'] + part_format_sl = part_format['sl'] + if part_format_sl == 0: + part_urls= [] + total_size = 0 + try: + # For fhd(1080p), every part is about 100M and 6 minutes + # try 100 parts here limited download longest single video of 10 hours. + for part in range(1,100): + filename = vid + '.p' + str(part_format_id % 1000) + '.' + str(part) + '.mp4' + key_api = "http://vv.video.qq.com/getkey?otype=json&platform=11&format=%s&vid=%s&filename=%s" % (part_format_id, parts_vid, filename) + #print(filename) + #print(key_api) + part_info = get_html(key_api) + key_json = json.loads(match1(part_info, r'QZOutputJson=(.*)')[:-1]) + #print(key_json) + vkey = key_json['key'] + url = '%s/%s?vkey=%s' % (parts_prefix, filename, vkey) + part_urls.append(url) + _, ext, size = url_info(url, faker=True) + total_size += size + except: + pass + print_info(site_info, parts_ti, ext, total_size) + if not info_only: + download_urls(part_urls, parts_ti, ext, total_size, output_dir=output_dir, merge=merge) + else: + fvkey = output_json['vl']['vi'][0]['fvkey'] + mp4 = output_json['vl']['vi'][0]['cl'].get('ci', None) + if mp4: + mp4 = mp4[0]['keyid'].replace('.10', '.p') + '.mp4' + else: + mp4 = output_json['vl']['vi'][0]['fn'] + url = '%s/%s?vkey=%s' % ( parts_prefix, mp4, fvkey ) + _, ext, size = url_info(url, faker=True) + + print_info(site_info, title, ext, size) + if not info_only: + download_urls([url], title, ext, size, output_dir=output_dir, merge=merge) def qq_download(url, output_dir='.', merge=True, info_only=False, **kwargs): From 3b3e5cfe38fde46afe0ebf2717802c44e8028706 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 5 Oct 2016 14:38:02 +0200 Subject: [PATCH 02/29] update README (close #1422) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index a99e57fd..1b653308 100644 --- a/README.md +++ b/README.md @@ -339,6 +339,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | **Tumblr** | |✓|✓|✓| | TED | |✓| | | | SoundCloud | | | |✓| +| SHOWROOM | |✓| | | | Pinterest | | |✓| | | MusicPlayOn | |✓| | | | MTV81 | |✓| | | From c9ffae970e3ebc6131c4b4a6593320ab6fe65675 Mon Sep 17 00:00:00 2001 From: Vicent Tsai Date: Mon, 10 Oct 2016 00:13:01 +0800 Subject: [PATCH 03/29] [AcFun] fix #1429 --- src/you_get/extractors/acfun.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/you_get/extractors/acfun.py b/src/you_get/extractors/acfun.py index 4638cb8f..87e005fb 100644 --- a/src/you_get/extractors/acfun.py +++ b/src/you_get/extractors/acfun.py @@ -73,14 +73,14 @@ def acfun_download(url, output_dir='.', merge=True, info_only=False, **kwargs): assert re.match(r'http://[^\.]+.acfun.[^\.]+/\D/\D\D(\d+)', url) html = get_html(url) - title = r1(r'

([^<>]+)<', html) + title = r1(r'data-title="([^"]+)"', html) title = unescape_html(title) title = escape_file_path(title) assert title - video = re.search('data-vid="(\d+)"\s*data-scode=""[^<]*title="([^"]+)"', html) - vid = video.group(1) - title = title + ' - ' + video.group(2) + vid = r1('data-vid="(\d+)"', html) + up = r1('data-name="([^"]+)"', html) + title = title + ' - ' + up acfun_download_by_vid(vid, title, output_dir=output_dir, merge=merge, From 21fc4d4a0999e70b7886d3abd0cfaa6e4244dbae Mon Sep 17 00:00:00 2001 From: chinat Date: Mon, 10 Oct 2016 11:28:45 +0800 Subject: [PATCH 04/29] translate char | to - in filename for ffmpeg concat --- src/you_get/util/fs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/util/fs.py b/src/you_get/util/fs.py index 36e0b29d..4f415bf0 100644 --- a/src/you_get/util/fs.py +++ b/src/you_get/util/fs.py @@ -10,6 +10,7 @@ def legitimize(text, os=platform.system()): text = text.translate({ 0: None, ord('/'): '-', + ord('|'): '-', }) if os == 'Windows': @@ -20,7 +21,6 @@ def legitimize(text, os=platform.system()): ord('*'): '-', ord('?'): '-', ord('\\'): '-', - ord('|'): '-', ord('\"'): '\'', # Reserved in Windows VFAT ord('+'): '-', From e2f86641ab5354fa103f29709a21ffdb4cf574f8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Fri, 14 Oct 2016 20:32:05 +0200 Subject: [PATCH 05/29] update README: use https --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1b653308..b994ebd1 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Interested? [Install it](#installation) now and [get started by examples](#getti Are you a Python programmer? Then check out [the source](https://github.com/soimort/you-get) and fork it! -![](http://i.imgur.com/GfthFAz.png) +![](https://i.imgur.com/GfthFAz.png) ## Installation From 669d7b558655f4ffa530ad24573936f90119ced2 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 19 Oct 2016 20:47:17 +0200 Subject: [PATCH 06/29] [youtube] unescape HTML entities, fix #1462 --- src/you_get/extractors/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 0ef390ed..33e3923e 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -236,7 +236,7 @@ class YouTube(VideoExtractor): start = '{:0>2}:{:0>2}:{:06.3f}'.format(int(h), int(m), s).replace('.', ',') m, s = divmod(finish, 60); h, m = divmod(m, 60) finish = '{:0>2}:{:0>2}:{:06.3f}'.format(int(h), int(m), s).replace('.', ',') - content = text.firstChild.nodeValue + content = unescape_html(text.firstChild.nodeValue) srt += '%s\n' % str(seq) srt += '%s --> %s\n' % (start, finish) From 95a8d1e8afdd6df60b3e87ac739cf836b0d0d837 Mon Sep 17 00:00:00 2001 From: Cheng Gu Date: Thu, 20 Oct 2016 14:19:45 +0800 Subject: [PATCH 07/29] [huomaotv] add huomao.com suppport --- src/you_get/common.py | 1 + src/you_get/extractors/huomaotv.py | 36 ++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 src/you_get/extractors/huomaotv.py diff --git a/src/you_get/common.py b/src/you_get/common.py index 7f76aaac..3a60bf12 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -27,6 +27,7 @@ SITES = { 'google' : 'google', 'heavy-music' : 'heavymusic', 'huaban' : 'huaban', + 'huomao' : 'huomaotv', 'iask' : 'sina', 'ifeng' : 'ifeng', 'imgur' : 'imgur', diff --git a/src/you_get/extractors/huomaotv.py b/src/you_get/extractors/huomaotv.py new file mode 100644 index 00000000..4852ff06 --- /dev/null +++ b/src/you_get/extractors/huomaotv.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python + +__all__ = ['huomaotv_download'] + +from ..common import * + + +def get_mobile_room_url(room_id): + return 'http://www.huomao.com/mobile/mob_live?cid=%s' % room_id + + +def get_m3u8_url(stream_id): + return 'http://live-ws.huomaotv.cn/live/%s/playlist.m3u8' % stream_id + + +def huomaotv_download(url, output_dir='.', merge=True, info_only=False, **kwargs): + room_id_pattern = r'huomao.com/(\d+)' + room_id = match1(url, room_id_pattern) + html = get_content(get_mobile_room_url(room_id)) + + stream_id_pattern = r'id="html_stream" value="(\w+)"' + stream_id = match1(html, stream_id_pattern) + + m3u8_url = get_m3u8_url(stream_id) + + title = match1(html, r'([^<]{1,9999})') + + print_info(site_info, title, 'm3u8', float('inf')) + + if not info_only: + download_url_ffmpeg(m3u8_url, title, 'm3u8', None, output_dir=output_dir, merge=merge) + + +site_info = 'huomao.com' +download = huomaotv_download +download_playlist = playlist_not_supported('huomao') From 78fa1161310216e102f84f950b01d43c08899550 Mon Sep 17 00:00:00 2001 From: liushuyu Date: Fri, 14 Oct 2016 00:03:56 -0600 Subject: [PATCH 08/29] Add POST method to common.py --- src/you_get/common.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/you_get/common.py b/src/you_get/common.py index 6c65bd49..3e152732 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -327,6 +327,45 @@ def get_content(url, headers={}, decoded=True): return data +def post_content(url, headers={}, post_data={}, decoded=True): + """Post the content of a URL via sending a HTTP POST request. + + Args: + url: A URL. + headers: Request headers used by the client. + decoded: Whether decode the response body using UTF-8 or the charset specified in Content-Type. + + Returns: + The content as a string. + """ + + logging.debug('post_content: %s \n post_data: %s' % (url, post_data)) + + req = request.Request(url, headers=headers) + if cookies: + cookies.add_cookie_header(req) + req.headers.update(req.unredirected_hdrs) + post_data_enc = bytes(parse.urlencode(post_data), 'utf-8') + response = request.urlopen(req, data = post_data_enc) + data = response.read() + + # Handle HTTP compression for gzip and deflate (zlib) + content_encoding = response.getheader('Content-Encoding') + if content_encoding == 'gzip': + data = ungzip(data) + elif content_encoding == 'deflate': + data = undeflate(data) + + # Decode the response body + if decoded: + charset = match1(response.getheader('Content-Type'), r'charset=([\w-]+)') + if charset is not None: + data = data.decode(charset) + else: + data = data.decode('utf-8') + + return data + def url_size(url, faker = False, headers = {}): if faker: response = request.urlopen(request.Request(url, headers = fake_headers), None) From aef17dcb9926f9e2d056a92796dcf58537a42d63 Mon Sep 17 00:00:00 2001 From: liushuyu Date: Fri, 14 Oct 2016 00:03:56 -0600 Subject: [PATCH 09/29] Add Baidu Cloud support --- src/you_get/extractors/baidu.py | 225 ++++++++++++++++++++++++++++---- 1 file changed, 197 insertions(+), 28 deletions(-) mode change 100755 => 100644 src/you_get/extractors/baidu.py diff --git a/src/you_get/extractors/baidu.py b/src/you_get/extractors/baidu.py old mode 100755 new mode 100644 index aa9caa0c..d5efaf0b --- a/src/you_get/extractors/baidu.py +++ b/src/you_get/extractors/baidu.py @@ -7,8 +7,10 @@ from ..common import * from .embed import * from .universal import * + def baidu_get_song_data(sid): - data = json.loads(get_html('http://music.baidu.com/data/music/fmlink?songIds=%s' % sid, faker = True))['data'] + data = json.loads(get_html( + 'http://music.baidu.com/data/music/fmlink?songIds=%s' % sid, faker=True))['data'] if data['xcode'] != '': # inside china mainland @@ -17,22 +19,28 @@ def baidu_get_song_data(sid): # outside china mainland return None + def baidu_get_song_url(data): return data['songLink'] + def baidu_get_song_artist(data): return data['artistName'] + def baidu_get_song_album(data): return data['albumName'] + def baidu_get_song_title(data): return data['songName'] + def baidu_get_song_lyric(data): lrc = data['lrcLink'] return None if lrc is '' else "http://music.baidu.com%s" % lrc + def baidu_download_song(sid, output_dir='.', merge=True, info_only=False): data = baidu_get_song_data(sid) if data is not None: @@ -51,7 +59,8 @@ def baidu_download_song(sid, output_dir='.', merge=True, info_only=False): type, ext, size = url_info(url, faker=True) print_info(site_info, title, type, size) if not info_only: - download_urls([url], file_name, ext, size, output_dir, merge=merge, faker=True) + download_urls([url], file_name, ext, size, + output_dir, merge=merge, faker=True) try: type, ext, size = url_info(lrc, faker=True) @@ -61,12 +70,14 @@ def baidu_download_song(sid, output_dir='.', merge=True, info_only=False): except: pass -def baidu_download_album(aid, output_dir = '.', merge = True, info_only = False): - html = get_html('http://music.baidu.com/album/%s' % aid, faker = True) + +def baidu_download_album(aid, output_dir='.', merge=True, info_only=False): + html = get_html('http://music.baidu.com/album/%s' % aid, faker=True) album_name = r1(r'

(.+?)<\/h2>', html) artist = r1(r'', html) output_dir = '%s/%s - %s' % (output_dir, artist, album_name) - ids = json.loads(r1(r'', html).replace('"', '').replace(';', '"'))['ids'] + ids = json.loads(r1(r'', + html).replace('"', '').replace(';', '"'))['ids'] track_nr = 1 for id in ids: song_data = baidu_get_song_data(id) @@ -75,38 +86,29 @@ def baidu_download_album(aid, output_dir = '.', merge = True, info_only = False) song_lrc = baidu_get_song_lyric(song_data) file_name = '%02d.%s' % (track_nr, song_title) - type, ext, size = url_info(song_url, faker = True) + type, ext, size = url_info(song_url, faker=True) print_info(site_info, song_title, type, size) if not info_only: - download_urls([song_url], file_name, ext, size, output_dir, merge = merge, faker = True) + download_urls([song_url], file_name, ext, size, + output_dir, merge=merge, faker=True) if song_lrc: - type, ext, size = url_info(song_lrc, faker = True) + type, ext, size = url_info(song_lrc, faker=True) print_info(site_info, song_title, type, size) if not info_only: - download_urls([song_lrc], file_name, ext, size, output_dir, faker = True) + download_urls([song_lrc], file_name, ext, + size, output_dir, faker=True) track_nr += 1 -def baidu_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False, **kwargs): - if re.match(r'http://imgsrc.baidu.com', url): - universal_download(url, output_dir, merge=merge, info_only=info_only) - return - elif re.match(r'http://pan.baidu.com', url): - html = get_html(url) +def baidu_download(url, output_dir='.', stream_type=None, merge=True, info_only=False, **kwargs): - title = r1(r'server_filename="([^"]+)"', html) - if len(title.split('.')) > 1: - title = ".".join(title.split('.')[:-1]) - - real_url = r1(r'\\"dlink\\":\\"([^"]*)\\"', html).replace('\\\\/', '/') - type, ext, size = url_info(real_url, faker = True) - - print_info(site_info, title, ext, size) + if re.match(r'http://pan.baidu.com', url): + real_url, title, ext, size = baidu_pan_download(url) if not info_only: - download_urls([real_url], title, ext, size, output_dir, merge = merge) - + download_urls([real_url], title, ext, size, + output_dir, url, merge=merge, faker=True) elif re.match(r'http://music.baidu.com/album/\d+', url): id = r1(r'http://music.baidu.com/album/(\d+)', url) baidu_download_album(id, output_dir, merge, info_only) @@ -124,17 +126,20 @@ def baidu_download(url, output_dir = '.', stream_type = None, merge = True, info html = get_html(url) title = r1(r'title:"([^"]+)"', html) - items = re.findall(r'//imgsrc.baidu.com/forum/w[^"]+/([^/"]+)', html) + items = re.findall( + r'//imgsrc.baidu.com/forum/w[^"]+/([^/"]+)', html) urls = ['http://imgsrc.baidu.com/forum/pic/item/' + i for i in set(items)] # handle albums kw = r1(r'kw=([^&]+)', html) or r1(r"kw:'([^']+)'", html) tid = r1(r'tid=(\d+)', html) or r1(r"tid:'([^']+)'", html) - album_url = 'http://tieba.baidu.com/photo/g/bw/picture/list?kw=%s&tid=%s' % (kw, tid) + album_url = 'http://tieba.baidu.com/photo/g/bw/picture/list?kw=%s&tid=%s' % ( + kw, tid) album_info = json.loads(get_content(album_url)) for i in album_info['data']['pic_list']: - urls.append('http://imgsrc.baidu.com/forum/pic/item/' + i['pic_id'] + '.jpg') + urls.append( + 'http://imgsrc.baidu.com/forum/pic/item/' + i['pic_id'] + '.jpg') ext = 'jpg' size = float('Inf') @@ -144,6 +149,170 @@ def baidu_download(url, output_dir = '.', stream_type = None, merge = True, info download_urls(urls, title, ext, size, output_dir=output_dir, merge=False) + +def baidu_pan_download(url): + errno_patt = r'errno":([^"]+),' + refer_url = "" + fake_headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Charset': 'UTF-8,*;q=0.5', + 'Accept-Encoding': 'gzip,deflate,sdch', + 'Accept-Language': 'en-US,en;q=0.8', + 'Host': 'pan.baidu.com', + 'Origin': 'http://pan.baidu.com', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:13.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2500.0 Safari/537.36', + 'Referer': refer_url + } + if cookies: + print('Use user specified cookies') + else: + print('Generating cookies...') + fake_headers['Cookie'] = baidu_pan_gen_cookies(url) + refer_url = "http://pan.baidu.com" + html = get_content(url, fake_headers, decoded=True) + isprotected = False + sign, timestamp, bdstoken, appid, primary_id, fs_id, uk = baidu_pan_parse( + html) + if sign == None: + if re.findall(r'\baccess-code\b', html): + isprotected = True + sign, timestamp, bdstoken, appid, primary_id, fs_id, uk, fake_headers, psk = baidu_pan_protected_share( + url) + # raise NotImplementedError("Password required!") + if isprotected != True: + raise AssertionError("Share not found or canceled: %s" % url) + if bdstoken == None: + bdstoken = "" + if isprotected != True: + sign, timestamp, bdstoken, appid, primary_id, fs_id, uk = baidu_pan_parse( + html) + request_url = "http://pan.baidu.com/api/sharedownload?sign=%s×tamp=%s&bdstoken=%s&channel=chunlei&clienttype=0&web=1&app_id=%s" % ( + sign, timestamp, bdstoken, appid) + refer_url = url + post_data = { + 'encrypt': 0, + 'product': 'share', + 'uk': uk, + 'primaryid': primary_id, + 'fid_list': '[' + fs_id + ']' + } + if isprotected == True: + post_data['sekey'] = psk + response_content = post_content(request_url, fake_headers, post_data, True) + errno = match1(response_content, errno_patt) + if errno != "0": + raise AssertionError( + "Server refused to provide download link! (Errno:%s)" % errno) + real_url = r1(r'dlink":"([^"]+)"', response_content).replace('\\/', '/') + title = r1(r'server_filename":"([^"]+)"', response_content) + assert real_url + type, ext, size = url_info(real_url, faker=True) + title_wrapped = json.loads('{"wrapper":"%s"}' % title) + title = title_wrapped['wrapper'] + logging.debug(real_url) + print_info(site_info, title, ext, size) + print('Hold on...') + time.sleep(5) + return real_url, title, ext, size + + +def baidu_pan_parse(html): + sign_patt = r'sign":"([^"]+)"' + timestamp_patt = r'timestamp":([^"]+),' + appid_patt = r'app_id":"([^"]+)"' + bdstoken_patt = r'bdstoken":"([^"]+)"' + fs_id_patt = r'fs_id":([^"]+),' + uk_patt = r'uk":([^"]+),' + errno_patt = r'errno":([^"]+),' + primary_id_patt = r'shareid":([^"]+),' + sign = match1(html, sign_patt) + timestamp = match1(html, timestamp_patt) + appid = match1(html, appid_patt) + bdstoken = match1(html, bdstoken_patt) + fs_id = match1(html, fs_id_patt) + uk = match1(html, uk_patt) + primary_id = match1(html, primary_id_patt) + return sign, timestamp, bdstoken, appid, primary_id, fs_id, uk + + +def baidu_pan_gen_cookies(url, post_data=None): + from http import cookiejar + cookiejar = cookiejar.CookieJar() + opener = request.build_opener(request.HTTPCookieProcessor(cookiejar)) + resp = opener.open('http://pan.baidu.com') + if post_data != None: + resp = opener.open(url, bytes(parse.urlencode(post_data), 'utf-8')) + return cookjar2hdr(cookiejar) + + +def baidu_pan_protected_share(url): + print('This share is protected by password!') + inpwd = input('Please provide unlock password: ') + inpwd = inpwd.replace(' ', '').replace('\t', '') + print('Please wait...') + post_pwd = { + 'pwd': inpwd, + 'vcode': None, + 'vstr': None + } + from http import cookiejar + import time + cookiejar = cookiejar.CookieJar() + opener = request.build_opener(request.HTTPCookieProcessor(cookiejar)) + resp = opener.open('http://pan.baidu.com') + resp = opener.open(url) + init_url = resp.geturl() + verify_url = 'http://pan.baidu.com/share/verify?%s&t=%s&channel=chunlei&clienttype=0&web=1' % ( + init_url.split('?', 1)[1], int(time.time())) + refer_url = init_url + fake_headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Charset': 'UTF-8,*;q=0.5', + 'Accept-Encoding': 'gzip,deflate,sdch', + 'Accept-Language': 'en-US,en;q=0.8', + 'Host': 'pan.baidu.com', + 'Origin': 'http://pan.baidu.com', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:13.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2500.0 Safari/537.36', + 'Referer': refer_url + } + opener.addheaders = dict2triplet(fake_headers) + pwd_resp = opener.open(verify_url, bytes( + parse.urlencode(post_pwd), 'utf-8')) + pwd_resp_str = ungzip(pwd_resp.read()).decode('utf-8') + pwd_res = json.loads(pwd_resp_str) + if pwd_res['errno'] != 0: + raise AssertionError( + 'Server returned an error: %s (Incorrect password?)' % pwd_res['errno']) + pg_resp = opener.open('http://pan.baidu.com/share/link?%s' % + init_url.split('?', 1)[1]) + content = ungzip(pg_resp.read()).decode('utf-8') + sign, timestamp, bdstoken, appid, primary_id, fs_id, uk = baidu_pan_parse( + content) + psk = query_cookiejar(cookiejar, 'BDCLND') + psk = parse.unquote(psk) + fake_headers['Cookie'] = cookjar2hdr(cookiejar) + return sign, timestamp, bdstoken, appid, primary_id, fs_id, uk, fake_headers, psk + + +def cookjar2hdr(cookiejar): + cookie_str = '' + for i in cookiejar: + cookie_str = cookie_str + i.name + '=' + i.value + ';' + return cookie_str[:-1] + + +def query_cookiejar(cookiejar, name): + for i in cookiejar: + if i.name == name: + return i.value + + +def dict2triplet(dictin): + out_triplet = [] + for i in dictin: + out_triplet.append((i, dictin[i])) + return out_triplet + site_info = "Baidu.com" download = baidu_download download_playlist = playlist_not_supported("baidu") From 4bbafeb9e48e76b7b622f2133685905b362a9096 Mon Sep 17 00:00:00 2001 From: liushuyu Date: Thu, 20 Oct 2016 13:09:30 -0600 Subject: [PATCH 10/29] icourse: add supprt --- src/you_get/common.py | 1 + src/you_get/extractors/__init__.py | 1 + src/you_get/extractors/icourses.py | 129 +++++++++++++++++++++++++++++ 3 files changed, 131 insertions(+) create mode 100644 src/you_get/extractors/icourses.py diff --git a/src/you_get/common.py b/src/you_get/common.py index 948b0ca2..ca867673 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -29,6 +29,7 @@ SITES = { 'huaban' : 'huaban', 'huomao' : 'huomaotv', 'iask' : 'sina', + 'icourses' : 'icourses', 'ifeng' : 'ifeng', 'imgur' : 'imgur', 'in' : 'alive', diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index e69bc2fd..61b6a0d1 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -24,6 +24,7 @@ from .funshion import * from .google import * from .heavymusic import * from .huaban import * +from .icourses import * from .ifeng import * from .imgur import * from .infoq import * diff --git a/src/you_get/extractors/icourses.py b/src/you_get/extractors/icourses.py new file mode 100644 index 00000000..5f9b8edf --- /dev/null +++ b/src/you_get/extractors/icourses.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +from ..common import * +from urllib import parse +import xml.etree.ElementTree as ET +import datetime +import hashlib +import base64 +import logging +from urllib import error +import re + +__all__ = ['icourses_download'] + + +def icourses_download(url, info_only, merge=False, output_dir='.', **kwargs): + title, real_url = icourses_cn_url_parser( + url, info_only=info_only, **kwargs) + if real_url is not None: + for tries in range(0, 3): + try: + _, type_, size = url_info(real_url, faker=True) + break + except error.HTTPError: + logging.warning('Failed to fetch the video file! Retrying...') + title, real_url = icourses_cn_url_parser(url) + print_info(site_info, title, type_, size) + if not info_only: + download_urls([real_url], title, 'flv', + total_size=size, output_dir=output_dir, refer=url, merge=merge, faker=True) + + +def icourses_playlist_download(url, **kwargs): + import random + from time import sleep + html = get_content(url) + page_type_patt = r'showSectionNode\(this,(\d+),(\d+)\)' + video_js_number = r'changeforvideo\((.*?)\)' + fs_flag = r'' + page_navi_vars = re.search(pattern=page_type_patt, string=html) + dummy_page = 'http://www.icourses.cn/jpk/viewCharacterDetail.action?sectionId={}&courseId={}'.format( + page_navi_vars.group(2), page_navi_vars.group(1)) + html = get_content(dummy_page) + fs_status = match1(html, fs_flag) + video_list = re.findall(pattern=video_js_number, string=html) + for video in video_list: + video_args = video.replace('\'', '').split(',') + video_url = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}&firstShowFlag={}'.format( + video_args[0], video_args[1], fs_status or '1') + sleep(random.Random().randint(0, 5)) # Prevent from blockage + icourses_download(url=video_url, **kwargs) + + +def icourses_cn_url_parser(url, **kwargs): + PLAYER_BASE_VER = '150606-1' + ENCRYPT_MOD_VER = '151020' + ENCRYPT_SALT = '3DAPmXsZ4o' # It took really long time to find this... + html = get_content(url) + if re.search(pattern=r'showSectionNode\(.*\)', string=html): + logging.warning('Switching to playlist mode!') + return icourses_playlist_download(url, **kwargs) + flashvars_patt = r'var\ flashvars\=((.|\n)*)};' + server_time_patt = r'MPlayer.swf\?v\=(\d+)' + uuid_patt = r'uuid:(\d+)' + other_args_patt = r'other:"(.*)"' + res_url_patt = r'IService:\'([^\']+)' + title_a_patt = r'
(.*?)' + title_b_patt = r'
((.|\n)*?)
' + title_a = match1(html, title_a_patt).strip() + title_b = match1(html, title_b_patt).strip() + title = title_a + title_b # WIP, FIXME + title = re.sub('( +|\n|\t|\r|\ \;)', '', + unescape_html(title).replace(' ', '')) + server_time = match1(html, server_time_patt) + flashvars = match1(html, flashvars_patt) + uuid = match1(flashvars, uuid_patt) + other_args = match1(flashvars, other_args_patt) + res_url = match1(flashvars, res_url_patt) + url_parts = {'v': server_time, 'other': other_args, + 'uuid': uuid, 'IService': res_url} + req_url = '%s?%s' % (res_url, parse.urlencode(url_parts)) + logging.debug('Requesting video resource location...') + xml_resp = get_html(req_url) + xml_obj = ET.fromstring(xml_resp) + logging.debug('The result was {}'.format(xml_obj.get('status'))) + if xml_obj.get('status') != 'success': + raise ValueError('Server returned error!') + common_args = {'lv': PLAYER_BASE_VER, 'ls': 'play', + 'lt': datetime.datetime.now().strftime('%m-%d/%H:%M:%S'), + 'start': 0} + media_host = xml_obj.find(".//*[@name='host']").text + media_url = media_host + xml_obj.find(".//*[@name='url']").text + # This is what they called `SSLModule`... But obviously, just a kind of + # encryption, takes absolutely no effect in protecting data intergrity + if xml_obj.find(".//*[@name='ssl']").text != 'true': + logging.debug('The encryption mode is disabled') + # when the so-called `SSLMode` is not activated, the parameters, `h` + # and `p` can be found in response + arg_h = xml_obj.find(".//*[@name='h']").text + assert arg_h + arg_r = xml_obj.find(".//*[@name='p']").text or ENCRYPT_MOD_VER + url_args = common_args.copy() + url_args.update({'h': arg_h, 'r': arg_r}) + final_url = '{}?{}'.format( + media_url, parse.urlencode(url_args)) + return title, final_url + # when the `SSLMode` is activated, we need to receive the timestamp and the + # time offset (?) value from the server + logging.debug('The encryption mode is in effect') + ssl_callback = get_html('{}/ssl/ssl.shtml'.format(media_host)).split(',') + ssl_timestamp = int(datetime.datetime.strptime( + ssl_callback[1], "%b %d %H:%M:%S %Y").timestamp() + int(ssl_callback[0])) + sign_this = ENCRYPT_SALT + \ + parse.urlparse(media_url).path + str(ssl_timestamp) + arg_h = base64.b64encode(hashlib.md5(bytes(sign_this, 'utf-8')).digest()) + # Post-processing, may subject to change, so leaving this alone... + arg_h = arg_h.decode('utf-8').strip('=').replace('+', + '-').replace('/', '_') + arg_r = ssl_timestamp + url_args = common_args.copy() + url_args.update({'h': arg_h, 'r': arg_r, 'p': ENCRYPT_MOD_VER}) + final_url = '{}?{}'.format( + media_url, parse.urlencode(url_args)) + logging.debug('Concat`ed URL: {}'.format(final_url)) + return title, final_url + + +site_info = 'icourses.cn' +download = icourses_download +download_playlist = icourses_playlist_download From 5351121186c2c8c94bc7b24419ea5ca305582462 Mon Sep 17 00:00:00 2001 From: liushuyu Date: Tue, 25 Oct 2016 12:52:30 -0600 Subject: [PATCH 11/29] icouses: Code clean up --- src/you_get/extractors/icourses.py | 197 +++++++++++++++-------------- 1 file changed, 105 insertions(+), 92 deletions(-) diff --git a/src/you_get/extractors/icourses.py b/src/you_get/extractors/icourses.py index 5f9b8edf..5c2f8cda 100644 --- a/src/you_get/extractors/icourses.py +++ b/src/you_get/extractors/icourses.py @@ -13,8 +13,9 @@ __all__ = ['icourses_download'] def icourses_download(url, info_only, merge=False, output_dir='.', **kwargs): - title, real_url = icourses_cn_url_parser( - url, info_only=info_only, **kwargs) + icourses_parser = ICousesExactor(url=url) + real_url = icourses_parser.icourses_cn_url_parser(**kwargs) + title = icourses_parser.title if real_url is not None: for tries in range(0, 3): try: @@ -22,108 +23,120 @@ def icourses_download(url, info_only, merge=False, output_dir='.', **kwargs): break except error.HTTPError: logging.warning('Failed to fetch the video file! Retrying...') - title, real_url = icourses_cn_url_parser(url) + real_url = icourses_parser.icourses_cn_url_parser() + title = icourses_parser.title print_info(site_info, title, type_, size) if not info_only: download_urls([real_url], title, 'flv', total_size=size, output_dir=output_dir, refer=url, merge=merge, faker=True) -def icourses_playlist_download(url, **kwargs): - import random - from time import sleep - html = get_content(url) - page_type_patt = r'showSectionNode\(this,(\d+),(\d+)\)' - video_js_number = r'changeforvideo\((.*?)\)' - fs_flag = r'' - page_navi_vars = re.search(pattern=page_type_patt, string=html) - dummy_page = 'http://www.icourses.cn/jpk/viewCharacterDetail.action?sectionId={}&courseId={}'.format( - page_navi_vars.group(2), page_navi_vars.group(1)) - html = get_content(dummy_page) - fs_status = match1(html, fs_flag) - video_list = re.findall(pattern=video_js_number, string=html) - for video in video_list: - video_args = video.replace('\'', '').split(',') - video_url = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}&firstShowFlag={}'.format( - video_args[0], video_args[1], fs_status or '1') - sleep(random.Random().randint(0, 5)) # Prevent from blockage - icourses_download(url=video_url, **kwargs) +# Why not using VideoExtractor: This site needs specical download method +class ICousesExactor(object): + def __init__(self, url): + self.url = url + self.title = '' + return -def icourses_cn_url_parser(url, **kwargs): - PLAYER_BASE_VER = '150606-1' - ENCRYPT_MOD_VER = '151020' - ENCRYPT_SALT = '3DAPmXsZ4o' # It took really long time to find this... - html = get_content(url) - if re.search(pattern=r'showSectionNode\(.*\)', string=html): - logging.warning('Switching to playlist mode!') - return icourses_playlist_download(url, **kwargs) - flashvars_patt = r'var\ flashvars\=((.|\n)*)};' - server_time_patt = r'MPlayer.swf\?v\=(\d+)' - uuid_patt = r'uuid:(\d+)' - other_args_patt = r'other:"(.*)"' - res_url_patt = r'IService:\'([^\']+)' - title_a_patt = r'
(.*?)' - title_b_patt = r'
((.|\n)*?)
' - title_a = match1(html, title_a_patt).strip() - title_b = match1(html, title_b_patt).strip() - title = title_a + title_b # WIP, FIXME - title = re.sub('( +|\n|\t|\r|\ \;)', '', - unescape_html(title).replace(' ', '')) - server_time = match1(html, server_time_patt) - flashvars = match1(html, flashvars_patt) - uuid = match1(flashvars, uuid_patt) - other_args = match1(flashvars, other_args_patt) - res_url = match1(flashvars, res_url_patt) - url_parts = {'v': server_time, 'other': other_args, - 'uuid': uuid, 'IService': res_url} - req_url = '%s?%s' % (res_url, parse.urlencode(url_parts)) - logging.debug('Requesting video resource location...') - xml_resp = get_html(req_url) - xml_obj = ET.fromstring(xml_resp) - logging.debug('The result was {}'.format(xml_obj.get('status'))) - if xml_obj.get('status') != 'success': - raise ValueError('Server returned error!') - common_args = {'lv': PLAYER_BASE_VER, 'ls': 'play', - 'lt': datetime.datetime.now().strftime('%m-%d/%H:%M:%S'), - 'start': 0} - media_host = xml_obj.find(".//*[@name='host']").text - media_url = media_host + xml_obj.find(".//*[@name='url']").text - # This is what they called `SSLModule`... But obviously, just a kind of - # encryption, takes absolutely no effect in protecting data intergrity - if xml_obj.find(".//*[@name='ssl']").text != 'true': - logging.debug('The encryption mode is disabled') - # when the so-called `SSLMode` is not activated, the parameters, `h` - # and `p` can be found in response - arg_h = xml_obj.find(".//*[@name='h']").text - assert arg_h - arg_r = xml_obj.find(".//*[@name='p']").text or ENCRYPT_MOD_VER + def icourses_playlist_download(self, **kwargs): + import random + from time import sleep + html = get_content(url) + page_type_patt = r'showSectionNode\(this,(\d+),(\d+)\)' + video_js_number = r'changeforvideo\((.*?)\)' + fs_flag = r'' + page_navi_vars = re.search(pattern=page_type_patt, string=html) + dummy_page = 'http://www.icourses.cn/jpk/viewCharacterDetail.action?sectionId={}&courseId={}'.format( + page_navi_vars.group(2), page_navi_vars.group(1)) + html = get_content(dummy_page) + fs_status = match1(html, fs_flag) + video_list = re.findall(pattern=video_js_number, string=html) + for video in video_list: + video_args = video.replace('\'', '').split(',') + video_url = 'http://www.icourses.cn/jpk/changeforVideo.action?resId={}&courseId={}&firstShowFlag={}'.format( + video_args[0], video_args[1], fs_status or '1') + sleep(random.Random().randint(0, 5)) # Prevent from blockage + icourses_download(video_url, **kwargs) + + def icourses_cn_url_parser(self, **kwargs): + PLAYER_BASE_VER = '150606-1' + ENCRYPT_MOD_VER = '151020' + ENCRYPT_SALT = '3DAPmXsZ4o' # It took really long time to find this... + html = get_content(self.url) + if re.search(pattern=r'showSectionNode\(.*\)', string=html): + logging.warning('Switching to playlist mode!') + return self.icourses_playlist_download(**kwargs) + flashvars_patt = r'var\ flashvars\=((.|\n)*)};' + server_time_patt = r'MPlayer.swf\?v\=(\d+)' + uuid_patt = r'uuid:(\d+)' + other_args_patt = r'other:"(.*)"' + res_url_patt = r'IService:\'([^\']+)' + title_a_patt = r'
(.*?)' + title_b_patt = r'
((.|\n)*?)
' + title_a = match1(html, title_a_patt).strip() + title_b = match1(html, title_b_patt).strip() + title = title_a + title_b # WIP, FIXME + title = re.sub('( +|\n|\t|\r|\ \;)', '', + unescape_html(title).replace(' ', '')) + server_time = match1(html, server_time_patt) + flashvars = match1(html, flashvars_patt) + uuid = match1(flashvars, uuid_patt) + other_args = match1(flashvars, other_args_patt) + res_url = match1(flashvars, res_url_patt) + url_parts = {'v': server_time, 'other': other_args, + 'uuid': uuid, 'IService': res_url} + req_url = '%s?%s' % (res_url, parse.urlencode(url_parts)) + logging.debug('Requesting video resource location...') + xml_resp = get_html(req_url) + xml_obj = ET.fromstring(xml_resp) + logging.debug('The result was {}'.format(xml_obj.get('status'))) + if xml_obj.get('status') != 'success': + raise ValueError('Server returned error!') + common_args = {'lv': PLAYER_BASE_VER, 'ls': 'play', + 'lt': datetime.datetime.now().strftime('%m-%d/%H:%M:%S'), + 'start': 0} + media_host = xml_obj.find(".//*[@name='host']").text + media_url = media_host + xml_obj.find(".//*[@name='url']").text + # This is what they called `SSLModule`... But obviously, just a kind of + # encryption, takes absolutely no effect in protecting data intergrity + if xml_obj.find(".//*[@name='ssl']").text != 'true': + logging.debug('The encryption mode is disabled') + # when the so-called `SSLMode` is not activated, the parameters, `h` + # and `p` can be found in response + arg_h = xml_obj.find(".//*[@name='h']").text + assert arg_h + arg_r = xml_obj.find(".//*[@name='p']").text or ENCRYPT_MOD_VER + url_args = common_args.copy() + url_args.update({'h': arg_h, 'r': arg_r}) + final_url = '{}?{}'.format( + media_url, parse.urlencode(url_args)) + self.title = title + return final_url + # when the `SSLMode` is activated, we need to receive the timestamp and the + # time offset (?) value from the server + logging.debug('The encryption mode is in effect') + ssl_callback = get_html( + '{}/ssl/ssl.shtml'.format(media_host)).split(',') + ssl_timestamp = int(datetime.datetime.strptime( + ssl_callback[1], "%b %d %H:%M:%S %Y").timestamp() + int(ssl_callback[0])) + sign_this = ENCRYPT_SALT + \ + parse.urlparse(media_url).path + str(ssl_timestamp) + arg_h = base64.b64encode(hashlib.md5( + bytes(sign_this, 'utf-8')).digest()) + # Post-processing, may subject to change, so leaving this alone... + arg_h = arg_h.decode('utf-8').strip('=').replace('+', + '-').replace('/', '_') + arg_r = ssl_timestamp url_args = common_args.copy() - url_args.update({'h': arg_h, 'r': arg_r}) + url_args.update({'h': arg_h, 'r': arg_r, 'p': ENCRYPT_MOD_VER}) final_url = '{}?{}'.format( media_url, parse.urlencode(url_args)) - return title, final_url - # when the `SSLMode` is activated, we need to receive the timestamp and the - # time offset (?) value from the server - logging.debug('The encryption mode is in effect') - ssl_callback = get_html('{}/ssl/ssl.shtml'.format(media_host)).split(',') - ssl_timestamp = int(datetime.datetime.strptime( - ssl_callback[1], "%b %d %H:%M:%S %Y").timestamp() + int(ssl_callback[0])) - sign_this = ENCRYPT_SALT + \ - parse.urlparse(media_url).path + str(ssl_timestamp) - arg_h = base64.b64encode(hashlib.md5(bytes(sign_this, 'utf-8')).digest()) - # Post-processing, may subject to change, so leaving this alone... - arg_h = arg_h.decode('utf-8').strip('=').replace('+', - '-').replace('/', '_') - arg_r = ssl_timestamp - url_args = common_args.copy() - url_args.update({'h': arg_h, 'r': arg_r, 'p': ENCRYPT_MOD_VER}) - final_url = '{}?{}'.format( - media_url, parse.urlencode(url_args)) - logging.debug('Concat`ed URL: {}'.format(final_url)) - return title, final_url + logging.debug('Crafted URL: {}'.format(final_url)) + self.title = title + return final_url site_info = 'icourses.cn' download = icourses_download -download_playlist = icourses_playlist_download +# download_playlist = icourses_playlist_download From ae4e533ec9d28fb1598fb91dfa87ce16cb06bc92 Mon Sep 17 00:00:00 2001 From: liushuyu Date: Tue, 25 Oct 2016 14:03:21 -0600 Subject: [PATCH 12/29] common: add dynamic url support for `url_save_chunked` --- src/you_get/common.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 948b0ca2..0f7fd0e3 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -547,7 +547,11 @@ def url_save(url, filepath, bar, refer = None, is_part = False, faker = False, h os.remove(filepath) # on Windows rename could fail if destination filepath exists os.rename(temp_filepath, filepath) -def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker = False, headers = {}): +def url_save_chunked(url, filepath, bar, dyn_callback=None, chunk_size=0, ignore_range=False, refer=None, is_part=False, faker=False, headers={}): + def dyn_update_url(received): + if callable(dyn_callback): + logging.debug('Calling callback %s for new URL from %s' % (dyn_callback.__name__, received)) + return dyn_callback(received) if os.path.exists(filepath): if not force: if not is_part: @@ -585,19 +589,26 @@ def url_save_chunked(url, filepath, bar, refer = None, is_part = False, faker = else: headers = {} if received: - headers['Range'] = 'bytes=' + str(received) + '-' + url = dyn_update_url(received) + if not ignore_range: + headers['Range'] = 'bytes=' + str(received) + '-' if refer: headers['Referer'] = refer - response = request.urlopen(request.Request(url, headers = headers), None) + response = request.urlopen(request.Request(url, headers=headers), None) with open(temp_filepath, open_mode) as output: + this_chunk = received while True: buffer = response.read(1024 * 256) if not buffer: break output.write(buffer) received += len(buffer) + if chunk_size and (received - this_chunk) >= chunk_size: + url = dyn_callback(received) + this_chunk = received + response = request.urlopen(request.Request(url, headers=headers), None) if bar: bar.update_received(len(buffer)) @@ -846,7 +857,7 @@ def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merg print() -def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False, headers = {}): +def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False, headers = {}, **kwargs): assert urls if dry_run: print('Real URLs:\n%s\n' % urls) @@ -860,7 +871,7 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=No filename = '%s.%s' % (title, ext) filepath = os.path.join(output_dir, filename) - if total_size and ext in ('ts'): + if total_size: if not force and os.path.exists(filepath[:-3] + '.mkv'): print('Skipping %s: file already exists' % filepath[:-3] + '.mkv') print() @@ -875,7 +886,7 @@ def download_urls_chunked(urls, title, ext, total_size, output_dir='.', refer=No print('Downloading %s ...' % tr(filename)) filepath = os.path.join(output_dir, filename) parts.append(filepath) - url_save_chunked(url, filepath, bar, refer = refer, faker = faker, headers = headers) + url_save_chunked(url, filepath, bar, refer = refer, faker = faker, headers = headers, **kwargs) bar.done() if not merge: From 2183448c9098c1abd0e9cf47fa305e3775e1e098 Mon Sep 17 00:00:00 2001 From: liushuyu Date: Tue, 25 Oct 2016 14:15:23 -0600 Subject: [PATCH 13/29] icourses: implement fake `keep connection alive` --- src/you_get/extractors/icourses.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/you_get/extractors/icourses.py b/src/you_get/extractors/icourses.py index 5c2f8cda..cb2ff74a 100644 --- a/src/you_get/extractors/icourses.py +++ b/src/you_get/extractors/icourses.py @@ -1,6 +1,8 @@ #!/usr/bin/env python from ..common import * from urllib import parse +import random +from time import sleep import xml.etree.ElementTree as ET import datetime import hashlib @@ -12,23 +14,24 @@ import re __all__ = ['icourses_download'] -def icourses_download(url, info_only, merge=False, output_dir='.', **kwargs): +def icourses_download(url, merge=False, output_dir='.', **kwargs): icourses_parser = ICousesExactor(url=url) real_url = icourses_parser.icourses_cn_url_parser(**kwargs) title = icourses_parser.title if real_url is not None: - for tries in range(0, 3): + for tries in range(0, 5): try: _, type_, size = url_info(real_url, faker=True) break except error.HTTPError: logging.warning('Failed to fetch the video file! Retrying...') + sleep(random.Random().randint(0, 5)) # Prevent from blockage real_url = icourses_parser.icourses_cn_url_parser() title = icourses_parser.title print_info(site_info, title, type_, size) - if not info_only: - download_urls([real_url], title, 'flv', - total_size=size, output_dir=output_dir, refer=url, merge=merge, faker=True) + if not kwargs['info_only']: + download_urls_chunked([real_url], title, 'flv', + total_size=size, output_dir=output_dir, refer=url, merge=merge, faker=True, ignore_range=True, chunk_size=15000000, dyn_callback=icourses_parser.icourses_cn_url_parser) # Why not using VideoExtractor: This site needs specical download method @@ -40,9 +43,7 @@ class ICousesExactor(object): return def icourses_playlist_download(self, **kwargs): - import random - from time import sleep - html = get_content(url) + html = get_content(self.url) page_type_patt = r'showSectionNode\(this,(\d+),(\d+)\)' video_js_number = r'changeforvideo\((.*?)\)' fs_flag = r'' @@ -59,7 +60,7 @@ class ICousesExactor(object): sleep(random.Random().randint(0, 5)) # Prevent from blockage icourses_download(video_url, **kwargs) - def icourses_cn_url_parser(self, **kwargs): + def icourses_cn_url_parser(self, received=0, **kwargs): PLAYER_BASE_VER = '150606-1' ENCRYPT_MOD_VER = '151020' ENCRYPT_SALT = '3DAPmXsZ4o' # It took really long time to find this... @@ -93,9 +94,14 @@ class ICousesExactor(object): logging.debug('The result was {}'.format(xml_obj.get('status'))) if xml_obj.get('status') != 'success': raise ValueError('Server returned error!') - common_args = {'lv': PLAYER_BASE_VER, 'ls': 'play', + if received: + play_type = 'seek' + else: + play_type = 'play' + received -= 1 + common_args = {'lv': PLAYER_BASE_VER, 'ls': play_type, 'lt': datetime.datetime.now().strftime('%m-%d/%H:%M:%S'), - 'start': 0} + 'start': received + 1} media_host = xml_obj.find(".//*[@name='host']").text media_url = media_host + xml_obj.find(".//*[@name='url']").text # This is what they called `SSLModule`... But obviously, just a kind of From ac33461c88344d86f74b69572f2f27d03fd708b5 Mon Sep 17 00:00:00 2001 From: Cheng Gu Date: Thu, 27 Oct 2016 17:44:02 +0800 Subject: [PATCH 14/29] fix(huomao): adapt to new url format --- src/you_get/extractors/huomaotv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/huomaotv.py b/src/you_get/extractors/huomaotv.py index 4852ff06..6e98c800 100644 --- a/src/you_get/extractors/huomaotv.py +++ b/src/you_get/extractors/huomaotv.py @@ -6,7 +6,7 @@ from ..common import * def get_mobile_room_url(room_id): - return 'http://www.huomao.com/mobile/mob_live?cid=%s' % room_id + return 'http://www.huomao.com/mobile/mob_live/%s' % room_id def get_m3u8_url(stream_id): From 0f3fe97e9caedf976286193aff5dddf430d80962 Mon Sep 17 00:00:00 2001 From: Cheng Gu Date: Thu, 27 Oct 2016 17:44:54 +0800 Subject: [PATCH 15/29] update: add huomao.com --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b994ebd1..182fc12a 100644 --- a/README.md +++ b/README.md @@ -407,6 +407,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 花瓣 | | |✓| | | Naver
네이버 | |✓| | | | 芒果TV | |✓| | | +| 火猫TV | |✓| | | For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. From feffcb656ad2c33b17fb2e20598f8137fc69789c Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 30 Oct 2016 00:24:31 +0200 Subject: [PATCH 16/29] [processor.ffmpeg] fix params in ffmpeg_download_stream --- src/you_get/processor/ffmpeg.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/you_get/processor/ffmpeg.py b/src/you_get/processor/ffmpeg.py index 1c0ba1a3..c6da97f7 100644 --- a/src/you_get/processor/ffmpeg.py +++ b/src/you_get/processor/ffmpeg.py @@ -212,15 +212,6 @@ def ffmpeg_download_stream(files, title, ext, params={}, output_dir='.'): if not (output_dir == '.'): output = output_dir + '/' + output - ffmpeg_params = [] - #should these exist... - if params is not None: - if len(params) > 0: - for k, v in params: - ffmpeg_params.append(k) - ffmpeg_params.append(v) - - print('Downloading streaming content with FFmpeg, press q to stop recording...') ffmpeg_params = [FFMPEG] + ['-y', '-re', '-i'] ffmpeg_params.append(files) #not the same here!!!! @@ -230,6 +221,12 @@ def ffmpeg_download_stream(files, title, ext, params={}, output_dir='.'): else: ffmpeg_params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc'] + if params is not None: + if len(params) > 0: + for k, v in params: + ffmpeg_params.append(k) + ffmpeg_params.append(v) + ffmpeg_params.append(output) print(' '.join(ffmpeg_params)) From 4b55884e86df68c56ae9fce85293f9b757e97576 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 30 Oct 2016 00:26:25 +0200 Subject: [PATCH 17/29] [dailymotion] use ffmpeg_download_stream, fix #1466 --- src/you_get/extractors/dailymotion.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/you_get/extractors/dailymotion.py b/src/you_get/extractors/dailymotion.py index 8b701cd1..2e96c160 100644 --- a/src/you_get/extractors/dailymotion.py +++ b/src/you_get/extractors/dailymotion.py @@ -4,6 +4,11 @@ __all__ = ['dailymotion_download'] from ..common import * +def extract_m3u(url): + content = get_content(url) + m3u_url = re.findall(r'http://.*', content)[0] + return match1(m3u_url, r'([^#]+)') + def dailymotion_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): """Downloads Dailymotion videos by URL. """ @@ -13,7 +18,7 @@ def dailymotion_download(url, output_dir = '.', merge = True, info_only = False, title = match1(html, r'"video_title"\s*:\s*"([^"]+)"') or \ match1(html, r'"title"\s*:\s*"([^"]+)"') - for quality in ['720','480','380','240','auto']: + for quality in ['1080','720','480','380','240','auto']: try: real_url = info[quality][0]["url"] if real_url: @@ -21,11 +26,12 @@ def dailymotion_download(url, output_dir = '.', merge = True, info_only = False, except KeyError: pass - type, ext, size = url_info(real_url) + m3u_url = extract_m3u(real_url) + mime, ext, size = 'video/mp4', 'mp4', 0 - print_info(site_info, title, type, size) + print_info(site_info, title, mime, size) if not info_only: - download_urls([real_url], title, ext, size, output_dir, merge = merge) + download_url_ffmpeg(m3u_url, title, ext, output_dir=output_dir, merge=merge) site_info = "Dailymotion.com" download = dailymotion_download From a4f4fb362616862cc283b05122e74be346f1a309 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sun, 30 Oct 2016 16:16:04 +0100 Subject: [PATCH 18/29] Revert "fix for #1405" (fix #1485) This reverts commit 38ba0dbe48ecac4b7a354e4cf5766cf9415fb3c9. --- src/you_get/extractors/youku.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/you_get/extractors/youku.py b/src/you_get/extractors/youku.py index 1fb09e8c..853a75ba 100644 --- a/src/you_get/extractors/youku.py +++ b/src/you_get/extractors/youku.py @@ -314,9 +314,6 @@ class Youku(VideoExtractor): q = q ) ksegs += [i['server'] for i in json.loads(get_content(u))] - - if (parse_host(ksegs[len(ksegs)-1])[0] == "vali.cp31.ott.cibntv.net"): - ksegs.pop(len(ksegs)-1) except error.HTTPError as e: # Use fallback stream data in case of HTTP 404 log.e('[Error] ' + str(e)) From e8514d1370bc748946940c7c2f757db5c9cf42c8 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 3 Nov 2016 01:44:04 +0100 Subject: [PATCH 19/29] version 0.4.575 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 6d91656c..6d4f6c4f 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.555' +__version__ = '0.4.575' From 391ca5643a355c310db786e467c6929fd5dde53f Mon Sep 17 00:00:00 2001 From: Zhiming Wang Date: Wed, 2 Nov 2016 20:44:40 -0400 Subject: [PATCH 20/29] [embed] correct tudou pattern Hyphen-minus (-) is a valid character in Tudou's video ID. It's even present in the second pattern of tudou_embed_patterns, just not the first. --- src/you_get/extractors/embed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/extractors/embed.py b/src/you_get/extractors/embed.py index a177e663..fc4015c4 100644 --- a/src/you_get/extractors/embed.py +++ b/src/you_get/extractors/embed.py @@ -25,7 +25,7 @@ youku_embed_patterns = [ 'youku\.com/v_show/id_([a-zA-Z0-9=]+)', """ http://www.tudou.com/programs/view/html5embed.action?type=0&code=3LS_URGvl54&lcode=&resourceId=0_06_05_99 """ -tudou_embed_patterns = [ 'tudou\.com[a-zA-Z0-9\/\?=\&\.\;]+code=([a-zA-Z0-9_]+)\&', +tudou_embed_patterns = [ 'tudou\.com[a-zA-Z0-9\/\?=\&\.\;]+code=([a-zA-Z0-9_-]+)\&', 'www\.tudou\.com/v/([a-zA-Z0-9_-]+)/[^"]*v\.swf' ] From 2b0fe3443f844690305caa0a468d1b744c72ced5 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 3 Nov 2016 17:03:01 +0100 Subject: [PATCH 21/29] [test] remove test_vimeo --- tests/test.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test.py b/tests/test.py index 638206af..0fa2979a 100644 --- a/tests/test.py +++ b/tests/test.py @@ -21,9 +21,6 @@ class YouGetTests(unittest.TestCase): def test_mixcloud(self): mixcloud.download("http://www.mixcloud.com/DJVadim/north-america-are-you-ready/", info_only=True) - def test_vimeo(self): - vimeo.download("http://vimeo.com/56810854", info_only=True) - def test_youtube(self): youtube.download("http://www.youtube.com/watch?v=pzKerr0JIPA", info_only=True) youtube.download("http://youtu.be/pzKerr0JIPA", info_only=True) From bc590cbd62ca4350598551e41910c719864f0c36 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Thu, 3 Nov 2016 21:32:13 +0100 Subject: [PATCH 22/29] [douban] add support: movie.douban.com --- README.md | 4 ++-- src/you_get/extractors/douban.py | 23 +++++++++++++++++------ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 182fc12a..40a26803 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,7 @@ $ you-get https://github.com/soimort/you-get/archive/master.zip or use [chocolatey package manager](https://chocolatey.org): ``` -> choco upgrade you-get +> choco upgrade you-get ``` In order to get the latest ```develop``` branch without messing up the PIP, you can try: @@ -373,7 +373,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 爆米花网 | |✓| | | | **bilibili
哔哩哔哩** | |✓| | | | Dilidili | |✓| | | -| 豆瓣 | | | |✓| +| 豆瓣 | |✓| |✓| | 斗鱼 | |✓| | | | Panda
熊猫 | |✓| | | | 凤凰视频 | |✓| | | diff --git a/src/you_get/extractors/douban.py b/src/you_get/extractors/douban.py index 187e99c0..1a4a67d1 100644 --- a/src/you_get/extractors/douban.py +++ b/src/you_get/extractors/douban.py @@ -7,12 +7,23 @@ from ..common import * def douban_download(url, output_dir = '.', merge = True, info_only = False, **kwargs): html = get_html(url) - if 'subject' in url: + + if re.match(r'https?://movie', url): + title = match1(html, 'name="description" content="([^"]+)') + tid = match1(url, 'trailer/(\d+)') + real_url = 'https://movie.douban.com/trailer/video_url?tid=%s' % tid + type, ext, size = url_info(real_url) + + print_info(site_info, title, type, size) + if not info_only: + download_urls([real_url], title, ext, size, output_dir, merge = merge) + + elif 'subject' in url: titles = re.findall(r'data-title="([^"]*)">', html) song_id = re.findall(r'
  • Date: Thu, 3 Nov 2016 22:03:56 +0100 Subject: [PATCH 23/29] [bilibili] fix support for bangumi --- src/you_get/extractors/bilibili.py | 108 +++++++++++++++-------------- 1 file changed, 56 insertions(+), 52 deletions(-) diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py index c18290b8..122dea0b 100644 --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -119,66 +119,70 @@ def bilibili_live_download_by_cid(cid, title, output_dir='.', merge=True, info_o def bilibili_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_content(url) - if re.match(r'https?://bangumi\.bilibili\.com/', url): - # quick hack for bangumi URLs - url = r1(r'"([^"]+)" class="v-av-link"', html) - html = get_content(url) - title = r1_of([r'', r']*>\s*([^<>]+)\s*
  • '], html) if title: title = unescape_html(title) title = escape_file_path(title) - flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"', - r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) - assert flashvars - flashvars = flashvars.replace(': ', '=') - t, cid = flashvars.split('=', 1) - cid = cid.split('&')[0] - if t == 'cid': - if re.match(r'https?://live\.bilibili\.com/', url): - title = r1(r'\s*([^<>]+)\s*', html) - bilibili_live_download_by_cid(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) + if re.match(r'https?://bangumi\.bilibili\.com/', url): + # quick hack for bangumi URLs + episode_id = r1(r'data-current-episode-id="(\d+)"', html) + cont = post_content('http://bangumi.bilibili.com/web_api/get_source', + post_data={'episode_id': episode_id}) + cid = json.loads(cont)['result']['cid'] + bilibili_download_by_cid(str(cid), title, output_dir=output_dir, merge=merge, info_only=info_only) - else: - # multi-P - cids = [] - pages = re.findall('', html) - for i, page in enumerate(pages): - html = get_html("http://www.bilibili.com%s" % page) - flashvars = r1_of([r'(cid=\d+)', - r'flashvars="([^"]+)"', - r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) - if flashvars: - t, cid = flashvars.split('=', 1) - cids.append(cid.split('&')[0]) - if url.endswith(page): - cids = [cid.split('&')[0]] - titles = [titles[i]] - break - - # no multi-P - if not pages: - cids = [cid] - titles = [r1(r'', html) or title] - - for i in range(len(cids)): - bilibili_download_by_cid(cids[i], - titles[i], - output_dir=output_dir, - merge=merge, - info_only=info_only) - - elif t == 'vid': - sina_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) - elif t == 'ykid': - youku_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) - elif t == 'uid': - tudou_download_by_id(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) else: - raise NotImplementedError(flashvars) + flashvars = r1_of([r'(cid=\d+)', r'(cid: \d+)', r'flashvars="([^"]+)"', + r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) + assert flashvars + flashvars = flashvars.replace(': ', '=') + t, cid = flashvars.split('=', 1) + cid = cid.split('&')[0] + if t == 'cid': + if re.match(r'https?://live\.bilibili\.com/', url): + title = r1(r'\s*([^<>]+)\s*', html) + bilibili_live_download_by_cid(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) + + else: + # multi-P + cids = [] + pages = re.findall('', html) + for i, page in enumerate(pages): + html = get_html("http://www.bilibili.com%s" % page) + flashvars = r1_of([r'(cid=\d+)', + r'flashvars="([^"]+)"', + r'"https://[a-z]+\.bilibili\.com/secure,(cid=\d+)(?:&aid=\d+)?"'], html) + if flashvars: + t, cid = flashvars.split('=', 1) + cids.append(cid.split('&')[0]) + if url.endswith(page): + cids = [cid.split('&')[0]] + titles = [titles[i]] + break + + # no multi-P + if not pages: + cids = [cid] + titles = [r1(r'', html) or title] + + for i in range(len(cids)): + bilibili_download_by_cid(cids[i], + titles[i], + output_dir=output_dir, + merge=merge, + info_only=info_only) + + elif t == 'vid': + sina_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + elif t == 'ykid': + youku_download_by_vid(cid, title=title, output_dir=output_dir, merge=merge, info_only=info_only) + elif t == 'uid': + tudou_download_by_id(cid, title, output_dir=output_dir, merge=merge, info_only=info_only) + else: + raise NotImplementedError(flashvars) if not info_only and not dry_run: if not kwargs['caption']: From d04997ec9bc2ce68655334063e5cce840053a0b0 Mon Sep 17 00:00:00 2001 From: Rokic Date: Tue, 8 Nov 2016 02:09:39 +0800 Subject: [PATCH 24/29] fix #1415 Songs from netease cloud music playlist will have a prefix indicates their order in the list. --- src/you_get/extractors/netease.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/you_get/extractors/netease.py b/src/you_get/extractors/netease.py index 63ee59b8..d5f3b1fa 100644 --- a/src/you_get/extractors/netease.py +++ b/src/you_get/extractors/netease.py @@ -54,13 +54,15 @@ def netease_cloud_music_download(url, output_dir='.', merge=True, info_only=Fals os.mkdir(new_dir) cover_url = j['result']['coverImgUrl'] download_urls([cover_url], "cover", "jpg", 0, new_dir) - - for i in j['result']['tracks']: - netease_song_download(i, output_dir=new_dir, info_only=info_only) + + prefix_width = len(str(len(j['result']['tracks']))) + for n, i in enumerate(j['result']['tracks']): + playlist_prefix = '%%.%dd_' % prefix_width % n + netease_song_download(i, output_dir=new_dir, info_only=info_only, playlist_prefix=playlist_prefix) try: # download lyrics assert kwargs['caption'] l = loads(get_content("http://music.163.com/api/song/lyric/?id=%s&lv=-1&csrf_token=" % i['id'], headers={"Referer": "http://music.163.com/"})) - netease_lyric_download(i, l["lrc"]["lyric"], output_dir=new_dir, info_only=info_only) + netease_lyric_download(i, l["lrc"]["lyric"], output_dir=new_dir, info_only=info_only, playlist_prefix=playlist_prefix) except: pass elif "song" in url: @@ -85,10 +87,10 @@ def netease_cloud_music_download(url, output_dir='.', merge=True, info_only=Fals j = loads(get_content("http://music.163.com/api/mv/detail/?id=%s&ids=[%s]&csrf_token=" % (rid, rid), headers={"Referer": "http://music.163.com/"})) netease_video_download(j['data'], output_dir=output_dir, info_only=info_only) -def netease_lyric_download(song, lyric, output_dir='.', info_only=False): +def netease_lyric_download(song, lyric, output_dir='.', info_only=False, playlist_prefix=""): if info_only: return - title = "%s. %s" % (song['position'], song['name']) + title = "%s%s. %s" % (playlist_prefix, song['position'], song['name']) filename = '%s.lrc' % get_filename(title) print('Saving %s ...' % filename, end="", flush=True) with open(os.path.join(output_dir, filename), @@ -103,8 +105,8 @@ def netease_video_download(vinfo, output_dir='.', info_only=False): netease_download_common(title, url_best, output_dir=output_dir, info_only=info_only) -def netease_song_download(song, output_dir='.', info_only=False): - title = "%s. %s" % (song['position'], song['name']) +def netease_song_download(song, output_dir='.', info_only=False, playlist_prefix=""): + title = "%s%s. %s" % (playlist_prefix, song['position'], song['name']) songNet = 'p' + song['mp3Url'].split('/')[2][1:] if 'hMusic' in song and song['hMusic'] != None: From 51dd7ad8e6b757687a4c06af7b6b3fb3dfa5f5b1 Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Wed, 9 Nov 2016 17:13:02 +0100 Subject: [PATCH 25/29] [youtube] use url_encoded_fmt_stream_map from video page, fix #1502 --- src/you_get/extractors/youtube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/you_get/extractors/youtube.py b/src/you_get/extractors/youtube.py index 33e3923e..64af5c14 100644 --- a/src/you_get/extractors/youtube.py +++ b/src/you_get/extractors/youtube.py @@ -155,6 +155,8 @@ class YouTube(VideoExtractor): try: ytplayer_config = json.loads(re.search('ytplayer.config\s*=\s*([^\n]+?});', video_page).group(1)) self.html5player = 'https:' + ytplayer_config['assets']['js'] + # Workaround: get_video_info returns bad s. Why? + stream_list = ytplayer_config['args']['url_encoded_fmt_stream_map'].split(',') except: self.html5player = None From 78ba20266c6b5e1fef1398af60ea8361bf57fff0 Mon Sep 17 00:00:00 2001 From: moyo Date: Sun, 13 Nov 2016 17:41:00 +0800 Subject: [PATCH 26/29] 1. Change container from FLV to TS 2. Fix video url matcher 3. Use m3u8 ext-info for fast size calculate 4. Use m3u8 url for video playing --- src/you_get/extractors/mgtv.py | 74 ++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 17 deletions(-) diff --git a/src/you_get/extractors/mgtv.py b/src/you_get/extractors/mgtv.py index aeb42490..3ce62efe 100644 --- a/src/you_get/extractors/mgtv.py +++ b/src/you_get/extractors/mgtv.py @@ -12,11 +12,11 @@ import re class MGTV(VideoExtractor): name = "芒果 (MGTV)" - # Last updated: 2015-11-24 + # Last updated: 2016-11-13 stream_types = [ - {'id': 'hd', 'container': 'flv', 'video_profile': '超清'}, - {'id': 'sd', 'container': 'flv', 'video_profile': '高清'}, - {'id': 'ld', 'container': 'flv', 'video_profile': '标清'}, + {'id': 'hd', 'container': 'ts', 'video_profile': '超清'}, + {'id': 'sd', 'container': 'ts', 'video_profile': '高清'}, + {'id': 'ld', 'container': 'ts', 'video_profile': '标清'}, ] id_dic = {i['video_profile']:(i['id']) for i in stream_types} @@ -27,7 +27,7 @@ class MGTV(VideoExtractor): def get_vid_from_url(url): """Extracts video ID from URL. """ - return match1(url, 'http://www.mgtv.com/v/\d/\d+/\w+/(\d+).html') + return match1(url, 'http://www.mgtv.com/b/\d+/(\d+).html') #---------------------------------------------------------------------- @staticmethod @@ -44,10 +44,15 @@ class MGTV(VideoExtractor): content = get_content(content['info']) #get the REAL M3U url, maybe to be changed later? segment_list = [] + segments_size = 0 for i in content.split(): if not i.startswith('#'): #not the best way, better we use the m3u8 package segment_list.append(base_url + i) - return segment_list + # use ext-info for fast size calculate + elif i.startswith('#EXT-MGTV-File-SIZE:'): + segments_size += int(i[i.rfind(':')+1:]) + + return m3u_url, segments_size, segment_list def download_playlist_by_url(self, url, **kwargs): pass @@ -69,28 +74,25 @@ class MGTV(VideoExtractor): quality_id = self.id_dic[s['video_profile']] url = stream_available[s['video_profile']] url = re.sub( r'(\&arange\=\d+)', '', url) #Un-Hum - segment_list_this = self.get_mgtv_real_url(url) - - container_this_stream = '' - size_this_stream = 0 + m3u8_url, m3u8_size, segment_list_this = self.get_mgtv_real_url(url) + stream_fileid_list = [] for i in segment_list_this: - _, container_this_stream, size_this_seg = url_info(i) - size_this_stream += size_this_seg stream_fileid_list.append(os.path.basename(i).split('.')[0]) - + #make pieces pieces = [] for i in zip(stream_fileid_list, segment_list_this): pieces.append({'fileid': i[0], 'segs': i[1],}) self.streams[quality_id] = { - 'container': 'flv', + 'container': s['container'], 'video_profile': s['video_profile'], - 'size': size_this_stream, - 'pieces': pieces + 'size': m3u8_size, + 'pieces': pieces, + 'm3u8_url': m3u8_url } - + if not kwargs['info_only']: self.streams[quality_id]['src'] = segment_list_this @@ -107,6 +109,44 @@ class MGTV(VideoExtractor): # Extract stream with the best quality stream_id = self.streams_sorted[0]['id'] + def download(self, **kwargs): + + if 'stream_id' in kwargs and kwargs['stream_id']: + stream_id = kwargs['stream_id'] + else: + stream_id = 'null' + + # print video info only + if 'info_only' in kwargs and kwargs['info_only']: + if stream_id != 'null': + if 'index' not in kwargs: + self.p(stream_id) + else: + self.p_i(stream_id) + else: + # Display all available streams + if 'index' not in kwargs: + self.p([]) + else: + stream_id = self.streams_sorted[0]['id'] if 'id' in self.streams_sorted[0] else self.streams_sorted[0]['itag'] + self.p_i(stream_id) + + # default to use the best quality + if stream_id == 'null': + stream_id = self.streams_sorted[0]['id'] + + stream_info = self.streams[stream_id] + + if not kwargs['info_only']: + if player: + # with m3u8 format because some video player can process urls automatically (e.g. mpv) + launch_player(player, [stream_info['m3u8_url']]) + else: + download_urls(stream_info['src'], self.title, stream_info['container'], stream_info['size'], + output_dir=kwargs['output_dir'], + merge=kwargs['merge'], + av=stream_id in self.dash_streams) + site = MGTV() download = site.download_by_url download_playlist = site.download_playlist_by_url \ No newline at end of file From 65713cae2cf1c122be72c2d6fdaf854b35260562 Mon Sep 17 00:00:00 2001 From: L Date: Mon, 14 Nov 2016 21:49:13 +0800 Subject: [PATCH 27/29] update yixia_download url match rule resolved #1346 --- src/you_get/extractors/yixia.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/yixia.py b/src/you_get/extractors/yixia.py index ca5c4bd6..7d5ba290 100644 --- a/src/you_get/extractors/yixia.py +++ b/src/you_get/extractors/yixia.py @@ -51,11 +51,11 @@ def yixia_download(url, output_dir = '.', merge = True, info_only = False, **kwa yixia_download_by_scid = yixia_miaopai_download_by_scid site_info = "Yixia Miaopai" - if re.match(r'http://www.miaopai.com/show/channel/\w+', url): #PC + if re.match(r'http://www.miaopai.com/show/channel/.+', url): #PC scid = match1(url, r'http://www.miaopai.com/show/channel/(.+)\.htm') - elif re.match(r'http://www.miaopai.com/show/\w+', url): #PC + elif re.match(r'http://www.miaopai.com/show/.+', url): #PC scid = match1(url, r'http://www.miaopai.com/show/(.+)\.htm') - elif re.match(r'http://m.miaopai.com/show/channel/\w+', url): #Mobile + elif re.match(r'http://m.miaopai.com/show/channel/.+', url): #Mobile scid = match1(url, r'http://m.miaopai.com/show/channel/(.+)\.htm') elif 'xiaokaxiu.com' in hostname: #Xiaokaxiu From a7635e96a5e20cc4025fbcb236254e7a69c6556c Mon Sep 17 00:00:00 2001 From: Zhang Cheng Date: Thu, 17 Nov 2016 11:18:01 +0800 Subject: [PATCH 28/29] [mgtv] add bsf:a aac_adtstoasc to ffmpeg args, fix #1458. --- src/you_get/processor/ffmpeg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/processor/ffmpeg.py b/src/you_get/processor/ffmpeg.py index 1c0ba1a3..dcc8e1c8 100644 --- a/src/you_get/processor/ffmpeg.py +++ b/src/you_get/processor/ffmpeg.py @@ -125,7 +125,7 @@ def ffmpeg_concat_flv_to_mp4(files, output='output.mp4'): params = [FFMPEG] + LOGLEVEL + ['-f', 'concat', '-safe', '-1', '-y', '-i'] params.append(output + '.txt') - params += ['-c', 'copy', output] + params += ['-c', 'copy', '-bsf:a', 'aac_adtstoasc', output] subprocess.check_call(params) os.remove(output + '.txt') From 250672f42d475eba1b7a69b48683cf0d0576698a Mon Sep 17 00:00:00 2001 From: Mort Yao Date: Sat, 19 Nov 2016 20:47:18 +0100 Subject: [PATCH 29/29] version 0.4.595 --- src/you_get/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/you_get/version.py b/src/you_get/version.py index 6d4f6c4f..28919906 100644 --- a/src/you_get/version.py +++ b/src/you_get/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python script_name = 'you-get' -__version__ = '0.4.575' +__version__ = '0.4.595'