diff --git a/src/you_get/.idea/encodings.xml b/src/you_get/.idea/encodings.xml new file mode 100644 index 00000000..e206d70d --- /dev/null +++ b/src/you_get/.idea/encodings.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/src/you_get/.idea/misc.xml b/src/you_get/.idea/misc.xml new file mode 100644 index 00000000..8d36e051 --- /dev/null +++ b/src/you_get/.idea/misc.xml @@ -0,0 +1,10 @@ + + + + + + + + diff --git a/src/you_get/.idea/modules.xml b/src/you_get/.idea/modules.xml new file mode 100644 index 00000000..6403ccc7 --- /dev/null +++ b/src/you_get/.idea/modules.xml @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/src/you_get/.idea/scopes/scope_settings.xml b/src/you_get/.idea/scopes/scope_settings.xml new file mode 100644 index 00000000..922003b8 --- /dev/null +++ b/src/you_get/.idea/scopes/scope_settings.xml @@ -0,0 +1,5 @@ + + + + \ No newline at end of file diff --git a/src/you_get/.idea/vcs.xml b/src/you_get/.idea/vcs.xml new file mode 100644 index 00000000..c80f2198 --- /dev/null +++ b/src/you_get/.idea/vcs.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/src/you_get/.idea/you_get.iml b/src/you_get/.idea/you_get.iml new file mode 100644 index 00000000..a34a8570 --- /dev/null +++ b/src/you_get/.idea/you_get.iml @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/src/you_get/__init__.py b/src/you_get/__init__.py old mode 100644 new mode 100755 diff --git a/src/you_get/__main__.py b/src/you_get/__main__.py old mode 100644 new mode 100755 diff --git a/src/you_get/cli_wrapper/__init__.py b/src/you_get/cli_wrapper/__init__.py old mode 100644 new mode 100755 diff --git a/src/you_get/cli_wrapper/downloader/__init__.py b/src/you_get/cli_wrapper/downloader/__init__.py old mode 100644 new mode 100755 diff --git a/src/you_get/cli_wrapper/openssl/__init__.py b/src/you_get/cli_wrapper/openssl/__init__.py old mode 100644 new mode 100755 diff --git a/src/you_get/cli_wrapper/player/__init__.py b/src/you_get/cli_wrapper/player/__init__.py old mode 100644 new mode 100755 diff --git a/src/you_get/cli_wrapper/player/__main__.py b/src/you_get/cli_wrapper/player/__main__.py old mode 100644 new mode 100755 diff --git a/src/you_get/cli_wrapper/player/dragonplayer.py b/src/you_get/cli_wrapper/player/dragonplayer.py old mode 100644 new mode 100755 diff --git a/src/you_get/cli_wrapper/player/gnome_mplayer.py b/src/you_get/cli_wrapper/player/gnome_mplayer.py old mode 100644 new mode 100755 diff --git a/src/you_get/cli_wrapper/player/mplayer.py b/src/you_get/cli_wrapper/player/mplayer.py old mode 100644 new mode 100755 diff --git a/src/you_get/cli_wrapper/player/vlc.py b/src/you_get/cli_wrapper/player/vlc.py old mode 100644 new mode 100755 diff --git a/src/you_get/cli_wrapper/player/wmp.py b/src/you_get/cli_wrapper/player/wmp.py old mode 100644 new mode 100755 diff --git a/src/you_get/cli_wrapper/transcoder/__init__.py b/src/you_get/cli_wrapper/transcoder/__init__.py old mode 100644 new mode 100755 diff --git a/src/you_get/cli_wrapper/transcoder/ffmpeg.py b/src/you_get/cli_wrapper/transcoder/ffmpeg.py old mode 100644 new mode 100755 diff --git a/src/you_get/cli_wrapper/transcoder/libav.py b/src/you_get/cli_wrapper/transcoder/libav.py old mode 100644 new mode 100755 diff --git a/src/you_get/cli_wrapper/transcoder/mencoder.py b/src/you_get/cli_wrapper/transcoder/mencoder.py old mode 100644 new mode 100755 diff --git a/src/you_get/common.py b/src/you_get/common.py old mode 100644 new mode 100755 index 00f06254..5b0fc947 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -18,6 +18,7 @@ force = False player = None extractor_proxy = None cookies_txt = None +dry_infos = {} fake_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', @@ -464,6 +465,7 @@ class PiecesProgressBar: print() self.displayed = False + class DummyProgressBar: def __init__(self, *args): pass @@ -477,7 +479,10 @@ class DummyProgressBar: def download_urls(urls, title, ext, total_size, output_dir='.', refer=None, merge=True, faker=False): assert urls if dry_run: - print('Real URLs:\n%s\n' % urls) + dry_infos.clear() + dry_infos.update({'urls':urls, 'ext':ext, 'total_size':total_size}) + + print('Real URLs dry_infos:\n%s\n' % dry_infos['urls']) return if player: @@ -899,7 +904,7 @@ def script_main(script_name, download, download_playlist = None): sys.exit(1) def url_to_module(url): - from .extractors import netease, w56, acfun, baidu, baomihua, bilibili, blip, catfun, cntv, cbs, coursera, dailymotion, dongting, douban, douyutv, ehow, facebook, freesound, google, sina, ifeng, alive, instagram, iqiyi, joy, jpopsuki, khan, ku6, kugou, kuwo, letv, magisto, miomio, mixcloud, mtv81, nicovideo, pptv, qq, sohu, songtaste, soundcloud, ted, theplatform, tudou, tucao, tumblr, vid48, videobam, vimeo, vine, vk, xiami, yinyuetai, youku, youtube + from .extractors import netease, w56, acfun, baidu, bilibili, blip, catfun, cntv, cbs, coursera, dailymotion, dongting, douban, douyutv, ehow, facebook, freesound, google, sina, ifeng, alive, instagram, iqiyi, joy, jpopsuki, khan, ku6, kugou, kuwo, letv, magisto, miomio, mixcloud, mtv81, nicovideo, pptv, qq, sohu, songtaste, soundcloud, ted, theplatform, tudou, tucao, tumblr, vid48, videobam, vimeo, vine, vk, xiami, yinyuetai, youku, youtube video_host = r1(r'https?://([^/]+)/', url) video_url = r1(r'https?://[^/]+(.*)', url) @@ -916,7 +921,6 @@ def url_to_module(url): '56': w56, 'acfun': acfun, 'baidu': baidu, - 'baomihua': baomihua, 'bilibili': bilibili, 'blip': blip, 'catfun': catfun, @@ -984,9 +988,11 @@ def url_to_module(url): raise NotImplementedError(url) else: return url_to_module(location) - +extractor = [] def any_download(url, **kwargs): m, url = url_to_module(url) + extractor.clear() + extractor.append(m) m.download(url, **kwargs) def any_download_playlist(url, **kwargs): diff --git a/src/you_get/extractor.py b/src/you_get/extractor.py old mode 100644 new mode 100755 index 14fc5b7b..04f4df5e --- a/src/you_get/extractor.py +++ b/src/you_get/extractor.py @@ -2,7 +2,6 @@ from .common import match1, download_urls, parse_host, set_proxy, unset_proxy from .util import log - class Extractor(): def __init__(self, *args): self.url = None diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/acfun.py b/src/you_get/extractors/acfun.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/alive.py b/src/you_get/extractors/alive.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/bilibili.py b/src/you_get/extractors/bilibili.py old mode 100644 new mode 100755 index 1869f955..9851537b --- a/src/you_get/extractors/bilibili.py +++ b/src/you_get/extractors/bilibili.py @@ -26,6 +26,7 @@ def get_srt_xml(id): url = 'http://comment.bilibili.com/%s.xml' % id return get_html(url) + def parse_srt_p(p): fields = p.split(',') assert len(fields) == 8, fields @@ -49,15 +50,31 @@ def parse_srt_p(p): font_size = int(font_size) - font_color = '#%06x' % int(font_color) + font_color = int(font_color) - return pool, mode, font_size, font_color + return time,font_color, mode, font_size, 'bilibili_'+user_id, pub_time def parse_srt_xml(xml): - d = re.findall(r'(.*)', xml) - for x, y in d: - p = parse_srt_p(x) - raise NotImplementedError() + ret = [] + d = re.findall(r'(.*?)', xml) + if len(d) > 3000: + d = d[:3000] + for parameters, text in d: + item = {} + time,font_color, mode, font_size, uuid, publishTime = parse_srt_p(parameters) + item['text'] = text + item['color'] = font_color + item['fontSize'] = font_size + item['direct'] = mode + item['startTime'] = time + item['uuid'] = uuid + item['publishTime'] = publishTime + ret.append(item) + + return ret + +def get_Danmu(id): + return parse_srt_xml(get_srt_xml(id)) def parse_cid_playurl(xml): from xml.dom.minidom import parseString diff --git a/src/you_get/extractors/blip.py b/src/you_get/extractors/blip.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/catfun.py b/src/you_get/extractors/catfun.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/cbs.py b/src/you_get/extractors/cbs.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/cntv.py b/src/you_get/extractors/cntv.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/coursera.py b/src/you_get/extractors/coursera.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/dailymotion.py b/src/you_get/extractors/dailymotion.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/dongting.py b/src/you_get/extractors/dongting.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/douban.py b/src/you_get/extractors/douban.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/douyutv.py b/src/you_get/extractors/douyutv.py old mode 100644 new mode 100755 index 8674add0..b4d347e9 --- a/src/you_get/extractors/douyutv.py +++ b/src/you_get/extractors/douyutv.py @@ -9,12 +9,10 @@ import json def douyutv_download(url, output_dir = '.', merge = True, info_only = False): html = get_html(url) room_id_patt = r'"room_id":(\d{1,99}),' - title_patt = r'
\s*

([^<]{1,9999})

' - title_patt_backup = r'([^<]{1,9999})' + title_patt = r'
\s*

([^<]{1,9999})

\s*
' roomid = re.findall(room_id_patt,html)[0] - title = r1_of([title_patt,title_patt_backup], html) - title = unescape_html(title) + title = unescape_html(re.findall(title_patt,html)[0]) conf = get_html("http://www.douyutv.com/api/client/room/"+roomid) metadata = json.loads(conf) diff --git a/src/you_get/extractors/ehow.py b/src/you_get/extractors/ehow.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/facebook.py b/src/you_get/extractors/facebook.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/freesound.py b/src/you_get/extractors/freesound.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/google.py b/src/you_get/extractors/google.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/ifeng.py b/src/you_get/extractors/ifeng.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/instagram.py b/src/you_get/extractors/instagram.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/iqiyi.py b/src/you_get/extractors/iqiyi.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/joy.py b/src/you_get/extractors/joy.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/jpopsuki.py b/src/you_get/extractors/jpopsuki.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/ku6.py b/src/you_get/extractors/ku6.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/kugou.py b/src/you_get/extractors/kugou.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/kuwo.py b/src/you_get/extractors/kuwo.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/letv.py b/src/you_get/extractors/letv.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/magisto.py b/src/you_get/extractors/magisto.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/miomio.py b/src/you_get/extractors/miomio.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/mixcloud.py b/src/you_get/extractors/mixcloud.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/mtv81.py b/src/you_get/extractors/mtv81.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/netease.py b/src/you_get/extractors/netease.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/nicovideo.py b/src/you_get/extractors/nicovideo.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/pptv.py b/src/you_get/extractors/pptv.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/qq.py b/src/you_get/extractors/qq.py old mode 100644 new mode 100755 index 5a7f8472..573c844f --- a/src/you_get/extractors/qq.py +++ b/src/you_get/extractors/qq.py @@ -2,8 +2,8 @@ __all__ = ['qq_download'] -from ..common import * -import uuid +from you_get.common import * +import uuid, urllib #QQMUSIC #SINGLE #1. http://y.qq.com/#type=song&mid=000A9lMb0iEqwN @@ -17,43 +17,49 @@ import uuid #can download as video through qq_download_by_id #1. http://y.qq.com/y/static/mv/mv_play.html?vid=i0014ufczcw -def qq_download_by_id(id, title=None, output_dir='.', merge=True, info_only=False): - xml = get_html('http://www.acfun.tv/getinfo?vids=%s' % id) - from xml.dom.minidom import parseString - doc = parseString(xml) - doc_root = doc.getElementsByTagName('root')[0] - doc_vl = doc_root.getElementsByTagName('vl')[0] - doc_vi = doc_vl.getElementsByTagName('vi')[0] - fn = doc_vi.getElementsByTagName('fn')[0].firstChild.data - # fclip = doc_vi.getElementsByTagName('fclip')[0].firstChild.data - # fc=doc_vi.getElementsByTagName('fc')[0].firstChild.data - fvkey = doc_vi.getElementsByTagName('fvkey')[0].firstChild.data - doc_ul = doc_vi.getElementsByTagName('ul') +def qq_download_by_id(id, title=None, output_dir='.', merge=True, info_only=False, urls_only=False): + data = {'vids': id, 'otype': 'json'} + url = urllib.request.Request('http://vv.video.qq.com/getinfo', urllib.parse.urlencode(data).encode('utf-8')) + f = urllib.request.urlopen(url) + json_str = f.read() + data = json.loads(json_str[13:-1].decode('utf-8')) + format_id = 10202 + file_id = 1 + for format_info in data['fl']['fi']: + if format_info['sl'] > 0: + format_id = format_info['id'] + file_id = format_info['sl'] + break + file_name = data['vl']['vi'][0]['fn'] + split_pos = file_name.rfind('.') + file_name = file_name[:split_pos] + '.%d' % file_id + file_name[split_pos:] + video_urls = [ui['url'] for ui in data['vl']['vi'][0]['ul']['ui']] - url = doc_ul[0].getElementsByTagName('url')[1].firstChild.data + data = {'format': format_id, 'otype': 'json', 'vid': id, 'filename': file_name} + url = urllib.request.Request('http://vv.video.qq.com/getkey', urllib.parse.urlencode(data).encode('utf-8')) + f = urllib.request.urlopen(url) + json_str = f.read() + data = json.loads(json_str[13:-1].decode('utf-8')) + video_key = data['key'] - # print(i.firstChild.data) - urls=[] - ext=fn[-3:] - size=0 - for i in doc.getElementsByTagName("cs"): - size+=int(i.firstChild.data) + urls = [] + size = 0 + ext = '' + for url in video_urls: + try: + url = "%s%s?vkey=%s" % (url, file_name, video_key) + _, ext, size = url_info(url) + urls = [url] + break + except: + print(url) - # size=sum(map(int,doc.getElementsByTagName("cs"))) - locid=str(uuid.uuid4()) - for i in doc.getElementsByTagName("ci"): - urls.append(url+fn[:-4] + "." + i.getElementsByTagName("idx")[0].firstChild.data + fn[-4:] + '?vkey=' + fvkey+ '&sdtfrom=v1000&type='+ fn[-3:0] +'&locid=' + locid + "&&level=1&platform=11&br=133&fmt=hd&sp=0") + if urls_only: + return urls, size, ext, {} - # if int(fclip) > 0: - # fn = fn[:-4] + "." + fclip + fn[-4:] - # url = url + fn + '?vkey=' + fvkey - - # _, ext, size = url_info(url) - - print_info(site_info, title, ext, size) if not info_only: - download_urls(urls, title, ext, size, output_dir=output_dir, merge=merge) + download_urls([url], title, 'flv', size, output_dir = output_dir, merge = merge) def qq_download(url, output_dir = '.', merge = True, info_only = False): if re.match(r'http://v.qq.com/([^\?]+)\?vid', url): @@ -97,3 +103,9 @@ def qq_download(url, output_dir = '.', merge = True, info_only = False): site_info = "QQ.com" download = qq_download download_playlist = playlist_not_supported('qq') + + +if __name__ == '__main__': + #print(qq_download('http://v.qq.com/cover/c/crfns95chw1snp2/t0012q2nz5m.html', urls_only = True)) + # print(get_videoId('http://v.qq.com/cover/k/kuegopa6s70qeu1.html?vid=t0013jyqbo7')) + print(qq_download_by_id('u001428c4av', urls_only=True)) \ No newline at end of file diff --git a/src/you_get/extractors/sina.py b/src/you_get/extractors/sina.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/sohu.py b/src/you_get/extractors/sohu.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/songtaste.py b/src/you_get/extractors/songtaste.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/soundcloud.py b/src/you_get/extractors/soundcloud.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/ted.py b/src/you_get/extractors/ted.py old mode 100644 new mode 100755 index 0c2d2c83..eb99843d --- a/src/you_get/extractors/ted.py +++ b/src/you_get/extractors/ted.py @@ -3,7 +3,43 @@ __all__ = ['ted_download'] from ..common import * -import json +import json, os, inspect, logging, time +from pprint import pprint +#sys.path += [os.path.dirname(os.path.dirname(os.path.dirname(__file__)))] +currentDir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +parentDir = os.path.dirname(os.path.dirname(currentDir)) +se_parentDir = os.path.dirname(parentDir) +sys.path.append(parentDir) +sys.path.append(se_parentDir) +#print currentDir +#print parentDir +#print se_parentDir +# print sys.path + +from you_get.common import * + +TED_D_DINFO = False +TED_D_DSUB = False +TED_D_DFINFO = False + +TED_TALKS_URL_PAT = "http://www.ted.com/talks/%s" +FOUND = False + +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) +log_handle = logging.StreamHandler(sys.stdout) +log_handle.setFormatter(logging.Formatter('%(asctime)-15s [%(levelname)s] %(message)s')) +logger.addHandler(log_handle) + +fake_headers_here = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + #'Accept-Charset': 'UTF-8,*;q=0.5', + 'Accept-Encoding': 'gzip,deflate,sdch', + 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', + 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36', + 'Connection': 'keep-alive', + 'Cookie':'', + } def ted_download(url, output_dir='.', merge=True, info_only=False): html = get_html(url) @@ -19,6 +55,298 @@ def ted_download(url, output_dir='.', merge=True, info_only=False): download_urls([url], title, ext, size, output_dir, merge=merge) break +# For ted_download_by_id +re_url = re.compile('"nativeDownloads":.*"high":"(.+)\?.+},"sub') +re_slug = re.compile('"slug":"(.*?)"') +#re_vid = re.compile('http://.+\/(.*\.mp4)') +re_name = re.compile('"external":.*?,"name":"(.*?)","title":') +# Inner video ID +re_in_id = re.compile('http://www.ted.com/talks/(.*?)') + +def ted_download_by_id(id, title, output_dir = '.', stream_type = None, merge = True, info_only = False, urls_only = False): + # ret: urls,size,ext,headers = callMap[videoType](videoId,"title", urls_only = True) + + try: + url = TED_TALKS_URL_PAT % id + vpage = get_html(url) + except: + logger.info("###ted_download_by_id: TED id home can not be accessed") + return [url], 0, 'mp4', {} + + logger.info("###ted_download_by_id") + logger.info("page url is" + url) + + #print "page content is" + # print vpage + + v_url = re.findall(re_url, vpage)[0] + v_title = re.findall(re_name, vpage)[0] + size = urls_size([v_url], True, None) + #size is not used + # size = -1 + urls = [v_url] + + logger.info("###ted_download_by_id") + #logger.info("name + v_url + size \n" ) + #print "%r, %r, %r" % (v_title, v_url, size) + logger.info("name: " + str(v_title) + " url:" + str(v_url) + " size: " + str(size)) + + # print "ret is",(urls, size, 'mp4', 'fake_headers') + + return urls, size, 'mp4', {} + + +def ted_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False, urls_only = False): + + logger.info("###ted_download") + logger.info("page url is " + url) + + vpage = get_html(url) + v_title = re.findall(re_name, vpage)[0] + v_url = re.findall(re_url, vpage)[0] + + type, ext, size = url_info(v_url) + print_info(site_info, v_title, type, size) + + logger.info("v_title is " + str(v_title) + "type is " + str(type) + "size is " + str(size) ) + + if not info_only: + download_urls([v_url], v_title, ext, size, output_dir, merge = merge) + + +def get_videoId(url): + v_in_id = re.findall(re_in_id, url)[0] + return v_in_id + + +def srt_time(tst): + """Format Time from TED Subtitles format to SRT time Format.""" + secs, mins, hours = ((tst / 1000) % 60), (tst / 60000), (tst / 3600000) + right_srt_time = ("{0:02d}:{1:02d}:{2:02d},{3:3.0f}". + format(int(hours), int(mins), int(secs), + divmod(secs, 1)[1] * 1000)) + return right_srt_time + + +def srt_sec_time(tst): + """Format Time from TED Subtitles format to SRT time Format.""" + secs = tst / 1000 + return secs + + +# regex expressions to search into the webpage +re_dm_intro = re.compile('"introDuration":(\d+\.?\d+),') +re_dm_id = re.compile('"id":(\d+),') +re_dm_url = re.compile('"nativeDownloads":.*"high":"(.+)\?.+},"sub') +re_dm_vid = re.compile('http://.+\/(.*\.mp4)') + +def ted_get_danmu(video_id): + """ + Get Danmu for the unique video_id + """ + logger.info("###ted_get_danmu") + + url = TED_TALKS_URL_PAT % video_id + logger.info("page url is " + url) + + try: + vpage = get_html(url) + except: + logger.info("###ted_get_danmu:request faild, ret null danmu list") + return [] + + ret_list = [] + + tt_intro = ((float(re_dm_intro.findall(vpage)[0]) + 1) * 1000) + tt_id = int(re_dm_id.findall(vpage)[0]) + tt_url = re_dm_url.findall(vpage)[0] + tt_v_fname = re_dm_vid.findall(tt_url)[0] + + #logger.info("###tt_intro is " + str(tt_intro)) + subs = get_subs(tt_id, tt_intro, tt_v_fname) + + # we only process english caption currrently + # 0(eng) 0(item list) + eng_sub = subs[0][0] + + for i in eng_sub: + r_item = {} + p_item = parse_item(i) + + r_item['text'] = p_item["content"] + r_item['color'] = p_item["font_color"] + r_item['fontSize'] = p_item["font_size"] + r_item['direct'] = p_item["mode"] + r_item['startTime'] = p_item["time"] + r_item['uuid'] = p_item["uuid"] + r_item['publishTime'] = p_item["pub_time"] + + ret_list.append(r_item) + #logger.info("###parsed sub item") + #pprint(r_item) + + logger.info("###ted_get_danmu:parsed sub item list info:" + " len: " + str(len(ret_list))) + + if TED_D_DINFO: + logger.info("###ted_get_danmu:last two items" + " ret_list len " + str(len(ret_list) ) ) + if len(ret_list) > 0: + pprint(ret_list[-1]) + if len(ret_list) > 1: + pprint(ret_list[-2]) + pass + + if TED_D_DFINFO: + logger.info("###ted_get_danmu:full ret list" ) + logger.info(str(ret_list)) + + return ret_list + + +def parse_item(item): + """ + Return a tuple for a+ danmu element + """ + s_time = float(item["start"]) + + # Mode is the direct opt + # mode 1~3: scrolling + # mode 4: bottom + # mode 5: top + # mode 6: reverse? + # mode 7: position + # mode 8: advanced + mode = 4 + assert 1 <= mode <= 8 + + # pool 0: normal + # pool 1: srt + # pool 2: special? + #pool = int(pool) + pool = 0 + assert 0 <= pool <= 2 + + font_size = 25 + font_color = 16777215 + pub_time = str(int(time.time() * 1000000 ))[-10:] + + return {"time":s_time, "font_color":font_color, "mode":mode, "font_size":font_size, + "uuid":"s_defuuid_z9", "pub_time":pub_time, "content":item["content"]} + + +def get_subs(tt_id, tt_intro, tt_video_fname): + """ + Get the sutitles, currently for english + """ + + subs = ["{0}.{1}.srt".format(tt_video_fname[:-4], lang) for lang in ('eng', 'chi')] + ret_subs = [] + + for sub in subs: + #logger.info("###get_subs:pls input to continue s sub getting:") + #raw_input() + + subtitle = get_single_sub(tt_id, tt_intro, sub) + if subtitle: + ret_subs.append(subtitle) + #logger.info("###get_subs:Subtitle '{0}' downloaded.".format(sub) ) + + if TED_D_DSUB: + # raw_input() + logger.info("\n") + for idx, sub in enumerate(subs): + + with open(sub, 'w') as srt_file: + for item in ret_subs[idx][0]: + srt_file.write(str(item)) + + srt_file.write("\n#############\n") + srt_file.write("\nSRT formated data\n") + srt_file.write(ret_subs[idx][1]) + + logger.info("###get_subs:Debug:Subtitle '{0}' downloaded.".format(sub)) + + return ret_subs + + +def get_single_sub(tt_id, tt_intro, sub): + """ + Get TED Subtitle in JSON format & convert it to SRT Subtitle. + """ + + srt_content = '' + srt_items = [] + tt_url = 'http://www.ted.com/talks' + sub_url = '{0}/subtitles/id/{1}/lang/{2}'.format(tt_url, tt_id, sub[-7:-4]) + + # Get JSON sub + json_file = request.urlopen(sub_url).readlines() + logger.info("###get_single_sub: sub url is " + sub_url) + + if json_file: + try: + json_object = json.loads(json_file[0].decode('utf-8')) + logger.info("###get_single_sub: json load orig data") + #logger.info(json_object) + if 'captions' in json_object: + caption_idx = 1 + if not json_object['captions']: + logger.info("Subtitle '{0}' not available.".format(sub)) + for caption in json_object['captions']: + start = tt_intro + caption['startTime'] + end = start + caption['duration'] + idx_line = '{0}'.format(caption_idx) + time_line = '{0} --> {1}'.format(srt_time(start), + srt_time(end)) + text_line = '{0}'.format(caption['content'].encode("utf-8")) + + # Append the srt items and content parellelly + srt_items.append({"index":caption_idx, "start":srt_sec_time(start), + "duration":srt_sec_time(caption['duration']), "content":text_line}) + srt_content += '\n'.join([idx_line, time_line, text_line, '\n']) + caption_idx += 1 + + elif 'status' in json_object: + logger.info("This is an error message returned by TED:{0}{0} - " + "{1}{0}{0}Probably because the subtitle '{2}' is not " + "available.{0}".format(os.linesep, json_object['status']['message'], sub)) + + except ValueError: + logger.info("Subtitle '{0}' it's a malformed json file.".format(sub)) + + return (srt_items, srt_content) + + +def options(): + """Defines the command line arguments and options for the script.""" + + desc = "Downloads the subtitles and the video (optional) for a TED Talk." + usage = "Beautifull TED" + parser = optparse.OptionParser(usage=usage, version="%prog " + __version__, + description=desc) + + parser.add_option("-s", "--only_subs", action='store_true', + dest="no_video", + help="download only the subs, not the video ", + default=False) + return parser + + +def check_exec_posix(prog): + """ + Check if the program is installed in a *NIX platform. + """ + return True + + +def main(): + """main section""" + pass + + +# module info +get_Danmu = ted_get_danmu + + site_info = "TED.com" download = ted_download download_playlist = playlist_not_supported('ted') diff --git a/src/you_get/extractors/theplatform.py b/src/you_get/extractors/theplatform.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/tucao.py b/src/you_get/extractors/tucao.py old mode 100644 new mode 100755 diff --git a/src/you_get/extractors/tudou.py b/src/you_get/extractors/tudou.py old mode 100644 new mode 100755 index a9f78a6d..95cf96fd --- a/src/you_get/extractors/tudou.py +++ b/src/you_get/extractors/tudou.py @@ -7,7 +7,7 @@ from xml.dom.minidom import parseString def tudou_download_by_iid(iid, title, output_dir = '.', merge = True, info_only = False): data = json.loads(get_decoded_html('http://www.tudou.com/outplay/goto/getItemSegs.action?iid=%s' % iid)) - temp = max([data[i] for i in data if 'size' in data[i][0]], key=lambda x:x[0]["size"]) + temp = max([data[i] for i in data], key=lambda x:x[0]["size"]) vids, size = [t["k"] for t in temp], sum([t["size"] for t in temp]) urls = [[n.firstChild.nodeValue.strip() for n in diff --git a/src/you_get/extractors/tumblr.py b/src/you_get/extractors/tumblr.py old mode 100644 new mode 100755 index 079de707..8a2e2ed1 --- a/src/you_get/extractors/tumblr.py +++ b/src/you_get/extractors/tumblr.py @@ -7,23 +7,15 @@ from ..common import * import re def tumblr_download(url, output_dir = '.', merge = True, info_only = False): - html = parse.unquote(get_html(url)).replace('\/', '/') - feed = r1(r'', html) - - if feed == 'audio': - real_url = r1(r'source src=\\x22([^\\]+)\\', html) - if not real_url: - real_url = r1(r'audio_file=([^&]+)&', html) + '?plead=please-dont-download-this-or-our-lawyers-wont-let-us-host-audio' - elif feed == 'video': - iframe_url = r1(r'