you-get/src/you_get/extractors/tudou.py

106 lines
4.3 KiB
Python
Raw Normal View History

2012-08-31 19:20:38 +04:00
#!/usr/bin/env python
2012-08-20 19:54:03 +04:00
__all__ = ['tudou_download', 'tudou_download_playlist', 'tudou_download_by_id', 'tudou_download_by_iid']
2012-08-31 19:20:38 +04:00
from ..common import *
2014-09-09 01:37:12 +04:00
from xml.dom.minidom import parseString
import you_get.extractors.acfun
2012-08-20 19:54:03 +04:00
def tudou_download_by_iid(iid, title, output_dir = '.', merge = True, info_only = False):
2013-08-02 13:25:16 +04:00
data = json.loads(get_decoded_html('http://www.tudou.com/outplay/goto/getItemSegs.action?iid=%s' % iid))
2015-08-28 10:15:56 +03:00
temp = max([data[i] for i in data if 'size' in data[i][0]], key=lambda x:sum([part['size'] for part in x]))
2014-09-09 01:37:12 +04:00
vids, size = [t["k"] for t in temp], sum([t["size"] for t in temp])
2016-02-18 17:50:14 +03:00
urls = []
for vid in vids:
for i in parseString(get_html('http://ct.v2.tudou.com/f?id=%s' % vid)).getElementsByTagName('f'):
urls.append(i.firstChild.nodeValue.strip())
2013-08-02 13:25:16 +04:00
2014-09-09 01:37:12 +04:00
ext = r1(r'http://[\w.]*/(\w+)/[\w.]*', urls[0])
2013-08-02 13:25:16 +04:00
print_info(site_info, title, ext, size)
2012-08-20 19:54:03 +04:00
if not info_only:
2014-09-09 01:37:12 +04:00
download_urls(urls, title, ext, size, output_dir=output_dir, merge = merge)
2012-08-20 19:54:03 +04:00
2014-06-24 05:59:47 +04:00
def tudou_download_by_id(id, title, output_dir = '.', merge = True, info_only = False):
2012-08-20 19:54:03 +04:00
html = get_html('http://www.tudou.com/programs/view/%s/' % id)
2014-06-24 05:59:47 +04:00
iid = r1(r'iid\s*[:=]\s*(\S+)', html)
2017-02-24 20:57:33 +03:00
try:
title = r1(r'kw\s*[:=]\s*[\'\"]([^\n]+?)\'\s*\n', html).replace("\\'", "\'")
except AttributeError:
title = ''
2013-05-04 01:33:38 +04:00
tudou_download_by_iid(iid, title, output_dir = output_dir, merge = merge, info_only = info_only)
2012-08-20 19:54:03 +04:00
2014-12-20 05:08:40 +03:00
def tudou_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):
if 'acfun.tudou.com' in url: #wrong way!
url = url.replace('acfun.tudou.com', 'www.acfun.tv')
2016-11-26 15:07:21 +03:00
you_get.extractors.acfun.acfun_download(url, output_dir,
merge,
info_only)
return #throw you back
2016-11-26 15:07:21 +03:00
2013-05-04 01:33:38 +04:00
# Embedded player
id = r1(r'http://www.tudou.com/v/([^/]+)/', url)
if id:
return tudou_download_by_id(id, title="", info_only=info_only)
2014-06-24 05:59:47 +04:00
2017-04-20 12:24:30 +03:00
html = get_content(url)
2014-06-24 05:59:47 +04:00
2017-02-24 20:57:33 +03:00
try:
title = r1(r'\Wkw\s*[:=]\s*[\'\"]([^\n]+?)\'\s*\n', html).replace("\\'", "\'")
assert title
title = unescape_html(title)
except AttributeError:
2017-04-20 12:24:30 +03:00
title = match1(html, r'id=\"subtitle\"\s*title\s*=\s*\"([^\"]+)\"')
if title is None:
title = ''
2014-06-24 05:59:47 +04:00
2013-02-26 01:50:29 +04:00
vcode = r1(r'vcode\s*[:=]\s*\'([^\']+)\'', html)
2017-04-20 12:24:30 +03:00
if vcode is None:
vcode = match1(html, r'viden\s*[:=]\s*\"([\w+/=]+)\"')
2013-02-26 01:50:29 +04:00
if vcode:
2014-06-24 05:59:47 +04:00
from .youku import youku_download_by_vid
2017-05-25 09:01:38 +03:00
return youku_download_by_vid(vcode, title=title, output_dir=output_dir, merge=merge, info_only=info_only, src='tudou', **kwargs)
2014-06-24 05:59:47 +04:00
2013-02-26 01:50:29 +04:00
iid = r1(r'iid\s*[:=]\s*(\d+)', html)
if not iid:
return tudou_download_playlist(url, output_dir, merge, info_only)
2014-06-24 05:59:47 +04:00
2012-08-20 19:54:03 +04:00
tudou_download_by_iid(iid, title, output_dir = output_dir, merge = merge, info_only = info_only)
2015-10-14 02:49:37 +03:00
# obsolete?
2012-08-20 19:54:03 +04:00
def parse_playlist(url):
aid = r1('http://www.tudou.com/playlist/p/a(\d+)(?:i\d+)?\.html', url)
html = get_decoded_html(url)
if not aid:
aid = r1(r"aid\s*[:=]\s*'(\d+)'", html)
if re.match(r'http://www.tudou.com/albumcover/', url):
atitle = r1(r"title\s*:\s*'([^']+)'", html)
elif re.match(r'http://www.tudou.com/playlist/p/', url):
atitle = r1(r'atitle\s*=\s*"([^"]+)"', html)
else:
raise NotImplementedError(url)
assert aid
assert atitle
import json
#url = 'http://www.tudou.com/playlist/service/getZyAlbumItems.html?aid='+aid
url = 'http://www.tudou.com/playlist/service/getAlbumItems.html?aid='+aid
return [(atitle + '-' + x['title'], str(x['itemId'])) for x in json.loads(get_html(url))['message']]
2015-10-14 02:49:37 +03:00
def parse_plist(url):
html = get_decoded_html(url)
lcode = r1(r"lcode:\s*'([^']+)'", html)
plist_info = json.loads(get_content('http://www.tudou.com/crp/plist.action?lcode=' + lcode))
return ([(item['kw'], item['iid']) for item in plist_info['items']])
def tudou_download_playlist(url, output_dir = '.', merge = True, info_only = False, **kwargs):
2015-10-14 02:49:37 +03:00
videos = parse_plist(url)
2012-08-20 19:54:03 +04:00
for i, (title, id) in enumerate(videos):
2012-09-01 12:18:59 +04:00
print('Processing %s of %s videos...' % (i + 1, len(videos)))
tudou_download_by_iid(id, title, output_dir = output_dir, merge = merge, info_only = info_only)
2012-08-20 19:54:03 +04:00
site_info = "Tudou.com"
download = tudou_download
2014-06-24 05:59:47 +04:00
download_playlist = tudou_download_playlist