you-get/src/you_get/extractors/tudou.py

#!/usr/bin/env python

__all__ = ['tudou_download', 'tudou_download_playlist', 'tudou_download_by_id', 'tudou_download_by_iid']

from ..common import *
from xml.dom.minidom import parseString
import you_get.extractors.acfun

def tudou_download_by_iid(iid, title, output_dir = '.', merge = True, info_only = False):
    data = json.loads(get_decoded_html('http://www.tudou.com/outplay/goto/getItemSegs.action?iid=%s' % iid))
    temp = max([data[i] for i in data if 'size' in data[i][0]], key=lambda x:sum([part['size'] for part in x]))
    vids, size = [t["k"] for t in temp], sum([t["size"] for t in temp])

    urls = []
    for vid in vids:
        for i in parseString(get_html('http://ct.v2.tudou.com/f?id=%s' % vid)).getElementsByTagName('f'):
            urls.append(i.firstChild.nodeValue.strip())

    ext = r1(r'http://[\w.]*/(\w+)/[\w.]*', urls[0])

    print_info(site_info, title, ext, size)
    if not info_only:
        download_urls(urls, title, ext, size, output_dir=output_dir, merge = merge)

def tudou_download_by_id(id, title, output_dir = '.', merge = True, info_only = False):
    html = get_html('http://www.tudou.com/programs/view/%s/' % id)

    iid = r1(r'iid\s*[:=]\s*(\S+)', html)
    try:
        title = r1(r'kw\s*[:=]\s*[\'\"]([^\n]+?)\'\s*\n', html).replace("\\'", "\'")
    except AttributeError:
        title = ''
    tudou_download_by_iid(iid, title, output_dir = output_dir, merge = merge, info_only = info_only)

def tudou_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):
    if 'acfun.tudou.com' in url:  #wrong way!
        url = url.replace('acfun.tudou.com', 'www.acfun.tv')
        you_get.extractors.acfun.acfun_download(url, output_dir,
                                               merge,
                                               info_only)
        return  #throw you back

    # Embedded player
    id = r1(r'http://www.tudou.com/v/([^/]+)/', url)
    if id:
        return tudou_download_by_id(id, title="", info_only=info_only)

    html = get_content(url)

    try:
        title = r1(r'\Wkw\s*[:=]\s*[\'\"]([^\n]+?)\'\s*\n', html).replace("\\'", "\'")
        assert title
        title = unescape_html(title)
    except AttributeError:
        title = match1(html, r'id=\"subtitle\"\s*title\s*=\s*\"([^\"]+)\"')
        if title is None:
            title = ''

    vcode = r1(r'vcode\s*[:=]\s*\'([^\']+)\'', html)
    if vcode is None:
        vcode = match1(html, r'viden\s*[:=]\s*\"([\w+/=]+)\"')
    if vcode:
        from .youku import youku_download_by_vid
        return youku_download_by_vid(vcode, title=title, output_dir=output_dir, merge=merge, info_only=info_only, src='tudou', **kwargs)

    iid = r1(r'iid\s*[:=]\s*(\d+)', html)
    if not iid:
        return tudou_download_playlist(url, output_dir, merge, info_only)

    tudou_download_by_iid(iid, title, output_dir = output_dir, merge = merge, info_only = info_only)

# obsolete?
def parse_playlist(url):
    aid = r1('http://www.tudou.com/playlist/p/a(\d+)(?:i\d+)?\.html', url)
    html = get_decoded_html(url)
    if not aid:
        aid = r1(r"aid\s*[:=]\s*'(\d+)'", html)
    if re.match(r'http://www.tudou.com/albumcover/', url):
        atitle = r1(r"title\s*:\s*'([^']+)'", html)
    elif re.match(r'http://www.tudou.com/playlist/p/', url):
        atitle = r1(r'atitle\s*=\s*"([^"]+)"', html)
    else:
        raise NotImplementedError(url)
    assert aid
    assert atitle
    import json
    #url = 'http://www.tudou.com/playlist/service/getZyAlbumItems.html?aid='+aid
    url = 'http://www.tudou.com/playlist/service/getAlbumItems.html?aid='+aid
    return [(atitle + '-' + x['title'], str(x['itemId'])) for x in json.loads(get_html(url))['message']]

def parse_plist(url):
    html = get_decoded_html(url)
    lcode = r1(r"lcode:\s*'([^']+)'", html)
    plist_info = json.loads(get_content('http://www.tudou.com/crp/plist.action?lcode=' + lcode))
    return ([(item['kw'], item['iid']) for item in plist_info['items']])

def tudou_download_playlist(url, output_dir = '.', merge = True, info_only = False, **kwargs):
    videos = parse_plist(url)
    for i, (title, id) in enumerate(videos):
        print('Processing %s of %s videos...' % (i + 1, len(videos)))
        tudou_download_by_iid(id, title, output_dir = output_dir, merge = merge, info_only = info_only)

site_info = "Tudou.com"
download = tudou_download
download_playlist = tudou_download_playlist
refactor for Python packaging 2012-08-31 19:20:38 +04:00			`#!/usr/bin/env python`
initial commit 2012-08-20 19:54:03 +04:00
			`__all__ = ['tudou_download', 'tudou_download_playlist', 'tudou_download_by_id', 'tudou_download_by_iid']`

refactor for Python packaging 2012-08-31 19:20:38 +04:00			`from ..common import *`
Tudou: fix incomplete download 2014-09-09 01:37:12 +04:00			`from xml.dom.minidom import parseString`
[Tudou]Fix acfun.tudou.com redirect, fix #695 2016-04-28 09:10:11 +03:00			`import you_get.extractors.acfun`
initial commit 2012-08-20 19:54:03 +04:00
			`def tudou_download_by_iid(iid, title, output_dir = '.', merge = True, info_only = False):`
Fixed Tudou 2013-08-02 13:25:16 +04:00			`data = json.loads(get_decoded_html('http://www.tudou.com/outplay/goto/getItemSegs.action?iid=%s' % iid))`
fix bug in tudou, fix #612 2015-08-28 10:15:56 +03:00			`temp = max([data[i] for i in data if 'size' in data[i][0]], key=lambda x:sum([part['size'] for part in x]))`
Tudou: fix incomplete download 2014-09-09 01:37:12 +04:00			`vids, size = [t["k"] for t in temp], sum([t["size"] for t in temp])`
[tudou] fix #925 2016-02-18 17:50:14 +03:00
			`urls = []`
			`for vid in vids:`
			`for i in parseString(get_html('http://ct.v2.tudou.com/f?id=%s' % vid)).getElementsByTagName('f'):`
			`urls.append(i.firstChild.nodeValue.strip())`
Fixed Tudou 2013-08-02 13:25:16 +04:00
Tudou: fix incomplete download 2014-09-09 01:37:12 +04:00			`ext = r1(r'http://[\w.]/(\w+)/[\w.]', urls[0])`
Fixed Tudou 2013-08-02 13:25:16 +04:00
			`print_info(site_info, title, ext, size)`
initial commit 2012-08-20 19:54:03 +04:00			`if not info_only:`
Tudou: fix incomplete download 2014-09-09 01:37:12 +04:00			`download_urls(urls, title, ext, size, output_dir=output_dir, merge = merge)`
initial commit 2012-08-20 19:54:03 +04:00
Youku: fix #331, refactoring 2014-06-24 05:59:47 +04:00			`def tudou_download_by_id(id, title, output_dir = '.', merge = True, info_only = False):`
initial commit 2012-08-20 19:54:03 +04:00			`html = get_html('http://www.tudou.com/programs/view/%s/' % id)`
Youku: fix #331, refactoring 2014-06-24 05:59:47 +04:00
			`iid = r1(r'iid\s[:=]\s(\S+)', html)`
fix tudou.py when there is no title 2017-02-24 20:57:33 +03:00			`try:`
			`title = r1(r'kw\s[:=]\s[\'\"]([^\n]+?)\'\s*\n', html).replace("\\'", "\'")`
			`except AttributeError:`
			`title = ''`
Tudou: fix #176 2013-05-04 01:33:38 +04:00			`tudou_download_by_iid(iid, title, output_dir = output_dir, merge = merge, info_only = info_only)`
initial commit 2012-08-20 19:54:03 +04:00
Tudou: fix #460 2014-12-20 05:08:40 +03:00			`def tudou_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):`
[Tudou]Fix acfun.tudou.com redirect, fix #695 2016-04-28 09:10:11 +03:00			`if 'acfun.tudou.com' in url: #wrong way!`
			`url = url.replace('acfun.tudou.com', 'www.acfun.tv')`
[tudou] fix #1526 2016-11-26 15:07:21 +03:00			`you_get.extractors.acfun.acfun_download(url, output_dir,`
			`merge,`
[Tudou]Fix acfun.tudou.com redirect, fix #695 2016-04-28 09:10:11 +03:00			`info_only)`
			`return #throw you back`
[tudou] fix #1526 2016-11-26 15:07:21 +03:00
Tudou: fix #176 2013-05-04 01:33:38 +04:00			`# Embedded player`
			`id = r1(r'http://www.tudou.com/v/([^/]+)/', url)`
			`if id:`
			`return tudou_download_by_id(id, title="", info_only=info_only)`
Youku: fix #331, refactoring 2014-06-24 05:59:47 +04:00
new matching patterns 2017-04-20 12:24:30 +03:00			`html = get_content(url)`
Youku: fix #331, refactoring 2014-06-24 05:59:47 +04:00
fix tudou.py when there is no title 2017-02-24 20:57:33 +03:00			`try:`
			`title = r1(r'\Wkw\s[:=]\s[\'\"]([^\n]+?)\'\s*\n', html).replace("\\'", "\'")`
			`assert title`
			`title = unescape_html(title)`
			`except AttributeError:`
new matching patterns 2017-04-20 12:24:30 +03:00			`title = match1(html, r'id=\"subtitle\"\stitle\s=\s*\"([^\"]+)\"')`
			`if title is None:`
			`title = ''`
Youku: fix #331, refactoring 2014-06-24 05:59:47 +04:00
Tudou: use Youku vcode, fix #116 2013-02-26 01:50:29 +04:00			`vcode = r1(r'vcode\s[:=]\s\'([^\']+)\'', html)`
new matching patterns 2017-04-20 12:24:30 +03:00			`if vcode is None:`
			`vcode = match1(html, r'viden\s[:=]\s\"([\w+/=]+)\"')`
Tudou: use Youku vcode, fix #116 2013-02-26 01:50:29 +04:00			`if vcode:`
Youku: fix #331, refactoring 2014-06-24 05:59:47 +04:00			`from .youku import youku_download_by_vid`
[youku tudou]update api 2017-05-25 09:01:38 +03:00			`return youku_download_by_vid(vcode, title=title, output_dir=output_dir, merge=merge, info_only=info_only, src='tudou', **kwargs)`
Youku: fix #331, refactoring 2014-06-24 05:59:47 +04:00
Tudou: use Youku vcode, fix #116 2013-02-26 01:50:29 +04:00			`iid = r1(r'iid\s[:=]\s(\d+)', html)`
			`if not iid:`
			`return tudou_download_playlist(url, output_dir, merge, info_only)`
Youku: fix #331, refactoring 2014-06-24 05:59:47 +04:00
initial commit 2012-08-20 19:54:03 +04:00			`tudou_download_by_iid(iid, title, output_dir = output_dir, merge = merge, info_only = info_only)`

[tudou] fix playlists 2015-10-14 02:49:37 +03:00			`# obsolete?`
initial commit 2012-08-20 19:54:03 +04:00			`def parse_playlist(url):`
			`aid = r1('http://www.tudou.com/playlist/p/a(\d+)(?:i\d+)?\.html', url)`
			`html = get_decoded_html(url)`
			`if not aid:`
			`aid = r1(r"aid\s[:=]\s'(\d+)'", html)`
			`if re.match(r'http://www.tudou.com/albumcover/', url):`
			`atitle = r1(r"title\s:\s'([^']+)'", html)`
			`elif re.match(r'http://www.tudou.com/playlist/p/', url):`
			`atitle = r1(r'atitle\s=\s"([^"]+)"', html)`
			`else:`
			`raise NotImplementedError(url)`
			`assert aid`
			`assert atitle`
			`import json`
			`#url = 'http://www.tudou.com/playlist/service/getZyAlbumItems.html?aid='+aid`
			`url = 'http://www.tudou.com/playlist/service/getAlbumItems.html?aid='+aid`
			`return [(atitle + '-' + x['title'], str(x['itemId'])) for x in json.loads(get_html(url))['message']]`

[tudou] fix playlists 2015-10-14 02:49:37 +03:00			`def parse_plist(url):`
			`html = get_decoded_html(url)`
			`lcode = r1(r"lcode:\s*'([^']+)'", html)`
			`plist_info = json.loads(get_content('http://www.tudou.com/crp/plist.action?lcode=' + lcode))`
			`return ([(item['kw'], item['iid']) for item in plist_info['items']])`

fix extractors not use VideoExtractor after add --json option 2015-09-26 08:45:39 +03:00			`def tudou_download_playlist(url, output_dir = '.', merge = True, info_only = False, **kwargs):`
[tudou] fix playlists 2015-10-14 02:49:37 +03:00			`videos = parse_plist(url)`
initial commit 2012-08-20 19:54:03 +04:00			`for i, (title, id) in enumerate(videos):`
automatically handle playlist URLs 2012-09-01 12:18:59 +04:00			`print('Processing %s of %s videos...' % (i + 1, len(videos)))`
merge youku-lixian commits: d19ea15, 980266d 2012-08-31 02:19:22 +04:00			`tudou_download_by_iid(id, title, output_dir = output_dir, merge = merge, info_only = info_only)`
initial commit 2012-08-20 19:54:03 +04:00
			`site_info = "Tudou.com"`
			`download = tudou_download`
Youku: fix #331, refactoring 2014-06-24 05:59:47 +04:00			`download_playlist = tudou_download_playlist`