merge youku-lixian commits: d19ea15, 980266d

This commit is contained in:
Mort Yao 2012-08-31 00:19:22 +02:00
parent 146bae2f97
commit a4bf334acf
3 changed files with 39 additions and 15 deletions

1
.gitignore vendored
View File

@ -1,6 +1,7 @@
_*
*.py[cod]
*.download
*.flv
*.mp4
*.webm

View File

@ -62,13 +62,11 @@ def parse_playlist(url):
url = 'http://www.tudou.com/playlist/service/getAlbumItems.html?aid='+aid
return [(atitle + '-' + x['title'], str(x['itemId'])) for x in json.loads(get_html(url))['message']]
def tudou_download_playlist(url, create_dir = False, output_dir = '.', merge = True):
if create_dir:
raise NotImplementedError('please report a bug so I can implement this')
def tudou_download_playlist(url, output_dir = '.', merge = True, info_only = False):
videos = parse_playlist(url)
for i, (title, id) in enumerate(videos):
print('Downloading %s of %s videos...' % (i + 1, len(videos)))
tudou_download_by_iid(id, title, output_dir = output_dir, merge = merge)
tudou_download_by_iid(id, title, output_dir = output_dir, merge = merge, info_only = info_only)
site_info = "Tudou.com"
download = tudou_download

View File

@ -37,25 +37,47 @@ def youku_url(url):
return url
raise Exception('Invalid Youku URL: '+url)
def parse_page(url):
url = youku_url(url)
page = get_html(url)
id2 = re.search(r"var\s+videoId2\s*=\s*'(\S+)'", page).group(1)
def parse_video_title(url, page):
if re.search(r'v_playlist', url):
# if we are playing a video from playlist, the meta title might be incorrect
title = re.search(r'<title>([^<>]*)</title>', page).group(1)
# if we are playing a viedo from play list, the meta title might be incorrect
title = r1_of([r'<div class="show_title" title="([^"]+)">[^<]', r'<title>([^<>]*)</title>'], page)
else:
title = re.search(r'<meta name="title" content="([^"]*)">', page).group(1)
title = r1_of([r'<div class="show_title" title="([^"]+)">[^<]', r'<meta name="title" content="([^"]*)"'], page)
assert title
title = trim_title(title)
if re.search(r'v_playlist', url) and re.search(r'-.*\S+', title):
title = re.sub(r'^[^-]+-\s*', '', title) # remove the special name from title for playlist video
title = re.sub(r'—专辑:.*', '', title) # remove the special name from title for playlist video
title = unescape_html(title)
subtitle = re.search(r'<span class="subtitle" id="subtitle">([^<>]*)</span>', page)
if subtitle:
subtitle = subtitle.group(1).strip()
if subtitle == title:
subtitle = None
return id2, title, subtitle
if subtitle:
title += '-' + subtitle
return title
def parse_playlist_title(url, page):
if re.search(r'v_playlist', url):
# if we are playing a viedo from play list, the meta title might be incorrect
title = re.search(r'<title>([^<>]*)</title>', page).group(1)
else:
title = re.search(r'<meta name="title" content="([^"]*)"', page).group(1)
title = trim_title(title)
if re.search(r'v_playlist', url) and re.search(r'-.*\S+', title):
title = re.sub(r'^[^-]+-\s*', '', title)
title = re.sub(r'^.*—专辑:《(.+)》', r'\1', title)
title = unescape_html(title)
return title
def parse_page(url):
url = youku_url(url)
page = get_html(url)
id2 = re.search(r"var\s+videoId2\s*=\s*'(\S+)'", page).group(1)
title = parse_video_title(url, page)
return id2, title
def get_info(videoId2):
return json.loads(get_html('http://v.youku.com/player/getPlayList/VideoIDS/' + videoId2))
@ -108,9 +130,8 @@ def youku_download_by_id(id2, title, output_dir = '.', stream_type = None, merge
download_urls(urls, title, file_type_of_url(urls[0]), total_size, output_dir, merge = merge)
def youku_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False):
id2, title, subtitle = parse_page(url)
if subtitle:
title += '-' + subtitle
id2, title = parse_page(url)
title = title.replace('?', '-')
youku_download_by_id(id2, title, output_dir, merge = merge, info_only = info_only)
@ -161,6 +182,10 @@ def youku_download_playlist(url, output_dir = '.', merge = True, info_only = Fal
assert re.match(r'http://v.youku.com/v_show/id_([\w=]+).html', url), 'URL not supported as playlist'
ids = parse_playlist(url)
title = parse_playlist_title(url, get_html(url))
title = title.replace('?', '-')
output_dir = os.path.join(output_dir, title)
for i, id in enumerate(ids):
print('Processing %s of %s videos...' % (i + 1, len(ids)))
youku_download(id, output_dir, merge = merge, info_only = info_only)