[^<]', r'
([^<>]*)', page)
if subtitle:
subtitle = subtitle.group(1).strip()
if subtitle == title:
subtitle = None
if subtitle:
title += '-' + subtitle
return title
def parse_playlist_title(url, page):
if re.search(r'v_playlist', url):
# if we are playing a viedo from play list, the meta title might be incorrect
title = re.search(r'
([^<>]*)', page).group(1)
else:
title = re.search(r'
> 16
c = source.pop(index)
mixed += c
ids = info['data'][0]['streamfileids'][stream_type].split('*')[:-1]
vid = ''.join(mixed[int(i)] for i in ids)
sid = '%s%s%s' % (int(time() * 1000), randint(1000, 1999), randint(1000, 9999))
urls = []
for s in segs[stream_type]:
no = '%02x' % int(s['no'])
url = 'http://f.youku.com/player/getFlvPath/sid/%s_%s/st/%s/fileid/%s%s%s?K=%s&ts=%s' % (sid, no, file_type, vid[:8], no.upper(), vid[10:], s['k'], s['seconds'])
urls.append((url, int(s['size'])))
return urls
def file_type_of_url(url):
return str(re.search(r'/st/([^/]+)/', url).group(1))
def youku_download_by_id(id2, title, output_dir = '.', stream_type = None, merge = True, info_only = False):
info = get_info(id2)
urls, sizes = zip(*find_video(info, stream_type))
ext = file_type_of_url(urls[0])
total_size = sum(sizes)
urls = url_locations(urls) # Use real (redirected) URLs for resuming of downloads
print_info(site_info, title, ext, total_size)
if not info_only:
download_urls(urls, title, ext, total_size, output_dir, merge = merge)
def youku_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False):
id2, title = parse_page(url)
title = title.replace('?', '-')
youku_download_by_id(id2, title, output_dir, merge = merge, info_only = info_only)
def parse_playlist_videos(html):
return re.findall(r'id="A_(\w+)"', html)
def parse_playlist_pages(html):
m = re.search(r'
', html, flags = re.S)
if m:
urls = re.findall(r'href="([^"]+)"', m.group())
x1, x2, x3 = re.match(r'^(.*page_)(\d+)(_.*)$', urls[-1]).groups()
return ['http://v.youku.com%s%s%s?__rt=1&__ro=listShow' % (x1, i, x3) for i in range(2, int(x2) + 1)]
else:
return []
def parse_playlist(url):
html = get_html(url)
video_id = re.search(r"var\s+videoId\s*=\s*'(\d+)'", html).group(1)
show_id = re.search(r'var\s+showid\s*=\s*"(\d+)"', html).group(1)
list_url = 'http://v.youku.com/v_vpofficiallist/page_1_showid_%s_id_%s.html?__rt=1&__ro=listShow' % (show_id, video_id)
html = get_html(list_url)
ids = parse_playlist_videos(html)
for url in parse_playlist_pages(html):
ids.extend(parse_playlist_videos(get_html(url)))
return ids
def parse_vplaylist(url):
id = r1_of([r'^http://www.youku.com/playlist_show/id_(\d+)(?:_ascending_\d_mode_pic(?:_page_\d+)?)?.html',
r'^http://v.youku.com/v_playlist/f(\d+)o[01]p\d+.html',
r'^http://u.youku.com/user_playlist/pid_(\d+)_id_[\w=]+(?:_page_\d+)?.html'],
url)
assert id, 'not valid vplaylist url: ' + url
url = 'http://www.youku.com/playlist_show/id_%s.html' % id
n = int(re.search(r'
(\d+)', get_html(url)).group(1))
return ['http://v.youku.com/v_playlist/f%so0p%s.html' % (id, i) for i in range(n)]
def youku_download_playlist(url, output_dir = '.', merge = True, info_only = False):
if re.match(r'http://www.youku.com/show_page/id_\w+.html', url):
url = find_video_id_from_show_page(url)
if re.match(r'http://www.youku.com/playlist_show/id_\d+(?:_ascending_\d_mode_pic(?:_page_\d+)?)?.html', url):
ids = parse_vplaylist(url)
elif re.match(r'http://v.youku.com/v_playlist/f\d+o[01]p\d+.html', url):
ids = parse_vplaylist(url)
elif re.match(r'http://u.youku.com/user_playlist/pid_(\d+)_id_[\w=]+(?:_page_\d+)?.html', url):
ids = parse_vplaylist(url)
else:
assert re.match(r'http://v.youku.com/v_show/id_([\w=]+).html', url), 'URL not supported as playlist'
ids = parse_playlist(url)
title = parse_playlist_title(url, get_html(url))
title = title.replace('?', '-')
output_dir = os.path.join(output_dir, title)
for i, id in enumerate(ids):
print('Processing %s of %s videos...' % (i + 1, len(ids)))
youku_download(id, output_dir, merge = merge, info_only = info_only)
site_info = "Youku.com"
download = youku_download
download_playlist = youku_download_playlist