merge youku-lixian commits: d19ea15, 980266d

2025-02-02 16:24:00 +03:00 · 2012-08-31 00:19:22 +02:00 · 2012-08-31 00:19:22 +02:00 · a4bf334acf
commit a4bf334acf
parent 146bae2f97
3 changed files with 39 additions and 15 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 _*
 *.py[cod]

+*.download
 *.flv
 *.mp4
 *.webm
--- a/get_tudou.py
+++ b/get_tudou.py
@ -62,13 +62,11 @@ def parse_playlist(url):
    url = 'http://www.tudou.com/playlist/service/getAlbumItems.html?aid='+aid
    return [(atitle + '-' + x['title'], str(x['itemId'])) for x in json.loads(get_html(url))['message']]

-def tudou_download_playlist(url, create_dir = False, output_dir = '.', merge = True):
-    if create_dir:
-        raise NotImplementedError('please report a bug so I can implement this')
+def tudou_download_playlist(url, output_dir = '.', merge = True, info_only = False):
    videos = parse_playlist(url)
    for i, (title, id) in enumerate(videos):
        print('Downloading %s of %s videos...' % (i + 1, len(videos)))
-        tudou_download_by_iid(id, title, output_dir = output_dir, merge = merge)
+        tudou_download_by_iid(id, title, output_dir = output_dir, merge = merge, info_only = info_only)

 site_info = "Tudou.com"
 download = tudou_download
--- a/get_youku.py
+++ b/get_youku.py
@ -37,25 +37,47 @@ def youku_url(url):
        return url
    raise Exception('Invalid Youku URL: '+url)

-def parse_page(url):
-    url = youku_url(url)
-    page = get_html(url)
-    id2 = re.search(r"var\s+videoId2\s*=\s*'(\S+)'", page).group(1)
+def parse_video_title(url, page):
    if re.search(r'v_playlist', url):
-        # if we are playing a video from playlist, the meta title might be incorrect
-        title = re.search(r'<title>([^<>]*)</title>', page).group(1)
+        # if we are playing a viedo from play list, the meta title might be incorrect
+        title = r1_of([r'<div class="show_title" title="([^"]+)">[^<]', r'<title>([^<>]*)</title>'], page)
    else:
-        title = re.search(r'<meta name="title" content="([^"]*)">', page).group(1)
+        title = r1_of([r'<div class="show_title" title="([^"]+)">[^<]', r'<meta name="title" content="([^"]*)"'], page)
+    assert title
    title = trim_title(title)
    if re.search(r'v_playlist', url) and re.search(r'-.*\S+', title):
        title = re.sub(r'^[^-]+-\s*', '', title) # remove the special name from title for playlist video
+    title = re.sub(r'—专辑：.*', '', title) # remove the special name from title for playlist video
    title = unescape_html(title)
+    
    subtitle = re.search(r'<span class="subtitle" id="subtitle">([^<>]*)</span>', page)
    if subtitle:
        subtitle = subtitle.group(1).strip()
    if subtitle == title:
        subtitle = None
-    return id2, title, subtitle
+    if subtitle:
+        title += '-' + subtitle
+    return title
+
+def parse_playlist_title(url, page):
+    if re.search(r'v_playlist', url):
+        # if we are playing a viedo from play list, the meta title might be incorrect
+        title = re.search(r'<title>([^<>]*)</title>', page).group(1)
+    else:
+        title = re.search(r'<meta name="title" content="([^"]*)"', page).group(1)
+    title = trim_title(title)
+    if re.search(r'v_playlist', url) and re.search(r'-.*\S+', title):
+        title = re.sub(r'^[^-]+-\s*', '', title)
+    title = re.sub(r'^.*—专辑：《(.+)》', r'\1', title)
+    title = unescape_html(title)
+    return title
+
+def parse_page(url):
+    url = youku_url(url)
+    page = get_html(url)
+    id2 = re.search(r"var\s+videoId2\s*=\s*'(\S+)'", page).group(1)
+    title = parse_video_title(url, page)
+    return id2, title

 def get_info(videoId2):
    return json.loads(get_html('http://v.youku.com/player/getPlayList/VideoIDS/' + videoId2))
@ -108,9 +130,8 @@ def youku_download_by_id(id2, title, output_dir = '.', stream_type = None, merge
        download_urls(urls, title, file_type_of_url(urls[0]), total_size, output_dir, merge = merge)

 def youku_download(url, output_dir = '.', stream_type = None, merge = True, info_only = False):
-    id2, title, subtitle = parse_page(url)
-    if subtitle:
-        title += '-' + subtitle
+    id2, title = parse_page(url)
+    title = title.replace('?', '-')
    
    youku_download_by_id(id2, title, output_dir, merge = merge, info_only = info_only)

@ -161,6 +182,10 @@ def youku_download_playlist(url, output_dir = '.', merge = True, info_only = Fal
        assert re.match(r'http://v.youku.com/v_show/id_([\w=]+).html', url), 'URL not supported as playlist'
        ids = parse_playlist(url)
    
+    title = parse_playlist_title(url, get_html(url))
+    title = title.replace('?', '-')
+    output_dir = os.path.join(output_dir, title)
+    
    for i, id in enumerate(ids):
        print('Processing %s of %s videos...' % (i + 1, len(ids)))
        youku_download(id, output_dir, merge = merge, info_only = info_only)