From b95b1a10ee01299358fd23e9ef85e1738cdfacd2 Mon Sep 17 00:00:00 2001 From: Chuntao Hong Date: Wed, 20 Jul 2016 12:04:15 +0800 Subject: [PATCH 1/2] fix playlist download --- src/you_get/extractors/youku.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/you_get/extractors/youku.py b/src/you_get/extractors/youku.py index fefaf5ee..e04a51fc 100644 --- a/src/you_get/extractors/youku.py +++ b/src/you_get/extractors/youku.py @@ -76,7 +76,7 @@ class Youku(VideoExtractor): for x in xs: if x not in mem: mem.add(x) - yield(x) + return mem def get_vid_from_url(url): """Extracts video ID from URL. @@ -89,7 +89,7 @@ class Youku(VideoExtractor): def get_playlist_id_from_url(url): """Extracts playlist ID from URL. """ - return match1(url, r'youku\.com/playlist_show/id_([a-zA-Z0-9=]+)') + return match1(url, r'youku\.com/albumlist/show\?id=([a-zA-Z0-9=]+)') def download_playlist_by_url(self, url, **kwargs): self.url = url @@ -97,16 +97,19 @@ class Youku(VideoExtractor): try: playlist_id = self.__class__.get_playlist_id_from_url(self.url) assert playlist_id - - video_page = get_content('http://www.youku.com/playlist_show/id_%s' % playlist_id) + video_page = get_content('http://list.youku.com/albumlist/show?id=%s' % playlist_id) videos = Youku.oset(re.findall(r'href="(http://v\.youku\.com/[^?"]+)', video_page)) - # Parse multi-page playlists - for extra_page_url in Youku.oset(re.findall('href="(http://www\.youku\.com/playlist_show/id_%s_[^?"]+)' % playlist_id, video_page)): - extra_page = get_content(extra_page_url) - videos |= Youku.oset(re.findall(r'href="(http://v\.youku\.com/[^?"]+)', extra_page)) - - except: + last_page_url = re.findall(r'href="(/albumlist/show\?id=%s[^"]+)" title="末页"' % playlist_id, video_page)[0] + num_pages = int(re.findall(r'page=([0-9]+)\.htm', last_page_url)[0]) + if (num_pages > 0): + # download one by one + for pn in range(1, num_pages + 1): + extra_page_url = re.sub(r'page=([0-9]+)\.htm', r'page=%s.htm' % pn, last_page_url) + extra_page = get_content('http://list.youku.com' + extra_page_url) + videos |= Youku.oset(re.findall(r'href="(http://v\.youku\.com/[^?"]+)', extra_page)) + except Exception as e: + print(e) # Show full list of episodes if match1(url, r'youku\.com/show_page/id_([a-zA-Z0-9=]+)'): ep_id = match1(url, r'youku\.com/show_page/id_([a-zA-Z0-9=]+)') From cb2878b8cfd974605112cc6c4fff2c405c92f39f Mon Sep 17 00:00:00 2001 From: Chuntao Hong Date: Wed, 20 Jul 2016 12:09:20 +0800 Subject: [PATCH 2/2] avoid unnecessary downloading page 1 --- src/you_get/extractors/youku.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/you_get/extractors/youku.py b/src/you_get/extractors/youku.py index e04a51fc..345347d0 100644 --- a/src/you_get/extractors/youku.py +++ b/src/you_get/extractors/youku.py @@ -104,12 +104,11 @@ class Youku(VideoExtractor): num_pages = int(re.findall(r'page=([0-9]+)\.htm', last_page_url)[0]) if (num_pages > 0): # download one by one - for pn in range(1, num_pages + 1): + for pn in range(2, num_pages + 1): extra_page_url = re.sub(r'page=([0-9]+)\.htm', r'page=%s.htm' % pn, last_page_url) extra_page = get_content('http://list.youku.com' + extra_page_url) videos |= Youku.oset(re.findall(r'href="(http://v\.youku\.com/[^?"]+)', extra_page)) - except Exception as e: - print(e) + except: # Show full list of episodes if match1(url, r'youku\.com/show_page/id_([a-zA-Z0-9=]+)'): ep_id = match1(url, r'youku\.com/show_page/id_([a-zA-Z0-9=]+)')