[lizhi] overhaul

Lizhi extractor has stopped working. In particular, there are two major changes: - URL format change: no more #/ in URL paths; - The /api/audio/{radio_id}/{audio_id} API now returns 404. This is a rewrite based on the /api/radio_audios API.
2025-01-24 14:05:01 +03:00 · 2016-12-04 19:36:17 -05:00 · 2016-12-04 19:36:17 -05:00 · 606e0a786e
commit 606e0a786e
parent 61d9bf124e
1 changed files with 46 additions and 28 deletions
--- a/src/you_get/extractors/lizhi.py
+++ b/src/you_get/extractors/lizhi.py
@ -4,37 +4,55 @@ __all__ = ['lizhi_download']
 import json
 from ..common import *

-def lizhi_download_playlist(url, output_dir = '.', merge = True, info_only = False, **kwargs):
-    # like this http://www.lizhi.fm/#/31365/
-    #api desc: s->start l->length band->some radio
-    #http://www.lizhi.fm/api/radio_audios?s=0&l=100&band=31365
-    band_id = match1(url,r'#/(\d+)')
-    #try to get a considerable large l to reduce html parsing task.
-    api_url = 'http://www.lizhi.fm/api/radio_audios?s=0&l=65535&band='+band_id
-    content_json = json.loads(get_content(api_url))
-    for sound in content_json:
-        title = sound["name"]
-        res_url = sound["url"]
-        songtype, ext, size = url_info(res_url,faker=True)
-        print_info(site_info, title, songtype, size)
-        if not info_only:
-            #no referer no speed!
-            download_urls([res_url], title, ext, size, output_dir, merge=merge ,refer = 'http://www.lizhi.fm',faker=True)    
-    pass
+# radio_id: e.g. 549759 from http://www.lizhi.fm/549759/
+#
+# Returns a list of tuples (audio_id, title, url) for each episode
+# (audio) in the radio playlist. url is the direct link to the audio
+# file.
+def lizhi_extract_playlist_info(radio_id):
+    # /api/radio_audios API parameters:
+    #
+    # - s: starting episode
+    # - l: count (per page)
+    # - band: radio_id
+    #
+    # We use l=65535 for poor man's pagination (that is, no pagination
+    # at all -- hope all fits on a single page).
+    #
+    # TODO: Use /api/radio?band={radio_id} to get number of episodes
+    # (au_cnt), then handle pagination properly.
+    api_url = 'http://www.lizhi.fm/api/radio_audios?s=0&l=65535&band=%s' % radio_id
+    api_response = json.loads(get_content(api_url))
+    return [(ep['id'], ep['name'], ep['url']) for ep in api_response]

-def lizhi_download(url, output_dir = '.', merge = True, info_only = False, **kwargs):
-    # url like http://www.lizhi.fm/#/549759/18864883431656710
-    api_id = match1(url,r'#/(\d+/\d+)')
-    api_url = 'http://www.lizhi.fm/api/audio/'+api_id
-    content_json = json.loads(get_content(api_url))
-    title = content_json["audio"]["name"]
-    res_url = content_json["audio"]["url"]
-    songtype, ext, size = url_info(res_url,faker=True)
-    print_info(site_info, title, songtype, size)
+def lizhi_download_audio(audio_id, title, url, output_dir='.', info_only=False):
+    filetype, ext, size = url_info(url)
+    print_info(site_info, title, filetype, size)
    if not info_only:
-        #no referer no speed!
-        download_urls([res_url], title, ext, size, output_dir, merge=merge ,refer = 'http://www.lizhi.fm',faker=True)    
+        download_urls([url], title, ext, size, output_dir=output_dir)

+def lizhi_download_playlist(url, output_dir='.', info_only=False, **kwargs):
+    # Sample URL: http://www.lizhi.fm/549759/
+    radio_id = match1(url,r'/(\d+)')
+    if not radio_id:
+        raise NotImplementedError('%s not supported' % url)
+    for audio_id, title, url in lizhi_extract_playlist_info(radio_id):
+        lizhi_download_audio(audio_id, title, url, output_dir=output_dir, info_only=info_only)
+
+def lizhi_download(url, output_dir='.', info_only=False, **kwargs):
+    # Sample URL: http://www.lizhi.fm/549759/18864883431656710/
+    m = re.search(r'/(?P<radio_id>\d+)/(?P<audio_id>\d+)', url)
+    if not m:
+        raise NotImplementedError('%s not supported' % url)
+    radio_id = m.group('radio_id')
+    audio_id = m.group('audio_id')
+    # Look for the audio_id among the full list of episodes
+    for aid, title, url in lizhi_extract_playlist_info(radio_id):
+        if aid == audio_id:
+            lizhi_download_audio(audio_id, title, url, output_dir=output_dir, info_only=info_only)
+            break
+    else:
+        raise NotImplementedError('Audio #%s not found in playlist #%s' % (audio_id, radio_id))

 site_info = "lizhi.fm"
 download = lizhi_download