From 5c2bb3fa3448efeaf014009bd1a1daa43680683e Mon Sep 17 00:00:00 2001 From: Riceball LEE Date: Tue, 15 Dec 2020 21:41:41 +0800 Subject: [PATCH 1/3] feat: add lrts extractor --- README.md | 1 + src/you_get/common.py | 1 + src/you_get/extractors/lrts.py | 70 ++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 src/you_get/extractors/lrts.py diff --git a/README.md b/README.md index ce412afd..6a23faf8 100644 --- a/README.md +++ b/README.md @@ -414,6 +414,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | 酷我音乐 | | | |✓| | 乐视网 | |✓| | | | 荔枝FM | | | |✓| +| 懒人听书 | | | |✓| | 秒拍 | |✓| | | | MioMio弹幕网 | |✓| | | | MissEvan
猫耳FM | | | |✓| diff --git a/src/you_get/common.py b/src/you_get/common.py index 79fc74d1..2b6e05d2 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -76,6 +76,7 @@ SITES = { 'letv' : 'le', 'lizhi' : 'lizhi', 'longzhu' : 'longzhu', + 'lrts' : 'lrts', 'magisto' : 'magisto', 'metacafe' : 'metacafe', 'mgtv' : 'mgtv', diff --git a/src/you_get/extractors/lrts.py b/src/you_get/extractors/lrts.py new file mode 100644 index 00000000..23abab5c --- /dev/null +++ b/src/you_get/extractors/lrts.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python + +__all__ = ['lrts_download'] + +import logging +from ..common import * + +def lrts_download(url, output_dir='.', merge=True, info_only=False, **kwargs): + html = get_html(url) + matched = re.search(r"/book/(\d+)", url) + if not matched: + raise AssertionError("not found book number: %s" % url) + book_no = matched.group(1) + book_title = book_no + matched = re.search(r"(.*)-(.*)", html) + if matched: + book_title = matched.group(1) + + matched = re.search(r"var totalCount='(\d+)'", html) + if not matched: + raise AssertionError("not found total count in html") + total_count = int(matched.group(1)) + logging.debug('total: %s' % total_count) + page_size = 10 + logging.debug('total page count: %s' % ((total_count // page_size) + 1)) + headers = { + 'Referer': url + } + items = [] + if (total_count > page_size): + for page in range((total_count // page_size) + 1): + page_url = 'http://www.lrts.me/ajax/book/%s/%s/%s' % (book_no, page, page_size) + response_content = json.loads(post_content(page_url, headers)) + if response_content['status'] != 'success': + raise AssertionError("got the page failed: %s" % (page_url)) + data = response_content['data']['data'] + if data: + for i in data: + i['resName'] = parse.unquote(i['resName']) + items.extend(data) + else: + break + + headers = { + 'Referer': 'http://www.lrts.me/playlist' + } + + for item in items: + i_url = 'http://www.lrts.me/ajax/path/4/%s/%s' % (item['fatherResId'], item['resId']) + response_content = json.loads(post_content(i_url, headers)) + # logging.debug(response_content) + if response_content['status'] == 'success' and response_content['data']: + item['ok'] = True + item['url'] = response_content['data'] + + items = list(filter(lambda i: 'ok' in i and i['ok'], items)) + print('Downloading %s: %s count ...' % (book_title, len(items))) + + for item in items: + title = item['resName'] + file_url = item['url'] + # if not file_url: continue + _, _, size = url_info(file_url) + print_info(site_info, title, 'mp3', size) + if not info_only: + download_urls([file_url], title, 'mp3', size, output_dir, merge=merge) + +site_info = "lrts.me" +download = lrts_download +download_playlist = lrts_download From e37836a40bd38feb7f2f616852883578b7153a6c Mon Sep 17 00:00:00 2001 From: Riceball LEE Date: Wed, 16 Dec 2020 10:31:52 +0800 Subject: [PATCH 2/3] feat: add arguments to specify the playlist first, last, page-size options --- src/you_get/common.py | 17 ++++++++++++++++- src/you_get/extractors/lrts.py | 26 ++++++++++++++++++++------ 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/src/you_get/common.py b/src/you_get/common.py index 2b6e05d2..7fe9d51d 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -1557,6 +1557,21 @@ def script_main(download, download_playlist, **kwargs): '-l', '--playlist', action='store_true', help='Prefer to download a playlist' ) + + playlist_grp = parser.add_argument_group('Playlist optional options') + playlist_grp.add_argument( + '-first', '--first', metavar='FIRST', + help='the first number' + ) + playlist_grp.add_argument( + '-last', '--last', metavar='LAST', + help='the last number' + ) + playlist_grp.add_argument( + '-size', '--page-size', metavar='PAGE_SIZE', + help='the page size number' + ) + download_grp.add_argument( '-a', '--auto-rename', action='store_true', default=False, help='Auto rename same name different files' @@ -1674,7 +1689,7 @@ def script_main(download, download_playlist, **kwargs): socket.setdefaulttimeout(args.timeout) try: - extra = {} + extra = {'args': args} if extractor_proxy: extra['extractor_proxy'] = extractor_proxy if stream_id: diff --git a/src/you_get/extractors/lrts.py b/src/you_get/extractors/lrts.py index 23abab5c..d206491d 100644 --- a/src/you_get/extractors/lrts.py +++ b/src/you_get/extractors/lrts.py @@ -4,15 +4,18 @@ __all__ = ['lrts_download'] import logging from ..common import * +from ..util import log, term def lrts_download(url, output_dir='.', merge=True, info_only=False, **kwargs): html = get_html(url) + args = kwargs.get('args') + if not args: args = {} matched = re.search(r"/book/(\d+)", url) if not matched: raise AssertionError("not found book number: %s" % url) book_no = matched.group(1) book_title = book_no - matched = re.search(r"(.*)-(.*)", html) + matched = re.search(r"([^-]*)[-](.*)[,](.*)", html) if matched: book_title = matched.group(1) @@ -20,15 +23,25 @@ def lrts_download(url, output_dir='.', merge=True, info_only=False, **kwargs): if not matched: raise AssertionError("not found total count in html") total_count = int(matched.group(1)) - logging.debug('total: %s' % total_count) + log.i('%s total: %s' % (book_title, total_count)) + first_page = 0 + if ('first' in args and args.first!= None): + first_page = int(args.first) + page_size = 10 - logging.debug('total page count: %s' % ((total_count // page_size) + 1)) + if ('page_size' in args and args.page_size != None): + page_size = int(args.page_size) + last_page = (total_count // page_size) + 1 + if ('last' in args and args.last != None): + last_page = int(args.last) + + log.i('page size is %s, page from %s to %s' % (page_size, first_page, last_page)) headers = { 'Referer': url } items = [] if (total_count > page_size): - for page in range((total_count // page_size) + 1): + for page in range(first_page, last_page): page_url = 'http://www.lrts.me/ajax/book/%s/%s/%s' % (book_no, page, page_size) response_content = json.loads(post_content(page_url, headers)) if response_content['status'] != 'success': @@ -48,13 +61,14 @@ def lrts_download(url, output_dir='.', merge=True, info_only=False, **kwargs): for item in items: i_url = 'http://www.lrts.me/ajax/path/4/%s/%s' % (item['fatherResId'], item['resId']) response_content = json.loads(post_content(i_url, headers)) - # logging.debug(response_content) + logging.debug(response_content) if response_content['status'] == 'success' and response_content['data']: item['ok'] = True item['url'] = response_content['data'] + logging.debug('ok') items = list(filter(lambda i: 'ok' in i and i['ok'], items)) - print('Downloading %s: %s count ...' % (book_title, len(items))) + log.i('Downloading %s: %s count ...' % (book_title, len(items))) for item in items: title = item['resName'] From 9432ce3c71d4f7df1a090a4c8defa76cd9ff06f3 Mon Sep 17 00:00:00 2001 From: Riceball LEE Date: Sun, 27 Dec 2020 15:45:21 +0800 Subject: [PATCH 3/3] fix(lrts): can not download audio for the count less than pagesize --- src/you_get/extractors/lrts.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/src/you_get/extractors/lrts.py b/src/you_get/extractors/lrts.py index d206491d..94d12a25 100644 --- a/src/you_get/extractors/lrts.py +++ b/src/you_get/extractors/lrts.py @@ -40,20 +40,18 @@ def lrts_download(url, output_dir='.', merge=True, info_only=False, **kwargs): 'Referer': url } items = [] - if (total_count > page_size): - for page in range(first_page, last_page): - page_url = 'http://www.lrts.me/ajax/book/%s/%s/%s' % (book_no, page, page_size) - response_content = json.loads(post_content(page_url, headers)) - if response_content['status'] != 'success': - raise AssertionError("got the page failed: %s" % (page_url)) - data = response_content['data']['data'] - if data: - for i in data: - i['resName'] = parse.unquote(i['resName']) - items.extend(data) - else: - break - + for page in range(first_page, last_page): + page_url = 'http://www.lrts.me/ajax/book/%s/%s/%s' % (book_no, page, page_size) + response_content = json.loads(post_content(page_url, headers)) + if response_content['status'] != 'success': + raise AssertionError("got the page failed: %s" % (page_url)) + data = response_content['data']['data'] + if data: + for i in data: + i['resName'] = parse.unquote(i['resName']) + items.extend(data) + else: + break headers = { 'Referer': 'http://www.lrts.me/playlist' } @@ -61,7 +59,6 @@ def lrts_download(url, output_dir='.', merge=True, info_only=False, **kwargs): for item in items: i_url = 'http://www.lrts.me/ajax/path/4/%s/%s' % (item['fatherResId'], item['resId']) response_content = json.loads(post_content(i_url, headers)) - logging.debug(response_content) if response_content['status'] == 'success' and response_content['data']: item['ok'] = True item['url'] = response_content['data']