From 343c410973b6f48f457daa579b07b9ee370da5e6 Mon Sep 17 00:00:00 2001 From: wenLiangcan Date: Fri, 11 Mar 2016 17:57:47 +0800 Subject: [PATCH 1/4] Add huaban.com support. --- .gitignore | 2 + README.md | 1 + src/you_get/common.py | 1 + src/you_get/extractors/__init__.py | 1 + src/you_get/extractors/huaban.py | 130 +++++++++++++++++++++++++++++ 5 files changed, 135 insertions(+) create mode 100644 src/you_get/extractors/huaban.py diff --git a/.gitignore b/.gitignore index 354bb109..d22d3afe 100644 --- a/.gitignore +++ b/.gitignore @@ -79,3 +79,5 @@ _* *.ts *.webm *.xml +/.env +/.idea diff --git a/README.md b/README.md index 2591edbd..abdf39e5 100644 --- a/README.md +++ b/README.md @@ -371,6 +371,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | **Youku
优酷** | |✓| | | | 战旗TV | |✓| | | | 央视网 | |✓| | | +| 花瓣 | | |✓| | For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. diff --git a/src/you_get/common.py b/src/you_get/common.py index a76dc5b1..f15481a3 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -24,6 +24,7 @@ SITES = { 'fun' : 'funshion', 'google' : 'google', 'heavy-music' : 'heavymusic', + 'huaban' : 'huaban', 'iask' : 'sina', 'ifeng' : 'ifeng', 'imgur' : 'imgur', diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index 15c0c722..5af9cdd3 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -21,6 +21,7 @@ from .freesound import * from .funshion import * from .google import * from .heavymusic import * +from .huaban import * from .ifeng import * from .imgur import * from .infoq import * diff --git a/src/you_get/extractors/huaban.py b/src/you_get/extractors/huaban.py new file mode 100644 index 00000000..a011ae35 --- /dev/null +++ b/src/you_get/extractors/huaban.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python + +import json +import os +import re +import traceback +import urllib.parse as urlparse + +from ..common import * + +__all__ = ['huaban_download'] + +site_info = '花瓣 (Huaban)' + +LIMIT = 100 + + +class EnhancedPiecesProgressBar(PiecesProgressBar): + BAR_LEN = 40 + + def update(self): + self.displayed = True + bar = '{0:>5}%[{1}] {2}/{3}'.format( + '', '=' * self.done_bar + '-' * self.todo_bar, + self.current_piece, self.total_pieces) + sys.stdout.write('\r' + bar) + sys.stdout.flush() + + @property + def done_bar(self): + return self.BAR_LEN // self.total_pieces * self.current_piece + + @property + def todo_bar(self): + return self.BAR_LEN - self.done_bar + + +class Board: + def __init__(self, title, pins): + self.title = title + self.pins = pins + self.pin_count = len(pins) + + +class Pin: + host = 'http://img.hb.aicdn.com/' + + def __init__(self, pin_json): + img_file = pin_json['file'] + self.key = img_file['key'] + self.url = urlparse.urljoin(self.host, self.key) + self.ext = img_file['type'].split('/')[-1] + + +def construct_url(url, **params): + param_str = urlparse.urlencode(params) + return url + '?' + param_str + + +def extract_json_data(url, **params): + url = construct_url(url, **params) + html = get_content(url, headers=fake_headers) + json_string = match1(html, r'app.page\["board"\] = (.*?});') + json_data = json.loads(json_string) + return json_data + + +def extract_board_data(url): + json_data = extract_json_data(url, limit=LIMIT) + pin_list = json_data['pins'] + title = json_data['title'] + pin_count = json_data['pin_count'] + pin_count -= len(pin_list) + + while pin_count > 0: + json_data = extract_json_data(url, max=pin_list[-1]['pin_id'], + limit=LIMIT) + pins = json_data['pins'] + pin_list += pins + pin_count -= len(pins) + + return Board(title, list(map(Pin, pin_list))) + + +def get_num_len(num): + return len(str(num)) + + +def huaban_download_board(url, output_dir, **kwargs): + board = extract_board_data(url) + output_dir = os.path.join(output_dir, board.title) + bar = EnhancedPiecesProgressBar(float('Inf'), board.pin_count) + + print("Site: ", site_info) + print("Title: ", board.title) + print() + + if dry_run: + urls = '\n'.join(map(lambda p: p.url, board.pins)) + print('Real URLs:\n{}'.format(urls)) + return + + print('Downloading {} images in {} ...'.format(board.pin_count, + board.title)) + try: + bar.update() + name_len = get_num_len(board.pin_count) + for i, pin in enumerate(board.pins): + filename = '{0}[{1}].{2}'.format(board.title, + str(i).zfill(name_len), pin.ext) + filepath = os.path.join(output_dir, filename) + bar.update_piece(i + 1) + url_save(pin.url, filepath, bar, is_part=True, faker=True) + bar.done() + except KeyboardInterrupt: + pass + except: + traceback.print_exception(*sys.exc_info()) + + +def huaban_download(url, output_dir='.', **kwargs): + if re.match(r'http://huaban\.com/boards/\d+/', url): + huaban_download_board(url, output_dir, **kwargs) + else: + print('Only board (画板) pages are supported currently') + print('ex: http://huaban.com/boards/12345678/') + + +download = huaban_download +download_playlist = playlist_not_supported("huaban") From 911794a3725c5bdda65b02470f7845b33587ddbe Mon Sep 17 00:00:00 2001 From: wenLiangcan Date: Sat, 12 Mar 2016 09:52:53 +0800 Subject: [PATCH 2/4] Use pin id as output filename. --- src/you_get/extractors/huaban.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/you_get/extractors/huaban.py b/src/you_get/extractors/huaban.py index a011ae35..efb082c1 100644 --- a/src/you_get/extractors/huaban.py +++ b/src/you_get/extractors/huaban.py @@ -47,8 +47,8 @@ class Pin: def __init__(self, pin_json): img_file = pin_json['file'] - self.key = img_file['key'] - self.url = urlparse.urljoin(self.host, self.key) + self.id = pin_json['pin_id'] + self.url = urlparse.urljoin(self.host, img_file['key']) self.ext = img_file['type'].split('/')[-1] @@ -106,8 +106,7 @@ def huaban_download_board(url, output_dir, **kwargs): bar.update() name_len = get_num_len(board.pin_count) for i, pin in enumerate(board.pins): - filename = '{0}[{1}].{2}'.format(board.title, - str(i).zfill(name_len), pin.ext) + filename = '{0}.{1}'.format(pin.id, pin.ext) filepath = os.path.join(output_dir, filename) bar.update_piece(i + 1) url_save(pin.url, filepath, bar, is_part=True, faker=True) From ce10df775cf3ee8a438ab93055a9a5c70fa6fc42 Mon Sep 17 00:00:00 2001 From: wenLiangcan Date: Sat, 12 Mar 2016 09:54:50 +0800 Subject: [PATCH 3/4] Fix progressbar animation. --- src/you_get/extractors/huaban.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/you_get/extractors/huaban.py b/src/you_get/extractors/huaban.py index efb082c1..a8283a72 100644 --- a/src/you_get/extractors/huaban.py +++ b/src/you_get/extractors/huaban.py @@ -3,6 +3,7 @@ import json import os import re +import math import traceback import urllib.parse as urlparse @@ -28,7 +29,7 @@ class EnhancedPiecesProgressBar(PiecesProgressBar): @property def done_bar(self): - return self.BAR_LEN // self.total_pieces * self.current_piece + return math.ceil(self.BAR_LEN / self.total_pieces * self.current_piece) @property def todo_bar(self): @@ -82,10 +83,6 @@ def extract_board_data(url): return Board(title, list(map(Pin, pin_list))) -def get_num_len(num): - return len(str(num)) - - def huaban_download_board(url, output_dir, **kwargs): board = extract_board_data(url) output_dir = os.path.join(output_dir, board.title) @@ -104,7 +101,6 @@ def huaban_download_board(url, output_dir, **kwargs): board.title)) try: bar.update() - name_len = get_num_len(board.pin_count) for i, pin in enumerate(board.pins): filename = '{0}.{1}'.format(pin.id, pin.ext) filepath = os.path.join(output_dir, filename) From 052a6410430f024cfbc78cfbb08ba9fb52429d6f Mon Sep 17 00:00:00 2001 From: wenLiangcan Date: Tue, 15 Mar 2016 12:05:02 +0800 Subject: [PATCH 4/4] [Huaban] Refactoring. Remove customized file handling and logging code and make use of `downalod_urls()` and `print_info()`. --- src/you_get/extractors/huaban.py | 52 ++++---------------------------- 1 file changed, 6 insertions(+), 46 deletions(-) diff --git a/src/you_get/extractors/huaban.py b/src/you_get/extractors/huaban.py index a8283a72..8acf938b 100644 --- a/src/you_get/extractors/huaban.py +++ b/src/you_get/extractors/huaban.py @@ -16,26 +16,6 @@ site_info = '花瓣 (Huaban)' LIMIT = 100 -class EnhancedPiecesProgressBar(PiecesProgressBar): - BAR_LEN = 40 - - def update(self): - self.displayed = True - bar = '{0:>5}%[{1}] {2}/{3}'.format( - '', '=' * self.done_bar + '-' * self.todo_bar, - self.current_piece, self.total_pieces) - sys.stdout.write('\r' + bar) - sys.stdout.flush() - - @property - def done_bar(self): - return math.ceil(self.BAR_LEN / self.total_pieces * self.current_piece) - - @property - def todo_bar(self): - return self.BAR_LEN - self.done_bar - - class Board: def __init__(self, title, pins): self.title = title @@ -48,7 +28,7 @@ class Pin: def __init__(self, pin_json): img_file = pin_json['file'] - self.id = pin_json['pin_id'] + self.id = str(pin_json['pin_id']) self.url = urlparse.urljoin(self.host, img_file['key']) self.ext = img_file['type'].split('/')[-1] @@ -84,33 +64,13 @@ def extract_board_data(url): def huaban_download_board(url, output_dir, **kwargs): + kwargs['merge'] = False board = extract_board_data(url) output_dir = os.path.join(output_dir, board.title) - bar = EnhancedPiecesProgressBar(float('Inf'), board.pin_count) - - print("Site: ", site_info) - print("Title: ", board.title) - print() - - if dry_run: - urls = '\n'.join(map(lambda p: p.url, board.pins)) - print('Real URLs:\n{}'.format(urls)) - return - - print('Downloading {} images in {} ...'.format(board.pin_count, - board.title)) - try: - bar.update() - for i, pin in enumerate(board.pins): - filename = '{0}.{1}'.format(pin.id, pin.ext) - filepath = os.path.join(output_dir, filename) - bar.update_piece(i + 1) - url_save(pin.url, filepath, bar, is_part=True, faker=True) - bar.done() - except KeyboardInterrupt: - pass - except: - traceback.print_exception(*sys.exc_info()) + print_info(site_info, board.title, 'jpg', float('Inf')) + for pin in board.pins: + download_urls([pin.url], pin.id, pin.ext, float('Inf'), + output_dir=output_dir, faker=True, **kwargs) def huaban_download(url, output_dir='.', **kwargs):