diff --git a/.gitignore b/.gitignore index 354bb109..d22d3afe 100644 --- a/.gitignore +++ b/.gitignore @@ -79,3 +79,5 @@ _* *.ts *.webm *.xml +/.env +/.idea diff --git a/README.md b/README.md index 2591edbd..abdf39e5 100644 --- a/README.md +++ b/README.md @@ -371,6 +371,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the | **Youku
优酷** | |✓| | | | 战旗TV | |✓| | | | 央视网 | |✓| | | +| 花瓣 | | |✓| | For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. diff --git a/src/you_get/common.py b/src/you_get/common.py index a76dc5b1..f15481a3 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -24,6 +24,7 @@ SITES = { 'fun' : 'funshion', 'google' : 'google', 'heavy-music' : 'heavymusic', + 'huaban' : 'huaban', 'iask' : 'sina', 'ifeng' : 'ifeng', 'imgur' : 'imgur', diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py index 15c0c722..5af9cdd3 100755 --- a/src/you_get/extractors/__init__.py +++ b/src/you_get/extractors/__init__.py @@ -21,6 +21,7 @@ from .freesound import * from .funshion import * from .google import * from .heavymusic import * +from .huaban import * from .ifeng import * from .imgur import * from .infoq import * diff --git a/src/you_get/extractors/huaban.py b/src/you_get/extractors/huaban.py new file mode 100644 index 00000000..8acf938b --- /dev/null +++ b/src/you_get/extractors/huaban.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python + +import json +import os +import re +import math +import traceback +import urllib.parse as urlparse + +from ..common import * + +__all__ = ['huaban_download'] + +site_info = '花瓣 (Huaban)' + +LIMIT = 100 + + +class Board: + def __init__(self, title, pins): + self.title = title + self.pins = pins + self.pin_count = len(pins) + + +class Pin: + host = 'http://img.hb.aicdn.com/' + + def __init__(self, pin_json): + img_file = pin_json['file'] + self.id = str(pin_json['pin_id']) + self.url = urlparse.urljoin(self.host, img_file['key']) + self.ext = img_file['type'].split('/')[-1] + + +def construct_url(url, **params): + param_str = urlparse.urlencode(params) + return url + '?' + param_str + + +def extract_json_data(url, **params): + url = construct_url(url, **params) + html = get_content(url, headers=fake_headers) + json_string = match1(html, r'app.page\["board"\] = (.*?});') + json_data = json.loads(json_string) + return json_data + + +def extract_board_data(url): + json_data = extract_json_data(url, limit=LIMIT) + pin_list = json_data['pins'] + title = json_data['title'] + pin_count = json_data['pin_count'] + pin_count -= len(pin_list) + + while pin_count > 0: + json_data = extract_json_data(url, max=pin_list[-1]['pin_id'], + limit=LIMIT) + pins = json_data['pins'] + pin_list += pins + pin_count -= len(pins) + + return Board(title, list(map(Pin, pin_list))) + + +def huaban_download_board(url, output_dir, **kwargs): + kwargs['merge'] = False + board = extract_board_data(url) + output_dir = os.path.join(output_dir, board.title) + print_info(site_info, board.title, 'jpg', float('Inf')) + for pin in board.pins: + download_urls([pin.url], pin.id, pin.ext, float('Inf'), + output_dir=output_dir, faker=True, **kwargs) + + +def huaban_download(url, output_dir='.', **kwargs): + if re.match(r'http://huaban\.com/boards/\d+/', url): + huaban_download_board(url, output_dir, **kwargs) + else: + print('Only board (画板) pages are supported currently') + print('ex: http://huaban.com/boards/12345678/') + + +download = huaban_download +download_playlist = playlist_not_supported("huaban")