From 343c410973b6f48f457daa579b07b9ee370da5e6 Mon Sep 17 00:00:00 2001
From: wenLiangcan <boxeed@gmail.com>
Date: Fri, 11 Mar 2016 17:57:47 +0800
Subject: [PATCH] Add huaban.com support.

---
 .gitignore                         |   2 +
 README.md                          |   1 +
 src/you_get/common.py              |   1 +
 src/you_get/extractors/__init__.py |   1 +
 src/you_get/extractors/huaban.py   | 130 +++++++++++++++++++++++++++++
 5 files changed, 135 insertions(+)
 create mode 100644 src/you_get/extractors/huaban.py
diff --git a/.gitignore b/.gitignore
index 354bb109..d22d3afe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -79,3 +79,5 @@ _*
 *.ts
 *.webm
 *.xml
+/.env
+/.idea
diff --git a/README.md b/README.md
index 2591edbd..abdf39e5 100644
--- a/README.md
+++ b/README.md
@@ -371,6 +371,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the
 | **Youku<br/>优酷** | <http://www.youku.com/> |✓| | |
 | 战旗TV   | <http://www.zhanqi.tv/lives>   |✓| | |
 | 央视网   | <http://www.cntv.cn/>          |✓| | |
+| 花瓣     | <http://huaban.com/>           | |✓| |
 
 For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page.
 
diff --git a/src/you_get/common.py b/src/you_get/common.py
index a76dc5b1..f15481a3 100755
--- a/src/you_get/common.py
+++ b/src/you_get/common.py
@@ -24,6 +24,7 @@ SITES = {
     'fun'              : 'funshion',
     'google'           : 'google',
     'heavy-music'      : 'heavymusic',
+    'huaban'           : 'huaban',
     'iask'             : 'sina',
     'ifeng'            : 'ifeng',
     'imgur'            : 'imgur',
diff --git a/src/you_get/extractors/__init__.py b/src/you_get/extractors/__init__.py
index 15c0c722..5af9cdd3 100755
--- a/src/you_get/extractors/__init__.py
+++ b/src/you_get/extractors/__init__.py
@@ -21,6 +21,7 @@ from .freesound import *
 from .funshion import *
 from .google import *
 from .heavymusic import *
+from .huaban import *
 from .ifeng import *
 from .imgur import *
 from .infoq import *
diff --git a/src/you_get/extractors/huaban.py b/src/you_get/extractors/huaban.py
new file mode 100644
index 00000000..a011ae35
--- /dev/null
+++ b/src/you_get/extractors/huaban.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+
+import json
+import os
+import re
+import traceback
+import urllib.parse as urlparse
+
+from ..common import *
+
+__all__ = ['huaban_download']
+
+site_info = '花瓣 (Huaban)'
+
+LIMIT = 100
+
+
+class EnhancedPiecesProgressBar(PiecesProgressBar):
+    BAR_LEN = 40
+
+    def update(self):
+        self.displayed = True
+        bar = '{0:>5}%[{1}] {2}/{3}'.format(
+            '', '=' * self.done_bar + '-' * self.todo_bar,
+            self.current_piece, self.total_pieces)
+        sys.stdout.write('\r' + bar)
+        sys.stdout.flush()
+
+    @property
+    def done_bar(self):
+        return self.BAR_LEN // self.total_pieces * self.current_piece
+
+    @property
+    def todo_bar(self):
+        return self.BAR_LEN - self.done_bar
+
+
+class Board:
+    def __init__(self, title, pins):
+        self.title = title
+        self.pins = pins
+        self.pin_count = len(pins)
+
+
+class Pin:
+    host = 'http://img.hb.aicdn.com/'
+
+    def __init__(self, pin_json):
+        img_file = pin_json['file']
+        self.key = img_file['key']
+        self.url = urlparse.urljoin(self.host, self.key)
+        self.ext = img_file['type'].split('/')[-1]
+
+
+def construct_url(url, **params):
+    param_str = urlparse.urlencode(params)
+    return url + '?' + param_str
+
+
+def extract_json_data(url, **params):
+    url = construct_url(url, **params)
+    html = get_content(url, headers=fake_headers)
+    json_string = match1(html, r'app.page\["board"\] = (.*?});')
+    json_data = json.loads(json_string)
+    return json_data
+
+
+def extract_board_data(url):
+    json_data = extract_json_data(url, limit=LIMIT)
+    pin_list = json_data['pins']
+    title = json_data['title']
+    pin_count = json_data['pin_count']
+    pin_count -= len(pin_list)
+
+    while pin_count > 0:
+        json_data = extract_json_data(url, max=pin_list[-1]['pin_id'],
+                                      limit=LIMIT)
+        pins = json_data['pins']
+        pin_list += pins
+        pin_count -= len(pins)
+
+    return Board(title, list(map(Pin, pin_list)))
+
+
+def get_num_len(num):
+    return len(str(num))
+
+
+def huaban_download_board(url, output_dir, **kwargs):
+    board = extract_board_data(url)
+    output_dir = os.path.join(output_dir, board.title)
+    bar = EnhancedPiecesProgressBar(float('Inf'), board.pin_count)
+
+    print("Site:      ", site_info)
+    print("Title:     ", board.title)
+    print()
+
+    if dry_run:
+        urls = '\n'.join(map(lambda p: p.url, board.pins))
+        print('Real URLs:\n{}'.format(urls))
+        return
+
+    print('Downloading {} images in {} ...'.format(board.pin_count,
+                                                   board.title))
+    try:
+        bar.update()
+        name_len = get_num_len(board.pin_count)
+        for i, pin in enumerate(board.pins):
+            filename = '{0}[{1}].{2}'.format(board.title,
+                                             str(i).zfill(name_len), pin.ext)
+            filepath = os.path.join(output_dir, filename)
+            bar.update_piece(i + 1)
+            url_save(pin.url, filepath, bar, is_part=True, faker=True)
+        bar.done()
+    except KeyboardInterrupt:
+        pass
+    except:
+        traceback.print_exception(*sys.exc_info())
+
+
+def huaban_download(url, output_dir='.', **kwargs):
+    if re.match(r'http://huaban\.com/boards/\d+/', url):
+        huaban_download_board(url, output_dir, **kwargs)
+    else:
+        print('Only board (画板) pages are supported currently')
+        print('ex: http://huaban.com/boards/12345678/')
+
+
+download = huaban_download
+download_playlist = playlist_not_supported("huaban")