Add huaban.com support.

2025-01-23 13:35:16 +03:00 · 2016-03-11 17:57:47 +08:00 · 2016-03-11 17:57:47 +08:00 · 343c410973
commit 343c410973
parent b97feb8d83
5 changed files with 135 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -79,3 +79,5 @@ _*
 *.ts
 *.webm
 *.xml
 /.env
 /.idea
--- a/README.md
+++ b/README.md
@ -371,6 +371,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the
 | **Youku<br/>优酷** | <http://www.youku.com/> |✓| | |
 | 战旗TV   | <http://www.zhanqi.tv/lives>   |✓| | |
 | 央视网   | <http://www.cntv.cn/>          |✓| | |
 | 花瓣     | <http://huaban.com/>           | |✓| |
 For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page.
--- a/src/you_get/common.py
+++ b/src/you_get/common.py
@ -24,6 +24,7 @@ SITES = {
    'fun'              : 'funshion',
    'google'           : 'google',
    'heavy-music'      : 'heavymusic',
    'huaban'           : 'huaban',
    'iask'             : 'sina',
    'ifeng'            : 'ifeng',
    'imgur'            : 'imgur',
--- a/src/you_get/extractors/init.py
+++ b/src/you_get/extractors/init.py
@ -21,6 +21,7 @@ from .freesound import *
 from .funshion import *
 from .google import *
 from .heavymusic import *
 from .huaban import *
 from .ifeng import *
 from .imgur import *
 from .infoq import *
--- a/src/you_get/extractors/huaban.py
+++ b/src/you_get/extractors/huaban.py
@ -0,0 +1,130 @@
 #!/usr/bin/env python
 import json
 import os
 import re
 import traceback
 import urllib.parse as urlparse
 from ..common import *
 __all__ = ['huaban_download']
 site_info = '花瓣 (Huaban)'
 LIMIT = 100
 class EnhancedPiecesProgressBar(PiecesProgressBar):
    BAR_LEN = 40
    def update(self):
        self.displayed = True
        bar = '{0:>5}%[{1}] {2}/{3}'.format(
            '', '=' * self.done_bar + '-' * self.todo_bar,
            self.current_piece, self.total_pieces)
        sys.stdout.write('\r' + bar)
        sys.stdout.flush()
    @property
    def done_bar(self):
        return self.BAR_LEN // self.total_pieces * self.current_piece
    @property
    def todo_bar(self):
        return self.BAR_LEN - self.done_bar
 class Board:
    def __init__(self, title, pins):
        self.title = title
        self.pins = pins
        self.pin_count = len(pins)
 class Pin:
    host = 'http://img.hb.aicdn.com/'
    def __init__(self, pin_json):
        img_file = pin_json['file']
        self.key = img_file['key']
        self.url = urlparse.urljoin(self.host, self.key)
        self.ext = img_file['type'].split('/')[-1]
 def construct_url(url, **params):
    param_str = urlparse.urlencode(params)
    return url + '?' + param_str
 def extract_json_data(url, **params):
    url = construct_url(url, **params)
    html = get_content(url, headers=fake_headers)
    json_string = match1(html, r'app.page\["board"\] = (.*?});')
    json_data = json.loads(json_string)
    return json_data
 def extract_board_data(url):
    json_data = extract_json_data(url, limit=LIMIT)
    pin_list = json_data['pins']
    title = json_data['title']
    pin_count = json_data['pin_count']
    pin_count -= len(pin_list)
    while pin_count > 0:
        json_data = extract_json_data(url, max=pin_list[-1]['pin_id'],
                                      limit=LIMIT)
        pins = json_data['pins']
        pin_list += pins
        pin_count -= len(pins)
    return Board(title, list(map(Pin, pin_list)))
 def get_num_len(num):
    return len(str(num))
 def huaban_download_board(url, output_dir, **kwargs):
    board = extract_board_data(url)
    output_dir = os.path.join(output_dir, board.title)
    bar = EnhancedPiecesProgressBar(float('Inf'), board.pin_count)
    print("Site:      ", site_info)
    print("Title:     ", board.title)
    print()
    if dry_run:
        urls = '\n'.join(map(lambda p: p.url, board.pins))
        print('Real URLs:\n{}'.format(urls))
        return
    print('Downloading {} images in {} ...'.format(board.pin_count,
                                                   board.title))
    try:
        bar.update()
        name_len = get_num_len(board.pin_count)
        for i, pin in enumerate(board.pins):
            filename = '{0}[{1}].{2}'.format(board.title,
                                             str(i).zfill(name_len), pin.ext)
            filepath = os.path.join(output_dir, filename)
            bar.update_piece(i + 1)
            url_save(pin.url, filepath, bar, is_part=True, faker=True)
        bar.done()
    except KeyboardInterrupt:
        pass
    except:
        traceback.print_exception(*sys.exc_info())
 def huaban_download(url, output_dir='.', **kwargs):
    if re.match(r'http://huaban\.com/boards/\d+/', url):
        huaban_download_board(url, output_dir, **kwargs)
    else:
        print('Only board (画板) pages are supported currently')
        print('ex: http://huaban.com/boards/12345678/')
 download = huaban_download
 download_playlist = playlist_not_supported("huaban")