Add huaban.com support.

This commit is contained in:
wenLiangcan 2016-03-11 17:57:47 +08:00
parent b97feb8d83
commit 343c410973
5 changed files with 135 additions and 0 deletions

2
.gitignore vendored
View File

@ -79,3 +79,5 @@ _*
*.ts *.ts
*.webm *.webm
*.xml *.xml
/.env
/.idea

View File

@ -371,6 +371,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the
| **Youku<br/>优酷** | <http://www.youku.com/> |✓| | | | **Youku<br/>优酷** | <http://www.youku.com/> |✓| | |
| 战旗TV | <http://www.zhanqi.tv/lives> |✓| | | | 战旗TV | <http://www.zhanqi.tv/lives> |✓| | |
| 央视网 | <http://www.cntv.cn/> |✓| | | | 央视网 | <http://www.cntv.cn/> |✓| | |
| 花瓣 | <http://huaban.com/> | |✓| |
For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page. For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page.

View File

@ -24,6 +24,7 @@ SITES = {
'fun' : 'funshion', 'fun' : 'funshion',
'google' : 'google', 'google' : 'google',
'heavy-music' : 'heavymusic', 'heavy-music' : 'heavymusic',
'huaban' : 'huaban',
'iask' : 'sina', 'iask' : 'sina',
'ifeng' : 'ifeng', 'ifeng' : 'ifeng',
'imgur' : 'imgur', 'imgur' : 'imgur',

View File

@ -21,6 +21,7 @@ from .freesound import *
from .funshion import * from .funshion import *
from .google import * from .google import *
from .heavymusic import * from .heavymusic import *
from .huaban import *
from .ifeng import * from .ifeng import *
from .imgur import * from .imgur import *
from .infoq import * from .infoq import *

View File

@ -0,0 +1,130 @@
#!/usr/bin/env python
import json
import os
import re
import traceback
import urllib.parse as urlparse
from ..common import *
__all__ = ['huaban_download']
site_info = '花瓣 (Huaban)'
LIMIT = 100
class EnhancedPiecesProgressBar(PiecesProgressBar):
BAR_LEN = 40
def update(self):
self.displayed = True
bar = '{0:>5}%[{1}] {2}/{3}'.format(
'', '=' * self.done_bar + '-' * self.todo_bar,
self.current_piece, self.total_pieces)
sys.stdout.write('\r' + bar)
sys.stdout.flush()
@property
def done_bar(self):
return self.BAR_LEN // self.total_pieces * self.current_piece
@property
def todo_bar(self):
return self.BAR_LEN - self.done_bar
class Board:
def __init__(self, title, pins):
self.title = title
self.pins = pins
self.pin_count = len(pins)
class Pin:
host = 'http://img.hb.aicdn.com/'
def __init__(self, pin_json):
img_file = pin_json['file']
self.key = img_file['key']
self.url = urlparse.urljoin(self.host, self.key)
self.ext = img_file['type'].split('/')[-1]
def construct_url(url, **params):
param_str = urlparse.urlencode(params)
return url + '?' + param_str
def extract_json_data(url, **params):
url = construct_url(url, **params)
html = get_content(url, headers=fake_headers)
json_string = match1(html, r'app.page\["board"\] = (.*?});')
json_data = json.loads(json_string)
return json_data
def extract_board_data(url):
json_data = extract_json_data(url, limit=LIMIT)
pin_list = json_data['pins']
title = json_data['title']
pin_count = json_data['pin_count']
pin_count -= len(pin_list)
while pin_count > 0:
json_data = extract_json_data(url, max=pin_list[-1]['pin_id'],
limit=LIMIT)
pins = json_data['pins']
pin_list += pins
pin_count -= len(pins)
return Board(title, list(map(Pin, pin_list)))
def get_num_len(num):
return len(str(num))
def huaban_download_board(url, output_dir, **kwargs):
board = extract_board_data(url)
output_dir = os.path.join(output_dir, board.title)
bar = EnhancedPiecesProgressBar(float('Inf'), board.pin_count)
print("Site: ", site_info)
print("Title: ", board.title)
print()
if dry_run:
urls = '\n'.join(map(lambda p: p.url, board.pins))
print('Real URLs:\n{}'.format(urls))
return
print('Downloading {} images in {} ...'.format(board.pin_count,
board.title))
try:
bar.update()
name_len = get_num_len(board.pin_count)
for i, pin in enumerate(board.pins):
filename = '{0}[{1}].{2}'.format(board.title,
str(i).zfill(name_len), pin.ext)
filepath = os.path.join(output_dir, filename)
bar.update_piece(i + 1)
url_save(pin.url, filepath, bar, is_part=True, faker=True)
bar.done()
except KeyboardInterrupt:
pass
except:
traceback.print_exception(*sys.exc_info())
def huaban_download(url, output_dir='.', **kwargs):
if re.match(r'http://huaban\.com/boards/\d+/', url):
huaban_download_board(url, output_dir, **kwargs)
else:
print('Only board (画板) pages are supported currently')
print('ex: http://huaban.com/boards/12345678/')
download = huaban_download
download_playlist = playlist_not_supported("huaban")