you-get/src/you_get/extractors/huaban.py

86 lines
2.2 KiB
Python
Raw Normal View History

2016-03-11 12:57:47 +03:00
#!/usr/bin/env python
import json
import os
import re
2016-03-12 04:54:50 +03:00
import math
2016-03-11 12:57:47 +03:00
import traceback
import urllib.parse as urlparse
from ..common import *
__all__ = ['huaban_download']
site_info = '花瓣 (Huaban)'
LIMIT = 100
class Board:
def __init__(self, title, pins):
self.title = title
self.pins = pins
self.pin_count = len(pins)
class Pin:
host = 'http://img.hb.aicdn.com/'
def __init__(self, pin_json):
img_file = pin_json['file']
self.id = str(pin_json['pin_id'])
2016-03-12 04:52:53 +03:00
self.url = urlparse.urljoin(self.host, img_file['key'])
2016-03-11 12:57:47 +03:00
self.ext = img_file['type'].split('/')[-1]
def construct_url(url, **params):
param_str = urlparse.urlencode(params)
return url + '?' + param_str
def extract_json_data(url, **params):
url = construct_url(url, **params)
html = get_content(url, headers=fake_headers)
json_string = match1(html, r'app.page\["board"\] = (.*?});')
json_data = json.loads(json_string)
return json_data
def extract_board_data(url):
json_data = extract_json_data(url, limit=LIMIT)
pin_list = json_data['pins']
title = json_data['title']
pin_count = json_data['pin_count']
pin_count -= len(pin_list)
while pin_count > 0:
json_data = extract_json_data(url, max=pin_list[-1]['pin_id'],
limit=LIMIT)
pins = json_data['pins']
pin_list += pins
pin_count -= len(pins)
return Board(title, list(map(Pin, pin_list)))
def huaban_download_board(url, output_dir, **kwargs):
kwargs['merge'] = False
2016-03-11 12:57:47 +03:00
board = extract_board_data(url)
output_dir = os.path.join(output_dir, board.title)
print_info(site_info, board.title, 'jpg', float('Inf'))
for pin in board.pins:
download_urls([pin.url], pin.id, pin.ext, float('Inf'),
output_dir=output_dir, faker=True, **kwargs)
2016-03-11 12:57:47 +03:00
def huaban_download(url, output_dir='.', **kwargs):
if re.match(r'http://huaban\.com/boards/\d+/', url):
huaban_download_board(url, output_dir, **kwargs)
else:
print('Only board (画板) pages are supported currently')
print('ex: http://huaban.com/boards/12345678/')
download = huaban_download
download_playlist = playlist_not_supported("huaban")