Merge branch 'develop' of https://github.com/wenLiangcan/you-get into wenLiangcan-develop

2025-02-03 00:33:58 +03:00 · 2016-03-15 22:28:18 +01:00 · 2016-03-15 22:28:18 +01:00 · 73eb64431a
commit 73eb64431a
parent 2c49cf5018 052a641043
5 changed files with 90 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -79,3 +79,5 @@ _*
 *.ts
 *.webm
 *.xml
+/.env
+/.idea
--- a/README.md
+++ b/README.md
@ -371,6 +371,7 @@ Use `--url`/`-u` to get a list of downloadable resource URLs extracted from the
 | **Youku<br/>优酷** | <http://www.youku.com/> |✓| | |
 | 战旗TV   | <http://www.zhanqi.tv/lives>   |✓| | |
 | 央视网   | <http://www.cntv.cn/>          |✓| | |
+| 花瓣     | <http://huaban.com/>           | |✓| |

 For all other sites not on the list, the universal extractor will take care of finding and downloading interesting resources from the page.

--- a/src/you_get/common.py
+++ b/src/you_get/common.py
@ -24,6 +24,7 @@ SITES = {
    'fun'              : 'funshion',
    'google'           : 'google',
    'heavy-music'      : 'heavymusic',
+    'huaban'           : 'huaban',
    'iask'             : 'sina',
    'ifeng'            : 'ifeng',
    'imgur'            : 'imgur',
--- a/src/you_get/extractors/init.py
+++ b/src/you_get/extractors/init.py
@ -21,6 +21,7 @@ from .freesound import *
 from .funshion import *
 from .google import *
 from .heavymusic import *
+from .huaban import *
 from .ifeng import *
 from .imgur import *
 from .infoq import *
--- a/src/you_get/extractors/huaban.py
+++ b/src/you_get/extractors/huaban.py
@ -0,0 +1,85 @@
+#!/usr/bin/env python
+
+import json
+import os
+import re
+import math
+import traceback
+import urllib.parse as urlparse
+
+from ..common import *
+
+__all__ = ['huaban_download']
+
+site_info = '花瓣 (Huaban)'
+
+LIMIT = 100
+
+
+class Board:
+    def __init__(self, title, pins):
+        self.title = title
+        self.pins = pins
+        self.pin_count = len(pins)
+
+
+class Pin:
+    host = 'http://img.hb.aicdn.com/'
+
+    def __init__(self, pin_json):
+        img_file = pin_json['file']
+        self.id = str(pin_json['pin_id'])
+        self.url = urlparse.urljoin(self.host, img_file['key'])
+        self.ext = img_file['type'].split('/')[-1]
+
+
+def construct_url(url, **params):
+    param_str = urlparse.urlencode(params)
+    return url + '?' + param_str
+
+
+def extract_json_data(url, **params):
+    url = construct_url(url, **params)
+    html = get_content(url, headers=fake_headers)
+    json_string = match1(html, r'app.page\["board"\] = (.*?});')
+    json_data = json.loads(json_string)
+    return json_data
+
+
+def extract_board_data(url):
+    json_data = extract_json_data(url, limit=LIMIT)
+    pin_list = json_data['pins']
+    title = json_data['title']
+    pin_count = json_data['pin_count']
+    pin_count -= len(pin_list)
+
+    while pin_count > 0:
+        json_data = extract_json_data(url, max=pin_list[-1]['pin_id'],
+                                      limit=LIMIT)
+        pins = json_data['pins']
+        pin_list += pins
+        pin_count -= len(pins)
+
+    return Board(title, list(map(Pin, pin_list)))
+
+
+def huaban_download_board(url, output_dir, **kwargs):
+    kwargs['merge'] = False
+    board = extract_board_data(url)
+    output_dir = os.path.join(output_dir, board.title)
+    print_info(site_info, board.title, 'jpg', float('Inf'))
+    for pin in board.pins:
+        download_urls([pin.url], pin.id, pin.ext, float('Inf'),
+                      output_dir=output_dir, faker=True, **kwargs)
+
+
+def huaban_download(url, output_dir='.', **kwargs):
+    if re.match(r'http://huaban\.com/boards/\d+/', url):
+        huaban_download_board(url, output_dir, **kwargs)
+    else:
+        print('Only board (画板) pages are supported currently')
+        print('ex: http://huaban.com/boards/12345678/')
+
+
+download = huaban_download
+download_playlist = playlist_not_supported("huaban")