From a5abbc87fbe8d7ed0cf2cce111b137cf1a984602 Mon Sep 17 00:00:00 2001 From: Huichao Xue Date: Thu, 14 Apr 2016 21:04:17 -0700 Subject: [PATCH] Auto-set a Chinese extractor proxy. I used BeautifulSoup to parse a proxy listing website http://www.proxynova.com/proxy-server-list/country-cn/, and then pick from them. Tested on my local machine and it worked. --- setup.py | 1 + src/you_get/common.py | 10 ++++++++-- src/you_get/proxy_picker.py | 20 ++++++++++++++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 src/you_get/proxy_picker.py diff --git a/setup.py b/setup.py index 21246c5f..34068b2a 100755 --- a/setup.py +++ b/setup.py @@ -32,6 +32,7 @@ setup( packages = find_packages('src'), package_dir = {'' : 'src'}, + install_requires=['bs4'], test_suite = 'tests', diff --git a/src/you_get/common.py b/src/you_get/common.py index e20be32b..3618987c 100755 --- a/src/you_get/common.py +++ b/src/you_get/common.py @@ -106,6 +106,7 @@ from .util import log, term from .util.git import get_version from .util.strings import get_filename, unescape_html from . import json_output as json_output_ +from . import proxy_picker dry_run = False json_output = False @@ -1029,6 +1030,7 @@ def download_main(download, download_playlist, urls, playlist, **kwargs): else: download(url, **kwargs) + def script_main(script_name, download, download_playlist, **kwargs): def version(): log.i('version %s, a tiny downloader that scrapes the web.' @@ -1061,10 +1063,11 @@ def script_main(script_name, download, download_playlist, **kwargs): -y | --extractor-proxy Use an HTTP proxy for extracting only. --no-proxy Never use a proxy. -d | --debug Show traceback and other debug info. + -C | --china Pick a Chinese proxy for extracting. ''' - short_opts = 'Vhfiuc:ndF:O:o:p:x:y:' - opts = ['version', 'help', 'force', 'info', 'url', 'cookies', 'no-caption', 'no-merge', 'no-proxy', 'debug', 'json', 'format=', 'stream=', 'itag=', 'output-filename=', 'output-dir=', 'player=', 'http-proxy=', 'extractor-proxy=', 'lang='] + short_opts = 'Vhfiuc:ndF:O:o:p:x:y:C' + opts = ['version', 'help', 'force', 'info', 'url', 'cookies', 'no-caption', 'no-merge', 'no-proxy', 'debug', 'json', 'format=', 'stream=', 'itag=', 'output-filename=', 'output-dir=', 'player=', 'http-proxy=', 'extractor-proxy=', 'lang=', "china"] if download_playlist: short_opts = 'l' + short_opts opts = ['playlist'] + opts @@ -1167,6 +1170,9 @@ def script_main(script_name, download, download_playlist, **kwargs): extractor_proxy = a elif o in ('--lang',): lang = a + elif o in ('-C', '--china'): + extractor_proxy = proxy_picker.pick_a_chinese_proxy() + print("Using Chinese proxy {}".format(extractor_proxy)) else: log.e("try 'you-get --help' for more options") sys.exit(2) diff --git a/src/you_get/proxy_picker.py b/src/you_get/proxy_picker.py new file mode 100644 index 00000000..81db36fa --- /dev/null +++ b/src/you_get/proxy_picker.py @@ -0,0 +1,20 @@ +from bs4 import BeautifulSoup +from urllib import request +import random + +def pick_a_chinese_proxy(): + content = request.urlopen( + "http://www.proxynova.com/proxy-server-list/country-cn/").read() + content = open("/tmp/proxies.html").read() + soup = BeautifulSoup(content, 'lxml') + all_proxies = [] + for row in soup.find_all('tr')[1:]: + try: + ip = row.find_all('span', {'class' : 'row_proxy_ip'})[0].text.strip() + port = row.find_all('td')[1].text.strip() + cur_proxy = "{}:{}".format(ip, port) + all_proxies.append(cur_proxy) + except: + pass + + return random.choice(all_proxies)