Auto-set a Chinese extractor proxy.

I used BeautifulSoup to parse a proxy listing website http://www.proxynova.com/proxy-server-list/country-cn/, and then pick from them. Tested on my local machine and it worked.
This commit is contained in:
Huichao Xue 2016-04-14 21:04:17 -07:00
parent 244f77e6b7
commit a5abbc87fb
3 changed files with 29 additions and 2 deletions

View File

@ -32,6 +32,7 @@ setup(
packages = find_packages('src'), packages = find_packages('src'),
package_dir = {'' : 'src'}, package_dir = {'' : 'src'},
install_requires=['bs4'],
test_suite = 'tests', test_suite = 'tests',

View File

@ -106,6 +106,7 @@ from .util import log, term
from .util.git import get_version from .util.git import get_version
from .util.strings import get_filename, unescape_html from .util.strings import get_filename, unescape_html
from . import json_output as json_output_ from . import json_output as json_output_
from . import proxy_picker
dry_run = False dry_run = False
json_output = False json_output = False
@ -1029,6 +1030,7 @@ def download_main(download, download_playlist, urls, playlist, **kwargs):
else: else:
download(url, **kwargs) download(url, **kwargs)
def script_main(script_name, download, download_playlist, **kwargs): def script_main(script_name, download, download_playlist, **kwargs):
def version(): def version():
log.i('version %s, a tiny downloader that scrapes the web.' log.i('version %s, a tiny downloader that scrapes the web.'
@ -1061,10 +1063,11 @@ def script_main(script_name, download, download_playlist, **kwargs):
-y | --extractor-proxy <HOST:PORT> Use an HTTP proxy for extracting only. -y | --extractor-proxy <HOST:PORT> Use an HTTP proxy for extracting only.
--no-proxy Never use a proxy. --no-proxy Never use a proxy.
-d | --debug Show traceback and other debug info. -d | --debug Show traceback and other debug info.
-C | --china Pick a Chinese proxy for extracting.
''' '''
short_opts = 'Vhfiuc:ndF:O:o:p:x:y:' short_opts = 'Vhfiuc:ndF:O:o:p:x:y:C'
opts = ['version', 'help', 'force', 'info', 'url', 'cookies', 'no-caption', 'no-merge', 'no-proxy', 'debug', 'json', 'format=', 'stream=', 'itag=', 'output-filename=', 'output-dir=', 'player=', 'http-proxy=', 'extractor-proxy=', 'lang='] opts = ['version', 'help', 'force', 'info', 'url', 'cookies', 'no-caption', 'no-merge', 'no-proxy', 'debug', 'json', 'format=', 'stream=', 'itag=', 'output-filename=', 'output-dir=', 'player=', 'http-proxy=', 'extractor-proxy=', 'lang=', "china"]
if download_playlist: if download_playlist:
short_opts = 'l' + short_opts short_opts = 'l' + short_opts
opts = ['playlist'] + opts opts = ['playlist'] + opts
@ -1167,6 +1170,9 @@ def script_main(script_name, download, download_playlist, **kwargs):
extractor_proxy = a extractor_proxy = a
elif o in ('--lang',): elif o in ('--lang',):
lang = a lang = a
elif o in ('-C', '--china'):
extractor_proxy = proxy_picker.pick_a_chinese_proxy()
print("Using Chinese proxy {}".format(extractor_proxy))
else: else:
log.e("try 'you-get --help' for more options") log.e("try 'you-get --help' for more options")
sys.exit(2) sys.exit(2)

View File

@ -0,0 +1,20 @@
from bs4 import BeautifulSoup
from urllib import request
import random
def pick_a_chinese_proxy():
content = request.urlopen(
"http://www.proxynova.com/proxy-server-list/country-cn/").read()
content = open("/tmp/proxies.html").read()
soup = BeautifulSoup(content, 'lxml')
all_proxies = []
for row in soup.find_all('tr')[1:]:
try:
ip = row.find_all('span', {'class' : 'row_proxy_ip'})[0].text.strip()
port = row.find_all('td')[1].text.strip()
cur_proxy = "{}:{}".format(ip, port)
all_proxies.append(cur_proxy)
except:
pass
return random.choice(all_proxies)