squid+haproxy实现爬虫代理服务器

通过squid搭建正向代理并结合haproxy实现负载均衡,代理服务器和代理服务器ip更换不需要更改爬虫脚本配置,直接由haproxy实现负载均衡和故障转移。

1、安装配置squid

yum install squid

编辑配置文件 /etc/sqid/squid.conf
注意修改三个地方
acl localnet src 192.168.0.0/16
启用定义的acl
http_access allow localhost
代理监听端口
http_port 3128
配置具体内容如下

acl manager proto cache_object                                                    
acl localhost src 127.0.0.1/32 ::1                                                
acl to_localhost dst 127.0.0.0/8 0.0.0.0/32 ::1                                   
acl localnet src 192.168.0.0/16 # RFC1918 possible internal network               
acl localnet src fc00::/7       # RFC 4193 local private network range            
acl localnet src fe80::/10      # RFC 4291 link-local (directly plugged) machines 
acl SSL_ports port 443                                                            
acl Safe_ports port 80          # http                                            
acl Safe_ports port 21          # ftp                                             
acl Safe_ports port 443         # https                                           
acl Safe_ports port 70          # gopher                                          
acl Safe_ports port 210         # wais                                            
acl Safe_ports port 1025-65535  # unregistered ports                              
acl Safe_ports port 280         # http-mgmt                                       
acl Safe_ports port 488         # gss-http                                        
acl Safe_ports port 591         # filemaker                                       
acl Safe_ports port 777         # multiling http                                  
acl CONNECT method CONNECT                                                        
http_access allow manager localhost                                               
http_access deny manager                                                          
http_access deny !Safe_ports                                                      
http_access deny CONNECT !SSL_ports                                               
http_access allow localnet                                                        
http_access allow localhost                                                       
http_access deny all                                                              
http_port 3128                                                                    
coredump_dir /var/spool/squid                                                     
refresh_pattern ^ftp:           1440    20%     10080                             
refresh_pattern ^gopher:        1440    0%      1440                              
refresh_pattern -i (/cgi-bin/|\?) 0     0%      0                                 
refresh_pattern .               0       20%     4320

启动squid
/etc/init.d/squid
2、安装配置haproxy

yum install haproxy

编辑配置文件 /etc/haproxy/haproxy.cfg

global                                                                                                             
    log 127.0.0.1   local3                                                                                         
    maxconn 65535                                                                                                  
    #uid 99                                                                                                        
    #gid 99                                                                                                        
    user        haproxy                                                                                            
    group       haproxy                                                                                            
    daemon                                                                                                         
    pidfile     /var/run/haproxy.pid                                                                               
    stats socket /var/lib/haproxy/stats                                                                            
defaults                                                                                                           
        log     global                                                                                             
        mode    http                                                                                               
        #option httplog                                                                                            
        option  tcplog                                                                                             
        option  dontlognull                                                                                        
        #option forwardfor                                                                                         
        #option httpclose                                                                                          
        retries 2                                                                                                  
        option redispatch                                                                                          
        maxconn 65535                                                                                              
        timeout connect  5000                                                                                      
        timeout client   50000                                                                                     
        timeout server  50000                                                                                      
listen  http_80:80    0.0.0.0:80                                                                                   
        mode  tcp  #配置http模式                                                                                   
        maxconn  65535                                                                                             
        balance leastconn                                                                                          
        #balance roundrobin                                                                                        
        #balance source                                                                                            
        server   192.168.15.88  192.168.15.88:3128  check   weight 3 check maxconn 10000 check inter 5s port 3128  
        server   192.168.15.89  192.168.15.89:3128  check   weight 3 check maxconn 10000 check inter 5s port 3128  
listen  https_443:443    0.0.0.0:443                                                                               
        mode  tcp  #配置http模式                                                                                   
        maxconn  65535                                                                                             
        balance leastconn                                                                                          
        #balance roundrobin                                                                                        
        #balance source                                                                                            
        server   192.168.15.88  192.168.15.88:3128  check   weight 3 check maxconn 10000 check inter 5s port 3128  
        server   192.168.15.89  192.168.15.89:3128  check   weight 3 check maxconn 10000 check inter 5s port 3128  
listen stats_auth 0.0.0.0:8080                                                                                     
        stats refresh 30s                                                                                          
        stats enable                                                                                               
        stats uri  /admin-status #管理地址                                                                         
        stats auth  admin:7Cg3Qco< #管理帐号:管理密码                                                              
        stats admin if TRUE

配置haproxy日志文件
vim /etc/rsyslog.d/haproxy.conf

$ModLoad imudp                   
$UDPServerRun 514                
local3.*     /var/log/haproxy.log
&~

启动haproxy
/etc/init.d/haproxy start
重启rsyslog服务
/etc/init.d/rsyslog start

3、python脚本服务器指定haproxy为代理服务器
export http_proxy=”http://192.168.15.178:80″
export https_proxy=”https://192.168.15.178:443″

 

发表评论