Proxy Pool Management: Quản Lý Nhiều Proxies Hiệu Quả

Scraping quy mô lớn cần nhiều proxies. Bài viết hướng dẫn quản lý proxy pool cho enterprise scraping.

Tại Sao Cần Proxy Pool?

Avoid bans: Rotate IPs liên tục
Increase throughput: Parallel requests
Redundancy: Backup khi proxy fail
Geo-distribution: Multi-location scraping

Basic Proxy Pool

import random
import requests

class ProxyPool:
    def __init__(self, proxies):
        self.proxies = proxies
        self.failed = set()
    
    def get_proxy(self):
        available = [p for p in self.proxies if p not in self.failed]
        if not available:
            self.failed.clear()  # Reset failed list
            available = self.proxies
        return random.choice(available)
    
    def mark_failed(self, proxy):
        self.failed.add(proxy)
    
    def request(self, url, **kwargs):
        proxy = self.get_proxy()
        try:
            response = requests.get(
                url,
                proxies={'http': proxy, 'https': proxy},
                timeout=10,
                **kwargs
            )
            return response
        except Exception as e:
            self.mark_failed(proxy)
            raise

# Usage
pool = ProxyPool([
    'http://user:pass@proxy1.vinaproxy.com:8080',
    'http://user:pass@proxy2.vinaproxy.com:8080',
    'http://user:pass@proxy3.vinaproxy.com:8080',
])

response = pool.request('https://example.com')

Weighted Proxy Selection

class WeightedProxyPool:
    def __init__(self, proxies):
        # {proxy: {'weight': 1.0, 'success': 0, 'fail': 0}}
        self.proxies = {p: {'weight': 1.0, 'success': 0, 'fail': 0} 
                        for p in proxies}
    
    def get_proxy(self):
        weights = [self.proxies[p]['weight'] for p in self.proxies]
        return random.choices(list(self.proxies.keys()), weights=weights)[0]
    
    def report_success(self, proxy):
        self.proxies[proxy]['success'] += 1
        self.proxies[proxy]['weight'] = min(2.0, self.proxies[proxy]['weight'] * 1.1)
    
    def report_failure(self, proxy):
        self.proxies[proxy]['fail'] += 1
        self.proxies[proxy]['weight'] = max(0.1, self.proxies[proxy]['weight'] * 0.5)
    
    def stats(self):
        for proxy, data in self.proxies.items():
            total = data['success'] + data['fail']
            rate = data['success'] / total * 100 if total > 0 else 0
            print(f"{proxy}: {rate:.1f}% success, weight={data['weight']:.2f}")

Health Check

import concurrent.futures

def check_proxy_health(proxy):
    try:
        response = requests.get(
            'https://httpbin.org/ip',
            proxies={'http': proxy, 'https': proxy},
            timeout=5
        )
        return proxy, response.status_code == 200, response.elapsed.total_seconds()
    except:
        return proxy, False, None

def health_check_all(proxies):
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        results = list(executor.map(check_proxy_health, proxies))
    
    healthy = []
    for proxy, is_healthy, latency in results:
        status = "✅" if is_healthy else "❌"
        latency_str = f"{latency:.2f}s" if latency else "N/A"
        print(f"{status} {proxy} - {latency_str}")
        if is_healthy:
            healthy.append(proxy)
    
    return healthy

# Check every hour
healthy_proxies = health_check_all(all_proxies)

Rotation Strategies

# 1. Per-request rotation
for url in urls:
    proxy = pool.get_proxy()
    response = requests.get(url, proxies={'http': proxy})

# 2. Per-domain rotation
domain_proxy = {}
def get_proxy_for_domain(domain):
    if domain not in domain_proxy:
        domain_proxy[domain] = pool.get_proxy()
    return domain_proxy[domain]

# 3. Time-based rotation
import time
last_rotation = time.time()
current_proxy = pool.get_proxy()

def get_rotating_proxy(interval=60):
    global last_rotation, current_proxy
    if time.time() - last_rotation > interval:
        current_proxy = pool.get_proxy()
        last_rotation = time.time()
    return current_proxy

Async Pool

import aiohttp
import asyncio

class AsyncProxyPool:
    def __init__(self, proxies):
        self.proxies = proxies
        self.index = 0
    
    def get_proxy(self):
        proxy = self.proxies[self.index % len(self.proxies)]
        self.index += 1
        return proxy
    
    async def fetch(self, url):
        proxy = self.get_proxy()
        async with aiohttp.ClientSession() as session:
            async with session.get(url, proxy=proxy) as response:
                return await response.text()

VinaProxy Pool Features

Built-in auto-rotation
Millions of IPs trong pool
Giá chỉ $0.5/GB

Dùng Thử Ngay →

Proxy Pool Management: Quản Lý Nhiều Proxies Hiệu Quả

Tại Sao Cần Proxy Pool?

Basic Proxy Pool

Weighted Proxy Selection

Health Check

Rotation Strategies

Async Pool

VinaProxy Pool Features

admin