Proxy Pool Management: Quản Lý Nhiều Proxies Hiệu Quả

Trở lại Tin tức
Tin tức

Proxy Pool Management: Quản Lý Nhiều Proxies Hiệu Quả

Scraping quy mô lớn cần nhiều proxies. Bài viết hướng dẫn quản lý proxy pool cho enterprise scraping.

Tại Sao Cần Proxy Pool?

  • Avoid bans: Rotate IPs liên tục
  • Increase throughput: Parallel requests
  • Redundancy: Backup khi proxy fail
  • Geo-distribution: Multi-location scraping

Basic Proxy Pool

import random
import requests

class ProxyPool:
    def __init__(self, proxies):
        self.proxies = proxies
        self.failed = set()
    
    def get_proxy(self):
        available = [p for p in self.proxies if p not in self.failed]
        if not available:
            self.failed.clear()  # Reset failed list
            available = self.proxies
        return random.choice(available)
    
    def mark_failed(self, proxy):
        self.failed.add(proxy)
    
    def request(self, url, **kwargs):
        proxy = self.get_proxy()
        try:
            response = requests.get(
                url,
                proxies={'http': proxy, 'https': proxy},
                timeout=10,
                **kwargs
            )
            return response
        except Exception as e:
            self.mark_failed(proxy)
            raise

# Usage
pool = ProxyPool([
    'http://user:pass@proxy1.vinaproxy.com:8080',
    'http://user:pass@proxy2.vinaproxy.com:8080',
    'http://user:pass@proxy3.vinaproxy.com:8080',
])

response = pool.request('https://example.com')

Weighted Proxy Selection

class WeightedProxyPool:
    def __init__(self, proxies):
        # {proxy: {'weight': 1.0, 'success': 0, 'fail': 0}}
        self.proxies = {p: {'weight': 1.0, 'success': 0, 'fail': 0} 
                        for p in proxies}
    
    def get_proxy(self):
        weights = [self.proxies[p]['weight'] for p in self.proxies]
        return random.choices(list(self.proxies.keys()), weights=weights)[0]
    
    def report_success(self, proxy):
        self.proxies[proxy]['success'] += 1
        self.proxies[proxy]['weight'] = min(2.0, self.proxies[proxy]['weight'] * 1.1)
    
    def report_failure(self, proxy):
        self.proxies[proxy]['fail'] += 1
        self.proxies[proxy]['weight'] = max(0.1, self.proxies[proxy]['weight'] * 0.5)
    
    def stats(self):
        for proxy, data in self.proxies.items():
            total = data['success'] + data['fail']
            rate = data['success'] / total * 100 if total > 0 else 0
            print(f"{proxy}: {rate:.1f}% success, weight={data['weight']:.2f}")

Health Check

import concurrent.futures

def check_proxy_health(proxy):
    try:
        response = requests.get(
            'https://httpbin.org/ip',
            proxies={'http': proxy, 'https': proxy},
            timeout=5
        )
        return proxy, response.status_code == 200, response.elapsed.total_seconds()
    except:
        return proxy, False, None

def health_check_all(proxies):
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        results = list(executor.map(check_proxy_health, proxies))
    
    healthy = []
    for proxy, is_healthy, latency in results:
        status = "✅" if is_healthy else "❌"
        latency_str = f"{latency:.2f}s" if latency else "N/A"
        print(f"{status} {proxy} - {latency_str}")
        if is_healthy:
            healthy.append(proxy)
    
    return healthy

# Check every hour
healthy_proxies = health_check_all(all_proxies)

Rotation Strategies

# 1. Per-request rotation
for url in urls:
    proxy = pool.get_proxy()
    response = requests.get(url, proxies={'http': proxy})

# 2. Per-domain rotation
domain_proxy = {}
def get_proxy_for_domain(domain):
    if domain not in domain_proxy:
        domain_proxy[domain] = pool.get_proxy()
    return domain_proxy[domain]

# 3. Time-based rotation
import time
last_rotation = time.time()
current_proxy = pool.get_proxy()

def get_rotating_proxy(interval=60):
    global last_rotation, current_proxy
    if time.time() - last_rotation > interval:
        current_proxy = pool.get_proxy()
        last_rotation = time.time()
    return current_proxy

Async Pool

import aiohttp
import asyncio

class AsyncProxyPool:
    def __init__(self, proxies):
        self.proxies = proxies
        self.index = 0
    
    def get_proxy(self):
        proxy = self.proxies[self.index % len(self.proxies)]
        self.index += 1
        return proxy
    
    async def fetch(self, url):
        proxy = self.get_proxy()
        async with aiohttp.ClientSession() as session:
            async with session.get(url, proxy=proxy) as response:
                return await response.text()

VinaProxy Pool Features

  • Built-in auto-rotation
  • Millions of IPs trong pool
  • Giá chỉ $0.5/GB

Dùng Thử Ngay →