Quản Lý Proxy Pool: Hướng Dẫn Cho Scale Lớn

Trở lại Tin tức
Tin tức

Quản Lý Proxy Pool: Hướng Dẫn Cho Scale Lớn

Scraping lớn cần nhiều proxies. Bài viết hướng dẫn quản lý proxy pool hiệu quả.

Proxy Pool Là Gì?

Proxy pool = tập hợp nhiều proxies để:

  • Rotate IPs tự động
  • Distribute load
  • Bypass rate limits
  • Increase reliability

Basic Pool Manager

import random
from collections import deque

class ProxyPool:
    def __init__(self, proxies):
        self.proxies = deque(proxies)
        self.failed = set()
    
    def get_proxy(self):
        """Get next proxy (round-robin)"""
        if not self.proxies:
            return None
        proxy = self.proxies[0]
        self.proxies.rotate(-1)
        return proxy
    
    def get_random(self):
        """Get random proxy"""
        available = [p for p in self.proxies if p not in self.failed]
        return random.choice(available) if available else None
    
    def mark_failed(self, proxy):
        """Mark proxy as failed"""
        self.failed.add(proxy)
    
    def mark_success(self, proxy):
        """Mark proxy as working"""
        self.failed.discard(proxy)

# Usage
pool = ProxyPool([
    'http://user:pass@proxy1.vinaproxy.com:8080',
    'http://user:pass@proxy2.vinaproxy.com:8080',
    'http://user:pass@proxy3.vinaproxy.com:8080'
])

proxy = pool.get_proxy()

Advanced Pool với Health Check

import requests
import threading
import time

class AdvancedProxyPool:
    def __init__(self, proxies):
        self.proxies = {p: {'healthy': True, 'latency': 0} for p in proxies}
        self.lock = threading.Lock()
    
    def health_check(self, proxy):
        """Test proxy health"""
        try:
            start = time.time()
            requests.get('https://httpbin.org/ip', 
                        proxies={'http': proxy, 'https': proxy},
                        timeout=10)
            latency = time.time() - start
            
            with self.lock:
                self.proxies[proxy] = {'healthy': True, 'latency': latency}
            return True
        except:
            with self.lock:
                self.proxies[proxy]['healthy'] = False
            return False
    
    def check_all(self):
        """Check all proxies (parallel)"""
        threads = [threading.Thread(target=self.health_check, args=(p,)) 
                   for p in self.proxies]
        for t in threads:
            t.start()
        for t in threads:
            t.join()
    
    def get_fastest(self):
        """Get fastest healthy proxy"""
        healthy = {p: d for p, d in self.proxies.items() if d['healthy']}
        if not healthy:
            return None
        return min(healthy.items(), key=lambda x: x[1]['latency'])[0]
    
    def get_random_healthy(self):
        """Get random healthy proxy"""
        healthy = [p for p, d in self.proxies.items() if d['healthy']]
        return random.choice(healthy) if healthy else None

Weighted Pool

class WeightedProxyPool:
    def __init__(self, proxies_with_weights):
        # {'proxy_url': weight}
        self.proxies = proxies_with_weights
    
    def get_weighted_random(self):
        """Higher weight = more likely to be selected"""
        total = sum(self.proxies.values())
        r = random.uniform(0, total)
        cumulative = 0
        for proxy, weight in self.proxies.items():
            cumulative += weight
            if r <= cumulative:
                return proxy
    
    def adjust_weight(self, proxy, success):
        """Adjust weight based on performance"""
        if proxy in self.proxies:
            if success:
                self.proxies[proxy] = min(self.proxies[proxy] * 1.1, 10)
            else:
                self.proxies[proxy] = max(self.proxies[proxy] * 0.5, 0.1)

# Usage
pool = WeightedProxyPool({
    'http://proxy1:8080': 5,  # High weight
    'http://proxy2:8080': 3,  # Medium
    'http://proxy3:8080': 1   # Low
})

Site-Specific Pools

class SiteProxyManager:
    def __init__(self):
        self.pools = {}
    
    def add_pool(self, site, proxies):
        self.pools[site] = ProxyPool(proxies)
    
    def get_for_site(self, url):
        """Get proxy optimized for specific site"""
        from urllib.parse import urlparse
        domain = urlparse(url).netloc
        
        if domain in self.pools:
            return self.pools[domain].get_proxy()
        
        # Fallback to default pool
        return self.pools.get('default', ProxyPool([])).get_proxy()

# Ví dụ: Proxies riêng cho từng site
manager = SiteProxyManager()
manager.add_pool('shopee.vn', shopee_proxies)
manager.add_pool('lazada.vn', lazada_proxies)
manager.add_pool('default', general_proxies)

Auto-Scaling Pool

class AutoScalingPool:
    def __init__(self, min_healthy=5):
        self.min_healthy = min_healthy
        self.proxies = []
    
    def check_and_scale(self):
        """Add more proxies if needed"""
        healthy_count = sum(1 for p in self.proxies if p['healthy'])
        
        if healthy_count < self.min_healthy:
            new_proxies = self.request_more_proxies()
            self.proxies.extend(new_proxies)
            print(f"Added {len(new_proxies)} proxies")
    
    def request_more_proxies(self):
        """Request more proxies from provider API"""
        # VinaProxy API endpoint
        response = requests.get('https://api.vinaproxy.com/v1/proxies')
        return response.json()['proxies']

Best Practices

  • Health check mỗi 5-10 phút
  • Remove proxies fail 3+ lần liên tiếp
  • Log proxy performance
  • Separate pools cho different sites
  • Monitor success rate

VinaProxy Pool Features

  • Built-in rotation
  • Auto health management
  • Millions of IPs
  • Giá chỉ $0.5/GB

Scale Với VinaProxy →