Quản Lý Proxy Pool: Hướng Dẫn Cho Scale Lớn
Scraping lớn cần nhiều proxies. Bài viết hướng dẫn quản lý proxy pool hiệu quả.
Proxy Pool Là Gì?
Proxy pool = tập hợp nhiều proxies để:
- Rotate IPs tự động
- Distribute load
- Bypass rate limits
- Increase reliability
Basic Pool Manager
import random
from collections import deque
class ProxyPool:
def __init__(self, proxies):
self.proxies = deque(proxies)
self.failed = set()
def get_proxy(self):
"""Get next proxy (round-robin)"""
if not self.proxies:
return None
proxy = self.proxies[0]
self.proxies.rotate(-1)
return proxy
def get_random(self):
"""Get random proxy"""
available = [p for p in self.proxies if p not in self.failed]
return random.choice(available) if available else None
def mark_failed(self, proxy):
"""Mark proxy as failed"""
self.failed.add(proxy)
def mark_success(self, proxy):
"""Mark proxy as working"""
self.failed.discard(proxy)
# Usage
pool = ProxyPool([
'http://user:pass@proxy1.vinaproxy.com:8080',
'http://user:pass@proxy2.vinaproxy.com:8080',
'http://user:pass@proxy3.vinaproxy.com:8080'
])
proxy = pool.get_proxy()
Advanced Pool với Health Check
import requests
import threading
import time
class AdvancedProxyPool:
def __init__(self, proxies):
self.proxies = {p: {'healthy': True, 'latency': 0} for p in proxies}
self.lock = threading.Lock()
def health_check(self, proxy):
"""Test proxy health"""
try:
start = time.time()
requests.get('https://httpbin.org/ip',
proxies={'http': proxy, 'https': proxy},
timeout=10)
latency = time.time() - start
with self.lock:
self.proxies[proxy] = {'healthy': True, 'latency': latency}
return True
except:
with self.lock:
self.proxies[proxy]['healthy'] = False
return False
def check_all(self):
"""Check all proxies (parallel)"""
threads = [threading.Thread(target=self.health_check, args=(p,))
for p in self.proxies]
for t in threads:
t.start()
for t in threads:
t.join()
def get_fastest(self):
"""Get fastest healthy proxy"""
healthy = {p: d for p, d in self.proxies.items() if d['healthy']}
if not healthy:
return None
return min(healthy.items(), key=lambda x: x[1]['latency'])[0]
def get_random_healthy(self):
"""Get random healthy proxy"""
healthy = [p for p, d in self.proxies.items() if d['healthy']]
return random.choice(healthy) if healthy else None
Weighted Pool
class WeightedProxyPool:
def __init__(self, proxies_with_weights):
# {'proxy_url': weight}
self.proxies = proxies_with_weights
def get_weighted_random(self):
"""Higher weight = more likely to be selected"""
total = sum(self.proxies.values())
r = random.uniform(0, total)
cumulative = 0
for proxy, weight in self.proxies.items():
cumulative += weight
if r <= cumulative:
return proxy
def adjust_weight(self, proxy, success):
"""Adjust weight based on performance"""
if proxy in self.proxies:
if success:
self.proxies[proxy] = min(self.proxies[proxy] * 1.1, 10)
else:
self.proxies[proxy] = max(self.proxies[proxy] * 0.5, 0.1)
# Usage
pool = WeightedProxyPool({
'http://proxy1:8080': 5, # High weight
'http://proxy2:8080': 3, # Medium
'http://proxy3:8080': 1 # Low
})
Site-Specific Pools
class SiteProxyManager:
def __init__(self):
self.pools = {}
def add_pool(self, site, proxies):
self.pools[site] = ProxyPool(proxies)
def get_for_site(self, url):
"""Get proxy optimized for specific site"""
from urllib.parse import urlparse
domain = urlparse(url).netloc
if domain in self.pools:
return self.pools[domain].get_proxy()
# Fallback to default pool
return self.pools.get('default', ProxyPool([])).get_proxy()
# Ví dụ: Proxies riêng cho từng site
manager = SiteProxyManager()
manager.add_pool('shopee.vn', shopee_proxies)
manager.add_pool('lazada.vn', lazada_proxies)
manager.add_pool('default', general_proxies)
Auto-Scaling Pool
class AutoScalingPool:
def __init__(self, min_healthy=5):
self.min_healthy = min_healthy
self.proxies = []
def check_and_scale(self):
"""Add more proxies if needed"""
healthy_count = sum(1 for p in self.proxies if p['healthy'])
if healthy_count < self.min_healthy:
new_proxies = self.request_more_proxies()
self.proxies.extend(new_proxies)
print(f"Added {len(new_proxies)} proxies")
def request_more_proxies(self):
"""Request more proxies from provider API"""
# VinaProxy API endpoint
response = requests.get('https://api.vinaproxy.com/v1/proxies')
return response.json()['proxies']
Best Practices
- Health check mỗi 5-10 phút
- Remove proxies fail 3+ lần liên tiếp
- Log proxy performance
- Separate pools cho different sites
- Monitor success rate
VinaProxy Pool Features
- Built-in rotation
- Auto health management
- Millions of IPs
- Giá chỉ $0.5/GB
