Proxy Pool Management: Quản Lý Nhiều Proxies Hiệu Quả
Scraping quy mô lớn cần nhiều proxies. Bài viết hướng dẫn quản lý proxy pool cho enterprise scraping.
Tại Sao Cần Proxy Pool?
- Avoid bans: Rotate IPs liên tục
- Increase throughput: Parallel requests
- Redundancy: Backup khi proxy fail
- Geo-distribution: Multi-location scraping
Basic Proxy Pool
import random
import requests
class ProxyPool:
def __init__(self, proxies):
self.proxies = proxies
self.failed = set()
def get_proxy(self):
available = [p for p in self.proxies if p not in self.failed]
if not available:
self.failed.clear() # Reset failed list
available = self.proxies
return random.choice(available)
def mark_failed(self, proxy):
self.failed.add(proxy)
def request(self, url, **kwargs):
proxy = self.get_proxy()
try:
response = requests.get(
url,
proxies={'http': proxy, 'https': proxy},
timeout=10,
**kwargs
)
return response
except Exception as e:
self.mark_failed(proxy)
raise
# Usage
pool = ProxyPool([
'http://user:pass@proxy1.vinaproxy.com:8080',
'http://user:pass@proxy2.vinaproxy.com:8080',
'http://user:pass@proxy3.vinaproxy.com:8080',
])
response = pool.request('https://example.com')
Weighted Proxy Selection
class WeightedProxyPool:
def __init__(self, proxies):
# {proxy: {'weight': 1.0, 'success': 0, 'fail': 0}}
self.proxies = {p: {'weight': 1.0, 'success': 0, 'fail': 0}
for p in proxies}
def get_proxy(self):
weights = [self.proxies[p]['weight'] for p in self.proxies]
return random.choices(list(self.proxies.keys()), weights=weights)[0]
def report_success(self, proxy):
self.proxies[proxy]['success'] += 1
self.proxies[proxy]['weight'] = min(2.0, self.proxies[proxy]['weight'] * 1.1)
def report_failure(self, proxy):
self.proxies[proxy]['fail'] += 1
self.proxies[proxy]['weight'] = max(0.1, self.proxies[proxy]['weight'] * 0.5)
def stats(self):
for proxy, data in self.proxies.items():
total = data['success'] + data['fail']
rate = data['success'] / total * 100 if total > 0 else 0
print(f"{proxy}: {rate:.1f}% success, weight={data['weight']:.2f}")
Health Check
import concurrent.futures
def check_proxy_health(proxy):
try:
response = requests.get(
'https://httpbin.org/ip',
proxies={'http': proxy, 'https': proxy},
timeout=5
)
return proxy, response.status_code == 200, response.elapsed.total_seconds()
except:
return proxy, False, None
def health_check_all(proxies):
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
results = list(executor.map(check_proxy_health, proxies))
healthy = []
for proxy, is_healthy, latency in results:
status = "✅" if is_healthy else "❌"
latency_str = f"{latency:.2f}s" if latency else "N/A"
print(f"{status} {proxy} - {latency_str}")
if is_healthy:
healthy.append(proxy)
return healthy
# Check every hour
healthy_proxies = health_check_all(all_proxies)
Rotation Strategies
# 1. Per-request rotation
for url in urls:
proxy = pool.get_proxy()
response = requests.get(url, proxies={'http': proxy})
# 2. Per-domain rotation
domain_proxy = {}
def get_proxy_for_domain(domain):
if domain not in domain_proxy:
domain_proxy[domain] = pool.get_proxy()
return domain_proxy[domain]
# 3. Time-based rotation
import time
last_rotation = time.time()
current_proxy = pool.get_proxy()
def get_rotating_proxy(interval=60):
global last_rotation, current_proxy
if time.time() - last_rotation > interval:
current_proxy = pool.get_proxy()
last_rotation = time.time()
return current_proxy
Async Pool
import aiohttp
import asyncio
class AsyncProxyPool:
def __init__(self, proxies):
self.proxies = proxies
self.index = 0
def get_proxy(self):
proxy = self.proxies[self.index % len(self.proxies)]
self.index += 1
return proxy
async def fetch(self, url):
proxy = self.get_proxy()
async with aiohttp.ClientSession() as session:
async with session.get(url, proxy=proxy) as response:
return await response.text()
VinaProxy Pool Features
- Built-in auto-rotation
- Millions of IPs trong pool
- Giá chỉ $0.5/GB
