Monitoring Scrapers: Theo Dõi Và Alert Khi Có Vấn Đề
Scrapers cần giám sát liên tục. Bài viết hướng dẫn monitoring và alerting cho scraping systems.
Tại Sao Cần Monitoring?
- Phát hiện sớm failures
- Track success rate
- Detect website changes
- Monitor proxy health
- Optimize performance
Metrics Cần Track
- Success rate: % requests thành công
- Response time: Latency trung bình
- Data volume: Items scraped/hour
- Error rate: % failures theo type
- Proxy usage: Bandwidth consumed
Basic Logging
import logging
from datetime import datetime
logging.basicConfig(
filename='scraper.log',
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
class ScraperMetrics:
def __init__(self):
self.success = 0
self.failed = 0
self.start_time = datetime.now()
def log_success(self, url):
self.success += 1
logging.info(f"SUCCESS: {url}")
def log_error(self, url, error):
self.failed += 1
logging.error(f"FAILED: {url} - {error}")
def report(self):
total = self.success + self.failed
rate = (self.success / total * 100) if total > 0 else 0
duration = (datetime.now() - self.start_time).seconds
logging.info(f"=== REPORT ===")
logging.info(f"Total: {total}, Success: {self.success}, Failed: {self.failed}")
logging.info(f"Success Rate: {rate:.1f}%")
logging.info(f"Duration: {duration}s, Rate: {total/duration:.1f}/s")
Telegram Alerts
import requests
def send_telegram_alert(message):
bot_token = 'YOUR_BOT_TOKEN'
chat_id = 'YOUR_CHAT_ID'
url = f'https://api.telegram.org/bot{bot_token}/sendMessage'
data = {
'chat_id': chat_id,
'text': message,
'parse_mode': 'Markdown'
}
requests.post(url, data=data)
# Usage
try:
scrape_website()
except Exception as e:
send_telegram_alert(f"🚨 *Scraper Error*\n\n`{str(e)}`")
Slack Alerts
def send_slack_alert(message):
webhook_url = 'https://hooks.slack.com/services/XXX/YYY/ZZZ'
payload = {
'text': message,
'username': 'Scraper Bot',
'icon_emoji': ':robot_face:'
}
requests.post(webhook_url, json=payload)
Health Check Endpoint
from flask import Flask, jsonify
import threading
app = Flask(__name__)
metrics = ScraperMetrics()
@app.route('/health')
def health():
return jsonify({
'status': 'running',
'success': metrics.success,
'failed': metrics.failed,
'success_rate': metrics.success / (metrics.success + metrics.failed + 1) * 100
})
# Run Flask in background
threading.Thread(target=lambda: app.run(port=5000)).start()
Website Change Detection
import hashlib
def detect_layout_change(url, expected_selectors):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
missing = []
for selector in expected_selectors:
if not soup.select(selector):
missing.append(selector)
if missing:
send_telegram_alert(
f"⚠️ Website changed!\n"
f"URL: {url}\n"
f"Missing: {missing}"
)
return False
return True
# Check before scraping
selectors = ['.product-name', '.price', '.description']
if detect_layout_change('https://shop.com', selectors):
scrape_products()
else:
logging.error("Website layout changed, scraper needs update")
Automated Recovery
import time
def scrape_with_retry(url, max_retries=3):
for attempt in range(max_retries):
try:
return scrape(url)
except Exception as e:
logging.warning(f"Attempt {attempt+1} failed: {e}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # Exponential backoff
send_telegram_alert(f"❌ All retries failed for {url}")
return None
VinaProxy + Monitoring
- Track proxy usage trong dashboard
- Monitor success rates
- Giá chỉ $0.5/GB
