Debug Web Scrapers: Tìm Và Sửa Lỗi Hiệu Quả
Scrapers thường gặp lỗi khó debug. Bài viết hướng dẫn kỹ thuật debug scrapers hiệu quả.
Lỗi Phổ Biến
- Empty results: Selector sai hoặc page không load
- Timeouts: Server chậm hoặc blocked
- 403/429: IP blocked hoặc rate limited
- Parsing errors: HTML structure thay đổi
- Encoding issues: Unicode problems
1. Save Raw Response
import requests
def debug_request(url):
response = requests.get(url, headers={'User-Agent': '...'})
# Save raw HTML for inspection
with open('debug_response.html', 'w', encoding='utf-8') as f:
f.write(response.text)
print(f"Status: {response.status_code}")
print(f"Headers: {dict(response.headers)}")
print(f"Content length: {len(response.text)}")
return response
# Check saved HTML in browser
debug_request('https://example.com/products')
2. Screenshot với Playwright
from playwright.sync_api import sync_playwright
def debug_with_screenshot(url):
with sync_playwright() as p:
browser = p.chromium.launch(headless=False) # Show browser
page = browser.new_page()
page.goto(url)
page.wait_for_timeout(3000)
# Screenshot
page.screenshot(path='debug_screenshot.png', full_page=True)
# Save HTML
html = page.content()
with open('debug_page.html', 'w') as f:
f.write(html)
# Interactive debugging
page.pause() # Opens inspector
browser.close()
3. Test Selectors Interactively
from bs4 import BeautifulSoup
import requests
# IPython/Jupyter for interactive testing
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
# Test selectors
print(soup.select('.product-name')) # Check if found
print(soup.select_one('.price')) # Single element
# CSS selector debugging
for elem in soup.select('div.product'):
print(elem.prettify()) # Pretty print HTML
4. Add Verbose Logging
import logging
logging.basicConfig(level=logging.DEBUG)
def scrape_with_logging(url):
logging.debug(f"Starting scrape: {url}")
try:
response = requests.get(url)
logging.debug(f"Status: {response.status_code}")
logging.debug(f"Content-Type: {response.headers.get('content-type')}")
soup = BeautifulSoup(response.text, 'lxml')
products = soup.select('.product')
logging.debug(f"Found {len(products)} products")
for i, product in enumerate(products):
name = product.select_one('.name')
logging.debug(f"Product {i}: name={name}")
except Exception as e:
logging.error(f"Error: {e}", exc_info=True)
5. Compare Working vs Broken
# Save responses over time
import hashlib
from datetime import datetime
def save_versioned_response(url, name):
response = requests.get(url)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"debug/{name}_{timestamp}.html"
with open(filename, 'w') as f:
f.write(response.text)
# Hash for quick comparison
content_hash = hashlib.md5(response.text.encode()).hexdigest()[:8]
print(f"Saved: {filename} (hash: {content_hash})")
# Compare: diff debug/page_old.html debug/page_new.html
6. Check If Blocked
def check_if_blocked(response):
blocked_signs = [
'access denied',
'blocked',
'captcha',
'rate limit',
'too many requests',
'bot detected'
]
text_lower = response.text.lower()
for sign in blocked_signs:
if sign in text_lower:
print(f"⚠️ Possible block: '{sign}' found")
return True
if response.status_code in [403, 429, 503]:
print(f"⚠️ Blocked status code: {response.status_code}")
return True
return False
7. Network Debugging
# Use mitmproxy or Charles to inspect traffic
# See actual requests from browser vs script
# Or in Playwright:
page.on('request', lambda req: print(f">> {req.method} {req.url}"))
page.on('response', lambda res: print(f"<< {res.status} {res.url}"))
VinaProxy + Debugging
- Switch proxies để test nếu IP blocked
- Reliable connections cho debugging
- Giá chỉ $0.5/GB
