Debug Web Scrapers: Tìm Và Sửa Lỗi Hiệu Quả

Scrapers thường gặp lỗi khó debug. Bài viết hướng dẫn kỹ thuật debug scrapers hiệu quả.

Lỗi Phổ Biến

Empty results: Selector sai hoặc page không load
Timeouts: Server chậm hoặc blocked
403/429: IP blocked hoặc rate limited
Parsing errors: HTML structure thay đổi
Encoding issues: Unicode problems

1. Save Raw Response

import requests

def debug_request(url):
    response = requests.get(url, headers={'User-Agent': '...'})
    
    # Save raw HTML for inspection
    with open('debug_response.html', 'w', encoding='utf-8') as f:
        f.write(response.text)
    
    print(f"Status: {response.status_code}")
    print(f"Headers: {dict(response.headers)}")
    print(f"Content length: {len(response.text)}")
    
    return response

# Check saved HTML in browser
debug_request('https://example.com/products')

2. Screenshot với Playwright

from playwright.sync_api import sync_playwright

def debug_with_screenshot(url):
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)  # Show browser
        page = browser.new_page()
        
        page.goto(url)
        page.wait_for_timeout(3000)
        
        # Screenshot
        page.screenshot(path='debug_screenshot.png', full_page=True)
        
        # Save HTML
        html = page.content()
        with open('debug_page.html', 'w') as f:
            f.write(html)
        
        # Interactive debugging
        page.pause()  # Opens inspector
        
        browser.close()

3. Test Selectors Interactively

from bs4 import BeautifulSoup
import requests

# IPython/Jupyter for interactive testing
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

# Test selectors
print(soup.select('.product-name'))  # Check if found
print(soup.select_one('.price'))     # Single element

# CSS selector debugging
for elem in soup.select('div.product'):
    print(elem.prettify())  # Pretty print HTML

4. Add Verbose Logging

import logging

logging.basicConfig(level=logging.DEBUG)

def scrape_with_logging(url):
    logging.debug(f"Starting scrape: {url}")
    
    try:
        response = requests.get(url)
        logging.debug(f"Status: {response.status_code}")
        logging.debug(f"Content-Type: {response.headers.get('content-type')}")
        
        soup = BeautifulSoup(response.text, 'lxml')
        
        products = soup.select('.product')
        logging.debug(f"Found {len(products)} products")
        
        for i, product in enumerate(products):
            name = product.select_one('.name')
            logging.debug(f"Product {i}: name={name}")
            
    except Exception as e:
        logging.error(f"Error: {e}", exc_info=True)

5. Compare Working vs Broken

# Save responses over time
import hashlib
from datetime import datetime

def save_versioned_response(url, name):
    response = requests.get(url)
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    filename = f"debug/{name}_{timestamp}.html"
    with open(filename, 'w') as f:
        f.write(response.text)
    
    # Hash for quick comparison
    content_hash = hashlib.md5(response.text.encode()).hexdigest()[:8]
    print(f"Saved: {filename} (hash: {content_hash})")

# Compare: diff debug/page_old.html debug/page_new.html

6. Check If Blocked

def check_if_blocked(response):
    blocked_signs = [
        'access denied',
        'blocked',
        'captcha',
        'rate limit',
        'too many requests',
        'bot detected'
    ]
    
    text_lower = response.text.lower()
    
    for sign in blocked_signs:
        if sign in text_lower:
            print(f"⚠️ Possible block: '{sign}' found")
            return True
    
    if response.status_code in [403, 429, 503]:
        print(f"⚠️ Blocked status code: {response.status_code}")
        return True
    
    return False

7. Network Debugging

# Use mitmproxy or Charles to inspect traffic
# See actual requests from browser vs script

# Or in Playwright:
page.on('request', lambda req: print(f">> {req.method} {req.url}"))
page.on('response', lambda res: print(f"<< {res.status} {res.url}"))

VinaProxy + Debugging

Switch proxies để test nếu IP blocked
Reliable connections cho debugging
Giá chỉ $0.5/GB

Dùng Thử Ngay →

Debug Web Scrapers: Tìm Và Sửa Lỗi Hiệu Quả

Lỗi Phổ Biến

1. Save Raw Response

2. Screenshot với Playwright

3. Test Selectors Interactively

4. Add Verbose Logging

5. Compare Working vs Broken

6. Check If Blocked

7. Network Debugging

VinaProxy + Debugging

admin