Debug Web Scrapers: Tìm Và Sửa Lỗi Hiệu Quả

Trở lại Tin tức
Tin tức

Debug Web Scrapers: Tìm Và Sửa Lỗi Hiệu Quả

Scrapers thường gặp lỗi khó debug. Bài viết hướng dẫn kỹ thuật debug scrapers hiệu quả.

Lỗi Phổ Biến

  • Empty results: Selector sai hoặc page không load
  • Timeouts: Server chậm hoặc blocked
  • 403/429: IP blocked hoặc rate limited
  • Parsing errors: HTML structure thay đổi
  • Encoding issues: Unicode problems

1. Save Raw Response

import requests

def debug_request(url):
    response = requests.get(url, headers={'User-Agent': '...'})
    
    # Save raw HTML for inspection
    with open('debug_response.html', 'w', encoding='utf-8') as f:
        f.write(response.text)
    
    print(f"Status: {response.status_code}")
    print(f"Headers: {dict(response.headers)}")
    print(f"Content length: {len(response.text)}")
    
    return response

# Check saved HTML in browser
debug_request('https://example.com/products')

2. Screenshot với Playwright

from playwright.sync_api import sync_playwright

def debug_with_screenshot(url):
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)  # Show browser
        page = browser.new_page()
        
        page.goto(url)
        page.wait_for_timeout(3000)
        
        # Screenshot
        page.screenshot(path='debug_screenshot.png', full_page=True)
        
        # Save HTML
        html = page.content()
        with open('debug_page.html', 'w') as f:
            f.write(html)
        
        # Interactive debugging
        page.pause()  # Opens inspector
        
        browser.close()

3. Test Selectors Interactively

from bs4 import BeautifulSoup
import requests

# IPython/Jupyter for interactive testing
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

# Test selectors
print(soup.select('.product-name'))  # Check if found
print(soup.select_one('.price'))     # Single element

# CSS selector debugging
for elem in soup.select('div.product'):
    print(elem.prettify())  # Pretty print HTML

4. Add Verbose Logging

import logging

logging.basicConfig(level=logging.DEBUG)

def scrape_with_logging(url):
    logging.debug(f"Starting scrape: {url}")
    
    try:
        response = requests.get(url)
        logging.debug(f"Status: {response.status_code}")
        logging.debug(f"Content-Type: {response.headers.get('content-type')}")
        
        soup = BeautifulSoup(response.text, 'lxml')
        
        products = soup.select('.product')
        logging.debug(f"Found {len(products)} products")
        
        for i, product in enumerate(products):
            name = product.select_one('.name')
            logging.debug(f"Product {i}: name={name}")
            
    except Exception as e:
        logging.error(f"Error: {e}", exc_info=True)

5. Compare Working vs Broken

# Save responses over time
import hashlib
from datetime import datetime

def save_versioned_response(url, name):
    response = requests.get(url)
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    filename = f"debug/{name}_{timestamp}.html"
    with open(filename, 'w') as f:
        f.write(response.text)
    
    # Hash for quick comparison
    content_hash = hashlib.md5(response.text.encode()).hexdigest()[:8]
    print(f"Saved: {filename} (hash: {content_hash})")

# Compare: diff debug/page_old.html debug/page_new.html

6. Check If Blocked

def check_if_blocked(response):
    blocked_signs = [
        'access denied',
        'blocked',
        'captcha',
        'rate limit',
        'too many requests',
        'bot detected'
    ]
    
    text_lower = response.text.lower()
    
    for sign in blocked_signs:
        if sign in text_lower:
            print(f"⚠️ Possible block: '{sign}' found")
            return True
    
    if response.status_code in [403, 429, 503]:
        print(f"⚠️ Blocked status code: {response.status_code}")
        return True
    
    return False

7. Network Debugging

# Use mitmproxy or Charles to inspect traffic
# See actual requests from browser vs script

# Or in Playwright:
page.on('request', lambda req: print(f">> {req.method} {req.url}"))
page.on('response', lambda res: print(f"<< {res.status} {res.url}"))

VinaProxy + Debugging

  • Switch proxies để test nếu IP blocked
  • Reliable connections cho debugging
  • Giá chỉ $0.5/GB

Dùng Thử Ngay →