Testing Web Scrapers: Đảm Bảo Scrapers Hoạt Động Đúng
Scrapers cần testing như mọi software khác. Bài viết hướng dẫn test scrapers một cách chuyên nghiệp.
Tại Sao Cần Test?
- Catch bugs trước production
- Detect khi website thay đổi
- Validate data quality
- Regression testing
- CI/CD integration
1. Unit Tests
import pytest
from scraper import parse_price, parse_product
# Test parsing functions
def test_parse_price():
assert parse_price("$100.00") == 100.0
assert parse_price("1,234.56 đ") == 1234.56
assert parse_price("Liên hệ") == None
assert parse_price("") == None
def test_parse_product():
html = '''
Test Product
$50.00
'''
product = parse_product(html)
assert product['name'] == 'Test Product'
assert product['price'] == 50.0
# Run: pytest test_scraper.py
2. Mock HTTP Responses
import responses
import requests
from scraper import scrape_product
@responses.activate
def test_scrape_product():
# Mock the HTTP response
responses.add(
responses.GET,
'https://shop.com/product/123',
body='''
Test Product
$99.99
''',
status=200
)
result = scrape_product('https://shop.com/product/123')
assert result['title'] == 'Test Product'
assert result['price'] == 99.99
@responses.activate
def test_handle_404():
responses.add(
responses.GET,
'https://shop.com/product/missing',
status=404
)
result = scrape_product('https://shop.com/product/missing')
assert result is None
3. Fixture Files
# Save real HTML for testing
# tests/fixtures/product_page.html
import pytest
from pathlib import Path
@pytest.fixture
def product_html():
path = Path(__file__).parent / 'fixtures' / 'product_page.html'
return path.read_text()
def test_parse_real_page(product_html):
result = parse_product_page(product_html)
assert 'name' in result
assert 'price' in result
assert result['price'] > 0
4. Integration Tests
import pytest
@pytest.mark.integration
def test_live_scrape():
"""Test against real website - run sparingly"""
url = 'https://httpbin.org/html'
response = requests.get(url)
assert response.status_code == 200
assert 'Herman Melville' in response.text
@pytest.mark.integration
def test_proxy_works():
"""Verify proxy is working"""
proxy = 'http://user:pass@proxy.vinaproxy.com:8080'
response = requests.get(
'https://httpbin.org/ip',
proxies={'http': proxy, 'https': proxy}
)
assert response.status_code == 200
assert 'origin' in response.json()
5. Data Validation
from pydantic import BaseModel, validator
from typing import Optional
class Product(BaseModel):
name: str
price: float
url: str
image_url: Optional[str]
@validator('price')
def price_must_be_positive(cls, v):
if v <= 0:
raise ValueError('Price must be positive')
return v
@validator('url')
def url_must_be_valid(cls, v):
if not v.startswith('http'):
raise ValueError('Invalid URL')
return v
# Validate scraped data
def test_product_validation():
data = scrape_product(url)
product = Product(**data) # Raises if invalid
assert product.price > 0
6. Snapshot Testing
def test_selector_still_works():
"""Detect when website structure changes"""
response = requests.get('https://shop.com')
soup = BeautifulSoup(response.text, 'lxml')
# These selectors should always exist
assert soup.select('.product-card'), "Product cards not found"
assert soup.select('.price'), "Prices not found"
assert soup.select('.product-name'), "Names not found"
CI/CD Integration
# .github/workflows/test.yml
name: Test Scrapers
on: [push, pull_request]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- run: pip install -r requirements.txt
- run: pytest tests/ -v
VinaProxy + Testing
- Test với real proxies
- Verify geo-targeting works
- Giá chỉ $0.5/GB
