Scrape JavaScript Websites: Xử Lý SPAs Và Dynamic Content

Trở lại Tin tức
Tin tức

Scrape JavaScript Websites: Xử Lý SPAs Và Dynamic Content

Nhiều websites dùng JavaScript để render. Bài viết hướng dẫn scrape JavaScript-heavy sites.

Tại Sao JavaScript Là Thách Thức?

  • Content được render sau page load
  • Data fetch qua AJAX/API calls
  • React/Vue/Angular SPAs
  • Infinite scroll, lazy loading

1. Playwright (Recommended)

from playwright.sync_api import sync_playwright

def scrape_spa(url):
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()
        
        page.goto(url)
        
        # Wait for JavaScript to render
        page.wait_for_selector('.product-list')
        
        # Or wait for network idle
        page.wait_for_load_state('networkidle')
        
        # Now extract content
        products = page.query_selector_all('.product')
        data = []
        for product in products:
            data.append({
                'name': product.query_selector('.name').inner_text(),
                'price': product.query_selector('.price').inner_text()
            })
        
        browser.close()
        return data

2. Selenium

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Chrome()
driver.get('https://spa-site.com')

# Wait for element to appear
wait = WebDriverWait(driver, 10)
element = wait.until(
    EC.presence_of_element_located((By.CLASS_NAME, 'product-list'))
)

# Extract data
products = driver.find_elements(By.CLASS_NAME, 'product')
for product in products:
    print(product.text)

driver.quit()

3. Execute JavaScript

# Playwright
result = page.evaluate('''
    () => {
        const products = [];
        document.querySelectorAll('.product').forEach(el => {
            products.push({
                name: el.querySelector('.name').textContent,
                price: el.querySelector('.price').textContent
            });
        });
        return products;
    }
''')

# Selenium
result = driver.execute_script('''
    return Array.from(document.querySelectorAll('.product'))
        .map(el => ({
            name: el.querySelector('.name').textContent,
            price: el.querySelector('.price').textContent
        }));
''')

4. Infinite Scroll

def scrape_infinite_scroll(page, max_scrolls=10):
    all_items = []
    last_height = 0
    
    for i in range(max_scrolls):
        # Scroll to bottom
        page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
        page.wait_for_timeout(2000)
        
        # Check if new content loaded
        new_height = page.evaluate('document.body.scrollHeight')
        if new_height == last_height:
            print("No more content")
            break
        last_height = new_height
        
        # Extract new items
        items = page.query_selector_all('.item')
        print(f"Scroll {i+1}: {len(items)} items")
    
    return page.query_selector_all('.item')

5. Wait Strategies

# Wait for specific element
page.wait_for_selector('.data-loaded')

# Wait for text to appear
page.wait_for_selector('text=Loading complete')

# Wait for network idle (no requests for 500ms)
page.wait_for_load_state('networkidle')

# Wait for function to return true
page.wait_for_function('window.dataLoaded === true')

# Custom wait with timeout
page.wait_for_selector('.result', timeout=30000)

6. Intercept API Calls

def scrape_via_api(url):
    api_data = []
    
    def handle_response(response):
        if '/api/products' in response.url:
            data = response.json()
            api_data.extend(data['products'])
    
    with sync_playwright() as p:
        browser = p.chromium.launch()
        page = browser.new_page()
        
        page.on('response', handle_response)
        page.goto(url)
        page.wait_for_timeout(5000)
        
        browser.close()
    
    return api_data

7. Headless Detection Bypass

browser = p.chromium.launch(
    headless=True,
    args=[
        '--disable-blink-features=AutomationControlled'
    ]
)

context = browser.new_context(
    user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64)...',
    viewport={'width': 1920, 'height': 1080}
)

VinaProxy + JavaScript Scraping

  • Residential IPs cho browser automation
  • Bypass anti-bot detection
  • Giá chỉ $0.5/GB

Dùng Thử Ngay →