Scrape JavaScript Websites: Xử Lý SPAs Và Dynamic Content
Nhiều websites dùng JavaScript để render. Bài viết hướng dẫn scrape JavaScript-heavy sites.
Tại Sao JavaScript Là Thách Thức?
- Content được render sau page load
- Data fetch qua AJAX/API calls
- React/Vue/Angular SPAs
- Infinite scroll, lazy loading
1. Playwright (Recommended)
from playwright.sync_api import sync_playwright
def scrape_spa(url):
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url)
# Wait for JavaScript to render
page.wait_for_selector('.product-list')
# Or wait for network idle
page.wait_for_load_state('networkidle')
# Now extract content
products = page.query_selector_all('.product')
data = []
for product in products:
data.append({
'name': product.query_selector('.name').inner_text(),
'price': product.query_selector('.price').inner_text()
})
browser.close()
return data
2. Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get('https://spa-site.com')
# Wait for element to appear
wait = WebDriverWait(driver, 10)
element = wait.until(
EC.presence_of_element_located((By.CLASS_NAME, 'product-list'))
)
# Extract data
products = driver.find_elements(By.CLASS_NAME, 'product')
for product in products:
print(product.text)
driver.quit()
3. Execute JavaScript
# Playwright
result = page.evaluate('''
() => {
const products = [];
document.querySelectorAll('.product').forEach(el => {
products.push({
name: el.querySelector('.name').textContent,
price: el.querySelector('.price').textContent
});
});
return products;
}
''')
# Selenium
result = driver.execute_script('''
return Array.from(document.querySelectorAll('.product'))
.map(el => ({
name: el.querySelector('.name').textContent,
price: el.querySelector('.price').textContent
}));
''')
4. Infinite Scroll
def scrape_infinite_scroll(page, max_scrolls=10):
all_items = []
last_height = 0
for i in range(max_scrolls):
# Scroll to bottom
page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
page.wait_for_timeout(2000)
# Check if new content loaded
new_height = page.evaluate('document.body.scrollHeight')
if new_height == last_height:
print("No more content")
break
last_height = new_height
# Extract new items
items = page.query_selector_all('.item')
print(f"Scroll {i+1}: {len(items)} items")
return page.query_selector_all('.item')
5. Wait Strategies
# Wait for specific element
page.wait_for_selector('.data-loaded')
# Wait for text to appear
page.wait_for_selector('text=Loading complete')
# Wait for network idle (no requests for 500ms)
page.wait_for_load_state('networkidle')
# Wait for function to return true
page.wait_for_function('window.dataLoaded === true')
# Custom wait with timeout
page.wait_for_selector('.result', timeout=30000)
6. Intercept API Calls
def scrape_via_api(url):
api_data = []
def handle_response(response):
if '/api/products' in response.url:
data = response.json()
api_data.extend(data['products'])
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.on('response', handle_response)
page.goto(url)
page.wait_for_timeout(5000)
browser.close()
return api_data
7. Headless Detection Bypass
browser = p.chromium.launch(
headless=True,
args=[
'--disable-blink-features=AutomationControlled'
]
)
context = browser.new_context(
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64)...',
viewport={'width': 1920, 'height': 1080}
)
VinaProxy + JavaScript Scraping
- Residential IPs cho browser automation
- Bypass anti-bot detection
- Giá chỉ $0.5/GB
