Concurrent Scraping: Tăng Tốc Với Parallel Requests
Sequential scraping quá chậm. Bài viết hướng dẫn concurrent scraping để tăng tốc gấp 10-50 lần.
Sequential vs Concurrent
# Sequential: 100 URLs × 1s = 100 seconds
for url in urls:
response = requests.get(url) # Waits for each
# Concurrent: 100 URLs ÷ 10 workers = 10 seconds
# 10x faster!
1. ThreadPoolExecutor
from concurrent.futures import ThreadPoolExecutor
import requests
def scrape(url):
response = requests.get(url)
return {'url': url, 'content': response.text}
urls = [...] # 1000 URLs
# Scrape with 20 concurrent threads
with ThreadPoolExecutor(max_workers=20) as executor:
results = list(executor.map(scrape, urls))
print(f"Scraped {len(results)} pages")
2. asyncio + aiohttp
import asyncio
import aiohttp
async def scrape_async(session, url):
async with session.get(url) as response:
return {
'url': url,
'status': response.status,
'content': await response.text()
}
async def scrape_all(urls, max_concurrent=50):
connector = aiohttp.TCPConnector(limit=max_concurrent)
async with aiohttp.ClientSession(connector=connector) as session:
tasks = [scrape_async(session, url) for url in urls]
results = await asyncio.gather(*tasks, return_exceptions=True)
return results
# Run
urls = [...] # 1000 URLs
results = asyncio.run(scrape_all(urls))
print(f"Scraped {len(results)} pages")
3. Semaphore Rate Limiting
import asyncio
import aiohttp
async def scrape_with_limit(session, url, semaphore):
async with semaphore:
async with session.get(url) as response:
await asyncio.sleep(0.1) # Rate limit
return await response.text()
async def main(urls, max_concurrent=10):
semaphore = asyncio.Semaphore(max_concurrent)
async with aiohttp.ClientSession() as session:
tasks = [scrape_with_limit(session, url, semaphore) for url in urls]
return await asyncio.gather(*tasks)
4. grequests (Simple Async)
import grequests
urls = [...] # List of URLs
# Create async requests
requests_list = [grequests.get(url) for url in urls]
# Send all (max 10 concurrent)
responses = grequests.map(requests_list, size=10)
for response in responses:
if response:
print(response.url, response.status_code)
5. Multiprocessing (CPU-bound)
from multiprocessing import Pool
import requests
from bs4 import BeautifulSoup
def scrape_and_parse(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
# CPU-intensive parsing
data = extract_complex_data(soup)
return data
# Use multiple CPU cores
with Pool(processes=8) as pool:
results = pool.map(scrape_and_parse, urls)
6. Playwright Parallel
import asyncio
from playwright.async_api import async_playwright
async def scrape_page(browser, url):
page = await browser.new_page()
await page.goto(url)
content = await page.content()
await page.close()
return content
async def main(urls):
async with async_playwright() as p:
browser = await p.chromium.launch()
# Scrape 5 pages concurrently
semaphore = asyncio.Semaphore(5)
async def limited_scrape(url):
async with semaphore:
return await scrape_page(browser, url)
tasks = [limited_scrape(url) for url in urls]
results = await asyncio.gather(*tasks)
await browser.close()
return results
Best Practices
- Start small: Begin with 5-10 concurrent, increase gradually
- Add delays: 0.1-0.5s between requests per worker
- Handle errors: Don’t let one failure crash all
- Monitor: Track success rates
- Respect robots.txt: Crawl-delay directive
Choosing The Right Approach
| Method | Best For |
|---|---|
| ThreadPoolExecutor | Simple, I/O-bound |
| asyncio/aiohttp | High concurrency, performance |
| Multiprocessing | CPU-intensive parsing |
| Playwright async | JavaScript-heavy sites |
VinaProxy + Concurrent Scraping
- Rotating proxies cho mỗi request
- No IP blocks với high concurrency
- Giá chỉ $0.5/GB
