Concurrent Scraping: Tăng Tốc Với Parallel Requests

Trở lại Tin tức
Tin tức

Concurrent Scraping: Tăng Tốc Với Parallel Requests

Sequential scraping quá chậm. Bài viết hướng dẫn concurrent scraping để tăng tốc gấp 10-50 lần.

Sequential vs Concurrent

# Sequential: 100 URLs × 1s = 100 seconds
for url in urls:
    response = requests.get(url)  # Waits for each

# Concurrent: 100 URLs ÷ 10 workers = 10 seconds
# 10x faster!

1. ThreadPoolExecutor

from concurrent.futures import ThreadPoolExecutor
import requests

def scrape(url):
    response = requests.get(url)
    return {'url': url, 'content': response.text}

urls = [...]  # 1000 URLs

# Scrape with 20 concurrent threads
with ThreadPoolExecutor(max_workers=20) as executor:
    results = list(executor.map(scrape, urls))

print(f"Scraped {len(results)} pages")

2. asyncio + aiohttp

import asyncio
import aiohttp

async def scrape_async(session, url):
    async with session.get(url) as response:
        return {
            'url': url,
            'status': response.status,
            'content': await response.text()
        }

async def scrape_all(urls, max_concurrent=50):
    connector = aiohttp.TCPConnector(limit=max_concurrent)
    async with aiohttp.ClientSession(connector=connector) as session:
        tasks = [scrape_async(session, url) for url in urls]
        results = await asyncio.gather(*tasks, return_exceptions=True)
    return results

# Run
urls = [...]  # 1000 URLs
results = asyncio.run(scrape_all(urls))
print(f"Scraped {len(results)} pages")

3. Semaphore Rate Limiting

import asyncio
import aiohttp

async def scrape_with_limit(session, url, semaphore):
    async with semaphore:
        async with session.get(url) as response:
            await asyncio.sleep(0.1)  # Rate limit
            return await response.text()

async def main(urls, max_concurrent=10):
    semaphore = asyncio.Semaphore(max_concurrent)
    async with aiohttp.ClientSession() as session:
        tasks = [scrape_with_limit(session, url, semaphore) for url in urls]
        return await asyncio.gather(*tasks)

4. grequests (Simple Async)

import grequests

urls = [...]  # List of URLs

# Create async requests
requests_list = [grequests.get(url) for url in urls]

# Send all (max 10 concurrent)
responses = grequests.map(requests_list, size=10)

for response in responses:
    if response:
        print(response.url, response.status_code)

5. Multiprocessing (CPU-bound)

from multiprocessing import Pool
import requests
from bs4 import BeautifulSoup

def scrape_and_parse(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    # CPU-intensive parsing
    data = extract_complex_data(soup)
    return data

# Use multiple CPU cores
with Pool(processes=8) as pool:
    results = pool.map(scrape_and_parse, urls)

6. Playwright Parallel

import asyncio
from playwright.async_api import async_playwright

async def scrape_page(browser, url):
    page = await browser.new_page()
    await page.goto(url)
    content = await page.content()
    await page.close()
    return content

async def main(urls):
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        
        # Scrape 5 pages concurrently
        semaphore = asyncio.Semaphore(5)
        
        async def limited_scrape(url):
            async with semaphore:
                return await scrape_page(browser, url)
        
        tasks = [limited_scrape(url) for url in urls]
        results = await asyncio.gather(*tasks)
        
        await browser.close()
    return results

Best Practices

  • Start small: Begin with 5-10 concurrent, increase gradually
  • Add delays: 0.1-0.5s between requests per worker
  • Handle errors: Don’t let one failure crash all
  • Monitor: Track success rates
  • Respect robots.txt: Crawl-delay directive

Choosing The Right Approach

Method Best For
ThreadPoolExecutor Simple, I/O-bound
asyncio/aiohttp High concurrency, performance
Multiprocessing CPU-intensive parsing
Playwright async JavaScript-heavy sites

VinaProxy + Concurrent Scraping

  • Rotating proxies cho mỗi request
  • No IP blocks với high concurrency
  • Giá chỉ $0.5/GB

Dùng Thử Ngay →