WebSocket Scraping: Thu Thập Real-time Data

Trở lại Tin tức
Tin tức

WebSocket Scraping: Thu Thập Real-time Data

Nhiều sites dùng WebSocket cho real-time data. Bài viết hướng dẫn scrape WebSocket streams.

WebSocket vs HTTP

  • HTTP: Request-response, one-time
  • WebSocket: Persistent connection, bi-directional
  • Use cases: Live prices, chat, notifications, gaming

Tìm WebSocket Connections

# Chrome DevTools:
# 1. Open Network tab
# 2. Filter by "WS"
# 3. Click on WebSocket connection
# 4. View Messages tab for data

# Look for URLs like:
# wss://stream.example.com/realtime
# wss://api.example.com/ws

Python websockets Library

import asyncio
import websockets
import json

async def scrape_websocket(url):
    async with websockets.connect(url) as ws:
        # Subscribe to channel
        await ws.send(json.dumps({
            'action': 'subscribe',
            'channel': 'prices'
        }))
        
        # Receive messages
        while True:
            message = await ws.recv()
            data = json.loads(message)
            print(f"Received: {data}")
            
            # Process data
            save_to_db(data)

# Run
asyncio.run(scrape_websocket('wss://stream.example.com'))

With Timeout và Reconnect

import asyncio
import websockets

async def robust_websocket(url, timeout=30):
    while True:
        try:
            async with websockets.connect(url) as ws:
                while True:
                    try:
                        message = await asyncio.wait_for(
                            ws.recv(), 
                            timeout=timeout
                        )
                        process(message)
                    except asyncio.TimeoutError:
                        # Send ping to keep alive
                        await ws.ping()
        except websockets.ConnectionClosed:
            print("Connection closed, reconnecting...")
            await asyncio.sleep(5)
        except Exception as e:
            print(f"Error: {e}, reconnecting...")
            await asyncio.sleep(5)

Playwright WebSocket Interception

from playwright.sync_api import sync_playwright

def intercept_websocket(url):
    messages = []
    
    with sync_playwright() as p:
        browser = p.chromium.launch()
        page = browser.new_page()
        
        # Intercept WebSocket messages
        def handle_ws(ws):
            ws.on('framereceived', lambda payload: messages.append(payload))
        
        page.on('websocket', handle_ws)
        
        page.goto(url)
        page.wait_for_timeout(10000)  # Collect for 10 seconds
        
        browser.close()
    
    return messages

# Capture live data
data = intercept_websocket('https://trading-site.com')

Stock Price Streaming

import asyncio
import websockets
import json

async def stream_stock_prices():
    url = 'wss://stream.binance.com:9443/ws/btcusdt@trade'
    
    async with websockets.connect(url) as ws:
        while True:
            message = await ws.recv()
            trade = json.loads(message)
            
            print(f"BTC Price: ${float(trade['p']):.2f}")
            print(f"Quantity: {trade['q']}")
            print(f"Time: {trade['T']}")
            print("---")

asyncio.run(stream_stock_prices())

Socket.IO

import socketio

sio = socketio.Client()

@sio.event
def connect():
    print('Connected!')
    sio.emit('subscribe', {'channel': 'updates'})

@sio.event
def message(data):
    print(f'Received: {data}')
    save_data(data)

@sio.event
def disconnect():
    print('Disconnected!')

# Connect
sio.connect('https://realtime.example.com')
sio.wait()

Save Streaming Data

import asyncio
import websockets
import json
from datetime import datetime

async def stream_to_file(url, output_file):
    async with websockets.connect(url) as ws:
        with open(output_file, 'a') as f:
            while True:
                message = await ws.recv()
                timestamp = datetime.now().isoformat()
                f.write(f"{timestamp}|{message}\n")
                f.flush()

# Stream to file
asyncio.run(stream_to_file('wss://stream.example.com', 'data.log'))

VinaProxy + WebSocket

  • Stable connections qua proxy
  • Multiple streams cùng lúc
  • Giá chỉ $0.5/GB

Dùng Thử Ngay →