WebSocket Scraping: Thu Thập Real-time Data
Nhiều sites dùng WebSocket cho real-time data. Bài viết hướng dẫn scrape WebSocket streams.
WebSocket vs HTTP
- HTTP: Request-response, one-time
- WebSocket: Persistent connection, bi-directional
- Use cases: Live prices, chat, notifications, gaming
Tìm WebSocket Connections
# Chrome DevTools:
# 1. Open Network tab
# 2. Filter by "WS"
# 3. Click on WebSocket connection
# 4. View Messages tab for data
# Look for URLs like:
# wss://stream.example.com/realtime
# wss://api.example.com/ws
Python websockets Library
import asyncio
import websockets
import json
async def scrape_websocket(url):
async with websockets.connect(url) as ws:
# Subscribe to channel
await ws.send(json.dumps({
'action': 'subscribe',
'channel': 'prices'
}))
# Receive messages
while True:
message = await ws.recv()
data = json.loads(message)
print(f"Received: {data}")
# Process data
save_to_db(data)
# Run
asyncio.run(scrape_websocket('wss://stream.example.com'))
With Timeout và Reconnect
import asyncio
import websockets
async def robust_websocket(url, timeout=30):
while True:
try:
async with websockets.connect(url) as ws:
while True:
try:
message = await asyncio.wait_for(
ws.recv(),
timeout=timeout
)
process(message)
except asyncio.TimeoutError:
# Send ping to keep alive
await ws.ping()
except websockets.ConnectionClosed:
print("Connection closed, reconnecting...")
await asyncio.sleep(5)
except Exception as e:
print(f"Error: {e}, reconnecting...")
await asyncio.sleep(5)
Playwright WebSocket Interception
from playwright.sync_api import sync_playwright
def intercept_websocket(url):
messages = []
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
# Intercept WebSocket messages
def handle_ws(ws):
ws.on('framereceived', lambda payload: messages.append(payload))
page.on('websocket', handle_ws)
page.goto(url)
page.wait_for_timeout(10000) # Collect for 10 seconds
browser.close()
return messages
# Capture live data
data = intercept_websocket('https://trading-site.com')
Stock Price Streaming
import asyncio
import websockets
import json
async def stream_stock_prices():
url = 'wss://stream.binance.com:9443/ws/btcusdt@trade'
async with websockets.connect(url) as ws:
while True:
message = await ws.recv()
trade = json.loads(message)
print(f"BTC Price: ${float(trade['p']):.2f}")
print(f"Quantity: {trade['q']}")
print(f"Time: {trade['T']}")
print("---")
asyncio.run(stream_stock_prices())
Socket.IO
import socketio
sio = socketio.Client()
@sio.event
def connect():
print('Connected!')
sio.emit('subscribe', {'channel': 'updates'})
@sio.event
def message(data):
print(f'Received: {data}')
save_data(data)
@sio.event
def disconnect():
print('Disconnected!')
# Connect
sio.connect('https://realtime.example.com')
sio.wait()
Save Streaming Data
import asyncio
import websockets
import json
from datetime import datetime
async def stream_to_file(url, output_file):
async with websockets.connect(url) as ws:
with open(output_file, 'a') as f:
while True:
message = await ws.recv()
timestamp = datetime.now().isoformat()
f.write(f"{timestamp}|{message}\n")
f.flush()
# Stream to file
asyncio.run(stream_to_file('wss://stream.example.com', 'data.log'))
VinaProxy + WebSocket
- Stable connections qua proxy
- Multiple streams cùng lúc
- Giá chỉ $0.5/GB
