Proxy Cho Social Media Scraping: Facebook, Instagram, TikTok
Social media có anti-bot mạnh nhất. Bài viết hướng dẫn scrape social media với proxy.
Tại Sao Social Media Khó Scrape?
- Rate limiting: Giới hạn requests nghiêm ngặt
- Login required: Nhiều data cần auth
- JavaScript heavy: Dynamic rendering
- Advanced detection: Fingerprinting, behavior analysis
Chọn Proxy Đúng
| Platform | Recommended Proxy | Lý Do |
|---|---|---|
| Residential | Strict IP checks | |
| Mobile/Residential | Mobile-first | |
| TikTok | Mobile | App-native detection |
| Twitter/X | Residential | Rate limit per IP |
| Residential sticky | Session-based |
Facebook Scraping
from playwright.sync_api import sync_playwright
def scrape_facebook_page(page_url):
with sync_playwright() as p:
browser = p.chromium.launch(
proxy={
'server': 'http://proxy.vinaproxy.com:8080',
'username': 'user',
'password': 'pass_country-vn' # Match target audience
},
headless=False # Visible for debugging
)
context = browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64)...'
)
page = context.new_page()
page.goto(page_url, wait_until='networkidle')
# Scroll để load more posts
for _ in range(3):
page.evaluate('window.scrollBy(0, 1000)')
page.wait_for_timeout(2000)
# Extract posts
posts = page.query_selector_all('[data-testid="post_message"]')
data = [post.inner_text() for post in posts]
browser.close()
return data
Instagram Scraping
import requests
import time
def scrape_instagram_profile(username, proxy):
"""Scrape public Instagram profile"""
headers = {
'User-Agent': 'Instagram 219.0.0.12.117 Android',
'Accept': '*/*',
}
# Mobile proxy recommended
proxies = {'http': proxy, 'https': proxy}
# Instagram API endpoint (public)
url = f'https://www.instagram.com/api/v1/users/web_profile_info/?username={username}'
response = requests.get(url, headers=headers, proxies=proxies)
if response.status_code == 200:
data = response.json()
user = data['data']['user']
return {
'followers': user['edge_followed_by']['count'],
'following': user['edge_follow']['count'],
'posts': user['edge_owner_to_timeline_media']['count'],
'bio': user['biography']
}
return None
# Use mobile proxy
proxy = 'http://user:pass_type-mobile@proxy.vinaproxy.com:8080'
profile = scrape_instagram_profile('nike', proxy)
TikTok Scraping
def scrape_tiktok_user(username, proxy):
"""Scrape TikTok user info"""
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
proxy={
'server': proxy,
}
)
# Mobile viewport cho TikTok
context = browser.new_context(
viewport={'width': 390, 'height': 844},
user_agent='Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X)...',
device_scale_factor=3
)
page = context.new_page()
page.goto(f'https://www.tiktok.com/@{username}')
page.wait_for_selector('[data-e2e="user-post-item"]', timeout=10000)
# Extract stats
followers = page.query_selector('[data-e2e="followers-count"]')
likes = page.query_selector('[data-e2e="likes-count"]')
data = {
'followers': followers.inner_text() if followers else None,
'likes': likes.inner_text() if likes else None
}
browser.close()
return data
Best Practices
1. Slow Down
import random
import time
def human_delay():
"""Random delay như người thật"""
delay = random.uniform(3, 8)
time.sleep(delay)
2. Session Persistence
# Dùng sticky session cho social
proxy = 'http://user:pass_session-abc123_time-30@proxy.vinaproxy.com:8080'
# Same IP for 30 minutes
3. Rotate User Agents
user_agents = [
'Mozilla/5.0 (iPhone; CPU iPhone OS 15_0...',
'Mozilla/5.0 (Linux; Android 12...',
'Mozilla/5.0 (Windows NT 10.0...',
]
headers = {'User-Agent': random.choice(user_agents)}
4. Handle Rate Limits
def request_with_backoff(url, proxies, max_retries=5):
for attempt in range(max_retries):
response = requests.get(url, proxies=proxies)
if response.status_code == 429: # Rate limited
wait = 2 ** attempt * 60 # Exponential backoff
print(f"Rate limited, waiting {wait}s")
time.sleep(wait)
continue
return response
return None
Legal Considerations
- Chỉ scrape public data
- Respect ToS
- Don’t overload servers
- GDPR compliance cho personal data
VinaProxy Cho Social
- Mobile proxies available
- Residential IPs trusted
- Sticky sessions support
- Giá chỉ $0.5/GB
