Proxy Cho Social Media Scraping: Facebook, Instagram, TikTok

Trở lại Tin tức
Tin tức

Proxy Cho Social Media Scraping: Facebook, Instagram, TikTok

Social media có anti-bot mạnh nhất. Bài viết hướng dẫn scrape social media với proxy.

Tại Sao Social Media Khó Scrape?

  • Rate limiting: Giới hạn requests nghiêm ngặt
  • Login required: Nhiều data cần auth
  • JavaScript heavy: Dynamic rendering
  • Advanced detection: Fingerprinting, behavior analysis

Chọn Proxy Đúng

Platform Recommended Proxy Lý Do
Facebook Residential Strict IP checks
Instagram Mobile/Residential Mobile-first
TikTok Mobile App-native detection
Twitter/X Residential Rate limit per IP
LinkedIn Residential sticky Session-based

Facebook Scraping

from playwright.sync_api import sync_playwright

def scrape_facebook_page(page_url):
    with sync_playwright() as p:
        browser = p.chromium.launch(
            proxy={
                'server': 'http://proxy.vinaproxy.com:8080',
                'username': 'user',
                'password': 'pass_country-vn'  # Match target audience
            },
            headless=False  # Visible for debugging
        )
        
        context = browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64)...'
        )
        
        page = context.new_page()
        page.goto(page_url, wait_until='networkidle')
        
        # Scroll để load more posts
        for _ in range(3):
            page.evaluate('window.scrollBy(0, 1000)')
            page.wait_for_timeout(2000)
        
        # Extract posts
        posts = page.query_selector_all('[data-testid="post_message"]')
        data = [post.inner_text() for post in posts]
        
        browser.close()
        return data

Instagram Scraping

import requests
import time

def scrape_instagram_profile(username, proxy):
    """Scrape public Instagram profile"""
    headers = {
        'User-Agent': 'Instagram 219.0.0.12.117 Android',
        'Accept': '*/*',
    }
    
    # Mobile proxy recommended
    proxies = {'http': proxy, 'https': proxy}
    
    # Instagram API endpoint (public)
    url = f'https://www.instagram.com/api/v1/users/web_profile_info/?username={username}'
    
    response = requests.get(url, headers=headers, proxies=proxies)
    
    if response.status_code == 200:
        data = response.json()
        user = data['data']['user']
        return {
            'followers': user['edge_followed_by']['count'],
            'following': user['edge_follow']['count'],
            'posts': user['edge_owner_to_timeline_media']['count'],
            'bio': user['biography']
        }
    
    return None

# Use mobile proxy
proxy = 'http://user:pass_type-mobile@proxy.vinaproxy.com:8080'
profile = scrape_instagram_profile('nike', proxy)

TikTok Scraping

def scrape_tiktok_user(username, proxy):
    """Scrape TikTok user info"""
    from playwright.sync_api import sync_playwright
    
    with sync_playwright() as p:
        browser = p.chromium.launch(
            proxy={
                'server': proxy,
            }
        )
        
        # Mobile viewport cho TikTok
        context = browser.new_context(
            viewport={'width': 390, 'height': 844},
            user_agent='Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X)...',
            device_scale_factor=3
        )
        
        page = context.new_page()
        page.goto(f'https://www.tiktok.com/@{username}')
        page.wait_for_selector('[data-e2e="user-post-item"]', timeout=10000)
        
        # Extract stats
        followers = page.query_selector('[data-e2e="followers-count"]')
        likes = page.query_selector('[data-e2e="likes-count"]')
        
        data = {
            'followers': followers.inner_text() if followers else None,
            'likes': likes.inner_text() if likes else None
        }
        
        browser.close()
        return data

Best Practices

1. Slow Down

import random
import time

def human_delay():
    """Random delay như người thật"""
    delay = random.uniform(3, 8)
    time.sleep(delay)

2. Session Persistence

# Dùng sticky session cho social
proxy = 'http://user:pass_session-abc123_time-30@proxy.vinaproxy.com:8080'
# Same IP for 30 minutes

3. Rotate User Agents

user_agents = [
    'Mozilla/5.0 (iPhone; CPU iPhone OS 15_0...',
    'Mozilla/5.0 (Linux; Android 12...',
    'Mozilla/5.0 (Windows NT 10.0...',
]

headers = {'User-Agent': random.choice(user_agents)}

4. Handle Rate Limits

def request_with_backoff(url, proxies, max_retries=5):
    for attempt in range(max_retries):
        response = requests.get(url, proxies=proxies)
        
        if response.status_code == 429:  # Rate limited
            wait = 2 ** attempt * 60  # Exponential backoff
            print(f"Rate limited, waiting {wait}s")
            time.sleep(wait)
            continue
        
        return response
    
    return None

Legal Considerations

  • Chỉ scrape public data
  • Respect ToS
  • Don’t overload servers
  • GDPR compliance cho personal data

VinaProxy Cho Social

  • Mobile proxies available
  • Residential IPs trusted
  • Sticky sessions support
  • Giá chỉ $0.5/GB

Social Scraping Ngay →