Form Automation: Submit Forms Tự Động Với Python
Nhiều data chỉ accessible qua forms. Bài viết hướng dẫn automate form submissions để scrape data.
Use Cases
- Search forms: Query databases
- Login forms: Access protected content
- Filters: Apply search criteria
- Data entry: Bulk submissions
1. Requests với POST
import requests
# Simple form submission
data = {
'username': 'user123',
'password': 'pass456',
'remember': 'true'
}
response = requests.post(
'https://example.com/login',
data=data
)
# Check if logged in
if 'Welcome' in response.text:
print("Login successful!")
# Continue with session
session = requests.Session()
session.cookies = response.cookies
2. Session Management
import requests
session = requests.Session()
# Login
login_data = {'email': 'user@example.com', 'password': 'secret'}
session.post('https://example.com/login', data=login_data)
# Now session has cookies, can access protected pages
response = session.get('https://example.com/dashboard')
protected_data = response.text
3. Handle CSRF Tokens
from bs4 import BeautifulSoup
session = requests.Session()
# Get login page to find CSRF token
login_page = session.get('https://example.com/login')
soup = BeautifulSoup(login_page.text, 'lxml')
csrf_token = soup.select_one('input[name="csrf_token"]')['value']
# Submit with CSRF token
data = {
'csrf_token': csrf_token,
'email': 'user@example.com',
'password': 'secret'
}
response = session.post('https://example.com/login', data=data)
4. Playwright Form Filling
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com/search')
# Fill form fields
page.fill('input[name="keyword"]', 'python scraping')
page.select_option('select[name="category"]', 'technology')
page.check('input[name="in_stock"]')
# Submit
page.click('button[type="submit"]')
# Wait for results
page.wait_for_selector('.search-results')
# Extract results
results = page.query_selector_all('.result-item')
for item in results:
print(item.inner_text())
browser.close()
5. Multi-step Forms
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto('https://example.com/wizard')
# Step 1
page.fill('#name', 'John Doe')
page.fill('#email', 'john@example.com')
page.click('button.next')
# Step 2
page.wait_for_selector('#address')
page.fill('#address', '123 Main St')
page.fill('#city', 'Ho Chi Minh')
page.click('button.next')
# Step 3 - Confirmation
page.wait_for_selector('.confirmation')
page.click('button.submit')
# Get result
page.wait_for_selector('.success-message')
print(page.inner_text('.success-message'))
browser.close()
6. File Upload
# With requests
files = {'document': open('file.pdf', 'rb')}
data = {'title': 'My Document'}
response = requests.post('https://example.com/upload', files=files, data=data)
# With Playwright
page.set_input_files('input[type="file"]', 'path/to/file.pdf')
7. Dropdown và Radio Buttons
# Playwright
# Select dropdown
page.select_option('#country', 'vietnam')
page.select_option('#country', label='Vietnam')
page.select_option('#country', index=5)
# Radio buttons
page.check('input[name="gender"][value="male"]')
# Checkboxes
page.check('#agree_terms')
page.uncheck('#newsletter')
8. JavaScript Forms
# Some forms need JavaScript execution
page.evaluate('''
document.querySelector('#hidden_field').value = 'secret_value';
document.querySelector('form').submit();
''')
VinaProxy + Form Automation
- Submit forms từ different IPs
- Bypass rate limits
- Giá chỉ $0.5/GB
