Xử Lý JSON Trong Web Scraping: Parse Và Extract Data
Nhiều APIs trả về JSON thay vì HTML. Bài viết hướng dẫn parse và extract JSON hiệu quả.
JSON Là Gì?
{
"products": [
{"id": 1, "name": "iPhone", "price": 999},
{"id": 2, "name": "Samsung", "price": 899}
],
"total": 2,
"page": 1
}
Parse JSON Response
import requests
response = requests.get('https://api.example.com/products')
# Tự động parse JSON
data = response.json()
# Hoặc manual
import json
data = json.loads(response.text)
# Access data
for product in data['products']:
print(f"{product['name']}: ${product['price']}")
Nested JSON
# Deep nested structure
data = {
"store": {
"products": {
"electronics": [
{"name": "TV", "specs": {"size": 55}}
]
}
}
}
# Access nested
tv_size = data['store']['products']['electronics'][0]['specs']['size']
print(tv_size) # 55
Safe Access (Avoid KeyError)
# Method 1: .get() with default
name = data.get('name', 'Unknown')
price = data.get('price', 0)
# Method 2: Nested safe access
def safe_get(data, *keys, default=None):
for key in keys:
if isinstance(data, dict):
data = data.get(key, default)
elif isinstance(data, list) and isinstance(key, int):
data = data[key] if len(data) > key else default
else:
return default
return data
# Usage
specs = safe_get(data, 'products', 0, 'specs', 'size', default='N/A')
JSON Lines (JSONL)
# File với 1 JSON object per line
# {"id": 1, "name": "A"}
# {"id": 2, "name": "B"}
with open('data.jsonl', 'r') as f:
for line in f:
item = json.loads(line)
print(item['name'])
Flatten Nested JSON
def flatten_json(data, prefix=''):
result = {}
for key, value in data.items():
new_key = f"{prefix}.{key}" if prefix else key
if isinstance(value, dict):
result.update(flatten_json(value, new_key))
else:
result[new_key] = value
return result
# Input
nested = {"user": {"name": "John", "address": {"city": "HN"}}}
# Output
flat = flatten_json(nested)
# {'user.name': 'John', 'user.address.city': 'HN'}
JSON To CSV
import json
import csv
with open('data.json', 'r') as f:
data = json.load(f)
with open('data.csv', 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)
Handle Large JSON
# Stream parse với ijson
import ijson
with open('large.json', 'rb') as f:
for item in ijson.items(f, 'products.item'):
process(item) # Không load toàn bộ vào RAM
VinaProxy + JSON APIs
- Scrape JSON APIs không bị block
- Rotate IPs cho rate limits
- Giá chỉ $0.5/GB
