Visualize Scraped Data: Tạo Charts Từ Dữ Liệu Thu Thập
Data không có nghĩa gì nếu không visualize. Bài viết hướng dẫn tạo charts từ scraped data.
Tools Phổ Biến
- Matplotlib: Basic plotting
- Seaborn: Statistical visualization
- Plotly: Interactive charts
- Pandas: Quick plots
1. Matplotlib Basics
import matplotlib.pyplot as plt
import pandas as pd
# Load scraped data
df = pd.read_csv('products.csv')
# Bar chart - Top 10 products by price
top_10 = df.nlargest(10, 'price')
plt.figure(figsize=(12, 6))
plt.barh(top_10['name'], top_10['price'])
plt.xlabel('Price (VND)')
plt.title('Top 10 Most Expensive Products')
plt.tight_layout()
plt.savefig('top_products.png')
plt.show()
2. Price Distribution
import matplotlib.pyplot as plt
# Histogram
plt.figure(figsize=(10, 6))
plt.hist(df['price'], bins=50, edgecolor='black')
plt.xlabel('Price')
plt.ylabel('Count')
plt.title('Price Distribution')
plt.savefig('price_distribution.png')
3. Time Series
# Price tracking over time
prices_df = pd.read_csv('price_history.csv')
prices_df['date'] = pd.to_datetime(prices_df['date'])
plt.figure(figsize=(12, 6))
for product in prices_df['product'].unique()[:5]:
product_data = prices_df[prices_df['product'] == product]
plt.plot(product_data['date'], product_data['price'], label=product)
plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Price Trends Over Time')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('price_trends.png')
4. Seaborn Statistical Plots
import seaborn as sns
# Box plot by category
plt.figure(figsize=(12, 6))
sns.boxplot(x='category', y='price', data=df)
plt.xticks(rotation=45)
plt.title('Price Distribution by Category')
plt.tight_layout()
plt.savefig('price_by_category.png')
# Heatmap - Correlation
numeric_cols = ['price', 'rating', 'reviews', 'sold']
correlation = df[numeric_cols].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Feature Correlation')
plt.savefig('correlation.png')
5. Plotly Interactive
import plotly.express as px
# Interactive scatter
fig = px.scatter(
df,
x='price',
y='rating',
size='reviews',
color='category',
hover_name='name',
title='Price vs Rating'
)
fig.write_html('interactive_scatter.html')
fig.show()
# Interactive line chart
fig = px.line(
prices_df,
x='date',
y='price',
color='product',
title='Price History'
)
fig.write_html('price_history.html')
6. Dashboard với Streamlit
# app.py
import streamlit as st
import pandas as pd
import plotly.express as px
st.title('Scraped Data Dashboard')
# Load data
df = pd.read_csv('products.csv')
# Sidebar filters
category = st.sidebar.selectbox('Category', df['category'].unique())
filtered = df[df['category'] == category]
# Metrics
col1, col2, col3 = st.columns(3)
col1.metric('Total Products', len(filtered))
col2.metric('Avg Price', f"{filtered['price'].mean():,.0f}")
col3.metric('Avg Rating', f"{filtered['rating'].mean():.1f}")
# Chart
fig = px.bar(filtered.head(20), x='name', y='price')
st.plotly_chart(fig)
# Run: streamlit run app.py
7. Export Charts
# High-quality export
plt.figure(figsize=(12, 8), dpi=300)
plt.plot(data)
plt.savefig('chart.png', dpi=300, bbox_inches='tight')
# PDF export
plt.savefig('chart.pdf', format='pdf')
# Multiple charts in one PDF
from matplotlib.backends.backend_pdf import PdfPages
with PdfPages('report.pdf') as pdf:
for chart in charts:
fig = create_chart(chart)
pdf.savefig(fig)
plt.close()
VinaProxy + Data Visualization
- Scrape → Analyze → Visualize
- Build market intelligence dashboards
- Giá chỉ $0.5/GB
