Visualize Scraped Data: Tạo Charts Từ Dữ Liệu Thu Thập

Data không có nghĩa gì nếu không visualize. Bài viết hướng dẫn tạo charts từ scraped data.

Tools Phổ Biến

Matplotlib: Basic plotting
Seaborn: Statistical visualization
Plotly: Interactive charts
Pandas: Quick plots

1. Matplotlib Basics

import matplotlib.pyplot as plt
import pandas as pd

# Load scraped data
df = pd.read_csv('products.csv')

# Bar chart - Top 10 products by price
top_10 = df.nlargest(10, 'price')
plt.figure(figsize=(12, 6))
plt.barh(top_10['name'], top_10['price'])
plt.xlabel('Price (VND)')
plt.title('Top 10 Most Expensive Products')
plt.tight_layout()
plt.savefig('top_products.png')
plt.show()

2. Price Distribution

import matplotlib.pyplot as plt

# Histogram
plt.figure(figsize=(10, 6))
plt.hist(df['price'], bins=50, edgecolor='black')
plt.xlabel('Price')
plt.ylabel('Count')
plt.title('Price Distribution')
plt.savefig('price_distribution.png')

3. Time Series

# Price tracking over time
prices_df = pd.read_csv('price_history.csv')
prices_df['date'] = pd.to_datetime(prices_df['date'])

plt.figure(figsize=(12, 6))
for product in prices_df['product'].unique()[:5]:
    product_data = prices_df[prices_df['product'] == product]
    plt.plot(product_data['date'], product_data['price'], label=product)

plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Price Trends Over Time')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('price_trends.png')

4. Seaborn Statistical Plots

import seaborn as sns

# Box plot by category
plt.figure(figsize=(12, 6))
sns.boxplot(x='category', y='price', data=df)
plt.xticks(rotation=45)
plt.title('Price Distribution by Category')
plt.tight_layout()
plt.savefig('price_by_category.png')

# Heatmap - Correlation
numeric_cols = ['price', 'rating', 'reviews', 'sold']
correlation = df[numeric_cols].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Feature Correlation')
plt.savefig('correlation.png')

5. Plotly Interactive

import plotly.express as px

# Interactive scatter
fig = px.scatter(
    df, 
    x='price', 
    y='rating',
    size='reviews',
    color='category',
    hover_name='name',
    title='Price vs Rating'
)
fig.write_html('interactive_scatter.html')
fig.show()

# Interactive line chart
fig = px.line(
    prices_df, 
    x='date', 
    y='price', 
    color='product',
    title='Price History'
)
fig.write_html('price_history.html')

6. Dashboard với Streamlit

# app.py
import streamlit as st
import pandas as pd
import plotly.express as px

st.title('Scraped Data Dashboard')

# Load data
df = pd.read_csv('products.csv')

# Sidebar filters
category = st.sidebar.selectbox('Category', df['category'].unique())
filtered = df[df['category'] == category]

# Metrics
col1, col2, col3 = st.columns(3)
col1.metric('Total Products', len(filtered))
col2.metric('Avg Price', f"{filtered['price'].mean():,.0f}")
col3.metric('Avg Rating', f"{filtered['rating'].mean():.1f}")

# Chart
fig = px.bar(filtered.head(20), x='name', y='price')
st.plotly_chart(fig)

# Run: streamlit run app.py

7. Export Charts

# High-quality export
plt.figure(figsize=(12, 8), dpi=300)
plt.plot(data)
plt.savefig('chart.png', dpi=300, bbox_inches='tight')

# PDF export
plt.savefig('chart.pdf', format='pdf')

# Multiple charts in one PDF
from matplotlib.backends.backend_pdf import PdfPages

with PdfPages('report.pdf') as pdf:
    for chart in charts:
        fig = create_chart(chart)
        pdf.savefig(fig)
        plt.close()

VinaProxy + Data Visualization

Scrape → Analyze → Visualize
Build market intelligence dashboards
Giá chỉ $0.5/GB

Dùng Thử Ngay →

Visualize Scraped Data: Tạo Charts Từ Dữ Liệu Thu Thập

Tools Phổ Biến

1. Matplotlib Basics

2. Price Distribution

3. Time Series

4. Seaborn Statistical Plots

5. Plotly Interactive

6. Dashboard với Streamlit

7. Export Charts

VinaProxy + Data Visualization

admin