Xây Dựng Dự Án Scraping Hoàn Chỉnh: Từ A-Z

Trở lại Tin tức
Tin tức

Xây Dựng Dự Án Scraping Hoàn Chỉnh: Từ A-Z

Bài viết hướng dẫn xây dựng scraping project từ đầu đến production.

Project Structure

my-scraper/
├── src/
│   ├── __init__.py
│   ├── scraper.py
│   ├── parser.py
│   ├── storage.py
│   └── utils.py
├── tests/
│   ├── test_scraper.py
│   └── fixtures/
├── config/
│   └── settings.py
├── data/
│   └── output/
├── logs/
├── requirements.txt
├── Dockerfile
├── docker-compose.yml
└── README.md

1. Setup Environment

# Create project
mkdir my-scraper && cd my-scraper

# Virtual environment
python -m venv venv
source venv/bin/activate  # Linux/Mac
# venv\Scripts\activate  # Windows

# Install dependencies
pip install requests beautifulsoup4 lxml pandas python-dotenv

# Create requirements.txt
pip freeze > requirements.txt

2. Configuration (config/settings.py)

import os
from dotenv import load_dotenv

load_dotenv()

# Proxy settings
PROXY_URL = os.getenv('PROXY_URL', 'http://user:pass@proxy.vinaproxy.com:8080')

# Scraping settings
REQUEST_TIMEOUT = 30
DELAY_MIN = 1
DELAY_MAX = 3
MAX_RETRIES = 3

# Target
BASE_URL = 'https://example.com'

# Storage
OUTPUT_DIR = 'data/output'
DATABASE_URL = os.getenv('DATABASE_URL', 'sqlite:///data/scraper.db')

3. Scraper (src/scraper.py)

import requests
import time
import random
import logging
from config.settings import *

class Scraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0...',
            'Accept': 'text/html,application/xhtml+xml'
        })
        self.logger = logging.getLogger(__name__)
    
    def fetch(self, url, retries=MAX_RETRIES):
        for attempt in range(retries):
            try:
                response = self.session.get(
                    url,
                    proxies={'http': PROXY_URL, 'https': PROXY_URL},
                    timeout=REQUEST_TIMEOUT
                )
                response.raise_for_status()
                return response
            except Exception as e:
                self.logger.warning(f"Attempt {attempt+1} failed: {e}")
                if attempt < retries - 1:
                    time.sleep(2 ** attempt)
        return None
    
    def scrape_page(self, url):
        response = self.fetch(url)
        if response:
            time.sleep(random.uniform(DELAY_MIN, DELAY_MAX))
            return response.text
        return None

4. Parser (src/parser.py)

from bs4 import BeautifulSoup

class Parser:
    def parse_product(self, html):
        soup = BeautifulSoup(html, 'lxml')
        
        return {
            'name': self._get_text(soup, '.product-name'),
            'price': self._get_price(soup, '.price'),
            'description': self._get_text(soup, '.description'),
            'image': self._get_attr(soup, '.product-image img', 'src')
        }
    
    def _get_text(self, soup, selector):
        elem = soup.select_one(selector)
        return elem.text.strip() if elem else None
    
    def _get_price(self, soup, selector):
        text = self._get_text(soup, selector)
        if text:
            import re
            match = re.search(r'[\d,]+', text)
            return int(match.group().replace(',', '')) if match else None
        return None
    
    def _get_attr(self, soup, selector, attr):
        elem = soup.select_one(selector)
        return elem.get(attr) if elem else None

5. Storage (src/storage.py)

import pandas as pd
import sqlite3
from config.settings import OUTPUT_DIR, DATABASE_URL

class Storage:
    def save_csv(self, data, filename):
        df = pd.DataFrame(data)
        path = f"{OUTPUT_DIR}/{filename}"
        df.to_csv(path, index=False)
        return path
    
    def save_db(self, data, table_name):
        conn = sqlite3.connect(DATABASE_URL.replace('sqlite:///', ''))
        df = pd.DataFrame(data)
        df.to_sql(table_name, conn, if_exists='append', index=False)
        conn.close()

6. Main Script

from src.scraper import Scraper
from src.parser import Parser
from src.storage import Storage
import logging

logging.basicConfig(level=logging.INFO)

def main():
    scraper = Scraper()
    parser = Parser()
    storage = Storage()
    
    urls = ['https://example.com/product/1', ...]
    products = []
    
    for url in urls:
        html = scraper.scrape_page(url)
        if html:
            product = parser.parse_product(html)
            product['url'] = url
            products.append(product)
            logging.info(f"Scraped: {product['name']}")
    
    storage.save_csv(products, 'products.csv')
    logging.info(f"Saved {len(products)} products")

if __name__ == '__main__':
    main()

7. Docker

# Dockerfile
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY . .
CMD ["python", "main.py"]

VinaProxy + Your Project

  • Professional proxy infrastructure
  • Just add PROXY_URL to .env
  • Giá chỉ $0.5/GB

Dùng Thử Ngay →