Image Scraping: Tải Hình Ảnh Hàng Loạt Từ Web

Trở lại Tin tức
Tin tức

Image Scraping: Tải Hình Ảnh Hàng Loạt Từ Web

Images là loại data quan trọng cần scrape. Bài viết hướng dẫn download images hàng loạt từ websites.

Use Cases

  • Product images: E-commerce catalogs
  • Training data: ML/AI datasets
  • Archiving: Website backup
  • Research: Visual analysis

Basic Image Download

import requests
from pathlib import Path

def download_image(url, folder='images'):
    Path(folder).mkdir(exist_ok=True)
    
    filename = url.split('/')[-1].split('?')[0]
    filepath = Path(folder) / filename
    
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)
        return filepath
    return None

# Download single image
download_image('https://example.com/image.jpg')

Extract All Images From Page

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def get_all_images(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    images = []
    for img in soup.select('img'):
        src = img.get('src') or img.get('data-src')
        if src:
            # Convert relative to absolute URL
            full_url = urljoin(page_url, src)
            images.append({
                'url': full_url,
                'alt': img.get('alt', ''),
                'width': img.get('width'),
                'height': img.get('height')
            })
    
    return images

# Get and download all
images = get_all_images('https://shop.com/products')
for img in images:
    download_image(img['url'])

Handle Lazy Loading

from playwright.sync_api import sync_playwright

def get_lazy_images(url):
    with sync_playwright() as p:
        browser = p.chromium.launch()
        page = browser.new_page()
        page.goto(url)
        
        # Scroll to load all images
        for _ in range(10):
            page.evaluate('window.scrollBy(0, 1000)')
            page.wait_for_timeout(500)
        
        # Get all image sources
        images = page.evaluate('''
            () => Array.from(document.querySelectorAll('img'))
                .map(img => img.src || img.dataset.src)
                .filter(src => src)
        ''')
        
        browser.close()
        return images

Async Bulk Download

import aiohttp
import asyncio
from pathlib import Path

async def download_image_async(session, url, folder):
    filename = url.split('/')[-1].split('?')[0]
    filepath = Path(folder) / filename
    
    try:
        async with session.get(url) as response:
            if response.status == 200:
                content = await response.read()
                with open(filepath, 'wb') as f:
                    f.write(content)
                return filepath
    except:
        pass
    return None

async def bulk_download(urls, folder='images', max_concurrent=10):
    Path(folder).mkdir(exist_ok=True)
    
    connector = aiohttp.TCPConnector(limit=max_concurrent)
    async with aiohttp.ClientSession(connector=connector) as session:
        tasks = [download_image_async(session, url, folder) for url in urls]
        results = await asyncio.gather(*tasks)
    
    return [r for r in results if r]

# Download 1000 images fast
urls = [...]  # List of image URLs
asyncio.run(bulk_download(urls))

Image Validation

from PIL import Image
import io

def validate_image(content):
    try:
        img = Image.open(io.BytesIO(content))
        return {
            'valid': True,
            'format': img.format,
            'size': img.size,
            'mode': img.mode
        }
    except:
        return {'valid': False}

# Filter valid images
response = requests.get(image_url)
info = validate_image(response.content)
if info['valid']:
    save_image(response.content)

Avoid Duplicates

import hashlib

def get_image_hash(content):
    return hashlib.md5(content).hexdigest()

downloaded_hashes = set()

def download_unique(url):
    response = requests.get(url)
    img_hash = get_image_hash(response.content)
    
    if img_hash not in downloaded_hashes:
        downloaded_hashes.add(img_hash)
        save_image(response.content)
        return True
    return False  # Duplicate

VinaProxy + Image Scraping

  • High bandwidth cho image downloads
  • CDN bypass với residential IPs
  • Giá chỉ $0.5/GB

Dùng Thử Ngay →