Image Scraping: Tải Hình Ảnh Hàng Loạt Từ Web
Images là loại data quan trọng cần scrape. Bài viết hướng dẫn download images hàng loạt từ websites.
Use Cases
- Product images: E-commerce catalogs
- Training data: ML/AI datasets
- Archiving: Website backup
- Research: Visual analysis
Basic Image Download
import requests
from pathlib import Path
def download_image(url, folder='images'):
Path(folder).mkdir(exist_ok=True)
filename = url.split('/')[-1].split('?')[0]
filepath = Path(folder) / filename
response = requests.get(url, stream=True)
if response.status_code == 200:
with open(filepath, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
return filepath
return None
# Download single image
download_image('https://example.com/image.jpg')
Extract All Images From Page
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def get_all_images(page_url):
response = requests.get(page_url)
soup = BeautifulSoup(response.text, 'lxml')
images = []
for img in soup.select('img'):
src = img.get('src') or img.get('data-src')
if src:
# Convert relative to absolute URL
full_url = urljoin(page_url, src)
images.append({
'url': full_url,
'alt': img.get('alt', ''),
'width': img.get('width'),
'height': img.get('height')
})
return images
# Get and download all
images = get_all_images('https://shop.com/products')
for img in images:
download_image(img['url'])
Handle Lazy Loading
from playwright.sync_api import sync_playwright
def get_lazy_images(url):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(url)
# Scroll to load all images
for _ in range(10):
page.evaluate('window.scrollBy(0, 1000)')
page.wait_for_timeout(500)
# Get all image sources
images = page.evaluate('''
() => Array.from(document.querySelectorAll('img'))
.map(img => img.src || img.dataset.src)
.filter(src => src)
''')
browser.close()
return images
Async Bulk Download
import aiohttp
import asyncio
from pathlib import Path
async def download_image_async(session, url, folder):
filename = url.split('/')[-1].split('?')[0]
filepath = Path(folder) / filename
try:
async with session.get(url) as response:
if response.status == 200:
content = await response.read()
with open(filepath, 'wb') as f:
f.write(content)
return filepath
except:
pass
return None
async def bulk_download(urls, folder='images', max_concurrent=10):
Path(folder).mkdir(exist_ok=True)
connector = aiohttp.TCPConnector(limit=max_concurrent)
async with aiohttp.ClientSession(connector=connector) as session:
tasks = [download_image_async(session, url, folder) for url in urls]
results = await asyncio.gather(*tasks)
return [r for r in results if r]
# Download 1000 images fast
urls = [...] # List of image URLs
asyncio.run(bulk_download(urls))
Image Validation
from PIL import Image
import io
def validate_image(content):
try:
img = Image.open(io.BytesIO(content))
return {
'valid': True,
'format': img.format,
'size': img.size,
'mode': img.mode
}
except:
return {'valid': False}
# Filter valid images
response = requests.get(image_url)
info = validate_image(response.content)
if info['valid']:
save_image(response.content)
Avoid Duplicates
import hashlib
def get_image_hash(content):
return hashlib.md5(content).hexdigest()
downloaded_hashes = set()
def download_unique(url):
response = requests.get(url)
img_hash = get_image_hash(response.content)
if img_hash not in downloaded_hashes:
downloaded_hashes.add(img_hash)
save_image(response.content)
return True
return False # Duplicate
VinaProxy + Image Scraping
- High bandwidth cho image downloads
- CDN bypass với residential IPs
- Giá chỉ $0.5/GB
