Sitemap Scraping: Tìm Tất Cả URLs Của Website
Sitemap là treasure map của website. Bài viết hướng dẫn extract URLs từ sitemaps để scrape toàn bộ site.
Sitemap Là Gì?
- XML file liệt kê tất cả URLs của website
- Thường ở: /sitemap.xml, /sitemap_index.xml
- Search engines dùng để crawl
- Perfect cho scrapers!
Tìm Sitemap
# Common locations
sitemap_urls = [
'https://example.com/sitemap.xml',
'https://example.com/sitemap_index.xml',
'https://example.com/sitemap/sitemap.xml',
'https://example.com/sitemaps/sitemap.xml',
]
# Check robots.txt
response = requests.get('https://example.com/robots.txt')
for line in response.text.split('\n'):
if 'Sitemap:' in line:
sitemap_url = line.split('Sitemap:')[1].strip()
print(f"Found: {sitemap_url}")
Parse Simple Sitemap
import requests
from bs4 import BeautifulSoup
def parse_sitemap(sitemap_url):
response = requests.get(sitemap_url)
soup = BeautifulSoup(response.text, 'xml')
urls = []
for loc in soup.find_all('loc'):
urls.append(loc.text)
return urls
urls = parse_sitemap('https://example.com/sitemap.xml')
print(f"Found {len(urls)} URLs")
Parse Sitemap Index
def parse_sitemap_index(index_url):
response = requests.get(index_url)
soup = BeautifulSoup(response.text, 'xml')
all_urls = []
# Get child sitemaps
for sitemap in soup.find_all('sitemap'):
child_url = sitemap.find('loc').text
print(f"Parsing: {child_url}")
# Parse each child sitemap
child_urls = parse_sitemap(child_url)
all_urls.extend(child_urls)
return all_urls
# Parse entire site
urls = parse_sitemap_index('https://example.com/sitemap_index.xml')
Extract Metadata
def parse_sitemap_full(sitemap_url):
response = requests.get(sitemap_url)
soup = BeautifulSoup(response.text, 'xml')
pages = []
for url_elem in soup.find_all('url'):
page = {
'loc': url_elem.find('loc').text,
'lastmod': url_elem.find('lastmod').text if url_elem.find('lastmod') else None,
'changefreq': url_elem.find('changefreq').text if url_elem.find('changefreq') else None,
'priority': url_elem.find('priority').text if url_elem.find('priority') else None
}
pages.append(page)
return pages
pages = parse_sitemap_full('https://example.com/sitemap.xml')
# Filter by lastmod
recent = [p for p in pages if p['lastmod'] and p['lastmod'] > '2026-01-01']
Filter URLs By Pattern
import re
def filter_urls(urls, patterns):
filtered = []
for url in urls:
for pattern in patterns:
if re.search(pattern, url):
filtered.append(url)
break
return filtered
# Get only product pages
product_urls = filter_urls(urls, [
r'/product/',
r'/san-pham/',
r'/p/',
])
# Get only blog posts
blog_urls = filter_urls(urls, [
r'/blog/',
r'/bai-viet/',
r'/tin-tuc/',
])
Async Sitemap Fetching
import aiohttp
import asyncio
async def fetch_sitemap(session, url):
async with session.get(url) as response:
return await response.text()
async def parse_all_sitemaps(sitemap_urls):
async with aiohttp.ClientSession() as session:
tasks = [fetch_sitemap(session, url) for url in sitemap_urls]
results = await asyncio.gather(*tasks)
all_urls = []
for content in results:
soup = BeautifulSoup(content, 'xml')
for loc in soup.find_all('loc'):
all_urls.append(loc.text)
return all_urls
urls = asyncio.run(parse_all_sitemaps(sitemap_urls))
Sitemap Generator Check
# Compare sitemap với actual content
def audit_sitemap(sitemap_url):
sitemap_urls = set(parse_sitemap(sitemap_url))
# Sample check: verify URLs are accessible
sample = list(sitemap_urls)[:100]
for url in sample:
response = requests.head(url)
status = "✅" if response.status_code == 200 else "❌"
print(f"{status} {response.status_code} - {url}")
VinaProxy + Sitemap Scraping
- Crawl entire sites efficiently
- Bypass rate limits
- Giá chỉ $0.5/GB
