Sitemap Scraping: Tìm Tất Cả URLs Của Website

Trở lại Tin tức
Tin tức

Sitemap Scraping: Tìm Tất Cả URLs Của Website

Sitemap là treasure map của website. Bài viết hướng dẫn extract URLs từ sitemaps để scrape toàn bộ site.

Sitemap Là Gì?

  • XML file liệt kê tất cả URLs của website
  • Thường ở: /sitemap.xml, /sitemap_index.xml
  • Search engines dùng để crawl
  • Perfect cho scrapers!

Tìm Sitemap

# Common locations
sitemap_urls = [
    'https://example.com/sitemap.xml',
    'https://example.com/sitemap_index.xml',
    'https://example.com/sitemap/sitemap.xml',
    'https://example.com/sitemaps/sitemap.xml',
]

# Check robots.txt
response = requests.get('https://example.com/robots.txt')
for line in response.text.split('\n'):
    if 'Sitemap:' in line:
        sitemap_url = line.split('Sitemap:')[1].strip()
        print(f"Found: {sitemap_url}")

Parse Simple Sitemap

import requests
from bs4 import BeautifulSoup

def parse_sitemap(sitemap_url):
    response = requests.get(sitemap_url)
    soup = BeautifulSoup(response.text, 'xml')
    
    urls = []
    for loc in soup.find_all('loc'):
        urls.append(loc.text)
    
    return urls

urls = parse_sitemap('https://example.com/sitemap.xml')
print(f"Found {len(urls)} URLs")

Parse Sitemap Index

def parse_sitemap_index(index_url):
    response = requests.get(index_url)
    soup = BeautifulSoup(response.text, 'xml')
    
    all_urls = []
    
    # Get child sitemaps
    for sitemap in soup.find_all('sitemap'):
        child_url = sitemap.find('loc').text
        print(f"Parsing: {child_url}")
        
        # Parse each child sitemap
        child_urls = parse_sitemap(child_url)
        all_urls.extend(child_urls)
    
    return all_urls

# Parse entire site
urls = parse_sitemap_index('https://example.com/sitemap_index.xml')

Extract Metadata

def parse_sitemap_full(sitemap_url):
    response = requests.get(sitemap_url)
    soup = BeautifulSoup(response.text, 'xml')
    
    pages = []
    for url_elem in soup.find_all('url'):
        page = {
            'loc': url_elem.find('loc').text,
            'lastmod': url_elem.find('lastmod').text if url_elem.find('lastmod') else None,
            'changefreq': url_elem.find('changefreq').text if url_elem.find('changefreq') else None,
            'priority': url_elem.find('priority').text if url_elem.find('priority') else None
        }
        pages.append(page)
    
    return pages

pages = parse_sitemap_full('https://example.com/sitemap.xml')
# Filter by lastmod
recent = [p for p in pages if p['lastmod'] and p['lastmod'] > '2026-01-01']

Filter URLs By Pattern

import re

def filter_urls(urls, patterns):
    filtered = []
    for url in urls:
        for pattern in patterns:
            if re.search(pattern, url):
                filtered.append(url)
                break
    return filtered

# Get only product pages
product_urls = filter_urls(urls, [
    r'/product/',
    r'/san-pham/',
    r'/p/',
])

# Get only blog posts
blog_urls = filter_urls(urls, [
    r'/blog/',
    r'/bai-viet/',
    r'/tin-tuc/',
])

Async Sitemap Fetching

import aiohttp
import asyncio

async def fetch_sitemap(session, url):
    async with session.get(url) as response:
        return await response.text()

async def parse_all_sitemaps(sitemap_urls):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_sitemap(session, url) for url in sitemap_urls]
        results = await asyncio.gather(*tasks)
    
    all_urls = []
    for content in results:
        soup = BeautifulSoup(content, 'xml')
        for loc in soup.find_all('loc'):
            all_urls.append(loc.text)
    
    return all_urls

urls = asyncio.run(parse_all_sitemaps(sitemap_urls))

Sitemap Generator Check

# Compare sitemap với actual content
def audit_sitemap(sitemap_url):
    sitemap_urls = set(parse_sitemap(sitemap_url))
    
    # Sample check: verify URLs are accessible
    sample = list(sitemap_urls)[:100]
    
    for url in sample:
        response = requests.head(url)
        status = "✅" if response.status_code == 200 else "❌"
        print(f"{status} {response.status_code} - {url}")

VinaProxy + Sitemap Scraping

  • Crawl entire sites efficiently
  • Bypass rate limits
  • Giá chỉ $0.5/GB

Dùng Thử Ngay →