News Aggregation Với Web Scraping: Tổng Hợp Tin Tức Tự Động

Tự xây dựng news aggregator theo chủ đề bạn quan tâm. Bài viết hướng dẫn scrape và tổng hợp tin tức.

Use Cases

Industry monitoring: Theo dõi tin ngành
Competitor news: PR và announcements đối thủ
Market research: Trends và sentiment
Personal feed: Custom news reader

Scrape News Sites

import requests
from bs4 import BeautifulSoup
from datetime import datetime

def scrape_news_site(url, selectors):
    response = requests.get(url, headers={'User-Agent': '...'})
    soup = BeautifulSoup(response.text, 'lxml')
    
    articles = []
    for item in soup.select(selectors['article']):
        articles.append({
            'title': item.select_one(selectors['title']).text.strip(),
            'url': item.select_one(selectors['link'])['href'],
            'summary': item.select_one(selectors['summary']).text.strip() if selectors.get('summary') else '',
            'source': url,
            'scraped_at': datetime.now().isoformat()
        })
    
    return articles

# Config cho nhiều sources
sources = [
    {
        'url': 'https://vnexpress.net/cong-nghe',
        'selectors': {
            'article': '.item-news',
            'title': '.title-news a',
            'link': '.title-news a',
            'summary': '.description'
        }
    },
    {
        'url': 'https://techcrunch.com',
        'selectors': {
            'article': 'article',
            'title': 'h2 a',
            'link': 'h2 a',
            'summary': '.post-block__content'
        }
    }
]

all_news = []
for source in sources:
    news = scrape_news_site(source['url'], source['selectors'])
    all_news.extend(news)
    print(f"Scraped {len(news)} from {source['url']}")

RSS Alternative

import feedparser

def parse_rss(feed_url):
    feed = feedparser.parse(feed_url)
    
    articles = []
    for entry in feed.entries:
        articles.append({
            'title': entry.title,
            'url': entry.link,
            'summary': entry.get('summary', ''),
            'published': entry.get('published', '')
        })
    
    return articles

# RSS feeds (nếu available)
rss_feeds = [
    'https://vnexpress.net/rss/cong-nghe.rss',
    'https://techcrunch.com/feed/'
]

for feed in rss_feeds:
    articles = parse_rss(feed)
    all_news.extend(articles)

Deduplication

from urllib.parse import urlparse

def normalize_url(url):
    parsed = urlparse(url)
    return f"{parsed.netloc}{parsed.path}".rstrip('/')

def deduplicate(articles):
    seen = set()
    unique = []
    for article in articles:
        key = normalize_url(article['url'])
        if key not in seen:
            seen.add(key)
            unique.append(article)
    return unique

all_news = deduplicate(all_news)

Keyword Filtering

keywords = ['AI', 'startup', 'funding', 'technology']

def matches_keywords(article, keywords):
    text = f"{article['title']} {article.get('summary', '')}".lower()
    return any(kw.lower() in text for kw in keywords)

filtered = [a for a in all_news if matches_keywords(a, keywords)]

Daily Digest Email

def create_digest(articles):
    html = "Daily News Digest"
    for article in articles[:20]:
        html += f'''
        
            {article['title']}
            {article.get('summary', '')[:200]}...
        
        '''
    return html

VinaProxy + News Aggregation

Scrape global news sites
Bypass geo-restrictions
Giá chỉ $0.5/GB

Dùng Thử Ngay →

News Aggregation Với Web Scraping: Tổng Hợp Tin Tức Tự Động

Use Cases

Scrape News Sites

RSS Alternative

Deduplication

Keyword Filtering

Daily Digest Email

Daily News Digest

{article['title']}

VinaProxy + News Aggregation

admin