News Aggregation Với Web Scraping: Tổng Hợp Tin Tức Tự Động
Tự xây dựng news aggregator theo chủ đề bạn quan tâm. Bài viết hướng dẫn scrape và tổng hợp tin tức.
Use Cases
- Industry monitoring: Theo dõi tin ngành
- Competitor news: PR và announcements đối thủ
- Market research: Trends và sentiment
- Personal feed: Custom news reader
Scrape News Sites
import requests
from bs4 import BeautifulSoup
from datetime import datetime
def scrape_news_site(url, selectors):
response = requests.get(url, headers={'User-Agent': '...'})
soup = BeautifulSoup(response.text, 'lxml')
articles = []
for item in soup.select(selectors['article']):
articles.append({
'title': item.select_one(selectors['title']).text.strip(),
'url': item.select_one(selectors['link'])['href'],
'summary': item.select_one(selectors['summary']).text.strip() if selectors.get('summary') else '',
'source': url,
'scraped_at': datetime.now().isoformat()
})
return articles
# Config cho nhiều sources
sources = [
{
'url': 'https://vnexpress.net/cong-nghe',
'selectors': {
'article': '.item-news',
'title': '.title-news a',
'link': '.title-news a',
'summary': '.description'
}
},
{
'url': 'https://techcrunch.com',
'selectors': {
'article': 'article',
'title': 'h2 a',
'link': 'h2 a',
'summary': '.post-block__content'
}
}
]
all_news = []
for source in sources:
news = scrape_news_site(source['url'], source['selectors'])
all_news.extend(news)
print(f"Scraped {len(news)} from {source['url']}")
RSS Alternative
import feedparser
def parse_rss(feed_url):
feed = feedparser.parse(feed_url)
articles = []
for entry in feed.entries:
articles.append({
'title': entry.title,
'url': entry.link,
'summary': entry.get('summary', ''),
'published': entry.get('published', '')
})
return articles
# RSS feeds (nếu available)
rss_feeds = [
'https://vnexpress.net/rss/cong-nghe.rss',
'https://techcrunch.com/feed/'
]
for feed in rss_feeds:
articles = parse_rss(feed)
all_news.extend(articles)
Deduplication
from urllib.parse import urlparse
def normalize_url(url):
parsed = urlparse(url)
return f"{parsed.netloc}{parsed.path}".rstrip('/')
def deduplicate(articles):
seen = set()
unique = []
for article in articles:
key = normalize_url(article['url'])
if key not in seen:
seen.add(key)
unique.append(article)
return unique
all_news = deduplicate(all_news)
Keyword Filtering
keywords = ['AI', 'startup', 'funding', 'technology']
def matches_keywords(article, keywords):
text = f"{article['title']} {article.get('summary', '')}".lower()
return any(kw.lower() in text for kw in keywords)
filtered = [a for a in all_news if matches_keywords(a, keywords)]
Daily Digest Email
def create_digest(articles):
html = "Daily News Digest
"
for article in articles[:20]:
html += f'''
{article['title']}
{article.get('summary', '')[:200]}...
'''
return html
VinaProxy + News Aggregation
- Scrape global news sites
- Bypass geo-restrictions
- Giá chỉ $0.5/GB
