Files
site11/backup-services/news-aggregator/backend/app/main.py
jungwoo choi 070032006e feat: Implement async queue-based news pipeline with microservices
Major architectural transformation from synchronous to asynchronous processing:

## Pipeline Services (8 microservices)
- pipeline-scheduler: APScheduler for 30-minute periodic job triggers
- pipeline-rss-collector: RSS feed collection with deduplication (7-day TTL)
- pipeline-google-search: Content enrichment via Google Search API
- pipeline-ai-summarizer: AI summarization using Claude API (claude-sonnet-4-20250514)
- pipeline-translator: Translation using DeepL Pro API
- pipeline-image-generator: Image generation with Replicate API (Stable Diffusion)
- pipeline-article-assembly: Final article assembly and MongoDB storage
- pipeline-monitor: Real-time monitoring dashboard (port 8100)

## Key Features
- Redis-based job queue with deduplication
- Asynchronous processing with Python asyncio
- Shared models and queue manager for inter-service communication
- Docker containerization for all services
- Container names standardized with site11_ prefix

## Removed Services
- Moved to backup: google-search, rss-feed, news-aggregator, ai-writer

## Configuration
- DeepL Pro API: 3abbc796-2515-44a8-972d-22dcf27ab54a
- Claude Model: claude-sonnet-4-20250514
- Redis Queue TTL: 7 days for deduplication

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-13 19:22:14 +09:00

365 lines
13 KiB
Python

"""
News Aggregator Service
RSS 피드 제목을 구글 검색으로 확장하는 통합 서비스
"""
from fastapi import FastAPI, HTTPException, Query, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware
from typing import List, Optional, Dict, Any
from datetime import datetime
import httpx
import asyncio
from pydantic import BaseModel
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(
title="News Aggregator Service",
description="RSS 피드와 구글 검색을 통합한 뉴스 수집 서비스",
version="1.0.0"
)
# CORS 설정
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Configuration
RSS_SERVICE_URL = "http://rss-feed-backend:8000"
GOOGLE_SEARCH_SERVICE_URL = "http://google-search-backend:8000"
# Response Models
class NewsItem(BaseModel):
"""뉴스 항목"""
rss_title: str
rss_link: Optional[str] = None
google_results: List[Dict[str, Any]] = []
search_keyword: str
timestamp: datetime = None
class AggregatedNews(BaseModel):
"""통합 뉴스 결과"""
keyword: str
rss_feed_url: str
total_rss_entries: int
processed_entries: int
news_items: List[NewsItem]
processing_time: float
# HTTP Client
client = httpx.AsyncClient(timeout=30.0)
@app.on_event("startup")
async def startup():
"""서비스 시작"""
logger.info("News Aggregator Service starting...")
@app.on_event("shutdown")
async def shutdown():
"""서비스 종료"""
await client.aclose()
logger.info("News Aggregator Service stopped")
@app.get("/")
async def root():
return {
"service": "News Aggregator Service",
"version": "1.0.0",
"description": "RSS 피드와 구글 검색 통합 서비스",
"endpoints": {
"aggregate": "GET /api/aggregate",
"aggregate_by_location": "GET /api/aggregate/location",
"aggregate_by_topic": "GET /api/aggregate/topic",
"health": "GET /health"
}
}
@app.get("/health")
async def health_check():
"""헬스 체크"""
try:
# Check RSS service
rss_response = await client.get(f"{RSS_SERVICE_URL}/health")
rss_healthy = rss_response.status_code == 200
# Check Google Search service
google_response = await client.get(f"{GOOGLE_SEARCH_SERVICE_URL}/health")
google_healthy = google_response.status_code == 200
return {
"status": "healthy" if (rss_healthy and google_healthy) else "degraded",
"services": {
"rss_feed": "healthy" if rss_healthy else "unhealthy",
"google_search": "healthy" if google_healthy else "unhealthy"
},
"timestamp": datetime.now().isoformat()
}
except Exception as e:
return {
"status": "unhealthy",
"error": str(e),
"timestamp": datetime.now().isoformat()
}
@app.get("/api/aggregate", response_model=AggregatedNews)
async def aggregate_news(
q: str = Query(..., description="검색 키워드"),
limit: int = Query(10, description="처리할 RSS 항목 수", ge=1, le=50),
google_results_per_title: int = Query(5, description="각 제목당 구글 검색 결과 수", ge=1, le=10),
lang: str = Query("ko", description="언어 코드"),
country: str = Query("KR", description="국가 코드")
):
"""
키워드로 RSS 피드를 검색하고, 각 제목을 구글에서 재검색
1. 키워드로 Google News RSS 피드 가져오기
2. RSS 피드의 각 제목을 구글 검색
3. 통합 결과 반환
"""
start_time = datetime.now()
try:
# Step 1: Get RSS feed from keyword
logger.info(f"Fetching RSS feed for keyword: {q}")
rss_response = await client.get(
f"{RSS_SERVICE_URL}/api/google-rss/search",
params={"q": q, "lang": lang, "country": country}
)
rss_response.raise_for_status()
rss_data = rss_response.json()
if not rss_data.get("success"):
raise HTTPException(status_code=500, detail=f"RSS 피드 가져오기 실패: {rss_data.get('error')}")
# Step 2: Process each RSS entry with Google search
news_items = []
entries = rss_data.get("entries", [])
# If no entries field, fallback to sample_titles
if not entries:
titles = rss_data.get("sample_titles", [])[:limit]
entries = [{"title": title, "link": "", "published": ""} for title in titles]
else:
entries = entries[:limit]
# Create tasks for parallel processing
search_tasks = []
for entry in entries:
title = entry.get("title", "")
# Clean title for better search results
clean_title = title.split(" - ")[-1] if " - " in title else title
search_tasks.append(
search_google(clean_title, google_results_per_title, lang, country)
)
# Execute searches in parallel
logger.info(f"Searching Google for {len(search_tasks)} RSS entries")
search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
# Combine results
for i, entry in enumerate(entries):
google_results = []
if not isinstance(search_results[i], Exception):
google_results = search_results[i]
title = entry.get("title", "")
news_items.append(NewsItem(
rss_title=title,
rss_link=entry.get("link", ""),
google_results=google_results,
search_keyword=title.split(" - ")[-1] if " - " in title else title,
timestamp=datetime.now()
))
# Calculate processing time
processing_time = (datetime.now() - start_time).total_seconds()
return AggregatedNews(
keyword=q,
rss_feed_url=rss_data.get("feed_url", ""),
total_rss_entries=rss_data.get("entry_count", 0),
processed_entries=len(news_items),
news_items=news_items,
processing_time=processing_time
)
except httpx.HTTPStatusError as e:
logger.error(f"HTTP error: {e}")
raise HTTPException(status_code=e.response.status_code, detail=str(e))
except Exception as e:
logger.error(f"Error in aggregate_news: {e}")
raise HTTPException(status_code=500, detail=str(e))
async def search_google(query: str, num_results: int, lang: str, country: str) -> List[Dict[str, Any]]:
"""구글 검색 서비스 호출 - 전체 콘텐츠 포함"""
try:
# Full content API 직접 호출
response = await client.get(
f"{GOOGLE_SEARCH_SERVICE_URL}/api/search/full",
params={
"q": query,
"num": num_results,
"lang": lang,
"country": country
}
)
response.raise_for_status()
data = response.json()
results = data.get("results", [])
# full_content가 이미 포함되어 있으므로 그대로 반환
logger.info(f"Google search for '{query}' returned {len(results)} results with full content")
return results
except Exception as e:
logger.error(f"Google search error for '{query}': {e}")
# Fallback to basic search without full content
try:
response = await client.get(
f"{GOOGLE_SEARCH_SERVICE_URL}/api/search",
params={
"q": query,
"num": num_results,
"lang": lang,
"country": country
}
)
response.raise_for_status()
data = response.json()
return data.get("results", [])
except:
return []
@app.get("/api/aggregate/location", response_model=AggregatedNews)
async def aggregate_news_by_location(
location: str = Query(..., description="지역명 (예: Seoul, Tokyo)"),
limit: int = Query(10, description="처리할 RSS 항목 수", ge=1, le=50),
google_results_per_title: int = Query(5, description="각 제목당 구글 검색 결과 수", ge=1, le=10),
lang: str = Query("ko", description="언어 코드"),
country: str = Query("KR", description="국가 코드")
):
"""지역 기반 RSS 피드를 가져와서 각 제목을 구글 검색"""
start_time = datetime.now()
try:
# Get location-based RSS feed
logger.info(f"Fetching RSS feed for location: {location}")
rss_response = await client.get(
f"{RSS_SERVICE_URL}/api/google-rss/location",
params={"location": location, "lang": lang, "country": country}
)
rss_response.raise_for_status()
rss_data = rss_response.json()
if not rss_data.get("success"):
raise HTTPException(status_code=500, detail=f"RSS 피드 가져오기 실패: {rss_data.get('error')}")
# Process titles
news_items = []
titles = rss_data.get("sample_titles", [])[:limit]
search_tasks = []
for title in titles:
clean_title = title.split(" - ")[-1] if " - " in title else title
search_tasks.append(
search_google(clean_title, google_results_per_title, lang, country)
)
search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
for i, title in enumerate(titles):
google_results = []
if not isinstance(search_results[i], Exception):
google_results = search_results[i]
news_items.append(NewsItem(
rss_title=title,
google_results=google_results,
search_keyword=title.split(" - ")[-1] if " - " in title else title,
timestamp=datetime.now()
))
processing_time = (datetime.now() - start_time).total_seconds()
return AggregatedNews(
keyword=f"Location: {location}",
rss_feed_url=rss_data.get("feed_url", ""),
total_rss_entries=rss_data.get("entry_count", 0),
processed_entries=len(news_items),
news_items=news_items,
processing_time=processing_time
)
except Exception as e:
logger.error(f"Error in aggregate_news_by_location: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/aggregate/topic", response_model=AggregatedNews)
async def aggregate_news_by_topic(
category: str = Query(..., description="카테고리 (TECHNOLOGY, BUSINESS, HEALTH 등)"),
limit: int = Query(10, description="처리할 RSS 항목 수", ge=1, le=50),
google_results_per_title: int = Query(5, description="각 제목당 구글 검색 결과 수", ge=1, le=10),
lang: str = Query("ko", description="언어 코드"),
country: str = Query("KR", description="국가 코드")
):
"""주제별 RSS 피드를 가져와서 각 제목을 구글 검색"""
start_time = datetime.now()
try:
# Get topic-based RSS feed
logger.info(f"Fetching RSS feed for topic: {category}")
rss_response = await client.get(
f"{RSS_SERVICE_URL}/api/google-rss/topic",
params={"category": category, "lang": lang, "country": country}
)
rss_response.raise_for_status()
rss_data = rss_response.json()
if not rss_data.get("success"):
raise HTTPException(status_code=500, detail=f"RSS 피드 가져오기 실패: {rss_data.get('error')}")
# Process titles
news_items = []
titles = rss_data.get("sample_titles", [])[:limit]
search_tasks = []
for title in titles:
clean_title = title.split(" - ")[-1] if " - " in title else title
search_tasks.append(
search_google(clean_title, google_results_per_title, lang, country)
)
search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
for i, title in enumerate(titles):
google_results = []
if not isinstance(search_results[i], Exception):
google_results = search_results[i]
news_items.append(NewsItem(
rss_title=title,
google_results=google_results,
search_keyword=title.split(" - ")[-1] if " - " in title else title,
timestamp=datetime.now()
))
processing_time = (datetime.now() - start_time).total_seconds()
return AggregatedNews(
keyword=f"Topic: {category}",
rss_feed_url=rss_data.get("feed_url", ""),
total_rss_entries=rss_data.get("entry_count", 0),
processed_entries=len(news_items),
news_items=news_items,
processing_time=processing_time
)
except Exception as e:
logger.error(f"Error in aggregate_news_by_topic: {e}")
raise HTTPException(status_code=500, detail=str(e))