365 lines
13 KiB
Python
365 lines
13 KiB
Python
"""
|
|
News Aggregator Service
|
|
RSS 피드 제목을 구글 검색으로 확장하는 통합 서비스
|
|
"""
|
|
from fastapi import FastAPI, HTTPException, Query, BackgroundTasks
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from typing import List, Optional, Dict, Any
|
|
from datetime import datetime
|
|
import httpx
|
|
import asyncio
|
|
from pydantic import BaseModel
|
|
import logging
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
app = FastAPI(
|
|
title="News Aggregator Service",
|
|
description="RSS 피드와 구글 검색을 통합한 뉴스 수집 서비스",
|
|
version="1.0.0"
|
|
)
|
|
|
|
# CORS 설정
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"],
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
# Configuration
|
|
RSS_SERVICE_URL = "http://rss-feed-backend:8000"
|
|
GOOGLE_SEARCH_SERVICE_URL = "http://google-search-backend:8000"
|
|
|
|
# Response Models
|
|
class NewsItem(BaseModel):
|
|
"""뉴스 항목"""
|
|
rss_title: str
|
|
rss_link: Optional[str] = None
|
|
google_results: List[Dict[str, Any]] = []
|
|
search_keyword: str
|
|
timestamp: datetime = None
|
|
|
|
class AggregatedNews(BaseModel):
|
|
"""통합 뉴스 결과"""
|
|
keyword: str
|
|
rss_feed_url: str
|
|
total_rss_entries: int
|
|
processed_entries: int
|
|
news_items: List[NewsItem]
|
|
processing_time: float
|
|
|
|
# HTTP Client
|
|
client = httpx.AsyncClient(timeout=30.0)
|
|
|
|
@app.on_event("startup")
|
|
async def startup():
|
|
"""서비스 시작"""
|
|
logger.info("News Aggregator Service starting...")
|
|
|
|
@app.on_event("shutdown")
|
|
async def shutdown():
|
|
"""서비스 종료"""
|
|
await client.aclose()
|
|
logger.info("News Aggregator Service stopped")
|
|
|
|
@app.get("/")
|
|
async def root():
|
|
return {
|
|
"service": "News Aggregator Service",
|
|
"version": "1.0.0",
|
|
"description": "RSS 피드와 구글 검색 통합 서비스",
|
|
"endpoints": {
|
|
"aggregate": "GET /api/aggregate",
|
|
"aggregate_by_location": "GET /api/aggregate/location",
|
|
"aggregate_by_topic": "GET /api/aggregate/topic",
|
|
"health": "GET /health"
|
|
}
|
|
}
|
|
|
|
@app.get("/health")
|
|
async def health_check():
|
|
"""헬스 체크"""
|
|
try:
|
|
# Check RSS service
|
|
rss_response = await client.get(f"{RSS_SERVICE_URL}/health")
|
|
rss_healthy = rss_response.status_code == 200
|
|
|
|
# Check Google Search service
|
|
google_response = await client.get(f"{GOOGLE_SEARCH_SERVICE_URL}/health")
|
|
google_healthy = google_response.status_code == 200
|
|
|
|
return {
|
|
"status": "healthy" if (rss_healthy and google_healthy) else "degraded",
|
|
"services": {
|
|
"rss_feed": "healthy" if rss_healthy else "unhealthy",
|
|
"google_search": "healthy" if google_healthy else "unhealthy"
|
|
},
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"status": "unhealthy",
|
|
"error": str(e),
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
@app.get("/api/aggregate", response_model=AggregatedNews)
|
|
async def aggregate_news(
|
|
q: str = Query(..., description="검색 키워드"),
|
|
limit: int = Query(10, description="처리할 RSS 항목 수", ge=1, le=50),
|
|
google_results_per_title: int = Query(5, description="각 제목당 구글 검색 결과 수", ge=1, le=10),
|
|
lang: str = Query("ko", description="언어 코드"),
|
|
country: str = Query("KR", description="국가 코드")
|
|
):
|
|
"""
|
|
키워드로 RSS 피드를 검색하고, 각 제목을 구글에서 재검색
|
|
|
|
1. 키워드로 Google News RSS 피드 가져오기
|
|
2. RSS 피드의 각 제목을 구글 검색
|
|
3. 통합 결과 반환
|
|
"""
|
|
start_time = datetime.now()
|
|
|
|
try:
|
|
# Step 1: Get RSS feed from keyword
|
|
logger.info(f"Fetching RSS feed for keyword: {q}")
|
|
rss_response = await client.get(
|
|
f"{RSS_SERVICE_URL}/api/google-rss/search",
|
|
params={"q": q, "lang": lang, "country": country}
|
|
)
|
|
rss_response.raise_for_status()
|
|
rss_data = rss_response.json()
|
|
|
|
if not rss_data.get("success"):
|
|
raise HTTPException(status_code=500, detail=f"RSS 피드 가져오기 실패: {rss_data.get('error')}")
|
|
|
|
# Step 2: Process each RSS entry with Google search
|
|
news_items = []
|
|
entries = rss_data.get("entries", [])
|
|
|
|
# If no entries field, fallback to sample_titles
|
|
if not entries:
|
|
titles = rss_data.get("sample_titles", [])[:limit]
|
|
entries = [{"title": title, "link": "", "published": ""} for title in titles]
|
|
else:
|
|
entries = entries[:limit]
|
|
|
|
# Create tasks for parallel processing
|
|
search_tasks = []
|
|
for entry in entries:
|
|
title = entry.get("title", "")
|
|
# Clean title for better search results
|
|
clean_title = title.split(" - ")[-1] if " - " in title else title
|
|
search_tasks.append(
|
|
search_google(clean_title, google_results_per_title, lang, country)
|
|
)
|
|
|
|
# Execute searches in parallel
|
|
logger.info(f"Searching Google for {len(search_tasks)} RSS entries")
|
|
search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
|
|
|
|
# Combine results
|
|
for i, entry in enumerate(entries):
|
|
google_results = []
|
|
if not isinstance(search_results[i], Exception):
|
|
google_results = search_results[i]
|
|
|
|
title = entry.get("title", "")
|
|
news_items.append(NewsItem(
|
|
rss_title=title,
|
|
rss_link=entry.get("link", ""),
|
|
google_results=google_results,
|
|
search_keyword=title.split(" - ")[-1] if " - " in title else title,
|
|
timestamp=datetime.now()
|
|
))
|
|
|
|
# Calculate processing time
|
|
processing_time = (datetime.now() - start_time).total_seconds()
|
|
|
|
return AggregatedNews(
|
|
keyword=q,
|
|
rss_feed_url=rss_data.get("feed_url", ""),
|
|
total_rss_entries=rss_data.get("entry_count", 0),
|
|
processed_entries=len(news_items),
|
|
news_items=news_items,
|
|
processing_time=processing_time
|
|
)
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
logger.error(f"HTTP error: {e}")
|
|
raise HTTPException(status_code=e.response.status_code, detail=str(e))
|
|
except Exception as e:
|
|
logger.error(f"Error in aggregate_news: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
async def search_google(query: str, num_results: int, lang: str, country: str) -> List[Dict[str, Any]]:
|
|
"""구글 검색 서비스 호출 - 전체 콘텐츠 포함"""
|
|
try:
|
|
# Full content API 직접 호출
|
|
response = await client.get(
|
|
f"{GOOGLE_SEARCH_SERVICE_URL}/api/search/full",
|
|
params={
|
|
"q": query,
|
|
"num": num_results,
|
|
"lang": lang,
|
|
"country": country
|
|
}
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
results = data.get("results", [])
|
|
|
|
# full_content가 이미 포함되어 있으므로 그대로 반환
|
|
logger.info(f"Google search for '{query}' returned {len(results)} results with full content")
|
|
|
|
return results
|
|
except Exception as e:
|
|
logger.error(f"Google search error for '{query}': {e}")
|
|
# Fallback to basic search without full content
|
|
try:
|
|
response = await client.get(
|
|
f"{GOOGLE_SEARCH_SERVICE_URL}/api/search",
|
|
params={
|
|
"q": query,
|
|
"num": num_results,
|
|
"lang": lang,
|
|
"country": country
|
|
}
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return data.get("results", [])
|
|
except:
|
|
return []
|
|
|
|
@app.get("/api/aggregate/location", response_model=AggregatedNews)
|
|
async def aggregate_news_by_location(
|
|
location: str = Query(..., description="지역명 (예: Seoul, Tokyo)"),
|
|
limit: int = Query(10, description="처리할 RSS 항목 수", ge=1, le=50),
|
|
google_results_per_title: int = Query(5, description="각 제목당 구글 검색 결과 수", ge=1, le=10),
|
|
lang: str = Query("ko", description="언어 코드"),
|
|
country: str = Query("KR", description="국가 코드")
|
|
):
|
|
"""지역 기반 RSS 피드를 가져와서 각 제목을 구글 검색"""
|
|
start_time = datetime.now()
|
|
|
|
try:
|
|
# Get location-based RSS feed
|
|
logger.info(f"Fetching RSS feed for location: {location}")
|
|
rss_response = await client.get(
|
|
f"{RSS_SERVICE_URL}/api/google-rss/location",
|
|
params={"location": location, "lang": lang, "country": country}
|
|
)
|
|
rss_response.raise_for_status()
|
|
rss_data = rss_response.json()
|
|
|
|
if not rss_data.get("success"):
|
|
raise HTTPException(status_code=500, detail=f"RSS 피드 가져오기 실패: {rss_data.get('error')}")
|
|
|
|
# Process titles
|
|
news_items = []
|
|
titles = rss_data.get("sample_titles", [])[:limit]
|
|
|
|
search_tasks = []
|
|
for title in titles:
|
|
clean_title = title.split(" - ")[-1] if " - " in title else title
|
|
search_tasks.append(
|
|
search_google(clean_title, google_results_per_title, lang, country)
|
|
)
|
|
|
|
search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
|
|
|
|
for i, title in enumerate(titles):
|
|
google_results = []
|
|
if not isinstance(search_results[i], Exception):
|
|
google_results = search_results[i]
|
|
|
|
news_items.append(NewsItem(
|
|
rss_title=title,
|
|
google_results=google_results,
|
|
search_keyword=title.split(" - ")[-1] if " - " in title else title,
|
|
timestamp=datetime.now()
|
|
))
|
|
|
|
processing_time = (datetime.now() - start_time).total_seconds()
|
|
|
|
return AggregatedNews(
|
|
keyword=f"Location: {location}",
|
|
rss_feed_url=rss_data.get("feed_url", ""),
|
|
total_rss_entries=rss_data.get("entry_count", 0),
|
|
processed_entries=len(news_items),
|
|
news_items=news_items,
|
|
processing_time=processing_time
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in aggregate_news_by_location: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@app.get("/api/aggregate/topic", response_model=AggregatedNews)
|
|
async def aggregate_news_by_topic(
|
|
category: str = Query(..., description="카테고리 (TECHNOLOGY, BUSINESS, HEALTH 등)"),
|
|
limit: int = Query(10, description="처리할 RSS 항목 수", ge=1, le=50),
|
|
google_results_per_title: int = Query(5, description="각 제목당 구글 검색 결과 수", ge=1, le=10),
|
|
lang: str = Query("ko", description="언어 코드"),
|
|
country: str = Query("KR", description="국가 코드")
|
|
):
|
|
"""주제별 RSS 피드를 가져와서 각 제목을 구글 검색"""
|
|
start_time = datetime.now()
|
|
|
|
try:
|
|
# Get topic-based RSS feed
|
|
logger.info(f"Fetching RSS feed for topic: {category}")
|
|
rss_response = await client.get(
|
|
f"{RSS_SERVICE_URL}/api/google-rss/topic",
|
|
params={"category": category, "lang": lang, "country": country}
|
|
)
|
|
rss_response.raise_for_status()
|
|
rss_data = rss_response.json()
|
|
|
|
if not rss_data.get("success"):
|
|
raise HTTPException(status_code=500, detail=f"RSS 피드 가져오기 실패: {rss_data.get('error')}")
|
|
|
|
# Process titles
|
|
news_items = []
|
|
titles = rss_data.get("sample_titles", [])[:limit]
|
|
|
|
search_tasks = []
|
|
for title in titles:
|
|
clean_title = title.split(" - ")[-1] if " - " in title else title
|
|
search_tasks.append(
|
|
search_google(clean_title, google_results_per_title, lang, country)
|
|
)
|
|
|
|
search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
|
|
|
|
for i, title in enumerate(titles):
|
|
google_results = []
|
|
if not isinstance(search_results[i], Exception):
|
|
google_results = search_results[i]
|
|
|
|
news_items.append(NewsItem(
|
|
rss_title=title,
|
|
google_results=google_results,
|
|
search_keyword=title.split(" - ")[-1] if " - " in title else title,
|
|
timestamp=datetime.now()
|
|
))
|
|
|
|
processing_time = (datetime.now() - start_time).total_seconds()
|
|
|
|
return AggregatedNews(
|
|
keyword=f"Topic: {category}",
|
|
rss_feed_url=rss_data.get("feed_url", ""),
|
|
total_rss_entries=rss_data.get("entry_count", 0),
|
|
processed_entries=len(news_items),
|
|
news_items=news_items,
|
|
processing_time=processing_time
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in aggregate_news_by_topic: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e)) |