Initial commit - cleaned repository

2025-09-28 20:41:57 +09:00
commit e3c28f796a
188 changed files with 28102 additions and 0 deletions
--- a/backup-services/news-aggregator/backend/app/init.py
+++ b/backup-services/news-aggregator/backend/app/init.py
--- a/backup-services/news-aggregator/backend/app/main.py
+++ b/backup-services/news-aggregator/backend/app/main.py
@ -0,0 +1,365 @@
+"""
+News Aggregator Service
+RSS 피드 제목을 구글 검색으로 확장하는 통합 서비스
+"""
+from fastapi import FastAPI, HTTPException, Query, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from typing import List, Optional, Dict, Any
+from datetime import datetime
+import httpx
+import asyncio
+from pydantic import BaseModel
+import logging
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+app = FastAPI(
+    title="News Aggregator Service",
+    description="RSS 피드와 구글 검색을 통합한 뉴스 수집 서비스",
+    version="1.0.0"
+)
+
+# CORS 설정
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Configuration
+RSS_SERVICE_URL = "http://rss-feed-backend:8000"
+GOOGLE_SEARCH_SERVICE_URL = "http://google-search-backend:8000"
+
+# Response Models
+class NewsItem(BaseModel):
+    """뉴스 항목"""
+    rss_title: str
+    rss_link: Optional[str] = None
+    google_results: List[Dict[str, Any]] = []
+    search_keyword: str
+    timestamp: datetime = None
+
+class AggregatedNews(BaseModel):
+    """통합 뉴스 결과"""
+    keyword: str
+    rss_feed_url: str
+    total_rss_entries: int
+    processed_entries: int
+    news_items: List[NewsItem]
+    processing_time: float
+
+# HTTP Client
+client = httpx.AsyncClient(timeout=30.0)
+
+@app.on_event("startup")
+async def startup():
+    """서비스 시작"""
+    logger.info("News Aggregator Service starting...")
+
+@app.on_event("shutdown")
+async def shutdown():
+    """서비스 종료"""
+    await client.aclose()
+    logger.info("News Aggregator Service stopped")
+
+@app.get("/")
+async def root():
+    return {
+        "service": "News Aggregator Service",
+        "version": "1.0.0",
+        "description": "RSS 피드와 구글 검색 통합 서비스",
+        "endpoints": {
+            "aggregate": "GET /api/aggregate",
+            "aggregate_by_location": "GET /api/aggregate/location",
+            "aggregate_by_topic": "GET /api/aggregate/topic",
+            "health": "GET /health"
+        }
+    }
+
+@app.get("/health")
+async def health_check():
+    """헬스 체크"""
+    try:
+        # Check RSS service
+        rss_response = await client.get(f"{RSS_SERVICE_URL}/health")
+        rss_healthy = rss_response.status_code == 200
+        
+        # Check Google Search service
+        google_response = await client.get(f"{GOOGLE_SEARCH_SERVICE_URL}/health")
+        google_healthy = google_response.status_code == 200
+        
+        return {
+            "status": "healthy" if (rss_healthy and google_healthy) else "degraded",
+            "services": {
+                "rss_feed": "healthy" if rss_healthy else "unhealthy",
+                "google_search": "healthy" if google_healthy else "unhealthy"
+            },
+            "timestamp": datetime.now().isoformat()
+        }
+    except Exception as e:
+        return {
+            "status": "unhealthy",
+            "error": str(e),
+            "timestamp": datetime.now().isoformat()
+        }
+
+@app.get("/api/aggregate", response_model=AggregatedNews)
+async def aggregate_news(
+    q: str = Query(..., description="검색 키워드"),
+    limit: int = Query(10, description="처리할 RSS 항목 수", ge=1, le=50),
+    google_results_per_title: int = Query(5, description="각 제목당 구글 검색 결과 수", ge=1, le=10),
+    lang: str = Query("ko", description="언어 코드"),
+    country: str = Query("KR", description="국가 코드")
+):
+    """
+    키워드로 RSS 피드를 검색하고, 각 제목을 구글에서 재검색
+    
+    1. 키워드로 Google News RSS 피드 가져오기
+    2. RSS 피드의 각 제목을 구글 검색
+    3. 통합 결과 반환
+    """
+    start_time = datetime.now()
+    
+    try:
+        # Step 1: Get RSS feed from keyword
+        logger.info(f"Fetching RSS feed for keyword: {q}")
+        rss_response = await client.get(
+            f"{RSS_SERVICE_URL}/api/google-rss/search",
+            params={"q": q, "lang": lang, "country": country}
+        )
+        rss_response.raise_for_status()
+        rss_data = rss_response.json()
+        
+        if not rss_data.get("success"):
+            raise HTTPException(status_code=500, detail=f"RSS 피드 가져오기 실패: {rss_data.get('error')}")
+        
+        # Step 2: Process each RSS entry with Google search
+        news_items = []
+        entries = rss_data.get("entries", [])
+        
+        # If no entries field, fallback to sample_titles
+        if not entries:
+            titles = rss_data.get("sample_titles", [])[:limit]
+            entries = [{"title": title, "link": "", "published": ""} for title in titles]
+        else:
+            entries = entries[:limit]
+        
+        # Create tasks for parallel processing
+        search_tasks = []
+        for entry in entries:
+            title = entry.get("title", "")
+            # Clean title for better search results
+            clean_title = title.split(" - ")[-1] if " - " in title else title
+            search_tasks.append(
+                search_google(clean_title, google_results_per_title, lang, country)
+            )
+        
+        # Execute searches in parallel
+        logger.info(f"Searching Google for {len(search_tasks)} RSS entries")
+        search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
+        
+        # Combine results
+        for i, entry in enumerate(entries):
+            google_results = []
+            if not isinstance(search_results[i], Exception):
+                google_results = search_results[i]
+            
+            title = entry.get("title", "")
+            news_items.append(NewsItem(
+                rss_title=title,
+                rss_link=entry.get("link", ""),
+                google_results=google_results,
+                search_keyword=title.split(" - ")[-1] if " - " in title else title,
+                timestamp=datetime.now()
+            ))
+        
+        # Calculate processing time
+        processing_time = (datetime.now() - start_time).total_seconds()
+        
+        return AggregatedNews(
+            keyword=q,
+            rss_feed_url=rss_data.get("feed_url", ""),
+            total_rss_entries=rss_data.get("entry_count", 0),
+            processed_entries=len(news_items),
+            news_items=news_items,
+            processing_time=processing_time
+        )
+        
+    except httpx.HTTPStatusError as e:
+        logger.error(f"HTTP error: {e}")
+        raise HTTPException(status_code=e.response.status_code, detail=str(e))
+    except Exception as e:
+        logger.error(f"Error in aggregate_news: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+async def search_google(query: str, num_results: int, lang: str, country: str) -> List[Dict[str, Any]]:
+    """구글 검색 서비스 호출 - 전체 콘텐츠 포함"""
+    try:
+        # Full content API 직접 호출
+        response = await client.get(
+            f"{GOOGLE_SEARCH_SERVICE_URL}/api/search/full",
+            params={
+                "q": query,
+                "num": num_results,
+                "lang": lang,
+                "country": country
+            }
+        )
+        response.raise_for_status()
+        data = response.json()
+        results = data.get("results", [])
+        
+        # full_content가 이미 포함되어 있으므로 그대로 반환
+        logger.info(f"Google search for '{query}' returned {len(results)} results with full content")
+        
+        return results
+    except Exception as e:
+        logger.error(f"Google search error for '{query}': {e}")
+        # Fallback to basic search without full content
+        try:
+            response = await client.get(
+                f"{GOOGLE_SEARCH_SERVICE_URL}/api/search",
+                params={
+                    "q": query,
+                    "num": num_results,
+                    "lang": lang,
+                    "country": country
+                }
+            )
+            response.raise_for_status()
+            data = response.json()
+            return data.get("results", [])
+        except:
+            return []
+
+@app.get("/api/aggregate/location", response_model=AggregatedNews)
+async def aggregate_news_by_location(
+    location: str = Query(..., description="지역명 (예: Seoul, Tokyo)"),
+    limit: int = Query(10, description="처리할 RSS 항목 수", ge=1, le=50),
+    google_results_per_title: int = Query(5, description="각 제목당 구글 검색 결과 수", ge=1, le=10),
+    lang: str = Query("ko", description="언어 코드"),
+    country: str = Query("KR", description="국가 코드")
+):
+    """지역 기반 RSS 피드를 가져와서 각 제목을 구글 검색"""
+    start_time = datetime.now()
+    
+    try:
+        # Get location-based RSS feed
+        logger.info(f"Fetching RSS feed for location: {location}")
+        rss_response = await client.get(
+            f"{RSS_SERVICE_URL}/api/google-rss/location",
+            params={"location": location, "lang": lang, "country": country}
+        )
+        rss_response.raise_for_status()
+        rss_data = rss_response.json()
+        
+        if not rss_data.get("success"):
+            raise HTTPException(status_code=500, detail=f"RSS 피드 가져오기 실패: {rss_data.get('error')}")
+        
+        # Process titles
+        news_items = []
+        titles = rss_data.get("sample_titles", [])[:limit]
+        
+        search_tasks = []
+        for title in titles:
+            clean_title = title.split(" - ")[-1] if " - " in title else title
+            search_tasks.append(
+                search_google(clean_title, google_results_per_title, lang, country)
+            )
+        
+        search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
+        
+        for i, title in enumerate(titles):
+            google_results = []
+            if not isinstance(search_results[i], Exception):
+                google_results = search_results[i]
+            
+            news_items.append(NewsItem(
+                rss_title=title,
+                google_results=google_results,
+                search_keyword=title.split(" - ")[-1] if " - " in title else title,
+                timestamp=datetime.now()
+            ))
+        
+        processing_time = (datetime.now() - start_time).total_seconds()
+        
+        return AggregatedNews(
+            keyword=f"Location: {location}",
+            rss_feed_url=rss_data.get("feed_url", ""),
+            total_rss_entries=rss_data.get("entry_count", 0),
+            processed_entries=len(news_items),
+            news_items=news_items,
+            processing_time=processing_time
+        )
+        
+    except Exception as e:
+        logger.error(f"Error in aggregate_news_by_location: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.get("/api/aggregate/topic", response_model=AggregatedNews)
+async def aggregate_news_by_topic(
+    category: str = Query(..., description="카테고리 (TECHNOLOGY, BUSINESS, HEALTH 등)"),
+    limit: int = Query(10, description="처리할 RSS 항목 수", ge=1, le=50),
+    google_results_per_title: int = Query(5, description="각 제목당 구글 검색 결과 수", ge=1, le=10),
+    lang: str = Query("ko", description="언어 코드"),
+    country: str = Query("KR", description="국가 코드")
+):
+    """주제별 RSS 피드를 가져와서 각 제목을 구글 검색"""
+    start_time = datetime.now()
+    
+    try:
+        # Get topic-based RSS feed
+        logger.info(f"Fetching RSS feed for topic: {category}")
+        rss_response = await client.get(
+            f"{RSS_SERVICE_URL}/api/google-rss/topic",
+            params={"category": category, "lang": lang, "country": country}
+        )
+        rss_response.raise_for_status()
+        rss_data = rss_response.json()
+        
+        if not rss_data.get("success"):
+            raise HTTPException(status_code=500, detail=f"RSS 피드 가져오기 실패: {rss_data.get('error')}")
+        
+        # Process titles
+        news_items = []
+        titles = rss_data.get("sample_titles", [])[:limit]
+        
+        search_tasks = []
+        for title in titles:
+            clean_title = title.split(" - ")[-1] if " - " in title else title
+            search_tasks.append(
+                search_google(clean_title, google_results_per_title, lang, country)
+            )
+        
+        search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
+        
+        for i, title in enumerate(titles):
+            google_results = []
+            if not isinstance(search_results[i], Exception):
+                google_results = search_results[i]
+            
+            news_items.append(NewsItem(
+                rss_title=title,
+                google_results=google_results,
+                search_keyword=title.split(" - ")[-1] if " - " in title else title,
+                timestamp=datetime.now()
+            ))
+        
+        processing_time = (datetime.now() - start_time).total_seconds()
+        
+        return AggregatedNews(
+            keyword=f"Topic: {category}",
+            rss_feed_url=rss_data.get("feed_url", ""),
+            total_rss_entries=rss_data.get("entry_count", 0),
+            processed_entries=len(news_items),
+            news_items=news_items,
+            processing_time=processing_time
+        )
+        
+    except Exception as e:
+        logger.error(f"Error in aggregate_news_by_topic: {e}")
+        raise HTTPException(status_code=500, detail=str(e))