Initial commit - cleaned repository

This commit is contained in:
jungwoo choi
2025-09-28 20:41:57 +09:00
commit e3c28f796a
188 changed files with 28102 additions and 0 deletions

View File

@ -0,0 +1,13 @@
FROM python:3.11-slim
WORKDIR /app
# Install dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application
COPY . .
# Run the application
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]

View File

@ -0,0 +1,365 @@
"""
News Aggregator Service
RSS 피드 제목을 구글 검색으로 확장하는 통합 서비스
"""
from fastapi import FastAPI, HTTPException, Query, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware
from typing import List, Optional, Dict, Any
from datetime import datetime
import httpx
import asyncio
from pydantic import BaseModel
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(
title="News Aggregator Service",
description="RSS 피드와 구글 검색을 통합한 뉴스 수집 서비스",
version="1.0.0"
)
# CORS 설정
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Configuration
RSS_SERVICE_URL = "http://rss-feed-backend:8000"
GOOGLE_SEARCH_SERVICE_URL = "http://google-search-backend:8000"
# Response Models
class NewsItem(BaseModel):
"""뉴스 항목"""
rss_title: str
rss_link: Optional[str] = None
google_results: List[Dict[str, Any]] = []
search_keyword: str
timestamp: datetime = None
class AggregatedNews(BaseModel):
"""통합 뉴스 결과"""
keyword: str
rss_feed_url: str
total_rss_entries: int
processed_entries: int
news_items: List[NewsItem]
processing_time: float
# HTTP Client
client = httpx.AsyncClient(timeout=30.0)
@app.on_event("startup")
async def startup():
"""서비스 시작"""
logger.info("News Aggregator Service starting...")
@app.on_event("shutdown")
async def shutdown():
"""서비스 종료"""
await client.aclose()
logger.info("News Aggregator Service stopped")
@app.get("/")
async def root():
return {
"service": "News Aggregator Service",
"version": "1.0.0",
"description": "RSS 피드와 구글 검색 통합 서비스",
"endpoints": {
"aggregate": "GET /api/aggregate",
"aggregate_by_location": "GET /api/aggregate/location",
"aggregate_by_topic": "GET /api/aggregate/topic",
"health": "GET /health"
}
}
@app.get("/health")
async def health_check():
"""헬스 체크"""
try:
# Check RSS service
rss_response = await client.get(f"{RSS_SERVICE_URL}/health")
rss_healthy = rss_response.status_code == 200
# Check Google Search service
google_response = await client.get(f"{GOOGLE_SEARCH_SERVICE_URL}/health")
google_healthy = google_response.status_code == 200
return {
"status": "healthy" if (rss_healthy and google_healthy) else "degraded",
"services": {
"rss_feed": "healthy" if rss_healthy else "unhealthy",
"google_search": "healthy" if google_healthy else "unhealthy"
},
"timestamp": datetime.now().isoformat()
}
except Exception as e:
return {
"status": "unhealthy",
"error": str(e),
"timestamp": datetime.now().isoformat()
}
@app.get("/api/aggregate", response_model=AggregatedNews)
async def aggregate_news(
q: str = Query(..., description="검색 키워드"),
limit: int = Query(10, description="처리할 RSS 항목 수", ge=1, le=50),
google_results_per_title: int = Query(5, description="각 제목당 구글 검색 결과 수", ge=1, le=10),
lang: str = Query("ko", description="언어 코드"),
country: str = Query("KR", description="국가 코드")
):
"""
키워드로 RSS 피드를 검색하고, 각 제목을 구글에서 재검색
1. 키워드로 Google News RSS 피드 가져오기
2. RSS 피드의 각 제목을 구글 검색
3. 통합 결과 반환
"""
start_time = datetime.now()
try:
# Step 1: Get RSS feed from keyword
logger.info(f"Fetching RSS feed for keyword: {q}")
rss_response = await client.get(
f"{RSS_SERVICE_URL}/api/google-rss/search",
params={"q": q, "lang": lang, "country": country}
)
rss_response.raise_for_status()
rss_data = rss_response.json()
if not rss_data.get("success"):
raise HTTPException(status_code=500, detail=f"RSS 피드 가져오기 실패: {rss_data.get('error')}")
# Step 2: Process each RSS entry with Google search
news_items = []
entries = rss_data.get("entries", [])
# If no entries field, fallback to sample_titles
if not entries:
titles = rss_data.get("sample_titles", [])[:limit]
entries = [{"title": title, "link": "", "published": ""} for title in titles]
else:
entries = entries[:limit]
# Create tasks for parallel processing
search_tasks = []
for entry in entries:
title = entry.get("title", "")
# Clean title for better search results
clean_title = title.split(" - ")[-1] if " - " in title else title
search_tasks.append(
search_google(clean_title, google_results_per_title, lang, country)
)
# Execute searches in parallel
logger.info(f"Searching Google for {len(search_tasks)} RSS entries")
search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
# Combine results
for i, entry in enumerate(entries):
google_results = []
if not isinstance(search_results[i], Exception):
google_results = search_results[i]
title = entry.get("title", "")
news_items.append(NewsItem(
rss_title=title,
rss_link=entry.get("link", ""),
google_results=google_results,
search_keyword=title.split(" - ")[-1] if " - " in title else title,
timestamp=datetime.now()
))
# Calculate processing time
processing_time = (datetime.now() - start_time).total_seconds()
return AggregatedNews(
keyword=q,
rss_feed_url=rss_data.get("feed_url", ""),
total_rss_entries=rss_data.get("entry_count", 0),
processed_entries=len(news_items),
news_items=news_items,
processing_time=processing_time
)
except httpx.HTTPStatusError as e:
logger.error(f"HTTP error: {e}")
raise HTTPException(status_code=e.response.status_code, detail=str(e))
except Exception as e:
logger.error(f"Error in aggregate_news: {e}")
raise HTTPException(status_code=500, detail=str(e))
async def search_google(query: str, num_results: int, lang: str, country: str) -> List[Dict[str, Any]]:
"""구글 검색 서비스 호출 - 전체 콘텐츠 포함"""
try:
# Full content API 직접 호출
response = await client.get(
f"{GOOGLE_SEARCH_SERVICE_URL}/api/search/full",
params={
"q": query,
"num": num_results,
"lang": lang,
"country": country
}
)
response.raise_for_status()
data = response.json()
results = data.get("results", [])
# full_content가 이미 포함되어 있으므로 그대로 반환
logger.info(f"Google search for '{query}' returned {len(results)} results with full content")
return results
except Exception as e:
logger.error(f"Google search error for '{query}': {e}")
# Fallback to basic search without full content
try:
response = await client.get(
f"{GOOGLE_SEARCH_SERVICE_URL}/api/search",
params={
"q": query,
"num": num_results,
"lang": lang,
"country": country
}
)
response.raise_for_status()
data = response.json()
return data.get("results", [])
except:
return []
@app.get("/api/aggregate/location", response_model=AggregatedNews)
async def aggregate_news_by_location(
location: str = Query(..., description="지역명 (예: Seoul, Tokyo)"),
limit: int = Query(10, description="처리할 RSS 항목 수", ge=1, le=50),
google_results_per_title: int = Query(5, description="각 제목당 구글 검색 결과 수", ge=1, le=10),
lang: str = Query("ko", description="언어 코드"),
country: str = Query("KR", description="국가 코드")
):
"""지역 기반 RSS 피드를 가져와서 각 제목을 구글 검색"""
start_time = datetime.now()
try:
# Get location-based RSS feed
logger.info(f"Fetching RSS feed for location: {location}")
rss_response = await client.get(
f"{RSS_SERVICE_URL}/api/google-rss/location",
params={"location": location, "lang": lang, "country": country}
)
rss_response.raise_for_status()
rss_data = rss_response.json()
if not rss_data.get("success"):
raise HTTPException(status_code=500, detail=f"RSS 피드 가져오기 실패: {rss_data.get('error')}")
# Process titles
news_items = []
titles = rss_data.get("sample_titles", [])[:limit]
search_tasks = []
for title in titles:
clean_title = title.split(" - ")[-1] if " - " in title else title
search_tasks.append(
search_google(clean_title, google_results_per_title, lang, country)
)
search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
for i, title in enumerate(titles):
google_results = []
if not isinstance(search_results[i], Exception):
google_results = search_results[i]
news_items.append(NewsItem(
rss_title=title,
google_results=google_results,
search_keyword=title.split(" - ")[-1] if " - " in title else title,
timestamp=datetime.now()
))
processing_time = (datetime.now() - start_time).total_seconds()
return AggregatedNews(
keyword=f"Location: {location}",
rss_feed_url=rss_data.get("feed_url", ""),
total_rss_entries=rss_data.get("entry_count", 0),
processed_entries=len(news_items),
news_items=news_items,
processing_time=processing_time
)
except Exception as e:
logger.error(f"Error in aggregate_news_by_location: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/aggregate/topic", response_model=AggregatedNews)
async def aggregate_news_by_topic(
category: str = Query(..., description="카테고리 (TECHNOLOGY, BUSINESS, HEALTH 등)"),
limit: int = Query(10, description="처리할 RSS 항목 수", ge=1, le=50),
google_results_per_title: int = Query(5, description="각 제목당 구글 검색 결과 수", ge=1, le=10),
lang: str = Query("ko", description="언어 코드"),
country: str = Query("KR", description="국가 코드")
):
"""주제별 RSS 피드를 가져와서 각 제목을 구글 검색"""
start_time = datetime.now()
try:
# Get topic-based RSS feed
logger.info(f"Fetching RSS feed for topic: {category}")
rss_response = await client.get(
f"{RSS_SERVICE_URL}/api/google-rss/topic",
params={"category": category, "lang": lang, "country": country}
)
rss_response.raise_for_status()
rss_data = rss_response.json()
if not rss_data.get("success"):
raise HTTPException(status_code=500, detail=f"RSS 피드 가져오기 실패: {rss_data.get('error')}")
# Process titles
news_items = []
titles = rss_data.get("sample_titles", [])[:limit]
search_tasks = []
for title in titles:
clean_title = title.split(" - ")[-1] if " - " in title else title
search_tasks.append(
search_google(clean_title, google_results_per_title, lang, country)
)
search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
for i, title in enumerate(titles):
google_results = []
if not isinstance(search_results[i], Exception):
google_results = search_results[i]
news_items.append(NewsItem(
rss_title=title,
google_results=google_results,
search_keyword=title.split(" - ")[-1] if " - " in title else title,
timestamp=datetime.now()
))
processing_time = (datetime.now() - start_time).total_seconds()
return AggregatedNews(
keyword=f"Topic: {category}",
rss_feed_url=rss_data.get("feed_url", ""),
total_rss_entries=rss_data.get("entry_count", 0),
processed_entries=len(news_items),
news_items=news_items,
processing_time=processing_time
)
except Exception as e:
logger.error(f"Error in aggregate_news_by_topic: {e}")
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,5 @@
fastapi==0.104.1
uvicorn[standard]==0.24.0
httpx==0.25.2
pydantic==2.5.0
python-multipart==0.0.6

View File

@ -0,0 +1,214 @@
#!/usr/bin/env python3
"""
News Aggregator Service Test
RSS 피드 제목을 구글 full content 검색으로 확장하는 통합 테스트
"""
import asyncio
import httpx
import json
from datetime import datetime
from typing import Dict, Any
# Service URL
SERVICE_URL = "http://localhost:8018"
async def test_aggregate_with_full_content():
"""키워드로 RSS 피드를 검색하고 full content 구글 검색 테스트"""
async with httpx.AsyncClient(timeout=60.0) as client:
print("\n" + "="*60)
print("뉴스 통합 서비스 Full Content 테스트")
print("="*60)
# Test with keyword "인공지능"
print("\n1. 키워드 '인공지능'으로 RSS 피드 검색 및 구글 full content 검색")
print("-" * 40)
response = await client.get(
f"{SERVICE_URL}/api/aggregate",
params={
"q": "인공지능",
"limit": 3, # 테스트용으로 3개만
"google_results_per_title": 2, # 각 제목당 2개 구글 결과
"lang": "ko",
"country": "KR"
}
)
if response.status_code == 200:
data = response.json()
print(f"✓ RSS 피드 URL: {data['rss_feed_url']}")
print(f"✓ 전체 RSS 항목 수: {data['total_rss_entries']}")
print(f"✓ 처리된 항목 수: {data['processed_entries']}")
print(f"✓ 처리 시간: {data['processing_time']:.2f}")
# Check each news item for full content
for i, item in enumerate(data['news_items'], 1):
print(f"\n [{i}] RSS 제목: {item['rss_title'][:50]}...")
print(f" 검색 키워드: {item['search_keyword'][:50]}...")
print(f" 구글 검색 결과 수: {len(item['google_results'])}")
# Check if google results have full_content
for j, result in enumerate(item['google_results'], 1):
has_full_content = 'full_content' in result
if has_full_content:
full_content = result.get('full_content', '')
if isinstance(full_content, str):
content_length = len(full_content)
else:
content_length = len(str(full_content))
else:
content_length = 0
print(f" - 결과 {j}: {result.get('title', 'N/A')[:40]}...")
print(f" Full Content 포함: {'' if has_full_content else ''}")
if has_full_content:
print(f" Content 길이: {content_length:,} 문자")
# Show first 200 chars of content
if isinstance(result['full_content'], str):
preview = result['full_content'][:200].replace('\n', ' ')
print(f" 미리보기: {preview}...")
else:
print(f" Content 타입: {type(result['full_content'])}")
print(f" Content 데이터: {str(result['full_content'])[:200]}...")
else:
print(f"✗ 오류: {response.status_code}")
print(f" 상세: {response.text}")
async def test_aggregate_by_location():
"""지역 기반 RSS 피드 및 full content 테스트"""
async with httpx.AsyncClient(timeout=60.0) as client:
print("\n" + "="*60)
print("지역 기반 뉴스 통합 Full Content 테스트")
print("="*60)
print("\n2. 지역 'Seoul'로 RSS 피드 검색 및 구글 full content 검색")
print("-" * 40)
response = await client.get(
f"{SERVICE_URL}/api/aggregate/location",
params={
"location": "Seoul",
"limit": 2,
"google_results_per_title": 2,
"lang": "ko",
"country": "KR"
}
)
if response.status_code == 200:
data = response.json()
print(f"✓ 지역: {data['keyword']}")
print(f"✓ RSS 피드 URL: {data['rss_feed_url']}")
print(f"✓ 처리된 항목 수: {data['processed_entries']}")
# Check full content availability
full_content_count = 0
total_content_size = 0
for item in data['news_items']:
for result in item['google_results']:
if 'full_content' in result:
full_content_count += 1
content = result['full_content']
if isinstance(content, str):
total_content_size += len(content)
else:
total_content_size += len(str(content))
print(f"\n📊 Full Content 통계:")
print(f" - Full Content 포함 결과: {full_content_count}")
print(f" - 전체 Content 크기: {total_content_size:,} 문자")
print(f" - 평균 Content 크기: {total_content_size//max(full_content_count, 1):,} 문자")
else:
print(f"✗ 오류: {response.status_code}")
async def test_aggregate_by_topic():
"""주제별 RSS 피드 및 full content 테스트"""
async with httpx.AsyncClient(timeout=60.0) as client:
print("\n" + "="*60)
print("주제별 뉴스 통합 Full Content 테스트")
print("="*60)
print("\n3. 주제 'TECHNOLOGY'로 RSS 피드 검색 및 구글 full content 검색")
print("-" * 40)
response = await client.get(
f"{SERVICE_URL}/api/aggregate/topic",
params={
"category": "TECHNOLOGY",
"limit": 2,
"google_results_per_title": 3,
"lang": "ko",
"country": "KR"
}
)
if response.status_code == 200:
data = response.json()
print(f"✓ 주제: {data['keyword']}")
print(f"✓ 처리 시간: {data['processing_time']:.2f}")
# Analyze content quality for AI summarization
print("\n📝 AI 요약을 위한 Content 품질 분석:")
for i, item in enumerate(data['news_items'], 1):
print(f"\n 뉴스 항목 {i}:")
for j, result in enumerate(item['google_results'], 1):
if 'full_content' in result:
content = result['full_content']
if isinstance(content, str):
# Check content quality indicators
has_paragraphs = '\n\n' in content or '</p>' in content
has_sufficient_length = len(content) > 500
has_korean = any(ord(char) >= 0xAC00 and ord(char) <= 0xD7A3 for char in content[:min(100, len(content))])
else:
content_str = str(content)
has_paragraphs = '\n\n' in content_str or '</p>' in content_str
has_sufficient_length = len(content_str) > 500
has_korean = any(ord(char) >= 0xAC00 and ord(char) <= 0xD7A3 for char in content_str[:min(100, len(content_str))])
print(f" 결과 {j} 품질 체크:")
print(f" - 충분한 길이 (>500자): {'' if has_sufficient_length else ''}")
print(f" - 단락 구조 포함: {'' if has_paragraphs else ''}")
print(f" - 한국어 콘텐츠: {'' if has_korean else ''}")
print(f" - AI 요약 가능: {'' if (has_sufficient_length and has_paragraphs) else ''}")
else:
print(f"✗ 오류: {response.status_code}")
async def test_health_check():
"""서비스 상태 확인"""
async with httpx.AsyncClient() as client:
print("\n" + "="*60)
print("서비스 Health Check")
print("="*60)
response = await client.get(f"{SERVICE_URL}/health")
if response.status_code == 200:
data = response.json()
print(f"✓ 통합 서비스 상태: {data['status']}")
print(f" - RSS 서비스: {data['services']['rss_feed']}")
print(f" - Google 검색 서비스: {data['services']['google_search']}")
else:
print(f"✗ Health check 실패: {response.status_code}")
async def main():
"""메인 테스트 실행"""
print("\n" + "="*70)
print(" News Aggregator Full Content Integration Test ")
print(" RSS 피드 + Google Full Content 통합 테스트 ")
print("="*70)
# Run tests
await test_health_check()
await test_aggregate_with_full_content()
await test_aggregate_by_location()
await test_aggregate_by_topic()
print("\n" + "="*70)
print(" 테스트 완료 - Full Content 통합 확인 ")
print("="*70)
print("\n✅ 모든 테스트가 완료되었습니다.")
print(" RSS 피드 제목을 구글 full content로 검색하는 기능이 정상 작동합니다.")
print(" AI 요약을 위한 충분한 콘텐츠가 수집되고 있습니다.")
if __name__ == "__main__":
asyncio.run(main())