diff --git a/backup-services/ai-writer/backend/Dockerfile b/backup-services/ai-writer/backend/Dockerfile new file mode 100644 index 0000000..a296111 --- /dev/null +++ b/backup-services/ai-writer/backend/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +COPY . . + +# Run the application +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] \ No newline at end of file diff --git a/services/google-search/backend/app/__init__.py b/backup-services/ai-writer/backend/app/__init__.py similarity index 100% rename from services/google-search/backend/app/__init__.py rename to backup-services/ai-writer/backend/app/__init__.py diff --git a/backup-services/ai-writer/backend/app/article_generator.py b/backup-services/ai-writer/backend/app/article_generator.py new file mode 100644 index 0000000..2712cf0 --- /dev/null +++ b/backup-services/ai-writer/backend/app/article_generator.py @@ -0,0 +1,218 @@ +""" +Article Generation Module +Claude API를 사용한 기사 생성 로직 +""" +from typing import Dict, Any, List, Optional +from datetime import datetime +import json +import uuid +import logging +from anthropic import AsyncAnthropic +from pydantic import BaseModel, Field + +logger = logging.getLogger(__name__) + +# Data Models +class NewsSource(BaseModel): + """뉴스 소스 정보""" + title: str + url: str + published_date: Optional[str] = None + source_site: str = "Unknown" + +class EventInfo(BaseModel): + """이벤트 정보""" + name: str + date: Optional[str] = None + location: Optional[str] = None + +class Entities(BaseModel): + """추출된 엔티티""" + people: List[str] = Field(default_factory=list) + organizations: List[str] = Field(default_factory=list) + groups: List[str] = Field(default_factory=list) + countries: List[str] = Field(default_factory=list) + events: List[EventInfo] = Field(default_factory=list) + keywords: List[str] = Field(default_factory=list) + +class SubTopic(BaseModel): + """기사 소주제""" + title: str + content: List[str] + +class GeneratedArticle(BaseModel): + """생성된 기사""" + news_id: str = Field(default_factory=lambda: str(uuid.uuid4())) + title: str + summary: str + subtopics: List[SubTopic] + categories: List[str] + entities: Entities + sources: List[NewsSource] = Field(default_factory=list) + created_at: datetime = Field(default_factory=datetime.now) + generation_metadata: Dict[str, Any] = Field(default_factory=dict) + +async def generate_article_with_claude( + news_data: Dict[str, Any], + style: str = "professional", + claude_api_key: str = None +) -> GeneratedArticle: + """Claude API를 사용하여 기사 생성""" + + if not claude_api_key: + import os + claude_api_key = os.getenv("CLAUDE_API_KEY") + + # Initialize Claude client + claude_client = AsyncAnthropic(api_key=claude_api_key) + + # Collect source information + sources_info = [] + + # Prepare the prompt + system_prompt = """당신은 전문적인 한국 언론사의 수석 기자입니다. + 제공된 데이터를 기반으로 깊이 있고 통찰력 있는 기사를 작성해야 합니다. + 기사는 다음 요구사항을 충족해야 합니다: + + 1. 소주제는 최소 2개, 최대 6개로 구성해야 합니다 + 2. 각 소주제는 최소 1개, 최대 10개의 문단으로 구성해야 합니다 + 3. 전문적이고 객관적인 어조를 유지해야 합니다 + 4. 사실에 기반한 분석과 통찰을 제공해야 합니다 + 5. 한국 독자를 대상으로 작성되어야 합니다 + 6. 이벤트 정보는 가능한 일시와 장소를 포함해야 합니다 + 7. 핵심 키워드를 최대 10개까지 추출해야 합니다 + + 반드시 다음 JSON 형식으로 응답하세요: + { + "title": "기사 제목", + "summary": "한 줄 요약 (100자 이내)", + "subtopics": [ + { + "title": "소주제 제목", + "content": ["문단1", "문단2", ...] // 1-10개 문단 + } + ], // 2-6개 소주제 + "categories": ["카테고리1", "카테고리2"], + "entities": { + "people": ["인물1", "인물2"], + "organizations": ["기관1", "기관2"], + "groups": ["단체1", "단체2"], + "countries": ["나라1", "나라2"], + "events": [ + { + "name": "이벤트명", + "date": "2025년 1월 15일", // 선택사항 + "location": "서울 코엑스" // 선택사항 + } + ], + "keywords": ["키워드1", "키워드2", ...] // 최대 10개 + } + }""" + + # Prepare news content for Claude and collect sources + news_content = [] + for item in news_data.get("news_items", []): + # Add RSS source info + rss_title = item.get('rss_title', '') + rss_link = item.get('rss_link', '') + rss_published = item.get('rss_published', '') + + if rss_title and rss_link: + sources_info.append(NewsSource( + title=rss_title, + url=rss_link, + published_date=rss_published, + source_site="RSS Feed" + )) + + item_text = f"제목: {rss_title}\n" + for result in item.get("google_results", []): + # Add Google search result sources + if "title" in result and "link" in result: + sources_info.append(NewsSource( + title=result.get('title', ''), + url=result.get('link', ''), + published_date=None, + source_site="Google Search" + )) + + if "full_content" in result and result["full_content"]: + content = result["full_content"] + if isinstance(content, dict): + item_text += f"출처: {content.get('url', '')}\n" + item_text += f"내용: {content.get('content', '')[:1000]}...\n\n" + else: + item_text += f"내용: {str(content)[:1000]}...\n\n" + news_content.append(item_text) + + combined_content = "\n".join(news_content[:10]) # Limit to prevent token overflow + + user_prompt = f"""다음 뉴스 데이터를 기반으로 종합적인 기사를 작성하세요: + +키워드: {news_data.get('keyword', '')} +수집된 뉴스 수: {len(news_data.get('news_items', []))} + +뉴스 내용: +{combined_content} + +스타일: {style} +- professional: 전통적인 뉴스 기사 스타일 +- analytical: 분석적이고 심층적인 스타일 +- investigative: 탐사보도 스타일 + +위의 데이터를 종합하여 통찰력 있는 기사를 JSON 형식으로 작성해주세요.""" + + try: + # Call Claude API + response = await claude_client.messages.create( + model="claude-3-5-sonnet-20241022", + max_tokens=4000, + temperature=0.7, + system=system_prompt, + messages=[ + {"role": "user", "content": user_prompt} + ] + ) + + # Parse response + content = response.content[0].text + + # Extract JSON from response + json_start = content.find('{') + json_end = content.rfind('}') + 1 + if json_start != -1 and json_end > json_start: + json_str = content[json_start:json_end] + article_data = json.loads(json_str) + else: + raise ValueError("No valid JSON found in response") + + # Create article object + article = GeneratedArticle( + title=article_data.get("title", ""), + summary=article_data.get("summary", ""), + subtopics=[ + SubTopic( + title=st.get("title", ""), + content=st.get("content", []) + ) for st in article_data.get("subtopics", []) + ], + categories=article_data.get("categories", []), + entities=Entities(**article_data.get("entities", {})), + sources=sources_info, + generation_metadata={ + "style": style, + "keyword": news_data.get('keyword', ''), + "model": "claude-3-5-sonnet-20241022", + "timestamp": datetime.now().isoformat() + } + ) + + logger.info(f"Successfully generated article: {article.title}") + return article + + except json.JSONDecodeError as e: + logger.error(f"Failed to parse Claude response as JSON: {e}") + raise + except Exception as e: + logger.error(f"Error generating article with Claude: {e}") + raise \ No newline at end of file diff --git a/backup-services/ai-writer/backend/app/main.py b/backup-services/ai-writer/backend/app/main.py new file mode 100644 index 0000000..1d5751a --- /dev/null +++ b/backup-services/ai-writer/backend/app/main.py @@ -0,0 +1,746 @@ +""" +AI Writer Service +Claude API를 사용한 전문적인 뉴스 기사 생성 서비스 +""" +from fastapi import FastAPI, HTTPException, BackgroundTasks +from fastapi.middleware.cors import CORSMiddleware +from typing import List, Dict, Any, Optional +from datetime import datetime +from pydantic import BaseModel, Field +import httpx +import asyncio +import logging +import json +import uuid +from anthropic import AsyncAnthropic +import os + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +app = FastAPI( + title="AI Writer Service", + description="Claude API를 사용한 전문적인 뉴스 기사 생성 서비스", + version="1.0.0" +) + +# CORS 설정 +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Configuration +NEWS_AGGREGATOR_URL = os.getenv("NEWS_AGGREGATOR_URL", "http://news-aggregator-backend:8000") +CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY", "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") +MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://mongodb:27017") +DB_NAME = os.getenv("DB_NAME", "ai_writer_db") + +# Claude client +claude_client = AsyncAnthropic(api_key=CLAUDE_API_KEY) + +# HTTP Client +http_client = httpx.AsyncClient(timeout=120.0) + +# Queue Manager +from app.queue_manager import RedisQueueManager +from app.queue_models import NewsJobData, JobResult, JobStatus, QueueStats +queue_manager = RedisQueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") +) + +# MongoDB client (optional for storing generated articles) +from motor.motor_asyncio import AsyncIOMotorClient +mongo_client = None +db = None + +# Data Models +class NewsSource(BaseModel): + """참고한 뉴스 소스 정보""" + title: str = Field(..., description="뉴스 제목") + url: str = Field(..., description="뉴스 URL") + published_date: Optional[str] = Field(None, description="발행일") + source_site: Optional[str] = Field(None, description="출처 사이트") +class SubTopic(BaseModel): + """기사 소주제""" + title: str = Field(..., description="소주제 제목") + content: List[str] = Field(..., description="소주제 내용 (문단 리스트)", min_items=1, max_items=10) + +class Event(BaseModel): + """이벤트 정보""" + name: str = Field(..., description="이벤트명") + date: Optional[str] = Field(None, description="일시") + location: Optional[str] = Field(None, description="장소") + +class NewsEntities(BaseModel): + """뉴스에 포함된 개체들""" + people: List[str] = Field(default_factory=list, description="뉴스에 포함된 인물") + organizations: List[str] = Field(default_factory=list, description="뉴스에 포함된 기관") + groups: List[str] = Field(default_factory=list, description="뉴스에 포함된 단체") + countries: List[str] = Field(default_factory=list, description="뉴스에 포함된 나라") + events: List[Event] = Field(default_factory=list, description="뉴스에 포함된 일정/이벤트 (일시와 장소 포함)") + keywords: List[str] = Field(default_factory=list, description="핵심 키워드 (최대 10개)", max_items=10) + +class GeneratedArticle(BaseModel): + """생성된 기사""" + news_id: str = Field(..., description="뉴스 아이디") + title: str = Field(..., description="뉴스 제목") + created_at: str = Field(..., description="생성년월일시분초") + summary: str = Field(..., description="한 줄 요약") + subtopics: List[SubTopic] = Field(..., description="소주제 리스트", min_items=2, max_items=6) + categories: List[str] = Field(..., description="카테고리 리스트") + entities: NewsEntities = Field(..., description="뉴스에 포함된 개체들") + source_keyword: Optional[str] = Field(None, description="원본 검색 키워드") + source_count: Optional[int] = Field(None, description="참조한 소스 수") + sources: List[NewsSource] = Field(default_factory=list, description="참고한 뉴스 소스 목록") + +class ArticleGenerationRequest(BaseModel): + """기사 생성 요청""" + keyword: str = Field(..., description="검색 키워드") + limit: int = Field(5, description="처리할 RSS 항목 수", ge=1, le=20) + google_results_per_title: int = Field(3, description="각 제목당 구글 검색 결과 수", ge=1, le=10) + lang: str = Field("ko", description="언어 코드") + country: str = Field("KR", description="국가 코드") + style: str = Field("professional", description="기사 스타일 (professional/analytical/investigative)") + +class PerItemGenerationRequest(BaseModel): + """개별 아이템별 기사 생성 요청""" + keyword: str = Field(..., description="검색 키워드") + limit: Optional[int] = Field(None, description="처리할 RSS 항목 수 (None이면 전체)") + google_results_per_title: int = Field(3, description="각 제목당 구글 검색 결과 수", ge=1, le=10) + lang: str = Field("ko", description="언어 코드") + country: str = Field("KR", description="국가 코드") + style: str = Field("professional", description="기사 스타일 (professional/analytical/investigative)") + skip_existing: bool = Field(True, description="이미 생성된 기사는 건너뛰기") + +@app.on_event("startup") +async def startup(): + """서비스 시작""" + global mongo_client, db + try: + mongo_client = AsyncIOMotorClient(MONGODB_URL) + db = mongo_client[DB_NAME] + logger.info("AI Writer Service starting...") + logger.info(f"Connected to MongoDB: {MONGODB_URL}") + + # Redis 큐 연결 + await queue_manager.connect() + logger.info("Connected to Redis queue") + except Exception as e: + logger.error(f"Failed to connect to services: {e}") + +@app.on_event("shutdown") +async def shutdown(): + """서비스 종료""" + await http_client.aclose() + if mongo_client: + mongo_client.close() + await queue_manager.disconnect() + logger.info("AI Writer Service stopped") + +@app.get("/") +async def root(): + return { + "service": "AI Writer Service", + "version": "1.0.0", + "description": "Claude API를 사용한 전문적인 뉴스 기사 생성 서비스", + "endpoints": { + "generate_article": "POST /api/generate", + "generate_per_item": "POST /api/generate/per-item", + "generate_from_aggregated": "POST /api/generate/from-aggregated", + "get_article": "GET /api/articles/{article_id}", + "list_articles": "GET /api/articles", + "health": "GET /health" + } + } + +@app.get("/health") +async def health_check(): + """헬스 체크""" + try: + # Check News Aggregator service + aggregator_response = await http_client.get(f"{NEWS_AGGREGATOR_URL}/health") + aggregator_healthy = aggregator_response.status_code == 200 + + # Check MongoDB + mongo_healthy = False + if db is not None: + await db.command("ping") + mongo_healthy = True + + return { + "status": "healthy" if (aggregator_healthy and mongo_healthy) else "degraded", + "services": { + "news_aggregator": "healthy" if aggregator_healthy else "unhealthy", + "mongodb": "healthy" if mongo_healthy else "unhealthy", + "claude_api": "configured" + }, + "timestamp": datetime.now().isoformat() + } + except Exception as e: + return { + "status": "unhealthy", + "error": str(e), + "timestamp": datetime.now().isoformat() + } + +async def generate_article_with_claude(news_data: Dict[str, Any], style: str = "professional") -> GeneratedArticle: + """Claude API를 사용하여 기사 생성""" + + # Collect source information + sources_info = [] + + # Prepare the prompt + system_prompt = """당신은 전문적인 한국 언론사의 수석 기자입니다. + 제공된 데이터를 기반으로 깊이 있고 통찰력 있는 기사를 작성해야 합니다. + 기사는 다음 요구사항을 충족해야 합니다: + + 1. 소주제는 최소 2개, 최대 6개로 구성해야 합니다 + 2. 각 소주제는 최소 1개, 최대 10개의 문단으로 구성해야 합니다 + 3. 전문적이고 객관적인 어조를 유지해야 합니다 + 4. 사실에 기반한 분석과 통찰을 제공해야 합니다 + 5. 한국 독자를 대상으로 작성되어야 합니다 + 6. 이벤트 정보는 가능한 일시와 장소를 포함해야 합니다 + 7. 핵심 키워드를 최대 10개까지 추출해야 합니다 + + 반드시 다음 JSON 형식으로 응답하세요: + { + "title": "기사 제목", + "summary": "한 줄 요약 (100자 이내)", + "subtopics": [ + { + "title": "소주제 제목", + "content": ["문단1", "문단2", ...] // 1-10개 문단 + } + ], // 2-6개 소주제 + "categories": ["카테고리1", "카테고리2"], + "entities": { + "people": ["인물1", "인물2"], + "organizations": ["기관1", "기관2"], + "groups": ["단체1", "단체2"], + "countries": ["나라1", "나라2"], + "events": [ + { + "name": "이벤트명", + "date": "2025년 1월 15일", // 선택사항 + "location": "서울 코엑스" // 선택사항 + } + ], + "keywords": ["키워드1", "키워드2", ...] // 최대 10개 + } + }""" + + # Prepare news content for Claude and collect sources + news_content = [] + for item in news_data.get("news_items", []): + # Add RSS source info + rss_title = item.get('rss_title', '') + rss_link = item.get('rss_link', '') + rss_published = item.get('rss_published', '') + + if rss_title and rss_link: + sources_info.append(NewsSource( + title=rss_title, + url=rss_link, + published_date=rss_published, + source_site="RSS Feed" + )) + + item_text = f"제목: {rss_title}\n" + for result in item.get("google_results", []): + # Add Google search result sources + if "title" in result and "link" in result: + sources_info.append(NewsSource( + title=result.get('title', ''), + url=result.get('link', ''), + published_date=None, + source_site="Google Search" + )) + + if "full_content" in result and result["full_content"]: + content = result["full_content"] + if isinstance(content, dict): + item_text += f"출처: {content.get('url', '')}\n" + item_text += f"내용: {content.get('content', '')[:1000]}...\n\n" + else: + item_text += f"내용: {str(content)[:1000]}...\n\n" + news_content.append(item_text) + + combined_content = "\n".join(news_content[:10]) # Limit to prevent token overflow + + user_prompt = f"""다음 뉴스 데이터를 기반으로 종합적인 기사를 작성하세요: + +키워드: {news_data.get('keyword', '')} +수집된 뉴스 수: {len(news_data.get('news_items', []))} + +뉴스 내용: +{combined_content} + +스타일: {style} +- professional: 전통적인 뉴스 기사 스타일 +- analytical: 분석적이고 심층적인 스타일 +- investigative: 탐사보도 스타일 + +위의 데이터를 종합하여 통찰력 있는 기사를 JSON 형식으로 작성해주세요.""" + + try: + # Call Claude API + response = await claude_client.messages.create( + model="claude-3-5-sonnet-20241022", # Latest Claude model + max_tokens=4000, + temperature=0.7, + system=system_prompt, + messages=[ + {"role": "user", "content": user_prompt} + ] + ) + + # Parse Claude's response + content = response.content[0].text + + # Extract JSON from response + import re + json_match = re.search(r'\{.*\}', content, re.DOTALL) + if json_match: + article_data = json.loads(json_match.group()) + else: + # If no JSON found, try to parse the entire content + article_data = json.loads(content) + + # Create GeneratedArticle object + entities_data = article_data.get("entities", {}) + events_data = entities_data.get("events", []) + + # Parse events - handle both old string format and new object format + parsed_events = [] + for event in events_data: + if isinstance(event, str): + # Old format: just event name as string + parsed_events.append(Event(name=event)) + elif isinstance(event, dict): + # New format: event object with name, date, location + parsed_events.append(Event( + name=event.get("name", ""), + date=event.get("date"), + location=event.get("location") + )) + + article = GeneratedArticle( + news_id=str(uuid.uuid4()), + title=article_data.get("title", "제목 없음"), + created_at=datetime.now().isoformat(), + summary=article_data.get("summary", ""), + subtopics=[ + SubTopic( + title=st.get("title", ""), + content=st.get("content", []) + ) for st in article_data.get("subtopics", []) + ], + categories=article_data.get("categories", []), + entities=NewsEntities( + people=entities_data.get("people", []), + organizations=entities_data.get("organizations", []), + groups=entities_data.get("groups", []), + countries=entities_data.get("countries", []), + events=parsed_events, + keywords=entities_data.get("keywords", []) + ), + source_keyword=news_data.get("keyword"), + source_count=len(news_data.get("news_items", [])), + sources=sources_info + ) + + return article + + except Exception as e: + logger.error(f"Error generating article with Claude: {e}") + raise HTTPException(status_code=500, detail=f"Failed to generate article: {str(e)}") + +@app.post("/api/generate") +async def generate_article(request: ArticleGenerationRequest): + """ + 뉴스 수집부터 기사 생성까지 전체 파이프라인 실행 + RSS → Google Search → AI 기사 생성 + 단일 종합 기사 생성 (기존 방식) + """ + try: + # Step 1: Get aggregated news from News Aggregator service + logger.info(f"Fetching aggregated news for keyword: {request.keyword}") + + aggregator_response = await http_client.get( + f"{NEWS_AGGREGATOR_URL}/api/aggregate", + params={ + "q": request.keyword, + "limit": request.limit, + "google_results_per_title": request.google_results_per_title, + "lang": request.lang, + "country": request.country + } + ) + aggregator_response.raise_for_status() + news_data = aggregator_response.json() + + if not news_data.get("news_items"): + raise HTTPException(status_code=404, detail="No news items found for the given keyword") + + # Step 2: Generate article using Claude + logger.info(f"Generating article with Claude for {len(news_data['news_items'])} news items") + article = await generate_article_with_claude(news_data, request.style) + + # Step 3: Store article in MongoDB (optional) + if db is not None: + try: + article_dict = article.dict() + await db.articles.insert_one(article_dict) + logger.info(f"Article saved with ID: {article.news_id}") + except Exception as e: + logger.error(f"Failed to save article to MongoDB: {e}") + + return article + + except httpx.HTTPStatusError as e: + logger.error(f"HTTP error from aggregator service: {e}") + raise HTTPException(status_code=e.response.status_code, detail=str(e)) + except Exception as e: + logger.error(f"Error in generate_article: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.post("/api/generate/from-aggregated", response_model=GeneratedArticle) +async def generate_from_aggregated_data(news_data: Dict[str, Any], style: str = "professional"): + """ + 이미 수집된 뉴스 데이터로부터 직접 기사 생성 + (News Aggregator 결과를 직접 입력받아 처리) + """ + try: + if not news_data.get("news_items"): + raise HTTPException(status_code=400, detail="No news items in provided data") + + # Generate article using Claude + logger.info(f"Generating article from {len(news_data['news_items'])} news items") + article = await generate_article_with_claude(news_data, style) + + # Store article in MongoDB + if db is not None: + try: + article_dict = article.dict() + await db.articles.insert_one(article_dict) + logger.info(f"Article saved with ID: {article.news_id}") + except Exception as e: + logger.error(f"Failed to save article to MongoDB: {e}") + + return article + + except Exception as e: + logger.error(f"Error in generate_from_aggregated_data: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/articles/{article_id}", response_model=GeneratedArticle) +async def get_article(article_id: str): + """저장된 기사 조회""" + if db is None: + raise HTTPException(status_code=503, detail="Database not available") + + article = await db.articles.find_one({"news_id": article_id}) + if not article: + raise HTTPException(status_code=404, detail="Article not found") + + # Convert MongoDB document to GeneratedArticle + article.pop("_id", None) + return GeneratedArticle(**article) + +@app.get("/api/articles") +async def list_articles( + skip: int = 0, + limit: int = 10, + keyword: Optional[str] = None, + category: Optional[str] = None +): + """저장된 기사 목록 조회""" + if db is None: + raise HTTPException(status_code=503, detail="Database not available") + + query = {} + if keyword: + query["source_keyword"] = {"$regex": keyword, "$options": "i"} + if category: + query["categories"] = category + + cursor = db.articles.find(query).skip(skip).limit(limit).sort("created_at", -1) + articles = [] + async for article in cursor: + article.pop("_id", None) + articles.append(article) + + total = await db.articles.count_documents(query) + + return { + "articles": articles, + "total": total, + "skip": skip, + "limit": limit + } + +@app.post("/api/generate/batch") +async def generate_batch_articles(keywords: List[str], style: str = "professional"): + """여러 키워드에 대한 기사 일괄 생성""" + results = [] + errors = [] + + for keyword in keywords[:5]: # Limit to 5 keywords to prevent overload + try: + request = ArticleGenerationRequest( + keyword=keyword, + style=style + ) + article = await generate_article(request) + results.append({ + "keyword": keyword, + "status": "success", + "article_id": article.news_id, + "title": article.title + }) + except Exception as e: + errors.append({ + "keyword": keyword, + "status": "error", + "error": str(e) + }) + + return { + "success": results, + "errors": errors, + "total_processed": len(results) + len(errors) + } + +@app.post("/api/generate/per-item") +async def generate_articles_per_rss_item(request: PerItemGenerationRequest): + """ + RSS 피드의 각 아이템별로 개별 기사 생성 + 각 RSS 아이템이 독립적인 기사가 됨 + 중복 생성 방지 기능 포함 + """ + try: + # Step 1: Get aggregated news from News Aggregator service + logger.info(f"Fetching aggregated news for keyword: {request.keyword}") + + # limit이 None이면 모든 항목 처리 (최대 100개로 제한) + actual_limit = request.limit if request.limit is not None else 100 + + aggregator_response = await http_client.get( + f"{NEWS_AGGREGATOR_URL}/api/aggregate", + params={ + "q": request.keyword, + "limit": actual_limit, + "google_results_per_title": request.google_results_per_title, + "lang": request.lang, + "country": request.country + } + ) + aggregator_response.raise_for_status() + news_data = aggregator_response.json() + + if not news_data.get("news_items"): + raise HTTPException(status_code=404, detail="No news items found for the given keyword") + + # Step 2: Check for existing articles if skip_existing is True + existing_titles = set() + skipped_count = 0 + + if request.skip_existing and db is not None: + # RSS 제목으로 중복 체크 (최근 24시간 내) + from datetime import datetime, timedelta + cutoff_time = (datetime.now() - timedelta(hours=24)).isoformat() + + existing_cursor = db.articles.find( + { + "source_keyword": request.keyword, + "created_at": {"$gte": cutoff_time} + }, + {"sources": 1} + ) + + async for doc in existing_cursor: + for source in doc.get("sources", []): + if source.get("source_site") == "RSS Feed": + existing_titles.add(source.get("title", "")) + + # Step 3: Generate individual article for each RSS item + generated_articles = [] + + for item in news_data["news_items"]: + try: + rss_title = item.get('rss_title', '') + + # Skip if already exists + if request.skip_existing and rss_title in existing_titles: + logger.info(f"Skipping already generated article: {rss_title}") + skipped_count += 1 + continue + + logger.info(f"Generating article for RSS item: {rss_title or 'Unknown'}") + + # Create individual news_data for this item + individual_news_data = { + "keyword": news_data.get("keyword"), + "news_items": [item] # Single item only + } + + # Generate article for this single item + article = await generate_article_with_claude(individual_news_data, request.style) + + # Store in MongoDB + if db is not None: + try: + article_dict = article.dict() + await db.articles.insert_one(article_dict) + logger.info(f"Article saved with ID: {article.news_id}") + except Exception as e: + logger.error(f"Failed to save article to MongoDB: {e}") + + generated_articles.append(article) + + except Exception as e: + logger.error(f"Failed to generate article for item: {e}") + # Continue with next item even if one fails + continue + + if not generated_articles and skipped_count == 0: + raise HTTPException(status_code=500, detail="Failed to generate any articles") + + # Return all generated articles + return { + "total_generated": len(generated_articles), + "total_items": len(news_data["news_items"]), + "skipped_duplicates": skipped_count, + "articles": generated_articles + } + + except httpx.HTTPStatusError as e: + logger.error(f"HTTP error from aggregator service: {e}") + raise HTTPException(status_code=e.response.status_code, detail=str(e)) + except Exception as e: + logger.error(f"Error in generate_articles_per_rss_item: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +# Queue Management Endpoints + +@app.post("/api/queue/enqueue") +async def enqueue_items(request: PerItemGenerationRequest): + """ + RSS 아이템들을 큐에 추가 (비동기 처리) + Consumer 워커가 백그라운드에서 처리 + """ + try: + # Step 1: Get aggregated news from News Aggregator service + logger.info(f"Fetching aggregated news for enqueue: {request.keyword}") + + actual_limit = request.limit if request.limit is not None else 100 + + aggregator_response = await http_client.get( + f"{NEWS_AGGREGATOR_URL}/api/aggregate", + params={ + "q": request.keyword, + "limit": actual_limit, + "google_results_per_title": request.google_results_per_title, + "lang": request.lang, + "country": request.country + } + ) + aggregator_response.raise_for_status() + news_data = aggregator_response.json() + + if not news_data.get("news_items"): + raise HTTPException(status_code=404, detail="No news items found for the given keyword") + + # Step 2: Check for existing articles if skip_existing is True + existing_titles = set() + skipped_count = 0 + + if request.skip_existing and db is not None: + from datetime import datetime, timedelta + cutoff_time = (datetime.now() - timedelta(hours=24)).isoformat() + + existing_cursor = db.articles.find( + { + "source_keyword": request.keyword, + "created_at": {"$gte": cutoff_time} + }, + {"sources": 1} + ) + + async for doc in existing_cursor: + for source in doc.get("sources", []): + if source.get("source_site") == "RSS Feed": + existing_titles.add(source.get("title", "")) + + # Step 3: Enqueue items for processing + enqueued_jobs = [] + + for item in news_data["news_items"]: + rss_title = item.get('rss_title', '') + + # Skip if already exists + if request.skip_existing and rss_title in existing_titles: + logger.info(f"Skipping already generated article: {rss_title}") + skipped_count += 1 + continue + + # Create job data + job_data = NewsJobData( + job_id=str(uuid.uuid4()), + keyword=request.keyword, + rss_title=rss_title, + rss_link=item.get('rss_link'), + rss_published=item.get('rss_published'), + google_results=item.get('google_results', []), + style=request.style, + created_at=datetime.now() + ) + + # Enqueue job + job_id = await queue_manager.enqueue(job_data) + enqueued_jobs.append({ + "job_id": job_id, + "title": rss_title[:100] + }) + + logger.info(f"Enqueued job {job_id} for: {rss_title}") + + return { + "total_enqueued": len(enqueued_jobs), + "total_items": len(news_data["news_items"]), + "skipped_duplicates": skipped_count, + "jobs": enqueued_jobs, + "message": f"{len(enqueued_jobs)} jobs added to queue for processing" + } + + except httpx.HTTPStatusError as e: + logger.error(f"HTTP error from aggregator service: {e}") + raise HTTPException(status_code=e.response.status_code, detail=str(e)) + except Exception as e: + logger.error(f"Error in enqueue_items: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/queue/stats", response_model=QueueStats) +async def get_queue_stats(): + """큐 상태 및 통계 조회""" + try: + stats = await queue_manager.get_stats() + return stats + except Exception as e: + logger.error(f"Error getting queue stats: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.delete("/api/queue/clear") +async def clear_queue(): + """큐 초기화 (관리자용)""" + try: + await queue_manager.clear_queue() + return {"message": "Queue cleared successfully"} + except Exception as e: + logger.error(f"Error clearing queue: {e}") + raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file diff --git a/backup-services/ai-writer/backend/app/queue_manager.py b/backup-services/ai-writer/backend/app/queue_manager.py new file mode 100644 index 0000000..2e0695a --- /dev/null +++ b/backup-services/ai-writer/backend/app/queue_manager.py @@ -0,0 +1,250 @@ +""" +Redis Queue Manager for AI Writer Service +Redis를 사용한 작업 큐 관리 +""" +import redis.asyncio as redis +import json +import uuid +from typing import Optional, List, Dict, Any +from datetime import datetime, timedelta +import logging +from queue_models import NewsJobData, JobResult, JobStatus, QueueStats + +logger = logging.getLogger(__name__) + +class RedisQueueManager: + """Redis 기반 작업 큐 매니저""" + + def __init__(self, redis_url: str = "redis://redis:6379"): + self.redis_url = redis_url + self.redis_client: Optional[redis.Redis] = None + + # Redis 키 정의 + self.QUEUE_KEY = "ai_writer:queue:pending" + self.PROCESSING_KEY = "ai_writer:queue:processing" + self.COMPLETED_KEY = "ai_writer:queue:completed" + self.FAILED_KEY = "ai_writer:queue:failed" + self.STATS_KEY = "ai_writer:stats" + self.WORKERS_KEY = "ai_writer:workers" + self.LOCK_PREFIX = "ai_writer:lock:" + + async def connect(self): + """Redis 연결""" + if not self.redis_client: + self.redis_client = await redis.from_url( + self.redis_url, + encoding="utf-8", + decode_responses=True + ) + logger.info("Connected to Redis queue") + + async def disconnect(self): + """Redis 연결 해제""" + if self.redis_client: + await self.redis_client.close() + self.redis_client = None + logger.info("Disconnected from Redis queue") + + async def enqueue(self, job_data: NewsJobData) -> str: + """작업을 큐에 추가""" + try: + if not job_data.job_id: + job_data.job_id = str(uuid.uuid4()) + + # JSON으로 직렬화 + job_json = job_data.json() + + # 우선순위에 따라 큐에 추가 + if job_data.priority > 0: + # 높은 우선순위는 앞쪽에 + await self.redis_client.lpush(self.QUEUE_KEY, job_json) + else: + # 일반 우선순위는 뒤쪽에 + await self.redis_client.rpush(self.QUEUE_KEY, job_json) + + # 통계 업데이트 + await self.redis_client.hincrby(self.STATS_KEY, "total_jobs", 1) + await self.redis_client.hincrby(self.STATS_KEY, "pending_jobs", 1) + + logger.info(f"Job {job_data.job_id} enqueued") + return job_data.job_id + + except Exception as e: + logger.error(f"Failed to enqueue job: {e}") + raise + + async def dequeue(self, timeout: int = 0) -> Optional[NewsJobData]: + """큐에서 작업 가져오기 (블로킹 가능)""" + try: + # 대기 중인 작업을 가져와서 처리 중 목록으로 이동 + if timeout > 0: + result = await self.redis_client.blmove( + self.QUEUE_KEY, + self.PROCESSING_KEY, + timeout, + "LEFT", + "RIGHT" + ) + else: + result = await self.redis_client.lmove( + self.QUEUE_KEY, + self.PROCESSING_KEY, + "LEFT", + "RIGHT" + ) + + if result: + # 통계 업데이트 + await self.redis_client.hincrby(self.STATS_KEY, "pending_jobs", -1) + await self.redis_client.hincrby(self.STATS_KEY, "processing_jobs", 1) + + return NewsJobData.parse_raw(result) + + return None + + except Exception as e: + logger.error(f"Failed to dequeue job: {e}") + return None + + async def mark_completed(self, job_id: str, article_id: str): + """작업을 완료로 표시""" + try: + # 처리 중 목록에서 작업 찾기 + processing_jobs = await self.redis_client.lrange(self.PROCESSING_KEY, 0, -1) + + for job_json in processing_jobs: + job = NewsJobData.parse_raw(job_json) + if job.job_id == job_id: + # 처리 중 목록에서 제거 + await self.redis_client.lrem(self.PROCESSING_KEY, 1, job_json) + + # 완료 결과 생성 + result = JobResult( + job_id=job_id, + status=JobStatus.COMPLETED, + article_id=article_id, + completed_at=datetime.now() + ) + + # 완료 목록에 추가 (최대 1000개 유지) + await self.redis_client.lpush(self.COMPLETED_KEY, result.json()) + await self.redis_client.ltrim(self.COMPLETED_KEY, 0, 999) + + # 통계 업데이트 + await self.redis_client.hincrby(self.STATS_KEY, "processing_jobs", -1) + await self.redis_client.hincrby(self.STATS_KEY, "completed_jobs", 1) + + logger.info(f"Job {job_id} marked as completed") + break + + except Exception as e: + logger.error(f"Failed to mark job as completed: {e}") + + async def mark_failed(self, job_id: str, error_message: str): + """작업을 실패로 표시""" + try: + # 처리 중 목록에서 작업 찾기 + processing_jobs = await self.redis_client.lrange(self.PROCESSING_KEY, 0, -1) + + for job_json in processing_jobs: + job = NewsJobData.parse_raw(job_json) + if job.job_id == job_id: + # 처리 중 목록에서 제거 + await self.redis_client.lrem(self.PROCESSING_KEY, 1, job_json) + + # 재시도 확인 + if job.retry_count < job.max_retries: + job.retry_count += 1 + # 다시 큐에 추가 + await self.redis_client.rpush(self.QUEUE_KEY, job.json()) + await self.redis_client.hincrby(self.STATS_KEY, "pending_jobs", 1) + logger.info(f"Job {job_id} requeued (retry {job.retry_count}/{job.max_retries})") + else: + # 실패 결과 생성 + result = JobResult( + job_id=job_id, + status=JobStatus.FAILED, + error_message=error_message, + completed_at=datetime.now() + ) + + # 실패 목록에 추가 + await self.redis_client.lpush(self.FAILED_KEY, result.json()) + await self.redis_client.ltrim(self.FAILED_KEY, 0, 999) + + # 통계 업데이트 + await self.redis_client.hincrby(self.STATS_KEY, "failed_jobs", 1) + logger.error(f"Job {job_id} marked as failed: {error_message}") + + await self.redis_client.hincrby(self.STATS_KEY, "processing_jobs", -1) + break + + except Exception as e: + logger.error(f"Failed to mark job as failed: {e}") + + async def get_stats(self) -> QueueStats: + """큐 통계 조회""" + try: + stats_data = await self.redis_client.hgetall(self.STATS_KEY) + + # 활성 워커 수 계산 + workers = await self.redis_client.smembers(self.WORKERS_KEY) + active_workers = 0 + for worker_id in workers: + # 워커가 최근 1분 이내에 활동했는지 확인 + last_ping = await self.redis_client.get(f"{self.WORKERS_KEY}:{worker_id}") + if last_ping: + last_ping_time = datetime.fromisoformat(last_ping) + if datetime.now() - last_ping_time < timedelta(minutes=1): + active_workers += 1 + + return QueueStats( + pending_jobs=int(stats_data.get("pending_jobs", 0)), + processing_jobs=int(stats_data.get("processing_jobs", 0)), + completed_jobs=int(stats_data.get("completed_jobs", 0)), + failed_jobs=int(stats_data.get("failed_jobs", 0)), + total_jobs=int(stats_data.get("total_jobs", 0)), + workers_active=active_workers + ) + + except Exception as e: + logger.error(f"Failed to get stats: {e}") + return QueueStats( + pending_jobs=0, + processing_jobs=0, + completed_jobs=0, + failed_jobs=0, + total_jobs=0, + workers_active=0 + ) + + async def register_worker(self, worker_id: str): + """워커 등록""" + await self.redis_client.sadd(self.WORKERS_KEY, worker_id) + await self.redis_client.set( + f"{self.WORKERS_KEY}:{worker_id}", + datetime.now().isoformat(), + ex=300 # 5분 후 자동 만료 + ) + + async def ping_worker(self, worker_id: str): + """워커 활동 업데이트""" + await self.redis_client.set( + f"{self.WORKERS_KEY}:{worker_id}", + datetime.now().isoformat(), + ex=300 + ) + + async def unregister_worker(self, worker_id: str): + """워커 등록 해제""" + await self.redis_client.srem(self.WORKERS_KEY, worker_id) + await self.redis_client.delete(f"{self.WORKERS_KEY}:{worker_id}") + + async def clear_queue(self): + """큐 초기화 (테스트용)""" + await self.redis_client.delete(self.QUEUE_KEY) + await self.redis_client.delete(self.PROCESSING_KEY) + await self.redis_client.delete(self.COMPLETED_KEY) + await self.redis_client.delete(self.FAILED_KEY) + await self.redis_client.delete(self.STATS_KEY) + logger.info("Queue cleared") \ No newline at end of file diff --git a/backup-services/ai-writer/backend/app/queue_models.py b/backup-services/ai-writer/backend/app/queue_models.py new file mode 100644 index 0000000..6cb9402 --- /dev/null +++ b/backup-services/ai-writer/backend/app/queue_models.py @@ -0,0 +1,49 @@ +""" +Queue Models for AI Writer Service +Redis 큐에서 사용할 데이터 모델 정의 +""" +from pydantic import BaseModel, Field +from typing import Optional, List, Dict, Any +from datetime import datetime +from enum import Enum + +class JobStatus(str, Enum): + """작업 상태""" + PENDING = "pending" + PROCESSING = "processing" + COMPLETED = "completed" + FAILED = "failed" + SKIPPED = "skipped" + +class NewsJobData(BaseModel): + """큐에 들어갈 뉴스 작업 데이터""" + job_id: str = Field(..., description="작업 고유 ID") + keyword: str = Field(..., description="원본 검색 키워드") + rss_title: str = Field(..., description="RSS 제목") + rss_link: Optional[str] = Field(None, description="RSS 링크") + rss_published: Optional[str] = Field(None, description="RSS 발행일") + google_results: List[Dict[str, Any]] = Field(default_factory=list, description="구글 검색 결과") + style: str = Field("professional", description="기사 스타일") + created_at: datetime = Field(default_factory=datetime.now, description="작업 생성 시간") + priority: int = Field(0, description="우선순위 (높을수록 우선)") + retry_count: int = Field(0, description="재시도 횟수") + max_retries: int = Field(3, description="최대 재시도 횟수") + +class JobResult(BaseModel): + """작업 결과""" + job_id: str = Field(..., description="작업 고유 ID") + status: JobStatus = Field(..., description="작업 상태") + article_id: Optional[str] = Field(None, description="생성된 기사 ID") + error_message: Optional[str] = Field(None, description="에러 메시지") + processing_time: Optional[float] = Field(None, description="처리 시간(초)") + completed_at: Optional[datetime] = Field(None, description="완료 시간") + +class QueueStats(BaseModel): + """큐 통계""" + pending_jobs: int = Field(..., description="대기 중인 작업 수") + processing_jobs: int = Field(..., description="처리 중인 작업 수") + completed_jobs: int = Field(..., description="완료된 작업 수") + failed_jobs: int = Field(..., description="실패한 작업 수") + total_jobs: int = Field(..., description="전체 작업 수") + workers_active: int = Field(..., description="활성 워커 수") + average_processing_time: Optional[float] = Field(None, description="평균 처리 시간(초)") \ No newline at end of file diff --git a/backup-services/ai-writer/backend/app/worker.py b/backup-services/ai-writer/backend/app/worker.py new file mode 100644 index 0000000..e859904 --- /dev/null +++ b/backup-services/ai-writer/backend/app/worker.py @@ -0,0 +1,201 @@ +""" +AI Writer Consumer Worker +큐에서 작업을 가져와 기사를 생성하는 백그라운드 워커 +""" +import asyncio +import logging +import signal +import sys +import uuid +from datetime import datetime +from typing import Optional +import os + +from motor.motor_asyncio import AsyncIOMotorClient +from anthropic import AsyncAnthropic + +from queue_manager import RedisQueueManager +from queue_models import NewsJobData, JobStatus +from article_generator import generate_article_with_claude + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +class AIWriterWorker: + """AI Writer 백그라운드 워커""" + + def __init__(self, worker_id: Optional[str] = None): + self.worker_id = worker_id or str(uuid.uuid4()) + self.queue_manager = RedisQueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") + ) + + # MongoDB 설정 + self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017") + self.db_name = os.getenv("DB_NAME", "ai_writer_db") + self.mongo_client = None + self.db = None + + # Claude 클라이언트 + self.claude_api_key = os.getenv("CLAUDE_API_KEY") + self.claude_client = AsyncAnthropic(api_key=self.claude_api_key) + + # 실행 상태 + self.running = False + self.tasks = [] + + async def start(self, num_workers: int = 1): + """워커 시작""" + logger.info(f"Starting AI Writer Worker {self.worker_id} with {num_workers} concurrent workers") + + try: + # Redis 연결 + await self.queue_manager.connect() + await self.queue_manager.register_worker(self.worker_id) + + # MongoDB 연결 + self.mongo_client = AsyncIOMotorClient(self.mongodb_url) + self.db = self.mongo_client[self.db_name] + logger.info("Connected to MongoDB") + + self.running = True + + # 여러 워커 태스크 생성 + for i in range(num_workers): + task = asyncio.create_task(self._process_jobs(f"{self.worker_id}-{i}")) + self.tasks.append(task) + + # 워커 핑 태스크 + ping_task = asyncio.create_task(self._ping_worker()) + self.tasks.append(ping_task) + + # 모든 태스크 대기 + await asyncio.gather(*self.tasks) + + except Exception as e: + logger.error(f"Worker error: {e}") + finally: + await self.stop() + + async def stop(self): + """워커 정지""" + logger.info(f"Stopping AI Writer Worker {self.worker_id}") + self.running = False + + # 태스크 취소 + for task in self.tasks: + task.cancel() + + # 워커 등록 해제 + await self.queue_manager.unregister_worker(self.worker_id) + + # 연결 해제 + await self.queue_manager.disconnect() + if self.mongo_client: + self.mongo_client.close() + + logger.info(f"Worker {self.worker_id} stopped") + + async def _process_jobs(self, sub_worker_id: str): + """작업 처리 루프""" + logger.info(f"Sub-worker {sub_worker_id} started") + + while self.running: + try: + # 큐에서 작업 가져오기 (5초 타임아웃) + job = await self.queue_manager.dequeue(timeout=5) + + if job: + logger.info(f"[{sub_worker_id}] Processing job {job.job_id}: {job.rss_title[:50]}") + start_time = datetime.now() + + try: + # 기사 생성 + article = await self._generate_article(job) + + # MongoDB에 저장 + if article and self.db is not None: + article_dict = article.dict() + await self.db.articles.insert_one(article_dict) + + # 처리 시간 계산 + processing_time = (datetime.now() - start_time).total_seconds() + + # 완료 표시 + await self.queue_manager.mark_completed( + job.job_id, + article.news_id + ) + + logger.info(f"[{sub_worker_id}] Job {job.job_id} completed in {processing_time:.2f}s") + else: + raise Exception("Failed to generate article") + + except Exception as e: + logger.error(f"[{sub_worker_id}] Job {job.job_id} failed: {e}") + await self.queue_manager.mark_failed(job.job_id, str(e)) + + except asyncio.CancelledError: + break + except Exception as e: + logger.error(f"[{sub_worker_id}] Worker error: {e}") + await asyncio.sleep(1) + + logger.info(f"Sub-worker {sub_worker_id} stopped") + + async def _generate_article(self, job: NewsJobData): + """기사 생성""" + # 작업 데이터를 기존 형식으로 변환 + news_data = { + "keyword": job.keyword, + "news_items": [{ + "rss_title": job.rss_title, + "rss_link": job.rss_link, + "rss_published": job.rss_published, + "google_results": job.google_results + }] + } + + # 기사 생성 (기존 함수 재사용) + return await generate_article_with_claude(news_data, job.style) + + async def _ping_worker(self): + """워커 활동 신호 전송""" + while self.running: + try: + await self.queue_manager.ping_worker(self.worker_id) + await asyncio.sleep(30) # 30초마다 핑 + except asyncio.CancelledError: + break + except Exception as e: + logger.error(f"Ping error: {e}") + +def signal_handler(signum, frame): + """시그널 핸들러""" + logger.info(f"Received signal {signum}") + sys.exit(0) + +async def main(): + """메인 함수""" + # 시그널 핸들러 등록 + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + # 워커 수 설정 (환경변수 또는 기본값) + num_workers = int(os.getenv("WORKER_COUNT", "3")) + + # 워커 시작 + worker = AIWriterWorker() + try: + await worker.start(num_workers=num_workers) + except KeyboardInterrupt: + logger.info("Keyboard interrupt received") + finally: + await worker.stop() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/backup-services/ai-writer/backend/article_전기차_analytical.json b/backup-services/ai-writer/backend/article_전기차_analytical.json new file mode 100644 index 0000000..1600e04 --- /dev/null +++ b/backup-services/ai-writer/backend/article_전기차_analytical.json @@ -0,0 +1,62 @@ +{ + "news_id": "49bdf2f3-4dbc-47eb-8c49-5d9536f41d87", + "title": "유럽 전기차 시장의 새로운 전환점: 현대차·기아의 소형 전기차 전략과 글로벌 경쟁 구도", + "created_at": "2025-09-13T00:29:13.376541", + "summary": "현대차와 기아가 IAA 2025에서 소형 전기차 콘셉트 모델을 공개하며 유럽 시장 공략을 가속화, 배터리 협력과 가격 경쟁력으로 승부수", + "subtopics": [ + { + "title": "현대차·기아의 유럽 소형 전기차 시장 공략", + "content": [ + "현대자동차와 기아가 IAA 2025에서 콘셉트 쓰리와 EV2를 공개하며 유럽 소형 전기차 시장 공략에 박차를 가하고 있다. 이는 유럽의 급성장하는 소형 전기차 수요에 대응하기 위한 전략적 움직임으로 평가된다.", + "특히 두 모델은 실용성과 경제성을 모두 갖춘 제품으로, 유럽 소비자들의 니즈를 정확히 겨냥했다는 평가를 받고 있다. 현대차그룹은 이를 통해 유럽 시장에서의 입지를 더욱 강화할 것으로 전망된다.", + "현지 전문가들은 현대차그룹의 이번 전략이 유럽 전기차 시장의 '골든타임'을 잡기 위한 시의적절한 움직임이라고 분석하고 있다." + ] + }, + { + "title": "배터리 공급망 전략의 중요성 부각", + "content": [ + "전기차 시장에서 배터리 공급망 확보가 핵심 경쟁력으로 부상하고 있다. IAA 모빌리티에서 폴스타가 SK온을 배터리 파트너로 공개적으로 언급한 것이 주목받고 있다.", + "배터리 제조사 선정에 대한 정보가 제한적인 가운데, 안정적인 배터리 공급망 구축이 전기차 제조사들의 성패를 좌우할 것으로 예상된다.", + "특히 소형 전기차의 경우 가격 경쟁력이 중요한 만큼, 효율적인 배터리 수급 전략이 시장 점유율 확대의 관건이 될 전망이다." + ] + }, + { + "title": "글로벌 전기차 시장의 경쟁 구도 변화", + "content": [ + "유럽 전기차 시장에서 소형 모델을 중심으로 한 경쟁이 본격화되면서, 제조사들의 전략적 포지셔닝이 더욱 중요해지고 있다.", + "현대차그룹은 품질과 기술력을 바탕으로 한 프리미엄 이미지와 함께, 합리적인 가격대의 소형 전기차 라인업으로 시장 공략을 가속화하고 있다.", + "이러한 변화는 글로벌 자동차 산업의 패러다임 전환을 반영하며, 향후 전기차 시장의 주도권 경쟁이 더욱 치열해질 것으로 예상된다." + ] + } + ], + "categories": [ + "자동차", + "경제", + "환경", + "기술" + ], + "entities": { + "people": [], + "organizations": [ + "현대자동차", + "기아", + "SK온", + "폴스타" + ], + "groups": [ + "유럽 자동차 제조사", + "배터리 제조업체" + ], + "countries": [ + "대한민국", + "독일", + "유럽연합" + ], + "events": [ + "IAA 2025", + "IAA 모빌리티" + ] + }, + "source_keyword": "전기차", + "source_count": 3 +} \ No newline at end of file diff --git a/backup-services/ai-writer/backend/article_전기차_investigative.json b/backup-services/ai-writer/backend/article_전기차_investigative.json new file mode 100644 index 0000000..3162f6f --- /dev/null +++ b/backup-services/ai-writer/backend/article_전기차_investigative.json @@ -0,0 +1,66 @@ +{ + "news_id": "8a51bead-4558-4351-a5b2-b5e5ba1b3d38", + "title": "현대차·기아, 유럽 전기차 시장서 소형 모델로 새 돌파구 모색", + "created_at": "2025-09-13T00:29:35.661926", + "summary": "IAA 모빌리티 2025에서 현대차·기아가 소형 전기차 콘셉트카를 공개하며 유럽 시장 공략 가속화. 배터리 공급망 확보와 가격 경쟁력이 성공 관건", + "subtopics": [ + { + "title": "유럽 소형 전기차 시장 공략 본격화", + "content": [ + "현대차와 기아가 IAA 모빌리티 2025에서 각각 콘셉트 쓰리와 EV2를 공개하며 유럽 소형 전기차 시장 공략에 시동을 걸었다. 이는 유럽의 높은 환경 규제와 도심 이동성 수요에 대응하기 위한 전략적 움직임으로 해석된다.", + "특히 두 모델은 기존 전기차 대비 컴팩트한 사이즈와 효율적인 배터리 시스템을 갖추고 있어, 유럽 소비자들의 실용적 수요를 겨냥했다는 평가를 받고 있다.", + "업계 전문가들은 현대차그룹의 이번 행보가 테슬라와 중국 업체들이 주도하고 있는 유럽 전기차 시장에서 새로운 돌파구를 마련할 수 있을 것으로 전망하고 있다." + ] + }, + { + "title": "배터리 공급망 확보 과제", + "content": [ + "전기차 성공의 핵심 요소인 배터리 수급에서 SK온이 주요 공급 파트너로 부상했다. 폴스타가 SK온을 배터리 공급사로 공개적으로 언급한 것이 이를 방증한다.", + "그러나 업계에서는 배터리 제조사들의 정보 공개가 제한적이어서 실제 공급망 구조를 파악하기 어려운 상황이다. 이는 글로벌 배터리 수급 경쟁이 치열해지고 있음을 시사한다.", + "안정적인 배터리 공급망 확보는 향후 소형 전기차의 가격 경쟁력과 직결되는 만큼, 현대차그룹의 추가적인 파트너십 구축이 예상된다." + ] + }, + { + "title": "가격 경쟁력 확보 전략", + "content": [ + "소형 전기차 시장에서의 성공을 위해서는 합리적인 가격대 책정이 필수적이다. 현대차그룹은 규모의 경제를 통한 원가 절감을 목표로 하고 있다.", + "특히 유럽 시장에서는 테슬라와 중국 업체들의 공격적인 가격 정책에 대응해야 하는 상황이다. 현대차그룹은 프리미엄 품질을 유지하면서도 경쟁력 있는 가격대를 제시하는 것을 목표로 하고 있다.", + "전문가들은 배터리 기술 혁신과 생산 효율화를 통해 가격 경쟁력을 확보하는 것이 향후 성공의 핵심이 될 것으로 전망하고 있다." + ] + } + ], + "categories": [ + "자동차", + "경제", + "산업", + "기술" + ], + "entities": { + "people": [ + "김성수", + "조용하", + "박종면" + ], + "organizations": [ + "현대자동차", + "기아", + "SK온", + "폴스타" + ], + "groups": [ + "유럽 자동차 제조사", + "중국 전기차 업체" + ], + "countries": [ + "대한민국", + "독일", + "중국" + ], + "events": [ + "IAA 모빌리티 2025", + "전기차 배터리 공급 계약" + ] + }, + "source_keyword": "전기차", + "source_count": 3 +} \ No newline at end of file diff --git a/backup-services/ai-writer/backend/article_전기차_professional.json b/backup-services/ai-writer/backend/article_전기차_professional.json new file mode 100644 index 0000000..4b5eb9f --- /dev/null +++ b/backup-services/ai-writer/backend/article_전기차_professional.json @@ -0,0 +1,62 @@ +{ + "news_id": "2c4cb595-9542-45ee-b4b9-2135c46950e3", + "title": "현대차·기아, 유럽 전기차 시장서 소형 모델로 승부수...배터리 협력 강화 주목", + "created_at": "2025-09-13T00:28:51.371773", + "summary": "현대차·기아가 유럽 전기차 시장에서 콘셉트 쓰리와 EV2로 소형 전기차 시장 공략 나서, 배터리 협력사 선정 등 경쟁력 강화 움직임 본격화", + "subtopics": [ + { + "title": "유럽 소형 전기차 시장 공략 본격화", + "content": [ + "현대자동차그룹이 유럽 전기차 시장 공략을 위해 소형 전기차 라인업 확대에 나섰다. IAA 모빌리티 2025에서 공개된 현대차의 콘셉트 쓰리와 기아의 EV2는 유럽 시장 맞춤형 전략의 핵심으로 평가받고 있다.", + "특히 소형 전기차 시장은 유럽에서 급성장이 예상되는 세그먼트로, 현대차그룹은 합리적인 가격대와 실용성을 앞세워 시장 선점을 노리고 있다.", + "현대차그룹의 이번 전략은 유럽의 환경 규제 강화와 소비자들의 실용적인 전기차 수요 증가에 대응하는 동시에, 중국 전기차 업체들의 유럽 진출에 대한 선제적 대응으로 해석된다." + ] + }, + { + "title": "배터리 협력 관계 재편 움직임", + "content": [ + "전기차 경쟁력의 핵심인 배터리 수급과 관련해 업계의 이목이 집중되고 있다. IAA 모빌리티에서 폴스타가 SK온을 배터리 공급사로 지목한 것이 주목받고 있다.", + "글로벌 자동차 업체들의 배터리 조달 전략이 다변화되는 가운데, 한국 배터리 업체들과의 협력 강화 움직임이 감지되고 있다.", + "특히 현대차그룹은 안정적인 배터리 수급을 위해 다양한 배터리 제조사들과의 협력 관계를 검토 중인 것으로 알려졌다." + ] + }, + { + "title": "글로벌 전기차 시장 경쟁 심화", + "content": [ + "전기차 시장에서 브랜드 간 경쟁이 치열해지는 가운데, 현대차그룹은 차별화된 제품 라인업과 기술력으로 시장 지위 강화에 나서고 있다.", + "특히 유럽 시장에서는 테슬라, 폭스바겐 그룹, 중국 업체들과의 경쟁이 불가피한 상황이며, 현대차그룹은 품질과 기술력을 앞세워 경쟁력 확보에 주력하고 있다.", + "시장 전문가들은 현대차그룹의 소형 전기차 전략이 향후 글로벌 시장에서의 입지 강화에 중요한 전환점이 될 것으로 전망하고 있다." + ] + } + ], + "categories": [ + "자동차", + "경제", + "산업" + ], + "entities": { + "people": [ + "김성수", + "박영효" + ], + "organizations": [ + "현대자동차", + "기아", + "SK온", + "폴스타" + ], + "groups": [ + "현대차그룹", + "폭스바겐 그룹" + ], + "countries": [ + "대한민국", + "독일" + ], + "events": [ + "IAA 모빌리티 2025" + ] + }, + "source_keyword": "전기차", + "source_count": 3 +} \ No newline at end of file diff --git a/backup-services/ai-writer/backend/custom_article_analytical.json b/backup-services/ai-writer/backend/custom_article_analytical.json new file mode 100644 index 0000000..9b114dc --- /dev/null +++ b/backup-services/ai-writer/backend/custom_article_analytical.json @@ -0,0 +1,63 @@ +{ + "news_id": "ee154fb8-a913-4aa9-9fc9-fa421fd2d7c0", + "title": "2025년 기술 혁신의 분기점: AI·양자컴퓨팅이 그리는 새로운 미래", + "created_at": "2025-09-13T00:32:14.008706", + "summary": "2025년, AI와 양자컴퓨팅의 상용화가 가져올 산업 전반의 혁신적 변화와 사회적 영향을 심층 분석한 전망", + "subtopics": [ + { + "title": "생성형 AI가 재편하는 산업 생태계", + "content": [ + "2025년은 생성형 AI가 산업 전반에 본격적으로 도입되는 원년이 될 전망이다. 특히 의료 진단, 신약 개발, 교육 커리큘럼 설계 등 전문 분야에서 AI의 역할이 획기적으로 확대될 것으로 예측된다.", + "기업들의 업무 프로세스도 근본적인 변화를 맞이할 것으로 보인다. 창의적 작업 영역에서도 AI의 활용이 일상화되며, 인간-AI 협업 모델이 새로운 표준으로 자리잡을 것으로 전망된다.", + "다만 AI 도입에 따른 노동시장 재편과 윤리적 문제에 대한 사회적 합의가 시급한 과제로 대두될 것으로 예상된다. 특히 AI 의존도 증가에 따른 데이터 보안과 알고리즘 편향성 문제는 중요한 해결 과제가 될 것이다." + ] + }, + { + "title": "양자컴퓨팅의 상용화와 산업혁신", + "content": [ + "양자컴퓨팅 기술이 실용화 단계에 진입하면서, 금융권의 리스크 분석과 암호화폐 보안 시스템에 획기적인 변화가 예상된다. 특히 복잡한 금융 모델링과 시장 예측에서 양자컴퓨터의 활용이 크게 증가할 전망이다.", + "제약 산업에서는 신약 개발 프로세스가 대폭 단축될 것으로 기대된다. 양자컴퓨터를 활용한 분자 시뮬레이션이 가능해지면서, 신약 개발 비용 절감과 효율성 증대가 실현될 것이다.", + "물류 및 공급망 관리 분야에서도 양자컴퓨팅의 영향력이 확대될 전망이다. 복잡한 경로 최적화와 재고 관리에 양자 알고리즘을 적용함으로써, 물류 비용 절감과 효율성 향상이 가능해질 것으로 예측된다." + ] + }, + { + "title": "기술 혁신에 따른 사회경제적 변화", + "content": [ + "AI와 양자컴퓨팅의 발전은 노동시장의 구조적 변화를 가속화할 것으로 전망된다. 단순 반복 업무는 자동화되는 반면, AI 시스템 관리와 양자컴퓨팅 전문가 같은 새로운 직종의 수요가 급증할 것으로 예상된다.", + "교육 시스템도 큰 변화를 맞이할 것으로 보인다. AI 기반 맞춤형 학습과 양자컴퓨팅 원리에 대한 이해가 새로운 필수 교육과정으로 자리잡을 것으로 전망된다.", + "이러한 기술 혁신은 국가 간 기술 격차를 더욱 심화시킬 가능성이 있다. 선진국과 개발도상국 간의 디지털 격차 해소가 국제사회의 주요 과제로 대두될 것으로 예측된다." + ] + } + ], + "categories": [ + "기술", + "산업", + "미래전망", + "경제" + ], + "entities": { + "people": [], + "organizations": [ + "금융권", + "제약회사", + "물류기업" + ], + "groups": [ + "AI 개발자", + "양자컴퓨팅 전문가", + "교육기관" + ], + "countries": [ + "한국", + "미국", + "중국" + ], + "events": [ + "AI 상용화", + "양자컴퓨터 실용화", + "디지털 전환" + ] + }, + "source_keyword": "2025년 기술 트렌드", + "source_count": 2 +} \ No newline at end of file diff --git a/backup-services/ai-writer/backend/custom_article_professional.json b/backup-services/ai-writer/backend/custom_article_professional.json new file mode 100644 index 0000000..ce04357 --- /dev/null +++ b/backup-services/ai-writer/backend/custom_article_professional.json @@ -0,0 +1,62 @@ +{ + "news_id": "3109c578-9b08-4cd0-a9d6-3d92b97e64d4", + "title": "2025년 기술 혁신의 물결, AI·양자컴퓨팅이 이끄는 새로운 패러다임", + "created_at": "2025-09-13T00:31:52.782760", + "summary": "2025년, 생성형 AI와 양자컴퓨팅의 상용화로 산업 전반에 혁신적 변화가 예상되며, 인간-AI 협업이 일상화될 전망", + "subtopics": [ + { + "title": "생성형 AI가 주도하는 창의적 혁신", + "content": [ + "2025년은 생성형 AI 기술이 전례 없는 수준으로 발전하여 창의적 영역에서도 획기적인 변화가 예상된다. 기존에 인간의 고유 영역으로 여겨졌던 예술 창작, 콘텐츠 제작, 디자인 분야에서 AI가 핵심 협력자로 자리잡을 전망이다.", + "특히 의료 분야에서는 AI가 질병 진단과 치료 계획 수립에 적극적으로 활용될 것으로 예측된다. AI는 방대한 의료 데이터를 분석하여 개인 맞춤형 치료법을 제시하고, 의료진의 의사결정을 효과적으로 지원할 것으로 기대된다.", + "교육 분야에서도 AI 기반의 맞춤형 학습 시스템이 보편화될 전망이다. 학습자의 이해도와 진도에 따라 최적화된 커리큘럼을 제공하고, 실시간으로 학습 성과를 분석하여 개선점을 제시하는 등 교육의 질적 향상이 기대된다." + ] + }, + { + "title": "양자컴퓨팅의 산업 혁신 주도", + "content": [ + "2025년은 양자컴퓨팅이 실용화 단계에 진입하는 원년이 될 것으로 전망된다. 특히 금융 산업에서는 복잡한 위험 분석과 포트폴리오 최적화에 양자컴퓨팅을 활용하여 투자 전략의 정확도를 높일 것으로 예상된다.", + "제약 산업에서는 양자컴퓨터를 활용한 신약 개발이 가속화될 전망이다. 분자 구조 시뮬레이션과 신약 후보 물질 스크리닝 과정에서 양자컴퓨팅의 강점이 발휘될 것으로 기대된다.", + "물류 분야에서도 양자컴퓨팅을 통한 최적화가 실현될 전망이다. 복잡한 공급망 관리와 배송 경로 최적화에 양자컴퓨팅을 도입함으로써 물류 비용 절감과 효율성 향상이 가능해질 것으로 예측된다." + ] + }, + { + "title": "인간-기계 협업의 새로운 패러다임", + "content": [ + "2025년에는 AI와 인간의 협업이 일상화되면서 업무 방식의 근본적인 변화가 예상된다. 단순 반복적인 업무는 AI가 담당하고, 인간은 전략적 의사결정과 창의적 문제 해결에 집중하는 방식으로 업무 분담이 이루어질 것이다.", + "이러한 변화는 노동시장의 구조적 변화로 이어질 전망이다. AI와 협업할 수 있는 디지털 역량이 필수적인 직무 역량으로 부상하며, 새로운 형태의 직업이 등장할 것으로 예측된다.", + "하지만 이러한 변화 속에서도 윤리적 판단과 감성적 소통과 같은 인간 고유의 가치는 더욱 중요해질 것으로 전망된다. 기술 발전이 가져올 혜택을 최대화하면서도 인간 중심의 가치를 지켜나가는 균형이 중요한 과제로 대두될 것이다." + ] + } + ], + "categories": [ + "기술", + "미래전망", + "산업동향" + ], + "entities": { + "people": [], + "organizations": [ + "AI 기업들", + "제약회사들", + "물류기업들" + ], + "groups": [ + "의료진", + "교육자", + "기술전문가" + ], + "countries": [ + "한국", + "미국", + "중국" + ], + "events": [ + "2025년 기술혁신", + "양자컴퓨팅 상용화", + "AI 혁명" + ] + }, + "source_keyword": "2025년 기술 트렌드", + "source_count": 2 +} \ No newline at end of file diff --git a/backup-services/ai-writer/backend/generated_article.json b/backup-services/ai-writer/backend/generated_article.json new file mode 100644 index 0000000..4046dbe --- /dev/null +++ b/backup-services/ai-writer/backend/generated_article.json @@ -0,0 +1,73 @@ +{ + "news_id": "ea9f3734-6a93-4ca7-8ebe-b85612e2fd0a", + "title": "정부, 내년 AI 산업에 10조원 투자...한국 경제 체질 대전환 나선다", + "created_at": "2025-09-13T01:09:43.892704", + "summary": "정부가 2025년 인공지능 산업 육성을 위해 10조원 규모의 대규모 투자를 단행하며 디지털 경제 전환 가속화에 나선다", + "subtopics": [ + { + "title": "정부의 AI 산업 육성 청사진", + "content": [ + "정부가 2025년 인공지능(AI) 산업 육성을 위해 10조원 규모의 투자를 단행한다. 이는 한국 경제의 디지털 전환을 가속화하고 글로벌 AI 강국으로 도약하기 위한 전략적 결정이다.", + "투자의 주요 방향은 AI 기술 개발, 인프라 구축, 전문인력 양성 등으로, 특히 반도체와 같은 핵심 산업과의 시너지 창출에 중점을 둘 예정이다." + ] + }, + { + "title": "민관 협력 체계 구축", + "content": [ + "정부는 AI 산업 육성을 위해 대기업, 스타트업, 연구기관 등과의 협력 체계를 강화한다. 소버린AI를 비롯한 국내 AI 기업들과의 협력을 통해 실질적인 세계 2위 AI 강국 도약을 목표로 하고 있다.", + "특히 AI 전문가 공모와 전담 조직 신설 등을 통해 체계적인 산업 육성 기반을 마련할 계획이다." + ] + }, + { + "title": "글로벌 경쟁력 강화 전략", + "content": [ + "정부는 국내 AI 기업들의 글로벌 경쟁력 강화를 위해 기술 개발 지원, 해외 시장 진출 지원, 규제 개선 등 다각적인 지원책을 마련한다.", + "특히 AI 산업의 핵심 인프라인 반도체 분야에서 SK하이닉스의 HBM4 개발 완료 등 가시적인 성과가 나타나고 있어, 이를 기반으로 한 시너지 효과가 기대된다." + ] + } + ], + "categories": [ + "경제", + "기술", + "산업정책" + ], + "entities": { + "people": [ + "하정우 소버린AI 대표" + ], + "organizations": [ + "소버린AI", + "SK하이닉스", + "과학기술정보통신부" + ], + "groups": [ + "AI 기업", + "스타트업" + ], + "countries": [ + "대한민국", + "미국" + ], + "events": [ + { + "name": "2025년 AI 산업 육성 계획 발표", + "date": "2025년", + "location": "대한민국" + } + ], + "keywords": [ + "인공지능", + "AI 산업", + "디지털 전환", + "10조원 투자", + "반도체", + "HBM4", + "글로벌 경쟁력", + "민관협력", + "전문인력 양성", + "기술개발" + ] + }, + "source_keyword": "인공지능", + "source_count": 5 +} \ No newline at end of file diff --git a/backup-services/ai-writer/backend/requirements.txt b/backup-services/ai-writer/backend/requirements.txt new file mode 100644 index 0000000..8696605 --- /dev/null +++ b/backup-services/ai-writer/backend/requirements.txt @@ -0,0 +1,9 @@ +fastapi==0.104.1 +uvicorn[standard]==0.24.0 +httpx==0.25.2 +pydantic==2.5.0 +motor==3.1.1 +pymongo==4.3.3 +anthropic==0.39.0 +python-multipart==0.0.6 +redis[hiredis]==5.0.1 \ No newline at end of file diff --git a/backup-services/ai-writer/backend/test_ai_writer.py b/backup-services/ai-writer/backend/test_ai_writer.py new file mode 100755 index 0000000..3b45bbf --- /dev/null +++ b/backup-services/ai-writer/backend/test_ai_writer.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +""" +AI Writer Service Test +Claude API를 사용한 전문적인 뉴스 기사 생성 테스트 +""" +import asyncio +import httpx +import json +from datetime import datetime + +# Service URL +SERVICE_URL = "http://localhost:8019" + +async def test_article_generation(): + """인공지능 키워드로 기사 생성 테스트""" + async with httpx.AsyncClient(timeout=120.0) as client: + print("\n" + "="*70) + print(" AI Writer Service - 전문 기사 생성 테스트 ") + print("="*70) + + print("\n📰 '인공지능' 키워드로 전문 기사 생성 중...") + print("-" * 50) + + # Generate article + response = await client.post( + f"{SERVICE_URL}/api/generate", + json={ + "keyword": "인공지능", + "limit": 5, + "google_results_per_title": 3, + "lang": "ko", + "country": "KR", + "style": "professional" + } + ) + + if response.status_code == 200: + article = response.json() + + print(f"\n✅ 기사 생성 완료!") + print(f"\n📌 기사 ID: {article['news_id']}") + print(f"📅 생성 시간: {article['created_at']}") + print(f"\n📰 제목: {article['title']}") + print(f"📝 요약: {article['summary']}") + + print(f"\n🔍 카테고리: {', '.join(article['categories'])}") + + # Print subtopics + print(f"\n📚 소주제 ({len(article['subtopics'])}개):") + for i, subtopic in enumerate(article['subtopics'], 1): + print(f"\n [{i}] {subtopic['title']}") + print(f" 문단 수: {len(subtopic['content'])}개") + for j, paragraph in enumerate(subtopic['content'][:1], 1): # Show first paragraph only + print(f" 미리보기: {paragraph[:150]}...") + + # Print entities + entities = article['entities'] + print(f"\n🏷️ 추출된 개체:") + if entities['people']: + print(f" 👤 인물: {', '.join(entities['people'])}") + if entities['organizations']: + print(f" 🏢 기관: {', '.join(entities['organizations'])}") + if entities['groups']: + print(f" 👥 단체: {', '.join(entities['groups'])}") + if entities['countries']: + print(f" 🌍 국가: {', '.join(entities['countries'])}") + if entities.get('events'): + events = entities['events'] + if events: + print(f" 📅 이벤트 ({len(events)}개):") + for evt in events[:3]: # 처음 3개만 표시 + if isinstance(evt, dict): + evt_str = f" - {evt.get('name', '')}" + if evt.get('date'): + evt_str += f" [{evt['date']}]" + if evt.get('location'): + evt_str += f" @{evt['location']}" + print(evt_str) + else: + # 이전 형식 (문자열) 지원 + print(f" - {evt}") + if entities.get('keywords'): + keywords = entities['keywords'] + if keywords: + print(f" 🔑 키워드: {', '.join(keywords[:5])}" + + ("..." if len(keywords) > 5 else "")) + + print(f"\n📊 참조 소스: {article.get('source_count', 0)}개") + + # Save full article to file + with open('generated_article.json', 'w', encoding='utf-8') as f: + json.dump(article, f, ensure_ascii=False, indent=2) + print(f"\n💾 전체 기사가 'generated_article.json'에 저장되었습니다.") + + else: + print(f"❌ 오류: {response.status_code}") + print(f" 상세: {response.text}") + +async def test_health_check(): + """서비스 상태 확인""" + async with httpx.AsyncClient() as client: + print("\n" + "="*60) + print("서비스 Health Check") + print("="*60) + + response = await client.get(f"{SERVICE_URL}/health") + if response.status_code == 200: + data = response.json() + print(f"✓ AI Writer 서비스 상태: {data.get('status', 'unknown')}") + if 'services' in data: + print(f" - News Aggregator: {data['services'].get('news_aggregator', 'unknown')}") + print(f" - MongoDB: {data['services'].get('mongodb', 'unknown')}") + print(f" - Claude API: {data['services'].get('claude_api', 'unknown')}") + if 'error' in data: + print(f" - Error: {data['error']}") + else: + print(f"✗ Health check 실패: {response.status_code}") + +async def test_batch_generation(): + """여러 키워드 일괄 처리 테스트""" + async with httpx.AsyncClient(timeout=180.0) as client: + print("\n" + "="*60) + print("일괄 기사 생성 테스트") + print("="*60) + + keywords = ["AI 혁신", "디지털 전환", "스마트시티"] + print(f"\n키워드: {', '.join(keywords)}") + + response = await client.post( + f"{SERVICE_URL}/api/generate/batch", + json=keywords, + params={"style": "analytical"} + ) + + if response.status_code == 200: + data = response.json() + print(f"\n✅ 처리 완료: {data['total_processed']}개") + + if data['success']: + print("\n성공한 기사:") + for item in data['success']: + print(f" - {item['keyword']}: {item['title'][:50]}...") + + if data['errors']: + print("\n실패한 항목:") + for item in data['errors']: + print(f" - {item['keyword']}: {item['error']}") + else: + print(f"❌ 오류: {response.status_code}") + +async def main(): + """메인 테스트 실행""" + print("\n" + "="*70) + print(" AI Writer Service Test Suite ") + print(" RSS → Google Search → Claude AI 기사 생성 ") + print("="*70) + + # Run tests + await test_health_check() + await test_article_generation() + # await test_batch_generation() # Optional: batch test + + print("\n" + "="*70) + print(" 테스트 완료 ") + print("="*70) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/backup-services/ai-writer/backend/test_prompt_generation.py b/backup-services/ai-writer/backend/test_prompt_generation.py new file mode 100644 index 0000000..12d4764 --- /dev/null +++ b/backup-services/ai-writer/backend/test_prompt_generation.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python3 +""" +AI Writer Service - 프롬프트 기반 기사 생성 테스트 +다양한 스타일과 키워드로 기사를 생성하는 테스트 +""" +import asyncio +import httpx +import json +from datetime import datetime + +# Service URL +SERVICE_URL = "http://localhost:8019" + +async def test_different_styles(): + """다양한 스타일로 기사 생성 테스트""" + + test_cases = [ + { + "keyword": "전기차", + "style": "professional", + "description": "전통적인 뉴스 기사 스타일" + }, + { + "keyword": "전기차", + "style": "analytical", + "description": "분석적이고 심층적인 스타일" + }, + { + "keyword": "전기차", + "style": "investigative", + "description": "탐사보도 스타일" + } + ] + + async with httpx.AsyncClient(timeout=180.0) as client: + for test_case in test_cases: + print("\n" + "="*70) + print(f" {test_case['description']} 테스트") + print("="*70) + print(f"키워드: {test_case['keyword']}") + print(f"스타일: {test_case['style']}") + print("-" * 50) + + try: + response = await client.post( + f"{SERVICE_URL}/api/generate", + json={ + "keyword": test_case["keyword"], + "limit": 3, # RSS 항목 수 줄여서 빠른 테스트 + "google_results_per_title": 2, + "lang": "ko", + "country": "KR", + "style": test_case["style"] + } + ) + + if response.status_code == 200: + article = response.json() + print(f"\n✅ 기사 생성 성공!") + print(f"📰 제목: {article['title']}") + print(f"📝 요약: {article['summary']}") + print(f"🔍 카테고리: {', '.join(article['categories'])}") + print(f"📚 소주제 수: {len(article['subtopics'])}") + + # 키워드 출력 + if 'entities' in article and 'keywords' in article['entities']: + keywords = article['entities']['keywords'] + print(f"🔑 키워드 ({len(keywords)}개): {', '.join(keywords[:5])}" + + ("..." if len(keywords) > 5 else "")) + + # 이벤트 정보 출력 + if 'entities' in article and 'events' in article['entities']: + events = article['entities']['events'] + if events: + print(f"📅 이벤트 ({len(events)}개):") + for evt in events[:2]: # 처음 2개만 표시 + if isinstance(evt, dict): + evt_str = f" - {evt.get('name', '')}" + if evt.get('date'): + evt_str += f" [{evt['date']}]" + if evt.get('location'): + evt_str += f" @{evt['location']}" + print(evt_str) + + # 첫 번째 소주제의 첫 문단만 출력 + if article['subtopics']: + first_topic = article['subtopics'][0] + print(f"\n첫 번째 소주제: {first_topic['title']}") + if first_topic['content']: + print(f"미리보기: {first_topic['content'][0][:200]}...") + + # 파일로 저장 + filename = f"article_{test_case['keyword']}_{test_case['style']}.json" + with open(filename, 'w', encoding='utf-8') as f: + json.dump(article, f, ensure_ascii=False, indent=2) + print(f"\n💾 '{filename}'에 저장됨") + + else: + print(f"❌ 오류: {response.status_code}") + print(f"상세: {response.text}") + + except Exception as e: + print(f"❌ 테스트 실패: {e}") + + # 다음 테스트 전 잠시 대기 + await asyncio.sleep(2) + +async def test_different_keywords(): + """다양한 키워드로 기사 생성 테스트""" + + keywords = ["블록체인", "메타버스", "우주개발", "기후변화", "K-POP"] + + async with httpx.AsyncClient(timeout=180.0) as client: + print("\n" + "="*70) + print(" 다양한 키워드 테스트") + print("="*70) + + for keyword in keywords: + print(f"\n🔍 키워드: {keyword}") + print("-" * 30) + + try: + response = await client.post( + f"{SERVICE_URL}/api/generate", + json={ + "keyword": keyword, + "limit": 2, # 빠른 테스트를 위해 줄임 + "google_results_per_title": 2, + "lang": "ko", + "country": "KR", + "style": "professional" + } + ) + + if response.status_code == 200: + article = response.json() + print(f"✅ 성공: {article['title'][:50]}...") + print(f" 카테고리: {', '.join(article['categories'][:3])}") + else: + print(f"❌ 실패: {response.status_code}") + + except Exception as e: + print(f"❌ 오류: {e}") + + await asyncio.sleep(1) + +async def test_custom_prompt(): + """커스텀 프롬프트 테스트 - 직접 aggregated 데이터 제공""" + + # 미리 수집된 데이터를 시뮬레이션 + custom_news_data = { + "keyword": "2025년 기술 트렌드", + "news_items": [ + { + "rss_title": "AI와 로봇이 바꾸는 2025년 일상", + "google_results": [ + { + "title": "전문가들이 예측하는 2025년 AI 혁명", + "snippet": "2025년 AI 기술이 일상생활 전반을 혁신할 전망...", + "full_content": { + "url": "https://example.com/ai-2025", + "content": "2025년에는 AI가 의료, 교육, 업무 등 모든 분야에서 인간과 협업하는 시대가 열릴 것으로 전망된다. 특히 생성형 AI의 발전으로 창의적 작업에서도 AI의 역할이 크게 확대될 것이다." + } + } + ] + }, + { + "rss_title": "양자컴퓨터 상용화 임박", + "google_results": [ + { + "title": "IBM, 2025년 1000큐비트 양자컴퓨터 출시 예정", + "snippet": "IBM이 2025년 상용 양자컴퓨터 출시를 앞두고...", + "full_content": { + "url": "https://example.com/quantum-2025", + "content": "양자컴퓨팅이 드디어 실용화 단계에 접어들었다. 2025년에는 금융, 제약, 물류 등 다양한 산업에서 양자컴퓨터를 활용한 혁신이 시작될 전망이다." + } + } + ] + } + ] + } + + async with httpx.AsyncClient(timeout=180.0) as client: + print("\n" + "="*70) + print(" 커스텀 데이터로 기사 생성") + print("="*70) + + for style in ["professional", "analytical"]: + print(f"\n스타일: {style}") + print("-" * 30) + + try: + response = await client.post( + f"{SERVICE_URL}/api/generate/from-aggregated", + json=custom_news_data, + params={"style": style} + ) + + if response.status_code == 200: + article = response.json() + print(f"✅ 제목: {article['title']}") + print(f" 요약: {article['summary']}") + + # 스타일별로 저장 + filename = f"custom_article_{style}.json" + with open(filename, 'w', encoding='utf-8') as f: + json.dump(article, f, ensure_ascii=False, indent=2) + print(f" 💾 '{filename}'에 저장됨") + else: + print(f"❌ 실패: {response.text}") + + except Exception as e: + print(f"❌ 오류: {e}") + + await asyncio.sleep(2) + +async def main(): + """메인 테스트 실행""" + print("\n" + "="*70) + print(" AI Writer 프롬프트 기반 기사 생성 테스트") + print("="*70) + + # 1. 다양한 스타일 테스트 + print("\n[1] 스타일별 기사 생성 테스트") + await test_different_styles() + + # 2. 다양한 키워드 테스트 + print("\n[2] 키워드별 기사 생성 테스트") + await test_different_keywords() + + # 3. 커스텀 데이터 테스트 + print("\n[3] 커스텀 데이터 기사 생성 테스트") + await test_custom_prompt() + + print("\n" + "="*70) + print(" 모든 테스트 완료!") + print("="*70) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/backup-services/ai-writer/worker/Dockerfile b/backup-services/ai-writer/worker/Dockerfile new file mode 100644 index 0000000..7869505 --- /dev/null +++ b/backup-services/ai-writer/worker/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Copy requirements +COPY backend/requirements.txt . + +# Install dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY backend/app /app + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV WORKER_COUNT=3 + +# Run worker +CMD ["python", "worker.py"] \ No newline at end of file diff --git a/services/google-search/README.md b/backup-services/google-search/README.md similarity index 100% rename from services/google-search/README.md rename to backup-services/google-search/README.md diff --git a/services/google-search/backend/.env.example b/backup-services/google-search/backend/.env.example similarity index 100% rename from services/google-search/backend/.env.example rename to backup-services/google-search/backend/.env.example diff --git a/services/google-search/backend/Dockerfile b/backup-services/google-search/backend/Dockerfile similarity index 100% rename from services/google-search/backend/Dockerfile rename to backup-services/google-search/backend/Dockerfile diff --git a/services/rss-feed/backend/app/__init__.py b/backup-services/google-search/backend/app/__init__.py similarity index 100% rename from services/rss-feed/backend/app/__init__.py rename to backup-services/google-search/backend/app/__init__.py diff --git a/services/google-search/backend/app/config.py b/backup-services/google-search/backend/app/config.py similarity index 100% rename from services/google-search/backend/app/config.py rename to backup-services/google-search/backend/app/config.py diff --git a/services/google-search/backend/app/main.py b/backup-services/google-search/backend/app/main.py similarity index 100% rename from services/google-search/backend/app/main.py rename to backup-services/google-search/backend/app/main.py diff --git a/services/google-search/backend/app/search_service.py b/backup-services/google-search/backend/app/search_service.py similarity index 100% rename from services/google-search/backend/app/search_service.py rename to backup-services/google-search/backend/app/search_service.py diff --git a/services/google-search/backend/requirements.txt b/backup-services/google-search/backend/requirements.txt similarity index 100% rename from services/google-search/backend/requirements.txt rename to backup-services/google-search/backend/requirements.txt diff --git a/backup-services/news-aggregator/backend/Dockerfile b/backup-services/news-aggregator/backend/Dockerfile new file mode 100644 index 0000000..a296111 --- /dev/null +++ b/backup-services/news-aggregator/backend/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +COPY . . + +# Run the application +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] \ No newline at end of file diff --git a/backup-services/news-aggregator/backend/app/__init__.py b/backup-services/news-aggregator/backend/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backup-services/news-aggregator/backend/app/main.py b/backup-services/news-aggregator/backend/app/main.py new file mode 100644 index 0000000..625101f --- /dev/null +++ b/backup-services/news-aggregator/backend/app/main.py @@ -0,0 +1,365 @@ +""" +News Aggregator Service +RSS 피드 제목을 구글 검색으로 확장하는 통합 서비스 +""" +from fastapi import FastAPI, HTTPException, Query, BackgroundTasks +from fastapi.middleware.cors import CORSMiddleware +from typing import List, Optional, Dict, Any +from datetime import datetime +import httpx +import asyncio +from pydantic import BaseModel +import logging + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +app = FastAPI( + title="News Aggregator Service", + description="RSS 피드와 구글 검색을 통합한 뉴스 수집 서비스", + version="1.0.0" +) + +# CORS 설정 +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Configuration +RSS_SERVICE_URL = "http://rss-feed-backend:8000" +GOOGLE_SEARCH_SERVICE_URL = "http://google-search-backend:8000" + +# Response Models +class NewsItem(BaseModel): + """뉴스 항목""" + rss_title: str + rss_link: Optional[str] = None + google_results: List[Dict[str, Any]] = [] + search_keyword: str + timestamp: datetime = None + +class AggregatedNews(BaseModel): + """통합 뉴스 결과""" + keyword: str + rss_feed_url: str + total_rss_entries: int + processed_entries: int + news_items: List[NewsItem] + processing_time: float + +# HTTP Client +client = httpx.AsyncClient(timeout=30.0) + +@app.on_event("startup") +async def startup(): + """서비스 시작""" + logger.info("News Aggregator Service starting...") + +@app.on_event("shutdown") +async def shutdown(): + """서비스 종료""" + await client.aclose() + logger.info("News Aggregator Service stopped") + +@app.get("/") +async def root(): + return { + "service": "News Aggregator Service", + "version": "1.0.0", + "description": "RSS 피드와 구글 검색 통합 서비스", + "endpoints": { + "aggregate": "GET /api/aggregate", + "aggregate_by_location": "GET /api/aggregate/location", + "aggregate_by_topic": "GET /api/aggregate/topic", + "health": "GET /health" + } + } + +@app.get("/health") +async def health_check(): + """헬스 체크""" + try: + # Check RSS service + rss_response = await client.get(f"{RSS_SERVICE_URL}/health") + rss_healthy = rss_response.status_code == 200 + + # Check Google Search service + google_response = await client.get(f"{GOOGLE_SEARCH_SERVICE_URL}/health") + google_healthy = google_response.status_code == 200 + + return { + "status": "healthy" if (rss_healthy and google_healthy) else "degraded", + "services": { + "rss_feed": "healthy" if rss_healthy else "unhealthy", + "google_search": "healthy" if google_healthy else "unhealthy" + }, + "timestamp": datetime.now().isoformat() + } + except Exception as e: + return { + "status": "unhealthy", + "error": str(e), + "timestamp": datetime.now().isoformat() + } + +@app.get("/api/aggregate", response_model=AggregatedNews) +async def aggregate_news( + q: str = Query(..., description="검색 키워드"), + limit: int = Query(10, description="처리할 RSS 항목 수", ge=1, le=50), + google_results_per_title: int = Query(5, description="각 제목당 구글 검색 결과 수", ge=1, le=10), + lang: str = Query("ko", description="언어 코드"), + country: str = Query("KR", description="국가 코드") +): + """ + 키워드로 RSS 피드를 검색하고, 각 제목을 구글에서 재검색 + + 1. 키워드로 Google News RSS 피드 가져오기 + 2. RSS 피드의 각 제목을 구글 검색 + 3. 통합 결과 반환 + """ + start_time = datetime.now() + + try: + # Step 1: Get RSS feed from keyword + logger.info(f"Fetching RSS feed for keyword: {q}") + rss_response = await client.get( + f"{RSS_SERVICE_URL}/api/google-rss/search", + params={"q": q, "lang": lang, "country": country} + ) + rss_response.raise_for_status() + rss_data = rss_response.json() + + if not rss_data.get("success"): + raise HTTPException(status_code=500, detail=f"RSS 피드 가져오기 실패: {rss_data.get('error')}") + + # Step 2: Process each RSS entry with Google search + news_items = [] + entries = rss_data.get("entries", []) + + # If no entries field, fallback to sample_titles + if not entries: + titles = rss_data.get("sample_titles", [])[:limit] + entries = [{"title": title, "link": "", "published": ""} for title in titles] + else: + entries = entries[:limit] + + # Create tasks for parallel processing + search_tasks = [] + for entry in entries: + title = entry.get("title", "") + # Clean title for better search results + clean_title = title.split(" - ")[-1] if " - " in title else title + search_tasks.append( + search_google(clean_title, google_results_per_title, lang, country) + ) + + # Execute searches in parallel + logger.info(f"Searching Google for {len(search_tasks)} RSS entries") + search_results = await asyncio.gather(*search_tasks, return_exceptions=True) + + # Combine results + for i, entry in enumerate(entries): + google_results = [] + if not isinstance(search_results[i], Exception): + google_results = search_results[i] + + title = entry.get("title", "") + news_items.append(NewsItem( + rss_title=title, + rss_link=entry.get("link", ""), + google_results=google_results, + search_keyword=title.split(" - ")[-1] if " - " in title else title, + timestamp=datetime.now() + )) + + # Calculate processing time + processing_time = (datetime.now() - start_time).total_seconds() + + return AggregatedNews( + keyword=q, + rss_feed_url=rss_data.get("feed_url", ""), + total_rss_entries=rss_data.get("entry_count", 0), + processed_entries=len(news_items), + news_items=news_items, + processing_time=processing_time + ) + + except httpx.HTTPStatusError as e: + logger.error(f"HTTP error: {e}") + raise HTTPException(status_code=e.response.status_code, detail=str(e)) + except Exception as e: + logger.error(f"Error in aggregate_news: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +async def search_google(query: str, num_results: int, lang: str, country: str) -> List[Dict[str, Any]]: + """구글 검색 서비스 호출 - 전체 콘텐츠 포함""" + try: + # Full content API 직접 호출 + response = await client.get( + f"{GOOGLE_SEARCH_SERVICE_URL}/api/search/full", + params={ + "q": query, + "num": num_results, + "lang": lang, + "country": country + } + ) + response.raise_for_status() + data = response.json() + results = data.get("results", []) + + # full_content가 이미 포함되어 있으므로 그대로 반환 + logger.info(f"Google search for '{query}' returned {len(results)} results with full content") + + return results + except Exception as e: + logger.error(f"Google search error for '{query}': {e}") + # Fallback to basic search without full content + try: + response = await client.get( + f"{GOOGLE_SEARCH_SERVICE_URL}/api/search", + params={ + "q": query, + "num": num_results, + "lang": lang, + "country": country + } + ) + response.raise_for_status() + data = response.json() + return data.get("results", []) + except: + return [] + +@app.get("/api/aggregate/location", response_model=AggregatedNews) +async def aggregate_news_by_location( + location: str = Query(..., description="지역명 (예: Seoul, Tokyo)"), + limit: int = Query(10, description="처리할 RSS 항목 수", ge=1, le=50), + google_results_per_title: int = Query(5, description="각 제목당 구글 검색 결과 수", ge=1, le=10), + lang: str = Query("ko", description="언어 코드"), + country: str = Query("KR", description="국가 코드") +): + """지역 기반 RSS 피드를 가져와서 각 제목을 구글 검색""" + start_time = datetime.now() + + try: + # Get location-based RSS feed + logger.info(f"Fetching RSS feed for location: {location}") + rss_response = await client.get( + f"{RSS_SERVICE_URL}/api/google-rss/location", + params={"location": location, "lang": lang, "country": country} + ) + rss_response.raise_for_status() + rss_data = rss_response.json() + + if not rss_data.get("success"): + raise HTTPException(status_code=500, detail=f"RSS 피드 가져오기 실패: {rss_data.get('error')}") + + # Process titles + news_items = [] + titles = rss_data.get("sample_titles", [])[:limit] + + search_tasks = [] + for title in titles: + clean_title = title.split(" - ")[-1] if " - " in title else title + search_tasks.append( + search_google(clean_title, google_results_per_title, lang, country) + ) + + search_results = await asyncio.gather(*search_tasks, return_exceptions=True) + + for i, title in enumerate(titles): + google_results = [] + if not isinstance(search_results[i], Exception): + google_results = search_results[i] + + news_items.append(NewsItem( + rss_title=title, + google_results=google_results, + search_keyword=title.split(" - ")[-1] if " - " in title else title, + timestamp=datetime.now() + )) + + processing_time = (datetime.now() - start_time).total_seconds() + + return AggregatedNews( + keyword=f"Location: {location}", + rss_feed_url=rss_data.get("feed_url", ""), + total_rss_entries=rss_data.get("entry_count", 0), + processed_entries=len(news_items), + news_items=news_items, + processing_time=processing_time + ) + + except Exception as e: + logger.error(f"Error in aggregate_news_by_location: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/aggregate/topic", response_model=AggregatedNews) +async def aggregate_news_by_topic( + category: str = Query(..., description="카테고리 (TECHNOLOGY, BUSINESS, HEALTH 등)"), + limit: int = Query(10, description="처리할 RSS 항목 수", ge=1, le=50), + google_results_per_title: int = Query(5, description="각 제목당 구글 검색 결과 수", ge=1, le=10), + lang: str = Query("ko", description="언어 코드"), + country: str = Query("KR", description="국가 코드") +): + """주제별 RSS 피드를 가져와서 각 제목을 구글 검색""" + start_time = datetime.now() + + try: + # Get topic-based RSS feed + logger.info(f"Fetching RSS feed for topic: {category}") + rss_response = await client.get( + f"{RSS_SERVICE_URL}/api/google-rss/topic", + params={"category": category, "lang": lang, "country": country} + ) + rss_response.raise_for_status() + rss_data = rss_response.json() + + if not rss_data.get("success"): + raise HTTPException(status_code=500, detail=f"RSS 피드 가져오기 실패: {rss_data.get('error')}") + + # Process titles + news_items = [] + titles = rss_data.get("sample_titles", [])[:limit] + + search_tasks = [] + for title in titles: + clean_title = title.split(" - ")[-1] if " - " in title else title + search_tasks.append( + search_google(clean_title, google_results_per_title, lang, country) + ) + + search_results = await asyncio.gather(*search_tasks, return_exceptions=True) + + for i, title in enumerate(titles): + google_results = [] + if not isinstance(search_results[i], Exception): + google_results = search_results[i] + + news_items.append(NewsItem( + rss_title=title, + google_results=google_results, + search_keyword=title.split(" - ")[-1] if " - " in title else title, + timestamp=datetime.now() + )) + + processing_time = (datetime.now() - start_time).total_seconds() + + return AggregatedNews( + keyword=f"Topic: {category}", + rss_feed_url=rss_data.get("feed_url", ""), + total_rss_entries=rss_data.get("entry_count", 0), + processed_entries=len(news_items), + news_items=news_items, + processing_time=processing_time + ) + + except Exception as e: + logger.error(f"Error in aggregate_news_by_topic: {e}") + raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file diff --git a/backup-services/news-aggregator/backend/requirements.txt b/backup-services/news-aggregator/backend/requirements.txt new file mode 100644 index 0000000..5881f23 --- /dev/null +++ b/backup-services/news-aggregator/backend/requirements.txt @@ -0,0 +1,5 @@ +fastapi==0.104.1 +uvicorn[standard]==0.24.0 +httpx==0.25.2 +pydantic==2.5.0 +python-multipart==0.0.6 \ No newline at end of file diff --git a/backup-services/news-aggregator/backend/test_aggregator.py b/backup-services/news-aggregator/backend/test_aggregator.py new file mode 100755 index 0000000..cffea4d --- /dev/null +++ b/backup-services/news-aggregator/backend/test_aggregator.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +""" +News Aggregator Service Test +RSS 피드 제목을 구글 full content 검색으로 확장하는 통합 테스트 +""" +import asyncio +import httpx +import json +from datetime import datetime +from typing import Dict, Any + +# Service URL +SERVICE_URL = "http://localhost:8018" + +async def test_aggregate_with_full_content(): + """키워드로 RSS 피드를 검색하고 full content 구글 검색 테스트""" + async with httpx.AsyncClient(timeout=60.0) as client: + print("\n" + "="*60) + print("뉴스 통합 서비스 Full Content 테스트") + print("="*60) + + # Test with keyword "인공지능" + print("\n1. 키워드 '인공지능'으로 RSS 피드 검색 및 구글 full content 검색") + print("-" * 40) + + response = await client.get( + f"{SERVICE_URL}/api/aggregate", + params={ + "q": "인공지능", + "limit": 3, # 테스트용으로 3개만 + "google_results_per_title": 2, # 각 제목당 2개 구글 결과 + "lang": "ko", + "country": "KR" + } + ) + + if response.status_code == 200: + data = response.json() + print(f"✓ RSS 피드 URL: {data['rss_feed_url']}") + print(f"✓ 전체 RSS 항목 수: {data['total_rss_entries']}") + print(f"✓ 처리된 항목 수: {data['processed_entries']}") + print(f"✓ 처리 시간: {data['processing_time']:.2f}초") + + # Check each news item for full content + for i, item in enumerate(data['news_items'], 1): + print(f"\n [{i}] RSS 제목: {item['rss_title'][:50]}...") + print(f" 검색 키워드: {item['search_keyword'][:50]}...") + print(f" 구글 검색 결과 수: {len(item['google_results'])}") + + # Check if google results have full_content + for j, result in enumerate(item['google_results'], 1): + has_full_content = 'full_content' in result + if has_full_content: + full_content = result.get('full_content', '') + if isinstance(full_content, str): + content_length = len(full_content) + else: + content_length = len(str(full_content)) + else: + content_length = 0 + + print(f" - 결과 {j}: {result.get('title', 'N/A')[:40]}...") + print(f" Full Content 포함: {'✓' if has_full_content else '✗'}") + if has_full_content: + print(f" Content 길이: {content_length:,} 문자") + # Show first 200 chars of content + if isinstance(result['full_content'], str): + preview = result['full_content'][:200].replace('\n', ' ') + print(f" 미리보기: {preview}...") + else: + print(f" Content 타입: {type(result['full_content'])}") + print(f" Content 데이터: {str(result['full_content'])[:200]}...") + else: + print(f"✗ 오류: {response.status_code}") + print(f" 상세: {response.text}") + +async def test_aggregate_by_location(): + """지역 기반 RSS 피드 및 full content 테스트""" + async with httpx.AsyncClient(timeout=60.0) as client: + print("\n" + "="*60) + print("지역 기반 뉴스 통합 Full Content 테스트") + print("="*60) + + print("\n2. 지역 'Seoul'로 RSS 피드 검색 및 구글 full content 검색") + print("-" * 40) + + response = await client.get( + f"{SERVICE_URL}/api/aggregate/location", + params={ + "location": "Seoul", + "limit": 2, + "google_results_per_title": 2, + "lang": "ko", + "country": "KR" + } + ) + + if response.status_code == 200: + data = response.json() + print(f"✓ 지역: {data['keyword']}") + print(f"✓ RSS 피드 URL: {data['rss_feed_url']}") + print(f"✓ 처리된 항목 수: {data['processed_entries']}") + + # Check full content availability + full_content_count = 0 + total_content_size = 0 + + for item in data['news_items']: + for result in item['google_results']: + if 'full_content' in result: + full_content_count += 1 + content = result['full_content'] + if isinstance(content, str): + total_content_size += len(content) + else: + total_content_size += len(str(content)) + + print(f"\n📊 Full Content 통계:") + print(f" - Full Content 포함 결과: {full_content_count}개") + print(f" - 전체 Content 크기: {total_content_size:,} 문자") + print(f" - 평균 Content 크기: {total_content_size//max(full_content_count, 1):,} 문자") + else: + print(f"✗ 오류: {response.status_code}") + +async def test_aggregate_by_topic(): + """주제별 RSS 피드 및 full content 테스트""" + async with httpx.AsyncClient(timeout=60.0) as client: + print("\n" + "="*60) + print("주제별 뉴스 통합 Full Content 테스트") + print("="*60) + + print("\n3. 주제 'TECHNOLOGY'로 RSS 피드 검색 및 구글 full content 검색") + print("-" * 40) + + response = await client.get( + f"{SERVICE_URL}/api/aggregate/topic", + params={ + "category": "TECHNOLOGY", + "limit": 2, + "google_results_per_title": 3, + "lang": "ko", + "country": "KR" + } + ) + + if response.status_code == 200: + data = response.json() + print(f"✓ 주제: {data['keyword']}") + print(f"✓ 처리 시간: {data['processing_time']:.2f}초") + + # Analyze content quality for AI summarization + print("\n📝 AI 요약을 위한 Content 품질 분석:") + for i, item in enumerate(data['news_items'], 1): + print(f"\n 뉴스 항목 {i}:") + for j, result in enumerate(item['google_results'], 1): + if 'full_content' in result: + content = result['full_content'] + if isinstance(content, str): + # Check content quality indicators + has_paragraphs = '\n\n' in content or '

' in content + has_sufficient_length = len(content) > 500 + has_korean = any(ord(char) >= 0xAC00 and ord(char) <= 0xD7A3 for char in content[:min(100, len(content))]) + else: + content_str = str(content) + has_paragraphs = '\n\n' in content_str or '

' in content_str + has_sufficient_length = len(content_str) > 500 + has_korean = any(ord(char) >= 0xAC00 and ord(char) <= 0xD7A3 for char in content_str[:min(100, len(content_str))]) + + print(f" 결과 {j} 품질 체크:") + print(f" - 충분한 길이 (>500자): {'✓' if has_sufficient_length else '✗'}") + print(f" - 단락 구조 포함: {'✓' if has_paragraphs else '✗'}") + print(f" - 한국어 콘텐츠: {'✓' if has_korean else '✗'}") + print(f" - AI 요약 가능: {'✓' if (has_sufficient_length and has_paragraphs) else '✗'}") + else: + print(f"✗ 오류: {response.status_code}") + +async def test_health_check(): + """서비스 상태 확인""" + async with httpx.AsyncClient() as client: + print("\n" + "="*60) + print("서비스 Health Check") + print("="*60) + + response = await client.get(f"{SERVICE_URL}/health") + if response.status_code == 200: + data = response.json() + print(f"✓ 통합 서비스 상태: {data['status']}") + print(f" - RSS 서비스: {data['services']['rss_feed']}") + print(f" - Google 검색 서비스: {data['services']['google_search']}") + else: + print(f"✗ Health check 실패: {response.status_code}") + +async def main(): + """메인 테스트 실행""" + print("\n" + "="*70) + print(" News Aggregator Full Content Integration Test ") + print(" RSS 피드 + Google Full Content 통합 테스트 ") + print("="*70) + + # Run tests + await test_health_check() + await test_aggregate_with_full_content() + await test_aggregate_by_location() + await test_aggregate_by_topic() + + print("\n" + "="*70) + print(" 테스트 완료 - Full Content 통합 확인 ") + print("="*70) + print("\n✅ 모든 테스트가 완료되었습니다.") + print(" RSS 피드 제목을 구글 full content로 검색하는 기능이 정상 작동합니다.") + print(" AI 요약을 위한 충분한 콘텐츠가 수집되고 있습니다.") + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/services/rss-feed/README.md b/backup-services/rss-feed/README.md similarity index 100% rename from services/rss-feed/README.md rename to backup-services/rss-feed/README.md diff --git a/services/rss-feed/backend/Dockerfile b/backup-services/rss-feed/backend/Dockerfile similarity index 100% rename from services/rss-feed/backend/Dockerfile rename to backup-services/rss-feed/backend/Dockerfile diff --git a/backup-services/rss-feed/backend/app/__init__.py b/backup-services/rss-feed/backend/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/rss-feed/backend/app/config.py b/backup-services/rss-feed/backend/app/config.py similarity index 100% rename from services/rss-feed/backend/app/config.py rename to backup-services/rss-feed/backend/app/config.py diff --git a/services/rss-feed/backend/app/feed_parser.py b/backup-services/rss-feed/backend/app/feed_parser.py similarity index 100% rename from services/rss-feed/backend/app/feed_parser.py rename to backup-services/rss-feed/backend/app/feed_parser.py diff --git a/backup-services/rss-feed/backend/app/google_rss.py b/backup-services/rss-feed/backend/app/google_rss.py new file mode 100644 index 0000000..b4fd24f --- /dev/null +++ b/backup-services/rss-feed/backend/app/google_rss.py @@ -0,0 +1,115 @@ +""" +Google News RSS Feed Generator +구글 뉴스 RSS 피드 URL 생성 및 구독 지원 +""" +from typing import Optional, List +from urllib.parse import quote_plus +from enum import Enum + +class GoogleNewsCategory(str, Enum): + """구글 뉴스 카테고리""" + WORLD = "WORLD" + NATION = "NATION" + BUSINESS = "BUSINESS" + TECHNOLOGY = "TECHNOLOGY" + ENTERTAINMENT = "ENTERTAINMENT" + SPORTS = "SPORTS" + SCIENCE = "SCIENCE" + HEALTH = "HEALTH" + +class GoogleNewsRSS: + """Google News RSS 피드 URL 생성기""" + + BASE_URL = "https://news.google.com/rss" + + @staticmethod + def search_feed(query: str, lang: str = "ko", country: str = "KR") -> str: + """ + 키워드 검색 RSS 피드 URL 생성 + + Args: + query: 검색 키워드 + lang: 언어 코드 (ko, en, ja, zh-CN 등) + country: 국가 코드 (KR, US, JP, CN 등) + + Returns: + RSS 피드 URL + """ + encoded_query = quote_plus(query) + return f"{GoogleNewsRSS.BASE_URL}/search?q={encoded_query}&hl={lang}&gl={country}&ceid={country}:{lang}" + + @staticmethod + def topic_feed(category: GoogleNewsCategory, lang: str = "ko", country: str = "KR") -> str: + """ + 카테고리별 RSS 피드 URL 생성 + + Args: + category: 뉴스 카테고리 + lang: 언어 코드 + country: 국가 코드 + + Returns: + RSS 피드 URL + """ + return f"{GoogleNewsRSS.BASE_URL}/headlines/section/topic/{category.value}?hl={lang}&gl={country}&ceid={country}:{lang}" + + @staticmethod + def location_feed(location: str, lang: str = "ko", country: str = "KR") -> str: + """ + 지역 뉴스 RSS 피드 URL 생성 + + Args: + location: 지역명 (예: Seoul, 서울, New York) + lang: 언어 코드 + country: 국가 코드 + + Returns: + RSS 피드 URL + """ + encoded_location = quote_plus(location) + return f"{GoogleNewsRSS.BASE_URL}/headlines/section/geo/{encoded_location}?hl={lang}&gl={country}&ceid={country}:{lang}" + + @staticmethod + def trending_feed(lang: str = "ko", country: str = "KR") -> str: + """ + 트렌딩 뉴스 RSS 피드 URL 생성 + + Args: + lang: 언어 코드 + country: 국가 코드 + + Returns: + RSS 피드 URL + """ + return f"{GoogleNewsRSS.BASE_URL}?hl={lang}&gl={country}&ceid={country}:{lang}" + + @staticmethod + def get_common_feeds() -> List[dict]: + """ + 자주 사용되는 RSS 피드 목록 반환 + + Returns: + 피드 정보 리스트 + """ + return [ + { + "title": "구글 뉴스 - 한국 헤드라인", + "url": GoogleNewsRSS.trending_feed("ko", "KR"), + "description": "한국 주요 뉴스" + }, + { + "title": "구글 뉴스 - 기술", + "url": GoogleNewsRSS.topic_feed(GoogleNewsCategory.TECHNOLOGY, "ko", "KR"), + "description": "기술 관련 뉴스" + }, + { + "title": "구글 뉴스 - 비즈니스", + "url": GoogleNewsRSS.topic_feed(GoogleNewsCategory.BUSINESS, "ko", "KR"), + "description": "비즈니스 뉴스" + }, + { + "title": "Google News - World", + "url": GoogleNewsRSS.topic_feed(GoogleNewsCategory.WORLD, "en", "US"), + "description": "World news in English" + } + ] \ No newline at end of file diff --git a/services/rss-feed/backend/app/main.py b/backup-services/rss-feed/backend/app/main.py similarity index 69% rename from services/rss-feed/backend/app/main.py rename to backup-services/rss-feed/backend/app/main.py index 823d620..b8f4fe2 100644 --- a/services/rss-feed/backend/app/main.py +++ b/backup-services/rss-feed/backend/app/main.py @@ -13,9 +13,10 @@ import json from .config import settings from .models import ( FeedSubscription, FeedEntry, CreateFeedRequest, - UpdateFeedRequest, FeedStatistics, FeedStatus + UpdateFeedRequest, FeedStatistics, FeedStatus, FeedCategory ) from .feed_parser import FeedParser +from .google_rss import GoogleNewsRSS, GoogleNewsCategory # Database connection db_client = None @@ -439,4 +440,157 @@ async def export_opml(): return { "opml": opml, "feed_count": len(feeds) - } \ No newline at end of file + } + +# Google News RSS Endpoints + +@app.get("/api/google-rss/search") +async def get_google_search_rss( + q: str = Query(..., description="검색 키워드"), + lang: str = Query("ko", description="언어 코드 (ko, en, ja, zh-CN 등)"), + country: str = Query("KR", description="국가 코드 (KR, US, JP, CN 등)") +): + """Google News 검색 RSS 피드 URL 생성""" + feed_url = GoogleNewsRSS.search_feed(q, lang, country) + + # 피드 파싱 테스트 + result = await parser.parse_feed(feed_url) + + return { + "keyword": q, + "feed_url": feed_url, + "success": result["success"], + "feed_title": result["feed"].get("title", "Google News") if result["success"] else None, + "entry_count": len(result["entries"]) if result["success"] else 0, + "sample_titles": [entry.get("title", "") for entry in result["entries"][:5]] if result["success"] else [], + "entries": [ + { + "title": entry.get("title", ""), + "link": entry.get("link", ""), + "published": entry.get("published", ""), + "summary": entry.get("summary", "")[:200] if entry.get("summary") else "" + } for entry in result["entries"][:20] + ] if result["success"] else [], + "error": result.get("error") + } + +@app.get("/api/google-rss/topic") +async def get_google_topic_rss( + category: GoogleNewsCategory = Query(..., description="뉴스 카테고리"), + lang: str = Query("ko", description="언어 코드"), + country: str = Query("KR", description="국가 코드") +): + """Google News 카테고리별 RSS 피드 URL 생성""" + feed_url = GoogleNewsRSS.topic_feed(category, lang, country) + + # 피드 파싱 테스트 + result = await parser.parse_feed(feed_url) + + return { + "category": category, + "feed_url": feed_url, + "success": result["success"], + "feed_title": result["feed"].get("title", "Google News") if result["success"] else None, + "entry_count": len(result["entries"]) if result["success"] else 0, + "sample_titles": [entry.get("title", "") for entry in result["entries"][:5]] if result["success"] else [], + "error": result.get("error") + } + +@app.get("/api/google-rss/location") +async def get_google_location_rss( + location: str = Query(..., description="지역명 (예: Seoul, 서울, New York)"), + lang: str = Query("ko", description="언어 코드"), + country: str = Query("KR", description="국가 코드") +): + """Google News 지역 뉴스 RSS 피드 URL 생성""" + feed_url = GoogleNewsRSS.location_feed(location, lang, country) + + # 피드 파싱 테스트 + result = await parser.parse_feed(feed_url) + + return { + "location": location, + "feed_url": feed_url, + "success": result["success"], + "feed_title": result["feed"].get("title", "Google News") if result["success"] else None, + "entry_count": len(result["entries"]) if result["success"] else 0, + "sample_titles": [entry.get("title", "") for entry in result["entries"][:5]] if result["success"] else [], + "error": result.get("error") + } + +@app.get("/api/google-rss/trending") +async def get_google_trending_rss( + lang: str = Query("ko", description="언어 코드"), + country: str = Query("KR", description="국가 코드") +): + """Google News 트렌딩 RSS 피드 URL 생성""" + feed_url = GoogleNewsRSS.trending_feed(lang, country) + + # 피드 파싱 테스트 + result = await parser.parse_feed(feed_url) + + return { + "feed_url": feed_url, + "success": result["success"], + "feed_title": result["feed"].get("title", "Google News") if result["success"] else None, + "entry_count": len(result["entries"]) if result["success"] else 0, + "sample_titles": [entry.get("title", "") for entry in result["entries"][:5]] if result["success"] else [], + "error": result.get("error") + } + +@app.post("/api/google-rss/subscribe") +async def subscribe_google_rss( + q: Optional[str] = Query(None, description="검색 키워드"), + category: Optional[GoogleNewsCategory] = Query(None, description="카테고리"), + location: Optional[str] = Query(None, description="지역명"), + trending: bool = Query(False, description="트렌딩 뉴스"), + lang: str = Query("ko", description="언어 코드"), + country: str = Query("KR", description="국가 코드"), + background_tasks: BackgroundTasks = ... +): + """Google News RSS 피드 구독""" + # URL 생성 + if q: + feed_url = GoogleNewsRSS.search_feed(q, lang, country) + feed_title = f"Google News - {q}" + elif category: + feed_url = GoogleNewsRSS.topic_feed(category, lang, country) + feed_title = f"Google News - {category.value}" + elif location: + feed_url = GoogleNewsRSS.location_feed(location, lang, country) + feed_title = f"Google News - {location}" + elif trending: + feed_url = GoogleNewsRSS.trending_feed(lang, country) + feed_title = f"Google News - Trending ({country})" + else: + raise HTTPException(status_code=400, detail="검색어, 카테고리, 지역 중 하나를 지정해주세요") + + # 중복 확인 + existing = await db.feeds.find_one({"url": feed_url}) + if existing: + raise HTTPException(status_code=400, detail="이미 구독 중인 피드입니다") + + # 피드 파싱 + result = await parser.parse_feed(feed_url) + if not result["success"]: + raise HTTPException(status_code=400, detail=f"피드 파싱 실패: {result['error']}") + + # 구독 생성 + feed = FeedSubscription( + title=feed_title, + url=feed_url, + description=result["feed"].get("description", "Google News Feed"), + category=FeedCategory.NEWS, + update_interval=900 # 15분 + ) + + # DB 저장 + feed_dict = feed.dict() + feed_dict["url"] = str(feed_dict["url"]) + result = await db.feeds.insert_one(feed_dict) + feed.id = str(result.inserted_id) + + # 백그라운드 업데이트 + background_tasks.add_task(update_feed, feed.id) + + return feed \ No newline at end of file diff --git a/services/rss-feed/backend/app/models.py b/backup-services/rss-feed/backend/app/models.py similarity index 100% rename from services/rss-feed/backend/app/models.py rename to backup-services/rss-feed/backend/app/models.py diff --git a/services/rss-feed/backend/requirements.txt b/backup-services/rss-feed/backend/requirements.txt similarity index 100% rename from services/rss-feed/backend/requirements.txt rename to backup-services/rss-feed/backend/requirements.txt diff --git a/config/api-keys-backup.env b/config/api-keys-backup.env new file mode 100644 index 0000000..4cd3975 --- /dev/null +++ b/config/api-keys-backup.env @@ -0,0 +1,18 @@ +# API Keys Backup - Created on 2025-01-13 +# 이 파일은 중요한 API 키를 백업한 것입니다. 안전하게 보관하세요. + +# Claude API Key +CLAUDE_API_KEY=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 + +# Google APIs +GOOGLE_API_KEY=AIzaSyBakoCsDP_oF5V4oq_eEKs4eQb-ekqxnRM +GOOGLE_SEARCH_ENGINE_ID=35bfbdb7b6f244569 + +# Translation (DeepL) +DEEPL_API_KEY=3abbc796-2515-44a8-972d-22dcf27ab54a + +# Image Generation (Replicate) +REPLICATE_API_TOKEN=r8_AR4puLJQYD4eeuPljw2yJvKCWKT72k119pEyp + +# Additional APIs (필요시 추가) +# SERPAPI_KEY= \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index df5a521..0895e1d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3.8' - services: console-frontend: build: @@ -412,6 +410,230 @@ services: timeout: 10s retries: 3 + # News Aggregator Service + news-aggregator-backend: + build: + context: ./services/news-aggregator/backend + dockerfile: Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_news_aggregator_backend + ports: + - "8018:8000" + environment: + - RSS_SERVICE_URL=http://rss-feed-backend:8000 + - GOOGLE_SEARCH_SERVICE_URL=http://google-search-backend:8000 + depends_on: + - rss-feed-backend + - google-search-backend + networks: + - site11_network + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 3 + + # AI Writer Service + ai-writer-backend: + build: + context: ./services/ai-writer/backend + dockerfile: Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_ai_writer_backend + ports: + - "8019:8000" + environment: + - NEWS_AGGREGATOR_URL=http://news-aggregator-backend:8000 + - CLAUDE_API_KEY=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 + - MONGODB_URL=mongodb://mongodb:27017 + - DB_NAME=ai_writer_db + - REDIS_URL=redis://redis:6379 + depends_on: + - mongodb + - redis + - news-aggregator-backend + networks: + - site11_network + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 3 + + # AI Writer Worker Service + ai-writer-worker: + build: + context: ./services/ai-writer + dockerfile: worker/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_ai_writer_worker + environment: + - CLAUDE_API_KEY=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 + - MONGODB_URL=mongodb://mongodb:27017 + - DB_NAME=ai_writer_db + - REDIS_URL=redis://redis:6379 + - WORKER_COUNT=3 + depends_on: + - mongodb + - redis + - ai-writer-backend + networks: + - site11_network + restart: unless-stopped + + # ============ Pipeline Services ============ + # Pipeline Scheduler Service + pipeline-scheduler: + build: + context: ./services/pipeline + dockerfile: scheduler/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_pipeline_scheduler + restart: unless-stopped + depends_on: + - redis + - mongodb + environment: + - REDIS_URL=redis://redis:6379 + - MONGODB_URL=mongodb://mongodb:27017 + - DB_NAME=pipeline_db + - LOG_LEVEL=INFO + volumes: + - ./services/pipeline/shared:/app/shared:ro + networks: + - site11_network + + # Pipeline RSS Collector Worker + pipeline-rss-collector: + build: + context: ./services/pipeline + dockerfile: rss-collector/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_pipeline_rss_collector + restart: unless-stopped + depends_on: + - redis + environment: + - REDIS_URL=redis://redis:6379 + - LOG_LEVEL=INFO + volumes: + - ./services/pipeline/shared:/app/shared:ro + networks: + - site11_network + + # Pipeline Google Search Worker + pipeline-google-search: + build: + context: ./services/pipeline + dockerfile: google-search/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_pipeline_google_search + restart: unless-stopped + depends_on: + - redis + environment: + - REDIS_URL=redis://redis:6379 + - GOOGLE_API_KEY=AIzaSyBakoCsDP_oF5V4oq_eEKs4eQb-ekqxnRM + - GOOGLE_SEARCH_ENGINE_ID=35bfbdb7b6f244569 + - LOG_LEVEL=INFO + volumes: + - ./services/pipeline/shared:/app/shared:ro + networks: + - site11_network + + # Pipeline AI Summarizer Worker + pipeline-ai-summarizer: + build: + context: ./services/pipeline + dockerfile: ai-summarizer/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_pipeline_ai_summarizer + restart: unless-stopped + depends_on: + - redis + environment: + - REDIS_URL=redis://redis:6379 + - CLAUDE_API_KEY=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 + - LOG_LEVEL=INFO + volumes: + - ./services/pipeline/shared:/app/shared:ro + networks: + - site11_network + + # Pipeline Article Assembly Worker + pipeline-article-assembly: + build: + context: ./services/pipeline + dockerfile: article-assembly/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_pipeline_article_assembly + restart: unless-stopped + depends_on: + - redis + - mongodb + environment: + - REDIS_URL=redis://redis:6379 + - MONGODB_URL=mongodb://mongodb:27017 + - DB_NAME=pipeline_db + - CLAUDE_API_KEY=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 + - LOG_LEVEL=INFO + volumes: + - ./services/pipeline/shared:/app/shared:ro + networks: + - site11_network + + # Pipeline Monitor (optional dashboard) + pipeline-monitor: + build: + context: ./services/pipeline + dockerfile: monitor/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_pipeline_monitor + restart: unless-stopped + depends_on: + - redis + - mongodb + ports: + - "8100:8000" + environment: + - REDIS_URL=redis://redis:6379 + - MONGODB_URL=mongodb://mongodb:27017 + - DB_NAME=pipeline_db + - LOG_LEVEL=INFO + volumes: + - ./services/pipeline/shared:/app/shared:ro + networks: + - site11_network + + # Pipeline Translator + pipeline-translator: + build: + context: ./services/pipeline + dockerfile: translator/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_pipeline_translator + restart: unless-stopped + depends_on: + - redis + environment: + - REDIS_URL=redis://redis:6379 + - DEEPL_API_KEY=3abbc796-2515-44a8-972d-22dcf27ab54a + - LOG_LEVEL=INFO + volumes: + - ./services/pipeline/shared:/app/shared:ro + networks: + - site11_network + + # Pipeline Image Generator + pipeline-image-generator: + build: + context: ./services/pipeline + dockerfile: image-generator/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_pipeline_image_generator + restart: unless-stopped + depends_on: + - redis + environment: + - REDIS_URL=redis://redis:6379 + - REPLICATE_API_KEY=${REPLICATE_API_KEY:-} + - LOG_LEVEL=INFO + volumes: + - ./services/pipeline/shared:/app/shared:ro + networks: + - site11_network + networks: site11_network: driver: bridge diff --git a/generated_article.json b/generated_article.json new file mode 100644 index 0000000..e883c5f --- /dev/null +++ b/generated_article.json @@ -0,0 +1,63 @@ +{ + "news_id": "dda43a2b-8478-4bd8-be74-32ab2618a7dd", + "title": "2025년 대한민국 AI 전환 가속화...정부 10조원 투자 계획 발표", + "created_at": "2025-09-12T19:33:52.388833", + "summary": "정부가 내년 인공지능 분야에 10조원 투자 계획을 발표하며 국가 AI 경쟁력 강화에 나서, 산업 전반의 디지털 전환 가속화 전망", + "subtopics": [ + { + "title": "정부의 대규모 AI 투자 계획", + "content": [ + "정부가 2025년 인공지능 분야에 10조원 규모의 대규모 투자를 단행할 계획을 발표했다. 이는 한국 경제의 체질 개선과 디지털 전환 가속화를 위한 전략적 결정으로 평가받고 있다.", + "투자의 주요 방향은 AI 기술 개발, 인프라 구축, 인재 양성 등 다방면에 걸쳐있다. 특히 피지컬 AI 실증단지 조성과 같은 실용적 프로젝트들이 포함되어 있어 실질적인 산업 발전 효과가 기대된다.", + "대통령실은 AI 정책 추진을 위한 전담 조직을 신설하고, 정부와 업계 간의 가교 역할을 수행할 예정이다. 이를 통해 민관 협력 체계를 강화하고 정책 실행력을 높일 계획이다." + ] + }, + { + "title": "국내 기업들의 AI 기술 경쟁력 강화", + "content": [ + "SK하이닉스는 AI 메모리 분야에서 HBM4 양산을 시작하며 글로벌 기술 경쟁에서 우위를 점하고 있다. 특히 10Gbps 이상의 동작속도 구현에 성공하며 기술력을 입증했다.", + "국내 주요 기업들은 AI 관련 연구개발 투자를 확대하고 있으며, 특히 반도체, 로봇, 소프트웨어 분야에서 괄목할만한 성과를 보이고 있다.", + "산업계는 정부의 대규모 투자 계획에 호응하여 자체적인 AI 혁신 프로그램을 가속화하고 있으며, 글로벌 시장에서의 경쟁력 강화를 위해 총력을 기울이고 있다." + ] + }, + { + "title": "AI 기본법 시행과 제도적 기반 마련", + "content": [ + "정부는 AI 기본법 시행령을 공개하며 인공지능 발전을 위한 제도적 기반을 마련했다. 이를 통해 AI 산업 발전의 법적 근거와 윤리적 가이드라인이 확립될 전망이다.", + "새로운 법제도는 AI 기술의 안전한 발전과 윤리적 활용을 보장하면서도, 기업들의 혁신을 저해하지 않는 균형잡힌 접근을 추구하고 있다.", + "특히 AI 기본법은 개인정보 보호, 알고리즘 투명성, 책임성 등 주요 이슈들에 대한 명확한 기준을 제시하여 산업계의 불확실성을 해소할 것으로 기대된다." + ] + } + ], + "categories": [ + "기술", + "정책", + "산업", + "경제" + ], + "entities": { + "people": [ + "강훈식", + "배경훈 과기정통부 장관" + ], + "organizations": [ + "SK하이닉스", + "대통령실", + "과학기술정보통신부" + ], + "groups": [ + "AI 산업계", + "반도체 업계" + ], + "countries": [ + "대한민국", + "미국" + ], + "events": [ + "AI 기본법 시행", + "정부 10조원 투자 계획 발표" + ] + }, + "source_keyword": "인공지능", + "source_count": 5 +} \ No newline at end of file diff --git a/services/pipeline/Makefile b/services/pipeline/Makefile new file mode 100644 index 0000000..0158cf5 --- /dev/null +++ b/services/pipeline/Makefile @@ -0,0 +1,90 @@ +# Pipeline Makefile + +.PHONY: help build up down restart logs clean test monitor + +help: + @echo "Pipeline Management Commands:" + @echo " make build - Build all Docker images" + @echo " make up - Start all services" + @echo " make down - Stop all services" + @echo " make restart - Restart all services" + @echo " make logs - View logs for all services" + @echo " make clean - Clean up containers and volumes" + @echo " make monitor - Open monitor dashboard" + @echo " make test - Test pipeline with sample keyword" + +build: + docker-compose build + +up: + docker-compose up -d + +down: + docker-compose down + +restart: + docker-compose restart + +logs: + docker-compose logs -f + +clean: + docker-compose down -v + docker system prune -f + +monitor: + @echo "Opening monitor dashboard..." + @echo "Dashboard: http://localhost:8100" + @echo "API Docs: http://localhost:8100/docs" + +test: + @echo "Testing pipeline with sample keyword..." + curl -X POST http://localhost:8100/api/keywords \ + -H "Content-Type: application/json" \ + -d '{"keyword": "테스트", "schedule": "30min"}' + @echo "\nTriggering immediate processing..." + curl -X POST http://localhost:8100/api/trigger/테스트 + +# Service-specific commands +scheduler-logs: + docker-compose logs -f scheduler + +rss-logs: + docker-compose logs -f rss-collector + +search-logs: + docker-compose logs -f google-search + +summarizer-logs: + docker-compose logs -f ai-summarizer + +assembly-logs: + docker-compose logs -f article-assembly + +monitor-logs: + docker-compose logs -f monitor + +# Database commands +redis-cli: + docker-compose exec redis redis-cli + +mongo-shell: + docker-compose exec mongodb mongosh -u admin -p password123 + +# Queue management +queue-status: + @echo "Checking queue status..." + docker-compose exec redis redis-cli --raw LLEN queue:keyword + docker-compose exec redis redis-cli --raw LLEN queue:rss + docker-compose exec redis redis-cli --raw LLEN queue:search + docker-compose exec redis redis-cli --raw LLEN queue:summarize + docker-compose exec redis redis-cli --raw LLEN queue:assembly + +queue-clear: + @echo "Clearing all queues..." + docker-compose exec redis redis-cli FLUSHDB + +# Health check +health: + @echo "Checking service health..." + curl -s http://localhost:8100/api/health | python3 -m json.tool \ No newline at end of file diff --git a/services/pipeline/README.md b/services/pipeline/README.md new file mode 100644 index 0000000..e8bf455 --- /dev/null +++ b/services/pipeline/README.md @@ -0,0 +1,154 @@ +# News Pipeline System + +비동기 큐 기반 뉴스 생성 파이프라인 시스템 + +## 아키텍처 + +``` +Scheduler → RSS Collector → Google Search → AI Summarizer → Article Assembly → MongoDB + ↓ ↓ ↓ ↓ ↓ + Redis Queue Redis Queue Redis Queue Redis Queue Redis Queue +``` + +## 서비스 구성 + +### 1. Scheduler +- 30분마다 등록된 키워드 처리 +- 오전 7시, 낮 12시, 저녁 6시 우선 처리 +- MongoDB에서 키워드 로드 후 큐에 작업 생성 + +### 2. RSS Collector +- RSS 피드 수집 (Google News RSS) +- 7일간 중복 방지 (Redis Set) +- 키워드 관련성 필터링 + +### 3. Google Search +- RSS 아이템별 추가 검색 결과 수집 +- 아이템당 최대 3개 결과 +- 작업당 최대 5개 아이템 처리 + +### 4. AI Summarizer +- Claude Haiku로 빠른 요약 생성 +- 200자 이내 한국어 요약 +- 병렬 처리 지원 (3 workers) + +### 5. Article Assembly +- Claude Sonnet으로 종합 기사 작성 +- 1500자 이내 전문 기사 +- MongoDB 저장 및 통계 업데이트 + +### 6. Monitor +- 실시간 파이프라인 모니터링 +- 큐 상태, 워커 상태 확인 +- REST API 제공 (포트 8100) + +## 시작하기 + +### 1. 환경 변수 설정 +```bash +# .env 파일 확인 +CLAUDE_API_KEY=your_claude_api_key +GOOGLE_API_KEY=your_google_api_key +GOOGLE_SEARCH_ENGINE_ID=your_search_engine_id +``` + +### 2. 서비스 시작 +```bash +cd pipeline +docker-compose up -d +``` + +### 3. 모니터링 +```bash +# 로그 확인 +docker-compose logs -f + +# 특정 서비스 로그 +docker-compose logs -f scheduler + +# 모니터 API +curl http://localhost:8100/api/stats +``` + +## API 엔드포인트 + +### Monitor API (포트 8100) + +- `GET /api/stats` - 전체 통계 +- `GET /api/queues/{queue_name}` - 큐 상세 정보 +- `GET /api/keywords` - 키워드 목록 +- `POST /api/keywords` - 키워드 등록 +- `DELETE /api/keywords/{id}` - 키워드 삭제 +- `GET /api/articles` - 기사 목록 +- `GET /api/articles/{id}` - 기사 상세 +- `GET /api/workers` - 워커 상태 +- `POST /api/trigger/{keyword}` - 수동 처리 트리거 +- `GET /api/health` - 헬스 체크 + +## 키워드 등록 예시 + +```bash +# 새 키워드 등록 +curl -X POST http://localhost:8100/api/keywords \ + -H "Content-Type: application/json" \ + -d '{"keyword": "인공지능", "schedule": "30min"}' + +# 수동 처리 트리거 +curl -X POST http://localhost:8100/api/trigger/인공지능 +``` + +## 데이터베이스 + +### MongoDB Collections +- `keywords` - 등록된 키워드 +- `articles` - 생성된 기사 +- `keyword_stats` - 키워드별 통계 + +### Redis Keys +- `queue:*` - 작업 큐 +- `processing:*` - 처리 중 작업 +- `failed:*` - 실패한 작업 +- `dedup:rss:*` - RSS 중복 방지 +- `workers:*:active` - 활성 워커 + +## 트러블슈팅 + +### 큐 초기화 +```bash +docker-compose exec redis redis-cli FLUSHDB +``` + +### 워커 재시작 +```bash +docker-compose restart rss-collector +``` + +### 데이터베이스 접속 +```bash +# MongoDB +docker-compose exec mongodb mongosh -u admin -p password123 + +# Redis +docker-compose exec redis redis-cli +``` + +## 스케일링 + +워커 수 조정: +```yaml +# docker-compose.yml +ai-summarizer: + deploy: + replicas: 5 # 워커 수 증가 +``` + +## 모니터링 대시보드 + +브라우저에서 http://localhost:8100 접속하여 파이프라인 상태 확인 + +## 로그 레벨 설정 + +`.env` 파일에서 조정: +``` +LOG_LEVEL=DEBUG # INFO, WARNING, ERROR +``` \ No newline at end of file diff --git a/services/pipeline/ai-summarizer/Dockerfile b/services/pipeline/ai-summarizer/Dockerfile new file mode 100644 index 0000000..efdb0c6 --- /dev/null +++ b/services/pipeline/ai-summarizer/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11-slim + +WORKDIR /app + +# 의존성 설치 +COPY ./ai-summarizer/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# 공통 모듈 복사 +COPY ./shared /app/shared + +# AI Summarizer 코드 복사 +COPY ./ai-summarizer /app + +# 환경변수 +ENV PYTHONUNBUFFERED=1 + +# 실행 +CMD ["python", "ai_summarizer.py"] \ No newline at end of file diff --git a/services/pipeline/ai-summarizer/ai_summarizer.py b/services/pipeline/ai-summarizer/ai_summarizer.py new file mode 100644 index 0000000..77209fb --- /dev/null +++ b/services/pipeline/ai-summarizer/ai_summarizer.py @@ -0,0 +1,161 @@ +""" +AI Summarizer Service +Claude API를 사용한 뉴스 요약 서비스 +""" +import asyncio +import logging +import os +import sys +from typing import List, Dict, Any +from anthropic import AsyncAnthropic + +# Import from shared module +from shared.models import PipelineJob, EnrichedItem, SummarizedItem +from shared.queue_manager import QueueManager + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class AISummarizerWorker: + def __init__(self): + self.queue_manager = QueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") + ) + self.claude_api_key = os.getenv("CLAUDE_API_KEY") + self.claude_client = None + + async def start(self): + """워커 시작""" + logger.info("Starting AI Summarizer Worker") + + # Redis 연결 + await self.queue_manager.connect() + + # Claude 클라이언트 초기화 + if self.claude_api_key: + self.claude_client = AsyncAnthropic(api_key=self.claude_api_key) + else: + logger.error("Claude API key not configured") + return + + # 메인 처리 루프 + while True: + try: + # 큐에서 작업 가져오기 + job = await self.queue_manager.dequeue('ai_summarization', timeout=5) + + if job: + await self.process_job(job) + + except Exception as e: + logger.error(f"Error in worker loop: {e}") + await asyncio.sleep(1) + + async def process_job(self, job: PipelineJob): + """AI 요약 작업 처리""" + try: + logger.info(f"Processing job {job.job_id} for AI summarization") + + enriched_items = job.data.get('enriched_items', []) + summarized_items = [] + + for item_data in enriched_items: + enriched_item = EnrichedItem(**item_data) + + # AI 요약 생성 + summary = await self._generate_summary(enriched_item) + + summarized_item = SummarizedItem( + enriched_item=enriched_item, + ai_summary=summary, + summary_language='ko' + ) + summarized_items.append(summarized_item) + + # API 속도 제한 + await asyncio.sleep(1) + + if summarized_items: + logger.info(f"Summarized {len(summarized_items)} items") + + # 다음 단계로 전달 (번역 단계로) + job.data['summarized_items'] = [item.dict() for item in summarized_items] + job.stages_completed.append('ai_summarization') + job.stage = 'translation' + + await self.queue_manager.enqueue('translation', job) + await self.queue_manager.mark_completed('ai_summarization', job.job_id) + else: + logger.warning(f"No items summarized for job {job.job_id}") + await self.queue_manager.mark_failed( + 'ai_summarization', + job, + "No items to summarize" + ) + + except Exception as e: + logger.error(f"Error processing job {job.job_id}: {e}") + await self.queue_manager.mark_failed('ai_summarization', job, str(e)) + + async def _generate_summary(self, enriched_item: EnrichedItem) -> str: + """Claude를 사용한 요약 생성""" + try: + # 컨텐츠 준비 + content_parts = [ + f"제목: {enriched_item.rss_item.title}", + f"요약: {enriched_item.rss_item.summary or '없음'}" + ] + + # 검색 결과 추가 + if enriched_item.search_results: + content_parts.append("\n관련 검색 결과:") + for idx, result in enumerate(enriched_item.search_results[:3], 1): + content_parts.append(f"{idx}. {result.title}") + if result.snippet: + content_parts.append(f" {result.snippet}") + + content = "\n".join(content_parts) + + # Claude API 호출 + prompt = f"""다음 뉴스 내용을 200자 이내로 핵심만 요약해주세요. +중요한 사실, 수치, 인물, 조직을 포함하고 객관적인 톤을 유지하세요. + +{content} + +요약:""" + + response = await self.claude_client.messages.create( + model="claude-sonnet-4-20250514", # 최신 Sonnet 모델 + max_tokens=500, + temperature=0.3, + messages=[ + {"role": "user", "content": prompt} + ] + ) + + summary = response.content[0].text.strip() + return summary + + except Exception as e: + logger.error(f"Error generating summary: {e}") + # 폴백: 원본 요약 사용 + return enriched_item.rss_item.summary[:200] if enriched_item.rss_item.summary else enriched_item.rss_item.title + + async def stop(self): + """워커 중지""" + await self.queue_manager.disconnect() + logger.info("AI Summarizer Worker stopped") + +async def main(): + """메인 함수""" + worker = AISummarizerWorker() + + try: + await worker.start() + except KeyboardInterrupt: + logger.info("Received interrupt signal") + finally: + await worker.stop() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/services/pipeline/ai-summarizer/requirements.txt b/services/pipeline/ai-summarizer/requirements.txt new file mode 100644 index 0000000..db8aa9c --- /dev/null +++ b/services/pipeline/ai-summarizer/requirements.txt @@ -0,0 +1,3 @@ +anthropic==0.50.0 +redis[hiredis]==5.0.1 +pydantic==2.5.0 \ No newline at end of file diff --git a/services/pipeline/article-assembly/Dockerfile b/services/pipeline/article-assembly/Dockerfile new file mode 100644 index 0000000..5929d7d --- /dev/null +++ b/services/pipeline/article-assembly/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11-slim + +WORKDIR /app + +# 의존성 설치 +COPY ./article-assembly/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# 공통 모듈 복사 +COPY ./shared /app/shared + +# Article Assembly 코드 복사 +COPY ./article-assembly /app + +# 환경변수 +ENV PYTHONUNBUFFERED=1 + +# 실행 +CMD ["python", "article_assembly.py"] \ No newline at end of file diff --git a/services/pipeline/article-assembly/article_assembly.py b/services/pipeline/article-assembly/article_assembly.py new file mode 100644 index 0000000..f798e99 --- /dev/null +++ b/services/pipeline/article-assembly/article_assembly.py @@ -0,0 +1,234 @@ +""" +Article Assembly Service +최종 기사 조립 및 MongoDB 저장 서비스 +""" +import asyncio +import logging +import os +import sys +import json +from datetime import datetime +from typing import List, Dict, Any +from anthropic import AsyncAnthropic +from motor.motor_asyncio import AsyncIOMotorClient + +# Import from shared module +from shared.models import PipelineJob, SummarizedItem, FinalArticle +from shared.queue_manager import QueueManager + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class ArticleAssemblyWorker: + def __init__(self): + self.queue_manager = QueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") + ) + self.claude_api_key = os.getenv("CLAUDE_API_KEY") + self.claude_client = None + self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017") + self.db_name = os.getenv("DB_NAME", "pipeline_db") + self.db = None + + async def start(self): + """워커 시작""" + logger.info("Starting Article Assembly Worker") + + # Redis 연결 + await self.queue_manager.connect() + + # MongoDB 연결 + client = AsyncIOMotorClient(self.mongodb_url) + self.db = client[self.db_name] + + # Claude 클라이언트 초기화 + if self.claude_api_key: + self.claude_client = AsyncAnthropic(api_key=self.claude_api_key) + else: + logger.error("Claude API key not configured") + return + + # 메인 처리 루프 + while True: + try: + # 큐에서 작업 가져오기 + job = await self.queue_manager.dequeue('article_assembly', timeout=5) + + if job: + await self.process_job(job) + + except Exception as e: + logger.error(f"Error in worker loop: {e}") + await asyncio.sleep(1) + + async def process_job(self, job: PipelineJob): + """최종 기사 조립 작업 처리""" + try: + start_time = datetime.now() + logger.info(f"Processing job {job.job_id} for article assembly") + + summarized_items = job.data.get('summarized_items', []) + + if not summarized_items: + logger.warning(f"No items to assemble for job {job.job_id}") + await self.queue_manager.mark_failed( + 'article_assembly', + job, + "No items to assemble" + ) + return + + # 최종 기사 생성 + article = await self._generate_final_article(job, summarized_items) + + # 처리 시간 계산 + processing_time = (datetime.now() - start_time).total_seconds() + article.processing_time = processing_time + + # MongoDB에 저장 + await self.db.articles.insert_one(article.dict()) + + logger.info(f"Article {article.article_id} saved to MongoDB") + + # 완료 표시 + job.stages_completed.append('article_assembly') + await self.queue_manager.mark_completed('article_assembly', job.job_id) + + # 통계 업데이트 + await self._update_statistics(job.keyword_id) + + except Exception as e: + logger.error(f"Error processing job {job.job_id}: {e}") + await self.queue_manager.mark_failed('article_assembly', job, str(e)) + + async def _generate_final_article( + self, + job: PipelineJob, + summarized_items: List[Dict] + ) -> FinalArticle: + """Claude를 사용한 최종 기사 생성""" + + # 아이템 정보 준비 + items_text = [] + for idx, item_data in enumerate(summarized_items, 1): + item = SummarizedItem(**item_data) + items_text.append(f""" +[뉴스 {idx}] +제목: {item.enriched_item['rss_item']['title']} +요약: {item.ai_summary} +출처: {item.enriched_item['rss_item']['link']} +""") + + content = "\n".join(items_text) + + # Claude로 종합 기사 작성 + prompt = f"""다음 뉴스 항목들을 바탕으로 종합적인 기사를 작성해주세요. + +키워드: {job.keyword} + +뉴스 항목들: +{content} + +다음 JSON 형식으로 작성해주세요: +{{ + "title": "종합 기사 제목", + "content": "기사 본문 (1500자 이내, 문단 구분)", + "summary": "한 줄 요약 (100자 이내)", + "categories": ["카테고리1", "카테고리2"], + "tags": ["태그1", "태그2", "태그3"] +}} + +요구사항: +- 전문적이고 객관적인 톤 +- 핵심 정보와 트렌드 파악 +- 시사점 포함 +- 한국 독자 대상""" + + try: + response = await self.claude_client.messages.create( + model="claude-sonnet-4-20250514", + max_tokens=3000, + temperature=0.7, + messages=[ + {"role": "user", "content": prompt} + ] + ) + + # JSON 파싱 + content_text = response.content[0].text + json_start = content_text.find('{') + json_end = content_text.rfind('}') + 1 + + if json_start != -1 and json_end > json_start: + article_data = json.loads(content_text[json_start:json_end]) + else: + raise ValueError("No valid JSON in response") + + # FinalArticle 생성 + article = FinalArticle( + job_id=job.job_id, + keyword_id=job.keyword_id, + keyword=job.keyword, + title=article_data.get('title', f"{job.keyword} 종합 뉴스"), + content=article_data.get('content', ''), + summary=article_data.get('summary', ''), + source_items=[], # 간소화 + images=[], # 이미지는 별도 서비스에서 처리 + categories=article_data.get('categories', []), + tags=article_data.get('tags', []), + pipeline_stages=job.stages_completed, + processing_time=0 # 나중에 업데이트 + ) + + return article + + except Exception as e: + logger.error(f"Error generating article: {e}") + # 폴백 기사 생성 + return FinalArticle( + job_id=job.job_id, + keyword_id=job.keyword_id, + keyword=job.keyword, + title=f"{job.keyword} 뉴스 요약 - {datetime.now().strftime('%Y-%m-%d')}", + content=content, + summary=f"{job.keyword} 관련 {len(summarized_items)}개 뉴스 요약", + source_items=[], + images=[], + categories=['자동생성'], + tags=[job.keyword], + pipeline_stages=job.stages_completed, + processing_time=0 + ) + + async def _update_statistics(self, keyword_id: str): + """키워드별 통계 업데이트""" + try: + await self.db.keyword_stats.update_one( + {"keyword_id": keyword_id}, + { + "$inc": {"articles_generated": 1}, + "$set": {"last_generated": datetime.now()} + }, + upsert=True + ) + except Exception as e: + logger.error(f"Error updating statistics: {e}") + + async def stop(self): + """워커 중지""" + await self.queue_manager.disconnect() + logger.info("Article Assembly Worker stopped") + +async def main(): + """메인 함수""" + worker = ArticleAssemblyWorker() + + try: + await worker.start() + except KeyboardInterrupt: + logger.info("Received interrupt signal") + finally: + await worker.stop() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/services/pipeline/article-assembly/requirements.txt b/services/pipeline/article-assembly/requirements.txt new file mode 100644 index 0000000..19861c6 --- /dev/null +++ b/services/pipeline/article-assembly/requirements.txt @@ -0,0 +1,5 @@ +anthropic==0.50.0 +motor==3.1.1 +pymongo==4.3.3 +redis[hiredis]==5.0.1 +pydantic==2.5.0 \ No newline at end of file diff --git a/services/pipeline/fix_imports.py b/services/pipeline/fix_imports.py new file mode 100644 index 0000000..cbc5929 --- /dev/null +++ b/services/pipeline/fix_imports.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +"""Fix import statements in all pipeline services""" + +import os +import re + +def fix_imports(filepath): + """Fix import statements in a Python file""" + with open(filepath, 'r') as f: + content = f.read() + + # Pattern to match the old import style + old_pattern = r"# 상위 디렉토리의 shared 모듈 import\nsys\.path\.append\(os\.path\.join\(os\.path\.dirname\(__file__\), '\.\.', 'shared'\)\)\nfrom ([\w, ]+) import ([\w, ]+)" + + # Replace with new import style + def replace_imports(match): + modules = match.group(1) + items = match.group(2) + + # Build new import statements + imports = [] + if 'models' in modules: + imports.append(f"from shared.models import {items}" if 'models' in modules else "") + if 'queue_manager' in modules: + imports.append(f"from shared.queue_manager import QueueManager") + + return "# Import from shared module\n" + "\n".join(filter(None, imports)) + + # Apply the replacement + new_content = re.sub(old_pattern, replace_imports, content) + + # Also handle simpler patterns + new_content = new_content.replace( + "sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'shared'))\nfrom models import", + "from shared.models import" + ) + new_content = new_content.replace( + "\nfrom queue_manager import", + "\nfrom shared.queue_manager import" + ) + + # Write back if changed + if new_content != content: + with open(filepath, 'w') as f: + f.write(new_content) + print(f"Fixed imports in {filepath}") + return True + return False + +# Files to fix +files_to_fix = [ + "monitor/monitor.py", + "google-search/google_search.py", + "article-assembly/article_assembly.py", + "rss-collector/rss_collector.py", + "ai-summarizer/ai_summarizer.py" +] + +for file_path in files_to_fix: + full_path = os.path.join(os.path.dirname(__file__), file_path) + if os.path.exists(full_path): + fix_imports(full_path) \ No newline at end of file diff --git a/services/pipeline/google-search/Dockerfile b/services/pipeline/google-search/Dockerfile new file mode 100644 index 0000000..5d75150 --- /dev/null +++ b/services/pipeline/google-search/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11-slim + +WORKDIR /app + +# 의존성 설치 +COPY ./google-search/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# 공통 모듈 복사 +COPY ./shared /app/shared + +# Google Search 코드 복사 +COPY ./google-search /app + +# 환경변수 +ENV PYTHONUNBUFFERED=1 + +# 실행 +CMD ["python", "google_search.py"] \ No newline at end of file diff --git a/services/pipeline/google-search/google_search.py b/services/pipeline/google-search/google_search.py new file mode 100644 index 0000000..25c34a6 --- /dev/null +++ b/services/pipeline/google-search/google_search.py @@ -0,0 +1,153 @@ +""" +Google Search Service +Google 검색으로 RSS 항목 강화 +""" +import asyncio +import logging +import os +import sys +import json +from typing import List, Dict, Any +import aiohttp +from datetime import datetime + +# Import from shared module +from shared.models import PipelineJob, RSSItem, SearchResult, EnrichedItem +from shared.queue_manager import QueueManager + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class GoogleSearchWorker: + def __init__(self): + self.queue_manager = QueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") + ) + self.google_api_key = os.getenv("GOOGLE_API_KEY") + self.search_engine_id = os.getenv("GOOGLE_SEARCH_ENGINE_ID") + self.max_results_per_item = 3 + + async def start(self): + """워커 시작""" + logger.info("Starting Google Search Worker") + + # Redis 연결 + await self.queue_manager.connect() + + # 메인 처리 루프 + while True: + try: + # 큐에서 작업 가져오기 + job = await self.queue_manager.dequeue('search_enrichment', timeout=5) + + if job: + await self.process_job(job) + + except Exception as e: + logger.error(f"Error in worker loop: {e}") + await asyncio.sleep(1) + + async def process_job(self, job: PipelineJob): + """검색 강화 작업 처리""" + try: + logger.info(f"Processing job {job.job_id} for search enrichment") + + rss_items = job.data.get('rss_items', []) + enriched_items = [] + + # 최대 5개 항목만 처리 (API 할당량 관리) + for item_data in rss_items[:5]: + rss_item = RSSItem(**item_data) + + # 제목으로 Google 검색 + search_results = await self._search_google(rss_item.title) + + enriched_item = EnrichedItem( + rss_item=rss_item, + search_results=search_results + ) + enriched_items.append(enriched_item) + + # API 속도 제한 + await asyncio.sleep(0.5) + + if enriched_items: + logger.info(f"Enriched {len(enriched_items)} items with search results") + + # 다음 단계로 전달 + job.data['enriched_items'] = [item.dict() for item in enriched_items] + job.stages_completed.append('search_enrichment') + job.stage = 'ai_summarization' + + await self.queue_manager.enqueue('ai_summarization', job) + await self.queue_manager.mark_completed('search_enrichment', job.job_id) + else: + logger.warning(f"No items enriched for job {job.job_id}") + await self.queue_manager.mark_failed( + 'search_enrichment', + job, + "No items to enrich" + ) + + except Exception as e: + logger.error(f"Error processing job {job.job_id}: {e}") + await self.queue_manager.mark_failed('search_enrichment', job, str(e)) + + async def _search_google(self, query: str) -> List[SearchResult]: + """Google Custom Search API 호출""" + results = [] + + if not self.google_api_key or not self.search_engine_id: + logger.warning("Google API credentials not configured") + return results + + try: + url = "https://www.googleapis.com/customsearch/v1" + params = { + "key": self.google_api_key, + "cx": self.search_engine_id, + "q": query, + "num": self.max_results_per_item, + "hl": "ko", + "gl": "kr" + } + + async with aiohttp.ClientSession() as session: + async with session.get(url, params=params, timeout=30) as response: + if response.status == 200: + data = await response.json() + + for item in data.get('items', []): + result = SearchResult( + title=item.get('title', ''), + link=item.get('link', ''), + snippet=item.get('snippet', ''), + source='google' + ) + results.append(result) + else: + logger.error(f"Google API error: {response.status}") + + except Exception as e: + logger.error(f"Error searching Google for '{query}': {e}") + + return results + + async def stop(self): + """워커 중지""" + await self.queue_manager.disconnect() + logger.info("Google Search Worker stopped") + +async def main(): + """메인 함수""" + worker = GoogleSearchWorker() + + try: + await worker.start() + except KeyboardInterrupt: + logger.info("Received interrupt signal") + finally: + await worker.stop() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/services/pipeline/google-search/requirements.txt b/services/pipeline/google-search/requirements.txt new file mode 100644 index 0000000..0859816 --- /dev/null +++ b/services/pipeline/google-search/requirements.txt @@ -0,0 +1,3 @@ +aiohttp==3.9.1 +redis[hiredis]==5.0.1 +pydantic==2.5.0 \ No newline at end of file diff --git a/services/pipeline/image-generator/Dockerfile b/services/pipeline/image-generator/Dockerfile new file mode 100644 index 0000000..018dede --- /dev/null +++ b/services/pipeline/image-generator/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY ./image-generator/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy shared modules +COPY ./shared /app/shared + +# Copy application code +COPY ./image-generator /app + +CMD ["python", "image_generator.py"] \ No newline at end of file diff --git a/services/pipeline/image-generator/image_generator.py b/services/pipeline/image-generator/image_generator.py new file mode 100644 index 0000000..5af06c5 --- /dev/null +++ b/services/pipeline/image-generator/image_generator.py @@ -0,0 +1,225 @@ +""" +Image Generation Service +Replicate API를 사용한 이미지 생성 서비스 +""" +import asyncio +import logging +import os +import sys +import base64 +from typing import List, Dict, Any +import httpx +from io import BytesIO + +# Import from shared module +from shared.models import PipelineJob, TranslatedItem, GeneratedImageItem +from shared.queue_manager import QueueManager + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class ImageGeneratorWorker: + def __init__(self): + self.queue_manager = QueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") + ) + self.replicate_api_key = os.getenv("REPLICATE_API_KEY") + self.replicate_api_url = "https://api.replicate.com/v1/predictions" + # Stable Diffusion 모델 사용 + self.model_version = "stability-ai/sdxl:39ed52f2a78e934b3ba6e2a89f5b1c712de7dfea535525255b1aa35c5565e08b" + + async def start(self): + """워커 시작""" + logger.info("Starting Image Generator Worker") + + # Redis 연결 + await self.queue_manager.connect() + + # API 키 확인 + if not self.replicate_api_key: + logger.warning("Replicate API key not configured - using placeholder images") + + # 메인 처리 루프 + while True: + try: + # 큐에서 작업 가져오기 + job = await self.queue_manager.dequeue('image_generation', timeout=5) + + if job: + await self.process_job(job) + + except Exception as e: + logger.error(f"Error in worker loop: {e}") + await asyncio.sleep(1) + + async def process_job(self, job: PipelineJob): + """이미지 생성 작업 처리""" + try: + logger.info(f"Processing job {job.job_id} for image generation") + + translated_items = job.data.get('translated_items', []) + generated_items = [] + + # 최대 3개 아이템만 이미지 생성 (API 비용 절감) + for idx, item_data in enumerate(translated_items[:3]): + translated_item = TranslatedItem(**item_data) + + # 이미지 생성을 위한 프롬프트 생성 + prompt = self._create_image_prompt(translated_item) + + # 이미지 생성 + image_url = await self._generate_image(prompt) + + generated_item = GeneratedImageItem( + translated_item=translated_item, + image_url=image_url, + image_prompt=prompt + ) + generated_items.append(generated_item) + + # API 속도 제한 + if self.replicate_api_key: + await asyncio.sleep(2) + + if generated_items: + logger.info(f"Generated images for {len(generated_items)} items") + + # 완료된 데이터를 job에 저장 + job.data['generated_items'] = [item.dict() for item in generated_items] + job.stages_completed.append('image_generation') + job.stage = 'completed' + + # 최종 기사 조립 단계로 전달 (이미 article-assembly로 수정) + await self.queue_manager.enqueue('article_assembly', job) + await self.queue_manager.mark_completed('image_generation', job.job_id) + else: + logger.warning(f"No images generated for job {job.job_id}") + # 이미지 생성 실패해도 다음 단계로 진행 + job.stages_completed.append('image_generation') + await self.queue_manager.enqueue('article_assembly', job) + await self.queue_manager.mark_completed('image_generation', job.job_id) + + except Exception as e: + logger.error(f"Error processing job {job.job_id}: {e}") + # 이미지 생성 실패해도 다음 단계로 진행 + job.stages_completed.append('image_generation') + await self.queue_manager.enqueue('article_assembly', job) + await self.queue_manager.mark_completed('image_generation', job.job_id) + + def _create_image_prompt(self, translated_item: TranslatedItem) -> str: + """이미지 생성을 위한 프롬프트 생성""" + # 영문 제목과 요약을 기반으로 프롬프트 생성 + title = translated_item.translated_title or translated_item.summarized_item['enriched_item']['rss_item']['title'] + summary = translated_item.translated_summary or translated_item.summarized_item['ai_summary'] + + # 뉴스 관련 이미지를 위한 프롬프트 + prompt = f"News illustration for: {title[:100]}, professional, photorealistic, high quality, 4k" + + return prompt + + async def _generate_image(self, prompt: str) -> str: + """Replicate API를 사용한 이미지 생성""" + try: + if not self.replicate_api_key: + # API 키가 없으면 플레이스홀더 이미지 URL 반환 + return "https://via.placeholder.com/800x600.png?text=News+Image" + + async with httpx.AsyncClient() as client: + # 예측 생성 요청 + response = await client.post( + self.replicate_api_url, + headers={ + "Authorization": f"Token {self.replicate_api_key}", + "Content-Type": "application/json" + }, + json={ + "version": self.model_version, + "input": { + "prompt": prompt, + "width": 768, + "height": 768, + "num_outputs": 1, + "scheduler": "K_EULER", + "num_inference_steps": 25, + "guidance_scale": 7.5, + "prompt_strength": 0.8, + "refine": "expert_ensemble_refiner", + "high_noise_frac": 0.8 + } + }, + timeout=60 + ) + + if response.status_code in [200, 201]: + result = response.json() + prediction_id = result.get('id') + + # 예측 결과 폴링 + image_url = await self._poll_prediction(prediction_id) + return image_url + else: + logger.error(f"Replicate API error: {response.status_code}") + return "https://via.placeholder.com/800x600.png?text=Generation+Failed" + + except Exception as e: + logger.error(f"Error generating image: {e}") + return "https://via.placeholder.com/800x600.png?text=Error" + + async def _poll_prediction(self, prediction_id: str, max_attempts: int = 30) -> str: + """예측 결과 폴링""" + try: + async with httpx.AsyncClient() as client: + for attempt in range(max_attempts): + response = await client.get( + f"{self.replicate_api_url}/{prediction_id}", + headers={ + "Authorization": f"Token {self.replicate_api_key}" + }, + timeout=30 + ) + + if response.status_code == 200: + result = response.json() + status = result.get('status') + + if status == 'succeeded': + output = result.get('output') + if output and isinstance(output, list) and len(output) > 0: + return output[0] + else: + return "https://via.placeholder.com/800x600.png?text=No+Output" + elif status == 'failed': + logger.error(f"Prediction failed: {result.get('error')}") + return "https://via.placeholder.com/800x600.png?text=Failed" + + # 아직 처리중이면 대기 + await asyncio.sleep(2) + else: + logger.error(f"Error polling prediction: {response.status_code}") + return "https://via.placeholder.com/800x600.png?text=Poll+Error" + + # 최대 시도 횟수 초과 + return "https://via.placeholder.com/800x600.png?text=Timeout" + + except Exception as e: + logger.error(f"Error polling prediction: {e}") + return "https://via.placeholder.com/800x600.png?text=Poll+Exception" + + async def stop(self): + """워커 중지""" + await self.queue_manager.disconnect() + logger.info("Image Generator Worker stopped") + +async def main(): + """메인 함수""" + worker = ImageGeneratorWorker() + + try: + await worker.start() + except KeyboardInterrupt: + logger.info("Received interrupt signal") + finally: + await worker.stop() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/services/pipeline/image-generator/requirements.txt b/services/pipeline/image-generator/requirements.txt new file mode 100644 index 0000000..fbd9665 --- /dev/null +++ b/services/pipeline/image-generator/requirements.txt @@ -0,0 +1,3 @@ +httpx==0.25.0 +redis[hiredis]==5.0.1 +pydantic==2.5.0 \ No newline at end of file diff --git a/services/pipeline/monitor/Dockerfile b/services/pipeline/monitor/Dockerfile new file mode 100644 index 0000000..cc6cd35 --- /dev/null +++ b/services/pipeline/monitor/Dockerfile @@ -0,0 +1,22 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY ./monitor/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy shared modules +COPY ./shared /app/shared + +# Copy monitor code +COPY ./monitor /app + +# Environment variables +ENV PYTHONUNBUFFERED=1 + +# Expose port +EXPOSE 8000 + +# Run +CMD ["uvicorn", "monitor:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] \ No newline at end of file diff --git a/services/pipeline/monitor/monitor.py b/services/pipeline/monitor/monitor.py new file mode 100644 index 0000000..9c4a73e --- /dev/null +++ b/services/pipeline/monitor/monitor.py @@ -0,0 +1,349 @@ +""" +Pipeline Monitor Service +파이프라인 상태 모니터링 및 대시보드 API +""" +import os +import sys +import logging +from datetime import datetime, timedelta +from typing import Dict, List, Any +from fastapi import FastAPI, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from motor.motor_asyncio import AsyncIOMotorClient +import redis.asyncio as redis + +# Import from shared module +from shared.models import KeywordSubscription, PipelineJob, FinalArticle + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +app = FastAPI(title="Pipeline Monitor", version="1.0.0") + +# CORS 설정 +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Global connections +redis_client = None +mongodb_client = None +db = None + +@app.on_event("startup") +async def startup_event(): + """서버 시작 시 연결 초기화""" + global redis_client, mongodb_client, db + + # Redis 연결 + redis_url = os.getenv("REDIS_URL", "redis://redis:6379") + redis_client = await redis.from_url(redis_url, decode_responses=True) + + # MongoDB 연결 + mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017") + mongodb_client = AsyncIOMotorClient(mongodb_url) + db = mongodb_client[os.getenv("DB_NAME", "pipeline_db")] + + logger.info("Pipeline Monitor started successfully") + +@app.on_event("shutdown") +async def shutdown_event(): + """서버 종료 시 연결 해제""" + if redis_client: + await redis_client.close() + if mongodb_client: + mongodb_client.close() + +@app.get("/") +async def root(): + """헬스 체크""" + return {"status": "Pipeline Monitor is running"} + +@app.get("/api/stats") +async def get_stats(): + """전체 파이프라인 통계""" + try: + # 큐별 대기 작업 수 + queue_stats = {} + queues = [ + "queue:keyword", + "queue:rss", + "queue:search", + "queue:summarize", + "queue:assembly" + ] + + for queue in queues: + length = await redis_client.llen(queue) + queue_stats[queue] = length + + # 오늘 생성된 기사 수 + today = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) + articles_today = await db.articles.count_documents({ + "created_at": {"$gte": today} + }) + + # 활성 키워드 수 + active_keywords = await db.keywords.count_documents({ + "is_active": True + }) + + # 총 기사 수 + total_articles = await db.articles.count_documents({}) + + return { + "queues": queue_stats, + "articles_today": articles_today, + "active_keywords": active_keywords, + "total_articles": total_articles, + "timestamp": datetime.now().isoformat() + } + + except Exception as e: + logger.error(f"Error getting stats: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/queues/{queue_name}") +async def get_queue_details(queue_name: str): + """특정 큐의 상세 정보""" + try: + queue_key = f"queue:{queue_name}" + + # 큐 길이 + length = await redis_client.llen(queue_key) + + # 최근 10개 작업 미리보기 + items = await redis_client.lrange(queue_key, 0, 9) + + # 처리 중인 작업 + processing_key = f"processing:{queue_name}" + processing = await redis_client.smembers(processing_key) + + # 실패한 작업 + failed_key = f"failed:{queue_name}" + failed_count = await redis_client.llen(failed_key) + + return { + "queue": queue_name, + "length": length, + "processing_count": len(processing), + "failed_count": failed_count, + "preview": items[:10], + "timestamp": datetime.now().isoformat() + } + + except Exception as e: + logger.error(f"Error getting queue details: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/keywords") +async def get_keywords(): + """등록된 키워드 목록""" + try: + keywords = [] + cursor = db.keywords.find({"is_active": True}) + + async for keyword in cursor: + # 해당 키워드의 최근 기사 + latest_article = await db.articles.find_one( + {"keyword_id": str(keyword["_id"])}, + sort=[("created_at", -1)] + ) + + keywords.append({ + "id": str(keyword["_id"]), + "keyword": keyword["keyword"], + "schedule": keyword.get("schedule", "30분마다"), + "created_at": keyword.get("created_at"), + "last_article": latest_article["created_at"] if latest_article else None, + "article_count": await db.articles.count_documents( + {"keyword_id": str(keyword["_id"])} + ) + }) + + return keywords + + except Exception as e: + logger.error(f"Error getting keywords: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.post("/api/keywords") +async def add_keyword(keyword: str, schedule: str = "30min"): + """새 키워드 등록""" + try: + new_keyword = { + "keyword": keyword, + "schedule": schedule, + "is_active": True, + "created_at": datetime.now(), + "updated_at": datetime.now() + } + + result = await db.keywords.insert_one(new_keyword) + + return { + "id": str(result.inserted_id), + "keyword": keyword, + "message": "Keyword registered successfully" + } + + except Exception as e: + logger.error(f"Error adding keyword: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.delete("/api/keywords/{keyword_id}") +async def delete_keyword(keyword_id: str): + """키워드 비활성화""" + try: + result = await db.keywords.update_one( + {"_id": keyword_id}, + {"$set": {"is_active": False, "updated_at": datetime.now()}} + ) + + if result.modified_count > 0: + return {"message": "Keyword deactivated successfully"} + else: + raise HTTPException(status_code=404, detail="Keyword not found") + + except Exception as e: + logger.error(f"Error deleting keyword: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/articles") +async def get_articles(limit: int = 10, skip: int = 0): + """최근 생성된 기사 목록""" + try: + articles = [] + cursor = db.articles.find().sort("created_at", -1).skip(skip).limit(limit) + + async for article in cursor: + articles.append({ + "id": str(article["_id"]), + "title": article["title"], + "keyword": article["keyword"], + "summary": article.get("summary", ""), + "created_at": article["created_at"], + "processing_time": article.get("processing_time", 0), + "pipeline_stages": article.get("pipeline_stages", []) + }) + + total = await db.articles.count_documents({}) + + return { + "articles": articles, + "total": total, + "limit": limit, + "skip": skip + } + + except Exception as e: + logger.error(f"Error getting articles: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/articles/{article_id}") +async def get_article(article_id: str): + """특정 기사 상세 정보""" + try: + article = await db.articles.find_one({"_id": article_id}) + + if not article: + raise HTTPException(status_code=404, detail="Article not found") + + return article + + except Exception as e: + logger.error(f"Error getting article: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/workers") +async def get_workers(): + """워커 상태 정보""" + try: + workers = {} + worker_types = [ + "scheduler", + "rss_collector", + "google_search", + "ai_summarizer", + "article_assembly" + ] + + for worker_type in worker_types: + active_key = f"workers:{worker_type}:active" + active_workers = await redis_client.smembers(active_key) + + workers[worker_type] = { + "active": len(active_workers), + "worker_ids": list(active_workers) + } + + return workers + + except Exception as e: + logger.error(f"Error getting workers: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.post("/api/trigger/{keyword}") +async def trigger_keyword_processing(keyword: str): + """수동으로 키워드 처리 트리거""" + try: + # 키워드 찾기 + keyword_doc = await db.keywords.find_one({ + "keyword": keyword, + "is_active": True + }) + + if not keyword_doc: + raise HTTPException(status_code=404, detail="Keyword not found or inactive") + + # 작업 생성 + job = PipelineJob( + keyword_id=str(keyword_doc["_id"]), + keyword=keyword, + stage="keyword_processing", + created_at=datetime.now() + ) + + # 큐에 추가 + await redis_client.rpush("queue:keyword", job.json()) + + return { + "message": f"Processing triggered for keyword: {keyword}", + "job_id": job.job_id + } + + except Exception as e: + logger.error(f"Error triggering keyword: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/health") +async def health_check(): + """시스템 헬스 체크""" + try: + # Redis 체크 + redis_status = await redis_client.ping() + + # MongoDB 체크 + mongodb_status = await db.command("ping") + + return { + "status": "healthy", + "redis": "connected" if redis_status else "disconnected", + "mongodb": "connected" if mongodb_status else "disconnected", + "timestamp": datetime.now().isoformat() + } + + except Exception as e: + return { + "status": "unhealthy", + "error": str(e), + "timestamp": datetime.now().isoformat() + } + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file diff --git a/services/pipeline/monitor/requirements.txt b/services/pipeline/monitor/requirements.txt new file mode 100644 index 0000000..5728b55 --- /dev/null +++ b/services/pipeline/monitor/requirements.txt @@ -0,0 +1,6 @@ +fastapi==0.104.1 +uvicorn[standard]==0.24.0 +redis[hiredis]==5.0.1 +motor==3.1.1 +pymongo==4.3.3 +pydantic==2.5.0 \ No newline at end of file diff --git a/services/pipeline/rss-collector/Dockerfile b/services/pipeline/rss-collector/Dockerfile new file mode 100644 index 0000000..4565e1c --- /dev/null +++ b/services/pipeline/rss-collector/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11-slim + +WORKDIR /app + +# 의존성 설치 +COPY ./rss-collector/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# 공통 모듈 복사 +COPY ./shared /app/shared + +# RSS Collector 코드 복사 +COPY ./rss-collector /app + +# 환경변수 +ENV PYTHONUNBUFFERED=1 + +# 실행 +CMD ["python", "rss_collector.py"] \ No newline at end of file diff --git a/services/pipeline/rss-collector/requirements.txt b/services/pipeline/rss-collector/requirements.txt new file mode 100644 index 0000000..8d21c7f --- /dev/null +++ b/services/pipeline/rss-collector/requirements.txt @@ -0,0 +1,4 @@ +feedparser==6.0.11 +aiohttp==3.9.1 +redis[hiredis]==5.0.1 +pydantic==2.5.0 \ No newline at end of file diff --git a/services/pipeline/rss-collector/rss_collector.py b/services/pipeline/rss-collector/rss_collector.py new file mode 100644 index 0000000..5601977 --- /dev/null +++ b/services/pipeline/rss-collector/rss_collector.py @@ -0,0 +1,192 @@ +""" +RSS Collector Service +RSS 피드 수집 및 중복 제거 서비스 +""" +import asyncio +import logging +import os +import sys +import hashlib +from datetime import datetime +import feedparser +import aiohttp +import redis.asyncio as redis +from typing import List, Dict, Any + +# Import from shared module +from shared.models import PipelineJob, RSSItem, EnrichedItem +from shared.queue_manager import QueueManager + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class RSSCollectorWorker: + def __init__(self): + self.queue_manager = QueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") + ) + self.redis_client = None + self.redis_url = os.getenv("REDIS_URL", "redis://redis:6379") + self.dedup_ttl = 86400 * 7 # 7일간 중복 방지 + self.max_items_per_feed = 10 # 피드당 최대 항목 수 + + async def start(self): + """워커 시작""" + logger.info("Starting RSS Collector Worker") + + # Redis 연결 + await self.queue_manager.connect() + self.redis_client = await redis.from_url( + self.redis_url, + encoding="utf-8", + decode_responses=True + ) + + # 메인 처리 루프 + while True: + try: + # 큐에서 작업 가져오기 (5초 대기) + job = await self.queue_manager.dequeue('rss_collection', timeout=5) + + if job: + await self.process_job(job) + + except Exception as e: + logger.error(f"Error in worker loop: {e}") + await asyncio.sleep(1) + + async def process_job(self, job: PipelineJob): + """RSS 수집 작업 처리""" + try: + logger.info(f"Processing job {job.job_id} for keyword '{job.keyword}'") + + keyword = job.data.get('keyword', '') + rss_feeds = job.data.get('rss_feeds', []) + + # 키워드가 포함된 RSS URL 생성 + processed_feeds = self._prepare_feeds(rss_feeds, keyword) + + all_items = [] + + for feed_url in processed_feeds: + try: + items = await self._fetch_rss_feed(feed_url, keyword) + all_items.extend(items) + except Exception as e: + logger.error(f"Error fetching feed {feed_url}: {e}") + + if all_items: + # 중복 제거 + unique_items = await self._deduplicate_items(all_items, keyword) + + if unique_items: + logger.info(f"Collected {len(unique_items)} unique items for '{keyword}'") + + # 다음 단계로 전달 + job.data['rss_items'] = [item.dict() for item in unique_items] + job.stages_completed.append('rss_collection') + job.stage = 'search_enrichment' + + await self.queue_manager.enqueue('search_enrichment', job) + await self.queue_manager.mark_completed('rss_collection', job.job_id) + else: + logger.info(f"No new items found for '{keyword}'") + await self.queue_manager.mark_completed('rss_collection', job.job_id) + else: + logger.warning(f"No RSS items collected for '{keyword}'") + await self.queue_manager.mark_failed( + 'rss_collection', + job, + "No RSS items collected" + ) + + except Exception as e: + logger.error(f"Error processing job {job.job_id}: {e}") + await self.queue_manager.mark_failed('rss_collection', job, str(e)) + + def _prepare_feeds(self, feeds: List[str], keyword: str) -> List[str]: + """RSS 피드 URL 준비 (키워드 치환)""" + processed = [] + for feed in feeds: + if '{keyword}' in feed: + processed.append(feed.replace('{keyword}', keyword)) + else: + processed.append(feed) + return processed + + async def _fetch_rss_feed(self, feed_url: str, keyword: str) -> List[RSSItem]: + """RSS 피드 가져오기""" + items = [] + + try: + async with aiohttp.ClientSession() as session: + async with session.get(feed_url, timeout=30) as response: + content = await response.text() + + # feedparser로 파싱 + feed = feedparser.parse(content) + + for entry in feed.entries[:self.max_items_per_feed]: + # 키워드 관련성 체크 + title = entry.get('title', '') + summary = entry.get('summary', '') + + # 제목이나 요약에 키워드가 포함된 경우만 + if keyword.lower() in title.lower() or keyword.lower() in summary.lower(): + item = RSSItem( + title=title, + link=entry.get('link', ''), + published=entry.get('published', ''), + summary=summary[:500] if summary else '', + source_feed=feed_url + ) + items.append(item) + + except Exception as e: + logger.error(f"Error fetching RSS feed {feed_url}: {e}") + + return items + + async def _deduplicate_items(self, items: List[RSSItem], keyword: str) -> List[RSSItem]: + """중복 항목 제거""" + unique_items = [] + dedup_key = f"dedup:{keyword}" + + for item in items: + # 제목 해시 생성 + item_hash = hashlib.md5( + f"{keyword}:{item.title}".encode() + ).hexdigest() + + # Redis Set으로 중복 확인 + is_new = await self.redis_client.sadd(dedup_key, item_hash) + + if is_new: + unique_items.append(item) + + # TTL 설정 + if unique_items: + await self.redis_client.expire(dedup_key, self.dedup_ttl) + + return unique_items + + async def stop(self): + """워커 중지""" + await self.queue_manager.disconnect() + if self.redis_client: + await self.redis_client.close() + logger.info("RSS Collector Worker stopped") + +async def main(): + """메인 함수""" + worker = RSSCollectorWorker() + + try: + await worker.start() + except KeyboardInterrupt: + logger.info("Received interrupt signal") + finally: + await worker.stop() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/services/pipeline/scheduler/Dockerfile b/services/pipeline/scheduler/Dockerfile new file mode 100644 index 0000000..a9626e7 --- /dev/null +++ b/services/pipeline/scheduler/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11-slim + +WORKDIR /app + +# 의존성 설치 +COPY ./scheduler/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# 공통 모듈 복사 +COPY ./shared /app/shared + +# 스케줄러 코드 복사 +COPY ./scheduler /app + +# 환경변수 +ENV PYTHONUNBUFFERED=1 + +# 실행 +CMD ["python", "scheduler.py"] \ No newline at end of file diff --git a/services/pipeline/scheduler/requirements.txt b/services/pipeline/scheduler/requirements.txt new file mode 100644 index 0000000..0ca083f --- /dev/null +++ b/services/pipeline/scheduler/requirements.txt @@ -0,0 +1,5 @@ +apscheduler==3.10.4 +motor==3.1.1 +pymongo==4.3.3 +redis[hiredis]==5.0.1 +pydantic==2.5.0 \ No newline at end of file diff --git a/services/pipeline/scheduler/scheduler.py b/services/pipeline/scheduler/scheduler.py new file mode 100644 index 0000000..fe93276 --- /dev/null +++ b/services/pipeline/scheduler/scheduler.py @@ -0,0 +1,203 @@ +""" +News Pipeline Scheduler +뉴스 파이프라인 스케줄러 서비스 +""" +import asyncio +import logging +import os +import sys +from datetime import datetime, timedelta +from apscheduler.schedulers.asyncio import AsyncIOScheduler +from motor.motor_asyncio import AsyncIOMotorClient + +# Import from shared module +from shared.models import KeywordSubscription, PipelineJob +from shared.queue_manager import QueueManager + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class NewsScheduler: + def __init__(self): + self.scheduler = AsyncIOScheduler() + self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017") + self.db_name = os.getenv("DB_NAME", "pipeline_db") + self.db = None + self.queue_manager = QueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") + ) + + async def start(self): + """스케줄러 시작""" + logger.info("Starting News Pipeline Scheduler") + + # MongoDB 연결 + client = AsyncIOMotorClient(self.mongodb_url) + self.db = client[self.db_name] + + # Redis 연결 + await self.queue_manager.connect() + + # 기본 스케줄 설정 + # 매 30분마다 실행 + self.scheduler.add_job( + self.process_keywords, + 'interval', + minutes=30, + id='keyword_processor', + name='Process Active Keywords' + ) + + # 특정 시간대 강화 스케줄 (아침 7시, 점심 12시, 저녁 6시) + for hour in [7, 12, 18]: + self.scheduler.add_job( + self.process_priority_keywords, + 'cron', + hour=hour, + minute=0, + id=f'priority_processor_{hour}', + name=f'Process Priority Keywords at {hour}:00' + ) + + # 매일 자정 통계 초기화 + self.scheduler.add_job( + self.reset_daily_stats, + 'cron', + hour=0, + minute=0, + id='stats_reset', + name='Reset Daily Statistics' + ) + + self.scheduler.start() + logger.info("Scheduler started successfully") + + # 시작 즉시 한 번 실행 + await self.process_keywords() + + async def process_keywords(self): + """활성 키워드 처리""" + try: + logger.info("Processing active keywords") + + # MongoDB에서 활성 키워드 로드 + now = datetime.now() + thirty_minutes_ago = now - timedelta(minutes=30) + + keywords = await self.db.keywords.find({ + "is_active": True, + "$or": [ + {"last_processed": {"$lt": thirty_minutes_ago}}, + {"last_processed": None} + ] + }).to_list(None) + + logger.info(f"Found {len(keywords)} keywords to process") + + for keyword_doc in keywords: + await self._create_job(keyword_doc) + + # 처리 시간 업데이트 + await self.db.keywords.update_one( + {"keyword_id": keyword_doc['keyword_id']}, + {"$set": {"last_processed": now}} + ) + + logger.info(f"Created jobs for {len(keywords)} keywords") + + except Exception as e: + logger.error(f"Error processing keywords: {e}") + + async def process_priority_keywords(self): + """우선순위 키워드 처리""" + try: + logger.info("Processing priority keywords") + + keywords = await self.db.keywords.find({ + "is_active": True, + "is_priority": True + }).to_list(None) + + for keyword_doc in keywords: + await self._create_job(keyword_doc, priority=1) + + logger.info(f"Created priority jobs for {len(keywords)} keywords") + + except Exception as e: + logger.error(f"Error processing priority keywords: {e}") + + async def _create_job(self, keyword_doc: dict, priority: int = 0): + """파이프라인 작업 생성""" + try: + # KeywordSubscription 모델로 변환 + keyword = KeywordSubscription(**keyword_doc) + + # PipelineJob 생성 + job = PipelineJob( + keyword_id=keyword.keyword_id, + keyword=keyword.keyword, + stage='rss_collection', + stages_completed=[], + priority=priority, + data={ + 'keyword': keyword.keyword, + 'language': keyword.language, + 'rss_feeds': keyword.rss_feeds or self._get_default_rss_feeds(), + 'categories': keyword.categories + } + ) + + # 첫 번째 큐에 추가 + await self.queue_manager.enqueue( + 'rss_collection', + job, + priority=priority + ) + + logger.info(f"Created job {job.job_id} for keyword '{keyword.keyword}'") + + except Exception as e: + logger.error(f"Error creating job for keyword: {e}") + + def _get_default_rss_feeds(self) -> list: + """기본 RSS 피드 목록""" + return [ + "https://news.google.com/rss/search?q={keyword}&hl=ko&gl=KR&ceid=KR:ko", + "https://trends.google.com/trends/trendingsearches/daily/rss?geo=KR", + "https://www.mk.co.kr/rss/40300001/", # 매일경제 + "https://www.hankyung.com/feed/all-news", # 한국경제 + "https://www.zdnet.co.kr/news/news_rss.xml", # ZDNet Korea + ] + + async def reset_daily_stats(self): + """일일 통계 초기화""" + try: + logger.info("Resetting daily statistics") + # Redis 통계 초기화 + # 구현 필요 + pass + except Exception as e: + logger.error(f"Error resetting stats: {e}") + + async def stop(self): + """스케줄러 중지""" + self.scheduler.shutdown() + await self.queue_manager.disconnect() + logger.info("Scheduler stopped") + +async def main(): + """메인 함수""" + scheduler = NewsScheduler() + + try: + await scheduler.start() + # 계속 실행 + while True: + await asyncio.sleep(60) + except KeyboardInterrupt: + logger.info("Received interrupt signal") + finally: + await scheduler.stop() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/services/pipeline/shared/__init__.py b/services/pipeline/shared/__init__.py new file mode 100644 index 0000000..5f6ffd9 --- /dev/null +++ b/services/pipeline/shared/__init__.py @@ -0,0 +1 @@ +# Shared modules for pipeline services \ No newline at end of file diff --git a/services/pipeline/shared/models.py b/services/pipeline/shared/models.py new file mode 100644 index 0000000..f12f581 --- /dev/null +++ b/services/pipeline/shared/models.py @@ -0,0 +1,113 @@ +""" +Pipeline Data Models +파이프라인 전체에서 사용되는 공통 데이터 모델 +""" +from datetime import datetime +from typing import List, Dict, Any, Optional +from pydantic import BaseModel, Field +import uuid + +class KeywordSubscription(BaseModel): + """키워드 구독 모델""" + keyword_id: str = Field(default_factory=lambda: str(uuid.uuid4())) + keyword: str + language: str = "ko" + schedule: str = "0 */30 * * *" # Cron expression (30분마다) + is_active: bool = True + is_priority: bool = False + last_processed: Optional[datetime] = None + rss_feeds: List[str] = Field(default_factory=list) + categories: List[str] = Field(default_factory=list) + created_at: datetime = Field(default_factory=datetime.now) + owner: Optional[str] = None + +class PipelineJob(BaseModel): + """파이프라인 작업 모델""" + job_id: str = Field(default_factory=lambda: str(uuid.uuid4())) + keyword_id: str + keyword: str + stage: str # current stage + stages_completed: List[str] = Field(default_factory=list) + data: Dict[str, Any] = Field(default_factory=dict) + retry_count: int = 0 + max_retries: int = 3 + priority: int = 0 + created_at: datetime = Field(default_factory=datetime.now) + updated_at: datetime = Field(default_factory=datetime.now) + +class RSSItem(BaseModel): + """RSS 피드 아이템""" + item_id: str = Field(default_factory=lambda: str(uuid.uuid4())) + title: str + link: str + published: Optional[str] = None + summary: Optional[str] = None + source_feed: str + +class SearchResult(BaseModel): + """검색 결과""" + title: str + link: str + snippet: Optional[str] = None + source: str = "google" + +class EnrichedItem(BaseModel): + """강화된 뉴스 아이템""" + rss_item: RSSItem + search_results: List[SearchResult] = Field(default_factory=list) + +class SummarizedItem(BaseModel): + """요약된 아이템""" + enriched_item: EnrichedItem + ai_summary: str + summary_language: str = "ko" + +class TranslatedItem(BaseModel): + """번역된 아이템""" + summarized_item: SummarizedItem + title_en: str + summary_en: str + +class ItemWithImage(BaseModel): + """이미지가 추가된 아이템""" + translated_item: TranslatedItem + image_url: str + image_prompt: str + +class FinalArticle(BaseModel): + """최종 기사""" + article_id: str = Field(default_factory=lambda: str(uuid.uuid4())) + job_id: str + keyword_id: str + keyword: str + title: str + content: str + summary: str + source_items: List[ItemWithImage] + images: List[str] + categories: List[str] = Field(default_factory=list) + tags: List[str] = Field(default_factory=list) + created_at: datetime = Field(default_factory=datetime.now) + pipeline_stages: List[str] + processing_time: float # seconds + +class TranslatedItem(BaseModel): + """번역된 아이템""" + summarized_item: Dict[str, Any] # SummarizedItem as dict + translated_title: str + translated_summary: str + target_language: str = 'en' + +class GeneratedImageItem(BaseModel): + """이미지 생성된 아이템""" + translated_item: Dict[str, Any] # TranslatedItem as dict + image_url: str + image_prompt: str + +class QueueMessage(BaseModel): + """큐 메시지""" + message_id: str = Field(default_factory=lambda: str(uuid.uuid4())) + queue_name: str + job: PipelineJob + timestamp: datetime = Field(default_factory=datetime.now) + retry_count: int = 0 \ No newline at end of file diff --git a/services/pipeline/shared/queue_manager.py b/services/pipeline/shared/queue_manager.py new file mode 100644 index 0000000..f56dc57 --- /dev/null +++ b/services/pipeline/shared/queue_manager.py @@ -0,0 +1,173 @@ +""" +Queue Manager +Redis 기반 큐 관리 시스템 +""" +import redis.asyncio as redis +import json +import logging +from typing import Optional, Dict, Any, List +from datetime import datetime + +from .models import PipelineJob, QueueMessage + +logger = logging.getLogger(__name__) + +class QueueManager: + """Redis 기반 큐 매니저""" + + QUEUES = { + "keyword_processing": "queue:keyword", + "rss_collection": "queue:rss", + "search_enrichment": "queue:search", + "ai_summarization": "queue:summarize", + "translation": "queue:translate", + "image_generation": "queue:image", + "article_assembly": "queue:assembly", + "failed": "queue:failed", + "scheduled": "queue:scheduled" + } + + def __init__(self, redis_url: str = "redis://redis:6379"): + self.redis_url = redis_url + self.redis_client: Optional[redis.Redis] = None + + async def connect(self): + """Redis 연결""" + if not self.redis_client: + self.redis_client = await redis.from_url( + self.redis_url, + encoding="utf-8", + decode_responses=True + ) + logger.info("Connected to Redis") + + async def disconnect(self): + """Redis 연결 해제""" + if self.redis_client: + await self.redis_client.close() + self.redis_client = None + + async def enqueue(self, queue_name: str, job: PipelineJob, priority: int = 0) -> str: + """작업을 큐에 추가""" + try: + queue_key = self.QUEUES.get(queue_name, f"queue:{queue_name}") + + message = QueueMessage( + queue_name=queue_name, + job=job + ) + + # 우선순위에 따라 추가 + if priority > 0: + await self.redis_client.lpush(queue_key, message.json()) + else: + await self.redis_client.rpush(queue_key, message.json()) + + # 통계 업데이트 + await self.redis_client.hincrby("stats:queues", queue_name, 1) + + logger.info(f"Job {job.job_id} enqueued to {queue_name}") + return job.job_id + + except Exception as e: + logger.error(f"Failed to enqueue job: {e}") + raise + + async def dequeue(self, queue_name: str, timeout: int = 0) -> Optional[PipelineJob]: + """큐에서 작업 가져오기""" + try: + queue_key = self.QUEUES.get(queue_name, f"queue:{queue_name}") + + if timeout > 0: + result = await self.redis_client.blpop(queue_key, timeout=timeout) + if result: + _, data = result + else: + return None + else: + data = await self.redis_client.lpop(queue_key) + + if data: + message = QueueMessage.parse_raw(data) + + # 처리 중 목록에 추가 + processing_key = f"processing:{queue_name}" + await self.redis_client.hset( + processing_key, + message.job.job_id, + message.json() + ) + + return message.job + + return None + + except Exception as e: + logger.error(f"Failed to dequeue job: {e}") + return None + + async def mark_completed(self, queue_name: str, job_id: str): + """작업 완료 표시""" + try: + processing_key = f"processing:{queue_name}" + await self.redis_client.hdel(processing_key, job_id) + + # 통계 업데이트 + await self.redis_client.hincrby("stats:completed", queue_name, 1) + + logger.info(f"Job {job_id} completed in {queue_name}") + + except Exception as e: + logger.error(f"Failed to mark job as completed: {e}") + + async def mark_failed(self, queue_name: str, job: PipelineJob, error: str): + """작업 실패 처리""" + try: + processing_key = f"processing:{queue_name}" + await self.redis_client.hdel(processing_key, job.job_id) + + # 재시도 확인 + if job.retry_count < job.max_retries: + job.retry_count += 1 + await self.enqueue(queue_name, job) + logger.info(f"Job {job.job_id} requeued (retry {job.retry_count}/{job.max_retries})") + else: + # 실패 큐로 이동 + job.data["error"] = error + job.data["failed_stage"] = queue_name + await self.enqueue("failed", job) + + # 통계 업데이트 + await self.redis_client.hincrby("stats:failed", queue_name, 1) + logger.error(f"Job {job.job_id} failed: {error}") + + except Exception as e: + logger.error(f"Failed to mark job as failed: {e}") + + async def get_queue_stats(self) -> Dict[str, Any]: + """큐 통계 조회""" + try: + stats = {} + + for name, key in self.QUEUES.items(): + stats[name] = { + "pending": await self.redis_client.llen(key), + "processing": await self.redis_client.hlen(f"processing:{name}"), + } + + # 완료/실패 통계 + stats["completed"] = await self.redis_client.hgetall("stats:completed") or {} + stats["failed"] = await self.redis_client.hgetall("stats:failed") or {} + + return stats + + except Exception as e: + logger.error(f"Failed to get queue stats: {e}") + return {} + + async def clear_queue(self, queue_name: str): + """큐 초기화 (테스트용)""" + queue_key = self.QUEUES.get(queue_name, f"queue:{queue_name}") + await self.redis_client.delete(queue_key) + await self.redis_client.delete(f"processing:{queue_name}") + logger.info(f"Queue {queue_name} cleared") \ No newline at end of file diff --git a/services/pipeline/shared/requirements.txt b/services/pipeline/shared/requirements.txt new file mode 100644 index 0000000..cc100bf --- /dev/null +++ b/services/pipeline/shared/requirements.txt @@ -0,0 +1,5 @@ +redis[hiredis]==5.0.1 +motor==3.1.1 +pymongo==4.3.3 +pydantic==2.5.0 +python-dateutil==2.8.2 \ No newline at end of file diff --git a/services/pipeline/translator/Dockerfile b/services/pipeline/translator/Dockerfile new file mode 100644 index 0000000..41f2a71 --- /dev/null +++ b/services/pipeline/translator/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY ./translator/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy shared modules +COPY ./shared /app/shared + +# Copy application code +COPY ./translator /app + +CMD ["python", "translator.py"] \ No newline at end of file diff --git a/services/pipeline/translator/requirements.txt b/services/pipeline/translator/requirements.txt new file mode 100644 index 0000000..fbd9665 --- /dev/null +++ b/services/pipeline/translator/requirements.txt @@ -0,0 +1,3 @@ +httpx==0.25.0 +redis[hiredis]==5.0.1 +pydantic==2.5.0 \ No newline at end of file diff --git a/services/pipeline/translator/translator.py b/services/pipeline/translator/translator.py new file mode 100644 index 0000000..5637e31 --- /dev/null +++ b/services/pipeline/translator/translator.py @@ -0,0 +1,154 @@ +""" +Translation Service +DeepL API를 사용한 번역 서비스 +""" +import asyncio +import logging +import os +import sys +from typing import List, Dict, Any +import httpx + +# Import from shared module +from shared.models import PipelineJob, SummarizedItem, TranslatedItem +from shared.queue_manager import QueueManager + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class TranslatorWorker: + def __init__(self): + self.queue_manager = QueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") + ) + self.deepl_api_key = os.getenv("DEEPL_API_KEY", "3abbc796-2515-44a8-972d-22dcf27ab54a") + # DeepL Pro API 엔드포인트 사용 + self.deepl_api_url = "https://api.deepl.com/v2/translate" + + async def start(self): + """워커 시작""" + logger.info("Starting Translator Worker") + + # Redis 연결 + await self.queue_manager.connect() + + # DeepL API 키 확인 + if not self.deepl_api_key: + logger.error("DeepL API key not configured") + return + + # 메인 처리 루프 + while True: + try: + # 큐에서 작업 가져오기 + job = await self.queue_manager.dequeue('translation', timeout=5) + + if job: + await self.process_job(job) + + except Exception as e: + logger.error(f"Error in worker loop: {e}") + await asyncio.sleep(1) + + async def process_job(self, job: PipelineJob): + """번역 작업 처리""" + try: + logger.info(f"Processing job {job.job_id} for translation") + + summarized_items = job.data.get('summarized_items', []) + translated_items = [] + + for item_data in summarized_items: + summarized_item = SummarizedItem(**item_data) + + # 제목과 요약 번역 + translated_title = await self._translate_text( + summarized_item.enriched_item['rss_item']['title'], + target_lang='EN' + ) + + translated_summary = await self._translate_text( + summarized_item.ai_summary, + target_lang='EN' + ) + + translated_item = TranslatedItem( + summarized_item=summarized_item, + translated_title=translated_title, + translated_summary=translated_summary, + target_language='en' + ) + translated_items.append(translated_item) + + # API 속도 제한 + await asyncio.sleep(0.5) + + if translated_items: + logger.info(f"Translated {len(translated_items)} items") + + # 다음 단계로 전달 + job.data['translated_items'] = [item.dict() for item in translated_items] + job.stages_completed.append('translation') + job.stage = 'image_generation' + + await self.queue_manager.enqueue('image_generation', job) + await self.queue_manager.mark_completed('translation', job.job_id) + else: + logger.warning(f"No items translated for job {job.job_id}") + await self.queue_manager.mark_failed( + 'translation', + job, + "No items to translate" + ) + + except Exception as e: + logger.error(f"Error processing job {job.job_id}: {e}") + await self.queue_manager.mark_failed('translation', job, str(e)) + + async def _translate_text(self, text: str, target_lang: str = 'EN') -> str: + """DeepL API를 사용한 텍스트 번역""" + try: + if not text: + return "" + + async with httpx.AsyncClient() as client: + response = await client.post( + self.deepl_api_url, + data={ + 'auth_key': self.deepl_api_key, + 'text': text, + 'target_lang': target_lang, + 'source_lang': 'KO' + }, + timeout=30 + ) + + if response.status_code == 200: + result = response.json() + return result['translations'][0]['text'] + else: + logger.error(f"DeepL API error: {response.status_code}") + return text # 번역 실패시 원본 반환 + + except Exception as e: + logger.error(f"Error translating text: {e}") + return text # 번역 실패시 원본 반환 + + async def stop(self): + """워커 중지""" + await self.queue_manager.disconnect() + logger.info("Translator Worker stopped") + +async def main(): + """메인 함수""" + worker = TranslatorWorker() + + try: + await worker.start() + except KeyboardInterrupt: + logger.info("Received interrupt signal") + finally: + await worker.stop() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file