From 070032006ea5bc6ad5058e4ead844a726ef6f0d5 Mon Sep 17 00:00:00 2001 From: jungwoo choi Date: Sat, 13 Sep 2025 19:22:14 +0900 Subject: [PATCH] feat: Implement async queue-based news pipeline with microservices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Major architectural transformation from synchronous to asynchronous processing: ## Pipeline Services (8 microservices) - pipeline-scheduler: APScheduler for 30-minute periodic job triggers - pipeline-rss-collector: RSS feed collection with deduplication (7-day TTL) - pipeline-google-search: Content enrichment via Google Search API - pipeline-ai-summarizer: AI summarization using Claude API (claude-sonnet-4-20250514) - pipeline-translator: Translation using DeepL Pro API - pipeline-image-generator: Image generation with Replicate API (Stable Diffusion) - pipeline-article-assembly: Final article assembly and MongoDB storage - pipeline-monitor: Real-time monitoring dashboard (port 8100) ## Key Features - Redis-based job queue with deduplication - Asynchronous processing with Python asyncio - Shared models and queue manager for inter-service communication - Docker containerization for all services - Container names standardized with site11_ prefix ## Removed Services - Moved to backup: google-search, rss-feed, news-aggregator, ai-writer ## Configuration - DeepL Pro API: 3abbc796-2515-44a8-972d-22dcf27ab54a - Claude Model: claude-sonnet-4-20250514 - Redis Queue TTL: 7 days for deduplication ๐Ÿค– Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- backup-services/ai-writer/backend/Dockerfile | 13 + .../ai-writer}/backend/app/__init__.py | 0 .../backend/app/article_generator.py | 218 +++++ backup-services/ai-writer/backend/app/main.py | 746 ++++++++++++++++++ .../ai-writer/backend/app/queue_manager.py | 250 ++++++ .../ai-writer/backend/app/queue_models.py | 49 ++ .../ai-writer/backend/app/worker.py | 201 +++++ .../backend/article_์ „๊ธฐ์ฐจ_analytical.json | 62 ++ .../backend/article_์ „๊ธฐ์ฐจ_investigative.json | 66 ++ .../backend/article_์ „๊ธฐ์ฐจ_professional.json | 62 ++ .../backend/custom_article_analytical.json | 63 ++ .../backend/custom_article_professional.json | 62 ++ .../ai-writer/backend/generated_article.json | 73 ++ .../ai-writer/backend/requirements.txt | 9 + .../ai-writer/backend/test_ai_writer.py | 168 ++++ .../backend/test_prompt_generation.py | 240 ++++++ backup-services/ai-writer/worker/Dockerfile | 19 + .../google-search/README.md | 0 .../google-search/backend/.env.example | 0 .../google-search/backend/Dockerfile | 0 .../google-search}/backend/app/__init__.py | 0 .../google-search/backend/app/config.py | 0 .../google-search/backend/app/main.py | 0 .../backend/app/search_service.py | 0 .../google-search/backend/requirements.txt | 0 .../news-aggregator/backend/Dockerfile | 13 + .../news-aggregator/backend/app/__init__.py | 0 .../news-aggregator/backend/app/main.py | 365 +++++++++ .../news-aggregator/backend/requirements.txt | 5 + .../backend/test_aggregator.py | 214 +++++ .../rss-feed/README.md | 0 .../rss-feed/backend/Dockerfile | 0 .../rss-feed/backend/app/__init__.py | 0 .../rss-feed/backend/app/config.py | 0 .../rss-feed/backend/app/feed_parser.py | 0 .../rss-feed/backend/app/google_rss.py | 115 +++ .../rss-feed/backend/app/main.py | 158 +++- .../rss-feed/backend/app/models.py | 0 .../rss-feed/backend/requirements.txt | 0 config/api-keys-backup.env | 18 + docker-compose.yml | 226 +++++- generated_article.json | 63 ++ services/pipeline/Makefile | 90 +++ services/pipeline/README.md | 154 ++++ services/pipeline/ai-summarizer/Dockerfile | 19 + .../pipeline/ai-summarizer/ai_summarizer.py | 161 ++++ .../pipeline/ai-summarizer/requirements.txt | 3 + services/pipeline/article-assembly/Dockerfile | 19 + .../article-assembly/article_assembly.py | 234 ++++++ .../article-assembly/requirements.txt | 5 + services/pipeline/fix_imports.py | 62 ++ services/pipeline/google-search/Dockerfile | 19 + .../pipeline/google-search/google_search.py | 153 ++++ .../pipeline/google-search/requirements.txt | 3 + services/pipeline/image-generator/Dockerfile | 15 + .../image-generator/image_generator.py | 225 ++++++ .../pipeline/image-generator/requirements.txt | 3 + services/pipeline/monitor/Dockerfile | 22 + services/pipeline/monitor/monitor.py | 349 ++++++++ services/pipeline/monitor/requirements.txt | 6 + services/pipeline/rss-collector/Dockerfile | 19 + .../pipeline/rss-collector/requirements.txt | 4 + .../pipeline/rss-collector/rss_collector.py | 192 +++++ services/pipeline/scheduler/Dockerfile | 19 + services/pipeline/scheduler/requirements.txt | 5 + services/pipeline/scheduler/scheduler.py | 203 +++++ services/pipeline/shared/__init__.py | 1 + services/pipeline/shared/models.py | 113 +++ services/pipeline/shared/queue_manager.py | 173 ++++ services/pipeline/shared/requirements.txt | 5 + services/pipeline/translator/Dockerfile | 15 + services/pipeline/translator/requirements.txt | 3 + services/pipeline/translator/translator.py | 154 ++++ 73 files changed, 5922 insertions(+), 4 deletions(-) create mode 100644 backup-services/ai-writer/backend/Dockerfile rename {services/google-search => backup-services/ai-writer}/backend/app/__init__.py (100%) create mode 100644 backup-services/ai-writer/backend/app/article_generator.py create mode 100644 backup-services/ai-writer/backend/app/main.py create mode 100644 backup-services/ai-writer/backend/app/queue_manager.py create mode 100644 backup-services/ai-writer/backend/app/queue_models.py create mode 100644 backup-services/ai-writer/backend/app/worker.py create mode 100644 backup-services/ai-writer/backend/article_์ „๊ธฐ์ฐจ_analytical.json create mode 100644 backup-services/ai-writer/backend/article_์ „๊ธฐ์ฐจ_investigative.json create mode 100644 backup-services/ai-writer/backend/article_์ „๊ธฐ์ฐจ_professional.json create mode 100644 backup-services/ai-writer/backend/custom_article_analytical.json create mode 100644 backup-services/ai-writer/backend/custom_article_professional.json create mode 100644 backup-services/ai-writer/backend/generated_article.json create mode 100644 backup-services/ai-writer/backend/requirements.txt create mode 100755 backup-services/ai-writer/backend/test_ai_writer.py create mode 100644 backup-services/ai-writer/backend/test_prompt_generation.py create mode 100644 backup-services/ai-writer/worker/Dockerfile rename {services => backup-services}/google-search/README.md (100%) rename {services => backup-services}/google-search/backend/.env.example (100%) rename {services => backup-services}/google-search/backend/Dockerfile (100%) rename {services/rss-feed => backup-services/google-search}/backend/app/__init__.py (100%) rename {services => backup-services}/google-search/backend/app/config.py (100%) rename {services => backup-services}/google-search/backend/app/main.py (100%) rename {services => backup-services}/google-search/backend/app/search_service.py (100%) rename {services => backup-services}/google-search/backend/requirements.txt (100%) create mode 100644 backup-services/news-aggregator/backend/Dockerfile create mode 100644 backup-services/news-aggregator/backend/app/__init__.py create mode 100644 backup-services/news-aggregator/backend/app/main.py create mode 100644 backup-services/news-aggregator/backend/requirements.txt create mode 100755 backup-services/news-aggregator/backend/test_aggregator.py rename {services => backup-services}/rss-feed/README.md (100%) rename {services => backup-services}/rss-feed/backend/Dockerfile (100%) create mode 100644 backup-services/rss-feed/backend/app/__init__.py rename {services => backup-services}/rss-feed/backend/app/config.py (100%) rename {services => backup-services}/rss-feed/backend/app/feed_parser.py (100%) create mode 100644 backup-services/rss-feed/backend/app/google_rss.py rename {services => backup-services}/rss-feed/backend/app/main.py (69%) rename {services => backup-services}/rss-feed/backend/app/models.py (100%) rename {services => backup-services}/rss-feed/backend/requirements.txt (100%) create mode 100644 config/api-keys-backup.env create mode 100644 generated_article.json create mode 100644 services/pipeline/Makefile create mode 100644 services/pipeline/README.md create mode 100644 services/pipeline/ai-summarizer/Dockerfile create mode 100644 services/pipeline/ai-summarizer/ai_summarizer.py create mode 100644 services/pipeline/ai-summarizer/requirements.txt create mode 100644 services/pipeline/article-assembly/Dockerfile create mode 100644 services/pipeline/article-assembly/article_assembly.py create mode 100644 services/pipeline/article-assembly/requirements.txt create mode 100644 services/pipeline/fix_imports.py create mode 100644 services/pipeline/google-search/Dockerfile create mode 100644 services/pipeline/google-search/google_search.py create mode 100644 services/pipeline/google-search/requirements.txt create mode 100644 services/pipeline/image-generator/Dockerfile create mode 100644 services/pipeline/image-generator/image_generator.py create mode 100644 services/pipeline/image-generator/requirements.txt create mode 100644 services/pipeline/monitor/Dockerfile create mode 100644 services/pipeline/monitor/monitor.py create mode 100644 services/pipeline/monitor/requirements.txt create mode 100644 services/pipeline/rss-collector/Dockerfile create mode 100644 services/pipeline/rss-collector/requirements.txt create mode 100644 services/pipeline/rss-collector/rss_collector.py create mode 100644 services/pipeline/scheduler/Dockerfile create mode 100644 services/pipeline/scheduler/requirements.txt create mode 100644 services/pipeline/scheduler/scheduler.py create mode 100644 services/pipeline/shared/__init__.py create mode 100644 services/pipeline/shared/models.py create mode 100644 services/pipeline/shared/queue_manager.py create mode 100644 services/pipeline/shared/requirements.txt create mode 100644 services/pipeline/translator/Dockerfile create mode 100644 services/pipeline/translator/requirements.txt create mode 100644 services/pipeline/translator/translator.py diff --git a/backup-services/ai-writer/backend/Dockerfile b/backup-services/ai-writer/backend/Dockerfile new file mode 100644 index 0000000..a296111 --- /dev/null +++ b/backup-services/ai-writer/backend/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +COPY . . + +# Run the application +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] \ No newline at end of file diff --git a/services/google-search/backend/app/__init__.py b/backup-services/ai-writer/backend/app/__init__.py similarity index 100% rename from services/google-search/backend/app/__init__.py rename to backup-services/ai-writer/backend/app/__init__.py diff --git a/backup-services/ai-writer/backend/app/article_generator.py b/backup-services/ai-writer/backend/app/article_generator.py new file mode 100644 index 0000000..2712cf0 --- /dev/null +++ b/backup-services/ai-writer/backend/app/article_generator.py @@ -0,0 +1,218 @@ +""" +Article Generation Module +Claude API๋ฅผ ์‚ฌ์šฉํ•œ ๊ธฐ์‚ฌ ์ƒ์„ฑ ๋กœ์ง +""" +from typing import Dict, Any, List, Optional +from datetime import datetime +import json +import uuid +import logging +from anthropic import AsyncAnthropic +from pydantic import BaseModel, Field + +logger = logging.getLogger(__name__) + +# Data Models +class NewsSource(BaseModel): + """๋‰ด์Šค ์†Œ์Šค ์ •๋ณด""" + title: str + url: str + published_date: Optional[str] = None + source_site: str = "Unknown" + +class EventInfo(BaseModel): + """์ด๋ฒคํŠธ ์ •๋ณด""" + name: str + date: Optional[str] = None + location: Optional[str] = None + +class Entities(BaseModel): + """์ถ”์ถœ๋œ ์—”ํ‹ฐํ‹ฐ""" + people: List[str] = Field(default_factory=list) + organizations: List[str] = Field(default_factory=list) + groups: List[str] = Field(default_factory=list) + countries: List[str] = Field(default_factory=list) + events: List[EventInfo] = Field(default_factory=list) + keywords: List[str] = Field(default_factory=list) + +class SubTopic(BaseModel): + """๊ธฐ์‚ฌ ์†Œ์ฃผ์ œ""" + title: str + content: List[str] + +class GeneratedArticle(BaseModel): + """์ƒ์„ฑ๋œ ๊ธฐ์‚ฌ""" + news_id: str = Field(default_factory=lambda: str(uuid.uuid4())) + title: str + summary: str + subtopics: List[SubTopic] + categories: List[str] + entities: Entities + sources: List[NewsSource] = Field(default_factory=list) + created_at: datetime = Field(default_factory=datetime.now) + generation_metadata: Dict[str, Any] = Field(default_factory=dict) + +async def generate_article_with_claude( + news_data: Dict[str, Any], + style: str = "professional", + claude_api_key: str = None +) -> GeneratedArticle: + """Claude API๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๊ธฐ์‚ฌ ์ƒ์„ฑ""" + + if not claude_api_key: + import os + claude_api_key = os.getenv("CLAUDE_API_KEY") + + # Initialize Claude client + claude_client = AsyncAnthropic(api_key=claude_api_key) + + # Collect source information + sources_info = [] + + # Prepare the prompt + system_prompt = """๋‹น์‹ ์€ ์ „๋ฌธ์ ์ธ ํ•œ๊ตญ ์–ธ๋ก ์‚ฌ์˜ ์ˆ˜์„ ๊ธฐ์ž์ž…๋‹ˆ๋‹ค. + ์ œ๊ณต๋œ ๋ฐ์ดํ„ฐ๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ๊นŠ์ด ์žˆ๊ณ  ํ†ต์ฐฐ๋ ฅ ์žˆ๋Š” ๊ธฐ์‚ฌ๋ฅผ ์ž‘์„ฑํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. + ๊ธฐ์‚ฌ๋Š” ๋‹ค์Œ ์š”๊ตฌ์‚ฌํ•ญ์„ ์ถฉ์กฑํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค: + + 1. ์†Œ์ฃผ์ œ๋Š” ์ตœ์†Œ 2๊ฐœ, ์ตœ๋Œ€ 6๊ฐœ๋กœ ๊ตฌ์„ฑํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค + 2. ๊ฐ ์†Œ์ฃผ์ œ๋Š” ์ตœ์†Œ 1๊ฐœ, ์ตœ๋Œ€ 10๊ฐœ์˜ ๋ฌธ๋‹จ์œผ๋กœ ๊ตฌ์„ฑํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค + 3. ์ „๋ฌธ์ ์ด๊ณ  ๊ฐ๊ด€์ ์ธ ์–ด์กฐ๋ฅผ ์œ ์ง€ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค + 4. ์‚ฌ์‹ค์— ๊ธฐ๋ฐ˜ํ•œ ๋ถ„์„๊ณผ ํ†ต์ฐฐ์„ ์ œ๊ณตํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค + 5. ํ•œ๊ตญ ๋…์ž๋ฅผ ๋Œ€์ƒ์œผ๋กœ ์ž‘์„ฑ๋˜์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค + 6. ์ด๋ฒคํŠธ ์ •๋ณด๋Š” ๊ฐ€๋Šฅํ•œ ์ผ์‹œ์™€ ์žฅ์†Œ๋ฅผ ํฌํ•จํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค + 7. ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ๋ฅผ ์ตœ๋Œ€ 10๊ฐœ๊นŒ์ง€ ์ถ”์ถœํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค + + ๋ฐ˜๋“œ์‹œ ๋‹ค์Œ JSON ํ˜•์‹์œผ๋กœ ์‘๋‹ตํ•˜์„ธ์š”: + { + "title": "๊ธฐ์‚ฌ ์ œ๋ชฉ", + "summary": "ํ•œ ์ค„ ์š”์•ฝ (100์ž ์ด๋‚ด)", + "subtopics": [ + { + "title": "์†Œ์ฃผ์ œ ์ œ๋ชฉ", + "content": ["๋ฌธ๋‹จ1", "๋ฌธ๋‹จ2", ...] // 1-10๊ฐœ ๋ฌธ๋‹จ + } + ], // 2-6๊ฐœ ์†Œ์ฃผ์ œ + "categories": ["์นดํ…Œ๊ณ ๋ฆฌ1", "์นดํ…Œ๊ณ ๋ฆฌ2"], + "entities": { + "people": ["์ธ๋ฌผ1", "์ธ๋ฌผ2"], + "organizations": ["๊ธฐ๊ด€1", "๊ธฐ๊ด€2"], + "groups": ["๋‹จ์ฒด1", "๋‹จ์ฒด2"], + "countries": ["๋‚˜๋ผ1", "๋‚˜๋ผ2"], + "events": [ + { + "name": "์ด๋ฒคํŠธ๋ช…", + "date": "2025๋…„ 1์›” 15์ผ", // ์„ ํƒ์‚ฌํ•ญ + "location": "์„œ์šธ ์ฝ”์—‘์Šค" // ์„ ํƒ์‚ฌํ•ญ + } + ], + "keywords": ["ํ‚ค์›Œ๋“œ1", "ํ‚ค์›Œ๋“œ2", ...] // ์ตœ๋Œ€ 10๊ฐœ + } + }""" + + # Prepare news content for Claude and collect sources + news_content = [] + for item in news_data.get("news_items", []): + # Add RSS source info + rss_title = item.get('rss_title', '') + rss_link = item.get('rss_link', '') + rss_published = item.get('rss_published', '') + + if rss_title and rss_link: + sources_info.append(NewsSource( + title=rss_title, + url=rss_link, + published_date=rss_published, + source_site="RSS Feed" + )) + + item_text = f"์ œ๋ชฉ: {rss_title}\n" + for result in item.get("google_results", []): + # Add Google search result sources + if "title" in result and "link" in result: + sources_info.append(NewsSource( + title=result.get('title', ''), + url=result.get('link', ''), + published_date=None, + source_site="Google Search" + )) + + if "full_content" in result and result["full_content"]: + content = result["full_content"] + if isinstance(content, dict): + item_text += f"์ถœ์ฒ˜: {content.get('url', '')}\n" + item_text += f"๋‚ด์šฉ: {content.get('content', '')[:1000]}...\n\n" + else: + item_text += f"๋‚ด์šฉ: {str(content)[:1000]}...\n\n" + news_content.append(item_text) + + combined_content = "\n".join(news_content[:10]) # Limit to prevent token overflow + + user_prompt = f"""๋‹ค์Œ ๋‰ด์Šค ๋ฐ์ดํ„ฐ๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์ข…ํ•ฉ์ ์ธ ๊ธฐ์‚ฌ๋ฅผ ์ž‘์„ฑํ•˜์„ธ์š”: + +ํ‚ค์›Œ๋“œ: {news_data.get('keyword', '')} +์ˆ˜์ง‘๋œ ๋‰ด์Šค ์ˆ˜: {len(news_data.get('news_items', []))} + +๋‰ด์Šค ๋‚ด์šฉ: +{combined_content} + +์Šคํƒ€์ผ: {style} +- professional: ์ „ํ†ต์ ์ธ ๋‰ด์Šค ๊ธฐ์‚ฌ ์Šคํƒ€์ผ +- analytical: ๋ถ„์„์ ์ด๊ณ  ์‹ฌ์ธต์ ์ธ ์Šคํƒ€์ผ +- investigative: ํƒ์‚ฌ๋ณด๋„ ์Šคํƒ€์ผ + +์œ„์˜ ๋ฐ์ดํ„ฐ๋ฅผ ์ข…ํ•ฉํ•˜์—ฌ ํ†ต์ฐฐ๋ ฅ ์žˆ๋Š” ๊ธฐ์‚ฌ๋ฅผ JSON ํ˜•์‹์œผ๋กœ ์ž‘์„ฑํ•ด์ฃผ์„ธ์š”.""" + + try: + # Call Claude API + response = await claude_client.messages.create( + model="claude-3-5-sonnet-20241022", + max_tokens=4000, + temperature=0.7, + system=system_prompt, + messages=[ + {"role": "user", "content": user_prompt} + ] + ) + + # Parse response + content = response.content[0].text + + # Extract JSON from response + json_start = content.find('{') + json_end = content.rfind('}') + 1 + if json_start != -1 and json_end > json_start: + json_str = content[json_start:json_end] + article_data = json.loads(json_str) + else: + raise ValueError("No valid JSON found in response") + + # Create article object + article = GeneratedArticle( + title=article_data.get("title", ""), + summary=article_data.get("summary", ""), + subtopics=[ + SubTopic( + title=st.get("title", ""), + content=st.get("content", []) + ) for st in article_data.get("subtopics", []) + ], + categories=article_data.get("categories", []), + entities=Entities(**article_data.get("entities", {})), + sources=sources_info, + generation_metadata={ + "style": style, + "keyword": news_data.get('keyword', ''), + "model": "claude-3-5-sonnet-20241022", + "timestamp": datetime.now().isoformat() + } + ) + + logger.info(f"Successfully generated article: {article.title}") + return article + + except json.JSONDecodeError as e: + logger.error(f"Failed to parse Claude response as JSON: {e}") + raise + except Exception as e: + logger.error(f"Error generating article with Claude: {e}") + raise \ No newline at end of file diff --git a/backup-services/ai-writer/backend/app/main.py b/backup-services/ai-writer/backend/app/main.py new file mode 100644 index 0000000..1d5751a --- /dev/null +++ b/backup-services/ai-writer/backend/app/main.py @@ -0,0 +1,746 @@ +""" +AI Writer Service +Claude API๋ฅผ ์‚ฌ์šฉํ•œ ์ „๋ฌธ์ ์ธ ๋‰ด์Šค ๊ธฐ์‚ฌ ์ƒ์„ฑ ์„œ๋น„์Šค +""" +from fastapi import FastAPI, HTTPException, BackgroundTasks +from fastapi.middleware.cors import CORSMiddleware +from typing import List, Dict, Any, Optional +from datetime import datetime +from pydantic import BaseModel, Field +import httpx +import asyncio +import logging +import json +import uuid +from anthropic import AsyncAnthropic +import os + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +app = FastAPI( + title="AI Writer Service", + description="Claude API๋ฅผ ์‚ฌ์šฉํ•œ ์ „๋ฌธ์ ์ธ ๋‰ด์Šค ๊ธฐ์‚ฌ ์ƒ์„ฑ ์„œ๋น„์Šค", + version="1.0.0" +) + +# CORS ์„ค์ • +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Configuration +NEWS_AGGREGATOR_URL = os.getenv("NEWS_AGGREGATOR_URL", "http://news-aggregator-backend:8000") +CLAUDE_API_KEY = os.getenv("CLAUDE_API_KEY", "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") +MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://mongodb:27017") +DB_NAME = os.getenv("DB_NAME", "ai_writer_db") + +# Claude client +claude_client = AsyncAnthropic(api_key=CLAUDE_API_KEY) + +# HTTP Client +http_client = httpx.AsyncClient(timeout=120.0) + +# Queue Manager +from app.queue_manager import RedisQueueManager +from app.queue_models import NewsJobData, JobResult, JobStatus, QueueStats +queue_manager = RedisQueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") +) + +# MongoDB client (optional for storing generated articles) +from motor.motor_asyncio import AsyncIOMotorClient +mongo_client = None +db = None + +# Data Models +class NewsSource(BaseModel): + """์ฐธ๊ณ ํ•œ ๋‰ด์Šค ์†Œ์Šค ์ •๋ณด""" + title: str = Field(..., description="๋‰ด์Šค ์ œ๋ชฉ") + url: str = Field(..., description="๋‰ด์Šค URL") + published_date: Optional[str] = Field(None, description="๋ฐœํ–‰์ผ") + source_site: Optional[str] = Field(None, description="์ถœ์ฒ˜ ์‚ฌ์ดํŠธ") +class SubTopic(BaseModel): + """๊ธฐ์‚ฌ ์†Œ์ฃผ์ œ""" + title: str = Field(..., description="์†Œ์ฃผ์ œ ์ œ๋ชฉ") + content: List[str] = Field(..., description="์†Œ์ฃผ์ œ ๋‚ด์šฉ (๋ฌธ๋‹จ ๋ฆฌ์ŠคํŠธ)", min_items=1, max_items=10) + +class Event(BaseModel): + """์ด๋ฒคํŠธ ์ •๋ณด""" + name: str = Field(..., description="์ด๋ฒคํŠธ๋ช…") + date: Optional[str] = Field(None, description="์ผ์‹œ") + location: Optional[str] = Field(None, description="์žฅ์†Œ") + +class NewsEntities(BaseModel): + """๋‰ด์Šค์— ํฌํ•จ๋œ ๊ฐœ์ฒด๋“ค""" + people: List[str] = Field(default_factory=list, description="๋‰ด์Šค์— ํฌํ•จ๋œ ์ธ๋ฌผ") + organizations: List[str] = Field(default_factory=list, description="๋‰ด์Šค์— ํฌํ•จ๋œ ๊ธฐ๊ด€") + groups: List[str] = Field(default_factory=list, description="๋‰ด์Šค์— ํฌํ•จ๋œ ๋‹จ์ฒด") + countries: List[str] = Field(default_factory=list, description="๋‰ด์Šค์— ํฌํ•จ๋œ ๋‚˜๋ผ") + events: List[Event] = Field(default_factory=list, description="๋‰ด์Šค์— ํฌํ•จ๋œ ์ผ์ •/์ด๋ฒคํŠธ (์ผ์‹œ์™€ ์žฅ์†Œ ํฌํ•จ)") + keywords: List[str] = Field(default_factory=list, description="ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ (์ตœ๋Œ€ 10๊ฐœ)", max_items=10) + +class GeneratedArticle(BaseModel): + """์ƒ์„ฑ๋œ ๊ธฐ์‚ฌ""" + news_id: str = Field(..., description="๋‰ด์Šค ์•„์ด๋””") + title: str = Field(..., description="๋‰ด์Šค ์ œ๋ชฉ") + created_at: str = Field(..., description="์ƒ์„ฑ๋…„์›”์ผ์‹œ๋ถ„์ดˆ") + summary: str = Field(..., description="ํ•œ ์ค„ ์š”์•ฝ") + subtopics: List[SubTopic] = Field(..., description="์†Œ์ฃผ์ œ ๋ฆฌ์ŠคํŠธ", min_items=2, max_items=6) + categories: List[str] = Field(..., description="์นดํ…Œ๊ณ ๋ฆฌ ๋ฆฌ์ŠคํŠธ") + entities: NewsEntities = Field(..., description="๋‰ด์Šค์— ํฌํ•จ๋œ ๊ฐœ์ฒด๋“ค") + source_keyword: Optional[str] = Field(None, description="์›๋ณธ ๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ") + source_count: Optional[int] = Field(None, description="์ฐธ์กฐํ•œ ์†Œ์Šค ์ˆ˜") + sources: List[NewsSource] = Field(default_factory=list, description="์ฐธ๊ณ ํ•œ ๋‰ด์Šค ์†Œ์Šค ๋ชฉ๋ก") + +class ArticleGenerationRequest(BaseModel): + """๊ธฐ์‚ฌ ์ƒ์„ฑ ์š”์ฒญ""" + keyword: str = Field(..., description="๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ") + limit: int = Field(5, description="์ฒ˜๋ฆฌํ•  RSS ํ•ญ๋ชฉ ์ˆ˜", ge=1, le=20) + google_results_per_title: int = Field(3, description="๊ฐ ์ œ๋ชฉ๋‹น ๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ˆ˜", ge=1, le=10) + lang: str = Field("ko", description="์–ธ์–ด ์ฝ”๋“œ") + country: str = Field("KR", description="๊ตญ๊ฐ€ ์ฝ”๋“œ") + style: str = Field("professional", description="๊ธฐ์‚ฌ ์Šคํƒ€์ผ (professional/analytical/investigative)") + +class PerItemGenerationRequest(BaseModel): + """๊ฐœ๋ณ„ ์•„์ดํ…œ๋ณ„ ๊ธฐ์‚ฌ ์ƒ์„ฑ ์š”์ฒญ""" + keyword: str = Field(..., description="๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ") + limit: Optional[int] = Field(None, description="์ฒ˜๋ฆฌํ•  RSS ํ•ญ๋ชฉ ์ˆ˜ (None์ด๋ฉด ์ „์ฒด)") + google_results_per_title: int = Field(3, description="๊ฐ ์ œ๋ชฉ๋‹น ๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ˆ˜", ge=1, le=10) + lang: str = Field("ko", description="์–ธ์–ด ์ฝ”๋“œ") + country: str = Field("KR", description="๊ตญ๊ฐ€ ์ฝ”๋“œ") + style: str = Field("professional", description="๊ธฐ์‚ฌ ์Šคํƒ€์ผ (professional/analytical/investigative)") + skip_existing: bool = Field(True, description="์ด๋ฏธ ์ƒ์„ฑ๋œ ๊ธฐ์‚ฌ๋Š” ๊ฑด๋„ˆ๋›ฐ๊ธฐ") + +@app.on_event("startup") +async def startup(): + """์„œ๋น„์Šค ์‹œ์ž‘""" + global mongo_client, db + try: + mongo_client = AsyncIOMotorClient(MONGODB_URL) + db = mongo_client[DB_NAME] + logger.info("AI Writer Service starting...") + logger.info(f"Connected to MongoDB: {MONGODB_URL}") + + # Redis ํ ์—ฐ๊ฒฐ + await queue_manager.connect() + logger.info("Connected to Redis queue") + except Exception as e: + logger.error(f"Failed to connect to services: {e}") + +@app.on_event("shutdown") +async def shutdown(): + """์„œ๋น„์Šค ์ข…๋ฃŒ""" + await http_client.aclose() + if mongo_client: + mongo_client.close() + await queue_manager.disconnect() + logger.info("AI Writer Service stopped") + +@app.get("/") +async def root(): + return { + "service": "AI Writer Service", + "version": "1.0.0", + "description": "Claude API๋ฅผ ์‚ฌ์šฉํ•œ ์ „๋ฌธ์ ์ธ ๋‰ด์Šค ๊ธฐ์‚ฌ ์ƒ์„ฑ ์„œ๋น„์Šค", + "endpoints": { + "generate_article": "POST /api/generate", + "generate_per_item": "POST /api/generate/per-item", + "generate_from_aggregated": "POST /api/generate/from-aggregated", + "get_article": "GET /api/articles/{article_id}", + "list_articles": "GET /api/articles", + "health": "GET /health" + } + } + +@app.get("/health") +async def health_check(): + """ํ—ฌ์Šค ์ฒดํฌ""" + try: + # Check News Aggregator service + aggregator_response = await http_client.get(f"{NEWS_AGGREGATOR_URL}/health") + aggregator_healthy = aggregator_response.status_code == 200 + + # Check MongoDB + mongo_healthy = False + if db is not None: + await db.command("ping") + mongo_healthy = True + + return { + "status": "healthy" if (aggregator_healthy and mongo_healthy) else "degraded", + "services": { + "news_aggregator": "healthy" if aggregator_healthy else "unhealthy", + "mongodb": "healthy" if mongo_healthy else "unhealthy", + "claude_api": "configured" + }, + "timestamp": datetime.now().isoformat() + } + except Exception as e: + return { + "status": "unhealthy", + "error": str(e), + "timestamp": datetime.now().isoformat() + } + +async def generate_article_with_claude(news_data: Dict[str, Any], style: str = "professional") -> GeneratedArticle: + """Claude API๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๊ธฐ์‚ฌ ์ƒ์„ฑ""" + + # Collect source information + sources_info = [] + + # Prepare the prompt + system_prompt = """๋‹น์‹ ์€ ์ „๋ฌธ์ ์ธ ํ•œ๊ตญ ์–ธ๋ก ์‚ฌ์˜ ์ˆ˜์„ ๊ธฐ์ž์ž…๋‹ˆ๋‹ค. + ์ œ๊ณต๋œ ๋ฐ์ดํ„ฐ๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ๊นŠ์ด ์žˆ๊ณ  ํ†ต์ฐฐ๋ ฅ ์žˆ๋Š” ๊ธฐ์‚ฌ๋ฅผ ์ž‘์„ฑํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. + ๊ธฐ์‚ฌ๋Š” ๋‹ค์Œ ์š”๊ตฌ์‚ฌํ•ญ์„ ์ถฉ์กฑํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค: + + 1. ์†Œ์ฃผ์ œ๋Š” ์ตœ์†Œ 2๊ฐœ, ์ตœ๋Œ€ 6๊ฐœ๋กœ ๊ตฌ์„ฑํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค + 2. ๊ฐ ์†Œ์ฃผ์ œ๋Š” ์ตœ์†Œ 1๊ฐœ, ์ตœ๋Œ€ 10๊ฐœ์˜ ๋ฌธ๋‹จ์œผ๋กœ ๊ตฌ์„ฑํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค + 3. ์ „๋ฌธ์ ์ด๊ณ  ๊ฐ๊ด€์ ์ธ ์–ด์กฐ๋ฅผ ์œ ์ง€ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค + 4. ์‚ฌ์‹ค์— ๊ธฐ๋ฐ˜ํ•œ ๋ถ„์„๊ณผ ํ†ต์ฐฐ์„ ์ œ๊ณตํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค + 5. ํ•œ๊ตญ ๋…์ž๋ฅผ ๋Œ€์ƒ์œผ๋กœ ์ž‘์„ฑ๋˜์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค + 6. ์ด๋ฒคํŠธ ์ •๋ณด๋Š” ๊ฐ€๋Šฅํ•œ ์ผ์‹œ์™€ ์žฅ์†Œ๋ฅผ ํฌํ•จํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค + 7. ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ๋ฅผ ์ตœ๋Œ€ 10๊ฐœ๊นŒ์ง€ ์ถ”์ถœํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค + + ๋ฐ˜๋“œ์‹œ ๋‹ค์Œ JSON ํ˜•์‹์œผ๋กœ ์‘๋‹ตํ•˜์„ธ์š”: + { + "title": "๊ธฐ์‚ฌ ์ œ๋ชฉ", + "summary": "ํ•œ ์ค„ ์š”์•ฝ (100์ž ์ด๋‚ด)", + "subtopics": [ + { + "title": "์†Œ์ฃผ์ œ ์ œ๋ชฉ", + "content": ["๋ฌธ๋‹จ1", "๋ฌธ๋‹จ2", ...] // 1-10๊ฐœ ๋ฌธ๋‹จ + } + ], // 2-6๊ฐœ ์†Œ์ฃผ์ œ + "categories": ["์นดํ…Œ๊ณ ๋ฆฌ1", "์นดํ…Œ๊ณ ๋ฆฌ2"], + "entities": { + "people": ["์ธ๋ฌผ1", "์ธ๋ฌผ2"], + "organizations": ["๊ธฐ๊ด€1", "๊ธฐ๊ด€2"], + "groups": ["๋‹จ์ฒด1", "๋‹จ์ฒด2"], + "countries": ["๋‚˜๋ผ1", "๋‚˜๋ผ2"], + "events": [ + { + "name": "์ด๋ฒคํŠธ๋ช…", + "date": "2025๋…„ 1์›” 15์ผ", // ์„ ํƒ์‚ฌํ•ญ + "location": "์„œ์šธ ์ฝ”์—‘์Šค" // ์„ ํƒ์‚ฌํ•ญ + } + ], + "keywords": ["ํ‚ค์›Œ๋“œ1", "ํ‚ค์›Œ๋“œ2", ...] // ์ตœ๋Œ€ 10๊ฐœ + } + }""" + + # Prepare news content for Claude and collect sources + news_content = [] + for item in news_data.get("news_items", []): + # Add RSS source info + rss_title = item.get('rss_title', '') + rss_link = item.get('rss_link', '') + rss_published = item.get('rss_published', '') + + if rss_title and rss_link: + sources_info.append(NewsSource( + title=rss_title, + url=rss_link, + published_date=rss_published, + source_site="RSS Feed" + )) + + item_text = f"์ œ๋ชฉ: {rss_title}\n" + for result in item.get("google_results", []): + # Add Google search result sources + if "title" in result and "link" in result: + sources_info.append(NewsSource( + title=result.get('title', ''), + url=result.get('link', ''), + published_date=None, + source_site="Google Search" + )) + + if "full_content" in result and result["full_content"]: + content = result["full_content"] + if isinstance(content, dict): + item_text += f"์ถœ์ฒ˜: {content.get('url', '')}\n" + item_text += f"๋‚ด์šฉ: {content.get('content', '')[:1000]}...\n\n" + else: + item_text += f"๋‚ด์šฉ: {str(content)[:1000]}...\n\n" + news_content.append(item_text) + + combined_content = "\n".join(news_content[:10]) # Limit to prevent token overflow + + user_prompt = f"""๋‹ค์Œ ๋‰ด์Šค ๋ฐ์ดํ„ฐ๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์ข…ํ•ฉ์ ์ธ ๊ธฐ์‚ฌ๋ฅผ ์ž‘์„ฑํ•˜์„ธ์š”: + +ํ‚ค์›Œ๋“œ: {news_data.get('keyword', '')} +์ˆ˜์ง‘๋œ ๋‰ด์Šค ์ˆ˜: {len(news_data.get('news_items', []))} + +๋‰ด์Šค ๋‚ด์šฉ: +{combined_content} + +์Šคํƒ€์ผ: {style} +- professional: ์ „ํ†ต์ ์ธ ๋‰ด์Šค ๊ธฐ์‚ฌ ์Šคํƒ€์ผ +- analytical: ๋ถ„์„์ ์ด๊ณ  ์‹ฌ์ธต์ ์ธ ์Šคํƒ€์ผ +- investigative: ํƒ์‚ฌ๋ณด๋„ ์Šคํƒ€์ผ + +์œ„์˜ ๋ฐ์ดํ„ฐ๋ฅผ ์ข…ํ•ฉํ•˜์—ฌ ํ†ต์ฐฐ๋ ฅ ์žˆ๋Š” ๊ธฐ์‚ฌ๋ฅผ JSON ํ˜•์‹์œผ๋กœ ์ž‘์„ฑํ•ด์ฃผ์„ธ์š”.""" + + try: + # Call Claude API + response = await claude_client.messages.create( + model="claude-3-5-sonnet-20241022", # Latest Claude model + max_tokens=4000, + temperature=0.7, + system=system_prompt, + messages=[ + {"role": "user", "content": user_prompt} + ] + ) + + # Parse Claude's response + content = response.content[0].text + + # Extract JSON from response + import re + json_match = re.search(r'\{.*\}', content, re.DOTALL) + if json_match: + article_data = json.loads(json_match.group()) + else: + # If no JSON found, try to parse the entire content + article_data = json.loads(content) + + # Create GeneratedArticle object + entities_data = article_data.get("entities", {}) + events_data = entities_data.get("events", []) + + # Parse events - handle both old string format and new object format + parsed_events = [] + for event in events_data: + if isinstance(event, str): + # Old format: just event name as string + parsed_events.append(Event(name=event)) + elif isinstance(event, dict): + # New format: event object with name, date, location + parsed_events.append(Event( + name=event.get("name", ""), + date=event.get("date"), + location=event.get("location") + )) + + article = GeneratedArticle( + news_id=str(uuid.uuid4()), + title=article_data.get("title", "์ œ๋ชฉ ์—†์Œ"), + created_at=datetime.now().isoformat(), + summary=article_data.get("summary", ""), + subtopics=[ + SubTopic( + title=st.get("title", ""), + content=st.get("content", []) + ) for st in article_data.get("subtopics", []) + ], + categories=article_data.get("categories", []), + entities=NewsEntities( + people=entities_data.get("people", []), + organizations=entities_data.get("organizations", []), + groups=entities_data.get("groups", []), + countries=entities_data.get("countries", []), + events=parsed_events, + keywords=entities_data.get("keywords", []) + ), + source_keyword=news_data.get("keyword"), + source_count=len(news_data.get("news_items", [])), + sources=sources_info + ) + + return article + + except Exception as e: + logger.error(f"Error generating article with Claude: {e}") + raise HTTPException(status_code=500, detail=f"Failed to generate article: {str(e)}") + +@app.post("/api/generate") +async def generate_article(request: ArticleGenerationRequest): + """ + ๋‰ด์Šค ์ˆ˜์ง‘๋ถ€ํ„ฐ ๊ธฐ์‚ฌ ์ƒ์„ฑ๊นŒ์ง€ ์ „์ฒด ํŒŒ์ดํ”„๋ผ์ธ ์‹คํ–‰ + RSS โ†’ Google Search โ†’ AI ๊ธฐ์‚ฌ ์ƒ์„ฑ + ๋‹จ์ผ ์ข…ํ•ฉ ๊ธฐ์‚ฌ ์ƒ์„ฑ (๊ธฐ์กด ๋ฐฉ์‹) + """ + try: + # Step 1: Get aggregated news from News Aggregator service + logger.info(f"Fetching aggregated news for keyword: {request.keyword}") + + aggregator_response = await http_client.get( + f"{NEWS_AGGREGATOR_URL}/api/aggregate", + params={ + "q": request.keyword, + "limit": request.limit, + "google_results_per_title": request.google_results_per_title, + "lang": request.lang, + "country": request.country + } + ) + aggregator_response.raise_for_status() + news_data = aggregator_response.json() + + if not news_data.get("news_items"): + raise HTTPException(status_code=404, detail="No news items found for the given keyword") + + # Step 2: Generate article using Claude + logger.info(f"Generating article with Claude for {len(news_data['news_items'])} news items") + article = await generate_article_with_claude(news_data, request.style) + + # Step 3: Store article in MongoDB (optional) + if db is not None: + try: + article_dict = article.dict() + await db.articles.insert_one(article_dict) + logger.info(f"Article saved with ID: {article.news_id}") + except Exception as e: + logger.error(f"Failed to save article to MongoDB: {e}") + + return article + + except httpx.HTTPStatusError as e: + logger.error(f"HTTP error from aggregator service: {e}") + raise HTTPException(status_code=e.response.status_code, detail=str(e)) + except Exception as e: + logger.error(f"Error in generate_article: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.post("/api/generate/from-aggregated", response_model=GeneratedArticle) +async def generate_from_aggregated_data(news_data: Dict[str, Any], style: str = "professional"): + """ + ์ด๋ฏธ ์ˆ˜์ง‘๋œ ๋‰ด์Šค ๋ฐ์ดํ„ฐ๋กœ๋ถ€ํ„ฐ ์ง์ ‘ ๊ธฐ์‚ฌ ์ƒ์„ฑ + (News Aggregator ๊ฒฐ๊ณผ๋ฅผ ์ง์ ‘ ์ž…๋ ฅ๋ฐ›์•„ ์ฒ˜๋ฆฌ) + """ + try: + if not news_data.get("news_items"): + raise HTTPException(status_code=400, detail="No news items in provided data") + + # Generate article using Claude + logger.info(f"Generating article from {len(news_data['news_items'])} news items") + article = await generate_article_with_claude(news_data, style) + + # Store article in MongoDB + if db is not None: + try: + article_dict = article.dict() + await db.articles.insert_one(article_dict) + logger.info(f"Article saved with ID: {article.news_id}") + except Exception as e: + logger.error(f"Failed to save article to MongoDB: {e}") + + return article + + except Exception as e: + logger.error(f"Error in generate_from_aggregated_data: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/articles/{article_id}", response_model=GeneratedArticle) +async def get_article(article_id: str): + """์ €์žฅ๋œ ๊ธฐ์‚ฌ ์กฐํšŒ""" + if db is None: + raise HTTPException(status_code=503, detail="Database not available") + + article = await db.articles.find_one({"news_id": article_id}) + if not article: + raise HTTPException(status_code=404, detail="Article not found") + + # Convert MongoDB document to GeneratedArticle + article.pop("_id", None) + return GeneratedArticle(**article) + +@app.get("/api/articles") +async def list_articles( + skip: int = 0, + limit: int = 10, + keyword: Optional[str] = None, + category: Optional[str] = None +): + """์ €์žฅ๋œ ๊ธฐ์‚ฌ ๋ชฉ๋ก ์กฐํšŒ""" + if db is None: + raise HTTPException(status_code=503, detail="Database not available") + + query = {} + if keyword: + query["source_keyword"] = {"$regex": keyword, "$options": "i"} + if category: + query["categories"] = category + + cursor = db.articles.find(query).skip(skip).limit(limit).sort("created_at", -1) + articles = [] + async for article in cursor: + article.pop("_id", None) + articles.append(article) + + total = await db.articles.count_documents(query) + + return { + "articles": articles, + "total": total, + "skip": skip, + "limit": limit + } + +@app.post("/api/generate/batch") +async def generate_batch_articles(keywords: List[str], style: str = "professional"): + """์—ฌ๋Ÿฌ ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•œ ๊ธฐ์‚ฌ ์ผ๊ด„ ์ƒ์„ฑ""" + results = [] + errors = [] + + for keyword in keywords[:5]: # Limit to 5 keywords to prevent overload + try: + request = ArticleGenerationRequest( + keyword=keyword, + style=style + ) + article = await generate_article(request) + results.append({ + "keyword": keyword, + "status": "success", + "article_id": article.news_id, + "title": article.title + }) + except Exception as e: + errors.append({ + "keyword": keyword, + "status": "error", + "error": str(e) + }) + + return { + "success": results, + "errors": errors, + "total_processed": len(results) + len(errors) + } + +@app.post("/api/generate/per-item") +async def generate_articles_per_rss_item(request: PerItemGenerationRequest): + """ + RSS ํ”ผ๋“œ์˜ ๊ฐ ์•„์ดํ…œ๋ณ„๋กœ ๊ฐœ๋ณ„ ๊ธฐ์‚ฌ ์ƒ์„ฑ + ๊ฐ RSS ์•„์ดํ…œ์ด ๋…๋ฆฝ์ ์ธ ๊ธฐ์‚ฌ๊ฐ€ ๋จ + ์ค‘๋ณต ์ƒ์„ฑ ๋ฐฉ์ง€ ๊ธฐ๋Šฅ ํฌํ•จ + """ + try: + # Step 1: Get aggregated news from News Aggregator service + logger.info(f"Fetching aggregated news for keyword: {request.keyword}") + + # limit์ด None์ด๋ฉด ๋ชจ๋“  ํ•ญ๋ชฉ ์ฒ˜๋ฆฌ (์ตœ๋Œ€ 100๊ฐœ๋กœ ์ œํ•œ) + actual_limit = request.limit if request.limit is not None else 100 + + aggregator_response = await http_client.get( + f"{NEWS_AGGREGATOR_URL}/api/aggregate", + params={ + "q": request.keyword, + "limit": actual_limit, + "google_results_per_title": request.google_results_per_title, + "lang": request.lang, + "country": request.country + } + ) + aggregator_response.raise_for_status() + news_data = aggregator_response.json() + + if not news_data.get("news_items"): + raise HTTPException(status_code=404, detail="No news items found for the given keyword") + + # Step 2: Check for existing articles if skip_existing is True + existing_titles = set() + skipped_count = 0 + + if request.skip_existing and db is not None: + # RSS ์ œ๋ชฉ์œผ๋กœ ์ค‘๋ณต ์ฒดํฌ (์ตœ๊ทผ 24์‹œ๊ฐ„ ๋‚ด) + from datetime import datetime, timedelta + cutoff_time = (datetime.now() - timedelta(hours=24)).isoformat() + + existing_cursor = db.articles.find( + { + "source_keyword": request.keyword, + "created_at": {"$gte": cutoff_time} + }, + {"sources": 1} + ) + + async for doc in existing_cursor: + for source in doc.get("sources", []): + if source.get("source_site") == "RSS Feed": + existing_titles.add(source.get("title", "")) + + # Step 3: Generate individual article for each RSS item + generated_articles = [] + + for item in news_data["news_items"]: + try: + rss_title = item.get('rss_title', '') + + # Skip if already exists + if request.skip_existing and rss_title in existing_titles: + logger.info(f"Skipping already generated article: {rss_title}") + skipped_count += 1 + continue + + logger.info(f"Generating article for RSS item: {rss_title or 'Unknown'}") + + # Create individual news_data for this item + individual_news_data = { + "keyword": news_data.get("keyword"), + "news_items": [item] # Single item only + } + + # Generate article for this single item + article = await generate_article_with_claude(individual_news_data, request.style) + + # Store in MongoDB + if db is not None: + try: + article_dict = article.dict() + await db.articles.insert_one(article_dict) + logger.info(f"Article saved with ID: {article.news_id}") + except Exception as e: + logger.error(f"Failed to save article to MongoDB: {e}") + + generated_articles.append(article) + + except Exception as e: + logger.error(f"Failed to generate article for item: {e}") + # Continue with next item even if one fails + continue + + if not generated_articles and skipped_count == 0: + raise HTTPException(status_code=500, detail="Failed to generate any articles") + + # Return all generated articles + return { + "total_generated": len(generated_articles), + "total_items": len(news_data["news_items"]), + "skipped_duplicates": skipped_count, + "articles": generated_articles + } + + except httpx.HTTPStatusError as e: + logger.error(f"HTTP error from aggregator service: {e}") + raise HTTPException(status_code=e.response.status_code, detail=str(e)) + except Exception as e: + logger.error(f"Error in generate_articles_per_rss_item: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +# Queue Management Endpoints + +@app.post("/api/queue/enqueue") +async def enqueue_items(request: PerItemGenerationRequest): + """ + RSS ์•„์ดํ…œ๋“ค์„ ํ์— ์ถ”๊ฐ€ (๋น„๋™๊ธฐ ์ฒ˜๋ฆฌ) + Consumer ์›Œ์ปค๊ฐ€ ๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ์ฒ˜๋ฆฌ + """ + try: + # Step 1: Get aggregated news from News Aggregator service + logger.info(f"Fetching aggregated news for enqueue: {request.keyword}") + + actual_limit = request.limit if request.limit is not None else 100 + + aggregator_response = await http_client.get( + f"{NEWS_AGGREGATOR_URL}/api/aggregate", + params={ + "q": request.keyword, + "limit": actual_limit, + "google_results_per_title": request.google_results_per_title, + "lang": request.lang, + "country": request.country + } + ) + aggregator_response.raise_for_status() + news_data = aggregator_response.json() + + if not news_data.get("news_items"): + raise HTTPException(status_code=404, detail="No news items found for the given keyword") + + # Step 2: Check for existing articles if skip_existing is True + existing_titles = set() + skipped_count = 0 + + if request.skip_existing and db is not None: + from datetime import datetime, timedelta + cutoff_time = (datetime.now() - timedelta(hours=24)).isoformat() + + existing_cursor = db.articles.find( + { + "source_keyword": request.keyword, + "created_at": {"$gte": cutoff_time} + }, + {"sources": 1} + ) + + async for doc in existing_cursor: + for source in doc.get("sources", []): + if source.get("source_site") == "RSS Feed": + existing_titles.add(source.get("title", "")) + + # Step 3: Enqueue items for processing + enqueued_jobs = [] + + for item in news_data["news_items"]: + rss_title = item.get('rss_title', '') + + # Skip if already exists + if request.skip_existing and rss_title in existing_titles: + logger.info(f"Skipping already generated article: {rss_title}") + skipped_count += 1 + continue + + # Create job data + job_data = NewsJobData( + job_id=str(uuid.uuid4()), + keyword=request.keyword, + rss_title=rss_title, + rss_link=item.get('rss_link'), + rss_published=item.get('rss_published'), + google_results=item.get('google_results', []), + style=request.style, + created_at=datetime.now() + ) + + # Enqueue job + job_id = await queue_manager.enqueue(job_data) + enqueued_jobs.append({ + "job_id": job_id, + "title": rss_title[:100] + }) + + logger.info(f"Enqueued job {job_id} for: {rss_title}") + + return { + "total_enqueued": len(enqueued_jobs), + "total_items": len(news_data["news_items"]), + "skipped_duplicates": skipped_count, + "jobs": enqueued_jobs, + "message": f"{len(enqueued_jobs)} jobs added to queue for processing" + } + + except httpx.HTTPStatusError as e: + logger.error(f"HTTP error from aggregator service: {e}") + raise HTTPException(status_code=e.response.status_code, detail=str(e)) + except Exception as e: + logger.error(f"Error in enqueue_items: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/queue/stats", response_model=QueueStats) +async def get_queue_stats(): + """ํ ์ƒํƒœ ๋ฐ ํ†ต๊ณ„ ์กฐํšŒ""" + try: + stats = await queue_manager.get_stats() + return stats + except Exception as e: + logger.error(f"Error getting queue stats: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.delete("/api/queue/clear") +async def clear_queue(): + """ํ ์ดˆ๊ธฐํ™” (๊ด€๋ฆฌ์ž์šฉ)""" + try: + await queue_manager.clear_queue() + return {"message": "Queue cleared successfully"} + except Exception as e: + logger.error(f"Error clearing queue: {e}") + raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file diff --git a/backup-services/ai-writer/backend/app/queue_manager.py b/backup-services/ai-writer/backend/app/queue_manager.py new file mode 100644 index 0000000..2e0695a --- /dev/null +++ b/backup-services/ai-writer/backend/app/queue_manager.py @@ -0,0 +1,250 @@ +""" +Redis Queue Manager for AI Writer Service +Redis๋ฅผ ์‚ฌ์šฉํ•œ ์ž‘์—… ํ ๊ด€๋ฆฌ +""" +import redis.asyncio as redis +import json +import uuid +from typing import Optional, List, Dict, Any +from datetime import datetime, timedelta +import logging +from queue_models import NewsJobData, JobResult, JobStatus, QueueStats + +logger = logging.getLogger(__name__) + +class RedisQueueManager: + """Redis ๊ธฐ๋ฐ˜ ์ž‘์—… ํ ๋งค๋‹ˆ์ €""" + + def __init__(self, redis_url: str = "redis://redis:6379"): + self.redis_url = redis_url + self.redis_client: Optional[redis.Redis] = None + + # Redis ํ‚ค ์ •์˜ + self.QUEUE_KEY = "ai_writer:queue:pending" + self.PROCESSING_KEY = "ai_writer:queue:processing" + self.COMPLETED_KEY = "ai_writer:queue:completed" + self.FAILED_KEY = "ai_writer:queue:failed" + self.STATS_KEY = "ai_writer:stats" + self.WORKERS_KEY = "ai_writer:workers" + self.LOCK_PREFIX = "ai_writer:lock:" + + async def connect(self): + """Redis ์—ฐ๊ฒฐ""" + if not self.redis_client: + self.redis_client = await redis.from_url( + self.redis_url, + encoding="utf-8", + decode_responses=True + ) + logger.info("Connected to Redis queue") + + async def disconnect(self): + """Redis ์—ฐ๊ฒฐ ํ•ด์ œ""" + if self.redis_client: + await self.redis_client.close() + self.redis_client = None + logger.info("Disconnected from Redis queue") + + async def enqueue(self, job_data: NewsJobData) -> str: + """์ž‘์—…์„ ํ์— ์ถ”๊ฐ€""" + try: + if not job_data.job_id: + job_data.job_id = str(uuid.uuid4()) + + # JSON์œผ๋กœ ์ง๋ ฌํ™” + job_json = job_data.json() + + # ์šฐ์„ ์ˆœ์œ„์— ๋”ฐ๋ผ ํ์— ์ถ”๊ฐ€ + if job_data.priority > 0: + # ๋†’์€ ์šฐ์„ ์ˆœ์œ„๋Š” ์•ž์ชฝ์— + await self.redis_client.lpush(self.QUEUE_KEY, job_json) + else: + # ์ผ๋ฐ˜ ์šฐ์„ ์ˆœ์œ„๋Š” ๋’ค์ชฝ์— + await self.redis_client.rpush(self.QUEUE_KEY, job_json) + + # ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ + await self.redis_client.hincrby(self.STATS_KEY, "total_jobs", 1) + await self.redis_client.hincrby(self.STATS_KEY, "pending_jobs", 1) + + logger.info(f"Job {job_data.job_id} enqueued") + return job_data.job_id + + except Exception as e: + logger.error(f"Failed to enqueue job: {e}") + raise + + async def dequeue(self, timeout: int = 0) -> Optional[NewsJobData]: + """ํ์—์„œ ์ž‘์—… ๊ฐ€์ ธ์˜ค๊ธฐ (๋ธ”๋กœํ‚น ๊ฐ€๋Šฅ)""" + try: + # ๋Œ€๊ธฐ ์ค‘์ธ ์ž‘์—…์„ ๊ฐ€์ ธ์™€์„œ ์ฒ˜๋ฆฌ ์ค‘ ๋ชฉ๋ก์œผ๋กœ ์ด๋™ + if timeout > 0: + result = await self.redis_client.blmove( + self.QUEUE_KEY, + self.PROCESSING_KEY, + timeout, + "LEFT", + "RIGHT" + ) + else: + result = await self.redis_client.lmove( + self.QUEUE_KEY, + self.PROCESSING_KEY, + "LEFT", + "RIGHT" + ) + + if result: + # ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ + await self.redis_client.hincrby(self.STATS_KEY, "pending_jobs", -1) + await self.redis_client.hincrby(self.STATS_KEY, "processing_jobs", 1) + + return NewsJobData.parse_raw(result) + + return None + + except Exception as e: + logger.error(f"Failed to dequeue job: {e}") + return None + + async def mark_completed(self, job_id: str, article_id: str): + """์ž‘์—…์„ ์™„๋ฃŒ๋กœ ํ‘œ์‹œ""" + try: + # ์ฒ˜๋ฆฌ ์ค‘ ๋ชฉ๋ก์—์„œ ์ž‘์—… ์ฐพ๊ธฐ + processing_jobs = await self.redis_client.lrange(self.PROCESSING_KEY, 0, -1) + + for job_json in processing_jobs: + job = NewsJobData.parse_raw(job_json) + if job.job_id == job_id: + # ์ฒ˜๋ฆฌ ์ค‘ ๋ชฉ๋ก์—์„œ ์ œ๊ฑฐ + await self.redis_client.lrem(self.PROCESSING_KEY, 1, job_json) + + # ์™„๋ฃŒ ๊ฒฐ๊ณผ ์ƒ์„ฑ + result = JobResult( + job_id=job_id, + status=JobStatus.COMPLETED, + article_id=article_id, + completed_at=datetime.now() + ) + + # ์™„๋ฃŒ ๋ชฉ๋ก์— ์ถ”๊ฐ€ (์ตœ๋Œ€ 1000๊ฐœ ์œ ์ง€) + await self.redis_client.lpush(self.COMPLETED_KEY, result.json()) + await self.redis_client.ltrim(self.COMPLETED_KEY, 0, 999) + + # ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ + await self.redis_client.hincrby(self.STATS_KEY, "processing_jobs", -1) + await self.redis_client.hincrby(self.STATS_KEY, "completed_jobs", 1) + + logger.info(f"Job {job_id} marked as completed") + break + + except Exception as e: + logger.error(f"Failed to mark job as completed: {e}") + + async def mark_failed(self, job_id: str, error_message: str): + """์ž‘์—…์„ ์‹คํŒจ๋กœ ํ‘œ์‹œ""" + try: + # ์ฒ˜๋ฆฌ ์ค‘ ๋ชฉ๋ก์—์„œ ์ž‘์—… ์ฐพ๊ธฐ + processing_jobs = await self.redis_client.lrange(self.PROCESSING_KEY, 0, -1) + + for job_json in processing_jobs: + job = NewsJobData.parse_raw(job_json) + if job.job_id == job_id: + # ์ฒ˜๋ฆฌ ์ค‘ ๋ชฉ๋ก์—์„œ ์ œ๊ฑฐ + await self.redis_client.lrem(self.PROCESSING_KEY, 1, job_json) + + # ์žฌ์‹œ๋„ ํ™•์ธ + if job.retry_count < job.max_retries: + job.retry_count += 1 + # ๋‹ค์‹œ ํ์— ์ถ”๊ฐ€ + await self.redis_client.rpush(self.QUEUE_KEY, job.json()) + await self.redis_client.hincrby(self.STATS_KEY, "pending_jobs", 1) + logger.info(f"Job {job_id} requeued (retry {job.retry_count}/{job.max_retries})") + else: + # ์‹คํŒจ ๊ฒฐ๊ณผ ์ƒ์„ฑ + result = JobResult( + job_id=job_id, + status=JobStatus.FAILED, + error_message=error_message, + completed_at=datetime.now() + ) + + # ์‹คํŒจ ๋ชฉ๋ก์— ์ถ”๊ฐ€ + await self.redis_client.lpush(self.FAILED_KEY, result.json()) + await self.redis_client.ltrim(self.FAILED_KEY, 0, 999) + + # ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ + await self.redis_client.hincrby(self.STATS_KEY, "failed_jobs", 1) + logger.error(f"Job {job_id} marked as failed: {error_message}") + + await self.redis_client.hincrby(self.STATS_KEY, "processing_jobs", -1) + break + + except Exception as e: + logger.error(f"Failed to mark job as failed: {e}") + + async def get_stats(self) -> QueueStats: + """ํ ํ†ต๊ณ„ ์กฐํšŒ""" + try: + stats_data = await self.redis_client.hgetall(self.STATS_KEY) + + # ํ™œ์„ฑ ์›Œ์ปค ์ˆ˜ ๊ณ„์‚ฐ + workers = await self.redis_client.smembers(self.WORKERS_KEY) + active_workers = 0 + for worker_id in workers: + # ์›Œ์ปค๊ฐ€ ์ตœ๊ทผ 1๋ถ„ ์ด๋‚ด์— ํ™œ๋™ํ–ˆ๋Š”์ง€ ํ™•์ธ + last_ping = await self.redis_client.get(f"{self.WORKERS_KEY}:{worker_id}") + if last_ping: + last_ping_time = datetime.fromisoformat(last_ping) + if datetime.now() - last_ping_time < timedelta(minutes=1): + active_workers += 1 + + return QueueStats( + pending_jobs=int(stats_data.get("pending_jobs", 0)), + processing_jobs=int(stats_data.get("processing_jobs", 0)), + completed_jobs=int(stats_data.get("completed_jobs", 0)), + failed_jobs=int(stats_data.get("failed_jobs", 0)), + total_jobs=int(stats_data.get("total_jobs", 0)), + workers_active=active_workers + ) + + except Exception as e: + logger.error(f"Failed to get stats: {e}") + return QueueStats( + pending_jobs=0, + processing_jobs=0, + completed_jobs=0, + failed_jobs=0, + total_jobs=0, + workers_active=0 + ) + + async def register_worker(self, worker_id: str): + """์›Œ์ปค ๋“ฑ๋ก""" + await self.redis_client.sadd(self.WORKERS_KEY, worker_id) + await self.redis_client.set( + f"{self.WORKERS_KEY}:{worker_id}", + datetime.now().isoformat(), + ex=300 # 5๋ถ„ ํ›„ ์ž๋™ ๋งŒ๋ฃŒ + ) + + async def ping_worker(self, worker_id: str): + """์›Œ์ปค ํ™œ๋™ ์—…๋ฐ์ดํŠธ""" + await self.redis_client.set( + f"{self.WORKERS_KEY}:{worker_id}", + datetime.now().isoformat(), + ex=300 + ) + + async def unregister_worker(self, worker_id: str): + """์›Œ์ปค ๋“ฑ๋ก ํ•ด์ œ""" + await self.redis_client.srem(self.WORKERS_KEY, worker_id) + await self.redis_client.delete(f"{self.WORKERS_KEY}:{worker_id}") + + async def clear_queue(self): + """ํ ์ดˆ๊ธฐํ™” (ํ…Œ์ŠคํŠธ์šฉ)""" + await self.redis_client.delete(self.QUEUE_KEY) + await self.redis_client.delete(self.PROCESSING_KEY) + await self.redis_client.delete(self.COMPLETED_KEY) + await self.redis_client.delete(self.FAILED_KEY) + await self.redis_client.delete(self.STATS_KEY) + logger.info("Queue cleared") \ No newline at end of file diff --git a/backup-services/ai-writer/backend/app/queue_models.py b/backup-services/ai-writer/backend/app/queue_models.py new file mode 100644 index 0000000..6cb9402 --- /dev/null +++ b/backup-services/ai-writer/backend/app/queue_models.py @@ -0,0 +1,49 @@ +""" +Queue Models for AI Writer Service +Redis ํ์—์„œ ์‚ฌ์šฉํ•  ๋ฐ์ดํ„ฐ ๋ชจ๋ธ ์ •์˜ +""" +from pydantic import BaseModel, Field +from typing import Optional, List, Dict, Any +from datetime import datetime +from enum import Enum + +class JobStatus(str, Enum): + """์ž‘์—… ์ƒํƒœ""" + PENDING = "pending" + PROCESSING = "processing" + COMPLETED = "completed" + FAILED = "failed" + SKIPPED = "skipped" + +class NewsJobData(BaseModel): + """ํ์— ๋“ค์–ด๊ฐˆ ๋‰ด์Šค ์ž‘์—… ๋ฐ์ดํ„ฐ""" + job_id: str = Field(..., description="์ž‘์—… ๊ณ ์œ  ID") + keyword: str = Field(..., description="์›๋ณธ ๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ") + rss_title: str = Field(..., description="RSS ์ œ๋ชฉ") + rss_link: Optional[str] = Field(None, description="RSS ๋งํฌ") + rss_published: Optional[str] = Field(None, description="RSS ๋ฐœํ–‰์ผ") + google_results: List[Dict[str, Any]] = Field(default_factory=list, description="๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ") + style: str = Field("professional", description="๊ธฐ์‚ฌ ์Šคํƒ€์ผ") + created_at: datetime = Field(default_factory=datetime.now, description="์ž‘์—… ์ƒ์„ฑ ์‹œ๊ฐ„") + priority: int = Field(0, description="์šฐ์„ ์ˆœ์œ„ (๋†’์„์ˆ˜๋ก ์šฐ์„ )") + retry_count: int = Field(0, description="์žฌ์‹œ๋„ ํšŸ์ˆ˜") + max_retries: int = Field(3, description="์ตœ๋Œ€ ์žฌ์‹œ๋„ ํšŸ์ˆ˜") + +class JobResult(BaseModel): + """์ž‘์—… ๊ฒฐ๊ณผ""" + job_id: str = Field(..., description="์ž‘์—… ๊ณ ์œ  ID") + status: JobStatus = Field(..., description="์ž‘์—… ์ƒํƒœ") + article_id: Optional[str] = Field(None, description="์ƒ์„ฑ๋œ ๊ธฐ์‚ฌ ID") + error_message: Optional[str] = Field(None, description="์—๋Ÿฌ ๋ฉ”์‹œ์ง€") + processing_time: Optional[float] = Field(None, description="์ฒ˜๋ฆฌ ์‹œ๊ฐ„(์ดˆ)") + completed_at: Optional[datetime] = Field(None, description="์™„๋ฃŒ ์‹œ๊ฐ„") + +class QueueStats(BaseModel): + """ํ ํ†ต๊ณ„""" + pending_jobs: int = Field(..., description="๋Œ€๊ธฐ ์ค‘์ธ ์ž‘์—… ์ˆ˜") + processing_jobs: int = Field(..., description="์ฒ˜๋ฆฌ ์ค‘์ธ ์ž‘์—… ์ˆ˜") + completed_jobs: int = Field(..., description="์™„๋ฃŒ๋œ ์ž‘์—… ์ˆ˜") + failed_jobs: int = Field(..., description="์‹คํŒจํ•œ ์ž‘์—… ์ˆ˜") + total_jobs: int = Field(..., description="์ „์ฒด ์ž‘์—… ์ˆ˜") + workers_active: int = Field(..., description="ํ™œ์„ฑ ์›Œ์ปค ์ˆ˜") + average_processing_time: Optional[float] = Field(None, description="ํ‰๊ท  ์ฒ˜๋ฆฌ ์‹œ๊ฐ„(์ดˆ)") \ No newline at end of file diff --git a/backup-services/ai-writer/backend/app/worker.py b/backup-services/ai-writer/backend/app/worker.py new file mode 100644 index 0000000..e859904 --- /dev/null +++ b/backup-services/ai-writer/backend/app/worker.py @@ -0,0 +1,201 @@ +""" +AI Writer Consumer Worker +ํ์—์„œ ์ž‘์—…์„ ๊ฐ€์ ธ์™€ ๊ธฐ์‚ฌ๋ฅผ ์ƒ์„ฑํ•˜๋Š” ๋ฐฑ๊ทธ๋ผ์šด๋“œ ์›Œ์ปค +""" +import asyncio +import logging +import signal +import sys +import uuid +from datetime import datetime +from typing import Optional +import os + +from motor.motor_asyncio import AsyncIOMotorClient +from anthropic import AsyncAnthropic + +from queue_manager import RedisQueueManager +from queue_models import NewsJobData, JobStatus +from article_generator import generate_article_with_claude + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +class AIWriterWorker: + """AI Writer ๋ฐฑ๊ทธ๋ผ์šด๋“œ ์›Œ์ปค""" + + def __init__(self, worker_id: Optional[str] = None): + self.worker_id = worker_id or str(uuid.uuid4()) + self.queue_manager = RedisQueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") + ) + + # MongoDB ์„ค์ • + self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017") + self.db_name = os.getenv("DB_NAME", "ai_writer_db") + self.mongo_client = None + self.db = None + + # Claude ํด๋ผ์ด์–ธํŠธ + self.claude_api_key = os.getenv("CLAUDE_API_KEY") + self.claude_client = AsyncAnthropic(api_key=self.claude_api_key) + + # ์‹คํ–‰ ์ƒํƒœ + self.running = False + self.tasks = [] + + async def start(self, num_workers: int = 1): + """์›Œ์ปค ์‹œ์ž‘""" + logger.info(f"Starting AI Writer Worker {self.worker_id} with {num_workers} concurrent workers") + + try: + # Redis ์—ฐ๊ฒฐ + await self.queue_manager.connect() + await self.queue_manager.register_worker(self.worker_id) + + # MongoDB ์—ฐ๊ฒฐ + self.mongo_client = AsyncIOMotorClient(self.mongodb_url) + self.db = self.mongo_client[self.db_name] + logger.info("Connected to MongoDB") + + self.running = True + + # ์—ฌ๋Ÿฌ ์›Œ์ปค ํƒœ์Šคํฌ ์ƒ์„ฑ + for i in range(num_workers): + task = asyncio.create_task(self._process_jobs(f"{self.worker_id}-{i}")) + self.tasks.append(task) + + # ์›Œ์ปค ํ•‘ ํƒœ์Šคํฌ + ping_task = asyncio.create_task(self._ping_worker()) + self.tasks.append(ping_task) + + # ๋ชจ๋“  ํƒœ์Šคํฌ ๋Œ€๊ธฐ + await asyncio.gather(*self.tasks) + + except Exception as e: + logger.error(f"Worker error: {e}") + finally: + await self.stop() + + async def stop(self): + """์›Œ์ปค ์ •์ง€""" + logger.info(f"Stopping AI Writer Worker {self.worker_id}") + self.running = False + + # ํƒœ์Šคํฌ ์ทจ์†Œ + for task in self.tasks: + task.cancel() + + # ์›Œ์ปค ๋“ฑ๋ก ํ•ด์ œ + await self.queue_manager.unregister_worker(self.worker_id) + + # ์—ฐ๊ฒฐ ํ•ด์ œ + await self.queue_manager.disconnect() + if self.mongo_client: + self.mongo_client.close() + + logger.info(f"Worker {self.worker_id} stopped") + + async def _process_jobs(self, sub_worker_id: str): + """์ž‘์—… ์ฒ˜๋ฆฌ ๋ฃจํ”„""" + logger.info(f"Sub-worker {sub_worker_id} started") + + while self.running: + try: + # ํ์—์„œ ์ž‘์—… ๊ฐ€์ ธ์˜ค๊ธฐ (5์ดˆ ํƒ€์ž„์•„์›ƒ) + job = await self.queue_manager.dequeue(timeout=5) + + if job: + logger.info(f"[{sub_worker_id}] Processing job {job.job_id}: {job.rss_title[:50]}") + start_time = datetime.now() + + try: + # ๊ธฐ์‚ฌ ์ƒ์„ฑ + article = await self._generate_article(job) + + # MongoDB์— ์ €์žฅ + if article and self.db is not None: + article_dict = article.dict() + await self.db.articles.insert_one(article_dict) + + # ์ฒ˜๋ฆฌ ์‹œ๊ฐ„ ๊ณ„์‚ฐ + processing_time = (datetime.now() - start_time).total_seconds() + + # ์™„๋ฃŒ ํ‘œ์‹œ + await self.queue_manager.mark_completed( + job.job_id, + article.news_id + ) + + logger.info(f"[{sub_worker_id}] Job {job.job_id} completed in {processing_time:.2f}s") + else: + raise Exception("Failed to generate article") + + except Exception as e: + logger.error(f"[{sub_worker_id}] Job {job.job_id} failed: {e}") + await self.queue_manager.mark_failed(job.job_id, str(e)) + + except asyncio.CancelledError: + break + except Exception as e: + logger.error(f"[{sub_worker_id}] Worker error: {e}") + await asyncio.sleep(1) + + logger.info(f"Sub-worker {sub_worker_id} stopped") + + async def _generate_article(self, job: NewsJobData): + """๊ธฐ์‚ฌ ์ƒ์„ฑ""" + # ์ž‘์—… ๋ฐ์ดํ„ฐ๋ฅผ ๊ธฐ์กด ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜ + news_data = { + "keyword": job.keyword, + "news_items": [{ + "rss_title": job.rss_title, + "rss_link": job.rss_link, + "rss_published": job.rss_published, + "google_results": job.google_results + }] + } + + # ๊ธฐ์‚ฌ ์ƒ์„ฑ (๊ธฐ์กด ํ•จ์ˆ˜ ์žฌ์‚ฌ์šฉ) + return await generate_article_with_claude(news_data, job.style) + + async def _ping_worker(self): + """์›Œ์ปค ํ™œ๋™ ์‹ ํ˜ธ ์ „์†ก""" + while self.running: + try: + await self.queue_manager.ping_worker(self.worker_id) + await asyncio.sleep(30) # 30์ดˆ๋งˆ๋‹ค ํ•‘ + except asyncio.CancelledError: + break + except Exception as e: + logger.error(f"Ping error: {e}") + +def signal_handler(signum, frame): + """์‹œ๊ทธ๋„ ํ•ธ๋“ค๋Ÿฌ""" + logger.info(f"Received signal {signum}") + sys.exit(0) + +async def main(): + """๋ฉ”์ธ ํ•จ์ˆ˜""" + # ์‹œ๊ทธ๋„ ํ•ธ๋“ค๋Ÿฌ ๋“ฑ๋ก + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + # ์›Œ์ปค ์ˆ˜ ์„ค์ • (ํ™˜๊ฒฝ๋ณ€์ˆ˜ ๋˜๋Š” ๊ธฐ๋ณธ๊ฐ’) + num_workers = int(os.getenv("WORKER_COUNT", "3")) + + # ์›Œ์ปค ์‹œ์ž‘ + worker = AIWriterWorker() + try: + await worker.start(num_workers=num_workers) + except KeyboardInterrupt: + logger.info("Keyboard interrupt received") + finally: + await worker.stop() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/backup-services/ai-writer/backend/article_์ „๊ธฐ์ฐจ_analytical.json b/backup-services/ai-writer/backend/article_์ „๊ธฐ์ฐจ_analytical.json new file mode 100644 index 0000000..1600e04 --- /dev/null +++ b/backup-services/ai-writer/backend/article_์ „๊ธฐ์ฐจ_analytical.json @@ -0,0 +1,62 @@ +{ + "news_id": "49bdf2f3-4dbc-47eb-8c49-5d9536f41d87", + "title": "์œ ๋Ÿฝ ์ „๊ธฐ์ฐจ ์‹œ์žฅ์˜ ์ƒˆ๋กœ์šด ์ „ํ™˜์ : ํ˜„๋Œ€์ฐจยท๊ธฐ์•„์˜ ์†Œํ˜• ์ „๊ธฐ์ฐจ ์ „๋žต๊ณผ ๊ธ€๋กœ๋ฒŒ ๊ฒฝ์Ÿ ๊ตฌ๋„", + "created_at": "2025-09-13T00:29:13.376541", + "summary": "ํ˜„๋Œ€์ฐจ์™€ ๊ธฐ์•„๊ฐ€ IAA 2025์—์„œ ์†Œํ˜• ์ „๊ธฐ์ฐจ ์ฝ˜์…‰ํŠธ ๋ชจ๋ธ์„ ๊ณต๊ฐœํ•˜๋ฉฐ ์œ ๋Ÿฝ ์‹œ์žฅ ๊ณต๋žต์„ ๊ฐ€์†ํ™”, ๋ฐฐํ„ฐ๋ฆฌ ํ˜‘๋ ฅ๊ณผ ๊ฐ€๊ฒฉ ๊ฒฝ์Ÿ๋ ฅ์œผ๋กœ ์Šน๋ถ€์ˆ˜", + "subtopics": [ + { + "title": "ํ˜„๋Œ€์ฐจยท๊ธฐ์•„์˜ ์œ ๋Ÿฝ ์†Œํ˜• ์ „๊ธฐ์ฐจ ์‹œ์žฅ ๊ณต๋žต", + "content": [ + "ํ˜„๋Œ€์ž๋™์ฐจ์™€ ๊ธฐ์•„๊ฐ€ IAA 2025์—์„œ ์ฝ˜์…‰ํŠธ ์“ฐ๋ฆฌ์™€ EV2๋ฅผ ๊ณต๊ฐœํ•˜๋ฉฐ ์œ ๋Ÿฝ ์†Œํ˜• ์ „๊ธฐ์ฐจ ์‹œ์žฅ ๊ณต๋žต์— ๋ฐ•์ฐจ๋ฅผ ๊ฐ€ํ•˜๊ณ  ์žˆ๋‹ค. ์ด๋Š” ์œ ๋Ÿฝ์˜ ๊ธ‰์„ฑ์žฅํ•˜๋Š” ์†Œํ˜• ์ „๊ธฐ์ฐจ ์ˆ˜์š”์— ๋Œ€์‘ํ•˜๊ธฐ ์œ„ํ•œ ์ „๋žต์  ์›€์ง์ž„์œผ๋กœ ํ‰๊ฐ€๋œ๋‹ค.", + "ํŠนํžˆ ๋‘ ๋ชจ๋ธ์€ ์‹ค์šฉ์„ฑ๊ณผ ๊ฒฝ์ œ์„ฑ์„ ๋ชจ๋‘ ๊ฐ–์ถ˜ ์ œํ’ˆ์œผ๋กœ, ์œ ๋Ÿฝ ์†Œ๋น„์ž๋“ค์˜ ๋‹ˆ์ฆˆ๋ฅผ ์ •ํ™•ํžˆ ๊ฒจ๋ƒฅํ–ˆ๋‹ค๋Š” ํ‰๊ฐ€๋ฅผ ๋ฐ›๊ณ  ์žˆ๋‹ค. ํ˜„๋Œ€์ฐจ๊ทธ๋ฃน์€ ์ด๋ฅผ ํ†ตํ•ด ์œ ๋Ÿฝ ์‹œ์žฅ์—์„œ์˜ ์ž…์ง€๋ฅผ ๋”์šฑ ๊ฐ•ํ™”ํ•  ๊ฒƒ์œผ๋กœ ์ „๋ง๋œ๋‹ค.", + "ํ˜„์ง€ ์ „๋ฌธ๊ฐ€๋“ค์€ ํ˜„๋Œ€์ฐจ๊ทธ๋ฃน์˜ ์ด๋ฒˆ ์ „๋žต์ด ์œ ๋Ÿฝ ์ „๊ธฐ์ฐจ ์‹œ์žฅ์˜ '๊ณจ๋“ ํƒ€์ž„'์„ ์žก๊ธฐ ์œ„ํ•œ ์‹œ์˜์ ์ ˆํ•œ ์›€์ง์ž„์ด๋ผ๊ณ  ๋ถ„์„ํ•˜๊ณ  ์žˆ๋‹ค." + ] + }, + { + "title": "๋ฐฐํ„ฐ๋ฆฌ ๊ณต๊ธ‰๋ง ์ „๋žต์˜ ์ค‘์š”์„ฑ ๋ถ€๊ฐ", + "content": [ + "์ „๊ธฐ์ฐจ ์‹œ์žฅ์—์„œ ๋ฐฐํ„ฐ๋ฆฌ ๊ณต๊ธ‰๋ง ํ™•๋ณด๊ฐ€ ํ•ต์‹ฌ ๊ฒฝ์Ÿ๋ ฅ์œผ๋กœ ๋ถ€์ƒํ•˜๊ณ  ์žˆ๋‹ค. IAA ๋ชจ๋นŒ๋ฆฌํ‹ฐ์—์„œ ํด์Šคํƒ€๊ฐ€ SK์˜จ์„ ๋ฐฐํ„ฐ๋ฆฌ ํŒŒํŠธ๋„ˆ๋กœ ๊ณต๊ฐœ์ ์œผ๋กœ ์–ธ๊ธ‰ํ•œ ๊ฒƒ์ด ์ฃผ๋ชฉ๋ฐ›๊ณ  ์žˆ๋‹ค.", + "๋ฐฐํ„ฐ๋ฆฌ ์ œ์กฐ์‚ฌ ์„ ์ •์— ๋Œ€ํ•œ ์ •๋ณด๊ฐ€ ์ œํ•œ์ ์ธ ๊ฐ€์šด๋ฐ, ์•ˆ์ •์ ์ธ ๋ฐฐํ„ฐ๋ฆฌ ๊ณต๊ธ‰๋ง ๊ตฌ์ถ•์ด ์ „๊ธฐ์ฐจ ์ œ์กฐ์‚ฌ๋“ค์˜ ์„ฑํŒจ๋ฅผ ์ขŒ์šฐํ•  ๊ฒƒ์œผ๋กœ ์˜ˆ์ƒ๋œ๋‹ค.", + "ํŠนํžˆ ์†Œํ˜• ์ „๊ธฐ์ฐจ์˜ ๊ฒฝ์šฐ ๊ฐ€๊ฒฉ ๊ฒฝ์Ÿ๋ ฅ์ด ์ค‘์š”ํ•œ ๋งŒํผ, ํšจ์œจ์ ์ธ ๋ฐฐํ„ฐ๋ฆฌ ์ˆ˜๊ธ‰ ์ „๋žต์ด ์‹œ์žฅ ์ ์œ ์œจ ํ™•๋Œ€์˜ ๊ด€๊ฑด์ด ๋  ์ „๋ง์ด๋‹ค." + ] + }, + { + "title": "๊ธ€๋กœ๋ฒŒ ์ „๊ธฐ์ฐจ ์‹œ์žฅ์˜ ๊ฒฝ์Ÿ ๊ตฌ๋„ ๋ณ€ํ™”", + "content": [ + "์œ ๋Ÿฝ ์ „๊ธฐ์ฐจ ์‹œ์žฅ์—์„œ ์†Œํ˜• ๋ชจ๋ธ์„ ์ค‘์‹ฌ์œผ๋กœ ํ•œ ๊ฒฝ์Ÿ์ด ๋ณธ๊ฒฉํ™”๋˜๋ฉด์„œ, ์ œ์กฐ์‚ฌ๋“ค์˜ ์ „๋žต์  ํฌ์ง€์…”๋‹์ด ๋”์šฑ ์ค‘์š”ํ•ด์ง€๊ณ  ์žˆ๋‹ค.", + "ํ˜„๋Œ€์ฐจ๊ทธ๋ฃน์€ ํ’ˆ์งˆ๊ณผ ๊ธฐ์ˆ ๋ ฅ์„ ๋ฐ”ํƒ•์œผ๋กœ ํ•œ ํ”„๋ฆฌ๋ฏธ์—„ ์ด๋ฏธ์ง€์™€ ํ•จ๊ป˜, ํ•ฉ๋ฆฌ์ ์ธ ๊ฐ€๊ฒฉ๋Œ€์˜ ์†Œํ˜• ์ „๊ธฐ์ฐจ ๋ผ์ธ์—…์œผ๋กœ ์‹œ์žฅ ๊ณต๋žต์„ ๊ฐ€์†ํ™”ํ•˜๊ณ  ์žˆ๋‹ค.", + "์ด๋Ÿฌํ•œ ๋ณ€ํ™”๋Š” ๊ธ€๋กœ๋ฒŒ ์ž๋™์ฐจ ์‚ฐ์—…์˜ ํŒจ๋Ÿฌ๋‹ค์ž„ ์ „ํ™˜์„ ๋ฐ˜์˜ํ•˜๋ฉฐ, ํ–ฅํ›„ ์ „๊ธฐ์ฐจ ์‹œ์žฅ์˜ ์ฃผ๋„๊ถŒ ๊ฒฝ์Ÿ์ด ๋”์šฑ ์น˜์—ดํ•ด์งˆ ๊ฒƒ์œผ๋กœ ์˜ˆ์ƒ๋œ๋‹ค." + ] + } + ], + "categories": [ + "์ž๋™์ฐจ", + "๊ฒฝ์ œ", + "ํ™˜๊ฒฝ", + "๊ธฐ์ˆ " + ], + "entities": { + "people": [], + "organizations": [ + "ํ˜„๋Œ€์ž๋™์ฐจ", + "๊ธฐ์•„", + "SK์˜จ", + "ํด์Šคํƒ€" + ], + "groups": [ + "์œ ๋Ÿฝ ์ž๋™์ฐจ ์ œ์กฐ์‚ฌ", + "๋ฐฐํ„ฐ๋ฆฌ ์ œ์กฐ์—…์ฒด" + ], + "countries": [ + "๋Œ€ํ•œ๋ฏผ๊ตญ", + "๋…์ผ", + "์œ ๋Ÿฝ์—ฐํ•ฉ" + ], + "events": [ + "IAA 2025", + "IAA ๋ชจ๋นŒ๋ฆฌํ‹ฐ" + ] + }, + "source_keyword": "์ „๊ธฐ์ฐจ", + "source_count": 3 +} \ No newline at end of file diff --git a/backup-services/ai-writer/backend/article_์ „๊ธฐ์ฐจ_investigative.json b/backup-services/ai-writer/backend/article_์ „๊ธฐ์ฐจ_investigative.json new file mode 100644 index 0000000..3162f6f --- /dev/null +++ b/backup-services/ai-writer/backend/article_์ „๊ธฐ์ฐจ_investigative.json @@ -0,0 +1,66 @@ +{ + "news_id": "8a51bead-4558-4351-a5b2-b5e5ba1b3d38", + "title": "ํ˜„๋Œ€์ฐจยท๊ธฐ์•„, ์œ ๋Ÿฝ ์ „๊ธฐ์ฐจ ์‹œ์žฅ์„œ ์†Œํ˜• ๋ชจ๋ธ๋กœ ์ƒˆ ๋ŒํŒŒ๊ตฌ ๋ชจ์ƒ‰", + "created_at": "2025-09-13T00:29:35.661926", + "summary": "IAA ๋ชจ๋นŒ๋ฆฌํ‹ฐ 2025์—์„œ ํ˜„๋Œ€์ฐจยท๊ธฐ์•„๊ฐ€ ์†Œํ˜• ์ „๊ธฐ์ฐจ ์ฝ˜์…‰ํŠธ์นด๋ฅผ ๊ณต๊ฐœํ•˜๋ฉฐ ์œ ๋Ÿฝ ์‹œ์žฅ ๊ณต๋žต ๊ฐ€์†ํ™”. ๋ฐฐํ„ฐ๋ฆฌ ๊ณต๊ธ‰๋ง ํ™•๋ณด์™€ ๊ฐ€๊ฒฉ ๊ฒฝ์Ÿ๋ ฅ์ด ์„ฑ๊ณต ๊ด€๊ฑด", + "subtopics": [ + { + "title": "์œ ๋Ÿฝ ์†Œํ˜• ์ „๊ธฐ์ฐจ ์‹œ์žฅ ๊ณต๋žต ๋ณธ๊ฒฉํ™”", + "content": [ + "ํ˜„๋Œ€์ฐจ์™€ ๊ธฐ์•„๊ฐ€ IAA ๋ชจ๋นŒ๋ฆฌํ‹ฐ 2025์—์„œ ๊ฐ๊ฐ ์ฝ˜์…‰ํŠธ ์“ฐ๋ฆฌ์™€ EV2๋ฅผ ๊ณต๊ฐœํ•˜๋ฉฐ ์œ ๋Ÿฝ ์†Œํ˜• ์ „๊ธฐ์ฐจ ์‹œ์žฅ ๊ณต๋žต์— ์‹œ๋™์„ ๊ฑธ์—ˆ๋‹ค. ์ด๋Š” ์œ ๋Ÿฝ์˜ ๋†’์€ ํ™˜๊ฒฝ ๊ทœ์ œ์™€ ๋„์‹ฌ ์ด๋™์„ฑ ์ˆ˜์š”์— ๋Œ€์‘ํ•˜๊ธฐ ์œ„ํ•œ ์ „๋žต์  ์›€์ง์ž„์œผ๋กœ ํ•ด์„๋œ๋‹ค.", + "ํŠนํžˆ ๋‘ ๋ชจ๋ธ์€ ๊ธฐ์กด ์ „๊ธฐ์ฐจ ๋Œ€๋น„ ์ปดํŒฉํŠธํ•œ ์‚ฌ์ด์ฆˆ์™€ ํšจ์œจ์ ์ธ ๋ฐฐํ„ฐ๋ฆฌ ์‹œ์Šคํ…œ์„ ๊ฐ–์ถ”๊ณ  ์žˆ์–ด, ์œ ๋Ÿฝ ์†Œ๋น„์ž๋“ค์˜ ์‹ค์šฉ์  ์ˆ˜์š”๋ฅผ ๊ฒจ๋ƒฅํ–ˆ๋‹ค๋Š” ํ‰๊ฐ€๋ฅผ ๋ฐ›๊ณ  ์žˆ๋‹ค.", + "์—…๊ณ„ ์ „๋ฌธ๊ฐ€๋“ค์€ ํ˜„๋Œ€์ฐจ๊ทธ๋ฃน์˜ ์ด๋ฒˆ ํ–‰๋ณด๊ฐ€ ํ…Œ์Šฌ๋ผ์™€ ์ค‘๊ตญ ์—…์ฒด๋“ค์ด ์ฃผ๋„ํ•˜๊ณ  ์žˆ๋Š” ์œ ๋Ÿฝ ์ „๊ธฐ์ฐจ ์‹œ์žฅ์—์„œ ์ƒˆ๋กœ์šด ๋ŒํŒŒ๊ตฌ๋ฅผ ๋งˆ๋ จํ•  ์ˆ˜ ์žˆ์„ ๊ฒƒ์œผ๋กœ ์ „๋งํ•˜๊ณ  ์žˆ๋‹ค." + ] + }, + { + "title": "๋ฐฐํ„ฐ๋ฆฌ ๊ณต๊ธ‰๋ง ํ™•๋ณด ๊ณผ์ œ", + "content": [ + "์ „๊ธฐ์ฐจ ์„ฑ๊ณต์˜ ํ•ต์‹ฌ ์š”์†Œ์ธ ๋ฐฐํ„ฐ๋ฆฌ ์ˆ˜๊ธ‰์—์„œ SK์˜จ์ด ์ฃผ์š” ๊ณต๊ธ‰ ํŒŒํŠธ๋„ˆ๋กœ ๋ถ€์ƒํ–ˆ๋‹ค. ํด์Šคํƒ€๊ฐ€ SK์˜จ์„ ๋ฐฐํ„ฐ๋ฆฌ ๊ณต๊ธ‰์‚ฌ๋กœ ๊ณต๊ฐœ์ ์œผ๋กœ ์–ธ๊ธ‰ํ•œ ๊ฒƒ์ด ์ด๋ฅผ ๋ฐฉ์ฆํ•œ๋‹ค.", + "๊ทธ๋Ÿฌ๋‚˜ ์—…๊ณ„์—์„œ๋Š” ๋ฐฐํ„ฐ๋ฆฌ ์ œ์กฐ์‚ฌ๋“ค์˜ ์ •๋ณด ๊ณต๊ฐœ๊ฐ€ ์ œํ•œ์ ์ด์–ด์„œ ์‹ค์ œ ๊ณต๊ธ‰๋ง ๊ตฌ์กฐ๋ฅผ ํŒŒ์•…ํ•˜๊ธฐ ์–ด๋ ค์šด ์ƒํ™ฉ์ด๋‹ค. ์ด๋Š” ๊ธ€๋กœ๋ฒŒ ๋ฐฐํ„ฐ๋ฆฌ ์ˆ˜๊ธ‰ ๊ฒฝ์Ÿ์ด ์น˜์—ดํ•ด์ง€๊ณ  ์žˆ์Œ์„ ์‹œ์‚ฌํ•œ๋‹ค.", + "์•ˆ์ •์ ์ธ ๋ฐฐํ„ฐ๋ฆฌ ๊ณต๊ธ‰๋ง ํ™•๋ณด๋Š” ํ–ฅํ›„ ์†Œํ˜• ์ „๊ธฐ์ฐจ์˜ ๊ฐ€๊ฒฉ ๊ฒฝ์Ÿ๋ ฅ๊ณผ ์ง๊ฒฐ๋˜๋Š” ๋งŒํผ, ํ˜„๋Œ€์ฐจ๊ทธ๋ฃน์˜ ์ถ”๊ฐ€์ ์ธ ํŒŒํŠธ๋„ˆ์‹ญ ๊ตฌ์ถ•์ด ์˜ˆ์ƒ๋œ๋‹ค." + ] + }, + { + "title": "๊ฐ€๊ฒฉ ๊ฒฝ์Ÿ๋ ฅ ํ™•๋ณด ์ „๋žต", + "content": [ + "์†Œํ˜• ์ „๊ธฐ์ฐจ ์‹œ์žฅ์—์„œ์˜ ์„ฑ๊ณต์„ ์œ„ํ•ด์„œ๋Š” ํ•ฉ๋ฆฌ์ ์ธ ๊ฐ€๊ฒฉ๋Œ€ ์ฑ…์ •์ด ํ•„์ˆ˜์ ์ด๋‹ค. ํ˜„๋Œ€์ฐจ๊ทธ๋ฃน์€ ๊ทœ๋ชจ์˜ ๊ฒฝ์ œ๋ฅผ ํ†ตํ•œ ์›๊ฐ€ ์ ˆ๊ฐ์„ ๋ชฉํ‘œ๋กœ ํ•˜๊ณ  ์žˆ๋‹ค.", + "ํŠนํžˆ ์œ ๋Ÿฝ ์‹œ์žฅ์—์„œ๋Š” ํ…Œ์Šฌ๋ผ์™€ ์ค‘๊ตญ ์—…์ฒด๋“ค์˜ ๊ณต๊ฒฉ์ ์ธ ๊ฐ€๊ฒฉ ์ •์ฑ…์— ๋Œ€์‘ํ•ด์•ผ ํ•˜๋Š” ์ƒํ™ฉ์ด๋‹ค. ํ˜„๋Œ€์ฐจ๊ทธ๋ฃน์€ ํ”„๋ฆฌ๋ฏธ์—„ ํ’ˆ์งˆ์„ ์œ ์ง€ํ•˜๋ฉด์„œ๋„ ๊ฒฝ์Ÿ๋ ฅ ์žˆ๋Š” ๊ฐ€๊ฒฉ๋Œ€๋ฅผ ์ œ์‹œํ•˜๋Š” ๊ฒƒ์„ ๋ชฉํ‘œ๋กœ ํ•˜๊ณ  ์žˆ๋‹ค.", + "์ „๋ฌธ๊ฐ€๋“ค์€ ๋ฐฐํ„ฐ๋ฆฌ ๊ธฐ์ˆ  ํ˜์‹ ๊ณผ ์ƒ์‚ฐ ํšจ์œจํ™”๋ฅผ ํ†ตํ•ด ๊ฐ€๊ฒฉ ๊ฒฝ์Ÿ๋ ฅ์„ ํ™•๋ณดํ•˜๋Š” ๊ฒƒ์ด ํ–ฅํ›„ ์„ฑ๊ณต์˜ ํ•ต์‹ฌ์ด ๋  ๊ฒƒ์œผ๋กœ ์ „๋งํ•˜๊ณ  ์žˆ๋‹ค." + ] + } + ], + "categories": [ + "์ž๋™์ฐจ", + "๊ฒฝ์ œ", + "์‚ฐ์—…", + "๊ธฐ์ˆ " + ], + "entities": { + "people": [ + "๊น€์„ฑ์ˆ˜", + "์กฐ์šฉํ•˜", + "๋ฐ•์ข…๋ฉด" + ], + "organizations": [ + "ํ˜„๋Œ€์ž๋™์ฐจ", + "๊ธฐ์•„", + "SK์˜จ", + "ํด์Šคํƒ€" + ], + "groups": [ + "์œ ๋Ÿฝ ์ž๋™์ฐจ ์ œ์กฐ์‚ฌ", + "์ค‘๊ตญ ์ „๊ธฐ์ฐจ ์—…์ฒด" + ], + "countries": [ + "๋Œ€ํ•œ๋ฏผ๊ตญ", + "๋…์ผ", + "์ค‘๊ตญ" + ], + "events": [ + "IAA ๋ชจ๋นŒ๋ฆฌํ‹ฐ 2025", + "์ „๊ธฐ์ฐจ ๋ฐฐํ„ฐ๋ฆฌ ๊ณต๊ธ‰ ๊ณ„์•ฝ" + ] + }, + "source_keyword": "์ „๊ธฐ์ฐจ", + "source_count": 3 +} \ No newline at end of file diff --git a/backup-services/ai-writer/backend/article_์ „๊ธฐ์ฐจ_professional.json b/backup-services/ai-writer/backend/article_์ „๊ธฐ์ฐจ_professional.json new file mode 100644 index 0000000..4b5eb9f --- /dev/null +++ b/backup-services/ai-writer/backend/article_์ „๊ธฐ์ฐจ_professional.json @@ -0,0 +1,62 @@ +{ + "news_id": "2c4cb595-9542-45ee-b4b9-2135c46950e3", + "title": "ํ˜„๋Œ€์ฐจยท๊ธฐ์•„, ์œ ๋Ÿฝ ์ „๊ธฐ์ฐจ ์‹œ์žฅ์„œ ์†Œํ˜• ๋ชจ๋ธ๋กœ ์Šน๋ถ€์ˆ˜...๋ฐฐํ„ฐ๋ฆฌ ํ˜‘๋ ฅ ๊ฐ•ํ™” ์ฃผ๋ชฉ", + "created_at": "2025-09-13T00:28:51.371773", + "summary": "ํ˜„๋Œ€์ฐจยท๊ธฐ์•„๊ฐ€ ์œ ๋Ÿฝ ์ „๊ธฐ์ฐจ ์‹œ์žฅ์—์„œ ์ฝ˜์…‰ํŠธ ์“ฐ๋ฆฌ์™€ EV2๋กœ ์†Œํ˜• ์ „๊ธฐ์ฐจ ์‹œ์žฅ ๊ณต๋žต ๋‚˜์„œ, ๋ฐฐํ„ฐ๋ฆฌ ํ˜‘๋ ฅ์‚ฌ ์„ ์ • ๋“ฑ ๊ฒฝ์Ÿ๋ ฅ ๊ฐ•ํ™” ์›€์ง์ž„ ๋ณธ๊ฒฉํ™”", + "subtopics": [ + { + "title": "์œ ๋Ÿฝ ์†Œํ˜• ์ „๊ธฐ์ฐจ ์‹œ์žฅ ๊ณต๋žต ๋ณธ๊ฒฉํ™”", + "content": [ + "ํ˜„๋Œ€์ž๋™์ฐจ๊ทธ๋ฃน์ด ์œ ๋Ÿฝ ์ „๊ธฐ์ฐจ ์‹œ์žฅ ๊ณต๋žต์„ ์œ„ํ•ด ์†Œํ˜• ์ „๊ธฐ์ฐจ ๋ผ์ธ์—… ํ™•๋Œ€์— ๋‚˜์„ฐ๋‹ค. IAA ๋ชจ๋นŒ๋ฆฌํ‹ฐ 2025์—์„œ ๊ณต๊ฐœ๋œ ํ˜„๋Œ€์ฐจ์˜ ์ฝ˜์…‰ํŠธ ์“ฐ๋ฆฌ์™€ ๊ธฐ์•„์˜ EV2๋Š” ์œ ๋Ÿฝ ์‹œ์žฅ ๋งž์ถคํ˜• ์ „๋žต์˜ ํ•ต์‹ฌ์œผ๋กœ ํ‰๊ฐ€๋ฐ›๊ณ  ์žˆ๋‹ค.", + "ํŠนํžˆ ์†Œํ˜• ์ „๊ธฐ์ฐจ ์‹œ์žฅ์€ ์œ ๋Ÿฝ์—์„œ ๊ธ‰์„ฑ์žฅ์ด ์˜ˆ์ƒ๋˜๋Š” ์„ธ๊ทธ๋จผํŠธ๋กœ, ํ˜„๋Œ€์ฐจ๊ทธ๋ฃน์€ ํ•ฉ๋ฆฌ์ ์ธ ๊ฐ€๊ฒฉ๋Œ€์™€ ์‹ค์šฉ์„ฑ์„ ์•ž์„ธ์›Œ ์‹œ์žฅ ์„ ์ ์„ ๋…ธ๋ฆฌ๊ณ  ์žˆ๋‹ค.", + "ํ˜„๋Œ€์ฐจ๊ทธ๋ฃน์˜ ์ด๋ฒˆ ์ „๋žต์€ ์œ ๋Ÿฝ์˜ ํ™˜๊ฒฝ ๊ทœ์ œ ๊ฐ•ํ™”์™€ ์†Œ๋น„์ž๋“ค์˜ ์‹ค์šฉ์ ์ธ ์ „๊ธฐ์ฐจ ์ˆ˜์š” ์ฆ๊ฐ€์— ๋Œ€์‘ํ•˜๋Š” ๋™์‹œ์—, ์ค‘๊ตญ ์ „๊ธฐ์ฐจ ์—…์ฒด๋“ค์˜ ์œ ๋Ÿฝ ์ง„์ถœ์— ๋Œ€ํ•œ ์„ ์ œ์  ๋Œ€์‘์œผ๋กœ ํ•ด์„๋œ๋‹ค." + ] + }, + { + "title": "๋ฐฐํ„ฐ๋ฆฌ ํ˜‘๋ ฅ ๊ด€๊ณ„ ์žฌํŽธ ์›€์ง์ž„", + "content": [ + "์ „๊ธฐ์ฐจ ๊ฒฝ์Ÿ๋ ฅ์˜ ํ•ต์‹ฌ์ธ ๋ฐฐํ„ฐ๋ฆฌ ์ˆ˜๊ธ‰๊ณผ ๊ด€๋ จํ•ด ์—…๊ณ„์˜ ์ด๋ชฉ์ด ์ง‘์ค‘๋˜๊ณ  ์žˆ๋‹ค. IAA ๋ชจ๋นŒ๋ฆฌํ‹ฐ์—์„œ ํด์Šคํƒ€๊ฐ€ SK์˜จ์„ ๋ฐฐํ„ฐ๋ฆฌ ๊ณต๊ธ‰์‚ฌ๋กœ ์ง€๋ชฉํ•œ ๊ฒƒ์ด ์ฃผ๋ชฉ๋ฐ›๊ณ  ์žˆ๋‹ค.", + "๊ธ€๋กœ๋ฒŒ ์ž๋™์ฐจ ์—…์ฒด๋“ค์˜ ๋ฐฐํ„ฐ๋ฆฌ ์กฐ๋‹ฌ ์ „๋žต์ด ๋‹ค๋ณ€ํ™”๋˜๋Š” ๊ฐ€์šด๋ฐ, ํ•œ๊ตญ ๋ฐฐํ„ฐ๋ฆฌ ์—…์ฒด๋“ค๊ณผ์˜ ํ˜‘๋ ฅ ๊ฐ•ํ™” ์›€์ง์ž„์ด ๊ฐ์ง€๋˜๊ณ  ์žˆ๋‹ค.", + "ํŠนํžˆ ํ˜„๋Œ€์ฐจ๊ทธ๋ฃน์€ ์•ˆ์ •์ ์ธ ๋ฐฐํ„ฐ๋ฆฌ ์ˆ˜๊ธ‰์„ ์œ„ํ•ด ๋‹ค์–‘ํ•œ ๋ฐฐํ„ฐ๋ฆฌ ์ œ์กฐ์‚ฌ๋“ค๊ณผ์˜ ํ˜‘๋ ฅ ๊ด€๊ณ„๋ฅผ ๊ฒ€ํ†  ์ค‘์ธ ๊ฒƒ์œผ๋กœ ์•Œ๋ ค์กŒ๋‹ค." + ] + }, + { + "title": "๊ธ€๋กœ๋ฒŒ ์ „๊ธฐ์ฐจ ์‹œ์žฅ ๊ฒฝ์Ÿ ์‹ฌํ™”", + "content": [ + "์ „๊ธฐ์ฐจ ์‹œ์žฅ์—์„œ ๋ธŒ๋žœ๋“œ ๊ฐ„ ๊ฒฝ์Ÿ์ด ์น˜์—ดํ•ด์ง€๋Š” ๊ฐ€์šด๋ฐ, ํ˜„๋Œ€์ฐจ๊ทธ๋ฃน์€ ์ฐจ๋ณ„ํ™”๋œ ์ œํ’ˆ ๋ผ์ธ์—…๊ณผ ๊ธฐ์ˆ ๋ ฅ์œผ๋กœ ์‹œ์žฅ ์ง€์œ„ ๊ฐ•ํ™”์— ๋‚˜์„œ๊ณ  ์žˆ๋‹ค.", + "ํŠนํžˆ ์œ ๋Ÿฝ ์‹œ์žฅ์—์„œ๋Š” ํ…Œ์Šฌ๋ผ, ํญ์Šค๋ฐ”๊ฒ ๊ทธ๋ฃน, ์ค‘๊ตญ ์—…์ฒด๋“ค๊ณผ์˜ ๊ฒฝ์Ÿ์ด ๋ถˆ๊ฐ€ํ”ผํ•œ ์ƒํ™ฉ์ด๋ฉฐ, ํ˜„๋Œ€์ฐจ๊ทธ๋ฃน์€ ํ’ˆ์งˆ๊ณผ ๊ธฐ์ˆ ๋ ฅ์„ ์•ž์„ธ์›Œ ๊ฒฝ์Ÿ๋ ฅ ํ™•๋ณด์— ์ฃผ๋ ฅํ•˜๊ณ  ์žˆ๋‹ค.", + "์‹œ์žฅ ์ „๋ฌธ๊ฐ€๋“ค์€ ํ˜„๋Œ€์ฐจ๊ทธ๋ฃน์˜ ์†Œํ˜• ์ „๊ธฐ์ฐจ ์ „๋žต์ด ํ–ฅํ›„ ๊ธ€๋กœ๋ฒŒ ์‹œ์žฅ์—์„œ์˜ ์ž…์ง€ ๊ฐ•ํ™”์— ์ค‘์š”ํ•œ ์ „ํ™˜์ ์ด ๋  ๊ฒƒ์œผ๋กœ ์ „๋งํ•˜๊ณ  ์žˆ๋‹ค." + ] + } + ], + "categories": [ + "์ž๋™์ฐจ", + "๊ฒฝ์ œ", + "์‚ฐ์—…" + ], + "entities": { + "people": [ + "๊น€์„ฑ์ˆ˜", + "๋ฐ•์˜ํšจ" + ], + "organizations": [ + "ํ˜„๋Œ€์ž๋™์ฐจ", + "๊ธฐ์•„", + "SK์˜จ", + "ํด์Šคํƒ€" + ], + "groups": [ + "ํ˜„๋Œ€์ฐจ๊ทธ๋ฃน", + "ํญ์Šค๋ฐ”๊ฒ ๊ทธ๋ฃน" + ], + "countries": [ + "๋Œ€ํ•œ๋ฏผ๊ตญ", + "๋…์ผ" + ], + "events": [ + "IAA ๋ชจ๋นŒ๋ฆฌํ‹ฐ 2025" + ] + }, + "source_keyword": "์ „๊ธฐ์ฐจ", + "source_count": 3 +} \ No newline at end of file diff --git a/backup-services/ai-writer/backend/custom_article_analytical.json b/backup-services/ai-writer/backend/custom_article_analytical.json new file mode 100644 index 0000000..9b114dc --- /dev/null +++ b/backup-services/ai-writer/backend/custom_article_analytical.json @@ -0,0 +1,63 @@ +{ + "news_id": "ee154fb8-a913-4aa9-9fc9-fa421fd2d7c0", + "title": "2025๋…„ ๊ธฐ์ˆ  ํ˜์‹ ์˜ ๋ถ„๊ธฐ์ : AIยท์–‘์ž์ปดํ“จํŒ…์ด ๊ทธ๋ฆฌ๋Š” ์ƒˆ๋กœ์šด ๋ฏธ๋ž˜", + "created_at": "2025-09-13T00:32:14.008706", + "summary": "2025๋…„, AI์™€ ์–‘์ž์ปดํ“จํŒ…์˜ ์ƒ์šฉํ™”๊ฐ€ ๊ฐ€์ ธ์˜ฌ ์‚ฐ์—… ์ „๋ฐ˜์˜ ํ˜์‹ ์  ๋ณ€ํ™”์™€ ์‚ฌํšŒ์  ์˜ํ–ฅ์„ ์‹ฌ์ธต ๋ถ„์„ํ•œ ์ „๋ง", + "subtopics": [ + { + "title": "์ƒ์„ฑํ˜• AI๊ฐ€ ์žฌํŽธํ•˜๋Š” ์‚ฐ์—… ์ƒํƒœ๊ณ„", + "content": [ + "2025๋…„์€ ์ƒ์„ฑํ˜• AI๊ฐ€ ์‚ฐ์—… ์ „๋ฐ˜์— ๋ณธ๊ฒฉ์ ์œผ๋กœ ๋„์ž…๋˜๋Š” ์›๋…„์ด ๋  ์ „๋ง์ด๋‹ค. ํŠนํžˆ ์˜๋ฃŒ ์ง„๋‹จ, ์‹ ์•ฝ ๊ฐœ๋ฐœ, ๊ต์œก ์ปค๋ฆฌํ˜๋Ÿผ ์„ค๊ณ„ ๋“ฑ ์ „๋ฌธ ๋ถ„์•ผ์—์„œ AI์˜ ์—ญํ• ์ด ํš๊ธฐ์ ์œผ๋กœ ํ™•๋Œ€๋  ๊ฒƒ์œผ๋กœ ์˜ˆ์ธก๋œ๋‹ค.", + "๊ธฐ์—…๋“ค์˜ ์—…๋ฌด ํ”„๋กœ์„ธ์Šค๋„ ๊ทผ๋ณธ์ ์ธ ๋ณ€ํ™”๋ฅผ ๋งž์ดํ•  ๊ฒƒ์œผ๋กœ ๋ณด์ธ๋‹ค. ์ฐฝ์˜์  ์ž‘์—… ์˜์—ญ์—์„œ๋„ AI์˜ ํ™œ์šฉ์ด ์ผ์ƒํ™”๋˜๋ฉฐ, ์ธ๊ฐ„-AI ํ˜‘์—… ๋ชจ๋ธ์ด ์ƒˆ๋กœ์šด ํ‘œ์ค€์œผ๋กœ ์ž๋ฆฌ์žก์„ ๊ฒƒ์œผ๋กœ ์ „๋ง๋œ๋‹ค.", + "๋‹ค๋งŒ AI ๋„์ž…์— ๋”ฐ๋ฅธ ๋…ธ๋™์‹œ์žฅ ์žฌํŽธ๊ณผ ์œค๋ฆฌ์  ๋ฌธ์ œ์— ๋Œ€ํ•œ ์‚ฌํšŒ์  ํ•ฉ์˜๊ฐ€ ์‹œ๊ธ‰ํ•œ ๊ณผ์ œ๋กœ ๋Œ€๋‘๋  ๊ฒƒ์œผ๋กœ ์˜ˆ์ƒ๋œ๋‹ค. ํŠนํžˆ AI ์˜์กด๋„ ์ฆ๊ฐ€์— ๋”ฐ๋ฅธ ๋ฐ์ดํ„ฐ ๋ณด์•ˆ๊ณผ ์•Œ๊ณ ๋ฆฌ์ฆ˜ ํŽธํ–ฅ์„ฑ ๋ฌธ์ œ๋Š” ์ค‘์š”ํ•œ ํ•ด๊ฒฐ ๊ณผ์ œ๊ฐ€ ๋  ๊ฒƒ์ด๋‹ค." + ] + }, + { + "title": "์–‘์ž์ปดํ“จํŒ…์˜ ์ƒ์šฉํ™”์™€ ์‚ฐ์—…ํ˜์‹ ", + "content": [ + "์–‘์ž์ปดํ“จํŒ… ๊ธฐ์ˆ ์ด ์‹ค์šฉํ™” ๋‹จ๊ณ„์— ์ง„์ž…ํ•˜๋ฉด์„œ, ๊ธˆ์œต๊ถŒ์˜ ๋ฆฌ์Šคํฌ ๋ถ„์„๊ณผ ์•”ํ˜ธํ™”ํ ๋ณด์•ˆ ์‹œ์Šคํ…œ์— ํš๊ธฐ์ ์ธ ๋ณ€ํ™”๊ฐ€ ์˜ˆ์ƒ๋œ๋‹ค. ํŠนํžˆ ๋ณต์žกํ•œ ๊ธˆ์œต ๋ชจ๋ธ๋ง๊ณผ ์‹œ์žฅ ์˜ˆ์ธก์—์„œ ์–‘์ž์ปดํ“จํ„ฐ์˜ ํ™œ์šฉ์ด ํฌ๊ฒŒ ์ฆ๊ฐ€ํ•  ์ „๋ง์ด๋‹ค.", + "์ œ์•ฝ ์‚ฐ์—…์—์„œ๋Š” ์‹ ์•ฝ ๊ฐœ๋ฐœ ํ”„๋กœ์„ธ์Šค๊ฐ€ ๋Œ€ํญ ๋‹จ์ถ•๋  ๊ฒƒ์œผ๋กœ ๊ธฐ๋Œ€๋œ๋‹ค. ์–‘์ž์ปดํ“จํ„ฐ๋ฅผ ํ™œ์šฉํ•œ ๋ถ„์ž ์‹œ๋ฎฌ๋ ˆ์ด์…˜์ด ๊ฐ€๋Šฅํ•ด์ง€๋ฉด์„œ, ์‹ ์•ฝ ๊ฐœ๋ฐœ ๋น„์šฉ ์ ˆ๊ฐ๊ณผ ํšจ์œจ์„ฑ ์ฆ๋Œ€๊ฐ€ ์‹คํ˜„๋  ๊ฒƒ์ด๋‹ค.", + "๋ฌผ๋ฅ˜ ๋ฐ ๊ณต๊ธ‰๋ง ๊ด€๋ฆฌ ๋ถ„์•ผ์—์„œ๋„ ์–‘์ž์ปดํ“จํŒ…์˜ ์˜ํ–ฅ๋ ฅ์ด ํ™•๋Œ€๋  ์ „๋ง์ด๋‹ค. ๋ณต์žกํ•œ ๊ฒฝ๋กœ ์ตœ์ ํ™”์™€ ์žฌ๊ณ  ๊ด€๋ฆฌ์— ์–‘์ž ์•Œ๊ณ ๋ฆฌ์ฆ˜์„ ์ ์šฉํ•จ์œผ๋กœ์จ, ๋ฌผ๋ฅ˜ ๋น„์šฉ ์ ˆ๊ฐ๊ณผ ํšจ์œจ์„ฑ ํ–ฅ์ƒ์ด ๊ฐ€๋Šฅํ•ด์งˆ ๊ฒƒ์œผ๋กœ ์˜ˆ์ธก๋œ๋‹ค." + ] + }, + { + "title": "๊ธฐ์ˆ  ํ˜์‹ ์— ๋”ฐ๋ฅธ ์‚ฌํšŒ๊ฒฝ์ œ์  ๋ณ€ํ™”", + "content": [ + "AI์™€ ์–‘์ž์ปดํ“จํŒ…์˜ ๋ฐœ์ „์€ ๋…ธ๋™์‹œ์žฅ์˜ ๊ตฌ์กฐ์  ๋ณ€ํ™”๋ฅผ ๊ฐ€์†ํ™”ํ•  ๊ฒƒ์œผ๋กœ ์ „๋ง๋œ๋‹ค. ๋‹จ์ˆœ ๋ฐ˜๋ณต ์—…๋ฌด๋Š” ์ž๋™ํ™”๋˜๋Š” ๋ฐ˜๋ฉด, AI ์‹œ์Šคํ…œ ๊ด€๋ฆฌ์™€ ์–‘์ž์ปดํ“จํŒ… ์ „๋ฌธ๊ฐ€ ๊ฐ™์€ ์ƒˆ๋กœ์šด ์ง์ข…์˜ ์ˆ˜์š”๊ฐ€ ๊ธ‰์ฆํ•  ๊ฒƒ์œผ๋กœ ์˜ˆ์ƒ๋œ๋‹ค.", + "๊ต์œก ์‹œ์Šคํ…œ๋„ ํฐ ๋ณ€ํ™”๋ฅผ ๋งž์ดํ•  ๊ฒƒ์œผ๋กœ ๋ณด์ธ๋‹ค. AI ๊ธฐ๋ฐ˜ ๋งž์ถคํ˜• ํ•™์Šต๊ณผ ์–‘์ž์ปดํ“จํŒ… ์›๋ฆฌ์— ๋Œ€ํ•œ ์ดํ•ด๊ฐ€ ์ƒˆ๋กœ์šด ํ•„์ˆ˜ ๊ต์œก๊ณผ์ •์œผ๋กœ ์ž๋ฆฌ์žก์„ ๊ฒƒ์œผ๋กœ ์ „๋ง๋œ๋‹ค.", + "์ด๋Ÿฌํ•œ ๊ธฐ์ˆ  ํ˜์‹ ์€ ๊ตญ๊ฐ€ ๊ฐ„ ๊ธฐ์ˆ  ๊ฒฉ์ฐจ๋ฅผ ๋”์šฑ ์‹ฌํ™”์‹œํ‚ฌ ๊ฐ€๋Šฅ์„ฑ์ด ์žˆ๋‹ค. ์„ ์ง„๊ตญ๊ณผ ๊ฐœ๋ฐœ๋„์ƒ๊ตญ ๊ฐ„์˜ ๋””์ง€ํ„ธ ๊ฒฉ์ฐจ ํ•ด์†Œ๊ฐ€ ๊ตญ์ œ์‚ฌํšŒ์˜ ์ฃผ์š” ๊ณผ์ œ๋กœ ๋Œ€๋‘๋  ๊ฒƒ์œผ๋กœ ์˜ˆ์ธก๋œ๋‹ค." + ] + } + ], + "categories": [ + "๊ธฐ์ˆ ", + "์‚ฐ์—…", + "๋ฏธ๋ž˜์ „๋ง", + "๊ฒฝ์ œ" + ], + "entities": { + "people": [], + "organizations": [ + "๊ธˆ์œต๊ถŒ", + "์ œ์•ฝํšŒ์‚ฌ", + "๋ฌผ๋ฅ˜๊ธฐ์—…" + ], + "groups": [ + "AI ๊ฐœ๋ฐœ์ž", + "์–‘์ž์ปดํ“จํŒ… ์ „๋ฌธ๊ฐ€", + "๊ต์œก๊ธฐ๊ด€" + ], + "countries": [ + "ํ•œ๊ตญ", + "๋ฏธ๊ตญ", + "์ค‘๊ตญ" + ], + "events": [ + "AI ์ƒ์šฉํ™”", + "์–‘์ž์ปดํ“จํ„ฐ ์‹ค์šฉํ™”", + "๋””์ง€ํ„ธ ์ „ํ™˜" + ] + }, + "source_keyword": "2025๋…„ ๊ธฐ์ˆ  ํŠธ๋ Œ๋“œ", + "source_count": 2 +} \ No newline at end of file diff --git a/backup-services/ai-writer/backend/custom_article_professional.json b/backup-services/ai-writer/backend/custom_article_professional.json new file mode 100644 index 0000000..ce04357 --- /dev/null +++ b/backup-services/ai-writer/backend/custom_article_professional.json @@ -0,0 +1,62 @@ +{ + "news_id": "3109c578-9b08-4cd0-a9d6-3d92b97e64d4", + "title": "2025๋…„ ๊ธฐ์ˆ  ํ˜์‹ ์˜ ๋ฌผ๊ฒฐ, AIยท์–‘์ž์ปดํ“จํŒ…์ด ์ด๋„๋Š” ์ƒˆ๋กœ์šด ํŒจ๋Ÿฌ๋‹ค์ž„", + "created_at": "2025-09-13T00:31:52.782760", + "summary": "2025๋…„, ์ƒ์„ฑํ˜• AI์™€ ์–‘์ž์ปดํ“จํŒ…์˜ ์ƒ์šฉํ™”๋กœ ์‚ฐ์—… ์ „๋ฐ˜์— ํ˜์‹ ์  ๋ณ€ํ™”๊ฐ€ ์˜ˆ์ƒ๋˜๋ฉฐ, ์ธ๊ฐ„-AI ํ˜‘์—…์ด ์ผ์ƒํ™”๋  ์ „๋ง", + "subtopics": [ + { + "title": "์ƒ์„ฑํ˜• AI๊ฐ€ ์ฃผ๋„ํ•˜๋Š” ์ฐฝ์˜์  ํ˜์‹ ", + "content": [ + "2025๋…„์€ ์ƒ์„ฑํ˜• AI ๊ธฐ์ˆ ์ด ์ „๋ก€ ์—†๋Š” ์ˆ˜์ค€์œผ๋กœ ๋ฐœ์ „ํ•˜์—ฌ ์ฐฝ์˜์  ์˜์—ญ์—์„œ๋„ ํš๊ธฐ์ ์ธ ๋ณ€ํ™”๊ฐ€ ์˜ˆ์ƒ๋œ๋‹ค. ๊ธฐ์กด์— ์ธ๊ฐ„์˜ ๊ณ ์œ  ์˜์—ญ์œผ๋กœ ์—ฌ๊ฒจ์กŒ๋˜ ์˜ˆ์ˆ  ์ฐฝ์ž‘, ์ฝ˜ํ…์ธ  ์ œ์ž‘, ๋””์ž์ธ ๋ถ„์•ผ์—์„œ AI๊ฐ€ ํ•ต์‹ฌ ํ˜‘๋ ฅ์ž๋กœ ์ž๋ฆฌ์žก์„ ์ „๋ง์ด๋‹ค.", + "ํŠนํžˆ ์˜๋ฃŒ ๋ถ„์•ผ์—์„œ๋Š” AI๊ฐ€ ์งˆ๋ณ‘ ์ง„๋‹จ๊ณผ ์น˜๋ฃŒ ๊ณ„ํš ์ˆ˜๋ฆฝ์— ์ ๊ทน์ ์œผ๋กœ ํ™œ์šฉ๋  ๊ฒƒ์œผ๋กœ ์˜ˆ์ธก๋œ๋‹ค. AI๋Š” ๋ฐฉ๋Œ€ํ•œ ์˜๋ฃŒ ๋ฐ์ดํ„ฐ๋ฅผ ๋ถ„์„ํ•˜์—ฌ ๊ฐœ์ธ ๋งž์ถคํ˜• ์น˜๋ฃŒ๋ฒ•์„ ์ œ์‹œํ•˜๊ณ , ์˜๋ฃŒ์ง„์˜ ์˜์‚ฌ๊ฒฐ์ •์„ ํšจ๊ณผ์ ์œผ๋กœ ์ง€์›ํ•  ๊ฒƒ์œผ๋กœ ๊ธฐ๋Œ€๋œ๋‹ค.", + "๊ต์œก ๋ถ„์•ผ์—์„œ๋„ AI ๊ธฐ๋ฐ˜์˜ ๋งž์ถคํ˜• ํ•™์Šต ์‹œ์Šคํ…œ์ด ๋ณดํŽธํ™”๋  ์ „๋ง์ด๋‹ค. ํ•™์Šต์ž์˜ ์ดํ•ด๋„์™€ ์ง„๋„์— ๋”ฐ๋ผ ์ตœ์ ํ™”๋œ ์ปค๋ฆฌํ˜๋Ÿผ์„ ์ œ๊ณตํ•˜๊ณ , ์‹ค์‹œ๊ฐ„์œผ๋กœ ํ•™์Šต ์„ฑ๊ณผ๋ฅผ ๋ถ„์„ํ•˜์—ฌ ๊ฐœ์„ ์ ์„ ์ œ์‹œํ•˜๋Š” ๋“ฑ ๊ต์œก์˜ ์งˆ์  ํ–ฅ์ƒ์ด ๊ธฐ๋Œ€๋œ๋‹ค." + ] + }, + { + "title": "์–‘์ž์ปดํ“จํŒ…์˜ ์‚ฐ์—… ํ˜์‹  ์ฃผ๋„", + "content": [ + "2025๋…„์€ ์–‘์ž์ปดํ“จํŒ…์ด ์‹ค์šฉํ™” ๋‹จ๊ณ„์— ์ง„์ž…ํ•˜๋Š” ์›๋…„์ด ๋  ๊ฒƒ์œผ๋กœ ์ „๋ง๋œ๋‹ค. ํŠนํžˆ ๊ธˆ์œต ์‚ฐ์—…์—์„œ๋Š” ๋ณต์žกํ•œ ์œ„ํ—˜ ๋ถ„์„๊ณผ ํฌํŠธํด๋ฆฌ์˜ค ์ตœ์ ํ™”์— ์–‘์ž์ปดํ“จํŒ…์„ ํ™œ์šฉํ•˜์—ฌ ํˆฌ์ž ์ „๋žต์˜ ์ •ํ™•๋„๋ฅผ ๋†’์ผ ๊ฒƒ์œผ๋กœ ์˜ˆ์ƒ๋œ๋‹ค.", + "์ œ์•ฝ ์‚ฐ์—…์—์„œ๋Š” ์–‘์ž์ปดํ“จํ„ฐ๋ฅผ ํ™œ์šฉํ•œ ์‹ ์•ฝ ๊ฐœ๋ฐœ์ด ๊ฐ€์†ํ™”๋  ์ „๋ง์ด๋‹ค. ๋ถ„์ž ๊ตฌ์กฐ ์‹œ๋ฎฌ๋ ˆ์ด์…˜๊ณผ ์‹ ์•ฝ ํ›„๋ณด ๋ฌผ์งˆ ์Šคํฌ๋ฆฌ๋‹ ๊ณผ์ •์—์„œ ์–‘์ž์ปดํ“จํŒ…์˜ ๊ฐ•์ ์ด ๋ฐœํœ˜๋  ๊ฒƒ์œผ๋กœ ๊ธฐ๋Œ€๋œ๋‹ค.", + "๋ฌผ๋ฅ˜ ๋ถ„์•ผ์—์„œ๋„ ์–‘์ž์ปดํ“จํŒ…์„ ํ†ตํ•œ ์ตœ์ ํ™”๊ฐ€ ์‹คํ˜„๋  ์ „๋ง์ด๋‹ค. ๋ณต์žกํ•œ ๊ณต๊ธ‰๋ง ๊ด€๋ฆฌ์™€ ๋ฐฐ์†ก ๊ฒฝ๋กœ ์ตœ์ ํ™”์— ์–‘์ž์ปดํ“จํŒ…์„ ๋„์ž…ํ•จ์œผ๋กœ์จ ๋ฌผ๋ฅ˜ ๋น„์šฉ ์ ˆ๊ฐ๊ณผ ํšจ์œจ์„ฑ ํ–ฅ์ƒ์ด ๊ฐ€๋Šฅํ•ด์งˆ ๊ฒƒ์œผ๋กœ ์˜ˆ์ธก๋œ๋‹ค." + ] + }, + { + "title": "์ธ๊ฐ„-๊ธฐ๊ณ„ ํ˜‘์—…์˜ ์ƒˆ๋กœ์šด ํŒจ๋Ÿฌ๋‹ค์ž„", + "content": [ + "2025๋…„์—๋Š” AI์™€ ์ธ๊ฐ„์˜ ํ˜‘์—…์ด ์ผ์ƒํ™”๋˜๋ฉด์„œ ์—…๋ฌด ๋ฐฉ์‹์˜ ๊ทผ๋ณธ์ ์ธ ๋ณ€ํ™”๊ฐ€ ์˜ˆ์ƒ๋œ๋‹ค. ๋‹จ์ˆœ ๋ฐ˜๋ณต์ ์ธ ์—…๋ฌด๋Š” AI๊ฐ€ ๋‹ด๋‹นํ•˜๊ณ , ์ธ๊ฐ„์€ ์ „๋žต์  ์˜์‚ฌ๊ฒฐ์ •๊ณผ ์ฐฝ์˜์  ๋ฌธ์ œ ํ•ด๊ฒฐ์— ์ง‘์ค‘ํ•˜๋Š” ๋ฐฉ์‹์œผ๋กœ ์—…๋ฌด ๋ถ„๋‹ด์ด ์ด๋ฃจ์–ด์งˆ ๊ฒƒ์ด๋‹ค.", + "์ด๋Ÿฌํ•œ ๋ณ€ํ™”๋Š” ๋…ธ๋™์‹œ์žฅ์˜ ๊ตฌ์กฐ์  ๋ณ€ํ™”๋กœ ์ด์–ด์งˆ ์ „๋ง์ด๋‹ค. AI์™€ ํ˜‘์—…ํ•  ์ˆ˜ ์žˆ๋Š” ๋””์ง€ํ„ธ ์—ญ๋Ÿ‰์ด ํ•„์ˆ˜์ ์ธ ์ง๋ฌด ์—ญ๋Ÿ‰์œผ๋กœ ๋ถ€์ƒํ•˜๋ฉฐ, ์ƒˆ๋กœ์šด ํ˜•ํƒœ์˜ ์ง์—…์ด ๋“ฑ์žฅํ•  ๊ฒƒ์œผ๋กœ ์˜ˆ์ธก๋œ๋‹ค.", + "ํ•˜์ง€๋งŒ ์ด๋Ÿฌํ•œ ๋ณ€ํ™” ์†์—์„œ๋„ ์œค๋ฆฌ์  ํŒ๋‹จ๊ณผ ๊ฐ์„ฑ์  ์†Œํ†ต๊ณผ ๊ฐ™์€ ์ธ๊ฐ„ ๊ณ ์œ ์˜ ๊ฐ€์น˜๋Š” ๋”์šฑ ์ค‘์š”ํ•ด์งˆ ๊ฒƒ์œผ๋กœ ์ „๋ง๋œ๋‹ค. ๊ธฐ์ˆ  ๋ฐœ์ „์ด ๊ฐ€์ ธ์˜ฌ ํ˜œํƒ์„ ์ตœ๋Œ€ํ™”ํ•˜๋ฉด์„œ๋„ ์ธ๊ฐ„ ์ค‘์‹ฌ์˜ ๊ฐ€์น˜๋ฅผ ์ง€์ผœ๋‚˜๊ฐ€๋Š” ๊ท ํ˜•์ด ์ค‘์š”ํ•œ ๊ณผ์ œ๋กœ ๋Œ€๋‘๋  ๊ฒƒ์ด๋‹ค." + ] + } + ], + "categories": [ + "๊ธฐ์ˆ ", + "๋ฏธ๋ž˜์ „๋ง", + "์‚ฐ์—…๋™ํ–ฅ" + ], + "entities": { + "people": [], + "organizations": [ + "AI ๊ธฐ์—…๋“ค", + "์ œ์•ฝํšŒ์‚ฌ๋“ค", + "๋ฌผ๋ฅ˜๊ธฐ์—…๋“ค" + ], + "groups": [ + "์˜๋ฃŒ์ง„", + "๊ต์œก์ž", + "๊ธฐ์ˆ ์ „๋ฌธ๊ฐ€" + ], + "countries": [ + "ํ•œ๊ตญ", + "๋ฏธ๊ตญ", + "์ค‘๊ตญ" + ], + "events": [ + "2025๋…„ ๊ธฐ์ˆ ํ˜์‹ ", + "์–‘์ž์ปดํ“จํŒ… ์ƒ์šฉํ™”", + "AI ํ˜๋ช…" + ] + }, + "source_keyword": "2025๋…„ ๊ธฐ์ˆ  ํŠธ๋ Œ๋“œ", + "source_count": 2 +} \ No newline at end of file diff --git a/backup-services/ai-writer/backend/generated_article.json b/backup-services/ai-writer/backend/generated_article.json new file mode 100644 index 0000000..4046dbe --- /dev/null +++ b/backup-services/ai-writer/backend/generated_article.json @@ -0,0 +1,73 @@ +{ + "news_id": "ea9f3734-6a93-4ca7-8ebe-b85612e2fd0a", + "title": "์ •๋ถ€, ๋‚ด๋…„ AI ์‚ฐ์—…์— 10์กฐ์› ํˆฌ์ž...ํ•œ๊ตญ ๊ฒฝ์ œ ์ฒด์งˆ ๋Œ€์ „ํ™˜ ๋‚˜์„ ๋‹ค", + "created_at": "2025-09-13T01:09:43.892704", + "summary": "์ •๋ถ€๊ฐ€ 2025๋…„ ์ธ๊ณต์ง€๋Šฅ ์‚ฐ์—… ์œก์„ฑ์„ ์œ„ํ•ด 10์กฐ์› ๊ทœ๋ชจ์˜ ๋Œ€๊ทœ๋ชจ ํˆฌ์ž๋ฅผ ๋‹จํ–‰ํ•˜๋ฉฐ ๋””์ง€ํ„ธ ๊ฒฝ์ œ ์ „ํ™˜ ๊ฐ€์†ํ™”์— ๋‚˜์„ ๋‹ค", + "subtopics": [ + { + "title": "์ •๋ถ€์˜ AI ์‚ฐ์—… ์œก์„ฑ ์ฒญ์‚ฌ์ง„", + "content": [ + "์ •๋ถ€๊ฐ€ 2025๋…„ ์ธ๊ณต์ง€๋Šฅ(AI) ์‚ฐ์—… ์œก์„ฑ์„ ์œ„ํ•ด 10์กฐ์› ๊ทœ๋ชจ์˜ ํˆฌ์ž๋ฅผ ๋‹จํ–‰ํ•œ๋‹ค. ์ด๋Š” ํ•œ๊ตญ ๊ฒฝ์ œ์˜ ๋””์ง€ํ„ธ ์ „ํ™˜์„ ๊ฐ€์†ํ™”ํ•˜๊ณ  ๊ธ€๋กœ๋ฒŒ AI ๊ฐ•๊ตญ์œผ๋กœ ๋„์•ฝํ•˜๊ธฐ ์œ„ํ•œ ์ „๋žต์  ๊ฒฐ์ •์ด๋‹ค.", + "ํˆฌ์ž์˜ ์ฃผ์š” ๋ฐฉํ–ฅ์€ AI ๊ธฐ์ˆ  ๊ฐœ๋ฐœ, ์ธํ”„๋ผ ๊ตฌ์ถ•, ์ „๋ฌธ์ธ๋ ฅ ์–‘์„ฑ ๋“ฑ์œผ๋กœ, ํŠนํžˆ ๋ฐ˜๋„์ฒด์™€ ๊ฐ™์€ ํ•ต์‹ฌ ์‚ฐ์—…๊ณผ์˜ ์‹œ๋„ˆ์ง€ ์ฐฝ์ถœ์— ์ค‘์ ์„ ๋‘˜ ์˜ˆ์ •์ด๋‹ค." + ] + }, + { + "title": "๋ฏผ๊ด€ ํ˜‘๋ ฅ ์ฒด๊ณ„ ๊ตฌ์ถ•", + "content": [ + "์ •๋ถ€๋Š” AI ์‚ฐ์—… ์œก์„ฑ์„ ์œ„ํ•ด ๋Œ€๊ธฐ์—…, ์Šคํƒ€ํŠธ์—…, ์—ฐ๊ตฌ๊ธฐ๊ด€ ๋“ฑ๊ณผ์˜ ํ˜‘๋ ฅ ์ฒด๊ณ„๋ฅผ ๊ฐ•ํ™”ํ•œ๋‹ค. ์†Œ๋ฒ„๋ฆฐAI๋ฅผ ๋น„๋กฏํ•œ ๊ตญ๋‚ด AI ๊ธฐ์—…๋“ค๊ณผ์˜ ํ˜‘๋ ฅ์„ ํ†ตํ•ด ์‹ค์งˆ์ ์ธ ์„ธ๊ณ„ 2์œ„ AI ๊ฐ•๊ตญ ๋„์•ฝ์„ ๋ชฉํ‘œ๋กœ ํ•˜๊ณ  ์žˆ๋‹ค.", + "ํŠนํžˆ AI ์ „๋ฌธ๊ฐ€ ๊ณต๋ชจ์™€ ์ „๋‹ด ์กฐ์ง ์‹ ์„ค ๋“ฑ์„ ํ†ตํ•ด ์ฒด๊ณ„์ ์ธ ์‚ฐ์—… ์œก์„ฑ ๊ธฐ๋ฐ˜์„ ๋งˆ๋ จํ•  ๊ณ„ํš์ด๋‹ค." + ] + }, + { + "title": "๊ธ€๋กœ๋ฒŒ ๊ฒฝ์Ÿ๋ ฅ ๊ฐ•ํ™” ์ „๋žต", + "content": [ + "์ •๋ถ€๋Š” ๊ตญ๋‚ด AI ๊ธฐ์—…๋“ค์˜ ๊ธ€๋กœ๋ฒŒ ๊ฒฝ์Ÿ๋ ฅ ๊ฐ•ํ™”๋ฅผ ์œ„ํ•ด ๊ธฐ์ˆ  ๊ฐœ๋ฐœ ์ง€์›, ํ•ด์™ธ ์‹œ์žฅ ์ง„์ถœ ์ง€์›, ๊ทœ์ œ ๊ฐœ์„  ๋“ฑ ๋‹ค๊ฐ์ ์ธ ์ง€์›์ฑ…์„ ๋งˆ๋ จํ•œ๋‹ค.", + "ํŠนํžˆ AI ์‚ฐ์—…์˜ ํ•ต์‹ฌ ์ธํ”„๋ผ์ธ ๋ฐ˜๋„์ฒด ๋ถ„์•ผ์—์„œ SKํ•˜์ด๋‹‰์Šค์˜ HBM4 ๊ฐœ๋ฐœ ์™„๋ฃŒ ๋“ฑ ๊ฐ€์‹œ์ ์ธ ์„ฑ๊ณผ๊ฐ€ ๋‚˜ํƒ€๋‚˜๊ณ  ์žˆ์–ด, ์ด๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ํ•œ ์‹œ๋„ˆ์ง€ ํšจ๊ณผ๊ฐ€ ๊ธฐ๋Œ€๋œ๋‹ค." + ] + } + ], + "categories": [ + "๊ฒฝ์ œ", + "๊ธฐ์ˆ ", + "์‚ฐ์—…์ •์ฑ…" + ], + "entities": { + "people": [ + "ํ•˜์ •์šฐ ์†Œ๋ฒ„๋ฆฐAI ๋Œ€ํ‘œ" + ], + "organizations": [ + "์†Œ๋ฒ„๋ฆฐAI", + "SKํ•˜์ด๋‹‰์Šค", + "๊ณผํ•™๊ธฐ์ˆ ์ •๋ณดํ†ต์‹ ๋ถ€" + ], + "groups": [ + "AI ๊ธฐ์—…", + "์Šคํƒ€ํŠธ์—…" + ], + "countries": [ + "๋Œ€ํ•œ๋ฏผ๊ตญ", + "๋ฏธ๊ตญ" + ], + "events": [ + { + "name": "2025๋…„ AI ์‚ฐ์—… ์œก์„ฑ ๊ณ„ํš ๋ฐœํ‘œ", + "date": "2025๋…„", + "location": "๋Œ€ํ•œ๋ฏผ๊ตญ" + } + ], + "keywords": [ + "์ธ๊ณต์ง€๋Šฅ", + "AI ์‚ฐ์—…", + "๋””์ง€ํ„ธ ์ „ํ™˜", + "10์กฐ์› ํˆฌ์ž", + "๋ฐ˜๋„์ฒด", + "HBM4", + "๊ธ€๋กœ๋ฒŒ ๊ฒฝ์Ÿ๋ ฅ", + "๋ฏผ๊ด€ํ˜‘๋ ฅ", + "์ „๋ฌธ์ธ๋ ฅ ์–‘์„ฑ", + "๊ธฐ์ˆ ๊ฐœ๋ฐœ" + ] + }, + "source_keyword": "์ธ๊ณต์ง€๋Šฅ", + "source_count": 5 +} \ No newline at end of file diff --git a/backup-services/ai-writer/backend/requirements.txt b/backup-services/ai-writer/backend/requirements.txt new file mode 100644 index 0000000..8696605 --- /dev/null +++ b/backup-services/ai-writer/backend/requirements.txt @@ -0,0 +1,9 @@ +fastapi==0.104.1 +uvicorn[standard]==0.24.0 +httpx==0.25.2 +pydantic==2.5.0 +motor==3.1.1 +pymongo==4.3.3 +anthropic==0.39.0 +python-multipart==0.0.6 +redis[hiredis]==5.0.1 \ No newline at end of file diff --git a/backup-services/ai-writer/backend/test_ai_writer.py b/backup-services/ai-writer/backend/test_ai_writer.py new file mode 100755 index 0000000..3b45bbf --- /dev/null +++ b/backup-services/ai-writer/backend/test_ai_writer.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +""" +AI Writer Service Test +Claude API๋ฅผ ์‚ฌ์šฉํ•œ ์ „๋ฌธ์ ์ธ ๋‰ด์Šค ๊ธฐ์‚ฌ ์ƒ์„ฑ ํ…Œ์ŠคํŠธ +""" +import asyncio +import httpx +import json +from datetime import datetime + +# Service URL +SERVICE_URL = "http://localhost:8019" + +async def test_article_generation(): + """์ธ๊ณต์ง€๋Šฅ ํ‚ค์›Œ๋“œ๋กœ ๊ธฐ์‚ฌ ์ƒ์„ฑ ํ…Œ์ŠคํŠธ""" + async with httpx.AsyncClient(timeout=120.0) as client: + print("\n" + "="*70) + print(" AI Writer Service - ์ „๋ฌธ ๊ธฐ์‚ฌ ์ƒ์„ฑ ํ…Œ์ŠคํŠธ ") + print("="*70) + + print("\n๐Ÿ“ฐ '์ธ๊ณต์ง€๋Šฅ' ํ‚ค์›Œ๋“œ๋กœ ์ „๋ฌธ ๊ธฐ์‚ฌ ์ƒ์„ฑ ์ค‘...") + print("-" * 50) + + # Generate article + response = await client.post( + f"{SERVICE_URL}/api/generate", + json={ + "keyword": "์ธ๊ณต์ง€๋Šฅ", + "limit": 5, + "google_results_per_title": 3, + "lang": "ko", + "country": "KR", + "style": "professional" + } + ) + + if response.status_code == 200: + article = response.json() + + print(f"\nโœ… ๊ธฐ์‚ฌ ์ƒ์„ฑ ์™„๋ฃŒ!") + print(f"\n๐Ÿ“Œ ๊ธฐ์‚ฌ ID: {article['news_id']}") + print(f"๐Ÿ“… ์ƒ์„ฑ ์‹œ๊ฐ„: {article['created_at']}") + print(f"\n๐Ÿ“ฐ ์ œ๋ชฉ: {article['title']}") + print(f"๐Ÿ“ ์š”์•ฝ: {article['summary']}") + + print(f"\n๐Ÿ” ์นดํ…Œ๊ณ ๋ฆฌ: {', '.join(article['categories'])}") + + # Print subtopics + print(f"\n๐Ÿ“š ์†Œ์ฃผ์ œ ({len(article['subtopics'])}๊ฐœ):") + for i, subtopic in enumerate(article['subtopics'], 1): + print(f"\n [{i}] {subtopic['title']}") + print(f" ๋ฌธ๋‹จ ์ˆ˜: {len(subtopic['content'])}๊ฐœ") + for j, paragraph in enumerate(subtopic['content'][:1], 1): # Show first paragraph only + print(f" ๋ฏธ๋ฆฌ๋ณด๊ธฐ: {paragraph[:150]}...") + + # Print entities + entities = article['entities'] + print(f"\n๐Ÿท๏ธ ์ถ”์ถœ๋œ ๊ฐœ์ฒด:") + if entities['people']: + print(f" ๐Ÿ‘ค ์ธ๋ฌผ: {', '.join(entities['people'])}") + if entities['organizations']: + print(f" ๐Ÿข ๊ธฐ๊ด€: {', '.join(entities['organizations'])}") + if entities['groups']: + print(f" ๐Ÿ‘ฅ ๋‹จ์ฒด: {', '.join(entities['groups'])}") + if entities['countries']: + print(f" ๐ŸŒ ๊ตญ๊ฐ€: {', '.join(entities['countries'])}") + if entities.get('events'): + events = entities['events'] + if events: + print(f" ๐Ÿ“… ์ด๋ฒคํŠธ ({len(events)}๊ฐœ):") + for evt in events[:3]: # ์ฒ˜์Œ 3๊ฐœ๋งŒ ํ‘œ์‹œ + if isinstance(evt, dict): + evt_str = f" - {evt.get('name', '')}" + if evt.get('date'): + evt_str += f" [{evt['date']}]" + if evt.get('location'): + evt_str += f" @{evt['location']}" + print(evt_str) + else: + # ์ด์ „ ํ˜•์‹ (๋ฌธ์ž์—ด) ์ง€์› + print(f" - {evt}") + if entities.get('keywords'): + keywords = entities['keywords'] + if keywords: + print(f" ๐Ÿ”‘ ํ‚ค์›Œ๋“œ: {', '.join(keywords[:5])}" + + ("..." if len(keywords) > 5 else "")) + + print(f"\n๐Ÿ“Š ์ฐธ์กฐ ์†Œ์Šค: {article.get('source_count', 0)}๊ฐœ") + + # Save full article to file + with open('generated_article.json', 'w', encoding='utf-8') as f: + json.dump(article, f, ensure_ascii=False, indent=2) + print(f"\n๐Ÿ’พ ์ „์ฒด ๊ธฐ์‚ฌ๊ฐ€ 'generated_article.json'์— ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.") + + else: + print(f"โŒ ์˜ค๋ฅ˜: {response.status_code}") + print(f" ์ƒ์„ธ: {response.text}") + +async def test_health_check(): + """์„œ๋น„์Šค ์ƒํƒœ ํ™•์ธ""" + async with httpx.AsyncClient() as client: + print("\n" + "="*60) + print("์„œ๋น„์Šค Health Check") + print("="*60) + + response = await client.get(f"{SERVICE_URL}/health") + if response.status_code == 200: + data = response.json() + print(f"โœ“ AI Writer ์„œ๋น„์Šค ์ƒํƒœ: {data.get('status', 'unknown')}") + if 'services' in data: + print(f" - News Aggregator: {data['services'].get('news_aggregator', 'unknown')}") + print(f" - MongoDB: {data['services'].get('mongodb', 'unknown')}") + print(f" - Claude API: {data['services'].get('claude_api', 'unknown')}") + if 'error' in data: + print(f" - Error: {data['error']}") + else: + print(f"โœ— Health check ์‹คํŒจ: {response.status_code}") + +async def test_batch_generation(): + """์—ฌ๋Ÿฌ ํ‚ค์›Œ๋“œ ์ผ๊ด„ ์ฒ˜๋ฆฌ ํ…Œ์ŠคํŠธ""" + async with httpx.AsyncClient(timeout=180.0) as client: + print("\n" + "="*60) + print("์ผ๊ด„ ๊ธฐ์‚ฌ ์ƒ์„ฑ ํ…Œ์ŠคํŠธ") + print("="*60) + + keywords = ["AI ํ˜์‹ ", "๋””์ง€ํ„ธ ์ „ํ™˜", "์Šค๋งˆํŠธ์‹œํ‹ฐ"] + print(f"\nํ‚ค์›Œ๋“œ: {', '.join(keywords)}") + + response = await client.post( + f"{SERVICE_URL}/api/generate/batch", + json=keywords, + params={"style": "analytical"} + ) + + if response.status_code == 200: + data = response.json() + print(f"\nโœ… ์ฒ˜๋ฆฌ ์™„๋ฃŒ: {data['total_processed']}๊ฐœ") + + if data['success']: + print("\n์„ฑ๊ณตํ•œ ๊ธฐ์‚ฌ:") + for item in data['success']: + print(f" - {item['keyword']}: {item['title'][:50]}...") + + if data['errors']: + print("\n์‹คํŒจํ•œ ํ•ญ๋ชฉ:") + for item in data['errors']: + print(f" - {item['keyword']}: {item['error']}") + else: + print(f"โŒ ์˜ค๋ฅ˜: {response.status_code}") + +async def main(): + """๋ฉ”์ธ ํ…Œ์ŠคํŠธ ์‹คํ–‰""" + print("\n" + "="*70) + print(" AI Writer Service Test Suite ") + print(" RSS โ†’ Google Search โ†’ Claude AI ๊ธฐ์‚ฌ ์ƒ์„ฑ ") + print("="*70) + + # Run tests + await test_health_check() + await test_article_generation() + # await test_batch_generation() # Optional: batch test + + print("\n" + "="*70) + print(" ํ…Œ์ŠคํŠธ ์™„๋ฃŒ ") + print("="*70) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/backup-services/ai-writer/backend/test_prompt_generation.py b/backup-services/ai-writer/backend/test_prompt_generation.py new file mode 100644 index 0000000..12d4764 --- /dev/null +++ b/backup-services/ai-writer/backend/test_prompt_generation.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python3 +""" +AI Writer Service - ํ”„๋กฌํ”„ํŠธ ๊ธฐ๋ฐ˜ ๊ธฐ์‚ฌ ์ƒ์„ฑ ํ…Œ์ŠคํŠธ +๋‹ค์–‘ํ•œ ์Šคํƒ€์ผ๊ณผ ํ‚ค์›Œ๋“œ๋กœ ๊ธฐ์‚ฌ๋ฅผ ์ƒ์„ฑํ•˜๋Š” ํ…Œ์ŠคํŠธ +""" +import asyncio +import httpx +import json +from datetime import datetime + +# Service URL +SERVICE_URL = "http://localhost:8019" + +async def test_different_styles(): + """๋‹ค์–‘ํ•œ ์Šคํƒ€์ผ๋กœ ๊ธฐ์‚ฌ ์ƒ์„ฑ ํ…Œ์ŠคํŠธ""" + + test_cases = [ + { + "keyword": "์ „๊ธฐ์ฐจ", + "style": "professional", + "description": "์ „ํ†ต์ ์ธ ๋‰ด์Šค ๊ธฐ์‚ฌ ์Šคํƒ€์ผ" + }, + { + "keyword": "์ „๊ธฐ์ฐจ", + "style": "analytical", + "description": "๋ถ„์„์ ์ด๊ณ  ์‹ฌ์ธต์ ์ธ ์Šคํƒ€์ผ" + }, + { + "keyword": "์ „๊ธฐ์ฐจ", + "style": "investigative", + "description": "ํƒ์‚ฌ๋ณด๋„ ์Šคํƒ€์ผ" + } + ] + + async with httpx.AsyncClient(timeout=180.0) as client: + for test_case in test_cases: + print("\n" + "="*70) + print(f" {test_case['description']} ํ…Œ์ŠคํŠธ") + print("="*70) + print(f"ํ‚ค์›Œ๋“œ: {test_case['keyword']}") + print(f"์Šคํƒ€์ผ: {test_case['style']}") + print("-" * 50) + + try: + response = await client.post( + f"{SERVICE_URL}/api/generate", + json={ + "keyword": test_case["keyword"], + "limit": 3, # RSS ํ•ญ๋ชฉ ์ˆ˜ ์ค„์—ฌ์„œ ๋น ๋ฅธ ํ…Œ์ŠคํŠธ + "google_results_per_title": 2, + "lang": "ko", + "country": "KR", + "style": test_case["style"] + } + ) + + if response.status_code == 200: + article = response.json() + print(f"\nโœ… ๊ธฐ์‚ฌ ์ƒ์„ฑ ์„ฑ๊ณต!") + print(f"๐Ÿ“ฐ ์ œ๋ชฉ: {article['title']}") + print(f"๐Ÿ“ ์š”์•ฝ: {article['summary']}") + print(f"๐Ÿ” ์นดํ…Œ๊ณ ๋ฆฌ: {', '.join(article['categories'])}") + print(f"๐Ÿ“š ์†Œ์ฃผ์ œ ์ˆ˜: {len(article['subtopics'])}") + + # ํ‚ค์›Œ๋“œ ์ถœ๋ ฅ + if 'entities' in article and 'keywords' in article['entities']: + keywords = article['entities']['keywords'] + print(f"๐Ÿ”‘ ํ‚ค์›Œ๋“œ ({len(keywords)}๊ฐœ): {', '.join(keywords[:5])}" + + ("..." if len(keywords) > 5 else "")) + + # ์ด๋ฒคํŠธ ์ •๋ณด ์ถœ๋ ฅ + if 'entities' in article and 'events' in article['entities']: + events = article['entities']['events'] + if events: + print(f"๐Ÿ“… ์ด๋ฒคํŠธ ({len(events)}๊ฐœ):") + for evt in events[:2]: # ์ฒ˜์Œ 2๊ฐœ๋งŒ ํ‘œ์‹œ + if isinstance(evt, dict): + evt_str = f" - {evt.get('name', '')}" + if evt.get('date'): + evt_str += f" [{evt['date']}]" + if evt.get('location'): + evt_str += f" @{evt['location']}" + print(evt_str) + + # ์ฒซ ๋ฒˆ์งธ ์†Œ์ฃผ์ œ์˜ ์ฒซ ๋ฌธ๋‹จ๋งŒ ์ถœ๋ ฅ + if article['subtopics']: + first_topic = article['subtopics'][0] + print(f"\n์ฒซ ๋ฒˆ์งธ ์†Œ์ฃผ์ œ: {first_topic['title']}") + if first_topic['content']: + print(f"๋ฏธ๋ฆฌ๋ณด๊ธฐ: {first_topic['content'][0][:200]}...") + + # ํŒŒ์ผ๋กœ ์ €์žฅ + filename = f"article_{test_case['keyword']}_{test_case['style']}.json" + with open(filename, 'w', encoding='utf-8') as f: + json.dump(article, f, ensure_ascii=False, indent=2) + print(f"\n๐Ÿ’พ '{filename}'์— ์ €์žฅ๋จ") + + else: + print(f"โŒ ์˜ค๋ฅ˜: {response.status_code}") + print(f"์ƒ์„ธ: {response.text}") + + except Exception as e: + print(f"โŒ ํ…Œ์ŠคํŠธ ์‹คํŒจ: {e}") + + # ๋‹ค์Œ ํ…Œ์ŠคํŠธ ์ „ ์ž ์‹œ ๋Œ€๊ธฐ + await asyncio.sleep(2) + +async def test_different_keywords(): + """๋‹ค์–‘ํ•œ ํ‚ค์›Œ๋“œ๋กœ ๊ธฐ์‚ฌ ์ƒ์„ฑ ํ…Œ์ŠคํŠธ""" + + keywords = ["๋ธ”๋ก์ฒด์ธ", "๋ฉ”ํƒ€๋ฒ„์Šค", "์šฐ์ฃผ๊ฐœ๋ฐœ", "๊ธฐํ›„๋ณ€ํ™”", "K-POP"] + + async with httpx.AsyncClient(timeout=180.0) as client: + print("\n" + "="*70) + print(" ๋‹ค์–‘ํ•œ ํ‚ค์›Œ๋“œ ํ…Œ์ŠคํŠธ") + print("="*70) + + for keyword in keywords: + print(f"\n๐Ÿ” ํ‚ค์›Œ๋“œ: {keyword}") + print("-" * 30) + + try: + response = await client.post( + f"{SERVICE_URL}/api/generate", + json={ + "keyword": keyword, + "limit": 2, # ๋น ๋ฅธ ํ…Œ์ŠคํŠธ๋ฅผ ์œ„ํ•ด ์ค„์ž„ + "google_results_per_title": 2, + "lang": "ko", + "country": "KR", + "style": "professional" + } + ) + + if response.status_code == 200: + article = response.json() + print(f"โœ… ์„ฑ๊ณต: {article['title'][:50]}...") + print(f" ์นดํ…Œ๊ณ ๋ฆฌ: {', '.join(article['categories'][:3])}") + else: + print(f"โŒ ์‹คํŒจ: {response.status_code}") + + except Exception as e: + print(f"โŒ ์˜ค๋ฅ˜: {e}") + + await asyncio.sleep(1) + +async def test_custom_prompt(): + """์ปค์Šคํ…€ ํ”„๋กฌํ”„ํŠธ ํ…Œ์ŠคํŠธ - ์ง์ ‘ aggregated ๋ฐ์ดํ„ฐ ์ œ๊ณต""" + + # ๋ฏธ๋ฆฌ ์ˆ˜์ง‘๋œ ๋ฐ์ดํ„ฐ๋ฅผ ์‹œ๋ฎฌ๋ ˆ์ด์…˜ + custom_news_data = { + "keyword": "2025๋…„ ๊ธฐ์ˆ  ํŠธ๋ Œ๋“œ", + "news_items": [ + { + "rss_title": "AI์™€ ๋กœ๋ด‡์ด ๋ฐ”๊พธ๋Š” 2025๋…„ ์ผ์ƒ", + "google_results": [ + { + "title": "์ „๋ฌธ๊ฐ€๋“ค์ด ์˜ˆ์ธกํ•˜๋Š” 2025๋…„ AI ํ˜๋ช…", + "snippet": "2025๋…„ AI ๊ธฐ์ˆ ์ด ์ผ์ƒ์ƒํ™œ ์ „๋ฐ˜์„ ํ˜์‹ ํ•  ์ „๋ง...", + "full_content": { + "url": "https://example.com/ai-2025", + "content": "2025๋…„์—๋Š” AI๊ฐ€ ์˜๋ฃŒ, ๊ต์œก, ์—…๋ฌด ๋“ฑ ๋ชจ๋“  ๋ถ„์•ผ์—์„œ ์ธ๊ฐ„๊ณผ ํ˜‘์—…ํ•˜๋Š” ์‹œ๋Œ€๊ฐ€ ์—ด๋ฆด ๊ฒƒ์œผ๋กœ ์ „๋ง๋œ๋‹ค. ํŠนํžˆ ์ƒ์„ฑํ˜• AI์˜ ๋ฐœ์ „์œผ๋กœ ์ฐฝ์˜์  ์ž‘์—…์—์„œ๋„ AI์˜ ์—ญํ• ์ด ํฌ๊ฒŒ ํ™•๋Œ€๋  ๊ฒƒ์ด๋‹ค." + } + } + ] + }, + { + "rss_title": "์–‘์ž์ปดํ“จํ„ฐ ์ƒ์šฉํ™” ์ž„๋ฐ•", + "google_results": [ + { + "title": "IBM, 2025๋…„ 1000ํ๋น„ํŠธ ์–‘์ž์ปดํ“จํ„ฐ ์ถœ์‹œ ์˜ˆ์ •", + "snippet": "IBM์ด 2025๋…„ ์ƒ์šฉ ์–‘์ž์ปดํ“จํ„ฐ ์ถœ์‹œ๋ฅผ ์•ž๋‘๊ณ ...", + "full_content": { + "url": "https://example.com/quantum-2025", + "content": "์–‘์ž์ปดํ“จํŒ…์ด ๋“œ๋””์–ด ์‹ค์šฉํ™” ๋‹จ๊ณ„์— ์ ‘์–ด๋“ค์—ˆ๋‹ค. 2025๋…„์—๋Š” ๊ธˆ์œต, ์ œ์•ฝ, ๋ฌผ๋ฅ˜ ๋“ฑ ๋‹ค์–‘ํ•œ ์‚ฐ์—…์—์„œ ์–‘์ž์ปดํ“จํ„ฐ๋ฅผ ํ™œ์šฉํ•œ ํ˜์‹ ์ด ์‹œ์ž‘๋  ์ „๋ง์ด๋‹ค." + } + } + ] + } + ] + } + + async with httpx.AsyncClient(timeout=180.0) as client: + print("\n" + "="*70) + print(" ์ปค์Šคํ…€ ๋ฐ์ดํ„ฐ๋กœ ๊ธฐ์‚ฌ ์ƒ์„ฑ") + print("="*70) + + for style in ["professional", "analytical"]: + print(f"\n์Šคํƒ€์ผ: {style}") + print("-" * 30) + + try: + response = await client.post( + f"{SERVICE_URL}/api/generate/from-aggregated", + json=custom_news_data, + params={"style": style} + ) + + if response.status_code == 200: + article = response.json() + print(f"โœ… ์ œ๋ชฉ: {article['title']}") + print(f" ์š”์•ฝ: {article['summary']}") + + # ์Šคํƒ€์ผ๋ณ„๋กœ ์ €์žฅ + filename = f"custom_article_{style}.json" + with open(filename, 'w', encoding='utf-8') as f: + json.dump(article, f, ensure_ascii=False, indent=2) + print(f" ๐Ÿ’พ '{filename}'์— ์ €์žฅ๋จ") + else: + print(f"โŒ ์‹คํŒจ: {response.text}") + + except Exception as e: + print(f"โŒ ์˜ค๋ฅ˜: {e}") + + await asyncio.sleep(2) + +async def main(): + """๋ฉ”์ธ ํ…Œ์ŠคํŠธ ์‹คํ–‰""" + print("\n" + "="*70) + print(" AI Writer ํ”„๋กฌํ”„ํŠธ ๊ธฐ๋ฐ˜ ๊ธฐ์‚ฌ ์ƒ์„ฑ ํ…Œ์ŠคํŠธ") + print("="*70) + + # 1. ๋‹ค์–‘ํ•œ ์Šคํƒ€์ผ ํ…Œ์ŠคํŠธ + print("\n[1] ์Šคํƒ€์ผ๋ณ„ ๊ธฐ์‚ฌ ์ƒ์„ฑ ํ…Œ์ŠคํŠธ") + await test_different_styles() + + # 2. ๋‹ค์–‘ํ•œ ํ‚ค์›Œ๋“œ ํ…Œ์ŠคํŠธ + print("\n[2] ํ‚ค์›Œ๋“œ๋ณ„ ๊ธฐ์‚ฌ ์ƒ์„ฑ ํ…Œ์ŠคํŠธ") + await test_different_keywords() + + # 3. ์ปค์Šคํ…€ ๋ฐ์ดํ„ฐ ํ…Œ์ŠคํŠธ + print("\n[3] ์ปค์Šคํ…€ ๋ฐ์ดํ„ฐ ๊ธฐ์‚ฌ ์ƒ์„ฑ ํ…Œ์ŠคํŠธ") + await test_custom_prompt() + + print("\n" + "="*70) + print(" ๋ชจ๋“  ํ…Œ์ŠคํŠธ ์™„๋ฃŒ!") + print("="*70) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/backup-services/ai-writer/worker/Dockerfile b/backup-services/ai-writer/worker/Dockerfile new file mode 100644 index 0000000..7869505 --- /dev/null +++ b/backup-services/ai-writer/worker/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Copy requirements +COPY backend/requirements.txt . + +# Install dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY backend/app /app + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV WORKER_COUNT=3 + +# Run worker +CMD ["python", "worker.py"] \ No newline at end of file diff --git a/services/google-search/README.md b/backup-services/google-search/README.md similarity index 100% rename from services/google-search/README.md rename to backup-services/google-search/README.md diff --git a/services/google-search/backend/.env.example b/backup-services/google-search/backend/.env.example similarity index 100% rename from services/google-search/backend/.env.example rename to backup-services/google-search/backend/.env.example diff --git a/services/google-search/backend/Dockerfile b/backup-services/google-search/backend/Dockerfile similarity index 100% rename from services/google-search/backend/Dockerfile rename to backup-services/google-search/backend/Dockerfile diff --git a/services/rss-feed/backend/app/__init__.py b/backup-services/google-search/backend/app/__init__.py similarity index 100% rename from services/rss-feed/backend/app/__init__.py rename to backup-services/google-search/backend/app/__init__.py diff --git a/services/google-search/backend/app/config.py b/backup-services/google-search/backend/app/config.py similarity index 100% rename from services/google-search/backend/app/config.py rename to backup-services/google-search/backend/app/config.py diff --git a/services/google-search/backend/app/main.py b/backup-services/google-search/backend/app/main.py similarity index 100% rename from services/google-search/backend/app/main.py rename to backup-services/google-search/backend/app/main.py diff --git a/services/google-search/backend/app/search_service.py b/backup-services/google-search/backend/app/search_service.py similarity index 100% rename from services/google-search/backend/app/search_service.py rename to backup-services/google-search/backend/app/search_service.py diff --git a/services/google-search/backend/requirements.txt b/backup-services/google-search/backend/requirements.txt similarity index 100% rename from services/google-search/backend/requirements.txt rename to backup-services/google-search/backend/requirements.txt diff --git a/backup-services/news-aggregator/backend/Dockerfile b/backup-services/news-aggregator/backend/Dockerfile new file mode 100644 index 0000000..a296111 --- /dev/null +++ b/backup-services/news-aggregator/backend/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +COPY . . + +# Run the application +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] \ No newline at end of file diff --git a/backup-services/news-aggregator/backend/app/__init__.py b/backup-services/news-aggregator/backend/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backup-services/news-aggregator/backend/app/main.py b/backup-services/news-aggregator/backend/app/main.py new file mode 100644 index 0000000..625101f --- /dev/null +++ b/backup-services/news-aggregator/backend/app/main.py @@ -0,0 +1,365 @@ +""" +News Aggregator Service +RSS ํ”ผ๋“œ ์ œ๋ชฉ์„ ๊ตฌ๊ธ€ ๊ฒ€์ƒ‰์œผ๋กœ ํ™•์žฅํ•˜๋Š” ํ†ตํ•ฉ ์„œ๋น„์Šค +""" +from fastapi import FastAPI, HTTPException, Query, BackgroundTasks +from fastapi.middleware.cors import CORSMiddleware +from typing import List, Optional, Dict, Any +from datetime import datetime +import httpx +import asyncio +from pydantic import BaseModel +import logging + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +app = FastAPI( + title="News Aggregator Service", + description="RSS ํ”ผ๋“œ์™€ ๊ตฌ๊ธ€ ๊ฒ€์ƒ‰์„ ํ†ตํ•ฉํ•œ ๋‰ด์Šค ์ˆ˜์ง‘ ์„œ๋น„์Šค", + version="1.0.0" +) + +# CORS ์„ค์ • +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Configuration +RSS_SERVICE_URL = "http://rss-feed-backend:8000" +GOOGLE_SEARCH_SERVICE_URL = "http://google-search-backend:8000" + +# Response Models +class NewsItem(BaseModel): + """๋‰ด์Šค ํ•ญ๋ชฉ""" + rss_title: str + rss_link: Optional[str] = None + google_results: List[Dict[str, Any]] = [] + search_keyword: str + timestamp: datetime = None + +class AggregatedNews(BaseModel): + """ํ†ตํ•ฉ ๋‰ด์Šค ๊ฒฐ๊ณผ""" + keyword: str + rss_feed_url: str + total_rss_entries: int + processed_entries: int + news_items: List[NewsItem] + processing_time: float + +# HTTP Client +client = httpx.AsyncClient(timeout=30.0) + +@app.on_event("startup") +async def startup(): + """์„œ๋น„์Šค ์‹œ์ž‘""" + logger.info("News Aggregator Service starting...") + +@app.on_event("shutdown") +async def shutdown(): + """์„œ๋น„์Šค ์ข…๋ฃŒ""" + await client.aclose() + logger.info("News Aggregator Service stopped") + +@app.get("/") +async def root(): + return { + "service": "News Aggregator Service", + "version": "1.0.0", + "description": "RSS ํ”ผ๋“œ์™€ ๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ ํ†ตํ•ฉ ์„œ๋น„์Šค", + "endpoints": { + "aggregate": "GET /api/aggregate", + "aggregate_by_location": "GET /api/aggregate/location", + "aggregate_by_topic": "GET /api/aggregate/topic", + "health": "GET /health" + } + } + +@app.get("/health") +async def health_check(): + """ํ—ฌ์Šค ์ฒดํฌ""" + try: + # Check RSS service + rss_response = await client.get(f"{RSS_SERVICE_URL}/health") + rss_healthy = rss_response.status_code == 200 + + # Check Google Search service + google_response = await client.get(f"{GOOGLE_SEARCH_SERVICE_URL}/health") + google_healthy = google_response.status_code == 200 + + return { + "status": "healthy" if (rss_healthy and google_healthy) else "degraded", + "services": { + "rss_feed": "healthy" if rss_healthy else "unhealthy", + "google_search": "healthy" if google_healthy else "unhealthy" + }, + "timestamp": datetime.now().isoformat() + } + except Exception as e: + return { + "status": "unhealthy", + "error": str(e), + "timestamp": datetime.now().isoformat() + } + +@app.get("/api/aggregate", response_model=AggregatedNews) +async def aggregate_news( + q: str = Query(..., description="๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ"), + limit: int = Query(10, description="์ฒ˜๋ฆฌํ•  RSS ํ•ญ๋ชฉ ์ˆ˜", ge=1, le=50), + google_results_per_title: int = Query(5, description="๊ฐ ์ œ๋ชฉ๋‹น ๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ˆ˜", ge=1, le=10), + lang: str = Query("ko", description="์–ธ์–ด ์ฝ”๋“œ"), + country: str = Query("KR", description="๊ตญ๊ฐ€ ์ฝ”๋“œ") +): + """ + ํ‚ค์›Œ๋“œ๋กœ RSS ํ”ผ๋“œ๋ฅผ ๊ฒ€์ƒ‰ํ•˜๊ณ , ๊ฐ ์ œ๋ชฉ์„ ๊ตฌ๊ธ€์—์„œ ์žฌ๊ฒ€์ƒ‰ + + 1. ํ‚ค์›Œ๋“œ๋กœ Google News RSS ํ”ผ๋“œ ๊ฐ€์ ธ์˜ค๊ธฐ + 2. RSS ํ”ผ๋“œ์˜ ๊ฐ ์ œ๋ชฉ์„ ๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ + 3. ํ†ตํ•ฉ ๊ฒฐ๊ณผ ๋ฐ˜ํ™˜ + """ + start_time = datetime.now() + + try: + # Step 1: Get RSS feed from keyword + logger.info(f"Fetching RSS feed for keyword: {q}") + rss_response = await client.get( + f"{RSS_SERVICE_URL}/api/google-rss/search", + params={"q": q, "lang": lang, "country": country} + ) + rss_response.raise_for_status() + rss_data = rss_response.json() + + if not rss_data.get("success"): + raise HTTPException(status_code=500, detail=f"RSS ํ”ผ๋“œ ๊ฐ€์ ธ์˜ค๊ธฐ ์‹คํŒจ: {rss_data.get('error')}") + + # Step 2: Process each RSS entry with Google search + news_items = [] + entries = rss_data.get("entries", []) + + # If no entries field, fallback to sample_titles + if not entries: + titles = rss_data.get("sample_titles", [])[:limit] + entries = [{"title": title, "link": "", "published": ""} for title in titles] + else: + entries = entries[:limit] + + # Create tasks for parallel processing + search_tasks = [] + for entry in entries: + title = entry.get("title", "") + # Clean title for better search results + clean_title = title.split(" - ")[-1] if " - " in title else title + search_tasks.append( + search_google(clean_title, google_results_per_title, lang, country) + ) + + # Execute searches in parallel + logger.info(f"Searching Google for {len(search_tasks)} RSS entries") + search_results = await asyncio.gather(*search_tasks, return_exceptions=True) + + # Combine results + for i, entry in enumerate(entries): + google_results = [] + if not isinstance(search_results[i], Exception): + google_results = search_results[i] + + title = entry.get("title", "") + news_items.append(NewsItem( + rss_title=title, + rss_link=entry.get("link", ""), + google_results=google_results, + search_keyword=title.split(" - ")[-1] if " - " in title else title, + timestamp=datetime.now() + )) + + # Calculate processing time + processing_time = (datetime.now() - start_time).total_seconds() + + return AggregatedNews( + keyword=q, + rss_feed_url=rss_data.get("feed_url", ""), + total_rss_entries=rss_data.get("entry_count", 0), + processed_entries=len(news_items), + news_items=news_items, + processing_time=processing_time + ) + + except httpx.HTTPStatusError as e: + logger.error(f"HTTP error: {e}") + raise HTTPException(status_code=e.response.status_code, detail=str(e)) + except Exception as e: + logger.error(f"Error in aggregate_news: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +async def search_google(query: str, num_results: int, lang: str, country: str) -> List[Dict[str, Any]]: + """๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ ์„œ๋น„์Šค ํ˜ธ์ถœ - ์ „์ฒด ์ฝ˜ํ…์ธ  ํฌํ•จ""" + try: + # Full content API ์ง์ ‘ ํ˜ธ์ถœ + response = await client.get( + f"{GOOGLE_SEARCH_SERVICE_URL}/api/search/full", + params={ + "q": query, + "num": num_results, + "lang": lang, + "country": country + } + ) + response.raise_for_status() + data = response.json() + results = data.get("results", []) + + # full_content๊ฐ€ ์ด๋ฏธ ํฌํ•จ๋˜์–ด ์žˆ์œผ๋ฏ€๋กœ ๊ทธ๋Œ€๋กœ ๋ฐ˜ํ™˜ + logger.info(f"Google search for '{query}' returned {len(results)} results with full content") + + return results + except Exception as e: + logger.error(f"Google search error for '{query}': {e}") + # Fallback to basic search without full content + try: + response = await client.get( + f"{GOOGLE_SEARCH_SERVICE_URL}/api/search", + params={ + "q": query, + "num": num_results, + "lang": lang, + "country": country + } + ) + response.raise_for_status() + data = response.json() + return data.get("results", []) + except: + return [] + +@app.get("/api/aggregate/location", response_model=AggregatedNews) +async def aggregate_news_by_location( + location: str = Query(..., description="์ง€์—ญ๋ช… (์˜ˆ: Seoul, Tokyo)"), + limit: int = Query(10, description="์ฒ˜๋ฆฌํ•  RSS ํ•ญ๋ชฉ ์ˆ˜", ge=1, le=50), + google_results_per_title: int = Query(5, description="๊ฐ ์ œ๋ชฉ๋‹น ๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ˆ˜", ge=1, le=10), + lang: str = Query("ko", description="์–ธ์–ด ์ฝ”๋“œ"), + country: str = Query("KR", description="๊ตญ๊ฐ€ ์ฝ”๋“œ") +): + """์ง€์—ญ ๊ธฐ๋ฐ˜ RSS ํ”ผ๋“œ๋ฅผ ๊ฐ€์ ธ์™€์„œ ๊ฐ ์ œ๋ชฉ์„ ๊ตฌ๊ธ€ ๊ฒ€์ƒ‰""" + start_time = datetime.now() + + try: + # Get location-based RSS feed + logger.info(f"Fetching RSS feed for location: {location}") + rss_response = await client.get( + f"{RSS_SERVICE_URL}/api/google-rss/location", + params={"location": location, "lang": lang, "country": country} + ) + rss_response.raise_for_status() + rss_data = rss_response.json() + + if not rss_data.get("success"): + raise HTTPException(status_code=500, detail=f"RSS ํ”ผ๋“œ ๊ฐ€์ ธ์˜ค๊ธฐ ์‹คํŒจ: {rss_data.get('error')}") + + # Process titles + news_items = [] + titles = rss_data.get("sample_titles", [])[:limit] + + search_tasks = [] + for title in titles: + clean_title = title.split(" - ")[-1] if " - " in title else title + search_tasks.append( + search_google(clean_title, google_results_per_title, lang, country) + ) + + search_results = await asyncio.gather(*search_tasks, return_exceptions=True) + + for i, title in enumerate(titles): + google_results = [] + if not isinstance(search_results[i], Exception): + google_results = search_results[i] + + news_items.append(NewsItem( + rss_title=title, + google_results=google_results, + search_keyword=title.split(" - ")[-1] if " - " in title else title, + timestamp=datetime.now() + )) + + processing_time = (datetime.now() - start_time).total_seconds() + + return AggregatedNews( + keyword=f"Location: {location}", + rss_feed_url=rss_data.get("feed_url", ""), + total_rss_entries=rss_data.get("entry_count", 0), + processed_entries=len(news_items), + news_items=news_items, + processing_time=processing_time + ) + + except Exception as e: + logger.error(f"Error in aggregate_news_by_location: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/aggregate/topic", response_model=AggregatedNews) +async def aggregate_news_by_topic( + category: str = Query(..., description="์นดํ…Œ๊ณ ๋ฆฌ (TECHNOLOGY, BUSINESS, HEALTH ๋“ฑ)"), + limit: int = Query(10, description="์ฒ˜๋ฆฌํ•  RSS ํ•ญ๋ชฉ ์ˆ˜", ge=1, le=50), + google_results_per_title: int = Query(5, description="๊ฐ ์ œ๋ชฉ๋‹น ๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ˆ˜", ge=1, le=10), + lang: str = Query("ko", description="์–ธ์–ด ์ฝ”๋“œ"), + country: str = Query("KR", description="๊ตญ๊ฐ€ ์ฝ”๋“œ") +): + """์ฃผ์ œ๋ณ„ RSS ํ”ผ๋“œ๋ฅผ ๊ฐ€์ ธ์™€์„œ ๊ฐ ์ œ๋ชฉ์„ ๊ตฌ๊ธ€ ๊ฒ€์ƒ‰""" + start_time = datetime.now() + + try: + # Get topic-based RSS feed + logger.info(f"Fetching RSS feed for topic: {category}") + rss_response = await client.get( + f"{RSS_SERVICE_URL}/api/google-rss/topic", + params={"category": category, "lang": lang, "country": country} + ) + rss_response.raise_for_status() + rss_data = rss_response.json() + + if not rss_data.get("success"): + raise HTTPException(status_code=500, detail=f"RSS ํ”ผ๋“œ ๊ฐ€์ ธ์˜ค๊ธฐ ์‹คํŒจ: {rss_data.get('error')}") + + # Process titles + news_items = [] + titles = rss_data.get("sample_titles", [])[:limit] + + search_tasks = [] + for title in titles: + clean_title = title.split(" - ")[-1] if " - " in title else title + search_tasks.append( + search_google(clean_title, google_results_per_title, lang, country) + ) + + search_results = await asyncio.gather(*search_tasks, return_exceptions=True) + + for i, title in enumerate(titles): + google_results = [] + if not isinstance(search_results[i], Exception): + google_results = search_results[i] + + news_items.append(NewsItem( + rss_title=title, + google_results=google_results, + search_keyword=title.split(" - ")[-1] if " - " in title else title, + timestamp=datetime.now() + )) + + processing_time = (datetime.now() - start_time).total_seconds() + + return AggregatedNews( + keyword=f"Topic: {category}", + rss_feed_url=rss_data.get("feed_url", ""), + total_rss_entries=rss_data.get("entry_count", 0), + processed_entries=len(news_items), + news_items=news_items, + processing_time=processing_time + ) + + except Exception as e: + logger.error(f"Error in aggregate_news_by_topic: {e}") + raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file diff --git a/backup-services/news-aggregator/backend/requirements.txt b/backup-services/news-aggregator/backend/requirements.txt new file mode 100644 index 0000000..5881f23 --- /dev/null +++ b/backup-services/news-aggregator/backend/requirements.txt @@ -0,0 +1,5 @@ +fastapi==0.104.1 +uvicorn[standard]==0.24.0 +httpx==0.25.2 +pydantic==2.5.0 +python-multipart==0.0.6 \ No newline at end of file diff --git a/backup-services/news-aggregator/backend/test_aggregator.py b/backup-services/news-aggregator/backend/test_aggregator.py new file mode 100755 index 0000000..cffea4d --- /dev/null +++ b/backup-services/news-aggregator/backend/test_aggregator.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +""" +News Aggregator Service Test +RSS ํ”ผ๋“œ ์ œ๋ชฉ์„ ๊ตฌ๊ธ€ full content ๊ฒ€์ƒ‰์œผ๋กœ ํ™•์žฅํ•˜๋Š” ํ†ตํ•ฉ ํ…Œ์ŠคํŠธ +""" +import asyncio +import httpx +import json +from datetime import datetime +from typing import Dict, Any + +# Service URL +SERVICE_URL = "http://localhost:8018" + +async def test_aggregate_with_full_content(): + """ํ‚ค์›Œ๋“œ๋กœ RSS ํ”ผ๋“œ๋ฅผ ๊ฒ€์ƒ‰ํ•˜๊ณ  full content ๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ ํ…Œ์ŠคํŠธ""" + async with httpx.AsyncClient(timeout=60.0) as client: + print("\n" + "="*60) + print("๋‰ด์Šค ํ†ตํ•ฉ ์„œ๋น„์Šค Full Content ํ…Œ์ŠคํŠธ") + print("="*60) + + # Test with keyword "์ธ๊ณต์ง€๋Šฅ" + print("\n1. ํ‚ค์›Œ๋“œ '์ธ๊ณต์ง€๋Šฅ'์œผ๋กœ RSS ํ”ผ๋“œ ๊ฒ€์ƒ‰ ๋ฐ ๊ตฌ๊ธ€ full content ๊ฒ€์ƒ‰") + print("-" * 40) + + response = await client.get( + f"{SERVICE_URL}/api/aggregate", + params={ + "q": "์ธ๊ณต์ง€๋Šฅ", + "limit": 3, # ํ…Œ์ŠคํŠธ์šฉ์œผ๋กœ 3๊ฐœ๋งŒ + "google_results_per_title": 2, # ๊ฐ ์ œ๋ชฉ๋‹น 2๊ฐœ ๊ตฌ๊ธ€ ๊ฒฐ๊ณผ + "lang": "ko", + "country": "KR" + } + ) + + if response.status_code == 200: + data = response.json() + print(f"โœ“ RSS ํ”ผ๋“œ URL: {data['rss_feed_url']}") + print(f"โœ“ ์ „์ฒด RSS ํ•ญ๋ชฉ ์ˆ˜: {data['total_rss_entries']}") + print(f"โœ“ ์ฒ˜๋ฆฌ๋œ ํ•ญ๋ชฉ ์ˆ˜: {data['processed_entries']}") + print(f"โœ“ ์ฒ˜๋ฆฌ ์‹œ๊ฐ„: {data['processing_time']:.2f}์ดˆ") + + # Check each news item for full content + for i, item in enumerate(data['news_items'], 1): + print(f"\n [{i}] RSS ์ œ๋ชฉ: {item['rss_title'][:50]}...") + print(f" ๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ: {item['search_keyword'][:50]}...") + print(f" ๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ˆ˜: {len(item['google_results'])}") + + # Check if google results have full_content + for j, result in enumerate(item['google_results'], 1): + has_full_content = 'full_content' in result + if has_full_content: + full_content = result.get('full_content', '') + if isinstance(full_content, str): + content_length = len(full_content) + else: + content_length = len(str(full_content)) + else: + content_length = 0 + + print(f" - ๊ฒฐ๊ณผ {j}: {result.get('title', 'N/A')[:40]}...") + print(f" Full Content ํฌํ•จ: {'โœ“' if has_full_content else 'โœ—'}") + if has_full_content: + print(f" Content ๊ธธ์ด: {content_length:,} ๋ฌธ์ž") + # Show first 200 chars of content + if isinstance(result['full_content'], str): + preview = result['full_content'][:200].replace('\n', ' ') + print(f" ๋ฏธ๋ฆฌ๋ณด๊ธฐ: {preview}...") + else: + print(f" Content ํƒ€์ž…: {type(result['full_content'])}") + print(f" Content ๋ฐ์ดํ„ฐ: {str(result['full_content'])[:200]}...") + else: + print(f"โœ— ์˜ค๋ฅ˜: {response.status_code}") + print(f" ์ƒ์„ธ: {response.text}") + +async def test_aggregate_by_location(): + """์ง€์—ญ ๊ธฐ๋ฐ˜ RSS ํ”ผ๋“œ ๋ฐ full content ํ…Œ์ŠคํŠธ""" + async with httpx.AsyncClient(timeout=60.0) as client: + print("\n" + "="*60) + print("์ง€์—ญ ๊ธฐ๋ฐ˜ ๋‰ด์Šค ํ†ตํ•ฉ Full Content ํ…Œ์ŠคํŠธ") + print("="*60) + + print("\n2. ์ง€์—ญ 'Seoul'๋กœ RSS ํ”ผ๋“œ ๊ฒ€์ƒ‰ ๋ฐ ๊ตฌ๊ธ€ full content ๊ฒ€์ƒ‰") + print("-" * 40) + + response = await client.get( + f"{SERVICE_URL}/api/aggregate/location", + params={ + "location": "Seoul", + "limit": 2, + "google_results_per_title": 2, + "lang": "ko", + "country": "KR" + } + ) + + if response.status_code == 200: + data = response.json() + print(f"โœ“ ์ง€์—ญ: {data['keyword']}") + print(f"โœ“ RSS ํ”ผ๋“œ URL: {data['rss_feed_url']}") + print(f"โœ“ ์ฒ˜๋ฆฌ๋œ ํ•ญ๋ชฉ ์ˆ˜: {data['processed_entries']}") + + # Check full content availability + full_content_count = 0 + total_content_size = 0 + + for item in data['news_items']: + for result in item['google_results']: + if 'full_content' in result: + full_content_count += 1 + content = result['full_content'] + if isinstance(content, str): + total_content_size += len(content) + else: + total_content_size += len(str(content)) + + print(f"\n๐Ÿ“Š Full Content ํ†ต๊ณ„:") + print(f" - Full Content ํฌํ•จ ๊ฒฐ๊ณผ: {full_content_count}๊ฐœ") + print(f" - ์ „์ฒด Content ํฌ๊ธฐ: {total_content_size:,} ๋ฌธ์ž") + print(f" - ํ‰๊ท  Content ํฌ๊ธฐ: {total_content_size//max(full_content_count, 1):,} ๋ฌธ์ž") + else: + print(f"โœ— ์˜ค๋ฅ˜: {response.status_code}") + +async def test_aggregate_by_topic(): + """์ฃผ์ œ๋ณ„ RSS ํ”ผ๋“œ ๋ฐ full content ํ…Œ์ŠคํŠธ""" + async with httpx.AsyncClient(timeout=60.0) as client: + print("\n" + "="*60) + print("์ฃผ์ œ๋ณ„ ๋‰ด์Šค ํ†ตํ•ฉ Full Content ํ…Œ์ŠคํŠธ") + print("="*60) + + print("\n3. ์ฃผ์ œ 'TECHNOLOGY'๋กœ RSS ํ”ผ๋“œ ๊ฒ€์ƒ‰ ๋ฐ ๊ตฌ๊ธ€ full content ๊ฒ€์ƒ‰") + print("-" * 40) + + response = await client.get( + f"{SERVICE_URL}/api/aggregate/topic", + params={ + "category": "TECHNOLOGY", + "limit": 2, + "google_results_per_title": 3, + "lang": "ko", + "country": "KR" + } + ) + + if response.status_code == 200: + data = response.json() + print(f"โœ“ ์ฃผ์ œ: {data['keyword']}") + print(f"โœ“ ์ฒ˜๋ฆฌ ์‹œ๊ฐ„: {data['processing_time']:.2f}์ดˆ") + + # Analyze content quality for AI summarization + print("\n๐Ÿ“ AI ์š”์•ฝ์„ ์œ„ํ•œ Content ํ’ˆ์งˆ ๋ถ„์„:") + for i, item in enumerate(data['news_items'], 1): + print(f"\n ๋‰ด์Šค ํ•ญ๋ชฉ {i}:") + for j, result in enumerate(item['google_results'], 1): + if 'full_content' in result: + content = result['full_content'] + if isinstance(content, str): + # Check content quality indicators + has_paragraphs = '\n\n' in content or '

' in content + has_sufficient_length = len(content) > 500 + has_korean = any(ord(char) >= 0xAC00 and ord(char) <= 0xD7A3 for char in content[:min(100, len(content))]) + else: + content_str = str(content) + has_paragraphs = '\n\n' in content_str or '

' in content_str + has_sufficient_length = len(content_str) > 500 + has_korean = any(ord(char) >= 0xAC00 and ord(char) <= 0xD7A3 for char in content_str[:min(100, len(content_str))]) + + print(f" ๊ฒฐ๊ณผ {j} ํ’ˆ์งˆ ์ฒดํฌ:") + print(f" - ์ถฉ๋ถ„ํ•œ ๊ธธ์ด (>500์ž): {'โœ“' if has_sufficient_length else 'โœ—'}") + print(f" - ๋‹จ๋ฝ ๊ตฌ์กฐ ํฌํ•จ: {'โœ“' if has_paragraphs else 'โœ—'}") + print(f" - ํ•œ๊ตญ์–ด ์ฝ˜ํ…์ธ : {'โœ“' if has_korean else 'โœ—'}") + print(f" - AI ์š”์•ฝ ๊ฐ€๋Šฅ: {'โœ“' if (has_sufficient_length and has_paragraphs) else 'โœ—'}") + else: + print(f"โœ— ์˜ค๋ฅ˜: {response.status_code}") + +async def test_health_check(): + """์„œ๋น„์Šค ์ƒํƒœ ํ™•์ธ""" + async with httpx.AsyncClient() as client: + print("\n" + "="*60) + print("์„œ๋น„์Šค Health Check") + print("="*60) + + response = await client.get(f"{SERVICE_URL}/health") + if response.status_code == 200: + data = response.json() + print(f"โœ“ ํ†ตํ•ฉ ์„œ๋น„์Šค ์ƒํƒœ: {data['status']}") + print(f" - RSS ์„œ๋น„์Šค: {data['services']['rss_feed']}") + print(f" - Google ๊ฒ€์ƒ‰ ์„œ๋น„์Šค: {data['services']['google_search']}") + else: + print(f"โœ— Health check ์‹คํŒจ: {response.status_code}") + +async def main(): + """๋ฉ”์ธ ํ…Œ์ŠคํŠธ ์‹คํ–‰""" + print("\n" + "="*70) + print(" News Aggregator Full Content Integration Test ") + print(" RSS ํ”ผ๋“œ + Google Full Content ํ†ตํ•ฉ ํ…Œ์ŠคํŠธ ") + print("="*70) + + # Run tests + await test_health_check() + await test_aggregate_with_full_content() + await test_aggregate_by_location() + await test_aggregate_by_topic() + + print("\n" + "="*70) + print(" ํ…Œ์ŠคํŠธ ์™„๋ฃŒ - Full Content ํ†ตํ•ฉ ํ™•์ธ ") + print("="*70) + print("\nโœ… ๋ชจ๋“  ํ…Œ์ŠคํŠธ๊ฐ€ ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.") + print(" RSS ํ”ผ๋“œ ์ œ๋ชฉ์„ ๊ตฌ๊ธ€ full content๋กœ ๊ฒ€์ƒ‰ํ•˜๋Š” ๊ธฐ๋Šฅ์ด ์ •์ƒ ์ž‘๋™ํ•ฉ๋‹ˆ๋‹ค.") + print(" AI ์š”์•ฝ์„ ์œ„ํ•œ ์ถฉ๋ถ„ํ•œ ์ฝ˜ํ…์ธ ๊ฐ€ ์ˆ˜์ง‘๋˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค.") + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/services/rss-feed/README.md b/backup-services/rss-feed/README.md similarity index 100% rename from services/rss-feed/README.md rename to backup-services/rss-feed/README.md diff --git a/services/rss-feed/backend/Dockerfile b/backup-services/rss-feed/backend/Dockerfile similarity index 100% rename from services/rss-feed/backend/Dockerfile rename to backup-services/rss-feed/backend/Dockerfile diff --git a/backup-services/rss-feed/backend/app/__init__.py b/backup-services/rss-feed/backend/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/rss-feed/backend/app/config.py b/backup-services/rss-feed/backend/app/config.py similarity index 100% rename from services/rss-feed/backend/app/config.py rename to backup-services/rss-feed/backend/app/config.py diff --git a/services/rss-feed/backend/app/feed_parser.py b/backup-services/rss-feed/backend/app/feed_parser.py similarity index 100% rename from services/rss-feed/backend/app/feed_parser.py rename to backup-services/rss-feed/backend/app/feed_parser.py diff --git a/backup-services/rss-feed/backend/app/google_rss.py b/backup-services/rss-feed/backend/app/google_rss.py new file mode 100644 index 0000000..b4fd24f --- /dev/null +++ b/backup-services/rss-feed/backend/app/google_rss.py @@ -0,0 +1,115 @@ +""" +Google News RSS Feed Generator +๊ตฌ๊ธ€ ๋‰ด์Šค RSS ํ”ผ๋“œ URL ์ƒ์„ฑ ๋ฐ ๊ตฌ๋… ์ง€์› +""" +from typing import Optional, List +from urllib.parse import quote_plus +from enum import Enum + +class GoogleNewsCategory(str, Enum): + """๊ตฌ๊ธ€ ๋‰ด์Šค ์นดํ…Œ๊ณ ๋ฆฌ""" + WORLD = "WORLD" + NATION = "NATION" + BUSINESS = "BUSINESS" + TECHNOLOGY = "TECHNOLOGY" + ENTERTAINMENT = "ENTERTAINMENT" + SPORTS = "SPORTS" + SCIENCE = "SCIENCE" + HEALTH = "HEALTH" + +class GoogleNewsRSS: + """Google News RSS ํ”ผ๋“œ URL ์ƒ์„ฑ๊ธฐ""" + + BASE_URL = "https://news.google.com/rss" + + @staticmethod + def search_feed(query: str, lang: str = "ko", country: str = "KR") -> str: + """ + ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰ RSS ํ”ผ๋“œ URL ์ƒ์„ฑ + + Args: + query: ๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ + lang: ์–ธ์–ด ์ฝ”๋“œ (ko, en, ja, zh-CN ๋“ฑ) + country: ๊ตญ๊ฐ€ ์ฝ”๋“œ (KR, US, JP, CN ๋“ฑ) + + Returns: + RSS ํ”ผ๋“œ URL + """ + encoded_query = quote_plus(query) + return f"{GoogleNewsRSS.BASE_URL}/search?q={encoded_query}&hl={lang}&gl={country}&ceid={country}:{lang}" + + @staticmethod + def topic_feed(category: GoogleNewsCategory, lang: str = "ko", country: str = "KR") -> str: + """ + ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ RSS ํ”ผ๋“œ URL ์ƒ์„ฑ + + Args: + category: ๋‰ด์Šค ์นดํ…Œ๊ณ ๋ฆฌ + lang: ์–ธ์–ด ์ฝ”๋“œ + country: ๊ตญ๊ฐ€ ์ฝ”๋“œ + + Returns: + RSS ํ”ผ๋“œ URL + """ + return f"{GoogleNewsRSS.BASE_URL}/headlines/section/topic/{category.value}?hl={lang}&gl={country}&ceid={country}:{lang}" + + @staticmethod + def location_feed(location: str, lang: str = "ko", country: str = "KR") -> str: + """ + ์ง€์—ญ ๋‰ด์Šค RSS ํ”ผ๋“œ URL ์ƒ์„ฑ + + Args: + location: ์ง€์—ญ๋ช… (์˜ˆ: Seoul, ์„œ์šธ, New York) + lang: ์–ธ์–ด ์ฝ”๋“œ + country: ๊ตญ๊ฐ€ ์ฝ”๋“œ + + Returns: + RSS ํ”ผ๋“œ URL + """ + encoded_location = quote_plus(location) + return f"{GoogleNewsRSS.BASE_URL}/headlines/section/geo/{encoded_location}?hl={lang}&gl={country}&ceid={country}:{lang}" + + @staticmethod + def trending_feed(lang: str = "ko", country: str = "KR") -> str: + """ + ํŠธ๋ Œ๋”ฉ ๋‰ด์Šค RSS ํ”ผ๋“œ URL ์ƒ์„ฑ + + Args: + lang: ์–ธ์–ด ์ฝ”๋“œ + country: ๊ตญ๊ฐ€ ์ฝ”๋“œ + + Returns: + RSS ํ”ผ๋“œ URL + """ + return f"{GoogleNewsRSS.BASE_URL}?hl={lang}&gl={country}&ceid={country}:{lang}" + + @staticmethod + def get_common_feeds() -> List[dict]: + """ + ์ž์ฃผ ์‚ฌ์šฉ๋˜๋Š” RSS ํ”ผ๋“œ ๋ชฉ๋ก ๋ฐ˜ํ™˜ + + Returns: + ํ”ผ๋“œ ์ •๋ณด ๋ฆฌ์ŠคํŠธ + """ + return [ + { + "title": "๊ตฌ๊ธ€ ๋‰ด์Šค - ํ•œ๊ตญ ํ—ค๋“œ๋ผ์ธ", + "url": GoogleNewsRSS.trending_feed("ko", "KR"), + "description": "ํ•œ๊ตญ ์ฃผ์š” ๋‰ด์Šค" + }, + { + "title": "๊ตฌ๊ธ€ ๋‰ด์Šค - ๊ธฐ์ˆ ", + "url": GoogleNewsRSS.topic_feed(GoogleNewsCategory.TECHNOLOGY, "ko", "KR"), + "description": "๊ธฐ์ˆ  ๊ด€๋ จ ๋‰ด์Šค" + }, + { + "title": "๊ตฌ๊ธ€ ๋‰ด์Šค - ๋น„์ฆˆ๋‹ˆ์Šค", + "url": GoogleNewsRSS.topic_feed(GoogleNewsCategory.BUSINESS, "ko", "KR"), + "description": "๋น„์ฆˆ๋‹ˆ์Šค ๋‰ด์Šค" + }, + { + "title": "Google News - World", + "url": GoogleNewsRSS.topic_feed(GoogleNewsCategory.WORLD, "en", "US"), + "description": "World news in English" + } + ] \ No newline at end of file diff --git a/services/rss-feed/backend/app/main.py b/backup-services/rss-feed/backend/app/main.py similarity index 69% rename from services/rss-feed/backend/app/main.py rename to backup-services/rss-feed/backend/app/main.py index 823d620..b8f4fe2 100644 --- a/services/rss-feed/backend/app/main.py +++ b/backup-services/rss-feed/backend/app/main.py @@ -13,9 +13,10 @@ import json from .config import settings from .models import ( FeedSubscription, FeedEntry, CreateFeedRequest, - UpdateFeedRequest, FeedStatistics, FeedStatus + UpdateFeedRequest, FeedStatistics, FeedStatus, FeedCategory ) from .feed_parser import FeedParser +from .google_rss import GoogleNewsRSS, GoogleNewsCategory # Database connection db_client = None @@ -439,4 +440,157 @@ async def export_opml(): return { "opml": opml, "feed_count": len(feeds) - } \ No newline at end of file + } + +# Google News RSS Endpoints + +@app.get("/api/google-rss/search") +async def get_google_search_rss( + q: str = Query(..., description="๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ"), + lang: str = Query("ko", description="์–ธ์–ด ์ฝ”๋“œ (ko, en, ja, zh-CN ๋“ฑ)"), + country: str = Query("KR", description="๊ตญ๊ฐ€ ์ฝ”๋“œ (KR, US, JP, CN ๋“ฑ)") +): + """Google News ๊ฒ€์ƒ‰ RSS ํ”ผ๋“œ URL ์ƒ์„ฑ""" + feed_url = GoogleNewsRSS.search_feed(q, lang, country) + + # ํ”ผ๋“œ ํŒŒ์‹ฑ ํ…Œ์ŠคํŠธ + result = await parser.parse_feed(feed_url) + + return { + "keyword": q, + "feed_url": feed_url, + "success": result["success"], + "feed_title": result["feed"].get("title", "Google News") if result["success"] else None, + "entry_count": len(result["entries"]) if result["success"] else 0, + "sample_titles": [entry.get("title", "") for entry in result["entries"][:5]] if result["success"] else [], + "entries": [ + { + "title": entry.get("title", ""), + "link": entry.get("link", ""), + "published": entry.get("published", ""), + "summary": entry.get("summary", "")[:200] if entry.get("summary") else "" + } for entry in result["entries"][:20] + ] if result["success"] else [], + "error": result.get("error") + } + +@app.get("/api/google-rss/topic") +async def get_google_topic_rss( + category: GoogleNewsCategory = Query(..., description="๋‰ด์Šค ์นดํ…Œ๊ณ ๋ฆฌ"), + lang: str = Query("ko", description="์–ธ์–ด ์ฝ”๋“œ"), + country: str = Query("KR", description="๊ตญ๊ฐ€ ์ฝ”๋“œ") +): + """Google News ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ RSS ํ”ผ๋“œ URL ์ƒ์„ฑ""" + feed_url = GoogleNewsRSS.topic_feed(category, lang, country) + + # ํ”ผ๋“œ ํŒŒ์‹ฑ ํ…Œ์ŠคํŠธ + result = await parser.parse_feed(feed_url) + + return { + "category": category, + "feed_url": feed_url, + "success": result["success"], + "feed_title": result["feed"].get("title", "Google News") if result["success"] else None, + "entry_count": len(result["entries"]) if result["success"] else 0, + "sample_titles": [entry.get("title", "") for entry in result["entries"][:5]] if result["success"] else [], + "error": result.get("error") + } + +@app.get("/api/google-rss/location") +async def get_google_location_rss( + location: str = Query(..., description="์ง€์—ญ๋ช… (์˜ˆ: Seoul, ์„œ์šธ, New York)"), + lang: str = Query("ko", description="์–ธ์–ด ์ฝ”๋“œ"), + country: str = Query("KR", description="๊ตญ๊ฐ€ ์ฝ”๋“œ") +): + """Google News ์ง€์—ญ ๋‰ด์Šค RSS ํ”ผ๋“œ URL ์ƒ์„ฑ""" + feed_url = GoogleNewsRSS.location_feed(location, lang, country) + + # ํ”ผ๋“œ ํŒŒ์‹ฑ ํ…Œ์ŠคํŠธ + result = await parser.parse_feed(feed_url) + + return { + "location": location, + "feed_url": feed_url, + "success": result["success"], + "feed_title": result["feed"].get("title", "Google News") if result["success"] else None, + "entry_count": len(result["entries"]) if result["success"] else 0, + "sample_titles": [entry.get("title", "") for entry in result["entries"][:5]] if result["success"] else [], + "error": result.get("error") + } + +@app.get("/api/google-rss/trending") +async def get_google_trending_rss( + lang: str = Query("ko", description="์–ธ์–ด ์ฝ”๋“œ"), + country: str = Query("KR", description="๊ตญ๊ฐ€ ์ฝ”๋“œ") +): + """Google News ํŠธ๋ Œ๋”ฉ RSS ํ”ผ๋“œ URL ์ƒ์„ฑ""" + feed_url = GoogleNewsRSS.trending_feed(lang, country) + + # ํ”ผ๋“œ ํŒŒ์‹ฑ ํ…Œ์ŠคํŠธ + result = await parser.parse_feed(feed_url) + + return { + "feed_url": feed_url, + "success": result["success"], + "feed_title": result["feed"].get("title", "Google News") if result["success"] else None, + "entry_count": len(result["entries"]) if result["success"] else 0, + "sample_titles": [entry.get("title", "") for entry in result["entries"][:5]] if result["success"] else [], + "error": result.get("error") + } + +@app.post("/api/google-rss/subscribe") +async def subscribe_google_rss( + q: Optional[str] = Query(None, description="๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ"), + category: Optional[GoogleNewsCategory] = Query(None, description="์นดํ…Œ๊ณ ๋ฆฌ"), + location: Optional[str] = Query(None, description="์ง€์—ญ๋ช…"), + trending: bool = Query(False, description="ํŠธ๋ Œ๋”ฉ ๋‰ด์Šค"), + lang: str = Query("ko", description="์–ธ์–ด ์ฝ”๋“œ"), + country: str = Query("KR", description="๊ตญ๊ฐ€ ์ฝ”๋“œ"), + background_tasks: BackgroundTasks = ... +): + """Google News RSS ํ”ผ๋“œ ๊ตฌ๋…""" + # URL ์ƒ์„ฑ + if q: + feed_url = GoogleNewsRSS.search_feed(q, lang, country) + feed_title = f"Google News - {q}" + elif category: + feed_url = GoogleNewsRSS.topic_feed(category, lang, country) + feed_title = f"Google News - {category.value}" + elif location: + feed_url = GoogleNewsRSS.location_feed(location, lang, country) + feed_title = f"Google News - {location}" + elif trending: + feed_url = GoogleNewsRSS.trending_feed(lang, country) + feed_title = f"Google News - Trending ({country})" + else: + raise HTTPException(status_code=400, detail="๊ฒ€์ƒ‰์–ด, ์นดํ…Œ๊ณ ๋ฆฌ, ์ง€์—ญ ์ค‘ ํ•˜๋‚˜๋ฅผ ์ง€์ •ํ•ด์ฃผ์„ธ์š”") + + # ์ค‘๋ณต ํ™•์ธ + existing = await db.feeds.find_one({"url": feed_url}) + if existing: + raise HTTPException(status_code=400, detail="์ด๋ฏธ ๊ตฌ๋… ์ค‘์ธ ํ”ผ๋“œ์ž…๋‹ˆ๋‹ค") + + # ํ”ผ๋“œ ํŒŒ์‹ฑ + result = await parser.parse_feed(feed_url) + if not result["success"]: + raise HTTPException(status_code=400, detail=f"ํ”ผ๋“œ ํŒŒ์‹ฑ ์‹คํŒจ: {result['error']}") + + # ๊ตฌ๋… ์ƒ์„ฑ + feed = FeedSubscription( + title=feed_title, + url=feed_url, + description=result["feed"].get("description", "Google News Feed"), + category=FeedCategory.NEWS, + update_interval=900 # 15๋ถ„ + ) + + # DB ์ €์žฅ + feed_dict = feed.dict() + feed_dict["url"] = str(feed_dict["url"]) + result = await db.feeds.insert_one(feed_dict) + feed.id = str(result.inserted_id) + + # ๋ฐฑ๊ทธ๋ผ์šด๋“œ ์—…๋ฐ์ดํŠธ + background_tasks.add_task(update_feed, feed.id) + + return feed \ No newline at end of file diff --git a/services/rss-feed/backend/app/models.py b/backup-services/rss-feed/backend/app/models.py similarity index 100% rename from services/rss-feed/backend/app/models.py rename to backup-services/rss-feed/backend/app/models.py diff --git a/services/rss-feed/backend/requirements.txt b/backup-services/rss-feed/backend/requirements.txt similarity index 100% rename from services/rss-feed/backend/requirements.txt rename to backup-services/rss-feed/backend/requirements.txt diff --git a/config/api-keys-backup.env b/config/api-keys-backup.env new file mode 100644 index 0000000..4cd3975 --- /dev/null +++ b/config/api-keys-backup.env @@ -0,0 +1,18 @@ +# API Keys Backup - Created on 2025-01-13 +# ์ด ํŒŒ์ผ์€ ์ค‘์š”ํ•œ API ํ‚ค๋ฅผ ๋ฐฑ์—…ํ•œ ๊ฒƒ์ž…๋‹ˆ๋‹ค. ์•ˆ์ „ํ•˜๊ฒŒ ๋ณด๊ด€ํ•˜์„ธ์š”. + +# Claude API Key +CLAUDE_API_KEY=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 + +# Google APIs +GOOGLE_API_KEY=AIzaSyBakoCsDP_oF5V4oq_eEKs4eQb-ekqxnRM +GOOGLE_SEARCH_ENGINE_ID=35bfbdb7b6f244569 + +# Translation (DeepL) +DEEPL_API_KEY=3abbc796-2515-44a8-972d-22dcf27ab54a + +# Image Generation (Replicate) +REPLICATE_API_TOKEN=r8_AR4puLJQYD4eeuPljw2yJvKCWKT72k119pEyp + +# Additional APIs (ํ•„์š”์‹œ ์ถ”๊ฐ€) +# SERPAPI_KEY= \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index df5a521..0895e1d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3.8' - services: console-frontend: build: @@ -412,6 +410,230 @@ services: timeout: 10s retries: 3 + # News Aggregator Service + news-aggregator-backend: + build: + context: ./services/news-aggregator/backend + dockerfile: Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_news_aggregator_backend + ports: + - "8018:8000" + environment: + - RSS_SERVICE_URL=http://rss-feed-backend:8000 + - GOOGLE_SEARCH_SERVICE_URL=http://google-search-backend:8000 + depends_on: + - rss-feed-backend + - google-search-backend + networks: + - site11_network + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 3 + + # AI Writer Service + ai-writer-backend: + build: + context: ./services/ai-writer/backend + dockerfile: Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_ai_writer_backend + ports: + - "8019:8000" + environment: + - NEWS_AGGREGATOR_URL=http://news-aggregator-backend:8000 + - CLAUDE_API_KEY=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 + - MONGODB_URL=mongodb://mongodb:27017 + - DB_NAME=ai_writer_db + - REDIS_URL=redis://redis:6379 + depends_on: + - mongodb + - redis + - news-aggregator-backend + networks: + - site11_network + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 3 + + # AI Writer Worker Service + ai-writer-worker: + build: + context: ./services/ai-writer + dockerfile: worker/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_ai_writer_worker + environment: + - CLAUDE_API_KEY=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 + - MONGODB_URL=mongodb://mongodb:27017 + - DB_NAME=ai_writer_db + - REDIS_URL=redis://redis:6379 + - WORKER_COUNT=3 + depends_on: + - mongodb + - redis + - ai-writer-backend + networks: + - site11_network + restart: unless-stopped + + # ============ Pipeline Services ============ + # Pipeline Scheduler Service + pipeline-scheduler: + build: + context: ./services/pipeline + dockerfile: scheduler/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_pipeline_scheduler + restart: unless-stopped + depends_on: + - redis + - mongodb + environment: + - REDIS_URL=redis://redis:6379 + - MONGODB_URL=mongodb://mongodb:27017 + - DB_NAME=pipeline_db + - LOG_LEVEL=INFO + volumes: + - ./services/pipeline/shared:/app/shared:ro + networks: + - site11_network + + # Pipeline RSS Collector Worker + pipeline-rss-collector: + build: + context: ./services/pipeline + dockerfile: rss-collector/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_pipeline_rss_collector + restart: unless-stopped + depends_on: + - redis + environment: + - REDIS_URL=redis://redis:6379 + - LOG_LEVEL=INFO + volumes: + - ./services/pipeline/shared:/app/shared:ro + networks: + - site11_network + + # Pipeline Google Search Worker + pipeline-google-search: + build: + context: ./services/pipeline + dockerfile: google-search/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_pipeline_google_search + restart: unless-stopped + depends_on: + - redis + environment: + - REDIS_URL=redis://redis:6379 + - GOOGLE_API_KEY=AIzaSyBakoCsDP_oF5V4oq_eEKs4eQb-ekqxnRM + - GOOGLE_SEARCH_ENGINE_ID=35bfbdb7b6f244569 + - LOG_LEVEL=INFO + volumes: + - ./services/pipeline/shared:/app/shared:ro + networks: + - site11_network + + # Pipeline AI Summarizer Worker + pipeline-ai-summarizer: + build: + context: ./services/pipeline + dockerfile: ai-summarizer/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_pipeline_ai_summarizer + restart: unless-stopped + depends_on: + - redis + environment: + - REDIS_URL=redis://redis:6379 + - CLAUDE_API_KEY=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 + - LOG_LEVEL=INFO + volumes: + - ./services/pipeline/shared:/app/shared:ro + networks: + - site11_network + + # Pipeline Article Assembly Worker + pipeline-article-assembly: + build: + context: ./services/pipeline + dockerfile: article-assembly/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_pipeline_article_assembly + restart: unless-stopped + depends_on: + - redis + - mongodb + environment: + - REDIS_URL=redis://redis:6379 + - MONGODB_URL=mongodb://mongodb:27017 + - DB_NAME=pipeline_db + - CLAUDE_API_KEY=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 + - LOG_LEVEL=INFO + volumes: + - ./services/pipeline/shared:/app/shared:ro + networks: + - site11_network + + # Pipeline Monitor (optional dashboard) + pipeline-monitor: + build: + context: ./services/pipeline + dockerfile: monitor/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_pipeline_monitor + restart: unless-stopped + depends_on: + - redis + - mongodb + ports: + - "8100:8000" + environment: + - REDIS_URL=redis://redis:6379 + - MONGODB_URL=mongodb://mongodb:27017 + - DB_NAME=pipeline_db + - LOG_LEVEL=INFO + volumes: + - ./services/pipeline/shared:/app/shared:ro + networks: + - site11_network + + # Pipeline Translator + pipeline-translator: + build: + context: ./services/pipeline + dockerfile: translator/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_pipeline_translator + restart: unless-stopped + depends_on: + - redis + environment: + - REDIS_URL=redis://redis:6379 + - DEEPL_API_KEY=3abbc796-2515-44a8-972d-22dcf27ab54a + - LOG_LEVEL=INFO + volumes: + - ./services/pipeline/shared:/app/shared:ro + networks: + - site11_network + + # Pipeline Image Generator + pipeline-image-generator: + build: + context: ./services/pipeline + dockerfile: image-generator/Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_pipeline_image_generator + restart: unless-stopped + depends_on: + - redis + environment: + - REDIS_URL=redis://redis:6379 + - REPLICATE_API_KEY=${REPLICATE_API_KEY:-} + - LOG_LEVEL=INFO + volumes: + - ./services/pipeline/shared:/app/shared:ro + networks: + - site11_network + networks: site11_network: driver: bridge diff --git a/generated_article.json b/generated_article.json new file mode 100644 index 0000000..e883c5f --- /dev/null +++ b/generated_article.json @@ -0,0 +1,63 @@ +{ + "news_id": "dda43a2b-8478-4bd8-be74-32ab2618a7dd", + "title": "2025๋…„ ๋Œ€ํ•œ๋ฏผ๊ตญ AI ์ „ํ™˜ ๊ฐ€์†ํ™”...์ •๋ถ€ 10์กฐ์› ํˆฌ์ž ๊ณ„ํš ๋ฐœํ‘œ", + "created_at": "2025-09-12T19:33:52.388833", + "summary": "์ •๋ถ€๊ฐ€ ๋‚ด๋…„ ์ธ๊ณต์ง€๋Šฅ ๋ถ„์•ผ์— 10์กฐ์› ํˆฌ์ž ๊ณ„ํš์„ ๋ฐœํ‘œํ•˜๋ฉฐ ๊ตญ๊ฐ€ AI ๊ฒฝ์Ÿ๋ ฅ ๊ฐ•ํ™”์— ๋‚˜์„œ, ์‚ฐ์—… ์ „๋ฐ˜์˜ ๋””์ง€ํ„ธ ์ „ํ™˜ ๊ฐ€์†ํ™” ์ „๋ง", + "subtopics": [ + { + "title": "์ •๋ถ€์˜ ๋Œ€๊ทœ๋ชจ AI ํˆฌ์ž ๊ณ„ํš", + "content": [ + "์ •๋ถ€๊ฐ€ 2025๋…„ ์ธ๊ณต์ง€๋Šฅ ๋ถ„์•ผ์— 10์กฐ์› ๊ทœ๋ชจ์˜ ๋Œ€๊ทœ๋ชจ ํˆฌ์ž๋ฅผ ๋‹จํ–‰ํ•  ๊ณ„ํš์„ ๋ฐœํ‘œํ–ˆ๋‹ค. ์ด๋Š” ํ•œ๊ตญ ๊ฒฝ์ œ์˜ ์ฒด์งˆ ๊ฐœ์„ ๊ณผ ๋””์ง€ํ„ธ ์ „ํ™˜ ๊ฐ€์†ํ™”๋ฅผ ์œ„ํ•œ ์ „๋žต์  ๊ฒฐ์ •์œผ๋กœ ํ‰๊ฐ€๋ฐ›๊ณ  ์žˆ๋‹ค.", + "ํˆฌ์ž์˜ ์ฃผ์š” ๋ฐฉํ–ฅ์€ AI ๊ธฐ์ˆ  ๊ฐœ๋ฐœ, ์ธํ”„๋ผ ๊ตฌ์ถ•, ์ธ์žฌ ์–‘์„ฑ ๋“ฑ ๋‹ค๋ฐฉ๋ฉด์— ๊ฑธ์ณ์žˆ๋‹ค. ํŠนํžˆ ํ”ผ์ง€์ปฌ AI ์‹ค์ฆ๋‹จ์ง€ ์กฐ์„ฑ๊ณผ ๊ฐ™์€ ์‹ค์šฉ์  ํ”„๋กœ์ ํŠธ๋“ค์ด ํฌํ•จ๋˜์–ด ์žˆ์–ด ์‹ค์งˆ์ ์ธ ์‚ฐ์—… ๋ฐœ์ „ ํšจ๊ณผ๊ฐ€ ๊ธฐ๋Œ€๋œ๋‹ค.", + "๋Œ€ํ†ต๋ น์‹ค์€ AI ์ •์ฑ… ์ถ”์ง„์„ ์œ„ํ•œ ์ „๋‹ด ์กฐ์ง์„ ์‹ ์„คํ•˜๊ณ , ์ •๋ถ€์™€ ์—…๊ณ„ ๊ฐ„์˜ ๊ฐ€๊ต ์—ญํ• ์„ ์ˆ˜ํ–‰ํ•  ์˜ˆ์ •์ด๋‹ค. ์ด๋ฅผ ํ†ตํ•ด ๋ฏผ๊ด€ ํ˜‘๋ ฅ ์ฒด๊ณ„๋ฅผ ๊ฐ•ํ™”ํ•˜๊ณ  ์ •์ฑ… ์‹คํ–‰๋ ฅ์„ ๋†’์ผ ๊ณ„ํš์ด๋‹ค." + ] + }, + { + "title": "๊ตญ๋‚ด ๊ธฐ์—…๋“ค์˜ AI ๊ธฐ์ˆ  ๊ฒฝ์Ÿ๋ ฅ ๊ฐ•ํ™”", + "content": [ + "SKํ•˜์ด๋‹‰์Šค๋Š” AI ๋ฉ”๋ชจ๋ฆฌ ๋ถ„์•ผ์—์„œ HBM4 ์–‘์‚ฐ์„ ์‹œ์ž‘ํ•˜๋ฉฐ ๊ธ€๋กœ๋ฒŒ ๊ธฐ์ˆ  ๊ฒฝ์Ÿ์—์„œ ์šฐ์œ„๋ฅผ ์ ํ•˜๊ณ  ์žˆ๋‹ค. ํŠนํžˆ 10Gbps ์ด์ƒ์˜ ๋™์ž‘์†๋„ ๊ตฌํ˜„์— ์„ฑ๊ณตํ•˜๋ฉฐ ๊ธฐ์ˆ ๋ ฅ์„ ์ž…์ฆํ–ˆ๋‹ค.", + "๊ตญ๋‚ด ์ฃผ์š” ๊ธฐ์—…๋“ค์€ AI ๊ด€๋ จ ์—ฐ๊ตฌ๊ฐœ๋ฐœ ํˆฌ์ž๋ฅผ ํ™•๋Œ€ํ•˜๊ณ  ์žˆ์œผ๋ฉฐ, ํŠนํžˆ ๋ฐ˜๋„์ฒด, ๋กœ๋ด‡, ์†Œํ”„ํŠธ์›จ์–ด ๋ถ„์•ผ์—์„œ ๊ด„๋ชฉํ• ๋งŒํ•œ ์„ฑ๊ณผ๋ฅผ ๋ณด์ด๊ณ  ์žˆ๋‹ค.", + "์‚ฐ์—…๊ณ„๋Š” ์ •๋ถ€์˜ ๋Œ€๊ทœ๋ชจ ํˆฌ์ž ๊ณ„ํš์— ํ˜ธ์‘ํ•˜์—ฌ ์ž์ฒด์ ์ธ AI ํ˜์‹  ํ”„๋กœ๊ทธ๋žจ์„ ๊ฐ€์†ํ™”ํ•˜๊ณ  ์žˆ์œผ๋ฉฐ, ๊ธ€๋กœ๋ฒŒ ์‹œ์žฅ์—์„œ์˜ ๊ฒฝ์Ÿ๋ ฅ ๊ฐ•ํ™”๋ฅผ ์œ„ํ•ด ์ด๋ ฅ์„ ๊ธฐ์šธ์ด๊ณ  ์žˆ๋‹ค." + ] + }, + { + "title": "AI ๊ธฐ๋ณธ๋ฒ• ์‹œํ–‰๊ณผ ์ œ๋„์  ๊ธฐ๋ฐ˜ ๋งˆ๋ จ", + "content": [ + "์ •๋ถ€๋Š” AI ๊ธฐ๋ณธ๋ฒ• ์‹œํ–‰๋ น์„ ๊ณต๊ฐœํ•˜๋ฉฐ ์ธ๊ณต์ง€๋Šฅ ๋ฐœ์ „์„ ์œ„ํ•œ ์ œ๋„์  ๊ธฐ๋ฐ˜์„ ๋งˆ๋ จํ–ˆ๋‹ค. ์ด๋ฅผ ํ†ตํ•ด AI ์‚ฐ์—… ๋ฐœ์ „์˜ ๋ฒ•์  ๊ทผ๊ฑฐ์™€ ์œค๋ฆฌ์  ๊ฐ€์ด๋“œ๋ผ์ธ์ด ํ™•๋ฆฝ๋  ์ „๋ง์ด๋‹ค.", + "์ƒˆ๋กœ์šด ๋ฒ•์ œ๋„๋Š” AI ๊ธฐ์ˆ ์˜ ์•ˆ์ „ํ•œ ๋ฐœ์ „๊ณผ ์œค๋ฆฌ์  ํ™œ์šฉ์„ ๋ณด์žฅํ•˜๋ฉด์„œ๋„, ๊ธฐ์—…๋“ค์˜ ํ˜์‹ ์„ ์ €ํ•ดํ•˜์ง€ ์•Š๋Š” ๊ท ํ˜•์žกํžŒ ์ ‘๊ทผ์„ ์ถ”๊ตฌํ•˜๊ณ  ์žˆ๋‹ค.", + "ํŠนํžˆ AI ๊ธฐ๋ณธ๋ฒ•์€ ๊ฐœ์ธ์ •๋ณด ๋ณดํ˜ธ, ์•Œ๊ณ ๋ฆฌ์ฆ˜ ํˆฌ๋ช…์„ฑ, ์ฑ…์ž„์„ฑ ๋“ฑ ์ฃผ์š” ์ด์Šˆ๋“ค์— ๋Œ€ํ•œ ๋ช…ํ™•ํ•œ ๊ธฐ์ค€์„ ์ œ์‹œํ•˜์—ฌ ์‚ฐ์—…๊ณ„์˜ ๋ถˆํ™•์‹ค์„ฑ์„ ํ•ด์†Œํ•  ๊ฒƒ์œผ๋กœ ๊ธฐ๋Œ€๋œ๋‹ค." + ] + } + ], + "categories": [ + "๊ธฐ์ˆ ", + "์ •์ฑ…", + "์‚ฐ์—…", + "๊ฒฝ์ œ" + ], + "entities": { + "people": [ + "๊ฐ•ํ›ˆ์‹", + "๋ฐฐ๊ฒฝํ›ˆ ๊ณผ๊ธฐ์ •ํ†ต๋ถ€ ์žฅ๊ด€" + ], + "organizations": [ + "SKํ•˜์ด๋‹‰์Šค", + "๋Œ€ํ†ต๋ น์‹ค", + "๊ณผํ•™๊ธฐ์ˆ ์ •๋ณดํ†ต์‹ ๋ถ€" + ], + "groups": [ + "AI ์‚ฐ์—…๊ณ„", + "๋ฐ˜๋„์ฒด ์—…๊ณ„" + ], + "countries": [ + "๋Œ€ํ•œ๋ฏผ๊ตญ", + "๋ฏธ๊ตญ" + ], + "events": [ + "AI ๊ธฐ๋ณธ๋ฒ• ์‹œํ–‰", + "์ •๋ถ€ 10์กฐ์› ํˆฌ์ž ๊ณ„ํš ๋ฐœํ‘œ" + ] + }, + "source_keyword": "์ธ๊ณต์ง€๋Šฅ", + "source_count": 5 +} \ No newline at end of file diff --git a/services/pipeline/Makefile b/services/pipeline/Makefile new file mode 100644 index 0000000..0158cf5 --- /dev/null +++ b/services/pipeline/Makefile @@ -0,0 +1,90 @@ +# Pipeline Makefile + +.PHONY: help build up down restart logs clean test monitor + +help: + @echo "Pipeline Management Commands:" + @echo " make build - Build all Docker images" + @echo " make up - Start all services" + @echo " make down - Stop all services" + @echo " make restart - Restart all services" + @echo " make logs - View logs for all services" + @echo " make clean - Clean up containers and volumes" + @echo " make monitor - Open monitor dashboard" + @echo " make test - Test pipeline with sample keyword" + +build: + docker-compose build + +up: + docker-compose up -d + +down: + docker-compose down + +restart: + docker-compose restart + +logs: + docker-compose logs -f + +clean: + docker-compose down -v + docker system prune -f + +monitor: + @echo "Opening monitor dashboard..." + @echo "Dashboard: http://localhost:8100" + @echo "API Docs: http://localhost:8100/docs" + +test: + @echo "Testing pipeline with sample keyword..." + curl -X POST http://localhost:8100/api/keywords \ + -H "Content-Type: application/json" \ + -d '{"keyword": "ํ…Œ์ŠคํŠธ", "schedule": "30min"}' + @echo "\nTriggering immediate processing..." + curl -X POST http://localhost:8100/api/trigger/ํ…Œ์ŠคํŠธ + +# Service-specific commands +scheduler-logs: + docker-compose logs -f scheduler + +rss-logs: + docker-compose logs -f rss-collector + +search-logs: + docker-compose logs -f google-search + +summarizer-logs: + docker-compose logs -f ai-summarizer + +assembly-logs: + docker-compose logs -f article-assembly + +monitor-logs: + docker-compose logs -f monitor + +# Database commands +redis-cli: + docker-compose exec redis redis-cli + +mongo-shell: + docker-compose exec mongodb mongosh -u admin -p password123 + +# Queue management +queue-status: + @echo "Checking queue status..." + docker-compose exec redis redis-cli --raw LLEN queue:keyword + docker-compose exec redis redis-cli --raw LLEN queue:rss + docker-compose exec redis redis-cli --raw LLEN queue:search + docker-compose exec redis redis-cli --raw LLEN queue:summarize + docker-compose exec redis redis-cli --raw LLEN queue:assembly + +queue-clear: + @echo "Clearing all queues..." + docker-compose exec redis redis-cli FLUSHDB + +# Health check +health: + @echo "Checking service health..." + curl -s http://localhost:8100/api/health | python3 -m json.tool \ No newline at end of file diff --git a/services/pipeline/README.md b/services/pipeline/README.md new file mode 100644 index 0000000..e8bf455 --- /dev/null +++ b/services/pipeline/README.md @@ -0,0 +1,154 @@ +# News Pipeline System + +๋น„๋™๊ธฐ ํ ๊ธฐ๋ฐ˜ ๋‰ด์Šค ์ƒ์„ฑ ํŒŒ์ดํ”„๋ผ์ธ ์‹œ์Šคํ…œ + +## ์•„ํ‚คํ…์ฒ˜ + +``` +Scheduler โ†’ RSS Collector โ†’ Google Search โ†’ AI Summarizer โ†’ Article Assembly โ†’ MongoDB + โ†“ โ†“ โ†“ โ†“ โ†“ + Redis Queue Redis Queue Redis Queue Redis Queue Redis Queue +``` + +## ์„œ๋น„์Šค ๊ตฌ์„ฑ + +### 1. Scheduler +- 30๋ถ„๋งˆ๋‹ค ๋“ฑ๋ก๋œ ํ‚ค์›Œ๋“œ ์ฒ˜๋ฆฌ +- ์˜ค์ „ 7์‹œ, ๋‚ฎ 12์‹œ, ์ €๋… 6์‹œ ์šฐ์„  ์ฒ˜๋ฆฌ +- MongoDB์—์„œ ํ‚ค์›Œ๋“œ ๋กœ๋“œ ํ›„ ํ์— ์ž‘์—… ์ƒ์„ฑ + +### 2. RSS Collector +- RSS ํ”ผ๋“œ ์ˆ˜์ง‘ (Google News RSS) +- 7์ผ๊ฐ„ ์ค‘๋ณต ๋ฐฉ์ง€ (Redis Set) +- ํ‚ค์›Œ๋“œ ๊ด€๋ จ์„ฑ ํ•„ํ„ฐ๋ง + +### 3. Google Search +- RSS ์•„์ดํ…œ๋ณ„ ์ถ”๊ฐ€ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ˆ˜์ง‘ +- ์•„์ดํ…œ๋‹น ์ตœ๋Œ€ 3๊ฐœ ๊ฒฐ๊ณผ +- ์ž‘์—…๋‹น ์ตœ๋Œ€ 5๊ฐœ ์•„์ดํ…œ ์ฒ˜๋ฆฌ + +### 4. AI Summarizer +- Claude Haiku๋กœ ๋น ๋ฅธ ์š”์•ฝ ์ƒ์„ฑ +- 200์ž ์ด๋‚ด ํ•œ๊ตญ์–ด ์š”์•ฝ +- ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ ์ง€์› (3 workers) + +### 5. Article Assembly +- Claude Sonnet์œผ๋กœ ์ข…ํ•ฉ ๊ธฐ์‚ฌ ์ž‘์„ฑ +- 1500์ž ์ด๋‚ด ์ „๋ฌธ ๊ธฐ์‚ฌ +- MongoDB ์ €์žฅ ๋ฐ ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ + +### 6. Monitor +- ์‹ค์‹œ๊ฐ„ ํŒŒ์ดํ”„๋ผ์ธ ๋ชจ๋‹ˆํ„ฐ๋ง +- ํ ์ƒํƒœ, ์›Œ์ปค ์ƒํƒœ ํ™•์ธ +- REST API ์ œ๊ณต (ํฌํŠธ 8100) + +## ์‹œ์ž‘ํ•˜๊ธฐ + +### 1. ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์„ค์ • +```bash +# .env ํŒŒ์ผ ํ™•์ธ +CLAUDE_API_KEY=your_claude_api_key +GOOGLE_API_KEY=your_google_api_key +GOOGLE_SEARCH_ENGINE_ID=your_search_engine_id +``` + +### 2. ์„œ๋น„์Šค ์‹œ์ž‘ +```bash +cd pipeline +docker-compose up -d +``` + +### 3. ๋ชจ๋‹ˆํ„ฐ๋ง +```bash +# ๋กœ๊ทธ ํ™•์ธ +docker-compose logs -f + +# ํŠน์ • ์„œ๋น„์Šค ๋กœ๊ทธ +docker-compose logs -f scheduler + +# ๋ชจ๋‹ˆํ„ฐ API +curl http://localhost:8100/api/stats +``` + +## API ์—”๋“œํฌ์ธํŠธ + +### Monitor API (ํฌํŠธ 8100) + +- `GET /api/stats` - ์ „์ฒด ํ†ต๊ณ„ +- `GET /api/queues/{queue_name}` - ํ ์ƒ์„ธ ์ •๋ณด +- `GET /api/keywords` - ํ‚ค์›Œ๋“œ ๋ชฉ๋ก +- `POST /api/keywords` - ํ‚ค์›Œ๋“œ ๋“ฑ๋ก +- `DELETE /api/keywords/{id}` - ํ‚ค์›Œ๋“œ ์‚ญ์ œ +- `GET /api/articles` - ๊ธฐ์‚ฌ ๋ชฉ๋ก +- `GET /api/articles/{id}` - ๊ธฐ์‚ฌ ์ƒ์„ธ +- `GET /api/workers` - ์›Œ์ปค ์ƒํƒœ +- `POST /api/trigger/{keyword}` - ์ˆ˜๋™ ์ฒ˜๋ฆฌ ํŠธ๋ฆฌ๊ฑฐ +- `GET /api/health` - ํ—ฌ์Šค ์ฒดํฌ + +## ํ‚ค์›Œ๋“œ ๋“ฑ๋ก ์˜ˆ์‹œ + +```bash +# ์ƒˆ ํ‚ค์›Œ๋“œ ๋“ฑ๋ก +curl -X POST http://localhost:8100/api/keywords \ + -H "Content-Type: application/json" \ + -d '{"keyword": "์ธ๊ณต์ง€๋Šฅ", "schedule": "30min"}' + +# ์ˆ˜๋™ ์ฒ˜๋ฆฌ ํŠธ๋ฆฌ๊ฑฐ +curl -X POST http://localhost:8100/api/trigger/์ธ๊ณต์ง€๋Šฅ +``` + +## ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค + +### MongoDB Collections +- `keywords` - ๋“ฑ๋ก๋œ ํ‚ค์›Œ๋“œ +- `articles` - ์ƒ์„ฑ๋œ ๊ธฐ์‚ฌ +- `keyword_stats` - ํ‚ค์›Œ๋“œ๋ณ„ ํ†ต๊ณ„ + +### Redis Keys +- `queue:*` - ์ž‘์—… ํ +- `processing:*` - ์ฒ˜๋ฆฌ ์ค‘ ์ž‘์—… +- `failed:*` - ์‹คํŒจํ•œ ์ž‘์—… +- `dedup:rss:*` - RSS ์ค‘๋ณต ๋ฐฉ์ง€ +- `workers:*:active` - ํ™œ์„ฑ ์›Œ์ปค + +## ํŠธ๋Ÿฌ๋ธ”์ŠˆํŒ… + +### ํ ์ดˆ๊ธฐํ™” +```bash +docker-compose exec redis redis-cli FLUSHDB +``` + +### ์›Œ์ปค ์žฌ์‹œ์ž‘ +```bash +docker-compose restart rss-collector +``` + +### ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค ์ ‘์† +```bash +# MongoDB +docker-compose exec mongodb mongosh -u admin -p password123 + +# Redis +docker-compose exec redis redis-cli +``` + +## ์Šค์ผ€์ผ๋ง + +์›Œ์ปค ์ˆ˜ ์กฐ์ •: +```yaml +# docker-compose.yml +ai-summarizer: + deploy: + replicas: 5 # ์›Œ์ปค ์ˆ˜ ์ฆ๊ฐ€ +``` + +## ๋ชจ๋‹ˆํ„ฐ๋ง ๋Œ€์‹œ๋ณด๋“œ + +๋ธŒ๋ผ์šฐ์ €์—์„œ http://localhost:8100 ์ ‘์†ํ•˜์—ฌ ํŒŒ์ดํ”„๋ผ์ธ ์ƒํƒœ ํ™•์ธ + +## ๋กœ๊ทธ ๋ ˆ๋ฒจ ์„ค์ • + +`.env` ํŒŒ์ผ์—์„œ ์กฐ์ •: +``` +LOG_LEVEL=DEBUG # INFO, WARNING, ERROR +``` \ No newline at end of file diff --git a/services/pipeline/ai-summarizer/Dockerfile b/services/pipeline/ai-summarizer/Dockerfile new file mode 100644 index 0000000..efdb0c6 --- /dev/null +++ b/services/pipeline/ai-summarizer/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11-slim + +WORKDIR /app + +# ์˜์กด์„ฑ ์„ค์น˜ +COPY ./ai-summarizer/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# ๊ณตํ†ต ๋ชจ๋“ˆ ๋ณต์‚ฌ +COPY ./shared /app/shared + +# AI Summarizer ์ฝ”๋“œ ๋ณต์‚ฌ +COPY ./ai-summarizer /app + +# ํ™˜๊ฒฝ๋ณ€์ˆ˜ +ENV PYTHONUNBUFFERED=1 + +# ์‹คํ–‰ +CMD ["python", "ai_summarizer.py"] \ No newline at end of file diff --git a/services/pipeline/ai-summarizer/ai_summarizer.py b/services/pipeline/ai-summarizer/ai_summarizer.py new file mode 100644 index 0000000..77209fb --- /dev/null +++ b/services/pipeline/ai-summarizer/ai_summarizer.py @@ -0,0 +1,161 @@ +""" +AI Summarizer Service +Claude API๋ฅผ ์‚ฌ์šฉํ•œ ๋‰ด์Šค ์š”์•ฝ ์„œ๋น„์Šค +""" +import asyncio +import logging +import os +import sys +from typing import List, Dict, Any +from anthropic import AsyncAnthropic + +# Import from shared module +from shared.models import PipelineJob, EnrichedItem, SummarizedItem +from shared.queue_manager import QueueManager + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class AISummarizerWorker: + def __init__(self): + self.queue_manager = QueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") + ) + self.claude_api_key = os.getenv("CLAUDE_API_KEY") + self.claude_client = None + + async def start(self): + """์›Œ์ปค ์‹œ์ž‘""" + logger.info("Starting AI Summarizer Worker") + + # Redis ์—ฐ๊ฒฐ + await self.queue_manager.connect() + + # Claude ํด๋ผ์ด์–ธํŠธ ์ดˆ๊ธฐํ™” + if self.claude_api_key: + self.claude_client = AsyncAnthropic(api_key=self.claude_api_key) + else: + logger.error("Claude API key not configured") + return + + # ๋ฉ”์ธ ์ฒ˜๋ฆฌ ๋ฃจํ”„ + while True: + try: + # ํ์—์„œ ์ž‘์—… ๊ฐ€์ ธ์˜ค๊ธฐ + job = await self.queue_manager.dequeue('ai_summarization', timeout=5) + + if job: + await self.process_job(job) + + except Exception as e: + logger.error(f"Error in worker loop: {e}") + await asyncio.sleep(1) + + async def process_job(self, job: PipelineJob): + """AI ์š”์•ฝ ์ž‘์—… ์ฒ˜๋ฆฌ""" + try: + logger.info(f"Processing job {job.job_id} for AI summarization") + + enriched_items = job.data.get('enriched_items', []) + summarized_items = [] + + for item_data in enriched_items: + enriched_item = EnrichedItem(**item_data) + + # AI ์š”์•ฝ ์ƒ์„ฑ + summary = await self._generate_summary(enriched_item) + + summarized_item = SummarizedItem( + enriched_item=enriched_item, + ai_summary=summary, + summary_language='ko' + ) + summarized_items.append(summarized_item) + + # API ์†๋„ ์ œํ•œ + await asyncio.sleep(1) + + if summarized_items: + logger.info(f"Summarized {len(summarized_items)} items") + + # ๋‹ค์Œ ๋‹จ๊ณ„๋กœ ์ „๋‹ฌ (๋ฒˆ์—ญ ๋‹จ๊ณ„๋กœ) + job.data['summarized_items'] = [item.dict() for item in summarized_items] + job.stages_completed.append('ai_summarization') + job.stage = 'translation' + + await self.queue_manager.enqueue('translation', job) + await self.queue_manager.mark_completed('ai_summarization', job.job_id) + else: + logger.warning(f"No items summarized for job {job.job_id}") + await self.queue_manager.mark_failed( + 'ai_summarization', + job, + "No items to summarize" + ) + + except Exception as e: + logger.error(f"Error processing job {job.job_id}: {e}") + await self.queue_manager.mark_failed('ai_summarization', job, str(e)) + + async def _generate_summary(self, enriched_item: EnrichedItem) -> str: + """Claude๋ฅผ ์‚ฌ์šฉํ•œ ์š”์•ฝ ์ƒ์„ฑ""" + try: + # ์ปจํ…์ธ  ์ค€๋น„ + content_parts = [ + f"์ œ๋ชฉ: {enriched_item.rss_item.title}", + f"์š”์•ฝ: {enriched_item.rss_item.summary or '์—†์Œ'}" + ] + + # ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ถ”๊ฐ€ + if enriched_item.search_results: + content_parts.append("\n๊ด€๋ จ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ:") + for idx, result in enumerate(enriched_item.search_results[:3], 1): + content_parts.append(f"{idx}. {result.title}") + if result.snippet: + content_parts.append(f" {result.snippet}") + + content = "\n".join(content_parts) + + # Claude API ํ˜ธ์ถœ + prompt = f"""๋‹ค์Œ ๋‰ด์Šค ๋‚ด์šฉ์„ 200์ž ์ด๋‚ด๋กœ ํ•ต์‹ฌ๋งŒ ์š”์•ฝํ•ด์ฃผ์„ธ์š”. +์ค‘์š”ํ•œ ์‚ฌ์‹ค, ์ˆ˜์น˜, ์ธ๋ฌผ, ์กฐ์ง์„ ํฌํ•จํ•˜๊ณ  ๊ฐ๊ด€์ ์ธ ํ†ค์„ ์œ ์ง€ํ•˜์„ธ์š”. + +{content} + +์š”์•ฝ:""" + + response = await self.claude_client.messages.create( + model="claude-sonnet-4-20250514", # ์ตœ์‹  Sonnet ๋ชจ๋ธ + max_tokens=500, + temperature=0.3, + messages=[ + {"role": "user", "content": prompt} + ] + ) + + summary = response.content[0].text.strip() + return summary + + except Exception as e: + logger.error(f"Error generating summary: {e}") + # ํด๋ฐฑ: ์›๋ณธ ์š”์•ฝ ์‚ฌ์šฉ + return enriched_item.rss_item.summary[:200] if enriched_item.rss_item.summary else enriched_item.rss_item.title + + async def stop(self): + """์›Œ์ปค ์ค‘์ง€""" + await self.queue_manager.disconnect() + logger.info("AI Summarizer Worker stopped") + +async def main(): + """๋ฉ”์ธ ํ•จ์ˆ˜""" + worker = AISummarizerWorker() + + try: + await worker.start() + except KeyboardInterrupt: + logger.info("Received interrupt signal") + finally: + await worker.stop() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/services/pipeline/ai-summarizer/requirements.txt b/services/pipeline/ai-summarizer/requirements.txt new file mode 100644 index 0000000..db8aa9c --- /dev/null +++ b/services/pipeline/ai-summarizer/requirements.txt @@ -0,0 +1,3 @@ +anthropic==0.50.0 +redis[hiredis]==5.0.1 +pydantic==2.5.0 \ No newline at end of file diff --git a/services/pipeline/article-assembly/Dockerfile b/services/pipeline/article-assembly/Dockerfile new file mode 100644 index 0000000..5929d7d --- /dev/null +++ b/services/pipeline/article-assembly/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11-slim + +WORKDIR /app + +# ์˜์กด์„ฑ ์„ค์น˜ +COPY ./article-assembly/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# ๊ณตํ†ต ๋ชจ๋“ˆ ๋ณต์‚ฌ +COPY ./shared /app/shared + +# Article Assembly ์ฝ”๋“œ ๋ณต์‚ฌ +COPY ./article-assembly /app + +# ํ™˜๊ฒฝ๋ณ€์ˆ˜ +ENV PYTHONUNBUFFERED=1 + +# ์‹คํ–‰ +CMD ["python", "article_assembly.py"] \ No newline at end of file diff --git a/services/pipeline/article-assembly/article_assembly.py b/services/pipeline/article-assembly/article_assembly.py new file mode 100644 index 0000000..f798e99 --- /dev/null +++ b/services/pipeline/article-assembly/article_assembly.py @@ -0,0 +1,234 @@ +""" +Article Assembly Service +์ตœ์ข… ๊ธฐ์‚ฌ ์กฐ๋ฆฝ ๋ฐ MongoDB ์ €์žฅ ์„œ๋น„์Šค +""" +import asyncio +import logging +import os +import sys +import json +from datetime import datetime +from typing import List, Dict, Any +from anthropic import AsyncAnthropic +from motor.motor_asyncio import AsyncIOMotorClient + +# Import from shared module +from shared.models import PipelineJob, SummarizedItem, FinalArticle +from shared.queue_manager import QueueManager + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class ArticleAssemblyWorker: + def __init__(self): + self.queue_manager = QueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") + ) + self.claude_api_key = os.getenv("CLAUDE_API_KEY") + self.claude_client = None + self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017") + self.db_name = os.getenv("DB_NAME", "pipeline_db") + self.db = None + + async def start(self): + """์›Œ์ปค ์‹œ์ž‘""" + logger.info("Starting Article Assembly Worker") + + # Redis ์—ฐ๊ฒฐ + await self.queue_manager.connect() + + # MongoDB ์—ฐ๊ฒฐ + client = AsyncIOMotorClient(self.mongodb_url) + self.db = client[self.db_name] + + # Claude ํด๋ผ์ด์–ธํŠธ ์ดˆ๊ธฐํ™” + if self.claude_api_key: + self.claude_client = AsyncAnthropic(api_key=self.claude_api_key) + else: + logger.error("Claude API key not configured") + return + + # ๋ฉ”์ธ ์ฒ˜๋ฆฌ ๋ฃจํ”„ + while True: + try: + # ํ์—์„œ ์ž‘์—… ๊ฐ€์ ธ์˜ค๊ธฐ + job = await self.queue_manager.dequeue('article_assembly', timeout=5) + + if job: + await self.process_job(job) + + except Exception as e: + logger.error(f"Error in worker loop: {e}") + await asyncio.sleep(1) + + async def process_job(self, job: PipelineJob): + """์ตœ์ข… ๊ธฐ์‚ฌ ์กฐ๋ฆฝ ์ž‘์—… ์ฒ˜๋ฆฌ""" + try: + start_time = datetime.now() + logger.info(f"Processing job {job.job_id} for article assembly") + + summarized_items = job.data.get('summarized_items', []) + + if not summarized_items: + logger.warning(f"No items to assemble for job {job.job_id}") + await self.queue_manager.mark_failed( + 'article_assembly', + job, + "No items to assemble" + ) + return + + # ์ตœ์ข… ๊ธฐ์‚ฌ ์ƒ์„ฑ + article = await self._generate_final_article(job, summarized_items) + + # ์ฒ˜๋ฆฌ ์‹œ๊ฐ„ ๊ณ„์‚ฐ + processing_time = (datetime.now() - start_time).total_seconds() + article.processing_time = processing_time + + # MongoDB์— ์ €์žฅ + await self.db.articles.insert_one(article.dict()) + + logger.info(f"Article {article.article_id} saved to MongoDB") + + # ์™„๋ฃŒ ํ‘œ์‹œ + job.stages_completed.append('article_assembly') + await self.queue_manager.mark_completed('article_assembly', job.job_id) + + # ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ + await self._update_statistics(job.keyword_id) + + except Exception as e: + logger.error(f"Error processing job {job.job_id}: {e}") + await self.queue_manager.mark_failed('article_assembly', job, str(e)) + + async def _generate_final_article( + self, + job: PipelineJob, + summarized_items: List[Dict] + ) -> FinalArticle: + """Claude๋ฅผ ์‚ฌ์šฉํ•œ ์ตœ์ข… ๊ธฐ์‚ฌ ์ƒ์„ฑ""" + + # ์•„์ดํ…œ ์ •๋ณด ์ค€๋น„ + items_text = [] + for idx, item_data in enumerate(summarized_items, 1): + item = SummarizedItem(**item_data) + items_text.append(f""" +[๋‰ด์Šค {idx}] +์ œ๋ชฉ: {item.enriched_item['rss_item']['title']} +์š”์•ฝ: {item.ai_summary} +์ถœ์ฒ˜: {item.enriched_item['rss_item']['link']} +""") + + content = "\n".join(items_text) + + # Claude๋กœ ์ข…ํ•ฉ ๊ธฐ์‚ฌ ์ž‘์„ฑ + prompt = f"""๋‹ค์Œ ๋‰ด์Šค ํ•ญ๋ชฉ๋“ค์„ ๋ฐ”ํƒ•์œผ๋กœ ์ข…ํ•ฉ์ ์ธ ๊ธฐ์‚ฌ๋ฅผ ์ž‘์„ฑํ•ด์ฃผ์„ธ์š”. + +ํ‚ค์›Œ๋“œ: {job.keyword} + +๋‰ด์Šค ํ•ญ๋ชฉ๋“ค: +{content} + +๋‹ค์Œ JSON ํ˜•์‹์œผ๋กœ ์ž‘์„ฑํ•ด์ฃผ์„ธ์š”: +{{ + "title": "์ข…ํ•ฉ ๊ธฐ์‚ฌ ์ œ๋ชฉ", + "content": "๊ธฐ์‚ฌ ๋ณธ๋ฌธ (1500์ž ์ด๋‚ด, ๋ฌธ๋‹จ ๊ตฌ๋ถ„)", + "summary": "ํ•œ ์ค„ ์š”์•ฝ (100์ž ์ด๋‚ด)", + "categories": ["์นดํ…Œ๊ณ ๋ฆฌ1", "์นดํ…Œ๊ณ ๋ฆฌ2"], + "tags": ["ํƒœ๊ทธ1", "ํƒœ๊ทธ2", "ํƒœ๊ทธ3"] +}} + +์š”๊ตฌ์‚ฌํ•ญ: +- ์ „๋ฌธ์ ์ด๊ณ  ๊ฐ๊ด€์ ์ธ ํ†ค +- ํ•ต์‹ฌ ์ •๋ณด์™€ ํŠธ๋ Œ๋“œ ํŒŒ์•… +- ์‹œ์‚ฌ์  ํฌํ•จ +- ํ•œ๊ตญ ๋…์ž ๋Œ€์ƒ""" + + try: + response = await self.claude_client.messages.create( + model="claude-sonnet-4-20250514", + max_tokens=3000, + temperature=0.7, + messages=[ + {"role": "user", "content": prompt} + ] + ) + + # JSON ํŒŒ์‹ฑ + content_text = response.content[0].text + json_start = content_text.find('{') + json_end = content_text.rfind('}') + 1 + + if json_start != -1 and json_end > json_start: + article_data = json.loads(content_text[json_start:json_end]) + else: + raise ValueError("No valid JSON in response") + + # FinalArticle ์ƒ์„ฑ + article = FinalArticle( + job_id=job.job_id, + keyword_id=job.keyword_id, + keyword=job.keyword, + title=article_data.get('title', f"{job.keyword} ์ข…ํ•ฉ ๋‰ด์Šค"), + content=article_data.get('content', ''), + summary=article_data.get('summary', ''), + source_items=[], # ๊ฐ„์†Œํ™” + images=[], # ์ด๋ฏธ์ง€๋Š” ๋ณ„๋„ ์„œ๋น„์Šค์—์„œ ์ฒ˜๋ฆฌ + categories=article_data.get('categories', []), + tags=article_data.get('tags', []), + pipeline_stages=job.stages_completed, + processing_time=0 # ๋‚˜์ค‘์— ์—…๋ฐ์ดํŠธ + ) + + return article + + except Exception as e: + logger.error(f"Error generating article: {e}") + # ํด๋ฐฑ ๊ธฐ์‚ฌ ์ƒ์„ฑ + return FinalArticle( + job_id=job.job_id, + keyword_id=job.keyword_id, + keyword=job.keyword, + title=f"{job.keyword} ๋‰ด์Šค ์š”์•ฝ - {datetime.now().strftime('%Y-%m-%d')}", + content=content, + summary=f"{job.keyword} ๊ด€๋ จ {len(summarized_items)}๊ฐœ ๋‰ด์Šค ์š”์•ฝ", + source_items=[], + images=[], + categories=['์ž๋™์ƒ์„ฑ'], + tags=[job.keyword], + pipeline_stages=job.stages_completed, + processing_time=0 + ) + + async def _update_statistics(self, keyword_id: str): + """ํ‚ค์›Œ๋“œ๋ณ„ ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ""" + try: + await self.db.keyword_stats.update_one( + {"keyword_id": keyword_id}, + { + "$inc": {"articles_generated": 1}, + "$set": {"last_generated": datetime.now()} + }, + upsert=True + ) + except Exception as e: + logger.error(f"Error updating statistics: {e}") + + async def stop(self): + """์›Œ์ปค ์ค‘์ง€""" + await self.queue_manager.disconnect() + logger.info("Article Assembly Worker stopped") + +async def main(): + """๋ฉ”์ธ ํ•จ์ˆ˜""" + worker = ArticleAssemblyWorker() + + try: + await worker.start() + except KeyboardInterrupt: + logger.info("Received interrupt signal") + finally: + await worker.stop() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/services/pipeline/article-assembly/requirements.txt b/services/pipeline/article-assembly/requirements.txt new file mode 100644 index 0000000..19861c6 --- /dev/null +++ b/services/pipeline/article-assembly/requirements.txt @@ -0,0 +1,5 @@ +anthropic==0.50.0 +motor==3.1.1 +pymongo==4.3.3 +redis[hiredis]==5.0.1 +pydantic==2.5.0 \ No newline at end of file diff --git a/services/pipeline/fix_imports.py b/services/pipeline/fix_imports.py new file mode 100644 index 0000000..cbc5929 --- /dev/null +++ b/services/pipeline/fix_imports.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +"""Fix import statements in all pipeline services""" + +import os +import re + +def fix_imports(filepath): + """Fix import statements in a Python file""" + with open(filepath, 'r') as f: + content = f.read() + + # Pattern to match the old import style + old_pattern = r"# ์ƒ์œ„ ๋””๋ ‰ํ† ๋ฆฌ์˜ shared ๋ชจ๋“ˆ import\nsys\.path\.append\(os\.path\.join\(os\.path\.dirname\(__file__\), '\.\.', 'shared'\)\)\nfrom ([\w, ]+) import ([\w, ]+)" + + # Replace with new import style + def replace_imports(match): + modules = match.group(1) + items = match.group(2) + + # Build new import statements + imports = [] + if 'models' in modules: + imports.append(f"from shared.models import {items}" if 'models' in modules else "") + if 'queue_manager' in modules: + imports.append(f"from shared.queue_manager import QueueManager") + + return "# Import from shared module\n" + "\n".join(filter(None, imports)) + + # Apply the replacement + new_content = re.sub(old_pattern, replace_imports, content) + + # Also handle simpler patterns + new_content = new_content.replace( + "sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'shared'))\nfrom models import", + "from shared.models import" + ) + new_content = new_content.replace( + "\nfrom queue_manager import", + "\nfrom shared.queue_manager import" + ) + + # Write back if changed + if new_content != content: + with open(filepath, 'w') as f: + f.write(new_content) + print(f"Fixed imports in {filepath}") + return True + return False + +# Files to fix +files_to_fix = [ + "monitor/monitor.py", + "google-search/google_search.py", + "article-assembly/article_assembly.py", + "rss-collector/rss_collector.py", + "ai-summarizer/ai_summarizer.py" +] + +for file_path in files_to_fix: + full_path = os.path.join(os.path.dirname(__file__), file_path) + if os.path.exists(full_path): + fix_imports(full_path) \ No newline at end of file diff --git a/services/pipeline/google-search/Dockerfile b/services/pipeline/google-search/Dockerfile new file mode 100644 index 0000000..5d75150 --- /dev/null +++ b/services/pipeline/google-search/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11-slim + +WORKDIR /app + +# ์˜์กด์„ฑ ์„ค์น˜ +COPY ./google-search/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# ๊ณตํ†ต ๋ชจ๋“ˆ ๋ณต์‚ฌ +COPY ./shared /app/shared + +# Google Search ์ฝ”๋“œ ๋ณต์‚ฌ +COPY ./google-search /app + +# ํ™˜๊ฒฝ๋ณ€์ˆ˜ +ENV PYTHONUNBUFFERED=1 + +# ์‹คํ–‰ +CMD ["python", "google_search.py"] \ No newline at end of file diff --git a/services/pipeline/google-search/google_search.py b/services/pipeline/google-search/google_search.py new file mode 100644 index 0000000..25c34a6 --- /dev/null +++ b/services/pipeline/google-search/google_search.py @@ -0,0 +1,153 @@ +""" +Google Search Service +Google ๊ฒ€์ƒ‰์œผ๋กœ RSS ํ•ญ๋ชฉ ๊ฐ•ํ™” +""" +import asyncio +import logging +import os +import sys +import json +from typing import List, Dict, Any +import aiohttp +from datetime import datetime + +# Import from shared module +from shared.models import PipelineJob, RSSItem, SearchResult, EnrichedItem +from shared.queue_manager import QueueManager + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class GoogleSearchWorker: + def __init__(self): + self.queue_manager = QueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") + ) + self.google_api_key = os.getenv("GOOGLE_API_KEY") + self.search_engine_id = os.getenv("GOOGLE_SEARCH_ENGINE_ID") + self.max_results_per_item = 3 + + async def start(self): + """์›Œ์ปค ์‹œ์ž‘""" + logger.info("Starting Google Search Worker") + + # Redis ์—ฐ๊ฒฐ + await self.queue_manager.connect() + + # ๋ฉ”์ธ ์ฒ˜๋ฆฌ ๋ฃจํ”„ + while True: + try: + # ํ์—์„œ ์ž‘์—… ๊ฐ€์ ธ์˜ค๊ธฐ + job = await self.queue_manager.dequeue('search_enrichment', timeout=5) + + if job: + await self.process_job(job) + + except Exception as e: + logger.error(f"Error in worker loop: {e}") + await asyncio.sleep(1) + + async def process_job(self, job: PipelineJob): + """๊ฒ€์ƒ‰ ๊ฐ•ํ™” ์ž‘์—… ์ฒ˜๋ฆฌ""" + try: + logger.info(f"Processing job {job.job_id} for search enrichment") + + rss_items = job.data.get('rss_items', []) + enriched_items = [] + + # ์ตœ๋Œ€ 5๊ฐœ ํ•ญ๋ชฉ๋งŒ ์ฒ˜๋ฆฌ (API ํ• ๋‹น๋Ÿ‰ ๊ด€๋ฆฌ) + for item_data in rss_items[:5]: + rss_item = RSSItem(**item_data) + + # ์ œ๋ชฉ์œผ๋กœ Google ๊ฒ€์ƒ‰ + search_results = await self._search_google(rss_item.title) + + enriched_item = EnrichedItem( + rss_item=rss_item, + search_results=search_results + ) + enriched_items.append(enriched_item) + + # API ์†๋„ ์ œํ•œ + await asyncio.sleep(0.5) + + if enriched_items: + logger.info(f"Enriched {len(enriched_items)} items with search results") + + # ๋‹ค์Œ ๋‹จ๊ณ„๋กœ ์ „๋‹ฌ + job.data['enriched_items'] = [item.dict() for item in enriched_items] + job.stages_completed.append('search_enrichment') + job.stage = 'ai_summarization' + + await self.queue_manager.enqueue('ai_summarization', job) + await self.queue_manager.mark_completed('search_enrichment', job.job_id) + else: + logger.warning(f"No items enriched for job {job.job_id}") + await self.queue_manager.mark_failed( + 'search_enrichment', + job, + "No items to enrich" + ) + + except Exception as e: + logger.error(f"Error processing job {job.job_id}: {e}") + await self.queue_manager.mark_failed('search_enrichment', job, str(e)) + + async def _search_google(self, query: str) -> List[SearchResult]: + """Google Custom Search API ํ˜ธ์ถœ""" + results = [] + + if not self.google_api_key or not self.search_engine_id: + logger.warning("Google API credentials not configured") + return results + + try: + url = "https://www.googleapis.com/customsearch/v1" + params = { + "key": self.google_api_key, + "cx": self.search_engine_id, + "q": query, + "num": self.max_results_per_item, + "hl": "ko", + "gl": "kr" + } + + async with aiohttp.ClientSession() as session: + async with session.get(url, params=params, timeout=30) as response: + if response.status == 200: + data = await response.json() + + for item in data.get('items', []): + result = SearchResult( + title=item.get('title', ''), + link=item.get('link', ''), + snippet=item.get('snippet', ''), + source='google' + ) + results.append(result) + else: + logger.error(f"Google API error: {response.status}") + + except Exception as e: + logger.error(f"Error searching Google for '{query}': {e}") + + return results + + async def stop(self): + """์›Œ์ปค ์ค‘์ง€""" + await self.queue_manager.disconnect() + logger.info("Google Search Worker stopped") + +async def main(): + """๋ฉ”์ธ ํ•จ์ˆ˜""" + worker = GoogleSearchWorker() + + try: + await worker.start() + except KeyboardInterrupt: + logger.info("Received interrupt signal") + finally: + await worker.stop() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/services/pipeline/google-search/requirements.txt b/services/pipeline/google-search/requirements.txt new file mode 100644 index 0000000..0859816 --- /dev/null +++ b/services/pipeline/google-search/requirements.txt @@ -0,0 +1,3 @@ +aiohttp==3.9.1 +redis[hiredis]==5.0.1 +pydantic==2.5.0 \ No newline at end of file diff --git a/services/pipeline/image-generator/Dockerfile b/services/pipeline/image-generator/Dockerfile new file mode 100644 index 0000000..018dede --- /dev/null +++ b/services/pipeline/image-generator/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY ./image-generator/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy shared modules +COPY ./shared /app/shared + +# Copy application code +COPY ./image-generator /app + +CMD ["python", "image_generator.py"] \ No newline at end of file diff --git a/services/pipeline/image-generator/image_generator.py b/services/pipeline/image-generator/image_generator.py new file mode 100644 index 0000000..5af06c5 --- /dev/null +++ b/services/pipeline/image-generator/image_generator.py @@ -0,0 +1,225 @@ +""" +Image Generation Service +Replicate API๋ฅผ ์‚ฌ์šฉํ•œ ์ด๋ฏธ์ง€ ์ƒ์„ฑ ์„œ๋น„์Šค +""" +import asyncio +import logging +import os +import sys +import base64 +from typing import List, Dict, Any +import httpx +from io import BytesIO + +# Import from shared module +from shared.models import PipelineJob, TranslatedItem, GeneratedImageItem +from shared.queue_manager import QueueManager + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class ImageGeneratorWorker: + def __init__(self): + self.queue_manager = QueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") + ) + self.replicate_api_key = os.getenv("REPLICATE_API_KEY") + self.replicate_api_url = "https://api.replicate.com/v1/predictions" + # Stable Diffusion ๋ชจ๋ธ ์‚ฌ์šฉ + self.model_version = "stability-ai/sdxl:39ed52f2a78e934b3ba6e2a89f5b1c712de7dfea535525255b1aa35c5565e08b" + + async def start(self): + """์›Œ์ปค ์‹œ์ž‘""" + logger.info("Starting Image Generator Worker") + + # Redis ์—ฐ๊ฒฐ + await self.queue_manager.connect() + + # API ํ‚ค ํ™•์ธ + if not self.replicate_api_key: + logger.warning("Replicate API key not configured - using placeholder images") + + # ๋ฉ”์ธ ์ฒ˜๋ฆฌ ๋ฃจํ”„ + while True: + try: + # ํ์—์„œ ์ž‘์—… ๊ฐ€์ ธ์˜ค๊ธฐ + job = await self.queue_manager.dequeue('image_generation', timeout=5) + + if job: + await self.process_job(job) + + except Exception as e: + logger.error(f"Error in worker loop: {e}") + await asyncio.sleep(1) + + async def process_job(self, job: PipelineJob): + """์ด๋ฏธ์ง€ ์ƒ์„ฑ ์ž‘์—… ์ฒ˜๋ฆฌ""" + try: + logger.info(f"Processing job {job.job_id} for image generation") + + translated_items = job.data.get('translated_items', []) + generated_items = [] + + # ์ตœ๋Œ€ 3๊ฐœ ์•„์ดํ…œ๋งŒ ์ด๋ฏธ์ง€ ์ƒ์„ฑ (API ๋น„์šฉ ์ ˆ๊ฐ) + for idx, item_data in enumerate(translated_items[:3]): + translated_item = TranslatedItem(**item_data) + + # ์ด๋ฏธ์ง€ ์ƒ์„ฑ์„ ์œ„ํ•œ ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ + prompt = self._create_image_prompt(translated_item) + + # ์ด๋ฏธ์ง€ ์ƒ์„ฑ + image_url = await self._generate_image(prompt) + + generated_item = GeneratedImageItem( + translated_item=translated_item, + image_url=image_url, + image_prompt=prompt + ) + generated_items.append(generated_item) + + # API ์†๋„ ์ œํ•œ + if self.replicate_api_key: + await asyncio.sleep(2) + + if generated_items: + logger.info(f"Generated images for {len(generated_items)} items") + + # ์™„๋ฃŒ๋œ ๋ฐ์ดํ„ฐ๋ฅผ job์— ์ €์žฅ + job.data['generated_items'] = [item.dict() for item in generated_items] + job.stages_completed.append('image_generation') + job.stage = 'completed' + + # ์ตœ์ข… ๊ธฐ์‚ฌ ์กฐ๋ฆฝ ๋‹จ๊ณ„๋กœ ์ „๋‹ฌ (์ด๋ฏธ article-assembly๋กœ ์ˆ˜์ •) + await self.queue_manager.enqueue('article_assembly', job) + await self.queue_manager.mark_completed('image_generation', job.job_id) + else: + logger.warning(f"No images generated for job {job.job_id}") + # ์ด๋ฏธ์ง€ ์ƒ์„ฑ ์‹คํŒจํ•ด๋„ ๋‹ค์Œ ๋‹จ๊ณ„๋กœ ์ง„ํ–‰ + job.stages_completed.append('image_generation') + await self.queue_manager.enqueue('article_assembly', job) + await self.queue_manager.mark_completed('image_generation', job.job_id) + + except Exception as e: + logger.error(f"Error processing job {job.job_id}: {e}") + # ์ด๋ฏธ์ง€ ์ƒ์„ฑ ์‹คํŒจํ•ด๋„ ๋‹ค์Œ ๋‹จ๊ณ„๋กœ ์ง„ํ–‰ + job.stages_completed.append('image_generation') + await self.queue_manager.enqueue('article_assembly', job) + await self.queue_manager.mark_completed('image_generation', job.job_id) + + def _create_image_prompt(self, translated_item: TranslatedItem) -> str: + """์ด๋ฏธ์ง€ ์ƒ์„ฑ์„ ์œ„ํ•œ ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ""" + # ์˜๋ฌธ ์ œ๋ชฉ๊ณผ ์š”์•ฝ์„ ๊ธฐ๋ฐ˜์œผ๋กœ ํ”„๋กฌํ”„ํŠธ ์ƒ์„ฑ + title = translated_item.translated_title or translated_item.summarized_item['enriched_item']['rss_item']['title'] + summary = translated_item.translated_summary or translated_item.summarized_item['ai_summary'] + + # ๋‰ด์Šค ๊ด€๋ จ ์ด๋ฏธ์ง€๋ฅผ ์œ„ํ•œ ํ”„๋กฌํ”„ํŠธ + prompt = f"News illustration for: {title[:100]}, professional, photorealistic, high quality, 4k" + + return prompt + + async def _generate_image(self, prompt: str) -> str: + """Replicate API๋ฅผ ์‚ฌ์šฉํ•œ ์ด๋ฏธ์ง€ ์ƒ์„ฑ""" + try: + if not self.replicate_api_key: + # API ํ‚ค๊ฐ€ ์—†์œผ๋ฉด ํ”Œ๋ ˆ์ด์Šคํ™€๋” ์ด๋ฏธ์ง€ URL ๋ฐ˜ํ™˜ + return "https://via.placeholder.com/800x600.png?text=News+Image" + + async with httpx.AsyncClient() as client: + # ์˜ˆ์ธก ์ƒ์„ฑ ์š”์ฒญ + response = await client.post( + self.replicate_api_url, + headers={ + "Authorization": f"Token {self.replicate_api_key}", + "Content-Type": "application/json" + }, + json={ + "version": self.model_version, + "input": { + "prompt": prompt, + "width": 768, + "height": 768, + "num_outputs": 1, + "scheduler": "K_EULER", + "num_inference_steps": 25, + "guidance_scale": 7.5, + "prompt_strength": 0.8, + "refine": "expert_ensemble_refiner", + "high_noise_frac": 0.8 + } + }, + timeout=60 + ) + + if response.status_code in [200, 201]: + result = response.json() + prediction_id = result.get('id') + + # ์˜ˆ์ธก ๊ฒฐ๊ณผ ํด๋ง + image_url = await self._poll_prediction(prediction_id) + return image_url + else: + logger.error(f"Replicate API error: {response.status_code}") + return "https://via.placeholder.com/800x600.png?text=Generation+Failed" + + except Exception as e: + logger.error(f"Error generating image: {e}") + return "https://via.placeholder.com/800x600.png?text=Error" + + async def _poll_prediction(self, prediction_id: str, max_attempts: int = 30) -> str: + """์˜ˆ์ธก ๊ฒฐ๊ณผ ํด๋ง""" + try: + async with httpx.AsyncClient() as client: + for attempt in range(max_attempts): + response = await client.get( + f"{self.replicate_api_url}/{prediction_id}", + headers={ + "Authorization": f"Token {self.replicate_api_key}" + }, + timeout=30 + ) + + if response.status_code == 200: + result = response.json() + status = result.get('status') + + if status == 'succeeded': + output = result.get('output') + if output and isinstance(output, list) and len(output) > 0: + return output[0] + else: + return "https://via.placeholder.com/800x600.png?text=No+Output" + elif status == 'failed': + logger.error(f"Prediction failed: {result.get('error')}") + return "https://via.placeholder.com/800x600.png?text=Failed" + + # ์•„์ง ์ฒ˜๋ฆฌ์ค‘์ด๋ฉด ๋Œ€๊ธฐ + await asyncio.sleep(2) + else: + logger.error(f"Error polling prediction: {response.status_code}") + return "https://via.placeholder.com/800x600.png?text=Poll+Error" + + # ์ตœ๋Œ€ ์‹œ๋„ ํšŸ์ˆ˜ ์ดˆ๊ณผ + return "https://via.placeholder.com/800x600.png?text=Timeout" + + except Exception as e: + logger.error(f"Error polling prediction: {e}") + return "https://via.placeholder.com/800x600.png?text=Poll+Exception" + + async def stop(self): + """์›Œ์ปค ์ค‘์ง€""" + await self.queue_manager.disconnect() + logger.info("Image Generator Worker stopped") + +async def main(): + """๋ฉ”์ธ ํ•จ์ˆ˜""" + worker = ImageGeneratorWorker() + + try: + await worker.start() + except KeyboardInterrupt: + logger.info("Received interrupt signal") + finally: + await worker.stop() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/services/pipeline/image-generator/requirements.txt b/services/pipeline/image-generator/requirements.txt new file mode 100644 index 0000000..fbd9665 --- /dev/null +++ b/services/pipeline/image-generator/requirements.txt @@ -0,0 +1,3 @@ +httpx==0.25.0 +redis[hiredis]==5.0.1 +pydantic==2.5.0 \ No newline at end of file diff --git a/services/pipeline/monitor/Dockerfile b/services/pipeline/monitor/Dockerfile new file mode 100644 index 0000000..cc6cd35 --- /dev/null +++ b/services/pipeline/monitor/Dockerfile @@ -0,0 +1,22 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY ./monitor/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy shared modules +COPY ./shared /app/shared + +# Copy monitor code +COPY ./monitor /app + +# Environment variables +ENV PYTHONUNBUFFERED=1 + +# Expose port +EXPOSE 8000 + +# Run +CMD ["uvicorn", "monitor:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] \ No newline at end of file diff --git a/services/pipeline/monitor/monitor.py b/services/pipeline/monitor/monitor.py new file mode 100644 index 0000000..9c4a73e --- /dev/null +++ b/services/pipeline/monitor/monitor.py @@ -0,0 +1,349 @@ +""" +Pipeline Monitor Service +ํŒŒ์ดํ”„๋ผ์ธ ์ƒํƒœ ๋ชจ๋‹ˆํ„ฐ๋ง ๋ฐ ๋Œ€์‹œ๋ณด๋“œ API +""" +import os +import sys +import logging +from datetime import datetime, timedelta +from typing import Dict, List, Any +from fastapi import FastAPI, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from motor.motor_asyncio import AsyncIOMotorClient +import redis.asyncio as redis + +# Import from shared module +from shared.models import KeywordSubscription, PipelineJob, FinalArticle + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +app = FastAPI(title="Pipeline Monitor", version="1.0.0") + +# CORS ์„ค์ • +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Global connections +redis_client = None +mongodb_client = None +db = None + +@app.on_event("startup") +async def startup_event(): + """์„œ๋ฒ„ ์‹œ์ž‘ ์‹œ ์—ฐ๊ฒฐ ์ดˆ๊ธฐํ™”""" + global redis_client, mongodb_client, db + + # Redis ์—ฐ๊ฒฐ + redis_url = os.getenv("REDIS_URL", "redis://redis:6379") + redis_client = await redis.from_url(redis_url, decode_responses=True) + + # MongoDB ์—ฐ๊ฒฐ + mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017") + mongodb_client = AsyncIOMotorClient(mongodb_url) + db = mongodb_client[os.getenv("DB_NAME", "pipeline_db")] + + logger.info("Pipeline Monitor started successfully") + +@app.on_event("shutdown") +async def shutdown_event(): + """์„œ๋ฒ„ ์ข…๋ฃŒ ์‹œ ์—ฐ๊ฒฐ ํ•ด์ œ""" + if redis_client: + await redis_client.close() + if mongodb_client: + mongodb_client.close() + +@app.get("/") +async def root(): + """ํ—ฌ์Šค ์ฒดํฌ""" + return {"status": "Pipeline Monitor is running"} + +@app.get("/api/stats") +async def get_stats(): + """์ „์ฒด ํŒŒ์ดํ”„๋ผ์ธ ํ†ต๊ณ„""" + try: + # ํ๋ณ„ ๋Œ€๊ธฐ ์ž‘์—… ์ˆ˜ + queue_stats = {} + queues = [ + "queue:keyword", + "queue:rss", + "queue:search", + "queue:summarize", + "queue:assembly" + ] + + for queue in queues: + length = await redis_client.llen(queue) + queue_stats[queue] = length + + # ์˜ค๋Š˜ ์ƒ์„ฑ๋œ ๊ธฐ์‚ฌ ์ˆ˜ + today = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) + articles_today = await db.articles.count_documents({ + "created_at": {"$gte": today} + }) + + # ํ™œ์„ฑ ํ‚ค์›Œ๋“œ ์ˆ˜ + active_keywords = await db.keywords.count_documents({ + "is_active": True + }) + + # ์ด ๊ธฐ์‚ฌ ์ˆ˜ + total_articles = await db.articles.count_documents({}) + + return { + "queues": queue_stats, + "articles_today": articles_today, + "active_keywords": active_keywords, + "total_articles": total_articles, + "timestamp": datetime.now().isoformat() + } + + except Exception as e: + logger.error(f"Error getting stats: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/queues/{queue_name}") +async def get_queue_details(queue_name: str): + """ํŠน์ • ํ์˜ ์ƒ์„ธ ์ •๋ณด""" + try: + queue_key = f"queue:{queue_name}" + + # ํ ๊ธธ์ด + length = await redis_client.llen(queue_key) + + # ์ตœ๊ทผ 10๊ฐœ ์ž‘์—… ๋ฏธ๋ฆฌ๋ณด๊ธฐ + items = await redis_client.lrange(queue_key, 0, 9) + + # ์ฒ˜๋ฆฌ ์ค‘์ธ ์ž‘์—… + processing_key = f"processing:{queue_name}" + processing = await redis_client.smembers(processing_key) + + # ์‹คํŒจํ•œ ์ž‘์—… + failed_key = f"failed:{queue_name}" + failed_count = await redis_client.llen(failed_key) + + return { + "queue": queue_name, + "length": length, + "processing_count": len(processing), + "failed_count": failed_count, + "preview": items[:10], + "timestamp": datetime.now().isoformat() + } + + except Exception as e: + logger.error(f"Error getting queue details: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/keywords") +async def get_keywords(): + """๋“ฑ๋ก๋œ ํ‚ค์›Œ๋“œ ๋ชฉ๋ก""" + try: + keywords = [] + cursor = db.keywords.find({"is_active": True}) + + async for keyword in cursor: + # ํ•ด๋‹น ํ‚ค์›Œ๋“œ์˜ ์ตœ๊ทผ ๊ธฐ์‚ฌ + latest_article = await db.articles.find_one( + {"keyword_id": str(keyword["_id"])}, + sort=[("created_at", -1)] + ) + + keywords.append({ + "id": str(keyword["_id"]), + "keyword": keyword["keyword"], + "schedule": keyword.get("schedule", "30๋ถ„๋งˆ๋‹ค"), + "created_at": keyword.get("created_at"), + "last_article": latest_article["created_at"] if latest_article else None, + "article_count": await db.articles.count_documents( + {"keyword_id": str(keyword["_id"])} + ) + }) + + return keywords + + except Exception as e: + logger.error(f"Error getting keywords: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.post("/api/keywords") +async def add_keyword(keyword: str, schedule: str = "30min"): + """์ƒˆ ํ‚ค์›Œ๋“œ ๋“ฑ๋ก""" + try: + new_keyword = { + "keyword": keyword, + "schedule": schedule, + "is_active": True, + "created_at": datetime.now(), + "updated_at": datetime.now() + } + + result = await db.keywords.insert_one(new_keyword) + + return { + "id": str(result.inserted_id), + "keyword": keyword, + "message": "Keyword registered successfully" + } + + except Exception as e: + logger.error(f"Error adding keyword: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.delete("/api/keywords/{keyword_id}") +async def delete_keyword(keyword_id: str): + """ํ‚ค์›Œ๋“œ ๋น„ํ™œ์„ฑํ™”""" + try: + result = await db.keywords.update_one( + {"_id": keyword_id}, + {"$set": {"is_active": False, "updated_at": datetime.now()}} + ) + + if result.modified_count > 0: + return {"message": "Keyword deactivated successfully"} + else: + raise HTTPException(status_code=404, detail="Keyword not found") + + except Exception as e: + logger.error(f"Error deleting keyword: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/articles") +async def get_articles(limit: int = 10, skip: int = 0): + """์ตœ๊ทผ ์ƒ์„ฑ๋œ ๊ธฐ์‚ฌ ๋ชฉ๋ก""" + try: + articles = [] + cursor = db.articles.find().sort("created_at", -1).skip(skip).limit(limit) + + async for article in cursor: + articles.append({ + "id": str(article["_id"]), + "title": article["title"], + "keyword": article["keyword"], + "summary": article.get("summary", ""), + "created_at": article["created_at"], + "processing_time": article.get("processing_time", 0), + "pipeline_stages": article.get("pipeline_stages", []) + }) + + total = await db.articles.count_documents({}) + + return { + "articles": articles, + "total": total, + "limit": limit, + "skip": skip + } + + except Exception as e: + logger.error(f"Error getting articles: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/articles/{article_id}") +async def get_article(article_id: str): + """ํŠน์ • ๊ธฐ์‚ฌ ์ƒ์„ธ ์ •๋ณด""" + try: + article = await db.articles.find_one({"_id": article_id}) + + if not article: + raise HTTPException(status_code=404, detail="Article not found") + + return article + + except Exception as e: + logger.error(f"Error getting article: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/workers") +async def get_workers(): + """์›Œ์ปค ์ƒํƒœ ์ •๋ณด""" + try: + workers = {} + worker_types = [ + "scheduler", + "rss_collector", + "google_search", + "ai_summarizer", + "article_assembly" + ] + + for worker_type in worker_types: + active_key = f"workers:{worker_type}:active" + active_workers = await redis_client.smembers(active_key) + + workers[worker_type] = { + "active": len(active_workers), + "worker_ids": list(active_workers) + } + + return workers + + except Exception as e: + logger.error(f"Error getting workers: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.post("/api/trigger/{keyword}") +async def trigger_keyword_processing(keyword: str): + """์ˆ˜๋™์œผ๋กœ ํ‚ค์›Œ๋“œ ์ฒ˜๋ฆฌ ํŠธ๋ฆฌ๊ฑฐ""" + try: + # ํ‚ค์›Œ๋“œ ์ฐพ๊ธฐ + keyword_doc = await db.keywords.find_one({ + "keyword": keyword, + "is_active": True + }) + + if not keyword_doc: + raise HTTPException(status_code=404, detail="Keyword not found or inactive") + + # ์ž‘์—… ์ƒ์„ฑ + job = PipelineJob( + keyword_id=str(keyword_doc["_id"]), + keyword=keyword, + stage="keyword_processing", + created_at=datetime.now() + ) + + # ํ์— ์ถ”๊ฐ€ + await redis_client.rpush("queue:keyword", job.json()) + + return { + "message": f"Processing triggered for keyword: {keyword}", + "job_id": job.job_id + } + + except Exception as e: + logger.error(f"Error triggering keyword: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/api/health") +async def health_check(): + """์‹œ์Šคํ…œ ํ—ฌ์Šค ์ฒดํฌ""" + try: + # Redis ์ฒดํฌ + redis_status = await redis_client.ping() + + # MongoDB ์ฒดํฌ + mongodb_status = await db.command("ping") + + return { + "status": "healthy", + "redis": "connected" if redis_status else "disconnected", + "mongodb": "connected" if mongodb_status else "disconnected", + "timestamp": datetime.now().isoformat() + } + + except Exception as e: + return { + "status": "unhealthy", + "error": str(e), + "timestamp": datetime.now().isoformat() + } + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file diff --git a/services/pipeline/monitor/requirements.txt b/services/pipeline/monitor/requirements.txt new file mode 100644 index 0000000..5728b55 --- /dev/null +++ b/services/pipeline/monitor/requirements.txt @@ -0,0 +1,6 @@ +fastapi==0.104.1 +uvicorn[standard]==0.24.0 +redis[hiredis]==5.0.1 +motor==3.1.1 +pymongo==4.3.3 +pydantic==2.5.0 \ No newline at end of file diff --git a/services/pipeline/rss-collector/Dockerfile b/services/pipeline/rss-collector/Dockerfile new file mode 100644 index 0000000..4565e1c --- /dev/null +++ b/services/pipeline/rss-collector/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11-slim + +WORKDIR /app + +# ์˜์กด์„ฑ ์„ค์น˜ +COPY ./rss-collector/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# ๊ณตํ†ต ๋ชจ๋“ˆ ๋ณต์‚ฌ +COPY ./shared /app/shared + +# RSS Collector ์ฝ”๋“œ ๋ณต์‚ฌ +COPY ./rss-collector /app + +# ํ™˜๊ฒฝ๋ณ€์ˆ˜ +ENV PYTHONUNBUFFERED=1 + +# ์‹คํ–‰ +CMD ["python", "rss_collector.py"] \ No newline at end of file diff --git a/services/pipeline/rss-collector/requirements.txt b/services/pipeline/rss-collector/requirements.txt new file mode 100644 index 0000000..8d21c7f --- /dev/null +++ b/services/pipeline/rss-collector/requirements.txt @@ -0,0 +1,4 @@ +feedparser==6.0.11 +aiohttp==3.9.1 +redis[hiredis]==5.0.1 +pydantic==2.5.0 \ No newline at end of file diff --git a/services/pipeline/rss-collector/rss_collector.py b/services/pipeline/rss-collector/rss_collector.py new file mode 100644 index 0000000..5601977 --- /dev/null +++ b/services/pipeline/rss-collector/rss_collector.py @@ -0,0 +1,192 @@ +""" +RSS Collector Service +RSS ํ”ผ๋“œ ์ˆ˜์ง‘ ๋ฐ ์ค‘๋ณต ์ œ๊ฑฐ ์„œ๋น„์Šค +""" +import asyncio +import logging +import os +import sys +import hashlib +from datetime import datetime +import feedparser +import aiohttp +import redis.asyncio as redis +from typing import List, Dict, Any + +# Import from shared module +from shared.models import PipelineJob, RSSItem, EnrichedItem +from shared.queue_manager import QueueManager + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class RSSCollectorWorker: + def __init__(self): + self.queue_manager = QueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") + ) + self.redis_client = None + self.redis_url = os.getenv("REDIS_URL", "redis://redis:6379") + self.dedup_ttl = 86400 * 7 # 7์ผ๊ฐ„ ์ค‘๋ณต ๋ฐฉ์ง€ + self.max_items_per_feed = 10 # ํ”ผ๋“œ๋‹น ์ตœ๋Œ€ ํ•ญ๋ชฉ ์ˆ˜ + + async def start(self): + """์›Œ์ปค ์‹œ์ž‘""" + logger.info("Starting RSS Collector Worker") + + # Redis ์—ฐ๊ฒฐ + await self.queue_manager.connect() + self.redis_client = await redis.from_url( + self.redis_url, + encoding="utf-8", + decode_responses=True + ) + + # ๋ฉ”์ธ ์ฒ˜๋ฆฌ ๋ฃจํ”„ + while True: + try: + # ํ์—์„œ ์ž‘์—… ๊ฐ€์ ธ์˜ค๊ธฐ (5์ดˆ ๋Œ€๊ธฐ) + job = await self.queue_manager.dequeue('rss_collection', timeout=5) + + if job: + await self.process_job(job) + + except Exception as e: + logger.error(f"Error in worker loop: {e}") + await asyncio.sleep(1) + + async def process_job(self, job: PipelineJob): + """RSS ์ˆ˜์ง‘ ์ž‘์—… ์ฒ˜๋ฆฌ""" + try: + logger.info(f"Processing job {job.job_id} for keyword '{job.keyword}'") + + keyword = job.data.get('keyword', '') + rss_feeds = job.data.get('rss_feeds', []) + + # ํ‚ค์›Œ๋“œ๊ฐ€ ํฌํ•จ๋œ RSS URL ์ƒ์„ฑ + processed_feeds = self._prepare_feeds(rss_feeds, keyword) + + all_items = [] + + for feed_url in processed_feeds: + try: + items = await self._fetch_rss_feed(feed_url, keyword) + all_items.extend(items) + except Exception as e: + logger.error(f"Error fetching feed {feed_url}: {e}") + + if all_items: + # ์ค‘๋ณต ์ œ๊ฑฐ + unique_items = await self._deduplicate_items(all_items, keyword) + + if unique_items: + logger.info(f"Collected {len(unique_items)} unique items for '{keyword}'") + + # ๋‹ค์Œ ๋‹จ๊ณ„๋กœ ์ „๋‹ฌ + job.data['rss_items'] = [item.dict() for item in unique_items] + job.stages_completed.append('rss_collection') + job.stage = 'search_enrichment' + + await self.queue_manager.enqueue('search_enrichment', job) + await self.queue_manager.mark_completed('rss_collection', job.job_id) + else: + logger.info(f"No new items found for '{keyword}'") + await self.queue_manager.mark_completed('rss_collection', job.job_id) + else: + logger.warning(f"No RSS items collected for '{keyword}'") + await self.queue_manager.mark_failed( + 'rss_collection', + job, + "No RSS items collected" + ) + + except Exception as e: + logger.error(f"Error processing job {job.job_id}: {e}") + await self.queue_manager.mark_failed('rss_collection', job, str(e)) + + def _prepare_feeds(self, feeds: List[str], keyword: str) -> List[str]: + """RSS ํ”ผ๋“œ URL ์ค€๋น„ (ํ‚ค์›Œ๋“œ ์น˜ํ™˜)""" + processed = [] + for feed in feeds: + if '{keyword}' in feed: + processed.append(feed.replace('{keyword}', keyword)) + else: + processed.append(feed) + return processed + + async def _fetch_rss_feed(self, feed_url: str, keyword: str) -> List[RSSItem]: + """RSS ํ”ผ๋“œ ๊ฐ€์ ธ์˜ค๊ธฐ""" + items = [] + + try: + async with aiohttp.ClientSession() as session: + async with session.get(feed_url, timeout=30) as response: + content = await response.text() + + # feedparser๋กœ ํŒŒ์‹ฑ + feed = feedparser.parse(content) + + for entry in feed.entries[:self.max_items_per_feed]: + # ํ‚ค์›Œ๋“œ ๊ด€๋ จ์„ฑ ์ฒดํฌ + title = entry.get('title', '') + summary = entry.get('summary', '') + + # ์ œ๋ชฉ์ด๋‚˜ ์š”์•ฝ์— ํ‚ค์›Œ๋“œ๊ฐ€ ํฌํ•จ๋œ ๊ฒฝ์šฐ๋งŒ + if keyword.lower() in title.lower() or keyword.lower() in summary.lower(): + item = RSSItem( + title=title, + link=entry.get('link', ''), + published=entry.get('published', ''), + summary=summary[:500] if summary else '', + source_feed=feed_url + ) + items.append(item) + + except Exception as e: + logger.error(f"Error fetching RSS feed {feed_url}: {e}") + + return items + + async def _deduplicate_items(self, items: List[RSSItem], keyword: str) -> List[RSSItem]: + """์ค‘๋ณต ํ•ญ๋ชฉ ์ œ๊ฑฐ""" + unique_items = [] + dedup_key = f"dedup:{keyword}" + + for item in items: + # ์ œ๋ชฉ ํ•ด์‹œ ์ƒ์„ฑ + item_hash = hashlib.md5( + f"{keyword}:{item.title}".encode() + ).hexdigest() + + # Redis Set์œผ๋กœ ์ค‘๋ณต ํ™•์ธ + is_new = await self.redis_client.sadd(dedup_key, item_hash) + + if is_new: + unique_items.append(item) + + # TTL ์„ค์ • + if unique_items: + await self.redis_client.expire(dedup_key, self.dedup_ttl) + + return unique_items + + async def stop(self): + """์›Œ์ปค ์ค‘์ง€""" + await self.queue_manager.disconnect() + if self.redis_client: + await self.redis_client.close() + logger.info("RSS Collector Worker stopped") + +async def main(): + """๋ฉ”์ธ ํ•จ์ˆ˜""" + worker = RSSCollectorWorker() + + try: + await worker.start() + except KeyboardInterrupt: + logger.info("Received interrupt signal") + finally: + await worker.stop() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/services/pipeline/scheduler/Dockerfile b/services/pipeline/scheduler/Dockerfile new file mode 100644 index 0000000..a9626e7 --- /dev/null +++ b/services/pipeline/scheduler/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11-slim + +WORKDIR /app + +# ์˜์กด์„ฑ ์„ค์น˜ +COPY ./scheduler/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# ๊ณตํ†ต ๋ชจ๋“ˆ ๋ณต์‚ฌ +COPY ./shared /app/shared + +# ์Šค์ผ€์ค„๋Ÿฌ ์ฝ”๋“œ ๋ณต์‚ฌ +COPY ./scheduler /app + +# ํ™˜๊ฒฝ๋ณ€์ˆ˜ +ENV PYTHONUNBUFFERED=1 + +# ์‹คํ–‰ +CMD ["python", "scheduler.py"] \ No newline at end of file diff --git a/services/pipeline/scheduler/requirements.txt b/services/pipeline/scheduler/requirements.txt new file mode 100644 index 0000000..0ca083f --- /dev/null +++ b/services/pipeline/scheduler/requirements.txt @@ -0,0 +1,5 @@ +apscheduler==3.10.4 +motor==3.1.1 +pymongo==4.3.3 +redis[hiredis]==5.0.1 +pydantic==2.5.0 \ No newline at end of file diff --git a/services/pipeline/scheduler/scheduler.py b/services/pipeline/scheduler/scheduler.py new file mode 100644 index 0000000..fe93276 --- /dev/null +++ b/services/pipeline/scheduler/scheduler.py @@ -0,0 +1,203 @@ +""" +News Pipeline Scheduler +๋‰ด์Šค ํŒŒ์ดํ”„๋ผ์ธ ์Šค์ผ€์ค„๋Ÿฌ ์„œ๋น„์Šค +""" +import asyncio +import logging +import os +import sys +from datetime import datetime, timedelta +from apscheduler.schedulers.asyncio import AsyncIOScheduler +from motor.motor_asyncio import AsyncIOMotorClient + +# Import from shared module +from shared.models import KeywordSubscription, PipelineJob +from shared.queue_manager import QueueManager + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class NewsScheduler: + def __init__(self): + self.scheduler = AsyncIOScheduler() + self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017") + self.db_name = os.getenv("DB_NAME", "pipeline_db") + self.db = None + self.queue_manager = QueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") + ) + + async def start(self): + """์Šค์ผ€์ค„๋Ÿฌ ์‹œ์ž‘""" + logger.info("Starting News Pipeline Scheduler") + + # MongoDB ์—ฐ๊ฒฐ + client = AsyncIOMotorClient(self.mongodb_url) + self.db = client[self.db_name] + + # Redis ์—ฐ๊ฒฐ + await self.queue_manager.connect() + + # ๊ธฐ๋ณธ ์Šค์ผ€์ค„ ์„ค์ • + # ๋งค 30๋ถ„๋งˆ๋‹ค ์‹คํ–‰ + self.scheduler.add_job( + self.process_keywords, + 'interval', + minutes=30, + id='keyword_processor', + name='Process Active Keywords' + ) + + # ํŠน์ • ์‹œ๊ฐ„๋Œ€ ๊ฐ•ํ™” ์Šค์ผ€์ค„ (์•„์นจ 7์‹œ, ์ ์‹ฌ 12์‹œ, ์ €๋… 6์‹œ) + for hour in [7, 12, 18]: + self.scheduler.add_job( + self.process_priority_keywords, + 'cron', + hour=hour, + minute=0, + id=f'priority_processor_{hour}', + name=f'Process Priority Keywords at {hour}:00' + ) + + # ๋งค์ผ ์ž์ • ํ†ต๊ณ„ ์ดˆ๊ธฐํ™” + self.scheduler.add_job( + self.reset_daily_stats, + 'cron', + hour=0, + minute=0, + id='stats_reset', + name='Reset Daily Statistics' + ) + + self.scheduler.start() + logger.info("Scheduler started successfully") + + # ์‹œ์ž‘ ์ฆ‰์‹œ ํ•œ ๋ฒˆ ์‹คํ–‰ + await self.process_keywords() + + async def process_keywords(self): + """ํ™œ์„ฑ ํ‚ค์›Œ๋“œ ์ฒ˜๋ฆฌ""" + try: + logger.info("Processing active keywords") + + # MongoDB์—์„œ ํ™œ์„ฑ ํ‚ค์›Œ๋“œ ๋กœ๋“œ + now = datetime.now() + thirty_minutes_ago = now - timedelta(minutes=30) + + keywords = await self.db.keywords.find({ + "is_active": True, + "$or": [ + {"last_processed": {"$lt": thirty_minutes_ago}}, + {"last_processed": None} + ] + }).to_list(None) + + logger.info(f"Found {len(keywords)} keywords to process") + + for keyword_doc in keywords: + await self._create_job(keyword_doc) + + # ์ฒ˜๋ฆฌ ์‹œ๊ฐ„ ์—…๋ฐ์ดํŠธ + await self.db.keywords.update_one( + {"keyword_id": keyword_doc['keyword_id']}, + {"$set": {"last_processed": now}} + ) + + logger.info(f"Created jobs for {len(keywords)} keywords") + + except Exception as e: + logger.error(f"Error processing keywords: {e}") + + async def process_priority_keywords(self): + """์šฐ์„ ์ˆœ์œ„ ํ‚ค์›Œ๋“œ ์ฒ˜๋ฆฌ""" + try: + logger.info("Processing priority keywords") + + keywords = await self.db.keywords.find({ + "is_active": True, + "is_priority": True + }).to_list(None) + + for keyword_doc in keywords: + await self._create_job(keyword_doc, priority=1) + + logger.info(f"Created priority jobs for {len(keywords)} keywords") + + except Exception as e: + logger.error(f"Error processing priority keywords: {e}") + + async def _create_job(self, keyword_doc: dict, priority: int = 0): + """ํŒŒ์ดํ”„๋ผ์ธ ์ž‘์—… ์ƒ์„ฑ""" + try: + # KeywordSubscription ๋ชจ๋ธ๋กœ ๋ณ€ํ™˜ + keyword = KeywordSubscription(**keyword_doc) + + # PipelineJob ์ƒ์„ฑ + job = PipelineJob( + keyword_id=keyword.keyword_id, + keyword=keyword.keyword, + stage='rss_collection', + stages_completed=[], + priority=priority, + data={ + 'keyword': keyword.keyword, + 'language': keyword.language, + 'rss_feeds': keyword.rss_feeds or self._get_default_rss_feeds(), + 'categories': keyword.categories + } + ) + + # ์ฒซ ๋ฒˆ์งธ ํ์— ์ถ”๊ฐ€ + await self.queue_manager.enqueue( + 'rss_collection', + job, + priority=priority + ) + + logger.info(f"Created job {job.job_id} for keyword '{keyword.keyword}'") + + except Exception as e: + logger.error(f"Error creating job for keyword: {e}") + + def _get_default_rss_feeds(self) -> list: + """๊ธฐ๋ณธ RSS ํ”ผ๋“œ ๋ชฉ๋ก""" + return [ + "https://news.google.com/rss/search?q={keyword}&hl=ko&gl=KR&ceid=KR:ko", + "https://trends.google.com/trends/trendingsearches/daily/rss?geo=KR", + "https://www.mk.co.kr/rss/40300001/", # ๋งค์ผ๊ฒฝ์ œ + "https://www.hankyung.com/feed/all-news", # ํ•œ๊ตญ๊ฒฝ์ œ + "https://www.zdnet.co.kr/news/news_rss.xml", # ZDNet Korea + ] + + async def reset_daily_stats(self): + """์ผ์ผ ํ†ต๊ณ„ ์ดˆ๊ธฐํ™”""" + try: + logger.info("Resetting daily statistics") + # Redis ํ†ต๊ณ„ ์ดˆ๊ธฐํ™” + # ๊ตฌํ˜„ ํ•„์š” + pass + except Exception as e: + logger.error(f"Error resetting stats: {e}") + + async def stop(self): + """์Šค์ผ€์ค„๋Ÿฌ ์ค‘์ง€""" + self.scheduler.shutdown() + await self.queue_manager.disconnect() + logger.info("Scheduler stopped") + +async def main(): + """๋ฉ”์ธ ํ•จ์ˆ˜""" + scheduler = NewsScheduler() + + try: + await scheduler.start() + # ๊ณ„์† ์‹คํ–‰ + while True: + await asyncio.sleep(60) + except KeyboardInterrupt: + logger.info("Received interrupt signal") + finally: + await scheduler.stop() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/services/pipeline/shared/__init__.py b/services/pipeline/shared/__init__.py new file mode 100644 index 0000000..5f6ffd9 --- /dev/null +++ b/services/pipeline/shared/__init__.py @@ -0,0 +1 @@ +# Shared modules for pipeline services \ No newline at end of file diff --git a/services/pipeline/shared/models.py b/services/pipeline/shared/models.py new file mode 100644 index 0000000..f12f581 --- /dev/null +++ b/services/pipeline/shared/models.py @@ -0,0 +1,113 @@ +""" +Pipeline Data Models +ํŒŒ์ดํ”„๋ผ์ธ ์ „์ฒด์—์„œ ์‚ฌ์šฉ๋˜๋Š” ๊ณตํ†ต ๋ฐ์ดํ„ฐ ๋ชจ๋ธ +""" +from datetime import datetime +from typing import List, Dict, Any, Optional +from pydantic import BaseModel, Field +import uuid + +class KeywordSubscription(BaseModel): + """ํ‚ค์›Œ๋“œ ๊ตฌ๋… ๋ชจ๋ธ""" + keyword_id: str = Field(default_factory=lambda: str(uuid.uuid4())) + keyword: str + language: str = "ko" + schedule: str = "0 */30 * * *" # Cron expression (30๋ถ„๋งˆ๋‹ค) + is_active: bool = True + is_priority: bool = False + last_processed: Optional[datetime] = None + rss_feeds: List[str] = Field(default_factory=list) + categories: List[str] = Field(default_factory=list) + created_at: datetime = Field(default_factory=datetime.now) + owner: Optional[str] = None + +class PipelineJob(BaseModel): + """ํŒŒ์ดํ”„๋ผ์ธ ์ž‘์—… ๋ชจ๋ธ""" + job_id: str = Field(default_factory=lambda: str(uuid.uuid4())) + keyword_id: str + keyword: str + stage: str # current stage + stages_completed: List[str] = Field(default_factory=list) + data: Dict[str, Any] = Field(default_factory=dict) + retry_count: int = 0 + max_retries: int = 3 + priority: int = 0 + created_at: datetime = Field(default_factory=datetime.now) + updated_at: datetime = Field(default_factory=datetime.now) + +class RSSItem(BaseModel): + """RSS ํ”ผ๋“œ ์•„์ดํ…œ""" + item_id: str = Field(default_factory=lambda: str(uuid.uuid4())) + title: str + link: str + published: Optional[str] = None + summary: Optional[str] = None + source_feed: str + +class SearchResult(BaseModel): + """๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ""" + title: str + link: str + snippet: Optional[str] = None + source: str = "google" + +class EnrichedItem(BaseModel): + """๊ฐ•ํ™”๋œ ๋‰ด์Šค ์•„์ดํ…œ""" + rss_item: RSSItem + search_results: List[SearchResult] = Field(default_factory=list) + +class SummarizedItem(BaseModel): + """์š”์•ฝ๋œ ์•„์ดํ…œ""" + enriched_item: EnrichedItem + ai_summary: str + summary_language: str = "ko" + +class TranslatedItem(BaseModel): + """๋ฒˆ์—ญ๋œ ์•„์ดํ…œ""" + summarized_item: SummarizedItem + title_en: str + summary_en: str + +class ItemWithImage(BaseModel): + """์ด๋ฏธ์ง€๊ฐ€ ์ถ”๊ฐ€๋œ ์•„์ดํ…œ""" + translated_item: TranslatedItem + image_url: str + image_prompt: str + +class FinalArticle(BaseModel): + """์ตœ์ข… ๊ธฐ์‚ฌ""" + article_id: str = Field(default_factory=lambda: str(uuid.uuid4())) + job_id: str + keyword_id: str + keyword: str + title: str + content: str + summary: str + source_items: List[ItemWithImage] + images: List[str] + categories: List[str] = Field(default_factory=list) + tags: List[str] = Field(default_factory=list) + created_at: datetime = Field(default_factory=datetime.now) + pipeline_stages: List[str] + processing_time: float # seconds + +class TranslatedItem(BaseModel): + """๋ฒˆ์—ญ๋œ ์•„์ดํ…œ""" + summarized_item: Dict[str, Any] # SummarizedItem as dict + translated_title: str + translated_summary: str + target_language: str = 'en' + +class GeneratedImageItem(BaseModel): + """์ด๋ฏธ์ง€ ์ƒ์„ฑ๋œ ์•„์ดํ…œ""" + translated_item: Dict[str, Any] # TranslatedItem as dict + image_url: str + image_prompt: str + +class QueueMessage(BaseModel): + """ํ ๋ฉ”์‹œ์ง€""" + message_id: str = Field(default_factory=lambda: str(uuid.uuid4())) + queue_name: str + job: PipelineJob + timestamp: datetime = Field(default_factory=datetime.now) + retry_count: int = 0 \ No newline at end of file diff --git a/services/pipeline/shared/queue_manager.py b/services/pipeline/shared/queue_manager.py new file mode 100644 index 0000000..f56dc57 --- /dev/null +++ b/services/pipeline/shared/queue_manager.py @@ -0,0 +1,173 @@ +""" +Queue Manager +Redis ๊ธฐ๋ฐ˜ ํ ๊ด€๋ฆฌ ์‹œ์Šคํ…œ +""" +import redis.asyncio as redis +import json +import logging +from typing import Optional, Dict, Any, List +from datetime import datetime + +from .models import PipelineJob, QueueMessage + +logger = logging.getLogger(__name__) + +class QueueManager: + """Redis ๊ธฐ๋ฐ˜ ํ ๋งค๋‹ˆ์ €""" + + QUEUES = { + "keyword_processing": "queue:keyword", + "rss_collection": "queue:rss", + "search_enrichment": "queue:search", + "ai_summarization": "queue:summarize", + "translation": "queue:translate", + "image_generation": "queue:image", + "article_assembly": "queue:assembly", + "failed": "queue:failed", + "scheduled": "queue:scheduled" + } + + def __init__(self, redis_url: str = "redis://redis:6379"): + self.redis_url = redis_url + self.redis_client: Optional[redis.Redis] = None + + async def connect(self): + """Redis ์—ฐ๊ฒฐ""" + if not self.redis_client: + self.redis_client = await redis.from_url( + self.redis_url, + encoding="utf-8", + decode_responses=True + ) + logger.info("Connected to Redis") + + async def disconnect(self): + """Redis ์—ฐ๊ฒฐ ํ•ด์ œ""" + if self.redis_client: + await self.redis_client.close() + self.redis_client = None + + async def enqueue(self, queue_name: str, job: PipelineJob, priority: int = 0) -> str: + """์ž‘์—…์„ ํ์— ์ถ”๊ฐ€""" + try: + queue_key = self.QUEUES.get(queue_name, f"queue:{queue_name}") + + message = QueueMessage( + queue_name=queue_name, + job=job + ) + + # ์šฐ์„ ์ˆœ์œ„์— ๋”ฐ๋ผ ์ถ”๊ฐ€ + if priority > 0: + await self.redis_client.lpush(queue_key, message.json()) + else: + await self.redis_client.rpush(queue_key, message.json()) + + # ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ + await self.redis_client.hincrby("stats:queues", queue_name, 1) + + logger.info(f"Job {job.job_id} enqueued to {queue_name}") + return job.job_id + + except Exception as e: + logger.error(f"Failed to enqueue job: {e}") + raise + + async def dequeue(self, queue_name: str, timeout: int = 0) -> Optional[PipelineJob]: + """ํ์—์„œ ์ž‘์—… ๊ฐ€์ ธ์˜ค๊ธฐ""" + try: + queue_key = self.QUEUES.get(queue_name, f"queue:{queue_name}") + + if timeout > 0: + result = await self.redis_client.blpop(queue_key, timeout=timeout) + if result: + _, data = result + else: + return None + else: + data = await self.redis_client.lpop(queue_key) + + if data: + message = QueueMessage.parse_raw(data) + + # ์ฒ˜๋ฆฌ ์ค‘ ๋ชฉ๋ก์— ์ถ”๊ฐ€ + processing_key = f"processing:{queue_name}" + await self.redis_client.hset( + processing_key, + message.job.job_id, + message.json() + ) + + return message.job + + return None + + except Exception as e: + logger.error(f"Failed to dequeue job: {e}") + return None + + async def mark_completed(self, queue_name: str, job_id: str): + """์ž‘์—… ์™„๋ฃŒ ํ‘œ์‹œ""" + try: + processing_key = f"processing:{queue_name}" + await self.redis_client.hdel(processing_key, job_id) + + # ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ + await self.redis_client.hincrby("stats:completed", queue_name, 1) + + logger.info(f"Job {job_id} completed in {queue_name}") + + except Exception as e: + logger.error(f"Failed to mark job as completed: {e}") + + async def mark_failed(self, queue_name: str, job: PipelineJob, error: str): + """์ž‘์—… ์‹คํŒจ ์ฒ˜๋ฆฌ""" + try: + processing_key = f"processing:{queue_name}" + await self.redis_client.hdel(processing_key, job.job_id) + + # ์žฌ์‹œ๋„ ํ™•์ธ + if job.retry_count < job.max_retries: + job.retry_count += 1 + await self.enqueue(queue_name, job) + logger.info(f"Job {job.job_id} requeued (retry {job.retry_count}/{job.max_retries})") + else: + # ์‹คํŒจ ํ๋กœ ์ด๋™ + job.data["error"] = error + job.data["failed_stage"] = queue_name + await self.enqueue("failed", job) + + # ํ†ต๊ณ„ ์—…๋ฐ์ดํŠธ + await self.redis_client.hincrby("stats:failed", queue_name, 1) + logger.error(f"Job {job.job_id} failed: {error}") + + except Exception as e: + logger.error(f"Failed to mark job as failed: {e}") + + async def get_queue_stats(self) -> Dict[str, Any]: + """ํ ํ†ต๊ณ„ ์กฐํšŒ""" + try: + stats = {} + + for name, key in self.QUEUES.items(): + stats[name] = { + "pending": await self.redis_client.llen(key), + "processing": await self.redis_client.hlen(f"processing:{name}"), + } + + # ์™„๋ฃŒ/์‹คํŒจ ํ†ต๊ณ„ + stats["completed"] = await self.redis_client.hgetall("stats:completed") or {} + stats["failed"] = await self.redis_client.hgetall("stats:failed") or {} + + return stats + + except Exception as e: + logger.error(f"Failed to get queue stats: {e}") + return {} + + async def clear_queue(self, queue_name: str): + """ํ ์ดˆ๊ธฐํ™” (ํ…Œ์ŠคํŠธ์šฉ)""" + queue_key = self.QUEUES.get(queue_name, f"queue:{queue_name}") + await self.redis_client.delete(queue_key) + await self.redis_client.delete(f"processing:{queue_name}") + logger.info(f"Queue {queue_name} cleared") \ No newline at end of file diff --git a/services/pipeline/shared/requirements.txt b/services/pipeline/shared/requirements.txt new file mode 100644 index 0000000..cc100bf --- /dev/null +++ b/services/pipeline/shared/requirements.txt @@ -0,0 +1,5 @@ +redis[hiredis]==5.0.1 +motor==3.1.1 +pymongo==4.3.3 +pydantic==2.5.0 +python-dateutil==2.8.2 \ No newline at end of file diff --git a/services/pipeline/translator/Dockerfile b/services/pipeline/translator/Dockerfile new file mode 100644 index 0000000..41f2a71 --- /dev/null +++ b/services/pipeline/translator/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY ./translator/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy shared modules +COPY ./shared /app/shared + +# Copy application code +COPY ./translator /app + +CMD ["python", "translator.py"] \ No newline at end of file diff --git a/services/pipeline/translator/requirements.txt b/services/pipeline/translator/requirements.txt new file mode 100644 index 0000000..fbd9665 --- /dev/null +++ b/services/pipeline/translator/requirements.txt @@ -0,0 +1,3 @@ +httpx==0.25.0 +redis[hiredis]==5.0.1 +pydantic==2.5.0 \ No newline at end of file diff --git a/services/pipeline/translator/translator.py b/services/pipeline/translator/translator.py new file mode 100644 index 0000000..5637e31 --- /dev/null +++ b/services/pipeline/translator/translator.py @@ -0,0 +1,154 @@ +""" +Translation Service +DeepL API๋ฅผ ์‚ฌ์šฉํ•œ ๋ฒˆ์—ญ ์„œ๋น„์Šค +""" +import asyncio +import logging +import os +import sys +from typing import List, Dict, Any +import httpx + +# Import from shared module +from shared.models import PipelineJob, SummarizedItem, TranslatedItem +from shared.queue_manager import QueueManager + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class TranslatorWorker: + def __init__(self): + self.queue_manager = QueueManager( + redis_url=os.getenv("REDIS_URL", "redis://redis:6379") + ) + self.deepl_api_key = os.getenv("DEEPL_API_KEY", "3abbc796-2515-44a8-972d-22dcf27ab54a") + # DeepL Pro API ์—”๋“œํฌ์ธํŠธ ์‚ฌ์šฉ + self.deepl_api_url = "https://api.deepl.com/v2/translate" + + async def start(self): + """์›Œ์ปค ์‹œ์ž‘""" + logger.info("Starting Translator Worker") + + # Redis ์—ฐ๊ฒฐ + await self.queue_manager.connect() + + # DeepL API ํ‚ค ํ™•์ธ + if not self.deepl_api_key: + logger.error("DeepL API key not configured") + return + + # ๋ฉ”์ธ ์ฒ˜๋ฆฌ ๋ฃจํ”„ + while True: + try: + # ํ์—์„œ ์ž‘์—… ๊ฐ€์ ธ์˜ค๊ธฐ + job = await self.queue_manager.dequeue('translation', timeout=5) + + if job: + await self.process_job(job) + + except Exception as e: + logger.error(f"Error in worker loop: {e}") + await asyncio.sleep(1) + + async def process_job(self, job: PipelineJob): + """๋ฒˆ์—ญ ์ž‘์—… ์ฒ˜๋ฆฌ""" + try: + logger.info(f"Processing job {job.job_id} for translation") + + summarized_items = job.data.get('summarized_items', []) + translated_items = [] + + for item_data in summarized_items: + summarized_item = SummarizedItem(**item_data) + + # ์ œ๋ชฉ๊ณผ ์š”์•ฝ ๋ฒˆ์—ญ + translated_title = await self._translate_text( + summarized_item.enriched_item['rss_item']['title'], + target_lang='EN' + ) + + translated_summary = await self._translate_text( + summarized_item.ai_summary, + target_lang='EN' + ) + + translated_item = TranslatedItem( + summarized_item=summarized_item, + translated_title=translated_title, + translated_summary=translated_summary, + target_language='en' + ) + translated_items.append(translated_item) + + # API ์†๋„ ์ œํ•œ + await asyncio.sleep(0.5) + + if translated_items: + logger.info(f"Translated {len(translated_items)} items") + + # ๋‹ค์Œ ๋‹จ๊ณ„๋กœ ์ „๋‹ฌ + job.data['translated_items'] = [item.dict() for item in translated_items] + job.stages_completed.append('translation') + job.stage = 'image_generation' + + await self.queue_manager.enqueue('image_generation', job) + await self.queue_manager.mark_completed('translation', job.job_id) + else: + logger.warning(f"No items translated for job {job.job_id}") + await self.queue_manager.mark_failed( + 'translation', + job, + "No items to translate" + ) + + except Exception as e: + logger.error(f"Error processing job {job.job_id}: {e}") + await self.queue_manager.mark_failed('translation', job, str(e)) + + async def _translate_text(self, text: str, target_lang: str = 'EN') -> str: + """DeepL API๋ฅผ ์‚ฌ์šฉํ•œ ํ…์ŠคํŠธ ๋ฒˆ์—ญ""" + try: + if not text: + return "" + + async with httpx.AsyncClient() as client: + response = await client.post( + self.deepl_api_url, + data={ + 'auth_key': self.deepl_api_key, + 'text': text, + 'target_lang': target_lang, + 'source_lang': 'KO' + }, + timeout=30 + ) + + if response.status_code == 200: + result = response.json() + return result['translations'][0]['text'] + else: + logger.error(f"DeepL API error: {response.status_code}") + return text # ๋ฒˆ์—ญ ์‹คํŒจ์‹œ ์›๋ณธ ๋ฐ˜ํ™˜ + + except Exception as e: + logger.error(f"Error translating text: {e}") + return text # ๋ฒˆ์—ญ ์‹คํŒจ์‹œ ์›๋ณธ ๋ฐ˜ํ™˜ + + async def stop(self): + """์›Œ์ปค ์ค‘์ง€""" + await self.queue_manager.disconnect() + logger.info("Translator Worker stopped") + +async def main(): + """๋ฉ”์ธ ํ•จ์ˆ˜""" + worker = TranslatorWorker() + + try: + await worker.start() + except KeyboardInterrupt: + logger.info("Received interrupt signal") + finally: + await worker.stop() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file