feat: Implement async queue-based news pipeline with microservices

Major architectural transformation from synchronous to asynchronous processing: ## Pipeline Services (8 microservices) - pipeline-scheduler: APScheduler for 30-minute periodic job triggers - pipeline-rss-collector: RSS feed collection with deduplication (7-day TTL) - pipeline-google-search: Content enrichment via Google Search API - pipeline-ai-summarizer: AI summarization using Claude API (claude-sonnet-4-20250514) - pipeline-translator: Translation using DeepL Pro API - pipeline-image-generator: Image generation with Replicate API (Stable Diffusion) - pipeline-article-assembly: Final article assembly and MongoDB storage - pipeline-monitor: Real-time monitoring dashboard (port 8100) ## Key Features - Redis-based job queue with deduplication - Asynchronous processing with Python asyncio - Shared models and queue manager for inter-service communication - Docker containerization for all services - Container names standardized with site11_ prefix ## Removed Services - Moved to backup: google-search, rss-feed, news-aggregator, ai-writer ## Configuration - DeepL Pro API: 3abbc796-2515-44a8-972d-22dcf27ab54a - Claude Model: claude-sonnet-4-20250514 - Redis Queue TTL: 7 days for deduplication 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-13 19:22:14 +09:00
parent 1d90af7c3c
commit 070032006e
73 changed files with 5922 additions and 4 deletions
--- a/services/pipeline/google-search/Dockerfile
+++ b/services/pipeline/google-search/Dockerfile
@ -0,0 +1,19 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# 의존성 설치
+COPY ./google-search/requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# 공통 모듈 복사
+COPY ./shared /app/shared
+
+# Google Search 코드 복사
+COPY ./google-search /app
+
+# 환경변수
+ENV PYTHONUNBUFFERED=1
+
+# 실행
+CMD ["python", "google_search.py"]
--- a/services/pipeline/google-search/google_search.py
+++ b/services/pipeline/google-search/google_search.py
@ -0,0 +1,153 @@
+"""
+Google Search Service
+Google 검색으로 RSS 항목 강화
+"""
+import asyncio
+import logging
+import os
+import sys
+import json
+from typing import List, Dict, Any
+import aiohttp
+from datetime import datetime
+
+# Import from shared module
+from shared.models import PipelineJob, RSSItem, SearchResult, EnrichedItem
+from shared.queue_manager import QueueManager
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+class GoogleSearchWorker:
+    def __init__(self):
+        self.queue_manager = QueueManager(
+            redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
+        )
+        self.google_api_key = os.getenv("GOOGLE_API_KEY")
+        self.search_engine_id = os.getenv("GOOGLE_SEARCH_ENGINE_ID")
+        self.max_results_per_item = 3
+        
+    async def start(self):
+        """워커 시작"""
+        logger.info("Starting Google Search Worker")
+        
+        # Redis 연결
+        await self.queue_manager.connect()
+        
+        # 메인 처리 루프
+        while True:
+            try:
+                # 큐에서 작업 가져오기
+                job = await self.queue_manager.dequeue('search_enrichment', timeout=5)
+                
+                if job:
+                    await self.process_job(job)
+                    
+            except Exception as e:
+                logger.error(f"Error in worker loop: {e}")
+                await asyncio.sleep(1)
+    
+    async def process_job(self, job: PipelineJob):
+        """검색 강화 작업 처리"""
+        try:
+            logger.info(f"Processing job {job.job_id} for search enrichment")
+            
+            rss_items = job.data.get('rss_items', [])
+            enriched_items = []
+            
+            # 최대 5개 항목만 처리 (API 할당량 관리)
+            for item_data in rss_items[:5]:
+                rss_item = RSSItem(**item_data)
+                
+                # 제목으로 Google 검색
+                search_results = await self._search_google(rss_item.title)
+                
+                enriched_item = EnrichedItem(
+                    rss_item=rss_item,
+                    search_results=search_results
+                )
+                enriched_items.append(enriched_item)
+                
+                # API 속도 제한
+                await asyncio.sleep(0.5)
+            
+            if enriched_items:
+                logger.info(f"Enriched {len(enriched_items)} items with search results")
+                
+                # 다음 단계로 전달
+                job.data['enriched_items'] = [item.dict() for item in enriched_items]
+                job.stages_completed.append('search_enrichment')
+                job.stage = 'ai_summarization'
+                
+                await self.queue_manager.enqueue('ai_summarization', job)
+                await self.queue_manager.mark_completed('search_enrichment', job.job_id)
+            else:
+                logger.warning(f"No items enriched for job {job.job_id}")
+                await self.queue_manager.mark_failed(
+                    'search_enrichment',
+                    job,
+                    "No items to enrich"
+                )
+                
+        except Exception as e:
+            logger.error(f"Error processing job {job.job_id}: {e}")
+            await self.queue_manager.mark_failed('search_enrichment', job, str(e))
+    
+    async def _search_google(self, query: str) -> List[SearchResult]:
+        """Google Custom Search API 호출"""
+        results = []
+        
+        if not self.google_api_key or not self.search_engine_id:
+            logger.warning("Google API credentials not configured")
+            return results
+        
+        try:
+            url = "https://www.googleapis.com/customsearch/v1"
+            params = {
+                "key": self.google_api_key,
+                "cx": self.search_engine_id,
+                "q": query,
+                "num": self.max_results_per_item,
+                "hl": "ko",
+                "gl": "kr"
+            }
+            
+            async with aiohttp.ClientSession() as session:
+                async with session.get(url, params=params, timeout=30) as response:
+                    if response.status == 200:
+                        data = await response.json()
+                        
+                        for item in data.get('items', []):
+                            result = SearchResult(
+                                title=item.get('title', ''),
+                                link=item.get('link', ''),
+                                snippet=item.get('snippet', ''),
+                                source='google'
+                            )
+                            results.append(result)
+                    else:
+                        logger.error(f"Google API error: {response.status}")
+                        
+        except Exception as e:
+            logger.error(f"Error searching Google for '{query}': {e}")
+        
+        return results
+    
+    async def stop(self):
+        """워커 중지"""
+        await self.queue_manager.disconnect()
+        logger.info("Google Search Worker stopped")
+
+async def main():
+    """메인 함수"""
+    worker = GoogleSearchWorker()
+    
+    try:
+        await worker.start()
+    except KeyboardInterrupt:
+        logger.info("Received interrupt signal")
+    finally:
+        await worker.stop()
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/services/pipeline/google-search/requirements.txt
+++ b/services/pipeline/google-search/requirements.txt
@ -0,0 +1,3 @@
+aiohttp==3.9.1
+redis[hiredis]==5.0.1
+pydantic==2.5.0