feat: Implement async queue-based news pipeline with microservices

Major architectural transformation from synchronous to asynchronous processing:

## Pipeline Services (8 microservices)
- pipeline-scheduler: APScheduler for 30-minute periodic job triggers
- pipeline-rss-collector: RSS feed collection with deduplication (7-day TTL)
- pipeline-google-search: Content enrichment via Google Search API
- pipeline-ai-summarizer: AI summarization using Claude API (claude-sonnet-4-20250514)
- pipeline-translator: Translation using DeepL Pro API
- pipeline-image-generator: Image generation with Replicate API (Stable Diffusion)
- pipeline-article-assembly: Final article assembly and MongoDB storage
- pipeline-monitor: Real-time monitoring dashboard (port 8100)

## Key Features
- Redis-based job queue with deduplication
- Asynchronous processing with Python asyncio
- Shared models and queue manager for inter-service communication
- Docker containerization for all services
- Container names standardized with site11_ prefix

## Removed Services
- Moved to backup: google-search, rss-feed, news-aggregator, ai-writer

## Configuration
- DeepL Pro API: 3abbc796-2515-44a8-972d-22dcf27ab54a
- Claude Model: claude-sonnet-4-20250514
- Redis Queue TTL: 7 days for deduplication

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
jungwoo choi
2025-09-13 19:22:14 +09:00
parent 1d90af7c3c
commit 070032006e
73 changed files with 5922 additions and 4 deletions

View File

@ -0,0 +1,19 @@
FROM python:3.11-slim
WORKDIR /app
# 의존성 설치
COPY ./google-search/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 공통 모듈 복사
COPY ./shared /app/shared
# Google Search 코드 복사
COPY ./google-search /app
# 환경변수
ENV PYTHONUNBUFFERED=1
# 실행
CMD ["python", "google_search.py"]

View File

@ -0,0 +1,153 @@
"""
Google Search Service
Google 검색으로 RSS 항목 강화
"""
import asyncio
import logging
import os
import sys
import json
from typing import List, Dict, Any
import aiohttp
from datetime import datetime
# Import from shared module
from shared.models import PipelineJob, RSSItem, SearchResult, EnrichedItem
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class GoogleSearchWorker:
def __init__(self):
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
self.google_api_key = os.getenv("GOOGLE_API_KEY")
self.search_engine_id = os.getenv("GOOGLE_SEARCH_ENGINE_ID")
self.max_results_per_item = 3
async def start(self):
"""워커 시작"""
logger.info("Starting Google Search Worker")
# Redis 연결
await self.queue_manager.connect()
# 메인 처리 루프
while True:
try:
# 큐에서 작업 가져오기
job = await self.queue_manager.dequeue('search_enrichment', timeout=5)
if job:
await self.process_job(job)
except Exception as e:
logger.error(f"Error in worker loop: {e}")
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""검색 강화 작업 처리"""
try:
logger.info(f"Processing job {job.job_id} for search enrichment")
rss_items = job.data.get('rss_items', [])
enriched_items = []
# 최대 5개 항목만 처리 (API 할당량 관리)
for item_data in rss_items[:5]:
rss_item = RSSItem(**item_data)
# 제목으로 Google 검색
search_results = await self._search_google(rss_item.title)
enriched_item = EnrichedItem(
rss_item=rss_item,
search_results=search_results
)
enriched_items.append(enriched_item)
# API 속도 제한
await asyncio.sleep(0.5)
if enriched_items:
logger.info(f"Enriched {len(enriched_items)} items with search results")
# 다음 단계로 전달
job.data['enriched_items'] = [item.dict() for item in enriched_items]
job.stages_completed.append('search_enrichment')
job.stage = 'ai_summarization'
await self.queue_manager.enqueue('ai_summarization', job)
await self.queue_manager.mark_completed('search_enrichment', job.job_id)
else:
logger.warning(f"No items enriched for job {job.job_id}")
await self.queue_manager.mark_failed(
'search_enrichment',
job,
"No items to enrich"
)
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")
await self.queue_manager.mark_failed('search_enrichment', job, str(e))
async def _search_google(self, query: str) -> List[SearchResult]:
"""Google Custom Search API 호출"""
results = []
if not self.google_api_key or not self.search_engine_id:
logger.warning("Google API credentials not configured")
return results
try:
url = "https://www.googleapis.com/customsearch/v1"
params = {
"key": self.google_api_key,
"cx": self.search_engine_id,
"q": query,
"num": self.max_results_per_item,
"hl": "ko",
"gl": "kr"
}
async with aiohttp.ClientSession() as session:
async with session.get(url, params=params, timeout=30) as response:
if response.status == 200:
data = await response.json()
for item in data.get('items', []):
result = SearchResult(
title=item.get('title', ''),
link=item.get('link', ''),
snippet=item.get('snippet', ''),
source='google'
)
results.append(result)
else:
logger.error(f"Google API error: {response.status}")
except Exception as e:
logger.error(f"Error searching Google for '{query}': {e}")
return results
async def stop(self):
"""워커 중지"""
await self.queue_manager.disconnect()
logger.info("Google Search Worker stopped")
async def main():
"""메인 함수"""
worker = GoogleSearchWorker()
try:
await worker.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
await worker.stop()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,3 @@
aiohttp==3.9.1
redis[hiredis]==5.0.1
pydantic==2.5.0