feat: Implement async queue-based news pipeline with microservices
Major architectural transformation from synchronous to asynchronous processing: ## Pipeline Services (8 microservices) - pipeline-scheduler: APScheduler for 30-minute periodic job triggers - pipeline-rss-collector: RSS feed collection with deduplication (7-day TTL) - pipeline-google-search: Content enrichment via Google Search API - pipeline-ai-summarizer: AI summarization using Claude API (claude-sonnet-4-20250514) - pipeline-translator: Translation using DeepL Pro API - pipeline-image-generator: Image generation with Replicate API (Stable Diffusion) - pipeline-article-assembly: Final article assembly and MongoDB storage - pipeline-monitor: Real-time monitoring dashboard (port 8100) ## Key Features - Redis-based job queue with deduplication - Asynchronous processing with Python asyncio - Shared models and queue manager for inter-service communication - Docker containerization for all services - Container names standardized with site11_ prefix ## Removed Services - Moved to backup: google-search, rss-feed, news-aggregator, ai-writer ## Configuration - DeepL Pro API: 3abbc796-2515-44a8-972d-22dcf27ab54a - Claude Model: claude-sonnet-4-20250514 - Redis Queue TTL: 7 days for deduplication 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
1
services/pipeline/shared/__init__.py
Normal file
1
services/pipeline/shared/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
# Shared modules for pipeline services
|
||||
113
services/pipeline/shared/models.py
Normal file
113
services/pipeline/shared/models.py
Normal file
@ -0,0 +1,113 @@
|
||||
"""
|
||||
Pipeline Data Models
|
||||
파이프라인 전체에서 사용되는 공통 데이터 모델
|
||||
"""
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any, Optional
|
||||
from pydantic import BaseModel, Field
|
||||
import uuid
|
||||
|
||||
class KeywordSubscription(BaseModel):
|
||||
"""키워드 구독 모델"""
|
||||
keyword_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
||||
keyword: str
|
||||
language: str = "ko"
|
||||
schedule: str = "0 */30 * * *" # Cron expression (30분마다)
|
||||
is_active: bool = True
|
||||
is_priority: bool = False
|
||||
last_processed: Optional[datetime] = None
|
||||
rss_feeds: List[str] = Field(default_factory=list)
|
||||
categories: List[str] = Field(default_factory=list)
|
||||
created_at: datetime = Field(default_factory=datetime.now)
|
||||
owner: Optional[str] = None
|
||||
|
||||
class PipelineJob(BaseModel):
|
||||
"""파이프라인 작업 모델"""
|
||||
job_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
||||
keyword_id: str
|
||||
keyword: str
|
||||
stage: str # current stage
|
||||
stages_completed: List[str] = Field(default_factory=list)
|
||||
data: Dict[str, Any] = Field(default_factory=dict)
|
||||
retry_count: int = 0
|
||||
max_retries: int = 3
|
||||
priority: int = 0
|
||||
created_at: datetime = Field(default_factory=datetime.now)
|
||||
updated_at: datetime = Field(default_factory=datetime.now)
|
||||
|
||||
class RSSItem(BaseModel):
|
||||
"""RSS 피드 아이템"""
|
||||
item_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
||||
title: str
|
||||
link: str
|
||||
published: Optional[str] = None
|
||||
summary: Optional[str] = None
|
||||
source_feed: str
|
||||
|
||||
class SearchResult(BaseModel):
|
||||
"""검색 결과"""
|
||||
title: str
|
||||
link: str
|
||||
snippet: Optional[str] = None
|
||||
source: str = "google"
|
||||
|
||||
class EnrichedItem(BaseModel):
|
||||
"""강화된 뉴스 아이템"""
|
||||
rss_item: RSSItem
|
||||
search_results: List[SearchResult] = Field(default_factory=list)
|
||||
|
||||
class SummarizedItem(BaseModel):
|
||||
"""요약된 아이템"""
|
||||
enriched_item: EnrichedItem
|
||||
ai_summary: str
|
||||
summary_language: str = "ko"
|
||||
|
||||
class TranslatedItem(BaseModel):
|
||||
"""번역된 아이템"""
|
||||
summarized_item: SummarizedItem
|
||||
title_en: str
|
||||
summary_en: str
|
||||
|
||||
class ItemWithImage(BaseModel):
|
||||
"""이미지가 추가된 아이템"""
|
||||
translated_item: TranslatedItem
|
||||
image_url: str
|
||||
image_prompt: str
|
||||
|
||||
class FinalArticle(BaseModel):
|
||||
"""최종 기사"""
|
||||
article_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
||||
job_id: str
|
||||
keyword_id: str
|
||||
keyword: str
|
||||
title: str
|
||||
content: str
|
||||
summary: str
|
||||
source_items: List[ItemWithImage]
|
||||
images: List[str]
|
||||
categories: List[str] = Field(default_factory=list)
|
||||
tags: List[str] = Field(default_factory=list)
|
||||
created_at: datetime = Field(default_factory=datetime.now)
|
||||
pipeline_stages: List[str]
|
||||
processing_time: float # seconds
|
||||
|
||||
class TranslatedItem(BaseModel):
|
||||
"""번역된 아이템"""
|
||||
summarized_item: Dict[str, Any] # SummarizedItem as dict
|
||||
translated_title: str
|
||||
translated_summary: str
|
||||
target_language: str = 'en'
|
||||
|
||||
class GeneratedImageItem(BaseModel):
|
||||
"""이미지 생성된 아이템"""
|
||||
translated_item: Dict[str, Any] # TranslatedItem as dict
|
||||
image_url: str
|
||||
image_prompt: str
|
||||
|
||||
class QueueMessage(BaseModel):
|
||||
"""큐 메시지"""
|
||||
message_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
||||
queue_name: str
|
||||
job: PipelineJob
|
||||
timestamp: datetime = Field(default_factory=datetime.now)
|
||||
retry_count: int = 0
|
||||
173
services/pipeline/shared/queue_manager.py
Normal file
173
services/pipeline/shared/queue_manager.py
Normal file
@ -0,0 +1,173 @@
|
||||
"""
|
||||
Queue Manager
|
||||
Redis 기반 큐 관리 시스템
|
||||
"""
|
||||
import redis.asyncio as redis
|
||||
import json
|
||||
import logging
|
||||
from typing import Optional, Dict, Any, List
|
||||
from datetime import datetime
|
||||
|
||||
from .models import PipelineJob, QueueMessage
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class QueueManager:
|
||||
"""Redis 기반 큐 매니저"""
|
||||
|
||||
QUEUES = {
|
||||
"keyword_processing": "queue:keyword",
|
||||
"rss_collection": "queue:rss",
|
||||
"search_enrichment": "queue:search",
|
||||
"ai_summarization": "queue:summarize",
|
||||
"translation": "queue:translate",
|
||||
"image_generation": "queue:image",
|
||||
"article_assembly": "queue:assembly",
|
||||
"failed": "queue:failed",
|
||||
"scheduled": "queue:scheduled"
|
||||
}
|
||||
|
||||
def __init__(self, redis_url: str = "redis://redis:6379"):
|
||||
self.redis_url = redis_url
|
||||
self.redis_client: Optional[redis.Redis] = None
|
||||
|
||||
async def connect(self):
|
||||
"""Redis 연결"""
|
||||
if not self.redis_client:
|
||||
self.redis_client = await redis.from_url(
|
||||
self.redis_url,
|
||||
encoding="utf-8",
|
||||
decode_responses=True
|
||||
)
|
||||
logger.info("Connected to Redis")
|
||||
|
||||
async def disconnect(self):
|
||||
"""Redis 연결 해제"""
|
||||
if self.redis_client:
|
||||
await self.redis_client.close()
|
||||
self.redis_client = None
|
||||
|
||||
async def enqueue(self, queue_name: str, job: PipelineJob, priority: int = 0) -> str:
|
||||
"""작업을 큐에 추가"""
|
||||
try:
|
||||
queue_key = self.QUEUES.get(queue_name, f"queue:{queue_name}")
|
||||
|
||||
message = QueueMessage(
|
||||
queue_name=queue_name,
|
||||
job=job
|
||||
)
|
||||
|
||||
# 우선순위에 따라 추가
|
||||
if priority > 0:
|
||||
await self.redis_client.lpush(queue_key, message.json())
|
||||
else:
|
||||
await self.redis_client.rpush(queue_key, message.json())
|
||||
|
||||
# 통계 업데이트
|
||||
await self.redis_client.hincrby("stats:queues", queue_name, 1)
|
||||
|
||||
logger.info(f"Job {job.job_id} enqueued to {queue_name}")
|
||||
return job.job_id
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to enqueue job: {e}")
|
||||
raise
|
||||
|
||||
async def dequeue(self, queue_name: str, timeout: int = 0) -> Optional[PipelineJob]:
|
||||
"""큐에서 작업 가져오기"""
|
||||
try:
|
||||
queue_key = self.QUEUES.get(queue_name, f"queue:{queue_name}")
|
||||
|
||||
if timeout > 0:
|
||||
result = await self.redis_client.blpop(queue_key, timeout=timeout)
|
||||
if result:
|
||||
_, data = result
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
data = await self.redis_client.lpop(queue_key)
|
||||
|
||||
if data:
|
||||
message = QueueMessage.parse_raw(data)
|
||||
|
||||
# 처리 중 목록에 추가
|
||||
processing_key = f"processing:{queue_name}"
|
||||
await self.redis_client.hset(
|
||||
processing_key,
|
||||
message.job.job_id,
|
||||
message.json()
|
||||
)
|
||||
|
||||
return message.job
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to dequeue job: {e}")
|
||||
return None
|
||||
|
||||
async def mark_completed(self, queue_name: str, job_id: str):
|
||||
"""작업 완료 표시"""
|
||||
try:
|
||||
processing_key = f"processing:{queue_name}"
|
||||
await self.redis_client.hdel(processing_key, job_id)
|
||||
|
||||
# 통계 업데이트
|
||||
await self.redis_client.hincrby("stats:completed", queue_name, 1)
|
||||
|
||||
logger.info(f"Job {job_id} completed in {queue_name}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to mark job as completed: {e}")
|
||||
|
||||
async def mark_failed(self, queue_name: str, job: PipelineJob, error: str):
|
||||
"""작업 실패 처리"""
|
||||
try:
|
||||
processing_key = f"processing:{queue_name}"
|
||||
await self.redis_client.hdel(processing_key, job.job_id)
|
||||
|
||||
# 재시도 확인
|
||||
if job.retry_count < job.max_retries:
|
||||
job.retry_count += 1
|
||||
await self.enqueue(queue_name, job)
|
||||
logger.info(f"Job {job.job_id} requeued (retry {job.retry_count}/{job.max_retries})")
|
||||
else:
|
||||
# 실패 큐로 이동
|
||||
job.data["error"] = error
|
||||
job.data["failed_stage"] = queue_name
|
||||
await self.enqueue("failed", job)
|
||||
|
||||
# 통계 업데이트
|
||||
await self.redis_client.hincrby("stats:failed", queue_name, 1)
|
||||
logger.error(f"Job {job.job_id} failed: {error}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to mark job as failed: {e}")
|
||||
|
||||
async def get_queue_stats(self) -> Dict[str, Any]:
|
||||
"""큐 통계 조회"""
|
||||
try:
|
||||
stats = {}
|
||||
|
||||
for name, key in self.QUEUES.items():
|
||||
stats[name] = {
|
||||
"pending": await self.redis_client.llen(key),
|
||||
"processing": await self.redis_client.hlen(f"processing:{name}"),
|
||||
}
|
||||
|
||||
# 완료/실패 통계
|
||||
stats["completed"] = await self.redis_client.hgetall("stats:completed") or {}
|
||||
stats["failed"] = await self.redis_client.hgetall("stats:failed") or {}
|
||||
|
||||
return stats
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get queue stats: {e}")
|
||||
return {}
|
||||
|
||||
async def clear_queue(self, queue_name: str):
|
||||
"""큐 초기화 (테스트용)"""
|
||||
queue_key = self.QUEUES.get(queue_name, f"queue:{queue_name}")
|
||||
await self.redis_client.delete(queue_key)
|
||||
await self.redis_client.delete(f"processing:{queue_name}")
|
||||
logger.info(f"Queue {queue_name} cleared")
|
||||
5
services/pipeline/shared/requirements.txt
Normal file
5
services/pipeline/shared/requirements.txt
Normal file
@ -0,0 +1,5 @@
|
||||
redis[hiredis]==5.0.1
|
||||
motor==3.1.1
|
||||
pymongo==4.3.3
|
||||
pydantic==2.5.0
|
||||
python-dateutil==2.8.2
|
||||
Reference in New Issue
Block a user