Initial commit - cleaned repository

This commit is contained in:
jungwoo choi
2025-09-28 20:41:57 +09:00
commit e3c28f796a
188 changed files with 28102 additions and 0 deletions

View File

@ -0,0 +1 @@
# Shared modules for pipeline services

View File

@ -0,0 +1,159 @@
"""
Pipeline Data Models
파이프라인 전체에서 사용되는 공통 데이터 모델
"""
from datetime import datetime
from typing import List, Dict, Any, Optional
from pydantic import BaseModel, Field
import uuid
class KeywordSubscription(BaseModel):
"""키워드 구독 모델"""
keyword_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
keyword: str
language: str = "ko"
schedule: str = "0 */30 * * *" # Cron expression (30분마다)
is_active: bool = True
is_priority: bool = False
last_processed: Optional[datetime] = None
rss_feeds: List[str] = Field(default_factory=list)
categories: List[str] = Field(default_factory=list)
created_at: datetime = Field(default_factory=datetime.now)
owner: Optional[str] = None
class PipelineJob(BaseModel):
"""파이프라인 작업 모델"""
job_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
keyword_id: str
keyword: str
stage: str # current stage
stages_completed: List[str] = Field(default_factory=list)
data: Dict[str, Any] = Field(default_factory=dict)
retry_count: int = 0
max_retries: int = 3
priority: int = 0
created_at: datetime = Field(default_factory=datetime.now)
updated_at: datetime = Field(default_factory=datetime.now)
class RSSItem(BaseModel):
"""RSS 피드 아이템"""
item_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
title: str
link: str
guid: Optional[str] = None # RSS GUID for deduplication
published: Optional[str] = None
summary: Optional[str] = None
source_feed: str
class SearchResult(BaseModel):
"""검색 결과"""
title: str
link: str
snippet: Optional[str] = None
source: str = "google"
class EnrichedItem(BaseModel):
"""강화된 뉴스 아이템"""
rss_item: RSSItem
search_results: List[SearchResult] = Field(default_factory=list)
class SummarizedItem(BaseModel):
"""요약된 아이템"""
enriched_item: EnrichedItem
ai_summary: str
summary_language: str = "ko"
class TranslatedItem(BaseModel):
"""번역된 아이템"""
summarized_item: SummarizedItem
title_en: str
summary_en: str
class ItemWithImage(BaseModel):
"""이미지가 추가된 아이템"""
translated_item: TranslatedItem
image_url: str
image_prompt: str
class Subtopic(BaseModel):
"""기사 소주제"""
title: str
content: List[str] # 문단별 내용
class Entities(BaseModel):
"""개체명"""
people: List[str] = Field(default_factory=list)
organizations: List[str] = Field(default_factory=list)
groups: List[str] = Field(default_factory=list)
countries: List[str] = Field(default_factory=list)
events: List[str] = Field(default_factory=list)
class NewsReference(BaseModel):
"""뉴스 레퍼런스"""
title: str
link: str
source: str
published: Optional[str] = None
class FinalArticle(BaseModel):
"""최종 기사 - ai_writer_db.articles 스키마와 일치"""
news_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
title: str
created_at: str = Field(default_factory=lambda: datetime.now().isoformat())
summary: str
subtopics: List[Subtopic] = Field(default_factory=list)
categories: List[str] = Field(default_factory=list)
entities: Entities = Field(default_factory=Entities)
source_keyword: str
source_count: int = 1
# 레퍼런스 뉴스 정보
references: List[NewsReference] = Field(default_factory=list)
# 파이프라인 관련 추가 필드
job_id: Optional[str] = None
keyword_id: Optional[str] = None
pipeline_stages: List[str] = Field(default_factory=list)
processing_time: Optional[float] = None
# 다국어 지원
language: str = 'ko'
ref_news_id: Optional[str] = None
# RSS 중복 체크용 GUID
rss_guid: Optional[str] = None
# 이미지 관련 필드
image_prompt: Optional[str] = None
images: List[str] = Field(default_factory=list)
# 번역 추적
translated_languages: List[str] = Field(default_factory=list)
class TranslatedItem(BaseModel):
"""번역된 아이템"""
summarized_item: Dict[str, Any] # SummarizedItem as dict
translated_title: str
translated_summary: str
target_language: str = 'en'
class GeneratedImageItem(BaseModel):
"""이미지 생성된 아이템"""
translated_item: Dict[str, Any] # TranslatedItem as dict
image_url: str
image_prompt: str
class QueueMessage(BaseModel):
"""큐 메시지"""
message_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
queue_name: str
job: PipelineJob
timestamp: datetime = Field(default_factory=datetime.now)
retry_count: int = 0
class Keyword(BaseModel):
"""스케줄러용 키워드 모델"""
keyword_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
keyword: str
interval_minutes: int = Field(default=60) # 기본 1시간
is_active: bool = Field(default=True)
last_run: Optional[datetime] = None
next_run: Optional[datetime] = None
created_at: datetime = Field(default_factory=datetime.now)
updated_at: datetime = Field(default_factory=datetime.now)
rss_feeds: List[str] = Field(default_factory=list) # 커스텀 RSS 피드
priority: int = Field(default=0) # 우선순위 (높을수록 우선)
max_articles_per_run: int = Field(default=100) # 실행당 최대 기사 수

View File

@ -0,0 +1,176 @@
"""
Queue Manager
Redis 기반 큐 관리 시스템
"""
import redis.asyncio as redis
import json
import logging
from typing import Optional, Dict, Any, List
from datetime import datetime
from .models import PipelineJob, QueueMessage
logger = logging.getLogger(__name__)
class QueueManager:
"""Redis 기반 큐 매니저"""
QUEUES = {
"keyword_processing": "queue:keyword_processing",
"rss_collection": "queue:rss_collection",
"search_enrichment": "queue:search_enrichment",
"google_search": "queue:google_search",
"ai_article_generation": "queue:ai_article_generation",
"image_generation": "queue:image_generation",
"translation": "queue:translation",
"failed": "queue:failed",
"scheduled": "queue:scheduled"
}
def __init__(self, redis_url: str = "redis://redis:6379"):
self.redis_url = redis_url
self.redis_client: Optional[redis.Redis] = None
async def connect(self):
"""Redis 연결"""
if not self.redis_client:
self.redis_client = await redis.from_url(
self.redis_url,
encoding="utf-8",
decode_responses=True
)
logger.info("Connected to Redis")
async def disconnect(self):
"""Redis 연결 해제"""
if self.redis_client:
await self.redis_client.close()
self.redis_client = None
async def enqueue(self, queue_name: str, job: PipelineJob, priority: int = 0) -> str:
"""작업을 큐에 추가"""
try:
queue_key = self.QUEUES.get(queue_name, f"queue:{queue_name}")
message = QueueMessage(
queue_name=queue_name,
job=job
)
# 우선순위에 따라 추가
if priority > 0:
await self.redis_client.lpush(queue_key, message.json())
else:
await self.redis_client.rpush(queue_key, message.json())
# 통계 업데이트
await self.redis_client.hincrby("stats:queues", queue_name, 1)
logger.info(f"Job {job.job_id} enqueued to {queue_name}")
return job.job_id
except Exception as e:
logger.error(f"Failed to enqueue job: {e}")
raise
async def dequeue(self, queue_name: str, timeout: int = 0) -> Optional[PipelineJob]:
"""큐에서 작업 가져오기"""
try:
queue_key = self.QUEUES.get(queue_name, f"queue:{queue_name}")
logger.info(f"Attempting to dequeue from {queue_key} with timeout={timeout}")
if timeout > 0:
result = await self.redis_client.blpop(queue_key, timeout)
if result:
_, data = result
logger.info(f"Dequeued item from {queue_key}")
else:
logger.debug(f"No item available in {queue_key}")
return None
else:
data = await self.redis_client.lpop(queue_key)
if data:
message = QueueMessage.parse_raw(data)
# 처리 중 목록에 추가
processing_key = f"processing:{queue_name}"
await self.redis_client.hset(
processing_key,
message.job.job_id,
message.json()
)
return message.job
return None
except Exception as e:
logger.error(f"Failed to dequeue job: {e}")
return None
async def mark_completed(self, queue_name: str, job_id: str):
"""작업 완료 표시"""
try:
processing_key = f"processing:{queue_name}"
await self.redis_client.hdel(processing_key, job_id)
# 통계 업데이트
await self.redis_client.hincrby("stats:completed", queue_name, 1)
logger.info(f"Job {job_id} completed in {queue_name}")
except Exception as e:
logger.error(f"Failed to mark job as completed: {e}")
async def mark_failed(self, queue_name: str, job: PipelineJob, error: str):
"""작업 실패 처리"""
try:
processing_key = f"processing:{queue_name}"
await self.redis_client.hdel(processing_key, job.job_id)
# 재시도 확인
if job.retry_count < job.max_retries:
job.retry_count += 1
await self.enqueue(queue_name, job)
logger.info(f"Job {job.job_id} requeued (retry {job.retry_count}/{job.max_retries})")
else:
# 실패 큐로 이동
job.data["error"] = error
job.data["failed_stage"] = queue_name
await self.enqueue("failed", job)
# 통계 업데이트
await self.redis_client.hincrby("stats:failed", queue_name, 1)
logger.error(f"Job {job.job_id} failed: {error}")
except Exception as e:
logger.error(f"Failed to mark job as failed: {e}")
async def get_queue_stats(self) -> Dict[str, Any]:
"""큐 통계 조회"""
try:
stats = {}
for name, key in self.QUEUES.items():
stats[name] = {
"pending": await self.redis_client.llen(key),
"processing": await self.redis_client.hlen(f"processing:{name}"),
}
# 완료/실패 통계
stats["completed"] = await self.redis_client.hgetall("stats:completed") or {}
stats["failed"] = await self.redis_client.hgetall("stats:failed") or {}
return stats
except Exception as e:
logger.error(f"Failed to get queue stats: {e}")
return {}
async def clear_queue(self, queue_name: str):
"""큐 초기화 (테스트용)"""
queue_key = self.QUEUES.get(queue_name, f"queue:{queue_name}")
await self.redis_client.delete(queue_key)
await self.redis_client.delete(f"processing:{queue_name}")
logger.info(f"Queue {queue_name} cleared")

View File

@ -0,0 +1,5 @@
redis[hiredis]==5.0.1
motor==3.1.1
pymongo==4.3.3
pydantic==2.5.0
python-dateutil==2.8.2