feat: Implement async queue-based news pipeline with microservices

Major architectural transformation from synchronous to asynchronous processing:

## Pipeline Services (8 microservices)
- pipeline-scheduler: APScheduler for 30-minute periodic job triggers
- pipeline-rss-collector: RSS feed collection with deduplication (7-day TTL)
- pipeline-google-search: Content enrichment via Google Search API
- pipeline-ai-summarizer: AI summarization using Claude API (claude-sonnet-4-20250514)
- pipeline-translator: Translation using DeepL Pro API
- pipeline-image-generator: Image generation with Replicate API (Stable Diffusion)
- pipeline-article-assembly: Final article assembly and MongoDB storage
- pipeline-monitor: Real-time monitoring dashboard (port 8100)

## Key Features
- Redis-based job queue with deduplication
- Asynchronous processing with Python asyncio
- Shared models and queue manager for inter-service communication
- Docker containerization for all services
- Container names standardized with site11_ prefix

## Removed Services
- Moved to backup: google-search, rss-feed, news-aggregator, ai-writer

## Configuration
- DeepL Pro API: 3abbc796-2515-44a8-972d-22dcf27ab54a
- Claude Model: claude-sonnet-4-20250514
- Redis Queue TTL: 7 days for deduplication

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
jungwoo choi
2025-09-13 19:22:14 +09:00
parent 1d90af7c3c
commit 070032006e
73 changed files with 5922 additions and 4 deletions

View File

@ -0,0 +1,203 @@
"""
News Pipeline Scheduler
뉴스 파이프라인 스케줄러 서비스
"""
import asyncio
import logging
import os
import sys
from datetime import datetime, timedelta
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from motor.motor_asyncio import AsyncIOMotorClient
# Import from shared module
from shared.models import KeywordSubscription, PipelineJob
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class NewsScheduler:
def __init__(self):
self.scheduler = AsyncIOScheduler()
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "pipeline_db")
self.db = None
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
async def start(self):
"""스케줄러 시작"""
logger.info("Starting News Pipeline Scheduler")
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# Redis 연결
await self.queue_manager.connect()
# 기본 스케줄 설정
# 매 30분마다 실행
self.scheduler.add_job(
self.process_keywords,
'interval',
minutes=30,
id='keyword_processor',
name='Process Active Keywords'
)
# 특정 시간대 강화 스케줄 (아침 7시, 점심 12시, 저녁 6시)
for hour in [7, 12, 18]:
self.scheduler.add_job(
self.process_priority_keywords,
'cron',
hour=hour,
minute=0,
id=f'priority_processor_{hour}',
name=f'Process Priority Keywords at {hour}:00'
)
# 매일 자정 통계 초기화
self.scheduler.add_job(
self.reset_daily_stats,
'cron',
hour=0,
minute=0,
id='stats_reset',
name='Reset Daily Statistics'
)
self.scheduler.start()
logger.info("Scheduler started successfully")
# 시작 즉시 한 번 실행
await self.process_keywords()
async def process_keywords(self):
"""활성 키워드 처리"""
try:
logger.info("Processing active keywords")
# MongoDB에서 활성 키워드 로드
now = datetime.now()
thirty_minutes_ago = now - timedelta(minutes=30)
keywords = await self.db.keywords.find({
"is_active": True,
"$or": [
{"last_processed": {"$lt": thirty_minutes_ago}},
{"last_processed": None}
]
}).to_list(None)
logger.info(f"Found {len(keywords)} keywords to process")
for keyword_doc in keywords:
await self._create_job(keyword_doc)
# 처리 시간 업데이트
await self.db.keywords.update_one(
{"keyword_id": keyword_doc['keyword_id']},
{"$set": {"last_processed": now}}
)
logger.info(f"Created jobs for {len(keywords)} keywords")
except Exception as e:
logger.error(f"Error processing keywords: {e}")
async def process_priority_keywords(self):
"""우선순위 키워드 처리"""
try:
logger.info("Processing priority keywords")
keywords = await self.db.keywords.find({
"is_active": True,
"is_priority": True
}).to_list(None)
for keyword_doc in keywords:
await self._create_job(keyword_doc, priority=1)
logger.info(f"Created priority jobs for {len(keywords)} keywords")
except Exception as e:
logger.error(f"Error processing priority keywords: {e}")
async def _create_job(self, keyword_doc: dict, priority: int = 0):
"""파이프라인 작업 생성"""
try:
# KeywordSubscription 모델로 변환
keyword = KeywordSubscription(**keyword_doc)
# PipelineJob 생성
job = PipelineJob(
keyword_id=keyword.keyword_id,
keyword=keyword.keyword,
stage='rss_collection',
stages_completed=[],
priority=priority,
data={
'keyword': keyword.keyword,
'language': keyword.language,
'rss_feeds': keyword.rss_feeds or self._get_default_rss_feeds(),
'categories': keyword.categories
}
)
# 첫 번째 큐에 추가
await self.queue_manager.enqueue(
'rss_collection',
job,
priority=priority
)
logger.info(f"Created job {job.job_id} for keyword '{keyword.keyword}'")
except Exception as e:
logger.error(f"Error creating job for keyword: {e}")
def _get_default_rss_feeds(self) -> list:
"""기본 RSS 피드 목록"""
return [
"https://news.google.com/rss/search?q={keyword}&hl=ko&gl=KR&ceid=KR:ko",
"https://trends.google.com/trends/trendingsearches/daily/rss?geo=KR",
"https://www.mk.co.kr/rss/40300001/", # 매일경제
"https://www.hankyung.com/feed/all-news", # 한국경제
"https://www.zdnet.co.kr/news/news_rss.xml", # ZDNet Korea
]
async def reset_daily_stats(self):
"""일일 통계 초기화"""
try:
logger.info("Resetting daily statistics")
# Redis 통계 초기화
# 구현 필요
pass
except Exception as e:
logger.error(f"Error resetting stats: {e}")
async def stop(self):
"""스케줄러 중지"""
self.scheduler.shutdown()
await self.queue_manager.disconnect()
logger.info("Scheduler stopped")
async def main():
"""메인 함수"""
scheduler = NewsScheduler()
try:
await scheduler.start()
# 계속 실행
while True:
await asyncio.sleep(60)
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
await scheduler.stop()
if __name__ == "__main__":
asyncio.run(main())