Files
site11/services/pipeline/scheduler/scheduler.py
jungwoo choi eeaa9dcb4b feat: Implement automated keyword-based news pipeline scheduler
- Add multi-threaded keyword scheduler for periodic news collection
- Create Keyword Manager API for CRUD operations and monitoring
- Implement automatic pipeline triggering (RSS → Google → AI → Translation)
- Add thread status monitoring and dynamic keyword management
- Support priority-based execution and configurable intervals
- Add comprehensive scheduler documentation guide
- Default keywords: AI, 테크놀로지, 경제, 블록체인

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-15 17:09:22 +09:00

203 lines
6.6 KiB
Python

"""
News Pipeline Scheduler
뉴스 파이프라인 스케줄러 서비스
"""
import asyncio
import logging
import os
import sys
from datetime import datetime, timedelta
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from motor.motor_asyncio import AsyncIOMotorClient
# Import from shared module
from shared.models import KeywordSubscription, PipelineJob
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class NewsScheduler:
def __init__(self):
self.scheduler = AsyncIOScheduler()
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
async def start(self):
"""스케줄러 시작"""
logger.info("Starting News Pipeline Scheduler")
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# Redis 연결
await self.queue_manager.connect()
# 기본 스케줄 설정
# 매 30분마다 실행
self.scheduler.add_job(
self.process_keywords,
'interval',
minutes=30,
id='keyword_processor',
name='Process Active Keywords'
)
# 특정 시간대 강화 스케줄 (아침 7시, 점심 12시, 저녁 6시)
for hour in [7, 12, 18]:
self.scheduler.add_job(
self.process_priority_keywords,
'cron',
hour=hour,
minute=0,
id=f'priority_processor_{hour}',
name=f'Process Priority Keywords at {hour}:00'
)
# 매일 자정 통계 초기화
self.scheduler.add_job(
self.reset_daily_stats,
'cron',
hour=0,
minute=0,
id='stats_reset',
name='Reset Daily Statistics'
)
self.scheduler.start()
logger.info("Scheduler started successfully")
# 시작 즉시 한 번 실행
await self.process_keywords()
async def process_keywords(self):
"""활성 키워드 처리"""
try:
logger.info("Processing active keywords")
# MongoDB에서 활성 키워드 로드
now = datetime.now()
thirty_minutes_ago = now - timedelta(minutes=30)
keywords = await self.db.keywords.find({
"is_active": True,
"$or": [
{"last_processed": {"$lt": thirty_minutes_ago}},
{"last_processed": None}
]
}).to_list(None)
logger.info(f"Found {len(keywords)} keywords to process")
for keyword_doc in keywords:
await self._create_job(keyword_doc)
# 처리 시간 업데이트
await self.db.keywords.update_one(
{"keyword_id": keyword_doc['keyword_id']},
{"$set": {"last_processed": now}}
)
logger.info(f"Created jobs for {len(keywords)} keywords")
except Exception as e:
logger.error(f"Error processing keywords: {e}")
async def process_priority_keywords(self):
"""우선순위 키워드 처리"""
try:
logger.info("Processing priority keywords")
keywords = await self.db.keywords.find({
"is_active": True,
"is_priority": True
}).to_list(None)
for keyword_doc in keywords:
await self._create_job(keyword_doc, priority=1)
logger.info(f"Created priority jobs for {len(keywords)} keywords")
except Exception as e:
logger.error(f"Error processing priority keywords: {e}")
async def _create_job(self, keyword_doc: dict, priority: int = 0):
"""파이프라인 작업 생성"""
try:
# KeywordSubscription 모델로 변환
keyword = KeywordSubscription(**keyword_doc)
# PipelineJob 생성
job = PipelineJob(
keyword_id=keyword.keyword_id,
keyword=keyword.keyword,
stage='rss_collection',
stages_completed=[],
priority=priority,
data={
'keyword': keyword.keyword,
'language': keyword.language,
'rss_feeds': keyword.rss_feeds or self._get_default_rss_feeds(),
'categories': keyword.categories
}
)
# 첫 번째 큐에 추가
await self.queue_manager.enqueue(
'rss_collection',
job,
priority=priority
)
logger.info(f"Created job {job.job_id} for keyword '{keyword.keyword}'")
except Exception as e:
logger.error(f"Error creating job for keyword: {e}")
def _get_default_rss_feeds(self) -> list:
"""기본 RSS 피드 목록"""
return [
"https://news.google.com/rss/search?q={keyword}&hl=ko&gl=KR&ceid=KR:ko",
"https://trends.google.com/trends/trendingsearches/daily/rss?geo=KR",
"https://www.mk.co.kr/rss/40300001/", # 매일경제
"https://www.hankyung.com/feed/all-news", # 한국경제
"https://www.zdnet.co.kr/news/news_rss.xml", # ZDNet Korea
]
async def reset_daily_stats(self):
"""일일 통계 초기화"""
try:
logger.info("Resetting daily statistics")
# Redis 통계 초기화
# 구현 필요
pass
except Exception as e:
logger.error(f"Error resetting stats: {e}")
async def stop(self):
"""스케줄러 중지"""
self.scheduler.shutdown()
await self.queue_manager.disconnect()
logger.info("Scheduler stopped")
async def main():
"""메인 함수"""
scheduler = NewsScheduler()
try:
await scheduler.start()
# 계속 실행
while True:
await asyncio.sleep(60)
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
await scheduler.stop()
if __name__ == "__main__":
asyncio.run(main())