- Add multi-threaded keyword scheduler for periodic news collection - Create Keyword Manager API for CRUD operations and monitoring - Implement automatic pipeline triggering (RSS → Google → AI → Translation) - Add thread status monitoring and dynamic keyword management - Support priority-based execution and configurable intervals - Add comprehensive scheduler documentation guide - Default keywords: AI, 테크놀로지, 경제, 블록체인 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
203 lines
6.6 KiB
Python
203 lines
6.6 KiB
Python
"""
|
|
News Pipeline Scheduler
|
|
뉴스 파이프라인 스케줄러 서비스
|
|
"""
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import sys
|
|
from datetime import datetime, timedelta
|
|
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
|
from motor.motor_asyncio import AsyncIOMotorClient
|
|
|
|
# Import from shared module
|
|
from shared.models import KeywordSubscription, PipelineJob
|
|
from shared.queue_manager import QueueManager
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class NewsScheduler:
|
|
def __init__(self):
|
|
self.scheduler = AsyncIOScheduler()
|
|
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
|
|
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
|
|
self.db = None
|
|
self.queue_manager = QueueManager(
|
|
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
|
|
)
|
|
|
|
async def start(self):
|
|
"""스케줄러 시작"""
|
|
logger.info("Starting News Pipeline Scheduler")
|
|
|
|
# MongoDB 연결
|
|
client = AsyncIOMotorClient(self.mongodb_url)
|
|
self.db = client[self.db_name]
|
|
|
|
# Redis 연결
|
|
await self.queue_manager.connect()
|
|
|
|
# 기본 스케줄 설정
|
|
# 매 30분마다 실행
|
|
self.scheduler.add_job(
|
|
self.process_keywords,
|
|
'interval',
|
|
minutes=30,
|
|
id='keyword_processor',
|
|
name='Process Active Keywords'
|
|
)
|
|
|
|
# 특정 시간대 강화 스케줄 (아침 7시, 점심 12시, 저녁 6시)
|
|
for hour in [7, 12, 18]:
|
|
self.scheduler.add_job(
|
|
self.process_priority_keywords,
|
|
'cron',
|
|
hour=hour,
|
|
minute=0,
|
|
id=f'priority_processor_{hour}',
|
|
name=f'Process Priority Keywords at {hour}:00'
|
|
)
|
|
|
|
# 매일 자정 통계 초기화
|
|
self.scheduler.add_job(
|
|
self.reset_daily_stats,
|
|
'cron',
|
|
hour=0,
|
|
minute=0,
|
|
id='stats_reset',
|
|
name='Reset Daily Statistics'
|
|
)
|
|
|
|
self.scheduler.start()
|
|
logger.info("Scheduler started successfully")
|
|
|
|
# 시작 즉시 한 번 실행
|
|
await self.process_keywords()
|
|
|
|
async def process_keywords(self):
|
|
"""활성 키워드 처리"""
|
|
try:
|
|
logger.info("Processing active keywords")
|
|
|
|
# MongoDB에서 활성 키워드 로드
|
|
now = datetime.now()
|
|
thirty_minutes_ago = now - timedelta(minutes=30)
|
|
|
|
keywords = await self.db.keywords.find({
|
|
"is_active": True,
|
|
"$or": [
|
|
{"last_processed": {"$lt": thirty_minutes_ago}},
|
|
{"last_processed": None}
|
|
]
|
|
}).to_list(None)
|
|
|
|
logger.info(f"Found {len(keywords)} keywords to process")
|
|
|
|
for keyword_doc in keywords:
|
|
await self._create_job(keyword_doc)
|
|
|
|
# 처리 시간 업데이트
|
|
await self.db.keywords.update_one(
|
|
{"keyword_id": keyword_doc['keyword_id']},
|
|
{"$set": {"last_processed": now}}
|
|
)
|
|
|
|
logger.info(f"Created jobs for {len(keywords)} keywords")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing keywords: {e}")
|
|
|
|
async def process_priority_keywords(self):
|
|
"""우선순위 키워드 처리"""
|
|
try:
|
|
logger.info("Processing priority keywords")
|
|
|
|
keywords = await self.db.keywords.find({
|
|
"is_active": True,
|
|
"is_priority": True
|
|
}).to_list(None)
|
|
|
|
for keyword_doc in keywords:
|
|
await self._create_job(keyword_doc, priority=1)
|
|
|
|
logger.info(f"Created priority jobs for {len(keywords)} keywords")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing priority keywords: {e}")
|
|
|
|
async def _create_job(self, keyword_doc: dict, priority: int = 0):
|
|
"""파이프라인 작업 생성"""
|
|
try:
|
|
# KeywordSubscription 모델로 변환
|
|
keyword = KeywordSubscription(**keyword_doc)
|
|
|
|
# PipelineJob 생성
|
|
job = PipelineJob(
|
|
keyword_id=keyword.keyword_id,
|
|
keyword=keyword.keyword,
|
|
stage='rss_collection',
|
|
stages_completed=[],
|
|
priority=priority,
|
|
data={
|
|
'keyword': keyword.keyword,
|
|
'language': keyword.language,
|
|
'rss_feeds': keyword.rss_feeds or self._get_default_rss_feeds(),
|
|
'categories': keyword.categories
|
|
}
|
|
)
|
|
|
|
# 첫 번째 큐에 추가
|
|
await self.queue_manager.enqueue(
|
|
'rss_collection',
|
|
job,
|
|
priority=priority
|
|
)
|
|
|
|
logger.info(f"Created job {job.job_id} for keyword '{keyword.keyword}'")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error creating job for keyword: {e}")
|
|
|
|
def _get_default_rss_feeds(self) -> list:
|
|
"""기본 RSS 피드 목록"""
|
|
return [
|
|
"https://news.google.com/rss/search?q={keyword}&hl=ko&gl=KR&ceid=KR:ko",
|
|
"https://trends.google.com/trends/trendingsearches/daily/rss?geo=KR",
|
|
"https://www.mk.co.kr/rss/40300001/", # 매일경제
|
|
"https://www.hankyung.com/feed/all-news", # 한국경제
|
|
"https://www.zdnet.co.kr/news/news_rss.xml", # ZDNet Korea
|
|
]
|
|
|
|
async def reset_daily_stats(self):
|
|
"""일일 통계 초기화"""
|
|
try:
|
|
logger.info("Resetting daily statistics")
|
|
# Redis 통계 초기화
|
|
# 구현 필요
|
|
pass
|
|
except Exception as e:
|
|
logger.error(f"Error resetting stats: {e}")
|
|
|
|
async def stop(self):
|
|
"""스케줄러 중지"""
|
|
self.scheduler.shutdown()
|
|
await self.queue_manager.disconnect()
|
|
logger.info("Scheduler stopped")
|
|
|
|
async def main():
|
|
"""메인 함수"""
|
|
scheduler = NewsScheduler()
|
|
|
|
try:
|
|
await scheduler.start()
|
|
# 계속 실행
|
|
while True:
|
|
await asyncio.sleep(60)
|
|
except KeyboardInterrupt:
|
|
logger.info("Received interrupt signal")
|
|
finally:
|
|
await scheduler.stop()
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |