feat: Implement automated keyword-based news pipeline scheduler

- Add multi-threaded keyword scheduler for periodic news collection
- Create Keyword Manager API for CRUD operations and monitoring
- Implement automatic pipeline triggering (RSS → Google → AI → Translation)
- Add thread status monitoring and dynamic keyword management
- Support priority-based execution and configurable intervals
- Add comprehensive scheduler documentation guide
- Default keywords: AI, 테크놀로지, 경제, 블록체인

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
jungwoo choi
2025-09-15 17:09:22 +09:00
parent 070032006e
commit eeaa9dcb4b
39 changed files with 3472 additions and 759 deletions

View File

@ -8,9 +8,11 @@ import os
import sys
from typing import List, Dict, Any
import httpx
from motor.motor_asyncio import AsyncIOMotorClient
from datetime import datetime
# Import from shared module
from shared.models import PipelineJob, SummarizedItem, TranslatedItem
from shared.models import PipelineJob, FinalArticle
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
@ -24,93 +26,167 @@ class TranslatorWorker:
self.deepl_api_key = os.getenv("DEEPL_API_KEY", "3abbc796-2515-44a8-972d-22dcf27ab54a")
# DeepL Pro API 엔드포인트 사용
self.deepl_api_url = "https://api.deepl.com/v2/translate"
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
async def start(self):
"""워커 시작"""
logger.info("Starting Translator Worker")
# Redis 연결
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# DeepL API 키 확인
if not self.deepl_api_key:
logger.error("DeepL API key not configured")
return
# 메인 처리 루프
while True:
try:
# 큐에서 작업 가져오기
job = await self.queue_manager.dequeue('translation', timeout=5)
if job:
await self.process_job(job)
except Exception as e:
logger.error(f"Error in worker loop: {e}")
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""번역 작업 처리"""
"""영어 버전 기사 생성 및 저장"""
try:
logger.info(f"Processing job {job.job_id} for translation")
summarized_items = job.data.get('summarized_items', [])
translated_items = []
for item_data in summarized_items:
summarized_item = SummarizedItem(**item_data)
# 제목과 요약 번역
translated_title = await self._translate_text(
summarized_item.enriched_item['rss_item']['title'],
# MongoDB에서 한국어 기사 가져오기
news_id = job.data.get('news_id')
if not news_id:
logger.error(f"No news_id in job {job.job_id}")
await self.queue_manager.mark_failed('translation', job, "No news_id")
return
# MongoDB에서 한국어 기사 조회 (articles_ko)
korean_article = await self.db.articles_ko.find_one({"news_id": news_id})
if not korean_article:
logger.error(f"Article {news_id} not found in MongoDB")
await self.queue_manager.mark_failed('translation', job, "Article not found")
return
# 영어로 번역
translated_title = await self._translate_text(
korean_article.get('title', ''),
target_lang='EN'
)
translated_summary = await self._translate_text(
korean_article.get('summary', ''),
target_lang='EN'
)
# Subtopics 번역
from shared.models import Subtopic
translated_subtopics = []
for subtopic in korean_article.get('subtopics', []):
translated_subtopic_title = await self._translate_text(
subtopic.get('title', ''),
target_lang='EN'
)
translated_summary = await self._translate_text(
summarized_item.ai_summary,
target_lang='EN'
)
translated_item = TranslatedItem(
summarized_item=summarized_item,
translated_title=translated_title,
translated_summary=translated_summary,
target_language='en'
)
translated_items.append(translated_item)
# API 속도 제한
await asyncio.sleep(0.5)
if translated_items:
logger.info(f"Translated {len(translated_items)} items")
# 다음 단계로 전달
job.data['translated_items'] = [item.dict() for item in translated_items]
job.stages_completed.append('translation')
job.stage = 'image_generation'
await self.queue_manager.enqueue('image_generation', job)
await self.queue_manager.mark_completed('translation', job.job_id)
else:
logger.warning(f"No items translated for job {job.job_id}")
await self.queue_manager.mark_failed(
'translation',
job,
"No items to translate"
)
translated_content_list = []
for content_para in subtopic.get('content', []):
translated_para = await self._translate_text(
content_para,
target_lang='EN'
)
translated_content_list.append(translated_para)
await asyncio.sleep(0.2) # API 속도 제한
translated_subtopics.append(Subtopic(
title=translated_subtopic_title,
content=translated_content_list
))
# 카테고리 번역
translated_categories = []
for category in korean_article.get('categories', []):
translated_cat = await self._translate_text(category, target_lang='EN')
translated_categories.append(translated_cat)
await asyncio.sleep(0.2) # API 속도 제한
# Entities 번역 (선택적)
from shared.models import Entities
entities_data = korean_article.get('entities', {})
translated_entities = Entities(
people=entities_data.get('people', []), # 인명은 번역하지 않음
organizations=entities_data.get('organizations', []), # 조직명은 번역하지 않음
groups=entities_data.get('groups', []),
countries=entities_data.get('countries', []),
events=entities_data.get('events', [])
)
# 레퍼런스 가져오기 (번역하지 않음)
from shared.models import NewsReference
references = []
for ref_data in korean_article.get('references', []):
references.append(NewsReference(**ref_data))
# 영어 버전 기사 생성 - 같은 news_id 사용
english_article = FinalArticle(
news_id=news_id, # 원본과 같은 news_id 사용
title=translated_title,
summary=translated_summary,
subtopics=translated_subtopics,
categories=translated_categories,
entities=translated_entities,
source_keyword=job.keyword,
source_count=korean_article.get('source_count', 1),
references=references, # 원본 레퍼런스 그대로 사용
job_id=job.job_id,
keyword_id=job.keyword_id,
pipeline_stages=job.stages_completed.copy() + ['translation'],
processing_time=korean_article.get('processing_time', 0),
language='en', # 영어
ref_news_id=None # 같은 news_id를 사용하므로 ref 불필요
)
# MongoDB에 영어 버전 저장 (articles_en)
result = await self.db.articles_en.insert_one(english_article.model_dump())
english_article_id = str(result.inserted_id)
logger.info(f"English article saved with _id: {english_article_id}, news_id: {news_id}, language: en")
# 원본 한국어 기사 업데이트 - 번역 완료 표시
await self.db.articles_ko.update_one(
{"news_id": news_id},
{
"$addToSet": {
"pipeline_stages": "translation"
}
}
)
# 완료 표시
job.stages_completed.append('translation')
await self.queue_manager.mark_completed('translation', job.job_id)
logger.info(f"Translation completed for job {job.job_id}")
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")
await self.queue_manager.mark_failed('translation', job, str(e))
async def _translate_text(self, text: str, target_lang: str = 'EN') -> str:
"""DeepL API를 사용한 텍스트 번역"""
try:
if not text:
return ""
async with httpx.AsyncClient() as client:
response = await client.post(
self.deepl_api_url,
@ -122,18 +198,18 @@ class TranslatorWorker:
},
timeout=30
)
if response.status_code == 200:
result = response.json()
return result['translations'][0]['text']
else:
logger.error(f"DeepL API error: {response.status_code}")
return text # 번역 실패시 원본 반환
except Exception as e:
logger.error(f"Error translating text: {e}")
return text # 번역 실패시 원본 반환
async def stop(self):
"""워커 중지"""
await self.queue_manager.disconnect()
@ -142,7 +218,7 @@ class TranslatorWorker:
async def main():
"""메인 함수"""
worker = TranslatorWorker()
try:
await worker.start()
except KeyboardInterrupt: