Files
site11/services/pipeline/translator/multi_translator.py
jungwoo choi eeaa9dcb4b feat: Implement automated keyword-based news pipeline scheduler
- Add multi-threaded keyword scheduler for periodic news collection
- Create Keyword Manager API for CRUD operations and monitoring
- Implement automatic pipeline triggering (RSS → Google → AI → Translation)
- Add thread status monitoring and dynamic keyword management
- Support priority-based execution and configurable intervals
- Add comprehensive scheduler documentation guide
- Default keywords: AI, 테크놀로지, 경제, 블록체인

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-15 17:09:22 +09:00

320 lines
12 KiB
Python

"""
Multi-Language Translation Service
다국어 번역 서비스 - 설정 기반 다중 언어 지원
"""
import asyncio
import logging
import os
import sys
import json
from typing import List, Dict, Any
import httpx
import redis.asyncio as redis
from motor.motor_asyncio import AsyncIOMotorClient
from datetime import datetime
# Import from shared module
from shared.models import PipelineJob, FinalArticle
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class MultiLanguageTranslator:
def __init__(self):
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
self.deepl_api_key = os.getenv("DEEPL_API_KEY", "3abbc796-2515-44a8-972d-22dcf27ab54a")
self.deepl_api_url = "https://api.deepl.com/v2/translate"
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
self.languages_config = None
self.config_path = "/app/config/languages.json"
async def load_config(self):
"""언어 설정 파일 로드"""
try:
if os.path.exists(self.config_path):
with open(self.config_path, 'r', encoding='utf-8') as f:
self.languages_config = json.load(f)
else:
# 기본 설정 (영어만)
self.languages_config = {
"enabled_languages": [
{
"code": "en",
"name": "English",
"deepl_code": "EN",
"collection": "articles_en",
"enabled": True
}
],
"source_language": {
"code": "ko",
"name": "Korean",
"collection": "articles_ko"
},
"translation_settings": {
"batch_size": 5,
"delay_between_languages": 2.0,
"delay_between_articles": 0.5,
"max_retries": 3
}
}
logger.info(f"Loaded language config: {len(self.get_enabled_languages())} languages enabled")
except Exception as e:
logger.error(f"Error loading config: {e}")
raise
def get_enabled_languages(self) -> List[Dict]:
"""활성화된 언어 목록 반환"""
return [lang for lang in self.languages_config["enabled_languages"] if lang["enabled"]]
async def start(self):
"""워커 시작"""
logger.info("Starting Multi-Language Translator Worker")
# 설정 로드
await self.load_config()
# Redis 연결
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# DeepL API 키 확인
if not self.deepl_api_key:
logger.error("DeepL API key not configured")
return
# 메인 처리 루프
while True:
try:
# 큐에서 작업 가져오기
job = await self.queue_manager.dequeue('translation', timeout=5)
if job:
await self.process_job(job)
except Exception as e:
logger.error(f"Error in worker loop: {e}")
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""모든 활성 언어로 번역"""
try:
logger.info(f"Processing job {job.job_id} for multi-language translation")
# MongoDB에서 한국어 기사 가져오기
news_id = job.data.get('news_id')
if not news_id:
logger.error(f"No news_id in job {job.job_id}")
await self.queue_manager.mark_failed('translation', job, "No news_id")
return
# 원본 컬렉션에서 기사 조회
source_collection = self.languages_config["source_language"]["collection"]
korean_article = await self.db[source_collection].find_one({"news_id": news_id})
if not korean_article:
logger.error(f"Article {news_id} not found in {source_collection}")
await self.queue_manager.mark_failed('translation', job, "Article not found")
return
# 활성화된 모든 언어로 번역
enabled_languages = self.get_enabled_languages()
settings = self.languages_config["translation_settings"]
for lang_config in enabled_languages:
try:
logger.info(f"Translating article {news_id} to {lang_config['name']}")
# 이미 번역되었는지 확인
existing = await self.db[lang_config["collection"]].find_one({"news_id": news_id})
if existing:
logger.info(f"Article {news_id} already translated to {lang_config['code']}")
continue
# 번역 수행
await self.translate_article(
korean_article,
lang_config,
job
)
# 언어 간 지연
if settings.get("delay_between_languages"):
await asyncio.sleep(settings["delay_between_languages"])
except Exception as e:
logger.error(f"Error translating to {lang_config['code']}: {e}")
continue
# 파이프라인 완료 로그
logger.info(f"Translation pipeline completed for news_id: {news_id}")
# 완료 표시
job.stages_completed.append('translation')
await self.queue_manager.mark_completed('translation', job.job_id)
logger.info(f"Multi-language translation completed for job {job.job_id}")
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")
await self.queue_manager.mark_failed('translation', job, str(e))
async def translate_article(self, korean_article: Dict, lang_config: Dict, job: PipelineJob):
"""특정 언어로 기사 번역"""
try:
# 제목 번역
translated_title = await self._translate_text(
korean_article.get('title', ''),
target_lang=lang_config["deepl_code"]
)
# 요약 번역
translated_summary = await self._translate_text(
korean_article.get('summary', ''),
target_lang=lang_config["deepl_code"]
)
# Subtopics 번역
from shared.models import Subtopic
translated_subtopics = []
for subtopic in korean_article.get('subtopics', []):
translated_subtopic_title = await self._translate_text(
subtopic.get('title', ''),
target_lang=lang_config["deepl_code"]
)
translated_content_list = []
for content_para in subtopic.get('content', []):
translated_para = await self._translate_text(
content_para,
target_lang=lang_config["deepl_code"]
)
translated_content_list.append(translated_para)
# API 속도 제한
settings = self.languages_config["translation_settings"]
if settings.get("delay_between_articles"):
await asyncio.sleep(settings["delay_between_articles"])
translated_subtopics.append(Subtopic(
title=translated_subtopic_title,
content=translated_content_list
))
# 카테고리 번역
translated_categories = []
for category in korean_article.get('categories', []):
translated_cat = await self._translate_text(
category,
target_lang=lang_config["deepl_code"]
)
translated_categories.append(translated_cat)
# Entities와 References는 원본 유지
from shared.models import Entities, NewsReference
entities_data = korean_article.get('entities', {})
translated_entities = Entities(**entities_data) if entities_data else Entities()
references = []
for ref_data in korean_article.get('references', []):
references.append(NewsReference(**ref_data))
# 번역된 기사 생성
translated_article = FinalArticle(
news_id=korean_article.get('news_id'), # 같은 news_id 사용
title=translated_title,
summary=translated_summary,
subtopics=translated_subtopics,
categories=translated_categories,
entities=translated_entities,
source_keyword=job.keyword if hasattr(job, 'keyword') else korean_article.get('source_keyword'),
source_count=korean_article.get('source_count', 1),
references=references,
job_id=job.job_id,
keyword_id=job.keyword_id if hasattr(job, 'keyword_id') else None,
pipeline_stages=korean_article.get('pipeline_stages', []) + ['translation'],
processing_time=korean_article.get('processing_time', 0),
language=lang_config["code"],
ref_news_id=None, # 같은 news_id 사용하므로 불필요
rss_guid=korean_article.get('rss_guid'), # RSS GUID 유지
image_prompt=korean_article.get('image_prompt'), # 이미지 프롬프트 유지
images=korean_article.get('images', []), # 이미지 URL 리스트 유지
translated_languages=korean_article.get('translated_languages', []) # 번역 언어 목록 유지
)
# MongoDB에 저장
collection_name = lang_config["collection"]
result = await self.db[collection_name].insert_one(translated_article.model_dump())
logger.info(f"Article saved to {collection_name} with _id: {result.inserted_id}, language: {lang_config['code']}")
# 원본 기사에 번역 완료 표시
await self.db[self.languages_config["source_language"]["collection"]].update_one(
{"news_id": korean_article.get('news_id')},
{
"$addToSet": {
"translated_languages": lang_config["code"]
}
}
)
except Exception as e:
logger.error(f"Error translating article to {lang_config['code']}: {e}")
raise
async def _translate_text(self, text: str, target_lang: str = 'EN') -> str:
"""DeepL API를 사용한 텍스트 번역"""
try:
if not text:
return ""
async with httpx.AsyncClient() as client:
response = await client.post(
self.deepl_api_url,
data={
'auth_key': self.deepl_api_key,
'text': text,
'target_lang': target_lang,
'source_lang': 'KO'
},
timeout=30
)
if response.status_code == 200:
result = response.json()
return result['translations'][0]['text']
else:
logger.error(f"DeepL API error: {response.status_code}")
return text # 번역 실패시 원본 반환
except Exception as e:
logger.error(f"Error translating text: {e}")
return text # 번역 실패시 원본 반환
async def stop(self):
"""워커 중지"""
await self.queue_manager.disconnect()
logger.info("Multi-Language Translator Worker stopped")
async def main():
"""메인 함수"""
worker = MultiLanguageTranslator()
try:
await worker.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
await worker.stop()
if __name__ == "__main__":
asyncio.run(main())