Files
site11/services/pipeline/translator/language_sync.py
jungwoo choi eeaa9dcb4b feat: Implement automated keyword-based news pipeline scheduler
- Add multi-threaded keyword scheduler for periodic news collection
- Create Keyword Manager API for CRUD operations and monitoring
- Implement automatic pipeline triggering (RSS → Google → AI → Translation)
- Add thread status monitoring and dynamic keyword management
- Support priority-based execution and configurable intervals
- Add comprehensive scheduler documentation guide
- Default keywords: AI, 테크놀로지, 경제, 블록체인

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-15 17:09:22 +09:00

329 lines
12 KiB
Python

"""
Language Sync Service
기존 기사를 새로운 언어로 번역하는 백그라운드 서비스
"""
import asyncio
import logging
import os
import sys
import json
from typing import List, Dict, Any
import httpx
from motor.motor_asyncio import AsyncIOMotorClient
from datetime import datetime
# Add parent directory to path for shared module
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Import from shared module
from shared.models import FinalArticle, Subtopic, Entities, NewsReference
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class LanguageSyncService:
def __init__(self):
self.deepl_api_key = os.getenv("DEEPL_API_KEY", "3abbc796-2515-44a8-972d-22dcf27ab54a")
self.deepl_api_url = "https://api.deepl.com/v2/translate"
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
self.languages_config = None
self.config_path = "/app/config/languages.json"
self.sync_batch_size = 10
self.sync_delay = 2.0 # 언어 간 지연
async def load_config(self):
"""언어 설정 파일 로드"""
try:
if os.path.exists(self.config_path):
with open(self.config_path, 'r', encoding='utf-8') as f:
self.languages_config = json.load(f)
logger.info(f"Loaded language config")
else:
raise FileNotFoundError(f"Config file not found: {self.config_path}")
except Exception as e:
logger.error(f"Error loading config: {e}")
raise
async def start(self):
"""백그라운드 싱크 서비스 시작"""
logger.info("Starting Language Sync Service")
# 설정 로드
await self.load_config()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# 주기적으로 싱크 체크 (10분마다)
while True:
try:
await self.sync_missing_translations()
await asyncio.sleep(600) # 10분 대기
except Exception as e:
logger.error(f"Error in sync loop: {e}")
await asyncio.sleep(60) # 에러 시 1분 후 재시도
async def sync_missing_translations(self):
"""누락된 번역 싱크"""
try:
# 활성화된 언어 목록
enabled_languages = [
lang for lang in self.languages_config["enabled_languages"]
if lang["enabled"]
]
if not enabled_languages:
logger.info("No enabled languages for sync")
return
# 원본 언어 컬렉션
source_collection = self.languages_config["source_language"]["collection"]
for lang_config in enabled_languages:
await self.sync_language(source_collection, lang_config)
await asyncio.sleep(self.sync_delay)
except Exception as e:
logger.error(f"Error in sync_missing_translations: {e}")
async def sync_language(self, source_collection: str, lang_config: Dict):
"""특정 언어로 누락된 기사 번역"""
try:
target_collection = lang_config["collection"]
# 번역되지 않은 기사 찾기
# 원본에는 있지만 대상 컬렉션에는 없는 기사
source_articles = await self.db[source_collection].find(
{},
{"news_id": 1}
).to_list(None)
source_ids = {article["news_id"] for article in source_articles}
translated_articles = await self.db[target_collection].find(
{},
{"news_id": 1}
).to_list(None)
translated_ids = {article["news_id"] for article in translated_articles}
# 누락된 news_id
missing_ids = source_ids - translated_ids
if not missing_ids:
logger.info(f"No missing translations for {lang_config['name']}")
return
logger.info(f"Found {len(missing_ids)} missing translations for {lang_config['name']}")
# 배치로 처리
missing_list = list(missing_ids)
for i in range(0, len(missing_list), self.sync_batch_size):
batch = missing_list[i:i+self.sync_batch_size]
for news_id in batch:
try:
# 원본 기사 조회
korean_article = await self.db[source_collection].find_one(
{"news_id": news_id}
)
if not korean_article:
continue
# 번역 수행
await self.translate_and_save(
korean_article,
lang_config
)
logger.info(f"Synced article {news_id} to {lang_config['code']}")
# API 속도 제한
await asyncio.sleep(0.5)
except Exception as e:
logger.error(f"Error translating {news_id} to {lang_config['code']}: {e}")
continue
# 배치 간 지연
if i + self.sync_batch_size < len(missing_list):
await asyncio.sleep(self.sync_delay)
except Exception as e:
logger.error(f"Error syncing language {lang_config['code']}: {e}")
async def translate_and_save(self, korean_article: Dict, lang_config: Dict):
"""기사 번역 및 저장"""
try:
# 제목 번역
translated_title = await self._translate_text(
korean_article.get('title', ''),
target_lang=lang_config["deepl_code"]
)
# 요약 번역
translated_summary = await self._translate_text(
korean_article.get('summary', ''),
target_lang=lang_config["deepl_code"]
)
# Subtopics 번역
translated_subtopics = []
for subtopic in korean_article.get('subtopics', []):
translated_subtopic_title = await self._translate_text(
subtopic.get('title', ''),
target_lang=lang_config["deepl_code"]
)
translated_content_list = []
for content_para in subtopic.get('content', []):
translated_para = await self._translate_text(
content_para,
target_lang=lang_config["deepl_code"]
)
translated_content_list.append(translated_para)
translated_subtopics.append(Subtopic(
title=translated_subtopic_title,
content=translated_content_list
))
# 카테고리 번역
translated_categories = []
for category in korean_article.get('categories', []):
translated_cat = await self._translate_text(
category,
target_lang=lang_config["deepl_code"]
)
translated_categories.append(translated_cat)
# Entities와 References는 원본 유지
entities_data = korean_article.get('entities', {})
translated_entities = Entities(**entities_data) if entities_data else Entities()
references = []
for ref_data in korean_article.get('references', []):
references.append(NewsReference(**ref_data))
# 번역된 기사 생성
translated_article = FinalArticle(
news_id=korean_article.get('news_id'),
title=translated_title,
summary=translated_summary,
subtopics=translated_subtopics,
categories=translated_categories,
entities=translated_entities,
source_keyword=korean_article.get('source_keyword'),
source_count=korean_article.get('source_count', 1),
references=references,
job_id=korean_article.get('job_id'),
keyword_id=korean_article.get('keyword_id'),
pipeline_stages=korean_article.get('pipeline_stages', []) + ['sync_translation'],
processing_time=korean_article.get('processing_time', 0),
language=lang_config["code"],
ref_news_id=None,
rss_guid=korean_article.get('rss_guid'), # RSS GUID 유지
image_prompt=korean_article.get('image_prompt'), # 이미지 프롬프트 유지
images=korean_article.get('images', []), # 이미지 URL 리스트 유지
translated_languages=korean_article.get('translated_languages', []) # 번역 언어 목록 유지
)
# MongoDB에 저장
collection_name = lang_config["collection"]
result = await self.db[collection_name].insert_one(translated_article.model_dump())
# 원본 기사에 번역 완료 표시
await self.db[self.languages_config["source_language"]["collection"]].update_one(
{"news_id": korean_article.get('news_id')},
{
"$addToSet": {
"translated_languages": lang_config["code"]
}
}
)
logger.info(f"Synced article to {collection_name}: {result.inserted_id}")
except Exception as e:
logger.error(f"Error in translate_and_save: {e}")
raise
async def _translate_text(self, text: str, target_lang: str = 'EN') -> str:
"""DeepL API를 사용한 텍스트 번역"""
try:
if not text:
return ""
async with httpx.AsyncClient() as client:
response = await client.post(
self.deepl_api_url,
data={
'auth_key': self.deepl_api_key,
'text': text,
'target_lang': target_lang,
'source_lang': 'KO'
},
timeout=30
)
if response.status_code == 200:
result = response.json()
return result['translations'][0]['text']
else:
logger.error(f"DeepL API error: {response.status_code}")
return text
except Exception as e:
logger.error(f"Error translating text: {e}")
return text
async def manual_sync(self, language_code: str = None):
"""수동 싱크 실행"""
logger.info(f"Manual sync requested for language: {language_code or 'all'}")
await self.load_config()
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
if language_code:
# 특정 언어만 싱크
lang_config = next(
(lang for lang in self.languages_config["enabled_languages"]
if lang["code"] == language_code and lang["enabled"]),
None
)
if lang_config:
source_collection = self.languages_config["source_language"]["collection"]
await self.sync_language(source_collection, lang_config)
else:
logger.error(f"Language {language_code} not found or not enabled")
else:
# 모든 활성 언어 싱크
await self.sync_missing_translations()
async def main():
"""메인 함수"""
service = LanguageSyncService()
# 명령줄 인수 확인
if len(sys.argv) > 1:
if sys.argv[1] == "sync":
# 수동 싱크 모드
language = sys.argv[2] if len(sys.argv) > 2 else None
await service.manual_sync(language)
else:
logger.error(f"Unknown command: {sys.argv[1]}")
else:
# 백그라운드 서비스 모드
try:
await service.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
if __name__ == "__main__":
asyncio.run(main())