feat: Implement automated keyword-based news pipeline scheduler

- Add multi-threaded keyword scheduler for periodic news collection
- Create Keyword Manager API for CRUD operations and monitoring
- Implement automatic pipeline triggering (RSS → Google → AI → Translation)
- Add thread status monitoring and dynamic keyword management
- Support priority-based execution and configurable intervals
- Add comprehensive scheduler documentation guide
- Default keywords: AI, 테크놀로지, 경제, 블록체인

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
jungwoo choi
2025-09-15 17:09:22 +09:00
parent 070032006e
commit eeaa9dcb4b
39 changed files with 3472 additions and 759 deletions

View File

@ -3,17 +3,17 @@ FROM python:3.11-slim
WORKDIR /app
# 의존성 설치
COPY ./ai-summarizer/requirements.txt .
COPY ./ai-article-generator/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 공통 모듈 복사
COPY ./shared /app/shared
# AI Summarizer 코드 복사
COPY ./ai-summarizer /app
# AI Article Generator 코드 복사
COPY ./ai-article-generator /app
# 환경변수
ENV PYTHONUNBUFFERED=1
# 실행
CMD ["python", "ai_summarizer.py"]
CMD ["python", "ai_article_generator.py"]

View File

@ -0,0 +1,300 @@
"""
AI Article Generator Service
Claude API를 사용한 뉴스 기사 생성 서비스
"""
import asyncio
import logging
import os
import sys
import json
from datetime import datetime
from typing import List, Dict, Any
from anthropic import AsyncAnthropic
from motor.motor_asyncio import AsyncIOMotorClient
# Import from shared module
from shared.models import PipelineJob, EnrichedItem, FinalArticle, Subtopic, Entities, NewsReference
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class AIArticleGeneratorWorker:
def __init__(self):
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
self.claude_api_key = os.getenv("CLAUDE_API_KEY")
self.claude_client = None
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db") # ai_writer_db 사용
self.db = None
async def start(self):
"""워커 시작"""
logger.info("Starting AI Article Generator Worker")
# Redis 연결
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# Claude 클라이언트 초기화
if self.claude_api_key:
self.claude_client = AsyncAnthropic(api_key=self.claude_api_key)
else:
logger.error("Claude API key not configured")
return
# 메인 처리 루프
while True:
try:
# 큐에서 작업 가져오기
job = await self.queue_manager.dequeue('ai_article_generation', timeout=5)
if job:
await self.process_job(job)
except Exception as e:
logger.error(f"Error in worker loop: {e}")
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""AI 기사 생성 작업 처리 - 단일 RSS 아이템"""
try:
start_time = datetime.now()
logger.info(f"Processing job {job.job_id} for AI article generation")
# 단일 enriched item 처리
enriched_item_data = job.data.get('enriched_item')
if not enriched_item_data:
# 이전 버전 호환성
enriched_items = job.data.get('enriched_items', [])
if enriched_items:
enriched_item_data = enriched_items[0]
else:
logger.warning(f"No enriched item in job {job.job_id}")
await self.queue_manager.mark_failed(
'ai_article_generation',
job,
"No enriched item to process"
)
return
enriched_item = EnrichedItem(**enriched_item_data)
# 기사 생성
article = await self._generate_article(job, enriched_item)
# 처리 시간 계산
processing_time = (datetime.now() - start_time).total_seconds()
article.processing_time = processing_time
# MongoDB에 저장 (ai_writer_db.articles_ko)
result = await self.db.articles_ko.insert_one(article.model_dump())
mongodb_id = str(result.inserted_id)
logger.info(f"Article {article.news_id} saved to MongoDB with _id: {mongodb_id}")
# 다음 단계로 전달 (이미지 생성)
job.data['news_id'] = article.news_id
job.data['mongodb_id'] = mongodb_id
job.stages_completed.append('ai_article_generation')
job.stage = 'image_generation'
await self.queue_manager.enqueue('image_generation', job)
await self.queue_manager.mark_completed('ai_article_generation', job.job_id)
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")
await self.queue_manager.mark_failed('ai_article_generation', job, str(e))
async def _generate_article(self, job: PipelineJob, enriched_item: EnrichedItem) -> FinalArticle:
"""Claude를 사용한 기사 생성"""
# RSS 아이템 정보
rss_item = enriched_item.rss_item
search_results = enriched_item.search_results
# 검색 결과 텍스트 준비 (최대 10개)
search_text = ""
if search_results:
search_text = "\n관련 검색 결과:\n"
for idx, result in enumerate(search_results[:10], 1):
search_text += f"{idx}. {result.title}\n"
if result.snippet:
search_text += f" {result.snippet}\n"
# Claude로 기사 작성
prompt = f"""다음 뉴스 정보를 바탕으로 상세한 기사를 작성해주세요.
키워드: {job.keyword}
뉴스 정보:
제목: {rss_item.title}
요약: {rss_item.summary or '내용 없음'}
링크: {rss_item.link}
{search_text}
다음 JSON 형식으로 작성해주세요:
{{
"title": "기사 제목 (50자 이내)",
"summary": "한 줄 요약 (100자 이내)",
"subtopics": [
{{
"title": "소제목1",
"content": ["문단1", "문단2", "문단3"]
}},
{{
"title": "소제목2",
"content": ["문단1", "문단2"]
}},
{{
"title": "소제목3",
"content": ["문단1", "문단2"]
}}
],
"categories": ["카테고리1", "카테고리2"],
"entities": {{
"people": ["인물1", "인물2"],
"organizations": ["조직1", "조직2"],
"groups": ["그룹1"],
"countries": ["국가1"],
"events": ["이벤트1"]
}}
}}
요구사항:
- 3개의 소제목로 구성
- 각 소제목별로 2-3개 문단
- 전문적이고 객관적인 톤
- 한국어로 작성
- 실제 정보를 바탕으로 구체적으로 작성"""
try:
response = await self.claude_client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=4000,
temperature=0.7,
messages=[
{"role": "user", "content": prompt}
]
)
# JSON 파싱
content_text = response.content[0].text
json_start = content_text.find('{')
json_end = content_text.rfind('}') + 1
if json_start != -1 and json_end > json_start:
article_data = json.loads(content_text[json_start:json_end])
else:
raise ValueError("No valid JSON in response")
# Subtopic 객체 생성
subtopics = []
for subtopic_data in article_data.get('subtopics', []):
subtopics.append(Subtopic(
title=subtopic_data.get('title', ''),
content=subtopic_data.get('content', [])
))
# Entities 객체 생성
entities_data = article_data.get('entities', {})
entities = Entities(
people=entities_data.get('people', []),
organizations=entities_data.get('organizations', []),
groups=entities_data.get('groups', []),
countries=entities_data.get('countries', []),
events=entities_data.get('events', [])
)
# 레퍼런스 생성
references = []
# RSS 원본 추가
references.append(NewsReference(
title=rss_item.title,
link=rss_item.link,
source=rss_item.source_feed,
published=rss_item.published
))
# 검색 결과 레퍼런스 추가 (최대 9개 - RSS 원본과 합쳐 총 10개)
for search_result in search_results[:9]: # 상위 9개까지
references.append(NewsReference(
title=search_result.title,
link=search_result.link,
source=search_result.source,
published=None
))
# FinalArticle 생성 (ai_writer_db.articles 스키마)
article = FinalArticle(
title=article_data.get('title', rss_item.title),
summary=article_data.get('summary', ''),
subtopics=subtopics,
categories=article_data.get('categories', []),
entities=entities,
source_keyword=job.keyword,
source_count=len(references),
references=references,
job_id=job.job_id,
keyword_id=job.keyword_id,
pipeline_stages=job.stages_completed.copy(),
language='ko',
rss_guid=rss_item.guid # RSS GUID 저장
)
return article
except Exception as e:
logger.error(f"Error generating article: {e}")
# 폴백 기사 생성
fallback_references = [NewsReference(
title=rss_item.title,
link=rss_item.link,
source=rss_item.source_feed,
published=rss_item.published
)]
return FinalArticle(
title=rss_item.title,
summary=rss_item.summary[:100] if rss_item.summary else '',
subtopics=[
Subtopic(
title="주요 내용",
content=[rss_item.summary or rss_item.title]
)
],
categories=['자동생성'],
entities=Entities(),
source_keyword=job.keyword,
source_count=1,
references=fallback_references,
job_id=job.job_id,
keyword_id=job.keyword_id,
pipeline_stages=job.stages_completed.copy(),
language='ko',
rss_guid=rss_item.guid # RSS GUID 저장
)
async def stop(self):
"""워커 중지"""
await self.queue_manager.disconnect()
logger.info("AI Article Generator Worker stopped")
async def main():
"""메인 함수"""
worker = AIArticleGeneratorWorker()
try:
await worker.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
await worker.stop()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -1,5 +1,5 @@
anthropic==0.50.0
motor==3.1.1
pymongo==4.3.3
redis[hiredis]==5.0.1
pydantic==2.5.0
pydantic==2.5.0
motor==3.1.1
pymongo==4.3.3

View File

@ -1,161 +0,0 @@
"""
AI Summarizer Service
Claude API를 사용한 뉴스 요약 서비스
"""
import asyncio
import logging
import os
import sys
from typing import List, Dict, Any
from anthropic import AsyncAnthropic
# Import from shared module
from shared.models import PipelineJob, EnrichedItem, SummarizedItem
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class AISummarizerWorker:
def __init__(self):
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
self.claude_api_key = os.getenv("CLAUDE_API_KEY")
self.claude_client = None
async def start(self):
"""워커 시작"""
logger.info("Starting AI Summarizer Worker")
# Redis 연결
await self.queue_manager.connect()
# Claude 클라이언트 초기화
if self.claude_api_key:
self.claude_client = AsyncAnthropic(api_key=self.claude_api_key)
else:
logger.error("Claude API key not configured")
return
# 메인 처리 루프
while True:
try:
# 큐에서 작업 가져오기
job = await self.queue_manager.dequeue('ai_summarization', timeout=5)
if job:
await self.process_job(job)
except Exception as e:
logger.error(f"Error in worker loop: {e}")
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""AI 요약 작업 처리"""
try:
logger.info(f"Processing job {job.job_id} for AI summarization")
enriched_items = job.data.get('enriched_items', [])
summarized_items = []
for item_data in enriched_items:
enriched_item = EnrichedItem(**item_data)
# AI 요약 생성
summary = await self._generate_summary(enriched_item)
summarized_item = SummarizedItem(
enriched_item=enriched_item,
ai_summary=summary,
summary_language='ko'
)
summarized_items.append(summarized_item)
# API 속도 제한
await asyncio.sleep(1)
if summarized_items:
logger.info(f"Summarized {len(summarized_items)} items")
# 다음 단계로 전달 (번역 단계로)
job.data['summarized_items'] = [item.dict() for item in summarized_items]
job.stages_completed.append('ai_summarization')
job.stage = 'translation'
await self.queue_manager.enqueue('translation', job)
await self.queue_manager.mark_completed('ai_summarization', job.job_id)
else:
logger.warning(f"No items summarized for job {job.job_id}")
await self.queue_manager.mark_failed(
'ai_summarization',
job,
"No items to summarize"
)
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")
await self.queue_manager.mark_failed('ai_summarization', job, str(e))
async def _generate_summary(self, enriched_item: EnrichedItem) -> str:
"""Claude를 사용한 요약 생성"""
try:
# 컨텐츠 준비
content_parts = [
f"제목: {enriched_item.rss_item.title}",
f"요약: {enriched_item.rss_item.summary or '없음'}"
]
# 검색 결과 추가
if enriched_item.search_results:
content_parts.append("\n관련 검색 결과:")
for idx, result in enumerate(enriched_item.search_results[:3], 1):
content_parts.append(f"{idx}. {result.title}")
if result.snippet:
content_parts.append(f" {result.snippet}")
content = "\n".join(content_parts)
# Claude API 호출
prompt = f"""다음 뉴스 내용을 200자 이내로 핵심만 요약해주세요.
중요한 사실, 수치, 인물, 조직을 포함하고 객관적인 톤을 유지하세요.
{content}
요약:"""
response = await self.claude_client.messages.create(
model="claude-sonnet-4-20250514", # 최신 Sonnet 모델
max_tokens=500,
temperature=0.3,
messages=[
{"role": "user", "content": prompt}
]
)
summary = response.content[0].text.strip()
return summary
except Exception as e:
logger.error(f"Error generating summary: {e}")
# 폴백: 원본 요약 사용
return enriched_item.rss_item.summary[:200] if enriched_item.rss_item.summary else enriched_item.rss_item.title
async def stop(self):
"""워커 중지"""
await self.queue_manager.disconnect()
logger.info("AI Summarizer Worker stopped")
async def main():
"""메인 함수"""
worker = AISummarizerWorker()
try:
await worker.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
await worker.stop()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -1,3 +0,0 @@
anthropic==0.50.0
redis[hiredis]==5.0.1
pydantic==2.5.0

View File

@ -1,19 +0,0 @@
FROM python:3.11-slim
WORKDIR /app
# 의존성 설치
COPY ./article-assembly/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 공통 모듈 복사
COPY ./shared /app/shared
# Article Assembly 코드 복사
COPY ./article-assembly /app
# 환경변수
ENV PYTHONUNBUFFERED=1
# 실행
CMD ["python", "article_assembly.py"]

View File

@ -1,234 +0,0 @@
"""
Article Assembly Service
최종 기사 조립 및 MongoDB 저장 서비스
"""
import asyncio
import logging
import os
import sys
import json
from datetime import datetime
from typing import List, Dict, Any
from anthropic import AsyncAnthropic
from motor.motor_asyncio import AsyncIOMotorClient
# Import from shared module
from shared.models import PipelineJob, SummarizedItem, FinalArticle
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ArticleAssemblyWorker:
def __init__(self):
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
self.claude_api_key = os.getenv("CLAUDE_API_KEY")
self.claude_client = None
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "pipeline_db")
self.db = None
async def start(self):
"""워커 시작"""
logger.info("Starting Article Assembly Worker")
# Redis 연결
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# Claude 클라이언트 초기화
if self.claude_api_key:
self.claude_client = AsyncAnthropic(api_key=self.claude_api_key)
else:
logger.error("Claude API key not configured")
return
# 메인 처리 루프
while True:
try:
# 큐에서 작업 가져오기
job = await self.queue_manager.dequeue('article_assembly', timeout=5)
if job:
await self.process_job(job)
except Exception as e:
logger.error(f"Error in worker loop: {e}")
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""최종 기사 조립 작업 처리"""
try:
start_time = datetime.now()
logger.info(f"Processing job {job.job_id} for article assembly")
summarized_items = job.data.get('summarized_items', [])
if not summarized_items:
logger.warning(f"No items to assemble for job {job.job_id}")
await self.queue_manager.mark_failed(
'article_assembly',
job,
"No items to assemble"
)
return
# 최종 기사 생성
article = await self._generate_final_article(job, summarized_items)
# 처리 시간 계산
processing_time = (datetime.now() - start_time).total_seconds()
article.processing_time = processing_time
# MongoDB에 저장
await self.db.articles.insert_one(article.dict())
logger.info(f"Article {article.article_id} saved to MongoDB")
# 완료 표시
job.stages_completed.append('article_assembly')
await self.queue_manager.mark_completed('article_assembly', job.job_id)
# 통계 업데이트
await self._update_statistics(job.keyword_id)
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")
await self.queue_manager.mark_failed('article_assembly', job, str(e))
async def _generate_final_article(
self,
job: PipelineJob,
summarized_items: List[Dict]
) -> FinalArticle:
"""Claude를 사용한 최종 기사 생성"""
# 아이템 정보 준비
items_text = []
for idx, item_data in enumerate(summarized_items, 1):
item = SummarizedItem(**item_data)
items_text.append(f"""
[뉴스 {idx}]
제목: {item.enriched_item['rss_item']['title']}
요약: {item.ai_summary}
출처: {item.enriched_item['rss_item']['link']}
""")
content = "\n".join(items_text)
# Claude로 종합 기사 작성
prompt = f"""다음 뉴스 항목들을 바탕으로 종합적인 기사를 작성해주세요.
키워드: {job.keyword}
뉴스 항목들:
{content}
다음 JSON 형식으로 작성해주세요:
{{
"title": "종합 기사 제목",
"content": "기사 본문 (1500자 이내, 문단 구분)",
"summary": "한 줄 요약 (100자 이내)",
"categories": ["카테고리1", "카테고리2"],
"tags": ["태그1", "태그2", "태그3"]
}}
요구사항:
- 전문적이고 객관적인 톤
- 핵심 정보와 트렌드 파악
- 시사점 포함
- 한국 독자 대상"""
try:
response = await self.claude_client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=3000,
temperature=0.7,
messages=[
{"role": "user", "content": prompt}
]
)
# JSON 파싱
content_text = response.content[0].text
json_start = content_text.find('{')
json_end = content_text.rfind('}') + 1
if json_start != -1 and json_end > json_start:
article_data = json.loads(content_text[json_start:json_end])
else:
raise ValueError("No valid JSON in response")
# FinalArticle 생성
article = FinalArticle(
job_id=job.job_id,
keyword_id=job.keyword_id,
keyword=job.keyword,
title=article_data.get('title', f"{job.keyword} 종합 뉴스"),
content=article_data.get('content', ''),
summary=article_data.get('summary', ''),
source_items=[], # 간소화
images=[], # 이미지는 별도 서비스에서 처리
categories=article_data.get('categories', []),
tags=article_data.get('tags', []),
pipeline_stages=job.stages_completed,
processing_time=0 # 나중에 업데이트
)
return article
except Exception as e:
logger.error(f"Error generating article: {e}")
# 폴백 기사 생성
return FinalArticle(
job_id=job.job_id,
keyword_id=job.keyword_id,
keyword=job.keyword,
title=f"{job.keyword} 뉴스 요약 - {datetime.now().strftime('%Y-%m-%d')}",
content=content,
summary=f"{job.keyword} 관련 {len(summarized_items)}개 뉴스 요약",
source_items=[],
images=[],
categories=['자동생성'],
tags=[job.keyword],
pipeline_stages=job.stages_completed,
processing_time=0
)
async def _update_statistics(self, keyword_id: str):
"""키워드별 통계 업데이트"""
try:
await self.db.keyword_stats.update_one(
{"keyword_id": keyword_id},
{
"$inc": {"articles_generated": 1},
"$set": {"last_generated": datetime.now()}
},
upsert=True
)
except Exception as e:
logger.error(f"Error updating statistics: {e}")
async def stop(self):
"""워커 중지"""
await self.queue_manager.disconnect()
logger.info("Article Assembly Worker stopped")
async def main():
"""메인 함수"""
worker = ArticleAssemblyWorker()
try:
await worker.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
await worker.stop()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,37 @@
#!/usr/bin/env python3
"""키워드 데이터베이스 확인 스크립트"""
import asyncio
from motor.motor_asyncio import AsyncIOMotorClient
from datetime import datetime
async def check_keywords():
client = AsyncIOMotorClient("mongodb://localhost:27017")
db = client.ai_writer_db
# 키워드 조회
keywords = await db.keywords.find().to_list(None)
print(f"\n=== 등록된 키워드: {len(keywords)}개 ===\n")
for kw in keywords:
print(f"키워드: {kw['keyword']}")
print(f" - ID: {kw['keyword_id']}")
print(f" - 간격: {kw['interval_minutes']}")
print(f" - 활성화: {kw['is_active']}")
print(f" - 우선순위: {kw['priority']}")
print(f" - RSS 피드: {len(kw.get('rss_feeds', []))}")
if kw.get('last_run'):
print(f" - 마지막 실행: {kw['last_run']}")
if kw.get('next_run'):
next_run = kw['next_run']
remaining = (next_run - datetime.now()).total_seconds() / 60
print(f" - 다음 실행: {next_run} ({remaining:.1f}분 후)")
print()
client.close()
if __name__ == "__main__":
asyncio.run(check_keywords())

View File

@ -0,0 +1,85 @@
{
"enabled_languages": [
{
"code": "en",
"name": "English",
"deepl_code": "EN",
"collection": "articles_en",
"enabled": true
},
{
"code": "zh-CN",
"name": "Chinese (Simplified)",
"deepl_code": "ZH",
"collection": "articles_zh_cn",
"enabled": false
},
{
"code": "zh-TW",
"name": "Chinese (Traditional)",
"deepl_code": "ZH-HANT",
"collection": "articles_zh_tw",
"enabled": false
},
{
"code": "ja",
"name": "Japanese",
"deepl_code": "JA",
"collection": "articles_ja",
"enabled": false
},
{
"code": "fr",
"name": "French",
"deepl_code": "FR",
"collection": "articles_fr",
"enabled": false
},
{
"code": "de",
"name": "German",
"deepl_code": "DE",
"collection": "articles_de",
"enabled": false
},
{
"code": "es",
"name": "Spanish",
"deepl_code": "ES",
"collection": "articles_es",
"enabled": false
},
{
"code": "pt",
"name": "Portuguese",
"deepl_code": "PT",
"collection": "articles_pt",
"enabled": false
},
{
"code": "ru",
"name": "Russian",
"deepl_code": "RU",
"collection": "articles_ru",
"enabled": false
},
{
"code": "it",
"name": "Italian",
"deepl_code": "IT",
"collection": "articles_it",
"enabled": false
}
],
"source_language": {
"code": "ko",
"name": "Korean",
"collection": "articles_ko"
},
"translation_settings": {
"batch_size": 5,
"delay_between_languages": 2.0,
"delay_between_articles": 0.5,
"max_retries": 3
}
}

View File

@ -48,46 +48,45 @@ class GoogleSearchWorker:
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""검색 강화 작업 처리"""
"""검색 강화 작업 처리 - 단일 RSS 아이템"""
try:
logger.info(f"Processing job {job.job_id} for search enrichment")
rss_items = job.data.get('rss_items', [])
enriched_items = []
# 최대 5개 항목만 처리 (API 할당량 관리)
for item_data in rss_items[:5]:
rss_item = RSSItem(**item_data)
# 제목으로 Google 검색
search_results = await self._search_google(rss_item.title)
enriched_item = EnrichedItem(
rss_item=rss_item,
search_results=search_results
)
enriched_items.append(enriched_item)
# API 속도 제한
await asyncio.sleep(0.5)
if enriched_items:
logger.info(f"Enriched {len(enriched_items)} items with search results")
# 다음 단계로 전달
job.data['enriched_items'] = [item.dict() for item in enriched_items]
job.stages_completed.append('search_enrichment')
job.stage = 'ai_summarization'
await self.queue_manager.enqueue('ai_summarization', job)
await self.queue_manager.mark_completed('search_enrichment', job.job_id)
else:
logger.warning(f"No items enriched for job {job.job_id}")
await self.queue_manager.mark_failed(
'search_enrichment',
job,
"No items to enrich"
)
# 단일 RSS 아이템 처리
rss_item_data = job.data.get('rss_item')
if not rss_item_data:
# 이전 버전 호환성 - 여러 아이템 처리
rss_items = job.data.get('rss_items', [])
if rss_items:
rss_item_data = rss_items[0] # 첫 번째 아이템만 처리
else:
logger.warning(f"No RSS item in job {job.job_id}")
await self.queue_manager.mark_failed(
'search_enrichment',
job,
"No RSS item to process"
)
return
rss_item = RSSItem(**rss_item_data)
# 제목으로 Google 검색
search_results = await self._search_google(rss_item.title)
enriched_item = EnrichedItem(
rss_item=rss_item,
search_results=search_results
)
logger.info(f"Enriched item with {len(search_results)} search results")
# 다음 단계로 전달 - 단일 enriched item
job.data['enriched_item'] = enriched_item.dict()
job.stages_completed.append('search_enrichment')
job.stage = 'ai_article_generation'
await self.queue_manager.enqueue('ai_article_generation', job)
await self.queue_manager.mark_completed('search_enrichment', job.job_id)
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")

View File

@ -10,9 +10,11 @@ import base64
from typing import List, Dict, Any
import httpx
from io import BytesIO
from motor.motor_asyncio import AsyncIOMotorClient
from bson import ObjectId
# Import from shared module
from shared.models import PipelineJob, TranslatedItem, GeneratedImageItem
from shared.models import PipelineJob
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
@ -23,107 +25,136 @@ class ImageGeneratorWorker:
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
self.replicate_api_key = os.getenv("REPLICATE_API_KEY")
self.replicate_api_key = os.getenv("REPLICATE_API_TOKEN")
self.replicate_api_url = "https://api.replicate.com/v1/predictions"
# Stable Diffusion 모델 사용
self.model_version = "stability-ai/sdxl:39ed52f2a78e934b3ba6e2a89f5b1c712de7dfea535525255b1aa35c5565e08b"
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
async def start(self):
"""워커 시작"""
logger.info("Starting Image Generator Worker")
# Redis 연결
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# API 키 확인
if not self.replicate_api_key:
logger.warning("Replicate API key not configured - using placeholder images")
# 메인 처리 루프
while True:
try:
# 큐에서 작업 가져오기
job = await self.queue_manager.dequeue('image_generation', timeout=5)
if job:
await self.process_job(job)
except Exception as e:
logger.error(f"Error in worker loop: {e}")
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""이미지 생성 작업 처리"""
"""이미지 생성 및 MongoDB 업데이트"""
try:
logger.info(f"Processing job {job.job_id} for image generation")
translated_items = job.data.get('translated_items', [])
generated_items = []
# 최대 3개 아이템만 이미지 생성 (API 비용 절감)
for idx, item_data in enumerate(translated_items[:3]):
translated_item = TranslatedItem(**item_data)
# 이미지 생성을 위한 프롬프트 생성
prompt = self._create_image_prompt(translated_item)
# 이미지 생성
# MongoDB에서 기사 정보 가져오기
news_id = job.data.get('news_id')
mongodb_id = job.data.get('mongodb_id')
if not news_id:
logger.error(f"No news_id in job {job.job_id}")
await self.queue_manager.mark_failed('image_generation', job, "No news_id")
return
# MongoDB에서 한국어 기사 조회 (articles_ko)
article = await self.db.articles_ko.find_one({"news_id": news_id})
if not article:
logger.error(f"Article {news_id} not found in MongoDB")
await self.queue_manager.mark_failed('image_generation', job, "Article not found")
return
# 이미지 생성을 위한 프롬프트 생성 (한국어 기사 기반)
prompt = self._create_image_prompt_from_article(article)
# 이미지 생성 (최대 3개)
image_urls = []
for i in range(min(3, 1)): # 테스트를 위해 1개만 생성
image_url = await self._generate_image(prompt)
generated_item = GeneratedImageItem(
translated_item=translated_item,
image_url=image_url,
image_prompt=prompt
)
generated_items.append(generated_item)
image_urls.append(image_url)
# API 속도 제한
if self.replicate_api_key:
if self.replicate_api_key and i < 2:
await asyncio.sleep(2)
if generated_items:
logger.info(f"Generated images for {len(generated_items)} items")
# 완료된 데이터를 job에 저장
job.data['generated_items'] = [item.dict() for item in generated_items]
job.stages_completed.append('image_generation')
job.stage = 'completed'
# 최종 기사 조립 단계로 전달 (이미 article-assembly로 수정)
await self.queue_manager.enqueue('article_assembly', job)
await self.queue_manager.mark_completed('image_generation', job.job_id)
else:
logger.warning(f"No images generated for job {job.job_id}")
# 이미지 생성 실패해도 다음 단계로 진행
job.stages_completed.append('image_generation')
await self.queue_manager.enqueue('article_assembly', job)
await self.queue_manager.mark_completed('image_generation', job.job_id)
# MongoDB 업데이트 (이미지 추가 - articles_ko)
await self.db.articles_ko.update_one(
{"news_id": news_id},
{
"$set": {
"images": image_urls,
"image_prompt": prompt
},
"$addToSet": {
"pipeline_stages": "image_generation"
}
}
)
logger.info(f"Updated article {news_id} with {len(image_urls)} images")
# 다음 단계로 전달 (번역)
job.stages_completed.append('image_generation')
job.stage = 'translation'
await self.queue_manager.enqueue('translation', job)
await self.queue_manager.mark_completed('image_generation', job.job_id)
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")
# 이미지 생성 실패해도 다음 단계로 진행
job.stages_completed.append('image_generation')
await self.queue_manager.enqueue('article_assembly', job)
await self.queue_manager.mark_completed('image_generation', job.job_id)
def _create_image_prompt(self, translated_item: TranslatedItem) -> str:
"""이미지 생성을 위한 프롬프트 생성"""
# 영문 제목과 요약을 기반으로 프롬프트 생성
title = translated_item.translated_title or translated_item.summarized_item['enriched_item']['rss_item']['title']
summary = translated_item.translated_summary or translated_item.summarized_item['ai_summary']
await self.queue_manager.mark_failed('image_generation', job, str(e))
def _create_image_prompt_from_article(self, article: Dict) -> str:
"""기사로부터 이미지 프롬프트 생성"""
# 키워드와 제목을 기반으로 프롬프트 생성
keyword = article.get('keyword', '')
title = article.get('title', '')
categories = article.get('categories', [])
# 카테고리 맵핑 (한글 -> 영어)
category_map = {
'기술': 'technology',
'경제': 'business',
'정치': 'politics',
'교육': 'education',
'사회': 'society',
'문화': 'culture',
'과학': 'science'
}
eng_categories = [category_map.get(cat, cat) for cat in categories]
category_str = ', '.join(eng_categories[:2]) if eng_categories else 'news'
# 뉴스 관련 이미지를 위한 프롬프트
prompt = f"News illustration for: {title[:100]}, professional, photorealistic, high quality, 4k"
prompt = f"News illustration for {keyword} {category_str}, professional, modern, clean design, high quality, 4k, no text"
return prompt
async def _generate_image(self, prompt: str) -> str:
"""Replicate API를 사용한 이미지 생성"""
try:
if not self.replicate_api_key:
# API 키가 없으면 플레이스홀더 이미지 URL 반환
return "https://via.placeholder.com/800x600.png?text=News+Image"
async with httpx.AsyncClient() as client:
# 예측 생성 요청
response = await client.post(
@ -149,22 +180,22 @@ class ImageGeneratorWorker:
},
timeout=60
)
if response.status_code in [200, 201]:
result = response.json()
prediction_id = result.get('id')
# 예측 결과 폴링
image_url = await self._poll_prediction(prediction_id)
return image_url
else:
logger.error(f"Replicate API error: {response.status_code}")
return "https://via.placeholder.com/800x600.png?text=Generation+Failed"
except Exception as e:
logger.error(f"Error generating image: {e}")
return "https://via.placeholder.com/800x600.png?text=Error"
async def _poll_prediction(self, prediction_id: str, max_attempts: int = 30) -> str:
"""예측 결과 폴링"""
try:
@ -177,11 +208,11 @@ class ImageGeneratorWorker:
},
timeout=30
)
if response.status_code == 200:
result = response.json()
status = result.get('status')
if status == 'succeeded':
output = result.get('output')
if output and isinstance(output, list) and len(output) > 0:
@ -191,20 +222,20 @@ class ImageGeneratorWorker:
elif status == 'failed':
logger.error(f"Prediction failed: {result.get('error')}")
return "https://via.placeholder.com/800x600.png?text=Failed"
# 아직 처리중이면 대기
await asyncio.sleep(2)
else:
logger.error(f"Error polling prediction: {response.status_code}")
return "https://via.placeholder.com/800x600.png?text=Poll+Error"
# 최대 시도 횟수 초과
return "https://via.placeholder.com/800x600.png?text=Timeout"
except Exception as e:
logger.error(f"Error polling prediction: {e}")
return "https://via.placeholder.com/800x600.png?text=Poll+Exception"
async def stop(self):
"""워커 중지"""
await self.queue_manager.disconnect()
@ -213,7 +244,7 @@ class ImageGeneratorWorker:
async def main():
"""메인 함수"""
worker = ImageGeneratorWorker()
try:
await worker.start()
except KeyboardInterrupt:

View File

@ -1,3 +1,5 @@
httpx==0.25.0
redis[hiredis]==5.0.1
pydantic==2.5.0
pydantic==2.5.0
motor==3.1.1
pymongo==4.3.3

View File

@ -46,7 +46,7 @@ async def startup_event():
# MongoDB 연결
mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
mongodb_client = AsyncIOMotorClient(mongodb_url)
db = mongodb_client[os.getenv("DB_NAME", "pipeline_db")]
db = mongodb_client[os.getenv("DB_NAME", "ai_writer_db")]
logger.info("Pipeline Monitor started successfully")

View File

@ -1,4 +1,5 @@
feedparser==6.0.11
aiohttp==3.9.1
redis[hiredis]==5.0.1
pydantic==2.5.0
pydantic==2.5.0
motor==3.6.0

View File

@ -11,6 +11,7 @@ from datetime import datetime
import feedparser
import aiohttp
import redis.asyncio as redis
from motor.motor_asyncio import AsyncIOMotorClient
from typing import List, Dict, Any
# Import from shared module
@ -27,13 +28,16 @@ class RSSCollectorWorker:
)
self.redis_client = None
self.redis_url = os.getenv("REDIS_URL", "redis://redis:6379")
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
self.dedup_ttl = 86400 * 7 # 7일간 중복 방지
self.max_items_per_feed = 10 # 피드당 최대 항목 수
self.max_items_per_feed = 100 # 피드당 최대 항목 수 (Google News는 최대 100개)
async def start(self):
"""워커 시작"""
logger.info("Starting RSS Collector Worker")
# Redis 연결
await self.queue_manager.connect()
self.redis_client = await redis.from_url(
@ -41,6 +45,10 @@ class RSSCollectorWorker:
encoding="utf-8",
decode_responses=True
)
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# 메인 처리 루프
while True:
@ -60,9 +68,20 @@ class RSSCollectorWorker:
try:
logger.info(f"Processing job {job.job_id} for keyword '{job.keyword}'")
keyword = job.data.get('keyword', '')
keyword = job.keyword # keyword는 job의 직접 속성
rss_feeds = job.data.get('rss_feeds', [])
# RSS 피드가 없으면 기본 피드 사용
if not rss_feeds:
# 기본 RSS 피드 추가 (Google News RSS)
rss_feeds = [
f"https://news.google.com/rss/search?q={keyword}&hl=en-US&gl=US&ceid=US:en",
f"https://news.google.com/rss/search?q={keyword}&hl=ko&gl=KR&ceid=KR:ko",
"https://feeds.bbci.co.uk/news/technology/rss.xml",
"https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml"
]
logger.info(f"Using default RSS feeds for keyword: {keyword}")
# 키워드가 포함된 RSS URL 생성
processed_feeds = self._prepare_feeds(rss_feeds, keyword)
@ -78,25 +97,53 @@ class RSSCollectorWorker:
if all_items:
# 중복 제거
unique_items = await self._deduplicate_items(all_items, keyword)
if unique_items:
logger.info(f"Collected {len(unique_items)} unique items for '{keyword}'")
# 다음 단계로 전달
job.data['rss_items'] = [item.dict() for item in unique_items]
job.stages_completed.append('rss_collection')
job.stage = 'search_enrichment'
await self.queue_manager.enqueue('search_enrichment', job)
# 각 RSS 아이템별로 개별 job 생성하여 다음 단계로 전달
# 시간 지연을 추가하여 API 호출 분산 (초기값: 1초, 점진적으로 조정 가능)
enqueue_delay = float(os.getenv("RSS_ENQUEUE_DELAY", "1.0"))
for idx, item in enumerate(unique_items):
# 각 아이템별로 새로운 job 생성
item_job = PipelineJob(
keyword_id=f"{job.keyword_id}_{idx}",
keyword=job.keyword,
stage='search_enrichment',
data={
'rss_item': item.dict(), # 단일 아이템
'original_job_id': job.job_id,
'item_index': idx,
'total_items': len(unique_items),
'item_hash': hashlib.md5(
f"{keyword}:guid:{item.guid}".encode() if item.guid
else f"{keyword}:title:{item.title}:link:{item.link}".encode()
).hexdigest() # GUID 또는 title+link 해시
},
stages_completed=['rss_collection']
)
# 개별 아이템을 다음 단계로 전달
await self.queue_manager.enqueue('search_enrichment', item_job)
logger.info(f"Enqueued item {idx+1}/{len(unique_items)} for keyword '{keyword}'")
# 다음 아이템 enqueue 전에 지연 추가 (마지막 아이템 제외)
if idx < len(unique_items) - 1:
await asyncio.sleep(enqueue_delay)
logger.debug(f"Waiting {enqueue_delay}s before next item...")
# 원본 job 완료 처리
await self.queue_manager.mark_completed('rss_collection', job.job_id)
logger.info(f"Completed RSS collection for job {job.job_id}: {len(unique_items)} items processed")
else:
logger.info(f"No new items found for '{keyword}'")
logger.info(f"No new items found for '{keyword}' after deduplication")
await self.queue_manager.mark_completed('rss_collection', job.job_id)
else:
logger.warning(f"No RSS items collected for '{keyword}'")
await self.queue_manager.mark_failed(
'rss_collection',
job,
'rss_collection',
job,
"No RSS items collected"
)
@ -126,21 +173,34 @@ class RSSCollectorWorker:
# feedparser로 파싱
feed = feedparser.parse(content)
logger.info(f"Found {len(feed.entries)} entries in feed {feed_url}")
for entry in feed.entries[:self.max_items_per_feed]:
# 키워드 관련성 체크
title = entry.get('title', '')
summary = entry.get('summary', '')
# 제목이나 요약에 키워드가 포함된 경우
if keyword.lower() in title.lower() or keyword.lower() in summary.lower():
# 대소문자 무시하고 키워드 매칭 (영문의 경우)
title_lower = title.lower() if keyword.isascii() else title
summary_lower = summary.lower() if keyword.isascii() else summary
keyword_lower = keyword.lower() if keyword.isascii() else keyword
# 제목이나 요약에 키워드가 포함된 경우
# Google News RSS는 이미 키워드 검색 결과이므로 모든 항목 포함
if "news.google.com" in feed_url or keyword_lower in title_lower or keyword_lower in summary_lower:
# GUID 추출 (Google RSS에서 일반적으로 사용)
guid = entry.get('id', entry.get('guid', ''))
item = RSSItem(
title=title,
link=entry.get('link', ''),
guid=guid, # GUID 추가
published=entry.get('published', ''),
summary=summary[:500] if summary else '',
source_feed=feed_url
)
items.append(item)
logger.debug(f"Added item: {title[:50]}... (guid: {guid[:30] if guid else 'no-guid'})")
except Exception as e:
logger.error(f"Error fetching RSS feed {feed_url}: {e}")
@ -148,26 +208,44 @@ class RSSCollectorWorker:
return items
async def _deduplicate_items(self, items: List[RSSItem], keyword: str) -> List[RSSItem]:
"""중복 항목 제거"""
"""중복 항목 제거 - GUID 또는 링크 기준으로만 중복 체크"""
unique_items = []
dedup_key = f"dedup:{keyword}"
seen_guids = set() # 현재 배치에서 본 GUID
seen_links = set() # 현재 배치에서 본 링크
for item in items:
# 제목 해시 생성
item_hash = hashlib.md5(
f"{keyword}:{item.title}".encode()
).hexdigest()
# Redis Set으로 중복 확인
is_new = await self.redis_client.sadd(dedup_key, item_hash)
if is_new:
unique_items.append(item)
# TTL 설정
if unique_items:
await self.redis_client.expire(dedup_key, self.dedup_ttl)
# GUID가 있는 경우 GUID로 중복 체크
if item.guid:
if item.guid in seen_guids:
logger.debug(f"Duplicate GUID in batch: {item.guid[:30]}")
continue
# MongoDB에서 이미 처리된 기사인지 확인
existing_article = await self.db.articles_ko.find_one({"rss_guid": item.guid})
if existing_article:
logger.info(f"Article with GUID {item.guid[:30]} already processed, skipping")
continue
seen_guids.add(item.guid)
else:
# GUID가 없으면 링크로 중복 체크
if item.link in seen_links:
logger.debug(f"Duplicate link in batch: {item.link[:50]}")
continue
# MongoDB에서 링크로 중복 확인 (references 필드에서 검색)
existing_article = await self.db.articles_ko.find_one({"references.link": item.link})
if existing_article:
logger.info(f"Article with link {item.link[:50]} already processed, skipping")
continue
seen_links.add(item.link)
unique_items.append(item)
logger.debug(f"New item added: {item.title[:50]}...")
logger.info(f"Deduplication result: {len(unique_items)} new items out of {len(items)} total")
return unique_items
async def stop(self):

View File

@ -2,18 +2,15 @@ FROM python:3.11-slim
WORKDIR /app
# 의존성 설치
# Install dependencies
COPY ./scheduler/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 공통 모듈 복사
# Copy shared module
COPY ./shared /app/shared
# 스케줄러 코드 복사
# Copy scheduler code
COPY ./scheduler /app
# 환경변수
ENV PYTHONUNBUFFERED=1
# 실행
CMD ["python", "scheduler.py"]
# Run scheduler
CMD ["python", "keyword_scheduler.py"]

View File

@ -0,0 +1,336 @@
"""
Keyword Manager API
키워드를 추가/수정/삭제하는 관리 API
"""
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Optional
from datetime import datetime, timedelta
from motor.motor_asyncio import AsyncIOMotorClient
import uvicorn
import os
import sys
import uuid
# Import from shared module
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from shared.models import Keyword
app = FastAPI(title="Keyword Manager API")
# MongoDB 연결
mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
db_name = os.getenv("DB_NAME", "ai_writer_db")
@app.on_event("startup")
async def startup_event():
"""앱 시작 시 MongoDB 연결"""
app.mongodb_client = AsyncIOMotorClient(mongodb_url)
app.db = app.mongodb_client[db_name]
@app.on_event("shutdown")
async def shutdown_event():
"""앱 종료 시 연결 해제"""
app.mongodb_client.close()
class KeywordCreate(BaseModel):
"""키워드 생성 요청 모델"""
keyword: str
interval_minutes: int = 60
priority: int = 0
rss_feeds: List[str] = []
max_articles_per_run: int = 100
is_active: bool = True
class KeywordUpdate(BaseModel):
"""키워드 업데이트 요청 모델"""
interval_minutes: Optional[int] = None
priority: Optional[int] = None
rss_feeds: Optional[List[str]] = None
max_articles_per_run: Optional[int] = None
is_active: Optional[bool] = None
@app.get("/")
async def root():
"""API 상태 확인"""
return {"status": "Keyword Manager API is running"}
@app.get("/threads/status")
async def get_threads_status():
"""모든 스레드 상태 조회"""
try:
# MongoDB에서 키워드 정보와 함께 상태 반환
cursor = app.db.keywords.find()
keywords = await cursor.to_list(None)
threads_status = []
for kw in keywords:
status = {
"keyword": kw.get("keyword"),
"keyword_id": kw.get("keyword_id"),
"is_active": kw.get("is_active"),
"interval_minutes": kw.get("interval_minutes"),
"priority": kw.get("priority"),
"last_run": kw.get("last_run").isoformat() if kw.get("last_run") else None,
"next_run": kw.get("next_run").isoformat() if kw.get("next_run") else None,
"thread_status": "active" if kw.get("is_active") else "inactive"
}
# 다음 실행까지 남은 시간 계산
if kw.get("next_run"):
remaining = (kw.get("next_run") - datetime.now()).total_seconds()
if remaining > 0:
status["minutes_until_next_run"] = round(remaining / 60, 1)
else:
status["minutes_until_next_run"] = 0
status["thread_status"] = "pending_execution"
threads_status.append(status)
# 우선순위 순으로 정렬
threads_status.sort(key=lambda x: x.get("priority", 0), reverse=True)
return {
"total_threads": len(threads_status),
"active_threads": sum(1 for t in threads_status if t.get("is_active")),
"threads": threads_status
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/keywords")
async def list_keywords():
"""모든 키워드 조회"""
try:
cursor = app.db.keywords.find()
keywords = await cursor.to_list(None)
# 각 키워드 정보 정리
result = []
for kw in keywords:
result.append({
"keyword_id": kw.get("keyword_id"),
"keyword": kw.get("keyword"),
"interval_minutes": kw.get("interval_minutes"),
"priority": kw.get("priority"),
"is_active": kw.get("is_active"),
"last_run": kw.get("last_run").isoformat() if kw.get("last_run") else None,
"next_run": kw.get("next_run").isoformat() if kw.get("next_run") else None,
"rss_feeds": kw.get("rss_feeds", []),
"max_articles_per_run": kw.get("max_articles_per_run", 100)
})
return {
"total": len(result),
"keywords": result
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/keywords/{keyword_text}")
async def get_keyword(keyword_text: str):
"""특정 키워드 조회"""
try:
keyword = await app.db.keywords.find_one({"keyword": keyword_text})
if not keyword:
raise HTTPException(status_code=404, detail=f"Keyword '{keyword_text}' not found")
return {
"keyword_id": keyword.get("keyword_id"),
"keyword": keyword.get("keyword"),
"interval_minutes": keyword.get("interval_minutes"),
"priority": keyword.get("priority"),
"is_active": keyword.get("is_active"),
"last_run": keyword.get("last_run").isoformat() if keyword.get("last_run") else None,
"next_run": keyword.get("next_run").isoformat() if keyword.get("next_run") else None,
"rss_feeds": keyword.get("rss_feeds", []),
"max_articles_per_run": keyword.get("max_articles_per_run", 100)
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/keywords")
async def create_keyword(keyword_data: KeywordCreate):
"""새 키워드 생성"""
try:
# 중복 체크
existing = await app.db.keywords.find_one({"keyword": keyword_data.keyword})
if existing:
raise HTTPException(status_code=400, detail=f"Keyword '{keyword_data.keyword}' already exists")
# 새 키워드 생성
keyword = Keyword(
keyword_id=str(uuid.uuid4()),
keyword=keyword_data.keyword,
interval_minutes=keyword_data.interval_minutes,
priority=keyword_data.priority,
rss_feeds=keyword_data.rss_feeds,
max_articles_per_run=keyword_data.max_articles_per_run,
is_active=keyword_data.is_active,
next_run=datetime.now() + timedelta(minutes=1), # 1분 후 첫 실행
created_at=datetime.now(),
updated_at=datetime.now()
)
await app.db.keywords.insert_one(keyword.model_dump())
return {
"message": f"Keyword '{keyword_data.keyword}' created successfully",
"keyword_id": keyword.keyword_id,
"note": "The scheduler will automatically detect and start processing this keyword within 30 seconds"
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.put("/keywords/{keyword_text}")
async def update_keyword(keyword_text: str, update_data: KeywordUpdate):
"""키워드 업데이트"""
try:
# 키워드 존재 확인
existing = await app.db.keywords.find_one({"keyword": keyword_text})
if not existing:
raise HTTPException(status_code=404, detail=f"Keyword '{keyword_text}' not found")
# 업데이트 데이터 준비
update_dict = {}
if update_data.interval_minutes is not None:
update_dict["interval_minutes"] = update_data.interval_minutes
if update_data.priority is not None:
update_dict["priority"] = update_data.priority
if update_data.rss_feeds is not None:
update_dict["rss_feeds"] = update_data.rss_feeds
if update_data.max_articles_per_run is not None:
update_dict["max_articles_per_run"] = update_data.max_articles_per_run
if update_data.is_active is not None:
update_dict["is_active"] = update_data.is_active
if update_dict:
update_dict["updated_at"] = datetime.now()
# 만약 interval이 변경되면 next_run도 재계산
if "interval_minutes" in update_dict:
update_dict["next_run"] = datetime.now() + timedelta(minutes=update_dict["interval_minutes"])
result = await app.db.keywords.update_one(
{"keyword": keyword_text},
{"$set": update_dict}
)
if result.modified_count > 0:
action_note = ""
if update_data.is_active is False:
action_note = "The scheduler will stop the thread for this keyword within 30 seconds."
elif update_data.is_active is True and not existing.get("is_active"):
action_note = "The scheduler will start a new thread for this keyword within 30 seconds."
return {
"message": f"Keyword '{keyword_text}' updated successfully",
"updated_fields": list(update_dict.keys()),
"note": action_note
}
else:
return {"message": "No changes made"}
else:
return {"message": "No update data provided"}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.delete("/keywords/{keyword_text}")
async def delete_keyword(keyword_text: str):
"""키워드 삭제"""
try:
# 키워드 존재 확인
existing = await app.db.keywords.find_one({"keyword": keyword_text})
if not existing:
raise HTTPException(status_code=404, detail=f"Keyword '{keyword_text}' not found")
# 삭제
result = await app.db.keywords.delete_one({"keyword": keyword_text})
if result.deleted_count > 0:
return {
"message": f"Keyword '{keyword_text}' deleted successfully",
"note": "The scheduler will stop the thread for this keyword within 30 seconds"
}
else:
raise HTTPException(status_code=500, detail="Failed to delete keyword")
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/keywords/{keyword_text}/activate")
async def activate_keyword(keyword_text: str):
"""키워드 활성화"""
try:
result = await app.db.keywords.update_one(
{"keyword": keyword_text},
{"$set": {"is_active": True, "updated_at": datetime.now()}}
)
if result.matched_count == 0:
raise HTTPException(status_code=404, detail=f"Keyword '{keyword_text}' not found")
return {
"message": f"Keyword '{keyword_text}' activated",
"note": "The scheduler will start processing this keyword within 30 seconds"
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/keywords/{keyword_text}/deactivate")
async def deactivate_keyword(keyword_text: str):
"""키워드 비활성화"""
try:
result = await app.db.keywords.update_one(
{"keyword": keyword_text},
{"$set": {"is_active": False, "updated_at": datetime.now()}}
)
if result.matched_count == 0:
raise HTTPException(status_code=404, detail=f"Keyword '{keyword_text}' not found")
return {
"message": f"Keyword '{keyword_text}' deactivated",
"note": "The scheduler will stop processing this keyword within 30 seconds"
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/keywords/{keyword_text}/trigger")
async def trigger_keyword(keyword_text: str):
"""키워드 즉시 실행 트리거"""
try:
# next_run을 현재 시간으로 설정하여 즉시 실행되도록 함
result = await app.db.keywords.update_one(
{"keyword": keyword_text},
{"$set": {"next_run": datetime.now(), "updated_at": datetime.now()}}
)
if result.matched_count == 0:
raise HTTPException(status_code=404, detail=f"Keyword '{keyword_text}' not found")
return {
"message": f"Keyword '{keyword_text}' triggered for immediate execution",
"note": "The scheduler will execute this keyword within the next minute"
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
port = int(os.getenv("API_PORT", "8100"))
uvicorn.run(app, host="0.0.0.0", port=port)

View File

@ -0,0 +1,245 @@
"""
Keyword Scheduler Service
데이터베이스에 등록된 키워드를 주기적으로 실행하는 스케줄러
"""
import asyncio
import logging
import os
import sys
from datetime import datetime, timedelta
from motor.motor_asyncio import AsyncIOMotorClient
from typing import List, Optional
import uuid
# Import from shared module
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from shared.models import Keyword, PipelineJob
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class KeywordScheduler:
def __init__(self):
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
self.check_interval = int(os.getenv("SCHEDULER_CHECK_INTERVAL", "60")) # 1분마다 체크
self.default_interval = int(os.getenv("DEFAULT_KEYWORD_INTERVAL", "60")) # 기본 1시간
async def start(self):
"""스케줄러 시작"""
logger.info("Starting Keyword Scheduler")
# Redis 연결
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# 초기 키워드 설정
await self.initialize_keywords()
# 메인 루프
while True:
try:
await self.check_and_execute_keywords()
await asyncio.sleep(self.check_interval)
except Exception as e:
logger.error(f"Error in scheduler loop: {e}")
await asyncio.sleep(10)
async def initialize_keywords(self):
"""초기 키워드 설정 (없으면 생성)"""
try:
# keywords 컬렉션 확인
count = await self.db.keywords.count_documents({})
if count == 0:
logger.info("No keywords found. Creating default keywords...")
# 기본 키워드 생성
default_keywords = [
{
"keyword": "AI",
"interval_minutes": 60,
"is_active": True,
"priority": 1,
"rss_feeds": []
},
{
"keyword": "경제",
"interval_minutes": 120,
"is_active": True,
"priority": 0,
"rss_feeds": []
},
{
"keyword": "테크놀로지",
"interval_minutes": 60,
"is_active": True,
"priority": 1,
"rss_feeds": []
}
]
for kw_data in default_keywords:
keyword = Keyword(**kw_data)
# 다음 실행 시간 설정
keyword.next_run = datetime.now() + timedelta(minutes=5) # 5분 후 첫 실행
await self.db.keywords.insert_one(keyword.dict())
logger.info(f"Created keyword: {keyword.keyword}")
logger.info(f"Found {count} keywords in database")
except Exception as e:
logger.error(f"Error initializing keywords: {e}")
async def check_and_execute_keywords(self):
"""실행할 키워드 체크 및 실행"""
try:
# 현재 시간
now = datetime.now()
# 실행할 키워드 조회 (활성화되고 next_run이 현재 시간 이전인 것)
query = {
"is_active": True,
"$or": [
{"next_run": {"$lte": now}},
{"next_run": None} # next_run이 설정되지 않은 경우
]
}
# 우선순위 순으로 정렬
cursor = self.db.keywords.find(query).sort("priority", -1)
keywords = await cursor.to_list(None)
for keyword_data in keywords:
keyword = Keyword(**keyword_data)
await self.execute_keyword(keyword)
except Exception as e:
logger.error(f"Error checking keywords: {e}")
async def execute_keyword(self, keyword: Keyword):
"""키워드 실행"""
try:
logger.info(f"Executing keyword: {keyword.keyword}")
# PipelineJob 생성
job = PipelineJob(
keyword_id=keyword.keyword_id,
keyword=keyword.keyword,
stage='rss_collection',
data={
'rss_feeds': keyword.rss_feeds if keyword.rss_feeds else [],
'max_articles': keyword.max_articles_per_run,
'scheduled': True
},
priority=keyword.priority
)
# 큐에 작업 추가
await self.queue_manager.enqueue('rss_collection', job)
logger.info(f"Enqueued job for keyword '{keyword.keyword}' with job_id: {job.job_id}")
# 키워드 업데이트
update_data = {
"last_run": datetime.now(),
"next_run": datetime.now() + timedelta(minutes=keyword.interval_minutes),
"updated_at": datetime.now()
}
await self.db.keywords.update_one(
{"keyword_id": keyword.keyword_id},
{"$set": update_data}
)
logger.info(f"Updated keyword '{keyword.keyword}' - next run at {update_data['next_run']}")
except Exception as e:
logger.error(f"Error executing keyword {keyword.keyword}: {e}")
async def add_keyword(self, keyword_text: str, interval_minutes: int = None,
rss_feeds: List[str] = None, priority: int = 0):
"""새 키워드 추가"""
try:
# 중복 체크
existing = await self.db.keywords.find_one({"keyword": keyword_text})
if existing:
logger.warning(f"Keyword '{keyword_text}' already exists")
return None
# 새 키워드 생성
keyword = Keyword(
keyword=keyword_text,
interval_minutes=interval_minutes or self.default_interval,
rss_feeds=rss_feeds or [],
priority=priority,
next_run=datetime.now() + timedelta(minutes=1) # 1분 후 첫 실행
)
result = await self.db.keywords.insert_one(keyword.dict())
logger.info(f"Added new keyword: {keyword_text}")
return keyword
except Exception as e:
logger.error(f"Error adding keyword: {e}")
return None
async def update_keyword(self, keyword_id: str, **kwargs):
"""키워드 업데이트"""
try:
# 업데이트할 필드
update_data = {k: v for k, v in kwargs.items() if v is not None}
update_data["updated_at"] = datetime.now()
result = await self.db.keywords.update_one(
{"keyword_id": keyword_id},
{"$set": update_data}
)
if result.modified_count > 0:
logger.info(f"Updated keyword {keyword_id}")
return True
return False
except Exception as e:
logger.error(f"Error updating keyword: {e}")
return False
async def delete_keyword(self, keyword_id: str):
"""키워드 삭제"""
try:
result = await self.db.keywords.delete_one({"keyword_id": keyword_id})
if result.deleted_count > 0:
logger.info(f"Deleted keyword {keyword_id}")
return True
return False
except Exception as e:
logger.error(f"Error deleting keyword: {e}")
return False
async def stop(self):
"""스케줄러 중지"""
await self.queue_manager.disconnect()
logger.info("Keyword Scheduler stopped")
async def main():
"""메인 함수"""
scheduler = KeywordScheduler()
try:
await scheduler.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
await scheduler.stop()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,361 @@
"""
Multi-threaded Keyword Scheduler Service
하나의 프로세스에서 여러 스레드로 키워드를 관리하는 스케줄러
"""
import asyncio
import logging
import os
import sys
from datetime import datetime, timedelta
from motor.motor_asyncio import AsyncIOMotorClient
from typing import Dict
import threading
import time
# Import from shared module
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from shared.models import Keyword, PipelineJob
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# 전역 변수로 스케줄러 인스턴스 참조 저장
scheduler_instance = None
class KeywordThread(threading.Thread):
"""개별 키워드를 관리하는 스레드"""
def __init__(self, keyword_text: str, mongodb_url: str, db_name: str, redis_url: str):
super().__init__(name=f"Thread-{keyword_text}")
self.keyword_text = keyword_text
self.mongodb_url = mongodb_url
self.db_name = db_name
self.redis_url = redis_url
self.running = True
self.keyword = None
self.status = "initializing"
self.last_execution = None
self.execution_count = 0
self.error_count = 0
self.last_error = None
def run(self):
"""스레드 실행"""
# 새로운 이벤트 루프 생성
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(self.run_scheduler())
finally:
loop.close()
async def run_scheduler(self):
"""비동기 스케줄러 실행"""
# Redis 연결
self.queue_manager = QueueManager(redis_url=self.redis_url)
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
logger.info(f"[{self.keyword_text}] Thread started")
# 키워드 로드
await self.load_keyword()
if not self.keyword:
logger.error(f"[{self.keyword_text}] Failed to load keyword")
return
# 메인 루프
while self.running:
try:
# 키워드 상태 체크
await self.reload_keyword()
if not self.keyword.is_active:
self.status = "inactive"
logger.info(f"[{self.keyword_text}] Keyword is inactive, sleeping...")
await asyncio.sleep(60)
continue
# 실행 시간 체크
now = datetime.now()
if self.keyword.next_run and self.keyword.next_run <= now:
self.status = "executing"
await self.execute_keyword()
# 다음 실행 시간까지 대기
sleep_seconds = self.keyword.interval_minutes * 60
self.status = "waiting"
else:
# 다음 체크까지 1분 대기
sleep_seconds = 60
self.status = "waiting"
await asyncio.sleep(sleep_seconds)
except Exception as e:
self.error_count += 1
self.last_error = str(e)
self.status = "error"
logger.error(f"[{self.keyword_text}] Error in thread loop: {e}")
await asyncio.sleep(60)
await self.queue_manager.disconnect()
logger.info(f"[{self.keyword_text}] Thread stopped")
async def load_keyword(self):
"""키워드 초기 로드"""
try:
keyword_doc = await self.db.keywords.find_one({"keyword": self.keyword_text})
if keyword_doc:
self.keyword = Keyword(**keyword_doc)
logger.info(f"[{self.keyword_text}] Loaded keyword")
except Exception as e:
logger.error(f"[{self.keyword_text}] Error loading keyword: {e}")
async def reload_keyword(self):
"""키워드 정보 재로드"""
try:
keyword_doc = await self.db.keywords.find_one({"keyword": self.keyword_text})
if keyword_doc:
self.keyword = Keyword(**keyword_doc)
except Exception as e:
logger.error(f"[{self.keyword_text}] Error reloading keyword: {e}")
async def execute_keyword(self):
"""키워드 실행"""
try:
logger.info(f"[{self.keyword_text}] Executing keyword")
# PipelineJob 생성
job = PipelineJob(
keyword_id=self.keyword.keyword_id,
keyword=self.keyword.keyword,
stage='rss_collection',
data={
'rss_feeds': self.keyword.rss_feeds if self.keyword.rss_feeds else [],
'max_articles': self.keyword.max_articles_per_run,
'scheduled': True,
'thread_name': self.name
},
priority=self.keyword.priority
)
# 큐에 작업 추가
await self.queue_manager.enqueue('rss_collection', job)
logger.info(f"[{self.keyword_text}] Enqueued job {job.job_id}")
# 키워드 업데이트
update_data = {
"last_run": datetime.now(),
"next_run": datetime.now() + timedelta(minutes=self.keyword.interval_minutes),
"updated_at": datetime.now()
}
await self.db.keywords.update_one(
{"keyword_id": self.keyword.keyword_id},
{"$set": update_data}
)
self.last_execution = datetime.now()
self.execution_count += 1
logger.info(f"[{self.keyword_text}] Next run at {update_data['next_run']}")
except Exception as e:
self.error_count += 1
self.last_error = str(e)
logger.error(f"[{self.keyword_text}] Error executing keyword: {e}")
def stop(self):
"""스레드 중지"""
self.running = False
self.status = "stopped"
def get_status(self):
"""스레드 상태 반환"""
return {
"keyword": self.keyword_text,
"thread_name": self.name,
"status": self.status,
"is_alive": self.is_alive(),
"execution_count": self.execution_count,
"last_execution": self.last_execution.isoformat() if self.last_execution else None,
"error_count": self.error_count,
"last_error": self.last_error,
"next_run": self.keyword.next_run.isoformat() if self.keyword and self.keyword.next_run else None
}
class MultiThreadScheduler:
"""멀티스레드 키워드 스케줄러"""
def __init__(self):
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.redis_url = os.getenv("REDIS_URL", "redis://redis:6379")
self.threads: Dict[str, KeywordThread] = {}
self.running = True
# Singleton 인스턴스를 전역 변수로 저장
global scheduler_instance
scheduler_instance = self
async def start(self):
"""스케줄러 시작"""
logger.info("Starting Multi-threaded Keyword Scheduler")
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# 초기 키워드 설정
await self.initialize_keywords()
# 키워드 로드 및 스레드 시작
await self.load_and_start_threads()
# 메인 루프 - 새로운 키워드 체크
while self.running:
try:
await self.check_new_keywords()
await asyncio.sleep(30) # 30초마다 새 키워드 체크
except Exception as e:
logger.error(f"Error in main loop: {e}")
await asyncio.sleep(30)
async def initialize_keywords(self):
"""초기 키워드 설정 (없으면 생성)"""
try:
count = await self.db.keywords.count_documents({})
if count == 0:
logger.info("No keywords found. Creating default keywords...")
default_keywords = [
{
"keyword": "AI",
"interval_minutes": 60,
"is_active": True,
"priority": 1,
"rss_feeds": [],
"next_run": datetime.now() + timedelta(minutes=1)
},
{
"keyword": "경제",
"interval_minutes": 120,
"is_active": True,
"priority": 0,
"rss_feeds": [],
"next_run": datetime.now() + timedelta(minutes=1)
},
{
"keyword": "테크놀로지",
"interval_minutes": 60,
"is_active": True,
"priority": 1,
"rss_feeds": [],
"next_run": datetime.now() + timedelta(minutes=1)
}
]
for kw_data in default_keywords:
keyword = Keyword(**kw_data)
await self.db.keywords.insert_one(keyword.model_dump())
logger.info(f"Created keyword: {keyword.keyword}")
logger.info(f"Found {count} keywords in database")
except Exception as e:
logger.error(f"Error initializing keywords: {e}")
async def load_and_start_threads(self):
"""키워드 로드 및 스레드 시작"""
try:
# 활성 키워드 조회
cursor = self.db.keywords.find({"is_active": True})
keywords = await cursor.to_list(None)
for keyword_doc in keywords:
keyword = Keyword(**keyword_doc)
if keyword.keyword not in self.threads:
self.start_keyword_thread(keyword.keyword)
logger.info(f"Started {len(self.threads)} keyword threads")
except Exception as e:
logger.error(f"Error loading keywords: {e}")
def start_keyword_thread(self, keyword_text: str):
"""키워드 스레드 시작"""
if keyword_text not in self.threads:
thread = KeywordThread(
keyword_text=keyword_text,
mongodb_url=self.mongodb_url,
db_name=self.db_name,
redis_url=self.redis_url
)
thread.start()
self.threads[keyword_text] = thread
logger.info(f"Started thread for keyword: {keyword_text}")
async def check_new_keywords(self):
"""새로운 키워드 체크 및 스레드 관리"""
try:
# 현재 활성 키워드 조회
cursor = self.db.keywords.find({"is_active": True})
active_keywords = await cursor.to_list(None)
active_keyword_texts = {kw['keyword'] for kw in active_keywords}
# 새 키워드 시작
for keyword_text in active_keyword_texts:
if keyword_text not in self.threads:
self.start_keyword_thread(keyword_text)
# 비활성화된 키워드 스레드 중지
for keyword_text in list(self.threads.keys()):
if keyword_text not in active_keyword_texts:
thread = self.threads[keyword_text]
thread.stop()
del self.threads[keyword_text]
logger.info(f"Stopped thread for keyword: {keyword_text}")
except Exception as e:
logger.error(f"Error checking new keywords: {e}")
def stop(self):
"""모든 스레드 중지"""
self.running = False
for thread in self.threads.values():
thread.stop()
# 모든 스레드가 종료될 때까지 대기
for thread in self.threads.values():
thread.join(timeout=5)
logger.info("Multi-threaded Keyword Scheduler stopped")
def get_threads_status(self):
"""모든 스레드 상태 반환"""
status_list = []
for thread in self.threads.values():
status_list.append(thread.get_status())
return status_list
async def main():
"""메인 함수"""
scheduler = MultiThreadScheduler()
try:
await scheduler.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
scheduler.stop()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -1,5 +1,5 @@
apscheduler==3.10.4
motor==3.1.1
pymongo==4.3.3
motor==3.6.0
redis[hiredis]==5.0.1
pydantic==2.5.0
pydantic==2.5.0
fastapi==0.104.1
uvicorn==0.24.0

View File

@ -21,7 +21,7 @@ class NewsScheduler:
def __init__(self):
self.scheduler = AsyncIOScheduler()
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "pipeline_db")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")

View File

@ -0,0 +1,173 @@
"""
Single Keyword Scheduler Service
단일 키워드를 전담하는 스케줄러
"""
import asyncio
import logging
import os
import sys
from datetime import datetime, timedelta
from motor.motor_asyncio import AsyncIOMotorClient
import uuid
# Import from shared module
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from shared.models import Keyword, PipelineJob
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class SingleKeywordScheduler:
def __init__(self):
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.keyword_text = os.getenv("KEYWORD") # 환경변수로 키워드 지정
self.interval_minutes = int(os.getenv("INTERVAL_MINUTES", "60"))
self.db = None
self.keyword = None
async def start(self):
"""스케줄러 시작"""
if not self.keyword_text:
logger.error("KEYWORD environment variable is required")
return
logger.info(f"Starting Single Keyword Scheduler for '{self.keyword_text}'")
# Redis 연결
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# 키워드 초기화 또는 로드
await self.initialize_keyword()
if not self.keyword:
logger.error(f"Failed to initialize keyword '{self.keyword_text}'")
return
# 메인 루프 - 이 키워드만 처리
while True:
try:
await self.check_and_execute()
# 다음 실행까지 대기
sleep_seconds = self.keyword.interval_minutes * 60
logger.info(f"Sleeping for {self.keyword.interval_minutes} minutes until next execution")
await asyncio.sleep(sleep_seconds)
except Exception as e:
logger.error(f"Error in scheduler loop: {e}")
await asyncio.sleep(60) # 에러 발생시 1분 후 재시도
async def initialize_keyword(self):
"""키워드 초기화 또는 로드"""
try:
# 기존 키워드 찾기
keyword_doc = await self.db.keywords.find_one({"keyword": self.keyword_text})
if keyword_doc:
self.keyword = Keyword(**keyword_doc)
logger.info(f"Loaded existing keyword: {self.keyword_text}")
else:
# 새 키워드 생성
self.keyword = Keyword(
keyword=self.keyword_text,
interval_minutes=self.interval_minutes,
is_active=True,
priority=int(os.getenv("PRIORITY", "0")),
rss_feeds=os.getenv("RSS_FEEDS", "").split(",") if os.getenv("RSS_FEEDS") else [],
max_articles_per_run=int(os.getenv("MAX_ARTICLES", "100"))
)
await self.db.keywords.insert_one(self.keyword.model_dump())
logger.info(f"Created new keyword: {self.keyword_text}")
except Exception as e:
logger.error(f"Error initializing keyword: {e}")
async def check_and_execute(self):
"""키워드 실행 체크 및 실행"""
try:
# 최신 키워드 정보 다시 로드
keyword_doc = await self.db.keywords.find_one({"keyword": self.keyword_text})
if not keyword_doc:
logger.error(f"Keyword '{self.keyword_text}' not found in database")
return
self.keyword = Keyword(**keyword_doc)
# 비활성화된 경우 스킵
if not self.keyword.is_active:
logger.info(f"Keyword '{self.keyword_text}' is inactive, skipping")
return
# 실행
await self.execute_keyword()
except Exception as e:
logger.error(f"Error checking keyword: {e}")
async def execute_keyword(self):
"""키워드 실행"""
try:
logger.info(f"Executing keyword: {self.keyword.keyword}")
# PipelineJob 생성
job = PipelineJob(
keyword_id=self.keyword.keyword_id,
keyword=self.keyword.keyword,
stage='rss_collection',
data={
'rss_feeds': self.keyword.rss_feeds if self.keyword.rss_feeds else [],
'max_articles': self.keyword.max_articles_per_run,
'scheduled': True,
'scheduler_instance': f"single-{self.keyword_text}"
},
priority=self.keyword.priority
)
# 큐에 작업 추가
await self.queue_manager.enqueue('rss_collection', job)
logger.info(f"Enqueued job for keyword '{self.keyword.keyword}' with job_id: {job.job_id}")
# 키워드 업데이트
update_data = {
"last_run": datetime.now(),
"next_run": datetime.now() + timedelta(minutes=self.keyword.interval_minutes),
"updated_at": datetime.now()
}
await self.db.keywords.update_one(
{"keyword_id": self.keyword.keyword_id},
{"$set": update_data}
)
logger.info(f"Updated keyword '{self.keyword.keyword}' - next run at {update_data['next_run']}")
except Exception as e:
logger.error(f"Error executing keyword {self.keyword.keyword}: {e}")
async def stop(self):
"""스케줄러 중지"""
await self.queue_manager.disconnect()
logger.info(f"Single Keyword Scheduler for '{self.keyword_text}' stopped")
async def main():
"""메인 함수"""
scheduler = SingleKeywordScheduler()
try:
await scheduler.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
await scheduler.stop()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -40,6 +40,7 @@ class RSSItem(BaseModel):
item_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
title: str
link: str
guid: Optional[str] = None # RSS GUID for deduplication
published: Optional[str] = None
summary: Optional[str] = None
source_feed: str
@ -74,22 +75,54 @@ class ItemWithImage(BaseModel):
image_url: str
image_prompt: str
class FinalArticle(BaseModel):
"""최종 기사"""
article_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
job_id: str
keyword_id: str
keyword: str
class Subtopic(BaseModel):
"""기사 소주제"""
title: str
content: str
content: List[str] # 문단별 내용
class Entities(BaseModel):
"""개체명"""
people: List[str] = Field(default_factory=list)
organizations: List[str] = Field(default_factory=list)
groups: List[str] = Field(default_factory=list)
countries: List[str] = Field(default_factory=list)
events: List[str] = Field(default_factory=list)
class NewsReference(BaseModel):
"""뉴스 레퍼런스"""
title: str
link: str
source: str
published: Optional[str] = None
class FinalArticle(BaseModel):
"""최종 기사 - ai_writer_db.articles 스키마와 일치"""
news_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
title: str
created_at: str = Field(default_factory=lambda: datetime.now().isoformat())
summary: str
source_items: List[ItemWithImage]
images: List[str]
subtopics: List[Subtopic] = Field(default_factory=list)
categories: List[str] = Field(default_factory=list)
tags: List[str] = Field(default_factory=list)
created_at: datetime = Field(default_factory=datetime.now)
pipeline_stages: List[str]
processing_time: float # seconds
entities: Entities = Field(default_factory=Entities)
source_keyword: str
source_count: int = 1
# 레퍼런스 뉴스 정보
references: List[NewsReference] = Field(default_factory=list)
# 파이프라인 관련 추가 필드
job_id: Optional[str] = None
keyword_id: Optional[str] = None
pipeline_stages: List[str] = Field(default_factory=list)
processing_time: Optional[float] = None
# 다국어 지원
language: str = 'ko'
ref_news_id: Optional[str] = None
# RSS 중복 체크용 GUID
rss_guid: Optional[str] = None
# 이미지 관련 필드
image_prompt: Optional[str] = None
images: List[str] = Field(default_factory=list)
# 번역 추적
translated_languages: List[str] = Field(default_factory=list)
class TranslatedItem(BaseModel):
"""번역된 아이템"""
@ -110,4 +143,17 @@ class QueueMessage(BaseModel):
queue_name: str
job: PipelineJob
timestamp: datetime = Field(default_factory=datetime.now)
retry_count: int = 0
retry_count: int = 0
class Keyword(BaseModel):
"""스케줄러용 키워드 모델"""
keyword_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
keyword: str
interval_minutes: int = Field(default=60) # 기본 1시간
is_active: bool = Field(default=True)
last_run: Optional[datetime] = None
next_run: Optional[datetime] = None
created_at: datetime = Field(default_factory=datetime.now)
updated_at: datetime = Field(default_factory=datetime.now)
rss_feeds: List[str] = Field(default_factory=list) # 커스텀 RSS 피드
priority: int = Field(default=0) # 우선순위 (높을수록 우선)
max_articles_per_run: int = Field(default=100) # 실행당 최대 기사 수

View File

@ -16,13 +16,13 @@ class QueueManager:
"""Redis 기반 큐 매니저"""
QUEUES = {
"keyword_processing": "queue:keyword",
"rss_collection": "queue:rss",
"search_enrichment": "queue:search",
"ai_summarization": "queue:summarize",
"translation": "queue:translate",
"image_generation": "queue:image",
"article_assembly": "queue:assembly",
"keyword_processing": "queue:keyword_processing",
"rss_collection": "queue:rss_collection",
"search_enrichment": "queue:search_enrichment",
"google_search": "queue:google_search",
"ai_article_generation": "queue:ai_article_generation",
"image_generation": "queue:image_generation",
"translation": "queue:translation",
"failed": "queue:failed",
"scheduled": "queue:scheduled"
}
@ -77,12 +77,15 @@ class QueueManager:
"""큐에서 작업 가져오기"""
try:
queue_key = self.QUEUES.get(queue_name, f"queue:{queue_name}")
logger.info(f"Attempting to dequeue from {queue_key} with timeout={timeout}")
if timeout > 0:
result = await self.redis_client.blpop(queue_key, timeout=timeout)
result = await self.redis_client.blpop(queue_key, timeout)
if result:
_, data = result
logger.info(f"Dequeued item from {queue_key}")
else:
logger.debug(f"No item available in {queue_key}")
return None
else:
data = await self.redis_client.lpop(queue_key)

View File

@ -0,0 +1,54 @@
#!/usr/bin/env python3
"""
Simple pipeline test - direct queue injection
"""
import asyncio
import json
import redis.asyncio as redis
from datetime import datetime
import uuid
async def test():
# Redis 연결
r = await redis.from_url("redis://redis:6379", decode_responses=True)
# 작업 생성
job = {
"job_id": str(uuid.uuid4()),
"keyword_id": str(uuid.uuid4()),
"keyword": "전기차",
"stage": "rss_collection",
"stages_completed": [],
"data": {
"rss_feeds": [
"https://news.google.com/rss/search?q=전기차&hl=ko&gl=KR&ceid=KR:ko"
],
"categories": ["technology", "automotive"]
},
"priority": 1,
"retry_count": 0,
"max_retries": 3,
"created_at": datetime.now().isoformat(),
"updated_at": datetime.now().isoformat()
}
# QueueMessage 형식으로 래핑
message = {
"message_id": str(uuid.uuid4()),
"queue_name": "rss_collection",
"job": job,
"timestamp": datetime.now().isoformat()
}
# 큐에 추가
await r.lpush("queue:rss_collection", json.dumps(message))
print(f"✅ Job {job['job_id']} added to queue:rss_collection")
# 큐 상태 확인
length = await r.llen("queue:rss_collection")
print(f"📊 Queue length: {length}")
await r.aclose()
if __name__ == "__main__":
asyncio.run(test())

View File

@ -0,0 +1,57 @@
#!/usr/bin/env python3
"""
Direct dequeue test
"""
import asyncio
import redis.asyncio as redis
import json
async def test_dequeue():
"""Test dequeue directly"""
# Connect to Redis
redis_client = await redis.from_url(
"redis://redis:6379",
encoding="utf-8",
decode_responses=True
)
print("Connected to Redis")
# Check queue length
length = await redis_client.llen("queue:rss_collection")
print(f"Queue length: {length}")
if length > 0:
# Get the first item
item = await redis_client.lrange("queue:rss_collection", 0, 0)
print(f"First item preview: {item[0][:200]}...")
# Try blpop with timeout
print("Trying blpop with timeout=5...")
result = await redis_client.blpop("queue:rss_collection", 5)
if result:
queue, data = result
print(f"Successfully dequeued from {queue}")
print(f"Data: {data[:200]}...")
# Parse the message
try:
message = json.loads(data)
print(f"Message ID: {message.get('message_id')}")
print(f"Queue Name: {message.get('queue_name')}")
if 'job' in message:
job = message['job']
print(f"Job ID: {job.get('job_id')}")
print(f"Keyword: {job.get('keyword')}")
except Exception as e:
print(f"Failed to parse message: {e}")
else:
print("blpop timed out - no result")
else:
print("Queue is empty")
await redis_client.close()
if __name__ == "__main__":
asyncio.run(test_dequeue())

View File

@ -0,0 +1,118 @@
#!/usr/bin/env python3
"""
Pipeline Test Script
파이프라인 전체 플로우를 테스트하는 스크립트
"""
import asyncio
import json
from datetime import datetime
from motor.motor_asyncio import AsyncIOMotorClient
import redis.asyncio as redis
from shared.models import KeywordSubscription, PipelineJob
async def test_pipeline():
"""파이프라인 테스트"""
# MongoDB 연결
mongo_client = AsyncIOMotorClient("mongodb://mongodb:27017")
db = mongo_client.pipeline
# Redis 연결
redis_client = redis.Redis(host='redis', port=6379, decode_responses=True)
# 1. 테스트 키워드 추가
test_keyword = KeywordSubscription(
keyword="전기차",
language="ko",
schedule="*/1 * * * *", # 1분마다 (테스트용)
is_active=True,
is_priority=True,
rss_feeds=[
"https://news.google.com/rss/search?q=전기차&hl=ko&gl=KR&ceid=KR:ko",
"https://news.google.com/rss/search?q=electric+vehicle&hl=en&gl=US&ceid=US:en"
],
categories=["technology", "automotive", "environment"],
owner="test_user"
)
# MongoDB에 저장
await db.keyword_subscriptions.replace_one(
{"keyword": test_keyword.keyword},
test_keyword.dict(),
upsert=True
)
print(f"✅ 키워드 '{test_keyword.keyword}' 추가 완료")
# 2. 즉시 파이프라인 트리거 (스케줄러를 거치지 않고 직접)
job = PipelineJob(
keyword_id=test_keyword.keyword_id,
keyword=test_keyword.keyword,
stage="rss_collection",
data={
"rss_feeds": test_keyword.rss_feeds,
"categories": test_keyword.categories
},
priority=1 if test_keyword.is_priority else 0
)
# Redis 큐에 직접 추가 (QueueMessage 형식으로)
from shared.queue_manager import QueueMessage
message = QueueMessage(
queue_name="rss_collection",
job=job
)
await redis_client.lpush("queue:rss_collection", message.json())
print(f"✅ 작업을 RSS Collection 큐에 추가: {job.job_id}")
# 3. 파이프라인 상태 모니터링
print("\n📊 파이프라인 실행 모니터링 중...")
print("각 단계별 로그를 확인하려면 다음 명령을 실행하세요:")
print(" docker-compose logs -f pipeline-rss-collector")
print(" docker-compose logs -f pipeline-google-search")
print(" docker-compose logs -f pipeline-ai-summarizer")
print(" docker-compose logs -f pipeline-translator")
print(" docker-compose logs -f pipeline-image-generator")
print(" docker-compose logs -f pipeline-article-assembly")
# 큐 상태 확인
for i in range(10):
await asyncio.sleep(5)
# 각 큐의 길이 확인
queues = [
"queue:rss_collection",
"queue:google_search",
"queue:ai_summarization",
"queue:translation",
"queue:image_generation",
"queue:article_assembly"
]
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] 큐 상태:")
for queue in queues:
length = await redis_client.llen(queue)
if length > 0:
print(f" {queue}: {length} 작업 대기 중")
# 4. 최종 결과 확인
print("\n📄 MongoDB에서 생성된 기사 확인 중...")
articles = await db.articles.find({"keyword": test_keyword.keyword}).to_list(length=5)
if articles:
print(f"{len(articles)}개의 기사 생성 완료!")
for article in articles:
print(f"\n제목: {article.get('title', 'N/A')}")
print(f"ID: {article.get('article_id', 'N/A')}")
print(f"생성 시간: {article.get('created_at', 'N/A')}")
print(f"처리 시간: {article.get('processing_time', 'N/A')}")
print(f"이미지 수: {len(article.get('images', []))}")
else:
print("⚠️ 아직 기사가 생성되지 않았습니다. 조금 더 기다려주세요.")
# 연결 종료
await redis_client.close()
mongo_client.close()
if __name__ == "__main__":
print("🚀 파이프라인 테스트 시작")
asyncio.run(test_pipeline())

View File

@ -0,0 +1,56 @@
#!/usr/bin/env python3
"""
스타크래프트 키워드로 파이프라인 테스트
"""
import asyncio
import sys
import os
sys.path.append(os.path.dirname(__file__))
from shared.queue_manager import QueueManager
from shared.models import PipelineJob
async def test_starcraft_pipeline():
"""스타크래프트 키워드로 파이프라인 테스트"""
# Queue manager 초기화
queue_manager = QueueManager(redis_url="redis://redis:6379")
await queue_manager.connect()
try:
# 스타크래프트 파이프라인 작업 생성
job = PipelineJob(
keyword_id="test_starcraft_001",
keyword="스타크래프트",
stage="rss_collection",
data={}
)
print(f"🚀 스타크래프트 파이프라인 작업 시작")
print(f" 작업 ID: {job.job_id}")
print(f" 키워드: {job.keyword}")
print(f" 키워드 ID: {job.keyword_id}")
# RSS 수집 큐에 작업 추가
await queue_manager.enqueue('rss_collection', job)
print(f"✅ 작업이 rss_collection 큐에 추가되었습니다")
# 큐 상태 확인
stats = await queue_manager.get_queue_stats()
print(f"\n📊 현재 큐 상태:")
for queue_name, stat in stats.items():
if queue_name not in ['completed', 'failed']:
pending = stat.get('pending', 0)
processing = stat.get('processing', 0)
if pending > 0 or processing > 0:
print(f" {queue_name}: 대기={pending}, 처리중={processing}")
print(f"\n⏳ 파이프라인 실행을 모니터링하세요:")
print(f" docker logs site11_pipeline_rss_collector --tail 20 -f")
print(f" python3 check_mongodb.py")
finally:
await queue_manager.disconnect()
if __name__ == "__main__":
asyncio.run(test_starcraft_pipeline())

View File

@ -0,0 +1,54 @@
"""
파이프라인 테스트 작업 제출 스크립트
"""
import redis
import json
from datetime import datetime
import uuid
import sys
def submit_test_job(keyword='나스닥'):
# Redis 연결
redis_client = redis.Redis(host='localhost', port=6379, decode_responses=True)
# 테스트 작업 생성
job_id = str(uuid.uuid4())
keyword_id = f'test_{job_id[:8]}'
job_data = {
'job_id': job_id,
'keyword_id': keyword_id,
'keyword': keyword,
'created_at': datetime.now().isoformat(),
'stage': 'rss_collection',
'stages_completed': [],
'data': {}
}
# QueueMessage 래퍼 생성
queue_message = {
'message_id': str(uuid.uuid4()),
'queue_name': 'rss_collection',
'job': job_data,
'timestamp': datetime.now().isoformat(),
'attempts': 0
}
# 큐에 작업 추가 (rpush 사용 - priority=0인 경우)
redis_client.rpush('queue:rss_collection', json.dumps(queue_message))
print(f'✅ 파이프라인 시작: job_id={job_id}')
print(f'✅ 키워드: {keyword}')
print(f'✅ RSS Collection 큐에 작업 추가 완료')
# 큐 상태 확인
queue_len = redis_client.llen('queue:rss_collection')
print(f'✅ 현재 큐 길이: {queue_len}')
redis_client.close()
if __name__ == "__main__":
if len(sys.argv) > 1:
keyword = sys.argv[1]
else:
keyword = '나스닥'
submit_test_job(keyword)

View File

@ -9,7 +9,11 @@ RUN pip install --no-cache-dir -r requirements.txt
# Copy shared modules
COPY ./shared /app/shared
# Copy config directory
COPY ./config /app/config
# Copy application code
COPY ./translator /app
CMD ["python", "translator.py"]
# Use multi_translator.py as the main service
CMD ["python", "multi_translator.py"]

View File

@ -0,0 +1,329 @@
"""
Language Sync Service
기존 기사를 새로운 언어로 번역하는 백그라운드 서비스
"""
import asyncio
import logging
import os
import sys
import json
from typing import List, Dict, Any
import httpx
from motor.motor_asyncio import AsyncIOMotorClient
from datetime import datetime
# Add parent directory to path for shared module
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Import from shared module
from shared.models import FinalArticle, Subtopic, Entities, NewsReference
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class LanguageSyncService:
def __init__(self):
self.deepl_api_key = os.getenv("DEEPL_API_KEY", "3abbc796-2515-44a8-972d-22dcf27ab54a")
self.deepl_api_url = "https://api.deepl.com/v2/translate"
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
self.languages_config = None
self.config_path = "/app/config/languages.json"
self.sync_batch_size = 10
self.sync_delay = 2.0 # 언어 간 지연
async def load_config(self):
"""언어 설정 파일 로드"""
try:
if os.path.exists(self.config_path):
with open(self.config_path, 'r', encoding='utf-8') as f:
self.languages_config = json.load(f)
logger.info(f"Loaded language config")
else:
raise FileNotFoundError(f"Config file not found: {self.config_path}")
except Exception as e:
logger.error(f"Error loading config: {e}")
raise
async def start(self):
"""백그라운드 싱크 서비스 시작"""
logger.info("Starting Language Sync Service")
# 설정 로드
await self.load_config()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# 주기적으로 싱크 체크 (10분마다)
while True:
try:
await self.sync_missing_translations()
await asyncio.sleep(600) # 10분 대기
except Exception as e:
logger.error(f"Error in sync loop: {e}")
await asyncio.sleep(60) # 에러 시 1분 후 재시도
async def sync_missing_translations(self):
"""누락된 번역 싱크"""
try:
# 활성화된 언어 목록
enabled_languages = [
lang for lang in self.languages_config["enabled_languages"]
if lang["enabled"]
]
if not enabled_languages:
logger.info("No enabled languages for sync")
return
# 원본 언어 컬렉션
source_collection = self.languages_config["source_language"]["collection"]
for lang_config in enabled_languages:
await self.sync_language(source_collection, lang_config)
await asyncio.sleep(self.sync_delay)
except Exception as e:
logger.error(f"Error in sync_missing_translations: {e}")
async def sync_language(self, source_collection: str, lang_config: Dict):
"""특정 언어로 누락된 기사 번역"""
try:
target_collection = lang_config["collection"]
# 번역되지 않은 기사 찾기
# 원본에는 있지만 대상 컬렉션에는 없는 기사
source_articles = await self.db[source_collection].find(
{},
{"news_id": 1}
).to_list(None)
source_ids = {article["news_id"] for article in source_articles}
translated_articles = await self.db[target_collection].find(
{},
{"news_id": 1}
).to_list(None)
translated_ids = {article["news_id"] for article in translated_articles}
# 누락된 news_id
missing_ids = source_ids - translated_ids
if not missing_ids:
logger.info(f"No missing translations for {lang_config['name']}")
return
logger.info(f"Found {len(missing_ids)} missing translations for {lang_config['name']}")
# 배치로 처리
missing_list = list(missing_ids)
for i in range(0, len(missing_list), self.sync_batch_size):
batch = missing_list[i:i+self.sync_batch_size]
for news_id in batch:
try:
# 원본 기사 조회
korean_article = await self.db[source_collection].find_one(
{"news_id": news_id}
)
if not korean_article:
continue
# 번역 수행
await self.translate_and_save(
korean_article,
lang_config
)
logger.info(f"Synced article {news_id} to {lang_config['code']}")
# API 속도 제한
await asyncio.sleep(0.5)
except Exception as e:
logger.error(f"Error translating {news_id} to {lang_config['code']}: {e}")
continue
# 배치 간 지연
if i + self.sync_batch_size < len(missing_list):
await asyncio.sleep(self.sync_delay)
except Exception as e:
logger.error(f"Error syncing language {lang_config['code']}: {e}")
async def translate_and_save(self, korean_article: Dict, lang_config: Dict):
"""기사 번역 및 저장"""
try:
# 제목 번역
translated_title = await self._translate_text(
korean_article.get('title', ''),
target_lang=lang_config["deepl_code"]
)
# 요약 번역
translated_summary = await self._translate_text(
korean_article.get('summary', ''),
target_lang=lang_config["deepl_code"]
)
# Subtopics 번역
translated_subtopics = []
for subtopic in korean_article.get('subtopics', []):
translated_subtopic_title = await self._translate_text(
subtopic.get('title', ''),
target_lang=lang_config["deepl_code"]
)
translated_content_list = []
for content_para in subtopic.get('content', []):
translated_para = await self._translate_text(
content_para,
target_lang=lang_config["deepl_code"]
)
translated_content_list.append(translated_para)
translated_subtopics.append(Subtopic(
title=translated_subtopic_title,
content=translated_content_list
))
# 카테고리 번역
translated_categories = []
for category in korean_article.get('categories', []):
translated_cat = await self._translate_text(
category,
target_lang=lang_config["deepl_code"]
)
translated_categories.append(translated_cat)
# Entities와 References는 원본 유지
entities_data = korean_article.get('entities', {})
translated_entities = Entities(**entities_data) if entities_data else Entities()
references = []
for ref_data in korean_article.get('references', []):
references.append(NewsReference(**ref_data))
# 번역된 기사 생성
translated_article = FinalArticle(
news_id=korean_article.get('news_id'),
title=translated_title,
summary=translated_summary,
subtopics=translated_subtopics,
categories=translated_categories,
entities=translated_entities,
source_keyword=korean_article.get('source_keyword'),
source_count=korean_article.get('source_count', 1),
references=references,
job_id=korean_article.get('job_id'),
keyword_id=korean_article.get('keyword_id'),
pipeline_stages=korean_article.get('pipeline_stages', []) + ['sync_translation'],
processing_time=korean_article.get('processing_time', 0),
language=lang_config["code"],
ref_news_id=None,
rss_guid=korean_article.get('rss_guid'), # RSS GUID 유지
image_prompt=korean_article.get('image_prompt'), # 이미지 프롬프트 유지
images=korean_article.get('images', []), # 이미지 URL 리스트 유지
translated_languages=korean_article.get('translated_languages', []) # 번역 언어 목록 유지
)
# MongoDB에 저장
collection_name = lang_config["collection"]
result = await self.db[collection_name].insert_one(translated_article.model_dump())
# 원본 기사에 번역 완료 표시
await self.db[self.languages_config["source_language"]["collection"]].update_one(
{"news_id": korean_article.get('news_id')},
{
"$addToSet": {
"translated_languages": lang_config["code"]
}
}
)
logger.info(f"Synced article to {collection_name}: {result.inserted_id}")
except Exception as e:
logger.error(f"Error in translate_and_save: {e}")
raise
async def _translate_text(self, text: str, target_lang: str = 'EN') -> str:
"""DeepL API를 사용한 텍스트 번역"""
try:
if not text:
return ""
async with httpx.AsyncClient() as client:
response = await client.post(
self.deepl_api_url,
data={
'auth_key': self.deepl_api_key,
'text': text,
'target_lang': target_lang,
'source_lang': 'KO'
},
timeout=30
)
if response.status_code == 200:
result = response.json()
return result['translations'][0]['text']
else:
logger.error(f"DeepL API error: {response.status_code}")
return text
except Exception as e:
logger.error(f"Error translating text: {e}")
return text
async def manual_sync(self, language_code: str = None):
"""수동 싱크 실행"""
logger.info(f"Manual sync requested for language: {language_code or 'all'}")
await self.load_config()
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
if language_code:
# 특정 언어만 싱크
lang_config = next(
(lang for lang in self.languages_config["enabled_languages"]
if lang["code"] == language_code and lang["enabled"]),
None
)
if lang_config:
source_collection = self.languages_config["source_language"]["collection"]
await self.sync_language(source_collection, lang_config)
else:
logger.error(f"Language {language_code} not found or not enabled")
else:
# 모든 활성 언어 싱크
await self.sync_missing_translations()
async def main():
"""메인 함수"""
service = LanguageSyncService()
# 명령줄 인수 확인
if len(sys.argv) > 1:
if sys.argv[1] == "sync":
# 수동 싱크 모드
language = sys.argv[2] if len(sys.argv) > 2 else None
await service.manual_sync(language)
else:
logger.error(f"Unknown command: {sys.argv[1]}")
else:
# 백그라운드 서비스 모드
try:
await service.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,320 @@
"""
Multi-Language Translation Service
다국어 번역 서비스 - 설정 기반 다중 언어 지원
"""
import asyncio
import logging
import os
import sys
import json
from typing import List, Dict, Any
import httpx
import redis.asyncio as redis
from motor.motor_asyncio import AsyncIOMotorClient
from datetime import datetime
# Import from shared module
from shared.models import PipelineJob, FinalArticle
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class MultiLanguageTranslator:
def __init__(self):
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
self.deepl_api_key = os.getenv("DEEPL_API_KEY", "3abbc796-2515-44a8-972d-22dcf27ab54a")
self.deepl_api_url = "https://api.deepl.com/v2/translate"
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
self.languages_config = None
self.config_path = "/app/config/languages.json"
async def load_config(self):
"""언어 설정 파일 로드"""
try:
if os.path.exists(self.config_path):
with open(self.config_path, 'r', encoding='utf-8') as f:
self.languages_config = json.load(f)
else:
# 기본 설정 (영어만)
self.languages_config = {
"enabled_languages": [
{
"code": "en",
"name": "English",
"deepl_code": "EN",
"collection": "articles_en",
"enabled": True
}
],
"source_language": {
"code": "ko",
"name": "Korean",
"collection": "articles_ko"
},
"translation_settings": {
"batch_size": 5,
"delay_between_languages": 2.0,
"delay_between_articles": 0.5,
"max_retries": 3
}
}
logger.info(f"Loaded language config: {len(self.get_enabled_languages())} languages enabled")
except Exception as e:
logger.error(f"Error loading config: {e}")
raise
def get_enabled_languages(self) -> List[Dict]:
"""활성화된 언어 목록 반환"""
return [lang for lang in self.languages_config["enabled_languages"] if lang["enabled"]]
async def start(self):
"""워커 시작"""
logger.info("Starting Multi-Language Translator Worker")
# 설정 로드
await self.load_config()
# Redis 연결
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# DeepL API 키 확인
if not self.deepl_api_key:
logger.error("DeepL API key not configured")
return
# 메인 처리 루프
while True:
try:
# 큐에서 작업 가져오기
job = await self.queue_manager.dequeue('translation', timeout=5)
if job:
await self.process_job(job)
except Exception as e:
logger.error(f"Error in worker loop: {e}")
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""모든 활성 언어로 번역"""
try:
logger.info(f"Processing job {job.job_id} for multi-language translation")
# MongoDB에서 한국어 기사 가져오기
news_id = job.data.get('news_id')
if not news_id:
logger.error(f"No news_id in job {job.job_id}")
await self.queue_manager.mark_failed('translation', job, "No news_id")
return
# 원본 컬렉션에서 기사 조회
source_collection = self.languages_config["source_language"]["collection"]
korean_article = await self.db[source_collection].find_one({"news_id": news_id})
if not korean_article:
logger.error(f"Article {news_id} not found in {source_collection}")
await self.queue_manager.mark_failed('translation', job, "Article not found")
return
# 활성화된 모든 언어로 번역
enabled_languages = self.get_enabled_languages()
settings = self.languages_config["translation_settings"]
for lang_config in enabled_languages:
try:
logger.info(f"Translating article {news_id} to {lang_config['name']}")
# 이미 번역되었는지 확인
existing = await self.db[lang_config["collection"]].find_one({"news_id": news_id})
if existing:
logger.info(f"Article {news_id} already translated to {lang_config['code']}")
continue
# 번역 수행
await self.translate_article(
korean_article,
lang_config,
job
)
# 언어 간 지연
if settings.get("delay_between_languages"):
await asyncio.sleep(settings["delay_between_languages"])
except Exception as e:
logger.error(f"Error translating to {lang_config['code']}: {e}")
continue
# 파이프라인 완료 로그
logger.info(f"Translation pipeline completed for news_id: {news_id}")
# 완료 표시
job.stages_completed.append('translation')
await self.queue_manager.mark_completed('translation', job.job_id)
logger.info(f"Multi-language translation completed for job {job.job_id}")
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")
await self.queue_manager.mark_failed('translation', job, str(e))
async def translate_article(self, korean_article: Dict, lang_config: Dict, job: PipelineJob):
"""특정 언어로 기사 번역"""
try:
# 제목 번역
translated_title = await self._translate_text(
korean_article.get('title', ''),
target_lang=lang_config["deepl_code"]
)
# 요약 번역
translated_summary = await self._translate_text(
korean_article.get('summary', ''),
target_lang=lang_config["deepl_code"]
)
# Subtopics 번역
from shared.models import Subtopic
translated_subtopics = []
for subtopic in korean_article.get('subtopics', []):
translated_subtopic_title = await self._translate_text(
subtopic.get('title', ''),
target_lang=lang_config["deepl_code"]
)
translated_content_list = []
for content_para in subtopic.get('content', []):
translated_para = await self._translate_text(
content_para,
target_lang=lang_config["deepl_code"]
)
translated_content_list.append(translated_para)
# API 속도 제한
settings = self.languages_config["translation_settings"]
if settings.get("delay_between_articles"):
await asyncio.sleep(settings["delay_between_articles"])
translated_subtopics.append(Subtopic(
title=translated_subtopic_title,
content=translated_content_list
))
# 카테고리 번역
translated_categories = []
for category in korean_article.get('categories', []):
translated_cat = await self._translate_text(
category,
target_lang=lang_config["deepl_code"]
)
translated_categories.append(translated_cat)
# Entities와 References는 원본 유지
from shared.models import Entities, NewsReference
entities_data = korean_article.get('entities', {})
translated_entities = Entities(**entities_data) if entities_data else Entities()
references = []
for ref_data in korean_article.get('references', []):
references.append(NewsReference(**ref_data))
# 번역된 기사 생성
translated_article = FinalArticle(
news_id=korean_article.get('news_id'), # 같은 news_id 사용
title=translated_title,
summary=translated_summary,
subtopics=translated_subtopics,
categories=translated_categories,
entities=translated_entities,
source_keyword=job.keyword if hasattr(job, 'keyword') else korean_article.get('source_keyword'),
source_count=korean_article.get('source_count', 1),
references=references,
job_id=job.job_id,
keyword_id=job.keyword_id if hasattr(job, 'keyword_id') else None,
pipeline_stages=korean_article.get('pipeline_stages', []) + ['translation'],
processing_time=korean_article.get('processing_time', 0),
language=lang_config["code"],
ref_news_id=None, # 같은 news_id 사용하므로 불필요
rss_guid=korean_article.get('rss_guid'), # RSS GUID 유지
image_prompt=korean_article.get('image_prompt'), # 이미지 프롬프트 유지
images=korean_article.get('images', []), # 이미지 URL 리스트 유지
translated_languages=korean_article.get('translated_languages', []) # 번역 언어 목록 유지
)
# MongoDB에 저장
collection_name = lang_config["collection"]
result = await self.db[collection_name].insert_one(translated_article.model_dump())
logger.info(f"Article saved to {collection_name} with _id: {result.inserted_id}, language: {lang_config['code']}")
# 원본 기사에 번역 완료 표시
await self.db[self.languages_config["source_language"]["collection"]].update_one(
{"news_id": korean_article.get('news_id')},
{
"$addToSet": {
"translated_languages": lang_config["code"]
}
}
)
except Exception as e:
logger.error(f"Error translating article to {lang_config['code']}: {e}")
raise
async def _translate_text(self, text: str, target_lang: str = 'EN') -> str:
"""DeepL API를 사용한 텍스트 번역"""
try:
if not text:
return ""
async with httpx.AsyncClient() as client:
response = await client.post(
self.deepl_api_url,
data={
'auth_key': self.deepl_api_key,
'text': text,
'target_lang': target_lang,
'source_lang': 'KO'
},
timeout=30
)
if response.status_code == 200:
result = response.json()
return result['translations'][0]['text']
else:
logger.error(f"DeepL API error: {response.status_code}")
return text # 번역 실패시 원본 반환
except Exception as e:
logger.error(f"Error translating text: {e}")
return text # 번역 실패시 원본 반환
async def stop(self):
"""워커 중지"""
await self.queue_manager.disconnect()
logger.info("Multi-Language Translator Worker stopped")
async def main():
"""메인 함수"""
worker = MultiLanguageTranslator()
try:
await worker.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
await worker.stop()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -1,3 +1,5 @@
httpx==0.25.0
redis[hiredis]==5.0.1
pydantic==2.5.0
pydantic==2.5.0
motor==3.1.1
pymongo==4.3.3

View File

@ -8,9 +8,11 @@ import os
import sys
from typing import List, Dict, Any
import httpx
from motor.motor_asyncio import AsyncIOMotorClient
from datetime import datetime
# Import from shared module
from shared.models import PipelineJob, SummarizedItem, TranslatedItem
from shared.models import PipelineJob, FinalArticle
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
@ -24,93 +26,167 @@ class TranslatorWorker:
self.deepl_api_key = os.getenv("DEEPL_API_KEY", "3abbc796-2515-44a8-972d-22dcf27ab54a")
# DeepL Pro API 엔드포인트 사용
self.deepl_api_url = "https://api.deepl.com/v2/translate"
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
async def start(self):
"""워커 시작"""
logger.info("Starting Translator Worker")
# Redis 연결
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# DeepL API 키 확인
if not self.deepl_api_key:
logger.error("DeepL API key not configured")
return
# 메인 처리 루프
while True:
try:
# 큐에서 작업 가져오기
job = await self.queue_manager.dequeue('translation', timeout=5)
if job:
await self.process_job(job)
except Exception as e:
logger.error(f"Error in worker loop: {e}")
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""번역 작업 처리"""
"""영어 버전 기사 생성 및 저장"""
try:
logger.info(f"Processing job {job.job_id} for translation")
summarized_items = job.data.get('summarized_items', [])
translated_items = []
for item_data in summarized_items:
summarized_item = SummarizedItem(**item_data)
# 제목과 요약 번역
translated_title = await self._translate_text(
summarized_item.enriched_item['rss_item']['title'],
# MongoDB에서 한국어 기사 가져오기
news_id = job.data.get('news_id')
if not news_id:
logger.error(f"No news_id in job {job.job_id}")
await self.queue_manager.mark_failed('translation', job, "No news_id")
return
# MongoDB에서 한국어 기사 조회 (articles_ko)
korean_article = await self.db.articles_ko.find_one({"news_id": news_id})
if not korean_article:
logger.error(f"Article {news_id} not found in MongoDB")
await self.queue_manager.mark_failed('translation', job, "Article not found")
return
# 영어로 번역
translated_title = await self._translate_text(
korean_article.get('title', ''),
target_lang='EN'
)
translated_summary = await self._translate_text(
korean_article.get('summary', ''),
target_lang='EN'
)
# Subtopics 번역
from shared.models import Subtopic
translated_subtopics = []
for subtopic in korean_article.get('subtopics', []):
translated_subtopic_title = await self._translate_text(
subtopic.get('title', ''),
target_lang='EN'
)
translated_summary = await self._translate_text(
summarized_item.ai_summary,
target_lang='EN'
)
translated_item = TranslatedItem(
summarized_item=summarized_item,
translated_title=translated_title,
translated_summary=translated_summary,
target_language='en'
)
translated_items.append(translated_item)
# API 속도 제한
await asyncio.sleep(0.5)
if translated_items:
logger.info(f"Translated {len(translated_items)} items")
# 다음 단계로 전달
job.data['translated_items'] = [item.dict() for item in translated_items]
job.stages_completed.append('translation')
job.stage = 'image_generation'
await self.queue_manager.enqueue('image_generation', job)
await self.queue_manager.mark_completed('translation', job.job_id)
else:
logger.warning(f"No items translated for job {job.job_id}")
await self.queue_manager.mark_failed(
'translation',
job,
"No items to translate"
)
translated_content_list = []
for content_para in subtopic.get('content', []):
translated_para = await self._translate_text(
content_para,
target_lang='EN'
)
translated_content_list.append(translated_para)
await asyncio.sleep(0.2) # API 속도 제한
translated_subtopics.append(Subtopic(
title=translated_subtopic_title,
content=translated_content_list
))
# 카테고리 번역
translated_categories = []
for category in korean_article.get('categories', []):
translated_cat = await self._translate_text(category, target_lang='EN')
translated_categories.append(translated_cat)
await asyncio.sleep(0.2) # API 속도 제한
# Entities 번역 (선택적)
from shared.models import Entities
entities_data = korean_article.get('entities', {})
translated_entities = Entities(
people=entities_data.get('people', []), # 인명은 번역하지 않음
organizations=entities_data.get('organizations', []), # 조직명은 번역하지 않음
groups=entities_data.get('groups', []),
countries=entities_data.get('countries', []),
events=entities_data.get('events', [])
)
# 레퍼런스 가져오기 (번역하지 않음)
from shared.models import NewsReference
references = []
for ref_data in korean_article.get('references', []):
references.append(NewsReference(**ref_data))
# 영어 버전 기사 생성 - 같은 news_id 사용
english_article = FinalArticle(
news_id=news_id, # 원본과 같은 news_id 사용
title=translated_title,
summary=translated_summary,
subtopics=translated_subtopics,
categories=translated_categories,
entities=translated_entities,
source_keyword=job.keyword,
source_count=korean_article.get('source_count', 1),
references=references, # 원본 레퍼런스 그대로 사용
job_id=job.job_id,
keyword_id=job.keyword_id,
pipeline_stages=job.stages_completed.copy() + ['translation'],
processing_time=korean_article.get('processing_time', 0),
language='en', # 영어
ref_news_id=None # 같은 news_id를 사용하므로 ref 불필요
)
# MongoDB에 영어 버전 저장 (articles_en)
result = await self.db.articles_en.insert_one(english_article.model_dump())
english_article_id = str(result.inserted_id)
logger.info(f"English article saved with _id: {english_article_id}, news_id: {news_id}, language: en")
# 원본 한국어 기사 업데이트 - 번역 완료 표시
await self.db.articles_ko.update_one(
{"news_id": news_id},
{
"$addToSet": {
"pipeline_stages": "translation"
}
}
)
# 완료 표시
job.stages_completed.append('translation')
await self.queue_manager.mark_completed('translation', job.job_id)
logger.info(f"Translation completed for job {job.job_id}")
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")
await self.queue_manager.mark_failed('translation', job, str(e))
async def _translate_text(self, text: str, target_lang: str = 'EN') -> str:
"""DeepL API를 사용한 텍스트 번역"""
try:
if not text:
return ""
async with httpx.AsyncClient() as client:
response = await client.post(
self.deepl_api_url,
@ -122,18 +198,18 @@ class TranslatorWorker:
},
timeout=30
)
if response.status_code == 200:
result = response.json()
return result['translations'][0]['text']
else:
logger.error(f"DeepL API error: {response.status_code}")
return text # 번역 실패시 원본 반환
except Exception as e:
logger.error(f"Error translating text: {e}")
return text # 번역 실패시 원본 반환
async def stop(self):
"""워커 중지"""
await self.queue_manager.disconnect()
@ -142,7 +218,7 @@ class TranslatorWorker:
async def main():
"""메인 함수"""
worker = TranslatorWorker()
try:
await worker.start()
except KeyboardInterrupt: