feat: Implement automated keyword-based news pipeline scheduler

- Add multi-threaded keyword scheduler for periodic news collection
- Create Keyword Manager API for CRUD operations and monitoring
- Implement automatic pipeline triggering (RSS → Google → AI → Translation)
- Add thread status monitoring and dynamic keyword management
- Support priority-based execution and configurable intervals
- Add comprehensive scheduler documentation guide
- Default keywords: AI, 테크놀로지, 경제, 블록체인

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
jungwoo choi
2025-09-15 17:09:22 +09:00
parent 070032006e
commit eeaa9dcb4b
39 changed files with 3472 additions and 759 deletions

38
check_mongodb.py Normal file
View File

@ -0,0 +1,38 @@
import pymongo
from datetime import datetime
import json
# MongoDB 연결
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["ai_writer_db"]
collection = db["articles"]
# 최근 생성된 기사 조회
articles = collection.find().sort("created_at", -1).limit(2)
for article in articles:
print("=" * 80)
print(f"기사 ID: {article['article_id']}")
print(f"키워드: {article['keyword']}")
print(f"제목: {article['title']}")
print(f"요약: {article['summary']}")
print(f"처리 시간: {article['processing_time']:.2f}")
print(f"생성 시각: {article['created_at']}")
print(f"파이프라인 단계: {', '.join(article['pipeline_stages'])}")
print(f"카테고리: {', '.join(article['categories'])}")
print(f"태그: {', '.join(article['tags'])}")
print(f"\n내용 (첫 500자):\n{article['content'][:500]}...")
print("=" * 80)
print()
# 저장된 기사 생성
with open('generated_article.json', 'w', encoding='utf-8') as f:
# 최신 기사 하나를 다시 조회
latest = collection.find_one(sort=[("created_at", -1)])
if latest:
# ObjectId를 문자열로 변환
latest['_id'] = str(latest['_id'])
# datetime 객체를 문자열로 변환
latest['created_at'] = latest['created_at'].isoformat()
json.dump(latest, f, ensure_ascii=False, indent=2)
print(f"✅ 최신 기사가 generated_article.json 파일로 저장되었습니다.")

View File

@ -0,0 +1,85 @@
version: '3.8'
services:
# 키워드별 전용 스케줄러 (AI 키워드)
pipeline-scheduler-ai:
build:
context: ./services/pipeline
dockerfile: scheduler/Dockerfile
container_name: ${COMPOSE_PROJECT_NAME}_scheduler_ai
restart: unless-stopped
depends_on:
- redis
- mongodb
- pipeline-rss-collector
environment:
- REDIS_URL=redis://redis:6379
- MONGODB_URL=mongodb://mongodb:27017
- DB_NAME=ai_writer_db
- KEYWORD=AI
- INTERVAL_MINUTES=60
- PRIORITY=1
- MAX_ARTICLES=100
- LOG_LEVEL=INFO
command: python single_keyword_scheduler.py
volumes:
- ./services/pipeline/shared:/app/shared:ro
networks:
- site11_network
# 키워드별 전용 스케줄러 (경제 키워드)
pipeline-scheduler-economy:
build:
context: ./services/pipeline
dockerfile: scheduler/Dockerfile
container_name: ${COMPOSE_PROJECT_NAME}_scheduler_economy
restart: unless-stopped
depends_on:
- redis
- mongodb
- pipeline-rss-collector
environment:
- REDIS_URL=redis://redis:6379
- MONGODB_URL=mongodb://mongodb:27017
- DB_NAME=ai_writer_db
- KEYWORD=경제
- INTERVAL_MINUTES=120
- PRIORITY=0
- MAX_ARTICLES=100
- LOG_LEVEL=INFO
command: python single_keyword_scheduler.py
volumes:
- ./services/pipeline/shared:/app/shared:ro
networks:
- site11_network
# 키워드별 전용 스케줄러 (테크놀로지 키워드)
pipeline-scheduler-tech:
build:
context: ./services/pipeline
dockerfile: scheduler/Dockerfile
container_name: ${COMPOSE_PROJECT_NAME}_scheduler_tech
restart: unless-stopped
depends_on:
- redis
- mongodb
- pipeline-rss-collector
environment:
- REDIS_URL=redis://redis:6379
- MONGODB_URL=mongodb://mongodb:27017
- DB_NAME=ai_writer_db
- KEYWORD=테크놀로지
- INTERVAL_MINUTES=60
- PRIORITY=1
- MAX_ARTICLES=100
- LOG_LEVEL=INFO
command: python single_keyword_scheduler.py
volumes:
- ./services/pipeline/shared:/app/shared:ro
networks:
- site11_network
networks:
site11_network:
external: true
name: site11_network

View File

@ -481,7 +481,7 @@ services:
restart: unless-stopped
# ============ Pipeline Services ============
# Pipeline Scheduler Service
# Pipeline Multi-threaded Scheduler Service
pipeline-scheduler:
build:
context: ./services/pipeline
@ -494,8 +494,30 @@ services:
environment:
- REDIS_URL=redis://redis:6379
- MONGODB_URL=mongodb://mongodb:27017
- DB_NAME=pipeline_db
- DB_NAME=ai_writer_db
- LOG_LEVEL=INFO
command: python multi_thread_scheduler.py
volumes:
- ./services/pipeline/shared:/app/shared:ro
networks:
- site11_network
# Keyword Manager API Service
keyword-manager:
build:
context: ./services/pipeline
dockerfile: scheduler/Dockerfile
container_name: ${COMPOSE_PROJECT_NAME}_keyword_manager
restart: unless-stopped
depends_on:
- mongodb
environment:
- MONGODB_URL=mongodb://mongodb:27017
- DB_NAME=ai_writer_db
- API_PORT=8100
command: python keyword_manager.py
ports:
- "8100:8100"
volumes:
- ./services/pipeline/shared:/app/shared:ro
networks:
@ -513,6 +535,7 @@ services:
environment:
- REDIS_URL=redis://redis:6379
- LOG_LEVEL=INFO
- RSS_ENQUEUE_DELAY=1.0
volumes:
- ./services/pipeline/shared:/app/shared:ro
networks:
@ -523,7 +546,6 @@ services:
build:
context: ./services/pipeline
dockerfile: google-search/Dockerfile
container_name: ${COMPOSE_PROJECT_NAME}_pipeline_google_search
restart: unless-stopped
depends_on:
- redis
@ -537,30 +559,11 @@ services:
networks:
- site11_network
# Pipeline AI Summarizer Worker
pipeline-ai-summarizer:
# Pipeline AI Article Generator Worker
pipeline-ai-article-generator:
build:
context: ./services/pipeline
dockerfile: ai-summarizer/Dockerfile
container_name: ${COMPOSE_PROJECT_NAME}_pipeline_ai_summarizer
restart: unless-stopped
depends_on:
- redis
environment:
- REDIS_URL=redis://redis:6379
- CLAUDE_API_KEY=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
- LOG_LEVEL=INFO
volumes:
- ./services/pipeline/shared:/app/shared:ro
networks:
- site11_network
# Pipeline Article Assembly Worker
pipeline-article-assembly:
build:
context: ./services/pipeline
dockerfile: article-assembly/Dockerfile
container_name: ${COMPOSE_PROJECT_NAME}_pipeline_article_assembly
dockerfile: ai-article-generator/Dockerfile
restart: unless-stopped
depends_on:
- redis
@ -568,7 +571,7 @@ services:
environment:
- REDIS_URL=redis://redis:6379
- MONGODB_URL=mongodb://mongodb:27017
- DB_NAME=pipeline_db
- DB_NAME=ai_writer_db
- CLAUDE_API_KEY=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
- LOG_LEVEL=INFO
volumes:
@ -576,6 +579,7 @@ services:
networks:
- site11_network
# Pipeline Monitor (optional dashboard)
pipeline-monitor:
build:
@ -591,7 +595,7 @@ services:
environment:
- REDIS_URL=redis://redis:6379
- MONGODB_URL=mongodb://mongodb:27017
- DB_NAME=pipeline_db
- DB_NAME=ai_writer_db
- LOG_LEVEL=INFO
volumes:
- ./services/pipeline/shared:/app/shared:ro
@ -603,12 +607,14 @@ services:
build:
context: ./services/pipeline
dockerfile: translator/Dockerfile
container_name: ${COMPOSE_PROJECT_NAME}_pipeline_translator
restart: unless-stopped
depends_on:
- redis
- mongodb
environment:
- REDIS_URL=redis://redis:6379
- MONGODB_URL=mongodb://mongodb:27017
- DB_NAME=ai_writer_db
- DEEPL_API_KEY=3abbc796-2515-44a8-972d-22dcf27ab54a
- LOG_LEVEL=INFO
volumes:
@ -616,6 +622,27 @@ services:
networks:
- site11_network
# Pipeline Language Sync Service
pipeline-language-sync:
build:
context: ./services/pipeline
dockerfile: translator/Dockerfile
container_name: ${COMPOSE_PROJECT_NAME}_pipeline_language_sync
restart: unless-stopped
depends_on:
- mongodb
environment:
- MONGODB_URL=mongodb://mongodb:27017
- DB_NAME=ai_writer_db
- DEEPL_API_KEY=3abbc796-2515-44a8-972d-22dcf27ab54a
- LOG_LEVEL=INFO
command: ["python", "language_sync.py"]
volumes:
- ./services/pipeline/shared:/app/shared:ro
- ./services/pipeline/config:/app/config:ro
networks:
- site11_network
# Pipeline Image Generator
pipeline-image-generator:
build:
@ -625,15 +652,20 @@ services:
restart: unless-stopped
depends_on:
- redis
- mongodb
env_file:
- ./services/pipeline/.env
environment:
- REDIS_URL=redis://redis:6379
- REPLICATE_API_KEY=${REPLICATE_API_KEY:-}
- MONGODB_URL=mongodb://mongodb:27017
- DB_NAME=ai_writer_db
- LOG_LEVEL=INFO
volumes:
- ./services/pipeline/shared:/app/shared:ro
networks:
- site11_network
networks:
site11_network:
driver: bridge

View File

@ -0,0 +1,238 @@
# Pipeline Scheduler Guide
## 개요
Pipeline Scheduler는 등록된 키워드를 주기적으로 실행하여 자동으로 뉴스를 수집하고 AI 기사를 생성하는 시스템입니다.
## 아키텍처
### 1. 구성 요소
#### 1.1 Multi-Thread Scheduler (pipeline-scheduler)
- **역할**: 키워드별 스레드 관리 및 주기적 실행
- **특징**:
- 단일 Docker 컨테이너에서 여러 스레드 동시 실행
- 각 키워드당 하나의 독립 스레드
- 30초마다 새 키워드 체크 및 스레드 관리
- **위치**: `services/pipeline/scheduler/multi_thread_scheduler.py`
#### 1.2 Keyword Manager API (keyword-manager)
- **역할**: 키워드 CRUD 및 스레드 모니터링
- **포트**: 8100
- **주요 엔드포인트**:
- `GET /threads/status` - 모든 스레드 상태 조회
- `GET /keywords` - 모든 키워드 목록
- `POST /keywords` - 새 키워드 추가
- `PUT /keywords/{keyword}` - 키워드 수정
- `DELETE /keywords/{keyword}` - 키워드 삭제
- `POST /keywords/{keyword}/activate` - 키워드 활성화
- `POST /keywords/{keyword}/deactivate` - 키워드 비활성화
- `POST /keywords/{keyword}/trigger` - 즉시 실행
- **위치**: `services/pipeline/scheduler/keyword_manager.py`
### 2. 데이터 모델
```python
class Keyword:
keyword_id: str # UUID
keyword: str # 검색 키워드
interval_minutes: int # 실행 주기 (분)
is_active: bool # 활성 상태
priority: int # 우선순위 (높을수록 우선)
rss_feeds: List[str] # RSS 피드 URL 목록
max_articles_per_run: int # 실행당 최대 기사 수
last_run: datetime # 마지막 실행 시간
next_run: datetime # 다음 실행 예정 시간
```
## 사용 방법
### 1. 서비스 시작
```bash
# 스케줄러와 매니저 시작
docker-compose up -d pipeline-scheduler keyword-manager
# 로그 확인
docker-compose logs -f pipeline-scheduler
```
### 2. 키워드 관리
#### 2.1 키워드 추가
```bash
curl -X POST http://localhost:8100/keywords \
-H "Content-Type: application/json" \
-d '{
"keyword": "딥러닝",
"interval_minutes": 60,
"priority": 1,
"rss_feeds": [],
"max_articles_per_run": 100,
"is_active": true
}'
```
#### 2.2 키워드 수정
```bash
curl -X PUT http://localhost:8100/keywords/딥러닝 \
-H "Content-Type: application/json" \
-d '{
"interval_minutes": 30,
"priority": 2
}'
```
#### 2.3 키워드 활성화/비활성화
```bash
# 활성화
curl -X POST http://localhost:8100/keywords/딥러닝/activate
# 비활성화
curl -X POST http://localhost:8100/keywords/딥러닝/deactivate
```
#### 2.4 즉시 실행
```bash
curl -X POST http://localhost:8100/keywords/딥러닝/trigger
```
#### 2.5 키워드 삭제
```bash
curl -X DELETE http://localhost:8100/keywords/딥러닝
```
### 3. 모니터링
#### 3.1 스레드 상태 확인
```bash
curl http://localhost:8100/threads/status | python3 -m json.tool
```
응답 예시:
```json
{
"total_threads": 4,
"active_threads": 4,
"threads": [
{
"keyword": "블록체인",
"keyword_id": "5c7ac9a9-c56f-4878-94ec-adb13f105c8a",
"is_active": true,
"interval_minutes": 30,
"priority": 2,
"last_run": "2025-09-15T08:05:58.807000",
"next_run": "2025-09-15T08:35:58.807000",
"thread_status": "active",
"minutes_until_next_run": 25.3
}
]
}
```
#### 3.2 키워드 목록 조회
```bash
curl http://localhost:8100/keywords | python3 -m json.tool
```
## 작동 방식
### 1. 실행 흐름
1. **키워드 스레드 시작**
- 스케줄러 시작 시 활성 키워드 로드
- 각 키워드별 독립 스레드 생성
2. **주기적 실행**
- 각 스레드는 설정된 주기마다 실행
- 실행 시 PipelineJob 생성 후 Redis 큐에 추가
- RSS 수집 → Google 검색 → AI 기사 생성 → 번역 파이프라인 자동 진행
3. **동적 스레드 관리**
- 30초마다 새 키워드 확인
- 새 키워드 추가 시 자동으로 스레드 생성
- 비활성화/삭제 시 스레드 자동 중지
### 2. 우선순위 처리
- 높은 우선순위(priority) 키워드가 먼저 처리
- Redis 큐에서 우선순위별 정렬
### 3. 오류 처리
- 각 스레드는 독립적으로 오류 처리
- 오류 발생 시 1분 대기 후 재시도
- 스레드별 error_count, last_error 추적
## 현재 설정된 키워드
| 키워드 | 실행 주기 | 우선순위 | 상태 |
|--------|-----------|----------|------|
| 블록체인 | 30분 | 2 | 활성 |
| AI | 60분 | 1 | 활성 |
| 테크놀로지 | 60분 | 1 | 활성 |
| 경제 | 60분 | 0 | 활성 |
## 주의사항
1. **스레드 관리**
- 키워드 추가/삭제는 30초 이내 자동 반영
- 스레드 상태는 keyword-manager API로 실시간 확인 가능
2. **실행 주기**
- 최소 실행 주기: 제한 없음 (권장: 30분 이상)
- interval_minutes 변경 시 다음 실행 시간 자동 재계산
3. **중복 방지**
- 동일 키워드 중복 등록 불가
- RSS 수집 시 중복 URL 자동 필터링
## 트러블슈팅
### 스레드가 시작되지 않을 때
```bash
# 스케줄러 재시작
docker-compose restart pipeline-scheduler
# 로그 확인
docker-compose logs --tail=50 pipeline-scheduler
```
### 키워드가 실행되지 않을 때
```bash
# 키워드 상태 확인
curl http://localhost:8100/keywords/키워드명 | python3 -m json.tool
# 즉시 실행 트리거
curl -X POST http://localhost:8100/keywords/키워드명/trigger
```
### MongoDB 연결 오류
```bash
# MongoDB 상태 확인
docker-compose ps mongodb
# MongoDB 재시작
docker-compose restart mongodb
```
## 파일 구조
```
services/pipeline/scheduler/
├── multi_thread_scheduler.py # 멀티스레드 스케줄러
├── keyword_manager.py # 키워드 관리 API
├── single_keyword_scheduler.py # (deprecated) 단일 키워드 스케줄러
├── requirements.txt # Python 의존성
└── Dockerfile # Docker 이미지 정의
services/pipeline/shared/
└── models.py # Keyword, PipelineJob 모델 정의
```
## 향후 개선사항
1. **웹 대시보드**: 실시간 모니터링 UI
2. **알림 시스템**: 오류 발생 시 이메일/Slack 알림
3. **통계 기능**: 키워드별 수집 통계 및 분석
4. **스케줄 템플릿**: 자주 사용하는 설정 저장/불러오기
5. **백업/복구**: 키워드 설정 백업 및 복구 기능

View File

@ -1,63 +1,36 @@
{
"news_id": "dda43a2b-8478-4bd8-be74-32ab2618a7dd",
"title": "2025년 대한민국 AI 전환 가속화...정부 10조원 투자 계획 발표",
"created_at": "2025-09-12T19:33:52.388833",
"summary": "정부가 내년 인공지능 분야에 10조원 투자 계획을 발표하며 국가 AI 경쟁력 강화에 나서, 산업 전반의 디지털 전환 가속화 전망",
"subtopics": [
{
"title": "정부의 대규모 AI 투자 계획",
"content": [
"정부가 2025년 인공지능 분야에 10조원 규모의 대규모 투자를 단행할 계획을 발표했다. 이는 한국 경제의 체질 개선과 디지털 전환 가속화를 위한 전략적 결정으로 평가받고 있다.",
"투자의 주요 방향은 AI 기술 개발, 인프라 구축, 인재 양성 등 다방면에 걸쳐있다. 특히 피지컬 AI 실증단지 조성과 같은 실용적 프로젝트들이 포함되어 있어 실질적인 산업 발전 효과가 기대된다.",
"대통령실은 AI 정책 추진을 위한 전담 조직을 신설하고, 정부와 업계 간의 가교 역할을 수행할 예정이다. 이를 통해 민관 협력 체계를 강화하고 정책 실행력을 높일 계획이다."
]
},
{
"title": "국내 기업들의 AI 기술 경쟁력 강화",
"content": [
"SK하이닉스는 AI 메모리 분야에서 HBM4 양산을 시작하며 글로벌 기술 경쟁에서 우위를 점하고 있다. 특히 10Gbps 이상의 동작속도 구현에 성공하며 기술력을 입증했다.",
"국내 주요 기업들은 AI 관련 연구개발 투자를 확대하고 있으며, 특히 반도체, 로봇, 소프트웨어 분야에서 괄목할만한 성과를 보이고 있다.",
"산업계는 정부의 대규모 투자 계획에 호응하여 자체적인 AI 혁신 프로그램을 가속화하고 있으며, 글로벌 시장에서의 경쟁력 강화를 위해 총력을 기울이고 있다."
]
},
{
"title": "AI 기본법 시행과 제도적 기반 마련",
"content": [
"정부는 AI 기본법 시행령을 공개하며 인공지능 발전을 위한 제도적 기반을 마련했다. 이를 통해 AI 산업 발전의 법적 근거와 윤리적 가이드라인이 확립될 전망이다.",
"새로운 법제도는 AI 기술의 안전한 발전과 윤리적 활용을 보장하면서도, 기업들의 혁신을 저해하지 않는 균형잡힌 접근을 추구하고 있다.",
"특히 AI 기본법은 개인정보 보호, 알고리즘 투명성, 책임성 등 주요 이슈들에 대한 명확한 기준을 제시하여 산업계의 불확실성을 해소할 것으로 기대된다."
]
}
"_id": "68c5798162bde7a1947d35a7",
"article_id": "17ee889c-a01e-4791-9f51-9336074c842b",
"job_id": "616c1c65-6b43-42a8-98e0-9547208106c8",
"keyword_id": "test_starcraft_001",
"keyword": "스타크래프트",
"title": "Is StarCraft a new tipping point after 15 years?",
"content": "**Nexon secures new StarCraft development rights***.\n\nSouth Korean gaming company Nexon has announced that it has won the rights to develop a new StarCraft game, beating out Blizzard in a bidding war. This is expected to mark a new turning point for the StarCraft IP. Nexon also secured the distribution rights to Blizzard's mobile game called Overwatch 3.\n\n**StarCraft 2, still the benchmark for RTS games***.\n\nFifteen years after its release, StarCraft 2 is still considered the pinnacle of the real-time strategy (RTS) gaming genre. According to a recently published analysis, today's RTS games are still being developed with the same standards of polish and playability that StarCraft 2 set. Blizzard recently released the 5.0.14 PTR update for StarCraft 2, demonstrating its continued support for the game.\n\n**Breaking records on the esports stage\n\nAt the Esports World Cup 2025, Joel \"Serral\" Larsson of Finland won the StarCraft 2 category, setting a new viewership record. This shows that StarCraft 2 still has a strong appeal to gamers around the world.\n\n**The failure of Stormgate, a classic for sequels\n\nOn the other hand, the spiritual successor to StarCraft, Stormgate, created by ex-Blizzard developers, has been criticized as a flop. The developers controversially blamed gamers for its failure. This shows how difficult it is to replicate StarCraft's winning formula.\n\n**Current events and outlook\n\nNexon's acquisition of StarCraft development rights is significant for the Korean gaming industry. It increases the likelihood of new StarCraft titles being developed in Korea, the home of the franchise. However, it will be a challenge to create a game that surpasses StarCraft 2, which has been a standard in the industry for 15 years.",
"summary": "Nexon's acquisition of the new StarCraft development rights marks a turning point for the StarCraft IP, which has been a benchmark in RTS gaming for 15 years.",
"source_items": [],
"images": [
"https://replicate.delivery/xezq/tYOKIl3SG35WPBVBg3ipefCOLUJPm9FCXwyOOrppS1WztyUVA/out-0.png"
],
"categories": [
"기술",
"정책",
"산업",
"경제"
"游戏",
"Esports"
],
"entities": {
"people": [
"강훈식",
"배경훈 과기정통부 장관"
"tags": [
"StarCraft",
"Nexon",
"RTS games",
"Blizzard",
"Esports"
],
"organizations": [
"SK하이닉스",
"대통령실",
"과학기술정보통신부"
"created_at": "2025-09-13T14:02:41.857000",
"pipeline_stages": [
"rss_collection",
"search_enrichment",
"ai_article_generation",
"image_generation",
"translation"
],
"groups": [
"AI 산업계",
"반도체 업계"
],
"countries": [
"대한민국",
"미국"
],
"events": [
"AI 기본법 시행",
"정부 10조원 투자 계획 발표"
]
},
"source_keyword": "인공지능",
"source_count": 5
"processing_time": 19.447839,
"language": "en",
"ref_news_id": "20a6bb85-8c61-41db-82fc-d52e0d88204d"
}

View File

@ -3,17 +3,17 @@ FROM python:3.11-slim
WORKDIR /app
# 의존성 설치
COPY ./ai-summarizer/requirements.txt .
COPY ./ai-article-generator/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 공통 모듈 복사
COPY ./shared /app/shared
# AI Summarizer 코드 복사
COPY ./ai-summarizer /app
# AI Article Generator 코드 복사
COPY ./ai-article-generator /app
# 환경변수
ENV PYTHONUNBUFFERED=1
# 실행
CMD ["python", "ai_summarizer.py"]
CMD ["python", "ai_article_generator.py"]

View File

@ -0,0 +1,300 @@
"""
AI Article Generator Service
Claude API를 사용한 뉴스 기사 생성 서비스
"""
import asyncio
import logging
import os
import sys
import json
from datetime import datetime
from typing import List, Dict, Any
from anthropic import AsyncAnthropic
from motor.motor_asyncio import AsyncIOMotorClient
# Import from shared module
from shared.models import PipelineJob, EnrichedItem, FinalArticle, Subtopic, Entities, NewsReference
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class AIArticleGeneratorWorker:
def __init__(self):
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
self.claude_api_key = os.getenv("CLAUDE_API_KEY")
self.claude_client = None
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db") # ai_writer_db 사용
self.db = None
async def start(self):
"""워커 시작"""
logger.info("Starting AI Article Generator Worker")
# Redis 연결
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# Claude 클라이언트 초기화
if self.claude_api_key:
self.claude_client = AsyncAnthropic(api_key=self.claude_api_key)
else:
logger.error("Claude API key not configured")
return
# 메인 처리 루프
while True:
try:
# 큐에서 작업 가져오기
job = await self.queue_manager.dequeue('ai_article_generation', timeout=5)
if job:
await self.process_job(job)
except Exception as e:
logger.error(f"Error in worker loop: {e}")
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""AI 기사 생성 작업 처리 - 단일 RSS 아이템"""
try:
start_time = datetime.now()
logger.info(f"Processing job {job.job_id} for AI article generation")
# 단일 enriched item 처리
enriched_item_data = job.data.get('enriched_item')
if not enriched_item_data:
# 이전 버전 호환성
enriched_items = job.data.get('enriched_items', [])
if enriched_items:
enriched_item_data = enriched_items[0]
else:
logger.warning(f"No enriched item in job {job.job_id}")
await self.queue_manager.mark_failed(
'ai_article_generation',
job,
"No enriched item to process"
)
return
enriched_item = EnrichedItem(**enriched_item_data)
# 기사 생성
article = await self._generate_article(job, enriched_item)
# 처리 시간 계산
processing_time = (datetime.now() - start_time).total_seconds()
article.processing_time = processing_time
# MongoDB에 저장 (ai_writer_db.articles_ko)
result = await self.db.articles_ko.insert_one(article.model_dump())
mongodb_id = str(result.inserted_id)
logger.info(f"Article {article.news_id} saved to MongoDB with _id: {mongodb_id}")
# 다음 단계로 전달 (이미지 생성)
job.data['news_id'] = article.news_id
job.data['mongodb_id'] = mongodb_id
job.stages_completed.append('ai_article_generation')
job.stage = 'image_generation'
await self.queue_manager.enqueue('image_generation', job)
await self.queue_manager.mark_completed('ai_article_generation', job.job_id)
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")
await self.queue_manager.mark_failed('ai_article_generation', job, str(e))
async def _generate_article(self, job: PipelineJob, enriched_item: EnrichedItem) -> FinalArticle:
"""Claude를 사용한 기사 생성"""
# RSS 아이템 정보
rss_item = enriched_item.rss_item
search_results = enriched_item.search_results
# 검색 결과 텍스트 준비 (최대 10개)
search_text = ""
if search_results:
search_text = "\n관련 검색 결과:\n"
for idx, result in enumerate(search_results[:10], 1):
search_text += f"{idx}. {result.title}\n"
if result.snippet:
search_text += f" {result.snippet}\n"
# Claude로 기사 작성
prompt = f"""다음 뉴스 정보를 바탕으로 상세한 기사를 작성해주세요.
키워드: {job.keyword}
뉴스 정보:
제목: {rss_item.title}
요약: {rss_item.summary or '내용 없음'}
링크: {rss_item.link}
{search_text}
다음 JSON 형식으로 작성해주세요:
{{
"title": "기사 제목 (50자 이내)",
"summary": "한 줄 요약 (100자 이내)",
"subtopics": [
{{
"title": "소제목1",
"content": ["문단1", "문단2", "문단3"]
}},
{{
"title": "소제목2",
"content": ["문단1", "문단2"]
}},
{{
"title": "소제목3",
"content": ["문단1", "문단2"]
}}
],
"categories": ["카테고리1", "카테고리2"],
"entities": {{
"people": ["인물1", "인물2"],
"organizations": ["조직1", "조직2"],
"groups": ["그룹1"],
"countries": ["국가1"],
"events": ["이벤트1"]
}}
}}
요구사항:
- 3개의 소제목로 구성
- 각 소제목별로 2-3개 문단
- 전문적이고 객관적인 톤
- 한국어로 작성
- 실제 정보를 바탕으로 구체적으로 작성"""
try:
response = await self.claude_client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=4000,
temperature=0.7,
messages=[
{"role": "user", "content": prompt}
]
)
# JSON 파싱
content_text = response.content[0].text
json_start = content_text.find('{')
json_end = content_text.rfind('}') + 1
if json_start != -1 and json_end > json_start:
article_data = json.loads(content_text[json_start:json_end])
else:
raise ValueError("No valid JSON in response")
# Subtopic 객체 생성
subtopics = []
for subtopic_data in article_data.get('subtopics', []):
subtopics.append(Subtopic(
title=subtopic_data.get('title', ''),
content=subtopic_data.get('content', [])
))
# Entities 객체 생성
entities_data = article_data.get('entities', {})
entities = Entities(
people=entities_data.get('people', []),
organizations=entities_data.get('organizations', []),
groups=entities_data.get('groups', []),
countries=entities_data.get('countries', []),
events=entities_data.get('events', [])
)
# 레퍼런스 생성
references = []
# RSS 원본 추가
references.append(NewsReference(
title=rss_item.title,
link=rss_item.link,
source=rss_item.source_feed,
published=rss_item.published
))
# 검색 결과 레퍼런스 추가 (최대 9개 - RSS 원본과 합쳐 총 10개)
for search_result in search_results[:9]: # 상위 9개까지
references.append(NewsReference(
title=search_result.title,
link=search_result.link,
source=search_result.source,
published=None
))
# FinalArticle 생성 (ai_writer_db.articles 스키마)
article = FinalArticle(
title=article_data.get('title', rss_item.title),
summary=article_data.get('summary', ''),
subtopics=subtopics,
categories=article_data.get('categories', []),
entities=entities,
source_keyword=job.keyword,
source_count=len(references),
references=references,
job_id=job.job_id,
keyword_id=job.keyword_id,
pipeline_stages=job.stages_completed.copy(),
language='ko',
rss_guid=rss_item.guid # RSS GUID 저장
)
return article
except Exception as e:
logger.error(f"Error generating article: {e}")
# 폴백 기사 생성
fallback_references = [NewsReference(
title=rss_item.title,
link=rss_item.link,
source=rss_item.source_feed,
published=rss_item.published
)]
return FinalArticle(
title=rss_item.title,
summary=rss_item.summary[:100] if rss_item.summary else '',
subtopics=[
Subtopic(
title="주요 내용",
content=[rss_item.summary or rss_item.title]
)
],
categories=['자동생성'],
entities=Entities(),
source_keyword=job.keyword,
source_count=1,
references=fallback_references,
job_id=job.job_id,
keyword_id=job.keyword_id,
pipeline_stages=job.stages_completed.copy(),
language='ko',
rss_guid=rss_item.guid # RSS GUID 저장
)
async def stop(self):
"""워커 중지"""
await self.queue_manager.disconnect()
logger.info("AI Article Generator Worker stopped")
async def main():
"""메인 함수"""
worker = AIArticleGeneratorWorker()
try:
await worker.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
await worker.stop()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -1,5 +1,5 @@
anthropic==0.50.0
motor==3.1.1
pymongo==4.3.3
redis[hiredis]==5.0.1
pydantic==2.5.0
motor==3.1.1
pymongo==4.3.3

View File

@ -1,161 +0,0 @@
"""
AI Summarizer Service
Claude API를 사용한 뉴스 요약 서비스
"""
import asyncio
import logging
import os
import sys
from typing import List, Dict, Any
from anthropic import AsyncAnthropic
# Import from shared module
from shared.models import PipelineJob, EnrichedItem, SummarizedItem
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class AISummarizerWorker:
def __init__(self):
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
self.claude_api_key = os.getenv("CLAUDE_API_KEY")
self.claude_client = None
async def start(self):
"""워커 시작"""
logger.info("Starting AI Summarizer Worker")
# Redis 연결
await self.queue_manager.connect()
# Claude 클라이언트 초기화
if self.claude_api_key:
self.claude_client = AsyncAnthropic(api_key=self.claude_api_key)
else:
logger.error("Claude API key not configured")
return
# 메인 처리 루프
while True:
try:
# 큐에서 작업 가져오기
job = await self.queue_manager.dequeue('ai_summarization', timeout=5)
if job:
await self.process_job(job)
except Exception as e:
logger.error(f"Error in worker loop: {e}")
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""AI 요약 작업 처리"""
try:
logger.info(f"Processing job {job.job_id} for AI summarization")
enriched_items = job.data.get('enriched_items', [])
summarized_items = []
for item_data in enriched_items:
enriched_item = EnrichedItem(**item_data)
# AI 요약 생성
summary = await self._generate_summary(enriched_item)
summarized_item = SummarizedItem(
enriched_item=enriched_item,
ai_summary=summary,
summary_language='ko'
)
summarized_items.append(summarized_item)
# API 속도 제한
await asyncio.sleep(1)
if summarized_items:
logger.info(f"Summarized {len(summarized_items)} items")
# 다음 단계로 전달 (번역 단계로)
job.data['summarized_items'] = [item.dict() for item in summarized_items]
job.stages_completed.append('ai_summarization')
job.stage = 'translation'
await self.queue_manager.enqueue('translation', job)
await self.queue_manager.mark_completed('ai_summarization', job.job_id)
else:
logger.warning(f"No items summarized for job {job.job_id}")
await self.queue_manager.mark_failed(
'ai_summarization',
job,
"No items to summarize"
)
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")
await self.queue_manager.mark_failed('ai_summarization', job, str(e))
async def _generate_summary(self, enriched_item: EnrichedItem) -> str:
"""Claude를 사용한 요약 생성"""
try:
# 컨텐츠 준비
content_parts = [
f"제목: {enriched_item.rss_item.title}",
f"요약: {enriched_item.rss_item.summary or '없음'}"
]
# 검색 결과 추가
if enriched_item.search_results:
content_parts.append("\n관련 검색 결과:")
for idx, result in enumerate(enriched_item.search_results[:3], 1):
content_parts.append(f"{idx}. {result.title}")
if result.snippet:
content_parts.append(f" {result.snippet}")
content = "\n".join(content_parts)
# Claude API 호출
prompt = f"""다음 뉴스 내용을 200자 이내로 핵심만 요약해주세요.
중요한 사실, 수치, 인물, 조직을 포함하고 객관적인 톤을 유지하세요.
{content}
요약:"""
response = await self.claude_client.messages.create(
model="claude-sonnet-4-20250514", # 최신 Sonnet 모델
max_tokens=500,
temperature=0.3,
messages=[
{"role": "user", "content": prompt}
]
)
summary = response.content[0].text.strip()
return summary
except Exception as e:
logger.error(f"Error generating summary: {e}")
# 폴백: 원본 요약 사용
return enriched_item.rss_item.summary[:200] if enriched_item.rss_item.summary else enriched_item.rss_item.title
async def stop(self):
"""워커 중지"""
await self.queue_manager.disconnect()
logger.info("AI Summarizer Worker stopped")
async def main():
"""메인 함수"""
worker = AISummarizerWorker()
try:
await worker.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
await worker.stop()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -1,3 +0,0 @@
anthropic==0.50.0
redis[hiredis]==5.0.1
pydantic==2.5.0

View File

@ -1,19 +0,0 @@
FROM python:3.11-slim
WORKDIR /app
# 의존성 설치
COPY ./article-assembly/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 공통 모듈 복사
COPY ./shared /app/shared
# Article Assembly 코드 복사
COPY ./article-assembly /app
# 환경변수
ENV PYTHONUNBUFFERED=1
# 실행
CMD ["python", "article_assembly.py"]

View File

@ -1,234 +0,0 @@
"""
Article Assembly Service
최종 기사 조립 및 MongoDB 저장 서비스
"""
import asyncio
import logging
import os
import sys
import json
from datetime import datetime
from typing import List, Dict, Any
from anthropic import AsyncAnthropic
from motor.motor_asyncio import AsyncIOMotorClient
# Import from shared module
from shared.models import PipelineJob, SummarizedItem, FinalArticle
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ArticleAssemblyWorker:
def __init__(self):
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
self.claude_api_key = os.getenv("CLAUDE_API_KEY")
self.claude_client = None
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "pipeline_db")
self.db = None
async def start(self):
"""워커 시작"""
logger.info("Starting Article Assembly Worker")
# Redis 연결
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# Claude 클라이언트 초기화
if self.claude_api_key:
self.claude_client = AsyncAnthropic(api_key=self.claude_api_key)
else:
logger.error("Claude API key not configured")
return
# 메인 처리 루프
while True:
try:
# 큐에서 작업 가져오기
job = await self.queue_manager.dequeue('article_assembly', timeout=5)
if job:
await self.process_job(job)
except Exception as e:
logger.error(f"Error in worker loop: {e}")
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""최종 기사 조립 작업 처리"""
try:
start_time = datetime.now()
logger.info(f"Processing job {job.job_id} for article assembly")
summarized_items = job.data.get('summarized_items', [])
if not summarized_items:
logger.warning(f"No items to assemble for job {job.job_id}")
await self.queue_manager.mark_failed(
'article_assembly',
job,
"No items to assemble"
)
return
# 최종 기사 생성
article = await self._generate_final_article(job, summarized_items)
# 처리 시간 계산
processing_time = (datetime.now() - start_time).total_seconds()
article.processing_time = processing_time
# MongoDB에 저장
await self.db.articles.insert_one(article.dict())
logger.info(f"Article {article.article_id} saved to MongoDB")
# 완료 표시
job.stages_completed.append('article_assembly')
await self.queue_manager.mark_completed('article_assembly', job.job_id)
# 통계 업데이트
await self._update_statistics(job.keyword_id)
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")
await self.queue_manager.mark_failed('article_assembly', job, str(e))
async def _generate_final_article(
self,
job: PipelineJob,
summarized_items: List[Dict]
) -> FinalArticle:
"""Claude를 사용한 최종 기사 생성"""
# 아이템 정보 준비
items_text = []
for idx, item_data in enumerate(summarized_items, 1):
item = SummarizedItem(**item_data)
items_text.append(f"""
[뉴스 {idx}]
제목: {item.enriched_item['rss_item']['title']}
요약: {item.ai_summary}
출처: {item.enriched_item['rss_item']['link']}
""")
content = "\n".join(items_text)
# Claude로 종합 기사 작성
prompt = f"""다음 뉴스 항목들을 바탕으로 종합적인 기사를 작성해주세요.
키워드: {job.keyword}
뉴스 항목들:
{content}
다음 JSON 형식으로 작성해주세요:
{{
"title": "종합 기사 제목",
"content": "기사 본문 (1500자 이내, 문단 구분)",
"summary": "한 줄 요약 (100자 이내)",
"categories": ["카테고리1", "카테고리2"],
"tags": ["태그1", "태그2", "태그3"]
}}
요구사항:
- 전문적이고 객관적인 톤
- 핵심 정보와 트렌드 파악
- 시사점 포함
- 한국 독자 대상"""
try:
response = await self.claude_client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=3000,
temperature=0.7,
messages=[
{"role": "user", "content": prompt}
]
)
# JSON 파싱
content_text = response.content[0].text
json_start = content_text.find('{')
json_end = content_text.rfind('}') + 1
if json_start != -1 and json_end > json_start:
article_data = json.loads(content_text[json_start:json_end])
else:
raise ValueError("No valid JSON in response")
# FinalArticle 생성
article = FinalArticle(
job_id=job.job_id,
keyword_id=job.keyword_id,
keyword=job.keyword,
title=article_data.get('title', f"{job.keyword} 종합 뉴스"),
content=article_data.get('content', ''),
summary=article_data.get('summary', ''),
source_items=[], # 간소화
images=[], # 이미지는 별도 서비스에서 처리
categories=article_data.get('categories', []),
tags=article_data.get('tags', []),
pipeline_stages=job.stages_completed,
processing_time=0 # 나중에 업데이트
)
return article
except Exception as e:
logger.error(f"Error generating article: {e}")
# 폴백 기사 생성
return FinalArticle(
job_id=job.job_id,
keyword_id=job.keyword_id,
keyword=job.keyword,
title=f"{job.keyword} 뉴스 요약 - {datetime.now().strftime('%Y-%m-%d')}",
content=content,
summary=f"{job.keyword} 관련 {len(summarized_items)}개 뉴스 요약",
source_items=[],
images=[],
categories=['자동생성'],
tags=[job.keyword],
pipeline_stages=job.stages_completed,
processing_time=0
)
async def _update_statistics(self, keyword_id: str):
"""키워드별 통계 업데이트"""
try:
await self.db.keyword_stats.update_one(
{"keyword_id": keyword_id},
{
"$inc": {"articles_generated": 1},
"$set": {"last_generated": datetime.now()}
},
upsert=True
)
except Exception as e:
logger.error(f"Error updating statistics: {e}")
async def stop(self):
"""워커 중지"""
await self.queue_manager.disconnect()
logger.info("Article Assembly Worker stopped")
async def main():
"""메인 함수"""
worker = ArticleAssemblyWorker()
try:
await worker.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
await worker.stop()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,37 @@
#!/usr/bin/env python3
"""키워드 데이터베이스 확인 스크립트"""
import asyncio
from motor.motor_asyncio import AsyncIOMotorClient
from datetime import datetime
async def check_keywords():
client = AsyncIOMotorClient("mongodb://localhost:27017")
db = client.ai_writer_db
# 키워드 조회
keywords = await db.keywords.find().to_list(None)
print(f"\n=== 등록된 키워드: {len(keywords)}개 ===\n")
for kw in keywords:
print(f"키워드: {kw['keyword']}")
print(f" - ID: {kw['keyword_id']}")
print(f" - 간격: {kw['interval_minutes']}")
print(f" - 활성화: {kw['is_active']}")
print(f" - 우선순위: {kw['priority']}")
print(f" - RSS 피드: {len(kw.get('rss_feeds', []))}")
if kw.get('last_run'):
print(f" - 마지막 실행: {kw['last_run']}")
if kw.get('next_run'):
next_run = kw['next_run']
remaining = (next_run - datetime.now()).total_seconds() / 60
print(f" - 다음 실행: {next_run} ({remaining:.1f}분 후)")
print()
client.close()
if __name__ == "__main__":
asyncio.run(check_keywords())

View File

@ -0,0 +1,85 @@
{
"enabled_languages": [
{
"code": "en",
"name": "English",
"deepl_code": "EN",
"collection": "articles_en",
"enabled": true
},
{
"code": "zh-CN",
"name": "Chinese (Simplified)",
"deepl_code": "ZH",
"collection": "articles_zh_cn",
"enabled": false
},
{
"code": "zh-TW",
"name": "Chinese (Traditional)",
"deepl_code": "ZH-HANT",
"collection": "articles_zh_tw",
"enabled": false
},
{
"code": "ja",
"name": "Japanese",
"deepl_code": "JA",
"collection": "articles_ja",
"enabled": false
},
{
"code": "fr",
"name": "French",
"deepl_code": "FR",
"collection": "articles_fr",
"enabled": false
},
{
"code": "de",
"name": "German",
"deepl_code": "DE",
"collection": "articles_de",
"enabled": false
},
{
"code": "es",
"name": "Spanish",
"deepl_code": "ES",
"collection": "articles_es",
"enabled": false
},
{
"code": "pt",
"name": "Portuguese",
"deepl_code": "PT",
"collection": "articles_pt",
"enabled": false
},
{
"code": "ru",
"name": "Russian",
"deepl_code": "RU",
"collection": "articles_ru",
"enabled": false
},
{
"code": "it",
"name": "Italian",
"deepl_code": "IT",
"collection": "articles_it",
"enabled": false
}
],
"source_language": {
"code": "ko",
"name": "Korean",
"collection": "articles_ko"
},
"translation_settings": {
"batch_size": 5,
"delay_between_languages": 2.0,
"delay_between_articles": 0.5,
"max_retries": 3
}
}

View File

@ -48,16 +48,27 @@ class GoogleSearchWorker:
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""검색 강화 작업 처리"""
"""검색 강화 작업 처리 - 단일 RSS 아이템"""
try:
logger.info(f"Processing job {job.job_id} for search enrichment")
# 단일 RSS 아이템 처리
rss_item_data = job.data.get('rss_item')
if not rss_item_data:
# 이전 버전 호환성 - 여러 아이템 처리
rss_items = job.data.get('rss_items', [])
enriched_items = []
if rss_items:
rss_item_data = rss_items[0] # 첫 번째 아이템만 처리
else:
logger.warning(f"No RSS item in job {job.job_id}")
await self.queue_manager.mark_failed(
'search_enrichment',
job,
"No RSS item to process"
)
return
# 최대 5개 항목만 처리 (API 할당량 관리)
for item_data in rss_items[:5]:
rss_item = RSSItem(**item_data)
rss_item = RSSItem(**rss_item_data)
# 제목으로 Google 검색
search_results = await self._search_google(rss_item.title)
@ -66,28 +77,16 @@ class GoogleSearchWorker:
rss_item=rss_item,
search_results=search_results
)
enriched_items.append(enriched_item)
# API 속도 제한
await asyncio.sleep(0.5)
logger.info(f"Enriched item with {len(search_results)} search results")
if enriched_items:
logger.info(f"Enriched {len(enriched_items)} items with search results")
# 다음 단계로 전달
job.data['enriched_items'] = [item.dict() for item in enriched_items]
# 다음 단계로 전달 - 단일 enriched item
job.data['enriched_item'] = enriched_item.dict()
job.stages_completed.append('search_enrichment')
job.stage = 'ai_summarization'
job.stage = 'ai_article_generation'
await self.queue_manager.enqueue('ai_summarization', job)
await self.queue_manager.enqueue('ai_article_generation', job)
await self.queue_manager.mark_completed('search_enrichment', job.job_id)
else:
logger.warning(f"No items enriched for job {job.job_id}")
await self.queue_manager.mark_failed(
'search_enrichment',
job,
"No items to enrich"
)
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")

View File

@ -10,9 +10,11 @@ import base64
from typing import List, Dict, Any
import httpx
from io import BytesIO
from motor.motor_asyncio import AsyncIOMotorClient
from bson import ObjectId
# Import from shared module
from shared.models import PipelineJob, TranslatedItem, GeneratedImageItem
from shared.models import PipelineJob
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
@ -23,10 +25,13 @@ class ImageGeneratorWorker:
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
self.replicate_api_key = os.getenv("REPLICATE_API_KEY")
self.replicate_api_key = os.getenv("REPLICATE_API_TOKEN")
self.replicate_api_url = "https://api.replicate.com/v1/predictions"
# Stable Diffusion 모델 사용
self.model_version = "stability-ai/sdxl:39ed52f2a78e934b3ba6e2a89f5b1c712de7dfea535525255b1aa35c5565e08b"
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
async def start(self):
"""워커 시작"""
@ -35,6 +40,10 @@ class ImageGeneratorWorker:
# Redis 연결
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# API 키 확인
if not self.replicate_api_key:
logger.warning("Replicate API key not configured - using placeholder images")
@ -53,67 +62,89 @@ class ImageGeneratorWorker:
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""이미지 생성 작업 처리"""
"""이미지 생성 및 MongoDB 업데이트"""
try:
logger.info(f"Processing job {job.job_id} for image generation")
translated_items = job.data.get('translated_items', [])
generated_items = []
# MongoDB에서 기사 정보 가져오기
news_id = job.data.get('news_id')
mongodb_id = job.data.get('mongodb_id')
# 최대 3개 아이템만 이미지 생성 (API 비용 절감)
for idx, item_data in enumerate(translated_items[:3]):
translated_item = TranslatedItem(**item_data)
if not news_id:
logger.error(f"No news_id in job {job.job_id}")
await self.queue_manager.mark_failed('image_generation', job, "No news_id")
return
# 이미지 생성을 위한 프롬프트 생성
prompt = self._create_image_prompt(translated_item)
# MongoDB에서 한국어 기사 조회 (articles_ko)
article = await self.db.articles_ko.find_one({"news_id": news_id})
if not article:
logger.error(f"Article {news_id} not found in MongoDB")
await self.queue_manager.mark_failed('image_generation', job, "Article not found")
return
# 이미지 생성
# 이미지 생성을 위한 프롬프트 생성 (한국어 기사 기반)
prompt = self._create_image_prompt_from_article(article)
# 이미지 생성 (최대 3개)
image_urls = []
for i in range(min(3, 1)): # 테스트를 위해 1개만 생성
image_url = await self._generate_image(prompt)
generated_item = GeneratedImageItem(
translated_item=translated_item,
image_url=image_url,
image_prompt=prompt
)
generated_items.append(generated_item)
image_urls.append(image_url)
# API 속도 제한
if self.replicate_api_key:
if self.replicate_api_key and i < 2:
await asyncio.sleep(2)
if generated_items:
logger.info(f"Generated images for {len(generated_items)} items")
# MongoDB 업데이트 (이미지 추가 - articles_ko)
await self.db.articles_ko.update_one(
{"news_id": news_id},
{
"$set": {
"images": image_urls,
"image_prompt": prompt
},
"$addToSet": {
"pipeline_stages": "image_generation"
}
}
)
# 완료된 데이터를 job에 저장
job.data['generated_items'] = [item.dict() for item in generated_items]
job.stages_completed.append('image_generation')
job.stage = 'completed'
logger.info(f"Updated article {news_id} with {len(image_urls)} images")
# 최종 기사 조립 단계로 전달 (이미 article-assembly로 수정)
await self.queue_manager.enqueue('article_assembly', job)
await self.queue_manager.mark_completed('image_generation', job.job_id)
else:
logger.warning(f"No images generated for job {job.job_id}")
# 이미지 생성 실패해도 다음 단계로 진행
# 다음 단계로 전달 (번역)
job.stages_completed.append('image_generation')
await self.queue_manager.enqueue('article_assembly', job)
job.stage = 'translation'
await self.queue_manager.enqueue('translation', job)
await self.queue_manager.mark_completed('image_generation', job.job_id)
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")
# 이미지 생성 실패해도 다음 단계로 진행
job.stages_completed.append('image_generation')
await self.queue_manager.enqueue('article_assembly', job)
await self.queue_manager.mark_completed('image_generation', job.job_id)
await self.queue_manager.mark_failed('image_generation', job, str(e))
def _create_image_prompt(self, translated_item: TranslatedItem) -> str:
"""이미지 생성을 위한 프롬프트 생성"""
# 영문 제목과 요약을 기반으로 프롬프트 생성
title = translated_item.translated_title or translated_item.summarized_item['enriched_item']['rss_item']['title']
summary = translated_item.translated_summary or translated_item.summarized_item['ai_summary']
def _create_image_prompt_from_article(self, article: Dict) -> str:
"""기사로부터 이미지 프롬프트 생성"""
# 키워드와 제목을 기반으로 프롬프트 생성
keyword = article.get('keyword', '')
title = article.get('title', '')
categories = article.get('categories', [])
# 카테고리 맵핑 (한글 -> 영어)
category_map = {
'기술': 'technology',
'경제': 'business',
'정치': 'politics',
'교육': 'education',
'사회': 'society',
'문화': 'culture',
'과학': 'science'
}
eng_categories = [category_map.get(cat, cat) for cat in categories]
category_str = ', '.join(eng_categories[:2]) if eng_categories else 'news'
# 뉴스 관련 이미지를 위한 프롬프트
prompt = f"News illustration for: {title[:100]}, professional, photorealistic, high quality, 4k"
prompt = f"News illustration for {keyword} {category_str}, professional, modern, clean design, high quality, 4k, no text"
return prompt

View File

@ -1,3 +1,5 @@
httpx==0.25.0
redis[hiredis]==5.0.1
pydantic==2.5.0
motor==3.1.1
pymongo==4.3.3

View File

@ -46,7 +46,7 @@ async def startup_event():
# MongoDB 연결
mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
mongodb_client = AsyncIOMotorClient(mongodb_url)
db = mongodb_client[os.getenv("DB_NAME", "pipeline_db")]
db = mongodb_client[os.getenv("DB_NAME", "ai_writer_db")]
logger.info("Pipeline Monitor started successfully")

View File

@ -2,3 +2,4 @@ feedparser==6.0.11
aiohttp==3.9.1
redis[hiredis]==5.0.1
pydantic==2.5.0
motor==3.6.0

View File

@ -11,6 +11,7 @@ from datetime import datetime
import feedparser
import aiohttp
import redis.asyncio as redis
from motor.motor_asyncio import AsyncIOMotorClient
from typing import List, Dict, Any
# Import from shared module
@ -27,8 +28,11 @@ class RSSCollectorWorker:
)
self.redis_client = None
self.redis_url = os.getenv("REDIS_URL", "redis://redis:6379")
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
self.dedup_ttl = 86400 * 7 # 7일간 중복 방지
self.max_items_per_feed = 10 # 피드당 최대 항목 수
self.max_items_per_feed = 100 # 피드당 최대 항목 수 (Google News는 최대 100개)
async def start(self):
"""워커 시작"""
@ -42,6 +46,10 @@ class RSSCollectorWorker:
decode_responses=True
)
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# 메인 처리 루프
while True:
try:
@ -60,9 +68,20 @@ class RSSCollectorWorker:
try:
logger.info(f"Processing job {job.job_id} for keyword '{job.keyword}'")
keyword = job.data.get('keyword', '')
keyword = job.keyword # keyword는 job의 직접 속성
rss_feeds = job.data.get('rss_feeds', [])
# RSS 피드가 없으면 기본 피드 사용
if not rss_feeds:
# 기본 RSS 피드 추가 (Google News RSS)
rss_feeds = [
f"https://news.google.com/rss/search?q={keyword}&hl=en-US&gl=US&ceid=US:en",
f"https://news.google.com/rss/search?q={keyword}&hl=ko&gl=KR&ceid=KR:ko",
"https://feeds.bbci.co.uk/news/technology/rss.xml",
"https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml"
]
logger.info(f"Using default RSS feeds for keyword: {keyword}")
# 키워드가 포함된 RSS URL 생성
processed_feeds = self._prepare_feeds(rss_feeds, keyword)
@ -82,15 +101,43 @@ class RSSCollectorWorker:
if unique_items:
logger.info(f"Collected {len(unique_items)} unique items for '{keyword}'")
# 다음 단계로 전달
job.data['rss_items'] = [item.dict() for item in unique_items]
job.stages_completed.append('rss_collection')
job.stage = 'search_enrichment'
# 각 RSS 아이템별로 개별 job 생성하여 다음 단계로 전달
# 시간 지연을 추가하여 API 호출 분산 (초기값: 1초, 점진적으로 조정 가능)
enqueue_delay = float(os.getenv("RSS_ENQUEUE_DELAY", "1.0"))
await self.queue_manager.enqueue('search_enrichment', job)
for idx, item in enumerate(unique_items):
# 각 아이템별로 새로운 job 생성
item_job = PipelineJob(
keyword_id=f"{job.keyword_id}_{idx}",
keyword=job.keyword,
stage='search_enrichment',
data={
'rss_item': item.dict(), # 단일 아이템
'original_job_id': job.job_id,
'item_index': idx,
'total_items': len(unique_items),
'item_hash': hashlib.md5(
f"{keyword}:guid:{item.guid}".encode() if item.guid
else f"{keyword}:title:{item.title}:link:{item.link}".encode()
).hexdigest() # GUID 또는 title+link 해시
},
stages_completed=['rss_collection']
)
# 개별 아이템을 다음 단계로 전달
await self.queue_manager.enqueue('search_enrichment', item_job)
logger.info(f"Enqueued item {idx+1}/{len(unique_items)} for keyword '{keyword}'")
# 다음 아이템 enqueue 전에 지연 추가 (마지막 아이템 제외)
if idx < len(unique_items) - 1:
await asyncio.sleep(enqueue_delay)
logger.debug(f"Waiting {enqueue_delay}s before next item...")
# 원본 job 완료 처리
await self.queue_manager.mark_completed('rss_collection', job.job_id)
logger.info(f"Completed RSS collection for job {job.job_id}: {len(unique_items)} items processed")
else:
logger.info(f"No new items found for '{keyword}'")
logger.info(f"No new items found for '{keyword}' after deduplication")
await self.queue_manager.mark_completed('rss_collection', job.job_id)
else:
logger.warning(f"No RSS items collected for '{keyword}'")
@ -126,21 +173,34 @@ class RSSCollectorWorker:
# feedparser로 파싱
feed = feedparser.parse(content)
logger.info(f"Found {len(feed.entries)} entries in feed {feed_url}")
for entry in feed.entries[:self.max_items_per_feed]:
# 키워드 관련성 체크
title = entry.get('title', '')
summary = entry.get('summary', '')
# 제목이나 요약에 키워드가 포함된 경우
if keyword.lower() in title.lower() or keyword.lower() in summary.lower():
# 대소문자 무시하고 키워드 매칭 (영문의 경우)
title_lower = title.lower() if keyword.isascii() else title
summary_lower = summary.lower() if keyword.isascii() else summary
keyword_lower = keyword.lower() if keyword.isascii() else keyword
# 제목이나 요약에 키워드가 포함된 경우
# Google News RSS는 이미 키워드 검색 결과이므로 모든 항목 포함
if "news.google.com" in feed_url or keyword_lower in title_lower or keyword_lower in summary_lower:
# GUID 추출 (Google RSS에서 일반적으로 사용)
guid = entry.get('id', entry.get('guid', ''))
item = RSSItem(
title=title,
link=entry.get('link', ''),
guid=guid, # GUID 추가
published=entry.get('published', ''),
summary=summary[:500] if summary else '',
source_feed=feed_url
)
items.append(item)
logger.debug(f"Added item: {title[:50]}... (guid: {guid[:30] if guid else 'no-guid'})")
except Exception as e:
logger.error(f"Error fetching RSS feed {feed_url}: {e}")
@ -148,25 +208,43 @@ class RSSCollectorWorker:
return items
async def _deduplicate_items(self, items: List[RSSItem], keyword: str) -> List[RSSItem]:
"""중복 항목 제거"""
"""중복 항목 제거 - GUID 또는 링크 기준으로만 중복 체크"""
unique_items = []
dedup_key = f"dedup:{keyword}"
seen_guids = set() # 현재 배치에서 본 GUID
seen_links = set() # 현재 배치에서 본 링크
for item in items:
# 제목 해시 생성
item_hash = hashlib.md5(
f"{keyword}:{item.title}".encode()
).hexdigest()
# GUID가 있는 경우 GUID로 중복 체크
if item.guid:
if item.guid in seen_guids:
logger.debug(f"Duplicate GUID in batch: {item.guid[:30]}")
continue
# Redis Set으로 중복 확인
is_new = await self.redis_client.sadd(dedup_key, item_hash)
# MongoDB에서 이미 처리된 기사인지 확인
existing_article = await self.db.articles_ko.find_one({"rss_guid": item.guid})
if existing_article:
logger.info(f"Article with GUID {item.guid[:30]} already processed, skipping")
continue
seen_guids.add(item.guid)
else:
# GUID가 없으면 링크로 중복 체크
if item.link in seen_links:
logger.debug(f"Duplicate link in batch: {item.link[:50]}")
continue
# MongoDB에서 링크로 중복 확인 (references 필드에서 검색)
existing_article = await self.db.articles_ko.find_one({"references.link": item.link})
if existing_article:
logger.info(f"Article with link {item.link[:50]} already processed, skipping")
continue
seen_links.add(item.link)
if is_new:
unique_items.append(item)
logger.debug(f"New item added: {item.title[:50]}...")
# TTL 설정
if unique_items:
await self.redis_client.expire(dedup_key, self.dedup_ttl)
logger.info(f"Deduplication result: {len(unique_items)} new items out of {len(items)} total")
return unique_items

View File

@ -2,18 +2,15 @@ FROM python:3.11-slim
WORKDIR /app
# 의존성 설치
# Install dependencies
COPY ./scheduler/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 공통 모듈 복사
# Copy shared module
COPY ./shared /app/shared
# 스케줄러 코드 복사
# Copy scheduler code
COPY ./scheduler /app
# 환경변수
ENV PYTHONUNBUFFERED=1
# 실행
CMD ["python", "scheduler.py"]
# Run scheduler
CMD ["python", "keyword_scheduler.py"]

View File

@ -0,0 +1,336 @@
"""
Keyword Manager API
키워드를 추가/수정/삭제하는 관리 API
"""
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Optional
from datetime import datetime, timedelta
from motor.motor_asyncio import AsyncIOMotorClient
import uvicorn
import os
import sys
import uuid
# Import from shared module
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from shared.models import Keyword
app = FastAPI(title="Keyword Manager API")
# MongoDB 연결
mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
db_name = os.getenv("DB_NAME", "ai_writer_db")
@app.on_event("startup")
async def startup_event():
"""앱 시작 시 MongoDB 연결"""
app.mongodb_client = AsyncIOMotorClient(mongodb_url)
app.db = app.mongodb_client[db_name]
@app.on_event("shutdown")
async def shutdown_event():
"""앱 종료 시 연결 해제"""
app.mongodb_client.close()
class KeywordCreate(BaseModel):
"""키워드 생성 요청 모델"""
keyword: str
interval_minutes: int = 60
priority: int = 0
rss_feeds: List[str] = []
max_articles_per_run: int = 100
is_active: bool = True
class KeywordUpdate(BaseModel):
"""키워드 업데이트 요청 모델"""
interval_minutes: Optional[int] = None
priority: Optional[int] = None
rss_feeds: Optional[List[str]] = None
max_articles_per_run: Optional[int] = None
is_active: Optional[bool] = None
@app.get("/")
async def root():
"""API 상태 확인"""
return {"status": "Keyword Manager API is running"}
@app.get("/threads/status")
async def get_threads_status():
"""모든 스레드 상태 조회"""
try:
# MongoDB에서 키워드 정보와 함께 상태 반환
cursor = app.db.keywords.find()
keywords = await cursor.to_list(None)
threads_status = []
for kw in keywords:
status = {
"keyword": kw.get("keyword"),
"keyword_id": kw.get("keyword_id"),
"is_active": kw.get("is_active"),
"interval_minutes": kw.get("interval_minutes"),
"priority": kw.get("priority"),
"last_run": kw.get("last_run").isoformat() if kw.get("last_run") else None,
"next_run": kw.get("next_run").isoformat() if kw.get("next_run") else None,
"thread_status": "active" if kw.get("is_active") else "inactive"
}
# 다음 실행까지 남은 시간 계산
if kw.get("next_run"):
remaining = (kw.get("next_run") - datetime.now()).total_seconds()
if remaining > 0:
status["minutes_until_next_run"] = round(remaining / 60, 1)
else:
status["minutes_until_next_run"] = 0
status["thread_status"] = "pending_execution"
threads_status.append(status)
# 우선순위 순으로 정렬
threads_status.sort(key=lambda x: x.get("priority", 0), reverse=True)
return {
"total_threads": len(threads_status),
"active_threads": sum(1 for t in threads_status if t.get("is_active")),
"threads": threads_status
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/keywords")
async def list_keywords():
"""모든 키워드 조회"""
try:
cursor = app.db.keywords.find()
keywords = await cursor.to_list(None)
# 각 키워드 정보 정리
result = []
for kw in keywords:
result.append({
"keyword_id": kw.get("keyword_id"),
"keyword": kw.get("keyword"),
"interval_minutes": kw.get("interval_minutes"),
"priority": kw.get("priority"),
"is_active": kw.get("is_active"),
"last_run": kw.get("last_run").isoformat() if kw.get("last_run") else None,
"next_run": kw.get("next_run").isoformat() if kw.get("next_run") else None,
"rss_feeds": kw.get("rss_feeds", []),
"max_articles_per_run": kw.get("max_articles_per_run", 100)
})
return {
"total": len(result),
"keywords": result
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/keywords/{keyword_text}")
async def get_keyword(keyword_text: str):
"""특정 키워드 조회"""
try:
keyword = await app.db.keywords.find_one({"keyword": keyword_text})
if not keyword:
raise HTTPException(status_code=404, detail=f"Keyword '{keyword_text}' not found")
return {
"keyword_id": keyword.get("keyword_id"),
"keyword": keyword.get("keyword"),
"interval_minutes": keyword.get("interval_minutes"),
"priority": keyword.get("priority"),
"is_active": keyword.get("is_active"),
"last_run": keyword.get("last_run").isoformat() if keyword.get("last_run") else None,
"next_run": keyword.get("next_run").isoformat() if keyword.get("next_run") else None,
"rss_feeds": keyword.get("rss_feeds", []),
"max_articles_per_run": keyword.get("max_articles_per_run", 100)
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/keywords")
async def create_keyword(keyword_data: KeywordCreate):
"""새 키워드 생성"""
try:
# 중복 체크
existing = await app.db.keywords.find_one({"keyword": keyword_data.keyword})
if existing:
raise HTTPException(status_code=400, detail=f"Keyword '{keyword_data.keyword}' already exists")
# 새 키워드 생성
keyword = Keyword(
keyword_id=str(uuid.uuid4()),
keyword=keyword_data.keyword,
interval_minutes=keyword_data.interval_minutes,
priority=keyword_data.priority,
rss_feeds=keyword_data.rss_feeds,
max_articles_per_run=keyword_data.max_articles_per_run,
is_active=keyword_data.is_active,
next_run=datetime.now() + timedelta(minutes=1), # 1분 후 첫 실행
created_at=datetime.now(),
updated_at=datetime.now()
)
await app.db.keywords.insert_one(keyword.model_dump())
return {
"message": f"Keyword '{keyword_data.keyword}' created successfully",
"keyword_id": keyword.keyword_id,
"note": "The scheduler will automatically detect and start processing this keyword within 30 seconds"
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.put("/keywords/{keyword_text}")
async def update_keyword(keyword_text: str, update_data: KeywordUpdate):
"""키워드 업데이트"""
try:
# 키워드 존재 확인
existing = await app.db.keywords.find_one({"keyword": keyword_text})
if not existing:
raise HTTPException(status_code=404, detail=f"Keyword '{keyword_text}' not found")
# 업데이트 데이터 준비
update_dict = {}
if update_data.interval_minutes is not None:
update_dict["interval_minutes"] = update_data.interval_minutes
if update_data.priority is not None:
update_dict["priority"] = update_data.priority
if update_data.rss_feeds is not None:
update_dict["rss_feeds"] = update_data.rss_feeds
if update_data.max_articles_per_run is not None:
update_dict["max_articles_per_run"] = update_data.max_articles_per_run
if update_data.is_active is not None:
update_dict["is_active"] = update_data.is_active
if update_dict:
update_dict["updated_at"] = datetime.now()
# 만약 interval이 변경되면 next_run도 재계산
if "interval_minutes" in update_dict:
update_dict["next_run"] = datetime.now() + timedelta(minutes=update_dict["interval_minutes"])
result = await app.db.keywords.update_one(
{"keyword": keyword_text},
{"$set": update_dict}
)
if result.modified_count > 0:
action_note = ""
if update_data.is_active is False:
action_note = "The scheduler will stop the thread for this keyword within 30 seconds."
elif update_data.is_active is True and not existing.get("is_active"):
action_note = "The scheduler will start a new thread for this keyword within 30 seconds."
return {
"message": f"Keyword '{keyword_text}' updated successfully",
"updated_fields": list(update_dict.keys()),
"note": action_note
}
else:
return {"message": "No changes made"}
else:
return {"message": "No update data provided"}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.delete("/keywords/{keyword_text}")
async def delete_keyword(keyword_text: str):
"""키워드 삭제"""
try:
# 키워드 존재 확인
existing = await app.db.keywords.find_one({"keyword": keyword_text})
if not existing:
raise HTTPException(status_code=404, detail=f"Keyword '{keyword_text}' not found")
# 삭제
result = await app.db.keywords.delete_one({"keyword": keyword_text})
if result.deleted_count > 0:
return {
"message": f"Keyword '{keyword_text}' deleted successfully",
"note": "The scheduler will stop the thread for this keyword within 30 seconds"
}
else:
raise HTTPException(status_code=500, detail="Failed to delete keyword")
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/keywords/{keyword_text}/activate")
async def activate_keyword(keyword_text: str):
"""키워드 활성화"""
try:
result = await app.db.keywords.update_one(
{"keyword": keyword_text},
{"$set": {"is_active": True, "updated_at": datetime.now()}}
)
if result.matched_count == 0:
raise HTTPException(status_code=404, detail=f"Keyword '{keyword_text}' not found")
return {
"message": f"Keyword '{keyword_text}' activated",
"note": "The scheduler will start processing this keyword within 30 seconds"
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/keywords/{keyword_text}/deactivate")
async def deactivate_keyword(keyword_text: str):
"""키워드 비활성화"""
try:
result = await app.db.keywords.update_one(
{"keyword": keyword_text},
{"$set": {"is_active": False, "updated_at": datetime.now()}}
)
if result.matched_count == 0:
raise HTTPException(status_code=404, detail=f"Keyword '{keyword_text}' not found")
return {
"message": f"Keyword '{keyword_text}' deactivated",
"note": "The scheduler will stop processing this keyword within 30 seconds"
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/keywords/{keyword_text}/trigger")
async def trigger_keyword(keyword_text: str):
"""키워드 즉시 실행 트리거"""
try:
# next_run을 현재 시간으로 설정하여 즉시 실행되도록 함
result = await app.db.keywords.update_one(
{"keyword": keyword_text},
{"$set": {"next_run": datetime.now(), "updated_at": datetime.now()}}
)
if result.matched_count == 0:
raise HTTPException(status_code=404, detail=f"Keyword '{keyword_text}' not found")
return {
"message": f"Keyword '{keyword_text}' triggered for immediate execution",
"note": "The scheduler will execute this keyword within the next minute"
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
port = int(os.getenv("API_PORT", "8100"))
uvicorn.run(app, host="0.0.0.0", port=port)

View File

@ -0,0 +1,245 @@
"""
Keyword Scheduler Service
데이터베이스에 등록된 키워드를 주기적으로 실행하는 스케줄러
"""
import asyncio
import logging
import os
import sys
from datetime import datetime, timedelta
from motor.motor_asyncio import AsyncIOMotorClient
from typing import List, Optional
import uuid
# Import from shared module
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from shared.models import Keyword, PipelineJob
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class KeywordScheduler:
def __init__(self):
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
self.check_interval = int(os.getenv("SCHEDULER_CHECK_INTERVAL", "60")) # 1분마다 체크
self.default_interval = int(os.getenv("DEFAULT_KEYWORD_INTERVAL", "60")) # 기본 1시간
async def start(self):
"""스케줄러 시작"""
logger.info("Starting Keyword Scheduler")
# Redis 연결
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# 초기 키워드 설정
await self.initialize_keywords()
# 메인 루프
while True:
try:
await self.check_and_execute_keywords()
await asyncio.sleep(self.check_interval)
except Exception as e:
logger.error(f"Error in scheduler loop: {e}")
await asyncio.sleep(10)
async def initialize_keywords(self):
"""초기 키워드 설정 (없으면 생성)"""
try:
# keywords 컬렉션 확인
count = await self.db.keywords.count_documents({})
if count == 0:
logger.info("No keywords found. Creating default keywords...")
# 기본 키워드 생성
default_keywords = [
{
"keyword": "AI",
"interval_minutes": 60,
"is_active": True,
"priority": 1,
"rss_feeds": []
},
{
"keyword": "경제",
"interval_minutes": 120,
"is_active": True,
"priority": 0,
"rss_feeds": []
},
{
"keyword": "테크놀로지",
"interval_minutes": 60,
"is_active": True,
"priority": 1,
"rss_feeds": []
}
]
for kw_data in default_keywords:
keyword = Keyword(**kw_data)
# 다음 실행 시간 설정
keyword.next_run = datetime.now() + timedelta(minutes=5) # 5분 후 첫 실행
await self.db.keywords.insert_one(keyword.dict())
logger.info(f"Created keyword: {keyword.keyword}")
logger.info(f"Found {count} keywords in database")
except Exception as e:
logger.error(f"Error initializing keywords: {e}")
async def check_and_execute_keywords(self):
"""실행할 키워드 체크 및 실행"""
try:
# 현재 시간
now = datetime.now()
# 실행할 키워드 조회 (활성화되고 next_run이 현재 시간 이전인 것)
query = {
"is_active": True,
"$or": [
{"next_run": {"$lte": now}},
{"next_run": None} # next_run이 설정되지 않은 경우
]
}
# 우선순위 순으로 정렬
cursor = self.db.keywords.find(query).sort("priority", -1)
keywords = await cursor.to_list(None)
for keyword_data in keywords:
keyword = Keyword(**keyword_data)
await self.execute_keyword(keyword)
except Exception as e:
logger.error(f"Error checking keywords: {e}")
async def execute_keyword(self, keyword: Keyword):
"""키워드 실행"""
try:
logger.info(f"Executing keyword: {keyword.keyword}")
# PipelineJob 생성
job = PipelineJob(
keyword_id=keyword.keyword_id,
keyword=keyword.keyword,
stage='rss_collection',
data={
'rss_feeds': keyword.rss_feeds if keyword.rss_feeds else [],
'max_articles': keyword.max_articles_per_run,
'scheduled': True
},
priority=keyword.priority
)
# 큐에 작업 추가
await self.queue_manager.enqueue('rss_collection', job)
logger.info(f"Enqueued job for keyword '{keyword.keyword}' with job_id: {job.job_id}")
# 키워드 업데이트
update_data = {
"last_run": datetime.now(),
"next_run": datetime.now() + timedelta(minutes=keyword.interval_minutes),
"updated_at": datetime.now()
}
await self.db.keywords.update_one(
{"keyword_id": keyword.keyword_id},
{"$set": update_data}
)
logger.info(f"Updated keyword '{keyword.keyword}' - next run at {update_data['next_run']}")
except Exception as e:
logger.error(f"Error executing keyword {keyword.keyword}: {e}")
async def add_keyword(self, keyword_text: str, interval_minutes: int = None,
rss_feeds: List[str] = None, priority: int = 0):
"""새 키워드 추가"""
try:
# 중복 체크
existing = await self.db.keywords.find_one({"keyword": keyword_text})
if existing:
logger.warning(f"Keyword '{keyword_text}' already exists")
return None
# 새 키워드 생성
keyword = Keyword(
keyword=keyword_text,
interval_minutes=interval_minutes or self.default_interval,
rss_feeds=rss_feeds or [],
priority=priority,
next_run=datetime.now() + timedelta(minutes=1) # 1분 후 첫 실행
)
result = await self.db.keywords.insert_one(keyword.dict())
logger.info(f"Added new keyword: {keyword_text}")
return keyword
except Exception as e:
logger.error(f"Error adding keyword: {e}")
return None
async def update_keyword(self, keyword_id: str, **kwargs):
"""키워드 업데이트"""
try:
# 업데이트할 필드
update_data = {k: v for k, v in kwargs.items() if v is not None}
update_data["updated_at"] = datetime.now()
result = await self.db.keywords.update_one(
{"keyword_id": keyword_id},
{"$set": update_data}
)
if result.modified_count > 0:
logger.info(f"Updated keyword {keyword_id}")
return True
return False
except Exception as e:
logger.error(f"Error updating keyword: {e}")
return False
async def delete_keyword(self, keyword_id: str):
"""키워드 삭제"""
try:
result = await self.db.keywords.delete_one({"keyword_id": keyword_id})
if result.deleted_count > 0:
logger.info(f"Deleted keyword {keyword_id}")
return True
return False
except Exception as e:
logger.error(f"Error deleting keyword: {e}")
return False
async def stop(self):
"""스케줄러 중지"""
await self.queue_manager.disconnect()
logger.info("Keyword Scheduler stopped")
async def main():
"""메인 함수"""
scheduler = KeywordScheduler()
try:
await scheduler.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
await scheduler.stop()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,361 @@
"""
Multi-threaded Keyword Scheduler Service
하나의 프로세스에서 여러 스레드로 키워드를 관리하는 스케줄러
"""
import asyncio
import logging
import os
import sys
from datetime import datetime, timedelta
from motor.motor_asyncio import AsyncIOMotorClient
from typing import Dict
import threading
import time
# Import from shared module
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from shared.models import Keyword, PipelineJob
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# 전역 변수로 스케줄러 인스턴스 참조 저장
scheduler_instance = None
class KeywordThread(threading.Thread):
"""개별 키워드를 관리하는 스레드"""
def __init__(self, keyword_text: str, mongodb_url: str, db_name: str, redis_url: str):
super().__init__(name=f"Thread-{keyword_text}")
self.keyword_text = keyword_text
self.mongodb_url = mongodb_url
self.db_name = db_name
self.redis_url = redis_url
self.running = True
self.keyword = None
self.status = "initializing"
self.last_execution = None
self.execution_count = 0
self.error_count = 0
self.last_error = None
def run(self):
"""스레드 실행"""
# 새로운 이벤트 루프 생성
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(self.run_scheduler())
finally:
loop.close()
async def run_scheduler(self):
"""비동기 스케줄러 실행"""
# Redis 연결
self.queue_manager = QueueManager(redis_url=self.redis_url)
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
logger.info(f"[{self.keyword_text}] Thread started")
# 키워드 로드
await self.load_keyword()
if not self.keyword:
logger.error(f"[{self.keyword_text}] Failed to load keyword")
return
# 메인 루프
while self.running:
try:
# 키워드 상태 체크
await self.reload_keyword()
if not self.keyword.is_active:
self.status = "inactive"
logger.info(f"[{self.keyword_text}] Keyword is inactive, sleeping...")
await asyncio.sleep(60)
continue
# 실행 시간 체크
now = datetime.now()
if self.keyword.next_run and self.keyword.next_run <= now:
self.status = "executing"
await self.execute_keyword()
# 다음 실행 시간까지 대기
sleep_seconds = self.keyword.interval_minutes * 60
self.status = "waiting"
else:
# 다음 체크까지 1분 대기
sleep_seconds = 60
self.status = "waiting"
await asyncio.sleep(sleep_seconds)
except Exception as e:
self.error_count += 1
self.last_error = str(e)
self.status = "error"
logger.error(f"[{self.keyword_text}] Error in thread loop: {e}")
await asyncio.sleep(60)
await self.queue_manager.disconnect()
logger.info(f"[{self.keyword_text}] Thread stopped")
async def load_keyword(self):
"""키워드 초기 로드"""
try:
keyword_doc = await self.db.keywords.find_one({"keyword": self.keyword_text})
if keyword_doc:
self.keyword = Keyword(**keyword_doc)
logger.info(f"[{self.keyword_text}] Loaded keyword")
except Exception as e:
logger.error(f"[{self.keyword_text}] Error loading keyword: {e}")
async def reload_keyword(self):
"""키워드 정보 재로드"""
try:
keyword_doc = await self.db.keywords.find_one({"keyword": self.keyword_text})
if keyword_doc:
self.keyword = Keyword(**keyword_doc)
except Exception as e:
logger.error(f"[{self.keyword_text}] Error reloading keyword: {e}")
async def execute_keyword(self):
"""키워드 실행"""
try:
logger.info(f"[{self.keyword_text}] Executing keyword")
# PipelineJob 생성
job = PipelineJob(
keyword_id=self.keyword.keyword_id,
keyword=self.keyword.keyword,
stage='rss_collection',
data={
'rss_feeds': self.keyword.rss_feeds if self.keyword.rss_feeds else [],
'max_articles': self.keyword.max_articles_per_run,
'scheduled': True,
'thread_name': self.name
},
priority=self.keyword.priority
)
# 큐에 작업 추가
await self.queue_manager.enqueue('rss_collection', job)
logger.info(f"[{self.keyword_text}] Enqueued job {job.job_id}")
# 키워드 업데이트
update_data = {
"last_run": datetime.now(),
"next_run": datetime.now() + timedelta(minutes=self.keyword.interval_minutes),
"updated_at": datetime.now()
}
await self.db.keywords.update_one(
{"keyword_id": self.keyword.keyword_id},
{"$set": update_data}
)
self.last_execution = datetime.now()
self.execution_count += 1
logger.info(f"[{self.keyword_text}] Next run at {update_data['next_run']}")
except Exception as e:
self.error_count += 1
self.last_error = str(e)
logger.error(f"[{self.keyword_text}] Error executing keyword: {e}")
def stop(self):
"""스레드 중지"""
self.running = False
self.status = "stopped"
def get_status(self):
"""스레드 상태 반환"""
return {
"keyword": self.keyword_text,
"thread_name": self.name,
"status": self.status,
"is_alive": self.is_alive(),
"execution_count": self.execution_count,
"last_execution": self.last_execution.isoformat() if self.last_execution else None,
"error_count": self.error_count,
"last_error": self.last_error,
"next_run": self.keyword.next_run.isoformat() if self.keyword and self.keyword.next_run else None
}
class MultiThreadScheduler:
"""멀티스레드 키워드 스케줄러"""
def __init__(self):
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.redis_url = os.getenv("REDIS_URL", "redis://redis:6379")
self.threads: Dict[str, KeywordThread] = {}
self.running = True
# Singleton 인스턴스를 전역 변수로 저장
global scheduler_instance
scheduler_instance = self
async def start(self):
"""스케줄러 시작"""
logger.info("Starting Multi-threaded Keyword Scheduler")
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# 초기 키워드 설정
await self.initialize_keywords()
# 키워드 로드 및 스레드 시작
await self.load_and_start_threads()
# 메인 루프 - 새로운 키워드 체크
while self.running:
try:
await self.check_new_keywords()
await asyncio.sleep(30) # 30초마다 새 키워드 체크
except Exception as e:
logger.error(f"Error in main loop: {e}")
await asyncio.sleep(30)
async def initialize_keywords(self):
"""초기 키워드 설정 (없으면 생성)"""
try:
count = await self.db.keywords.count_documents({})
if count == 0:
logger.info("No keywords found. Creating default keywords...")
default_keywords = [
{
"keyword": "AI",
"interval_minutes": 60,
"is_active": True,
"priority": 1,
"rss_feeds": [],
"next_run": datetime.now() + timedelta(minutes=1)
},
{
"keyword": "경제",
"interval_minutes": 120,
"is_active": True,
"priority": 0,
"rss_feeds": [],
"next_run": datetime.now() + timedelta(minutes=1)
},
{
"keyword": "테크놀로지",
"interval_minutes": 60,
"is_active": True,
"priority": 1,
"rss_feeds": [],
"next_run": datetime.now() + timedelta(minutes=1)
}
]
for kw_data in default_keywords:
keyword = Keyword(**kw_data)
await self.db.keywords.insert_one(keyword.model_dump())
logger.info(f"Created keyword: {keyword.keyword}")
logger.info(f"Found {count} keywords in database")
except Exception as e:
logger.error(f"Error initializing keywords: {e}")
async def load_and_start_threads(self):
"""키워드 로드 및 스레드 시작"""
try:
# 활성 키워드 조회
cursor = self.db.keywords.find({"is_active": True})
keywords = await cursor.to_list(None)
for keyword_doc in keywords:
keyword = Keyword(**keyword_doc)
if keyword.keyword not in self.threads:
self.start_keyword_thread(keyword.keyword)
logger.info(f"Started {len(self.threads)} keyword threads")
except Exception as e:
logger.error(f"Error loading keywords: {e}")
def start_keyword_thread(self, keyword_text: str):
"""키워드 스레드 시작"""
if keyword_text not in self.threads:
thread = KeywordThread(
keyword_text=keyword_text,
mongodb_url=self.mongodb_url,
db_name=self.db_name,
redis_url=self.redis_url
)
thread.start()
self.threads[keyword_text] = thread
logger.info(f"Started thread for keyword: {keyword_text}")
async def check_new_keywords(self):
"""새로운 키워드 체크 및 스레드 관리"""
try:
# 현재 활성 키워드 조회
cursor = self.db.keywords.find({"is_active": True})
active_keywords = await cursor.to_list(None)
active_keyword_texts = {kw['keyword'] for kw in active_keywords}
# 새 키워드 시작
for keyword_text in active_keyword_texts:
if keyword_text not in self.threads:
self.start_keyword_thread(keyword_text)
# 비활성화된 키워드 스레드 중지
for keyword_text in list(self.threads.keys()):
if keyword_text not in active_keyword_texts:
thread = self.threads[keyword_text]
thread.stop()
del self.threads[keyword_text]
logger.info(f"Stopped thread for keyword: {keyword_text}")
except Exception as e:
logger.error(f"Error checking new keywords: {e}")
def stop(self):
"""모든 스레드 중지"""
self.running = False
for thread in self.threads.values():
thread.stop()
# 모든 스레드가 종료될 때까지 대기
for thread in self.threads.values():
thread.join(timeout=5)
logger.info("Multi-threaded Keyword Scheduler stopped")
def get_threads_status(self):
"""모든 스레드 상태 반환"""
status_list = []
for thread in self.threads.values():
status_list.append(thread.get_status())
return status_list
async def main():
"""메인 함수"""
scheduler = MultiThreadScheduler()
try:
await scheduler.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
scheduler.stop()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -1,5 +1,5 @@
apscheduler==3.10.4
motor==3.1.1
pymongo==4.3.3
motor==3.6.0
redis[hiredis]==5.0.1
pydantic==2.5.0
fastapi==0.104.1
uvicorn==0.24.0

View File

@ -21,7 +21,7 @@ class NewsScheduler:
def __init__(self):
self.scheduler = AsyncIOScheduler()
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "pipeline_db")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")

View File

@ -0,0 +1,173 @@
"""
Single Keyword Scheduler Service
단일 키워드를 전담하는 스케줄러
"""
import asyncio
import logging
import os
import sys
from datetime import datetime, timedelta
from motor.motor_asyncio import AsyncIOMotorClient
import uuid
# Import from shared module
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from shared.models import Keyword, PipelineJob
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class SingleKeywordScheduler:
def __init__(self):
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.keyword_text = os.getenv("KEYWORD") # 환경변수로 키워드 지정
self.interval_minutes = int(os.getenv("INTERVAL_MINUTES", "60"))
self.db = None
self.keyword = None
async def start(self):
"""스케줄러 시작"""
if not self.keyword_text:
logger.error("KEYWORD environment variable is required")
return
logger.info(f"Starting Single Keyword Scheduler for '{self.keyword_text}'")
# Redis 연결
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# 키워드 초기화 또는 로드
await self.initialize_keyword()
if not self.keyword:
logger.error(f"Failed to initialize keyword '{self.keyword_text}'")
return
# 메인 루프 - 이 키워드만 처리
while True:
try:
await self.check_and_execute()
# 다음 실행까지 대기
sleep_seconds = self.keyword.interval_minutes * 60
logger.info(f"Sleeping for {self.keyword.interval_minutes} minutes until next execution")
await asyncio.sleep(sleep_seconds)
except Exception as e:
logger.error(f"Error in scheduler loop: {e}")
await asyncio.sleep(60) # 에러 발생시 1분 후 재시도
async def initialize_keyword(self):
"""키워드 초기화 또는 로드"""
try:
# 기존 키워드 찾기
keyword_doc = await self.db.keywords.find_one({"keyword": self.keyword_text})
if keyword_doc:
self.keyword = Keyword(**keyword_doc)
logger.info(f"Loaded existing keyword: {self.keyword_text}")
else:
# 새 키워드 생성
self.keyword = Keyword(
keyword=self.keyword_text,
interval_minutes=self.interval_minutes,
is_active=True,
priority=int(os.getenv("PRIORITY", "0")),
rss_feeds=os.getenv("RSS_FEEDS", "").split(",") if os.getenv("RSS_FEEDS") else [],
max_articles_per_run=int(os.getenv("MAX_ARTICLES", "100"))
)
await self.db.keywords.insert_one(self.keyword.model_dump())
logger.info(f"Created new keyword: {self.keyword_text}")
except Exception as e:
logger.error(f"Error initializing keyword: {e}")
async def check_and_execute(self):
"""키워드 실행 체크 및 실행"""
try:
# 최신 키워드 정보 다시 로드
keyword_doc = await self.db.keywords.find_one({"keyword": self.keyword_text})
if not keyword_doc:
logger.error(f"Keyword '{self.keyword_text}' not found in database")
return
self.keyword = Keyword(**keyword_doc)
# 비활성화된 경우 스킵
if not self.keyword.is_active:
logger.info(f"Keyword '{self.keyword_text}' is inactive, skipping")
return
# 실행
await self.execute_keyword()
except Exception as e:
logger.error(f"Error checking keyword: {e}")
async def execute_keyword(self):
"""키워드 실행"""
try:
logger.info(f"Executing keyword: {self.keyword.keyword}")
# PipelineJob 생성
job = PipelineJob(
keyword_id=self.keyword.keyword_id,
keyword=self.keyword.keyword,
stage='rss_collection',
data={
'rss_feeds': self.keyword.rss_feeds if self.keyword.rss_feeds else [],
'max_articles': self.keyword.max_articles_per_run,
'scheduled': True,
'scheduler_instance': f"single-{self.keyword_text}"
},
priority=self.keyword.priority
)
# 큐에 작업 추가
await self.queue_manager.enqueue('rss_collection', job)
logger.info(f"Enqueued job for keyword '{self.keyword.keyword}' with job_id: {job.job_id}")
# 키워드 업데이트
update_data = {
"last_run": datetime.now(),
"next_run": datetime.now() + timedelta(minutes=self.keyword.interval_minutes),
"updated_at": datetime.now()
}
await self.db.keywords.update_one(
{"keyword_id": self.keyword.keyword_id},
{"$set": update_data}
)
logger.info(f"Updated keyword '{self.keyword.keyword}' - next run at {update_data['next_run']}")
except Exception as e:
logger.error(f"Error executing keyword {self.keyword.keyword}: {e}")
async def stop(self):
"""스케줄러 중지"""
await self.queue_manager.disconnect()
logger.info(f"Single Keyword Scheduler for '{self.keyword_text}' stopped")
async def main():
"""메인 함수"""
scheduler = SingleKeywordScheduler()
try:
await scheduler.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
await scheduler.stop()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -40,6 +40,7 @@ class RSSItem(BaseModel):
item_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
title: str
link: str
guid: Optional[str] = None # RSS GUID for deduplication
published: Optional[str] = None
summary: Optional[str] = None
source_feed: str
@ -74,22 +75,54 @@ class ItemWithImage(BaseModel):
image_url: str
image_prompt: str
class FinalArticle(BaseModel):
"""최종 기사"""
article_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
job_id: str
keyword_id: str
keyword: str
class Subtopic(BaseModel):
"""기사 소주제"""
title: str
content: str
content: List[str] # 문단별 내용
class Entities(BaseModel):
"""개체명"""
people: List[str] = Field(default_factory=list)
organizations: List[str] = Field(default_factory=list)
groups: List[str] = Field(default_factory=list)
countries: List[str] = Field(default_factory=list)
events: List[str] = Field(default_factory=list)
class NewsReference(BaseModel):
"""뉴스 레퍼런스"""
title: str
link: str
source: str
published: Optional[str] = None
class FinalArticle(BaseModel):
"""최종 기사 - ai_writer_db.articles 스키마와 일치"""
news_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
title: str
created_at: str = Field(default_factory=lambda: datetime.now().isoformat())
summary: str
source_items: List[ItemWithImage]
images: List[str]
subtopics: List[Subtopic] = Field(default_factory=list)
categories: List[str] = Field(default_factory=list)
tags: List[str] = Field(default_factory=list)
created_at: datetime = Field(default_factory=datetime.now)
pipeline_stages: List[str]
processing_time: float # seconds
entities: Entities = Field(default_factory=Entities)
source_keyword: str
source_count: int = 1
# 레퍼런스 뉴스 정보
references: List[NewsReference] = Field(default_factory=list)
# 파이프라인 관련 추가 필드
job_id: Optional[str] = None
keyword_id: Optional[str] = None
pipeline_stages: List[str] = Field(default_factory=list)
processing_time: Optional[float] = None
# 다국어 지원
language: str = 'ko'
ref_news_id: Optional[str] = None
# RSS 중복 체크용 GUID
rss_guid: Optional[str] = None
# 이미지 관련 필드
image_prompt: Optional[str] = None
images: List[str] = Field(default_factory=list)
# 번역 추적
translated_languages: List[str] = Field(default_factory=list)
class TranslatedItem(BaseModel):
"""번역된 아이템"""
@ -111,3 +144,16 @@ class QueueMessage(BaseModel):
job: PipelineJob
timestamp: datetime = Field(default_factory=datetime.now)
retry_count: int = 0
class Keyword(BaseModel):
"""스케줄러용 키워드 모델"""
keyword_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
keyword: str
interval_minutes: int = Field(default=60) # 기본 1시간
is_active: bool = Field(default=True)
last_run: Optional[datetime] = None
next_run: Optional[datetime] = None
created_at: datetime = Field(default_factory=datetime.now)
updated_at: datetime = Field(default_factory=datetime.now)
rss_feeds: List[str] = Field(default_factory=list) # 커스텀 RSS 피드
priority: int = Field(default=0) # 우선순위 (높을수록 우선)
max_articles_per_run: int = Field(default=100) # 실행당 최대 기사 수

View File

@ -16,13 +16,13 @@ class QueueManager:
"""Redis 기반 큐 매니저"""
QUEUES = {
"keyword_processing": "queue:keyword",
"rss_collection": "queue:rss",
"search_enrichment": "queue:search",
"ai_summarization": "queue:summarize",
"translation": "queue:translate",
"image_generation": "queue:image",
"article_assembly": "queue:assembly",
"keyword_processing": "queue:keyword_processing",
"rss_collection": "queue:rss_collection",
"search_enrichment": "queue:search_enrichment",
"google_search": "queue:google_search",
"ai_article_generation": "queue:ai_article_generation",
"image_generation": "queue:image_generation",
"translation": "queue:translation",
"failed": "queue:failed",
"scheduled": "queue:scheduled"
}
@ -77,12 +77,15 @@ class QueueManager:
"""큐에서 작업 가져오기"""
try:
queue_key = self.QUEUES.get(queue_name, f"queue:{queue_name}")
logger.info(f"Attempting to dequeue from {queue_key} with timeout={timeout}")
if timeout > 0:
result = await self.redis_client.blpop(queue_key, timeout=timeout)
result = await self.redis_client.blpop(queue_key, timeout)
if result:
_, data = result
logger.info(f"Dequeued item from {queue_key}")
else:
logger.debug(f"No item available in {queue_key}")
return None
else:
data = await self.redis_client.lpop(queue_key)

View File

@ -0,0 +1,54 @@
#!/usr/bin/env python3
"""
Simple pipeline test - direct queue injection
"""
import asyncio
import json
import redis.asyncio as redis
from datetime import datetime
import uuid
async def test():
# Redis 연결
r = await redis.from_url("redis://redis:6379", decode_responses=True)
# 작업 생성
job = {
"job_id": str(uuid.uuid4()),
"keyword_id": str(uuid.uuid4()),
"keyword": "전기차",
"stage": "rss_collection",
"stages_completed": [],
"data": {
"rss_feeds": [
"https://news.google.com/rss/search?q=전기차&hl=ko&gl=KR&ceid=KR:ko"
],
"categories": ["technology", "automotive"]
},
"priority": 1,
"retry_count": 0,
"max_retries": 3,
"created_at": datetime.now().isoformat(),
"updated_at": datetime.now().isoformat()
}
# QueueMessage 형식으로 래핑
message = {
"message_id": str(uuid.uuid4()),
"queue_name": "rss_collection",
"job": job,
"timestamp": datetime.now().isoformat()
}
# 큐에 추가
await r.lpush("queue:rss_collection", json.dumps(message))
print(f"✅ Job {job['job_id']} added to queue:rss_collection")
# 큐 상태 확인
length = await r.llen("queue:rss_collection")
print(f"📊 Queue length: {length}")
await r.aclose()
if __name__ == "__main__":
asyncio.run(test())

View File

@ -0,0 +1,57 @@
#!/usr/bin/env python3
"""
Direct dequeue test
"""
import asyncio
import redis.asyncio as redis
import json
async def test_dequeue():
"""Test dequeue directly"""
# Connect to Redis
redis_client = await redis.from_url(
"redis://redis:6379",
encoding="utf-8",
decode_responses=True
)
print("Connected to Redis")
# Check queue length
length = await redis_client.llen("queue:rss_collection")
print(f"Queue length: {length}")
if length > 0:
# Get the first item
item = await redis_client.lrange("queue:rss_collection", 0, 0)
print(f"First item preview: {item[0][:200]}...")
# Try blpop with timeout
print("Trying blpop with timeout=5...")
result = await redis_client.blpop("queue:rss_collection", 5)
if result:
queue, data = result
print(f"Successfully dequeued from {queue}")
print(f"Data: {data[:200]}...")
# Parse the message
try:
message = json.loads(data)
print(f"Message ID: {message.get('message_id')}")
print(f"Queue Name: {message.get('queue_name')}")
if 'job' in message:
job = message['job']
print(f"Job ID: {job.get('job_id')}")
print(f"Keyword: {job.get('keyword')}")
except Exception as e:
print(f"Failed to parse message: {e}")
else:
print("blpop timed out - no result")
else:
print("Queue is empty")
await redis_client.close()
if __name__ == "__main__":
asyncio.run(test_dequeue())

View File

@ -0,0 +1,118 @@
#!/usr/bin/env python3
"""
Pipeline Test Script
파이프라인 전체 플로우를 테스트하는 스크립트
"""
import asyncio
import json
from datetime import datetime
from motor.motor_asyncio import AsyncIOMotorClient
import redis.asyncio as redis
from shared.models import KeywordSubscription, PipelineJob
async def test_pipeline():
"""파이프라인 테스트"""
# MongoDB 연결
mongo_client = AsyncIOMotorClient("mongodb://mongodb:27017")
db = mongo_client.pipeline
# Redis 연결
redis_client = redis.Redis(host='redis', port=6379, decode_responses=True)
# 1. 테스트 키워드 추가
test_keyword = KeywordSubscription(
keyword="전기차",
language="ko",
schedule="*/1 * * * *", # 1분마다 (테스트용)
is_active=True,
is_priority=True,
rss_feeds=[
"https://news.google.com/rss/search?q=전기차&hl=ko&gl=KR&ceid=KR:ko",
"https://news.google.com/rss/search?q=electric+vehicle&hl=en&gl=US&ceid=US:en"
],
categories=["technology", "automotive", "environment"],
owner="test_user"
)
# MongoDB에 저장
await db.keyword_subscriptions.replace_one(
{"keyword": test_keyword.keyword},
test_keyword.dict(),
upsert=True
)
print(f"✅ 키워드 '{test_keyword.keyword}' 추가 완료")
# 2. 즉시 파이프라인 트리거 (스케줄러를 거치지 않고 직접)
job = PipelineJob(
keyword_id=test_keyword.keyword_id,
keyword=test_keyword.keyword,
stage="rss_collection",
data={
"rss_feeds": test_keyword.rss_feeds,
"categories": test_keyword.categories
},
priority=1 if test_keyword.is_priority else 0
)
# Redis 큐에 직접 추가 (QueueMessage 형식으로)
from shared.queue_manager import QueueMessage
message = QueueMessage(
queue_name="rss_collection",
job=job
)
await redis_client.lpush("queue:rss_collection", message.json())
print(f"✅ 작업을 RSS Collection 큐에 추가: {job.job_id}")
# 3. 파이프라인 상태 모니터링
print("\n📊 파이프라인 실행 모니터링 중...")
print("각 단계별 로그를 확인하려면 다음 명령을 실행하세요:")
print(" docker-compose logs -f pipeline-rss-collector")
print(" docker-compose logs -f pipeline-google-search")
print(" docker-compose logs -f pipeline-ai-summarizer")
print(" docker-compose logs -f pipeline-translator")
print(" docker-compose logs -f pipeline-image-generator")
print(" docker-compose logs -f pipeline-article-assembly")
# 큐 상태 확인
for i in range(10):
await asyncio.sleep(5)
# 각 큐의 길이 확인
queues = [
"queue:rss_collection",
"queue:google_search",
"queue:ai_summarization",
"queue:translation",
"queue:image_generation",
"queue:article_assembly"
]
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] 큐 상태:")
for queue in queues:
length = await redis_client.llen(queue)
if length > 0:
print(f" {queue}: {length} 작업 대기 중")
# 4. 최종 결과 확인
print("\n📄 MongoDB에서 생성된 기사 확인 중...")
articles = await db.articles.find({"keyword": test_keyword.keyword}).to_list(length=5)
if articles:
print(f"{len(articles)}개의 기사 생성 완료!")
for article in articles:
print(f"\n제목: {article.get('title', 'N/A')}")
print(f"ID: {article.get('article_id', 'N/A')}")
print(f"생성 시간: {article.get('created_at', 'N/A')}")
print(f"처리 시간: {article.get('processing_time', 'N/A')}")
print(f"이미지 수: {len(article.get('images', []))}")
else:
print("⚠️ 아직 기사가 생성되지 않았습니다. 조금 더 기다려주세요.")
# 연결 종료
await redis_client.close()
mongo_client.close()
if __name__ == "__main__":
print("🚀 파이프라인 테스트 시작")
asyncio.run(test_pipeline())

View File

@ -0,0 +1,56 @@
#!/usr/bin/env python3
"""
스타크래프트 키워드로 파이프라인 테스트
"""
import asyncio
import sys
import os
sys.path.append(os.path.dirname(__file__))
from shared.queue_manager import QueueManager
from shared.models import PipelineJob
async def test_starcraft_pipeline():
"""스타크래프트 키워드로 파이프라인 테스트"""
# Queue manager 초기화
queue_manager = QueueManager(redis_url="redis://redis:6379")
await queue_manager.connect()
try:
# 스타크래프트 파이프라인 작업 생성
job = PipelineJob(
keyword_id="test_starcraft_001",
keyword="스타크래프트",
stage="rss_collection",
data={}
)
print(f"🚀 스타크래프트 파이프라인 작업 시작")
print(f" 작업 ID: {job.job_id}")
print(f" 키워드: {job.keyword}")
print(f" 키워드 ID: {job.keyword_id}")
# RSS 수집 큐에 작업 추가
await queue_manager.enqueue('rss_collection', job)
print(f"✅ 작업이 rss_collection 큐에 추가되었습니다")
# 큐 상태 확인
stats = await queue_manager.get_queue_stats()
print(f"\n📊 현재 큐 상태:")
for queue_name, stat in stats.items():
if queue_name not in ['completed', 'failed']:
pending = stat.get('pending', 0)
processing = stat.get('processing', 0)
if pending > 0 or processing > 0:
print(f" {queue_name}: 대기={pending}, 처리중={processing}")
print(f"\n⏳ 파이프라인 실행을 모니터링하세요:")
print(f" docker logs site11_pipeline_rss_collector --tail 20 -f")
print(f" python3 check_mongodb.py")
finally:
await queue_manager.disconnect()
if __name__ == "__main__":
asyncio.run(test_starcraft_pipeline())

View File

@ -0,0 +1,54 @@
"""
파이프라인 테스트 작업 제출 스크립트
"""
import redis
import json
from datetime import datetime
import uuid
import sys
def submit_test_job(keyword='나스닥'):
# Redis 연결
redis_client = redis.Redis(host='localhost', port=6379, decode_responses=True)
# 테스트 작업 생성
job_id = str(uuid.uuid4())
keyword_id = f'test_{job_id[:8]}'
job_data = {
'job_id': job_id,
'keyword_id': keyword_id,
'keyword': keyword,
'created_at': datetime.now().isoformat(),
'stage': 'rss_collection',
'stages_completed': [],
'data': {}
}
# QueueMessage 래퍼 생성
queue_message = {
'message_id': str(uuid.uuid4()),
'queue_name': 'rss_collection',
'job': job_data,
'timestamp': datetime.now().isoformat(),
'attempts': 0
}
# 큐에 작업 추가 (rpush 사용 - priority=0인 경우)
redis_client.rpush('queue:rss_collection', json.dumps(queue_message))
print(f'✅ 파이프라인 시작: job_id={job_id}')
print(f'✅ 키워드: {keyword}')
print(f'✅ RSS Collection 큐에 작업 추가 완료')
# 큐 상태 확인
queue_len = redis_client.llen('queue:rss_collection')
print(f'✅ 현재 큐 길이: {queue_len}')
redis_client.close()
if __name__ == "__main__":
if len(sys.argv) > 1:
keyword = sys.argv[1]
else:
keyword = '나스닥'
submit_test_job(keyword)

View File

@ -9,7 +9,11 @@ RUN pip install --no-cache-dir -r requirements.txt
# Copy shared modules
COPY ./shared /app/shared
# Copy config directory
COPY ./config /app/config
# Copy application code
COPY ./translator /app
CMD ["python", "translator.py"]
# Use multi_translator.py as the main service
CMD ["python", "multi_translator.py"]

View File

@ -0,0 +1,329 @@
"""
Language Sync Service
기존 기사를 새로운 언어로 번역하는 백그라운드 서비스
"""
import asyncio
import logging
import os
import sys
import json
from typing import List, Dict, Any
import httpx
from motor.motor_asyncio import AsyncIOMotorClient
from datetime import datetime
# Add parent directory to path for shared module
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Import from shared module
from shared.models import FinalArticle, Subtopic, Entities, NewsReference
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class LanguageSyncService:
def __init__(self):
self.deepl_api_key = os.getenv("DEEPL_API_KEY", "3abbc796-2515-44a8-972d-22dcf27ab54a")
self.deepl_api_url = "https://api.deepl.com/v2/translate"
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
self.languages_config = None
self.config_path = "/app/config/languages.json"
self.sync_batch_size = 10
self.sync_delay = 2.0 # 언어 간 지연
async def load_config(self):
"""언어 설정 파일 로드"""
try:
if os.path.exists(self.config_path):
with open(self.config_path, 'r', encoding='utf-8') as f:
self.languages_config = json.load(f)
logger.info(f"Loaded language config")
else:
raise FileNotFoundError(f"Config file not found: {self.config_path}")
except Exception as e:
logger.error(f"Error loading config: {e}")
raise
async def start(self):
"""백그라운드 싱크 서비스 시작"""
logger.info("Starting Language Sync Service")
# 설정 로드
await self.load_config()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# 주기적으로 싱크 체크 (10분마다)
while True:
try:
await self.sync_missing_translations()
await asyncio.sleep(600) # 10분 대기
except Exception as e:
logger.error(f"Error in sync loop: {e}")
await asyncio.sleep(60) # 에러 시 1분 후 재시도
async def sync_missing_translations(self):
"""누락된 번역 싱크"""
try:
# 활성화된 언어 목록
enabled_languages = [
lang for lang in self.languages_config["enabled_languages"]
if lang["enabled"]
]
if not enabled_languages:
logger.info("No enabled languages for sync")
return
# 원본 언어 컬렉션
source_collection = self.languages_config["source_language"]["collection"]
for lang_config in enabled_languages:
await self.sync_language(source_collection, lang_config)
await asyncio.sleep(self.sync_delay)
except Exception as e:
logger.error(f"Error in sync_missing_translations: {e}")
async def sync_language(self, source_collection: str, lang_config: Dict):
"""특정 언어로 누락된 기사 번역"""
try:
target_collection = lang_config["collection"]
# 번역되지 않은 기사 찾기
# 원본에는 있지만 대상 컬렉션에는 없는 기사
source_articles = await self.db[source_collection].find(
{},
{"news_id": 1}
).to_list(None)
source_ids = {article["news_id"] for article in source_articles}
translated_articles = await self.db[target_collection].find(
{},
{"news_id": 1}
).to_list(None)
translated_ids = {article["news_id"] for article in translated_articles}
# 누락된 news_id
missing_ids = source_ids - translated_ids
if not missing_ids:
logger.info(f"No missing translations for {lang_config['name']}")
return
logger.info(f"Found {len(missing_ids)} missing translations for {lang_config['name']}")
# 배치로 처리
missing_list = list(missing_ids)
for i in range(0, len(missing_list), self.sync_batch_size):
batch = missing_list[i:i+self.sync_batch_size]
for news_id in batch:
try:
# 원본 기사 조회
korean_article = await self.db[source_collection].find_one(
{"news_id": news_id}
)
if not korean_article:
continue
# 번역 수행
await self.translate_and_save(
korean_article,
lang_config
)
logger.info(f"Synced article {news_id} to {lang_config['code']}")
# API 속도 제한
await asyncio.sleep(0.5)
except Exception as e:
logger.error(f"Error translating {news_id} to {lang_config['code']}: {e}")
continue
# 배치 간 지연
if i + self.sync_batch_size < len(missing_list):
await asyncio.sleep(self.sync_delay)
except Exception as e:
logger.error(f"Error syncing language {lang_config['code']}: {e}")
async def translate_and_save(self, korean_article: Dict, lang_config: Dict):
"""기사 번역 및 저장"""
try:
# 제목 번역
translated_title = await self._translate_text(
korean_article.get('title', ''),
target_lang=lang_config["deepl_code"]
)
# 요약 번역
translated_summary = await self._translate_text(
korean_article.get('summary', ''),
target_lang=lang_config["deepl_code"]
)
# Subtopics 번역
translated_subtopics = []
for subtopic in korean_article.get('subtopics', []):
translated_subtopic_title = await self._translate_text(
subtopic.get('title', ''),
target_lang=lang_config["deepl_code"]
)
translated_content_list = []
for content_para in subtopic.get('content', []):
translated_para = await self._translate_text(
content_para,
target_lang=lang_config["deepl_code"]
)
translated_content_list.append(translated_para)
translated_subtopics.append(Subtopic(
title=translated_subtopic_title,
content=translated_content_list
))
# 카테고리 번역
translated_categories = []
for category in korean_article.get('categories', []):
translated_cat = await self._translate_text(
category,
target_lang=lang_config["deepl_code"]
)
translated_categories.append(translated_cat)
# Entities와 References는 원본 유지
entities_data = korean_article.get('entities', {})
translated_entities = Entities(**entities_data) if entities_data else Entities()
references = []
for ref_data in korean_article.get('references', []):
references.append(NewsReference(**ref_data))
# 번역된 기사 생성
translated_article = FinalArticle(
news_id=korean_article.get('news_id'),
title=translated_title,
summary=translated_summary,
subtopics=translated_subtopics,
categories=translated_categories,
entities=translated_entities,
source_keyword=korean_article.get('source_keyword'),
source_count=korean_article.get('source_count', 1),
references=references,
job_id=korean_article.get('job_id'),
keyword_id=korean_article.get('keyword_id'),
pipeline_stages=korean_article.get('pipeline_stages', []) + ['sync_translation'],
processing_time=korean_article.get('processing_time', 0),
language=lang_config["code"],
ref_news_id=None,
rss_guid=korean_article.get('rss_guid'), # RSS GUID 유지
image_prompt=korean_article.get('image_prompt'), # 이미지 프롬프트 유지
images=korean_article.get('images', []), # 이미지 URL 리스트 유지
translated_languages=korean_article.get('translated_languages', []) # 번역 언어 목록 유지
)
# MongoDB에 저장
collection_name = lang_config["collection"]
result = await self.db[collection_name].insert_one(translated_article.model_dump())
# 원본 기사에 번역 완료 표시
await self.db[self.languages_config["source_language"]["collection"]].update_one(
{"news_id": korean_article.get('news_id')},
{
"$addToSet": {
"translated_languages": lang_config["code"]
}
}
)
logger.info(f"Synced article to {collection_name}: {result.inserted_id}")
except Exception as e:
logger.error(f"Error in translate_and_save: {e}")
raise
async def _translate_text(self, text: str, target_lang: str = 'EN') -> str:
"""DeepL API를 사용한 텍스트 번역"""
try:
if not text:
return ""
async with httpx.AsyncClient() as client:
response = await client.post(
self.deepl_api_url,
data={
'auth_key': self.deepl_api_key,
'text': text,
'target_lang': target_lang,
'source_lang': 'KO'
},
timeout=30
)
if response.status_code == 200:
result = response.json()
return result['translations'][0]['text']
else:
logger.error(f"DeepL API error: {response.status_code}")
return text
except Exception as e:
logger.error(f"Error translating text: {e}")
return text
async def manual_sync(self, language_code: str = None):
"""수동 싱크 실행"""
logger.info(f"Manual sync requested for language: {language_code or 'all'}")
await self.load_config()
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
if language_code:
# 특정 언어만 싱크
lang_config = next(
(lang for lang in self.languages_config["enabled_languages"]
if lang["code"] == language_code and lang["enabled"]),
None
)
if lang_config:
source_collection = self.languages_config["source_language"]["collection"]
await self.sync_language(source_collection, lang_config)
else:
logger.error(f"Language {language_code} not found or not enabled")
else:
# 모든 활성 언어 싱크
await self.sync_missing_translations()
async def main():
"""메인 함수"""
service = LanguageSyncService()
# 명령줄 인수 확인
if len(sys.argv) > 1:
if sys.argv[1] == "sync":
# 수동 싱크 모드
language = sys.argv[2] if len(sys.argv) > 2 else None
await service.manual_sync(language)
else:
logger.error(f"Unknown command: {sys.argv[1]}")
else:
# 백그라운드 서비스 모드
try:
await service.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,320 @@
"""
Multi-Language Translation Service
다국어 번역 서비스 - 설정 기반 다중 언어 지원
"""
import asyncio
import logging
import os
import sys
import json
from typing import List, Dict, Any
import httpx
import redis.asyncio as redis
from motor.motor_asyncio import AsyncIOMotorClient
from datetime import datetime
# Import from shared module
from shared.models import PipelineJob, FinalArticle
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class MultiLanguageTranslator:
def __init__(self):
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
self.deepl_api_key = os.getenv("DEEPL_API_KEY", "3abbc796-2515-44a8-972d-22dcf27ab54a")
self.deepl_api_url = "https://api.deepl.com/v2/translate"
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
self.languages_config = None
self.config_path = "/app/config/languages.json"
async def load_config(self):
"""언어 설정 파일 로드"""
try:
if os.path.exists(self.config_path):
with open(self.config_path, 'r', encoding='utf-8') as f:
self.languages_config = json.load(f)
else:
# 기본 설정 (영어만)
self.languages_config = {
"enabled_languages": [
{
"code": "en",
"name": "English",
"deepl_code": "EN",
"collection": "articles_en",
"enabled": True
}
],
"source_language": {
"code": "ko",
"name": "Korean",
"collection": "articles_ko"
},
"translation_settings": {
"batch_size": 5,
"delay_between_languages": 2.0,
"delay_between_articles": 0.5,
"max_retries": 3
}
}
logger.info(f"Loaded language config: {len(self.get_enabled_languages())} languages enabled")
except Exception as e:
logger.error(f"Error loading config: {e}")
raise
def get_enabled_languages(self) -> List[Dict]:
"""활성화된 언어 목록 반환"""
return [lang for lang in self.languages_config["enabled_languages"] if lang["enabled"]]
async def start(self):
"""워커 시작"""
logger.info("Starting Multi-Language Translator Worker")
# 설정 로드
await self.load_config()
# Redis 연결
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# DeepL API 키 확인
if not self.deepl_api_key:
logger.error("DeepL API key not configured")
return
# 메인 처리 루프
while True:
try:
# 큐에서 작업 가져오기
job = await self.queue_manager.dequeue('translation', timeout=5)
if job:
await self.process_job(job)
except Exception as e:
logger.error(f"Error in worker loop: {e}")
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""모든 활성 언어로 번역"""
try:
logger.info(f"Processing job {job.job_id} for multi-language translation")
# MongoDB에서 한국어 기사 가져오기
news_id = job.data.get('news_id')
if not news_id:
logger.error(f"No news_id in job {job.job_id}")
await self.queue_manager.mark_failed('translation', job, "No news_id")
return
# 원본 컬렉션에서 기사 조회
source_collection = self.languages_config["source_language"]["collection"]
korean_article = await self.db[source_collection].find_one({"news_id": news_id})
if not korean_article:
logger.error(f"Article {news_id} not found in {source_collection}")
await self.queue_manager.mark_failed('translation', job, "Article not found")
return
# 활성화된 모든 언어로 번역
enabled_languages = self.get_enabled_languages()
settings = self.languages_config["translation_settings"]
for lang_config in enabled_languages:
try:
logger.info(f"Translating article {news_id} to {lang_config['name']}")
# 이미 번역되었는지 확인
existing = await self.db[lang_config["collection"]].find_one({"news_id": news_id})
if existing:
logger.info(f"Article {news_id} already translated to {lang_config['code']}")
continue
# 번역 수행
await self.translate_article(
korean_article,
lang_config,
job
)
# 언어 간 지연
if settings.get("delay_between_languages"):
await asyncio.sleep(settings["delay_between_languages"])
except Exception as e:
logger.error(f"Error translating to {lang_config['code']}: {e}")
continue
# 파이프라인 완료 로그
logger.info(f"Translation pipeline completed for news_id: {news_id}")
# 완료 표시
job.stages_completed.append('translation')
await self.queue_manager.mark_completed('translation', job.job_id)
logger.info(f"Multi-language translation completed for job {job.job_id}")
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")
await self.queue_manager.mark_failed('translation', job, str(e))
async def translate_article(self, korean_article: Dict, lang_config: Dict, job: PipelineJob):
"""특정 언어로 기사 번역"""
try:
# 제목 번역
translated_title = await self._translate_text(
korean_article.get('title', ''),
target_lang=lang_config["deepl_code"]
)
# 요약 번역
translated_summary = await self._translate_text(
korean_article.get('summary', ''),
target_lang=lang_config["deepl_code"]
)
# Subtopics 번역
from shared.models import Subtopic
translated_subtopics = []
for subtopic in korean_article.get('subtopics', []):
translated_subtopic_title = await self._translate_text(
subtopic.get('title', ''),
target_lang=lang_config["deepl_code"]
)
translated_content_list = []
for content_para in subtopic.get('content', []):
translated_para = await self._translate_text(
content_para,
target_lang=lang_config["deepl_code"]
)
translated_content_list.append(translated_para)
# API 속도 제한
settings = self.languages_config["translation_settings"]
if settings.get("delay_between_articles"):
await asyncio.sleep(settings["delay_between_articles"])
translated_subtopics.append(Subtopic(
title=translated_subtopic_title,
content=translated_content_list
))
# 카테고리 번역
translated_categories = []
for category in korean_article.get('categories', []):
translated_cat = await self._translate_text(
category,
target_lang=lang_config["deepl_code"]
)
translated_categories.append(translated_cat)
# Entities와 References는 원본 유지
from shared.models import Entities, NewsReference
entities_data = korean_article.get('entities', {})
translated_entities = Entities(**entities_data) if entities_data else Entities()
references = []
for ref_data in korean_article.get('references', []):
references.append(NewsReference(**ref_data))
# 번역된 기사 생성
translated_article = FinalArticle(
news_id=korean_article.get('news_id'), # 같은 news_id 사용
title=translated_title,
summary=translated_summary,
subtopics=translated_subtopics,
categories=translated_categories,
entities=translated_entities,
source_keyword=job.keyword if hasattr(job, 'keyword') else korean_article.get('source_keyword'),
source_count=korean_article.get('source_count', 1),
references=references,
job_id=job.job_id,
keyword_id=job.keyword_id if hasattr(job, 'keyword_id') else None,
pipeline_stages=korean_article.get('pipeline_stages', []) + ['translation'],
processing_time=korean_article.get('processing_time', 0),
language=lang_config["code"],
ref_news_id=None, # 같은 news_id 사용하므로 불필요
rss_guid=korean_article.get('rss_guid'), # RSS GUID 유지
image_prompt=korean_article.get('image_prompt'), # 이미지 프롬프트 유지
images=korean_article.get('images', []), # 이미지 URL 리스트 유지
translated_languages=korean_article.get('translated_languages', []) # 번역 언어 목록 유지
)
# MongoDB에 저장
collection_name = lang_config["collection"]
result = await self.db[collection_name].insert_one(translated_article.model_dump())
logger.info(f"Article saved to {collection_name} with _id: {result.inserted_id}, language: {lang_config['code']}")
# 원본 기사에 번역 완료 표시
await self.db[self.languages_config["source_language"]["collection"]].update_one(
{"news_id": korean_article.get('news_id')},
{
"$addToSet": {
"translated_languages": lang_config["code"]
}
}
)
except Exception as e:
logger.error(f"Error translating article to {lang_config['code']}: {e}")
raise
async def _translate_text(self, text: str, target_lang: str = 'EN') -> str:
"""DeepL API를 사용한 텍스트 번역"""
try:
if not text:
return ""
async with httpx.AsyncClient() as client:
response = await client.post(
self.deepl_api_url,
data={
'auth_key': self.deepl_api_key,
'text': text,
'target_lang': target_lang,
'source_lang': 'KO'
},
timeout=30
)
if response.status_code == 200:
result = response.json()
return result['translations'][0]['text']
else:
logger.error(f"DeepL API error: {response.status_code}")
return text # 번역 실패시 원본 반환
except Exception as e:
logger.error(f"Error translating text: {e}")
return text # 번역 실패시 원본 반환
async def stop(self):
"""워커 중지"""
await self.queue_manager.disconnect()
logger.info("Multi-Language Translator Worker stopped")
async def main():
"""메인 함수"""
worker = MultiLanguageTranslator()
try:
await worker.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
await worker.stop()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -1,3 +1,5 @@
httpx==0.25.0
redis[hiredis]==5.0.1
pydantic==2.5.0
motor==3.1.1
pymongo==4.3.3

View File

@ -8,9 +8,11 @@ import os
import sys
from typing import List, Dict, Any
import httpx
from motor.motor_asyncio import AsyncIOMotorClient
from datetime import datetime
# Import from shared module
from shared.models import PipelineJob, SummarizedItem, TranslatedItem
from shared.models import PipelineJob, FinalArticle
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
@ -24,6 +26,9 @@ class TranslatorWorker:
self.deepl_api_key = os.getenv("DEEPL_API_KEY", "3abbc796-2515-44a8-972d-22dcf27ab54a")
# DeepL Pro API 엔드포인트 사용
self.deepl_api_url = "https://api.deepl.com/v2/translate"
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "ai_writer_db")
self.db = None
async def start(self):
"""워커 시작"""
@ -32,6 +37,10 @@ class TranslatorWorker:
# Redis 연결
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# DeepL API 키 확인
if not self.deepl_api_key:
logger.error("DeepL API key not configured")
@ -51,55 +60,122 @@ class TranslatorWorker:
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""번역 작업 처리"""
"""영어 버전 기사 생성 및 저장"""
try:
logger.info(f"Processing job {job.job_id} for translation")
summarized_items = job.data.get('summarized_items', [])
translated_items = []
# MongoDB에서 한국어 기사 가져오기
news_id = job.data.get('news_id')
if not news_id:
logger.error(f"No news_id in job {job.job_id}")
await self.queue_manager.mark_failed('translation', job, "No news_id")
return
for item_data in summarized_items:
summarized_item = SummarizedItem(**item_data)
# MongoDB에서 한국어 기사 조회 (articles_ko)
korean_article = await self.db.articles_ko.find_one({"news_id": news_id})
if not korean_article:
logger.error(f"Article {news_id} not found in MongoDB")
await self.queue_manager.mark_failed('translation', job, "Article not found")
return
# 제목과 요약 번역
# 영어로 번역
translated_title = await self._translate_text(
summarized_item.enriched_item['rss_item']['title'],
korean_article.get('title', ''),
target_lang='EN'
)
translated_summary = await self._translate_text(
summarized_item.ai_summary,
korean_article.get('summary', ''),
target_lang='EN'
)
translated_item = TranslatedItem(
summarized_item=summarized_item,
translated_title=translated_title,
translated_summary=translated_summary,
target_language='en'
# Subtopics 번역
from shared.models import Subtopic
translated_subtopics = []
for subtopic in korean_article.get('subtopics', []):
translated_subtopic_title = await self._translate_text(
subtopic.get('title', ''),
target_lang='EN'
)
translated_items.append(translated_item)
# API 속도 제한
await asyncio.sleep(0.5)
translated_content_list = []
for content_para in subtopic.get('content', []):
translated_para = await self._translate_text(
content_para,
target_lang='EN'
)
translated_content_list.append(translated_para)
await asyncio.sleep(0.2) # API 속도 제한
if translated_items:
logger.info(f"Translated {len(translated_items)} items")
translated_subtopics.append(Subtopic(
title=translated_subtopic_title,
content=translated_content_list
))
# 다음 단계로 전달
job.data['translated_items'] = [item.dict() for item in translated_items]
# 카테고리 번역
translated_categories = []
for category in korean_article.get('categories', []):
translated_cat = await self._translate_text(category, target_lang='EN')
translated_categories.append(translated_cat)
await asyncio.sleep(0.2) # API 속도 제한
# Entities 번역 (선택적)
from shared.models import Entities
entities_data = korean_article.get('entities', {})
translated_entities = Entities(
people=entities_data.get('people', []), # 인명은 번역하지 않음
organizations=entities_data.get('organizations', []), # 조직명은 번역하지 않음
groups=entities_data.get('groups', []),
countries=entities_data.get('countries', []),
events=entities_data.get('events', [])
)
# 레퍼런스 가져오기 (번역하지 않음)
from shared.models import NewsReference
references = []
for ref_data in korean_article.get('references', []):
references.append(NewsReference(**ref_data))
# 영어 버전 기사 생성 - 같은 news_id 사용
english_article = FinalArticle(
news_id=news_id, # 원본과 같은 news_id 사용
title=translated_title,
summary=translated_summary,
subtopics=translated_subtopics,
categories=translated_categories,
entities=translated_entities,
source_keyword=job.keyword,
source_count=korean_article.get('source_count', 1),
references=references, # 원본 레퍼런스 그대로 사용
job_id=job.job_id,
keyword_id=job.keyword_id,
pipeline_stages=job.stages_completed.copy() + ['translation'],
processing_time=korean_article.get('processing_time', 0),
language='en', # 영어
ref_news_id=None # 같은 news_id를 사용하므로 ref 불필요
)
# MongoDB에 영어 버전 저장 (articles_en)
result = await self.db.articles_en.insert_one(english_article.model_dump())
english_article_id = str(result.inserted_id)
logger.info(f"English article saved with _id: {english_article_id}, news_id: {news_id}, language: en")
# 원본 한국어 기사 업데이트 - 번역 완료 표시
await self.db.articles_ko.update_one(
{"news_id": news_id},
{
"$addToSet": {
"pipeline_stages": "translation"
}
}
)
# 완료 표시
job.stages_completed.append('translation')
job.stage = 'image_generation'
await self.queue_manager.enqueue('image_generation', job)
await self.queue_manager.mark_completed('translation', job.job_id)
else:
logger.warning(f"No items translated for job {job.job_id}")
await self.queue_manager.mark_failed(
'translation',
job,
"No items to translate"
)
logger.info(f"Translation completed for job {job.job_id}")
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")