Major architectural transformation from synchronous to asynchronous processing: ## Pipeline Services (8 microservices) - pipeline-scheduler: APScheduler for 30-minute periodic job triggers - pipeline-rss-collector: RSS feed collection with deduplication (7-day TTL) - pipeline-google-search: Content enrichment via Google Search API - pipeline-ai-summarizer: AI summarization using Claude API (claude-sonnet-4-20250514) - pipeline-translator: Translation using DeepL Pro API - pipeline-image-generator: Image generation with Replicate API (Stable Diffusion) - pipeline-article-assembly: Final article assembly and MongoDB storage - pipeline-monitor: Real-time monitoring dashboard (port 8100) ## Key Features - Redis-based job queue with deduplication - Asynchronous processing with Python asyncio - Shared models and queue manager for inter-service communication - Docker containerization for all services - Container names standardized with site11_ prefix ## Removed Services - Moved to backup: google-search, rss-feed, news-aggregator, ai-writer ## Configuration - DeepL Pro API: 3abbc796-2515-44a8-972d-22dcf27ab54a - Claude Model: claude-sonnet-4-20250514 - Redis Queue TTL: 7 days for deduplication 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
234 lines
7.9 KiB
Python
234 lines
7.9 KiB
Python
"""
|
|
Article Assembly Service
|
|
최종 기사 조립 및 MongoDB 저장 서비스
|
|
"""
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import sys
|
|
import json
|
|
from datetime import datetime
|
|
from typing import List, Dict, Any
|
|
from anthropic import AsyncAnthropic
|
|
from motor.motor_asyncio import AsyncIOMotorClient
|
|
|
|
# Import from shared module
|
|
from shared.models import PipelineJob, SummarizedItem, FinalArticle
|
|
from shared.queue_manager import QueueManager
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class ArticleAssemblyWorker:
|
|
def __init__(self):
|
|
self.queue_manager = QueueManager(
|
|
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
|
|
)
|
|
self.claude_api_key = os.getenv("CLAUDE_API_KEY")
|
|
self.claude_client = None
|
|
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
|
|
self.db_name = os.getenv("DB_NAME", "pipeline_db")
|
|
self.db = None
|
|
|
|
async def start(self):
|
|
"""워커 시작"""
|
|
logger.info("Starting Article Assembly Worker")
|
|
|
|
# Redis 연결
|
|
await self.queue_manager.connect()
|
|
|
|
# MongoDB 연결
|
|
client = AsyncIOMotorClient(self.mongodb_url)
|
|
self.db = client[self.db_name]
|
|
|
|
# Claude 클라이언트 초기화
|
|
if self.claude_api_key:
|
|
self.claude_client = AsyncAnthropic(api_key=self.claude_api_key)
|
|
else:
|
|
logger.error("Claude API key not configured")
|
|
return
|
|
|
|
# 메인 처리 루프
|
|
while True:
|
|
try:
|
|
# 큐에서 작업 가져오기
|
|
job = await self.queue_manager.dequeue('article_assembly', timeout=5)
|
|
|
|
if job:
|
|
await self.process_job(job)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in worker loop: {e}")
|
|
await asyncio.sleep(1)
|
|
|
|
async def process_job(self, job: PipelineJob):
|
|
"""최종 기사 조립 작업 처리"""
|
|
try:
|
|
start_time = datetime.now()
|
|
logger.info(f"Processing job {job.job_id} for article assembly")
|
|
|
|
summarized_items = job.data.get('summarized_items', [])
|
|
|
|
if not summarized_items:
|
|
logger.warning(f"No items to assemble for job {job.job_id}")
|
|
await self.queue_manager.mark_failed(
|
|
'article_assembly',
|
|
job,
|
|
"No items to assemble"
|
|
)
|
|
return
|
|
|
|
# 최종 기사 생성
|
|
article = await self._generate_final_article(job, summarized_items)
|
|
|
|
# 처리 시간 계산
|
|
processing_time = (datetime.now() - start_time).total_seconds()
|
|
article.processing_time = processing_time
|
|
|
|
# MongoDB에 저장
|
|
await self.db.articles.insert_one(article.dict())
|
|
|
|
logger.info(f"Article {article.article_id} saved to MongoDB")
|
|
|
|
# 완료 표시
|
|
job.stages_completed.append('article_assembly')
|
|
await self.queue_manager.mark_completed('article_assembly', job.job_id)
|
|
|
|
# 통계 업데이트
|
|
await self._update_statistics(job.keyword_id)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing job {job.job_id}: {e}")
|
|
await self.queue_manager.mark_failed('article_assembly', job, str(e))
|
|
|
|
async def _generate_final_article(
|
|
self,
|
|
job: PipelineJob,
|
|
summarized_items: List[Dict]
|
|
) -> FinalArticle:
|
|
"""Claude를 사용한 최종 기사 생성"""
|
|
|
|
# 아이템 정보 준비
|
|
items_text = []
|
|
for idx, item_data in enumerate(summarized_items, 1):
|
|
item = SummarizedItem(**item_data)
|
|
items_text.append(f"""
|
|
[뉴스 {idx}]
|
|
제목: {item.enriched_item['rss_item']['title']}
|
|
요약: {item.ai_summary}
|
|
출처: {item.enriched_item['rss_item']['link']}
|
|
""")
|
|
|
|
content = "\n".join(items_text)
|
|
|
|
# Claude로 종합 기사 작성
|
|
prompt = f"""다음 뉴스 항목들을 바탕으로 종합적인 기사를 작성해주세요.
|
|
|
|
키워드: {job.keyword}
|
|
|
|
뉴스 항목들:
|
|
{content}
|
|
|
|
다음 JSON 형식으로 작성해주세요:
|
|
{{
|
|
"title": "종합 기사 제목",
|
|
"content": "기사 본문 (1500자 이내, 문단 구분)",
|
|
"summary": "한 줄 요약 (100자 이내)",
|
|
"categories": ["카테고리1", "카테고리2"],
|
|
"tags": ["태그1", "태그2", "태그3"]
|
|
}}
|
|
|
|
요구사항:
|
|
- 전문적이고 객관적인 톤
|
|
- 핵심 정보와 트렌드 파악
|
|
- 시사점 포함
|
|
- 한국 독자 대상"""
|
|
|
|
try:
|
|
response = await self.claude_client.messages.create(
|
|
model="claude-sonnet-4-20250514",
|
|
max_tokens=3000,
|
|
temperature=0.7,
|
|
messages=[
|
|
{"role": "user", "content": prompt}
|
|
]
|
|
)
|
|
|
|
# JSON 파싱
|
|
content_text = response.content[0].text
|
|
json_start = content_text.find('{')
|
|
json_end = content_text.rfind('}') + 1
|
|
|
|
if json_start != -1 and json_end > json_start:
|
|
article_data = json.loads(content_text[json_start:json_end])
|
|
else:
|
|
raise ValueError("No valid JSON in response")
|
|
|
|
# FinalArticle 생성
|
|
article = FinalArticle(
|
|
job_id=job.job_id,
|
|
keyword_id=job.keyword_id,
|
|
keyword=job.keyword,
|
|
title=article_data.get('title', f"{job.keyword} 종합 뉴스"),
|
|
content=article_data.get('content', ''),
|
|
summary=article_data.get('summary', ''),
|
|
source_items=[], # 간소화
|
|
images=[], # 이미지는 별도 서비스에서 처리
|
|
categories=article_data.get('categories', []),
|
|
tags=article_data.get('tags', []),
|
|
pipeline_stages=job.stages_completed,
|
|
processing_time=0 # 나중에 업데이트
|
|
)
|
|
|
|
return article
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error generating article: {e}")
|
|
# 폴백 기사 생성
|
|
return FinalArticle(
|
|
job_id=job.job_id,
|
|
keyword_id=job.keyword_id,
|
|
keyword=job.keyword,
|
|
title=f"{job.keyword} 뉴스 요약 - {datetime.now().strftime('%Y-%m-%d')}",
|
|
content=content,
|
|
summary=f"{job.keyword} 관련 {len(summarized_items)}개 뉴스 요약",
|
|
source_items=[],
|
|
images=[],
|
|
categories=['자동생성'],
|
|
tags=[job.keyword],
|
|
pipeline_stages=job.stages_completed,
|
|
processing_time=0
|
|
)
|
|
|
|
async def _update_statistics(self, keyword_id: str):
|
|
"""키워드별 통계 업데이트"""
|
|
try:
|
|
await self.db.keyword_stats.update_one(
|
|
{"keyword_id": keyword_id},
|
|
{
|
|
"$inc": {"articles_generated": 1},
|
|
"$set": {"last_generated": datetime.now()}
|
|
},
|
|
upsert=True
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error updating statistics: {e}")
|
|
|
|
async def stop(self):
|
|
"""워커 중지"""
|
|
await self.queue_manager.disconnect()
|
|
logger.info("Article Assembly Worker stopped")
|
|
|
|
async def main():
|
|
"""메인 함수"""
|
|
worker = ArticleAssemblyWorker()
|
|
|
|
try:
|
|
await worker.start()
|
|
except KeyboardInterrupt:
|
|
logger.info("Received interrupt signal")
|
|
finally:
|
|
await worker.stop()
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |