feat: Implement async queue-based news pipeline with microservices

Major architectural transformation from synchronous to asynchronous processing:

## Pipeline Services (8 microservices)
- pipeline-scheduler: APScheduler for 30-minute periodic job triggers
- pipeline-rss-collector: RSS feed collection with deduplication (7-day TTL)
- pipeline-google-search: Content enrichment via Google Search API
- pipeline-ai-summarizer: AI summarization using Claude API (claude-sonnet-4-20250514)
- pipeline-translator: Translation using DeepL Pro API
- pipeline-image-generator: Image generation with Replicate API (Stable Diffusion)
- pipeline-article-assembly: Final article assembly and MongoDB storage
- pipeline-monitor: Real-time monitoring dashboard (port 8100)

## Key Features
- Redis-based job queue with deduplication
- Asynchronous processing with Python asyncio
- Shared models and queue manager for inter-service communication
- Docker containerization for all services
- Container names standardized with site11_ prefix

## Removed Services
- Moved to backup: google-search, rss-feed, news-aggregator, ai-writer

## Configuration
- DeepL Pro API: 3abbc796-2515-44a8-972d-22dcf27ab54a
- Claude Model: claude-sonnet-4-20250514
- Redis Queue TTL: 7 days for deduplication

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
jungwoo choi
2025-09-13 19:22:14 +09:00
parent 1d90af7c3c
commit 070032006e
73 changed files with 5922 additions and 4 deletions

View File

@ -0,0 +1,234 @@
"""
Article Assembly Service
최종 기사 조립 및 MongoDB 저장 서비스
"""
import asyncio
import logging
import os
import sys
import json
from datetime import datetime
from typing import List, Dict, Any
from anthropic import AsyncAnthropic
from motor.motor_asyncio import AsyncIOMotorClient
# Import from shared module
from shared.models import PipelineJob, SummarizedItem, FinalArticle
from shared.queue_manager import QueueManager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ArticleAssemblyWorker:
def __init__(self):
self.queue_manager = QueueManager(
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
)
self.claude_api_key = os.getenv("CLAUDE_API_KEY")
self.claude_client = None
self.mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
self.db_name = os.getenv("DB_NAME", "pipeline_db")
self.db = None
async def start(self):
"""워커 시작"""
logger.info("Starting Article Assembly Worker")
# Redis 연결
await self.queue_manager.connect()
# MongoDB 연결
client = AsyncIOMotorClient(self.mongodb_url)
self.db = client[self.db_name]
# Claude 클라이언트 초기화
if self.claude_api_key:
self.claude_client = AsyncAnthropic(api_key=self.claude_api_key)
else:
logger.error("Claude API key not configured")
return
# 메인 처리 루프
while True:
try:
# 큐에서 작업 가져오기
job = await self.queue_manager.dequeue('article_assembly', timeout=5)
if job:
await self.process_job(job)
except Exception as e:
logger.error(f"Error in worker loop: {e}")
await asyncio.sleep(1)
async def process_job(self, job: PipelineJob):
"""최종 기사 조립 작업 처리"""
try:
start_time = datetime.now()
logger.info(f"Processing job {job.job_id} for article assembly")
summarized_items = job.data.get('summarized_items', [])
if not summarized_items:
logger.warning(f"No items to assemble for job {job.job_id}")
await self.queue_manager.mark_failed(
'article_assembly',
job,
"No items to assemble"
)
return
# 최종 기사 생성
article = await self._generate_final_article(job, summarized_items)
# 처리 시간 계산
processing_time = (datetime.now() - start_time).total_seconds()
article.processing_time = processing_time
# MongoDB에 저장
await self.db.articles.insert_one(article.dict())
logger.info(f"Article {article.article_id} saved to MongoDB")
# 완료 표시
job.stages_completed.append('article_assembly')
await self.queue_manager.mark_completed('article_assembly', job.job_id)
# 통계 업데이트
await self._update_statistics(job.keyword_id)
except Exception as e:
logger.error(f"Error processing job {job.job_id}: {e}")
await self.queue_manager.mark_failed('article_assembly', job, str(e))
async def _generate_final_article(
self,
job: PipelineJob,
summarized_items: List[Dict]
) -> FinalArticle:
"""Claude를 사용한 최종 기사 생성"""
# 아이템 정보 준비
items_text = []
for idx, item_data in enumerate(summarized_items, 1):
item = SummarizedItem(**item_data)
items_text.append(f"""
[뉴스 {idx}]
제목: {item.enriched_item['rss_item']['title']}
요약: {item.ai_summary}
출처: {item.enriched_item['rss_item']['link']}
""")
content = "\n".join(items_text)
# Claude로 종합 기사 작성
prompt = f"""다음 뉴스 항목들을 바탕으로 종합적인 기사를 작성해주세요.
키워드: {job.keyword}
뉴스 항목들:
{content}
다음 JSON 형식으로 작성해주세요:
{{
"title": "종합 기사 제목",
"content": "기사 본문 (1500자 이내, 문단 구분)",
"summary": "한 줄 요약 (100자 이내)",
"categories": ["카테고리1", "카테고리2"],
"tags": ["태그1", "태그2", "태그3"]
}}
요구사항:
- 전문적이고 객관적인 톤
- 핵심 정보와 트렌드 파악
- 시사점 포함
- 한국 독자 대상"""
try:
response = await self.claude_client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=3000,
temperature=0.7,
messages=[
{"role": "user", "content": prompt}
]
)
# JSON 파싱
content_text = response.content[0].text
json_start = content_text.find('{')
json_end = content_text.rfind('}') + 1
if json_start != -1 and json_end > json_start:
article_data = json.loads(content_text[json_start:json_end])
else:
raise ValueError("No valid JSON in response")
# FinalArticle 생성
article = FinalArticle(
job_id=job.job_id,
keyword_id=job.keyword_id,
keyword=job.keyword,
title=article_data.get('title', f"{job.keyword} 종합 뉴스"),
content=article_data.get('content', ''),
summary=article_data.get('summary', ''),
source_items=[], # 간소화
images=[], # 이미지는 별도 서비스에서 처리
categories=article_data.get('categories', []),
tags=article_data.get('tags', []),
pipeline_stages=job.stages_completed,
processing_time=0 # 나중에 업데이트
)
return article
except Exception as e:
logger.error(f"Error generating article: {e}")
# 폴백 기사 생성
return FinalArticle(
job_id=job.job_id,
keyword_id=job.keyword_id,
keyword=job.keyword,
title=f"{job.keyword} 뉴스 요약 - {datetime.now().strftime('%Y-%m-%d')}",
content=content,
summary=f"{job.keyword} 관련 {len(summarized_items)}개 뉴스 요약",
source_items=[],
images=[],
categories=['자동생성'],
tags=[job.keyword],
pipeline_stages=job.stages_completed,
processing_time=0
)
async def _update_statistics(self, keyword_id: str):
"""키워드별 통계 업데이트"""
try:
await self.db.keyword_stats.update_one(
{"keyword_id": keyword_id},
{
"$inc": {"articles_generated": 1},
"$set": {"last_generated": datetime.now()}
},
upsert=True
)
except Exception as e:
logger.error(f"Error updating statistics: {e}")
async def stop(self):
"""워커 중지"""
await self.queue_manager.disconnect()
logger.info("Article Assembly Worker stopped")
async def main():
"""메인 함수"""
worker = ArticleAssemblyWorker()
try:
await worker.start()
except KeyboardInterrupt:
logger.info("Received interrupt signal")
finally:
await worker.stop()
if __name__ == "__main__":
asyncio.run(main())