feat: Implement async queue-based news pipeline with microservices

Major architectural transformation from synchronous to asynchronous processing:

## Pipeline Services (8 microservices)
- pipeline-scheduler: APScheduler for 30-minute periodic job triggers
- pipeline-rss-collector: RSS feed collection with deduplication (7-day TTL)
- pipeline-google-search: Content enrichment via Google Search API
- pipeline-ai-summarizer: AI summarization using Claude API (claude-sonnet-4-20250514)
- pipeline-translator: Translation using DeepL Pro API
- pipeline-image-generator: Image generation with Replicate API (Stable Diffusion)
- pipeline-article-assembly: Final article assembly and MongoDB storage
- pipeline-monitor: Real-time monitoring dashboard (port 8100)

## Key Features
- Redis-based job queue with deduplication
- Asynchronous processing with Python asyncio
- Shared models and queue manager for inter-service communication
- Docker containerization for all services
- Container names standardized with site11_ prefix

## Removed Services
- Moved to backup: google-search, rss-feed, news-aggregator, ai-writer

## Configuration
- DeepL Pro API: 3abbc796-2515-44a8-972d-22dcf27ab54a
- Claude Model: claude-sonnet-4-20250514
- Redis Queue TTL: 7 days for deduplication

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
jungwoo choi
2025-09-13 19:22:14 +09:00
parent 1d90af7c3c
commit 070032006e
73 changed files with 5922 additions and 4 deletions

View File

@ -0,0 +1,214 @@
#!/usr/bin/env python3
"""
News Aggregator Service Test
RSS 피드 제목을 구글 full content 검색으로 확장하는 통합 테스트
"""
import asyncio
import httpx
import json
from datetime import datetime
from typing import Dict, Any
# Service URL
SERVICE_URL = "http://localhost:8018"
async def test_aggregate_with_full_content():
"""키워드로 RSS 피드를 검색하고 full content 구글 검색 테스트"""
async with httpx.AsyncClient(timeout=60.0) as client:
print("\n" + "="*60)
print("뉴스 통합 서비스 Full Content 테스트")
print("="*60)
# Test with keyword "인공지능"
print("\n1. 키워드 '인공지능'으로 RSS 피드 검색 및 구글 full content 검색")
print("-" * 40)
response = await client.get(
f"{SERVICE_URL}/api/aggregate",
params={
"q": "인공지능",
"limit": 3, # 테스트용으로 3개만
"google_results_per_title": 2, # 각 제목당 2개 구글 결과
"lang": "ko",
"country": "KR"
}
)
if response.status_code == 200:
data = response.json()
print(f"✓ RSS 피드 URL: {data['rss_feed_url']}")
print(f"✓ 전체 RSS 항목 수: {data['total_rss_entries']}")
print(f"✓ 처리된 항목 수: {data['processed_entries']}")
print(f"✓ 처리 시간: {data['processing_time']:.2f}")
# Check each news item for full content
for i, item in enumerate(data['news_items'], 1):
print(f"\n [{i}] RSS 제목: {item['rss_title'][:50]}...")
print(f" 검색 키워드: {item['search_keyword'][:50]}...")
print(f" 구글 검색 결과 수: {len(item['google_results'])}")
# Check if google results have full_content
for j, result in enumerate(item['google_results'], 1):
has_full_content = 'full_content' in result
if has_full_content:
full_content = result.get('full_content', '')
if isinstance(full_content, str):
content_length = len(full_content)
else:
content_length = len(str(full_content))
else:
content_length = 0
print(f" - 결과 {j}: {result.get('title', 'N/A')[:40]}...")
print(f" Full Content 포함: {'' if has_full_content else ''}")
if has_full_content:
print(f" Content 길이: {content_length:,} 문자")
# Show first 200 chars of content
if isinstance(result['full_content'], str):
preview = result['full_content'][:200].replace('\n', ' ')
print(f" 미리보기: {preview}...")
else:
print(f" Content 타입: {type(result['full_content'])}")
print(f" Content 데이터: {str(result['full_content'])[:200]}...")
else:
print(f"✗ 오류: {response.status_code}")
print(f" 상세: {response.text}")
async def test_aggregate_by_location():
"""지역 기반 RSS 피드 및 full content 테스트"""
async with httpx.AsyncClient(timeout=60.0) as client:
print("\n" + "="*60)
print("지역 기반 뉴스 통합 Full Content 테스트")
print("="*60)
print("\n2. 지역 'Seoul'로 RSS 피드 검색 및 구글 full content 검색")
print("-" * 40)
response = await client.get(
f"{SERVICE_URL}/api/aggregate/location",
params={
"location": "Seoul",
"limit": 2,
"google_results_per_title": 2,
"lang": "ko",
"country": "KR"
}
)
if response.status_code == 200:
data = response.json()
print(f"✓ 지역: {data['keyword']}")
print(f"✓ RSS 피드 URL: {data['rss_feed_url']}")
print(f"✓ 처리된 항목 수: {data['processed_entries']}")
# Check full content availability
full_content_count = 0
total_content_size = 0
for item in data['news_items']:
for result in item['google_results']:
if 'full_content' in result:
full_content_count += 1
content = result['full_content']
if isinstance(content, str):
total_content_size += len(content)
else:
total_content_size += len(str(content))
print(f"\n📊 Full Content 통계:")
print(f" - Full Content 포함 결과: {full_content_count}")
print(f" - 전체 Content 크기: {total_content_size:,} 문자")
print(f" - 평균 Content 크기: {total_content_size//max(full_content_count, 1):,} 문자")
else:
print(f"✗ 오류: {response.status_code}")
async def test_aggregate_by_topic():
"""주제별 RSS 피드 및 full content 테스트"""
async with httpx.AsyncClient(timeout=60.0) as client:
print("\n" + "="*60)
print("주제별 뉴스 통합 Full Content 테스트")
print("="*60)
print("\n3. 주제 'TECHNOLOGY'로 RSS 피드 검색 및 구글 full content 검색")
print("-" * 40)
response = await client.get(
f"{SERVICE_URL}/api/aggregate/topic",
params={
"category": "TECHNOLOGY",
"limit": 2,
"google_results_per_title": 3,
"lang": "ko",
"country": "KR"
}
)
if response.status_code == 200:
data = response.json()
print(f"✓ 주제: {data['keyword']}")
print(f"✓ 처리 시간: {data['processing_time']:.2f}")
# Analyze content quality for AI summarization
print("\n📝 AI 요약을 위한 Content 품질 분석:")
for i, item in enumerate(data['news_items'], 1):
print(f"\n 뉴스 항목 {i}:")
for j, result in enumerate(item['google_results'], 1):
if 'full_content' in result:
content = result['full_content']
if isinstance(content, str):
# Check content quality indicators
has_paragraphs = '\n\n' in content or '</p>' in content
has_sufficient_length = len(content) > 500
has_korean = any(ord(char) >= 0xAC00 and ord(char) <= 0xD7A3 for char in content[:min(100, len(content))])
else:
content_str = str(content)
has_paragraphs = '\n\n' in content_str or '</p>' in content_str
has_sufficient_length = len(content_str) > 500
has_korean = any(ord(char) >= 0xAC00 and ord(char) <= 0xD7A3 for char in content_str[:min(100, len(content_str))])
print(f" 결과 {j} 품질 체크:")
print(f" - 충분한 길이 (>500자): {'' if has_sufficient_length else ''}")
print(f" - 단락 구조 포함: {'' if has_paragraphs else ''}")
print(f" - 한국어 콘텐츠: {'' if has_korean else ''}")
print(f" - AI 요약 가능: {'' if (has_sufficient_length and has_paragraphs) else ''}")
else:
print(f"✗ 오류: {response.status_code}")
async def test_health_check():
"""서비스 상태 확인"""
async with httpx.AsyncClient() as client:
print("\n" + "="*60)
print("서비스 Health Check")
print("="*60)
response = await client.get(f"{SERVICE_URL}/health")
if response.status_code == 200:
data = response.json()
print(f"✓ 통합 서비스 상태: {data['status']}")
print(f" - RSS 서비스: {data['services']['rss_feed']}")
print(f" - Google 검색 서비스: {data['services']['google_search']}")
else:
print(f"✗ Health check 실패: {response.status_code}")
async def main():
"""메인 테스트 실행"""
print("\n" + "="*70)
print(" News Aggregator Full Content Integration Test ")
print(" RSS 피드 + Google Full Content 통합 테스트 ")
print("="*70)
# Run tests
await test_health_check()
await test_aggregate_with_full_content()
await test_aggregate_by_location()
await test_aggregate_by_topic()
print("\n" + "="*70)
print(" 테스트 완료 - Full Content 통합 확인 ")
print("="*70)
print("\n✅ 모든 테스트가 완료되었습니다.")
print(" RSS 피드 제목을 구글 full content로 검색하는 기능이 정상 작동합니다.")
print(" AI 요약을 위한 충분한 콘텐츠가 수집되고 있습니다.")
if __name__ == "__main__":
asyncio.run(main())