feat: Implement async queue-based news pipeline with microservices

Major architectural transformation from synchronous to asynchronous processing: ## Pipeline Services (8 microservices) - pipeline-scheduler: APScheduler for 30-minute periodic job triggers - pipeline-rss-collector: RSS feed collection with deduplication (7-day TTL) - pipeline-google-search: Content enrichment via Google Search API - pipeline-ai-summarizer: AI summarization using Claude API (claude-sonnet-4-20250514) - pipeline-translator: Translation using DeepL Pro API - pipeline-image-generator: Image generation with Replicate API (Stable Diffusion) - pipeline-article-assembly: Final article assembly and MongoDB storage - pipeline-monitor: Real-time monitoring dashboard (port 8100) ## Key Features - Redis-based job queue with deduplication - Asynchronous processing with Python asyncio - Shared models and queue manager for inter-service communication - Docker containerization for all services - Container names standardized with site11_ prefix ## Removed Services - Moved to backup: google-search, rss-feed, news-aggregator, ai-writer ## Configuration - DeepL Pro API: 3abbc796-2515-44a8-972d-22dcf27ab54a - Claude Model: claude-sonnet-4-20250514 - Redis Queue TTL: 7 days for deduplication 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-13 19:22:14 +09:00
parent 1d90af7c3c
commit 070032006e
73 changed files with 5922 additions and 4 deletions
--- a/backup-services/news-aggregator/backend/test_aggregator.py
+++ b/backup-services/news-aggregator/backend/test_aggregator.py
@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+"""
+News Aggregator Service Test
+RSS 피드 제목을 구글 full content 검색으로 확장하는 통합 테스트
+"""
+import asyncio
+import httpx
+import json
+from datetime import datetime
+from typing import Dict, Any
+
+# Service URL
+SERVICE_URL = "http://localhost:8018"
+
+async def test_aggregate_with_full_content():
+    """키워드로 RSS 피드를 검색하고 full content 구글 검색 테스트"""
+    async with httpx.AsyncClient(timeout=60.0) as client:
+        print("\n" + "="*60)
+        print("뉴스 통합 서비스 Full Content 테스트")
+        print("="*60)
+        
+        # Test with keyword "인공지능"
+        print("\n1. 키워드 '인공지능'으로 RSS 피드 검색 및 구글 full content 검색")
+        print("-" * 40)
+        
+        response = await client.get(
+            f"{SERVICE_URL}/api/aggregate",
+            params={
+                "q": "인공지능",
+                "limit": 3,  # 테스트용으로 3개만
+                "google_results_per_title": 2,  # 각 제목당 2개 구글 결과
+                "lang": "ko",
+                "country": "KR"
+            }
+        )
+        
+        if response.status_code == 200:
+            data = response.json()
+            print(f"✓ RSS 피드 URL: {data['rss_feed_url']}")
+            print(f"✓ 전체 RSS 항목 수: {data['total_rss_entries']}")
+            print(f"✓ 처리된 항목 수: {data['processed_entries']}")
+            print(f"✓ 처리 시간: {data['processing_time']:.2f}초")
+            
+            # Check each news item for full content
+            for i, item in enumerate(data['news_items'], 1):
+                print(f"\n  [{i}] RSS 제목: {item['rss_title'][:50]}...")
+                print(f"      검색 키워드: {item['search_keyword'][:50]}...")
+                print(f"      구글 검색 결과 수: {len(item['google_results'])}")
+                
+                # Check if google results have full_content
+                for j, result in enumerate(item['google_results'], 1):
+                    has_full_content = 'full_content' in result
+                    if has_full_content:
+                        full_content = result.get('full_content', '')
+                        if isinstance(full_content, str):
+                            content_length = len(full_content)
+                        else:
+                            content_length = len(str(full_content))
+                    else:
+                        content_length = 0
+                    
+                    print(f"        - 결과 {j}: {result.get('title', 'N/A')[:40]}...")
+                    print(f"          Full Content 포함: {'✓' if has_full_content else '✗'}")
+                    if has_full_content:
+                        print(f"          Content 길이: {content_length:,} 문자")
+                        # Show first 200 chars of content
+                        if isinstance(result['full_content'], str):
+                            preview = result['full_content'][:200].replace('\n', ' ')
+                            print(f"          미리보기: {preview}...")
+                        else:
+                            print(f"          Content 타입: {type(result['full_content'])}")
+                            print(f"          Content 데이터: {str(result['full_content'])[:200]}...")
+        else:
+            print(f"✗ 오류: {response.status_code}")
+            print(f"  상세: {response.text}")
+
+async def test_aggregate_by_location():
+    """지역 기반 RSS 피드 및 full content 테스트"""
+    async with httpx.AsyncClient(timeout=60.0) as client:
+        print("\n" + "="*60)
+        print("지역 기반 뉴스 통합 Full Content 테스트")
+        print("="*60)
+        
+        print("\n2. 지역 'Seoul'로 RSS 피드 검색 및 구글 full content 검색")
+        print("-" * 40)
+        
+        response = await client.get(
+            f"{SERVICE_URL}/api/aggregate/location",
+            params={
+                "location": "Seoul",
+                "limit": 2,
+                "google_results_per_title": 2,
+                "lang": "ko",
+                "country": "KR"
+            }
+        )
+        
+        if response.status_code == 200:
+            data = response.json()
+            print(f"✓ 지역: {data['keyword']}")
+            print(f"✓ RSS 피드 URL: {data['rss_feed_url']}")
+            print(f"✓ 처리된 항목 수: {data['processed_entries']}")
+            
+            # Check full content availability
+            full_content_count = 0
+            total_content_size = 0
+            
+            for item in data['news_items']:
+                for result in item['google_results']:
+                    if 'full_content' in result:
+                        full_content_count += 1
+                        content = result['full_content']
+                        if isinstance(content, str):
+                            total_content_size += len(content)
+                        else:
+                            total_content_size += len(str(content))
+            
+            print(f"\n📊 Full Content 통계:")
+            print(f"   - Full Content 포함 결과: {full_content_count}개")
+            print(f"   - 전체 Content 크기: {total_content_size:,} 문자")
+            print(f"   - 평균 Content 크기: {total_content_size//max(full_content_count, 1):,} 문자")
+        else:
+            print(f"✗ 오류: {response.status_code}")
+
+async def test_aggregate_by_topic():
+    """주제별 RSS 피드 및 full content 테스트"""
+    async with httpx.AsyncClient(timeout=60.0) as client:
+        print("\n" + "="*60)
+        print("주제별 뉴스 통합 Full Content 테스트")
+        print("="*60)
+        
+        print("\n3. 주제 'TECHNOLOGY'로 RSS 피드 검색 및 구글 full content 검색")
+        print("-" * 40)
+        
+        response = await client.get(
+            f"{SERVICE_URL}/api/aggregate/topic",
+            params={
+                "category": "TECHNOLOGY",
+                "limit": 2,
+                "google_results_per_title": 3,
+                "lang": "ko",
+                "country": "KR"
+            }
+        )
+        
+        if response.status_code == 200:
+            data = response.json()
+            print(f"✓ 주제: {data['keyword']}")
+            print(f"✓ 처리 시간: {data['processing_time']:.2f}초")
+            
+            # Analyze content quality for AI summarization
+            print("\n📝 AI 요약을 위한 Content 품질 분석:")
+            for i, item in enumerate(data['news_items'], 1):
+                print(f"\n  뉴스 항목 {i}:")
+                for j, result in enumerate(item['google_results'], 1):
+                    if 'full_content' in result:
+                        content = result['full_content']
+                        if isinstance(content, str):
+                            # Check content quality indicators
+                            has_paragraphs = '\n\n' in content or '</p>' in content
+                            has_sufficient_length = len(content) > 500
+                            has_korean = any(ord(char) >= 0xAC00 and ord(char) <= 0xD7A3 for char in content[:min(100, len(content))])
+                        else:
+                            content_str = str(content)
+                            has_paragraphs = '\n\n' in content_str or '</p>' in content_str
+                            has_sufficient_length = len(content_str) > 500
+                            has_korean = any(ord(char) >= 0xAC00 and ord(char) <= 0xD7A3 for char in content_str[:min(100, len(content_str))])
+                        
+                        print(f"    결과 {j} 품질 체크:")
+                        print(f"      - 충분한 길이 (>500자): {'✓' if has_sufficient_length else '✗'}")
+                        print(f"      - 단락 구조 포함: {'✓' if has_paragraphs else '✗'}")
+                        print(f"      - 한국어 콘텐츠: {'✓' if has_korean else '✗'}")
+                        print(f"      - AI 요약 가능: {'✓' if (has_sufficient_length and has_paragraphs) else '✗'}")
+        else:
+            print(f"✗ 오류: {response.status_code}")
+
+async def test_health_check():
+    """서비스 상태 확인"""
+    async with httpx.AsyncClient() as client:
+        print("\n" + "="*60)
+        print("서비스 Health Check")
+        print("="*60)
+        
+        response = await client.get(f"{SERVICE_URL}/health")
+        if response.status_code == 200:
+            data = response.json()
+            print(f"✓ 통합 서비스 상태: {data['status']}")
+            print(f"  - RSS 서비스: {data['services']['rss_feed']}")
+            print(f"  - Google 검색 서비스: {data['services']['google_search']}")
+        else:
+            print(f"✗ Health check 실패: {response.status_code}")
+
+async def main():
+    """메인 테스트 실행"""
+    print("\n" + "="*70)
+    print(" News Aggregator Full Content Integration Test ")
+    print(" RSS 피드 + Google Full Content 통합 테스트 ")
+    print("="*70)
+    
+    # Run tests
+    await test_health_check()
+    await test_aggregate_with_full_content()
+    await test_aggregate_by_location()
+    await test_aggregate_by_topic()
+    
+    print("\n" + "="*70)
+    print(" 테스트 완료 - Full Content 통합 확인 ")
+    print("="*70)
+    print("\n✅ 모든 테스트가 완료되었습니다.")
+    print("   RSS 피드 제목을 구글 full content로 검색하는 기능이 정상 작동합니다.")
+    print("   AI 요약을 위한 충분한 콘텐츠가 수집되고 있습니다.")
+
+if __name__ == "__main__":
+    asyncio.run(main())