feat: Implement async queue-based news pipeline with microservices
Major architectural transformation from synchronous to asynchronous processing: ## Pipeline Services (8 microservices) - pipeline-scheduler: APScheduler for 30-minute periodic job triggers - pipeline-rss-collector: RSS feed collection with deduplication (7-day TTL) - pipeline-google-search: Content enrichment via Google Search API - pipeline-ai-summarizer: AI summarization using Claude API (claude-sonnet-4-20250514) - pipeline-translator: Translation using DeepL Pro API - pipeline-image-generator: Image generation with Replicate API (Stable Diffusion) - pipeline-article-assembly: Final article assembly and MongoDB storage - pipeline-monitor: Real-time monitoring dashboard (port 8100) ## Key Features - Redis-based job queue with deduplication - Asynchronous processing with Python asyncio - Shared models and queue manager for inter-service communication - Docker containerization for all services - Container names standardized with site11_ prefix ## Removed Services - Moved to backup: google-search, rss-feed, news-aggregator, ai-writer ## Configuration - DeepL Pro API: 3abbc796-2515-44a8-972d-22dcf27ab54a - Claude Model: claude-sonnet-4-20250514 - Redis Queue TTL: 7 days for deduplication 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
19
services/pipeline/google-search/Dockerfile
Normal file
19
services/pipeline/google-search/Dockerfile
Normal file
@ -0,0 +1,19 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 의존성 설치
|
||||
COPY ./google-search/requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# 공통 모듈 복사
|
||||
COPY ./shared /app/shared
|
||||
|
||||
# Google Search 코드 복사
|
||||
COPY ./google-search /app
|
||||
|
||||
# 환경변수
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
# 실행
|
||||
CMD ["python", "google_search.py"]
|
||||
153
services/pipeline/google-search/google_search.py
Normal file
153
services/pipeline/google-search/google_search.py
Normal file
@ -0,0 +1,153 @@
|
||||
"""
|
||||
Google Search Service
|
||||
Google 검색으로 RSS 항목 강화
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from typing import List, Dict, Any
|
||||
import aiohttp
|
||||
from datetime import datetime
|
||||
|
||||
# Import from shared module
|
||||
from shared.models import PipelineJob, RSSItem, SearchResult, EnrichedItem
|
||||
from shared.queue_manager import QueueManager
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class GoogleSearchWorker:
|
||||
def __init__(self):
|
||||
self.queue_manager = QueueManager(
|
||||
redis_url=os.getenv("REDIS_URL", "redis://redis:6379")
|
||||
)
|
||||
self.google_api_key = os.getenv("GOOGLE_API_KEY")
|
||||
self.search_engine_id = os.getenv("GOOGLE_SEARCH_ENGINE_ID")
|
||||
self.max_results_per_item = 3
|
||||
|
||||
async def start(self):
|
||||
"""워커 시작"""
|
||||
logger.info("Starting Google Search Worker")
|
||||
|
||||
# Redis 연결
|
||||
await self.queue_manager.connect()
|
||||
|
||||
# 메인 처리 루프
|
||||
while True:
|
||||
try:
|
||||
# 큐에서 작업 가져오기
|
||||
job = await self.queue_manager.dequeue('search_enrichment', timeout=5)
|
||||
|
||||
if job:
|
||||
await self.process_job(job)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in worker loop: {e}")
|
||||
await asyncio.sleep(1)
|
||||
|
||||
async def process_job(self, job: PipelineJob):
|
||||
"""검색 강화 작업 처리"""
|
||||
try:
|
||||
logger.info(f"Processing job {job.job_id} for search enrichment")
|
||||
|
||||
rss_items = job.data.get('rss_items', [])
|
||||
enriched_items = []
|
||||
|
||||
# 최대 5개 항목만 처리 (API 할당량 관리)
|
||||
for item_data in rss_items[:5]:
|
||||
rss_item = RSSItem(**item_data)
|
||||
|
||||
# 제목으로 Google 검색
|
||||
search_results = await self._search_google(rss_item.title)
|
||||
|
||||
enriched_item = EnrichedItem(
|
||||
rss_item=rss_item,
|
||||
search_results=search_results
|
||||
)
|
||||
enriched_items.append(enriched_item)
|
||||
|
||||
# API 속도 제한
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
if enriched_items:
|
||||
logger.info(f"Enriched {len(enriched_items)} items with search results")
|
||||
|
||||
# 다음 단계로 전달
|
||||
job.data['enriched_items'] = [item.dict() for item in enriched_items]
|
||||
job.stages_completed.append('search_enrichment')
|
||||
job.stage = 'ai_summarization'
|
||||
|
||||
await self.queue_manager.enqueue('ai_summarization', job)
|
||||
await self.queue_manager.mark_completed('search_enrichment', job.job_id)
|
||||
else:
|
||||
logger.warning(f"No items enriched for job {job.job_id}")
|
||||
await self.queue_manager.mark_failed(
|
||||
'search_enrichment',
|
||||
job,
|
||||
"No items to enrich"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing job {job.job_id}: {e}")
|
||||
await self.queue_manager.mark_failed('search_enrichment', job, str(e))
|
||||
|
||||
async def _search_google(self, query: str) -> List[SearchResult]:
|
||||
"""Google Custom Search API 호출"""
|
||||
results = []
|
||||
|
||||
if not self.google_api_key or not self.search_engine_id:
|
||||
logger.warning("Google API credentials not configured")
|
||||
return results
|
||||
|
||||
try:
|
||||
url = "https://www.googleapis.com/customsearch/v1"
|
||||
params = {
|
||||
"key": self.google_api_key,
|
||||
"cx": self.search_engine_id,
|
||||
"q": query,
|
||||
"num": self.max_results_per_item,
|
||||
"hl": "ko",
|
||||
"gl": "kr"
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, params=params, timeout=30) as response:
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
|
||||
for item in data.get('items', []):
|
||||
result = SearchResult(
|
||||
title=item.get('title', ''),
|
||||
link=item.get('link', ''),
|
||||
snippet=item.get('snippet', ''),
|
||||
source='google'
|
||||
)
|
||||
results.append(result)
|
||||
else:
|
||||
logger.error(f"Google API error: {response.status}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error searching Google for '{query}': {e}")
|
||||
|
||||
return results
|
||||
|
||||
async def stop(self):
|
||||
"""워커 중지"""
|
||||
await self.queue_manager.disconnect()
|
||||
logger.info("Google Search Worker stopped")
|
||||
|
||||
async def main():
|
||||
"""메인 함수"""
|
||||
worker = GoogleSearchWorker()
|
||||
|
||||
try:
|
||||
await worker.start()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Received interrupt signal")
|
||||
finally:
|
||||
await worker.stop()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
3
services/pipeline/google-search/requirements.txt
Normal file
3
services/pipeline/google-search/requirements.txt
Normal file
@ -0,0 +1,3 @@
|
||||
aiohttp==3.9.1
|
||||
redis[hiredis]==5.0.1
|
||||
pydantic==2.5.0
|
||||
Reference in New Issue
Block a user