- FastAPI 백엔드 (audio-studio-api) - Next.js 프론트엔드 (audio-studio-ui) - Qwen3-TTS 엔진 (audio-studio-tts) - MusicGen 서비스 (audio-studio-musicgen) - Docker Compose 개발/운영 환경 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
8.2 KiB
8.2 KiB
모니터링 및 로깅 (Monitoring & Logging)
이 프로젝트의 모니터링 및 로깅 패턴입니다.
Python 로깅
기본 설정
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
로깅 패턴
# 정보성 로그
logger.info(f"Starting Wikipedia Enrichment Worker")
logger.info(f"Processing job {job.job_id} for Wikipedia enrichment")
logger.info(f"Found {len(image_urls)} image(s) for '{name}' (logo preferred)")
# 경고 로그 (비치명적 오류)
logger.warning(f"Biocode registration failed (non-critical): {e}")
logger.warning(f"Failed to get logo for '{title}': {e}")
# 에러 로그
logger.error(f"Error processing job {job.job_id}: {e}")
logger.error(f"Claude API key not configured")
# 디버그 로그
logger.debug(f"Selected candidate '{candidate.get('title')}' with score: {best_score}")
구조화된 로깅
import json
def log_structured(level: str, message: str, **kwargs):
"""구조화된 JSON 로깅"""
log_entry = {
"timestamp": datetime.now().isoformat(),
"level": level,
"message": message,
**kwargs
}
print(json.dumps(log_entry))
# 사용 예시
log_structured("INFO", "Article processed",
job_id=job.job_id,
processing_time=processing_time,
people_count=len(enriched_people),
orgs_count=len(enriched_orgs)
)
Docker 로그
로그 확인
# 전체 로그
docker-compose logs -f
# 특정 서비스 로그
docker-compose logs -f news-wikipedia-enrichment
# 최근 100줄만
docker-compose logs --tail=100 news-article-generator
# 시간 범위 지정
docker-compose logs --since 2024-01-15T10:00:00 news-wikipedia-enrichment
로그 드라이버 설정
# docker-compose.yml
services:
news-wikipedia-enrichment:
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
Prometheus 설정
docker-compose.yml
services:
prometheus:
image: prom/prometheus:latest
container_name: {프로젝트}-prometheus
restart: unless-stopped
ports:
- "9090:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- {프로젝트}_prometheus_data:/prometheus
networks:
- {프로젝트}-network
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.enable-lifecycle'
prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'fastapi-services'
static_configs:
- targets:
- 'base-auth:8000'
- 'base-image:8000'
- 'news-user-service:8000'
metrics_path: '/metrics'
- job_name: 'redis'
static_configs:
- targets: ['redis-exporter:9121']
- job_name: 'mongodb'
static_configs:
- targets: ['mongodb-exporter:9216']
FastAPI 메트릭 노출
from prometheus_client import Counter, Histogram, generate_latest
from fastapi import Response
# 메트릭 정의
REQUEST_COUNT = Counter(
'http_requests_total',
'Total HTTP requests',
['method', 'endpoint', 'status']
)
REQUEST_LATENCY = Histogram(
'http_request_duration_seconds',
'HTTP request latency',
['method', 'endpoint']
)
@app.get("/metrics")
async def metrics():
return Response(
content=generate_latest(),
media_type="text/plain"
)
@app.middleware("http")
async def track_metrics(request: Request, call_next):
start_time = time.time()
response = await call_next(request)
duration = time.time() - start_time
REQUEST_COUNT.labels(
method=request.method,
endpoint=request.url.path,
status=response.status_code
).inc()
REQUEST_LATENCY.labels(
method=request.method,
endpoint=request.url.path
).observe(duration)
return response
Grafana 설정
docker-compose.yml
services:
grafana:
image: grafana/grafana:latest
container_name: {프로젝트}-grafana
restart: unless-stopped
ports:
- "3000:3000"
volumes:
- {프로젝트}_grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin123
- GF_USERS_ALLOW_SIGN_UP=false
networks:
- {프로젝트}-network
데이터소스 프로비저닝
# grafana/provisioning/datasources/datasources.yml
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false
대시보드 예시 (JSON)
{
"dashboard": {
"title": "News Pipeline Monitoring",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total[5m])",
"legendFormat": "{{method}} {{endpoint}}"
}
]
},
{
"title": "Request Latency (p95)",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
"legendFormat": "{{endpoint}}"
}
]
}
]
}
}
헬스체크
FastAPI 헬스체크 엔드포인트
@app.get("/health")
async def health_check():
"""헬스체크 엔드포인트"""
checks = {
"status": "healthy",
"timestamp": datetime.now().isoformat(),
"checks": {}
}
# MongoDB 체크
try:
await db.command("ping")
checks["checks"]["mongodb"] = "healthy"
except Exception as e:
checks["checks"]["mongodb"] = f"unhealthy: {e}"
checks["status"] = "unhealthy"
# Redis 체크
try:
await redis.ping()
checks["checks"]["redis"] = "healthy"
except Exception as e:
checks["checks"]["redis"] = f"unhealthy: {e}"
checks["status"] = "unhealthy"
status_code = 200 if checks["status"] == "healthy" else 503
return JSONResponse(content=checks, status_code=status_code)
Docker 헬스체크
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
워커 하트비트
Redis 기반 하트비트
class QueueManager:
async def start_heartbeat(self, worker_name: str):
"""워커 하트비트 시작"""
async def heartbeat_loop():
while True:
try:
await self.redis.setex(
f"worker:heartbeat:{worker_name}",
60, # 60초 TTL
datetime.now().isoformat()
)
await asyncio.sleep(30) # 30초마다 갱신
except Exception as e:
logger.error(f"Heartbeat error: {e}")
asyncio.create_task(heartbeat_loop())
async def get_active_workers(self) -> List[str]:
"""활성 워커 목록 조회"""
keys = await self.redis.keys("worker:heartbeat:*")
return [key.decode().split(":")[-1] for key in keys]
알림 설정 (Alertmanager)
alertmanager.yml
global:
slack_api_url: 'https://hooks.slack.com/services/xxx'
route:
receiver: 'slack-notifications'
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receivers:
- name: 'slack-notifications'
slack_configs:
- channel: '#alerts'
send_resolved: true
title: '{{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
알림 규칙
# prometheus/rules/alerts.yml
groups:
- name: service-alerts
rules:
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: critical
annotations:
description: "High error rate detected"
- alert: WorkerDown
expr: absent(up{job="fastapi-services"})
for: 1m
labels:
severity: warning
annotations:
description: "Worker service is down"