362 lines
8.2 KiB
Markdown
362 lines
8.2 KiB
Markdown
# 모니터링 및 로깅 (Monitoring & Logging)
|
|
|
|
이 프로젝트의 모니터링 및 로깅 패턴입니다.
|
|
|
|
## Python 로깅
|
|
|
|
### 기본 설정
|
|
```python
|
|
import logging
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
```
|
|
|
|
### 로깅 패턴
|
|
```python
|
|
# 정보성 로그
|
|
logger.info(f"Starting Wikipedia Enrichment Worker")
|
|
logger.info(f"Processing job {job.job_id} for Wikipedia enrichment")
|
|
logger.info(f"Found {len(image_urls)} image(s) for '{name}' (logo preferred)")
|
|
|
|
# 경고 로그 (비치명적 오류)
|
|
logger.warning(f"Biocode registration failed (non-critical): {e}")
|
|
logger.warning(f"Failed to get logo for '{title}': {e}")
|
|
|
|
# 에러 로그
|
|
logger.error(f"Error processing job {job.job_id}: {e}")
|
|
logger.error(f"Claude API key not configured")
|
|
|
|
# 디버그 로그
|
|
logger.debug(f"Selected candidate '{candidate.get('title')}' with score: {best_score}")
|
|
```
|
|
|
|
### 구조화된 로깅
|
|
```python
|
|
import json
|
|
|
|
def log_structured(level: str, message: str, **kwargs):
|
|
"""구조화된 JSON 로깅"""
|
|
log_entry = {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"level": level,
|
|
"message": message,
|
|
**kwargs
|
|
}
|
|
print(json.dumps(log_entry))
|
|
|
|
# 사용 예시
|
|
log_structured("INFO", "Article processed",
|
|
job_id=job.job_id,
|
|
processing_time=processing_time,
|
|
people_count=len(enriched_people),
|
|
orgs_count=len(enriched_orgs)
|
|
)
|
|
```
|
|
|
|
## Docker 로그
|
|
|
|
### 로그 확인
|
|
```bash
|
|
# 전체 로그
|
|
docker-compose logs -f
|
|
|
|
# 특정 서비스 로그
|
|
docker-compose logs -f news-wikipedia-enrichment
|
|
|
|
# 최근 100줄만
|
|
docker-compose logs --tail=100 news-article-generator
|
|
|
|
# 시간 범위 지정
|
|
docker-compose logs --since 2024-01-15T10:00:00 news-wikipedia-enrichment
|
|
```
|
|
|
|
### 로그 드라이버 설정
|
|
```yaml
|
|
# docker-compose.yml
|
|
services:
|
|
news-wikipedia-enrichment:
|
|
logging:
|
|
driver: "json-file"
|
|
options:
|
|
max-size: "10m"
|
|
max-file: "3"
|
|
```
|
|
|
|
## Prometheus 설정
|
|
|
|
### docker-compose.yml
|
|
```yaml
|
|
services:
|
|
prometheus:
|
|
image: prom/prometheus:latest
|
|
container_name: {프로젝트}-prometheus
|
|
restart: unless-stopped
|
|
ports:
|
|
- "9090:9090"
|
|
volumes:
|
|
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
|
|
- {프로젝트}_prometheus_data:/prometheus
|
|
networks:
|
|
- {프로젝트}-network
|
|
command:
|
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
|
- '--storage.tsdb.path=/prometheus'
|
|
- '--web.enable-lifecycle'
|
|
```
|
|
|
|
### prometheus.yml
|
|
```yaml
|
|
global:
|
|
scrape_interval: 15s
|
|
evaluation_interval: 15s
|
|
|
|
scrape_configs:
|
|
- job_name: 'prometheus'
|
|
static_configs:
|
|
- targets: ['localhost:9090']
|
|
|
|
- job_name: 'fastapi-services'
|
|
static_configs:
|
|
- targets:
|
|
- 'base-auth:8000'
|
|
- 'base-image:8000'
|
|
- 'news-user-service:8000'
|
|
metrics_path: '/metrics'
|
|
|
|
- job_name: 'redis'
|
|
static_configs:
|
|
- targets: ['redis-exporter:9121']
|
|
|
|
- job_name: 'mongodb'
|
|
static_configs:
|
|
- targets: ['mongodb-exporter:9216']
|
|
```
|
|
|
|
### FastAPI 메트릭 노출
|
|
```python
|
|
from prometheus_client import Counter, Histogram, generate_latest
|
|
from fastapi import Response
|
|
|
|
# 메트릭 정의
|
|
REQUEST_COUNT = Counter(
|
|
'http_requests_total',
|
|
'Total HTTP requests',
|
|
['method', 'endpoint', 'status']
|
|
)
|
|
|
|
REQUEST_LATENCY = Histogram(
|
|
'http_request_duration_seconds',
|
|
'HTTP request latency',
|
|
['method', 'endpoint']
|
|
)
|
|
|
|
@app.get("/metrics")
|
|
async def metrics():
|
|
return Response(
|
|
content=generate_latest(),
|
|
media_type="text/plain"
|
|
)
|
|
|
|
@app.middleware("http")
|
|
async def track_metrics(request: Request, call_next):
|
|
start_time = time.time()
|
|
response = await call_next(request)
|
|
duration = time.time() - start_time
|
|
|
|
REQUEST_COUNT.labels(
|
|
method=request.method,
|
|
endpoint=request.url.path,
|
|
status=response.status_code
|
|
).inc()
|
|
|
|
REQUEST_LATENCY.labels(
|
|
method=request.method,
|
|
endpoint=request.url.path
|
|
).observe(duration)
|
|
|
|
return response
|
|
```
|
|
|
|
## Grafana 설정
|
|
|
|
### docker-compose.yml
|
|
```yaml
|
|
services:
|
|
grafana:
|
|
image: grafana/grafana:latest
|
|
container_name: {프로젝트}-grafana
|
|
restart: unless-stopped
|
|
ports:
|
|
- "3000:3000"
|
|
volumes:
|
|
- {프로젝트}_grafana_data:/var/lib/grafana
|
|
- ./grafana/provisioning:/etc/grafana/provisioning
|
|
environment:
|
|
- GF_SECURITY_ADMIN_USER=admin
|
|
- GF_SECURITY_ADMIN_PASSWORD=admin123
|
|
- GF_USERS_ALLOW_SIGN_UP=false
|
|
networks:
|
|
- {프로젝트}-network
|
|
```
|
|
|
|
### 데이터소스 프로비저닝
|
|
```yaml
|
|
# grafana/provisioning/datasources/datasources.yml
|
|
apiVersion: 1
|
|
|
|
datasources:
|
|
- name: Prometheus
|
|
type: prometheus
|
|
access: proxy
|
|
url: http://prometheus:9090
|
|
isDefault: true
|
|
editable: false
|
|
```
|
|
|
|
### 대시보드 예시 (JSON)
|
|
```json
|
|
{
|
|
"dashboard": {
|
|
"title": "News Pipeline Monitoring",
|
|
"panels": [
|
|
{
|
|
"title": "Request Rate",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(http_requests_total[5m])",
|
|
"legendFormat": "{{method}} {{endpoint}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Request Latency (p95)",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "{{endpoint}}"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
}
|
|
```
|
|
|
|
## 헬스체크
|
|
|
|
### FastAPI 헬스체크 엔드포인트
|
|
```python
|
|
@app.get("/health")
|
|
async def health_check():
|
|
"""헬스체크 엔드포인트"""
|
|
checks = {
|
|
"status": "healthy",
|
|
"timestamp": datetime.now().isoformat(),
|
|
"checks": {}
|
|
}
|
|
|
|
# MongoDB 체크
|
|
try:
|
|
await db.command("ping")
|
|
checks["checks"]["mongodb"] = "healthy"
|
|
except Exception as e:
|
|
checks["checks"]["mongodb"] = f"unhealthy: {e}"
|
|
checks["status"] = "unhealthy"
|
|
|
|
# Redis 체크
|
|
try:
|
|
await redis.ping()
|
|
checks["checks"]["redis"] = "healthy"
|
|
except Exception as e:
|
|
checks["checks"]["redis"] = f"unhealthy: {e}"
|
|
checks["status"] = "unhealthy"
|
|
|
|
status_code = 200 if checks["status"] == "healthy" else 503
|
|
return JSONResponse(content=checks, status_code=status_code)
|
|
```
|
|
|
|
### Docker 헬스체크
|
|
```yaml
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 40s
|
|
```
|
|
|
|
## 워커 하트비트
|
|
|
|
### Redis 기반 하트비트
|
|
```python
|
|
class QueueManager:
|
|
async def start_heartbeat(self, worker_name: str):
|
|
"""워커 하트비트 시작"""
|
|
async def heartbeat_loop():
|
|
while True:
|
|
try:
|
|
await self.redis.setex(
|
|
f"worker:heartbeat:{worker_name}",
|
|
60, # 60초 TTL
|
|
datetime.now().isoformat()
|
|
)
|
|
await asyncio.sleep(30) # 30초마다 갱신
|
|
except Exception as e:
|
|
logger.error(f"Heartbeat error: {e}")
|
|
|
|
asyncio.create_task(heartbeat_loop())
|
|
|
|
async def get_active_workers(self) -> List[str]:
|
|
"""활성 워커 목록 조회"""
|
|
keys = await self.redis.keys("worker:heartbeat:*")
|
|
return [key.decode().split(":")[-1] for key in keys]
|
|
```
|
|
|
|
## 알림 설정 (Alertmanager)
|
|
|
|
### alertmanager.yml
|
|
```yaml
|
|
global:
|
|
slack_api_url: 'https://hooks.slack.com/services/xxx'
|
|
|
|
route:
|
|
receiver: 'slack-notifications'
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 4h
|
|
|
|
receivers:
|
|
- name: 'slack-notifications'
|
|
slack_configs:
|
|
- channel: '#alerts'
|
|
send_resolved: true
|
|
title: '{{ .GroupLabels.alertname }}'
|
|
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
|
|
```
|
|
|
|
### 알림 규칙
|
|
```yaml
|
|
# prometheus/rules/alerts.yml
|
|
groups:
|
|
- name: service-alerts
|
|
rules:
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: "High error rate detected"
|
|
|
|
- alert: WorkerDown
|
|
expr: absent(up{job="fastapi-services"})
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: "Worker service is down"
|
|
```
|