Initial commit: 프로젝트 초기 구성
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
361
.claude/skills/monitoring-logging.md
Normal file
361
.claude/skills/monitoring-logging.md
Normal file
@ -0,0 +1,361 @@
|
||||
# 모니터링 및 로깅 (Monitoring & Logging)
|
||||
|
||||
이 프로젝트의 모니터링 및 로깅 패턴입니다.
|
||||
|
||||
## Python 로깅
|
||||
|
||||
### 기본 설정
|
||||
```python
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
```
|
||||
|
||||
### 로깅 패턴
|
||||
```python
|
||||
# 정보성 로그
|
||||
logger.info(f"Starting Wikipedia Enrichment Worker")
|
||||
logger.info(f"Processing job {job.job_id} for Wikipedia enrichment")
|
||||
logger.info(f"Found {len(image_urls)} image(s) for '{name}' (logo preferred)")
|
||||
|
||||
# 경고 로그 (비치명적 오류)
|
||||
logger.warning(f"Biocode registration failed (non-critical): {e}")
|
||||
logger.warning(f"Failed to get logo for '{title}': {e}")
|
||||
|
||||
# 에러 로그
|
||||
logger.error(f"Error processing job {job.job_id}: {e}")
|
||||
logger.error(f"Claude API key not configured")
|
||||
|
||||
# 디버그 로그
|
||||
logger.debug(f"Selected candidate '{candidate.get('title')}' with score: {best_score}")
|
||||
```
|
||||
|
||||
### 구조화된 로깅
|
||||
```python
|
||||
import json
|
||||
|
||||
def log_structured(level: str, message: str, **kwargs):
|
||||
"""구조화된 JSON 로깅"""
|
||||
log_entry = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"level": level,
|
||||
"message": message,
|
||||
**kwargs
|
||||
}
|
||||
print(json.dumps(log_entry))
|
||||
|
||||
# 사용 예시
|
||||
log_structured("INFO", "Article processed",
|
||||
job_id=job.job_id,
|
||||
processing_time=processing_time,
|
||||
people_count=len(enriched_people),
|
||||
orgs_count=len(enriched_orgs)
|
||||
)
|
||||
```
|
||||
|
||||
## Docker 로그
|
||||
|
||||
### 로그 확인
|
||||
```bash
|
||||
# 전체 로그
|
||||
docker-compose logs -f
|
||||
|
||||
# 특정 서비스 로그
|
||||
docker-compose logs -f news-wikipedia-enrichment
|
||||
|
||||
# 최근 100줄만
|
||||
docker-compose logs --tail=100 news-article-generator
|
||||
|
||||
# 시간 범위 지정
|
||||
docker-compose logs --since 2024-01-15T10:00:00 news-wikipedia-enrichment
|
||||
```
|
||||
|
||||
### 로그 드라이버 설정
|
||||
```yaml
|
||||
# docker-compose.yml
|
||||
services:
|
||||
news-wikipedia-enrichment:
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "10m"
|
||||
max-file: "3"
|
||||
```
|
||||
|
||||
## Prometheus 설정
|
||||
|
||||
### docker-compose.yml
|
||||
```yaml
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: {프로젝트}-prometheus
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "9090:9090"
|
||||
volumes:
|
||||
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
- {프로젝트}_prometheus_data:/prometheus
|
||||
networks:
|
||||
- {프로젝트}-network
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--web.enable-lifecycle'
|
||||
```
|
||||
|
||||
### prometheus.yml
|
||||
```yaml
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
- job_name: 'fastapi-services'
|
||||
static_configs:
|
||||
- targets:
|
||||
- 'base-auth:8000'
|
||||
- 'base-image:8000'
|
||||
- 'news-user-service:8000'
|
||||
metrics_path: '/metrics'
|
||||
|
||||
- job_name: 'redis'
|
||||
static_configs:
|
||||
- targets: ['redis-exporter:9121']
|
||||
|
||||
- job_name: 'mongodb'
|
||||
static_configs:
|
||||
- targets: ['mongodb-exporter:9216']
|
||||
```
|
||||
|
||||
### FastAPI 메트릭 노출
|
||||
```python
|
||||
from prometheus_client import Counter, Histogram, generate_latest
|
||||
from fastapi import Response
|
||||
|
||||
# 메트릭 정의
|
||||
REQUEST_COUNT = Counter(
|
||||
'http_requests_total',
|
||||
'Total HTTP requests',
|
||||
['method', 'endpoint', 'status']
|
||||
)
|
||||
|
||||
REQUEST_LATENCY = Histogram(
|
||||
'http_request_duration_seconds',
|
||||
'HTTP request latency',
|
||||
['method', 'endpoint']
|
||||
)
|
||||
|
||||
@app.get("/metrics")
|
||||
async def metrics():
|
||||
return Response(
|
||||
content=generate_latest(),
|
||||
media_type="text/plain"
|
||||
)
|
||||
|
||||
@app.middleware("http")
|
||||
async def track_metrics(request: Request, call_next):
|
||||
start_time = time.time()
|
||||
response = await call_next(request)
|
||||
duration = time.time() - start_time
|
||||
|
||||
REQUEST_COUNT.labels(
|
||||
method=request.method,
|
||||
endpoint=request.url.path,
|
||||
status=response.status_code
|
||||
).inc()
|
||||
|
||||
REQUEST_LATENCY.labels(
|
||||
method=request.method,
|
||||
endpoint=request.url.path
|
||||
).observe(duration)
|
||||
|
||||
return response
|
||||
```
|
||||
|
||||
## Grafana 설정
|
||||
|
||||
### docker-compose.yml
|
||||
```yaml
|
||||
services:
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
container_name: {프로젝트}-grafana
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "3000:3000"
|
||||
volumes:
|
||||
- {프로젝트}_grafana_data:/var/lib/grafana
|
||||
- ./grafana/provisioning:/etc/grafana/provisioning
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
- GF_SECURITY_ADMIN_PASSWORD=admin123
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
networks:
|
||||
- {프로젝트}-network
|
||||
```
|
||||
|
||||
### 데이터소스 프로비저닝
|
||||
```yaml
|
||||
# grafana/provisioning/datasources/datasources.yml
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
```
|
||||
|
||||
### 대시보드 예시 (JSON)
|
||||
```json
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "News Pipeline Monitoring",
|
||||
"panels": [
|
||||
{
|
||||
"title": "Request Rate",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(http_requests_total[5m])",
|
||||
"legendFormat": "{{method}} {{endpoint}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Request Latency (p95)",
|
||||
"type": "graph",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
|
||||
"legendFormat": "{{endpoint}}"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 헬스체크
|
||||
|
||||
### FastAPI 헬스체크 엔드포인트
|
||||
```python
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""헬스체크 엔드포인트"""
|
||||
checks = {
|
||||
"status": "healthy",
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"checks": {}
|
||||
}
|
||||
|
||||
# MongoDB 체크
|
||||
try:
|
||||
await db.command("ping")
|
||||
checks["checks"]["mongodb"] = "healthy"
|
||||
except Exception as e:
|
||||
checks["checks"]["mongodb"] = f"unhealthy: {e}"
|
||||
checks["status"] = "unhealthy"
|
||||
|
||||
# Redis 체크
|
||||
try:
|
||||
await redis.ping()
|
||||
checks["checks"]["redis"] = "healthy"
|
||||
except Exception as e:
|
||||
checks["checks"]["redis"] = f"unhealthy: {e}"
|
||||
checks["status"] = "unhealthy"
|
||||
|
||||
status_code = 200 if checks["status"] == "healthy" else 503
|
||||
return JSONResponse(content=checks, status_code=status_code)
|
||||
```
|
||||
|
||||
### Docker 헬스체크
|
||||
```yaml
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
```
|
||||
|
||||
## 워커 하트비트
|
||||
|
||||
### Redis 기반 하트비트
|
||||
```python
|
||||
class QueueManager:
|
||||
async def start_heartbeat(self, worker_name: str):
|
||||
"""워커 하트비트 시작"""
|
||||
async def heartbeat_loop():
|
||||
while True:
|
||||
try:
|
||||
await self.redis.setex(
|
||||
f"worker:heartbeat:{worker_name}",
|
||||
60, # 60초 TTL
|
||||
datetime.now().isoformat()
|
||||
)
|
||||
await asyncio.sleep(30) # 30초마다 갱신
|
||||
except Exception as e:
|
||||
logger.error(f"Heartbeat error: {e}")
|
||||
|
||||
asyncio.create_task(heartbeat_loop())
|
||||
|
||||
async def get_active_workers(self) -> List[str]:
|
||||
"""활성 워커 목록 조회"""
|
||||
keys = await self.redis.keys("worker:heartbeat:*")
|
||||
return [key.decode().split(":")[-1] for key in keys]
|
||||
```
|
||||
|
||||
## 알림 설정 (Alertmanager)
|
||||
|
||||
### alertmanager.yml
|
||||
```yaml
|
||||
global:
|
||||
slack_api_url: 'https://hooks.slack.com/services/xxx'
|
||||
|
||||
route:
|
||||
receiver: 'slack-notifications'
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
|
||||
receivers:
|
||||
- name: 'slack-notifications'
|
||||
slack_configs:
|
||||
- channel: '#alerts'
|
||||
send_resolved: true
|
||||
title: '{{ .GroupLabels.alertname }}'
|
||||
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
|
||||
```
|
||||
|
||||
### 알림 규칙
|
||||
```yaml
|
||||
# prometheus/rules/alerts.yml
|
||||
groups:
|
||||
- name: service-alerts
|
||||
rules:
|
||||
- alert: HighErrorRate
|
||||
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: "High error rate detected"
|
||||
|
||||
- alert: WorkerDown
|
||||
expr: absent(up{job="fastapi-services"})
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: "Worker service is down"
|
||||
```
|
||||
Reference in New Issue
Block a user