Files
todos/.claude/skills/monitoring-logging.md
jungwoo choi 993ef9640e Initial commit: 프로젝트 초기 구성
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 06:00:38 +09:00

8.2 KiB

모니터링 및 로깅 (Monitoring & Logging)

이 프로젝트의 모니터링 및 로깅 패턴입니다.

Python 로깅

기본 설정

import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

로깅 패턴

# 정보성 로그
logger.info(f"Starting Wikipedia Enrichment Worker")
logger.info(f"Processing job {job.job_id} for Wikipedia enrichment")
logger.info(f"Found {len(image_urls)} image(s) for '{name}' (logo preferred)")

# 경고 로그 (비치명적 오류)
logger.warning(f"Biocode registration failed (non-critical): {e}")
logger.warning(f"Failed to get logo for '{title}': {e}")

# 에러 로그
logger.error(f"Error processing job {job.job_id}: {e}")
logger.error(f"Claude API key not configured")

# 디버그 로그
logger.debug(f"Selected candidate '{candidate.get('title')}' with score: {best_score}")

구조화된 로깅

import json

def log_structured(level: str, message: str, **kwargs):
    """구조화된 JSON 로깅"""
    log_entry = {
        "timestamp": datetime.now().isoformat(),
        "level": level,
        "message": message,
        **kwargs
    }
    print(json.dumps(log_entry))

# 사용 예시
log_structured("INFO", "Article processed",
    job_id=job.job_id,
    processing_time=processing_time,
    people_count=len(enriched_people),
    orgs_count=len(enriched_orgs)
)

Docker 로그

로그 확인

# 전체 로그
docker-compose logs -f

# 특정 서비스 로그
docker-compose logs -f news-wikipedia-enrichment

# 최근 100줄만
docker-compose logs --tail=100 news-article-generator

# 시간 범위 지정
docker-compose logs --since 2024-01-15T10:00:00 news-wikipedia-enrichment

로그 드라이버 설정

# docker-compose.yml
services:
  news-wikipedia-enrichment:
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"

Prometheus 설정

docker-compose.yml

services:
  prometheus:
    image: prom/prometheus:latest
    container_name: {프로젝트}-prometheus
    restart: unless-stopped
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
      - {프로젝트}_prometheus_data:/prometheus
    networks:
      - {프로젝트}-network
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.enable-lifecycle'

prometheus.yml

global:
  scrape_interval: 15s
  evaluation_interval: 15s

scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  - job_name: 'fastapi-services'
    static_configs:
      - targets:
        - 'base-auth:8000'
        - 'base-image:8000'
        - 'news-user-service:8000'
    metrics_path: '/metrics'

  - job_name: 'redis'
    static_configs:
      - targets: ['redis-exporter:9121']

  - job_name: 'mongodb'
    static_configs:
      - targets: ['mongodb-exporter:9216']

FastAPI 메트릭 노출

from prometheus_client import Counter, Histogram, generate_latest
from fastapi import Response

# 메트릭 정의
REQUEST_COUNT = Counter(
    'http_requests_total',
    'Total HTTP requests',
    ['method', 'endpoint', 'status']
)

REQUEST_LATENCY = Histogram(
    'http_request_duration_seconds',
    'HTTP request latency',
    ['method', 'endpoint']
)

@app.get("/metrics")
async def metrics():
    return Response(
        content=generate_latest(),
        media_type="text/plain"
    )

@app.middleware("http")
async def track_metrics(request: Request, call_next):
    start_time = time.time()
    response = await call_next(request)
    duration = time.time() - start_time

    REQUEST_COUNT.labels(
        method=request.method,
        endpoint=request.url.path,
        status=response.status_code
    ).inc()

    REQUEST_LATENCY.labels(
        method=request.method,
        endpoint=request.url.path
    ).observe(duration)

    return response

Grafana 설정

docker-compose.yml

services:
  grafana:
    image: grafana/grafana:latest
    container_name: {프로젝트}-grafana
    restart: unless-stopped
    ports:
      - "3000:3000"
    volumes:
      - {프로젝트}_grafana_data:/var/lib/grafana
      - ./grafana/provisioning:/etc/grafana/provisioning
    environment:
      - GF_SECURITY_ADMIN_USER=admin
      - GF_SECURITY_ADMIN_PASSWORD=admin123
      - GF_USERS_ALLOW_SIGN_UP=false
    networks:
      - {프로젝트}-network

데이터소스 프로비저닝

# grafana/provisioning/datasources/datasources.yml
apiVersion: 1

datasources:
  - name: Prometheus
    type: prometheus
    access: proxy
    url: http://prometheus:9090
    isDefault: true
    editable: false

대시보드 예시 (JSON)

{
  "dashboard": {
    "title": "News Pipeline Monitoring",
    "panels": [
      {
        "title": "Request Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(http_requests_total[5m])",
            "legendFormat": "{{method}} {{endpoint}}"
          }
        ]
      },
      {
        "title": "Request Latency (p95)",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
            "legendFormat": "{{endpoint}}"
          }
        ]
      }
    ]
  }
}

헬스체크

FastAPI 헬스체크 엔드포인트

@app.get("/health")
async def health_check():
    """헬스체크 엔드포인트"""
    checks = {
        "status": "healthy",
        "timestamp": datetime.now().isoformat(),
        "checks": {}
    }

    # MongoDB 체크
    try:
        await db.command("ping")
        checks["checks"]["mongodb"] = "healthy"
    except Exception as e:
        checks["checks"]["mongodb"] = f"unhealthy: {e}"
        checks["status"] = "unhealthy"

    # Redis 체크
    try:
        await redis.ping()
        checks["checks"]["redis"] = "healthy"
    except Exception as e:
        checks["checks"]["redis"] = f"unhealthy: {e}"
        checks["status"] = "unhealthy"

    status_code = 200 if checks["status"] == "healthy" else 503
    return JSONResponse(content=checks, status_code=status_code)

Docker 헬스체크

healthcheck:
  test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
  interval: 30s
  timeout: 10s
  retries: 3
  start_period: 40s

워커 하트비트

Redis 기반 하트비트

class QueueManager:
    async def start_heartbeat(self, worker_name: str):
        """워커 하트비트 시작"""
        async def heartbeat_loop():
            while True:
                try:
                    await self.redis.setex(
                        f"worker:heartbeat:{worker_name}",
                        60,  # 60초 TTL
                        datetime.now().isoformat()
                    )
                    await asyncio.sleep(30)  # 30초마다 갱신
                except Exception as e:
                    logger.error(f"Heartbeat error: {e}")

        asyncio.create_task(heartbeat_loop())

    async def get_active_workers(self) -> List[str]:
        """활성 워커 목록 조회"""
        keys = await self.redis.keys("worker:heartbeat:*")
        return [key.decode().split(":")[-1] for key in keys]

알림 설정 (Alertmanager)

alertmanager.yml

global:
  slack_api_url: 'https://hooks.slack.com/services/xxx'

route:
  receiver: 'slack-notifications'
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 4h

receivers:
  - name: 'slack-notifications'
    slack_configs:
      - channel: '#alerts'
        send_resolved: true
        title: '{{ .GroupLabels.alertname }}'
        text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'

알림 규칙

# prometheus/rules/alerts.yml
groups:
  - name: service-alerts
    rules:
      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
        for: 5m
        labels:
          severity: critical
        annotations:
          description: "High error rate detected"

      - alert: WorkerDown
        expr: absent(up{job="fastapi-services"})
        for: 1m
        labels:
          severity: warning
        annotations:
          description: "Worker service is down"