# 모니터링 및 로깅 (Monitoring & Logging) 이 프로젝트의 모니터링 및 로깅 패턴입니다. ## Python 로깅 ### 기본 설정 ```python import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) ``` ### 로깅 패턴 ```python # 정보성 로그 logger.info(f"Starting Wikipedia Enrichment Worker") logger.info(f"Processing job {job.job_id} for Wikipedia enrichment") logger.info(f"Found {len(image_urls)} image(s) for '{name}' (logo preferred)") # 경고 로그 (비치명적 오류) logger.warning(f"Biocode registration failed (non-critical): {e}") logger.warning(f"Failed to get logo for '{title}': {e}") # 에러 로그 logger.error(f"Error processing job {job.job_id}: {e}") logger.error(f"Claude API key not configured") # 디버그 로그 logger.debug(f"Selected candidate '{candidate.get('title')}' with score: {best_score}") ``` ### 구조화된 로깅 ```python import json def log_structured(level: str, message: str, **kwargs): """구조화된 JSON 로깅""" log_entry = { "timestamp": datetime.now().isoformat(), "level": level, "message": message, **kwargs } print(json.dumps(log_entry)) # 사용 예시 log_structured("INFO", "Article processed", job_id=job.job_id, processing_time=processing_time, people_count=len(enriched_people), orgs_count=len(enriched_orgs) ) ``` ## Docker 로그 ### 로그 확인 ```bash # 전체 로그 docker-compose logs -f # 특정 서비스 로그 docker-compose logs -f news-wikipedia-enrichment # 최근 100줄만 docker-compose logs --tail=100 news-article-generator # 시간 범위 지정 docker-compose logs --since 2024-01-15T10:00:00 news-wikipedia-enrichment ``` ### 로그 드라이버 설정 ```yaml # docker-compose.yml services: news-wikipedia-enrichment: logging: driver: "json-file" options: max-size: "10m" max-file: "3" ``` ## Prometheus 설정 ### docker-compose.yml ```yaml services: prometheus: image: prom/prometheus:latest container_name: {프로젝트}-prometheus restart: unless-stopped ports: - "9090:9090" volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml - {프로젝트}_prometheus_data:/prometheus networks: - {프로젝트}-network command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--web.enable-lifecycle' ``` ### prometheus.yml ```yaml global: scrape_interval: 15s evaluation_interval: 15s scrape_configs: - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] - job_name: 'fastapi-services' static_configs: - targets: - 'base-auth:8000' - 'base-image:8000' - 'news-user-service:8000' metrics_path: '/metrics' - job_name: 'redis' static_configs: - targets: ['redis-exporter:9121'] - job_name: 'mongodb' static_configs: - targets: ['mongodb-exporter:9216'] ``` ### FastAPI 메트릭 노출 ```python from prometheus_client import Counter, Histogram, generate_latest from fastapi import Response # 메트릭 정의 REQUEST_COUNT = Counter( 'http_requests_total', 'Total HTTP requests', ['method', 'endpoint', 'status'] ) REQUEST_LATENCY = Histogram( 'http_request_duration_seconds', 'HTTP request latency', ['method', 'endpoint'] ) @app.get("/metrics") async def metrics(): return Response( content=generate_latest(), media_type="text/plain" ) @app.middleware("http") async def track_metrics(request: Request, call_next): start_time = time.time() response = await call_next(request) duration = time.time() - start_time REQUEST_COUNT.labels( method=request.method, endpoint=request.url.path, status=response.status_code ).inc() REQUEST_LATENCY.labels( method=request.method, endpoint=request.url.path ).observe(duration) return response ``` ## Grafana 설정 ### docker-compose.yml ```yaml services: grafana: image: grafana/grafana:latest container_name: {프로젝트}-grafana restart: unless-stopped ports: - "3000:3000" volumes: - {프로젝트}_grafana_data:/var/lib/grafana - ./grafana/provisioning:/etc/grafana/provisioning environment: - GF_SECURITY_ADMIN_USER=admin - GF_SECURITY_ADMIN_PASSWORD=admin123 - GF_USERS_ALLOW_SIGN_UP=false networks: - {프로젝트}-network ``` ### 데이터소스 프로비저닝 ```yaml # grafana/provisioning/datasources/datasources.yml apiVersion: 1 datasources: - name: Prometheus type: prometheus access: proxy url: http://prometheus:9090 isDefault: true editable: false ``` ### 대시보드 예시 (JSON) ```json { "dashboard": { "title": "News Pipeline Monitoring", "panels": [ { "title": "Request Rate", "type": "graph", "targets": [ { "expr": "rate(http_requests_total[5m])", "legendFormat": "{{method}} {{endpoint}}" } ] }, { "title": "Request Latency (p95)", "type": "graph", "targets": [ { "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", "legendFormat": "{{endpoint}}" } ] } ] } } ``` ## 헬스체크 ### FastAPI 헬스체크 엔드포인트 ```python @app.get("/health") async def health_check(): """헬스체크 엔드포인트""" checks = { "status": "healthy", "timestamp": datetime.now().isoformat(), "checks": {} } # MongoDB 체크 try: await db.command("ping") checks["checks"]["mongodb"] = "healthy" except Exception as e: checks["checks"]["mongodb"] = f"unhealthy: {e}" checks["status"] = "unhealthy" # Redis 체크 try: await redis.ping() checks["checks"]["redis"] = "healthy" except Exception as e: checks["checks"]["redis"] = f"unhealthy: {e}" checks["status"] = "unhealthy" status_code = 200 if checks["status"] == "healthy" else 503 return JSONResponse(content=checks, status_code=status_code) ``` ### Docker 헬스체크 ```yaml healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s timeout: 10s retries: 3 start_period: 40s ``` ## 워커 하트비트 ### Redis 기반 하트비트 ```python class QueueManager: async def start_heartbeat(self, worker_name: str): """워커 하트비트 시작""" async def heartbeat_loop(): while True: try: await self.redis.setex( f"worker:heartbeat:{worker_name}", 60, # 60초 TTL datetime.now().isoformat() ) await asyncio.sleep(30) # 30초마다 갱신 except Exception as e: logger.error(f"Heartbeat error: {e}") asyncio.create_task(heartbeat_loop()) async def get_active_workers(self) -> List[str]: """활성 워커 목록 조회""" keys = await self.redis.keys("worker:heartbeat:*") return [key.decode().split(":")[-1] for key in keys] ``` ## 알림 설정 (Alertmanager) ### alertmanager.yml ```yaml global: slack_api_url: 'https://hooks.slack.com/services/xxx' route: receiver: 'slack-notifications' group_wait: 30s group_interval: 5m repeat_interval: 4h receivers: - name: 'slack-notifications' slack_configs: - channel: '#alerts' send_resolved: true title: '{{ .GroupLabels.alertname }}' text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}' ``` ### 알림 규칙 ```yaml # prometheus/rules/alerts.yml groups: - name: service-alerts rules: - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1 for: 5m labels: severity: critical annotations: description: "High error rate detected" - alert: WorkerDown expr: absent(up{job="fastapi-services"}) for: 1m labels: severity: warning annotations: description: "Worker service is down" ```