""" Inspection orchestration service. Manages the full inspection lifecycle: - URL validation and fetching - Parallel execution of 4 checker engines - Progress tracking via Redis - Result aggregation and storage in MongoDB """ import asyncio import json import logging import time import uuid from datetime import datetime, timezone from typing import Optional import httpx from motor.motor_asyncio import AsyncIOMotorDatabase from redis.asyncio import Redis from app.core.config import get_settings from app.core.redis import ( set_inspection_status, update_category_progress, publish_event, cache_result, ) from app.engines.html_css import HtmlCssChecker from app.engines.accessibility import AccessibilityChecker from app.engines.seo import SeoChecker from app.engines.performance_security import PerformanceSecurityChecker from app.models.schemas import ( CategoryResult, InspectionResult, IssueSummary, Severity, calculate_grade, calculate_overall_score, ) logger = logging.getLogger(__name__) class InspectionService: """Inspection orchestration service.""" def __init__(self, db: AsyncIOMotorDatabase, redis: Redis): self.db = db self.redis = redis async def start_inspection( self, url: str, accessibility_standard: str = "wcag_2.1_aa", ) -> str: """ Start an inspection and return the inspection_id. 1. Validate URL accessibility (timeout 10s) 2. Generate inspection_id (UUID v4) 3. Initialize progress state in Redis 4. Launch background inspection task """ settings = get_settings() # 1. Fetch URL to verify accessibility response = await self._fetch_url(url, timeout=settings.URL_FETCH_TIMEOUT) # 2. Generate inspection_id inspection_id = str(uuid.uuid4()) # 3. Initialize Redis state await self._init_progress(inspection_id, url) # 4. Run inspection as background task asyncio.create_task( self._run_inspection(inspection_id, url, response, accessibility_standard) ) return inspection_id async def run_inspection_inline( self, url: str, inspection_id: Optional[str] = None, progress_callback: Optional[object] = None, accessibility_standard: str = "wcag_2.1_aa", ) -> tuple[str, dict]: """ Run a full inspection synchronously (inline) and return the result. This is the core inspection logic extracted for reuse by both: - Single-page inspection (_run_inspection wrapper with SSE/Redis) - Site-wide inspection (site_inspection_service calling per-page) Args: url: Target URL to inspect. inspection_id: Optional pre-generated ID. If None, a new UUID is generated. progress_callback: Optional async callback(category, progress, current_step). If None, progress is not reported. accessibility_standard: Accessibility standard to use for inspection. Returns: (inspection_id, result_dict) where result_dict is the MongoDB document. Raises: Exception: On fetch failure or unrecoverable errors. """ settings = get_settings() if inspection_id is None: inspection_id = str(uuid.uuid4()) # Fetch URL response = await self._fetch_url(url, timeout=settings.URL_FETCH_TIMEOUT) html_content = response.text headers = dict(response.headers) start_time = time.time() created_at = datetime.now(timezone.utc) # Use provided callback or a no-op if progress_callback is None: async def progress_callback(category: str, progress: int, current_step: str): pass # Create 4 checker engines checkers = [ HtmlCssChecker(progress_callback=progress_callback), AccessibilityChecker( progress_callback=progress_callback, standard=accessibility_standard, ), SeoChecker(progress_callback=progress_callback), PerformanceSecurityChecker(progress_callback=progress_callback), ] # Parallel execution with per-category timeout results = await asyncio.gather( *[ asyncio.wait_for( checker.check(url, html_content, headers), timeout=settings.CATEGORY_TIMEOUT, ) for checker in checkers ], return_exceptions=True, ) # Process results (handle timeouts/errors per category) categories = {} category_names = ["html_css", "accessibility", "seo", "performance_security"] for i, result in enumerate(results): cat_name = category_names[i] if isinstance(result, Exception): logger.error( "Category %s failed for inspection %s: %s", cat_name, inspection_id, str(result), ) categories[cat_name] = CategoryResult( score=0, grade="F", total_issues=0, issues=[], ) else: categories[cat_name] = result # Calculate overall score overall_score = calculate_overall_score(categories) grade = calculate_grade(overall_score) duration = round(time.time() - start_time, 1) # Build summary total_critical = sum(c.critical for c in categories.values()) total_major = sum(c.major for c in categories.values()) total_minor = sum(c.minor for c in categories.values()) total_info = sum(c.info for c in categories.values()) total_issues = sum(c.total_issues for c in categories.values()) summary = IssueSummary( total_issues=total_issues, critical=total_critical, major=total_major, minor=total_minor, info=total_info, ) # Build inspection result completed_at = datetime.now(timezone.utc) inspection_result = InspectionResult( inspection_id=inspection_id, url=url, status="completed", created_at=created_at, completed_at=completed_at, duration_seconds=duration, overall_score=overall_score, grade=grade, categories=categories, summary=summary, accessibility_standard=accessibility_standard, ) # Store in MongoDB doc = inspection_result.model_dump(mode="json") await self.db.inspections.insert_one(doc) # Enforce URL history limit (max 100 per URL) await self._enforce_history_limit(url, max_count=100) # Cache in Redis await cache_result(inspection_id, doc) logger.info( "Inspection %s completed (inline): score=%d, duration=%.1fs, standard=%s", inspection_id, overall_score, duration, accessibility_standard, ) return inspection_id, doc async def _run_inspection( self, inspection_id: str, url: str, response: httpx.Response, accessibility_standard: str = "wcag_2.1_aa", ) -> None: """ Execute 4 category checks in parallel and store results. This is the background-task wrapper that adds SSE/Redis progress tracking on top of run_inspection_inline(). """ try: # Progress callback that publishes to Redis + SSE async def progress_callback(category: str, progress: int, current_step: str): await self._update_progress(inspection_id, category, progress, current_step) # Use inline runner (fetches URL internally, so we pass the pre-fetched response data) # Since run_inspection_inline fetches the URL again, we use the lower-level approach # to avoid double-fetching. We replicate the core logic with SSE event publishing. html_content = response.text headers = dict(response.headers) start_time = time.time() created_at = datetime.now(timezone.utc) # Create 4 checker engines checkers = [ HtmlCssChecker(progress_callback=progress_callback), AccessibilityChecker( progress_callback=progress_callback, standard=accessibility_standard, ), SeoChecker(progress_callback=progress_callback), PerformanceSecurityChecker(progress_callback=progress_callback), ] settings = get_settings() # Parallel execution with per-category timeout results = await asyncio.gather( *[ asyncio.wait_for( checker.check(url, html_content, headers), timeout=settings.CATEGORY_TIMEOUT, ) for checker in checkers ], return_exceptions=True, ) # Process results (handle timeouts/errors per category) categories = {} category_names = ["html_css", "accessibility", "seo", "performance_security"] for i, result in enumerate(results): cat_name = category_names[i] if isinstance(result, Exception): logger.error( "Category %s failed for inspection %s: %s", cat_name, inspection_id, str(result), ) categories[cat_name] = CategoryResult( score=0, grade="F", total_issues=0, issues=[], ) # Publish category error event await publish_event(inspection_id, { "event_type": "category_complete", "inspection_id": inspection_id, "category": cat_name, "score": 0, "total_issues": 0, }) else: categories[cat_name] = result # Publish category completion event await publish_event(inspection_id, { "event_type": "category_complete", "inspection_id": inspection_id, "category": cat_name, "score": result.score, "total_issues": result.total_issues, }) # Calculate overall score overall_score = calculate_overall_score(categories) grade = calculate_grade(overall_score) duration = round(time.time() - start_time, 1) # Build summary total_critical = sum(c.critical for c in categories.values()) total_major = sum(c.major for c in categories.values()) total_minor = sum(c.minor for c in categories.values()) total_info = sum(c.info for c in categories.values()) total_issues = sum(c.total_issues for c in categories.values()) summary = IssueSummary( total_issues=total_issues, critical=total_critical, major=total_major, minor=total_minor, info=total_info, ) # Build inspection result completed_at = datetime.now(timezone.utc) inspection_result = InspectionResult( inspection_id=inspection_id, url=url, status="completed", created_at=created_at, completed_at=completed_at, duration_seconds=duration, overall_score=overall_score, grade=grade, categories=categories, summary=summary, accessibility_standard=accessibility_standard, ) # Store in MongoDB doc = inspection_result.model_dump(mode="json") await self.db.inspections.insert_one(doc) # Enforce URL history limit (max 100 per URL) await self._enforce_history_limit(url, max_count=100) # Cache in Redis await cache_result(inspection_id, doc) # Mark as completed await set_inspection_status(inspection_id, "completed") # Publish complete event await publish_event(inspection_id, { "event_type": "complete", "inspection_id": inspection_id, "status": "completed", "overall_score": overall_score, "redirect_url": f"/inspections/{inspection_id}", }) logger.info( "Inspection %s completed: score=%d, duration=%.1fs, standard=%s", inspection_id, overall_score, duration, accessibility_standard, ) except Exception as e: logger.error( "Inspection %s failed: %s", inspection_id, str(e), exc_info=True ) await set_inspection_status(inspection_id, "error") await publish_event(inspection_id, { "event_type": "error", "inspection_id": inspection_id, "status": "error", "message": "검사 중 오류가 발생했습니다", }) # Store error record in MongoDB error_doc = { "inspection_id": inspection_id, "url": url, "status": "error", "created_at": datetime.now(timezone.utc), "error_message": str(e)[:500], "overall_score": 0, "grade": "F", "categories": {}, "summary": { "total_issues": 0, "critical": 0, "major": 0, "minor": 0, "info": 0, }, } await self.db.inspections.insert_one(error_doc) async def _fetch_url(self, url: str, timeout: int = 10) -> httpx.Response: """Fetch URL content with timeout.""" async with httpx.AsyncClient( follow_redirects=True, timeout=httpx.Timeout(float(timeout)), verify=False, ) as client: response = await client.get(url, headers={ "User-Agent": "WebInspector/1.0 (Inspection Bot)", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7", }) response.raise_for_status() return response async def _init_progress(self, inspection_id: str, url: str) -> None: """Initialize inspection progress in Redis.""" await set_inspection_status(inspection_id, "running") # Initialize all category progresses for cat in ["html_css", "accessibility", "seo", "performance_security"]: await update_category_progress(inspection_id, cat, 0, "대기 중...") async def _update_progress( self, inspection_id: str, category: str, progress: int, current_step: str ) -> None: """Update category progress and publish SSE event.""" await update_category_progress(inspection_id, category, progress, current_step) # Build full progress state progress_data = await self._build_progress_event(inspection_id, category, progress, current_step) await publish_event(inspection_id, progress_data) async def _build_progress_event( self, inspection_id: str, updated_category: str, progress: int, current_step: str ) -> dict: """Build progress event data including all categories.""" from app.core.redis import get_current_progress raw = await get_current_progress(inspection_id) categories = {} category_list = ["html_css", "accessibility", "seo", "performance_security"] for cat in category_list: if raw: cat_progress = int(raw.get(f"{cat}_progress", 0)) cat_step = raw.get(f"{cat}_step", "") cat_status = raw.get(f"{cat}_status", "pending") else: cat_progress = 0 cat_step = "" cat_status = "pending" # Override with just-updated values if cat == updated_category: cat_progress = progress cat_step = current_step cat_status = "completed" if progress >= 100 else "running" categories[cat] = { "status": cat_status, "progress": cat_progress, "current_step": cat_step, } # Calculate overall progress total_progress = sum(c["progress"] for c in categories.values()) overall_progress = round(total_progress / len(categories)) return { "event_type": "progress", "inspection_id": inspection_id, "status": "running", "overall_progress": overall_progress, "categories": categories, } async def _enforce_history_limit(self, url: str, max_count: int = 100) -> None: """Delete oldest inspection records if URL exceeds max_count.""" count = await self.db.inspections.count_documents({"url": url}) if count > max_count: excess = count - max_count oldest = self.db.inspections.find( {"url": url} ).sort("created_at", 1).limit(excess) ids_to_delete = [] async for doc in oldest: ids_to_delete.append(doc["_id"]) if ids_to_delete: await self.db.inspections.delete_many({"_id": {"$in": ids_to_delete}}) logger.info( "Deleted %d oldest inspections for URL %s", len(ids_to_delete), url, ) async def get_inspection(self, inspection_id: str) -> Optional[dict]: """Get inspection result by ID (cache-first).""" from app.core.redis import get_cached_result, cache_result # Try cache first cached = await get_cached_result(inspection_id) if cached: return cached # Fetch from MongoDB doc = await self.db.inspections.find_one( {"inspection_id": inspection_id}, {"_id": 0}, ) if doc: await cache_result(inspection_id, doc) return doc return None async def get_issues( self, inspection_id: str, category: Optional[str] = None, severity: Optional[str] = None, ) -> Optional[dict]: """Get filtered issues for an inspection.""" doc = await self.get_inspection(inspection_id) if not doc: return None all_issues = [] categories = doc.get("categories", {}) for cat_name, cat_data in categories.items(): if category and category != "all" and cat_name != category: continue for issue in cat_data.get("issues", []): if severity and severity != "all" and issue.get("severity") != severity: continue all_issues.append(issue) # Sort by severity priority severity_order = {"critical": 0, "major": 1, "minor": 2, "info": 3} all_issues.sort(key=lambda x: severity_order.get(x.get("severity", "info"), 4)) return { "inspection_id": inspection_id, "total": len(all_issues), "filters": { "category": category or "all", "severity": severity or "all", }, "issues": all_issues, } async def get_inspection_list( self, page: int = 1, limit: int = 20, url_filter: Optional[str] = None, sort: str = "-created_at", ) -> dict: """Get paginated inspection list.""" limit = min(limit, 100) skip = (page - 1) * limit # Build query query = {} if url_filter: query["url"] = {"$regex": url_filter, "$options": "i"} # Sort direction if sort.startswith("-"): sort_field = sort[1:] sort_dir = -1 else: sort_field = sort sort_dir = 1 # Count total total = await self.db.inspections.count_documents(query) # Fetch items cursor = self.db.inspections.find( query, { "_id": 0, "inspection_id": 1, "url": 1, "created_at": 1, "overall_score": 1, "grade": 1, "summary.total_issues": 1, }, ).sort(sort_field, sort_dir).skip(skip).limit(limit) items = [] async for doc in cursor: items.append({ "inspection_id": doc.get("inspection_id"), "url": doc.get("url"), "created_at": doc.get("created_at"), "overall_score": doc.get("overall_score", 0), "grade": doc.get("grade", "F"), "total_issues": doc.get("summary", {}).get("total_issues", 0), }) total_pages = max(1, -(-total // limit)) # Ceiling division return { "items": items, "total": total, "page": page, "limit": limit, "total_pages": total_pages, } async def get_trend(self, url: str, limit: int = 10) -> dict: """Get trend data for a specific URL.""" cursor = self.db.inspections.find( {"url": url, "status": "completed"}, { "_id": 0, "inspection_id": 1, "created_at": 1, "overall_score": 1, "categories.html_css.score": 1, "categories.accessibility.score": 1, "categories.seo.score": 1, "categories.performance_security.score": 1, }, ).sort("created_at", 1).limit(limit) data_points = [] async for doc in cursor: cats = doc.get("categories", {}) data_points.append({ "inspection_id": doc.get("inspection_id"), "created_at": doc.get("created_at"), "overall_score": doc.get("overall_score", 0), "html_css": cats.get("html_css", {}).get("score", 0), "accessibility": cats.get("accessibility", {}).get("score", 0), "seo": cats.get("seo", {}).get("score", 0), "performance_security": cats.get("performance_security", {}).get("score", 0), }) return { "url": url, "data_points": data_points, }