feat: 사이트 전체 검사 기능 추가

도메인 하위 링크를 BFS로 자동 크롤링하여 페이지별 검사 수행. - BFS 링크 크롤러 (같은 도메인 필터링, max_pages/max_depth 설정) - 사이트 검사 오케스트레이션 (크롤링→순차 검사→집계) - SSE 실시간 진행 상태 (크롤링/검사/완료) - 페이지 트리 + 집계 결과 UI - UrlInputForm에 "사이트 전체 검사" 버튼 추가 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 16:46:49 +09:00
parent 44ad36e2ab
commit 81b9104aea
21 changed files with 3238 additions and 56 deletions
--- a/backend/app/services/inspection_service.py
+++ b/backend/app/services/inspection_service.py
@ -75,20 +75,162 @@ class InspectionService:

        return inspection_id

-    async def _run_inspection(
-        self, inspection_id: str, url: str, response: httpx.Response
-    ) -> None:
-        """Execute 4 category checks in parallel and store results."""
+    async def run_inspection_inline(
+        self,
+        url: str,
+        inspection_id: Optional[str] = None,
+        progress_callback: Optional[object] = None,
+    ) -> tuple[str, dict]:
+        """
+        Run a full inspection synchronously (inline) and return the result.
+
+        This is the core inspection logic extracted for reuse by both:
+        - Single-page inspection (_run_inspection wrapper with SSE/Redis)
+        - Site-wide inspection (site_inspection_service calling per-page)
+
+        Args:
+            url: Target URL to inspect.
+            inspection_id: Optional pre-generated ID. If None, a new UUID is generated.
+            progress_callback: Optional async callback(category, progress, current_step).
+                               If None, progress is not reported.
+
+        Returns:
+            (inspection_id, result_dict) where result_dict is the MongoDB document.
+
+        Raises:
+            Exception: On fetch failure or unrecoverable errors.
+        """
+        settings = get_settings()
+
+        if inspection_id is None:
+            inspection_id = str(uuid.uuid4())
+
+        # Fetch URL
+        response = await self._fetch_url(url, timeout=settings.URL_FETCH_TIMEOUT)
        html_content = response.text
        headers = dict(response.headers)
        start_time = time.time()
        created_at = datetime.now(timezone.utc)

+        # Use provided callback or a no-op
+        if progress_callback is None:
+            async def progress_callback(category: str, progress: int, current_step: str):
+                pass
+
+        # Create 4 checker engines
+        checkers = [
+            HtmlCssChecker(progress_callback=progress_callback),
+            AccessibilityChecker(progress_callback=progress_callback),
+            SeoChecker(progress_callback=progress_callback),
+            PerformanceSecurityChecker(progress_callback=progress_callback),
+        ]
+
+        # Parallel execution with per-category timeout
+        results = await asyncio.gather(
+            *[
+                asyncio.wait_for(
+                    checker.check(url, html_content, headers),
+                    timeout=settings.CATEGORY_TIMEOUT,
+                )
+                for checker in checkers
+            ],
+            return_exceptions=True,
+        )
+
+        # Process results (handle timeouts/errors per category)
+        categories = {}
+        category_names = ["html_css", "accessibility", "seo", "performance_security"]
+
+        for i, result in enumerate(results):
+            cat_name = category_names[i]
+            if isinstance(result, Exception):
+                logger.error(
+                    "Category %s failed for inspection %s: %s",
+                    cat_name, inspection_id, str(result),
+                )
+                categories[cat_name] = CategoryResult(
+                    score=0,
+                    grade="F",
+                    total_issues=0,
+                    issues=[],
+                )
+            else:
+                categories[cat_name] = result
+
+        # Calculate overall score
+        overall_score = calculate_overall_score(categories)
+        grade = calculate_grade(overall_score)
+        duration = round(time.time() - start_time, 1)
+
+        # Build summary
+        total_critical = sum(c.critical for c in categories.values())
+        total_major = sum(c.major for c in categories.values())
+        total_minor = sum(c.minor for c in categories.values())
+        total_info = sum(c.info for c in categories.values())
+        total_issues = sum(c.total_issues for c in categories.values())
+
+        summary = IssueSummary(
+            total_issues=total_issues,
+            critical=total_critical,
+            major=total_major,
+            minor=total_minor,
+            info=total_info,
+        )
+
+        # Build inspection result
+        completed_at = datetime.now(timezone.utc)
+        inspection_result = InspectionResult(
+            inspection_id=inspection_id,
+            url=url,
+            status="completed",
+            created_at=created_at,
+            completed_at=completed_at,
+            duration_seconds=duration,
+            overall_score=overall_score,
+            grade=grade,
+            categories=categories,
+            summary=summary,
+        )
+
+        # Store in MongoDB
+        doc = inspection_result.model_dump(mode="json")
+        await self.db.inspections.insert_one(doc)
+
+        # Enforce URL history limit (max 100 per URL)
+        await self._enforce_history_limit(url, max_count=100)
+
+        # Cache in Redis
+        await cache_result(inspection_id, doc)
+
+        logger.info(
+            "Inspection %s completed (inline): score=%d, duration=%.1fs",
+            inspection_id, overall_score, duration,
+        )
+
+        return inspection_id, doc
+
+    async def _run_inspection(
+        self, inspection_id: str, url: str, response: httpx.Response
+    ) -> None:
+        """
+        Execute 4 category checks in parallel and store results.
+
+        This is the background-task wrapper that adds SSE/Redis progress
+        tracking on top of run_inspection_inline().
+        """
        try:
-            # Progress callback factory
+            # Progress callback that publishes to Redis + SSE
            async def progress_callback(category: str, progress: int, current_step: str):
                await self._update_progress(inspection_id, category, progress, current_step)

+            # Use inline runner (fetches URL internally, so we pass the pre-fetched response data)
+            # Since run_inspection_inline fetches the URL again, we use the lower-level approach
+            # to avoid double-fetching. We replicate the core logic with SSE event publishing.
+            html_content = response.text
+            headers = dict(response.headers)
+            start_time = time.time()
+            created_at = datetime.now(timezone.utc)
+
            # Create 4 checker engines
            checkers = [
                HtmlCssChecker(progress_callback=progress_callback),
@ -122,14 +264,13 @@ class InspectionService:
                        "Category %s failed for inspection %s: %s",
                        cat_name, inspection_id, str(result),
                    )
-                    # Create error result for failed category
                    categories[cat_name] = CategoryResult(
                        score=0,
                        grade="F",
                        total_issues=0,
                        issues=[],
                    )
-                    # Publish category error
+                    # Publish category error event
                    await publish_event(inspection_id, {
                        "event_type": "category_complete",
                        "inspection_id": inspection_id,
@ -139,7 +280,7 @@ class InspectionService:
                    })
                else:
                    categories[cat_name] = result
-                    # Publish category completion
+                    # Publish category completion event
                    await publish_event(inspection_id, {
                        "event_type": "category_complete",
                        "inspection_id": inspection_id,
--- a/backend/app/services/link_crawler.py
+++ b/backend/app/services/link_crawler.py
@ -0,0 +1,291 @@
+"""
+BFS link crawler for same-domain page discovery.
+
+Crawls a root URL using BFS (Breadth-First Search), extracting same-domain
+links up to configurable max_pages and max_depth limits. Used by the
+site-wide inspection feature to discover pages before inspection.
+"""
+
+import logging
+from collections import deque
+from typing import Callable, Awaitable, Optional
+from urllib.parse import urljoin, urlparse, urlunparse
+
+import httpx
+from bs4 import BeautifulSoup
+
+logger = logging.getLogger(__name__)
+
+# Schemes to skip when extracting links
+_SKIP_SCHEMES = {"javascript", "mailto", "tel", "data", "blob", "ftp"}
+
+# File extensions that are not HTML pages
+_SKIP_EXTENSIONS = {
+    ".pdf", ".jpg", ".jpeg", ".png", ".gif", ".svg", ".webp", ".ico",
+    ".css", ".js", ".json", ".xml", ".zip", ".tar", ".gz", ".mp4",
+    ".mp3", ".wav", ".avi", ".mov", ".woff", ".woff2", ".ttf", ".eot",
+}
+
+# Type alias for progress callback: (pages_found, current_url) -> None
+ProgressCallback = Callable[[int, str], Awaitable[None]]
+
+
+def normalize_url(url: str) -> str:
+    """
+    Normalize a URL for deduplication:
+    - Remove fragment (#...)
+    - Remove trailing slash (except for root path)
+    - Lowercase scheme and netloc
+    """
+    parsed = urlparse(url)
+
+    # Remove fragment
+    normalized = parsed._replace(fragment="")
+
+    # Lowercase scheme and netloc
+    normalized = normalized._replace(
+        scheme=normalized.scheme.lower(),
+        netloc=normalized.netloc.lower(),
+    )
+
+    # Remove trailing slash (but keep "/" for root path)
+    path = normalized.path
+    if path != "/" and path.endswith("/"):
+        path = path.rstrip("/")
+    normalized = normalized._replace(path=path)
+
+    return urlunparse(normalized)
+
+
+def is_same_domain(url: str, root_domain: str) -> bool:
+    """Check if a URL belongs to the same domain as the root."""
+    parsed = urlparse(url)
+    url_domain = parsed.netloc.lower()
+
+    # Handle www prefix: treat example.com and www.example.com as same domain
+    root_clean = root_domain.lower().removeprefix("www.")
+    url_clean = url_domain.removeprefix("www.")
+
+    return root_clean == url_clean
+
+
+def should_skip_url(href: str) -> bool:
+    """Check if a URL should be skipped based on scheme or extension."""
+    if not href or href.strip() == "":
+        return True
+
+    # Skip anchors-only links
+    if href.startswith("#"):
+        return True
+
+    # Skip non-HTTP schemes
+    parsed = urlparse(href)
+    if parsed.scheme and parsed.scheme.lower() in _SKIP_SCHEMES:
+        return True
+
+    # Skip non-HTML file extensions
+    path = parsed.path.lower()
+    for ext in _SKIP_EXTENSIONS:
+        if path.endswith(ext):
+            return True
+
+    return False
+
+
+class LinkCrawler:
+    """
+    BFS link crawler that discovers same-domain pages.
+
+    Usage:
+        crawler = LinkCrawler(
+            root_url="https://example.com",
+            max_pages=20,
+            max_depth=2,
+        )
+        pages = await crawler.crawl(progress_callback=callback)
+    """
+
+    def __init__(
+        self,
+        root_url: str,
+        max_pages: int = 20,
+        max_depth: int = 2,
+    ):
+        self.root_url = normalize_url(root_url)
+        self.max_pages = max_pages
+        self.max_depth = max_depth
+
+        parsed = urlparse(self.root_url)
+        self.root_domain = parsed.netloc.lower()
+        self.root_scheme = parsed.scheme
+
+    async def crawl(
+        self,
+        progress_callback: Optional[ProgressCallback] = None,
+    ) -> list[dict]:
+        """
+        BFS crawl starting from root_url.
+
+        Returns list of dicts:
+            [
+                {
+                    "url": "https://example.com/",
+                    "depth": 0,
+                    "parent_url": None,
+                    "title": "Example Page",
+                    "status": "discovered",
+                },
+                ...
+            ]
+        """
+        visited: set[str] = set()
+        results: list[dict] = []
+
+        # BFS queue: (url, depth, parent_url)
+        queue: deque[tuple[str, int, Optional[str]]] = deque()
+        queue.append((self.root_url, 0, None))
+        visited.add(self.root_url)
+
+        async with httpx.AsyncClient(
+            follow_redirects=True,
+            timeout=httpx.Timeout(10.0),
+            verify=False,
+            headers={
+                "User-Agent": "WebInspector/1.0 (Site Crawler)",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
+            },
+        ) as client:
+            while queue and len(results) < self.max_pages:
+                url, depth, parent_url = queue.popleft()
+
+                # Fetch the page
+                title = None
+                status_code = None
+                links: list[str] = []
+
+                try:
+                    response = await client.get(url)
+                    status_code = response.status_code
+
+                    # Only parse HTML content
+                    content_type = response.headers.get("content-type", "")
+                    if "text/html" not in content_type and "application/xhtml" not in content_type:
+                        logger.debug("Skipping non-HTML content: %s (%s)", url, content_type)
+                        # Still record it but don't extract links
+                        results.append({
+                            "url": url,
+                            "depth": depth,
+                            "parent_url": parent_url,
+                            "title": None,
+                            "status": "discovered",
+                        })
+                        if progress_callback:
+                            await progress_callback(len(results), url)
+                        continue
+
+                    html = response.text
+                    title, links = self._extract_links_and_title(url, html)
+
+                except httpx.TimeoutException:
+                    logger.warning("Timeout crawling %s", url)
+                    results.append({
+                        "url": url,
+                        "depth": depth,
+                        "parent_url": parent_url,
+                        "title": None,
+                        "status": "discovered",
+                    })
+                    if progress_callback:
+                        await progress_callback(len(results), url)
+                    continue
+
+                except httpx.RequestError as e:
+                    logger.warning("Request error crawling %s: %s", url, str(e))
+                    results.append({
+                        "url": url,
+                        "depth": depth,
+                        "parent_url": parent_url,
+                        "title": None,
+                        "status": "discovered",
+                    })
+                    if progress_callback:
+                        await progress_callback(len(results), url)
+                    continue
+
+                # Record this page
+                results.append({
+                    "url": url,
+                    "depth": depth,
+                    "parent_url": parent_url,
+                    "title": title,
+                    "status": "discovered",
+                })
+
+                # Notify progress
+                if progress_callback:
+                    await progress_callback(len(results), url)
+
+                # Only enqueue child links if we haven't reached max_depth
+                if depth < self.max_depth:
+                    for link in links:
+                        normalized = normalize_url(link)
+
+                        if normalized in visited:
+                            continue
+
+                        if not is_same_domain(normalized, self.root_domain):
+                            continue
+
+                        if len(visited) >= self.max_pages:
+                            break
+
+                        visited.add(normalized)
+                        queue.append((normalized, depth + 1, url))
+
+        logger.info(
+            "Crawl completed: root=%s, pages_found=%d, max_pages=%d, max_depth=%d",
+            self.root_url, len(results), self.max_pages, self.max_depth,
+        )
+
+        return results
+
+    def _extract_links_and_title(
+        self, base_url: str, html: str
+    ) -> tuple[Optional[str], list[str]]:
+        """
+        Extract page title and same-domain links from HTML.
+
+        Returns:
+            (title, list_of_absolute_urls)
+        """
+        soup = BeautifulSoup(html, "html.parser")
+
+        # Extract title
+        title = None
+        title_tag = soup.find("title")
+        if title_tag and title_tag.string:
+            title = title_tag.string.strip()
+            # Truncate very long titles
+            if len(title) > 200:
+                title = title[:200] + "..."
+
+        # Extract links
+        links: list[str] = []
+        for a_tag in soup.find_all("a", href=True):
+            href = a_tag["href"].strip()
+
+            if should_skip_url(href):
+                continue
+
+            # Resolve relative URLs
+            absolute_url = urljoin(base_url, href)
+
+            # Verify it's HTTP(S)
+            parsed = urlparse(absolute_url)
+            if parsed.scheme not in ("http", "https"):
+                continue
+
+            links.append(absolute_url)
+
+        return title, links
--- a/backend/app/services/site_inspection_service.py
+++ b/backend/app/services/site_inspection_service.py
@ -0,0 +1,678 @@
+"""
+Site-wide inspection orchestration service.
+
+Manages the full site inspection lifecycle:
+  1. BFS crawling to discover same-domain pages
+  2. Sequential/parallel inspection of each discovered page
+  3. Aggregate score computation
+  4. Progress tracking via Redis Pub/Sub (SSE events)
+  5. Result storage in MongoDB (site_inspections collection)
+"""
+
+import asyncio
+import json
+import logging
+import uuid
+from datetime import datetime, timezone
+from typing import Optional
+from urllib.parse import urlparse
+
+from motor.motor_asyncio import AsyncIOMotorDatabase
+from redis.asyncio import Redis
+
+from app.core.config import get_settings
+from app.core.redis import get_redis
+from app.models.schemas import calculate_grade
+from app.services.link_crawler import LinkCrawler
+from app.services.inspection_service import InspectionService
+
+logger = logging.getLogger(__name__)
+
+# Redis key TTLs
+SITE_RESULT_CACHE_TTL = 3600  # 1 hour
+
+
+class SiteInspectionService:
+    """Site-wide inspection orchestration service."""
+
+    def __init__(self, db: AsyncIOMotorDatabase, redis: Redis):
+        self.db = db
+        self.redis = redis
+        self.inspection_service = InspectionService(db=db, redis=redis)
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    async def start_site_inspection(
+        self,
+        url: str,
+        max_pages: int = 20,
+        max_depth: int = 2,
+    ) -> str:
+        """
+        Start a site-wide inspection.
+
+        1. Validate URL
+        2. Generate site_inspection_id
+        3. Create initial MongoDB document with status "crawling"
+        4. Launch background crawl-and-inspect task
+        5. Return site_inspection_id
+        """
+        settings = get_settings()
+
+        # Clamp to server-side limits
+        max_pages = min(max_pages, settings.SITE_MAX_PAGES)
+        max_depth = min(max_depth, settings.SITE_MAX_DEPTH)
+
+        site_inspection_id = str(uuid.uuid4())
+        parsed = urlparse(url)
+        domain = parsed.netloc.lower()
+
+        # Create initial document
+        doc = {
+            "site_inspection_id": site_inspection_id,
+            "root_url": url,
+            "domain": domain,
+            "status": "crawling",
+            "created_at": datetime.now(timezone.utc),
+            "completed_at": None,
+            "config": {
+                "max_pages": max_pages,
+                "max_depth": max_depth,
+            },
+            "discovered_pages": [],
+            "aggregate_scores": None,
+        }
+        await self.db.site_inspections.insert_one(doc)
+
+        logger.info(
+            "Site inspection started: id=%s, url=%s, max_pages=%d, max_depth=%d",
+            site_inspection_id, url, max_pages, max_depth,
+        )
+
+        # Launch background task
+        asyncio.create_task(
+            self._crawl_and_inspect(site_inspection_id, url, max_pages, max_depth)
+        )
+
+        return site_inspection_id
+
+    async def get_site_inspection(self, site_inspection_id: str) -> Optional[dict]:
+        """Get site inspection result by ID (cache-first)."""
+        # Try Redis cache first
+        cache_key = f"site-inspection:result:{site_inspection_id}"
+        cached = await self.redis.get(cache_key)
+        if cached:
+            return json.loads(cached)
+
+        # Fetch from MongoDB
+        doc = await self.db.site_inspections.find_one(
+            {"site_inspection_id": site_inspection_id},
+            {"_id": 0},
+        )
+        if doc:
+            # Only cache completed results
+            if doc.get("status") in ("completed", "error"):
+                await self.redis.set(
+                    cache_key,
+                    json.dumps(doc, ensure_ascii=False, default=str),
+                    ex=SITE_RESULT_CACHE_TTL,
+                )
+            return doc
+        return None
+
+    async def get_site_inspection_list(
+        self,
+        page: int = 1,
+        limit: int = 20,
+    ) -> dict:
+        """Get paginated list of site inspections."""
+        limit = min(limit, 100)
+        skip = (page - 1) * limit
+
+        total = await self.db.site_inspections.count_documents({})
+
+        cursor = self.db.site_inspections.find(
+            {},
+            {
+                "_id": 0,
+                "site_inspection_id": 1,
+                "root_url": 1,
+                "domain": 1,
+                "status": 1,
+                "created_at": 1,
+                "discovered_pages": 1,
+                "aggregate_scores": 1,
+            },
+        ).sort("created_at", -1).skip(skip).limit(limit)
+
+        items = []
+        async for doc in cursor:
+            pages = doc.get("discovered_pages", [])
+            pages_total = len(pages)
+            pages_inspected = sum(
+                1 for p in pages if p.get("status") == "completed"
+            )
+            agg = doc.get("aggregate_scores")
+
+            items.append({
+                "site_inspection_id": doc.get("site_inspection_id"),
+                "root_url": doc.get("root_url"),
+                "domain": doc.get("domain"),
+                "status": doc.get("status"),
+                "created_at": doc.get("created_at"),
+                "pages_total": pages_total,
+                "pages_inspected": pages_inspected,
+                "overall_score": agg.get("overall_score") if agg else None,
+                "grade": agg.get("grade") if agg else None,
+            })
+
+        total_pages = max(1, -(-total // limit))
+
+        return {
+            "items": items,
+            "total": total,
+            "page": page,
+            "limit": limit,
+            "total_pages": total_pages,
+        }
+
+    async def inspect_single_page(
+        self,
+        site_inspection_id: str,
+        page_url: str,
+    ) -> Optional[str]:
+        """
+        Trigger inspection for a single page within a site inspection.
+        Returns the inspection_id if successful, None if site inspection not found.
+        """
+        doc = await self.db.site_inspections.find_one(
+            {"site_inspection_id": site_inspection_id},
+        )
+        if not doc:
+            return None
+
+        # Find the page in discovered_pages
+        page_found = False
+        for page in doc.get("discovered_pages", []):
+            if page["url"] == page_url:
+                page_found = True
+                break
+
+        if not page_found:
+            return None
+
+        # Run inspection inline
+        inspection_id = str(uuid.uuid4())
+        try:
+            inspection_id, result = await self.inspection_service.run_inspection_inline(
+                url=page_url,
+                inspection_id=inspection_id,
+            )
+
+            # Update page status in site inspection
+            overall_score = result.get("overall_score", 0)
+            grade = result.get("grade", "F")
+
+            await self.db.site_inspections.update_one(
+                {
+                    "site_inspection_id": site_inspection_id,
+                    "discovered_pages.url": page_url,
+                },
+                {
+                    "$set": {
+                        "discovered_pages.$.inspection_id": inspection_id,
+                        "discovered_pages.$.status": "completed",
+                        "discovered_pages.$.overall_score": overall_score,
+                        "discovered_pages.$.grade": grade,
+                    }
+                },
+            )
+
+            # Recompute aggregates
+            await self._compute_and_store_aggregates(site_inspection_id)
+
+            # Invalidate cache
+            cache_key = f"site-inspection:result:{site_inspection_id}"
+            await self.redis.delete(cache_key)
+
+            return inspection_id
+
+        except Exception as e:
+            logger.error(
+                "Failed to inspect page %s in site %s: %s",
+                page_url, site_inspection_id, str(e),
+            )
+            await self.db.site_inspections.update_one(
+                {
+                    "site_inspection_id": site_inspection_id,
+                    "discovered_pages.url": page_url,
+                },
+                {
+                    "$set": {
+                        "discovered_pages.$.status": "error",
+                    }
+                },
+            )
+            return None
+
+    # ------------------------------------------------------------------
+    # Background task: Crawl + Inspect
+    # ------------------------------------------------------------------
+
+    async def _crawl_and_inspect(
+        self,
+        site_inspection_id: str,
+        url: str,
+        max_pages: int,
+        max_depth: int,
+    ) -> None:
+        """
+        Background task that runs in two phases:
+        Phase 1: BFS crawling to discover pages
+        Phase 2: Parallel inspection of discovered pages (with semaphore)
+        """
+        try:
+            # ==============================
+            # Phase 1: Crawling
+            # ==============================
+            logger.info("Phase 1 (crawling) started: %s", site_inspection_id)
+
+            async def crawl_progress(pages_found: int, current_url: str):
+                await self._publish_site_event(site_inspection_id, {
+                    "event_type": "crawl_progress",
+                    "site_inspection_id": site_inspection_id,
+                    "pages_found": pages_found,
+                    "current_url": current_url,
+                })
+
+            crawler = LinkCrawler(
+                root_url=url,
+                max_pages=max_pages,
+                max_depth=max_depth,
+            )
+            discovered = await crawler.crawl(progress_callback=crawl_progress)
+
+            if not discovered:
+                raise ValueError("크롤링 결과가 없습니다. URL을 확인해주세요.")
+
+            # Build discovered_pages documents
+            discovered_pages = []
+            for page in discovered:
+                discovered_pages.append({
+                    "url": page["url"],
+                    "depth": page["depth"],
+                    "parent_url": page["parent_url"],
+                    "inspection_id": None,
+                    "status": "pending",
+                    "title": page.get("title"),
+                    "overall_score": None,
+                    "grade": None,
+                })
+
+            # Store discovered pages in MongoDB
+            await self.db.site_inspections.update_one(
+                {"site_inspection_id": site_inspection_id},
+                {
+                    "$set": {
+                        "status": "inspecting",
+                        "discovered_pages": discovered_pages,
+                    }
+                },
+            )
+
+            # Publish crawl_complete event
+            await self._publish_site_event(site_inspection_id, {
+                "event_type": "crawl_complete",
+                "site_inspection_id": site_inspection_id,
+                "total_pages": len(discovered_pages),
+                "pages": [
+                    {
+                        "url": p["url"],
+                        "depth": p["depth"],
+                        "parent_url": p["parent_url"],
+                        "title": p.get("title"),
+                    }
+                    for p in discovered_pages
+                ],
+            })
+
+            logger.info(
+                "Phase 1 completed: %s, pages=%d",
+                site_inspection_id, len(discovered_pages),
+            )
+
+            # ==============================
+            # Phase 2: Page-by-page inspection
+            # ==============================
+            logger.info("Phase 2 (inspection) started: %s", site_inspection_id)
+
+            settings = get_settings()
+            semaphore = asyncio.Semaphore(settings.SITE_CONCURRENCY)
+
+            tasks = [
+                self._inspect_page_with_semaphore(
+                    semaphore=semaphore,
+                    site_inspection_id=site_inspection_id,
+                    page_url=page["url"],
+                    page_index=idx,
+                    total_pages=len(discovered_pages),
+                )
+                for idx, page in enumerate(discovered_pages)
+            ]
+
+            await asyncio.gather(*tasks, return_exceptions=True)
+
+            # ==============================
+            # Finalize: Compute aggregates
+            # ==============================
+            aggregate_scores = await self._compute_and_store_aggregates(site_inspection_id)
+
+            # Mark as completed
+            await self.db.site_inspections.update_one(
+                {"site_inspection_id": site_inspection_id},
+                {
+                    "$set": {
+                        "status": "completed",
+                        "completed_at": datetime.now(timezone.utc),
+                    }
+                },
+            )
+
+            # Publish complete event
+            await self._publish_site_event(site_inspection_id, {
+                "event_type": "complete",
+                "site_inspection_id": site_inspection_id,
+                "status": "completed",
+                "aggregate_scores": aggregate_scores,
+            })
+
+            logger.info("Site inspection completed: %s", site_inspection_id)
+
+        except Exception as e:
+            logger.error(
+                "Site inspection %s failed: %s",
+                site_inspection_id, str(e), exc_info=True,
+            )
+
+            await self.db.site_inspections.update_one(
+                {"site_inspection_id": site_inspection_id},
+                {
+                    "$set": {
+                        "status": "error",
+                        "completed_at": datetime.now(timezone.utc),
+                    }
+                },
+            )
+
+            await self._publish_site_event(site_inspection_id, {
+                "event_type": "error",
+                "site_inspection_id": site_inspection_id,
+                "status": "error",
+                "message": f"사이트 검사 중 오류가 발생했습니다: {str(e)[:200]}",
+            })
+
+    async def _inspect_page_with_semaphore(
+        self,
+        semaphore: asyncio.Semaphore,
+        site_inspection_id: str,
+        page_url: str,
+        page_index: int,
+        total_pages: int,
+    ) -> None:
+        """Inspect a single page with semaphore-controlled concurrency."""
+        async with semaphore:
+            await self._inspect_single_page(
+                site_inspection_id=site_inspection_id,
+                page_url=page_url,
+                page_index=page_index,
+                total_pages=total_pages,
+            )
+
+    async def _inspect_single_page(
+        self,
+        site_inspection_id: str,
+        page_url: str,
+        page_index: int,
+        total_pages: int,
+    ) -> None:
+        """Run inspection for a single discovered page."""
+        inspection_id = str(uuid.uuid4())
+
+        # Publish page_start event
+        await self._publish_site_event(site_inspection_id, {
+            "event_type": "page_start",
+            "site_inspection_id": site_inspection_id,
+            "page_url": page_url,
+            "page_index": page_index,
+        })
+
+        # Mark page as inspecting in MongoDB
+        await self.db.site_inspections.update_one(
+            {
+                "site_inspection_id": site_inspection_id,
+                "discovered_pages.url": page_url,
+            },
+            {
+                "$set": {
+                    "discovered_pages.$.status": "inspecting",
+                    "discovered_pages.$.inspection_id": inspection_id,
+                }
+            },
+        )
+
+        try:
+            # Progress callback for per-page SSE updates
+            async def page_progress_callback(category: str, progress: int, current_step: str):
+                await self._publish_site_event(site_inspection_id, {
+                    "event_type": "page_progress",
+                    "site_inspection_id": site_inspection_id,
+                    "page_url": page_url,
+                    "page_index": page_index,
+                    "category": category,
+                    "progress": progress,
+                    "current_step": current_step,
+                })
+
+            # Run the inspection
+            _, result = await self.inspection_service.run_inspection_inline(
+                url=page_url,
+                inspection_id=inspection_id,
+                progress_callback=page_progress_callback,
+            )
+
+            overall_score = result.get("overall_score", 0)
+            grade = result.get("grade", "F")
+
+            # Update page status in MongoDB
+            await self.db.site_inspections.update_one(
+                {
+                    "site_inspection_id": site_inspection_id,
+                    "discovered_pages.url": page_url,
+                },
+                {
+                    "$set": {
+                        "discovered_pages.$.status": "completed",
+                        "discovered_pages.$.overall_score": overall_score,
+                        "discovered_pages.$.grade": grade,
+                    }
+                },
+            )
+
+            # Publish page_complete event
+            await self._publish_site_event(site_inspection_id, {
+                "event_type": "page_complete",
+                "site_inspection_id": site_inspection_id,
+                "page_url": page_url,
+                "page_index": page_index,
+                "inspection_id": inspection_id,
+                "score": overall_score,
+                "grade": grade,
+            })
+
+            # Compute and publish aggregate update
+            aggregate_scores = await self._compute_and_store_aggregates(site_inspection_id)
+            await self._publish_site_event(site_inspection_id, {
+                "event_type": "aggregate_update",
+                "site_inspection_id": site_inspection_id,
+                "pages_inspected": aggregate_scores.get("pages_inspected", 0),
+                "pages_total": aggregate_scores.get("pages_total", total_pages),
+                "overall_score": aggregate_scores.get("overall_score", 0),
+                "grade": aggregate_scores.get("grade", "F"),
+            })
+
+            logger.info(
+                "Page inspection completed: site=%s, page=%s, score=%d",
+                site_inspection_id, page_url, overall_score,
+            )
+
+        except Exception as e:
+            logger.error(
+                "Page inspection failed: site=%s, page=%s, error=%s",
+                site_inspection_id, page_url, str(e),
+            )
+
+            # Mark page as error
+            await self.db.site_inspections.update_one(
+                {
+                    "site_inspection_id": site_inspection_id,
+                    "discovered_pages.url": page_url,
+                },
+                {
+                    "$set": {
+                        "discovered_pages.$.status": "error",
+                    }
+                },
+            )
+
+            # Publish page error (non-fatal, continue with other pages)
+            await self._publish_site_event(site_inspection_id, {
+                "event_type": "page_complete",
+                "site_inspection_id": site_inspection_id,
+                "page_url": page_url,
+                "page_index": page_index,
+                "inspection_id": None,
+                "score": 0,
+                "grade": "F",
+                "error": str(e)[:200],
+            })
+
+    # ------------------------------------------------------------------
+    # Aggregate computation
+    # ------------------------------------------------------------------
+
+    async def _compute_and_store_aggregates(self, site_inspection_id: str) -> dict:
+        """
+        Compute aggregate scores from all completed page inspections.
+
+        Fetches each completed page's full inspection result from the
+        inspections collection, averages category scores, and stores
+        the aggregate in the site_inspections document.
+
+        Returns the aggregate_scores dict.
+        """
+        doc = await self.db.site_inspections.find_one(
+            {"site_inspection_id": site_inspection_id},
+        )
+        if not doc:
+            return {}
+
+        pages = doc.get("discovered_pages", [])
+        total_pages = len(pages)
+
+        # Collect inspection IDs for completed pages
+        completed_ids = [
+            p["inspection_id"]
+            for p in pages
+            if p.get("status") == "completed" and p.get("inspection_id")
+        ]
+
+        if not completed_ids:
+            aggregate = {
+                "overall_score": 0,
+                "grade": "F",
+                "html_css": 0,
+                "accessibility": 0,
+                "seo": 0,
+                "performance_security": 0,
+                "total_issues": 0,
+                "pages_inspected": 0,
+                "pages_total": total_pages,
+            }
+            await self._store_aggregates(site_inspection_id, aggregate)
+            return aggregate
+
+        # Fetch all completed inspection results
+        cursor = self.db.inspections.find(
+            {"inspection_id": {"$in": completed_ids}},
+            {
+                "_id": 0,
+                "overall_score": 1,
+                "categories.html_css.score": 1,
+                "categories.accessibility.score": 1,
+                "categories.seo.score": 1,
+                "categories.performance_security.score": 1,
+                "summary.total_issues": 1,
+            },
+        )
+
+        scores_overall = []
+        scores_html_css = []
+        scores_accessibility = []
+        scores_seo = []
+        scores_perf = []
+        total_issues = 0
+
+        async for insp in cursor:
+            scores_overall.append(insp.get("overall_score", 0))
+
+            cats = insp.get("categories", {})
+            scores_html_css.append(cats.get("html_css", {}).get("score", 0))
+            scores_accessibility.append(cats.get("accessibility", {}).get("score", 0))
+            scores_seo.append(cats.get("seo", {}).get("score", 0))
+            scores_perf.append(cats.get("performance_security", {}).get("score", 0))
+
+            total_issues += insp.get("summary", {}).get("total_issues", 0)
+
+        pages_inspected = len(scores_overall)
+
+        def safe_avg(values: list[int]) -> int:
+            return round(sum(values) / len(values)) if values else 0
+
+        overall_score = safe_avg(scores_overall)
+        grade = calculate_grade(overall_score)
+
+        aggregate = {
+            "overall_score": overall_score,
+            "grade": grade,
+            "html_css": safe_avg(scores_html_css),
+            "accessibility": safe_avg(scores_accessibility),
+            "seo": safe_avg(scores_seo),
+            "performance_security": safe_avg(scores_perf),
+            "total_issues": total_issues,
+            "pages_inspected": pages_inspected,
+            "pages_total": total_pages,
+        }
+
+        await self._store_aggregates(site_inspection_id, aggregate)
+        return aggregate
+
+    async def _store_aggregates(self, site_inspection_id: str, aggregate: dict) -> None:
+        """Store aggregate scores in MongoDB."""
+        await self.db.site_inspections.update_one(
+            {"site_inspection_id": site_inspection_id},
+            {"$set": {"aggregate_scores": aggregate}},
+        )
+
+    # ------------------------------------------------------------------
+    # SSE event publishing
+    # ------------------------------------------------------------------
+
+    async def _publish_site_event(self, site_inspection_id: str, event_data: dict) -> None:
+        """Publish an SSE event for site inspection via Redis Pub/Sub."""
+        channel = f"site-inspection:{site_inspection_id}:events"
+        await self.redis.publish(
+            channel,
+            json.dumps(event_data, ensure_ascii=False, default=str),
+        )