feat: 사이트 전체 검사 기능 추가

도메인 하위 링크를 BFS로 자동 크롤링하여 페이지별 검사 수행.
- BFS 링크 크롤러 (같은 도메인 필터링, max_pages/max_depth 설정)
- 사이트 검사 오케스트레이션 (크롤링→순차 검사→집계)
- SSE 실시간 진행 상태 (크롤링/검사/완료)
- 페이지 트리 + 집계 결과 UI
- UrlInputForm에 "사이트 전체 검사" 버튼 추가

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
jungwoo choi
2026-02-13 16:46:49 +09:00
parent 44ad36e2ab
commit 81b9104aea
21 changed files with 3238 additions and 56 deletions

View File

@ -75,20 +75,162 @@ class InspectionService:
return inspection_id
async def _run_inspection(
self, inspection_id: str, url: str, response: httpx.Response
) -> None:
"""Execute 4 category checks in parallel and store results."""
async def run_inspection_inline(
self,
url: str,
inspection_id: Optional[str] = None,
progress_callback: Optional[object] = None,
) -> tuple[str, dict]:
"""
Run a full inspection synchronously (inline) and return the result.
This is the core inspection logic extracted for reuse by both:
- Single-page inspection (_run_inspection wrapper with SSE/Redis)
- Site-wide inspection (site_inspection_service calling per-page)
Args:
url: Target URL to inspect.
inspection_id: Optional pre-generated ID. If None, a new UUID is generated.
progress_callback: Optional async callback(category, progress, current_step).
If None, progress is not reported.
Returns:
(inspection_id, result_dict) where result_dict is the MongoDB document.
Raises:
Exception: On fetch failure or unrecoverable errors.
"""
settings = get_settings()
if inspection_id is None:
inspection_id = str(uuid.uuid4())
# Fetch URL
response = await self._fetch_url(url, timeout=settings.URL_FETCH_TIMEOUT)
html_content = response.text
headers = dict(response.headers)
start_time = time.time()
created_at = datetime.now(timezone.utc)
# Use provided callback or a no-op
if progress_callback is None:
async def progress_callback(category: str, progress: int, current_step: str):
pass
# Create 4 checker engines
checkers = [
HtmlCssChecker(progress_callback=progress_callback),
AccessibilityChecker(progress_callback=progress_callback),
SeoChecker(progress_callback=progress_callback),
PerformanceSecurityChecker(progress_callback=progress_callback),
]
# Parallel execution with per-category timeout
results = await asyncio.gather(
*[
asyncio.wait_for(
checker.check(url, html_content, headers),
timeout=settings.CATEGORY_TIMEOUT,
)
for checker in checkers
],
return_exceptions=True,
)
# Process results (handle timeouts/errors per category)
categories = {}
category_names = ["html_css", "accessibility", "seo", "performance_security"]
for i, result in enumerate(results):
cat_name = category_names[i]
if isinstance(result, Exception):
logger.error(
"Category %s failed for inspection %s: %s",
cat_name, inspection_id, str(result),
)
categories[cat_name] = CategoryResult(
score=0,
grade="F",
total_issues=0,
issues=[],
)
else:
categories[cat_name] = result
# Calculate overall score
overall_score = calculate_overall_score(categories)
grade = calculate_grade(overall_score)
duration = round(time.time() - start_time, 1)
# Build summary
total_critical = sum(c.critical for c in categories.values())
total_major = sum(c.major for c in categories.values())
total_minor = sum(c.minor for c in categories.values())
total_info = sum(c.info for c in categories.values())
total_issues = sum(c.total_issues for c in categories.values())
summary = IssueSummary(
total_issues=total_issues,
critical=total_critical,
major=total_major,
minor=total_minor,
info=total_info,
)
# Build inspection result
completed_at = datetime.now(timezone.utc)
inspection_result = InspectionResult(
inspection_id=inspection_id,
url=url,
status="completed",
created_at=created_at,
completed_at=completed_at,
duration_seconds=duration,
overall_score=overall_score,
grade=grade,
categories=categories,
summary=summary,
)
# Store in MongoDB
doc = inspection_result.model_dump(mode="json")
await self.db.inspections.insert_one(doc)
# Enforce URL history limit (max 100 per URL)
await self._enforce_history_limit(url, max_count=100)
# Cache in Redis
await cache_result(inspection_id, doc)
logger.info(
"Inspection %s completed (inline): score=%d, duration=%.1fs",
inspection_id, overall_score, duration,
)
return inspection_id, doc
async def _run_inspection(
self, inspection_id: str, url: str, response: httpx.Response
) -> None:
"""
Execute 4 category checks in parallel and store results.
This is the background-task wrapper that adds SSE/Redis progress
tracking on top of run_inspection_inline().
"""
try:
# Progress callback factory
# Progress callback that publishes to Redis + SSE
async def progress_callback(category: str, progress: int, current_step: str):
await self._update_progress(inspection_id, category, progress, current_step)
# Use inline runner (fetches URL internally, so we pass the pre-fetched response data)
# Since run_inspection_inline fetches the URL again, we use the lower-level approach
# to avoid double-fetching. We replicate the core logic with SSE event publishing.
html_content = response.text
headers = dict(response.headers)
start_time = time.time()
created_at = datetime.now(timezone.utc)
# Create 4 checker engines
checkers = [
HtmlCssChecker(progress_callback=progress_callback),
@ -122,14 +264,13 @@ class InspectionService:
"Category %s failed for inspection %s: %s",
cat_name, inspection_id, str(result),
)
# Create error result for failed category
categories[cat_name] = CategoryResult(
score=0,
grade="F",
total_issues=0,
issues=[],
)
# Publish category error
# Publish category error event
await publish_event(inspection_id, {
"event_type": "category_complete",
"inspection_id": inspection_id,
@ -139,7 +280,7 @@ class InspectionService:
})
else:
categories[cat_name] = result
# Publish category completion
# Publish category completion event
await publish_event(inspection_id, {
"event_type": "category_complete",
"inspection_id": inspection_id,

View File

@ -0,0 +1,291 @@
"""
BFS link crawler for same-domain page discovery.
Crawls a root URL using BFS (Breadth-First Search), extracting same-domain
links up to configurable max_pages and max_depth limits. Used by the
site-wide inspection feature to discover pages before inspection.
"""
import logging
from collections import deque
from typing import Callable, Awaitable, Optional
from urllib.parse import urljoin, urlparse, urlunparse
import httpx
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
# Schemes to skip when extracting links
_SKIP_SCHEMES = {"javascript", "mailto", "tel", "data", "blob", "ftp"}
# File extensions that are not HTML pages
_SKIP_EXTENSIONS = {
".pdf", ".jpg", ".jpeg", ".png", ".gif", ".svg", ".webp", ".ico",
".css", ".js", ".json", ".xml", ".zip", ".tar", ".gz", ".mp4",
".mp3", ".wav", ".avi", ".mov", ".woff", ".woff2", ".ttf", ".eot",
}
# Type alias for progress callback: (pages_found, current_url) -> None
ProgressCallback = Callable[[int, str], Awaitable[None]]
def normalize_url(url: str) -> str:
"""
Normalize a URL for deduplication:
- Remove fragment (#...)
- Remove trailing slash (except for root path)
- Lowercase scheme and netloc
"""
parsed = urlparse(url)
# Remove fragment
normalized = parsed._replace(fragment="")
# Lowercase scheme and netloc
normalized = normalized._replace(
scheme=normalized.scheme.lower(),
netloc=normalized.netloc.lower(),
)
# Remove trailing slash (but keep "/" for root path)
path = normalized.path
if path != "/" and path.endswith("/"):
path = path.rstrip("/")
normalized = normalized._replace(path=path)
return urlunparse(normalized)
def is_same_domain(url: str, root_domain: str) -> bool:
"""Check if a URL belongs to the same domain as the root."""
parsed = urlparse(url)
url_domain = parsed.netloc.lower()
# Handle www prefix: treat example.com and www.example.com as same domain
root_clean = root_domain.lower().removeprefix("www.")
url_clean = url_domain.removeprefix("www.")
return root_clean == url_clean
def should_skip_url(href: str) -> bool:
"""Check if a URL should be skipped based on scheme or extension."""
if not href or href.strip() == "":
return True
# Skip anchors-only links
if href.startswith("#"):
return True
# Skip non-HTTP schemes
parsed = urlparse(href)
if parsed.scheme and parsed.scheme.lower() in _SKIP_SCHEMES:
return True
# Skip non-HTML file extensions
path = parsed.path.lower()
for ext in _SKIP_EXTENSIONS:
if path.endswith(ext):
return True
return False
class LinkCrawler:
"""
BFS link crawler that discovers same-domain pages.
Usage:
crawler = LinkCrawler(
root_url="https://example.com",
max_pages=20,
max_depth=2,
)
pages = await crawler.crawl(progress_callback=callback)
"""
def __init__(
self,
root_url: str,
max_pages: int = 20,
max_depth: int = 2,
):
self.root_url = normalize_url(root_url)
self.max_pages = max_pages
self.max_depth = max_depth
parsed = urlparse(self.root_url)
self.root_domain = parsed.netloc.lower()
self.root_scheme = parsed.scheme
async def crawl(
self,
progress_callback: Optional[ProgressCallback] = None,
) -> list[dict]:
"""
BFS crawl starting from root_url.
Returns list of dicts:
[
{
"url": "https://example.com/",
"depth": 0,
"parent_url": None,
"title": "Example Page",
"status": "discovered",
},
...
]
"""
visited: set[str] = set()
results: list[dict] = []
# BFS queue: (url, depth, parent_url)
queue: deque[tuple[str, int, Optional[str]]] = deque()
queue.append((self.root_url, 0, None))
visited.add(self.root_url)
async with httpx.AsyncClient(
follow_redirects=True,
timeout=httpx.Timeout(10.0),
verify=False,
headers={
"User-Agent": "WebInspector/1.0 (Site Crawler)",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
},
) as client:
while queue and len(results) < self.max_pages:
url, depth, parent_url = queue.popleft()
# Fetch the page
title = None
status_code = None
links: list[str] = []
try:
response = await client.get(url)
status_code = response.status_code
# Only parse HTML content
content_type = response.headers.get("content-type", "")
if "text/html" not in content_type and "application/xhtml" not in content_type:
logger.debug("Skipping non-HTML content: %s (%s)", url, content_type)
# Still record it but don't extract links
results.append({
"url": url,
"depth": depth,
"parent_url": parent_url,
"title": None,
"status": "discovered",
})
if progress_callback:
await progress_callback(len(results), url)
continue
html = response.text
title, links = self._extract_links_and_title(url, html)
except httpx.TimeoutException:
logger.warning("Timeout crawling %s", url)
results.append({
"url": url,
"depth": depth,
"parent_url": parent_url,
"title": None,
"status": "discovered",
})
if progress_callback:
await progress_callback(len(results), url)
continue
except httpx.RequestError as e:
logger.warning("Request error crawling %s: %s", url, str(e))
results.append({
"url": url,
"depth": depth,
"parent_url": parent_url,
"title": None,
"status": "discovered",
})
if progress_callback:
await progress_callback(len(results), url)
continue
# Record this page
results.append({
"url": url,
"depth": depth,
"parent_url": parent_url,
"title": title,
"status": "discovered",
})
# Notify progress
if progress_callback:
await progress_callback(len(results), url)
# Only enqueue child links if we haven't reached max_depth
if depth < self.max_depth:
for link in links:
normalized = normalize_url(link)
if normalized in visited:
continue
if not is_same_domain(normalized, self.root_domain):
continue
if len(visited) >= self.max_pages:
break
visited.add(normalized)
queue.append((normalized, depth + 1, url))
logger.info(
"Crawl completed: root=%s, pages_found=%d, max_pages=%d, max_depth=%d",
self.root_url, len(results), self.max_pages, self.max_depth,
)
return results
def _extract_links_and_title(
self, base_url: str, html: str
) -> tuple[Optional[str], list[str]]:
"""
Extract page title and same-domain links from HTML.
Returns:
(title, list_of_absolute_urls)
"""
soup = BeautifulSoup(html, "html.parser")
# Extract title
title = None
title_tag = soup.find("title")
if title_tag and title_tag.string:
title = title_tag.string.strip()
# Truncate very long titles
if len(title) > 200:
title = title[:200] + "..."
# Extract links
links: list[str] = []
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"].strip()
if should_skip_url(href):
continue
# Resolve relative URLs
absolute_url = urljoin(base_url, href)
# Verify it's HTTP(S)
parsed = urlparse(absolute_url)
if parsed.scheme not in ("http", "https"):
continue
links.append(absolute_url)
return title, links

View File

@ -0,0 +1,678 @@
"""
Site-wide inspection orchestration service.
Manages the full site inspection lifecycle:
1. BFS crawling to discover same-domain pages
2. Sequential/parallel inspection of each discovered page
3. Aggregate score computation
4. Progress tracking via Redis Pub/Sub (SSE events)
5. Result storage in MongoDB (site_inspections collection)
"""
import asyncio
import json
import logging
import uuid
from datetime import datetime, timezone
from typing import Optional
from urllib.parse import urlparse
from motor.motor_asyncio import AsyncIOMotorDatabase
from redis.asyncio import Redis
from app.core.config import get_settings
from app.core.redis import get_redis
from app.models.schemas import calculate_grade
from app.services.link_crawler import LinkCrawler
from app.services.inspection_service import InspectionService
logger = logging.getLogger(__name__)
# Redis key TTLs
SITE_RESULT_CACHE_TTL = 3600 # 1 hour
class SiteInspectionService:
"""Site-wide inspection orchestration service."""
def __init__(self, db: AsyncIOMotorDatabase, redis: Redis):
self.db = db
self.redis = redis
self.inspection_service = InspectionService(db=db, redis=redis)
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
async def start_site_inspection(
self,
url: str,
max_pages: int = 20,
max_depth: int = 2,
) -> str:
"""
Start a site-wide inspection.
1. Validate URL
2. Generate site_inspection_id
3. Create initial MongoDB document with status "crawling"
4. Launch background crawl-and-inspect task
5. Return site_inspection_id
"""
settings = get_settings()
# Clamp to server-side limits
max_pages = min(max_pages, settings.SITE_MAX_PAGES)
max_depth = min(max_depth, settings.SITE_MAX_DEPTH)
site_inspection_id = str(uuid.uuid4())
parsed = urlparse(url)
domain = parsed.netloc.lower()
# Create initial document
doc = {
"site_inspection_id": site_inspection_id,
"root_url": url,
"domain": domain,
"status": "crawling",
"created_at": datetime.now(timezone.utc),
"completed_at": None,
"config": {
"max_pages": max_pages,
"max_depth": max_depth,
},
"discovered_pages": [],
"aggregate_scores": None,
}
await self.db.site_inspections.insert_one(doc)
logger.info(
"Site inspection started: id=%s, url=%s, max_pages=%d, max_depth=%d",
site_inspection_id, url, max_pages, max_depth,
)
# Launch background task
asyncio.create_task(
self._crawl_and_inspect(site_inspection_id, url, max_pages, max_depth)
)
return site_inspection_id
async def get_site_inspection(self, site_inspection_id: str) -> Optional[dict]:
"""Get site inspection result by ID (cache-first)."""
# Try Redis cache first
cache_key = f"site-inspection:result:{site_inspection_id}"
cached = await self.redis.get(cache_key)
if cached:
return json.loads(cached)
# Fetch from MongoDB
doc = await self.db.site_inspections.find_one(
{"site_inspection_id": site_inspection_id},
{"_id": 0},
)
if doc:
# Only cache completed results
if doc.get("status") in ("completed", "error"):
await self.redis.set(
cache_key,
json.dumps(doc, ensure_ascii=False, default=str),
ex=SITE_RESULT_CACHE_TTL,
)
return doc
return None
async def get_site_inspection_list(
self,
page: int = 1,
limit: int = 20,
) -> dict:
"""Get paginated list of site inspections."""
limit = min(limit, 100)
skip = (page - 1) * limit
total = await self.db.site_inspections.count_documents({})
cursor = self.db.site_inspections.find(
{},
{
"_id": 0,
"site_inspection_id": 1,
"root_url": 1,
"domain": 1,
"status": 1,
"created_at": 1,
"discovered_pages": 1,
"aggregate_scores": 1,
},
).sort("created_at", -1).skip(skip).limit(limit)
items = []
async for doc in cursor:
pages = doc.get("discovered_pages", [])
pages_total = len(pages)
pages_inspected = sum(
1 for p in pages if p.get("status") == "completed"
)
agg = doc.get("aggregate_scores")
items.append({
"site_inspection_id": doc.get("site_inspection_id"),
"root_url": doc.get("root_url"),
"domain": doc.get("domain"),
"status": doc.get("status"),
"created_at": doc.get("created_at"),
"pages_total": pages_total,
"pages_inspected": pages_inspected,
"overall_score": agg.get("overall_score") if agg else None,
"grade": agg.get("grade") if agg else None,
})
total_pages = max(1, -(-total // limit))
return {
"items": items,
"total": total,
"page": page,
"limit": limit,
"total_pages": total_pages,
}
async def inspect_single_page(
self,
site_inspection_id: str,
page_url: str,
) -> Optional[str]:
"""
Trigger inspection for a single page within a site inspection.
Returns the inspection_id if successful, None if site inspection not found.
"""
doc = await self.db.site_inspections.find_one(
{"site_inspection_id": site_inspection_id},
)
if not doc:
return None
# Find the page in discovered_pages
page_found = False
for page in doc.get("discovered_pages", []):
if page["url"] == page_url:
page_found = True
break
if not page_found:
return None
# Run inspection inline
inspection_id = str(uuid.uuid4())
try:
inspection_id, result = await self.inspection_service.run_inspection_inline(
url=page_url,
inspection_id=inspection_id,
)
# Update page status in site inspection
overall_score = result.get("overall_score", 0)
grade = result.get("grade", "F")
await self.db.site_inspections.update_one(
{
"site_inspection_id": site_inspection_id,
"discovered_pages.url": page_url,
},
{
"$set": {
"discovered_pages.$.inspection_id": inspection_id,
"discovered_pages.$.status": "completed",
"discovered_pages.$.overall_score": overall_score,
"discovered_pages.$.grade": grade,
}
},
)
# Recompute aggregates
await self._compute_and_store_aggregates(site_inspection_id)
# Invalidate cache
cache_key = f"site-inspection:result:{site_inspection_id}"
await self.redis.delete(cache_key)
return inspection_id
except Exception as e:
logger.error(
"Failed to inspect page %s in site %s: %s",
page_url, site_inspection_id, str(e),
)
await self.db.site_inspections.update_one(
{
"site_inspection_id": site_inspection_id,
"discovered_pages.url": page_url,
},
{
"$set": {
"discovered_pages.$.status": "error",
}
},
)
return None
# ------------------------------------------------------------------
# Background task: Crawl + Inspect
# ------------------------------------------------------------------
async def _crawl_and_inspect(
self,
site_inspection_id: str,
url: str,
max_pages: int,
max_depth: int,
) -> None:
"""
Background task that runs in two phases:
Phase 1: BFS crawling to discover pages
Phase 2: Parallel inspection of discovered pages (with semaphore)
"""
try:
# ==============================
# Phase 1: Crawling
# ==============================
logger.info("Phase 1 (crawling) started: %s", site_inspection_id)
async def crawl_progress(pages_found: int, current_url: str):
await self._publish_site_event(site_inspection_id, {
"event_type": "crawl_progress",
"site_inspection_id": site_inspection_id,
"pages_found": pages_found,
"current_url": current_url,
})
crawler = LinkCrawler(
root_url=url,
max_pages=max_pages,
max_depth=max_depth,
)
discovered = await crawler.crawl(progress_callback=crawl_progress)
if not discovered:
raise ValueError("크롤링 결과가 없습니다. URL을 확인해주세요.")
# Build discovered_pages documents
discovered_pages = []
for page in discovered:
discovered_pages.append({
"url": page["url"],
"depth": page["depth"],
"parent_url": page["parent_url"],
"inspection_id": None,
"status": "pending",
"title": page.get("title"),
"overall_score": None,
"grade": None,
})
# Store discovered pages in MongoDB
await self.db.site_inspections.update_one(
{"site_inspection_id": site_inspection_id},
{
"$set": {
"status": "inspecting",
"discovered_pages": discovered_pages,
}
},
)
# Publish crawl_complete event
await self._publish_site_event(site_inspection_id, {
"event_type": "crawl_complete",
"site_inspection_id": site_inspection_id,
"total_pages": len(discovered_pages),
"pages": [
{
"url": p["url"],
"depth": p["depth"],
"parent_url": p["parent_url"],
"title": p.get("title"),
}
for p in discovered_pages
],
})
logger.info(
"Phase 1 completed: %s, pages=%d",
site_inspection_id, len(discovered_pages),
)
# ==============================
# Phase 2: Page-by-page inspection
# ==============================
logger.info("Phase 2 (inspection) started: %s", site_inspection_id)
settings = get_settings()
semaphore = asyncio.Semaphore(settings.SITE_CONCURRENCY)
tasks = [
self._inspect_page_with_semaphore(
semaphore=semaphore,
site_inspection_id=site_inspection_id,
page_url=page["url"],
page_index=idx,
total_pages=len(discovered_pages),
)
for idx, page in enumerate(discovered_pages)
]
await asyncio.gather(*tasks, return_exceptions=True)
# ==============================
# Finalize: Compute aggregates
# ==============================
aggregate_scores = await self._compute_and_store_aggregates(site_inspection_id)
# Mark as completed
await self.db.site_inspections.update_one(
{"site_inspection_id": site_inspection_id},
{
"$set": {
"status": "completed",
"completed_at": datetime.now(timezone.utc),
}
},
)
# Publish complete event
await self._publish_site_event(site_inspection_id, {
"event_type": "complete",
"site_inspection_id": site_inspection_id,
"status": "completed",
"aggregate_scores": aggregate_scores,
})
logger.info("Site inspection completed: %s", site_inspection_id)
except Exception as e:
logger.error(
"Site inspection %s failed: %s",
site_inspection_id, str(e), exc_info=True,
)
await self.db.site_inspections.update_one(
{"site_inspection_id": site_inspection_id},
{
"$set": {
"status": "error",
"completed_at": datetime.now(timezone.utc),
}
},
)
await self._publish_site_event(site_inspection_id, {
"event_type": "error",
"site_inspection_id": site_inspection_id,
"status": "error",
"message": f"사이트 검사 중 오류가 발생했습니다: {str(e)[:200]}",
})
async def _inspect_page_with_semaphore(
self,
semaphore: asyncio.Semaphore,
site_inspection_id: str,
page_url: str,
page_index: int,
total_pages: int,
) -> None:
"""Inspect a single page with semaphore-controlled concurrency."""
async with semaphore:
await self._inspect_single_page(
site_inspection_id=site_inspection_id,
page_url=page_url,
page_index=page_index,
total_pages=total_pages,
)
async def _inspect_single_page(
self,
site_inspection_id: str,
page_url: str,
page_index: int,
total_pages: int,
) -> None:
"""Run inspection for a single discovered page."""
inspection_id = str(uuid.uuid4())
# Publish page_start event
await self._publish_site_event(site_inspection_id, {
"event_type": "page_start",
"site_inspection_id": site_inspection_id,
"page_url": page_url,
"page_index": page_index,
})
# Mark page as inspecting in MongoDB
await self.db.site_inspections.update_one(
{
"site_inspection_id": site_inspection_id,
"discovered_pages.url": page_url,
},
{
"$set": {
"discovered_pages.$.status": "inspecting",
"discovered_pages.$.inspection_id": inspection_id,
}
},
)
try:
# Progress callback for per-page SSE updates
async def page_progress_callback(category: str, progress: int, current_step: str):
await self._publish_site_event(site_inspection_id, {
"event_type": "page_progress",
"site_inspection_id": site_inspection_id,
"page_url": page_url,
"page_index": page_index,
"category": category,
"progress": progress,
"current_step": current_step,
})
# Run the inspection
_, result = await self.inspection_service.run_inspection_inline(
url=page_url,
inspection_id=inspection_id,
progress_callback=page_progress_callback,
)
overall_score = result.get("overall_score", 0)
grade = result.get("grade", "F")
# Update page status in MongoDB
await self.db.site_inspections.update_one(
{
"site_inspection_id": site_inspection_id,
"discovered_pages.url": page_url,
},
{
"$set": {
"discovered_pages.$.status": "completed",
"discovered_pages.$.overall_score": overall_score,
"discovered_pages.$.grade": grade,
}
},
)
# Publish page_complete event
await self._publish_site_event(site_inspection_id, {
"event_type": "page_complete",
"site_inspection_id": site_inspection_id,
"page_url": page_url,
"page_index": page_index,
"inspection_id": inspection_id,
"score": overall_score,
"grade": grade,
})
# Compute and publish aggregate update
aggregate_scores = await self._compute_and_store_aggregates(site_inspection_id)
await self._publish_site_event(site_inspection_id, {
"event_type": "aggregate_update",
"site_inspection_id": site_inspection_id,
"pages_inspected": aggregate_scores.get("pages_inspected", 0),
"pages_total": aggregate_scores.get("pages_total", total_pages),
"overall_score": aggregate_scores.get("overall_score", 0),
"grade": aggregate_scores.get("grade", "F"),
})
logger.info(
"Page inspection completed: site=%s, page=%s, score=%d",
site_inspection_id, page_url, overall_score,
)
except Exception as e:
logger.error(
"Page inspection failed: site=%s, page=%s, error=%s",
site_inspection_id, page_url, str(e),
)
# Mark page as error
await self.db.site_inspections.update_one(
{
"site_inspection_id": site_inspection_id,
"discovered_pages.url": page_url,
},
{
"$set": {
"discovered_pages.$.status": "error",
}
},
)
# Publish page error (non-fatal, continue with other pages)
await self._publish_site_event(site_inspection_id, {
"event_type": "page_complete",
"site_inspection_id": site_inspection_id,
"page_url": page_url,
"page_index": page_index,
"inspection_id": None,
"score": 0,
"grade": "F",
"error": str(e)[:200],
})
# ------------------------------------------------------------------
# Aggregate computation
# ------------------------------------------------------------------
async def _compute_and_store_aggregates(self, site_inspection_id: str) -> dict:
"""
Compute aggregate scores from all completed page inspections.
Fetches each completed page's full inspection result from the
inspections collection, averages category scores, and stores
the aggregate in the site_inspections document.
Returns the aggregate_scores dict.
"""
doc = await self.db.site_inspections.find_one(
{"site_inspection_id": site_inspection_id},
)
if not doc:
return {}
pages = doc.get("discovered_pages", [])
total_pages = len(pages)
# Collect inspection IDs for completed pages
completed_ids = [
p["inspection_id"]
for p in pages
if p.get("status") == "completed" and p.get("inspection_id")
]
if not completed_ids:
aggregate = {
"overall_score": 0,
"grade": "F",
"html_css": 0,
"accessibility": 0,
"seo": 0,
"performance_security": 0,
"total_issues": 0,
"pages_inspected": 0,
"pages_total": total_pages,
}
await self._store_aggregates(site_inspection_id, aggregate)
return aggregate
# Fetch all completed inspection results
cursor = self.db.inspections.find(
{"inspection_id": {"$in": completed_ids}},
{
"_id": 0,
"overall_score": 1,
"categories.html_css.score": 1,
"categories.accessibility.score": 1,
"categories.seo.score": 1,
"categories.performance_security.score": 1,
"summary.total_issues": 1,
},
)
scores_overall = []
scores_html_css = []
scores_accessibility = []
scores_seo = []
scores_perf = []
total_issues = 0
async for insp in cursor:
scores_overall.append(insp.get("overall_score", 0))
cats = insp.get("categories", {})
scores_html_css.append(cats.get("html_css", {}).get("score", 0))
scores_accessibility.append(cats.get("accessibility", {}).get("score", 0))
scores_seo.append(cats.get("seo", {}).get("score", 0))
scores_perf.append(cats.get("performance_security", {}).get("score", 0))
total_issues += insp.get("summary", {}).get("total_issues", 0)
pages_inspected = len(scores_overall)
def safe_avg(values: list[int]) -> int:
return round(sum(values) / len(values)) if values else 0
overall_score = safe_avg(scores_overall)
grade = calculate_grade(overall_score)
aggregate = {
"overall_score": overall_score,
"grade": grade,
"html_css": safe_avg(scores_html_css),
"accessibility": safe_avg(scores_accessibility),
"seo": safe_avg(scores_seo),
"performance_security": safe_avg(scores_perf),
"total_issues": total_issues,
"pages_inspected": pages_inspected,
"pages_total": total_pages,
}
await self._store_aggregates(site_inspection_id, aggregate)
return aggregate
async def _store_aggregates(self, site_inspection_id: str, aggregate: dict) -> None:
"""Store aggregate scores in MongoDB."""
await self.db.site_inspections.update_one(
{"site_inspection_id": site_inspection_id},
{"$set": {"aggregate_scores": aggregate}},
)
# ------------------------------------------------------------------
# SSE event publishing
# ------------------------------------------------------------------
async def _publish_site_event(self, site_inspection_id: str, event_data: dict) -> None:
"""Publish an SSE event for site inspection via Redis Pub/Sub."""
channel = f"site-inspection:{site_inspection_id}:events"
await self.redis.publish(
channel,
json.dumps(event_data, ensure_ascii=False, default=str),
)