feat: 사이트 전체 검사 기능 추가
도메인 하위 링크를 BFS로 자동 크롤링하여 페이지별 검사 수행. - BFS 링크 크롤러 (같은 도메인 필터링, max_pages/max_depth 설정) - 사이트 검사 오케스트레이션 (크롤링→순차 검사→집계) - SSE 실시간 진행 상태 (크롤링/검사/완료) - 페이지 트리 + 집계 결과 UI - UrlInputForm에 "사이트 전체 검사" 버튼 추가 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@ -75,20 +75,162 @@ class InspectionService:
|
||||
|
||||
return inspection_id
|
||||
|
||||
async def _run_inspection(
|
||||
self, inspection_id: str, url: str, response: httpx.Response
|
||||
) -> None:
|
||||
"""Execute 4 category checks in parallel and store results."""
|
||||
async def run_inspection_inline(
|
||||
self,
|
||||
url: str,
|
||||
inspection_id: Optional[str] = None,
|
||||
progress_callback: Optional[object] = None,
|
||||
) -> tuple[str, dict]:
|
||||
"""
|
||||
Run a full inspection synchronously (inline) and return the result.
|
||||
|
||||
This is the core inspection logic extracted for reuse by both:
|
||||
- Single-page inspection (_run_inspection wrapper with SSE/Redis)
|
||||
- Site-wide inspection (site_inspection_service calling per-page)
|
||||
|
||||
Args:
|
||||
url: Target URL to inspect.
|
||||
inspection_id: Optional pre-generated ID. If None, a new UUID is generated.
|
||||
progress_callback: Optional async callback(category, progress, current_step).
|
||||
If None, progress is not reported.
|
||||
|
||||
Returns:
|
||||
(inspection_id, result_dict) where result_dict is the MongoDB document.
|
||||
|
||||
Raises:
|
||||
Exception: On fetch failure or unrecoverable errors.
|
||||
"""
|
||||
settings = get_settings()
|
||||
|
||||
if inspection_id is None:
|
||||
inspection_id = str(uuid.uuid4())
|
||||
|
||||
# Fetch URL
|
||||
response = await self._fetch_url(url, timeout=settings.URL_FETCH_TIMEOUT)
|
||||
html_content = response.text
|
||||
headers = dict(response.headers)
|
||||
start_time = time.time()
|
||||
created_at = datetime.now(timezone.utc)
|
||||
|
||||
# Use provided callback or a no-op
|
||||
if progress_callback is None:
|
||||
async def progress_callback(category: str, progress: int, current_step: str):
|
||||
pass
|
||||
|
||||
# Create 4 checker engines
|
||||
checkers = [
|
||||
HtmlCssChecker(progress_callback=progress_callback),
|
||||
AccessibilityChecker(progress_callback=progress_callback),
|
||||
SeoChecker(progress_callback=progress_callback),
|
||||
PerformanceSecurityChecker(progress_callback=progress_callback),
|
||||
]
|
||||
|
||||
# Parallel execution with per-category timeout
|
||||
results = await asyncio.gather(
|
||||
*[
|
||||
asyncio.wait_for(
|
||||
checker.check(url, html_content, headers),
|
||||
timeout=settings.CATEGORY_TIMEOUT,
|
||||
)
|
||||
for checker in checkers
|
||||
],
|
||||
return_exceptions=True,
|
||||
)
|
||||
|
||||
# Process results (handle timeouts/errors per category)
|
||||
categories = {}
|
||||
category_names = ["html_css", "accessibility", "seo", "performance_security"]
|
||||
|
||||
for i, result in enumerate(results):
|
||||
cat_name = category_names[i]
|
||||
if isinstance(result, Exception):
|
||||
logger.error(
|
||||
"Category %s failed for inspection %s: %s",
|
||||
cat_name, inspection_id, str(result),
|
||||
)
|
||||
categories[cat_name] = CategoryResult(
|
||||
score=0,
|
||||
grade="F",
|
||||
total_issues=0,
|
||||
issues=[],
|
||||
)
|
||||
else:
|
||||
categories[cat_name] = result
|
||||
|
||||
# Calculate overall score
|
||||
overall_score = calculate_overall_score(categories)
|
||||
grade = calculate_grade(overall_score)
|
||||
duration = round(time.time() - start_time, 1)
|
||||
|
||||
# Build summary
|
||||
total_critical = sum(c.critical for c in categories.values())
|
||||
total_major = sum(c.major for c in categories.values())
|
||||
total_minor = sum(c.minor for c in categories.values())
|
||||
total_info = sum(c.info for c in categories.values())
|
||||
total_issues = sum(c.total_issues for c in categories.values())
|
||||
|
||||
summary = IssueSummary(
|
||||
total_issues=total_issues,
|
||||
critical=total_critical,
|
||||
major=total_major,
|
||||
minor=total_minor,
|
||||
info=total_info,
|
||||
)
|
||||
|
||||
# Build inspection result
|
||||
completed_at = datetime.now(timezone.utc)
|
||||
inspection_result = InspectionResult(
|
||||
inspection_id=inspection_id,
|
||||
url=url,
|
||||
status="completed",
|
||||
created_at=created_at,
|
||||
completed_at=completed_at,
|
||||
duration_seconds=duration,
|
||||
overall_score=overall_score,
|
||||
grade=grade,
|
||||
categories=categories,
|
||||
summary=summary,
|
||||
)
|
||||
|
||||
# Store in MongoDB
|
||||
doc = inspection_result.model_dump(mode="json")
|
||||
await self.db.inspections.insert_one(doc)
|
||||
|
||||
# Enforce URL history limit (max 100 per URL)
|
||||
await self._enforce_history_limit(url, max_count=100)
|
||||
|
||||
# Cache in Redis
|
||||
await cache_result(inspection_id, doc)
|
||||
|
||||
logger.info(
|
||||
"Inspection %s completed (inline): score=%d, duration=%.1fs",
|
||||
inspection_id, overall_score, duration,
|
||||
)
|
||||
|
||||
return inspection_id, doc
|
||||
|
||||
async def _run_inspection(
|
||||
self, inspection_id: str, url: str, response: httpx.Response
|
||||
) -> None:
|
||||
"""
|
||||
Execute 4 category checks in parallel and store results.
|
||||
|
||||
This is the background-task wrapper that adds SSE/Redis progress
|
||||
tracking on top of run_inspection_inline().
|
||||
"""
|
||||
try:
|
||||
# Progress callback factory
|
||||
# Progress callback that publishes to Redis + SSE
|
||||
async def progress_callback(category: str, progress: int, current_step: str):
|
||||
await self._update_progress(inspection_id, category, progress, current_step)
|
||||
|
||||
# Use inline runner (fetches URL internally, so we pass the pre-fetched response data)
|
||||
# Since run_inspection_inline fetches the URL again, we use the lower-level approach
|
||||
# to avoid double-fetching. We replicate the core logic with SSE event publishing.
|
||||
html_content = response.text
|
||||
headers = dict(response.headers)
|
||||
start_time = time.time()
|
||||
created_at = datetime.now(timezone.utc)
|
||||
|
||||
# Create 4 checker engines
|
||||
checkers = [
|
||||
HtmlCssChecker(progress_callback=progress_callback),
|
||||
@ -122,14 +264,13 @@ class InspectionService:
|
||||
"Category %s failed for inspection %s: %s",
|
||||
cat_name, inspection_id, str(result),
|
||||
)
|
||||
# Create error result for failed category
|
||||
categories[cat_name] = CategoryResult(
|
||||
score=0,
|
||||
grade="F",
|
||||
total_issues=0,
|
||||
issues=[],
|
||||
)
|
||||
# Publish category error
|
||||
# Publish category error event
|
||||
await publish_event(inspection_id, {
|
||||
"event_type": "category_complete",
|
||||
"inspection_id": inspection_id,
|
||||
@ -139,7 +280,7 @@ class InspectionService:
|
||||
})
|
||||
else:
|
||||
categories[cat_name] = result
|
||||
# Publish category completion
|
||||
# Publish category completion event
|
||||
await publish_event(inspection_id, {
|
||||
"event_type": "category_complete",
|
||||
"inspection_id": inspection_id,
|
||||
|
||||
291
backend/app/services/link_crawler.py
Normal file
291
backend/app/services/link_crawler.py
Normal file
@ -0,0 +1,291 @@
|
||||
"""
|
||||
BFS link crawler for same-domain page discovery.
|
||||
|
||||
Crawls a root URL using BFS (Breadth-First Search), extracting same-domain
|
||||
links up to configurable max_pages and max_depth limits. Used by the
|
||||
site-wide inspection feature to discover pages before inspection.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from collections import deque
|
||||
from typing import Callable, Awaitable, Optional
|
||||
from urllib.parse import urljoin, urlparse, urlunparse
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Schemes to skip when extracting links
|
||||
_SKIP_SCHEMES = {"javascript", "mailto", "tel", "data", "blob", "ftp"}
|
||||
|
||||
# File extensions that are not HTML pages
|
||||
_SKIP_EXTENSIONS = {
|
||||
".pdf", ".jpg", ".jpeg", ".png", ".gif", ".svg", ".webp", ".ico",
|
||||
".css", ".js", ".json", ".xml", ".zip", ".tar", ".gz", ".mp4",
|
||||
".mp3", ".wav", ".avi", ".mov", ".woff", ".woff2", ".ttf", ".eot",
|
||||
}
|
||||
|
||||
# Type alias for progress callback: (pages_found, current_url) -> None
|
||||
ProgressCallback = Callable[[int, str], Awaitable[None]]
|
||||
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
"""
|
||||
Normalize a URL for deduplication:
|
||||
- Remove fragment (#...)
|
||||
- Remove trailing slash (except for root path)
|
||||
- Lowercase scheme and netloc
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
|
||||
# Remove fragment
|
||||
normalized = parsed._replace(fragment="")
|
||||
|
||||
# Lowercase scheme and netloc
|
||||
normalized = normalized._replace(
|
||||
scheme=normalized.scheme.lower(),
|
||||
netloc=normalized.netloc.lower(),
|
||||
)
|
||||
|
||||
# Remove trailing slash (but keep "/" for root path)
|
||||
path = normalized.path
|
||||
if path != "/" and path.endswith("/"):
|
||||
path = path.rstrip("/")
|
||||
normalized = normalized._replace(path=path)
|
||||
|
||||
return urlunparse(normalized)
|
||||
|
||||
|
||||
def is_same_domain(url: str, root_domain: str) -> bool:
|
||||
"""Check if a URL belongs to the same domain as the root."""
|
||||
parsed = urlparse(url)
|
||||
url_domain = parsed.netloc.lower()
|
||||
|
||||
# Handle www prefix: treat example.com and www.example.com as same domain
|
||||
root_clean = root_domain.lower().removeprefix("www.")
|
||||
url_clean = url_domain.removeprefix("www.")
|
||||
|
||||
return root_clean == url_clean
|
||||
|
||||
|
||||
def should_skip_url(href: str) -> bool:
|
||||
"""Check if a URL should be skipped based on scheme or extension."""
|
||||
if not href or href.strip() == "":
|
||||
return True
|
||||
|
||||
# Skip anchors-only links
|
||||
if href.startswith("#"):
|
||||
return True
|
||||
|
||||
# Skip non-HTTP schemes
|
||||
parsed = urlparse(href)
|
||||
if parsed.scheme and parsed.scheme.lower() in _SKIP_SCHEMES:
|
||||
return True
|
||||
|
||||
# Skip non-HTML file extensions
|
||||
path = parsed.path.lower()
|
||||
for ext in _SKIP_EXTENSIONS:
|
||||
if path.endswith(ext):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
class LinkCrawler:
|
||||
"""
|
||||
BFS link crawler that discovers same-domain pages.
|
||||
|
||||
Usage:
|
||||
crawler = LinkCrawler(
|
||||
root_url="https://example.com",
|
||||
max_pages=20,
|
||||
max_depth=2,
|
||||
)
|
||||
pages = await crawler.crawl(progress_callback=callback)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
root_url: str,
|
||||
max_pages: int = 20,
|
||||
max_depth: int = 2,
|
||||
):
|
||||
self.root_url = normalize_url(root_url)
|
||||
self.max_pages = max_pages
|
||||
self.max_depth = max_depth
|
||||
|
||||
parsed = urlparse(self.root_url)
|
||||
self.root_domain = parsed.netloc.lower()
|
||||
self.root_scheme = parsed.scheme
|
||||
|
||||
async def crawl(
|
||||
self,
|
||||
progress_callback: Optional[ProgressCallback] = None,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
BFS crawl starting from root_url.
|
||||
|
||||
Returns list of dicts:
|
||||
[
|
||||
{
|
||||
"url": "https://example.com/",
|
||||
"depth": 0,
|
||||
"parent_url": None,
|
||||
"title": "Example Page",
|
||||
"status": "discovered",
|
||||
},
|
||||
...
|
||||
]
|
||||
"""
|
||||
visited: set[str] = set()
|
||||
results: list[dict] = []
|
||||
|
||||
# BFS queue: (url, depth, parent_url)
|
||||
queue: deque[tuple[str, int, Optional[str]]] = deque()
|
||||
queue.append((self.root_url, 0, None))
|
||||
visited.add(self.root_url)
|
||||
|
||||
async with httpx.AsyncClient(
|
||||
follow_redirects=True,
|
||||
timeout=httpx.Timeout(10.0),
|
||||
verify=False,
|
||||
headers={
|
||||
"User-Agent": "WebInspector/1.0 (Site Crawler)",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
|
||||
},
|
||||
) as client:
|
||||
while queue and len(results) < self.max_pages:
|
||||
url, depth, parent_url = queue.popleft()
|
||||
|
||||
# Fetch the page
|
||||
title = None
|
||||
status_code = None
|
||||
links: list[str] = []
|
||||
|
||||
try:
|
||||
response = await client.get(url)
|
||||
status_code = response.status_code
|
||||
|
||||
# Only parse HTML content
|
||||
content_type = response.headers.get("content-type", "")
|
||||
if "text/html" not in content_type and "application/xhtml" not in content_type:
|
||||
logger.debug("Skipping non-HTML content: %s (%s)", url, content_type)
|
||||
# Still record it but don't extract links
|
||||
results.append({
|
||||
"url": url,
|
||||
"depth": depth,
|
||||
"parent_url": parent_url,
|
||||
"title": None,
|
||||
"status": "discovered",
|
||||
})
|
||||
if progress_callback:
|
||||
await progress_callback(len(results), url)
|
||||
continue
|
||||
|
||||
html = response.text
|
||||
title, links = self._extract_links_and_title(url, html)
|
||||
|
||||
except httpx.TimeoutException:
|
||||
logger.warning("Timeout crawling %s", url)
|
||||
results.append({
|
||||
"url": url,
|
||||
"depth": depth,
|
||||
"parent_url": parent_url,
|
||||
"title": None,
|
||||
"status": "discovered",
|
||||
})
|
||||
if progress_callback:
|
||||
await progress_callback(len(results), url)
|
||||
continue
|
||||
|
||||
except httpx.RequestError as e:
|
||||
logger.warning("Request error crawling %s: %s", url, str(e))
|
||||
results.append({
|
||||
"url": url,
|
||||
"depth": depth,
|
||||
"parent_url": parent_url,
|
||||
"title": None,
|
||||
"status": "discovered",
|
||||
})
|
||||
if progress_callback:
|
||||
await progress_callback(len(results), url)
|
||||
continue
|
||||
|
||||
# Record this page
|
||||
results.append({
|
||||
"url": url,
|
||||
"depth": depth,
|
||||
"parent_url": parent_url,
|
||||
"title": title,
|
||||
"status": "discovered",
|
||||
})
|
||||
|
||||
# Notify progress
|
||||
if progress_callback:
|
||||
await progress_callback(len(results), url)
|
||||
|
||||
# Only enqueue child links if we haven't reached max_depth
|
||||
if depth < self.max_depth:
|
||||
for link in links:
|
||||
normalized = normalize_url(link)
|
||||
|
||||
if normalized in visited:
|
||||
continue
|
||||
|
||||
if not is_same_domain(normalized, self.root_domain):
|
||||
continue
|
||||
|
||||
if len(visited) >= self.max_pages:
|
||||
break
|
||||
|
||||
visited.add(normalized)
|
||||
queue.append((normalized, depth + 1, url))
|
||||
|
||||
logger.info(
|
||||
"Crawl completed: root=%s, pages_found=%d, max_pages=%d, max_depth=%d",
|
||||
self.root_url, len(results), self.max_pages, self.max_depth,
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
def _extract_links_and_title(
|
||||
self, base_url: str, html: str
|
||||
) -> tuple[Optional[str], list[str]]:
|
||||
"""
|
||||
Extract page title and same-domain links from HTML.
|
||||
|
||||
Returns:
|
||||
(title, list_of_absolute_urls)
|
||||
"""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Extract title
|
||||
title = None
|
||||
title_tag = soup.find("title")
|
||||
if title_tag and title_tag.string:
|
||||
title = title_tag.string.strip()
|
||||
# Truncate very long titles
|
||||
if len(title) > 200:
|
||||
title = title[:200] + "..."
|
||||
|
||||
# Extract links
|
||||
links: list[str] = []
|
||||
for a_tag in soup.find_all("a", href=True):
|
||||
href = a_tag["href"].strip()
|
||||
|
||||
if should_skip_url(href):
|
||||
continue
|
||||
|
||||
# Resolve relative URLs
|
||||
absolute_url = urljoin(base_url, href)
|
||||
|
||||
# Verify it's HTTP(S)
|
||||
parsed = urlparse(absolute_url)
|
||||
if parsed.scheme not in ("http", "https"):
|
||||
continue
|
||||
|
||||
links.append(absolute_url)
|
||||
|
||||
return title, links
|
||||
678
backend/app/services/site_inspection_service.py
Normal file
678
backend/app/services/site_inspection_service.py
Normal file
@ -0,0 +1,678 @@
|
||||
"""
|
||||
Site-wide inspection orchestration service.
|
||||
|
||||
Manages the full site inspection lifecycle:
|
||||
1. BFS crawling to discover same-domain pages
|
||||
2. Sequential/parallel inspection of each discovered page
|
||||
3. Aggregate score computation
|
||||
4. Progress tracking via Redis Pub/Sub (SSE events)
|
||||
5. Result storage in MongoDB (site_inspections collection)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from motor.motor_asyncio import AsyncIOMotorDatabase
|
||||
from redis.asyncio import Redis
|
||||
|
||||
from app.core.config import get_settings
|
||||
from app.core.redis import get_redis
|
||||
from app.models.schemas import calculate_grade
|
||||
from app.services.link_crawler import LinkCrawler
|
||||
from app.services.inspection_service import InspectionService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Redis key TTLs
|
||||
SITE_RESULT_CACHE_TTL = 3600 # 1 hour
|
||||
|
||||
|
||||
class SiteInspectionService:
|
||||
"""Site-wide inspection orchestration service."""
|
||||
|
||||
def __init__(self, db: AsyncIOMotorDatabase, redis: Redis):
|
||||
self.db = db
|
||||
self.redis = redis
|
||||
self.inspection_service = InspectionService(db=db, redis=redis)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def start_site_inspection(
|
||||
self,
|
||||
url: str,
|
||||
max_pages: int = 20,
|
||||
max_depth: int = 2,
|
||||
) -> str:
|
||||
"""
|
||||
Start a site-wide inspection.
|
||||
|
||||
1. Validate URL
|
||||
2. Generate site_inspection_id
|
||||
3. Create initial MongoDB document with status "crawling"
|
||||
4. Launch background crawl-and-inspect task
|
||||
5. Return site_inspection_id
|
||||
"""
|
||||
settings = get_settings()
|
||||
|
||||
# Clamp to server-side limits
|
||||
max_pages = min(max_pages, settings.SITE_MAX_PAGES)
|
||||
max_depth = min(max_depth, settings.SITE_MAX_DEPTH)
|
||||
|
||||
site_inspection_id = str(uuid.uuid4())
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower()
|
||||
|
||||
# Create initial document
|
||||
doc = {
|
||||
"site_inspection_id": site_inspection_id,
|
||||
"root_url": url,
|
||||
"domain": domain,
|
||||
"status": "crawling",
|
||||
"created_at": datetime.now(timezone.utc),
|
||||
"completed_at": None,
|
||||
"config": {
|
||||
"max_pages": max_pages,
|
||||
"max_depth": max_depth,
|
||||
},
|
||||
"discovered_pages": [],
|
||||
"aggregate_scores": None,
|
||||
}
|
||||
await self.db.site_inspections.insert_one(doc)
|
||||
|
||||
logger.info(
|
||||
"Site inspection started: id=%s, url=%s, max_pages=%d, max_depth=%d",
|
||||
site_inspection_id, url, max_pages, max_depth,
|
||||
)
|
||||
|
||||
# Launch background task
|
||||
asyncio.create_task(
|
||||
self._crawl_and_inspect(site_inspection_id, url, max_pages, max_depth)
|
||||
)
|
||||
|
||||
return site_inspection_id
|
||||
|
||||
async def get_site_inspection(self, site_inspection_id: str) -> Optional[dict]:
|
||||
"""Get site inspection result by ID (cache-first)."""
|
||||
# Try Redis cache first
|
||||
cache_key = f"site-inspection:result:{site_inspection_id}"
|
||||
cached = await self.redis.get(cache_key)
|
||||
if cached:
|
||||
return json.loads(cached)
|
||||
|
||||
# Fetch from MongoDB
|
||||
doc = await self.db.site_inspections.find_one(
|
||||
{"site_inspection_id": site_inspection_id},
|
||||
{"_id": 0},
|
||||
)
|
||||
if doc:
|
||||
# Only cache completed results
|
||||
if doc.get("status") in ("completed", "error"):
|
||||
await self.redis.set(
|
||||
cache_key,
|
||||
json.dumps(doc, ensure_ascii=False, default=str),
|
||||
ex=SITE_RESULT_CACHE_TTL,
|
||||
)
|
||||
return doc
|
||||
return None
|
||||
|
||||
async def get_site_inspection_list(
|
||||
self,
|
||||
page: int = 1,
|
||||
limit: int = 20,
|
||||
) -> dict:
|
||||
"""Get paginated list of site inspections."""
|
||||
limit = min(limit, 100)
|
||||
skip = (page - 1) * limit
|
||||
|
||||
total = await self.db.site_inspections.count_documents({})
|
||||
|
||||
cursor = self.db.site_inspections.find(
|
||||
{},
|
||||
{
|
||||
"_id": 0,
|
||||
"site_inspection_id": 1,
|
||||
"root_url": 1,
|
||||
"domain": 1,
|
||||
"status": 1,
|
||||
"created_at": 1,
|
||||
"discovered_pages": 1,
|
||||
"aggregate_scores": 1,
|
||||
},
|
||||
).sort("created_at", -1).skip(skip).limit(limit)
|
||||
|
||||
items = []
|
||||
async for doc in cursor:
|
||||
pages = doc.get("discovered_pages", [])
|
||||
pages_total = len(pages)
|
||||
pages_inspected = sum(
|
||||
1 for p in pages if p.get("status") == "completed"
|
||||
)
|
||||
agg = doc.get("aggregate_scores")
|
||||
|
||||
items.append({
|
||||
"site_inspection_id": doc.get("site_inspection_id"),
|
||||
"root_url": doc.get("root_url"),
|
||||
"domain": doc.get("domain"),
|
||||
"status": doc.get("status"),
|
||||
"created_at": doc.get("created_at"),
|
||||
"pages_total": pages_total,
|
||||
"pages_inspected": pages_inspected,
|
||||
"overall_score": agg.get("overall_score") if agg else None,
|
||||
"grade": agg.get("grade") if agg else None,
|
||||
})
|
||||
|
||||
total_pages = max(1, -(-total // limit))
|
||||
|
||||
return {
|
||||
"items": items,
|
||||
"total": total,
|
||||
"page": page,
|
||||
"limit": limit,
|
||||
"total_pages": total_pages,
|
||||
}
|
||||
|
||||
async def inspect_single_page(
|
||||
self,
|
||||
site_inspection_id: str,
|
||||
page_url: str,
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Trigger inspection for a single page within a site inspection.
|
||||
Returns the inspection_id if successful, None if site inspection not found.
|
||||
"""
|
||||
doc = await self.db.site_inspections.find_one(
|
||||
{"site_inspection_id": site_inspection_id},
|
||||
)
|
||||
if not doc:
|
||||
return None
|
||||
|
||||
# Find the page in discovered_pages
|
||||
page_found = False
|
||||
for page in doc.get("discovered_pages", []):
|
||||
if page["url"] == page_url:
|
||||
page_found = True
|
||||
break
|
||||
|
||||
if not page_found:
|
||||
return None
|
||||
|
||||
# Run inspection inline
|
||||
inspection_id = str(uuid.uuid4())
|
||||
try:
|
||||
inspection_id, result = await self.inspection_service.run_inspection_inline(
|
||||
url=page_url,
|
||||
inspection_id=inspection_id,
|
||||
)
|
||||
|
||||
# Update page status in site inspection
|
||||
overall_score = result.get("overall_score", 0)
|
||||
grade = result.get("grade", "F")
|
||||
|
||||
await self.db.site_inspections.update_one(
|
||||
{
|
||||
"site_inspection_id": site_inspection_id,
|
||||
"discovered_pages.url": page_url,
|
||||
},
|
||||
{
|
||||
"$set": {
|
||||
"discovered_pages.$.inspection_id": inspection_id,
|
||||
"discovered_pages.$.status": "completed",
|
||||
"discovered_pages.$.overall_score": overall_score,
|
||||
"discovered_pages.$.grade": grade,
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
# Recompute aggregates
|
||||
await self._compute_and_store_aggregates(site_inspection_id)
|
||||
|
||||
# Invalidate cache
|
||||
cache_key = f"site-inspection:result:{site_inspection_id}"
|
||||
await self.redis.delete(cache_key)
|
||||
|
||||
return inspection_id
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to inspect page %s in site %s: %s",
|
||||
page_url, site_inspection_id, str(e),
|
||||
)
|
||||
await self.db.site_inspections.update_one(
|
||||
{
|
||||
"site_inspection_id": site_inspection_id,
|
||||
"discovered_pages.url": page_url,
|
||||
},
|
||||
{
|
||||
"$set": {
|
||||
"discovered_pages.$.status": "error",
|
||||
}
|
||||
},
|
||||
)
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Background task: Crawl + Inspect
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _crawl_and_inspect(
|
||||
self,
|
||||
site_inspection_id: str,
|
||||
url: str,
|
||||
max_pages: int,
|
||||
max_depth: int,
|
||||
) -> None:
|
||||
"""
|
||||
Background task that runs in two phases:
|
||||
Phase 1: BFS crawling to discover pages
|
||||
Phase 2: Parallel inspection of discovered pages (with semaphore)
|
||||
"""
|
||||
try:
|
||||
# ==============================
|
||||
# Phase 1: Crawling
|
||||
# ==============================
|
||||
logger.info("Phase 1 (crawling) started: %s", site_inspection_id)
|
||||
|
||||
async def crawl_progress(pages_found: int, current_url: str):
|
||||
await self._publish_site_event(site_inspection_id, {
|
||||
"event_type": "crawl_progress",
|
||||
"site_inspection_id": site_inspection_id,
|
||||
"pages_found": pages_found,
|
||||
"current_url": current_url,
|
||||
})
|
||||
|
||||
crawler = LinkCrawler(
|
||||
root_url=url,
|
||||
max_pages=max_pages,
|
||||
max_depth=max_depth,
|
||||
)
|
||||
discovered = await crawler.crawl(progress_callback=crawl_progress)
|
||||
|
||||
if not discovered:
|
||||
raise ValueError("크롤링 결과가 없습니다. URL을 확인해주세요.")
|
||||
|
||||
# Build discovered_pages documents
|
||||
discovered_pages = []
|
||||
for page in discovered:
|
||||
discovered_pages.append({
|
||||
"url": page["url"],
|
||||
"depth": page["depth"],
|
||||
"parent_url": page["parent_url"],
|
||||
"inspection_id": None,
|
||||
"status": "pending",
|
||||
"title": page.get("title"),
|
||||
"overall_score": None,
|
||||
"grade": None,
|
||||
})
|
||||
|
||||
# Store discovered pages in MongoDB
|
||||
await self.db.site_inspections.update_one(
|
||||
{"site_inspection_id": site_inspection_id},
|
||||
{
|
||||
"$set": {
|
||||
"status": "inspecting",
|
||||
"discovered_pages": discovered_pages,
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
# Publish crawl_complete event
|
||||
await self._publish_site_event(site_inspection_id, {
|
||||
"event_type": "crawl_complete",
|
||||
"site_inspection_id": site_inspection_id,
|
||||
"total_pages": len(discovered_pages),
|
||||
"pages": [
|
||||
{
|
||||
"url": p["url"],
|
||||
"depth": p["depth"],
|
||||
"parent_url": p["parent_url"],
|
||||
"title": p.get("title"),
|
||||
}
|
||||
for p in discovered_pages
|
||||
],
|
||||
})
|
||||
|
||||
logger.info(
|
||||
"Phase 1 completed: %s, pages=%d",
|
||||
site_inspection_id, len(discovered_pages),
|
||||
)
|
||||
|
||||
# ==============================
|
||||
# Phase 2: Page-by-page inspection
|
||||
# ==============================
|
||||
logger.info("Phase 2 (inspection) started: %s", site_inspection_id)
|
||||
|
||||
settings = get_settings()
|
||||
semaphore = asyncio.Semaphore(settings.SITE_CONCURRENCY)
|
||||
|
||||
tasks = [
|
||||
self._inspect_page_with_semaphore(
|
||||
semaphore=semaphore,
|
||||
site_inspection_id=site_inspection_id,
|
||||
page_url=page["url"],
|
||||
page_index=idx,
|
||||
total_pages=len(discovered_pages),
|
||||
)
|
||||
for idx, page in enumerate(discovered_pages)
|
||||
]
|
||||
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# ==============================
|
||||
# Finalize: Compute aggregates
|
||||
# ==============================
|
||||
aggregate_scores = await self._compute_and_store_aggregates(site_inspection_id)
|
||||
|
||||
# Mark as completed
|
||||
await self.db.site_inspections.update_one(
|
||||
{"site_inspection_id": site_inspection_id},
|
||||
{
|
||||
"$set": {
|
||||
"status": "completed",
|
||||
"completed_at": datetime.now(timezone.utc),
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
# Publish complete event
|
||||
await self._publish_site_event(site_inspection_id, {
|
||||
"event_type": "complete",
|
||||
"site_inspection_id": site_inspection_id,
|
||||
"status": "completed",
|
||||
"aggregate_scores": aggregate_scores,
|
||||
})
|
||||
|
||||
logger.info("Site inspection completed: %s", site_inspection_id)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Site inspection %s failed: %s",
|
||||
site_inspection_id, str(e), exc_info=True,
|
||||
)
|
||||
|
||||
await self.db.site_inspections.update_one(
|
||||
{"site_inspection_id": site_inspection_id},
|
||||
{
|
||||
"$set": {
|
||||
"status": "error",
|
||||
"completed_at": datetime.now(timezone.utc),
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
await self._publish_site_event(site_inspection_id, {
|
||||
"event_type": "error",
|
||||
"site_inspection_id": site_inspection_id,
|
||||
"status": "error",
|
||||
"message": f"사이트 검사 중 오류가 발생했습니다: {str(e)[:200]}",
|
||||
})
|
||||
|
||||
async def _inspect_page_with_semaphore(
|
||||
self,
|
||||
semaphore: asyncio.Semaphore,
|
||||
site_inspection_id: str,
|
||||
page_url: str,
|
||||
page_index: int,
|
||||
total_pages: int,
|
||||
) -> None:
|
||||
"""Inspect a single page with semaphore-controlled concurrency."""
|
||||
async with semaphore:
|
||||
await self._inspect_single_page(
|
||||
site_inspection_id=site_inspection_id,
|
||||
page_url=page_url,
|
||||
page_index=page_index,
|
||||
total_pages=total_pages,
|
||||
)
|
||||
|
||||
async def _inspect_single_page(
|
||||
self,
|
||||
site_inspection_id: str,
|
||||
page_url: str,
|
||||
page_index: int,
|
||||
total_pages: int,
|
||||
) -> None:
|
||||
"""Run inspection for a single discovered page."""
|
||||
inspection_id = str(uuid.uuid4())
|
||||
|
||||
# Publish page_start event
|
||||
await self._publish_site_event(site_inspection_id, {
|
||||
"event_type": "page_start",
|
||||
"site_inspection_id": site_inspection_id,
|
||||
"page_url": page_url,
|
||||
"page_index": page_index,
|
||||
})
|
||||
|
||||
# Mark page as inspecting in MongoDB
|
||||
await self.db.site_inspections.update_one(
|
||||
{
|
||||
"site_inspection_id": site_inspection_id,
|
||||
"discovered_pages.url": page_url,
|
||||
},
|
||||
{
|
||||
"$set": {
|
||||
"discovered_pages.$.status": "inspecting",
|
||||
"discovered_pages.$.inspection_id": inspection_id,
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
# Progress callback for per-page SSE updates
|
||||
async def page_progress_callback(category: str, progress: int, current_step: str):
|
||||
await self._publish_site_event(site_inspection_id, {
|
||||
"event_type": "page_progress",
|
||||
"site_inspection_id": site_inspection_id,
|
||||
"page_url": page_url,
|
||||
"page_index": page_index,
|
||||
"category": category,
|
||||
"progress": progress,
|
||||
"current_step": current_step,
|
||||
})
|
||||
|
||||
# Run the inspection
|
||||
_, result = await self.inspection_service.run_inspection_inline(
|
||||
url=page_url,
|
||||
inspection_id=inspection_id,
|
||||
progress_callback=page_progress_callback,
|
||||
)
|
||||
|
||||
overall_score = result.get("overall_score", 0)
|
||||
grade = result.get("grade", "F")
|
||||
|
||||
# Update page status in MongoDB
|
||||
await self.db.site_inspections.update_one(
|
||||
{
|
||||
"site_inspection_id": site_inspection_id,
|
||||
"discovered_pages.url": page_url,
|
||||
},
|
||||
{
|
||||
"$set": {
|
||||
"discovered_pages.$.status": "completed",
|
||||
"discovered_pages.$.overall_score": overall_score,
|
||||
"discovered_pages.$.grade": grade,
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
# Publish page_complete event
|
||||
await self._publish_site_event(site_inspection_id, {
|
||||
"event_type": "page_complete",
|
||||
"site_inspection_id": site_inspection_id,
|
||||
"page_url": page_url,
|
||||
"page_index": page_index,
|
||||
"inspection_id": inspection_id,
|
||||
"score": overall_score,
|
||||
"grade": grade,
|
||||
})
|
||||
|
||||
# Compute and publish aggregate update
|
||||
aggregate_scores = await self._compute_and_store_aggregates(site_inspection_id)
|
||||
await self._publish_site_event(site_inspection_id, {
|
||||
"event_type": "aggregate_update",
|
||||
"site_inspection_id": site_inspection_id,
|
||||
"pages_inspected": aggregate_scores.get("pages_inspected", 0),
|
||||
"pages_total": aggregate_scores.get("pages_total", total_pages),
|
||||
"overall_score": aggregate_scores.get("overall_score", 0),
|
||||
"grade": aggregate_scores.get("grade", "F"),
|
||||
})
|
||||
|
||||
logger.info(
|
||||
"Page inspection completed: site=%s, page=%s, score=%d",
|
||||
site_inspection_id, page_url, overall_score,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Page inspection failed: site=%s, page=%s, error=%s",
|
||||
site_inspection_id, page_url, str(e),
|
||||
)
|
||||
|
||||
# Mark page as error
|
||||
await self.db.site_inspections.update_one(
|
||||
{
|
||||
"site_inspection_id": site_inspection_id,
|
||||
"discovered_pages.url": page_url,
|
||||
},
|
||||
{
|
||||
"$set": {
|
||||
"discovered_pages.$.status": "error",
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
# Publish page error (non-fatal, continue with other pages)
|
||||
await self._publish_site_event(site_inspection_id, {
|
||||
"event_type": "page_complete",
|
||||
"site_inspection_id": site_inspection_id,
|
||||
"page_url": page_url,
|
||||
"page_index": page_index,
|
||||
"inspection_id": None,
|
||||
"score": 0,
|
||||
"grade": "F",
|
||||
"error": str(e)[:200],
|
||||
})
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Aggregate computation
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _compute_and_store_aggregates(self, site_inspection_id: str) -> dict:
|
||||
"""
|
||||
Compute aggregate scores from all completed page inspections.
|
||||
|
||||
Fetches each completed page's full inspection result from the
|
||||
inspections collection, averages category scores, and stores
|
||||
the aggregate in the site_inspections document.
|
||||
|
||||
Returns the aggregate_scores dict.
|
||||
"""
|
||||
doc = await self.db.site_inspections.find_one(
|
||||
{"site_inspection_id": site_inspection_id},
|
||||
)
|
||||
if not doc:
|
||||
return {}
|
||||
|
||||
pages = doc.get("discovered_pages", [])
|
||||
total_pages = len(pages)
|
||||
|
||||
# Collect inspection IDs for completed pages
|
||||
completed_ids = [
|
||||
p["inspection_id"]
|
||||
for p in pages
|
||||
if p.get("status") == "completed" and p.get("inspection_id")
|
||||
]
|
||||
|
||||
if not completed_ids:
|
||||
aggregate = {
|
||||
"overall_score": 0,
|
||||
"grade": "F",
|
||||
"html_css": 0,
|
||||
"accessibility": 0,
|
||||
"seo": 0,
|
||||
"performance_security": 0,
|
||||
"total_issues": 0,
|
||||
"pages_inspected": 0,
|
||||
"pages_total": total_pages,
|
||||
}
|
||||
await self._store_aggregates(site_inspection_id, aggregate)
|
||||
return aggregate
|
||||
|
||||
# Fetch all completed inspection results
|
||||
cursor = self.db.inspections.find(
|
||||
{"inspection_id": {"$in": completed_ids}},
|
||||
{
|
||||
"_id": 0,
|
||||
"overall_score": 1,
|
||||
"categories.html_css.score": 1,
|
||||
"categories.accessibility.score": 1,
|
||||
"categories.seo.score": 1,
|
||||
"categories.performance_security.score": 1,
|
||||
"summary.total_issues": 1,
|
||||
},
|
||||
)
|
||||
|
||||
scores_overall = []
|
||||
scores_html_css = []
|
||||
scores_accessibility = []
|
||||
scores_seo = []
|
||||
scores_perf = []
|
||||
total_issues = 0
|
||||
|
||||
async for insp in cursor:
|
||||
scores_overall.append(insp.get("overall_score", 0))
|
||||
|
||||
cats = insp.get("categories", {})
|
||||
scores_html_css.append(cats.get("html_css", {}).get("score", 0))
|
||||
scores_accessibility.append(cats.get("accessibility", {}).get("score", 0))
|
||||
scores_seo.append(cats.get("seo", {}).get("score", 0))
|
||||
scores_perf.append(cats.get("performance_security", {}).get("score", 0))
|
||||
|
||||
total_issues += insp.get("summary", {}).get("total_issues", 0)
|
||||
|
||||
pages_inspected = len(scores_overall)
|
||||
|
||||
def safe_avg(values: list[int]) -> int:
|
||||
return round(sum(values) / len(values)) if values else 0
|
||||
|
||||
overall_score = safe_avg(scores_overall)
|
||||
grade = calculate_grade(overall_score)
|
||||
|
||||
aggregate = {
|
||||
"overall_score": overall_score,
|
||||
"grade": grade,
|
||||
"html_css": safe_avg(scores_html_css),
|
||||
"accessibility": safe_avg(scores_accessibility),
|
||||
"seo": safe_avg(scores_seo),
|
||||
"performance_security": safe_avg(scores_perf),
|
||||
"total_issues": total_issues,
|
||||
"pages_inspected": pages_inspected,
|
||||
"pages_total": total_pages,
|
||||
}
|
||||
|
||||
await self._store_aggregates(site_inspection_id, aggregate)
|
||||
return aggregate
|
||||
|
||||
async def _store_aggregates(self, site_inspection_id: str, aggregate: dict) -> None:
|
||||
"""Store aggregate scores in MongoDB."""
|
||||
await self.db.site_inspections.update_one(
|
||||
{"site_inspection_id": site_inspection_id},
|
||||
{"$set": {"aggregate_scores": aggregate}},
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# SSE event publishing
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _publish_site_event(self, site_inspection_id: str, event_data: dict) -> None:
|
||||
"""Publish an SSE event for site inspection via Redis Pub/Sub."""
|
||||
channel = f"site-inspection:{site_inspection_id}:events"
|
||||
await self.redis.publish(
|
||||
channel,
|
||||
json.dumps(event_data, ensure_ascii=False, default=str),
|
||||
)
|
||||
Reference in New Issue
Block a user