diff --git a/backend/app/services/batch_inspection_service.py b/backend/app/services/batch_inspection_service.py index fd656d1..4b91d4b 100644 --- a/backend/app/services/batch_inspection_service.py +++ b/backend/app/services/batch_inspection_service.py @@ -63,9 +63,13 @@ class BatchInspectionService: batch_inspection_id = str(uuid.uuid4()) - # Build discovered_pages documents + # Build discovered_pages documents (중복 URL 제거) discovered_pages = [] + seen_urls: set[str] = set() for url in urls: + if url in seen_urls: + continue + seen_urls.add(url) discovered_pages.append({ "url": url, "depth": 0, @@ -76,6 +80,8 @@ class BatchInspectionService: "overall_score": None, "grade": None, }) + # 중복 제거된 URL 목록으로 갱신 + urls = list(seen_urls) # Create initial document doc = { diff --git a/backend/app/services/link_crawler.py b/backend/app/services/link_crawler.py index bfc388f..2253e1f 100644 --- a/backend/app/services/link_crawler.py +++ b/backend/app/services/link_crawler.py @@ -39,16 +39,21 @@ def normalize_url(url: str) -> str: - Remove fragment (#...) - Remove trailing slash (except for root path) - Lowercase scheme and netloc + - Strip www. prefix for consistent deduplication + - Remove common tracking query parameters """ parsed = urlparse(url) # Remove fragment normalized = parsed._replace(fragment="") - # Lowercase scheme and netloc + # Lowercase scheme and netloc, strip www. + netloc = normalized.netloc.lower() + if netloc.startswith("www."): + netloc = netloc[4:] normalized = normalized._replace( scheme=normalized.scheme.lower(), - netloc=normalized.netloc.lower(), + netloc=netloc, ) # Remove trailing slash (but keep "/" for root path) @@ -57,6 +62,17 @@ def normalize_url(url: str) -> str: path = path.rstrip("/") normalized = normalized._replace(path=path) + # Remove common tracking query parameters + if normalized.query: + from urllib.parse import parse_qs, urlencode + _TRACKING_PARAMS = { + "utm_source", "utm_medium", "utm_campaign", "utm_term", + "utm_content", "ref", "fbclid", "gclid", "mc_cid", "mc_eid", + } + params = parse_qs(normalized.query, keep_blank_values=True) + filtered = {k: v for k, v in params.items() if k.lower() not in _TRACKING_PARAMS} + normalized = normalized._replace(query=urlencode(filtered, doseq=True)) + return urlunparse(normalized) diff --git a/backend/app/services/site_inspection_service.py b/backend/app/services/site_inspection_service.py index 149e918..91eb0b2 100644 --- a/backend/app/services/site_inspection_service.py +++ b/backend/app/services/site_inspection_service.py @@ -302,9 +302,13 @@ class SiteInspectionService: if not discovered: raise ValueError("크롤링 결과가 없습니다. URL을 확인해주세요.") - # Build discovered_pages documents + # Build discovered_pages documents (중복 URL 제거) discovered_pages = [] + seen_urls: set[str] = set() for page in discovered: + if page["url"] in seen_urls: + continue + seen_urls.add(page["url"]) discovered_pages.append({ "url": page["url"], "depth": page["depth"], @@ -444,14 +448,6 @@ class SiteInspectionService: """Run inspection for a single discovered page.""" inspection_id = str(uuid.uuid4()) - # Publish page_start event - await self._publish_site_event(site_inspection_id, { - "event_type": "page_start", - "site_inspection_id": site_inspection_id, - "page_url": page_url, - "page_index": page_index, - }) - # Mark page as inspecting in MongoDB await self.db.site_inspections.update_one( {