fix: 크롤링 중복 URL 제거 + URL 정규화 강화

- normalize_url: www. prefix 제거, UTM 등 트래킹 파라미터 제거 - site inspection: 크롤링 후 검사 전 중복 URL 필터링 - batch inspection: 업로드 URL 목록 중복 제거 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-14 07:26:00 +09:00
parent 9f58485a87
commit 645ec56bd1
3 changed files with 30 additions and 12 deletions
--- a/backend/app/services/batch_inspection_service.py
+++ b/backend/app/services/batch_inspection_service.py
@ -63,9 +63,13 @@ class BatchInspectionService:

        batch_inspection_id = str(uuid.uuid4())

-        # Build discovered_pages documents
+        # Build discovered_pages documents (중복 URL 제거)
        discovered_pages = []
+        seen_urls: set[str] = set()
        for url in urls:
+            if url in seen_urls:
+                continue
+            seen_urls.add(url)
            discovered_pages.append({
                "url": url,
                "depth": 0,
@ -76,6 +80,8 @@ class BatchInspectionService:
                "overall_score": None,
                "grade": None,
            })
+        # 중복 제거된 URL 목록으로 갱신
+        urls = list(seen_urls)

        # Create initial document
        doc = {
--- a/backend/app/services/link_crawler.py
+++ b/backend/app/services/link_crawler.py
@ -39,16 +39,21 @@ def normalize_url(url: str) -> str:
    - Remove fragment (#...)
    - Remove trailing slash (except for root path)
    - Lowercase scheme and netloc
+    - Strip www. prefix for consistent deduplication
+    - Remove common tracking query parameters
    """
    parsed = urlparse(url)

    # Remove fragment
    normalized = parsed._replace(fragment="")

-    # Lowercase scheme and netloc
+    # Lowercase scheme and netloc, strip www.
+    netloc = normalized.netloc.lower()
+    if netloc.startswith("www."):
+        netloc = netloc[4:]
    normalized = normalized._replace(
        scheme=normalized.scheme.lower(),
-        netloc=normalized.netloc.lower(),
+        netloc=netloc,
    )

    # Remove trailing slash (but keep "/" for root path)
@ -57,6 +62,17 @@ def normalize_url(url: str) -> str:
        path = path.rstrip("/")
    normalized = normalized._replace(path=path)

+    # Remove common tracking query parameters
+    if normalized.query:
+        from urllib.parse import parse_qs, urlencode
+        _TRACKING_PARAMS = {
+            "utm_source", "utm_medium", "utm_campaign", "utm_term",
+            "utm_content", "ref", "fbclid", "gclid", "mc_cid", "mc_eid",
+        }
+        params = parse_qs(normalized.query, keep_blank_values=True)
+        filtered = {k: v for k, v in params.items() if k.lower() not in _TRACKING_PARAMS}
+        normalized = normalized._replace(query=urlencode(filtered, doseq=True))
+
    return urlunparse(normalized)


--- a/backend/app/services/site_inspection_service.py
+++ b/backend/app/services/site_inspection_service.py
@ -302,9 +302,13 @@ class SiteInspectionService:
            if not discovered:
                raise ValueError("크롤링 결과가 없습니다. URL을 확인해주세요.")

-            # Build discovered_pages documents
+            # Build discovered_pages documents (중복 URL 제거)
            discovered_pages = []
+            seen_urls: set[str] = set()
            for page in discovered:
+                if page["url"] in seen_urls:
+                    continue
+                seen_urls.add(page["url"])
                discovered_pages.append({
                    "url": page["url"],
                    "depth": page["depth"],
@ -444,14 +448,6 @@ class SiteInspectionService:
        """Run inspection for a single discovered page."""
        inspection_id = str(uuid.uuid4())

-        # Publish page_start event
-        await self._publish_site_event(site_inspection_id, {
-            "event_type": "page_start",
-            "site_inspection_id": site_inspection_id,
-            "page_url": page_url,
-            "page_index": page_index,
-        })
-
        # Mark page as inspecting in MongoDB
        await self.db.site_inspections.update_one(
            {