fix: 크롤링 중복 URL 제거 + URL 정규화 강화

- normalize_url: www. prefix 제거, UTM 등 트래킹 파라미터 제거 - site inspection: 크롤링 후 검사 전 중복 URL 필터링 - batch inspection: 업로드 URL 목록 중복 제거 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-14 07:26:00 +09:00
parent 9f58485a87
commit 645ec56bd1
3 changed files with 30 additions and 12 deletions
--- a/backend/app/services/link_crawler.py
+++ b/backend/app/services/link_crawler.py
@ -39,16 +39,21 @@ def normalize_url(url: str) -> str:
    - Remove fragment (#...)
    - Remove trailing slash (except for root path)
    - Lowercase scheme and netloc
+    - Strip www. prefix for consistent deduplication
+    - Remove common tracking query parameters
    """
    parsed = urlparse(url)

    # Remove fragment
    normalized = parsed._replace(fragment="")

-    # Lowercase scheme and netloc
+    # Lowercase scheme and netloc, strip www.
+    netloc = normalized.netloc.lower()
+    if netloc.startswith("www."):
+        netloc = netloc[4:]
    normalized = normalized._replace(
        scheme=normalized.scheme.lower(),
-        netloc=normalized.netloc.lower(),
+        netloc=netloc,
    )

    # Remove trailing slash (but keep "/" for root path)
@ -57,6 +62,17 @@ def normalize_url(url: str) -> str:
        path = path.rstrip("/")
    normalized = normalized._replace(path=path)

+    # Remove common tracking query parameters
+    if normalized.query:
+        from urllib.parse import parse_qs, urlencode
+        _TRACKING_PARAMS = {
+            "utm_source", "utm_medium", "utm_campaign", "utm_term",
+            "utm_content", "ref", "fbclid", "gclid", "mc_cid", "mc_eid",
+        }
+        params = parse_qs(normalized.query, keep_blank_values=True)
+        filtered = {k: v for k, v in params.items() if k.lower() not in _TRACKING_PARAMS}
+        normalized = normalized._replace(query=urlencode(filtered, doseq=True))
+
    return urlunparse(normalized)