fix: 크롤링 중복 URL 제거 + URL 정규화 강화

- normalize_url: www. prefix 제거, UTM 등 트래킹 파라미터 제거
- site inspection: 크롤링 후 검사 전 중복 URL 필터링
- batch inspection: 업로드 URL 목록 중복 제거

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
jungwoo choi
2026-02-14 07:26:00 +09:00
parent 9f58485a87
commit 645ec56bd1
3 changed files with 30 additions and 12 deletions

View File

@ -39,16 +39,21 @@ def normalize_url(url: str) -> str:
- Remove fragment (#...)
- Remove trailing slash (except for root path)
- Lowercase scheme and netloc
- Strip www. prefix for consistent deduplication
- Remove common tracking query parameters
"""
parsed = urlparse(url)
# Remove fragment
normalized = parsed._replace(fragment="")
# Lowercase scheme and netloc
# Lowercase scheme and netloc, strip www.
netloc = normalized.netloc.lower()
if netloc.startswith("www."):
netloc = netloc[4:]
normalized = normalized._replace(
scheme=normalized.scheme.lower(),
netloc=normalized.netloc.lower(),
netloc=netloc,
)
# Remove trailing slash (but keep "/" for root path)
@ -57,6 +62,17 @@ def normalize_url(url: str) -> str:
path = path.rstrip("/")
normalized = normalized._replace(path=path)
# Remove common tracking query parameters
if normalized.query:
from urllib.parse import parse_qs, urlencode
_TRACKING_PARAMS = {
"utm_source", "utm_medium", "utm_campaign", "utm_term",
"utm_content", "ref", "fbclid", "gclid", "mc_cid", "mc_eid",
}
params = parse_qs(normalized.query, keep_blank_values=True)
filtered = {k: v for k, v in params.items() if k.lower() not in _TRACKING_PARAMS}
normalized = normalized._replace(query=urlencode(filtered, doseq=True))
return urlunparse(normalized)