fix: 쿼리 파라미터 다르면 유니크 URL로 판단 — 트래킹 파라미터 제거 로직 삭제
모든 쿼리 파라미터를 보존하여 파라미터가 다른 URL은 별도 페이지로 취급. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@ -40,7 +40,7 @@ def normalize_url(url: str) -> str:
|
|||||||
- Remove trailing slash (except for root path)
|
- Remove trailing slash (except for root path)
|
||||||
- Lowercase scheme and netloc
|
- Lowercase scheme and netloc
|
||||||
- Strip www. prefix for consistent deduplication
|
- Strip www. prefix for consistent deduplication
|
||||||
- Remove common tracking query parameters
|
- Query parameters are preserved as-is (different params = different page)
|
||||||
"""
|
"""
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
|
|
||||||
@ -62,17 +62,6 @@ def normalize_url(url: str) -> str:
|
|||||||
path = path.rstrip("/")
|
path = path.rstrip("/")
|
||||||
normalized = normalized._replace(path=path)
|
normalized = normalized._replace(path=path)
|
||||||
|
|
||||||
# Remove common tracking query parameters
|
|
||||||
if normalized.query:
|
|
||||||
from urllib.parse import parse_qs, urlencode
|
|
||||||
_TRACKING_PARAMS = {
|
|
||||||
"utm_source", "utm_medium", "utm_campaign", "utm_term",
|
|
||||||
"utm_content", "ref", "fbclid", "gclid", "mc_cid", "mc_eid",
|
|
||||||
}
|
|
||||||
params = parse_qs(normalized.query, keep_blank_values=True)
|
|
||||||
filtered = {k: v for k, v in params.items() if k.lower() not in _TRACKING_PARAMS}
|
|
||||||
normalized = normalized._replace(query=urlencode(filtered, doseq=True))
|
|
||||||
|
|
||||||
return urlunparse(normalized)
|
return urlunparse(normalized)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user