diff --git a/backend/app/services/link_crawler.py b/backend/app/services/link_crawler.py index 8138f53..868c258 100644 --- a/backend/app/services/link_crawler.py +++ b/backend/app/services/link_crawler.py @@ -40,7 +40,7 @@ def normalize_url(url: str) -> str: - Remove trailing slash (except for root path) - Lowercase scheme and netloc - Strip www. prefix for consistent deduplication - - Remove common tracking query parameters + - Query parameters are preserved as-is (different params = different page) """ parsed = urlparse(url) @@ -62,17 +62,6 @@ def normalize_url(url: str) -> str: path = path.rstrip("/") normalized = normalized._replace(path=path) - # Remove common tracking query parameters - if normalized.query: - from urllib.parse import parse_qs, urlencode - _TRACKING_PARAMS = { - "utm_source", "utm_medium", "utm_campaign", "utm_term", - "utm_content", "ref", "fbclid", "gclid", "mc_cid", "mc_eid", - } - params = parse_qs(normalized.query, keep_blank_values=True) - filtered = {k: v for k, v in params.items() if k.lower() not in _TRACKING_PARAMS} - normalized = normalized._replace(query=urlencode(filtered, doseq=True)) - return urlunparse(normalized)