From 645ec56bd10a7bc9a34dea02c4659b8bf49b8166 Mon Sep 17 00:00:00 2001 From: jungwoo choi Date: Sat, 14 Feb 2026 07:26:00 +0900 Subject: [PATCH] =?UTF-8?q?fix:=20=ED=81=AC=EB=A1=A4=EB=A7=81=20=EC=A4=91?= =?UTF-8?q?=EB=B3=B5=20URL=20=EC=A0=9C=EA=B1=B0=20+=20URL=20=EC=A0=95?= =?UTF-8?q?=EA=B7=9C=ED=99=94=20=EA=B0=95=ED=99=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - normalize_url: www. prefix 제거, UTM 등 트래킹 파라미터 제거 - site inspection: 크롤링 후 검사 전 중복 URL 필터링 - batch inspection: 업로드 URL 목록 중복 제거 Co-Authored-By: Claude Opus 4.6 --- .../app/services/batch_inspection_service.py | 8 +++++++- backend/app/services/link_crawler.py | 20 +++++++++++++++++-- .../app/services/site_inspection_service.py | 14 +++++-------- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/backend/app/services/batch_inspection_service.py b/backend/app/services/batch_inspection_service.py index fd656d1..4b91d4b 100644 --- a/backend/app/services/batch_inspection_service.py +++ b/backend/app/services/batch_inspection_service.py @@ -63,9 +63,13 @@ class BatchInspectionService: batch_inspection_id = str(uuid.uuid4()) - # Build discovered_pages documents + # Build discovered_pages documents (중복 URL 제거) discovered_pages = [] + seen_urls: set[str] = set() for url in urls: + if url in seen_urls: + continue + seen_urls.add(url) discovered_pages.append({ "url": url, "depth": 0, @@ -76,6 +80,8 @@ class BatchInspectionService: "overall_score": None, "grade": None, }) + # 중복 제거된 URL 목록으로 갱신 + urls = list(seen_urls) # Create initial document doc = { diff --git a/backend/app/services/link_crawler.py b/backend/app/services/link_crawler.py index bfc388f..2253e1f 100644 --- a/backend/app/services/link_crawler.py +++ b/backend/app/services/link_crawler.py @@ -39,16 +39,21 @@ def normalize_url(url: str) -> str: - Remove fragment (#...) - Remove trailing slash (except for root path) - Lowercase scheme and netloc + - Strip www. prefix for consistent deduplication + - Remove common tracking query parameters """ parsed = urlparse(url) # Remove fragment normalized = parsed._replace(fragment="") - # Lowercase scheme and netloc + # Lowercase scheme and netloc, strip www. + netloc = normalized.netloc.lower() + if netloc.startswith("www."): + netloc = netloc[4:] normalized = normalized._replace( scheme=normalized.scheme.lower(), - netloc=normalized.netloc.lower(), + netloc=netloc, ) # Remove trailing slash (but keep "/" for root path) @@ -57,6 +62,17 @@ def normalize_url(url: str) -> str: path = path.rstrip("/") normalized = normalized._replace(path=path) + # Remove common tracking query parameters + if normalized.query: + from urllib.parse import parse_qs, urlencode + _TRACKING_PARAMS = { + "utm_source", "utm_medium", "utm_campaign", "utm_term", + "utm_content", "ref", "fbclid", "gclid", "mc_cid", "mc_eid", + } + params = parse_qs(normalized.query, keep_blank_values=True) + filtered = {k: v for k, v in params.items() if k.lower() not in _TRACKING_PARAMS} + normalized = normalized._replace(query=urlencode(filtered, doseq=True)) + return urlunparse(normalized) diff --git a/backend/app/services/site_inspection_service.py b/backend/app/services/site_inspection_service.py index 149e918..91eb0b2 100644 --- a/backend/app/services/site_inspection_service.py +++ b/backend/app/services/site_inspection_service.py @@ -302,9 +302,13 @@ class SiteInspectionService: if not discovered: raise ValueError("크롤링 결과가 없습니다. URL을 확인해주세요.") - # Build discovered_pages documents + # Build discovered_pages documents (중복 URL 제거) discovered_pages = [] + seen_urls: set[str] = set() for page in discovered: + if page["url"] in seen_urls: + continue + seen_urls.add(page["url"]) discovered_pages.append({ "url": page["url"], "depth": page["depth"], @@ -444,14 +448,6 @@ class SiteInspectionService: """Run inspection for a single discovered page.""" inspection_id = str(uuid.uuid4()) - # Publish page_start event - await self._publish_site_event(site_inspection_id, { - "event_type": "page_start", - "site_inspection_id": site_inspection_id, - "page_url": page_url, - "page_index": page_index, - }) - # Mark page as inspecting in MongoDB await self.db.site_inspections.update_one( {