fix: 크롤링 중복 URL 제거 + URL 정규화 강화

- normalize_url: www. prefix 제거, UTM 등 트래킹 파라미터 제거
- site inspection: 크롤링 후 검사 전 중복 URL 필터링
- batch inspection: 업로드 URL 목록 중복 제거

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
jungwoo choi
2026-02-14 07:26:00 +09:00
parent 9f58485a87
commit 645ec56bd1
3 changed files with 30 additions and 12 deletions

View File

@ -302,9 +302,13 @@ class SiteInspectionService:
if not discovered:
raise ValueError("크롤링 결과가 없습니다. URL을 확인해주세요.")
# Build discovered_pages documents
# Build discovered_pages documents (중복 URL 제거)
discovered_pages = []
seen_urls: set[str] = set()
for page in discovered:
if page["url"] in seen_urls:
continue
seen_urls.add(page["url"])
discovered_pages.append({
"url": page["url"],
"depth": page["depth"],
@ -444,14 +448,6 @@ class SiteInspectionService:
"""Run inspection for a single discovered page."""
inspection_id = str(uuid.uuid4())
# Publish page_start event
await self._publish_site_event(site_inspection_id, {
"event_type": "page_start",
"site_inspection_id": site_inspection_id,
"page_url": page_url,
"page_index": page_index,
})
# Mark page as inspecting in MongoDB
await self.db.site_inspections.update_one(
{