fix: 크롤링 중복 URL 제거 + URL 정규화 강화
- normalize_url: www. prefix 제거, UTM 등 트래킹 파라미터 제거 - site inspection: 크롤링 후 검사 전 중복 URL 필터링 - batch inspection: 업로드 URL 목록 중복 제거 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@ -63,9 +63,13 @@ class BatchInspectionService:
|
||||
|
||||
batch_inspection_id = str(uuid.uuid4())
|
||||
|
||||
# Build discovered_pages documents
|
||||
# Build discovered_pages documents (중복 URL 제거)
|
||||
discovered_pages = []
|
||||
seen_urls: set[str] = set()
|
||||
for url in urls:
|
||||
if url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(url)
|
||||
discovered_pages.append({
|
||||
"url": url,
|
||||
"depth": 0,
|
||||
@ -76,6 +80,8 @@ class BatchInspectionService:
|
||||
"overall_score": None,
|
||||
"grade": None,
|
||||
})
|
||||
# 중복 제거된 URL 목록으로 갱신
|
||||
urls = list(seen_urls)
|
||||
|
||||
# Create initial document
|
||||
doc = {
|
||||
|
||||
Reference in New Issue
Block a user