fix: 크롤링 중복 URL 제거 + URL 정규화 강화
- normalize_url: www. prefix 제거, UTM 등 트래킹 파라미터 제거 - site inspection: 크롤링 후 검사 전 중복 URL 필터링 - batch inspection: 업로드 URL 목록 중복 제거 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@ -63,9 +63,13 @@ class BatchInspectionService:
|
|||||||
|
|
||||||
batch_inspection_id = str(uuid.uuid4())
|
batch_inspection_id = str(uuid.uuid4())
|
||||||
|
|
||||||
# Build discovered_pages documents
|
# Build discovered_pages documents (중복 URL 제거)
|
||||||
discovered_pages = []
|
discovered_pages = []
|
||||||
|
seen_urls: set[str] = set()
|
||||||
for url in urls:
|
for url in urls:
|
||||||
|
if url in seen_urls:
|
||||||
|
continue
|
||||||
|
seen_urls.add(url)
|
||||||
discovered_pages.append({
|
discovered_pages.append({
|
||||||
"url": url,
|
"url": url,
|
||||||
"depth": 0,
|
"depth": 0,
|
||||||
@ -76,6 +80,8 @@ class BatchInspectionService:
|
|||||||
"overall_score": None,
|
"overall_score": None,
|
||||||
"grade": None,
|
"grade": None,
|
||||||
})
|
})
|
||||||
|
# 중복 제거된 URL 목록으로 갱신
|
||||||
|
urls = list(seen_urls)
|
||||||
|
|
||||||
# Create initial document
|
# Create initial document
|
||||||
doc = {
|
doc = {
|
||||||
|
|||||||
@ -39,16 +39,21 @@ def normalize_url(url: str) -> str:
|
|||||||
- Remove fragment (#...)
|
- Remove fragment (#...)
|
||||||
- Remove trailing slash (except for root path)
|
- Remove trailing slash (except for root path)
|
||||||
- Lowercase scheme and netloc
|
- Lowercase scheme and netloc
|
||||||
|
- Strip www. prefix for consistent deduplication
|
||||||
|
- Remove common tracking query parameters
|
||||||
"""
|
"""
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
|
|
||||||
# Remove fragment
|
# Remove fragment
|
||||||
normalized = parsed._replace(fragment="")
|
normalized = parsed._replace(fragment="")
|
||||||
|
|
||||||
# Lowercase scheme and netloc
|
# Lowercase scheme and netloc, strip www.
|
||||||
|
netloc = normalized.netloc.lower()
|
||||||
|
if netloc.startswith("www."):
|
||||||
|
netloc = netloc[4:]
|
||||||
normalized = normalized._replace(
|
normalized = normalized._replace(
|
||||||
scheme=normalized.scheme.lower(),
|
scheme=normalized.scheme.lower(),
|
||||||
netloc=normalized.netloc.lower(),
|
netloc=netloc,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Remove trailing slash (but keep "/" for root path)
|
# Remove trailing slash (but keep "/" for root path)
|
||||||
@ -57,6 +62,17 @@ def normalize_url(url: str) -> str:
|
|||||||
path = path.rstrip("/")
|
path = path.rstrip("/")
|
||||||
normalized = normalized._replace(path=path)
|
normalized = normalized._replace(path=path)
|
||||||
|
|
||||||
|
# Remove common tracking query parameters
|
||||||
|
if normalized.query:
|
||||||
|
from urllib.parse import parse_qs, urlencode
|
||||||
|
_TRACKING_PARAMS = {
|
||||||
|
"utm_source", "utm_medium", "utm_campaign", "utm_term",
|
||||||
|
"utm_content", "ref", "fbclid", "gclid", "mc_cid", "mc_eid",
|
||||||
|
}
|
||||||
|
params = parse_qs(normalized.query, keep_blank_values=True)
|
||||||
|
filtered = {k: v for k, v in params.items() if k.lower() not in _TRACKING_PARAMS}
|
||||||
|
normalized = normalized._replace(query=urlencode(filtered, doseq=True))
|
||||||
|
|
||||||
return urlunparse(normalized)
|
return urlunparse(normalized)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -302,9 +302,13 @@ class SiteInspectionService:
|
|||||||
if not discovered:
|
if not discovered:
|
||||||
raise ValueError("크롤링 결과가 없습니다. URL을 확인해주세요.")
|
raise ValueError("크롤링 결과가 없습니다. URL을 확인해주세요.")
|
||||||
|
|
||||||
# Build discovered_pages documents
|
# Build discovered_pages documents (중복 URL 제거)
|
||||||
discovered_pages = []
|
discovered_pages = []
|
||||||
|
seen_urls: set[str] = set()
|
||||||
for page in discovered:
|
for page in discovered:
|
||||||
|
if page["url"] in seen_urls:
|
||||||
|
continue
|
||||||
|
seen_urls.add(page["url"])
|
||||||
discovered_pages.append({
|
discovered_pages.append({
|
||||||
"url": page["url"],
|
"url": page["url"],
|
||||||
"depth": page["depth"],
|
"depth": page["depth"],
|
||||||
@ -444,14 +448,6 @@ class SiteInspectionService:
|
|||||||
"""Run inspection for a single discovered page."""
|
"""Run inspection for a single discovered page."""
|
||||||
inspection_id = str(uuid.uuid4())
|
inspection_id = str(uuid.uuid4())
|
||||||
|
|
||||||
# Publish page_start event
|
|
||||||
await self._publish_site_event(site_inspection_id, {
|
|
||||||
"event_type": "page_start",
|
|
||||||
"site_inspection_id": site_inspection_id,
|
|
||||||
"page_url": page_url,
|
|
||||||
"page_index": page_index,
|
|
||||||
})
|
|
||||||
|
|
||||||
# Mark page as inspecting in MongoDB
|
# Mark page as inspecting in MongoDB
|
||||||
await self.db.site_inspections.update_one(
|
await self.db.site_inspections.update_one(
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user