fix: 크롤러 리다이렉트 중복 URL 제거 — max_pages를 유니크 URL 기준으로 카운트

리다이렉트 후 최종 URL을 정규화하여 visited에 추가, 이미 방문한 URL로 리다이렉트되면 스킵.
결과에는 최종 URL을 사용하여 중복 제거.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
jungwoo choi
2026-02-14 07:53:54 +09:00
parent 1d7544cdfe
commit 816b31e0d4

View File

@ -188,24 +188,35 @@ class LinkCrawler:
response = await client.get(url) response = await client.get(url)
status_code = response.status_code status_code = response.status_code
# 리다이렉트 후 최종 URL을 정규화하여 중복 체크
final_url = normalize_url(str(response.url))
if final_url != url and final_url in visited:
logger.debug("Redirect target already visited: %s%s", url, final_url)
continue
if final_url != url:
visited.add(final_url)
# 결과에는 최종 URL 사용
effective_url = final_url
# Only parse HTML content # Only parse HTML content
content_type = response.headers.get("content-type", "") content_type = response.headers.get("content-type", "")
if "text/html" not in content_type and "application/xhtml" not in content_type: if "text/html" not in content_type and "application/xhtml" not in content_type:
logger.debug("Skipping non-HTML content: %s (%s)", url, content_type) logger.debug("Skipping non-HTML content: %s (%s)", effective_url, content_type)
# Still record it but don't extract links # Still record it but don't extract links
results.append({ results.append({
"url": url, "url": effective_url,
"depth": depth, "depth": depth,
"parent_url": parent_url, "parent_url": parent_url,
"title": None, "title": None,
"status": "discovered", "status": "discovered",
}) })
if progress_callback: if progress_callback:
await progress_callback(len(results), url) await progress_callback(len(results), effective_url)
continue continue
html = response.text html = response.text
title, links = self._extract_links_and_title(url, html) title, links = self._extract_links_and_title(effective_url, html)
except httpx.TimeoutException: except httpx.TimeoutException:
logger.warning("Timeout crawling %s", url) logger.warning("Timeout crawling %s", url)
@ -233,9 +244,9 @@ class LinkCrawler:
await progress_callback(len(results), url) await progress_callback(len(results), url)
continue continue
# Record this page # Record this page (최종 URL 사용)
results.append({ results.append({
"url": url, "url": effective_url,
"depth": depth, "depth": depth,
"parent_url": parent_url, "parent_url": parent_url,
"title": title, "title": title,
@ -244,7 +255,7 @@ class LinkCrawler:
# Notify progress # Notify progress
if progress_callback: if progress_callback:
await progress_callback(len(results), url) await progress_callback(len(results), effective_url)
# Only enqueue child links if we haven't reached max_depth # Only enqueue child links if we haven't reached max_depth
if depth < self.max_depth: if depth < self.max_depth: