fix: 크롤러 리다이렉트 중복 URL 제거 — max_pages를 유니크 URL 기준으로 카운트
리다이렉트 후 최종 URL을 정규화하여 visited에 추가, 이미 방문한 URL로 리다이렉트되면 스킵. 결과에는 최종 URL을 사용하여 중복 제거. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@ -188,24 +188,35 @@ class LinkCrawler:
|
|||||||
response = await client.get(url)
|
response = await client.get(url)
|
||||||
status_code = response.status_code
|
status_code = response.status_code
|
||||||
|
|
||||||
|
# 리다이렉트 후 최종 URL을 정규화하여 중복 체크
|
||||||
|
final_url = normalize_url(str(response.url))
|
||||||
|
if final_url != url and final_url in visited:
|
||||||
|
logger.debug("Redirect target already visited: %s → %s", url, final_url)
|
||||||
|
continue
|
||||||
|
if final_url != url:
|
||||||
|
visited.add(final_url)
|
||||||
|
|
||||||
|
# 결과에는 최종 URL 사용
|
||||||
|
effective_url = final_url
|
||||||
|
|
||||||
# Only parse HTML content
|
# Only parse HTML content
|
||||||
content_type = response.headers.get("content-type", "")
|
content_type = response.headers.get("content-type", "")
|
||||||
if "text/html" not in content_type and "application/xhtml" not in content_type:
|
if "text/html" not in content_type and "application/xhtml" not in content_type:
|
||||||
logger.debug("Skipping non-HTML content: %s (%s)", url, content_type)
|
logger.debug("Skipping non-HTML content: %s (%s)", effective_url, content_type)
|
||||||
# Still record it but don't extract links
|
# Still record it but don't extract links
|
||||||
results.append({
|
results.append({
|
||||||
"url": url,
|
"url": effective_url,
|
||||||
"depth": depth,
|
"depth": depth,
|
||||||
"parent_url": parent_url,
|
"parent_url": parent_url,
|
||||||
"title": None,
|
"title": None,
|
||||||
"status": "discovered",
|
"status": "discovered",
|
||||||
})
|
})
|
||||||
if progress_callback:
|
if progress_callback:
|
||||||
await progress_callback(len(results), url)
|
await progress_callback(len(results), effective_url)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
html = response.text
|
html = response.text
|
||||||
title, links = self._extract_links_and_title(url, html)
|
title, links = self._extract_links_and_title(effective_url, html)
|
||||||
|
|
||||||
except httpx.TimeoutException:
|
except httpx.TimeoutException:
|
||||||
logger.warning("Timeout crawling %s", url)
|
logger.warning("Timeout crawling %s", url)
|
||||||
@ -233,9 +244,9 @@ class LinkCrawler:
|
|||||||
await progress_callback(len(results), url)
|
await progress_callback(len(results), url)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Record this page
|
# Record this page (최종 URL 사용)
|
||||||
results.append({
|
results.append({
|
||||||
"url": url,
|
"url": effective_url,
|
||||||
"depth": depth,
|
"depth": depth,
|
||||||
"parent_url": parent_url,
|
"parent_url": parent_url,
|
||||||
"title": title,
|
"title": title,
|
||||||
@ -244,7 +255,7 @@ class LinkCrawler:
|
|||||||
|
|
||||||
# Notify progress
|
# Notify progress
|
||||||
if progress_callback:
|
if progress_callback:
|
||||||
await progress_callback(len(results), url)
|
await progress_callback(len(results), effective_url)
|
||||||
|
|
||||||
# Only enqueue child links if we haven't reached max_depth
|
# Only enqueue child links if we haven't reached max_depth
|
||||||
if depth < self.max_depth:
|
if depth < self.max_depth:
|
||||||
|
|||||||
Reference in New Issue
Block a user