diff --git a/backend/app/services/link_crawler.py b/backend/app/services/link_crawler.py index 2253e1f..8138f53 100644 --- a/backend/app/services/link_crawler.py +++ b/backend/app/services/link_crawler.py @@ -188,24 +188,35 @@ class LinkCrawler: response = await client.get(url) status_code = response.status_code + # 리다이렉트 후 최종 URL을 정규화하여 중복 체크 + final_url = normalize_url(str(response.url)) + if final_url != url and final_url in visited: + logger.debug("Redirect target already visited: %s → %s", url, final_url) + continue + if final_url != url: + visited.add(final_url) + + # 결과에는 최종 URL 사용 + effective_url = final_url + # Only parse HTML content content_type = response.headers.get("content-type", "") if "text/html" not in content_type and "application/xhtml" not in content_type: - logger.debug("Skipping non-HTML content: %s (%s)", url, content_type) + logger.debug("Skipping non-HTML content: %s (%s)", effective_url, content_type) # Still record it but don't extract links results.append({ - "url": url, + "url": effective_url, "depth": depth, "parent_url": parent_url, "title": None, "status": "discovered", }) if progress_callback: - await progress_callback(len(results), url) + await progress_callback(len(results), effective_url) continue html = response.text - title, links = self._extract_links_and_title(url, html) + title, links = self._extract_links_and_title(effective_url, html) except httpx.TimeoutException: logger.warning("Timeout crawling %s", url) @@ -233,9 +244,9 @@ class LinkCrawler: await progress_callback(len(results), url) continue - # Record this page + # Record this page (최종 URL 사용) results.append({ - "url": url, + "url": effective_url, "depth": depth, "parent_url": parent_url, "title": title, @@ -244,7 +255,7 @@ class LinkCrawler: # Notify progress if progress_callback: - await progress_callback(len(results), url) + await progress_callback(len(results), effective_url) # Only enqueue child links if we haven't reached max_depth if depth < self.max_depth: