From 816b31e0d471be4c0042e32d4413348d2d7d155b Mon Sep 17 00:00:00 2001
From: jungwoo choi <jungwoochoi@MacBook-Pro-2.local>
Date: Sat, 14 Feb 2026 07:53:54 +0900
Subject: [PATCH] =?UTF-8?q?fix:=20=ED=81=AC=EB=A1=A4=EB=9F=AC=20=EB=A6=AC?=
 =?UTF-8?q?=EB=8B=A4=EC=9D=B4=EB=A0=89=ED=8A=B8=20=EC=A4=91=EB=B3=B5=20URL?=
 =?UTF-8?q?=20=EC=A0=9C=EA=B1=B0=20=E2=80=94=20max=5Fpages=EB=A5=BC=20?=
 =?UTF-8?q?=EC=9C=A0=EB=8B=88=ED=81=AC=20URL=20=EA=B8=B0=EC=A4=80=EC=9C=BC?=
 =?UTF-8?q?=EB=A1=9C=20=EC=B9=B4=EC=9A=B4=ED=8A=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

리다이렉트 후 최종 URL을 정규화하여 visited에 추가, 이미 방문한 URL로 리다이렉트되면 스킵.
결과에는 최종 URL을 사용하여 중복 제거.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 backend/app/services/link_crawler.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/backend/app/services/link_crawler.py b/backend/app/services/link_crawler.py
index 2253e1f..8138f53 100644
--- a/backend/app/services/link_crawler.py
+++ b/backend/app/services/link_crawler.py
@@ -188,24 +188,35 @@ class LinkCrawler:
                     response = await client.get(url)
                     status_code = response.status_code
 
+                    # 리다이렉트 후 최종 URL을 정규화하여 중복 체크
+                    final_url = normalize_url(str(response.url))
+                    if final_url != url and final_url in visited:
+                        logger.debug("Redirect target already visited: %s → %s", url, final_url)
+                        continue
+                    if final_url != url:
+                        visited.add(final_url)
+
+                    # 결과에는 최종 URL 사용
+                    effective_url = final_url
+
                     # Only parse HTML content
                     content_type = response.headers.get("content-type", "")
                     if "text/html" not in content_type and "application/xhtml" not in content_type:
-                        logger.debug("Skipping non-HTML content: %s (%s)", url, content_type)
+                        logger.debug("Skipping non-HTML content: %s (%s)", effective_url, content_type)
                         # Still record it but don't extract links
                         results.append({
-                            "url": url,
+                            "url": effective_url,
                             "depth": depth,
                             "parent_url": parent_url,
                             "title": None,
                             "status": "discovered",
                         })
                         if progress_callback:
-                            await progress_callback(len(results), url)
+                            await progress_callback(len(results), effective_url)
                         continue
 
                     html = response.text
-                    title, links = self._extract_links_and_title(url, html)
+                    title, links = self._extract_links_and_title(effective_url, html)
 
                 except httpx.TimeoutException:
                     logger.warning("Timeout crawling %s", url)
@@ -233,9 +244,9 @@ class LinkCrawler:
                         await progress_callback(len(results), url)
                     continue
 
-                # Record this page
+                # Record this page (최종 URL 사용)
                 results.append({
-                    "url": url,
+                    "url": effective_url,
                     "depth": depth,
                     "parent_url": parent_url,
                     "title": title,
@@ -244,7 +255,7 @@ class LinkCrawler:
 
                 # Notify progress
                 if progress_callback:
-                    await progress_callback(len(results), url)
+                    await progress_callback(len(results), effective_url)
 
                 # Only enqueue child links if we haven't reached max_depth
                 if depth < self.max_depth: