fix: 크롤러 리다이렉트 중복 URL 제거 — max_pages를 유니크 URL 기준으로 카운트

리다이렉트 후 최종 URL을 정규화하여 visited에 추가, 이미 방문한 URL로 리다이렉트되면 스킵. 결과에는 최종 URL을 사용하여 중복 제거. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-14 07:53:54 +09:00
parent 1d7544cdfe
commit 816b31e0d4
1 changed files with 18 additions and 7 deletions
--- a/backend/app/services/link_crawler.py
+++ b/backend/app/services/link_crawler.py
@ -188,24 +188,35 @@ class LinkCrawler:
                    response = await client.get(url)
                    status_code = response.status_code
                    # 리다이렉트 후 최종 URL을 정규화하여 중복 체크
                    final_url = normalize_url(str(response.url))
                    if final_url != url and final_url in visited:
                        logger.debug("Redirect target already visited: %s → %s", url, final_url)
                        continue
                    if final_url != url:
                        visited.add(final_url)
                    # 결과에는 최종 URL 사용
                    effective_url = final_url
                    # Only parse HTML content
                    content_type = response.headers.get("content-type", "")
                    if "text/html" not in content_type and "application/xhtml" not in content_type:
-                        logger.debug("Skipping non-HTML content: %s (%s)", url, content_type)
+                        logger.debug("Skipping non-HTML content: %s (%s)", effective_url, content_type)
                        # Still record it but don't extract links
                        results.append({
-                            "url": url,
+                            "url": effective_url,
                            "depth": depth,
                            "parent_url": parent_url,
                            "title": None,
                            "status": "discovered",
                        })
                        if progress_callback:
-                            await progress_callback(len(results), url)
+                            await progress_callback(len(results), effective_url)
                        continue
                    html = response.text
-                    title, links = self._extract_links_and_title(url, html)
+                    title, links = self._extract_links_and_title(effective_url, html)
                except httpx.TimeoutException:
                    logger.warning("Timeout crawling %s", url)
@ -233,9 +244,9 @@ class LinkCrawler:
                        await progress_callback(len(results), url)
                    continue
-                # Record this page
+                # Record this page (최종 URL 사용)
                results.append({
-                    "url": url,
+                    "url": effective_url,
                    "depth": depth,
                    "parent_url": parent_url,
                    "title": title,
@ -244,7 +255,7 @@ class LinkCrawler:
                # Notify progress
                if progress_callback:
-                    await progress_callback(len(results), url)
+                    await progress_callback(len(results), effective_url)
                # Only enqueue child links if we haven't reached max_depth
                if depth < self.max_depth: