web-inspector/backend/app/engines/seo.py

"""
SEO Optimization Checker Engine (F-004).
Checks meta tags, OG tags, robots.txt, sitemap.xml, structured data, etc.
"""

import re
import json
import logging
from urllib.parse import urlparse, urljoin
from typing import Any, Optional

import httpx
from bs4 import BeautifulSoup

from app.engines.base import BaseChecker
from app.models.schemas import CategoryResult, Issue
from app.rules import get_rules

logger = logging.getLogger(__name__)


class SeoChecker(BaseChecker):
    """SEO optimization checker engine."""

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._rules_data = get_rules("seo")

    def _get_seo_rule(self, rule_id: str) -> dict[str, Any]:
        """Lookup a rule by id from YAML data."""
        for rule in self._rules_data.get("rules", []):
            if rule.get("id") == rule_id:
                return rule
        return {}

    def _get_threshold(self, rule_id: str, key: str, default: Any = None) -> Any:
        """Get a specific threshold from a rule's details."""
        rule = self._get_seo_rule(rule_id)
        details = rule.get("details", {})
        return details.get(key, default)

    @property
    def category_name(self) -> str:
        return "seo"

    async def check(self, url: str, html_content: str, headers: dict) -> CategoryResult:
        soup = BeautifulSoup(html_content, "html5lib")
        issues: list[Issue] = []
        meta_info: dict = {}

        await self.update_progress(10, "title 태그 검사 중...")
        issues += self._check_title(soup, meta_info)

        await self.update_progress(20, "meta description 검사 중...")
        issues += self._check_meta_description(soup, meta_info)
        issues += self._check_meta_keywords(soup, meta_info)

        await self.update_progress(30, "OG 태그 검사 중...")
        issues += self._check_og_tags(soup)
        issues += self._check_twitter_card(soup)

        await self.update_progress(40, "canonical URL 검사 중...")
        issues += self._check_canonical(soup)

        await self.update_progress(50, "robots.txt 확인 중...")
        issues += await self._check_robots_txt(url, meta_info)

        await self.update_progress(60, "sitemap.xml 확인 중...")
        issues += await self._check_sitemap(url, meta_info)

        await self.update_progress(70, "H1 태그 검사 중...")
        issues += self._check_h1(soup)

        await self.update_progress(80, "구조화 데이터 검사 중...")
        issues += self._check_structured_data(soup, html_content, meta_info)

        await self.update_progress(90, "기타 항목 검사 중...")
        issues += self._check_favicon(soup)
        issues += self._check_viewport(soup)
        issues += self._check_url_structure(url)
        issues += self._check_img_alt_seo(soup)

        score = self._calculate_score_by_deduction(issues)
        await self.update_progress(100, "완료")

        return self._build_result(
            category="seo",
            score=score,
            issues=issues,
            meta_info=meta_info,
        )

    def _check_title(self, soup: BeautifulSoup, meta_info: dict) -> list[Issue]:
        """S-01: Check title tag existence and length."""
        issues = []
        title = soup.find("title")
        min_len = self._get_threshold("seo-title-tag", "min_length", 10)
        max_len = self._get_threshold("seo-title-tag", "max_length", 60)

        if title is None or not title.string or title.string.strip() == "":
            meta_info["title"] = None
            meta_info["title_length"] = 0
            issues.append(self._create_issue(
                code="S-01",
                severity="critical",
                message="<title> 태그가 없거나 비어있습니다",
                suggestion=f"검색 결과에 표시될 {min_len}-{max_len}자 길이의 페이지 제목을 설정하세요",
            ))
            return issues

        title_text = title.string.strip()
        title_len = len(title_text)
        meta_info["title"] = title_text
        meta_info["title_length"] = title_len

        if title_len < min_len:
            issues.append(self._create_issue(
                code="S-01",
                severity="critical",
                message=f"title이 너무 짧습니다 ({title_len}자, 권장 {min_len}-{max_len}자)",
                element=f"<title>{title_text}</title>",
                suggestion=f"검색 결과에 효과적으로 표시되도록 {min_len}자 이상의 제목을 작성하세요",
            ))
        elif title_len > max_len:
            issues.append(self._create_issue(
                code="S-01",
                severity="minor",
                message=f"title이 너무 깁니다 ({title_len}자, 권장 {min_len}-{max_len}자)",
                element=f"<title>{title_text[:50]}...</title>",
                suggestion=f"검색 결과에서 잘리지 않도록 {max_len}자 이내로 제목을 줄이세요",
            ))
        return issues

    def _check_meta_description(self, soup: BeautifulSoup, meta_info: dict) -> list[Issue]:
        """S-02: Check meta description existence and length."""
        issues = []
        desc = soup.find("meta", attrs={"name": re.compile(r"^description$", re.I)})
        min_len = self._get_threshold("seo-meta-description", "min_length", 50)
        max_len = self._get_threshold("seo-meta-description", "max_length", 160)

        if desc is None or not desc.get("content"):
            meta_info["description"] = None
            meta_info["description_length"] = 0
            issues.append(self._create_issue(
                code="S-02",
                severity="major",
                message="meta description이 없습니다",
                suggestion=f'<meta name="description" content="페이지 설명">을 추가하세요 ({min_len}-{max_len}자 권장)',
            ))
            return issues

        content = desc["content"].strip()
        content_len = len(content)
        meta_info["description"] = content
        meta_info["description_length"] = content_len

        if content_len < min_len:
            issues.append(self._create_issue(
                code="S-02",
                severity="major",
                message=f"meta description이 너무 짧습니다 ({content_len}자, 권장 {min_len}-{max_len}자)",
                suggestion=f"검색 결과에서 페이지를 효과적으로 설명하도록 {min_len}자 이상으로 작성하세요",
            ))
        elif content_len > max_len:
            issues.append(self._create_issue(
                code="S-02",
                severity="minor",
                message=f"meta description이 너무 깁니다 ({content_len}자, 권장 {min_len}-{max_len}자)",
                suggestion=f"검색 결과에서 잘리지 않도록 {max_len}자 이내로 줄이세요",
            ))
        return issues

    def _check_meta_keywords(self, soup: BeautifulSoup, meta_info: dict) -> list[Issue]:
        """S-03: Check meta keywords (informational only)."""
        keywords = soup.find("meta", attrs={"name": re.compile(r"^keywords$", re.I)})
        if keywords is None or not keywords.get("content"):
            meta_info["has_keywords"] = False
            return [self._create_issue(
                code="S-03",
                severity="info",
                message="meta keywords가 없습니다 (현재 대부분의 검색엔진에서 무시됨)",
                suggestion="meta keywords는 SEO에 큰 영향이 없지만, 참고용으로 추가할 수 있습니다",
            )]
        meta_info["has_keywords"] = True
        return []

    def _check_og_tags(self, soup: BeautifulSoup) -> list[Issue]:
        """S-04: Check Open Graph tags from YAML rule definitions."""
        issues = []
        rule = self._get_seo_rule("seo-open-graph")
        required_tags = rule.get("details", {}).get("required_tags", [])
        required_og = [t["property"] for t in required_tags] if required_tags else [
            "og:title", "og:description", "og:image",
        ]
        missing = []

        for prop in required_og:
            og = soup.find("meta", attrs={"property": prop})
            if og is None or not og.get("content"):
                missing.append(prop)

        if missing:
            issues.append(self._create_issue(
                code="S-04",
                severity="major",
                message=f"Open Graph 태그가 누락되었습니다: {', '.join(missing)}",
                suggestion=f'누락된 OG 태그를 추가하세요. 예: <meta property="{missing[0]}" content="값">',
            ))
        return issues

    def _check_twitter_card(self, soup: BeautifulSoup) -> list[Issue]:
        """S-05: Check Twitter Card tags."""
        twitter_card = soup.find("meta", attrs={"name": "twitter:card"})
        twitter_title = soup.find("meta", attrs={"name": "twitter:title"})

        if twitter_card is None and twitter_title is None:
            return [self._create_issue(
                code="S-05",
                severity="minor",
                message="Twitter Card 태그가 없습니다",
                suggestion='<meta name="twitter:card" content="summary_large_image">를 추가하세요',
            )]
        return []

    def _check_canonical(self, soup: BeautifulSoup) -> list[Issue]:
        """S-06: Check canonical URL."""
        canonical = soup.find("link", attrs={"rel": "canonical"})
        if canonical is None or not canonical.get("href"):
            return [self._create_issue(
                code="S-06",
                severity="major",
                message="canonical URL이 설정되지 않았습니다",
                suggestion='<link rel="canonical" href="현재페이지URL">을 추가하여 중복 콘텐츠 문제를 방지하세요',
            )]
        return []

    async def _check_robots_txt(self, url: str, meta_info: dict) -> list[Issue]:
        """S-07: Check robots.txt accessibility."""
        parsed = urlparse(url)
        robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"

        try:
            async with httpx.AsyncClient(timeout=httpx.Timeout(5.0), verify=False) as client:
                resp = await client.get(robots_url)
                if resp.status_code == 200:
                    meta_info["has_robots_txt"] = True
                    return []
                else:
                    meta_info["has_robots_txt"] = False
                    return [self._create_issue(
                        code="S-07",
                        severity="major",
                        message=f"robots.txt에 접근할 수 없습니다 (HTTP {resp.status_code})",
                        suggestion="검색엔진 크롤링을 제어하기 위해 /robots.txt 파일을 생성하세요",
                    )]
        except Exception as e:
            logger.warning("robots.txt check failed for %s: %s", url, str(e))
            meta_info["has_robots_txt"] = False
            return [self._create_issue(
                code="S-07",
                severity="major",
                message="robots.txt에 접근할 수 없습니다",
                suggestion="검색엔진 크롤링을 제어하기 위해 /robots.txt 파일을 생성하세요",
            )]

    async def _check_sitemap(self, url: str, meta_info: dict) -> list[Issue]:
        """S-08: Check sitemap.xml accessibility."""
        parsed = urlparse(url)
        sitemap_url = f"{parsed.scheme}://{parsed.netloc}/sitemap.xml"

        try:
            async with httpx.AsyncClient(timeout=httpx.Timeout(5.0), verify=False) as client:
                resp = await client.get(sitemap_url)
                if resp.status_code == 200:
                    meta_info["has_sitemap"] = True
                    return []
                else:
                    meta_info["has_sitemap"] = False
                    return [self._create_issue(
                        code="S-08",
                        severity="major",
                        message=f"sitemap.xml에 접근할 수 없습니다 (HTTP {resp.status_code})",
                        suggestion="검색엔진이 사이트 구조를 이해할 수 있도록 /sitemap.xml을 생성하세요",
                    )]
        except Exception as e:
            logger.warning("sitemap.xml check failed for %s: %s", url, str(e))
            meta_info["has_sitemap"] = False
            return [self._create_issue(
                code="S-08",
                severity="major",
                message="sitemap.xml에 접근할 수 없습니다",
                suggestion="검색엔진이 사이트 구조를 이해할 수 있도록 /sitemap.xml을 생성하세요",
            )]

    def _check_h1(self, soup: BeautifulSoup) -> list[Issue]:
        """S-09: Check H1 tag existence and uniqueness."""
        h1_tags = soup.find_all("h1")
        issues = []

        if len(h1_tags) == 0:
            issues.append(self._create_issue(
                code="S-09",
                severity="critical",
                message="H1 태그가 없습니다",
                suggestion="페이지의 주요 제목을 <h1> 태그로 추가하세요",
            ))
        elif len(h1_tags) > 1:
            issues.append(self._create_issue(
                code="S-09",
                severity="critical",
                message=f"H1 태그가 {len(h1_tags)}개 발견되었습니다 (1개 권장)",
                element=self._truncate_element(str(h1_tags[0])),
                suggestion="페이지당 H1 태그는 1개만 사용하세요",
            ))
        return issues

    def _check_structured_data(self, soup: BeautifulSoup, html_content: str, meta_info: dict) -> list[Issue]:
        """S-10: Check for structured data (JSON-LD, Microdata, RDFa)."""
        structured_types = []

        # JSON-LD
        json_ld_scripts = soup.find_all("script", attrs={"type": "application/ld+json"})
        if json_ld_scripts:
            structured_types.append("JSON-LD")

        # Microdata
        microdata = soup.find_all(attrs={"itemscope": True})
        if microdata:
            structured_types.append("Microdata")

        # RDFa
        rdfa = soup.find_all(attrs={"typeof": True})
        if rdfa:
            structured_types.append("RDFa")

        meta_info["structured_data_types"] = structured_types

        if not structured_types:
            return [self._create_issue(
                code="S-10",
                severity="minor",
                message="구조화 데이터(JSON-LD, Microdata, RDFa)가 없습니다",
                suggestion='<script type="application/ld+json">을 사용하여 구조화 데이터를 추가하세요',
            )]
        return []

    def _check_favicon(self, soup: BeautifulSoup) -> list[Issue]:
        """S-11: Check favicon existence."""
        favicon = soup.find("link", attrs={"rel": re.compile(r"icon", re.I)})
        if favicon is None:
            return [self._create_issue(
                code="S-11",
                severity="minor",
                message="favicon이 설정되지 않았습니다",
                suggestion='<link rel="icon" href="/favicon.ico">를 추가하세요',
            )]
        return []

    def _check_viewport(self, soup: BeautifulSoup) -> list[Issue]:
        """S-12: Check viewport meta tag for mobile friendliness."""
        viewport = soup.find("meta", attrs={"name": re.compile(r"^viewport$", re.I)})
        if viewport is None:
            return [self._create_issue(
                code="S-12",
                severity="major",
                message="viewport meta 태그가 없습니다 (모바일 친화성 부족)",
                suggestion='<meta name="viewport" content="width=device-width, initial-scale=1.0">을 추가하세요',
            )]
        return []

    def _check_url_structure(self, url: str) -> list[Issue]:
        """S-13: Check URL structure for SEO friendliness."""
        parsed = urlparse(url)
        path = parsed.path

        # Check for special characters (excluding common ones like /, -, _)
        special_chars = re.findall(r"[^a-zA-Z0-9/\-_.]", path)
        if len(special_chars) > 3:
            return [self._create_issue(
                code="S-13",
                severity="minor",
                message=f"URL에 특수 문자가 많습니다 ({len(special_chars)}개)",
                suggestion="URL은 영문, 숫자, 하이픈(-)을 사용하여 깔끔하게 구성하세요",
            )]
        return []

    def _check_img_alt_seo(self, soup: BeautifulSoup) -> list[Issue]:
        """S-14: Check image alt attributes from SEO perspective."""
        images = soup.find_all("img")
        if not images:
            return []

        missing_alt = [img for img in images if not img.get("alt") and img.get("alt") != ""]
        if missing_alt:
            return [self._create_issue(
                code="S-14",
                severity="major",
                message=f"alt 속성이 없는 이미지가 {len(missing_alt)}개 발견되었습니다",
                element=self._truncate_element(str(missing_alt[0])) if missing_alt else None,
                suggestion="검색엔진이 이미지를 이해할 수 있도록 모든 이미지에 설명적인 alt 속성을 추가하세요",
            )]
        return []

    @staticmethod
    def _truncate_element(element_str: str, max_len: int = 200) -> str:
        if len(element_str) > max_len:
            return element_str[:max_len] + "..."
        return element_str