Files
jungwoo choi 44ad36e2ab refactor: 4개 검사 엔진을 YAML 기반 표준 규칙으로 리팩토링
- YAML 규칙 파일 4개 신규 생성 (html_css, accessibility, seo, performance_security)
  W3C, WCAG 2.0/2.1/2.2, OWASP, Google Search Essentials 공식 표준 기반
- rules/__init__.py: YAML 로더 + 캐싱 + 리로드 모듈
- html_css.py: 30개 폐기 요소, 100+개 폐기 속성을 YAML에서 동적 로드
- accessibility.py: WCAG 버전 선택 지원 (wcag_version 파라미터)
- seo.py: title/description 길이, OG 필수 태그 등 임계값 YAML 로드
- performance_security.py: COOP/COEP/CORP 검사 추가, 정보 노출 헤더 검사 추가,
  TTFB/페이지 크기 임계값 YAML 로드
- PyYAML 의존성 추가

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 15:49:57 +09:00

409 lines
17 KiB
Python

"""
SEO Optimization Checker Engine (F-004).
Checks meta tags, OG tags, robots.txt, sitemap.xml, structured data, etc.
"""
import re
import json
import logging
from urllib.parse import urlparse, urljoin
from typing import Any, Optional
import httpx
from bs4 import BeautifulSoup
from app.engines.base import BaseChecker
from app.models.schemas import CategoryResult, Issue
from app.rules import get_rules
logger = logging.getLogger(__name__)
class SeoChecker(BaseChecker):
"""SEO optimization checker engine."""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._rules_data = get_rules("seo")
def _get_seo_rule(self, rule_id: str) -> dict[str, Any]:
"""Lookup a rule by id from YAML data."""
for rule in self._rules_data.get("rules", []):
if rule.get("id") == rule_id:
return rule
return {}
def _get_threshold(self, rule_id: str, key: str, default: Any = None) -> Any:
"""Get a specific threshold from a rule's details."""
rule = self._get_seo_rule(rule_id)
details = rule.get("details", {})
return details.get(key, default)
@property
def category_name(self) -> str:
return "seo"
async def check(self, url: str, html_content: str, headers: dict) -> CategoryResult:
soup = BeautifulSoup(html_content, "html5lib")
issues: list[Issue] = []
meta_info: dict = {}
await self.update_progress(10, "title 태그 검사 중...")
issues += self._check_title(soup, meta_info)
await self.update_progress(20, "meta description 검사 중...")
issues += self._check_meta_description(soup, meta_info)
issues += self._check_meta_keywords(soup, meta_info)
await self.update_progress(30, "OG 태그 검사 중...")
issues += self._check_og_tags(soup)
issues += self._check_twitter_card(soup)
await self.update_progress(40, "canonical URL 검사 중...")
issues += self._check_canonical(soup)
await self.update_progress(50, "robots.txt 확인 중...")
issues += await self._check_robots_txt(url, meta_info)
await self.update_progress(60, "sitemap.xml 확인 중...")
issues += await self._check_sitemap(url, meta_info)
await self.update_progress(70, "H1 태그 검사 중...")
issues += self._check_h1(soup)
await self.update_progress(80, "구조화 데이터 검사 중...")
issues += self._check_structured_data(soup, html_content, meta_info)
await self.update_progress(90, "기타 항목 검사 중...")
issues += self._check_favicon(soup)
issues += self._check_viewport(soup)
issues += self._check_url_structure(url)
issues += self._check_img_alt_seo(soup)
score = self._calculate_score_by_deduction(issues)
await self.update_progress(100, "완료")
return self._build_result(
category="seo",
score=score,
issues=issues,
meta_info=meta_info,
)
def _check_title(self, soup: BeautifulSoup, meta_info: dict) -> list[Issue]:
"""S-01: Check title tag existence and length."""
issues = []
title = soup.find("title")
min_len = self._get_threshold("seo-title-tag", "min_length", 10)
max_len = self._get_threshold("seo-title-tag", "max_length", 60)
if title is None or not title.string or title.string.strip() == "":
meta_info["title"] = None
meta_info["title_length"] = 0
issues.append(self._create_issue(
code="S-01",
severity="critical",
message="<title> 태그가 없거나 비어있습니다",
suggestion=f"검색 결과에 표시될 {min_len}-{max_len}자 길이의 페이지 제목을 설정하세요",
))
return issues
title_text = title.string.strip()
title_len = len(title_text)
meta_info["title"] = title_text
meta_info["title_length"] = title_len
if title_len < min_len:
issues.append(self._create_issue(
code="S-01",
severity="critical",
message=f"title이 너무 짧습니다 ({title_len}자, 권장 {min_len}-{max_len}자)",
element=f"<title>{title_text}</title>",
suggestion=f"검색 결과에 효과적으로 표시되도록 {min_len}자 이상의 제목을 작성하세요",
))
elif title_len > max_len:
issues.append(self._create_issue(
code="S-01",
severity="minor",
message=f"title이 너무 깁니다 ({title_len}자, 권장 {min_len}-{max_len}자)",
element=f"<title>{title_text[:50]}...</title>",
suggestion=f"검색 결과에서 잘리지 않도록 {max_len}자 이내로 제목을 줄이세요",
))
return issues
def _check_meta_description(self, soup: BeautifulSoup, meta_info: dict) -> list[Issue]:
"""S-02: Check meta description existence and length."""
issues = []
desc = soup.find("meta", attrs={"name": re.compile(r"^description$", re.I)})
min_len = self._get_threshold("seo-meta-description", "min_length", 50)
max_len = self._get_threshold("seo-meta-description", "max_length", 160)
if desc is None or not desc.get("content"):
meta_info["description"] = None
meta_info["description_length"] = 0
issues.append(self._create_issue(
code="S-02",
severity="major",
message="meta description이 없습니다",
suggestion=f'<meta name="description" content="페이지 설명">을 추가하세요 ({min_len}-{max_len}자 권장)',
))
return issues
content = desc["content"].strip()
content_len = len(content)
meta_info["description"] = content
meta_info["description_length"] = content_len
if content_len < min_len:
issues.append(self._create_issue(
code="S-02",
severity="major",
message=f"meta description이 너무 짧습니다 ({content_len}자, 권장 {min_len}-{max_len}자)",
suggestion=f"검색 결과에서 페이지를 효과적으로 설명하도록 {min_len}자 이상으로 작성하세요",
))
elif content_len > max_len:
issues.append(self._create_issue(
code="S-02",
severity="minor",
message=f"meta description이 너무 깁니다 ({content_len}자, 권장 {min_len}-{max_len}자)",
suggestion=f"검색 결과에서 잘리지 않도록 {max_len}자 이내로 줄이세요",
))
return issues
def _check_meta_keywords(self, soup: BeautifulSoup, meta_info: dict) -> list[Issue]:
"""S-03: Check meta keywords (informational only)."""
keywords = soup.find("meta", attrs={"name": re.compile(r"^keywords$", re.I)})
if keywords is None or not keywords.get("content"):
meta_info["has_keywords"] = False
return [self._create_issue(
code="S-03",
severity="info",
message="meta keywords가 없습니다 (현재 대부분의 검색엔진에서 무시됨)",
suggestion="meta keywords는 SEO에 큰 영향이 없지만, 참고용으로 추가할 수 있습니다",
)]
meta_info["has_keywords"] = True
return []
def _check_og_tags(self, soup: BeautifulSoup) -> list[Issue]:
"""S-04: Check Open Graph tags from YAML rule definitions."""
issues = []
rule = self._get_seo_rule("seo-open-graph")
required_tags = rule.get("details", {}).get("required_tags", [])
required_og = [t["property"] for t in required_tags] if required_tags else [
"og:title", "og:description", "og:image",
]
missing = []
for prop in required_og:
og = soup.find("meta", attrs={"property": prop})
if og is None or not og.get("content"):
missing.append(prop)
if missing:
issues.append(self._create_issue(
code="S-04",
severity="major",
message=f"Open Graph 태그가 누락되었습니다: {', '.join(missing)}",
suggestion=f'누락된 OG 태그를 추가하세요. 예: <meta property="{missing[0]}" content="">',
))
return issues
def _check_twitter_card(self, soup: BeautifulSoup) -> list[Issue]:
"""S-05: Check Twitter Card tags."""
twitter_card = soup.find("meta", attrs={"name": "twitter:card"})
twitter_title = soup.find("meta", attrs={"name": "twitter:title"})
if twitter_card is None and twitter_title is None:
return [self._create_issue(
code="S-05",
severity="minor",
message="Twitter Card 태그가 없습니다",
suggestion='<meta name="twitter:card" content="summary_large_image">를 추가하세요',
)]
return []
def _check_canonical(self, soup: BeautifulSoup) -> list[Issue]:
"""S-06: Check canonical URL."""
canonical = soup.find("link", attrs={"rel": "canonical"})
if canonical is None or not canonical.get("href"):
return [self._create_issue(
code="S-06",
severity="major",
message="canonical URL이 설정되지 않았습니다",
suggestion='<link rel="canonical" href="현재페이지URL">을 추가하여 중복 콘텐츠 문제를 방지하세요',
)]
return []
async def _check_robots_txt(self, url: str, meta_info: dict) -> list[Issue]:
"""S-07: Check robots.txt accessibility."""
parsed = urlparse(url)
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
try:
async with httpx.AsyncClient(timeout=httpx.Timeout(5.0), verify=False) as client:
resp = await client.get(robots_url)
if resp.status_code == 200:
meta_info["has_robots_txt"] = True
return []
else:
meta_info["has_robots_txt"] = False
return [self._create_issue(
code="S-07",
severity="major",
message=f"robots.txt에 접근할 수 없습니다 (HTTP {resp.status_code})",
suggestion="검색엔진 크롤링을 제어하기 위해 /robots.txt 파일을 생성하세요",
)]
except Exception as e:
logger.warning("robots.txt check failed for %s: %s", url, str(e))
meta_info["has_robots_txt"] = False
return [self._create_issue(
code="S-07",
severity="major",
message="robots.txt에 접근할 수 없습니다",
suggestion="검색엔진 크롤링을 제어하기 위해 /robots.txt 파일을 생성하세요",
)]
async def _check_sitemap(self, url: str, meta_info: dict) -> list[Issue]:
"""S-08: Check sitemap.xml accessibility."""
parsed = urlparse(url)
sitemap_url = f"{parsed.scheme}://{parsed.netloc}/sitemap.xml"
try:
async with httpx.AsyncClient(timeout=httpx.Timeout(5.0), verify=False) as client:
resp = await client.get(sitemap_url)
if resp.status_code == 200:
meta_info["has_sitemap"] = True
return []
else:
meta_info["has_sitemap"] = False
return [self._create_issue(
code="S-08",
severity="major",
message=f"sitemap.xml에 접근할 수 없습니다 (HTTP {resp.status_code})",
suggestion="검색엔진이 사이트 구조를 이해할 수 있도록 /sitemap.xml을 생성하세요",
)]
except Exception as e:
logger.warning("sitemap.xml check failed for %s: %s", url, str(e))
meta_info["has_sitemap"] = False
return [self._create_issue(
code="S-08",
severity="major",
message="sitemap.xml에 접근할 수 없습니다",
suggestion="검색엔진이 사이트 구조를 이해할 수 있도록 /sitemap.xml을 생성하세요",
)]
def _check_h1(self, soup: BeautifulSoup) -> list[Issue]:
"""S-09: Check H1 tag existence and uniqueness."""
h1_tags = soup.find_all("h1")
issues = []
if len(h1_tags) == 0:
issues.append(self._create_issue(
code="S-09",
severity="critical",
message="H1 태그가 없습니다",
suggestion="페이지의 주요 제목을 <h1> 태그로 추가하세요",
))
elif len(h1_tags) > 1:
issues.append(self._create_issue(
code="S-09",
severity="critical",
message=f"H1 태그가 {len(h1_tags)}개 발견되었습니다 (1개 권장)",
element=self._truncate_element(str(h1_tags[0])),
suggestion="페이지당 H1 태그는 1개만 사용하세요",
))
return issues
def _check_structured_data(self, soup: BeautifulSoup, html_content: str, meta_info: dict) -> list[Issue]:
"""S-10: Check for structured data (JSON-LD, Microdata, RDFa)."""
structured_types = []
# JSON-LD
json_ld_scripts = soup.find_all("script", attrs={"type": "application/ld+json"})
if json_ld_scripts:
structured_types.append("JSON-LD")
# Microdata
microdata = soup.find_all(attrs={"itemscope": True})
if microdata:
structured_types.append("Microdata")
# RDFa
rdfa = soup.find_all(attrs={"typeof": True})
if rdfa:
structured_types.append("RDFa")
meta_info["structured_data_types"] = structured_types
if not structured_types:
return [self._create_issue(
code="S-10",
severity="minor",
message="구조화 데이터(JSON-LD, Microdata, RDFa)가 없습니다",
suggestion='<script type="application/ld+json">을 사용하여 구조화 데이터를 추가하세요',
)]
return []
def _check_favicon(self, soup: BeautifulSoup) -> list[Issue]:
"""S-11: Check favicon existence."""
favicon = soup.find("link", attrs={"rel": re.compile(r"icon", re.I)})
if favicon is None:
return [self._create_issue(
code="S-11",
severity="minor",
message="favicon이 설정되지 않았습니다",
suggestion='<link rel="icon" href="/favicon.ico">를 추가하세요',
)]
return []
def _check_viewport(self, soup: BeautifulSoup) -> list[Issue]:
"""S-12: Check viewport meta tag for mobile friendliness."""
viewport = soup.find("meta", attrs={"name": re.compile(r"^viewport$", re.I)})
if viewport is None:
return [self._create_issue(
code="S-12",
severity="major",
message="viewport meta 태그가 없습니다 (모바일 친화성 부족)",
suggestion='<meta name="viewport" content="width=device-width, initial-scale=1.0">을 추가하세요',
)]
return []
def _check_url_structure(self, url: str) -> list[Issue]:
"""S-13: Check URL structure for SEO friendliness."""
parsed = urlparse(url)
path = parsed.path
# Check for special characters (excluding common ones like /, -, _)
special_chars = re.findall(r"[^a-zA-Z0-9/\-_.]", path)
if len(special_chars) > 3:
return [self._create_issue(
code="S-13",
severity="minor",
message=f"URL에 특수 문자가 많습니다 ({len(special_chars)}개)",
suggestion="URL은 영문, 숫자, 하이픈(-)을 사용하여 깔끔하게 구성하세요",
)]
return []
def _check_img_alt_seo(self, soup: BeautifulSoup) -> list[Issue]:
"""S-14: Check image alt attributes from SEO perspective."""
images = soup.find_all("img")
if not images:
return []
missing_alt = [img for img in images if not img.get("alt") and img.get("alt") != ""]
if missing_alt:
return [self._create_issue(
code="S-14",
severity="major",
message=f"alt 속성이 없는 이미지가 {len(missing_alt)}개 발견되었습니다",
element=self._truncate_element(str(missing_alt[0])) if missing_alt else None,
suggestion="검색엔진이 이미지를 이해할 수 있도록 모든 이미지에 설명적인 alt 속성을 추가하세요",
)]
return []
@staticmethod
def _truncate_element(element_str: str, max_len: int = 200) -> str:
if len(element_str) > max_len:
return element_str[:max_len] + "..."
return element_str