Initial commit - cleaned repository

This commit is contained in:
jungwoo choi
2025-09-28 20:41:57 +09:00
commit e3c28f796a
188 changed files with 28102 additions and 0 deletions

View File

@ -0,0 +1,21 @@
# Google Custom Search API Configuration
# Get your API key from: https://console.cloud.google.com/apis/credentials
GOOGLE_API_KEY=
# Get your Search Engine ID from: https://programmablesearchengine.google.com/
GOOGLE_SEARCH_ENGINE_ID=
# Alternative: SerpAPI Configuration
# Get your API key from: https://serpapi.com/
SERPAPI_KEY=
# Redis Configuration
REDIS_HOST=redis
REDIS_PORT=6379
REDIS_DB=2
# Search Settings
DEFAULT_LANGUAGE=ko
DEFAULT_COUNTRY=kr
CACHE_TTL=3600
MAX_RESULTS=10

View File

@ -0,0 +1,10 @@
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]

View File

@ -0,0 +1,30 @@
from pydantic_settings import BaseSettings
from typing import Optional
class Settings(BaseSettings):
# Google Custom Search API 설정
google_api_key: Optional[str] = None
google_search_engine_id: Optional[str] = None
# SerpAPI 설정 (대안)
serpapi_key: Optional[str] = None
# Redis 캐싱 설정
redis_host: str = "redis"
redis_port: int = 6379
redis_db: int = 2
cache_ttl: int = 3600 # 1시간
# 검색 설정
max_results: int = 10
default_language: str = "ko"
default_country: str = "kr"
# 서비스 설정
service_name: str = "Google Search Service"
debug: bool = True
class Config:
env_file = ".env"
settings = Settings()

View File

@ -0,0 +1,188 @@
from fastapi import FastAPI, Query, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from typing import Optional
from datetime import datetime
from contextlib import asynccontextmanager
from .search_service import GoogleSearchService
from .config import settings
@asynccontextmanager
async def lifespan(app: FastAPI):
# 시작 시
print("Google Search Service starting...")
yield
# 종료 시
print("Google Search Service stopping...")
app = FastAPI(
title="Google Search Service",
description="구글 검색 결과를 수신하는 서비스",
version="1.0.0",
lifespan=lifespan
)
# CORS 설정
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# 검색 서비스 초기화
search_service = GoogleSearchService()
@app.get("/")
async def root():
return {
"service": "Google Search Service",
"version": "1.0.0",
"timestamp": datetime.now().isoformat(),
"endpoints": {
"search": "/api/search?q=keyword",
"custom_search": "/api/search/custom?q=keyword",
"serpapi_search": "/api/search/serpapi?q=keyword",
"scraping_search": "/api/search/scraping?q=keyword",
"trending": "/api/trending",
"health": "/health"
}
}
@app.get("/health")
async def health_check():
return {
"status": "healthy",
"service": "google-search",
"timestamp": datetime.now().isoformat()
}
@app.get("/api/search")
async def search(
q: str = Query(..., description="검색 키워드"),
num: int = Query(10, description="결과 개수", ge=1, le=20),
lang: Optional[str] = Query(None, description="언어 코드 (ko, en 등)"),
country: Optional[str] = Query(None, description="국가 코드 (kr, us 등)"),
date_restrict: Optional[str] = Query(None, description="날짜 제한 (d7=일주일, m1=한달, m3=3개월, y1=1년)"),
sort_by_date: bool = Query(False, description="최신순 정렬")
):
"""
자동으로 최적의 방법을 선택하여 구글 검색
1. Google Custom Search API (설정된 경우)
2. SerpAPI (설정된 경우)
3. 웹 스크래핑 (폴백)
"""
# Google Custom Search API 시도
if settings.google_api_key and settings.google_search_engine_id:
result = await search_service.search_with_custom_api(q, num, lang, country, date_restrict, sort_by_date)
if "error" not in result or not result["error"]:
result["method"] = "google_custom_search"
return result
# SerpAPI 시도
if settings.serpapi_key:
result = await search_service.search_with_serpapi(q, num, lang, country)
if "error" not in result or not result["error"]:
result["method"] = "serpapi"
return result
# 웹 스크래핑 폴백
result = await search_service.search_with_scraping(q, num, lang)
result["method"] = "web_scraping"
result["warning"] = "API 키가 설정되지 않아 웹 스크래핑을 사용합니다. 제한적이고 불안정할 수 있습니다."
return result
@app.get("/api/search/custom")
async def search_custom(
q: str = Query(..., description="검색 키워드"),
num: int = Query(10, description="결과 개수", ge=1, le=10),
lang: Optional[str] = Query(None, description="언어 코드"),
country: Optional[str] = Query(None, description="국가 코드")
):
"""Google Custom Search API를 사용한 검색"""
if not settings.google_api_key or not settings.google_search_engine_id:
raise HTTPException(
status_code=503,
detail="Google Custom Search API credentials not configured"
)
result = await search_service.search_with_custom_api(q, num, lang, country)
if "error" in result and result["error"]:
raise HTTPException(status_code=500, detail=result["error"])
return result
@app.get("/api/search/serpapi")
async def search_serpapi(
q: str = Query(..., description="검색 키워드"),
num: int = Query(10, description="결과 개수", ge=1, le=50),
lang: Optional[str] = Query(None, description="언어 코드"),
country: Optional[str] = Query(None, description="국가 코드")
):
"""SerpAPI를 사용한 검색"""
if not settings.serpapi_key:
raise HTTPException(
status_code=503,
detail="SerpAPI key not configured"
)
result = await search_service.search_with_serpapi(q, num, lang, country)
if "error" in result and result["error"]:
raise HTTPException(status_code=500, detail=result["error"])
return result
@app.get("/api/search/scraping")
async def search_scraping(
q: str = Query(..., description="검색 키워드"),
num: int = Query(10, description="결과 개수", ge=1, le=20),
lang: Optional[str] = Query(None, description="언어 코드")
):
"""웹 스크래핑을 사용한 검색 (제한적)"""
result = await search_service.search_with_scraping(q, num, lang)
if "error" in result and result["error"]:
raise HTTPException(status_code=500, detail=result["error"])
result["warning"] = "웹 스크래핑은 제한적이고 불안정할 수 있습니다"
return result
@app.get("/api/search/full")
async def search_with_full_content(
q: str = Query(..., description="검색 키워드"),
num: int = Query(5, description="결과 개수", ge=1, le=10),
lang: Optional[str] = Query(None, description="언어 코드 (ko, en 등)"),
country: Optional[str] = Query(None, description="국가 코드 (kr, us 등)")
):
"""
Google 검색 후 각 결과 페이지의 전체 내용을 가져오기
주의: 시간이 오래 걸릴 수 있음
"""
result = await search_service.search_with_full_content(q, num, lang, country)
if "error" in result and result["error"]:
raise HTTPException(status_code=500, detail=result["error"])
return result
@app.get("/api/trending")
async def get_trending(
country: Optional[str] = Query(None, description="국가 코드 (kr, us 등)")
):
"""실시간 트렌딩 검색어 조회"""
result = await search_service.get_trending_searches(country)
if "error" in result and result["error"]:
raise HTTPException(status_code=500, detail=result["error"])
return result
@app.post("/api/clear-cache")
async def clear_cache():
"""캐시 초기화"""
try:
search_service.redis_client.flushdb()
return {
"status": "success",
"message": "캐시가 초기화되었습니다"
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,540 @@
import httpx
import json
import redis
from typing import List, Dict, Optional
from datetime import datetime
import hashlib
from bs4 import BeautifulSoup
from .config import settings
class GoogleSearchService:
def __init__(self):
# Redis 연결
self.redis_client = redis.Redis(
host=settings.redis_host,
port=settings.redis_port,
db=settings.redis_db,
decode_responses=True
)
def _get_cache_key(self, query: str, **kwargs) -> str:
"""캐시 키 생성"""
cache_data = f"{query}_{kwargs}"
return f"google_search:{hashlib.md5(cache_data.encode()).hexdigest()}"
async def search_with_custom_api(
self,
query: str,
num_results: int = 10,
language: str = None,
country: str = None,
date_restrict: str = None,
sort_by_date: bool = False
) -> Dict:
"""Google Custom Search API 사용"""
if not settings.google_api_key or not settings.google_search_engine_id:
return {
"error": "Google API credentials not configured",
"results": []
}
# 캐시 확인
cache_key = self._get_cache_key(query, num=num_results, lang=language, country=country)
cached = self.redis_client.get(cache_key)
if cached:
return json.loads(cached)
url = "https://www.googleapis.com/customsearch/v1"
all_results = []
total_results_info = None
# Google API는 한 번에 최대 10개만 반환, 20개를 원하면 2번 요청
num_requests = min((num_results + 9) // 10, 2) # 최대 2번 요청 (20개까지)
async with httpx.AsyncClient() as client:
for page in range(num_requests):
start_index = page * 10 + 1
current_num = min(10, num_results - page * 10)
params = {
"key": settings.google_api_key,
"cx": settings.google_search_engine_id,
"q": query,
"num": current_num,
"start": start_index, # 시작 인덱스
"hl": language or settings.default_language,
"gl": country or settings.default_country
}
# 날짜 제한 추가 (d7 = 일주일, m1 = 한달, y1 = 1년)
if date_restrict:
params["dateRestrict"] = date_restrict
# 날짜순 정렬 (Google Custom Search API에서는 sort=date 옵션)
if sort_by_date:
params["sort"] = "date"
try:
response = await client.get(url, params=params)
response.raise_for_status()
data = response.json()
# 첫 번째 요청에서만 전체 정보 저장
if page == 0:
total_results_info = {
"total_results": data.get("searchInformation", {}).get("totalResults"),
"search_time": data.get("searchInformation", {}).get("searchTime"),
"query": data.get("queries", {}).get("request", [{}])[0].get("searchTerms")
}
# 결과 추가
for item in data.get("items", []):
all_results.append({
"title": item.get("title"),
"link": item.get("link"),
"snippet": item.get("snippet"),
"display_link": item.get("displayLink"),
"thumbnail": item.get("pagemap", {}).get("cse_thumbnail", [{}])[0].get("src") if "pagemap" in item else None
})
except Exception as e:
# 첫 번째 요청이 실패하면 에러 반환
if page == 0:
return {
"error": str(e),
"results": []
}
# 두 번째 요청이 실패하면 첫 번째 결과만 반환
break
results = {
"query": total_results_info.get("query") if total_results_info else query,
"total_results": total_results_info.get("total_results") if total_results_info else "0",
"search_time": total_results_info.get("search_time") if total_results_info else 0,
"results": all_results[:num_results], # 요청한 개수만큼만 반환
"timestamp": datetime.utcnow().isoformat()
}
# 캐시 저장
self.redis_client.setex(
cache_key,
settings.cache_ttl,
json.dumps(results)
)
return results
async def search_with_serpapi(
self,
query: str,
num_results: int = 10,
language: str = None,
country: str = None
) -> Dict:
"""SerpAPI 사용 (유료 서비스)"""
if not settings.serpapi_key:
return {
"error": "SerpAPI key not configured",
"results": []
}
# 캐시 확인
cache_key = self._get_cache_key(query, num=num_results, lang=language, country=country)
cached = self.redis_client.get(cache_key)
if cached:
return json.loads(cached)
from serpapi import GoogleSearch
params = {
"q": query,
"api_key": settings.serpapi_key,
"num": num_results,
"hl": language or settings.default_language,
"gl": country or settings.default_country
}
try:
search = GoogleSearch(params)
results = search.get_dict()
formatted_results = self._format_serpapi_results(results)
# 캐시 저장
self.redis_client.setex(
cache_key,
settings.cache_ttl,
json.dumps(formatted_results)
)
return formatted_results
except Exception as e:
return {
"error": str(e),
"results": []
}
async def search_with_scraping(
self,
query: str,
num_results: int = 10,
language: str = None
) -> Dict:
"""웹 스크래핑으로 검색 (비추천, 제한적)"""
# 캐시 확인
cache_key = self._get_cache_key(query, num=num_results, lang=language)
cached = self.redis_client.get(cache_key)
if cached:
return json.loads(cached)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
params = {
"q": query,
"num": num_results,
"hl": language or settings.default_language
}
async with httpx.AsyncClient() as client:
try:
response = await client.get(
"https://www.google.com/search",
params=params,
headers=headers,
follow_redirects=True
)
soup = BeautifulSoup(response.text, 'html.parser')
results = self._parse_google_html(soup)
formatted_results = {
"query": query,
"total_results": len(results),
"results": results,
"timestamp": datetime.utcnow().isoformat()
}
# 캐시 저장
self.redis_client.setex(
cache_key,
settings.cache_ttl,
json.dumps(formatted_results)
)
return formatted_results
except Exception as e:
return {
"error": str(e),
"results": []
}
def _format_google_results(self, data: Dict) -> Dict:
"""Google API 결과 포맷팅"""
results = []
for item in data.get("items", []):
results.append({
"title": item.get("title"),
"link": item.get("link"),
"snippet": item.get("snippet"),
"display_link": item.get("displayLink"),
"thumbnail": item.get("pagemap", {}).get("cse_thumbnail", [{}])[0].get("src") if "pagemap" in item else None
})
return {
"query": data.get("queries", {}).get("request", [{}])[0].get("searchTerms"),
"total_results": data.get("searchInformation", {}).get("totalResults"),
"search_time": data.get("searchInformation", {}).get("searchTime"),
"results": results,
"timestamp": datetime.utcnow().isoformat()
}
def _format_serpapi_results(self, data: Dict) -> Dict:
"""SerpAPI 결과 포맷팅"""
results = []
for item in data.get("organic_results", []):
results.append({
"title": item.get("title"),
"link": item.get("link"),
"snippet": item.get("snippet"),
"position": item.get("position"),
"thumbnail": item.get("thumbnail"),
"date": item.get("date")
})
# 관련 검색어
related_searches = [
item.get("query") for item in data.get("related_searches", [])
]
return {
"query": data.get("search_parameters", {}).get("q"),
"total_results": data.get("search_information", {}).get("total_results"),
"search_time": data.get("search_information", {}).get("time_taken_displayed"),
"results": results,
"related_searches": related_searches,
"timestamp": datetime.utcnow().isoformat()
}
def _parse_google_html(self, soup: BeautifulSoup) -> List[Dict]:
"""HTML 파싱으로 검색 결과 추출"""
results = []
# 검색 결과 컨테이너 찾기
for g in soup.find_all('div', class_='g'):
anchors = g.find_all('a')
if anchors:
link = anchors[0].get('href', '')
title_elem = g.find('h3')
snippet_elem = g.find('span', class_='st') or g.find('div', class_='s')
if title_elem and link:
results.append({
"title": title_elem.get_text(),
"link": link,
"snippet": snippet_elem.get_text() if snippet_elem else ""
})
return results
async def fetch_page_content(self, url: str) -> Dict:
"""웹 페이지의 전체 내용을 가져오기"""
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.get(url, headers=headers, follow_redirects=True)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# 불필요한 태그 제거
for script in soup(["script", "style", "nav", "header", "footer"]):
script.decompose()
# 본문 내용 추출 시도
main_content = None
# 1. article 태그 찾기
article = soup.find('article')
if article:
main_content = article.get_text()
# 2. main 태그 찾기
if not main_content:
main = soup.find('main')
if main:
main_content = main.get_text()
# 3. 일반적인 콘텐츠 div 찾기
if not main_content:
content_divs = soup.find_all('div', class_=lambda x: x and ('content' in x.lower() or 'article' in x.lower() or 'post' in x.lower()))
if content_divs:
main_content = ' '.join([div.get_text() for div in content_divs[:3]])
# 4. 전체 body에서 텍스트 추출
if not main_content:
body = soup.find('body')
if body:
main_content = body.get_text()
else:
main_content = soup.get_text()
# 텍스트 정리
main_content = ' '.join(main_content.split())
# 제목 추출
title = soup.find('title')
title_text = title.get_text() if title else ""
# 메타 설명 추출
meta_desc = soup.find('meta', attrs={'name': 'description'})
description = meta_desc.get('content', '') if meta_desc else ""
return {
"url": url,
"title": title_text,
"description": description,
"content": main_content[:5000], # 최대 5000자
"content_length": len(main_content),
"success": True
}
except Exception as e:
return {
"url": url,
"error": str(e),
"success": False
}
async def search_with_extended_snippet(
self,
query: str,
num_results: int = 10,
language: str = None,
country: str = None
) -> Dict:
"""검색 후 확장된 snippet 가져오기 (메타 설명 + 첫 500자)"""
# 먼저 일반 검색 수행
search_results = await self.search_with_custom_api(
query, num_results, language, country
)
if "error" in search_results:
return search_results
# 각 결과의 확장된 snippet 가져오기
import asyncio
async def fetch_extended_snippet(result):
"""개별 페이지의 확장된 snippet 가져오기"""
enhanced_result = result.copy()
if result.get("link"):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
async with httpx.AsyncClient(timeout=5.0) as client:
response = await client.get(result["link"], headers=headers, follow_redirects=True)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# 메타 설명 추출
meta_desc = soup.find('meta', attrs={'name': 'description'})
if not meta_desc:
meta_desc = soup.find('meta', attrs={'property': 'og:description'})
description = meta_desc.get('content', '') if meta_desc else ""
# 본문 첫 부분 추출
for script in soup(["script", "style"]):
script.decompose()
# 본문 텍스트 찾기
text_content = ""
for tag in ['article', 'main', 'div']:
elements = soup.find_all(tag)
for elem in elements:
text = elem.get_text().strip()
if len(text) > 200: # 의미있는 텍스트만
text_content = ' '.join(text.split())[:1000]
break
if text_content:
break
# 기존 snippet과 병합
extended_snippet = result.get("snippet", "")
if description and description not in extended_snippet:
extended_snippet = description + " ... " + extended_snippet
if text_content and len(extended_snippet) < 500:
extended_snippet = extended_snippet + " ... " + text_content[:500-len(extended_snippet)]
enhanced_result["snippet"] = extended_snippet[:1000] # 최대 1000자
enhanced_result["extended"] = True
except Exception as e:
# 실패 시 원본 snippet 유지
enhanced_result["extended"] = False
enhanced_result["fetch_error"] = str(e)
return enhanced_result
# 병렬로 모든 페이지 처리
tasks = [fetch_extended_snippet(result) for result in search_results.get("results", [])]
enhanced_results = await asyncio.gather(*tasks)
return {
**search_results,
"results": enhanced_results,
"snippet_extended": True
}
async def search_with_full_content(
self,
query: str,
num_results: int = 5,
language: str = None,
country: str = None
) -> Dict:
"""검색 후 각 결과의 전체 내용 가져오기"""
# 먼저 일반 검색 수행
search_results = await self.search_with_custom_api(
query, num_results, language, country
)
if "error" in search_results:
return search_results
# 각 결과의 전체 내용 가져오기
enhanced_results = []
for result in search_results.get("results", [])[:num_results]:
# 원본 검색 결과 복사
enhanced_result = result.copy()
# 페이지 내용 가져오기
if result.get("link"):
content_data = await self.fetch_page_content(result["link"])
enhanced_result["full_content"] = content_data
enhanced_results.append(enhanced_result)
return {
**search_results,
"results": enhanced_results,
"content_fetched": True
}
async def get_trending_searches(self, country: str = None) -> Dict:
"""트렌딩 검색어 가져오기"""
# Google Trends 비공식 API 사용
url = f"https://trends.google.com/trends/api/dailytrends"
params = {
"geo": country or settings.default_country.upper()
}
async with httpx.AsyncClient() as client:
try:
response = await client.get(url, params=params)
# Google Trends API는 ")]}',\n"로 시작하는 응답을 반환
json_data = response.text[6:]
data = json.loads(json_data)
trending = []
for date_data in data.get("default", {}).get("trendingSearchesDays", []):
for search in date_data.get("trendingSearches", []):
trending.append({
"title": search.get("title", {}).get("query"),
"traffic": search.get("formattedTraffic"),
"articles": [
{
"title": article.get("title"),
"url": article.get("url"),
"source": article.get("source")
}
for article in search.get("articles", [])[:3]
]
})
return {
"country": country or settings.default_country,
"trending": trending[:10],
"timestamp": datetime.utcnow().isoformat()
}
except Exception as e:
return {
"error": str(e),
"trending": []
}

View File

@ -0,0 +1,9 @@
fastapi==0.109.0
uvicorn[standard]==0.27.0
httpx==0.26.0
pydantic==2.5.3
pydantic-settings==2.1.0
google-api-python-client==2.108.0
beautifulsoup4==4.12.2
redis==5.0.1
serpapi==0.1.5