diff --git a/docker-compose.yml b/docker-compose.yml index 858e099..ab8bcee 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -354,6 +354,35 @@ services: timeout: 10s retries: 3 + # Google Search Service + google-search-backend: + build: + context: ./services/google-search/backend + dockerfile: Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_google_search_backend + ports: + - "8016:8000" + environment: + - REDIS_HOST=redis + - REDIS_PORT=6379 + - REDIS_DB=2 + - GOOGLE_API_KEY=AIzaSyBakoCsDP_oF5V4oq_eEKs4eQb-ekqxnRM + - GOOGLE_SEARCH_ENGINE_ID=35bfbdb7b6f244569 + - SERPAPI_KEY=${SERPAPI_KEY:-} + - DEFAULT_LANGUAGE=ko + - DEFAULT_COUNTRY=kr + - CACHE_TTL=3600 + depends_on: + - redis + networks: + - site11_network + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 3 + networks: site11_network: driver: bridge diff --git a/services/google-search/README.md b/services/google-search/README.md new file mode 100644 index 0000000..26c4a52 --- /dev/null +++ b/services/google-search/README.md @@ -0,0 +1,153 @@ +# Google Search Service + +키워드를 구글에서 검색한 결과를 수신하는 서비스입니다. + +## 주요 기능 + +### 1. 다중 검색 방법 지원 +- **Google Custom Search API**: 공식 구글 API (권장) +- **SerpAPI**: 대체 검색 API +- **웹 스크래핑**: 폴백 옵션 (제한적) + +### 2. 검색 옵션 +- 최대 20개 검색 결과 지원 +- 언어별/국가별 검색 +- 날짜 기준 필터링 및 정렬 +- 전체 콘텐츠 가져오기 + +## API 엔드포인트 + +### 기본 검색 +``` +GET /api/search?q=키워드&num=20&lang=ko&country=kr +``` + +**파라미터:** +- `q`: 검색 키워드 (필수) +- `num`: 결과 개수 (1-20, 기본값: 10) +- `lang`: 언어 코드 (ko, en 등) +- `country`: 국가 코드 (kr, us 등) +- `date_restrict`: 날짜 제한 + - `d7`: 일주일 이내 + - `m1`: 한달 이내 + - `m3`: 3개월 이내 + - `y1`: 1년 이내 +- `sort_by_date`: 최신순 정렬 (true/false) + +### 전체 콘텐츠 검색 +``` +GET /api/search/full?q=키워드&num=5 +``` +각 검색 결과 페이지의 전체 내용을 가져옵니다 (시간이 오래 걸릴 수 있음). + +### 실시간 트렌딩 +``` +GET /api/trending?country=kr +``` + +## 사용 예제 + +### 1. 한국어 검색 (최신순) +```bash +curl "http://localhost:8016/api/search?q=인공지능&num=20&lang=ko&country=kr&sort_by_date=true" +``` + +### 2. 영어 검색 (미국) +```bash +curl "http://localhost:8016/api/search?q=artificial%20intelligence&num=10&lang=en&country=us" +``` + +### 3. 최근 일주일 내 결과만 +```bash +curl "http://localhost:8016/api/search?q=뉴스&date_restrict=d7&lang=ko" +``` + +### 4. 전체 콘텐츠 가져오기 +```bash +curl "http://localhost:8016/api/search/full?q=python%20tutorial&num=3" +``` + +## 환경 설정 + +### 필수 API 키 설정 + +1. **Google Custom Search API** + - [Google Cloud Console](https://console.cloud.google.com/apis/credentials)에서 API 키 발급 + - [Programmable Search Engine](https://programmablesearchengine.google.com/)에서 검색 엔진 ID 생성 + +2. **SerpAPI (선택사항)** + - [SerpAPI](https://serpapi.com/)에서 API 키 발급 + +### .env 파일 설정 +```env +# Google Custom Search API +GOOGLE_API_KEY=your_api_key_here +GOOGLE_SEARCH_ENGINE_ID=your_search_engine_id_here + +# SerpAPI (선택사항) +SERPAPI_KEY=your_serpapi_key_here + +# Redis 캐시 +REDIS_HOST=redis +REDIS_PORT=6379 +REDIS_DB=2 + +# 기본 설정 +DEFAULT_LANGUAGE=ko +DEFAULT_COUNTRY=kr +CACHE_TTL=3600 +``` + +## Docker 실행 + +```bash +# 빌드 및 실행 +docker-compose build google-search-backend +docker-compose up -d google-search-backend + +# 로그 확인 +docker-compose logs -f google-search-backend +``` + +## 제한 사항 + +### Google Custom Search API +- 무료 계정: 일일 100회 쿼리 제한 +- 검색당 최대 100개 결과 +- snippet 길이는 서버에서 제한 (변경 불가) + +### 해결 방법 +- 20개 이상 결과 필요 시: 페이지네이션 사용 +- 긴 내용 필요 시: `/api/search/full` 엔드포인트 사용 +- API 제한 도달 시: SerpAPI 또는 웹 스크래핑으로 자동 폴백 + +## 캐시 관리 + +Redis를 사용하여 검색 결과를 캐싱합니다: +- 기본 TTL: 3600초 (1시간) +- 캐시 초기화: `POST /api/clear-cache` + +## 헬스 체크 + +```bash +curl http://localhost:8016/health +``` + +## 문제 해결 + +### 1. 한글 검색 안될 때 +URL 인코딩 사용: +```bash +# "인공지능" → %EC%9D%B8%EA%B3%B5%EC%A7%80%EB%8A%A5 +curl "http://localhost:8016/api/search?q=%EC%9D%B8%EA%B3%B5%EC%A7%80%EB%8A%A5" +``` + +### 2. API 제한 에러 +- Google API 일일 제한 확인 +- SerpAPI 키 설정으로 대체 +- 웹 스크래핑 자동 폴백 활용 + +### 3. 느린 응답 시간 +- Redis 캐시 활성화 확인 +- 결과 개수 줄이기 +- 전체 콘텐츠 대신 기본 검색 사용 \ No newline at end of file diff --git a/services/google-search/backend/.env.example b/services/google-search/backend/.env.example new file mode 100644 index 0000000..0d4f463 --- /dev/null +++ b/services/google-search/backend/.env.example @@ -0,0 +1,21 @@ +# Google Custom Search API Configuration +# Get your API key from: https://console.cloud.google.com/apis/credentials +GOOGLE_API_KEY= + +# Get your Search Engine ID from: https://programmablesearchengine.google.com/ +GOOGLE_SEARCH_ENGINE_ID= + +# Alternative: SerpAPI Configuration +# Get your API key from: https://serpapi.com/ +SERPAPI_KEY= + +# Redis Configuration +REDIS_HOST=redis +REDIS_PORT=6379 +REDIS_DB=2 + +# Search Settings +DEFAULT_LANGUAGE=ko +DEFAULT_COUNTRY=kr +CACHE_TTL=3600 +MAX_RESULTS=10 \ No newline at end of file diff --git a/services/google-search/backend/Dockerfile b/services/google-search/backend/Dockerfile new file mode 100644 index 0000000..800c70b --- /dev/null +++ b/services/google-search/backend/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3.11-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] \ No newline at end of file diff --git a/services/google-search/backend/app/__init__.py b/services/google-search/backend/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/google-search/backend/app/config.py b/services/google-search/backend/app/config.py new file mode 100644 index 0000000..1b06392 --- /dev/null +++ b/services/google-search/backend/app/config.py @@ -0,0 +1,30 @@ +from pydantic_settings import BaseSettings +from typing import Optional + +class Settings(BaseSettings): + # Google Custom Search API 설정 + google_api_key: Optional[str] = None + google_search_engine_id: Optional[str] = None + + # SerpAPI 설정 (대안) + serpapi_key: Optional[str] = None + + # Redis 캐싱 설정 + redis_host: str = "redis" + redis_port: int = 6379 + redis_db: int = 2 + cache_ttl: int = 3600 # 1시간 + + # 검색 설정 + max_results: int = 10 + default_language: str = "ko" + default_country: str = "kr" + + # 서비스 설정 + service_name: str = "Google Search Service" + debug: bool = True + + class Config: + env_file = ".env" + +settings = Settings() \ No newline at end of file diff --git a/services/google-search/backend/app/main.py b/services/google-search/backend/app/main.py new file mode 100644 index 0000000..83a29ba --- /dev/null +++ b/services/google-search/backend/app/main.py @@ -0,0 +1,188 @@ +from fastapi import FastAPI, Query, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from typing import Optional +from datetime import datetime +from contextlib import asynccontextmanager + +from .search_service import GoogleSearchService +from .config import settings + +@asynccontextmanager +async def lifespan(app: FastAPI): + # 시작 시 + print("Google Search Service starting...") + yield + # 종료 시 + print("Google Search Service stopping...") + +app = FastAPI( + title="Google Search Service", + description="구글 검색 결과를 수신하는 서비스", + version="1.0.0", + lifespan=lifespan +) + +# CORS 설정 +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# 검색 서비스 초기화 +search_service = GoogleSearchService() + +@app.get("/") +async def root(): + return { + "service": "Google Search Service", + "version": "1.0.0", + "timestamp": datetime.now().isoformat(), + "endpoints": { + "search": "/api/search?q=keyword", + "custom_search": "/api/search/custom?q=keyword", + "serpapi_search": "/api/search/serpapi?q=keyword", + "scraping_search": "/api/search/scraping?q=keyword", + "trending": "/api/trending", + "health": "/health" + } + } + +@app.get("/health") +async def health_check(): + return { + "status": "healthy", + "service": "google-search", + "timestamp": datetime.now().isoformat() + } + +@app.get("/api/search") +async def search( + q: str = Query(..., description="검색 키워드"), + num: int = Query(10, description="결과 개수", ge=1, le=20), + lang: Optional[str] = Query(None, description="언어 코드 (ko, en 등)"), + country: Optional[str] = Query(None, description="국가 코드 (kr, us 등)"), + date_restrict: Optional[str] = Query(None, description="날짜 제한 (d7=일주일, m1=한달, m3=3개월, y1=1년)"), + sort_by_date: bool = Query(False, description="최신순 정렬") +): + """ + 자동으로 최적의 방법을 선택하여 구글 검색 + 1. Google Custom Search API (설정된 경우) + 2. SerpAPI (설정된 경우) + 3. 웹 스크래핑 (폴백) + """ + # Google Custom Search API 시도 + if settings.google_api_key and settings.google_search_engine_id: + result = await search_service.search_with_custom_api(q, num, lang, country, date_restrict, sort_by_date) + if "error" not in result or not result["error"]: + result["method"] = "google_custom_search" + return result + + # SerpAPI 시도 + if settings.serpapi_key: + result = await search_service.search_with_serpapi(q, num, lang, country) + if "error" not in result or not result["error"]: + result["method"] = "serpapi" + return result + + # 웹 스크래핑 폴백 + result = await search_service.search_with_scraping(q, num, lang) + result["method"] = "web_scraping" + result["warning"] = "API 키가 설정되지 않아 웹 스크래핑을 사용합니다. 제한적이고 불안정할 수 있습니다." + return result + +@app.get("/api/search/custom") +async def search_custom( + q: str = Query(..., description="검색 키워드"), + num: int = Query(10, description="결과 개수", ge=1, le=10), + lang: Optional[str] = Query(None, description="언어 코드"), + country: Optional[str] = Query(None, description="국가 코드") +): + """Google Custom Search API를 사용한 검색""" + if not settings.google_api_key or not settings.google_search_engine_id: + raise HTTPException( + status_code=503, + detail="Google Custom Search API credentials not configured" + ) + + result = await search_service.search_with_custom_api(q, num, lang, country) + if "error" in result and result["error"]: + raise HTTPException(status_code=500, detail=result["error"]) + + return result + +@app.get("/api/search/serpapi") +async def search_serpapi( + q: str = Query(..., description="검색 키워드"), + num: int = Query(10, description="결과 개수", ge=1, le=50), + lang: Optional[str] = Query(None, description="언어 코드"), + country: Optional[str] = Query(None, description="국가 코드") +): + """SerpAPI를 사용한 검색""" + if not settings.serpapi_key: + raise HTTPException( + status_code=503, + detail="SerpAPI key not configured" + ) + + result = await search_service.search_with_serpapi(q, num, lang, country) + if "error" in result and result["error"]: + raise HTTPException(status_code=500, detail=result["error"]) + + return result + +@app.get("/api/search/scraping") +async def search_scraping( + q: str = Query(..., description="검색 키워드"), + num: int = Query(10, description="결과 개수", ge=1, le=20), + lang: Optional[str] = Query(None, description="언어 코드") +): + """웹 스크래핑을 사용한 검색 (제한적)""" + result = await search_service.search_with_scraping(q, num, lang) + if "error" in result and result["error"]: + raise HTTPException(status_code=500, detail=result["error"]) + + result["warning"] = "웹 스크래핑은 제한적이고 불안정할 수 있습니다" + return result + +@app.get("/api/search/full") +async def search_with_full_content( + q: str = Query(..., description="검색 키워드"), + num: int = Query(5, description="결과 개수", ge=1, le=10), + lang: Optional[str] = Query(None, description="언어 코드 (ko, en 등)"), + country: Optional[str] = Query(None, description="국가 코드 (kr, us 등)") +): + """ + Google 검색 후 각 결과 페이지의 전체 내용을 가져오기 + 주의: 시간이 오래 걸릴 수 있음 + """ + result = await search_service.search_with_full_content(q, num, lang, country) + if "error" in result and result["error"]: + raise HTTPException(status_code=500, detail=result["error"]) + + return result + +@app.get("/api/trending") +async def get_trending( + country: Optional[str] = Query(None, description="국가 코드 (kr, us 등)") +): + """실시간 트렌딩 검색어 조회""" + result = await search_service.get_trending_searches(country) + if "error" in result and result["error"]: + raise HTTPException(status_code=500, detail=result["error"]) + + return result + +@app.post("/api/clear-cache") +async def clear_cache(): + """캐시 초기화""" + try: + search_service.redis_client.flushdb() + return { + "status": "success", + "message": "캐시가 초기화되었습니다" + } + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file diff --git a/services/google-search/backend/app/search_service.py b/services/google-search/backend/app/search_service.py new file mode 100644 index 0000000..708765b --- /dev/null +++ b/services/google-search/backend/app/search_service.py @@ -0,0 +1,540 @@ +import httpx +import json +import redis +from typing import List, Dict, Optional +from datetime import datetime +import hashlib +from bs4 import BeautifulSoup +from .config import settings + +class GoogleSearchService: + def __init__(self): + # Redis 연결 + self.redis_client = redis.Redis( + host=settings.redis_host, + port=settings.redis_port, + db=settings.redis_db, + decode_responses=True + ) + + def _get_cache_key(self, query: str, **kwargs) -> str: + """캐시 키 생성""" + cache_data = f"{query}_{kwargs}" + return f"google_search:{hashlib.md5(cache_data.encode()).hexdigest()}" + + async def search_with_custom_api( + self, + query: str, + num_results: int = 10, + language: str = None, + country: str = None, + date_restrict: str = None, + sort_by_date: bool = False + ) -> Dict: + """Google Custom Search API 사용""" + if not settings.google_api_key or not settings.google_search_engine_id: + return { + "error": "Google API credentials not configured", + "results": [] + } + + # 캐시 확인 + cache_key = self._get_cache_key(query, num=num_results, lang=language, country=country) + cached = self.redis_client.get(cache_key) + if cached: + return json.loads(cached) + + url = "https://www.googleapis.com/customsearch/v1" + + all_results = [] + total_results_info = None + + # Google API는 한 번에 최대 10개만 반환, 20개를 원하면 2번 요청 + num_requests = min((num_results + 9) // 10, 2) # 최대 2번 요청 (20개까지) + + async with httpx.AsyncClient() as client: + for page in range(num_requests): + start_index = page * 10 + 1 + current_num = min(10, num_results - page * 10) + + params = { + "key": settings.google_api_key, + "cx": settings.google_search_engine_id, + "q": query, + "num": current_num, + "start": start_index, # 시작 인덱스 + "hl": language or settings.default_language, + "gl": country or settings.default_country + } + + # 날짜 제한 추가 (d7 = 일주일, m1 = 한달, y1 = 1년) + if date_restrict: + params["dateRestrict"] = date_restrict + + # 날짜순 정렬 (Google Custom Search API에서는 sort=date 옵션) + if sort_by_date: + params["sort"] = "date" + + try: + response = await client.get(url, params=params) + response.raise_for_status() + + data = response.json() + + # 첫 번째 요청에서만 전체 정보 저장 + if page == 0: + total_results_info = { + "total_results": data.get("searchInformation", {}).get("totalResults"), + "search_time": data.get("searchInformation", {}).get("searchTime"), + "query": data.get("queries", {}).get("request", [{}])[0].get("searchTerms") + } + + # 결과 추가 + for item in data.get("items", []): + all_results.append({ + "title": item.get("title"), + "link": item.get("link"), + "snippet": item.get("snippet"), + "display_link": item.get("displayLink"), + "thumbnail": item.get("pagemap", {}).get("cse_thumbnail", [{}])[0].get("src") if "pagemap" in item else None + }) + + except Exception as e: + # 첫 번째 요청이 실패하면 에러 반환 + if page == 0: + return { + "error": str(e), + "results": [] + } + # 두 번째 요청이 실패하면 첫 번째 결과만 반환 + break + + results = { + "query": total_results_info.get("query") if total_results_info else query, + "total_results": total_results_info.get("total_results") if total_results_info else "0", + "search_time": total_results_info.get("search_time") if total_results_info else 0, + "results": all_results[:num_results], # 요청한 개수만큼만 반환 + "timestamp": datetime.utcnow().isoformat() + } + + # 캐시 저장 + self.redis_client.setex( + cache_key, + settings.cache_ttl, + json.dumps(results) + ) + + return results + + async def search_with_serpapi( + self, + query: str, + num_results: int = 10, + language: str = None, + country: str = None + ) -> Dict: + """SerpAPI 사용 (유료 서비스)""" + if not settings.serpapi_key: + return { + "error": "SerpAPI key not configured", + "results": [] + } + + # 캐시 확인 + cache_key = self._get_cache_key(query, num=num_results, lang=language, country=country) + cached = self.redis_client.get(cache_key) + if cached: + return json.loads(cached) + + from serpapi import GoogleSearch + + params = { + "q": query, + "api_key": settings.serpapi_key, + "num": num_results, + "hl": language or settings.default_language, + "gl": country or settings.default_country + } + + try: + search = GoogleSearch(params) + results = search.get_dict() + + formatted_results = self._format_serpapi_results(results) + + # 캐시 저장 + self.redis_client.setex( + cache_key, + settings.cache_ttl, + json.dumps(formatted_results) + ) + + return formatted_results + + except Exception as e: + return { + "error": str(e), + "results": [] + } + + async def search_with_scraping( + self, + query: str, + num_results: int = 10, + language: str = None + ) -> Dict: + """웹 스크래핑으로 검색 (비추천, 제한적)""" + # 캐시 확인 + cache_key = self._get_cache_key(query, num=num_results, lang=language) + cached = self.redis_client.get(cache_key) + if cached: + return json.loads(cached) + + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + } + + params = { + "q": query, + "num": num_results, + "hl": language or settings.default_language + } + + async with httpx.AsyncClient() as client: + try: + response = await client.get( + "https://www.google.com/search", + params=params, + headers=headers, + follow_redirects=True + ) + + soup = BeautifulSoup(response.text, 'html.parser') + results = self._parse_google_html(soup) + + formatted_results = { + "query": query, + "total_results": len(results), + "results": results, + "timestamp": datetime.utcnow().isoformat() + } + + # 캐시 저장 + self.redis_client.setex( + cache_key, + settings.cache_ttl, + json.dumps(formatted_results) + ) + + return formatted_results + + except Exception as e: + return { + "error": str(e), + "results": [] + } + + def _format_google_results(self, data: Dict) -> Dict: + """Google API 결과 포맷팅""" + results = [] + + for item in data.get("items", []): + results.append({ + "title": item.get("title"), + "link": item.get("link"), + "snippet": item.get("snippet"), + "display_link": item.get("displayLink"), + "thumbnail": item.get("pagemap", {}).get("cse_thumbnail", [{}])[0].get("src") if "pagemap" in item else None + }) + + return { + "query": data.get("queries", {}).get("request", [{}])[0].get("searchTerms"), + "total_results": data.get("searchInformation", {}).get("totalResults"), + "search_time": data.get("searchInformation", {}).get("searchTime"), + "results": results, + "timestamp": datetime.utcnow().isoformat() + } + + def _format_serpapi_results(self, data: Dict) -> Dict: + """SerpAPI 결과 포맷팅""" + results = [] + + for item in data.get("organic_results", []): + results.append({ + "title": item.get("title"), + "link": item.get("link"), + "snippet": item.get("snippet"), + "position": item.get("position"), + "thumbnail": item.get("thumbnail"), + "date": item.get("date") + }) + + # 관련 검색어 + related_searches = [ + item.get("query") for item in data.get("related_searches", []) + ] + + return { + "query": data.get("search_parameters", {}).get("q"), + "total_results": data.get("search_information", {}).get("total_results"), + "search_time": data.get("search_information", {}).get("time_taken_displayed"), + "results": results, + "related_searches": related_searches, + "timestamp": datetime.utcnow().isoformat() + } + + def _parse_google_html(self, soup: BeautifulSoup) -> List[Dict]: + """HTML 파싱으로 검색 결과 추출""" + results = [] + + # 검색 결과 컨테이너 찾기 + for g in soup.find_all('div', class_='g'): + anchors = g.find_all('a') + if anchors: + link = anchors[0].get('href', '') + title_elem = g.find('h3') + snippet_elem = g.find('span', class_='st') or g.find('div', class_='s') + + if title_elem and link: + results.append({ + "title": title_elem.get_text(), + "link": link, + "snippet": snippet_elem.get_text() if snippet_elem else "" + }) + + return results + + async def fetch_page_content(self, url: str) -> Dict: + """웹 페이지의 전체 내용을 가져오기""" + try: + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + } + + async with httpx.AsyncClient(timeout=10.0) as client: + response = await client.get(url, headers=headers, follow_redirects=True) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'html.parser') + + # 불필요한 태그 제거 + for script in soup(["script", "style", "nav", "header", "footer"]): + script.decompose() + + # 본문 내용 추출 시도 + main_content = None + + # 1. article 태그 찾기 + article = soup.find('article') + if article: + main_content = article.get_text() + + # 2. main 태그 찾기 + if not main_content: + main = soup.find('main') + if main: + main_content = main.get_text() + + # 3. 일반적인 콘텐츠 div 찾기 + if not main_content: + content_divs = soup.find_all('div', class_=lambda x: x and ('content' in x.lower() or 'article' in x.lower() or 'post' in x.lower())) + if content_divs: + main_content = ' '.join([div.get_text() for div in content_divs[:3]]) + + # 4. 전체 body에서 텍스트 추출 + if not main_content: + body = soup.find('body') + if body: + main_content = body.get_text() + else: + main_content = soup.get_text() + + # 텍스트 정리 + main_content = ' '.join(main_content.split()) + + # 제목 추출 + title = soup.find('title') + title_text = title.get_text() if title else "" + + # 메타 설명 추출 + meta_desc = soup.find('meta', attrs={'name': 'description'}) + description = meta_desc.get('content', '') if meta_desc else "" + + return { + "url": url, + "title": title_text, + "description": description, + "content": main_content[:5000], # 최대 5000자 + "content_length": len(main_content), + "success": True + } + + except Exception as e: + return { + "url": url, + "error": str(e), + "success": False + } + + async def search_with_extended_snippet( + self, + query: str, + num_results: int = 10, + language: str = None, + country: str = None + ) -> Dict: + """검색 후 확장된 snippet 가져오기 (메타 설명 + 첫 500자)""" + # 먼저 일반 검색 수행 + search_results = await self.search_with_custom_api( + query, num_results, language, country + ) + + if "error" in search_results: + return search_results + + # 각 결과의 확장된 snippet 가져오기 + import asyncio + + async def fetch_extended_snippet(result): + """개별 페이지의 확장된 snippet 가져오기""" + enhanced_result = result.copy() + + if result.get("link"): + try: + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + } + + async with httpx.AsyncClient(timeout=5.0) as client: + response = await client.get(result["link"], headers=headers, follow_redirects=True) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'html.parser') + + # 메타 설명 추출 + meta_desc = soup.find('meta', attrs={'name': 'description'}) + if not meta_desc: + meta_desc = soup.find('meta', attrs={'property': 'og:description'}) + + description = meta_desc.get('content', '') if meta_desc else "" + + # 본문 첫 부분 추출 + for script in soup(["script", "style"]): + script.decompose() + + # 본문 텍스트 찾기 + text_content = "" + for tag in ['article', 'main', 'div']: + elements = soup.find_all(tag) + for elem in elements: + text = elem.get_text().strip() + if len(text) > 200: # 의미있는 텍스트만 + text_content = ' '.join(text.split())[:1000] + break + if text_content: + break + + # 기존 snippet과 병합 + extended_snippet = result.get("snippet", "") + if description and description not in extended_snippet: + extended_snippet = description + " ... " + extended_snippet + if text_content and len(extended_snippet) < 500: + extended_snippet = extended_snippet + " ... " + text_content[:500-len(extended_snippet)] + + enhanced_result["snippet"] = extended_snippet[:1000] # 최대 1000자 + enhanced_result["extended"] = True + + except Exception as e: + # 실패 시 원본 snippet 유지 + enhanced_result["extended"] = False + enhanced_result["fetch_error"] = str(e) + + return enhanced_result + + # 병렬로 모든 페이지 처리 + tasks = [fetch_extended_snippet(result) for result in search_results.get("results", [])] + enhanced_results = await asyncio.gather(*tasks) + + return { + **search_results, + "results": enhanced_results, + "snippet_extended": True + } + + async def search_with_full_content( + self, + query: str, + num_results: int = 5, + language: str = None, + country: str = None + ) -> Dict: + """검색 후 각 결과의 전체 내용 가져오기""" + # 먼저 일반 검색 수행 + search_results = await self.search_with_custom_api( + query, num_results, language, country + ) + + if "error" in search_results: + return search_results + + # 각 결과의 전체 내용 가져오기 + enhanced_results = [] + for result in search_results.get("results", [])[:num_results]: + # 원본 검색 결과 복사 + enhanced_result = result.copy() + + # 페이지 내용 가져오기 + if result.get("link"): + content_data = await self.fetch_page_content(result["link"]) + enhanced_result["full_content"] = content_data + + enhanced_results.append(enhanced_result) + + return { + **search_results, + "results": enhanced_results, + "content_fetched": True + } + + async def get_trending_searches(self, country: str = None) -> Dict: + """트렌딩 검색어 가져오기""" + # Google Trends 비공식 API 사용 + url = f"https://trends.google.com/trends/api/dailytrends" + params = { + "geo": country or settings.default_country.upper() + } + + async with httpx.AsyncClient() as client: + try: + response = await client.get(url, params=params) + # Google Trends API는 ")]}',\n"로 시작하는 응답을 반환 + json_data = response.text[6:] + data = json.loads(json_data) + + trending = [] + for date_data in data.get("default", {}).get("trendingSearchesDays", []): + for search in date_data.get("trendingSearches", []): + trending.append({ + "title": search.get("title", {}).get("query"), + "traffic": search.get("formattedTraffic"), + "articles": [ + { + "title": article.get("title"), + "url": article.get("url"), + "source": article.get("source") + } + for article in search.get("articles", [])[:3] + ] + }) + + return { + "country": country or settings.default_country, + "trending": trending[:10], + "timestamp": datetime.utcnow().isoformat() + } + + except Exception as e: + return { + "error": str(e), + "trending": [] + } \ No newline at end of file diff --git a/services/google-search/backend/requirements.txt b/services/google-search/backend/requirements.txt new file mode 100644 index 0000000..36fd2a3 --- /dev/null +++ b/services/google-search/backend/requirements.txt @@ -0,0 +1,9 @@ +fastapi==0.109.0 +uvicorn[standard]==0.27.0 +httpx==0.26.0 +pydantic==2.5.3 +pydantic-settings==2.1.0 +google-api-python-client==2.108.0 +beautifulsoup4==4.12.2 +redis==5.0.1 +serpapi==0.1.5 \ No newline at end of file