From 67cadb9abaff00ce939f588967cc1e53bc88b0fb Mon Sep 17 00:00:00 2001 From: jungwoo choi Date: Fri, 12 Sep 2025 11:22:50 +0900 Subject: [PATCH] Add Google Search service with advanced features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Implement multi-method search (Custom Search API, SerpAPI, web scraping) - Support up to 20 results with pagination - Add date filtering and sorting capabilities - Include full content fetching option - Add country/language specific search support - Implement Redis caching for performance - Create comprehensive documentation ๐Ÿค– Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- docker-compose.yml | 29 + services/google-search/README.md | 153 +++++ services/google-search/backend/.env.example | 21 + services/google-search/backend/Dockerfile | 10 + .../google-search/backend/app/__init__.py | 0 services/google-search/backend/app/config.py | 30 + services/google-search/backend/app/main.py | 188 ++++++ .../backend/app/search_service.py | 540 ++++++++++++++++++ .../google-search/backend/requirements.txt | 9 + 9 files changed, 980 insertions(+) create mode 100644 services/google-search/README.md create mode 100644 services/google-search/backend/.env.example create mode 100644 services/google-search/backend/Dockerfile create mode 100644 services/google-search/backend/app/__init__.py create mode 100644 services/google-search/backend/app/config.py create mode 100644 services/google-search/backend/app/main.py create mode 100644 services/google-search/backend/app/search_service.py create mode 100644 services/google-search/backend/requirements.txt diff --git a/docker-compose.yml b/docker-compose.yml index 858e099..ab8bcee 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -354,6 +354,35 @@ services: timeout: 10s retries: 3 + # Google Search Service + google-search-backend: + build: + context: ./services/google-search/backend + dockerfile: Dockerfile + container_name: ${COMPOSE_PROJECT_NAME}_google_search_backend + ports: + - "8016:8000" + environment: + - REDIS_HOST=redis + - REDIS_PORT=6379 + - REDIS_DB=2 + - GOOGLE_API_KEY=AIzaSyBakoCsDP_oF5V4oq_eEKs4eQb-ekqxnRM + - GOOGLE_SEARCH_ENGINE_ID=35bfbdb7b6f244569 + - SERPAPI_KEY=${SERPAPI_KEY:-} + - DEFAULT_LANGUAGE=ko + - DEFAULT_COUNTRY=kr + - CACHE_TTL=3600 + depends_on: + - redis + networks: + - site11_network + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 3 + networks: site11_network: driver: bridge diff --git a/services/google-search/README.md b/services/google-search/README.md new file mode 100644 index 0000000..26c4a52 --- /dev/null +++ b/services/google-search/README.md @@ -0,0 +1,153 @@ +# Google Search Service + +ํ‚ค์›Œ๋“œ๋ฅผ ๊ตฌ๊ธ€์—์„œ ๊ฒ€์ƒ‰ํ•œ ๊ฒฐ๊ณผ๋ฅผ ์ˆ˜์‹ ํ•˜๋Š” ์„œ๋น„์Šค์ž…๋‹ˆ๋‹ค. + +## ์ฃผ์š” ๊ธฐ๋Šฅ + +### 1. ๋‹ค์ค‘ ๊ฒ€์ƒ‰ ๋ฐฉ๋ฒ• ์ง€์› +- **Google Custom Search API**: ๊ณต์‹ ๊ตฌ๊ธ€ API (๊ถŒ์žฅ) +- **SerpAPI**: ๋Œ€์ฒด ๊ฒ€์ƒ‰ API +- **์›น ์Šคํฌ๋ž˜ํ•‘**: ํด๋ฐฑ ์˜ต์…˜ (์ œํ•œ์ ) + +### 2. ๊ฒ€์ƒ‰ ์˜ต์…˜ +- ์ตœ๋Œ€ 20๊ฐœ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ง€์› +- ์–ธ์–ด๋ณ„/๊ตญ๊ฐ€๋ณ„ ๊ฒ€์ƒ‰ +- ๋‚ ์งœ ๊ธฐ์ค€ ํ•„ํ„ฐ๋ง ๋ฐ ์ •๋ ฌ +- ์ „์ฒด ์ฝ˜ํ…์ธ  ๊ฐ€์ ธ์˜ค๊ธฐ + +## API ์—”๋“œํฌ์ธํŠธ + +### ๊ธฐ๋ณธ ๊ฒ€์ƒ‰ +``` +GET /api/search?q=ํ‚ค์›Œ๋“œ&num=20&lang=ko&country=kr +``` + +**ํŒŒ๋ผ๋ฏธํ„ฐ:** +- `q`: ๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ (ํ•„์ˆ˜) +- `num`: ๊ฒฐ๊ณผ ๊ฐœ์ˆ˜ (1-20, ๊ธฐ๋ณธ๊ฐ’: 10) +- `lang`: ์–ธ์–ด ์ฝ”๋“œ (ko, en ๋“ฑ) +- `country`: ๊ตญ๊ฐ€ ์ฝ”๋“œ (kr, us ๋“ฑ) +- `date_restrict`: ๋‚ ์งœ ์ œํ•œ + - `d7`: ์ผ์ฃผ์ผ ์ด๋‚ด + - `m1`: ํ•œ๋‹ฌ ์ด๋‚ด + - `m3`: 3๊ฐœ์›” ์ด๋‚ด + - `y1`: 1๋…„ ์ด๋‚ด +- `sort_by_date`: ์ตœ์‹ ์ˆœ ์ •๋ ฌ (true/false) + +### ์ „์ฒด ์ฝ˜ํ…์ธ  ๊ฒ€์ƒ‰ +``` +GET /api/search/full?q=ํ‚ค์›Œ๋“œ&num=5 +``` +๊ฐ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ํŽ˜์ด์ง€์˜ ์ „์ฒด ๋‚ด์šฉ์„ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค (์‹œ๊ฐ„์ด ์˜ค๋ž˜ ๊ฑธ๋ฆด ์ˆ˜ ์žˆ์Œ). + +### ์‹ค์‹œ๊ฐ„ ํŠธ๋ Œ๋”ฉ +``` +GET /api/trending?country=kr +``` + +## ์‚ฌ์šฉ ์˜ˆ์ œ + +### 1. ํ•œ๊ตญ์–ด ๊ฒ€์ƒ‰ (์ตœ์‹ ์ˆœ) +```bash +curl "http://localhost:8016/api/search?q=์ธ๊ณต์ง€๋Šฅ&num=20&lang=ko&country=kr&sort_by_date=true" +``` + +### 2. ์˜์–ด ๊ฒ€์ƒ‰ (๋ฏธ๊ตญ) +```bash +curl "http://localhost:8016/api/search?q=artificial%20intelligence&num=10&lang=en&country=us" +``` + +### 3. ์ตœ๊ทผ ์ผ์ฃผ์ผ ๋‚ด ๊ฒฐ๊ณผ๋งŒ +```bash +curl "http://localhost:8016/api/search?q=๋‰ด์Šค&date_restrict=d7&lang=ko" +``` + +### 4. ์ „์ฒด ์ฝ˜ํ…์ธ  ๊ฐ€์ ธ์˜ค๊ธฐ +```bash +curl "http://localhost:8016/api/search/full?q=python%20tutorial&num=3" +``` + +## ํ™˜๊ฒฝ ์„ค์ • + +### ํ•„์ˆ˜ API ํ‚ค ์„ค์ • + +1. **Google Custom Search API** + - [Google Cloud Console](https://console.cloud.google.com/apis/credentials)์—์„œ API ํ‚ค ๋ฐœ๊ธ‰ + - [Programmable Search Engine](https://programmablesearchengine.google.com/)์—์„œ ๊ฒ€์ƒ‰ ์—”์ง„ ID ์ƒ์„ฑ + +2. **SerpAPI (์„ ํƒ์‚ฌํ•ญ)** + - [SerpAPI](https://serpapi.com/)์—์„œ API ํ‚ค ๋ฐœ๊ธ‰ + +### .env ํŒŒ์ผ ์„ค์ • +```env +# Google Custom Search API +GOOGLE_API_KEY=your_api_key_here +GOOGLE_SEARCH_ENGINE_ID=your_search_engine_id_here + +# SerpAPI (์„ ํƒ์‚ฌํ•ญ) +SERPAPI_KEY=your_serpapi_key_here + +# Redis ์บ์‹œ +REDIS_HOST=redis +REDIS_PORT=6379 +REDIS_DB=2 + +# ๊ธฐ๋ณธ ์„ค์ • +DEFAULT_LANGUAGE=ko +DEFAULT_COUNTRY=kr +CACHE_TTL=3600 +``` + +## Docker ์‹คํ–‰ + +```bash +# ๋นŒ๋“œ ๋ฐ ์‹คํ–‰ +docker-compose build google-search-backend +docker-compose up -d google-search-backend + +# ๋กœ๊ทธ ํ™•์ธ +docker-compose logs -f google-search-backend +``` + +## ์ œํ•œ ์‚ฌํ•ญ + +### Google Custom Search API +- ๋ฌด๋ฃŒ ๊ณ„์ •: ์ผ์ผ 100ํšŒ ์ฟผ๋ฆฌ ์ œํ•œ +- ๊ฒ€์ƒ‰๋‹น ์ตœ๋Œ€ 100๊ฐœ ๊ฒฐ๊ณผ +- snippet ๊ธธ์ด๋Š” ์„œ๋ฒ„์—์„œ ์ œํ•œ (๋ณ€๊ฒฝ ๋ถˆ๊ฐ€) + +### ํ•ด๊ฒฐ ๋ฐฉ๋ฒ• +- 20๊ฐœ ์ด์ƒ ๊ฒฐ๊ณผ ํ•„์š” ์‹œ: ํŽ˜์ด์ง€๋„ค์ด์…˜ ์‚ฌ์šฉ +- ๊ธด ๋‚ด์šฉ ํ•„์š” ์‹œ: `/api/search/full` ์—”๋“œํฌ์ธํŠธ ์‚ฌ์šฉ +- API ์ œํ•œ ๋„๋‹ฌ ์‹œ: SerpAPI ๋˜๋Š” ์›น ์Šคํฌ๋ž˜ํ•‘์œผ๋กœ ์ž๋™ ํด๋ฐฑ + +## ์บ์‹œ ๊ด€๋ฆฌ + +Redis๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ์บ์‹ฑํ•ฉ๋‹ˆ๋‹ค: +- ๊ธฐ๋ณธ TTL: 3600์ดˆ (1์‹œ๊ฐ„) +- ์บ์‹œ ์ดˆ๊ธฐํ™”: `POST /api/clear-cache` + +## ํ—ฌ์Šค ์ฒดํฌ + +```bash +curl http://localhost:8016/health +``` + +## ๋ฌธ์ œ ํ•ด๊ฒฐ + +### 1. ํ•œ๊ธ€ ๊ฒ€์ƒ‰ ์•ˆ๋  ๋•Œ +URL ์ธ์ฝ”๋”ฉ ์‚ฌ์šฉ: +```bash +# "์ธ๊ณต์ง€๋Šฅ" โ†’ %EC%9D%B8%EA%B3%B5%EC%A7%80%EB%8A%A5 +curl "http://localhost:8016/api/search?q=%EC%9D%B8%EA%B3%B5%EC%A7%80%EB%8A%A5" +``` + +### 2. API ์ œํ•œ ์—๋Ÿฌ +- Google API ์ผ์ผ ์ œํ•œ ํ™•์ธ +- SerpAPI ํ‚ค ์„ค์ •์œผ๋กœ ๋Œ€์ฒด +- ์›น ์Šคํฌ๋ž˜ํ•‘ ์ž๋™ ํด๋ฐฑ ํ™œ์šฉ + +### 3. ๋А๋ฆฐ ์‘๋‹ต ์‹œ๊ฐ„ +- Redis ์บ์‹œ ํ™œ์„ฑํ™” ํ™•์ธ +- ๊ฒฐ๊ณผ ๊ฐœ์ˆ˜ ์ค„์ด๊ธฐ +- ์ „์ฒด ์ฝ˜ํ…์ธ  ๋Œ€์‹  ๊ธฐ๋ณธ ๊ฒ€์ƒ‰ ์‚ฌ์šฉ \ No newline at end of file diff --git a/services/google-search/backend/.env.example b/services/google-search/backend/.env.example new file mode 100644 index 0000000..0d4f463 --- /dev/null +++ b/services/google-search/backend/.env.example @@ -0,0 +1,21 @@ +# Google Custom Search API Configuration +# Get your API key from: https://console.cloud.google.com/apis/credentials +GOOGLE_API_KEY= + +# Get your Search Engine ID from: https://programmablesearchengine.google.com/ +GOOGLE_SEARCH_ENGINE_ID= + +# Alternative: SerpAPI Configuration +# Get your API key from: https://serpapi.com/ +SERPAPI_KEY= + +# Redis Configuration +REDIS_HOST=redis +REDIS_PORT=6379 +REDIS_DB=2 + +# Search Settings +DEFAULT_LANGUAGE=ko +DEFAULT_COUNTRY=kr +CACHE_TTL=3600 +MAX_RESULTS=10 \ No newline at end of file diff --git a/services/google-search/backend/Dockerfile b/services/google-search/backend/Dockerfile new file mode 100644 index 0000000..800c70b --- /dev/null +++ b/services/google-search/backend/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3.11-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] \ No newline at end of file diff --git a/services/google-search/backend/app/__init__.py b/services/google-search/backend/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/google-search/backend/app/config.py b/services/google-search/backend/app/config.py new file mode 100644 index 0000000..1b06392 --- /dev/null +++ b/services/google-search/backend/app/config.py @@ -0,0 +1,30 @@ +from pydantic_settings import BaseSettings +from typing import Optional + +class Settings(BaseSettings): + # Google Custom Search API ์„ค์ • + google_api_key: Optional[str] = None + google_search_engine_id: Optional[str] = None + + # SerpAPI ์„ค์ • (๋Œ€์•ˆ) + serpapi_key: Optional[str] = None + + # Redis ์บ์‹ฑ ์„ค์ • + redis_host: str = "redis" + redis_port: int = 6379 + redis_db: int = 2 + cache_ttl: int = 3600 # 1์‹œ๊ฐ„ + + # ๊ฒ€์ƒ‰ ์„ค์ • + max_results: int = 10 + default_language: str = "ko" + default_country: str = "kr" + + # ์„œ๋น„์Šค ์„ค์ • + service_name: str = "Google Search Service" + debug: bool = True + + class Config: + env_file = ".env" + +settings = Settings() \ No newline at end of file diff --git a/services/google-search/backend/app/main.py b/services/google-search/backend/app/main.py new file mode 100644 index 0000000..83a29ba --- /dev/null +++ b/services/google-search/backend/app/main.py @@ -0,0 +1,188 @@ +from fastapi import FastAPI, Query, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from typing import Optional +from datetime import datetime +from contextlib import asynccontextmanager + +from .search_service import GoogleSearchService +from .config import settings + +@asynccontextmanager +async def lifespan(app: FastAPI): + # ์‹œ์ž‘ ์‹œ + print("Google Search Service starting...") + yield + # ์ข…๋ฃŒ ์‹œ + print("Google Search Service stopping...") + +app = FastAPI( + title="Google Search Service", + description="๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ์ˆ˜์‹ ํ•˜๋Š” ์„œ๋น„์Šค", + version="1.0.0", + lifespan=lifespan +) + +# CORS ์„ค์ • +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# ๊ฒ€์ƒ‰ ์„œ๋น„์Šค ์ดˆ๊ธฐํ™” +search_service = GoogleSearchService() + +@app.get("/") +async def root(): + return { + "service": "Google Search Service", + "version": "1.0.0", + "timestamp": datetime.now().isoformat(), + "endpoints": { + "search": "/api/search?q=keyword", + "custom_search": "/api/search/custom?q=keyword", + "serpapi_search": "/api/search/serpapi?q=keyword", + "scraping_search": "/api/search/scraping?q=keyword", + "trending": "/api/trending", + "health": "/health" + } + } + +@app.get("/health") +async def health_check(): + return { + "status": "healthy", + "service": "google-search", + "timestamp": datetime.now().isoformat() + } + +@app.get("/api/search") +async def search( + q: str = Query(..., description="๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ"), + num: int = Query(10, description="๊ฒฐ๊ณผ ๊ฐœ์ˆ˜", ge=1, le=20), + lang: Optional[str] = Query(None, description="์–ธ์–ด ์ฝ”๋“œ (ko, en ๋“ฑ)"), + country: Optional[str] = Query(None, description="๊ตญ๊ฐ€ ์ฝ”๋“œ (kr, us ๋“ฑ)"), + date_restrict: Optional[str] = Query(None, description="๋‚ ์งœ ์ œํ•œ (d7=์ผ์ฃผ์ผ, m1=ํ•œ๋‹ฌ, m3=3๊ฐœ์›”, y1=1๋…„)"), + sort_by_date: bool = Query(False, description="์ตœ์‹ ์ˆœ ์ •๋ ฌ") +): + """ + ์ž๋™์œผ๋กœ ์ตœ์ ์˜ ๋ฐฉ๋ฒ•์„ ์„ ํƒํ•˜์—ฌ ๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ + 1. Google Custom Search API (์„ค์ •๋œ ๊ฒฝ์šฐ) + 2. SerpAPI (์„ค์ •๋œ ๊ฒฝ์šฐ) + 3. ์›น ์Šคํฌ๋ž˜ํ•‘ (ํด๋ฐฑ) + """ + # Google Custom Search API ์‹œ๋„ + if settings.google_api_key and settings.google_search_engine_id: + result = await search_service.search_with_custom_api(q, num, lang, country, date_restrict, sort_by_date) + if "error" not in result or not result["error"]: + result["method"] = "google_custom_search" + return result + + # SerpAPI ์‹œ๋„ + if settings.serpapi_key: + result = await search_service.search_with_serpapi(q, num, lang, country) + if "error" not in result or not result["error"]: + result["method"] = "serpapi" + return result + + # ์›น ์Šคํฌ๋ž˜ํ•‘ ํด๋ฐฑ + result = await search_service.search_with_scraping(q, num, lang) + result["method"] = "web_scraping" + result["warning"] = "API ํ‚ค๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•„ ์›น ์Šคํฌ๋ž˜ํ•‘์„ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. ์ œํ•œ์ ์ด๊ณ  ๋ถˆ์•ˆ์ •ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค." + return result + +@app.get("/api/search/custom") +async def search_custom( + q: str = Query(..., description="๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ"), + num: int = Query(10, description="๊ฒฐ๊ณผ ๊ฐœ์ˆ˜", ge=1, le=10), + lang: Optional[str] = Query(None, description="์–ธ์–ด ์ฝ”๋“œ"), + country: Optional[str] = Query(None, description="๊ตญ๊ฐ€ ์ฝ”๋“œ") +): + """Google Custom Search API๋ฅผ ์‚ฌ์šฉํ•œ ๊ฒ€์ƒ‰""" + if not settings.google_api_key or not settings.google_search_engine_id: + raise HTTPException( + status_code=503, + detail="Google Custom Search API credentials not configured" + ) + + result = await search_service.search_with_custom_api(q, num, lang, country) + if "error" in result and result["error"]: + raise HTTPException(status_code=500, detail=result["error"]) + + return result + +@app.get("/api/search/serpapi") +async def search_serpapi( + q: str = Query(..., description="๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ"), + num: int = Query(10, description="๊ฒฐ๊ณผ ๊ฐœ์ˆ˜", ge=1, le=50), + lang: Optional[str] = Query(None, description="์–ธ์–ด ์ฝ”๋“œ"), + country: Optional[str] = Query(None, description="๊ตญ๊ฐ€ ์ฝ”๋“œ") +): + """SerpAPI๋ฅผ ์‚ฌ์šฉํ•œ ๊ฒ€์ƒ‰""" + if not settings.serpapi_key: + raise HTTPException( + status_code=503, + detail="SerpAPI key not configured" + ) + + result = await search_service.search_with_serpapi(q, num, lang, country) + if "error" in result and result["error"]: + raise HTTPException(status_code=500, detail=result["error"]) + + return result + +@app.get("/api/search/scraping") +async def search_scraping( + q: str = Query(..., description="๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ"), + num: int = Query(10, description="๊ฒฐ๊ณผ ๊ฐœ์ˆ˜", ge=1, le=20), + lang: Optional[str] = Query(None, description="์–ธ์–ด ์ฝ”๋“œ") +): + """์›น ์Šคํฌ๋ž˜ํ•‘์„ ์‚ฌ์šฉํ•œ ๊ฒ€์ƒ‰ (์ œํ•œ์ )""" + result = await search_service.search_with_scraping(q, num, lang) + if "error" in result and result["error"]: + raise HTTPException(status_code=500, detail=result["error"]) + + result["warning"] = "์›น ์Šคํฌ๋ž˜ํ•‘์€ ์ œํ•œ์ ์ด๊ณ  ๋ถˆ์•ˆ์ •ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค" + return result + +@app.get("/api/search/full") +async def search_with_full_content( + q: str = Query(..., description="๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ"), + num: int = Query(5, description="๊ฒฐ๊ณผ ๊ฐœ์ˆ˜", ge=1, le=10), + lang: Optional[str] = Query(None, description="์–ธ์–ด ์ฝ”๋“œ (ko, en ๋“ฑ)"), + country: Optional[str] = Query(None, description="๊ตญ๊ฐ€ ์ฝ”๋“œ (kr, us ๋“ฑ)") +): + """ + Google ๊ฒ€์ƒ‰ ํ›„ ๊ฐ ๊ฒฐ๊ณผ ํŽ˜์ด์ง€์˜ ์ „์ฒด ๋‚ด์šฉ์„ ๊ฐ€์ ธ์˜ค๊ธฐ + ์ฃผ์˜: ์‹œ๊ฐ„์ด ์˜ค๋ž˜ ๊ฑธ๋ฆด ์ˆ˜ ์žˆ์Œ + """ + result = await search_service.search_with_full_content(q, num, lang, country) + if "error" in result and result["error"]: + raise HTTPException(status_code=500, detail=result["error"]) + + return result + +@app.get("/api/trending") +async def get_trending( + country: Optional[str] = Query(None, description="๊ตญ๊ฐ€ ์ฝ”๋“œ (kr, us ๋“ฑ)") +): + """์‹ค์‹œ๊ฐ„ ํŠธ๋ Œ๋”ฉ ๊ฒ€์ƒ‰์–ด ์กฐํšŒ""" + result = await search_service.get_trending_searches(country) + if "error" in result and result["error"]: + raise HTTPException(status_code=500, detail=result["error"]) + + return result + +@app.post("/api/clear-cache") +async def clear_cache(): + """์บ์‹œ ์ดˆ๊ธฐํ™”""" + try: + search_service.redis_client.flushdb() + return { + "status": "success", + "message": "์บ์‹œ๊ฐ€ ์ดˆ๊ธฐํ™”๋˜์—ˆ์Šต๋‹ˆ๋‹ค" + } + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file diff --git a/services/google-search/backend/app/search_service.py b/services/google-search/backend/app/search_service.py new file mode 100644 index 0000000..708765b --- /dev/null +++ b/services/google-search/backend/app/search_service.py @@ -0,0 +1,540 @@ +import httpx +import json +import redis +from typing import List, Dict, Optional +from datetime import datetime +import hashlib +from bs4 import BeautifulSoup +from .config import settings + +class GoogleSearchService: + def __init__(self): + # Redis ์—ฐ๊ฒฐ + self.redis_client = redis.Redis( + host=settings.redis_host, + port=settings.redis_port, + db=settings.redis_db, + decode_responses=True + ) + + def _get_cache_key(self, query: str, **kwargs) -> str: + """์บ์‹œ ํ‚ค ์ƒ์„ฑ""" + cache_data = f"{query}_{kwargs}" + return f"google_search:{hashlib.md5(cache_data.encode()).hexdigest()}" + + async def search_with_custom_api( + self, + query: str, + num_results: int = 10, + language: str = None, + country: str = None, + date_restrict: str = None, + sort_by_date: bool = False + ) -> Dict: + """Google Custom Search API ์‚ฌ์šฉ""" + if not settings.google_api_key or not settings.google_search_engine_id: + return { + "error": "Google API credentials not configured", + "results": [] + } + + # ์บ์‹œ ํ™•์ธ + cache_key = self._get_cache_key(query, num=num_results, lang=language, country=country) + cached = self.redis_client.get(cache_key) + if cached: + return json.loads(cached) + + url = "https://www.googleapis.com/customsearch/v1" + + all_results = [] + total_results_info = None + + # Google API๋Š” ํ•œ ๋ฒˆ์— ์ตœ๋Œ€ 10๊ฐœ๋งŒ ๋ฐ˜ํ™˜, 20๊ฐœ๋ฅผ ์›ํ•˜๋ฉด 2๋ฒˆ ์š”์ฒญ + num_requests = min((num_results + 9) // 10, 2) # ์ตœ๋Œ€ 2๋ฒˆ ์š”์ฒญ (20๊ฐœ๊นŒ์ง€) + + async with httpx.AsyncClient() as client: + for page in range(num_requests): + start_index = page * 10 + 1 + current_num = min(10, num_results - page * 10) + + params = { + "key": settings.google_api_key, + "cx": settings.google_search_engine_id, + "q": query, + "num": current_num, + "start": start_index, # ์‹œ์ž‘ ์ธ๋ฑ์Šค + "hl": language or settings.default_language, + "gl": country or settings.default_country + } + + # ๋‚ ์งœ ์ œํ•œ ์ถ”๊ฐ€ (d7 = ์ผ์ฃผ์ผ, m1 = ํ•œ๋‹ฌ, y1 = 1๋…„) + if date_restrict: + params["dateRestrict"] = date_restrict + + # ๋‚ ์งœ์ˆœ ์ •๋ ฌ (Google Custom Search API์—์„œ๋Š” sort=date ์˜ต์…˜) + if sort_by_date: + params["sort"] = "date" + + try: + response = await client.get(url, params=params) + response.raise_for_status() + + data = response.json() + + # ์ฒซ ๋ฒˆ์งธ ์š”์ฒญ์—์„œ๋งŒ ์ „์ฒด ์ •๋ณด ์ €์žฅ + if page == 0: + total_results_info = { + "total_results": data.get("searchInformation", {}).get("totalResults"), + "search_time": data.get("searchInformation", {}).get("searchTime"), + "query": data.get("queries", {}).get("request", [{}])[0].get("searchTerms") + } + + # ๊ฒฐ๊ณผ ์ถ”๊ฐ€ + for item in data.get("items", []): + all_results.append({ + "title": item.get("title"), + "link": item.get("link"), + "snippet": item.get("snippet"), + "display_link": item.get("displayLink"), + "thumbnail": item.get("pagemap", {}).get("cse_thumbnail", [{}])[0].get("src") if "pagemap" in item else None + }) + + except Exception as e: + # ์ฒซ ๋ฒˆ์งธ ์š”์ฒญ์ด ์‹คํŒจํ•˜๋ฉด ์—๋Ÿฌ ๋ฐ˜ํ™˜ + if page == 0: + return { + "error": str(e), + "results": [] + } + # ๋‘ ๋ฒˆ์งธ ์š”์ฒญ์ด ์‹คํŒจํ•˜๋ฉด ์ฒซ ๋ฒˆ์งธ ๊ฒฐ๊ณผ๋งŒ ๋ฐ˜ํ™˜ + break + + results = { + "query": total_results_info.get("query") if total_results_info else query, + "total_results": total_results_info.get("total_results") if total_results_info else "0", + "search_time": total_results_info.get("search_time") if total_results_info else 0, + "results": all_results[:num_results], # ์š”์ฒญํ•œ ๊ฐœ์ˆ˜๋งŒํผ๋งŒ ๋ฐ˜ํ™˜ + "timestamp": datetime.utcnow().isoformat() + } + + # ์บ์‹œ ์ €์žฅ + self.redis_client.setex( + cache_key, + settings.cache_ttl, + json.dumps(results) + ) + + return results + + async def search_with_serpapi( + self, + query: str, + num_results: int = 10, + language: str = None, + country: str = None + ) -> Dict: + """SerpAPI ์‚ฌ์šฉ (์œ ๋ฃŒ ์„œ๋น„์Šค)""" + if not settings.serpapi_key: + return { + "error": "SerpAPI key not configured", + "results": [] + } + + # ์บ์‹œ ํ™•์ธ + cache_key = self._get_cache_key(query, num=num_results, lang=language, country=country) + cached = self.redis_client.get(cache_key) + if cached: + return json.loads(cached) + + from serpapi import GoogleSearch + + params = { + "q": query, + "api_key": settings.serpapi_key, + "num": num_results, + "hl": language or settings.default_language, + "gl": country or settings.default_country + } + + try: + search = GoogleSearch(params) + results = search.get_dict() + + formatted_results = self._format_serpapi_results(results) + + # ์บ์‹œ ์ €์žฅ + self.redis_client.setex( + cache_key, + settings.cache_ttl, + json.dumps(formatted_results) + ) + + return formatted_results + + except Exception as e: + return { + "error": str(e), + "results": [] + } + + async def search_with_scraping( + self, + query: str, + num_results: int = 10, + language: str = None + ) -> Dict: + """์›น ์Šคํฌ๋ž˜ํ•‘์œผ๋กœ ๊ฒ€์ƒ‰ (๋น„์ถ”์ฒœ, ์ œํ•œ์ )""" + # ์บ์‹œ ํ™•์ธ + cache_key = self._get_cache_key(query, num=num_results, lang=language) + cached = self.redis_client.get(cache_key) + if cached: + return json.loads(cached) + + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + } + + params = { + "q": query, + "num": num_results, + "hl": language or settings.default_language + } + + async with httpx.AsyncClient() as client: + try: + response = await client.get( + "https://www.google.com/search", + params=params, + headers=headers, + follow_redirects=True + ) + + soup = BeautifulSoup(response.text, 'html.parser') + results = self._parse_google_html(soup) + + formatted_results = { + "query": query, + "total_results": len(results), + "results": results, + "timestamp": datetime.utcnow().isoformat() + } + + # ์บ์‹œ ์ €์žฅ + self.redis_client.setex( + cache_key, + settings.cache_ttl, + json.dumps(formatted_results) + ) + + return formatted_results + + except Exception as e: + return { + "error": str(e), + "results": [] + } + + def _format_google_results(self, data: Dict) -> Dict: + """Google API ๊ฒฐ๊ณผ ํฌ๋งทํŒ…""" + results = [] + + for item in data.get("items", []): + results.append({ + "title": item.get("title"), + "link": item.get("link"), + "snippet": item.get("snippet"), + "display_link": item.get("displayLink"), + "thumbnail": item.get("pagemap", {}).get("cse_thumbnail", [{}])[0].get("src") if "pagemap" in item else None + }) + + return { + "query": data.get("queries", {}).get("request", [{}])[0].get("searchTerms"), + "total_results": data.get("searchInformation", {}).get("totalResults"), + "search_time": data.get("searchInformation", {}).get("searchTime"), + "results": results, + "timestamp": datetime.utcnow().isoformat() + } + + def _format_serpapi_results(self, data: Dict) -> Dict: + """SerpAPI ๊ฒฐ๊ณผ ํฌ๋งทํŒ…""" + results = [] + + for item in data.get("organic_results", []): + results.append({ + "title": item.get("title"), + "link": item.get("link"), + "snippet": item.get("snippet"), + "position": item.get("position"), + "thumbnail": item.get("thumbnail"), + "date": item.get("date") + }) + + # ๊ด€๋ จ ๊ฒ€์ƒ‰์–ด + related_searches = [ + item.get("query") for item in data.get("related_searches", []) + ] + + return { + "query": data.get("search_parameters", {}).get("q"), + "total_results": data.get("search_information", {}).get("total_results"), + "search_time": data.get("search_information", {}).get("time_taken_displayed"), + "results": results, + "related_searches": related_searches, + "timestamp": datetime.utcnow().isoformat() + } + + def _parse_google_html(self, soup: BeautifulSoup) -> List[Dict]: + """HTML ํŒŒ์‹ฑ์œผ๋กœ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ถ”์ถœ""" + results = [] + + # ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ปจํ…Œ์ด๋„ˆ ์ฐพ๊ธฐ + for g in soup.find_all('div', class_='g'): + anchors = g.find_all('a') + if anchors: + link = anchors[0].get('href', '') + title_elem = g.find('h3') + snippet_elem = g.find('span', class_='st') or g.find('div', class_='s') + + if title_elem and link: + results.append({ + "title": title_elem.get_text(), + "link": link, + "snippet": snippet_elem.get_text() if snippet_elem else "" + }) + + return results + + async def fetch_page_content(self, url: str) -> Dict: + """์›น ํŽ˜์ด์ง€์˜ ์ „์ฒด ๋‚ด์šฉ์„ ๊ฐ€์ ธ์˜ค๊ธฐ""" + try: + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + } + + async with httpx.AsyncClient(timeout=10.0) as client: + response = await client.get(url, headers=headers, follow_redirects=True) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'html.parser') + + # ๋ถˆํ•„์š”ํ•œ ํƒœ๊ทธ ์ œ๊ฑฐ + for script in soup(["script", "style", "nav", "header", "footer"]): + script.decompose() + + # ๋ณธ๋ฌธ ๋‚ด์šฉ ์ถ”์ถœ ์‹œ๋„ + main_content = None + + # 1. article ํƒœ๊ทธ ์ฐพ๊ธฐ + article = soup.find('article') + if article: + main_content = article.get_text() + + # 2. main ํƒœ๊ทธ ์ฐพ๊ธฐ + if not main_content: + main = soup.find('main') + if main: + main_content = main.get_text() + + # 3. ์ผ๋ฐ˜์ ์ธ ์ฝ˜ํ…์ธ  div ์ฐพ๊ธฐ + if not main_content: + content_divs = soup.find_all('div', class_=lambda x: x and ('content' in x.lower() or 'article' in x.lower() or 'post' in x.lower())) + if content_divs: + main_content = ' '.join([div.get_text() for div in content_divs[:3]]) + + # 4. ์ „์ฒด body์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ + if not main_content: + body = soup.find('body') + if body: + main_content = body.get_text() + else: + main_content = soup.get_text() + + # ํ…์ŠคํŠธ ์ •๋ฆฌ + main_content = ' '.join(main_content.split()) + + # ์ œ๋ชฉ ์ถ”์ถœ + title = soup.find('title') + title_text = title.get_text() if title else "" + + # ๋ฉ”ํƒ€ ์„ค๋ช… ์ถ”์ถœ + meta_desc = soup.find('meta', attrs={'name': 'description'}) + description = meta_desc.get('content', '') if meta_desc else "" + + return { + "url": url, + "title": title_text, + "description": description, + "content": main_content[:5000], # ์ตœ๋Œ€ 5000์ž + "content_length": len(main_content), + "success": True + } + + except Exception as e: + return { + "url": url, + "error": str(e), + "success": False + } + + async def search_with_extended_snippet( + self, + query: str, + num_results: int = 10, + language: str = None, + country: str = None + ) -> Dict: + """๊ฒ€์ƒ‰ ํ›„ ํ™•์žฅ๋œ snippet ๊ฐ€์ ธ์˜ค๊ธฐ (๋ฉ”ํƒ€ ์„ค๋ช… + ์ฒซ 500์ž)""" + # ๋จผ์ € ์ผ๋ฐ˜ ๊ฒ€์ƒ‰ ์ˆ˜ํ–‰ + search_results = await self.search_with_custom_api( + query, num_results, language, country + ) + + if "error" in search_results: + return search_results + + # ๊ฐ ๊ฒฐ๊ณผ์˜ ํ™•์žฅ๋œ snippet ๊ฐ€์ ธ์˜ค๊ธฐ + import asyncio + + async def fetch_extended_snippet(result): + """๊ฐœ๋ณ„ ํŽ˜์ด์ง€์˜ ํ™•์žฅ๋œ snippet ๊ฐ€์ ธ์˜ค๊ธฐ""" + enhanced_result = result.copy() + + if result.get("link"): + try: + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + } + + async with httpx.AsyncClient(timeout=5.0) as client: + response = await client.get(result["link"], headers=headers, follow_redirects=True) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'html.parser') + + # ๋ฉ”ํƒ€ ์„ค๋ช… ์ถ”์ถœ + meta_desc = soup.find('meta', attrs={'name': 'description'}) + if not meta_desc: + meta_desc = soup.find('meta', attrs={'property': 'og:description'}) + + description = meta_desc.get('content', '') if meta_desc else "" + + # ๋ณธ๋ฌธ ์ฒซ ๋ถ€๋ถ„ ์ถ”์ถœ + for script in soup(["script", "style"]): + script.decompose() + + # ๋ณธ๋ฌธ ํ…์ŠคํŠธ ์ฐพ๊ธฐ + text_content = "" + for tag in ['article', 'main', 'div']: + elements = soup.find_all(tag) + for elem in elements: + text = elem.get_text().strip() + if len(text) > 200: # ์˜๋ฏธ์žˆ๋Š” ํ…์ŠคํŠธ๋งŒ + text_content = ' '.join(text.split())[:1000] + break + if text_content: + break + + # ๊ธฐ์กด snippet๊ณผ ๋ณ‘ํ•ฉ + extended_snippet = result.get("snippet", "") + if description and description not in extended_snippet: + extended_snippet = description + " ... " + extended_snippet + if text_content and len(extended_snippet) < 500: + extended_snippet = extended_snippet + " ... " + text_content[:500-len(extended_snippet)] + + enhanced_result["snippet"] = extended_snippet[:1000] # ์ตœ๋Œ€ 1000์ž + enhanced_result["extended"] = True + + except Exception as e: + # ์‹คํŒจ ์‹œ ์›๋ณธ snippet ์œ ์ง€ + enhanced_result["extended"] = False + enhanced_result["fetch_error"] = str(e) + + return enhanced_result + + # ๋ณ‘๋ ฌ๋กœ ๋ชจ๋“  ํŽ˜์ด์ง€ ์ฒ˜๋ฆฌ + tasks = [fetch_extended_snippet(result) for result in search_results.get("results", [])] + enhanced_results = await asyncio.gather(*tasks) + + return { + **search_results, + "results": enhanced_results, + "snippet_extended": True + } + + async def search_with_full_content( + self, + query: str, + num_results: int = 5, + language: str = None, + country: str = None + ) -> Dict: + """๊ฒ€์ƒ‰ ํ›„ ๊ฐ ๊ฒฐ๊ณผ์˜ ์ „์ฒด ๋‚ด์šฉ ๊ฐ€์ ธ์˜ค๊ธฐ""" + # ๋จผ์ € ์ผ๋ฐ˜ ๊ฒ€์ƒ‰ ์ˆ˜ํ–‰ + search_results = await self.search_with_custom_api( + query, num_results, language, country + ) + + if "error" in search_results: + return search_results + + # ๊ฐ ๊ฒฐ๊ณผ์˜ ์ „์ฒด ๋‚ด์šฉ ๊ฐ€์ ธ์˜ค๊ธฐ + enhanced_results = [] + for result in search_results.get("results", [])[:num_results]: + # ์›๋ณธ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋ณต์‚ฌ + enhanced_result = result.copy() + + # ํŽ˜์ด์ง€ ๋‚ด์šฉ ๊ฐ€์ ธ์˜ค๊ธฐ + if result.get("link"): + content_data = await self.fetch_page_content(result["link"]) + enhanced_result["full_content"] = content_data + + enhanced_results.append(enhanced_result) + + return { + **search_results, + "results": enhanced_results, + "content_fetched": True + } + + async def get_trending_searches(self, country: str = None) -> Dict: + """ํŠธ๋ Œ๋”ฉ ๊ฒ€์ƒ‰์–ด ๊ฐ€์ ธ์˜ค๊ธฐ""" + # Google Trends ๋น„๊ณต์‹ API ์‚ฌ์šฉ + url = f"https://trends.google.com/trends/api/dailytrends" + params = { + "geo": country or settings.default_country.upper() + } + + async with httpx.AsyncClient() as client: + try: + response = await client.get(url, params=params) + # Google Trends API๋Š” ")]}',\n"๋กœ ์‹œ์ž‘ํ•˜๋Š” ์‘๋‹ต์„ ๋ฐ˜ํ™˜ + json_data = response.text[6:] + data = json.loads(json_data) + + trending = [] + for date_data in data.get("default", {}).get("trendingSearchesDays", []): + for search in date_data.get("trendingSearches", []): + trending.append({ + "title": search.get("title", {}).get("query"), + "traffic": search.get("formattedTraffic"), + "articles": [ + { + "title": article.get("title"), + "url": article.get("url"), + "source": article.get("source") + } + for article in search.get("articles", [])[:3] + ] + }) + + return { + "country": country or settings.default_country, + "trending": trending[:10], + "timestamp": datetime.utcnow().isoformat() + } + + except Exception as e: + return { + "error": str(e), + "trending": [] + } \ No newline at end of file diff --git a/services/google-search/backend/requirements.txt b/services/google-search/backend/requirements.txt new file mode 100644 index 0000000..36fd2a3 --- /dev/null +++ b/services/google-search/backend/requirements.txt @@ -0,0 +1,9 @@ +fastapi==0.109.0 +uvicorn[standard]==0.27.0 +httpx==0.26.0 +pydantic==2.5.3 +pydantic-settings==2.1.0 +google-api-python-client==2.108.0 +beautifulsoup4==4.12.2 +redis==5.0.1 +serpapi==0.1.5 \ No newline at end of file