import httpx import json import redis from typing import List, Dict, Optional from datetime import datetime import hashlib from bs4 import BeautifulSoup from .config import settings class GoogleSearchService: def __init__(self): # Redis 연결 self.redis_client = redis.Redis( host=settings.redis_host, port=settings.redis_port, db=settings.redis_db, decode_responses=True ) def _get_cache_key(self, query: str, **kwargs) -> str: """캐시 키 생성""" cache_data = f"{query}_{kwargs}" return f"google_search:{hashlib.md5(cache_data.encode()).hexdigest()}" async def search_with_custom_api( self, query: str, num_results: int = 10, language: str = None, country: str = None, date_restrict: str = None, sort_by_date: bool = False ) -> Dict: """Google Custom Search API 사용""" if not settings.google_api_key or not settings.google_search_engine_id: return { "error": "Google API credentials not configured", "results": [] } # 캐시 확인 cache_key = self._get_cache_key(query, num=num_results, lang=language, country=country) cached = self.redis_client.get(cache_key) if cached: return json.loads(cached) url = "https://www.googleapis.com/customsearch/v1" all_results = [] total_results_info = None # Google API는 한 번에 최대 10개만 반환, 20개를 원하면 2번 요청 num_requests = min((num_results + 9) // 10, 2) # 최대 2번 요청 (20개까지) async with httpx.AsyncClient() as client: for page in range(num_requests): start_index = page * 10 + 1 current_num = min(10, num_results - page * 10) params = { "key": settings.google_api_key, "cx": settings.google_search_engine_id, "q": query, "num": current_num, "start": start_index, # 시작 인덱스 "hl": language or settings.default_language, "gl": country or settings.default_country } # 날짜 제한 추가 (d7 = 일주일, m1 = 한달, y1 = 1년) if date_restrict: params["dateRestrict"] = date_restrict # 날짜순 정렬 (Google Custom Search API에서는 sort=date 옵션) if sort_by_date: params["sort"] = "date" try: response = await client.get(url, params=params) response.raise_for_status() data = response.json() # 첫 번째 요청에서만 전체 정보 저장 if page == 0: total_results_info = { "total_results": data.get("searchInformation", {}).get("totalResults"), "search_time": data.get("searchInformation", {}).get("searchTime"), "query": data.get("queries", {}).get("request", [{}])[0].get("searchTerms") } # 결과 추가 for item in data.get("items", []): all_results.append({ "title": item.get("title"), "link": item.get("link"), "snippet": item.get("snippet"), "display_link": item.get("displayLink"), "thumbnail": item.get("pagemap", {}).get("cse_thumbnail", [{}])[0].get("src") if "pagemap" in item else None }) except Exception as e: # 첫 번째 요청이 실패하면 에러 반환 if page == 0: return { "error": str(e), "results": [] } # 두 번째 요청이 실패하면 첫 번째 결과만 반환 break results = { "query": total_results_info.get("query") if total_results_info else query, "total_results": total_results_info.get("total_results") if total_results_info else "0", "search_time": total_results_info.get("search_time") if total_results_info else 0, "results": all_results[:num_results], # 요청한 개수만큼만 반환 "timestamp": datetime.utcnow().isoformat() } # 캐시 저장 self.redis_client.setex( cache_key, settings.cache_ttl, json.dumps(results) ) return results async def search_with_serpapi( self, query: str, num_results: int = 10, language: str = None, country: str = None ) -> Dict: """SerpAPI 사용 (유료 서비스)""" if not settings.serpapi_key: return { "error": "SerpAPI key not configured", "results": [] } # 캐시 확인 cache_key = self._get_cache_key(query, num=num_results, lang=language, country=country) cached = self.redis_client.get(cache_key) if cached: return json.loads(cached) from serpapi import GoogleSearch params = { "q": query, "api_key": settings.serpapi_key, "num": num_results, "hl": language or settings.default_language, "gl": country or settings.default_country } try: search = GoogleSearch(params) results = search.get_dict() formatted_results = self._format_serpapi_results(results) # 캐시 저장 self.redis_client.setex( cache_key, settings.cache_ttl, json.dumps(formatted_results) ) return formatted_results except Exception as e: return { "error": str(e), "results": [] } async def search_with_scraping( self, query: str, num_results: int = 10, language: str = None ) -> Dict: """웹 스크래핑으로 검색 (비추천, 제한적)""" # 캐시 확인 cache_key = self._get_cache_key(query, num=num_results, lang=language) cached = self.redis_client.get(cache_key) if cached: return json.loads(cached) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" } params = { "q": query, "num": num_results, "hl": language or settings.default_language } async with httpx.AsyncClient() as client: try: response = await client.get( "https://www.google.com/search", params=params, headers=headers, follow_redirects=True ) soup = BeautifulSoup(response.text, 'html.parser') results = self._parse_google_html(soup) formatted_results = { "query": query, "total_results": len(results), "results": results, "timestamp": datetime.utcnow().isoformat() } # 캐시 저장 self.redis_client.setex( cache_key, settings.cache_ttl, json.dumps(formatted_results) ) return formatted_results except Exception as e: return { "error": str(e), "results": [] } def _format_google_results(self, data: Dict) -> Dict: """Google API 결과 포맷팅""" results = [] for item in data.get("items", []): results.append({ "title": item.get("title"), "link": item.get("link"), "snippet": item.get("snippet"), "display_link": item.get("displayLink"), "thumbnail": item.get("pagemap", {}).get("cse_thumbnail", [{}])[0].get("src") if "pagemap" in item else None }) return { "query": data.get("queries", {}).get("request", [{}])[0].get("searchTerms"), "total_results": data.get("searchInformation", {}).get("totalResults"), "search_time": data.get("searchInformation", {}).get("searchTime"), "results": results, "timestamp": datetime.utcnow().isoformat() } def _format_serpapi_results(self, data: Dict) -> Dict: """SerpAPI 결과 포맷팅""" results = [] for item in data.get("organic_results", []): results.append({ "title": item.get("title"), "link": item.get("link"), "snippet": item.get("snippet"), "position": item.get("position"), "thumbnail": item.get("thumbnail"), "date": item.get("date") }) # 관련 검색어 related_searches = [ item.get("query") for item in data.get("related_searches", []) ] return { "query": data.get("search_parameters", {}).get("q"), "total_results": data.get("search_information", {}).get("total_results"), "search_time": data.get("search_information", {}).get("time_taken_displayed"), "results": results, "related_searches": related_searches, "timestamp": datetime.utcnow().isoformat() } def _parse_google_html(self, soup: BeautifulSoup) -> List[Dict]: """HTML 파싱으로 검색 결과 추출""" results = [] # 검색 결과 컨테이너 찾기 for g in soup.find_all('div', class_='g'): anchors = g.find_all('a') if anchors: link = anchors[0].get('href', '') title_elem = g.find('h3') snippet_elem = g.find('span', class_='st') or g.find('div', class_='s') if title_elem and link: results.append({ "title": title_elem.get_text(), "link": link, "snippet": snippet_elem.get_text() if snippet_elem else "" }) return results async def fetch_page_content(self, url: str) -> Dict: """웹 페이지의 전체 내용을 가져오기""" try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" } async with httpx.AsyncClient(timeout=10.0) as client: response = await client.get(url, headers=headers, follow_redirects=True) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # 불필요한 태그 제거 for script in soup(["script", "style", "nav", "header", "footer"]): script.decompose() # 본문 내용 추출 시도 main_content = None # 1. article 태그 찾기 article = soup.find('article') if article: main_content = article.get_text() # 2. main 태그 찾기 if not main_content: main = soup.find('main') if main: main_content = main.get_text() # 3. 일반적인 콘텐츠 div 찾기 if not main_content: content_divs = soup.find_all('div', class_=lambda x: x and ('content' in x.lower() or 'article' in x.lower() or 'post' in x.lower())) if content_divs: main_content = ' '.join([div.get_text() for div in content_divs[:3]]) # 4. 전체 body에서 텍스트 추출 if not main_content: body = soup.find('body') if body: main_content = body.get_text() else: main_content = soup.get_text() # 텍스트 정리 main_content = ' '.join(main_content.split()) # 제목 추출 title = soup.find('title') title_text = title.get_text() if title else "" # 메타 설명 추출 meta_desc = soup.find('meta', attrs={'name': 'description'}) description = meta_desc.get('content', '') if meta_desc else "" return { "url": url, "title": title_text, "description": description, "content": main_content[:5000], # 최대 5000자 "content_length": len(main_content), "success": True } except Exception as e: return { "url": url, "error": str(e), "success": False } async def search_with_extended_snippet( self, query: str, num_results: int = 10, language: str = None, country: str = None ) -> Dict: """검색 후 확장된 snippet 가져오기 (메타 설명 + 첫 500자)""" # 먼저 일반 검색 수행 search_results = await self.search_with_custom_api( query, num_results, language, country ) if "error" in search_results: return search_results # 각 결과의 확장된 snippet 가져오기 import asyncio async def fetch_extended_snippet(result): """개별 페이지의 확장된 snippet 가져오기""" enhanced_result = result.copy() if result.get("link"): try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" } async with httpx.AsyncClient(timeout=5.0) as client: response = await client.get(result["link"], headers=headers, follow_redirects=True) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # 메타 설명 추출 meta_desc = soup.find('meta', attrs={'name': 'description'}) if not meta_desc: meta_desc = soup.find('meta', attrs={'property': 'og:description'}) description = meta_desc.get('content', '') if meta_desc else "" # 본문 첫 부분 추출 for script in soup(["script", "style"]): script.decompose() # 본문 텍스트 찾기 text_content = "" for tag in ['article', 'main', 'div']: elements = soup.find_all(tag) for elem in elements: text = elem.get_text().strip() if len(text) > 200: # 의미있는 텍스트만 text_content = ' '.join(text.split())[:1000] break if text_content: break # 기존 snippet과 병합 extended_snippet = result.get("snippet", "") if description and description not in extended_snippet: extended_snippet = description + " ... " + extended_snippet if text_content and len(extended_snippet) < 500: extended_snippet = extended_snippet + " ... " + text_content[:500-len(extended_snippet)] enhanced_result["snippet"] = extended_snippet[:1000] # 최대 1000자 enhanced_result["extended"] = True except Exception as e: # 실패 시 원본 snippet 유지 enhanced_result["extended"] = False enhanced_result["fetch_error"] = str(e) return enhanced_result # 병렬로 모든 페이지 처리 tasks = [fetch_extended_snippet(result) for result in search_results.get("results", [])] enhanced_results = await asyncio.gather(*tasks) return { **search_results, "results": enhanced_results, "snippet_extended": True } async def search_with_full_content( self, query: str, num_results: int = 5, language: str = None, country: str = None ) -> Dict: """검색 후 각 결과의 전체 내용 가져오기""" # 먼저 일반 검색 수행 search_results = await self.search_with_custom_api( query, num_results, language, country ) if "error" in search_results: return search_results # 각 결과의 전체 내용 가져오기 enhanced_results = [] for result in search_results.get("results", [])[:num_results]: # 원본 검색 결과 복사 enhanced_result = result.copy() # 페이지 내용 가져오기 if result.get("link"): content_data = await self.fetch_page_content(result["link"]) enhanced_result["full_content"] = content_data enhanced_results.append(enhanced_result) return { **search_results, "results": enhanced_results, "content_fetched": True } async def get_trending_searches(self, country: str = None) -> Dict: """트렌딩 검색어 가져오기""" # Google Trends 비공식 API 사용 url = f"https://trends.google.com/trends/api/dailytrends" params = { "geo": country or settings.default_country.upper() } async with httpx.AsyncClient() as client: try: response = await client.get(url, params=params) # Google Trends API는 ")]}',\n"로 시작하는 응답을 반환 json_data = response.text[6:] data = json.loads(json_data) trending = [] for date_data in data.get("default", {}).get("trendingSearchesDays", []): for search in date_data.get("trendingSearches", []): trending.append({ "title": search.get("title", {}).get("query"), "traffic": search.get("formattedTraffic"), "articles": [ { "title": article.get("title"), "url": article.get("url"), "source": article.get("source") } for article in search.get("articles", [])[:3] ] }) return { "country": country or settings.default_country, "trending": trending[:10], "timestamp": datetime.utcnow().isoformat() } except Exception as e: return { "error": str(e), "trending": [] }