Files
site11/backup-services/google-search/backend/app/search_service.py
jungwoo choi 070032006e feat: Implement async queue-based news pipeline with microservices
Major architectural transformation from synchronous to asynchronous processing:

## Pipeline Services (8 microservices)
- pipeline-scheduler: APScheduler for 30-minute periodic job triggers
- pipeline-rss-collector: RSS feed collection with deduplication (7-day TTL)
- pipeline-google-search: Content enrichment via Google Search API
- pipeline-ai-summarizer: AI summarization using Claude API (claude-sonnet-4-20250514)
- pipeline-translator: Translation using DeepL Pro API
- pipeline-image-generator: Image generation with Replicate API (Stable Diffusion)
- pipeline-article-assembly: Final article assembly and MongoDB storage
- pipeline-monitor: Real-time monitoring dashboard (port 8100)

## Key Features
- Redis-based job queue with deduplication
- Asynchronous processing with Python asyncio
- Shared models and queue manager for inter-service communication
- Docker containerization for all services
- Container names standardized with site11_ prefix

## Removed Services
- Moved to backup: google-search, rss-feed, news-aggregator, ai-writer

## Configuration
- DeepL Pro API: 3abbc796-2515-44a8-972d-22dcf27ab54a
- Claude Model: claude-sonnet-4-20250514
- Redis Queue TTL: 7 days for deduplication

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-13 19:22:14 +09:00

540 lines
21 KiB
Python

import httpx
import json
import redis
from typing import List, Dict, Optional
from datetime import datetime
import hashlib
from bs4 import BeautifulSoup
from .config import settings
class GoogleSearchService:
def __init__(self):
# Redis 연결
self.redis_client = redis.Redis(
host=settings.redis_host,
port=settings.redis_port,
db=settings.redis_db,
decode_responses=True
)
def _get_cache_key(self, query: str, **kwargs) -> str:
"""캐시 키 생성"""
cache_data = f"{query}_{kwargs}"
return f"google_search:{hashlib.md5(cache_data.encode()).hexdigest()}"
async def search_with_custom_api(
self,
query: str,
num_results: int = 10,
language: str = None,
country: str = None,
date_restrict: str = None,
sort_by_date: bool = False
) -> Dict:
"""Google Custom Search API 사용"""
if not settings.google_api_key or not settings.google_search_engine_id:
return {
"error": "Google API credentials not configured",
"results": []
}
# 캐시 확인
cache_key = self._get_cache_key(query, num=num_results, lang=language, country=country)
cached = self.redis_client.get(cache_key)
if cached:
return json.loads(cached)
url = "https://www.googleapis.com/customsearch/v1"
all_results = []
total_results_info = None
# Google API는 한 번에 최대 10개만 반환, 20개를 원하면 2번 요청
num_requests = min((num_results + 9) // 10, 2) # 최대 2번 요청 (20개까지)
async with httpx.AsyncClient() as client:
for page in range(num_requests):
start_index = page * 10 + 1
current_num = min(10, num_results - page * 10)
params = {
"key": settings.google_api_key,
"cx": settings.google_search_engine_id,
"q": query,
"num": current_num,
"start": start_index, # 시작 인덱스
"hl": language or settings.default_language,
"gl": country or settings.default_country
}
# 날짜 제한 추가 (d7 = 일주일, m1 = 한달, y1 = 1년)
if date_restrict:
params["dateRestrict"] = date_restrict
# 날짜순 정렬 (Google Custom Search API에서는 sort=date 옵션)
if sort_by_date:
params["sort"] = "date"
try:
response = await client.get(url, params=params)
response.raise_for_status()
data = response.json()
# 첫 번째 요청에서만 전체 정보 저장
if page == 0:
total_results_info = {
"total_results": data.get("searchInformation", {}).get("totalResults"),
"search_time": data.get("searchInformation", {}).get("searchTime"),
"query": data.get("queries", {}).get("request", [{}])[0].get("searchTerms")
}
# 결과 추가
for item in data.get("items", []):
all_results.append({
"title": item.get("title"),
"link": item.get("link"),
"snippet": item.get("snippet"),
"display_link": item.get("displayLink"),
"thumbnail": item.get("pagemap", {}).get("cse_thumbnail", [{}])[0].get("src") if "pagemap" in item else None
})
except Exception as e:
# 첫 번째 요청이 실패하면 에러 반환
if page == 0:
return {
"error": str(e),
"results": []
}
# 두 번째 요청이 실패하면 첫 번째 결과만 반환
break
results = {
"query": total_results_info.get("query") if total_results_info else query,
"total_results": total_results_info.get("total_results") if total_results_info else "0",
"search_time": total_results_info.get("search_time") if total_results_info else 0,
"results": all_results[:num_results], # 요청한 개수만큼만 반환
"timestamp": datetime.utcnow().isoformat()
}
# 캐시 저장
self.redis_client.setex(
cache_key,
settings.cache_ttl,
json.dumps(results)
)
return results
async def search_with_serpapi(
self,
query: str,
num_results: int = 10,
language: str = None,
country: str = None
) -> Dict:
"""SerpAPI 사용 (유료 서비스)"""
if not settings.serpapi_key:
return {
"error": "SerpAPI key not configured",
"results": []
}
# 캐시 확인
cache_key = self._get_cache_key(query, num=num_results, lang=language, country=country)
cached = self.redis_client.get(cache_key)
if cached:
return json.loads(cached)
from serpapi import GoogleSearch
params = {
"q": query,
"api_key": settings.serpapi_key,
"num": num_results,
"hl": language or settings.default_language,
"gl": country or settings.default_country
}
try:
search = GoogleSearch(params)
results = search.get_dict()
formatted_results = self._format_serpapi_results(results)
# 캐시 저장
self.redis_client.setex(
cache_key,
settings.cache_ttl,
json.dumps(formatted_results)
)
return formatted_results
except Exception as e:
return {
"error": str(e),
"results": []
}
async def search_with_scraping(
self,
query: str,
num_results: int = 10,
language: str = None
) -> Dict:
"""웹 스크래핑으로 검색 (비추천, 제한적)"""
# 캐시 확인
cache_key = self._get_cache_key(query, num=num_results, lang=language)
cached = self.redis_client.get(cache_key)
if cached:
return json.loads(cached)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
params = {
"q": query,
"num": num_results,
"hl": language or settings.default_language
}
async with httpx.AsyncClient() as client:
try:
response = await client.get(
"https://www.google.com/search",
params=params,
headers=headers,
follow_redirects=True
)
soup = BeautifulSoup(response.text, 'html.parser')
results = self._parse_google_html(soup)
formatted_results = {
"query": query,
"total_results": len(results),
"results": results,
"timestamp": datetime.utcnow().isoformat()
}
# 캐시 저장
self.redis_client.setex(
cache_key,
settings.cache_ttl,
json.dumps(formatted_results)
)
return formatted_results
except Exception as e:
return {
"error": str(e),
"results": []
}
def _format_google_results(self, data: Dict) -> Dict:
"""Google API 결과 포맷팅"""
results = []
for item in data.get("items", []):
results.append({
"title": item.get("title"),
"link": item.get("link"),
"snippet": item.get("snippet"),
"display_link": item.get("displayLink"),
"thumbnail": item.get("pagemap", {}).get("cse_thumbnail", [{}])[0].get("src") if "pagemap" in item else None
})
return {
"query": data.get("queries", {}).get("request", [{}])[0].get("searchTerms"),
"total_results": data.get("searchInformation", {}).get("totalResults"),
"search_time": data.get("searchInformation", {}).get("searchTime"),
"results": results,
"timestamp": datetime.utcnow().isoformat()
}
def _format_serpapi_results(self, data: Dict) -> Dict:
"""SerpAPI 결과 포맷팅"""
results = []
for item in data.get("organic_results", []):
results.append({
"title": item.get("title"),
"link": item.get("link"),
"snippet": item.get("snippet"),
"position": item.get("position"),
"thumbnail": item.get("thumbnail"),
"date": item.get("date")
})
# 관련 검색어
related_searches = [
item.get("query") for item in data.get("related_searches", [])
]
return {
"query": data.get("search_parameters", {}).get("q"),
"total_results": data.get("search_information", {}).get("total_results"),
"search_time": data.get("search_information", {}).get("time_taken_displayed"),
"results": results,
"related_searches": related_searches,
"timestamp": datetime.utcnow().isoformat()
}
def _parse_google_html(self, soup: BeautifulSoup) -> List[Dict]:
"""HTML 파싱으로 검색 결과 추출"""
results = []
# 검색 결과 컨테이너 찾기
for g in soup.find_all('div', class_='g'):
anchors = g.find_all('a')
if anchors:
link = anchors[0].get('href', '')
title_elem = g.find('h3')
snippet_elem = g.find('span', class_='st') or g.find('div', class_='s')
if title_elem and link:
results.append({
"title": title_elem.get_text(),
"link": link,
"snippet": snippet_elem.get_text() if snippet_elem else ""
})
return results
async def fetch_page_content(self, url: str) -> Dict:
"""웹 페이지의 전체 내용을 가져오기"""
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.get(url, headers=headers, follow_redirects=True)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# 불필요한 태그 제거
for script in soup(["script", "style", "nav", "header", "footer"]):
script.decompose()
# 본문 내용 추출 시도
main_content = None
# 1. article 태그 찾기
article = soup.find('article')
if article:
main_content = article.get_text()
# 2. main 태그 찾기
if not main_content:
main = soup.find('main')
if main:
main_content = main.get_text()
# 3. 일반적인 콘텐츠 div 찾기
if not main_content:
content_divs = soup.find_all('div', class_=lambda x: x and ('content' in x.lower() or 'article' in x.lower() or 'post' in x.lower()))
if content_divs:
main_content = ' '.join([div.get_text() for div in content_divs[:3]])
# 4. 전체 body에서 텍스트 추출
if not main_content:
body = soup.find('body')
if body:
main_content = body.get_text()
else:
main_content = soup.get_text()
# 텍스트 정리
main_content = ' '.join(main_content.split())
# 제목 추출
title = soup.find('title')
title_text = title.get_text() if title else ""
# 메타 설명 추출
meta_desc = soup.find('meta', attrs={'name': 'description'})
description = meta_desc.get('content', '') if meta_desc else ""
return {
"url": url,
"title": title_text,
"description": description,
"content": main_content[:5000], # 최대 5000자
"content_length": len(main_content),
"success": True
}
except Exception as e:
return {
"url": url,
"error": str(e),
"success": False
}
async def search_with_extended_snippet(
self,
query: str,
num_results: int = 10,
language: str = None,
country: str = None
) -> Dict:
"""검색 후 확장된 snippet 가져오기 (메타 설명 + 첫 500자)"""
# 먼저 일반 검색 수행
search_results = await self.search_with_custom_api(
query, num_results, language, country
)
if "error" in search_results:
return search_results
# 각 결과의 확장된 snippet 가져오기
import asyncio
async def fetch_extended_snippet(result):
"""개별 페이지의 확장된 snippet 가져오기"""
enhanced_result = result.copy()
if result.get("link"):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
async with httpx.AsyncClient(timeout=5.0) as client:
response = await client.get(result["link"], headers=headers, follow_redirects=True)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# 메타 설명 추출
meta_desc = soup.find('meta', attrs={'name': 'description'})
if not meta_desc:
meta_desc = soup.find('meta', attrs={'property': 'og:description'})
description = meta_desc.get('content', '') if meta_desc else ""
# 본문 첫 부분 추출
for script in soup(["script", "style"]):
script.decompose()
# 본문 텍스트 찾기
text_content = ""
for tag in ['article', 'main', 'div']:
elements = soup.find_all(tag)
for elem in elements:
text = elem.get_text().strip()
if len(text) > 200: # 의미있는 텍스트만
text_content = ' '.join(text.split())[:1000]
break
if text_content:
break
# 기존 snippet과 병합
extended_snippet = result.get("snippet", "")
if description and description not in extended_snippet:
extended_snippet = description + " ... " + extended_snippet
if text_content and len(extended_snippet) < 500:
extended_snippet = extended_snippet + " ... " + text_content[:500-len(extended_snippet)]
enhanced_result["snippet"] = extended_snippet[:1000] # 최대 1000자
enhanced_result["extended"] = True
except Exception as e:
# 실패 시 원본 snippet 유지
enhanced_result["extended"] = False
enhanced_result["fetch_error"] = str(e)
return enhanced_result
# 병렬로 모든 페이지 처리
tasks = [fetch_extended_snippet(result) for result in search_results.get("results", [])]
enhanced_results = await asyncio.gather(*tasks)
return {
**search_results,
"results": enhanced_results,
"snippet_extended": True
}
async def search_with_full_content(
self,
query: str,
num_results: int = 5,
language: str = None,
country: str = None
) -> Dict:
"""검색 후 각 결과의 전체 내용 가져오기"""
# 먼저 일반 검색 수행
search_results = await self.search_with_custom_api(
query, num_results, language, country
)
if "error" in search_results:
return search_results
# 각 결과의 전체 내용 가져오기
enhanced_results = []
for result in search_results.get("results", [])[:num_results]:
# 원본 검색 결과 복사
enhanced_result = result.copy()
# 페이지 내용 가져오기
if result.get("link"):
content_data = await self.fetch_page_content(result["link"])
enhanced_result["full_content"] = content_data
enhanced_results.append(enhanced_result)
return {
**search_results,
"results": enhanced_results,
"content_fetched": True
}
async def get_trending_searches(self, country: str = None) -> Dict:
"""트렌딩 검색어 가져오기"""
# Google Trends 비공식 API 사용
url = f"https://trends.google.com/trends/api/dailytrends"
params = {
"geo": country or settings.default_country.upper()
}
async with httpx.AsyncClient() as client:
try:
response = await client.get(url, params=params)
# Google Trends API는 ")]}',\n"로 시작하는 응답을 반환
json_data = response.text[6:]
data = json.loads(json_data)
trending = []
for date_data in data.get("default", {}).get("trendingSearchesDays", []):
for search in date_data.get("trendingSearches", []):
trending.append({
"title": search.get("title", {}).get("query"),
"traffic": search.get("formattedTraffic"),
"articles": [
{
"title": article.get("title"),
"url": article.get("url"),
"source": article.get("source")
}
for article in search.get("articles", [])[:3]
]
})
return {
"country": country or settings.default_country,
"trending": trending[:10],
"timestamp": datetime.utcnow().isoformat()
}
except Exception as e:
return {
"error": str(e),
"trending": []
}