Major architectural transformation from synchronous to asynchronous processing: ## Pipeline Services (8 microservices) - pipeline-scheduler: APScheduler for 30-minute periodic job triggers - pipeline-rss-collector: RSS feed collection with deduplication (7-day TTL) - pipeline-google-search: Content enrichment via Google Search API - pipeline-ai-summarizer: AI summarization using Claude API (claude-sonnet-4-20250514) - pipeline-translator: Translation using DeepL Pro API - pipeline-image-generator: Image generation with Replicate API (Stable Diffusion) - pipeline-article-assembly: Final article assembly and MongoDB storage - pipeline-monitor: Real-time monitoring dashboard (port 8100) ## Key Features - Redis-based job queue with deduplication - Asynchronous processing with Python asyncio - Shared models and queue manager for inter-service communication - Docker containerization for all services - Container names standardized with site11_ prefix ## Removed Services - Moved to backup: google-search, rss-feed, news-aggregator, ai-writer ## Configuration - DeepL Pro API: 3abbc796-2515-44a8-972d-22dcf27ab54a - Claude Model: claude-sonnet-4-20250514 - Redis Queue TTL: 7 days for deduplication 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
540 lines
21 KiB
Python
540 lines
21 KiB
Python
import httpx
|
|
import json
|
|
import redis
|
|
from typing import List, Dict, Optional
|
|
from datetime import datetime
|
|
import hashlib
|
|
from bs4 import BeautifulSoup
|
|
from .config import settings
|
|
|
|
class GoogleSearchService:
|
|
def __init__(self):
|
|
# Redis 연결
|
|
self.redis_client = redis.Redis(
|
|
host=settings.redis_host,
|
|
port=settings.redis_port,
|
|
db=settings.redis_db,
|
|
decode_responses=True
|
|
)
|
|
|
|
def _get_cache_key(self, query: str, **kwargs) -> str:
|
|
"""캐시 키 생성"""
|
|
cache_data = f"{query}_{kwargs}"
|
|
return f"google_search:{hashlib.md5(cache_data.encode()).hexdigest()}"
|
|
|
|
async def search_with_custom_api(
|
|
self,
|
|
query: str,
|
|
num_results: int = 10,
|
|
language: str = None,
|
|
country: str = None,
|
|
date_restrict: str = None,
|
|
sort_by_date: bool = False
|
|
) -> Dict:
|
|
"""Google Custom Search API 사용"""
|
|
if not settings.google_api_key or not settings.google_search_engine_id:
|
|
return {
|
|
"error": "Google API credentials not configured",
|
|
"results": []
|
|
}
|
|
|
|
# 캐시 확인
|
|
cache_key = self._get_cache_key(query, num=num_results, lang=language, country=country)
|
|
cached = self.redis_client.get(cache_key)
|
|
if cached:
|
|
return json.loads(cached)
|
|
|
|
url = "https://www.googleapis.com/customsearch/v1"
|
|
|
|
all_results = []
|
|
total_results_info = None
|
|
|
|
# Google API는 한 번에 최대 10개만 반환, 20개를 원하면 2번 요청
|
|
num_requests = min((num_results + 9) // 10, 2) # 최대 2번 요청 (20개까지)
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
for page in range(num_requests):
|
|
start_index = page * 10 + 1
|
|
current_num = min(10, num_results - page * 10)
|
|
|
|
params = {
|
|
"key": settings.google_api_key,
|
|
"cx": settings.google_search_engine_id,
|
|
"q": query,
|
|
"num": current_num,
|
|
"start": start_index, # 시작 인덱스
|
|
"hl": language or settings.default_language,
|
|
"gl": country or settings.default_country
|
|
}
|
|
|
|
# 날짜 제한 추가 (d7 = 일주일, m1 = 한달, y1 = 1년)
|
|
if date_restrict:
|
|
params["dateRestrict"] = date_restrict
|
|
|
|
# 날짜순 정렬 (Google Custom Search API에서는 sort=date 옵션)
|
|
if sort_by_date:
|
|
params["sort"] = "date"
|
|
|
|
try:
|
|
response = await client.get(url, params=params)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
|
|
# 첫 번째 요청에서만 전체 정보 저장
|
|
if page == 0:
|
|
total_results_info = {
|
|
"total_results": data.get("searchInformation", {}).get("totalResults"),
|
|
"search_time": data.get("searchInformation", {}).get("searchTime"),
|
|
"query": data.get("queries", {}).get("request", [{}])[0].get("searchTerms")
|
|
}
|
|
|
|
# 결과 추가
|
|
for item in data.get("items", []):
|
|
all_results.append({
|
|
"title": item.get("title"),
|
|
"link": item.get("link"),
|
|
"snippet": item.get("snippet"),
|
|
"display_link": item.get("displayLink"),
|
|
"thumbnail": item.get("pagemap", {}).get("cse_thumbnail", [{}])[0].get("src") if "pagemap" in item else None
|
|
})
|
|
|
|
except Exception as e:
|
|
# 첫 번째 요청이 실패하면 에러 반환
|
|
if page == 0:
|
|
return {
|
|
"error": str(e),
|
|
"results": []
|
|
}
|
|
# 두 번째 요청이 실패하면 첫 번째 결과만 반환
|
|
break
|
|
|
|
results = {
|
|
"query": total_results_info.get("query") if total_results_info else query,
|
|
"total_results": total_results_info.get("total_results") if total_results_info else "0",
|
|
"search_time": total_results_info.get("search_time") if total_results_info else 0,
|
|
"results": all_results[:num_results], # 요청한 개수만큼만 반환
|
|
"timestamp": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
# 캐시 저장
|
|
self.redis_client.setex(
|
|
cache_key,
|
|
settings.cache_ttl,
|
|
json.dumps(results)
|
|
)
|
|
|
|
return results
|
|
|
|
async def search_with_serpapi(
|
|
self,
|
|
query: str,
|
|
num_results: int = 10,
|
|
language: str = None,
|
|
country: str = None
|
|
) -> Dict:
|
|
"""SerpAPI 사용 (유료 서비스)"""
|
|
if not settings.serpapi_key:
|
|
return {
|
|
"error": "SerpAPI key not configured",
|
|
"results": []
|
|
}
|
|
|
|
# 캐시 확인
|
|
cache_key = self._get_cache_key(query, num=num_results, lang=language, country=country)
|
|
cached = self.redis_client.get(cache_key)
|
|
if cached:
|
|
return json.loads(cached)
|
|
|
|
from serpapi import GoogleSearch
|
|
|
|
params = {
|
|
"q": query,
|
|
"api_key": settings.serpapi_key,
|
|
"num": num_results,
|
|
"hl": language or settings.default_language,
|
|
"gl": country or settings.default_country
|
|
}
|
|
|
|
try:
|
|
search = GoogleSearch(params)
|
|
results = search.get_dict()
|
|
|
|
formatted_results = self._format_serpapi_results(results)
|
|
|
|
# 캐시 저장
|
|
self.redis_client.setex(
|
|
cache_key,
|
|
settings.cache_ttl,
|
|
json.dumps(formatted_results)
|
|
)
|
|
|
|
return formatted_results
|
|
|
|
except Exception as e:
|
|
return {
|
|
"error": str(e),
|
|
"results": []
|
|
}
|
|
|
|
async def search_with_scraping(
|
|
self,
|
|
query: str,
|
|
num_results: int = 10,
|
|
language: str = None
|
|
) -> Dict:
|
|
"""웹 스크래핑으로 검색 (비추천, 제한적)"""
|
|
# 캐시 확인
|
|
cache_key = self._get_cache_key(query, num=num_results, lang=language)
|
|
cached = self.redis_client.get(cache_key)
|
|
if cached:
|
|
return json.loads(cached)
|
|
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
}
|
|
|
|
params = {
|
|
"q": query,
|
|
"num": num_results,
|
|
"hl": language or settings.default_language
|
|
}
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
try:
|
|
response = await client.get(
|
|
"https://www.google.com/search",
|
|
params=params,
|
|
headers=headers,
|
|
follow_redirects=True
|
|
)
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
results = self._parse_google_html(soup)
|
|
|
|
formatted_results = {
|
|
"query": query,
|
|
"total_results": len(results),
|
|
"results": results,
|
|
"timestamp": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
# 캐시 저장
|
|
self.redis_client.setex(
|
|
cache_key,
|
|
settings.cache_ttl,
|
|
json.dumps(formatted_results)
|
|
)
|
|
|
|
return formatted_results
|
|
|
|
except Exception as e:
|
|
return {
|
|
"error": str(e),
|
|
"results": []
|
|
}
|
|
|
|
def _format_google_results(self, data: Dict) -> Dict:
|
|
"""Google API 결과 포맷팅"""
|
|
results = []
|
|
|
|
for item in data.get("items", []):
|
|
results.append({
|
|
"title": item.get("title"),
|
|
"link": item.get("link"),
|
|
"snippet": item.get("snippet"),
|
|
"display_link": item.get("displayLink"),
|
|
"thumbnail": item.get("pagemap", {}).get("cse_thumbnail", [{}])[0].get("src") if "pagemap" in item else None
|
|
})
|
|
|
|
return {
|
|
"query": data.get("queries", {}).get("request", [{}])[0].get("searchTerms"),
|
|
"total_results": data.get("searchInformation", {}).get("totalResults"),
|
|
"search_time": data.get("searchInformation", {}).get("searchTime"),
|
|
"results": results,
|
|
"timestamp": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
def _format_serpapi_results(self, data: Dict) -> Dict:
|
|
"""SerpAPI 결과 포맷팅"""
|
|
results = []
|
|
|
|
for item in data.get("organic_results", []):
|
|
results.append({
|
|
"title": item.get("title"),
|
|
"link": item.get("link"),
|
|
"snippet": item.get("snippet"),
|
|
"position": item.get("position"),
|
|
"thumbnail": item.get("thumbnail"),
|
|
"date": item.get("date")
|
|
})
|
|
|
|
# 관련 검색어
|
|
related_searches = [
|
|
item.get("query") for item in data.get("related_searches", [])
|
|
]
|
|
|
|
return {
|
|
"query": data.get("search_parameters", {}).get("q"),
|
|
"total_results": data.get("search_information", {}).get("total_results"),
|
|
"search_time": data.get("search_information", {}).get("time_taken_displayed"),
|
|
"results": results,
|
|
"related_searches": related_searches,
|
|
"timestamp": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
def _parse_google_html(self, soup: BeautifulSoup) -> List[Dict]:
|
|
"""HTML 파싱으로 검색 결과 추출"""
|
|
results = []
|
|
|
|
# 검색 결과 컨테이너 찾기
|
|
for g in soup.find_all('div', class_='g'):
|
|
anchors = g.find_all('a')
|
|
if anchors:
|
|
link = anchors[0].get('href', '')
|
|
title_elem = g.find('h3')
|
|
snippet_elem = g.find('span', class_='st') or g.find('div', class_='s')
|
|
|
|
if title_elem and link:
|
|
results.append({
|
|
"title": title_elem.get_text(),
|
|
"link": link,
|
|
"snippet": snippet_elem.get_text() if snippet_elem else ""
|
|
})
|
|
|
|
return results
|
|
|
|
async def fetch_page_content(self, url: str) -> Dict:
|
|
"""웹 페이지의 전체 내용을 가져오기"""
|
|
try:
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
response = await client.get(url, headers=headers, follow_redirects=True)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# 불필요한 태그 제거
|
|
for script in soup(["script", "style", "nav", "header", "footer"]):
|
|
script.decompose()
|
|
|
|
# 본문 내용 추출 시도
|
|
main_content = None
|
|
|
|
# 1. article 태그 찾기
|
|
article = soup.find('article')
|
|
if article:
|
|
main_content = article.get_text()
|
|
|
|
# 2. main 태그 찾기
|
|
if not main_content:
|
|
main = soup.find('main')
|
|
if main:
|
|
main_content = main.get_text()
|
|
|
|
# 3. 일반적인 콘텐츠 div 찾기
|
|
if not main_content:
|
|
content_divs = soup.find_all('div', class_=lambda x: x and ('content' in x.lower() or 'article' in x.lower() or 'post' in x.lower()))
|
|
if content_divs:
|
|
main_content = ' '.join([div.get_text() for div in content_divs[:3]])
|
|
|
|
# 4. 전체 body에서 텍스트 추출
|
|
if not main_content:
|
|
body = soup.find('body')
|
|
if body:
|
|
main_content = body.get_text()
|
|
else:
|
|
main_content = soup.get_text()
|
|
|
|
# 텍스트 정리
|
|
main_content = ' '.join(main_content.split())
|
|
|
|
# 제목 추출
|
|
title = soup.find('title')
|
|
title_text = title.get_text() if title else ""
|
|
|
|
# 메타 설명 추출
|
|
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
|
description = meta_desc.get('content', '') if meta_desc else ""
|
|
|
|
return {
|
|
"url": url,
|
|
"title": title_text,
|
|
"description": description,
|
|
"content": main_content[:5000], # 최대 5000자
|
|
"content_length": len(main_content),
|
|
"success": True
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"url": url,
|
|
"error": str(e),
|
|
"success": False
|
|
}
|
|
|
|
async def search_with_extended_snippet(
|
|
self,
|
|
query: str,
|
|
num_results: int = 10,
|
|
language: str = None,
|
|
country: str = None
|
|
) -> Dict:
|
|
"""검색 후 확장된 snippet 가져오기 (메타 설명 + 첫 500자)"""
|
|
# 먼저 일반 검색 수행
|
|
search_results = await self.search_with_custom_api(
|
|
query, num_results, language, country
|
|
)
|
|
|
|
if "error" in search_results:
|
|
return search_results
|
|
|
|
# 각 결과의 확장된 snippet 가져오기
|
|
import asyncio
|
|
|
|
async def fetch_extended_snippet(result):
|
|
"""개별 페이지의 확장된 snippet 가져오기"""
|
|
enhanced_result = result.copy()
|
|
|
|
if result.get("link"):
|
|
try:
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=5.0) as client:
|
|
response = await client.get(result["link"], headers=headers, follow_redirects=True)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# 메타 설명 추출
|
|
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
|
if not meta_desc:
|
|
meta_desc = soup.find('meta', attrs={'property': 'og:description'})
|
|
|
|
description = meta_desc.get('content', '') if meta_desc else ""
|
|
|
|
# 본문 첫 부분 추출
|
|
for script in soup(["script", "style"]):
|
|
script.decompose()
|
|
|
|
# 본문 텍스트 찾기
|
|
text_content = ""
|
|
for tag in ['article', 'main', 'div']:
|
|
elements = soup.find_all(tag)
|
|
for elem in elements:
|
|
text = elem.get_text().strip()
|
|
if len(text) > 200: # 의미있는 텍스트만
|
|
text_content = ' '.join(text.split())[:1000]
|
|
break
|
|
if text_content:
|
|
break
|
|
|
|
# 기존 snippet과 병합
|
|
extended_snippet = result.get("snippet", "")
|
|
if description and description not in extended_snippet:
|
|
extended_snippet = description + " ... " + extended_snippet
|
|
if text_content and len(extended_snippet) < 500:
|
|
extended_snippet = extended_snippet + " ... " + text_content[:500-len(extended_snippet)]
|
|
|
|
enhanced_result["snippet"] = extended_snippet[:1000] # 최대 1000자
|
|
enhanced_result["extended"] = True
|
|
|
|
except Exception as e:
|
|
# 실패 시 원본 snippet 유지
|
|
enhanced_result["extended"] = False
|
|
enhanced_result["fetch_error"] = str(e)
|
|
|
|
return enhanced_result
|
|
|
|
# 병렬로 모든 페이지 처리
|
|
tasks = [fetch_extended_snippet(result) for result in search_results.get("results", [])]
|
|
enhanced_results = await asyncio.gather(*tasks)
|
|
|
|
return {
|
|
**search_results,
|
|
"results": enhanced_results,
|
|
"snippet_extended": True
|
|
}
|
|
|
|
async def search_with_full_content(
|
|
self,
|
|
query: str,
|
|
num_results: int = 5,
|
|
language: str = None,
|
|
country: str = None
|
|
) -> Dict:
|
|
"""검색 후 각 결과의 전체 내용 가져오기"""
|
|
# 먼저 일반 검색 수행
|
|
search_results = await self.search_with_custom_api(
|
|
query, num_results, language, country
|
|
)
|
|
|
|
if "error" in search_results:
|
|
return search_results
|
|
|
|
# 각 결과의 전체 내용 가져오기
|
|
enhanced_results = []
|
|
for result in search_results.get("results", [])[:num_results]:
|
|
# 원본 검색 결과 복사
|
|
enhanced_result = result.copy()
|
|
|
|
# 페이지 내용 가져오기
|
|
if result.get("link"):
|
|
content_data = await self.fetch_page_content(result["link"])
|
|
enhanced_result["full_content"] = content_data
|
|
|
|
enhanced_results.append(enhanced_result)
|
|
|
|
return {
|
|
**search_results,
|
|
"results": enhanced_results,
|
|
"content_fetched": True
|
|
}
|
|
|
|
async def get_trending_searches(self, country: str = None) -> Dict:
|
|
"""트렌딩 검색어 가져오기"""
|
|
# Google Trends 비공식 API 사용
|
|
url = f"https://trends.google.com/trends/api/dailytrends"
|
|
params = {
|
|
"geo": country or settings.default_country.upper()
|
|
}
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
try:
|
|
response = await client.get(url, params=params)
|
|
# Google Trends API는 ")]}',\n"로 시작하는 응답을 반환
|
|
json_data = response.text[6:]
|
|
data = json.loads(json_data)
|
|
|
|
trending = []
|
|
for date_data in data.get("default", {}).get("trendingSearchesDays", []):
|
|
for search in date_data.get("trendingSearches", []):
|
|
trending.append({
|
|
"title": search.get("title", {}).get("query"),
|
|
"traffic": search.get("formattedTraffic"),
|
|
"articles": [
|
|
{
|
|
"title": article.get("title"),
|
|
"url": article.get("url"),
|
|
"source": article.get("source")
|
|
}
|
|
for article in search.get("articles", [])[:3]
|
|
]
|
|
})
|
|
|
|
return {
|
|
"country": country or settings.default_country,
|
|
"trending": trending[:10],
|
|
"timestamp": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"error": str(e),
|
|
"trending": []
|
|
} |