Files
site11/services/search/backend/solr_client.py
jungwoo choi dd165454f0 feat: Add Step 13 - Search System with Apache Solr and Data Persistence
- Implemented search service with Apache Solr instead of Elasticsearch
- Added full-text search, faceted search, and autocomplete capabilities
- Created data indexer for synchronizing data from MongoDB/Kafka to Solr
- Configured external volume mounts for all data services:
  - MongoDB, Redis, Kafka, Zookeeper, MinIO, Solr
  - All data now persists in ./data/ directory
- Added comprehensive search API endpoints
- Created documentation for data persistence and backup strategies

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-11 20:27:02 +09:00

303 lines
10 KiB
Python

"""
Apache Solr client for search operations
"""
import pysolr
import logging
from typing import Dict, List, Any, Optional
from datetime import datetime
import json
logger = logging.getLogger(__name__)
class SolrClient:
def __init__(self, solr_url: str = "http://solr:8983/solr", core_name: str = "site11"):
self.solr_url = f"{solr_url}/{core_name}"
self.core_name = core_name
self.solr = None
self.connect()
def connect(self):
"""Connect to Solr instance"""
try:
self.solr = pysolr.Solr(
self.solr_url,
always_commit=True,
timeout=10
)
# Test connection
self.solr.ping()
logger.info(f"Connected to Solr at {self.solr_url}")
except Exception as e:
logger.error(f"Failed to connect to Solr: {e}")
raise
def index_document(self, document: Dict[str, Any], doc_type: str = None) -> bool:
"""Index a single document"""
try:
# Add metadata
if doc_type:
document["doc_type"] = doc_type
if "id" not in document:
document["id"] = f"{doc_type}_{document.get('_id', '')}"
# Add indexing timestamp
document["indexed_at"] = datetime.utcnow().isoformat()
# Index the document
self.solr.add([document])
logger.info(f"Indexed document: {document.get('id')}")
return True
except Exception as e:
logger.error(f"Failed to index document: {e}")
return False
def bulk_index(self, documents: List[Dict[str, Any]], doc_type: str = None) -> int:
"""Bulk index multiple documents"""
try:
indexed = 0
for doc in documents:
if doc_type:
doc["doc_type"] = doc_type
if "id" not in doc:
doc["id"] = f"{doc_type}_{doc.get('_id', '')}"
doc["indexed_at"] = datetime.utcnow().isoformat()
self.solr.add(documents)
indexed = len(documents)
logger.info(f"Bulk indexed {indexed} documents")
return indexed
except Exception as e:
logger.error(f"Failed to bulk index: {e}")
return 0
def search(self, query: str, **kwargs) -> Dict[str, Any]:
"""
Search documents
Args:
query: Search query string
**kwargs: Additional search parameters
- fq: Filter queries
- fl: Fields to return
- start: Starting offset
- rows: Number of rows
- sort: Sort order
- facet: Enable faceting
- facet.field: Fields to facet on
"""
try:
# Default parameters
params = {
'q': query,
'start': kwargs.get('start', 0),
'rows': kwargs.get('rows', 10),
'fl': kwargs.get('fl', '*,score'),
'defType': 'edismax',
'qf': 'title^3 content^2 tags description name', # Boost fields
'mm': '2<-25%', # Minimum match
'hl': 'true', # Highlighting
'hl.fl': 'title,content,description',
'hl.simple.pre': '<mark>',
'hl.simple.post': '</mark>'
}
# Add filter queries
if 'fq' in kwargs:
params['fq'] = kwargs['fq']
# Add sorting
if 'sort' in kwargs:
params['sort'] = kwargs['sort']
# Add faceting
if kwargs.get('facet'):
params.update({
'facet': 'true',
'facet.field': kwargs.get('facet_field', ['doc_type', 'tags', 'status']),
'facet.mincount': 1
})
# Execute search
results = self.solr.search(**params)
# Format response
response = {
'total': results.hits,
'documents': [],
'facets': {},
'highlighting': {}
}
# Add documents
for doc in results.docs:
response['documents'].append(doc)
# Add facets if available
if hasattr(results, 'facets') and results.facets:
if 'facet_fields' in results.facets:
for field, values in results.facets['facet_fields'].items():
response['facets'][field] = [
{'value': values[i], 'count': values[i+1]}
for i in range(0, len(values), 2)
]
# Add highlighting if available
if hasattr(results, 'highlighting'):
response['highlighting'] = results.highlighting
return response
except Exception as e:
logger.error(f"Search failed: {e}")
return {'total': 0, 'documents': [], 'error': str(e)}
def suggest(self, prefix: str, field: str = "suggest", limit: int = 10) -> List[str]:
"""Get autocomplete suggestions"""
try:
params = {
'q': f'{field}:{prefix}*',
'fl': field,
'rows': limit,
'start': 0
}
results = self.solr.search(**params)
suggestions = []
for doc in results.docs:
if field in doc:
value = doc[field]
if isinstance(value, list):
suggestions.extend(value)
else:
suggestions.append(value)
# Remove duplicates and limit
seen = set()
unique_suggestions = []
for s in suggestions:
if s not in seen:
seen.add(s)
unique_suggestions.append(s)
if len(unique_suggestions) >= limit:
break
return unique_suggestions
except Exception as e:
logger.error(f"Suggest failed: {e}")
return []
def more_like_this(self, doc_id: str, mlt_fields: List[str] = None, rows: int = 5) -> List[Dict]:
"""Find similar documents"""
try:
if not mlt_fields:
mlt_fields = ['title', 'content', 'tags', 'description']
params = {
'q': f'id:{doc_id}',
'mlt': 'true',
'mlt.fl': ','.join(mlt_fields),
'mlt.mindf': 1,
'mlt.mintf': 1,
'mlt.count': rows,
'fl': '*,score'
}
results = self.solr.search(**params)
if results.docs:
# The MLT results are in the moreLikeThis section
if hasattr(results, 'moreLikeThis'):
mlt_results = results.moreLikeThis.get(doc_id, {})
if 'docs' in mlt_results:
return mlt_results['docs']
return []
except Exception as e:
logger.error(f"More like this failed: {e}")
return []
def delete_document(self, doc_id: str) -> bool:
"""Delete a document by ID"""
try:
self.solr.delete(id=doc_id)
logger.info(f"Deleted document: {doc_id}")
return True
except Exception as e:
logger.error(f"Failed to delete document: {e}")
return False
def delete_by_query(self, query: str) -> bool:
"""Delete documents matching a query"""
try:
self.solr.delete(q=query)
logger.info(f"Deleted documents matching: {query}")
return True
except Exception as e:
logger.error(f"Failed to delete by query: {e}")
return False
def clear_index(self) -> bool:
"""Clear all documents from index"""
try:
self.solr.delete(q='*:*')
logger.info("Cleared all documents from index")
return True
except Exception as e:
logger.error(f"Failed to clear index: {e}")
return False
def get_stats(self) -> Dict[str, Any]:
"""Get index statistics"""
try:
# Get document count
results = self.solr.search(q='*:*', rows=0)
# Get facet counts for doc_type
facet_results = self.solr.search(
q='*:*',
rows=0,
facet='true',
**{'facet.field': ['doc_type', 'status']}
)
stats = {
'total_documents': results.hits,
'doc_types': {},
'status_counts': {}
}
if hasattr(facet_results, 'facets') and facet_results.facets:
if 'facet_fields' in facet_results.facets:
# Parse doc_type facets
doc_type_facets = facet_results.facets['facet_fields'].get('doc_type', [])
for i in range(0, len(doc_type_facets), 2):
stats['doc_types'][doc_type_facets[i]] = doc_type_facets[i+1]
# Parse status facets
status_facets = facet_results.facets['facet_fields'].get('status', [])
for i in range(0, len(status_facets), 2):
stats['status_counts'][status_facets[i]] = status_facets[i+1]
return stats
except Exception as e:
logger.error(f"Failed to get stats: {e}")
return {'error': str(e)}
def optimize_index(self) -> bool:
"""Optimize the Solr index"""
try:
self.solr.optimize()
logger.info("Index optimized")
return True
except Exception as e:
logger.error(f"Failed to optimize index: {e}")
return False