Initial commit - cleaned repository
This commit is contained in:
303
services/search/backend/solr_client.py
Normal file
303
services/search/backend/solr_client.py
Normal file
@ -0,0 +1,303 @@
|
||||
"""
|
||||
Apache Solr client for search operations
|
||||
"""
|
||||
import pysolr
|
||||
import logging
|
||||
from typing import Dict, List, Any, Optional
|
||||
from datetime import datetime
|
||||
import json
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class SolrClient:
|
||||
def __init__(self, solr_url: str = "http://solr:8983/solr", core_name: str = "site11"):
|
||||
self.solr_url = f"{solr_url}/{core_name}"
|
||||
self.core_name = core_name
|
||||
self.solr = None
|
||||
self.connect()
|
||||
|
||||
def connect(self):
|
||||
"""Connect to Solr instance"""
|
||||
try:
|
||||
self.solr = pysolr.Solr(
|
||||
self.solr_url,
|
||||
always_commit=True,
|
||||
timeout=10
|
||||
)
|
||||
# Test connection
|
||||
self.solr.ping()
|
||||
logger.info(f"Connected to Solr at {self.solr_url}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to connect to Solr: {e}")
|
||||
raise
|
||||
|
||||
def index_document(self, document: Dict[str, Any], doc_type: str = None) -> bool:
|
||||
"""Index a single document"""
|
||||
try:
|
||||
# Add metadata
|
||||
if doc_type:
|
||||
document["doc_type"] = doc_type
|
||||
|
||||
if "id" not in document:
|
||||
document["id"] = f"{doc_type}_{document.get('_id', '')}"
|
||||
|
||||
# Add indexing timestamp
|
||||
document["indexed_at"] = datetime.utcnow().isoformat()
|
||||
|
||||
# Index the document
|
||||
self.solr.add([document])
|
||||
logger.info(f"Indexed document: {document.get('id')}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to index document: {e}")
|
||||
return False
|
||||
|
||||
def bulk_index(self, documents: List[Dict[str, Any]], doc_type: str = None) -> int:
|
||||
"""Bulk index multiple documents"""
|
||||
try:
|
||||
indexed = 0
|
||||
for doc in documents:
|
||||
if doc_type:
|
||||
doc["doc_type"] = doc_type
|
||||
|
||||
if "id" not in doc:
|
||||
doc["id"] = f"{doc_type}_{doc.get('_id', '')}"
|
||||
|
||||
doc["indexed_at"] = datetime.utcnow().isoformat()
|
||||
|
||||
self.solr.add(documents)
|
||||
indexed = len(documents)
|
||||
logger.info(f"Bulk indexed {indexed} documents")
|
||||
return indexed
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to bulk index: {e}")
|
||||
return 0
|
||||
|
||||
def search(self, query: str, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Search documents
|
||||
|
||||
Args:
|
||||
query: Search query string
|
||||
**kwargs: Additional search parameters
|
||||
- fq: Filter queries
|
||||
- fl: Fields to return
|
||||
- start: Starting offset
|
||||
- rows: Number of rows
|
||||
- sort: Sort order
|
||||
- facet: Enable faceting
|
||||
- facet.field: Fields to facet on
|
||||
"""
|
||||
try:
|
||||
# Default parameters
|
||||
params = {
|
||||
'q': query,
|
||||
'start': kwargs.get('start', 0),
|
||||
'rows': kwargs.get('rows', 10),
|
||||
'fl': kwargs.get('fl', '*,score'),
|
||||
'defType': 'edismax',
|
||||
'qf': 'title^3 content^2 tags description name', # Boost fields
|
||||
'mm': '2<-25%', # Minimum match
|
||||
'hl': 'true', # Highlighting
|
||||
'hl.fl': 'title,content,description',
|
||||
'hl.simple.pre': '<mark>',
|
||||
'hl.simple.post': '</mark>'
|
||||
}
|
||||
|
||||
# Add filter queries
|
||||
if 'fq' in kwargs:
|
||||
params['fq'] = kwargs['fq']
|
||||
|
||||
# Add sorting
|
||||
if 'sort' in kwargs:
|
||||
params['sort'] = kwargs['sort']
|
||||
|
||||
# Add faceting
|
||||
if kwargs.get('facet'):
|
||||
params.update({
|
||||
'facet': 'true',
|
||||
'facet.field': kwargs.get('facet_field', ['doc_type', 'tags', 'status']),
|
||||
'facet.mincount': 1
|
||||
})
|
||||
|
||||
# Execute search
|
||||
results = self.solr.search(**params)
|
||||
|
||||
# Format response
|
||||
response = {
|
||||
'total': results.hits,
|
||||
'documents': [],
|
||||
'facets': {},
|
||||
'highlighting': {}
|
||||
}
|
||||
|
||||
# Add documents
|
||||
for doc in results.docs:
|
||||
response['documents'].append(doc)
|
||||
|
||||
# Add facets if available
|
||||
if hasattr(results, 'facets') and results.facets:
|
||||
if 'facet_fields' in results.facets:
|
||||
for field, values in results.facets['facet_fields'].items():
|
||||
response['facets'][field] = [
|
||||
{'value': values[i], 'count': values[i+1]}
|
||||
for i in range(0, len(values), 2)
|
||||
]
|
||||
|
||||
# Add highlighting if available
|
||||
if hasattr(results, 'highlighting'):
|
||||
response['highlighting'] = results.highlighting
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Search failed: {e}")
|
||||
return {'total': 0, 'documents': [], 'error': str(e)}
|
||||
|
||||
def suggest(self, prefix: str, field: str = "suggest", limit: int = 10) -> List[str]:
|
||||
"""Get autocomplete suggestions"""
|
||||
try:
|
||||
params = {
|
||||
'q': f'{field}:{prefix}*',
|
||||
'fl': field,
|
||||
'rows': limit,
|
||||
'start': 0
|
||||
}
|
||||
|
||||
results = self.solr.search(**params)
|
||||
suggestions = []
|
||||
|
||||
for doc in results.docs:
|
||||
if field in doc:
|
||||
value = doc[field]
|
||||
if isinstance(value, list):
|
||||
suggestions.extend(value)
|
||||
else:
|
||||
suggestions.append(value)
|
||||
|
||||
# Remove duplicates and limit
|
||||
seen = set()
|
||||
unique_suggestions = []
|
||||
for s in suggestions:
|
||||
if s not in seen:
|
||||
seen.add(s)
|
||||
unique_suggestions.append(s)
|
||||
if len(unique_suggestions) >= limit:
|
||||
break
|
||||
|
||||
return unique_suggestions
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Suggest failed: {e}")
|
||||
return []
|
||||
|
||||
def more_like_this(self, doc_id: str, mlt_fields: List[str] = None, rows: int = 5) -> List[Dict]:
|
||||
"""Find similar documents"""
|
||||
try:
|
||||
if not mlt_fields:
|
||||
mlt_fields = ['title', 'content', 'tags', 'description']
|
||||
|
||||
params = {
|
||||
'q': f'id:{doc_id}',
|
||||
'mlt': 'true',
|
||||
'mlt.fl': ','.join(mlt_fields),
|
||||
'mlt.mindf': 1,
|
||||
'mlt.mintf': 1,
|
||||
'mlt.count': rows,
|
||||
'fl': '*,score'
|
||||
}
|
||||
|
||||
results = self.solr.search(**params)
|
||||
|
||||
if results.docs:
|
||||
# The MLT results are in the moreLikeThis section
|
||||
if hasattr(results, 'moreLikeThis'):
|
||||
mlt_results = results.moreLikeThis.get(doc_id, {})
|
||||
if 'docs' in mlt_results:
|
||||
return mlt_results['docs']
|
||||
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"More like this failed: {e}")
|
||||
return []
|
||||
|
||||
def delete_document(self, doc_id: str) -> bool:
|
||||
"""Delete a document by ID"""
|
||||
try:
|
||||
self.solr.delete(id=doc_id)
|
||||
logger.info(f"Deleted document: {doc_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete document: {e}")
|
||||
return False
|
||||
|
||||
def delete_by_query(self, query: str) -> bool:
|
||||
"""Delete documents matching a query"""
|
||||
try:
|
||||
self.solr.delete(q=query)
|
||||
logger.info(f"Deleted documents matching: {query}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete by query: {e}")
|
||||
return False
|
||||
|
||||
def clear_index(self) -> bool:
|
||||
"""Clear all documents from index"""
|
||||
try:
|
||||
self.solr.delete(q='*:*')
|
||||
logger.info("Cleared all documents from index")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to clear index: {e}")
|
||||
return False
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get index statistics"""
|
||||
try:
|
||||
# Get document count
|
||||
results = self.solr.search(q='*:*', rows=0)
|
||||
|
||||
# Get facet counts for doc_type
|
||||
facet_results = self.solr.search(
|
||||
q='*:*',
|
||||
rows=0,
|
||||
facet='true',
|
||||
**{'facet.field': ['doc_type', 'status']}
|
||||
)
|
||||
|
||||
stats = {
|
||||
'total_documents': results.hits,
|
||||
'doc_types': {},
|
||||
'status_counts': {}
|
||||
}
|
||||
|
||||
if hasattr(facet_results, 'facets') and facet_results.facets:
|
||||
if 'facet_fields' in facet_results.facets:
|
||||
# Parse doc_type facets
|
||||
doc_type_facets = facet_results.facets['facet_fields'].get('doc_type', [])
|
||||
for i in range(0, len(doc_type_facets), 2):
|
||||
stats['doc_types'][doc_type_facets[i]] = doc_type_facets[i+1]
|
||||
|
||||
# Parse status facets
|
||||
status_facets = facet_results.facets['facet_fields'].get('status', [])
|
||||
for i in range(0, len(status_facets), 2):
|
||||
stats['status_counts'][status_facets[i]] = status_facets[i+1]
|
||||
|
||||
return stats
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get stats: {e}")
|
||||
return {'error': str(e)}
|
||||
|
||||
def optimize_index(self) -> bool:
|
||||
"""Optimize the Solr index"""
|
||||
try:
|
||||
self.solr.optimize()
|
||||
logger.info("Index optimized")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to optimize index: {e}")
|
||||
return False
|
||||
Reference in New Issue
Block a user