303 lines
10 KiB
Python
303 lines
10 KiB
Python
"""
|
|
Apache Solr client for search operations
|
|
"""
|
|
import pysolr
|
|
import logging
|
|
from typing import Dict, List, Any, Optional
|
|
from datetime import datetime
|
|
import json
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class SolrClient:
|
|
def __init__(self, solr_url: str = "http://solr:8983/solr", core_name: str = "site11"):
|
|
self.solr_url = f"{solr_url}/{core_name}"
|
|
self.core_name = core_name
|
|
self.solr = None
|
|
self.connect()
|
|
|
|
def connect(self):
|
|
"""Connect to Solr instance"""
|
|
try:
|
|
self.solr = pysolr.Solr(
|
|
self.solr_url,
|
|
always_commit=True,
|
|
timeout=10
|
|
)
|
|
# Test connection
|
|
self.solr.ping()
|
|
logger.info(f"Connected to Solr at {self.solr_url}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to connect to Solr: {e}")
|
|
raise
|
|
|
|
def index_document(self, document: Dict[str, Any], doc_type: str = None) -> bool:
|
|
"""Index a single document"""
|
|
try:
|
|
# Add metadata
|
|
if doc_type:
|
|
document["doc_type"] = doc_type
|
|
|
|
if "id" not in document:
|
|
document["id"] = f"{doc_type}_{document.get('_id', '')}"
|
|
|
|
# Add indexing timestamp
|
|
document["indexed_at"] = datetime.utcnow().isoformat()
|
|
|
|
# Index the document
|
|
self.solr.add([document])
|
|
logger.info(f"Indexed document: {document.get('id')}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to index document: {e}")
|
|
return False
|
|
|
|
def bulk_index(self, documents: List[Dict[str, Any]], doc_type: str = None) -> int:
|
|
"""Bulk index multiple documents"""
|
|
try:
|
|
indexed = 0
|
|
for doc in documents:
|
|
if doc_type:
|
|
doc["doc_type"] = doc_type
|
|
|
|
if "id" not in doc:
|
|
doc["id"] = f"{doc_type}_{doc.get('_id', '')}"
|
|
|
|
doc["indexed_at"] = datetime.utcnow().isoformat()
|
|
|
|
self.solr.add(documents)
|
|
indexed = len(documents)
|
|
logger.info(f"Bulk indexed {indexed} documents")
|
|
return indexed
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to bulk index: {e}")
|
|
return 0
|
|
|
|
def search(self, query: str, **kwargs) -> Dict[str, Any]:
|
|
"""
|
|
Search documents
|
|
|
|
Args:
|
|
query: Search query string
|
|
**kwargs: Additional search parameters
|
|
- fq: Filter queries
|
|
- fl: Fields to return
|
|
- start: Starting offset
|
|
- rows: Number of rows
|
|
- sort: Sort order
|
|
- facet: Enable faceting
|
|
- facet.field: Fields to facet on
|
|
"""
|
|
try:
|
|
# Default parameters
|
|
params = {
|
|
'q': query,
|
|
'start': kwargs.get('start', 0),
|
|
'rows': kwargs.get('rows', 10),
|
|
'fl': kwargs.get('fl', '*,score'),
|
|
'defType': 'edismax',
|
|
'qf': 'title^3 content^2 tags description name', # Boost fields
|
|
'mm': '2<-25%', # Minimum match
|
|
'hl': 'true', # Highlighting
|
|
'hl.fl': 'title,content,description',
|
|
'hl.simple.pre': '<mark>',
|
|
'hl.simple.post': '</mark>'
|
|
}
|
|
|
|
# Add filter queries
|
|
if 'fq' in kwargs:
|
|
params['fq'] = kwargs['fq']
|
|
|
|
# Add sorting
|
|
if 'sort' in kwargs:
|
|
params['sort'] = kwargs['sort']
|
|
|
|
# Add faceting
|
|
if kwargs.get('facet'):
|
|
params.update({
|
|
'facet': 'true',
|
|
'facet.field': kwargs.get('facet_field', ['doc_type', 'tags', 'status']),
|
|
'facet.mincount': 1
|
|
})
|
|
|
|
# Execute search
|
|
results = self.solr.search(**params)
|
|
|
|
# Format response
|
|
response = {
|
|
'total': results.hits,
|
|
'documents': [],
|
|
'facets': {},
|
|
'highlighting': {}
|
|
}
|
|
|
|
# Add documents
|
|
for doc in results.docs:
|
|
response['documents'].append(doc)
|
|
|
|
# Add facets if available
|
|
if hasattr(results, 'facets') and results.facets:
|
|
if 'facet_fields' in results.facets:
|
|
for field, values in results.facets['facet_fields'].items():
|
|
response['facets'][field] = [
|
|
{'value': values[i], 'count': values[i+1]}
|
|
for i in range(0, len(values), 2)
|
|
]
|
|
|
|
# Add highlighting if available
|
|
if hasattr(results, 'highlighting'):
|
|
response['highlighting'] = results.highlighting
|
|
|
|
return response
|
|
|
|
except Exception as e:
|
|
logger.error(f"Search failed: {e}")
|
|
return {'total': 0, 'documents': [], 'error': str(e)}
|
|
|
|
def suggest(self, prefix: str, field: str = "suggest", limit: int = 10) -> List[str]:
|
|
"""Get autocomplete suggestions"""
|
|
try:
|
|
params = {
|
|
'q': f'{field}:{prefix}*',
|
|
'fl': field,
|
|
'rows': limit,
|
|
'start': 0
|
|
}
|
|
|
|
results = self.solr.search(**params)
|
|
suggestions = []
|
|
|
|
for doc in results.docs:
|
|
if field in doc:
|
|
value = doc[field]
|
|
if isinstance(value, list):
|
|
suggestions.extend(value)
|
|
else:
|
|
suggestions.append(value)
|
|
|
|
# Remove duplicates and limit
|
|
seen = set()
|
|
unique_suggestions = []
|
|
for s in suggestions:
|
|
if s not in seen:
|
|
seen.add(s)
|
|
unique_suggestions.append(s)
|
|
if len(unique_suggestions) >= limit:
|
|
break
|
|
|
|
return unique_suggestions
|
|
|
|
except Exception as e:
|
|
logger.error(f"Suggest failed: {e}")
|
|
return []
|
|
|
|
def more_like_this(self, doc_id: str, mlt_fields: List[str] = None, rows: int = 5) -> List[Dict]:
|
|
"""Find similar documents"""
|
|
try:
|
|
if not mlt_fields:
|
|
mlt_fields = ['title', 'content', 'tags', 'description']
|
|
|
|
params = {
|
|
'q': f'id:{doc_id}',
|
|
'mlt': 'true',
|
|
'mlt.fl': ','.join(mlt_fields),
|
|
'mlt.mindf': 1,
|
|
'mlt.mintf': 1,
|
|
'mlt.count': rows,
|
|
'fl': '*,score'
|
|
}
|
|
|
|
results = self.solr.search(**params)
|
|
|
|
if results.docs:
|
|
# The MLT results are in the moreLikeThis section
|
|
if hasattr(results, 'moreLikeThis'):
|
|
mlt_results = results.moreLikeThis.get(doc_id, {})
|
|
if 'docs' in mlt_results:
|
|
return mlt_results['docs']
|
|
|
|
return []
|
|
|
|
except Exception as e:
|
|
logger.error(f"More like this failed: {e}")
|
|
return []
|
|
|
|
def delete_document(self, doc_id: str) -> bool:
|
|
"""Delete a document by ID"""
|
|
try:
|
|
self.solr.delete(id=doc_id)
|
|
logger.info(f"Deleted document: {doc_id}")
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to delete document: {e}")
|
|
return False
|
|
|
|
def delete_by_query(self, query: str) -> bool:
|
|
"""Delete documents matching a query"""
|
|
try:
|
|
self.solr.delete(q=query)
|
|
logger.info(f"Deleted documents matching: {query}")
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to delete by query: {e}")
|
|
return False
|
|
|
|
def clear_index(self) -> bool:
|
|
"""Clear all documents from index"""
|
|
try:
|
|
self.solr.delete(q='*:*')
|
|
logger.info("Cleared all documents from index")
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to clear index: {e}")
|
|
return False
|
|
|
|
def get_stats(self) -> Dict[str, Any]:
|
|
"""Get index statistics"""
|
|
try:
|
|
# Get document count
|
|
results = self.solr.search(q='*:*', rows=0)
|
|
|
|
# Get facet counts for doc_type
|
|
facet_results = self.solr.search(
|
|
q='*:*',
|
|
rows=0,
|
|
facet='true',
|
|
**{'facet.field': ['doc_type', 'status']}
|
|
)
|
|
|
|
stats = {
|
|
'total_documents': results.hits,
|
|
'doc_types': {},
|
|
'status_counts': {}
|
|
}
|
|
|
|
if hasattr(facet_results, 'facets') and facet_results.facets:
|
|
if 'facet_fields' in facet_results.facets:
|
|
# Parse doc_type facets
|
|
doc_type_facets = facet_results.facets['facet_fields'].get('doc_type', [])
|
|
for i in range(0, len(doc_type_facets), 2):
|
|
stats['doc_types'][doc_type_facets[i]] = doc_type_facets[i+1]
|
|
|
|
# Parse status facets
|
|
status_facets = facet_results.facets['facet_fields'].get('status', [])
|
|
for i in range(0, len(status_facets), 2):
|
|
stats['status_counts'][status_facets[i]] = status_facets[i+1]
|
|
|
|
return stats
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to get stats: {e}")
|
|
return {'error': str(e)}
|
|
|
|
def optimize_index(self) -> bool:
|
|
"""Optimize the Solr index"""
|
|
try:
|
|
self.solr.optimize()
|
|
logger.info("Index optimized")
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to optimize index: {e}")
|
|
return False |