""" Apache Solr client for search operations """ import pysolr import logging from typing import Dict, List, Any, Optional from datetime import datetime import json logger = logging.getLogger(__name__) class SolrClient: def __init__(self, solr_url: str = "http://solr:8983/solr", core_name: str = "site11"): self.solr_url = f"{solr_url}/{core_name}" self.core_name = core_name self.solr = None self.connect() def connect(self): """Connect to Solr instance""" try: self.solr = pysolr.Solr( self.solr_url, always_commit=True, timeout=10 ) # Test connection self.solr.ping() logger.info(f"Connected to Solr at {self.solr_url}") except Exception as e: logger.error(f"Failed to connect to Solr: {e}") raise def index_document(self, document: Dict[str, Any], doc_type: str = None) -> bool: """Index a single document""" try: # Add metadata if doc_type: document["doc_type"] = doc_type if "id" not in document: document["id"] = f"{doc_type}_{document.get('_id', '')}" # Add indexing timestamp document["indexed_at"] = datetime.utcnow().isoformat() # Index the document self.solr.add([document]) logger.info(f"Indexed document: {document.get('id')}") return True except Exception as e: logger.error(f"Failed to index document: {e}") return False def bulk_index(self, documents: List[Dict[str, Any]], doc_type: str = None) -> int: """Bulk index multiple documents""" try: indexed = 0 for doc in documents: if doc_type: doc["doc_type"] = doc_type if "id" not in doc: doc["id"] = f"{doc_type}_{doc.get('_id', '')}" doc["indexed_at"] = datetime.utcnow().isoformat() self.solr.add(documents) indexed = len(documents) logger.info(f"Bulk indexed {indexed} documents") return indexed except Exception as e: logger.error(f"Failed to bulk index: {e}") return 0 def search(self, query: str, **kwargs) -> Dict[str, Any]: """ Search documents Args: query: Search query string **kwargs: Additional search parameters - fq: Filter queries - fl: Fields to return - start: Starting offset - rows: Number of rows - sort: Sort order - facet: Enable faceting - facet.field: Fields to facet on """ try: # Default parameters params = { 'q': query, 'start': kwargs.get('start', 0), 'rows': kwargs.get('rows', 10), 'fl': kwargs.get('fl', '*,score'), 'defType': 'edismax', 'qf': 'title^3 content^2 tags description name', # Boost fields 'mm': '2<-25%', # Minimum match 'hl': 'true', # Highlighting 'hl.fl': 'title,content,description', 'hl.simple.pre': '', 'hl.simple.post': '' } # Add filter queries if 'fq' in kwargs: params['fq'] = kwargs['fq'] # Add sorting if 'sort' in kwargs: params['sort'] = kwargs['sort'] # Add faceting if kwargs.get('facet'): params.update({ 'facet': 'true', 'facet.field': kwargs.get('facet_field', ['doc_type', 'tags', 'status']), 'facet.mincount': 1 }) # Execute search results = self.solr.search(**params) # Format response response = { 'total': results.hits, 'documents': [], 'facets': {}, 'highlighting': {} } # Add documents for doc in results.docs: response['documents'].append(doc) # Add facets if available if hasattr(results, 'facets') and results.facets: if 'facet_fields' in results.facets: for field, values in results.facets['facet_fields'].items(): response['facets'][field] = [ {'value': values[i], 'count': values[i+1]} for i in range(0, len(values), 2) ] # Add highlighting if available if hasattr(results, 'highlighting'): response['highlighting'] = results.highlighting return response except Exception as e: logger.error(f"Search failed: {e}") return {'total': 0, 'documents': [], 'error': str(e)} def suggest(self, prefix: str, field: str = "suggest", limit: int = 10) -> List[str]: """Get autocomplete suggestions""" try: params = { 'q': f'{field}:{prefix}*', 'fl': field, 'rows': limit, 'start': 0 } results = self.solr.search(**params) suggestions = [] for doc in results.docs: if field in doc: value = doc[field] if isinstance(value, list): suggestions.extend(value) else: suggestions.append(value) # Remove duplicates and limit seen = set() unique_suggestions = [] for s in suggestions: if s not in seen: seen.add(s) unique_suggestions.append(s) if len(unique_suggestions) >= limit: break return unique_suggestions except Exception as e: logger.error(f"Suggest failed: {e}") return [] def more_like_this(self, doc_id: str, mlt_fields: List[str] = None, rows: int = 5) -> List[Dict]: """Find similar documents""" try: if not mlt_fields: mlt_fields = ['title', 'content', 'tags', 'description'] params = { 'q': f'id:{doc_id}', 'mlt': 'true', 'mlt.fl': ','.join(mlt_fields), 'mlt.mindf': 1, 'mlt.mintf': 1, 'mlt.count': rows, 'fl': '*,score' } results = self.solr.search(**params) if results.docs: # The MLT results are in the moreLikeThis section if hasattr(results, 'moreLikeThis'): mlt_results = results.moreLikeThis.get(doc_id, {}) if 'docs' in mlt_results: return mlt_results['docs'] return [] except Exception as e: logger.error(f"More like this failed: {e}") return [] def delete_document(self, doc_id: str) -> bool: """Delete a document by ID""" try: self.solr.delete(id=doc_id) logger.info(f"Deleted document: {doc_id}") return True except Exception as e: logger.error(f"Failed to delete document: {e}") return False def delete_by_query(self, query: str) -> bool: """Delete documents matching a query""" try: self.solr.delete(q=query) logger.info(f"Deleted documents matching: {query}") return True except Exception as e: logger.error(f"Failed to delete by query: {e}") return False def clear_index(self) -> bool: """Clear all documents from index""" try: self.solr.delete(q='*:*') logger.info("Cleared all documents from index") return True except Exception as e: logger.error(f"Failed to clear index: {e}") return False def get_stats(self) -> Dict[str, Any]: """Get index statistics""" try: # Get document count results = self.solr.search(q='*:*', rows=0) # Get facet counts for doc_type facet_results = self.solr.search( q='*:*', rows=0, facet='true', **{'facet.field': ['doc_type', 'status']} ) stats = { 'total_documents': results.hits, 'doc_types': {}, 'status_counts': {} } if hasattr(facet_results, 'facets') and facet_results.facets: if 'facet_fields' in facet_results.facets: # Parse doc_type facets doc_type_facets = facet_results.facets['facet_fields'].get('doc_type', []) for i in range(0, len(doc_type_facets), 2): stats['doc_types'][doc_type_facets[i]] = doc_type_facets[i+1] # Parse status facets status_facets = facet_results.facets['facet_fields'].get('status', []) for i in range(0, len(status_facets), 2): stats['status_counts'][status_facets[i]] = status_facets[i+1] return stats except Exception as e: logger.error(f"Failed to get stats: {e}") return {'error': str(e)} def optimize_index(self) -> bool: """Optimize the Solr index""" try: self.solr.optimize() logger.info("Index optimized") return True except Exception as e: logger.error(f"Failed to optimize index: {e}") return False