site11/services/search/backend/solr_client.py

"""
Apache Solr client for search operations
"""
import pysolr
import logging
from typing import Dict, List, Any, Optional
from datetime import datetime
import json

logger = logging.getLogger(__name__)

class SolrClient:
    def __init__(self, solr_url: str = "http://solr:8983/solr", core_name: str = "site11"):
        self.solr_url = f"{solr_url}/{core_name}"
        self.core_name = core_name
        self.solr = None
        self.connect()

    def connect(self):
        """Connect to Solr instance"""
        try:
            self.solr = pysolr.Solr(
                self.solr_url,
                always_commit=True,
                timeout=10
            )
            # Test connection
            self.solr.ping()
            logger.info(f"Connected to Solr at {self.solr_url}")
        except Exception as e:
            logger.error(f"Failed to connect to Solr: {e}")
            raise

    def index_document(self, document: Dict[str, Any], doc_type: str = None) -> bool:
        """Index a single document"""
        try:
            # Add metadata
            if doc_type:
                document["doc_type"] = doc_type

            if "id" not in document:
                document["id"] = f"{doc_type}_{document.get('_id', '')}"

            # Add indexing timestamp
            document["indexed_at"] = datetime.utcnow().isoformat()

            # Index the document
            self.solr.add([document])
            logger.info(f"Indexed document: {document.get('id')}")
            return True

        except Exception as e:
            logger.error(f"Failed to index document: {e}")
            return False

    def bulk_index(self, documents: List[Dict[str, Any]], doc_type: str = None) -> int:
        """Bulk index multiple documents"""
        try:
            indexed = 0
            for doc in documents:
                if doc_type:
                    doc["doc_type"] = doc_type

                if "id" not in doc:
                    doc["id"] = f"{doc_type}_{doc.get('_id', '')}"

                doc["indexed_at"] = datetime.utcnow().isoformat()

            self.solr.add(documents)
            indexed = len(documents)
            logger.info(f"Bulk indexed {indexed} documents")
            return indexed

        except Exception as e:
            logger.error(f"Failed to bulk index: {e}")
            return 0

    def search(self, query: str, **kwargs) -> Dict[str, Any]:
        """
        Search documents

        Args:
            query: Search query string
            **kwargs: Additional search parameters
                - fq: Filter queries
                - fl: Fields to return
                - start: Starting offset
                - rows: Number of rows
                - sort: Sort order
                - facet: Enable faceting
                - facet.field: Fields to facet on
        """
        try:
            # Default parameters
            params = {
                'q': query,
                'start': kwargs.get('start', 0),
                'rows': kwargs.get('rows', 10),
                'fl': kwargs.get('fl', '*,score'),
                'defType': 'edismax',
                'qf': 'title^3 content^2 tags description name',  # Boost fields
                'mm': '2<-25%',  # Minimum match
                'hl': 'true',  # Highlighting
                'hl.fl': 'title,content,description',
                'hl.simple.pre': '<mark>',
                'hl.simple.post': '</mark>'
            }

            # Add filter queries
            if 'fq' in kwargs:
                params['fq'] = kwargs['fq']

            # Add sorting
            if 'sort' in kwargs:
                params['sort'] = kwargs['sort']

            # Add faceting
            if kwargs.get('facet'):
                params.update({
                    'facet': 'true',
                    'facet.field': kwargs.get('facet_field', ['doc_type', 'tags', 'status']),
                    'facet.mincount': 1
                })

            # Execute search
            results = self.solr.search(**params)

            # Format response
            response = {
                'total': results.hits,
                'documents': [],
                'facets': {},
                'highlighting': {}
            }

            # Add documents
            for doc in results.docs:
                response['documents'].append(doc)

            # Add facets if available
            if hasattr(results, 'facets') and results.facets:
                if 'facet_fields' in results.facets:
                    for field, values in results.facets['facet_fields'].items():
                        response['facets'][field] = [
                            {'value': values[i], 'count': values[i+1]}
                            for i in range(0, len(values), 2)
                        ]

            # Add highlighting if available
            if hasattr(results, 'highlighting'):
                response['highlighting'] = results.highlighting

            return response

        except Exception as e:
            logger.error(f"Search failed: {e}")
            return {'total': 0, 'documents': [], 'error': str(e)}

    def suggest(self, prefix: str, field: str = "suggest", limit: int = 10) -> List[str]:
        """Get autocomplete suggestions"""
        try:
            params = {
                'q': f'{field}:{prefix}*',
                'fl': field,
                'rows': limit,
                'start': 0
            }

            results = self.solr.search(**params)
            suggestions = []

            for doc in results.docs:
                if field in doc:
                    value = doc[field]
                    if isinstance(value, list):
                        suggestions.extend(value)
                    else:
                        suggestions.append(value)

            # Remove duplicates and limit
            seen = set()
            unique_suggestions = []
            for s in suggestions:
                if s not in seen:
                    seen.add(s)
                    unique_suggestions.append(s)
                    if len(unique_suggestions) >= limit:
                        break

            return unique_suggestions

        except Exception as e:
            logger.error(f"Suggest failed: {e}")
            return []

    def more_like_this(self, doc_id: str, mlt_fields: List[str] = None, rows: int = 5) -> List[Dict]:
        """Find similar documents"""
        try:
            if not mlt_fields:
                mlt_fields = ['title', 'content', 'tags', 'description']

            params = {
                'q': f'id:{doc_id}',
                'mlt': 'true',
                'mlt.fl': ','.join(mlt_fields),
                'mlt.mindf': 1,
                'mlt.mintf': 1,
                'mlt.count': rows,
                'fl': '*,score'
            }

            results = self.solr.search(**params)

            if results.docs:
                # The MLT results are in the moreLikeThis section
                if hasattr(results, 'moreLikeThis'):
                    mlt_results = results.moreLikeThis.get(doc_id, {})
                    if 'docs' in mlt_results:
                        return mlt_results['docs']

            return []

        except Exception as e:
            logger.error(f"More like this failed: {e}")
            return []

    def delete_document(self, doc_id: str) -> bool:
        """Delete a document by ID"""
        try:
            self.solr.delete(id=doc_id)
            logger.info(f"Deleted document: {doc_id}")
            return True
        except Exception as e:
            logger.error(f"Failed to delete document: {e}")
            return False

    def delete_by_query(self, query: str) -> bool:
        """Delete documents matching a query"""
        try:
            self.solr.delete(q=query)
            logger.info(f"Deleted documents matching: {query}")
            return True
        except Exception as e:
            logger.error(f"Failed to delete by query: {e}")
            return False

    def clear_index(self) -> bool:
        """Clear all documents from index"""
        try:
            self.solr.delete(q='*:*')
            logger.info("Cleared all documents from index")
            return True
        except Exception as e:
            logger.error(f"Failed to clear index: {e}")
            return False

    def get_stats(self) -> Dict[str, Any]:
        """Get index statistics"""
        try:
            # Get document count
            results = self.solr.search(q='*:*', rows=0)

            # Get facet counts for doc_type
            facet_results = self.solr.search(
                q='*:*',
                rows=0,
                facet='true',
                **{'facet.field': ['doc_type', 'status']}
            )

            stats = {
                'total_documents': results.hits,
                'doc_types': {},
                'status_counts': {}
            }

            if hasattr(facet_results, 'facets') and facet_results.facets:
                if 'facet_fields' in facet_results.facets:
                    # Parse doc_type facets
                    doc_type_facets = facet_results.facets['facet_fields'].get('doc_type', [])
                    for i in range(0, len(doc_type_facets), 2):
                        stats['doc_types'][doc_type_facets[i]] = doc_type_facets[i+1]

                    # Parse status facets
                    status_facets = facet_results.facets['facet_fields'].get('status', [])
                    for i in range(0, len(status_facets), 2):
                        stats['status_counts'][status_facets[i]] = status_facets[i+1]

            return stats

        except Exception as e:
            logger.error(f"Failed to get stats: {e}")
            return {'error': str(e)}

    def optimize_index(self) -> bool:
        """Optimize the Solr index"""
        try:
            self.solr.optimize()
            logger.info("Index optimized")
            return True
        except Exception as e:
            logger.error(f"Failed to optimize index: {e}")
            return False