Initial commit - cleaned repository

2025-09-28 20:41:57 +09:00
commit e3c28f796a
188 changed files with 28102 additions and 0 deletions
--- a/services/search/backend/solr_client.py
+++ b/services/search/backend/solr_client.py
@ -0,0 +1,303 @@
+"""
+Apache Solr client for search operations
+"""
+import pysolr
+import logging
+from typing import Dict, List, Any, Optional
+from datetime import datetime
+import json
+
+logger = logging.getLogger(__name__)
+
+class SolrClient:
+    def __init__(self, solr_url: str = "http://solr:8983/solr", core_name: str = "site11"):
+        self.solr_url = f"{solr_url}/{core_name}"
+        self.core_name = core_name
+        self.solr = None
+        self.connect()
+    
+    def connect(self):
+        """Connect to Solr instance"""
+        try:
+            self.solr = pysolr.Solr(
+                self.solr_url,
+                always_commit=True,
+                timeout=10
+            )
+            # Test connection
+            self.solr.ping()
+            logger.info(f"Connected to Solr at {self.solr_url}")
+        except Exception as e:
+            logger.error(f"Failed to connect to Solr: {e}")
+            raise
+    
+    def index_document(self, document: Dict[str, Any], doc_type: str = None) -> bool:
+        """Index a single document"""
+        try:
+            # Add metadata
+            if doc_type:
+                document["doc_type"] = doc_type
+            
+            if "id" not in document:
+                document["id"] = f"{doc_type}_{document.get('_id', '')}"
+            
+            # Add indexing timestamp
+            document["indexed_at"] = datetime.utcnow().isoformat()
+            
+            # Index the document
+            self.solr.add([document])
+            logger.info(f"Indexed document: {document.get('id')}")
+            return True
+            
+        except Exception as e:
+            logger.error(f"Failed to index document: {e}")
+            return False
+    
+    def bulk_index(self, documents: List[Dict[str, Any]], doc_type: str = None) -> int:
+        """Bulk index multiple documents"""
+        try:
+            indexed = 0
+            for doc in documents:
+                if doc_type:
+                    doc["doc_type"] = doc_type
+                
+                if "id" not in doc:
+                    doc["id"] = f"{doc_type}_{doc.get('_id', '')}"
+                
+                doc["indexed_at"] = datetime.utcnow().isoformat()
+            
+            self.solr.add(documents)
+            indexed = len(documents)
+            logger.info(f"Bulk indexed {indexed} documents")
+            return indexed
+            
+        except Exception as e:
+            logger.error(f"Failed to bulk index: {e}")
+            return 0
+    
+    def search(self, query: str, **kwargs) -> Dict[str, Any]:
+        """
+        Search documents
+        
+        Args:
+            query: Search query string
+            **kwargs: Additional search parameters
+                - fq: Filter queries
+                - fl: Fields to return
+                - start: Starting offset
+                - rows: Number of rows
+                - sort: Sort order
+                - facet: Enable faceting
+                - facet.field: Fields to facet on
+        """
+        try:
+            # Default parameters
+            params = {
+                'q': query,
+                'start': kwargs.get('start', 0),
+                'rows': kwargs.get('rows', 10),
+                'fl': kwargs.get('fl', '*,score'),
+                'defType': 'edismax',
+                'qf': 'title^3 content^2 tags description name',  # Boost fields
+                'mm': '2<-25%',  # Minimum match
+                'hl': 'true',  # Highlighting
+                'hl.fl': 'title,content,description',
+                'hl.simple.pre': '<mark>',
+                'hl.simple.post': '</mark>'
+            }
+            
+            # Add filter queries
+            if 'fq' in kwargs:
+                params['fq'] = kwargs['fq']
+            
+            # Add sorting
+            if 'sort' in kwargs:
+                params['sort'] = kwargs['sort']
+            
+            # Add faceting
+            if kwargs.get('facet'):
+                params.update({
+                    'facet': 'true',
+                    'facet.field': kwargs.get('facet_field', ['doc_type', 'tags', 'status']),
+                    'facet.mincount': 1
+                })
+            
+            # Execute search
+            results = self.solr.search(**params)
+            
+            # Format response
+            response = {
+                'total': results.hits,
+                'documents': [],
+                'facets': {},
+                'highlighting': {}
+            }
+            
+            # Add documents
+            for doc in results.docs:
+                response['documents'].append(doc)
+            
+            # Add facets if available
+            if hasattr(results, 'facets') and results.facets:
+                if 'facet_fields' in results.facets:
+                    for field, values in results.facets['facet_fields'].items():
+                        response['facets'][field] = [
+                            {'value': values[i], 'count': values[i+1]}
+                            for i in range(0, len(values), 2)
+                        ]
+            
+            # Add highlighting if available
+            if hasattr(results, 'highlighting'):
+                response['highlighting'] = results.highlighting
+            
+            return response
+            
+        except Exception as e:
+            logger.error(f"Search failed: {e}")
+            return {'total': 0, 'documents': [], 'error': str(e)}
+    
+    def suggest(self, prefix: str, field: str = "suggest", limit: int = 10) -> List[str]:
+        """Get autocomplete suggestions"""
+        try:
+            params = {
+                'q': f'{field}:{prefix}*',
+                'fl': field,
+                'rows': limit,
+                'start': 0
+            }
+            
+            results = self.solr.search(**params)
+            suggestions = []
+            
+            for doc in results.docs:
+                if field in doc:
+                    value = doc[field]
+                    if isinstance(value, list):
+                        suggestions.extend(value)
+                    else:
+                        suggestions.append(value)
+            
+            # Remove duplicates and limit
+            seen = set()
+            unique_suggestions = []
+            for s in suggestions:
+                if s not in seen:
+                    seen.add(s)
+                    unique_suggestions.append(s)
+                    if len(unique_suggestions) >= limit:
+                        break
+            
+            return unique_suggestions
+            
+        except Exception as e:
+            logger.error(f"Suggest failed: {e}")
+            return []
+    
+    def more_like_this(self, doc_id: str, mlt_fields: List[str] = None, rows: int = 5) -> List[Dict]:
+        """Find similar documents"""
+        try:
+            if not mlt_fields:
+                mlt_fields = ['title', 'content', 'tags', 'description']
+            
+            params = {
+                'q': f'id:{doc_id}',
+                'mlt': 'true',
+                'mlt.fl': ','.join(mlt_fields),
+                'mlt.mindf': 1,
+                'mlt.mintf': 1,
+                'mlt.count': rows,
+                'fl': '*,score'
+            }
+            
+            results = self.solr.search(**params)
+            
+            if results.docs:
+                # The MLT results are in the moreLikeThis section
+                if hasattr(results, 'moreLikeThis'):
+                    mlt_results = results.moreLikeThis.get(doc_id, {})
+                    if 'docs' in mlt_results:
+                        return mlt_results['docs']
+            
+            return []
+            
+        except Exception as e:
+            logger.error(f"More like this failed: {e}")
+            return []
+    
+    def delete_document(self, doc_id: str) -> bool:
+        """Delete a document by ID"""
+        try:
+            self.solr.delete(id=doc_id)
+            logger.info(f"Deleted document: {doc_id}")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to delete document: {e}")
+            return False
+    
+    def delete_by_query(self, query: str) -> bool:
+        """Delete documents matching a query"""
+        try:
+            self.solr.delete(q=query)
+            logger.info(f"Deleted documents matching: {query}")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to delete by query: {e}")
+            return False
+    
+    def clear_index(self) -> bool:
+        """Clear all documents from index"""
+        try:
+            self.solr.delete(q='*:*')
+            logger.info("Cleared all documents from index")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to clear index: {e}")
+            return False
+    
+    def get_stats(self) -> Dict[str, Any]:
+        """Get index statistics"""
+        try:
+            # Get document count
+            results = self.solr.search(q='*:*', rows=0)
+            
+            # Get facet counts for doc_type
+            facet_results = self.solr.search(
+                q='*:*',
+                rows=0,
+                facet='true',
+                **{'facet.field': ['doc_type', 'status']}
+            )
+            
+            stats = {
+                'total_documents': results.hits,
+                'doc_types': {},
+                'status_counts': {}
+            }
+            
+            if hasattr(facet_results, 'facets') and facet_results.facets:
+                if 'facet_fields' in facet_results.facets:
+                    # Parse doc_type facets
+                    doc_type_facets = facet_results.facets['facet_fields'].get('doc_type', [])
+                    for i in range(0, len(doc_type_facets), 2):
+                        stats['doc_types'][doc_type_facets[i]] = doc_type_facets[i+1]
+                    
+                    # Parse status facets
+                    status_facets = facet_results.facets['facet_fields'].get('status', [])
+                    for i in range(0, len(status_facets), 2):
+                        stats['status_counts'][status_facets[i]] = status_facets[i+1]
+            
+            return stats
+            
+        except Exception as e:
+            logger.error(f"Failed to get stats: {e}")
+            return {'error': str(e)}
+    
+    def optimize_index(self) -> bool:
+        """Optimize the Solr index"""
+        try:
+            self.solr.optimize()
+            logger.info("Index optimized")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to optimize index: {e}")
+            return False