Initial commit - cleaned repository

This commit is contained in:
jungwoo choi
2025-09-28 20:41:57 +09:00
commit e3c28f796a
188 changed files with 28102 additions and 0 deletions

View File

@ -0,0 +1,303 @@
"""
Apache Solr client for search operations
"""
import pysolr
import logging
from typing import Dict, List, Any, Optional
from datetime import datetime
import json
logger = logging.getLogger(__name__)
class SolrClient:
def __init__(self, solr_url: str = "http://solr:8983/solr", core_name: str = "site11"):
self.solr_url = f"{solr_url}/{core_name}"
self.core_name = core_name
self.solr = None
self.connect()
def connect(self):
"""Connect to Solr instance"""
try:
self.solr = pysolr.Solr(
self.solr_url,
always_commit=True,
timeout=10
)
# Test connection
self.solr.ping()
logger.info(f"Connected to Solr at {self.solr_url}")
except Exception as e:
logger.error(f"Failed to connect to Solr: {e}")
raise
def index_document(self, document: Dict[str, Any], doc_type: str = None) -> bool:
"""Index a single document"""
try:
# Add metadata
if doc_type:
document["doc_type"] = doc_type
if "id" not in document:
document["id"] = f"{doc_type}_{document.get('_id', '')}"
# Add indexing timestamp
document["indexed_at"] = datetime.utcnow().isoformat()
# Index the document
self.solr.add([document])
logger.info(f"Indexed document: {document.get('id')}")
return True
except Exception as e:
logger.error(f"Failed to index document: {e}")
return False
def bulk_index(self, documents: List[Dict[str, Any]], doc_type: str = None) -> int:
"""Bulk index multiple documents"""
try:
indexed = 0
for doc in documents:
if doc_type:
doc["doc_type"] = doc_type
if "id" not in doc:
doc["id"] = f"{doc_type}_{doc.get('_id', '')}"
doc["indexed_at"] = datetime.utcnow().isoformat()
self.solr.add(documents)
indexed = len(documents)
logger.info(f"Bulk indexed {indexed} documents")
return indexed
except Exception as e:
logger.error(f"Failed to bulk index: {e}")
return 0
def search(self, query: str, **kwargs) -> Dict[str, Any]:
"""
Search documents
Args:
query: Search query string
**kwargs: Additional search parameters
- fq: Filter queries
- fl: Fields to return
- start: Starting offset
- rows: Number of rows
- sort: Sort order
- facet: Enable faceting
- facet.field: Fields to facet on
"""
try:
# Default parameters
params = {
'q': query,
'start': kwargs.get('start', 0),
'rows': kwargs.get('rows', 10),
'fl': kwargs.get('fl', '*,score'),
'defType': 'edismax',
'qf': 'title^3 content^2 tags description name', # Boost fields
'mm': '2<-25%', # Minimum match
'hl': 'true', # Highlighting
'hl.fl': 'title,content,description',
'hl.simple.pre': '<mark>',
'hl.simple.post': '</mark>'
}
# Add filter queries
if 'fq' in kwargs:
params['fq'] = kwargs['fq']
# Add sorting
if 'sort' in kwargs:
params['sort'] = kwargs['sort']
# Add faceting
if kwargs.get('facet'):
params.update({
'facet': 'true',
'facet.field': kwargs.get('facet_field', ['doc_type', 'tags', 'status']),
'facet.mincount': 1
})
# Execute search
results = self.solr.search(**params)
# Format response
response = {
'total': results.hits,
'documents': [],
'facets': {},
'highlighting': {}
}
# Add documents
for doc in results.docs:
response['documents'].append(doc)
# Add facets if available
if hasattr(results, 'facets') and results.facets:
if 'facet_fields' in results.facets:
for field, values in results.facets['facet_fields'].items():
response['facets'][field] = [
{'value': values[i], 'count': values[i+1]}
for i in range(0, len(values), 2)
]
# Add highlighting if available
if hasattr(results, 'highlighting'):
response['highlighting'] = results.highlighting
return response
except Exception as e:
logger.error(f"Search failed: {e}")
return {'total': 0, 'documents': [], 'error': str(e)}
def suggest(self, prefix: str, field: str = "suggest", limit: int = 10) -> List[str]:
"""Get autocomplete suggestions"""
try:
params = {
'q': f'{field}:{prefix}*',
'fl': field,
'rows': limit,
'start': 0
}
results = self.solr.search(**params)
suggestions = []
for doc in results.docs:
if field in doc:
value = doc[field]
if isinstance(value, list):
suggestions.extend(value)
else:
suggestions.append(value)
# Remove duplicates and limit
seen = set()
unique_suggestions = []
for s in suggestions:
if s not in seen:
seen.add(s)
unique_suggestions.append(s)
if len(unique_suggestions) >= limit:
break
return unique_suggestions
except Exception as e:
logger.error(f"Suggest failed: {e}")
return []
def more_like_this(self, doc_id: str, mlt_fields: List[str] = None, rows: int = 5) -> List[Dict]:
"""Find similar documents"""
try:
if not mlt_fields:
mlt_fields = ['title', 'content', 'tags', 'description']
params = {
'q': f'id:{doc_id}',
'mlt': 'true',
'mlt.fl': ','.join(mlt_fields),
'mlt.mindf': 1,
'mlt.mintf': 1,
'mlt.count': rows,
'fl': '*,score'
}
results = self.solr.search(**params)
if results.docs:
# The MLT results are in the moreLikeThis section
if hasattr(results, 'moreLikeThis'):
mlt_results = results.moreLikeThis.get(doc_id, {})
if 'docs' in mlt_results:
return mlt_results['docs']
return []
except Exception as e:
logger.error(f"More like this failed: {e}")
return []
def delete_document(self, doc_id: str) -> bool:
"""Delete a document by ID"""
try:
self.solr.delete(id=doc_id)
logger.info(f"Deleted document: {doc_id}")
return True
except Exception as e:
logger.error(f"Failed to delete document: {e}")
return False
def delete_by_query(self, query: str) -> bool:
"""Delete documents matching a query"""
try:
self.solr.delete(q=query)
logger.info(f"Deleted documents matching: {query}")
return True
except Exception as e:
logger.error(f"Failed to delete by query: {e}")
return False
def clear_index(self) -> bool:
"""Clear all documents from index"""
try:
self.solr.delete(q='*:*')
logger.info("Cleared all documents from index")
return True
except Exception as e:
logger.error(f"Failed to clear index: {e}")
return False
def get_stats(self) -> Dict[str, Any]:
"""Get index statistics"""
try:
# Get document count
results = self.solr.search(q='*:*', rows=0)
# Get facet counts for doc_type
facet_results = self.solr.search(
q='*:*',
rows=0,
facet='true',
**{'facet.field': ['doc_type', 'status']}
)
stats = {
'total_documents': results.hits,
'doc_types': {},
'status_counts': {}
}
if hasattr(facet_results, 'facets') and facet_results.facets:
if 'facet_fields' in facet_results.facets:
# Parse doc_type facets
doc_type_facets = facet_results.facets['facet_fields'].get('doc_type', [])
for i in range(0, len(doc_type_facets), 2):
stats['doc_types'][doc_type_facets[i]] = doc_type_facets[i+1]
# Parse status facets
status_facets = facet_results.facets['facet_fields'].get('status', [])
for i in range(0, len(status_facets), 2):
stats['status_counts'][status_facets[i]] = status_facets[i+1]
return stats
except Exception as e:
logger.error(f"Failed to get stats: {e}")
return {'error': str(e)}
def optimize_index(self) -> bool:
"""Optimize the Solr index"""
try:
self.solr.optimize()
logger.info("Index optimized")
return True
except Exception as e:
logger.error(f"Failed to optimize index: {e}")
return False