Initial commit - cleaned repository
This commit is contained in:
21
services/search/backend/Dockerfile
Normal file
21
services/search/backend/Dockerfile
Normal file
@ -0,0 +1,21 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
gcc \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements first for better caching
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY . .
|
||||
|
||||
# Create necessary directories
|
||||
RUN mkdir -p /app/logs
|
||||
|
||||
# Run the application
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
|
||||
286
services/search/backend/indexer.py
Normal file
286
services/search/backend/indexer.py
Normal file
@ -0,0 +1,286 @@
|
||||
"""
|
||||
Data indexer for synchronizing data from other services to Solr
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Dict, Any, List
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
from aiokafka import AIOKafkaConsumer
|
||||
import json
|
||||
from solr_client import SolrClient
|
||||
from datetime import datetime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DataIndexer:
|
||||
def __init__(self, solr_client: SolrClient, mongodb_url: str, kafka_servers: str):
|
||||
self.solr = solr_client
|
||||
self.mongodb_url = mongodb_url
|
||||
self.kafka_servers = kafka_servers
|
||||
self.mongo_client = None
|
||||
self.kafka_consumer = None
|
||||
self.running = False
|
||||
|
||||
async def start(self):
|
||||
"""Start the indexer"""
|
||||
try:
|
||||
# Connect to MongoDB
|
||||
self.mongo_client = AsyncIOMotorClient(self.mongodb_url)
|
||||
|
||||
# Initialize Kafka consumer
|
||||
await self._init_kafka_consumer()
|
||||
|
||||
# Start background tasks
|
||||
self.running = True
|
||||
asyncio.create_task(self._consume_kafka_events())
|
||||
asyncio.create_task(self._periodic_sync())
|
||||
|
||||
logger.info("Data indexer started")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to start indexer: {e}")
|
||||
|
||||
async def stop(self):
|
||||
"""Stop the indexer"""
|
||||
self.running = False
|
||||
|
||||
if self.kafka_consumer:
|
||||
await self.kafka_consumer.stop()
|
||||
|
||||
if self.mongo_client:
|
||||
self.mongo_client.close()
|
||||
|
||||
logger.info("Data indexer stopped")
|
||||
|
||||
async def _init_kafka_consumer(self):
|
||||
"""Initialize Kafka consumer"""
|
||||
try:
|
||||
self.kafka_consumer = AIOKafkaConsumer(
|
||||
'user_events',
|
||||
'file_events',
|
||||
'content_events',
|
||||
bootstrap_servers=self.kafka_servers,
|
||||
value_deserializer=lambda m: json.loads(m.decode('utf-8')),
|
||||
group_id='search_indexer',
|
||||
auto_offset_reset='latest'
|
||||
)
|
||||
await self.kafka_consumer.start()
|
||||
logger.info("Kafka consumer initialized")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Kafka consumer initialization failed: {e}")
|
||||
self.kafka_consumer = None
|
||||
|
||||
async def _consume_kafka_events(self):
|
||||
"""Consume events from Kafka and index them"""
|
||||
if not self.kafka_consumer:
|
||||
return
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
async for msg in self.kafka_consumer:
|
||||
await self._handle_kafka_event(msg.topic, msg.value)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Kafka consumption error: {e}")
|
||||
await asyncio.sleep(5)
|
||||
|
||||
async def _handle_kafka_event(self, topic: str, event: Dict[str, Any]):
|
||||
"""Handle a Kafka event"""
|
||||
try:
|
||||
event_type = event.get('type')
|
||||
data = event.get('data', {})
|
||||
|
||||
if topic == 'user_events':
|
||||
await self._index_user_event(event_type, data)
|
||||
elif topic == 'file_events':
|
||||
await self._index_file_event(event_type, data)
|
||||
elif topic == 'content_events':
|
||||
await self._index_content_event(event_type, data)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to handle event: {e}")
|
||||
|
||||
async def _index_user_event(self, event_type: str, data: Dict):
|
||||
"""Index user-related events"""
|
||||
if event_type == 'user_created' or event_type == 'user_updated':
|
||||
user_doc = {
|
||||
'id': f"user_{data.get('user_id')}",
|
||||
'doc_type': 'user',
|
||||
'user_id': data.get('user_id'),
|
||||
'username': data.get('username'),
|
||||
'email': data.get('email'),
|
||||
'name': data.get('name', ''),
|
||||
'bio': data.get('bio', ''),
|
||||
'tags': data.get('tags', []),
|
||||
'created_at': data.get('created_at'),
|
||||
'updated_at': datetime.utcnow().isoformat()
|
||||
}
|
||||
self.solr.index_document(user_doc)
|
||||
|
||||
elif event_type == 'user_deleted':
|
||||
self.solr.delete_document(f"user_{data.get('user_id')}")
|
||||
|
||||
async def _index_file_event(self, event_type: str, data: Dict):
|
||||
"""Index file-related events"""
|
||||
if event_type == 'file_uploaded':
|
||||
file_doc = {
|
||||
'id': f"file_{data.get('file_id')}",
|
||||
'doc_type': 'file',
|
||||
'file_id': data.get('file_id'),
|
||||
'filename': data.get('filename'),
|
||||
'content_type': data.get('content_type'),
|
||||
'size': data.get('size'),
|
||||
'user_id': data.get('user_id'),
|
||||
'tags': data.get('tags', []),
|
||||
'description': data.get('description', ''),
|
||||
'created_at': data.get('created_at'),
|
||||
'updated_at': datetime.utcnow().isoformat()
|
||||
}
|
||||
self.solr.index_document(file_doc)
|
||||
|
||||
elif event_type == 'file_deleted':
|
||||
self.solr.delete_document(f"file_{data.get('file_id')}")
|
||||
|
||||
async def _index_content_event(self, event_type: str, data: Dict):
|
||||
"""Index content-related events"""
|
||||
if event_type in ['content_created', 'content_updated']:
|
||||
content_doc = {
|
||||
'id': f"content_{data.get('content_id')}",
|
||||
'doc_type': 'content',
|
||||
'content_id': data.get('content_id'),
|
||||
'title': data.get('title'),
|
||||
'content': data.get('content', ''),
|
||||
'summary': data.get('summary', ''),
|
||||
'author_id': data.get('author_id'),
|
||||
'tags': data.get('tags', []),
|
||||
'category': data.get('category'),
|
||||
'status': data.get('status', 'draft'),
|
||||
'created_at': data.get('created_at'),
|
||||
'updated_at': datetime.utcnow().isoformat()
|
||||
}
|
||||
self.solr.index_document(content_doc)
|
||||
|
||||
elif event_type == 'content_deleted':
|
||||
self.solr.delete_document(f"content_{data.get('content_id')}")
|
||||
|
||||
async def _periodic_sync(self):
|
||||
"""Periodically sync data from MongoDB"""
|
||||
while self.running:
|
||||
try:
|
||||
# Sync every 5 minutes
|
||||
await asyncio.sleep(300)
|
||||
await self.sync_all_data()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Periodic sync error: {e}")
|
||||
|
||||
async def sync_all_data(self):
|
||||
"""Sync all data from MongoDB to Solr"""
|
||||
try:
|
||||
logger.info("Starting full data sync")
|
||||
|
||||
# Sync users
|
||||
await self._sync_users()
|
||||
|
||||
# Sync files
|
||||
await self._sync_files()
|
||||
|
||||
# Optimize index after bulk sync
|
||||
self.solr.optimize_index()
|
||||
|
||||
logger.info("Full data sync completed")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Full sync failed: {e}")
|
||||
|
||||
async def _sync_users(self):
|
||||
"""Sync users from MongoDB"""
|
||||
try:
|
||||
db = self.mongo_client['users_db']
|
||||
collection = db['users']
|
||||
|
||||
users = []
|
||||
async for user in collection.find({'deleted_at': None}):
|
||||
user_doc = {
|
||||
'id': f"user_{str(user['_id'])}",
|
||||
'doc_type': 'user',
|
||||
'user_id': str(user['_id']),
|
||||
'username': user.get('username'),
|
||||
'email': user.get('email'),
|
||||
'name': user.get('name', ''),
|
||||
'bio': user.get('bio', ''),
|
||||
'tags': user.get('tags', []),
|
||||
'created_at': user.get('created_at').isoformat() if user.get('created_at') else None,
|
||||
'updated_at': datetime.utcnow().isoformat()
|
||||
}
|
||||
users.append(user_doc)
|
||||
|
||||
# Bulk index every 100 documents
|
||||
if len(users) >= 100:
|
||||
self.solr.bulk_index(users, 'user')
|
||||
users = []
|
||||
|
||||
# Index remaining users
|
||||
if users:
|
||||
self.solr.bulk_index(users, 'user')
|
||||
|
||||
logger.info(f"Synced users to Solr")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to sync users: {e}")
|
||||
|
||||
async def _sync_files(self):
|
||||
"""Sync files from MongoDB"""
|
||||
try:
|
||||
db = self.mongo_client['files_db']
|
||||
collection = db['file_metadata']
|
||||
|
||||
files = []
|
||||
async for file in collection.find({'deleted_at': None}):
|
||||
file_doc = {
|
||||
'id': f"file_{str(file['_id'])}",
|
||||
'doc_type': 'file',
|
||||
'file_id': str(file['_id']),
|
||||
'filename': file.get('filename'),
|
||||
'original_name': file.get('original_name'),
|
||||
'content_type': file.get('content_type'),
|
||||
'size': file.get('size'),
|
||||
'user_id': file.get('user_id'),
|
||||
'tags': list(file.get('tags', {}).keys()),
|
||||
'description': file.get('metadata', {}).get('description', ''),
|
||||
'created_at': file.get('created_at').isoformat() if file.get('created_at') else None,
|
||||
'updated_at': datetime.utcnow().isoformat()
|
||||
}
|
||||
files.append(file_doc)
|
||||
|
||||
# Bulk index every 100 documents
|
||||
if len(files) >= 100:
|
||||
self.solr.bulk_index(files, 'file')
|
||||
files = []
|
||||
|
||||
# Index remaining files
|
||||
if files:
|
||||
self.solr.bulk_index(files, 'file')
|
||||
|
||||
logger.info(f"Synced files to Solr")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to sync files: {e}")
|
||||
|
||||
async def reindex_collection(self, collection_name: str, doc_type: str):
|
||||
"""Reindex a specific collection"""
|
||||
try:
|
||||
# Delete existing documents of this type
|
||||
self.solr.delete_by_query(f'doc_type:{doc_type}')
|
||||
|
||||
# Sync the collection
|
||||
if collection_name == 'users':
|
||||
await self._sync_users()
|
||||
elif collection_name == 'files':
|
||||
await self._sync_files()
|
||||
|
||||
logger.info(f"Reindexed {collection_name}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to reindex {collection_name}: {e}")
|
||||
362
services/search/backend/main.py
Normal file
362
services/search/backend/main.py
Normal file
@ -0,0 +1,362 @@
|
||||
"""
|
||||
Search Service with Apache Solr
|
||||
"""
|
||||
from fastapi import FastAPI, Query, HTTPException
|
||||
from fastapi.responses import JSONResponse
|
||||
from contextlib import asynccontextmanager
|
||||
import logging
|
||||
import os
|
||||
from typing import Optional, List, Dict, Any
|
||||
from datetime import datetime
|
||||
from solr_client import SolrClient
|
||||
from indexer import DataIndexer
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Global instances
|
||||
solr_client = None
|
||||
data_indexer = None
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Manage application lifecycle"""
|
||||
global solr_client, data_indexer
|
||||
|
||||
# Startup
|
||||
logger.info("Starting Search Service...")
|
||||
|
||||
# Wait for Solr to be ready
|
||||
solr_url = os.getenv("SOLR_URL", "http://solr:8983/solr")
|
||||
max_retries = 30
|
||||
|
||||
for i in range(max_retries):
|
||||
try:
|
||||
solr_client = SolrClient(solr_url=solr_url, core_name="site11")
|
||||
logger.info("Connected to Solr")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.warning(f"Waiting for Solr... ({i+1}/{max_retries})")
|
||||
await asyncio.sleep(2)
|
||||
|
||||
if solr_client:
|
||||
# Initialize data indexer
|
||||
mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
|
||||
kafka_servers = os.getenv("KAFKA_BOOTSTRAP_SERVERS", "kafka:9092")
|
||||
|
||||
data_indexer = DataIndexer(solr_client, mongodb_url, kafka_servers)
|
||||
await data_indexer.start()
|
||||
|
||||
# Initial data sync
|
||||
asyncio.create_task(data_indexer.sync_all_data())
|
||||
|
||||
yield
|
||||
|
||||
# Shutdown
|
||||
if data_indexer:
|
||||
await data_indexer.stop()
|
||||
|
||||
logger.info("Search Service stopped")
|
||||
|
||||
app = FastAPI(
|
||||
title="Search Service",
|
||||
description="Full-text search with Apache Solr",
|
||||
version="1.0.0",
|
||||
lifespan=lifespan
|
||||
)
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": "search",
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"solr_connected": solr_client is not None
|
||||
}
|
||||
|
||||
@app.get("/api/search")
|
||||
async def search(
|
||||
q: str = Query(..., description="Search query"),
|
||||
doc_type: Optional[str] = Query(None, description="Filter by document type"),
|
||||
start: int = Query(0, ge=0, description="Starting offset"),
|
||||
rows: int = Query(10, ge=1, le=100, description="Number of results"),
|
||||
sort: Optional[str] = Query(None, description="Sort order (e.g., 'created_at desc')"),
|
||||
facet: bool = Query(False, description="Enable faceting"),
|
||||
facet_field: Optional[List[str]] = Query(None, description="Fields to facet on")
|
||||
):
|
||||
"""
|
||||
Search documents across all indexed content
|
||||
"""
|
||||
if not solr_client:
|
||||
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||
|
||||
try:
|
||||
# Build filter query
|
||||
fq = []
|
||||
if doc_type:
|
||||
fq.append(f"doc_type:{doc_type}")
|
||||
|
||||
# Prepare search parameters
|
||||
search_params = {
|
||||
'start': start,
|
||||
'rows': rows,
|
||||
'facet': facet
|
||||
}
|
||||
|
||||
if fq:
|
||||
search_params['fq'] = fq
|
||||
|
||||
if sort:
|
||||
search_params['sort'] = sort
|
||||
|
||||
if facet_field:
|
||||
search_params['facet_field'] = facet_field
|
||||
|
||||
# Execute search
|
||||
results = solr_client.search(q, **search_params)
|
||||
|
||||
return {
|
||||
"query": q,
|
||||
"total": results['total'],
|
||||
"start": start,
|
||||
"rows": rows,
|
||||
"documents": results['documents'],
|
||||
"facets": results.get('facets', {}),
|
||||
"highlighting": results.get('highlighting', {})
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Search failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.get("/api/search/suggest")
|
||||
async def suggest(
|
||||
q: str = Query(..., min_length=1, description="Query prefix"),
|
||||
field: str = Query("title", description="Field to search in"),
|
||||
limit: int = Query(10, ge=1, le=50, description="Maximum suggestions")
|
||||
):
|
||||
"""
|
||||
Get autocomplete suggestions
|
||||
"""
|
||||
if not solr_client:
|
||||
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||
|
||||
try:
|
||||
suggestions = solr_client.suggest(q, field, limit)
|
||||
|
||||
return {
|
||||
"query": q,
|
||||
"suggestions": suggestions,
|
||||
"count": len(suggestions)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Suggest failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.get("/api/search/similar/{doc_id}")
|
||||
async def find_similar(
|
||||
doc_id: str,
|
||||
rows: int = Query(5, ge=1, le=20, description="Number of similar documents")
|
||||
):
|
||||
"""
|
||||
Find documents similar to the given document
|
||||
"""
|
||||
if not solr_client:
|
||||
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||
|
||||
try:
|
||||
similar_docs = solr_client.more_like_this(doc_id, rows=rows)
|
||||
|
||||
return {
|
||||
"source_document": doc_id,
|
||||
"similar_documents": similar_docs,
|
||||
"count": len(similar_docs)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Similar search failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/api/search/index")
|
||||
async def index_document(document: Dict[str, Any]):
|
||||
"""
|
||||
Index a single document
|
||||
"""
|
||||
if not solr_client:
|
||||
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||
|
||||
try:
|
||||
doc_type = document.get('doc_type', 'general')
|
||||
success = solr_client.index_document(document, doc_type)
|
||||
|
||||
if success:
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "Document indexed",
|
||||
"document_id": document.get('id')
|
||||
}
|
||||
else:
|
||||
raise HTTPException(status_code=500, detail="Failed to index document")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Indexing failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/api/search/bulk-index")
|
||||
async def bulk_index(documents: List[Dict[str, Any]]):
|
||||
"""
|
||||
Bulk index multiple documents
|
||||
"""
|
||||
if not solr_client:
|
||||
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||
|
||||
try:
|
||||
indexed = solr_client.bulk_index(documents)
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"message": f"Indexed {indexed} documents",
|
||||
"count": indexed
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Bulk indexing failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.delete("/api/search/document/{doc_id}")
|
||||
async def delete_document(doc_id: str):
|
||||
"""
|
||||
Delete a document from the index
|
||||
"""
|
||||
if not solr_client:
|
||||
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||
|
||||
try:
|
||||
success = solr_client.delete_document(doc_id)
|
||||
|
||||
if success:
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "Document deleted",
|
||||
"document_id": doc_id
|
||||
}
|
||||
else:
|
||||
raise HTTPException(status_code=500, detail="Failed to delete document")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Deletion failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.get("/api/search/stats")
|
||||
async def get_stats():
|
||||
"""
|
||||
Get search index statistics
|
||||
"""
|
||||
if not solr_client:
|
||||
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||
|
||||
try:
|
||||
stats = solr_client.get_stats()
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"statistics": stats,
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get stats: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/api/search/reindex/{collection}")
|
||||
async def reindex_collection(
|
||||
collection: str,
|
||||
doc_type: Optional[str] = Query(None, description="Document type for the collection")
|
||||
):
|
||||
"""
|
||||
Reindex a specific collection
|
||||
"""
|
||||
if not data_indexer:
|
||||
raise HTTPException(status_code=503, detail="Indexer service unavailable")
|
||||
|
||||
try:
|
||||
if not doc_type:
|
||||
# Map collection to doc_type
|
||||
doc_type_map = {
|
||||
'users': 'user',
|
||||
'files': 'file',
|
||||
'content': 'content'
|
||||
}
|
||||
doc_type = doc_type_map.get(collection, collection)
|
||||
|
||||
asyncio.create_task(data_indexer.reindex_collection(collection, doc_type))
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"message": f"Reindexing {collection} started",
|
||||
"collection": collection,
|
||||
"doc_type": doc_type
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Reindex failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/api/search/optimize")
|
||||
async def optimize_index():
|
||||
"""
|
||||
Optimize the search index
|
||||
"""
|
||||
if not solr_client:
|
||||
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||
|
||||
try:
|
||||
success = solr_client.optimize_index()
|
||||
|
||||
if success:
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "Index optimization started"
|
||||
}
|
||||
else:
|
||||
raise HTTPException(status_code=500, detail="Failed to optimize index")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Optimization failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/api/search/clear")
|
||||
async def clear_index():
|
||||
"""
|
||||
Clear all documents from the index (DANGER!)
|
||||
"""
|
||||
if not solr_client:
|
||||
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||
|
||||
try:
|
||||
success = solr_client.clear_index()
|
||||
|
||||
if success:
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "Index cleared",
|
||||
"warning": "All documents have been deleted!"
|
||||
}
|
||||
else:
|
||||
raise HTTPException(status_code=500, detail="Failed to clear index")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Clear index failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
10
services/search/backend/requirements.txt
Normal file
10
services/search/backend/requirements.txt
Normal file
@ -0,0 +1,10 @@
|
||||
fastapi==0.109.0
|
||||
uvicorn[standard]==0.27.0
|
||||
pydantic==2.5.3
|
||||
python-dotenv==1.0.0
|
||||
pysolr==3.9.0
|
||||
httpx==0.25.2
|
||||
motor==3.5.1
|
||||
pymongo==4.6.1
|
||||
aiokafka==0.10.0
|
||||
redis==5.0.1
|
||||
303
services/search/backend/solr_client.py
Normal file
303
services/search/backend/solr_client.py
Normal file
@ -0,0 +1,303 @@
|
||||
"""
|
||||
Apache Solr client for search operations
|
||||
"""
|
||||
import pysolr
|
||||
import logging
|
||||
from typing import Dict, List, Any, Optional
|
||||
from datetime import datetime
|
||||
import json
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class SolrClient:
|
||||
def __init__(self, solr_url: str = "http://solr:8983/solr", core_name: str = "site11"):
|
||||
self.solr_url = f"{solr_url}/{core_name}"
|
||||
self.core_name = core_name
|
||||
self.solr = None
|
||||
self.connect()
|
||||
|
||||
def connect(self):
|
||||
"""Connect to Solr instance"""
|
||||
try:
|
||||
self.solr = pysolr.Solr(
|
||||
self.solr_url,
|
||||
always_commit=True,
|
||||
timeout=10
|
||||
)
|
||||
# Test connection
|
||||
self.solr.ping()
|
||||
logger.info(f"Connected to Solr at {self.solr_url}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to connect to Solr: {e}")
|
||||
raise
|
||||
|
||||
def index_document(self, document: Dict[str, Any], doc_type: str = None) -> bool:
|
||||
"""Index a single document"""
|
||||
try:
|
||||
# Add metadata
|
||||
if doc_type:
|
||||
document["doc_type"] = doc_type
|
||||
|
||||
if "id" not in document:
|
||||
document["id"] = f"{doc_type}_{document.get('_id', '')}"
|
||||
|
||||
# Add indexing timestamp
|
||||
document["indexed_at"] = datetime.utcnow().isoformat()
|
||||
|
||||
# Index the document
|
||||
self.solr.add([document])
|
||||
logger.info(f"Indexed document: {document.get('id')}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to index document: {e}")
|
||||
return False
|
||||
|
||||
def bulk_index(self, documents: List[Dict[str, Any]], doc_type: str = None) -> int:
|
||||
"""Bulk index multiple documents"""
|
||||
try:
|
||||
indexed = 0
|
||||
for doc in documents:
|
||||
if doc_type:
|
||||
doc["doc_type"] = doc_type
|
||||
|
||||
if "id" not in doc:
|
||||
doc["id"] = f"{doc_type}_{doc.get('_id', '')}"
|
||||
|
||||
doc["indexed_at"] = datetime.utcnow().isoformat()
|
||||
|
||||
self.solr.add(documents)
|
||||
indexed = len(documents)
|
||||
logger.info(f"Bulk indexed {indexed} documents")
|
||||
return indexed
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to bulk index: {e}")
|
||||
return 0
|
||||
|
||||
def search(self, query: str, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Search documents
|
||||
|
||||
Args:
|
||||
query: Search query string
|
||||
**kwargs: Additional search parameters
|
||||
- fq: Filter queries
|
||||
- fl: Fields to return
|
||||
- start: Starting offset
|
||||
- rows: Number of rows
|
||||
- sort: Sort order
|
||||
- facet: Enable faceting
|
||||
- facet.field: Fields to facet on
|
||||
"""
|
||||
try:
|
||||
# Default parameters
|
||||
params = {
|
||||
'q': query,
|
||||
'start': kwargs.get('start', 0),
|
||||
'rows': kwargs.get('rows', 10),
|
||||
'fl': kwargs.get('fl', '*,score'),
|
||||
'defType': 'edismax',
|
||||
'qf': 'title^3 content^2 tags description name', # Boost fields
|
||||
'mm': '2<-25%', # Minimum match
|
||||
'hl': 'true', # Highlighting
|
||||
'hl.fl': 'title,content,description',
|
||||
'hl.simple.pre': '<mark>',
|
||||
'hl.simple.post': '</mark>'
|
||||
}
|
||||
|
||||
# Add filter queries
|
||||
if 'fq' in kwargs:
|
||||
params['fq'] = kwargs['fq']
|
||||
|
||||
# Add sorting
|
||||
if 'sort' in kwargs:
|
||||
params['sort'] = kwargs['sort']
|
||||
|
||||
# Add faceting
|
||||
if kwargs.get('facet'):
|
||||
params.update({
|
||||
'facet': 'true',
|
||||
'facet.field': kwargs.get('facet_field', ['doc_type', 'tags', 'status']),
|
||||
'facet.mincount': 1
|
||||
})
|
||||
|
||||
# Execute search
|
||||
results = self.solr.search(**params)
|
||||
|
||||
# Format response
|
||||
response = {
|
||||
'total': results.hits,
|
||||
'documents': [],
|
||||
'facets': {},
|
||||
'highlighting': {}
|
||||
}
|
||||
|
||||
# Add documents
|
||||
for doc in results.docs:
|
||||
response['documents'].append(doc)
|
||||
|
||||
# Add facets if available
|
||||
if hasattr(results, 'facets') and results.facets:
|
||||
if 'facet_fields' in results.facets:
|
||||
for field, values in results.facets['facet_fields'].items():
|
||||
response['facets'][field] = [
|
||||
{'value': values[i], 'count': values[i+1]}
|
||||
for i in range(0, len(values), 2)
|
||||
]
|
||||
|
||||
# Add highlighting if available
|
||||
if hasattr(results, 'highlighting'):
|
||||
response['highlighting'] = results.highlighting
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Search failed: {e}")
|
||||
return {'total': 0, 'documents': [], 'error': str(e)}
|
||||
|
||||
def suggest(self, prefix: str, field: str = "suggest", limit: int = 10) -> List[str]:
|
||||
"""Get autocomplete suggestions"""
|
||||
try:
|
||||
params = {
|
||||
'q': f'{field}:{prefix}*',
|
||||
'fl': field,
|
||||
'rows': limit,
|
||||
'start': 0
|
||||
}
|
||||
|
||||
results = self.solr.search(**params)
|
||||
suggestions = []
|
||||
|
||||
for doc in results.docs:
|
||||
if field in doc:
|
||||
value = doc[field]
|
||||
if isinstance(value, list):
|
||||
suggestions.extend(value)
|
||||
else:
|
||||
suggestions.append(value)
|
||||
|
||||
# Remove duplicates and limit
|
||||
seen = set()
|
||||
unique_suggestions = []
|
||||
for s in suggestions:
|
||||
if s not in seen:
|
||||
seen.add(s)
|
||||
unique_suggestions.append(s)
|
||||
if len(unique_suggestions) >= limit:
|
||||
break
|
||||
|
||||
return unique_suggestions
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Suggest failed: {e}")
|
||||
return []
|
||||
|
||||
def more_like_this(self, doc_id: str, mlt_fields: List[str] = None, rows: int = 5) -> List[Dict]:
|
||||
"""Find similar documents"""
|
||||
try:
|
||||
if not mlt_fields:
|
||||
mlt_fields = ['title', 'content', 'tags', 'description']
|
||||
|
||||
params = {
|
||||
'q': f'id:{doc_id}',
|
||||
'mlt': 'true',
|
||||
'mlt.fl': ','.join(mlt_fields),
|
||||
'mlt.mindf': 1,
|
||||
'mlt.mintf': 1,
|
||||
'mlt.count': rows,
|
||||
'fl': '*,score'
|
||||
}
|
||||
|
||||
results = self.solr.search(**params)
|
||||
|
||||
if results.docs:
|
||||
# The MLT results are in the moreLikeThis section
|
||||
if hasattr(results, 'moreLikeThis'):
|
||||
mlt_results = results.moreLikeThis.get(doc_id, {})
|
||||
if 'docs' in mlt_results:
|
||||
return mlt_results['docs']
|
||||
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"More like this failed: {e}")
|
||||
return []
|
||||
|
||||
def delete_document(self, doc_id: str) -> bool:
|
||||
"""Delete a document by ID"""
|
||||
try:
|
||||
self.solr.delete(id=doc_id)
|
||||
logger.info(f"Deleted document: {doc_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete document: {e}")
|
||||
return False
|
||||
|
||||
def delete_by_query(self, query: str) -> bool:
|
||||
"""Delete documents matching a query"""
|
||||
try:
|
||||
self.solr.delete(q=query)
|
||||
logger.info(f"Deleted documents matching: {query}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete by query: {e}")
|
||||
return False
|
||||
|
||||
def clear_index(self) -> bool:
|
||||
"""Clear all documents from index"""
|
||||
try:
|
||||
self.solr.delete(q='*:*')
|
||||
logger.info("Cleared all documents from index")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to clear index: {e}")
|
||||
return False
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get index statistics"""
|
||||
try:
|
||||
# Get document count
|
||||
results = self.solr.search(q='*:*', rows=0)
|
||||
|
||||
# Get facet counts for doc_type
|
||||
facet_results = self.solr.search(
|
||||
q='*:*',
|
||||
rows=0,
|
||||
facet='true',
|
||||
**{'facet.field': ['doc_type', 'status']}
|
||||
)
|
||||
|
||||
stats = {
|
||||
'total_documents': results.hits,
|
||||
'doc_types': {},
|
||||
'status_counts': {}
|
||||
}
|
||||
|
||||
if hasattr(facet_results, 'facets') and facet_results.facets:
|
||||
if 'facet_fields' in facet_results.facets:
|
||||
# Parse doc_type facets
|
||||
doc_type_facets = facet_results.facets['facet_fields'].get('doc_type', [])
|
||||
for i in range(0, len(doc_type_facets), 2):
|
||||
stats['doc_types'][doc_type_facets[i]] = doc_type_facets[i+1]
|
||||
|
||||
# Parse status facets
|
||||
status_facets = facet_results.facets['facet_fields'].get('status', [])
|
||||
for i in range(0, len(status_facets), 2):
|
||||
stats['status_counts'][status_facets[i]] = status_facets[i+1]
|
||||
|
||||
return stats
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get stats: {e}")
|
||||
return {'error': str(e)}
|
||||
|
||||
def optimize_index(self) -> bool:
|
||||
"""Optimize the Solr index"""
|
||||
try:
|
||||
self.solr.optimize()
|
||||
logger.info("Index optimized")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to optimize index: {e}")
|
||||
return False
|
||||
292
services/search/backend/test_search.py
Normal file
292
services/search/backend/test_search.py
Normal file
@ -0,0 +1,292 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for Search Service with Apache Solr
|
||||
"""
|
||||
import asyncio
|
||||
import httpx
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
BASE_URL = "http://localhost:8015"
|
||||
|
||||
async def test_search_api():
|
||||
"""Test search API endpoints"""
|
||||
async with httpx.AsyncClient() as client:
|
||||
print("\n🔍 Testing Search Service API...")
|
||||
|
||||
# Test health check
|
||||
print("\n1. Testing health check...")
|
||||
response = await client.get(f"{BASE_URL}/health")
|
||||
print(f"Health check: {response.json()}")
|
||||
|
||||
# Test index sample documents
|
||||
print("\n2. Indexing sample documents...")
|
||||
|
||||
# Index user document
|
||||
user_doc = {
|
||||
"id": "user_test_001",
|
||||
"doc_type": "user",
|
||||
"user_id": "test_001",
|
||||
"username": "john_doe",
|
||||
"email": "john@example.com",
|
||||
"name": "John Doe",
|
||||
"bio": "Software developer passionate about Python and microservices",
|
||||
"tags": ["python", "developer", "backend"],
|
||||
"created_at": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
response = await client.post(f"{BASE_URL}/api/search/index", json=user_doc)
|
||||
print(f"Indexed user: {response.json()}")
|
||||
|
||||
# Index file documents
|
||||
file_docs = [
|
||||
{
|
||||
"id": "file_test_001",
|
||||
"doc_type": "file",
|
||||
"file_id": "test_file_001",
|
||||
"filename": "architecture_diagram.png",
|
||||
"content_type": "image/png",
|
||||
"size": 1024000,
|
||||
"user_id": "test_001",
|
||||
"tags": ["architecture", "design", "documentation"],
|
||||
"description": "System architecture diagram showing microservices",
|
||||
"created_at": datetime.utcnow().isoformat()
|
||||
},
|
||||
{
|
||||
"id": "file_test_002",
|
||||
"doc_type": "file",
|
||||
"file_id": "test_file_002",
|
||||
"filename": "user_manual.pdf",
|
||||
"content_type": "application/pdf",
|
||||
"size": 2048000,
|
||||
"user_id": "test_001",
|
||||
"tags": ["documentation", "manual", "guide"],
|
||||
"description": "Complete user manual for the application",
|
||||
"created_at": datetime.utcnow().isoformat()
|
||||
}
|
||||
]
|
||||
|
||||
response = await client.post(f"{BASE_URL}/api/search/bulk-index", json=file_docs)
|
||||
print(f"Bulk indexed files: {response.json()}")
|
||||
|
||||
# Index content documents
|
||||
content_docs = [
|
||||
{
|
||||
"id": "content_test_001",
|
||||
"doc_type": "content",
|
||||
"content_id": "test_content_001",
|
||||
"title": "Getting Started with Microservices",
|
||||
"content": "Microservices architecture is a method of developing software applications as a suite of independently deployable services.",
|
||||
"summary": "Introduction to microservices architecture patterns",
|
||||
"author_id": "test_001",
|
||||
"tags": ["microservices", "architecture", "tutorial"],
|
||||
"category": "technology",
|
||||
"status": "published",
|
||||
"created_at": datetime.utcnow().isoformat()
|
||||
},
|
||||
{
|
||||
"id": "content_test_002",
|
||||
"doc_type": "content",
|
||||
"content_id": "test_content_002",
|
||||
"title": "Python Best Practices",
|
||||
"content": "Learn the best practices for writing clean, maintainable Python code including PEP 8 style guide.",
|
||||
"summary": "Essential Python coding standards and practices",
|
||||
"author_id": "test_001",
|
||||
"tags": ["python", "programming", "best-practices"],
|
||||
"category": "programming",
|
||||
"status": "published",
|
||||
"created_at": datetime.utcnow().isoformat()
|
||||
}
|
||||
]
|
||||
|
||||
response = await client.post(f"{BASE_URL}/api/search/bulk-index", json=content_docs)
|
||||
print(f"Bulk indexed content: {response.json()}")
|
||||
|
||||
# Wait for indexing
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test basic search
|
||||
print("\n3. Testing basic search...")
|
||||
response = await client.get(
|
||||
f"{BASE_URL}/api/search",
|
||||
params={"q": "microservices"}
|
||||
)
|
||||
results = response.json()
|
||||
print(f"Search for 'microservices': Found {results['total']} results")
|
||||
if results['documents']:
|
||||
print(f"First result: {results['documents'][0].get('title', results['documents'][0].get('filename', 'N/A'))}")
|
||||
|
||||
# Test search with filters
|
||||
print("\n4. Testing filtered search...")
|
||||
response = await client.get(
|
||||
f"{BASE_URL}/api/search",
|
||||
params={
|
||||
"q": "*:*",
|
||||
"doc_type": "file",
|
||||
"rows": 5
|
||||
}
|
||||
)
|
||||
results = response.json()
|
||||
print(f"Files search: Found {results['total']} files")
|
||||
|
||||
# Test faceted search
|
||||
print("\n5. Testing faceted search...")
|
||||
response = await client.get(
|
||||
f"{BASE_URL}/api/search",
|
||||
params={
|
||||
"q": "*:*",
|
||||
"facet": "true",
|
||||
"facet_field": ["doc_type", "tags", "category", "status"]
|
||||
}
|
||||
)
|
||||
results = response.json()
|
||||
print(f"Facets: {json.dumps(results['facets'], indent=2)}")
|
||||
|
||||
# Test autocomplete/suggest
|
||||
print("\n6. Testing autocomplete...")
|
||||
response = await client.get(
|
||||
f"{BASE_URL}/api/search/suggest",
|
||||
params={
|
||||
"q": "micro",
|
||||
"field": "title",
|
||||
"limit": 5
|
||||
}
|
||||
)
|
||||
suggestions = response.json()
|
||||
print(f"Suggestions for 'micro': {suggestions['suggestions']}")
|
||||
|
||||
# Test similar documents
|
||||
print("\n7. Testing similar documents...")
|
||||
response = await client.get(f"{BASE_URL}/api/search/similar/content_test_001")
|
||||
if response.status_code == 200:
|
||||
similar = response.json()
|
||||
print(f"Found {similar['count']} similar documents")
|
||||
else:
|
||||
print(f"Similar search: {response.status_code}")
|
||||
|
||||
# Test search with highlighting
|
||||
print("\n8. Testing search with highlighting...")
|
||||
response = await client.get(
|
||||
f"{BASE_URL}/api/search",
|
||||
params={"q": "Python"}
|
||||
)
|
||||
results = response.json()
|
||||
if results['highlighting']:
|
||||
print(f"Highlighting results: {len(results['highlighting'])} documents highlighted")
|
||||
|
||||
# Test search statistics
|
||||
print("\n9. Testing search statistics...")
|
||||
response = await client.get(f"{BASE_URL}/api/search/stats")
|
||||
if response.status_code == 200:
|
||||
stats = response.json()
|
||||
print(f"Index stats: {stats['statistics']}")
|
||||
|
||||
# Test complex query
|
||||
print("\n10. Testing complex query...")
|
||||
response = await client.get(
|
||||
f"{BASE_URL}/api/search",
|
||||
params={
|
||||
"q": "architecture OR python",
|
||||
"doc_type": "content",
|
||||
"sort": "created_at desc",
|
||||
"rows": 10
|
||||
}
|
||||
)
|
||||
results = response.json()
|
||||
print(f"Complex query: Found {results['total']} results")
|
||||
|
||||
# Test delete document
|
||||
print("\n11. Testing document deletion...")
|
||||
response = await client.delete(f"{BASE_URL}/api/search/document/content_test_002")
|
||||
if response.status_code == 200:
|
||||
print(f"Deleted document: {response.json()}")
|
||||
|
||||
# Verify deletion
|
||||
await asyncio.sleep(1)
|
||||
response = await client.get(
|
||||
f"{BASE_URL}/api/search",
|
||||
params={"q": "id:content_test_002"}
|
||||
)
|
||||
results = response.json()
|
||||
print(f"Verify deletion: Found {results['total']} results (should be 0)")
|
||||
|
||||
async def test_performance():
|
||||
"""Test search performance"""
|
||||
print("\n\n⚡ Testing Search Performance...")
|
||||
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
# Index many documents
|
||||
print("Indexing 100 test documents...")
|
||||
docs = []
|
||||
for i in range(100):
|
||||
docs.append({
|
||||
"id": f"perf_test_{i}",
|
||||
"doc_type": "content",
|
||||
"title": f"Test Document {i}",
|
||||
"content": f"This is test content for document {i} with various keywords like search, Solr, Python, microservices",
|
||||
"tags": [f"tag{i%10}", f"category{i%5}"],
|
||||
"created_at": datetime.utcnow().isoformat()
|
||||
})
|
||||
|
||||
response = await client.post(f"{BASE_URL}/api/search/bulk-index", json=docs)
|
||||
print(f"Indexed {response.json().get('count', 0)} documents")
|
||||
|
||||
# Wait for indexing
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test search speed
|
||||
print("\nTesting search response times...")
|
||||
import time
|
||||
|
||||
queries = ["search", "Python", "document", "test", "microservices"]
|
||||
for query in queries:
|
||||
start = time.time()
|
||||
response = await client.get(
|
||||
f"{BASE_URL}/api/search",
|
||||
params={"q": query, "rows": 20}
|
||||
)
|
||||
elapsed = time.time() - start
|
||||
results = response.json()
|
||||
print(f"Query '{query}': {results['total']} results in {elapsed:.3f}s")
|
||||
|
||||
async def test_reindex():
|
||||
"""Test reindexing from MongoDB"""
|
||||
print("\n\n🔄 Testing Reindex Functionality...")
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
# Trigger reindex for users collection
|
||||
print("Triggering reindex for users collection...")
|
||||
response = await client.post(
|
||||
f"{BASE_URL}/api/search/reindex/users",
|
||||
params={"doc_type": "user"}
|
||||
)
|
||||
if response.status_code == 200:
|
||||
print(f"Reindex started: {response.json()}")
|
||||
else:
|
||||
print(f"Reindex failed: {response.status_code}")
|
||||
|
||||
# Test index optimization
|
||||
print("\nTesting index optimization...")
|
||||
response = await client.post(f"{BASE_URL}/api/search/optimize")
|
||||
if response.status_code == 200:
|
||||
print(f"Optimization: {response.json()}")
|
||||
|
||||
async def main():
|
||||
"""Run all tests"""
|
||||
print("=" * 60)
|
||||
print("SEARCH SERVICE TEST SUITE (Apache Solr)")
|
||||
print("=" * 60)
|
||||
print(f"Started at: {datetime.now().isoformat()}")
|
||||
|
||||
# Run tests
|
||||
await test_search_api()
|
||||
await test_performance()
|
||||
await test_reindex()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("✅ All search tests completed!")
|
||||
print(f"Finished at: {datetime.now().isoformat()}")
|
||||
print("=" * 60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user