feat: Add Step 13 - Search System with Apache Solr and Data Persistence
- Implemented search service with Apache Solr instead of Elasticsearch - Added full-text search, faceted search, and autocomplete capabilities - Created data indexer for synchronizing data from MongoDB/Kafka to Solr - Configured external volume mounts for all data services: - MongoDB, Redis, Kafka, Zookeeper, MinIO, Solr - All data now persists in ./data/ directory - Added comprehensive search API endpoints - Created documentation for data persistence and backup strategies 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@ -64,4 +64,4 @@ temp/
|
|||||||
*.pem
|
*.pem
|
||||||
*.key
|
*.key
|
||||||
*.crt
|
*.crt
|
||||||
secrets/
|
secrets/data/
|
||||||
|
|||||||
@ -76,7 +76,7 @@ services:
|
|||||||
- CONVERT_TO_WEBP=true
|
- CONVERT_TO_WEBP=true
|
||||||
volumes:
|
volumes:
|
||||||
- ./services/images/backend:/app
|
- ./services/images/backend:/app
|
||||||
- images_cache:/app/cache
|
- ./data/images-cache:/app/cache
|
||||||
networks:
|
networks:
|
||||||
- site11_network
|
- site11_network
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
@ -118,8 +118,8 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "${MONGODB_PORT}:27017"
|
- "${MONGODB_PORT}:27017"
|
||||||
volumes:
|
volumes:
|
||||||
- mongodb_data:/data/db
|
- ./data/mongodb:/data/db
|
||||||
- mongodb_config:/data/configdb
|
- ./data/mongodb/configdb:/data/configdb
|
||||||
networks:
|
networks:
|
||||||
- site11_network
|
- site11_network
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
@ -135,7 +135,7 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "${REDIS_PORT}:6379"
|
- "${REDIS_PORT}:6379"
|
||||||
volumes:
|
volumes:
|
||||||
- redis_data:/data
|
- ./data/redis:/data
|
||||||
networks:
|
networks:
|
||||||
- site11_network
|
- site11_network
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
@ -154,8 +154,8 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "${KAFKA_ZOOKEEPER_PORT}:2181"
|
- "${KAFKA_ZOOKEEPER_PORT}:2181"
|
||||||
volumes:
|
volumes:
|
||||||
- zookeeper_data:/var/lib/zookeeper/data
|
- ./data/zookeeper/data:/var/lib/zookeeper/data
|
||||||
- zookeeper_logs:/var/lib/zookeeper/log
|
- ./data/zookeeper/logs:/var/lib/zookeeper/log
|
||||||
networks:
|
networks:
|
||||||
- site11_network
|
- site11_network
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
@ -181,7 +181,7 @@ services:
|
|||||||
KAFKA_JMX_HOSTNAME: localhost
|
KAFKA_JMX_HOSTNAME: localhost
|
||||||
KAFKA_AUTO_CREATE_TOPICS_ENABLE: 'true'
|
KAFKA_AUTO_CREATE_TOPICS_ENABLE: 'true'
|
||||||
volumes:
|
volumes:
|
||||||
- kafka_data:/var/lib/kafka/data
|
- ./data/kafka:/var/lib/kafka/data
|
||||||
networks:
|
networks:
|
||||||
- site11_network
|
- site11_network
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
@ -234,7 +234,7 @@ services:
|
|||||||
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
|
- MINIO_ROOT_USER=${MINIO_ROOT_USER:-minioadmin}
|
||||||
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin}
|
- MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-minioadmin}
|
||||||
volumes:
|
volumes:
|
||||||
- minio_data:/data
|
- ./data/minio:/data
|
||||||
command: server /data --console-address ":9001"
|
command: server /data --console-address ":9001"
|
||||||
networks:
|
networks:
|
||||||
- site11_network
|
- site11_network
|
||||||
@ -264,7 +264,7 @@ services:
|
|||||||
- MINIO_SECURE=false
|
- MINIO_SECURE=false
|
||||||
volumes:
|
volumes:
|
||||||
- ./services/files/backend:/app
|
- ./services/files/backend:/app
|
||||||
- files_temp:/tmp
|
- ./data/files-temp:/tmp
|
||||||
networks:
|
networks:
|
||||||
- site11_network
|
- site11_network
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
@ -277,6 +277,57 @@ services:
|
|||||||
timeout: 10s
|
timeout: 10s
|
||||||
retries: 3
|
retries: 3
|
||||||
|
|
||||||
|
# Apache Solr Search Engine
|
||||||
|
solr:
|
||||||
|
image: solr:9.4
|
||||||
|
container_name: ${COMPOSE_PROJECT_NAME}_solr
|
||||||
|
ports:
|
||||||
|
- "8983:8983"
|
||||||
|
volumes:
|
||||||
|
- ./data/solr:/var/solr
|
||||||
|
- ./services/search/solr-config:/opt/solr/server/solr/configsets/site11_config
|
||||||
|
command:
|
||||||
|
- solr-precreate
|
||||||
|
- site11
|
||||||
|
- /opt/solr/server/solr/configsets/site11_config
|
||||||
|
networks:
|
||||||
|
- site11_network
|
||||||
|
restart: unless-stopped
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8983/solr/site11/admin/ping"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
# Search Service
|
||||||
|
search-backend:
|
||||||
|
build:
|
||||||
|
context: ./services/search/backend
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
container_name: ${COMPOSE_PROJECT_NAME}_search_backend
|
||||||
|
ports:
|
||||||
|
- "8015:8000"
|
||||||
|
environment:
|
||||||
|
- ENV=${ENV}
|
||||||
|
- PORT=8000
|
||||||
|
- SOLR_URL=http://solr:8983/solr
|
||||||
|
- MONGODB_URL=${MONGODB_URL}
|
||||||
|
- KAFKA_BOOTSTRAP_SERVERS=${KAFKA_BOOTSTRAP_SERVERS}
|
||||||
|
volumes:
|
||||||
|
- ./services/search/backend:/app
|
||||||
|
networks:
|
||||||
|
- site11_network
|
||||||
|
restart: unless-stopped
|
||||||
|
depends_on:
|
||||||
|
- solr
|
||||||
|
- mongodb
|
||||||
|
- kafka
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
# Statistics Service
|
# Statistics Service
|
||||||
statistics-backend:
|
statistics-backend:
|
||||||
build:
|
build:
|
||||||
@ -308,13 +359,15 @@ networks:
|
|||||||
driver: bridge
|
driver: bridge
|
||||||
name: site11_network
|
name: site11_network
|
||||||
|
|
||||||
volumes:
|
# Named volumes are replaced with bind mounts in ./data/ directory
|
||||||
mongodb_data:
|
# volumes:
|
||||||
mongodb_config:
|
# mongodb_data:
|
||||||
redis_data:
|
# mongodb_config:
|
||||||
images_cache:
|
# redis_data:
|
||||||
zookeeper_data:
|
# images_cache:
|
||||||
zookeeper_logs:
|
# zookeeper_data:
|
||||||
kafka_data:
|
# zookeeper_logs:
|
||||||
minio_data:
|
# kafka_data:
|
||||||
files_temp:
|
# minio_data:
|
||||||
|
# files_temp:
|
||||||
|
# solr_data:
|
||||||
140
docs/DATA_PERSISTENCE.md
Normal file
140
docs/DATA_PERSISTENCE.md
Normal file
@ -0,0 +1,140 @@
|
|||||||
|
# Data Persistence Configuration
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
All data services are configured to use bind mounts to local directories for data persistence. This ensures data survives container restarts and rebuilds.
|
||||||
|
|
||||||
|
## Directory Structure
|
||||||
|
```
|
||||||
|
data/
|
||||||
|
├── mongodb/ # MongoDB database files
|
||||||
|
├── redis/ # Redis persistence files
|
||||||
|
├── kafka/ # Kafka log data
|
||||||
|
├── zookeeper/ # Zookeeper data and logs
|
||||||
|
│ ├── data/
|
||||||
|
│ └── logs/
|
||||||
|
├── minio/ # MinIO object storage
|
||||||
|
├── solr/ # Solr search index
|
||||||
|
├── files-temp/ # Temporary file storage
|
||||||
|
└── images-cache/ # Image processing cache
|
||||||
|
```
|
||||||
|
|
||||||
|
## Volume Mappings
|
||||||
|
|
||||||
|
### MongoDB
|
||||||
|
- `./data/mongodb:/data/db` - Database files
|
||||||
|
- `./data/mongodb/configdb:/data/configdb` - Configuration database
|
||||||
|
|
||||||
|
### Redis
|
||||||
|
- `./data/redis:/data` - RDB snapshots and AOF logs
|
||||||
|
|
||||||
|
### Kafka
|
||||||
|
- `./data/kafka:/var/lib/kafka/data` - Message logs
|
||||||
|
|
||||||
|
### Zookeeper
|
||||||
|
- `./data/zookeeper/data:/var/lib/zookeeper/data` - Coordination data
|
||||||
|
- `./data/zookeeper/logs:/var/lib/zookeeper/log` - Transaction logs
|
||||||
|
|
||||||
|
### MinIO
|
||||||
|
- `./data/minio:/data` - Object storage buckets
|
||||||
|
|
||||||
|
### Solr
|
||||||
|
- `./data/solr:/var/solr` - Search index and configuration
|
||||||
|
|
||||||
|
### Application Caches
|
||||||
|
- `./data/files-temp:/tmp` - Temporary file processing
|
||||||
|
- `./data/images-cache:/app/cache` - Processed image cache
|
||||||
|
|
||||||
|
## Backup and Restore
|
||||||
|
|
||||||
|
### Backup All Data
|
||||||
|
```bash
|
||||||
|
# Stop services
|
||||||
|
docker-compose down
|
||||||
|
|
||||||
|
# Create backup
|
||||||
|
tar -czf backup-$(date +%Y%m%d).tar.gz data/
|
||||||
|
|
||||||
|
# Restart services
|
||||||
|
docker-compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
### Restore Data
|
||||||
|
```bash
|
||||||
|
# Stop services
|
||||||
|
docker-compose down
|
||||||
|
|
||||||
|
# Extract backup
|
||||||
|
tar -xzf backup-YYYYMMDD.tar.gz
|
||||||
|
|
||||||
|
# Restart services
|
||||||
|
docker-compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
### Individual Service Backups
|
||||||
|
|
||||||
|
#### MongoDB Backup
|
||||||
|
```bash
|
||||||
|
docker exec site11_mongodb mongodump --out /data/db/backup
|
||||||
|
tar -czf mongodb-backup.tar.gz data/mongodb/backup/
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Redis Backup
|
||||||
|
```bash
|
||||||
|
docker exec site11_redis redis-cli BGSAVE
|
||||||
|
# Wait for completion
|
||||||
|
cp data/redis/dump.rdb redis-backup-$(date +%Y%m%d).rdb
|
||||||
|
```
|
||||||
|
|
||||||
|
## Permissions
|
||||||
|
Ensure proper permissions for data directories:
|
||||||
|
```bash
|
||||||
|
# Set appropriate permissions
|
||||||
|
chmod -R 755 data/
|
||||||
|
```
|
||||||
|
|
||||||
|
## Disk Space Monitoring
|
||||||
|
Monitor disk usage regularly:
|
||||||
|
```bash
|
||||||
|
# Check data directory size
|
||||||
|
du -sh data/*
|
||||||
|
|
||||||
|
# Check individual services
|
||||||
|
du -sh data/mongodb
|
||||||
|
du -sh data/minio
|
||||||
|
du -sh data/kafka
|
||||||
|
```
|
||||||
|
|
||||||
|
## Clean Up Old Data
|
||||||
|
|
||||||
|
### Clear Kafka Logs (older than 7 days)
|
||||||
|
```bash
|
||||||
|
docker exec site11_kafka kafka-log-dirs.sh --describe --bootstrap-server localhost:9092
|
||||||
|
```
|
||||||
|
|
||||||
|
### Clear Image Cache
|
||||||
|
```bash
|
||||||
|
rm -rf data/images-cache/*
|
||||||
|
```
|
||||||
|
|
||||||
|
### Clear Temporary Files
|
||||||
|
```bash
|
||||||
|
rm -rf data/files-temp/*
|
||||||
|
```
|
||||||
|
|
||||||
|
## Migration from Docker Volumes
|
||||||
|
If migrating from named Docker volumes to bind mounts:
|
||||||
|
|
||||||
|
1. Export data from Docker volumes:
|
||||||
|
```bash
|
||||||
|
docker run --rm -v site11_mongodb_data:/source -v $(pwd)/data/mongodb:/dest alpine cp -av /source/. /dest/
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Update docker-compose.yml (already done)
|
||||||
|
|
||||||
|
3. Restart services with new configuration
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
- The `data/` directory is excluded from git via .gitignore
|
||||||
|
- Ensure sufficient disk space for data growth
|
||||||
|
- Consider setting up automated backups for production
|
||||||
|
- Monitor disk I/O performance for database services
|
||||||
21
services/search/backend/Dockerfile
Normal file
21
services/search/backend/Dockerfile
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install system dependencies
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
gcc \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Copy requirements first for better caching
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy application code
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Create necessary directories
|
||||||
|
RUN mkdir -p /app/logs
|
||||||
|
|
||||||
|
# Run the application
|
||||||
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
|
||||||
286
services/search/backend/indexer.py
Normal file
286
services/search/backend/indexer.py
Normal file
@ -0,0 +1,286 @@
|
|||||||
|
"""
|
||||||
|
Data indexer for synchronizing data from other services to Solr
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Any, List
|
||||||
|
from motor.motor_asyncio import AsyncIOMotorClient
|
||||||
|
from aiokafka import AIOKafkaConsumer
|
||||||
|
import json
|
||||||
|
from solr_client import SolrClient
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class DataIndexer:
|
||||||
|
def __init__(self, solr_client: SolrClient, mongodb_url: str, kafka_servers: str):
|
||||||
|
self.solr = solr_client
|
||||||
|
self.mongodb_url = mongodb_url
|
||||||
|
self.kafka_servers = kafka_servers
|
||||||
|
self.mongo_client = None
|
||||||
|
self.kafka_consumer = None
|
||||||
|
self.running = False
|
||||||
|
|
||||||
|
async def start(self):
|
||||||
|
"""Start the indexer"""
|
||||||
|
try:
|
||||||
|
# Connect to MongoDB
|
||||||
|
self.mongo_client = AsyncIOMotorClient(self.mongodb_url)
|
||||||
|
|
||||||
|
# Initialize Kafka consumer
|
||||||
|
await self._init_kafka_consumer()
|
||||||
|
|
||||||
|
# Start background tasks
|
||||||
|
self.running = True
|
||||||
|
asyncio.create_task(self._consume_kafka_events())
|
||||||
|
asyncio.create_task(self._periodic_sync())
|
||||||
|
|
||||||
|
logger.info("Data indexer started")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to start indexer: {e}")
|
||||||
|
|
||||||
|
async def stop(self):
|
||||||
|
"""Stop the indexer"""
|
||||||
|
self.running = False
|
||||||
|
|
||||||
|
if self.kafka_consumer:
|
||||||
|
await self.kafka_consumer.stop()
|
||||||
|
|
||||||
|
if self.mongo_client:
|
||||||
|
self.mongo_client.close()
|
||||||
|
|
||||||
|
logger.info("Data indexer stopped")
|
||||||
|
|
||||||
|
async def _init_kafka_consumer(self):
|
||||||
|
"""Initialize Kafka consumer"""
|
||||||
|
try:
|
||||||
|
self.kafka_consumer = AIOKafkaConsumer(
|
||||||
|
'user_events',
|
||||||
|
'file_events',
|
||||||
|
'content_events',
|
||||||
|
bootstrap_servers=self.kafka_servers,
|
||||||
|
value_deserializer=lambda m: json.loads(m.decode('utf-8')),
|
||||||
|
group_id='search_indexer',
|
||||||
|
auto_offset_reset='latest'
|
||||||
|
)
|
||||||
|
await self.kafka_consumer.start()
|
||||||
|
logger.info("Kafka consumer initialized")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Kafka consumer initialization failed: {e}")
|
||||||
|
self.kafka_consumer = None
|
||||||
|
|
||||||
|
async def _consume_kafka_events(self):
|
||||||
|
"""Consume events from Kafka and index them"""
|
||||||
|
if not self.kafka_consumer:
|
||||||
|
return
|
||||||
|
|
||||||
|
while self.running:
|
||||||
|
try:
|
||||||
|
async for msg in self.kafka_consumer:
|
||||||
|
await self._handle_kafka_event(msg.topic, msg.value)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Kafka consumption error: {e}")
|
||||||
|
await asyncio.sleep(5)
|
||||||
|
|
||||||
|
async def _handle_kafka_event(self, topic: str, event: Dict[str, Any]):
|
||||||
|
"""Handle a Kafka event"""
|
||||||
|
try:
|
||||||
|
event_type = event.get('type')
|
||||||
|
data = event.get('data', {})
|
||||||
|
|
||||||
|
if topic == 'user_events':
|
||||||
|
await self._index_user_event(event_type, data)
|
||||||
|
elif topic == 'file_events':
|
||||||
|
await self._index_file_event(event_type, data)
|
||||||
|
elif topic == 'content_events':
|
||||||
|
await self._index_content_event(event_type, data)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to handle event: {e}")
|
||||||
|
|
||||||
|
async def _index_user_event(self, event_type: str, data: Dict):
|
||||||
|
"""Index user-related events"""
|
||||||
|
if event_type == 'user_created' or event_type == 'user_updated':
|
||||||
|
user_doc = {
|
||||||
|
'id': f"user_{data.get('user_id')}",
|
||||||
|
'doc_type': 'user',
|
||||||
|
'user_id': data.get('user_id'),
|
||||||
|
'username': data.get('username'),
|
||||||
|
'email': data.get('email'),
|
||||||
|
'name': data.get('name', ''),
|
||||||
|
'bio': data.get('bio', ''),
|
||||||
|
'tags': data.get('tags', []),
|
||||||
|
'created_at': data.get('created_at'),
|
||||||
|
'updated_at': datetime.utcnow().isoformat()
|
||||||
|
}
|
||||||
|
self.solr.index_document(user_doc)
|
||||||
|
|
||||||
|
elif event_type == 'user_deleted':
|
||||||
|
self.solr.delete_document(f"user_{data.get('user_id')}")
|
||||||
|
|
||||||
|
async def _index_file_event(self, event_type: str, data: Dict):
|
||||||
|
"""Index file-related events"""
|
||||||
|
if event_type == 'file_uploaded':
|
||||||
|
file_doc = {
|
||||||
|
'id': f"file_{data.get('file_id')}",
|
||||||
|
'doc_type': 'file',
|
||||||
|
'file_id': data.get('file_id'),
|
||||||
|
'filename': data.get('filename'),
|
||||||
|
'content_type': data.get('content_type'),
|
||||||
|
'size': data.get('size'),
|
||||||
|
'user_id': data.get('user_id'),
|
||||||
|
'tags': data.get('tags', []),
|
||||||
|
'description': data.get('description', ''),
|
||||||
|
'created_at': data.get('created_at'),
|
||||||
|
'updated_at': datetime.utcnow().isoformat()
|
||||||
|
}
|
||||||
|
self.solr.index_document(file_doc)
|
||||||
|
|
||||||
|
elif event_type == 'file_deleted':
|
||||||
|
self.solr.delete_document(f"file_{data.get('file_id')}")
|
||||||
|
|
||||||
|
async def _index_content_event(self, event_type: str, data: Dict):
|
||||||
|
"""Index content-related events"""
|
||||||
|
if event_type in ['content_created', 'content_updated']:
|
||||||
|
content_doc = {
|
||||||
|
'id': f"content_{data.get('content_id')}",
|
||||||
|
'doc_type': 'content',
|
||||||
|
'content_id': data.get('content_id'),
|
||||||
|
'title': data.get('title'),
|
||||||
|
'content': data.get('content', ''),
|
||||||
|
'summary': data.get('summary', ''),
|
||||||
|
'author_id': data.get('author_id'),
|
||||||
|
'tags': data.get('tags', []),
|
||||||
|
'category': data.get('category'),
|
||||||
|
'status': data.get('status', 'draft'),
|
||||||
|
'created_at': data.get('created_at'),
|
||||||
|
'updated_at': datetime.utcnow().isoformat()
|
||||||
|
}
|
||||||
|
self.solr.index_document(content_doc)
|
||||||
|
|
||||||
|
elif event_type == 'content_deleted':
|
||||||
|
self.solr.delete_document(f"content_{data.get('content_id')}")
|
||||||
|
|
||||||
|
async def _periodic_sync(self):
|
||||||
|
"""Periodically sync data from MongoDB"""
|
||||||
|
while self.running:
|
||||||
|
try:
|
||||||
|
# Sync every 5 minutes
|
||||||
|
await asyncio.sleep(300)
|
||||||
|
await self.sync_all_data()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Periodic sync error: {e}")
|
||||||
|
|
||||||
|
async def sync_all_data(self):
|
||||||
|
"""Sync all data from MongoDB to Solr"""
|
||||||
|
try:
|
||||||
|
logger.info("Starting full data sync")
|
||||||
|
|
||||||
|
# Sync users
|
||||||
|
await self._sync_users()
|
||||||
|
|
||||||
|
# Sync files
|
||||||
|
await self._sync_files()
|
||||||
|
|
||||||
|
# Optimize index after bulk sync
|
||||||
|
self.solr.optimize_index()
|
||||||
|
|
||||||
|
logger.info("Full data sync completed")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Full sync failed: {e}")
|
||||||
|
|
||||||
|
async def _sync_users(self):
|
||||||
|
"""Sync users from MongoDB"""
|
||||||
|
try:
|
||||||
|
db = self.mongo_client['users_db']
|
||||||
|
collection = db['users']
|
||||||
|
|
||||||
|
users = []
|
||||||
|
async for user in collection.find({'deleted_at': None}):
|
||||||
|
user_doc = {
|
||||||
|
'id': f"user_{str(user['_id'])}",
|
||||||
|
'doc_type': 'user',
|
||||||
|
'user_id': str(user['_id']),
|
||||||
|
'username': user.get('username'),
|
||||||
|
'email': user.get('email'),
|
||||||
|
'name': user.get('name', ''),
|
||||||
|
'bio': user.get('bio', ''),
|
||||||
|
'tags': user.get('tags', []),
|
||||||
|
'created_at': user.get('created_at').isoformat() if user.get('created_at') else None,
|
||||||
|
'updated_at': datetime.utcnow().isoformat()
|
||||||
|
}
|
||||||
|
users.append(user_doc)
|
||||||
|
|
||||||
|
# Bulk index every 100 documents
|
||||||
|
if len(users) >= 100:
|
||||||
|
self.solr.bulk_index(users, 'user')
|
||||||
|
users = []
|
||||||
|
|
||||||
|
# Index remaining users
|
||||||
|
if users:
|
||||||
|
self.solr.bulk_index(users, 'user')
|
||||||
|
|
||||||
|
logger.info(f"Synced users to Solr")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to sync users: {e}")
|
||||||
|
|
||||||
|
async def _sync_files(self):
|
||||||
|
"""Sync files from MongoDB"""
|
||||||
|
try:
|
||||||
|
db = self.mongo_client['files_db']
|
||||||
|
collection = db['file_metadata']
|
||||||
|
|
||||||
|
files = []
|
||||||
|
async for file in collection.find({'deleted_at': None}):
|
||||||
|
file_doc = {
|
||||||
|
'id': f"file_{str(file['_id'])}",
|
||||||
|
'doc_type': 'file',
|
||||||
|
'file_id': str(file['_id']),
|
||||||
|
'filename': file.get('filename'),
|
||||||
|
'original_name': file.get('original_name'),
|
||||||
|
'content_type': file.get('content_type'),
|
||||||
|
'size': file.get('size'),
|
||||||
|
'user_id': file.get('user_id'),
|
||||||
|
'tags': list(file.get('tags', {}).keys()),
|
||||||
|
'description': file.get('metadata', {}).get('description', ''),
|
||||||
|
'created_at': file.get('created_at').isoformat() if file.get('created_at') else None,
|
||||||
|
'updated_at': datetime.utcnow().isoformat()
|
||||||
|
}
|
||||||
|
files.append(file_doc)
|
||||||
|
|
||||||
|
# Bulk index every 100 documents
|
||||||
|
if len(files) >= 100:
|
||||||
|
self.solr.bulk_index(files, 'file')
|
||||||
|
files = []
|
||||||
|
|
||||||
|
# Index remaining files
|
||||||
|
if files:
|
||||||
|
self.solr.bulk_index(files, 'file')
|
||||||
|
|
||||||
|
logger.info(f"Synced files to Solr")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to sync files: {e}")
|
||||||
|
|
||||||
|
async def reindex_collection(self, collection_name: str, doc_type: str):
|
||||||
|
"""Reindex a specific collection"""
|
||||||
|
try:
|
||||||
|
# Delete existing documents of this type
|
||||||
|
self.solr.delete_by_query(f'doc_type:{doc_type}')
|
||||||
|
|
||||||
|
# Sync the collection
|
||||||
|
if collection_name == 'users':
|
||||||
|
await self._sync_users()
|
||||||
|
elif collection_name == 'files':
|
||||||
|
await self._sync_files()
|
||||||
|
|
||||||
|
logger.info(f"Reindexed {collection_name}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to reindex {collection_name}: {e}")
|
||||||
362
services/search/backend/main.py
Normal file
362
services/search/backend/main.py
Normal file
@ -0,0 +1,362 @@
|
|||||||
|
"""
|
||||||
|
Search Service with Apache Solr
|
||||||
|
"""
|
||||||
|
from fastapi import FastAPI, Query, HTTPException
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import Optional, List, Dict, Any
|
||||||
|
from datetime import datetime
|
||||||
|
from solr_client import SolrClient
|
||||||
|
from indexer import DataIndexer
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Global instances
|
||||||
|
solr_client = None
|
||||||
|
data_indexer = None
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def lifespan(app: FastAPI):
|
||||||
|
"""Manage application lifecycle"""
|
||||||
|
global solr_client, data_indexer
|
||||||
|
|
||||||
|
# Startup
|
||||||
|
logger.info("Starting Search Service...")
|
||||||
|
|
||||||
|
# Wait for Solr to be ready
|
||||||
|
solr_url = os.getenv("SOLR_URL", "http://solr:8983/solr")
|
||||||
|
max_retries = 30
|
||||||
|
|
||||||
|
for i in range(max_retries):
|
||||||
|
try:
|
||||||
|
solr_client = SolrClient(solr_url=solr_url, core_name="site11")
|
||||||
|
logger.info("Connected to Solr")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Waiting for Solr... ({i+1}/{max_retries})")
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
|
if solr_client:
|
||||||
|
# Initialize data indexer
|
||||||
|
mongodb_url = os.getenv("MONGODB_URL", "mongodb://mongodb:27017")
|
||||||
|
kafka_servers = os.getenv("KAFKA_BOOTSTRAP_SERVERS", "kafka:9092")
|
||||||
|
|
||||||
|
data_indexer = DataIndexer(solr_client, mongodb_url, kafka_servers)
|
||||||
|
await data_indexer.start()
|
||||||
|
|
||||||
|
# Initial data sync
|
||||||
|
asyncio.create_task(data_indexer.sync_all_data())
|
||||||
|
|
||||||
|
yield
|
||||||
|
|
||||||
|
# Shutdown
|
||||||
|
if data_indexer:
|
||||||
|
await data_indexer.stop()
|
||||||
|
|
||||||
|
logger.info("Search Service stopped")
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="Search Service",
|
||||||
|
description="Full-text search with Apache Solr",
|
||||||
|
version="1.0.0",
|
||||||
|
lifespan=lifespan
|
||||||
|
)
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health_check():
|
||||||
|
"""Health check endpoint"""
|
||||||
|
return {
|
||||||
|
"status": "healthy",
|
||||||
|
"service": "search",
|
||||||
|
"timestamp": datetime.utcnow().isoformat(),
|
||||||
|
"solr_connected": solr_client is not None
|
||||||
|
}
|
||||||
|
|
||||||
|
@app.get("/api/search")
|
||||||
|
async def search(
|
||||||
|
q: str = Query(..., description="Search query"),
|
||||||
|
doc_type: Optional[str] = Query(None, description="Filter by document type"),
|
||||||
|
start: int = Query(0, ge=0, description="Starting offset"),
|
||||||
|
rows: int = Query(10, ge=1, le=100, description="Number of results"),
|
||||||
|
sort: Optional[str] = Query(None, description="Sort order (e.g., 'created_at desc')"),
|
||||||
|
facet: bool = Query(False, description="Enable faceting"),
|
||||||
|
facet_field: Optional[List[str]] = Query(None, description="Fields to facet on")
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Search documents across all indexed content
|
||||||
|
"""
|
||||||
|
if not solr_client:
|
||||||
|
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Build filter query
|
||||||
|
fq = []
|
||||||
|
if doc_type:
|
||||||
|
fq.append(f"doc_type:{doc_type}")
|
||||||
|
|
||||||
|
# Prepare search parameters
|
||||||
|
search_params = {
|
||||||
|
'start': start,
|
||||||
|
'rows': rows,
|
||||||
|
'facet': facet
|
||||||
|
}
|
||||||
|
|
||||||
|
if fq:
|
||||||
|
search_params['fq'] = fq
|
||||||
|
|
||||||
|
if sort:
|
||||||
|
search_params['sort'] = sort
|
||||||
|
|
||||||
|
if facet_field:
|
||||||
|
search_params['facet_field'] = facet_field
|
||||||
|
|
||||||
|
# Execute search
|
||||||
|
results = solr_client.search(q, **search_params)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"query": q,
|
||||||
|
"total": results['total'],
|
||||||
|
"start": start,
|
||||||
|
"rows": rows,
|
||||||
|
"documents": results['documents'],
|
||||||
|
"facets": results.get('facets', {}),
|
||||||
|
"highlighting": results.get('highlighting', {})
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Search failed: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
@app.get("/api/search/suggest")
|
||||||
|
async def suggest(
|
||||||
|
q: str = Query(..., min_length=1, description="Query prefix"),
|
||||||
|
field: str = Query("title", description="Field to search in"),
|
||||||
|
limit: int = Query(10, ge=1, le=50, description="Maximum suggestions")
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Get autocomplete suggestions
|
||||||
|
"""
|
||||||
|
if not solr_client:
|
||||||
|
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||||
|
|
||||||
|
try:
|
||||||
|
suggestions = solr_client.suggest(q, field, limit)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"query": q,
|
||||||
|
"suggestions": suggestions,
|
||||||
|
"count": len(suggestions)
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Suggest failed: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
@app.get("/api/search/similar/{doc_id}")
|
||||||
|
async def find_similar(
|
||||||
|
doc_id: str,
|
||||||
|
rows: int = Query(5, ge=1, le=20, description="Number of similar documents")
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Find documents similar to the given document
|
||||||
|
"""
|
||||||
|
if not solr_client:
|
||||||
|
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||||
|
|
||||||
|
try:
|
||||||
|
similar_docs = solr_client.more_like_this(doc_id, rows=rows)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"source_document": doc_id,
|
||||||
|
"similar_documents": similar_docs,
|
||||||
|
"count": len(similar_docs)
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Similar search failed: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
@app.post("/api/search/index")
|
||||||
|
async def index_document(document: Dict[str, Any]):
|
||||||
|
"""
|
||||||
|
Index a single document
|
||||||
|
"""
|
||||||
|
if not solr_client:
|
||||||
|
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||||
|
|
||||||
|
try:
|
||||||
|
doc_type = document.get('doc_type', 'general')
|
||||||
|
success = solr_client.index_document(document, doc_type)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
return {
|
||||||
|
"status": "success",
|
||||||
|
"message": "Document indexed",
|
||||||
|
"document_id": document.get('id')
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to index document")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Indexing failed: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
@app.post("/api/search/bulk-index")
|
||||||
|
async def bulk_index(documents: List[Dict[str, Any]]):
|
||||||
|
"""
|
||||||
|
Bulk index multiple documents
|
||||||
|
"""
|
||||||
|
if not solr_client:
|
||||||
|
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||||
|
|
||||||
|
try:
|
||||||
|
indexed = solr_client.bulk_index(documents)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "success",
|
||||||
|
"message": f"Indexed {indexed} documents",
|
||||||
|
"count": indexed
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Bulk indexing failed: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
@app.delete("/api/search/document/{doc_id}")
|
||||||
|
async def delete_document(doc_id: str):
|
||||||
|
"""
|
||||||
|
Delete a document from the index
|
||||||
|
"""
|
||||||
|
if not solr_client:
|
||||||
|
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||||
|
|
||||||
|
try:
|
||||||
|
success = solr_client.delete_document(doc_id)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
return {
|
||||||
|
"status": "success",
|
||||||
|
"message": "Document deleted",
|
||||||
|
"document_id": doc_id
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to delete document")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Deletion failed: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
@app.get("/api/search/stats")
|
||||||
|
async def get_stats():
|
||||||
|
"""
|
||||||
|
Get search index statistics
|
||||||
|
"""
|
||||||
|
if not solr_client:
|
||||||
|
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||||
|
|
||||||
|
try:
|
||||||
|
stats = solr_client.get_stats()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "success",
|
||||||
|
"statistics": stats,
|
||||||
|
"timestamp": datetime.utcnow().isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to get stats: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
@app.post("/api/search/reindex/{collection}")
|
||||||
|
async def reindex_collection(
|
||||||
|
collection: str,
|
||||||
|
doc_type: Optional[str] = Query(None, description="Document type for the collection")
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Reindex a specific collection
|
||||||
|
"""
|
||||||
|
if not data_indexer:
|
||||||
|
raise HTTPException(status_code=503, detail="Indexer service unavailable")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not doc_type:
|
||||||
|
# Map collection to doc_type
|
||||||
|
doc_type_map = {
|
||||||
|
'users': 'user',
|
||||||
|
'files': 'file',
|
||||||
|
'content': 'content'
|
||||||
|
}
|
||||||
|
doc_type = doc_type_map.get(collection, collection)
|
||||||
|
|
||||||
|
asyncio.create_task(data_indexer.reindex_collection(collection, doc_type))
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "success",
|
||||||
|
"message": f"Reindexing {collection} started",
|
||||||
|
"collection": collection,
|
||||||
|
"doc_type": doc_type
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Reindex failed: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
@app.post("/api/search/optimize")
|
||||||
|
async def optimize_index():
|
||||||
|
"""
|
||||||
|
Optimize the search index
|
||||||
|
"""
|
||||||
|
if not solr_client:
|
||||||
|
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||||
|
|
||||||
|
try:
|
||||||
|
success = solr_client.optimize_index()
|
||||||
|
|
||||||
|
if success:
|
||||||
|
return {
|
||||||
|
"status": "success",
|
||||||
|
"message": "Index optimization started"
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to optimize index")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Optimization failed: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
@app.post("/api/search/clear")
|
||||||
|
async def clear_index():
|
||||||
|
"""
|
||||||
|
Clear all documents from the index (DANGER!)
|
||||||
|
"""
|
||||||
|
if not solr_client:
|
||||||
|
raise HTTPException(status_code=503, detail="Search service unavailable")
|
||||||
|
|
||||||
|
try:
|
||||||
|
success = solr_client.clear_index()
|
||||||
|
|
||||||
|
if success:
|
||||||
|
return {
|
||||||
|
"status": "success",
|
||||||
|
"message": "Index cleared",
|
||||||
|
"warning": "All documents have been deleted!"
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to clear index")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Clear index failed: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||||
10
services/search/backend/requirements.txt
Normal file
10
services/search/backend/requirements.txt
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
fastapi==0.109.0
|
||||||
|
uvicorn[standard]==0.27.0
|
||||||
|
pydantic==2.5.3
|
||||||
|
python-dotenv==1.0.0
|
||||||
|
pysolr==3.9.0
|
||||||
|
httpx==0.25.2
|
||||||
|
motor==3.5.1
|
||||||
|
pymongo==4.6.1
|
||||||
|
aiokafka==0.10.0
|
||||||
|
redis==5.0.1
|
||||||
303
services/search/backend/solr_client.py
Normal file
303
services/search/backend/solr_client.py
Normal file
@ -0,0 +1,303 @@
|
|||||||
|
"""
|
||||||
|
Apache Solr client for search operations
|
||||||
|
"""
|
||||||
|
import pysolr
|
||||||
|
import logging
|
||||||
|
from typing import Dict, List, Any, Optional
|
||||||
|
from datetime import datetime
|
||||||
|
import json
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class SolrClient:
|
||||||
|
def __init__(self, solr_url: str = "http://solr:8983/solr", core_name: str = "site11"):
|
||||||
|
self.solr_url = f"{solr_url}/{core_name}"
|
||||||
|
self.core_name = core_name
|
||||||
|
self.solr = None
|
||||||
|
self.connect()
|
||||||
|
|
||||||
|
def connect(self):
|
||||||
|
"""Connect to Solr instance"""
|
||||||
|
try:
|
||||||
|
self.solr = pysolr.Solr(
|
||||||
|
self.solr_url,
|
||||||
|
always_commit=True,
|
||||||
|
timeout=10
|
||||||
|
)
|
||||||
|
# Test connection
|
||||||
|
self.solr.ping()
|
||||||
|
logger.info(f"Connected to Solr at {self.solr_url}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to connect to Solr: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def index_document(self, document: Dict[str, Any], doc_type: str = None) -> bool:
|
||||||
|
"""Index a single document"""
|
||||||
|
try:
|
||||||
|
# Add metadata
|
||||||
|
if doc_type:
|
||||||
|
document["doc_type"] = doc_type
|
||||||
|
|
||||||
|
if "id" not in document:
|
||||||
|
document["id"] = f"{doc_type}_{document.get('_id', '')}"
|
||||||
|
|
||||||
|
# Add indexing timestamp
|
||||||
|
document["indexed_at"] = datetime.utcnow().isoformat()
|
||||||
|
|
||||||
|
# Index the document
|
||||||
|
self.solr.add([document])
|
||||||
|
logger.info(f"Indexed document: {document.get('id')}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to index document: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def bulk_index(self, documents: List[Dict[str, Any]], doc_type: str = None) -> int:
|
||||||
|
"""Bulk index multiple documents"""
|
||||||
|
try:
|
||||||
|
indexed = 0
|
||||||
|
for doc in documents:
|
||||||
|
if doc_type:
|
||||||
|
doc["doc_type"] = doc_type
|
||||||
|
|
||||||
|
if "id" not in doc:
|
||||||
|
doc["id"] = f"{doc_type}_{doc.get('_id', '')}"
|
||||||
|
|
||||||
|
doc["indexed_at"] = datetime.utcnow().isoformat()
|
||||||
|
|
||||||
|
self.solr.add(documents)
|
||||||
|
indexed = len(documents)
|
||||||
|
logger.info(f"Bulk indexed {indexed} documents")
|
||||||
|
return indexed
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to bulk index: {e}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def search(self, query: str, **kwargs) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Search documents
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Search query string
|
||||||
|
**kwargs: Additional search parameters
|
||||||
|
- fq: Filter queries
|
||||||
|
- fl: Fields to return
|
||||||
|
- start: Starting offset
|
||||||
|
- rows: Number of rows
|
||||||
|
- sort: Sort order
|
||||||
|
- facet: Enable faceting
|
||||||
|
- facet.field: Fields to facet on
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Default parameters
|
||||||
|
params = {
|
||||||
|
'q': query,
|
||||||
|
'start': kwargs.get('start', 0),
|
||||||
|
'rows': kwargs.get('rows', 10),
|
||||||
|
'fl': kwargs.get('fl', '*,score'),
|
||||||
|
'defType': 'edismax',
|
||||||
|
'qf': 'title^3 content^2 tags description name', # Boost fields
|
||||||
|
'mm': '2<-25%', # Minimum match
|
||||||
|
'hl': 'true', # Highlighting
|
||||||
|
'hl.fl': 'title,content,description',
|
||||||
|
'hl.simple.pre': '<mark>',
|
||||||
|
'hl.simple.post': '</mark>'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add filter queries
|
||||||
|
if 'fq' in kwargs:
|
||||||
|
params['fq'] = kwargs['fq']
|
||||||
|
|
||||||
|
# Add sorting
|
||||||
|
if 'sort' in kwargs:
|
||||||
|
params['sort'] = kwargs['sort']
|
||||||
|
|
||||||
|
# Add faceting
|
||||||
|
if kwargs.get('facet'):
|
||||||
|
params.update({
|
||||||
|
'facet': 'true',
|
||||||
|
'facet.field': kwargs.get('facet_field', ['doc_type', 'tags', 'status']),
|
||||||
|
'facet.mincount': 1
|
||||||
|
})
|
||||||
|
|
||||||
|
# Execute search
|
||||||
|
results = self.solr.search(**params)
|
||||||
|
|
||||||
|
# Format response
|
||||||
|
response = {
|
||||||
|
'total': results.hits,
|
||||||
|
'documents': [],
|
||||||
|
'facets': {},
|
||||||
|
'highlighting': {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add documents
|
||||||
|
for doc in results.docs:
|
||||||
|
response['documents'].append(doc)
|
||||||
|
|
||||||
|
# Add facets if available
|
||||||
|
if hasattr(results, 'facets') and results.facets:
|
||||||
|
if 'facet_fields' in results.facets:
|
||||||
|
for field, values in results.facets['facet_fields'].items():
|
||||||
|
response['facets'][field] = [
|
||||||
|
{'value': values[i], 'count': values[i+1]}
|
||||||
|
for i in range(0, len(values), 2)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add highlighting if available
|
||||||
|
if hasattr(results, 'highlighting'):
|
||||||
|
response['highlighting'] = results.highlighting
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Search failed: {e}")
|
||||||
|
return {'total': 0, 'documents': [], 'error': str(e)}
|
||||||
|
|
||||||
|
def suggest(self, prefix: str, field: str = "suggest", limit: int = 10) -> List[str]:
|
||||||
|
"""Get autocomplete suggestions"""
|
||||||
|
try:
|
||||||
|
params = {
|
||||||
|
'q': f'{field}:{prefix}*',
|
||||||
|
'fl': field,
|
||||||
|
'rows': limit,
|
||||||
|
'start': 0
|
||||||
|
}
|
||||||
|
|
||||||
|
results = self.solr.search(**params)
|
||||||
|
suggestions = []
|
||||||
|
|
||||||
|
for doc in results.docs:
|
||||||
|
if field in doc:
|
||||||
|
value = doc[field]
|
||||||
|
if isinstance(value, list):
|
||||||
|
suggestions.extend(value)
|
||||||
|
else:
|
||||||
|
suggestions.append(value)
|
||||||
|
|
||||||
|
# Remove duplicates and limit
|
||||||
|
seen = set()
|
||||||
|
unique_suggestions = []
|
||||||
|
for s in suggestions:
|
||||||
|
if s not in seen:
|
||||||
|
seen.add(s)
|
||||||
|
unique_suggestions.append(s)
|
||||||
|
if len(unique_suggestions) >= limit:
|
||||||
|
break
|
||||||
|
|
||||||
|
return unique_suggestions
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Suggest failed: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def more_like_this(self, doc_id: str, mlt_fields: List[str] = None, rows: int = 5) -> List[Dict]:
|
||||||
|
"""Find similar documents"""
|
||||||
|
try:
|
||||||
|
if not mlt_fields:
|
||||||
|
mlt_fields = ['title', 'content', 'tags', 'description']
|
||||||
|
|
||||||
|
params = {
|
||||||
|
'q': f'id:{doc_id}',
|
||||||
|
'mlt': 'true',
|
||||||
|
'mlt.fl': ','.join(mlt_fields),
|
||||||
|
'mlt.mindf': 1,
|
||||||
|
'mlt.mintf': 1,
|
||||||
|
'mlt.count': rows,
|
||||||
|
'fl': '*,score'
|
||||||
|
}
|
||||||
|
|
||||||
|
results = self.solr.search(**params)
|
||||||
|
|
||||||
|
if results.docs:
|
||||||
|
# The MLT results are in the moreLikeThis section
|
||||||
|
if hasattr(results, 'moreLikeThis'):
|
||||||
|
mlt_results = results.moreLikeThis.get(doc_id, {})
|
||||||
|
if 'docs' in mlt_results:
|
||||||
|
return mlt_results['docs']
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"More like this failed: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def delete_document(self, doc_id: str) -> bool:
|
||||||
|
"""Delete a document by ID"""
|
||||||
|
try:
|
||||||
|
self.solr.delete(id=doc_id)
|
||||||
|
logger.info(f"Deleted document: {doc_id}")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to delete document: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def delete_by_query(self, query: str) -> bool:
|
||||||
|
"""Delete documents matching a query"""
|
||||||
|
try:
|
||||||
|
self.solr.delete(q=query)
|
||||||
|
logger.info(f"Deleted documents matching: {query}")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to delete by query: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def clear_index(self) -> bool:
|
||||||
|
"""Clear all documents from index"""
|
||||||
|
try:
|
||||||
|
self.solr.delete(q='*:*')
|
||||||
|
logger.info("Cleared all documents from index")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to clear index: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_stats(self) -> Dict[str, Any]:
|
||||||
|
"""Get index statistics"""
|
||||||
|
try:
|
||||||
|
# Get document count
|
||||||
|
results = self.solr.search(q='*:*', rows=0)
|
||||||
|
|
||||||
|
# Get facet counts for doc_type
|
||||||
|
facet_results = self.solr.search(
|
||||||
|
q='*:*',
|
||||||
|
rows=0,
|
||||||
|
facet='true',
|
||||||
|
**{'facet.field': ['doc_type', 'status']}
|
||||||
|
)
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
'total_documents': results.hits,
|
||||||
|
'doc_types': {},
|
||||||
|
'status_counts': {}
|
||||||
|
}
|
||||||
|
|
||||||
|
if hasattr(facet_results, 'facets') and facet_results.facets:
|
||||||
|
if 'facet_fields' in facet_results.facets:
|
||||||
|
# Parse doc_type facets
|
||||||
|
doc_type_facets = facet_results.facets['facet_fields'].get('doc_type', [])
|
||||||
|
for i in range(0, len(doc_type_facets), 2):
|
||||||
|
stats['doc_types'][doc_type_facets[i]] = doc_type_facets[i+1]
|
||||||
|
|
||||||
|
# Parse status facets
|
||||||
|
status_facets = facet_results.facets['facet_fields'].get('status', [])
|
||||||
|
for i in range(0, len(status_facets), 2):
|
||||||
|
stats['status_counts'][status_facets[i]] = status_facets[i+1]
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to get stats: {e}")
|
||||||
|
return {'error': str(e)}
|
||||||
|
|
||||||
|
def optimize_index(self) -> bool:
|
||||||
|
"""Optimize the Solr index"""
|
||||||
|
try:
|
||||||
|
self.solr.optimize()
|
||||||
|
logger.info("Index optimized")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to optimize index: {e}")
|
||||||
|
return False
|
||||||
292
services/search/backend/test_search.py
Normal file
292
services/search/backend/test_search.py
Normal file
@ -0,0 +1,292 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script for Search Service with Apache Solr
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import httpx
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
BASE_URL = "http://localhost:8015"
|
||||||
|
|
||||||
|
async def test_search_api():
|
||||||
|
"""Test search API endpoints"""
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
print("\n🔍 Testing Search Service API...")
|
||||||
|
|
||||||
|
# Test health check
|
||||||
|
print("\n1. Testing health check...")
|
||||||
|
response = await client.get(f"{BASE_URL}/health")
|
||||||
|
print(f"Health check: {response.json()}")
|
||||||
|
|
||||||
|
# Test index sample documents
|
||||||
|
print("\n2. Indexing sample documents...")
|
||||||
|
|
||||||
|
# Index user document
|
||||||
|
user_doc = {
|
||||||
|
"id": "user_test_001",
|
||||||
|
"doc_type": "user",
|
||||||
|
"user_id": "test_001",
|
||||||
|
"username": "john_doe",
|
||||||
|
"email": "john@example.com",
|
||||||
|
"name": "John Doe",
|
||||||
|
"bio": "Software developer passionate about Python and microservices",
|
||||||
|
"tags": ["python", "developer", "backend"],
|
||||||
|
"created_at": datetime.utcnow().isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
response = await client.post(f"{BASE_URL}/api/search/index", json=user_doc)
|
||||||
|
print(f"Indexed user: {response.json()}")
|
||||||
|
|
||||||
|
# Index file documents
|
||||||
|
file_docs = [
|
||||||
|
{
|
||||||
|
"id": "file_test_001",
|
||||||
|
"doc_type": "file",
|
||||||
|
"file_id": "test_file_001",
|
||||||
|
"filename": "architecture_diagram.png",
|
||||||
|
"content_type": "image/png",
|
||||||
|
"size": 1024000,
|
||||||
|
"user_id": "test_001",
|
||||||
|
"tags": ["architecture", "design", "documentation"],
|
||||||
|
"description": "System architecture diagram showing microservices",
|
||||||
|
"created_at": datetime.utcnow().isoformat()
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "file_test_002",
|
||||||
|
"doc_type": "file",
|
||||||
|
"file_id": "test_file_002",
|
||||||
|
"filename": "user_manual.pdf",
|
||||||
|
"content_type": "application/pdf",
|
||||||
|
"size": 2048000,
|
||||||
|
"user_id": "test_001",
|
||||||
|
"tags": ["documentation", "manual", "guide"],
|
||||||
|
"description": "Complete user manual for the application",
|
||||||
|
"created_at": datetime.utcnow().isoformat()
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
response = await client.post(f"{BASE_URL}/api/search/bulk-index", json=file_docs)
|
||||||
|
print(f"Bulk indexed files: {response.json()}")
|
||||||
|
|
||||||
|
# Index content documents
|
||||||
|
content_docs = [
|
||||||
|
{
|
||||||
|
"id": "content_test_001",
|
||||||
|
"doc_type": "content",
|
||||||
|
"content_id": "test_content_001",
|
||||||
|
"title": "Getting Started with Microservices",
|
||||||
|
"content": "Microservices architecture is a method of developing software applications as a suite of independently deployable services.",
|
||||||
|
"summary": "Introduction to microservices architecture patterns",
|
||||||
|
"author_id": "test_001",
|
||||||
|
"tags": ["microservices", "architecture", "tutorial"],
|
||||||
|
"category": "technology",
|
||||||
|
"status": "published",
|
||||||
|
"created_at": datetime.utcnow().isoformat()
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "content_test_002",
|
||||||
|
"doc_type": "content",
|
||||||
|
"content_id": "test_content_002",
|
||||||
|
"title": "Python Best Practices",
|
||||||
|
"content": "Learn the best practices for writing clean, maintainable Python code including PEP 8 style guide.",
|
||||||
|
"summary": "Essential Python coding standards and practices",
|
||||||
|
"author_id": "test_001",
|
||||||
|
"tags": ["python", "programming", "best-practices"],
|
||||||
|
"category": "programming",
|
||||||
|
"status": "published",
|
||||||
|
"created_at": datetime.utcnow().isoformat()
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
response = await client.post(f"{BASE_URL}/api/search/bulk-index", json=content_docs)
|
||||||
|
print(f"Bulk indexed content: {response.json()}")
|
||||||
|
|
||||||
|
# Wait for indexing
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
|
# Test basic search
|
||||||
|
print("\n3. Testing basic search...")
|
||||||
|
response = await client.get(
|
||||||
|
f"{BASE_URL}/api/search",
|
||||||
|
params={"q": "microservices"}
|
||||||
|
)
|
||||||
|
results = response.json()
|
||||||
|
print(f"Search for 'microservices': Found {results['total']} results")
|
||||||
|
if results['documents']:
|
||||||
|
print(f"First result: {results['documents'][0].get('title', results['documents'][0].get('filename', 'N/A'))}")
|
||||||
|
|
||||||
|
# Test search with filters
|
||||||
|
print("\n4. Testing filtered search...")
|
||||||
|
response = await client.get(
|
||||||
|
f"{BASE_URL}/api/search",
|
||||||
|
params={
|
||||||
|
"q": "*:*",
|
||||||
|
"doc_type": "file",
|
||||||
|
"rows": 5
|
||||||
|
}
|
||||||
|
)
|
||||||
|
results = response.json()
|
||||||
|
print(f"Files search: Found {results['total']} files")
|
||||||
|
|
||||||
|
# Test faceted search
|
||||||
|
print("\n5. Testing faceted search...")
|
||||||
|
response = await client.get(
|
||||||
|
f"{BASE_URL}/api/search",
|
||||||
|
params={
|
||||||
|
"q": "*:*",
|
||||||
|
"facet": "true",
|
||||||
|
"facet_field": ["doc_type", "tags", "category", "status"]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
results = response.json()
|
||||||
|
print(f"Facets: {json.dumps(results['facets'], indent=2)}")
|
||||||
|
|
||||||
|
# Test autocomplete/suggest
|
||||||
|
print("\n6. Testing autocomplete...")
|
||||||
|
response = await client.get(
|
||||||
|
f"{BASE_URL}/api/search/suggest",
|
||||||
|
params={
|
||||||
|
"q": "micro",
|
||||||
|
"field": "title",
|
||||||
|
"limit": 5
|
||||||
|
}
|
||||||
|
)
|
||||||
|
suggestions = response.json()
|
||||||
|
print(f"Suggestions for 'micro': {suggestions['suggestions']}")
|
||||||
|
|
||||||
|
# Test similar documents
|
||||||
|
print("\n7. Testing similar documents...")
|
||||||
|
response = await client.get(f"{BASE_URL}/api/search/similar/content_test_001")
|
||||||
|
if response.status_code == 200:
|
||||||
|
similar = response.json()
|
||||||
|
print(f"Found {similar['count']} similar documents")
|
||||||
|
else:
|
||||||
|
print(f"Similar search: {response.status_code}")
|
||||||
|
|
||||||
|
# Test search with highlighting
|
||||||
|
print("\n8. Testing search with highlighting...")
|
||||||
|
response = await client.get(
|
||||||
|
f"{BASE_URL}/api/search",
|
||||||
|
params={"q": "Python"}
|
||||||
|
)
|
||||||
|
results = response.json()
|
||||||
|
if results['highlighting']:
|
||||||
|
print(f"Highlighting results: {len(results['highlighting'])} documents highlighted")
|
||||||
|
|
||||||
|
# Test search statistics
|
||||||
|
print("\n9. Testing search statistics...")
|
||||||
|
response = await client.get(f"{BASE_URL}/api/search/stats")
|
||||||
|
if response.status_code == 200:
|
||||||
|
stats = response.json()
|
||||||
|
print(f"Index stats: {stats['statistics']}")
|
||||||
|
|
||||||
|
# Test complex query
|
||||||
|
print("\n10. Testing complex query...")
|
||||||
|
response = await client.get(
|
||||||
|
f"{BASE_URL}/api/search",
|
||||||
|
params={
|
||||||
|
"q": "architecture OR python",
|
||||||
|
"doc_type": "content",
|
||||||
|
"sort": "created_at desc",
|
||||||
|
"rows": 10
|
||||||
|
}
|
||||||
|
)
|
||||||
|
results = response.json()
|
||||||
|
print(f"Complex query: Found {results['total']} results")
|
||||||
|
|
||||||
|
# Test delete document
|
||||||
|
print("\n11. Testing document deletion...")
|
||||||
|
response = await client.delete(f"{BASE_URL}/api/search/document/content_test_002")
|
||||||
|
if response.status_code == 200:
|
||||||
|
print(f"Deleted document: {response.json()}")
|
||||||
|
|
||||||
|
# Verify deletion
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
response = await client.get(
|
||||||
|
f"{BASE_URL}/api/search",
|
||||||
|
params={"q": "id:content_test_002"}
|
||||||
|
)
|
||||||
|
results = response.json()
|
||||||
|
print(f"Verify deletion: Found {results['total']} results (should be 0)")
|
||||||
|
|
||||||
|
async def test_performance():
|
||||||
|
"""Test search performance"""
|
||||||
|
print("\n\n⚡ Testing Search Performance...")
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
# Index many documents
|
||||||
|
print("Indexing 100 test documents...")
|
||||||
|
docs = []
|
||||||
|
for i in range(100):
|
||||||
|
docs.append({
|
||||||
|
"id": f"perf_test_{i}",
|
||||||
|
"doc_type": "content",
|
||||||
|
"title": f"Test Document {i}",
|
||||||
|
"content": f"This is test content for document {i} with various keywords like search, Solr, Python, microservices",
|
||||||
|
"tags": [f"tag{i%10}", f"category{i%5}"],
|
||||||
|
"created_at": datetime.utcnow().isoformat()
|
||||||
|
})
|
||||||
|
|
||||||
|
response = await client.post(f"{BASE_URL}/api/search/bulk-index", json=docs)
|
||||||
|
print(f"Indexed {response.json().get('count', 0)} documents")
|
||||||
|
|
||||||
|
# Wait for indexing
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
|
# Test search speed
|
||||||
|
print("\nTesting search response times...")
|
||||||
|
import time
|
||||||
|
|
||||||
|
queries = ["search", "Python", "document", "test", "microservices"]
|
||||||
|
for query in queries:
|
||||||
|
start = time.time()
|
||||||
|
response = await client.get(
|
||||||
|
f"{BASE_URL}/api/search",
|
||||||
|
params={"q": query, "rows": 20}
|
||||||
|
)
|
||||||
|
elapsed = time.time() - start
|
||||||
|
results = response.json()
|
||||||
|
print(f"Query '{query}': {results['total']} results in {elapsed:.3f}s")
|
||||||
|
|
||||||
|
async def test_reindex():
|
||||||
|
"""Test reindexing from MongoDB"""
|
||||||
|
print("\n\n🔄 Testing Reindex Functionality...")
|
||||||
|
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
# Trigger reindex for users collection
|
||||||
|
print("Triggering reindex for users collection...")
|
||||||
|
response = await client.post(
|
||||||
|
f"{BASE_URL}/api/search/reindex/users",
|
||||||
|
params={"doc_type": "user"}
|
||||||
|
)
|
||||||
|
if response.status_code == 200:
|
||||||
|
print(f"Reindex started: {response.json()}")
|
||||||
|
else:
|
||||||
|
print(f"Reindex failed: {response.status_code}")
|
||||||
|
|
||||||
|
# Test index optimization
|
||||||
|
print("\nTesting index optimization...")
|
||||||
|
response = await client.post(f"{BASE_URL}/api/search/optimize")
|
||||||
|
if response.status_code == 200:
|
||||||
|
print(f"Optimization: {response.json()}")
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Run all tests"""
|
||||||
|
print("=" * 60)
|
||||||
|
print("SEARCH SERVICE TEST SUITE (Apache Solr)")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Started at: {datetime.now().isoformat()}")
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
await test_search_api()
|
||||||
|
await test_performance()
|
||||||
|
await test_reindex()
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("✅ All search tests completed!")
|
||||||
|
print(f"Finished at: {datetime.now().isoformat()}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
105
services/search/solr-config/conf/managed-schema.xml
Normal file
105
services/search/solr-config/conf/managed-schema.xml
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<schema name="site11" version="1.6">
|
||||||
|
<!-- Field Types -->
|
||||||
|
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
|
||||||
|
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
|
||||||
|
<fieldType name="int" class="solr.IntPointField" omitNorms="true"/>
|
||||||
|
<fieldType name="long" class="solr.LongPointField" omitNorms="true"/>
|
||||||
|
<fieldType name="float" class="solr.FloatPointField" omitNorms="true"/>
|
||||||
|
<fieldType name="double" class="solr.DoublePointField" omitNorms="true"/>
|
||||||
|
<fieldType name="date" class="solr.DatePointField" omitNorms="true"/>
|
||||||
|
|
||||||
|
<!-- Text field with analysis -->
|
||||||
|
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
<analyzer type="index">
|
||||||
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
<filter class="solr.EdgeNGramFilterFactory" minGramSize="2" maxGramSize="15"/>
|
||||||
|
</analyzer>
|
||||||
|
<analyzer type="query">
|
||||||
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
|
||||||
|
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
|
<!-- Text field for exact matching -->
|
||||||
|
<fieldType name="text_exact" class="solr.TextField">
|
||||||
|
<analyzer>
|
||||||
|
<tokenizer class="solr.KeywordTokenizerFactory"/>
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
|
<!-- Autocomplete/Suggest field -->
|
||||||
|
<fieldType name="text_suggest" class="solr.TextField" positionIncrementGap="100">
|
||||||
|
<analyzer>
|
||||||
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||||
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
|
<filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="20"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
|
<!-- Fields -->
|
||||||
|
<field name="id" type="string" indexed="true" stored="true" required="true"/>
|
||||||
|
<field name="_version_" type="long" indexed="true" stored="true"/>
|
||||||
|
|
||||||
|
<!-- Document type and metadata -->
|
||||||
|
<field name="doc_type" type="string" indexed="true" stored="true" docValues="true"/>
|
||||||
|
<field name="indexed_at" type="date" indexed="true" stored="true"/>
|
||||||
|
|
||||||
|
<!-- Common fields across document types -->
|
||||||
|
<field name="title" type="text_general" indexed="true" stored="true" termVectors="true"/>
|
||||||
|
<field name="content" type="text_general" indexed="true" stored="true" termVectors="true"/>
|
||||||
|
<field name="description" type="text_general" indexed="true" stored="true"/>
|
||||||
|
<field name="summary" type="text_general" indexed="true" stored="true"/>
|
||||||
|
<field name="tags" type="string" indexed="true" stored="true" multiValued="true" docValues="true"/>
|
||||||
|
<field name="category" type="string" indexed="true" stored="true" docValues="true"/>
|
||||||
|
<field name="status" type="string" indexed="true" stored="true" docValues="true"/>
|
||||||
|
|
||||||
|
<!-- User-specific fields -->
|
||||||
|
<field name="user_id" type="string" indexed="true" stored="true"/>
|
||||||
|
<field name="username" type="text_exact" indexed="true" stored="true"/>
|
||||||
|
<field name="email" type="text_exact" indexed="true" stored="true"/>
|
||||||
|
<field name="name" type="text_general" indexed="true" stored="true"/>
|
||||||
|
<field name="bio" type="text_general" indexed="true" stored="true"/>
|
||||||
|
|
||||||
|
<!-- File-specific fields -->
|
||||||
|
<field name="file_id" type="string" indexed="true" stored="true"/>
|
||||||
|
<field name="filename" type="text_general" indexed="true" stored="true"/>
|
||||||
|
<field name="original_name" type="text_general" indexed="true" stored="true"/>
|
||||||
|
<field name="content_type" type="string" indexed="true" stored="true" docValues="true"/>
|
||||||
|
<field name="size" type="long" indexed="true" stored="true"/>
|
||||||
|
|
||||||
|
<!-- Content-specific fields -->
|
||||||
|
<field name="content_id" type="string" indexed="true" stored="true"/>
|
||||||
|
<field name="author_id" type="string" indexed="true" stored="true"/>
|
||||||
|
|
||||||
|
<!-- Dates -->
|
||||||
|
<field name="created_at" type="date" indexed="true" stored="true"/>
|
||||||
|
<field name="updated_at" type="date" indexed="true" stored="true"/>
|
||||||
|
|
||||||
|
<!-- Suggest field for autocomplete -->
|
||||||
|
<field name="suggest" type="text_suggest" indexed="true" stored="false" multiValued="true"/>
|
||||||
|
|
||||||
|
<!-- Copy fields for better search -->
|
||||||
|
<copyField source="title" dest="suggest"/>
|
||||||
|
<copyField source="name" dest="suggest"/>
|
||||||
|
<copyField source="filename" dest="suggest"/>
|
||||||
|
<copyField source="tags" dest="suggest"/>
|
||||||
|
|
||||||
|
<!-- Dynamic fields -->
|
||||||
|
<dynamicField name="*_i" type="int" indexed="true" stored="true"/>
|
||||||
|
<dynamicField name="*_l" type="long" indexed="true" stored="true"/>
|
||||||
|
<dynamicField name="*_f" type="float" indexed="true" stored="true"/>
|
||||||
|
<dynamicField name="*_d" type="double" indexed="true" stored="true"/>
|
||||||
|
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
|
||||||
|
<dynamicField name="*_t" type="text_general" indexed="true" stored="true"/>
|
||||||
|
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
|
||||||
|
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
|
||||||
|
|
||||||
|
<!-- Unique Key -->
|
||||||
|
<uniqueKey>id</uniqueKey>
|
||||||
|
</schema>
|
||||||
154
services/search/solr-config/conf/solrconfig.xml
Normal file
154
services/search/solr-config/conf/solrconfig.xml
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8" ?>
|
||||||
|
<config>
|
||||||
|
<luceneMatchVersion>9.4.0</luceneMatchVersion>
|
||||||
|
|
||||||
|
<!-- Data Directory -->
|
||||||
|
<dataDir>${solr.data.dir:}</dataDir>
|
||||||
|
|
||||||
|
<!-- Index Config -->
|
||||||
|
<indexConfig>
|
||||||
|
<ramBufferSizeMB>100</ramBufferSizeMB>
|
||||||
|
<maxBufferedDocs>1000</maxBufferedDocs>
|
||||||
|
<mergePolicyFactory class="org.apache.solr.index.TieredMergePolicyFactory">
|
||||||
|
<int name="maxMergeAtOnce">10</int>
|
||||||
|
<int name="segmentsPerTier">10</int>
|
||||||
|
</mergePolicyFactory>
|
||||||
|
</indexConfig>
|
||||||
|
|
||||||
|
<!-- Update Handler -->
|
||||||
|
<updateHandler class="solr.DirectUpdateHandler2">
|
||||||
|
<updateLog>
|
||||||
|
<str name="dir">${solr.ulog.dir:}</str>
|
||||||
|
<int name="numVersionBuckets">${solr.ulog.numVersionBuckets:65536}</int>
|
||||||
|
</updateLog>
|
||||||
|
<autoCommit>
|
||||||
|
<maxTime>${solr.autoCommit.maxTime:15000}</maxTime>
|
||||||
|
<openSearcher>false</openSearcher>
|
||||||
|
</autoCommit>
|
||||||
|
<autoSoftCommit>
|
||||||
|
<maxTime>${solr.autoSoftCommit.maxTime:1000}</maxTime>
|
||||||
|
</autoSoftCommit>
|
||||||
|
</updateHandler>
|
||||||
|
|
||||||
|
<!-- Query Settings -->
|
||||||
|
<query>
|
||||||
|
<maxBooleanClauses>1024</maxBooleanClauses>
|
||||||
|
<filterCache class="solr.CaffeineCache" size="512" initialSize="512" autowarmCount="0"/>
|
||||||
|
<queryResultCache class="solr.CaffeineCache" size="512" initialSize="512" autowarmCount="0"/>
|
||||||
|
<documentCache class="solr.CaffeineCache" size="512" initialSize="512" autowarmCount="0"/>
|
||||||
|
<enableLazyFieldLoading>true</enableLazyFieldLoading>
|
||||||
|
<queryResultWindowSize>20</queryResultWindowSize>
|
||||||
|
<queryResultMaxDocsCached>200</queryResultMaxDocsCached>
|
||||||
|
</query>
|
||||||
|
|
||||||
|
<!-- Request Dispatcher -->
|
||||||
|
<requestDispatcher>
|
||||||
|
<requestParsers enableRemoteStreaming="true" multipartUploadLimitInKB="2048000"
|
||||||
|
formdataUploadLimitInKB="2048" addHttpRequestToContext="false"/>
|
||||||
|
<httpCaching never304="true"/>
|
||||||
|
</requestDispatcher>
|
||||||
|
|
||||||
|
<!-- Request Handlers -->
|
||||||
|
|
||||||
|
<!-- Standard search handler -->
|
||||||
|
<requestHandler name="/select" class="solr.SearchHandler">
|
||||||
|
<lst name="defaults">
|
||||||
|
<str name="echoParams">explicit</str>
|
||||||
|
<int name="rows">10</int>
|
||||||
|
<str name="df">content</str>
|
||||||
|
<str name="q.op">OR</str>
|
||||||
|
<str name="defType">edismax</str>
|
||||||
|
<str name="qf">
|
||||||
|
title^3.0 name^2.5 content^2.0 description^1.5 summary^1.5
|
||||||
|
filename^1.5 tags^1.2 category username email bio
|
||||||
|
</str>
|
||||||
|
<str name="pf">
|
||||||
|
title^4.0 name^3.0 content^2.5 description^2.0
|
||||||
|
</str>
|
||||||
|
<str name="mm">2<-25%</str>
|
||||||
|
<str name="hl">true</str>
|
||||||
|
<str name="hl.fl">title,content,description,summary</str>
|
||||||
|
<str name="hl.simple.pre"><mark></str>
|
||||||
|
<str name="hl.simple.post"></mark></str>
|
||||||
|
<str name="facet">true</str>
|
||||||
|
<str name="facet.mincount">1</str>
|
||||||
|
</lst>
|
||||||
|
</requestHandler>
|
||||||
|
|
||||||
|
<!-- Update handler -->
|
||||||
|
<requestHandler name="/update" class="solr.UpdateRequestHandler"/>
|
||||||
|
|
||||||
|
<!-- Get handler -->
|
||||||
|
<requestHandler name="/get" class="solr.RealTimeGetHandler">
|
||||||
|
<lst name="defaults">
|
||||||
|
<str name="omitHeader">true</str>
|
||||||
|
</lst>
|
||||||
|
</requestHandler>
|
||||||
|
|
||||||
|
<!-- Admin handlers -->
|
||||||
|
<requestHandler name="/admin/ping" class="solr.PingRequestHandler">
|
||||||
|
<lst name="invariants">
|
||||||
|
<str name="q">solrpingquery</str>
|
||||||
|
</lst>
|
||||||
|
<lst name="defaults">
|
||||||
|
<str name="echoParams">all</str>
|
||||||
|
</lst>
|
||||||
|
</requestHandler>
|
||||||
|
|
||||||
|
<!-- Suggest/Autocomplete handler -->
|
||||||
|
<requestHandler name="/suggest" class="solr.SearchHandler">
|
||||||
|
<lst name="defaults">
|
||||||
|
<str name="suggest">true</str>
|
||||||
|
<str name="suggest.count">10</str>
|
||||||
|
<str name="suggest.dictionary">suggest</str>
|
||||||
|
</lst>
|
||||||
|
<arr name="components">
|
||||||
|
<str>suggest</str>
|
||||||
|
</arr>
|
||||||
|
</requestHandler>
|
||||||
|
|
||||||
|
<!-- Spell check component -->
|
||||||
|
<searchComponent name="spellcheck" class="solr.SpellCheckComponent">
|
||||||
|
<str name="queryAnalyzerFieldType">text_general</str>
|
||||||
|
<lst name="spellchecker">
|
||||||
|
<str name="name">default</str>
|
||||||
|
<str name="field">content</str>
|
||||||
|
<str name="classname">solr.DirectSolrSpellChecker</str>
|
||||||
|
<str name="distanceMeasure">internal</str>
|
||||||
|
<float name="accuracy">0.5</float>
|
||||||
|
<int name="maxEdits">2</int>
|
||||||
|
<int name="minPrefix">1</int>
|
||||||
|
<int name="maxInspections">5</int>
|
||||||
|
<int name="minQueryLength">4</int>
|
||||||
|
<float name="maxQueryFrequency">0.01</float>
|
||||||
|
</lst>
|
||||||
|
</searchComponent>
|
||||||
|
|
||||||
|
<!-- Suggest component -->
|
||||||
|
<searchComponent name="suggest" class="solr.SuggestComponent">
|
||||||
|
<lst name="suggester">
|
||||||
|
<str name="name">suggest</str>
|
||||||
|
<str name="lookupImpl">FuzzyLookupFactory</str>
|
||||||
|
<str name="dictionaryImpl">DocumentDictionaryFactory</str>
|
||||||
|
<str name="field">suggest</str>
|
||||||
|
<str name="suggestAnalyzerFieldType">text_suggest</str>
|
||||||
|
<str name="buildOnStartup">false</str>
|
||||||
|
</lst>
|
||||||
|
</searchComponent>
|
||||||
|
|
||||||
|
<!-- More Like This handler -->
|
||||||
|
<requestHandler name="/mlt" class="solr.MoreLikeThisHandler">
|
||||||
|
<lst name="defaults">
|
||||||
|
<str name="mlt.fl">title,content,description,tags</str>
|
||||||
|
<int name="mlt.mindf">1</int>
|
||||||
|
<int name="mlt.mintf">1</int>
|
||||||
|
<int name="mlt.count">10</int>
|
||||||
|
</lst>
|
||||||
|
</requestHandler>
|
||||||
|
|
||||||
|
<!-- Schema handler -->
|
||||||
|
<requestHandler name="/schema" class="solr.SchemaHandler"/>
|
||||||
|
|
||||||
|
<!-- Config handler -->
|
||||||
|
<requestHandler name="/config" class="solr.ConfigHandler"/>
|
||||||
|
</config>
|
||||||
Reference in New Issue
Block a user