Initial commit - cleaned repository
This commit is contained in:
247
services/files/backend/file_processor.py
Normal file
247
services/files/backend/file_processor.py
Normal file
@ -0,0 +1,247 @@
|
||||
"""
|
||||
File Processor for handling file uploads and processing
|
||||
"""
|
||||
import hashlib
|
||||
import mimetypes
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, Optional
|
||||
import logging
|
||||
import uuid
|
||||
from fastapi import UploadFile
|
||||
from models import FileType, FileStatus
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class FileProcessor:
|
||||
def __init__(self, minio_client, metadata_manager, thumbnail_generator):
|
||||
self.minio_client = minio_client
|
||||
self.metadata_manager = metadata_manager
|
||||
self.thumbnail_generator = thumbnail_generator
|
||||
|
||||
def _determine_file_type(self, content_type: str) -> FileType:
|
||||
"""Determine file type from content type"""
|
||||
if content_type.startswith('image/'):
|
||||
return FileType.IMAGE
|
||||
elif content_type.startswith('video/'):
|
||||
return FileType.VIDEO
|
||||
elif content_type.startswith('audio/'):
|
||||
return FileType.AUDIO
|
||||
elif content_type in ['application/pdf', 'application/msword',
|
||||
'application/vnd.openxmlformats-officedocument',
|
||||
'text/plain', 'text/html', 'text/csv']:
|
||||
return FileType.DOCUMENT
|
||||
elif content_type in ['application/zip', 'application/x-rar-compressed',
|
||||
'application/x-tar', 'application/gzip']:
|
||||
return FileType.ARCHIVE
|
||||
else:
|
||||
return FileType.OTHER
|
||||
|
||||
def _calculate_file_hash(self, file_data: bytes) -> str:
|
||||
"""Calculate SHA256 hash of file data"""
|
||||
return hashlib.sha256(file_data).hexdigest()
|
||||
|
||||
async def process_upload(self, file: UploadFile, user_id: str,
|
||||
bucket: str = "default",
|
||||
public: bool = False,
|
||||
generate_thumbnail: bool = True,
|
||||
tags: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
||||
"""Process file upload"""
|
||||
try:
|
||||
# Read file data
|
||||
file_data = await file.read()
|
||||
file_size = len(file_data)
|
||||
|
||||
# Get content type
|
||||
content_type = file.content_type or mimetypes.guess_type(file.filename)[0] or 'application/octet-stream'
|
||||
|
||||
# Generate file ID and object name
|
||||
file_id = str(uuid.uuid4())
|
||||
timestamp = datetime.now().strftime('%Y%m%d')
|
||||
file_extension = file.filename.split('.')[-1] if '.' in file.filename else ''
|
||||
object_name = f"{timestamp}/{user_id}/{file_id}.{file_extension}" if file_extension else f"{timestamp}/{user_id}/{file_id}"
|
||||
|
||||
# Calculate file hash
|
||||
file_hash = self._calculate_file_hash(file_data)
|
||||
|
||||
# Check for duplicates
|
||||
duplicates = await self.metadata_manager.find_duplicate_files(file_hash)
|
||||
if duplicates and not public: # Allow duplicates for public files
|
||||
# Return existing file info
|
||||
existing = duplicates[0]
|
||||
logger.info(f"Duplicate file detected: {existing['id']}")
|
||||
return {
|
||||
"file_id": existing["id"],
|
||||
"filename": existing["filename"],
|
||||
"size": existing["size"],
|
||||
"content_type": existing["content_type"],
|
||||
"file_type": existing["file_type"],
|
||||
"bucket": existing["bucket"],
|
||||
"public": existing["public"],
|
||||
"has_thumbnail": existing.get("has_thumbnail", False),
|
||||
"thumbnail_url": existing.get("thumbnail_url"),
|
||||
"created_at": existing["created_at"],
|
||||
"duplicate": True
|
||||
}
|
||||
|
||||
# Upload to MinIO
|
||||
upload_result = await self.minio_client.upload_file(
|
||||
bucket=bucket,
|
||||
object_name=object_name,
|
||||
file_data=file_data,
|
||||
content_type=content_type,
|
||||
metadata={
|
||||
"user_id": user_id,
|
||||
"original_name": file.filename,
|
||||
"upload_date": datetime.now().isoformat()
|
||||
}
|
||||
)
|
||||
|
||||
# Determine file type
|
||||
file_type = self._determine_file_type(content_type)
|
||||
|
||||
# Generate thumbnail if applicable
|
||||
has_thumbnail = False
|
||||
thumbnail_url = None
|
||||
|
||||
if generate_thumbnail and file_type == FileType.IMAGE:
|
||||
thumbnail_data = await self.thumbnail_generator.generate_thumbnail(
|
||||
file_data=file_data,
|
||||
content_type=content_type
|
||||
)
|
||||
|
||||
if thumbnail_data:
|
||||
has_thumbnail = True
|
||||
# Generate multiple sizes
|
||||
await self.thumbnail_generator.generate_multiple_sizes(
|
||||
file_data=file_data,
|
||||
content_type=content_type,
|
||||
file_id=file_id
|
||||
)
|
||||
|
||||
if public:
|
||||
thumbnail_url = await self.minio_client.generate_presigned_download_url(
|
||||
bucket="thumbnails",
|
||||
object_name=f"thumbnails/{file_id}_medium.jpg",
|
||||
expires_in=86400 * 30 # 30 days
|
||||
)
|
||||
|
||||
# Create metadata
|
||||
metadata = {
|
||||
"id": file_id,
|
||||
"filename": file.filename,
|
||||
"original_name": file.filename,
|
||||
"size": file_size,
|
||||
"content_type": content_type,
|
||||
"file_type": file_type.value,
|
||||
"bucket": bucket,
|
||||
"object_name": object_name,
|
||||
"user_id": user_id,
|
||||
"hash": file_hash,
|
||||
"public": public,
|
||||
"has_thumbnail": has_thumbnail,
|
||||
"thumbnail_url": thumbnail_url,
|
||||
"tags": tags or {},
|
||||
"metadata": {
|
||||
"etag": upload_result.get("etag"),
|
||||
"version_id": upload_result.get("version_id")
|
||||
}
|
||||
}
|
||||
|
||||
# Save metadata to database
|
||||
await self.metadata_manager.create_file_metadata(metadata)
|
||||
|
||||
# Generate download URL if public
|
||||
download_url = None
|
||||
if public:
|
||||
download_url = await self.minio_client.generate_presigned_download_url(
|
||||
bucket=bucket,
|
||||
object_name=object_name,
|
||||
expires_in=86400 * 30 # 30 days
|
||||
)
|
||||
|
||||
logger.info(f"File uploaded successfully: {file_id}")
|
||||
|
||||
return {
|
||||
"file_id": file_id,
|
||||
"filename": file.filename,
|
||||
"size": file_size,
|
||||
"content_type": content_type,
|
||||
"file_type": file_type.value,
|
||||
"bucket": bucket,
|
||||
"public": public,
|
||||
"has_thumbnail": has_thumbnail,
|
||||
"thumbnail_url": thumbnail_url,
|
||||
"download_url": download_url,
|
||||
"created_at": datetime.now()
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"File processing error: {e}")
|
||||
raise
|
||||
|
||||
async def process_large_file(self, file: UploadFile, user_id: str,
|
||||
bucket: str = "default",
|
||||
chunk_size: int = 1024 * 1024 * 5) -> Dict[str, Any]:
|
||||
"""Process large file upload in chunks"""
|
||||
try:
|
||||
file_id = str(uuid.uuid4())
|
||||
timestamp = datetime.now().strftime('%Y%m%d')
|
||||
file_extension = file.filename.split('.')[-1] if '.' in file.filename else ''
|
||||
object_name = f"{timestamp}/{user_id}/{file_id}.{file_extension}"
|
||||
|
||||
# Initialize multipart upload
|
||||
hasher = hashlib.sha256()
|
||||
total_size = 0
|
||||
|
||||
# Process file in chunks
|
||||
chunks = []
|
||||
while True:
|
||||
chunk = await file.read(chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
chunks.append(chunk)
|
||||
hasher.update(chunk)
|
||||
total_size += len(chunk)
|
||||
|
||||
# Combine chunks and upload
|
||||
file_data = b''.join(chunks)
|
||||
file_hash = hasher.hexdigest()
|
||||
|
||||
# Upload to MinIO
|
||||
content_type = file.content_type or 'application/octet-stream'
|
||||
await self.minio_client.upload_file(
|
||||
bucket=bucket,
|
||||
object_name=object_name,
|
||||
file_data=file_data,
|
||||
content_type=content_type
|
||||
)
|
||||
|
||||
# Create metadata
|
||||
metadata = {
|
||||
"id": file_id,
|
||||
"filename": file.filename,
|
||||
"original_name": file.filename,
|
||||
"size": total_size,
|
||||
"content_type": content_type,
|
||||
"file_type": self._determine_file_type(content_type).value,
|
||||
"bucket": bucket,
|
||||
"object_name": object_name,
|
||||
"user_id": user_id,
|
||||
"hash": file_hash,
|
||||
"public": False,
|
||||
"has_thumbnail": False
|
||||
}
|
||||
|
||||
await self.metadata_manager.create_file_metadata(metadata)
|
||||
|
||||
return {
|
||||
"file_id": file_id,
|
||||
"filename": file.filename,
|
||||
"size": total_size,
|
||||
"message": "Large file uploaded successfully"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Large file processing error: {e}")
|
||||
raise
|
||||
Reference in New Issue
Block a user