247 lines
9.7 KiB
Python
247 lines
9.7 KiB
Python
"""
|
|
File Processor for handling file uploads and processing
|
|
"""
|
|
import hashlib
|
|
import mimetypes
|
|
from datetime import datetime
|
|
from typing import Dict, Any, Optional
|
|
import logging
|
|
import uuid
|
|
from fastapi import UploadFile
|
|
from models import FileType, FileStatus
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class FileProcessor:
|
|
def __init__(self, minio_client, metadata_manager, thumbnail_generator):
|
|
self.minio_client = minio_client
|
|
self.metadata_manager = metadata_manager
|
|
self.thumbnail_generator = thumbnail_generator
|
|
|
|
def _determine_file_type(self, content_type: str) -> FileType:
|
|
"""Determine file type from content type"""
|
|
if content_type.startswith('image/'):
|
|
return FileType.IMAGE
|
|
elif content_type.startswith('video/'):
|
|
return FileType.VIDEO
|
|
elif content_type.startswith('audio/'):
|
|
return FileType.AUDIO
|
|
elif content_type in ['application/pdf', 'application/msword',
|
|
'application/vnd.openxmlformats-officedocument',
|
|
'text/plain', 'text/html', 'text/csv']:
|
|
return FileType.DOCUMENT
|
|
elif content_type in ['application/zip', 'application/x-rar-compressed',
|
|
'application/x-tar', 'application/gzip']:
|
|
return FileType.ARCHIVE
|
|
else:
|
|
return FileType.OTHER
|
|
|
|
def _calculate_file_hash(self, file_data: bytes) -> str:
|
|
"""Calculate SHA256 hash of file data"""
|
|
return hashlib.sha256(file_data).hexdigest()
|
|
|
|
async def process_upload(self, file: UploadFile, user_id: str,
|
|
bucket: str = "default",
|
|
public: bool = False,
|
|
generate_thumbnail: bool = True,
|
|
tags: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
|
"""Process file upload"""
|
|
try:
|
|
# Read file data
|
|
file_data = await file.read()
|
|
file_size = len(file_data)
|
|
|
|
# Get content type
|
|
content_type = file.content_type or mimetypes.guess_type(file.filename)[0] or 'application/octet-stream'
|
|
|
|
# Generate file ID and object name
|
|
file_id = str(uuid.uuid4())
|
|
timestamp = datetime.now().strftime('%Y%m%d')
|
|
file_extension = file.filename.split('.')[-1] if '.' in file.filename else ''
|
|
object_name = f"{timestamp}/{user_id}/{file_id}.{file_extension}" if file_extension else f"{timestamp}/{user_id}/{file_id}"
|
|
|
|
# Calculate file hash
|
|
file_hash = self._calculate_file_hash(file_data)
|
|
|
|
# Check for duplicates
|
|
duplicates = await self.metadata_manager.find_duplicate_files(file_hash)
|
|
if duplicates and not public: # Allow duplicates for public files
|
|
# Return existing file info
|
|
existing = duplicates[0]
|
|
logger.info(f"Duplicate file detected: {existing['id']}")
|
|
return {
|
|
"file_id": existing["id"],
|
|
"filename": existing["filename"],
|
|
"size": existing["size"],
|
|
"content_type": existing["content_type"],
|
|
"file_type": existing["file_type"],
|
|
"bucket": existing["bucket"],
|
|
"public": existing["public"],
|
|
"has_thumbnail": existing.get("has_thumbnail", False),
|
|
"thumbnail_url": existing.get("thumbnail_url"),
|
|
"created_at": existing["created_at"],
|
|
"duplicate": True
|
|
}
|
|
|
|
# Upload to MinIO
|
|
upload_result = await self.minio_client.upload_file(
|
|
bucket=bucket,
|
|
object_name=object_name,
|
|
file_data=file_data,
|
|
content_type=content_type,
|
|
metadata={
|
|
"user_id": user_id,
|
|
"original_name": file.filename,
|
|
"upload_date": datetime.now().isoformat()
|
|
}
|
|
)
|
|
|
|
# Determine file type
|
|
file_type = self._determine_file_type(content_type)
|
|
|
|
# Generate thumbnail if applicable
|
|
has_thumbnail = False
|
|
thumbnail_url = None
|
|
|
|
if generate_thumbnail and file_type == FileType.IMAGE:
|
|
thumbnail_data = await self.thumbnail_generator.generate_thumbnail(
|
|
file_data=file_data,
|
|
content_type=content_type
|
|
)
|
|
|
|
if thumbnail_data:
|
|
has_thumbnail = True
|
|
# Generate multiple sizes
|
|
await self.thumbnail_generator.generate_multiple_sizes(
|
|
file_data=file_data,
|
|
content_type=content_type,
|
|
file_id=file_id
|
|
)
|
|
|
|
if public:
|
|
thumbnail_url = await self.minio_client.generate_presigned_download_url(
|
|
bucket="thumbnails",
|
|
object_name=f"thumbnails/{file_id}_medium.jpg",
|
|
expires_in=86400 * 30 # 30 days
|
|
)
|
|
|
|
# Create metadata
|
|
metadata = {
|
|
"id": file_id,
|
|
"filename": file.filename,
|
|
"original_name": file.filename,
|
|
"size": file_size,
|
|
"content_type": content_type,
|
|
"file_type": file_type.value,
|
|
"bucket": bucket,
|
|
"object_name": object_name,
|
|
"user_id": user_id,
|
|
"hash": file_hash,
|
|
"public": public,
|
|
"has_thumbnail": has_thumbnail,
|
|
"thumbnail_url": thumbnail_url,
|
|
"tags": tags or {},
|
|
"metadata": {
|
|
"etag": upload_result.get("etag"),
|
|
"version_id": upload_result.get("version_id")
|
|
}
|
|
}
|
|
|
|
# Save metadata to database
|
|
await self.metadata_manager.create_file_metadata(metadata)
|
|
|
|
# Generate download URL if public
|
|
download_url = None
|
|
if public:
|
|
download_url = await self.minio_client.generate_presigned_download_url(
|
|
bucket=bucket,
|
|
object_name=object_name,
|
|
expires_in=86400 * 30 # 30 days
|
|
)
|
|
|
|
logger.info(f"File uploaded successfully: {file_id}")
|
|
|
|
return {
|
|
"file_id": file_id,
|
|
"filename": file.filename,
|
|
"size": file_size,
|
|
"content_type": content_type,
|
|
"file_type": file_type.value,
|
|
"bucket": bucket,
|
|
"public": public,
|
|
"has_thumbnail": has_thumbnail,
|
|
"thumbnail_url": thumbnail_url,
|
|
"download_url": download_url,
|
|
"created_at": datetime.now()
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"File processing error: {e}")
|
|
raise
|
|
|
|
async def process_large_file(self, file: UploadFile, user_id: str,
|
|
bucket: str = "default",
|
|
chunk_size: int = 1024 * 1024 * 5) -> Dict[str, Any]:
|
|
"""Process large file upload in chunks"""
|
|
try:
|
|
file_id = str(uuid.uuid4())
|
|
timestamp = datetime.now().strftime('%Y%m%d')
|
|
file_extension = file.filename.split('.')[-1] if '.' in file.filename else ''
|
|
object_name = f"{timestamp}/{user_id}/{file_id}.{file_extension}"
|
|
|
|
# Initialize multipart upload
|
|
hasher = hashlib.sha256()
|
|
total_size = 0
|
|
|
|
# Process file in chunks
|
|
chunks = []
|
|
while True:
|
|
chunk = await file.read(chunk_size)
|
|
if not chunk:
|
|
break
|
|
|
|
chunks.append(chunk)
|
|
hasher.update(chunk)
|
|
total_size += len(chunk)
|
|
|
|
# Combine chunks and upload
|
|
file_data = b''.join(chunks)
|
|
file_hash = hasher.hexdigest()
|
|
|
|
# Upload to MinIO
|
|
content_type = file.content_type or 'application/octet-stream'
|
|
await self.minio_client.upload_file(
|
|
bucket=bucket,
|
|
object_name=object_name,
|
|
file_data=file_data,
|
|
content_type=content_type
|
|
)
|
|
|
|
# Create metadata
|
|
metadata = {
|
|
"id": file_id,
|
|
"filename": file.filename,
|
|
"original_name": file.filename,
|
|
"size": total_size,
|
|
"content_type": content_type,
|
|
"file_type": self._determine_file_type(content_type).value,
|
|
"bucket": bucket,
|
|
"object_name": object_name,
|
|
"user_id": user_id,
|
|
"hash": file_hash,
|
|
"public": False,
|
|
"has_thumbnail": False
|
|
}
|
|
|
|
await self.metadata_manager.create_file_metadata(metadata)
|
|
|
|
return {
|
|
"file_id": file_id,
|
|
"filename": file.filename,
|
|
"size": total_size,
|
|
"message": "Large file uploaded successfully"
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Large file processing error: {e}")
|
|
raise |