""" File Processor for handling file uploads and processing """ import hashlib import mimetypes from datetime import datetime from typing import Dict, Any, Optional import logging import uuid from fastapi import UploadFile from models import FileType, FileStatus logger = logging.getLogger(__name__) class FileProcessor: def __init__(self, minio_client, metadata_manager, thumbnail_generator): self.minio_client = minio_client self.metadata_manager = metadata_manager self.thumbnail_generator = thumbnail_generator def _determine_file_type(self, content_type: str) -> FileType: """Determine file type from content type""" if content_type.startswith('image/'): return FileType.IMAGE elif content_type.startswith('video/'): return FileType.VIDEO elif content_type.startswith('audio/'): return FileType.AUDIO elif content_type in ['application/pdf', 'application/msword', 'application/vnd.openxmlformats-officedocument', 'text/plain', 'text/html', 'text/csv']: return FileType.DOCUMENT elif content_type in ['application/zip', 'application/x-rar-compressed', 'application/x-tar', 'application/gzip']: return FileType.ARCHIVE else: return FileType.OTHER def _calculate_file_hash(self, file_data: bytes) -> str: """Calculate SHA256 hash of file data""" return hashlib.sha256(file_data).hexdigest() async def process_upload(self, file: UploadFile, user_id: str, bucket: str = "default", public: bool = False, generate_thumbnail: bool = True, tags: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """Process file upload""" try: # Read file data file_data = await file.read() file_size = len(file_data) # Get content type content_type = file.content_type or mimetypes.guess_type(file.filename)[0] or 'application/octet-stream' # Generate file ID and object name file_id = str(uuid.uuid4()) timestamp = datetime.now().strftime('%Y%m%d') file_extension = file.filename.split('.')[-1] if '.' in file.filename else '' object_name = f"{timestamp}/{user_id}/{file_id}.{file_extension}" if file_extension else f"{timestamp}/{user_id}/{file_id}" # Calculate file hash file_hash = self._calculate_file_hash(file_data) # Check for duplicates duplicates = await self.metadata_manager.find_duplicate_files(file_hash) if duplicates and not public: # Allow duplicates for public files # Return existing file info existing = duplicates[0] logger.info(f"Duplicate file detected: {existing['id']}") return { "file_id": existing["id"], "filename": existing["filename"], "size": existing["size"], "content_type": existing["content_type"], "file_type": existing["file_type"], "bucket": existing["bucket"], "public": existing["public"], "has_thumbnail": existing.get("has_thumbnail", False), "thumbnail_url": existing.get("thumbnail_url"), "created_at": existing["created_at"], "duplicate": True } # Upload to MinIO upload_result = await self.minio_client.upload_file( bucket=bucket, object_name=object_name, file_data=file_data, content_type=content_type, metadata={ "user_id": user_id, "original_name": file.filename, "upload_date": datetime.now().isoformat() } ) # Determine file type file_type = self._determine_file_type(content_type) # Generate thumbnail if applicable has_thumbnail = False thumbnail_url = None if generate_thumbnail and file_type == FileType.IMAGE: thumbnail_data = await self.thumbnail_generator.generate_thumbnail( file_data=file_data, content_type=content_type ) if thumbnail_data: has_thumbnail = True # Generate multiple sizes await self.thumbnail_generator.generate_multiple_sizes( file_data=file_data, content_type=content_type, file_id=file_id ) if public: thumbnail_url = await self.minio_client.generate_presigned_download_url( bucket="thumbnails", object_name=f"thumbnails/{file_id}_medium.jpg", expires_in=86400 * 30 # 30 days ) # Create metadata metadata = { "id": file_id, "filename": file.filename, "original_name": file.filename, "size": file_size, "content_type": content_type, "file_type": file_type.value, "bucket": bucket, "object_name": object_name, "user_id": user_id, "hash": file_hash, "public": public, "has_thumbnail": has_thumbnail, "thumbnail_url": thumbnail_url, "tags": tags or {}, "metadata": { "etag": upload_result.get("etag"), "version_id": upload_result.get("version_id") } } # Save metadata to database await self.metadata_manager.create_file_metadata(metadata) # Generate download URL if public download_url = None if public: download_url = await self.minio_client.generate_presigned_download_url( bucket=bucket, object_name=object_name, expires_in=86400 * 30 # 30 days ) logger.info(f"File uploaded successfully: {file_id}") return { "file_id": file_id, "filename": file.filename, "size": file_size, "content_type": content_type, "file_type": file_type.value, "bucket": bucket, "public": public, "has_thumbnail": has_thumbnail, "thumbnail_url": thumbnail_url, "download_url": download_url, "created_at": datetime.now() } except Exception as e: logger.error(f"File processing error: {e}") raise async def process_large_file(self, file: UploadFile, user_id: str, bucket: str = "default", chunk_size: int = 1024 * 1024 * 5) -> Dict[str, Any]: """Process large file upload in chunks""" try: file_id = str(uuid.uuid4()) timestamp = datetime.now().strftime('%Y%m%d') file_extension = file.filename.split('.')[-1] if '.' in file.filename else '' object_name = f"{timestamp}/{user_id}/{file_id}.{file_extension}" # Initialize multipart upload hasher = hashlib.sha256() total_size = 0 # Process file in chunks chunks = [] while True: chunk = await file.read(chunk_size) if not chunk: break chunks.append(chunk) hasher.update(chunk) total_size += len(chunk) # Combine chunks and upload file_data = b''.join(chunks) file_hash = hasher.hexdigest() # Upload to MinIO content_type = file.content_type or 'application/octet-stream' await self.minio_client.upload_file( bucket=bucket, object_name=object_name, file_data=file_data, content_type=content_type ) # Create metadata metadata = { "id": file_id, "filename": file.filename, "original_name": file.filename, "size": total_size, "content_type": content_type, "file_type": self._determine_file_type(content_type).value, "bucket": bucket, "object_name": object_name, "user_id": user_id, "hash": file_hash, "public": False, "has_thumbnail": False } await self.metadata_manager.create_file_metadata(metadata) return { "file_id": file_id, "filename": file.filename, "size": total_size, "message": "Large file uploaded successfully" } except Exception as e: logger.error(f"Large file processing error: {e}") raise