site11/services/files/backend/file_processor.py

"""
File Processor for handling file uploads and processing
"""
import hashlib
import mimetypes
from datetime import datetime
from typing import Dict, Any, Optional
import logging
import uuid
from fastapi import UploadFile
from models import FileType, FileStatus

logger = logging.getLogger(__name__)

class FileProcessor:
    def __init__(self, minio_client, metadata_manager, thumbnail_generator):
        self.minio_client = minio_client
        self.metadata_manager = metadata_manager
        self.thumbnail_generator = thumbnail_generator

    def _determine_file_type(self, content_type: str) -> FileType:
        """Determine file type from content type"""
        if content_type.startswith('image/'):
            return FileType.IMAGE
        elif content_type.startswith('video/'):
            return FileType.VIDEO
        elif content_type.startswith('audio/'):
            return FileType.AUDIO
        elif content_type in ['application/pdf', 'application/msword',
                            'application/vnd.openxmlformats-officedocument',
                            'text/plain', 'text/html', 'text/csv']:
            return FileType.DOCUMENT
        elif content_type in ['application/zip', 'application/x-rar-compressed',
                            'application/x-tar', 'application/gzip']:
            return FileType.ARCHIVE
        else:
            return FileType.OTHER

    def _calculate_file_hash(self, file_data: bytes) -> str:
        """Calculate SHA256 hash of file data"""
        return hashlib.sha256(file_data).hexdigest()

    async def process_upload(self, file: UploadFile, user_id: str,
                           bucket: str = "default",
                           public: bool = False,
                           generate_thumbnail: bool = True,
                           tags: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
        """Process file upload"""
        try:
            # Read file data
            file_data = await file.read()
            file_size = len(file_data)

            # Get content type
            content_type = file.content_type or mimetypes.guess_type(file.filename)[0] or 'application/octet-stream'

            # Generate file ID and object name
            file_id = str(uuid.uuid4())
            timestamp = datetime.now().strftime('%Y%m%d')
            file_extension = file.filename.split('.')[-1] if '.' in file.filename else ''
            object_name = f"{timestamp}/{user_id}/{file_id}.{file_extension}" if file_extension else f"{timestamp}/{user_id}/{file_id}"

            # Calculate file hash
            file_hash = self._calculate_file_hash(file_data)

            # Check for duplicates
            duplicates = await self.metadata_manager.find_duplicate_files(file_hash)
            if duplicates and not public:  # Allow duplicates for public files
                # Return existing file info
                existing = duplicates[0]
                logger.info(f"Duplicate file detected: {existing['id']}")
                return {
                    "file_id": existing["id"],
                    "filename": existing["filename"],
                    "size": existing["size"],
                    "content_type": existing["content_type"],
                    "file_type": existing["file_type"],
                    "bucket": existing["bucket"],
                    "public": existing["public"],
                    "has_thumbnail": existing.get("has_thumbnail", False),
                    "thumbnail_url": existing.get("thumbnail_url"),
                    "created_at": existing["created_at"],
                    "duplicate": True
                }

            # Upload to MinIO
            upload_result = await self.minio_client.upload_file(
                bucket=bucket,
                object_name=object_name,
                file_data=file_data,
                content_type=content_type,
                metadata={
                    "user_id": user_id,
                    "original_name": file.filename,
                    "upload_date": datetime.now().isoformat()
                }
            )

            # Determine file type
            file_type = self._determine_file_type(content_type)

            # Generate thumbnail if applicable
            has_thumbnail = False
            thumbnail_url = None

            if generate_thumbnail and file_type == FileType.IMAGE:
                thumbnail_data = await self.thumbnail_generator.generate_thumbnail(
                    file_data=file_data,
                    content_type=content_type
                )

                if thumbnail_data:
                    has_thumbnail = True
                    # Generate multiple sizes
                    await self.thumbnail_generator.generate_multiple_sizes(
                        file_data=file_data,
                        content_type=content_type,
                        file_id=file_id
                    )

                    if public:
                        thumbnail_url = await self.minio_client.generate_presigned_download_url(
                            bucket="thumbnails",
                            object_name=f"thumbnails/{file_id}_medium.jpg",
                            expires_in=86400 * 30  # 30 days
                        )

            # Create metadata
            metadata = {
                "id": file_id,
                "filename": file.filename,
                "original_name": file.filename,
                "size": file_size,
                "content_type": content_type,
                "file_type": file_type.value,
                "bucket": bucket,
                "object_name": object_name,
                "user_id": user_id,
                "hash": file_hash,
                "public": public,
                "has_thumbnail": has_thumbnail,
                "thumbnail_url": thumbnail_url,
                "tags": tags or {},
                "metadata": {
                    "etag": upload_result.get("etag"),
                    "version_id": upload_result.get("version_id")
                }
            }

            # Save metadata to database
            await self.metadata_manager.create_file_metadata(metadata)

            # Generate download URL if public
            download_url = None
            if public:
                download_url = await self.minio_client.generate_presigned_download_url(
                    bucket=bucket,
                    object_name=object_name,
                    expires_in=86400 * 30  # 30 days
                )

            logger.info(f"File uploaded successfully: {file_id}")

            return {
                "file_id": file_id,
                "filename": file.filename,
                "size": file_size,
                "content_type": content_type,
                "file_type": file_type.value,
                "bucket": bucket,
                "public": public,
                "has_thumbnail": has_thumbnail,
                "thumbnail_url": thumbnail_url,
                "download_url": download_url,
                "created_at": datetime.now()
            }

        except Exception as e:
            logger.error(f"File processing error: {e}")
            raise

    async def process_large_file(self, file: UploadFile, user_id: str,
                               bucket: str = "default",
                               chunk_size: int = 1024 * 1024 * 5) -> Dict[str, Any]:
        """Process large file upload in chunks"""
        try:
            file_id = str(uuid.uuid4())
            timestamp = datetime.now().strftime('%Y%m%d')
            file_extension = file.filename.split('.')[-1] if '.' in file.filename else ''
            object_name = f"{timestamp}/{user_id}/{file_id}.{file_extension}"

            # Initialize multipart upload
            hasher = hashlib.sha256()
            total_size = 0

            # Process file in chunks
            chunks = []
            while True:
                chunk = await file.read(chunk_size)
                if not chunk:
                    break

                chunks.append(chunk)
                hasher.update(chunk)
                total_size += len(chunk)

            # Combine chunks and upload
            file_data = b''.join(chunks)
            file_hash = hasher.hexdigest()

            # Upload to MinIO
            content_type = file.content_type or 'application/octet-stream'
            await self.minio_client.upload_file(
                bucket=bucket,
                object_name=object_name,
                file_data=file_data,
                content_type=content_type
            )

            # Create metadata
            metadata = {
                "id": file_id,
                "filename": file.filename,
                "original_name": file.filename,
                "size": total_size,
                "content_type": content_type,
                "file_type": self._determine_file_type(content_type).value,
                "bucket": bucket,
                "object_name": object_name,
                "user_id": user_id,
                "hash": file_hash,
                "public": False,
                "has_thumbnail": False
            }

            await self.metadata_manager.create_file_metadata(metadata)

            return {
                "file_id": file_id,
                "filename": file.filename,
                "size": total_size,
                "message": "Large file uploaded successfully"
            }

        except Exception as e:
            logger.error(f"Large file processing error: {e}")
            raise