drama-studio/audio-studio-tts/app/main.py

"""Audio Studio TTS Engine

Qwen3-TTS 기반 음성 합성 API 서버
"""

import logging
from contextlib import asynccontextmanager
from typing import Optional, List

from fastapi import FastAPI, HTTPException, UploadFile, File, Form
from fastapi.responses import Response, JSONResponse
from pydantic import BaseModel, Field

from app.services.qwen_tts import tts_service, PRESET_SPEAKERS, LANGUAGE_MAP

# 로깅 설정
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


# ========================================
# Pydantic 모델
# ========================================

class SynthesizeRequest(BaseModel):
    """기본 TTS 합성 요청"""
    text: str = Field(..., min_length=1, max_length=5000, description="합성할 텍스트")
    speaker: str = Field(default="Chelsie", description="프리셋 스피커 이름")
    language: str = Field(default="ko", description="언어 코드 (ko, en, ja 등)")
    instruct: Optional[str] = Field(default=None, description="감정/스타일 지시")


class VoiceDesignRequest(BaseModel):
    """Voice Design 요청"""
    text: str = Field(..., min_length=1, max_length=5000, description="합성할 텍스트")
    instruct: str = Field(..., min_length=10, description="음성 디자인 프롬프트")
    language: str = Field(default="ko", description="언어 코드")


class HealthResponse(BaseModel):
    """헬스체크 응답"""
    status: str
    initialized: bool
    loaded_models: List[str]
    device: str


class SpeakersResponse(BaseModel):
    """스피커 목록 응답"""
    speakers: List[str]


class LanguagesResponse(BaseModel):
    """언어 목록 응답"""
    languages: dict


# ========================================
# 앱 생명주기
# ========================================

@asynccontextmanager
async def lifespan(app: FastAPI):
    """앱 시작/종료 시 실행"""
    # 시작 시 모델 초기화
    logger.info("TTS 엔진 시작...")
    try:
        await tts_service.initialize(preload_models=["custom"])
        logger.info("TTS 엔진 준비 완료")
    except Exception as e:
        logger.error(f"TTS 엔진 초기화 실패: {e}")
        # 초기화 실패해도 서버는 시작 (lazy loading 시도)

    yield

    # 종료 시 정리
    logger.info("TTS 엔진 종료")


# ========================================
# FastAPI 앱
# ========================================

app = FastAPI(
    title="Audio Studio TTS Engine",
    description="Qwen3-TTS 기반 음성 합성 API",
    version="0.1.0",
    lifespan=lifespan,
)


# ========================================
# API 엔드포인트
# ========================================

@app.get("/health", response_model=HealthResponse)
async def health_check():
    """헬스체크 엔드포인트"""
    return HealthResponse(
        status="healthy",
        initialized=tts_service.is_initialized(),
        loaded_models=tts_service.get_loaded_models(),
        device=tts_service.device,
    )


@app.get("/speakers", response_model=SpeakersResponse)
async def get_speakers():
    """프리셋 스피커 목록 조회"""
    return SpeakersResponse(speakers=tts_service.get_preset_speakers())


@app.get("/languages", response_model=LanguagesResponse)
async def get_languages():
    """지원 언어 목록 조회"""
    return LanguagesResponse(languages=tts_service.get_supported_languages())


@app.post("/synthesize")
async def synthesize(request: SynthesizeRequest):
    """프리셋 음성으로 TTS 합성

    CustomVoice 모델을 사용하여 텍스트를 음성으로 변환합니다.
    """
    try:
        # 스피커 유효성 검사
        if request.speaker not in PRESET_SPEAKERS:
            raise HTTPException(
                status_code=400,
                detail=f"Invalid speaker. Available: {PRESET_SPEAKERS}"
            )

        # 언어 유효성 검사
        if request.language not in LANGUAGE_MAP:
            raise HTTPException(
                status_code=400,
                detail=f"Invalid language. Available: {list(LANGUAGE_MAP.keys())}"
            )

        # TTS 합성
        audio_bytes, sr = await tts_service.synthesize_custom(
            text=request.text,
            speaker=request.speaker,
            language=request.language,
            instruct=request.instruct,
        )

        return Response(
            content=audio_bytes,
            media_type="audio/wav",
            headers={
                "X-Sample-Rate": str(sr),
                "Content-Disposition": 'attachment; filename="output.wav"',
            }
        )

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"TTS 합성 실패: {e}")
        raise HTTPException(status_code=500, detail=f"TTS synthesis failed: {str(e)}")


@app.post("/voice-clone")
async def voice_clone(
    text: str = Form(..., description="합성할 텍스트"),
    ref_text: str = Form(..., description="레퍼런스 오디오의 트랜스크립트"),
    language: str = Form(default="ko", description="언어 코드"),
    ref_audio: UploadFile = File(..., description="레퍼런스 오디오 파일"),
):
    """Voice Clone으로 TTS 합성

    레퍼런스 오디오를 기반으로 목소리를 복제하여 새 텍스트를 합성합니다.
    3초 이상의 오디오가 권장됩니다.
    """
    try:
        # 언어 유효성 검사
        if language not in LANGUAGE_MAP:
            raise HTTPException(
                status_code=400,
                detail=f"Invalid language. Available: {list(LANGUAGE_MAP.keys())}"
            )

        # 오디오 파일 읽기
        audio_content = await ref_audio.read()
        if len(audio_content) < 1000:  # 최소 크기 체크
            raise HTTPException(
                status_code=400,
                detail="Reference audio is too small"
            )

        # Voice Clone 합성
        audio_bytes, sr = await tts_service.synthesize_clone(
            text=text,
            ref_audio=audio_content,
            ref_text=ref_text,
            language=language,
        )

        return Response(
            content=audio_bytes,
            media_type="audio/wav",
            headers={
                "X-Sample-Rate": str(sr),
                "Content-Disposition": 'attachment; filename="cloned.wav"',
            }
        )

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Voice Clone 실패: {e}")
        raise HTTPException(status_code=500, detail=f"Voice clone failed: {str(e)}")


@app.post("/voice-design")
async def voice_design(request: VoiceDesignRequest):
    """Voice Design으로 TTS 합성

    텍스트 프롬프트를 기반으로 새로운 음성을 생성합니다.
    예: "30대 남성, 부드럽고 차분한 목소리"
    """
    try:
        # 언어 유효성 검사
        if request.language not in LANGUAGE_MAP:
            raise HTTPException(
                status_code=400,
                detail=f"Invalid language. Available: {list(LANGUAGE_MAP.keys())}"
            )

        # Voice Design 합성
        audio_bytes, sr = await tts_service.synthesize_design(
            text=request.text,
            instruct=request.instruct,
            language=request.language,
        )

        return Response(
            content=audio_bytes,
            media_type="audio/wav",
            headers={
                "X-Sample-Rate": str(sr),
                "Content-Disposition": 'attachment; filename="designed.wav"',
            }
        )

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Voice Design 실패: {e}")
        raise HTTPException(status_code=500, detail=f"Voice design failed: {str(e)}")


@app.post("/load-model")
async def load_model(model_type: str):
    """특정 모델 로드 (관리용)

    Args:
        model_type: custom | base | design
    """
    valid_types = ["custom", "base", "design"]
    if model_type not in valid_types:
        raise HTTPException(
            status_code=400,
            detail=f"Invalid model type. Available: {valid_types}"
        )

    try:
        await tts_service._load_model(model_type)
        return JSONResponse({
            "status": "loaded",
            "model_type": model_type,
            "loaded_models": tts_service.get_loaded_models(),
        })
    except Exception as e:
        logger.error(f"모델 로드 실패: {e}")
        raise HTTPException(status_code=500, detail=f"Model load failed: {str(e)}")