drama-studio/audio-studio-api/app/services/tts_client.py

"""TTS 엔진 클라이언트

audio-studio-tts 서비스와 통신
"""

import os
import logging
from typing import Optional, Tuple, List

import httpx

logger = logging.getLogger(__name__)


class TTSClient:
    """TTS 엔진 HTTP 클라이언트"""

    def __init__(self):
        self.base_url = os.getenv("TTS_ENGINE_URL", "http://localhost:8001")
        self.timeout = httpx.Timeout(120.0, connect=10.0)  # TTS는 시간이 걸릴 수 있음

    async def health_check(self) -> dict:
        """TTS 엔진 헬스체크"""
        async with httpx.AsyncClient(timeout=self.timeout) as client:
            response = await client.get(f"{self.base_url}/health")
            response.raise_for_status()
            return response.json()

    async def get_speakers(self) -> List[str]:
        """프리셋 스피커 목록 조회"""
        async with httpx.AsyncClient(timeout=self.timeout) as client:
            response = await client.get(f"{self.base_url}/speakers")
            response.raise_for_status()
            return response.json()["speakers"]

    async def get_languages(self) -> dict:
        """지원 언어 목록 조회"""
        async with httpx.AsyncClient(timeout=self.timeout) as client:
            response = await client.get(f"{self.base_url}/languages")
            response.raise_for_status()
            return response.json()["languages"]

    async def synthesize(
        self,
        text: str,
        speaker: str = "Chelsie",
        language: str = "ko",
        instruct: Optional[str] = None,
    ) -> Tuple[bytes, int]:
        """프리셋 음성으로 TTS 합성

        Returns:
            (audio_bytes, sample_rate)
        """
        async with httpx.AsyncClient(timeout=self.timeout) as client:
            payload = {
                "text": text,
                "speaker": speaker,
                "language": language,
            }
            if instruct:
                payload["instruct"] = instruct

            response = await client.post(
                f"{self.base_url}/synthesize",
                json=payload,
            )
            response.raise_for_status()

            # 샘플레이트 추출
            sample_rate = int(response.headers.get("X-Sample-Rate", "24000"))

            return response.content, sample_rate

    async def voice_clone(
        self,
        text: str,
        ref_audio: bytes,
        ref_text: str,
        language: str = "ko",
    ) -> Tuple[bytes, int]:
        """Voice Clone으로 TTS 합성

        Returns:
            (audio_bytes, sample_rate)
        """
        async with httpx.AsyncClient(timeout=self.timeout) as client:
            # multipart/form-data로 전송
            files = {"ref_audio": ("reference.wav", ref_audio, "audio/wav")}
            data = {
                "text": text,
                "ref_text": ref_text,
                "language": language,
            }

            response = await client.post(
                f"{self.base_url}/voice-clone",
                files=files,
                data=data,
            )
            response.raise_for_status()

            sample_rate = int(response.headers.get("X-Sample-Rate", "24000"))
            return response.content, sample_rate

    async def voice_design(
        self,
        text: str,
        instruct: str,
        language: str = "ko",
    ) -> Tuple[bytes, int]:
        """Voice Design으로 TTS 합성

        Returns:
            (audio_bytes, sample_rate)
        """
        async with httpx.AsyncClient(timeout=self.timeout) as client:
            payload = {
                "text": text,
                "instruct": instruct,
                "language": language,
            }

            response = await client.post(
                f"{self.base_url}/voice-design",
                json=payload,
            )
            response.raise_for_status()

            sample_rate = int(response.headers.get("X-Sample-Rate", "24000"))
            return response.content, sample_rate


# 싱글톤 인스턴스
tts_client = TTSClient()