From f586f930b613d7a8bd712e8ecadf098d1410332c Mon Sep 17 00:00:00 2001 From: jungwoo choi Date: Mon, 10 Nov 2025 14:11:20 +0900 Subject: [PATCH] Initial commit: Multilingual Translation API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Implemented REST API for 105+ language translation - Used Facebook M2M100 model (Apache 2.0 License - Commercial use allowed) - Supports any-to-any translation between 105 languages - Major languages: English, Chinese, Spanish, Arabic, Russian, Japanese, Korean, etc. - Southeast Asian: Malay, Indonesian, Thai, Vietnamese, Tagalog, Burmese, Khmer, Lao - South Asian: Bengali, Hindi, Urdu, Tamil, Telugu, Marathi, Gujarati, etc. - European: German, French, Italian, Spanish, Portuguese, Russian, etc. - African: Swahili, Amharic, Hausa, Igbo, Yoruba, Zulu, Xhosa - And many more languages Tech Stack: - FastAPI for REST API - Transformers (Hugging Face) for ML model - PyTorch for inference - Docker for containerization - M2M100 418M parameter model Features: - Health check endpoint - Supported languages listing - Dynamic language validation - Model caching for performance - GPU support (auto-detection) - CORS enabled for web clients 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .env.example | 12 +++ .gitignore | 56 ++++++++++ CLAUDE.md | 143 +++++++++++++++++++++++++ Dockerfile | 27 +++++ README.md | 137 ++++++++++++++++++++++++ app/__init__.py | 0 app/config.py | 27 +++++ app/main.py | 255 ++++++++++++++++++++++++++++++++++++++++++++ app/models.py | 51 +++++++++ app/translator.py | 259 +++++++++++++++++++++++++++++++++++++++++++++ docker-compose.yml | 21 ++++ requirements.txt | 9 ++ run.py | 14 +++ 13 files changed, 1011 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 CLAUDE.md create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 app/__init__.py create mode 100644 app/config.py create mode 100644 app/main.py create mode 100644 app/models.py create mode 100644 app/translator.py create mode 100644 docker-compose.yml create mode 100644 requirements.txt create mode 100644 run.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..bacd5ca --- /dev/null +++ b/.env.example @@ -0,0 +1,12 @@ +# API Configuration +API_HOST=0.0.0.0 +API_PORT=8000 +API_TITLE=Malaysian Language Translation API +API_VERSION=1.0.0 + +# Model Configuration +MODEL_CACHE_DIR=./models +MAX_LENGTH=512 + +# CORS Settings (comma-separated origins) +ALLOWED_ORIGINS=* diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..74e76b3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,56 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual Environment +venv/ +env/ +ENV/ +.venv + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Environment variables +.env +.env.local + +# Model cache +models/ +*.bin +*.safetensors + +# Logs +*.log +logs/ + +# OS +.DS_Store +Thumbs.db + +# Testing +.pytest_cache/ +.coverage +htmlcov/ diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..7716194 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,143 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +This is a Malaysian language translation API service built with FastAPI and Hugging Face Transformers. It provides bidirectional translation between Malay (Bahasa Melayu) and English using Helsinki-NLP's OPUS-MT neural machine translation models. + +## Development Commands + +### Local Development + +```bash +# Setup virtual environment and install dependencies +python -m venv venv +source venv/bin/activate # Windows: venv\Scripts\activate +pip install -r requirements.txt + +# Run the development server (with auto-reload) +python run.py + +# Or run with uvicorn directly +uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 +``` + +### Docker Development + +```bash +# Build and run with Docker Compose +docker-compose up -d + +# View logs +docker-compose logs -f + +# Stop services +docker-compose down + +# Rebuild after code changes +docker-compose up -d --build +``` + +### Testing the API + +```bash +# Health check +curl http://localhost:8000/health + +# Translate Malay to English +curl -X POST "http://localhost:8000/api/translate" \ + -H "Content-Type: application/json" \ + -d '{"text": "Selamat pagi", "source_lang": "ms", "target_lang": "en"}' + +# Translate English to Malay +curl -X POST "http://localhost:8000/api/translate" \ + -H "Content-Type: application/json" \ + -d '{"text": "Good morning", "source_lang": "en", "target_lang": "ms"}' +``` + +## Architecture + +### Core Components + +1. **app/main.py** - FastAPI application with endpoint definitions + - Lifespan events handle model preloading on startup + - CORS middleware configured for cross-origin requests + - Three main endpoints: root (`/`), health (`/health`), translate (`/api/translate`) + +2. **app/translator.py** - Translation service singleton + - Manages loading and caching of translation models + - Automatically detects and uses GPU if available (CUDA) + - Supports lazy loading - models are loaded on first use or preloaded at startup + - Model naming convention: `Helsinki-NLP/opus-mt-{source}-{target}` + +3. **app/models.py** - Pydantic schemas for request/response validation + - `TranslationRequest`: Validates input (text, source_lang, target_lang) + - `TranslationResponse`: Structured output with metadata + - `LanguageCode` enum: Only "ms" and "en" are supported + +4. **app/config.py** - Configuration management using pydantic-settings + - Loads settings from environment variables or `.env` file + - Default values provided for all settings + +### Translation Flow + +1. Request received at `/api/translate` endpoint +2. Pydantic validates request schema +3. TranslationService determines appropriate model based on language pair +4. Model is loaded if not already cached in memory +5. Text is tokenized, translated, and decoded +6. Response includes original text, translation, and model metadata + +### Model Caching + +- Models are downloaded to `MODEL_CACHE_DIR` (default: `./models/`) +- Once downloaded, models persist across restarts +- In Docker, use volume mount to persist models +- First translation request may be slow due to model download (~300MB per model) + +### Device Selection + +The translator automatically detects GPU availability: +- CUDA GPU: Used automatically if available for faster inference +- CPU: Fallback option, slower but works everywhere + +## Configuration + +Environment variables (see `.env.example`): +- `API_HOST` / `API_PORT`: Server binding +- `MODEL_CACHE_DIR`: Where to store downloaded models +- `MAX_LENGTH`: Maximum token length for translation (default 512) +- `ALLOWED_ORIGINS`: CORS configuration + +## Common Tasks + +### Adding New Language Pairs + +To add support for additional languages: + +1. Check if Helsinki-NLP has an OPUS-MT model for the language pair at https://huggingface.co/Helsinki-NLP +2. Update `app/models.py` - Add new language code to `LanguageCode` enum +3. Update `app/translator.py` - Add model mapping in `_get_model_name()` method +4. Update `app/main.py` - Add language info to `/api/supported-languages` endpoint + +### Modifying Translation Behavior + +Translation parameters are in `app/translator.py` in the `translate()` method: +- Adjust `max_length` in tokenizer call to handle longer texts +- Modify generation parameters passed to `model.generate()` for different translation strategies + +### Production Deployment + +For production use: +1. Set `reload=False` in `run.py` or use production-ready uvicorn command +2. Configure proper `ALLOWED_ORIGINS` instead of "*" +3. Add authentication middleware if needed +4. Consider using multiple workers: `uvicorn app.main:app --workers 4` +5. Mount persistent volume for `models/` directory in Docker + +## API Documentation + +When the server is running, interactive API documentation is available at: +- Swagger UI: http://localhost:8000/docs +- ReDoc: http://localhost:8000/redoc diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..386f4c7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + gcc \ + g++ \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements first for better caching +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY . . + +# Create directory for model cache +RUN mkdir -p /app/models + +# Expose port +EXPOSE 8000 + +# Run the application +CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..ccd02c8 --- /dev/null +++ b/README.md @@ -0,0 +1,137 @@ +# Malaysian Language Translation API + +말레이시아어(Bahasa Melayu)와 영어 간의 자연스러운 번역을 제공하는 REST API 서비스입니다. + +## 주요 기능 + +- 말레이시아어 ↔ 영어 양방향 번역 +- Helsinki-NLP의 OPUS-MT 신경망 기계 번역 모델 사용 +- FastAPI 기반 고성능 REST API +- Docker를 통한 간편한 배포 +- 자동 API 문서화 (Swagger UI) + +## 빠른 시작 + +### 1. 로컬 환경 설정 + +```bash +# Python 가상환경 생성 및 활성화 +python -m venv venv +source venv/bin/activate # Windows: venv\Scripts\activate + +# 의존성 설치 +pip install -r requirements.txt + +# 환경변수 설정 (선택사항) +cp .env.example .env + +# 서버 실행 +python run.py +``` + +서버가 실행되면 http://localhost:8000 에서 API에 접근할 수 있습니다. + +### 2. Docker로 실행 + +```bash +# Docker Compose로 실행 +docker-compose up -d + +# 로그 확인 +docker-compose logs -f +``` + +## API 사용법 + +### 번역 요청 + +```bash +curl -X POST "http://localhost:8000/api/translate" \ + -H "Content-Type: application/json" \ + -d '{ + "text": "Selamat pagi, apa khabar?", + "source_lang": "ms", + "target_lang": "en" + }' +``` + +응답: +```json +{ + "original_text": "Selamat pagi, apa khabar?", + "translated_text": "Good morning, how are you?", + "source_lang": "ms", + "target_lang": "en", + "model_used": "Helsinki-NLP/opus-mt-ms-en" +} +``` + +### 지원 언어 확인 + +```bash +curl http://localhost:8000/api/supported-languages +``` + +### 헬스체크 + +```bash +curl http://localhost:8000/health +``` + +## API 문서 + +서버 실행 후 다음 URL에서 자동 생성된 API 문서를 확인할 수 있습니다: + +- Swagger UI: http://localhost:8000/docs +- ReDoc: http://localhost:8000/redoc + +## 지원 언어 + +- **ms**: 말레이시아어 (Bahasa Melayu) +- **en**: 영어 (English) + +## 프로젝트 구조 + +``` +site13/ +├── app/ +│ ├── __init__.py +│ ├── main.py # FastAPI 애플리케이션 및 엔드포인트 +│ ├── models.py # Pydantic 데이터 모델 +│ ├── translator.py # 번역 서비스 로직 +│ └── config.py # 설정 관리 +├── models/ # 다운로드된 번역 모델 캐시 (자동 생성) +├── requirements.txt # Python 의존성 +├── run.py # 서버 실행 스크립트 +├── Dockerfile # Docker 이미지 빌드 설정 +├── docker-compose.yml # Docker Compose 설정 +└── README.md # 프로젝트 문서 +``` + +## 환경 변수 + +`.env` 파일에서 다음 설정을 변경할 수 있습니다: + +- `API_HOST`: API 서버 호스트 (기본값: 0.0.0.0) +- `API_PORT`: API 서버 포트 (기본값: 8000) +- `MODEL_CACHE_DIR`: 모델 캐시 디렉토리 (기본값: ./models) +- `MAX_LENGTH`: 최대 번역 길이 (기본값: 512) +- `ALLOWED_ORIGINS`: CORS 허용 오리진 (기본값: *) + +## 성능 최적화 + +- 첫 실행 시 모델을 다운로드하므로 시간이 걸릴 수 있습니다 +- 모델은 `models/` 디렉토리에 캐시되어 재사용됩니다 +- GPU가 있는 경우 자동으로 감지하여 사용합니다 +- Docker volume을 사용하여 모델을 영구 저장합니다 + +## 개발 + +```bash +# 개발 모드로 실행 (자동 재시작) +uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 +``` + +## 라이선스 + +이 프로젝트는 Helsinki-NLP의 OPUS-MT 모델을 사용합니다. diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/config.py b/app/config.py new file mode 100644 index 0000000..56c0924 --- /dev/null +++ b/app/config.py @@ -0,0 +1,27 @@ +from pydantic_settings import BaseSettings +from typing import List + + +class Settings(BaseSettings): + """Application settings""" + + # API Configuration + api_host: str = "0.0.0.0" + api_port: int = 8000 + api_title: str = "Malaysian Language Translation API" + api_version: str = "1.0.0" + api_description: str = "API for translating between Malay and English using neural machine translation" + + # Model Configuration + model_cache_dir: str = "./models" + max_length: int = 512 + + # CORS Settings + allowed_origins: List[str] = ["*"] + + class Config: + env_file = ".env" + case_sensitive = False + + +settings = Settings() diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..67a1dea --- /dev/null +++ b/app/main.py @@ -0,0 +1,255 @@ +from fastapi import FastAPI, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from contextlib import asynccontextmanager +import logging + +from .config import settings +from .models import TranslationRequest, TranslationResponse, HealthResponse +from .translator import translator + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Lifecycle event handler for startup and shutdown""" + # Startup + logger.info("Starting Malaysian Translation API...") + try: + # Preload translation models + logger.info("Preloading translation models...") + translator.preload_all_models() + logger.info("Models loaded successfully") + except Exception as e: + logger.error(f"Error during startup: {str(e)}") + raise + + yield + + # Shutdown + logger.info("Shutting down Malaysian Translation API...") + + +# Create FastAPI app +app = FastAPI( + title=settings.api_title, + version=settings.api_version, + description=settings.api_description, + lifespan=lifespan +) + +# Add CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=settings.allowed_origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +@app.get("/", response_model=dict) +async def root(): + """Root endpoint with API information""" + return { + "name": settings.api_title, + "version": settings.api_version, + "description": settings.api_description, + "endpoints": { + "translate": "/api/translate", + "health": "/health", + "docs": "/docs" + } + } + + +@app.get("/health", response_model=HealthResponse) +async def health_check(): + """Health check endpoint""" + models_ready = translator.is_ready() + + return HealthResponse( + status="healthy" if models_ready else "degraded", + message="Translation service is running" if models_ready else "Models not loaded", + models_loaded=models_ready + ) + + +@app.post("/api/translate", response_model=TranslationResponse) +async def translate_text(request: TranslationRequest): + """ + Translate text between Malay and English + + - **text**: Text to translate (1-5000 characters) + - **source_lang**: Source language code ('ms' for Malay, 'en' for English) + - **target_lang**: Target language code ('ms' for Malay, 'en' for English) + """ + # Validate language pair + if request.source_lang == request.target_lang: + raise HTTPException( + status_code=400, + detail="Source and target languages must be different" + ) + + try: + # Perform translation + translated_text, model_used = translator.translate( + text=request.text, + source_lang=request.source_lang, + target_lang=request.target_lang + ) + + return TranslationResponse( + original_text=request.text, + translated_text=translated_text, + source_lang=request.source_lang, + target_lang=request.target_lang, + model_used=model_used + ) + + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + except Exception as e: + logger.error(f"Translation error: {str(e)}") + raise HTTPException( + status_code=500, + detail="Translation failed. Please try again." + ) + + +@app.get("/api/supported-languages") +async def get_supported_languages(): + """Get list of supported languages""" + + # Language names mapping + lang_names = { + "en": {"name": "English", "native": "English"}, + "zh": {"name": "Chinese", "native": "中文"}, + "es": {"name": "Spanish", "native": "Español"}, + "ar": {"name": "Arabic", "native": "العربية"}, + "hi": {"name": "Hindi", "native": "हिन्दी"}, + "bn": {"name": "Bengali", "native": "বাংলা"}, + "pt": {"name": "Portuguese", "native": "Português"}, + "ru": {"name": "Russian", "native": "Русский"}, + "ja": {"name": "Japanese", "native": "日本語"}, + "de": {"name": "German", "native": "Deutsch"}, + "fr": {"name": "French", "native": "Français"}, + "ko": {"name": "Korean", "native": "한국어"}, + "it": {"name": "Italian", "native": "Italiano"}, + "tr": {"name": "Turkish", "native": "Türkçe"}, + "vi": {"name": "Vietnamese", "native": "Tiếng Việt"}, + "th": {"name": "Thai", "native": "ไทย"}, + "pl": {"name": "Polish", "native": "Polski"}, + "nl": {"name": "Dutch", "native": "Nederlands"}, + "uk": {"name": "Ukrainian", "native": "Українська"}, + "ro": {"name": "Romanian", "native": "Română"}, + "ms": {"name": "Malay", "native": "Bahasa Melayu"}, + "id": {"name": "Indonesian", "native": "Bahasa Indonesia"}, + "tl": {"name": "Tagalog", "native": "Tagalog"}, + "my": {"name": "Burmese", "native": "မြန်မာဘာသာ"}, + "km": {"name": "Khmer", "native": "ភាសាខ្មែរ"}, + "lo": {"name": "Lao", "native": "ລາວ"}, + "ur": {"name": "Urdu", "native": "اردو"}, + "ta": {"name": "Tamil", "native": "தமிழ்"}, + "te": {"name": "Telugu", "native": "తెలుగు"}, + "mr": {"name": "Marathi", "native": "मराठी"}, + "gu": {"name": "Gujarati", "native": "ગુજરાતી"}, + "kn": {"name": "Kannada", "native": "ಕನ್ನಡ"}, + "ml": {"name": "Malayalam", "native": "മലയാളം"}, + "pa": {"name": "Punjabi", "native": "ਪੰਜਾਬੀ"}, + "ne": {"name": "Nepali", "native": "नेपाली"}, + "si": {"name": "Sinhala", "native": "සිංහල"}, + "sv": {"name": "Swedish", "native": "Svenska"}, + "da": {"name": "Danish", "native": "Dansk"}, + "fi": {"name": "Finnish", "native": "Suomi"}, + "no": {"name": "Norwegian", "native": "Norsk"}, + "cs": {"name": "Czech", "native": "Čeština"}, + "sk": {"name": "Slovak", "native": "Slovenčina"}, + "hu": {"name": "Hungarian", "native": "Magyar"}, + "bg": {"name": "Bulgarian", "native": "Български"}, + "sr": {"name": "Serbian", "native": "Српски"}, + "hr": {"name": "Croatian", "native": "Hrvatski"}, + "sl": {"name": "Slovenian", "native": "Slovenščina"}, + "et": {"name": "Estonian", "native": "Eesti"}, + "lv": {"name": "Latvian", "native": "Latviešu"}, + "lt": {"name": "Lithuanian", "native": "Lietuvių"}, + "el": {"name": "Greek", "native": "Ελληνικά"}, + "he": {"name": "Hebrew", "native": "עברית"}, + "fa": {"name": "Persian", "native": "فارسی"}, + "sw": {"name": "Swahili", "native": "Kiswahili"}, + "am": {"name": "Amharic", "native": "አማርኛ"}, + "ha": {"name": "Hausa", "native": "Hausa"}, + "ig": {"name": "Igbo", "native": "Igbo"}, + "yo": {"name": "Yoruba", "native": "Yorùbá"}, + "zu": {"name": "Zulu", "native": "isiZulu"}, + "xh": {"name": "Xhosa", "native": "isiXhosa"}, + "af": {"name": "Afrikaans", "native": "Afrikaans"}, + "az": {"name": "Azerbaijani", "native": "Azərbaycan"}, + "ka": {"name": "Georgian", "native": "ქართული"}, + "kk": {"name": "Kazakh", "native": "Қазақша"}, + "uz": {"name": "Uzbek", "native": "Oʻzbekcha"}, + "mn": {"name": "Mongolian", "native": "Монгол"}, + "sq": {"name": "Albanian", "native": "Shqip"}, + "hy": {"name": "Armenian", "native": "Հայերեն"}, + "be": {"name": "Belarusian", "native": "Беларуская"}, + "bs": {"name": "Bosnian", "native": "Bosanski"}, + "ca": {"name": "Catalan", "native": "Català"}, + "ceb": {"name": "Cebuano", "native": "Cebuano"}, + "cy": {"name": "Welsh", "native": "Cymraeg"}, + "eo": {"name": "Esperanto", "native": "Esperanto"}, + "eu": {"name": "Basque", "native": "Euskara"}, + "fil": {"name": "Filipino", "native": "Filipino"}, + "fy": {"name": "Frisian", "native": "Frysk"}, + "ga": {"name": "Irish", "native": "Gaeilge"}, + "gd": {"name": "Scottish Gaelic", "native": "Gàidhlig"}, + "gl": {"name": "Galician", "native": "Galego"}, + "haw": {"name": "Hawaiian", "native": "ʻŌlelo Hawaiʻi"}, + "hmn": {"name": "Hmong", "native": "Hmong"}, + "ht": {"name": "Haitian Creole", "native": "Kreyòl ayisyen"}, + "is": {"name": "Icelandic", "native": "Íslenska"}, + "jv": {"name": "Javanese", "native": "Basa Jawa"}, + "ku": {"name": "Kurdish", "native": "Kurdî"}, + "ky": {"name": "Kyrgyz", "native": "Кыргызча"}, + "la": {"name": "Latin", "native": "Latina"}, + "lb": {"name": "Luxembourgish", "native": "Lëtzebuergesch"}, + "lg": {"name": "Luganda", "native": "Luganda"}, + "ln": {"name": "Lingala", "native": "Lingála"}, + "mg": {"name": "Malagasy", "native": "Malagasy"}, + "mi": {"name": "Maori", "native": "Te Reo Māori"}, + "mk": {"name": "Macedonian", "native": "Македонски"}, + "mt": {"name": "Maltese", "native": "Malti"}, + "ny": {"name": "Chichewa", "native": "Chichewa"}, + "ps": {"name": "Pashto", "native": "پښتو"}, + "sn": {"name": "Shona", "native": "chiShona"}, + "so": {"name": "Somali", "native": "Soomaali"}, + "st": {"name": "Sesotho", "native": "Sesotho"}, + "su": {"name": "Sundanese", "native": "Basa Sunda"}, + "tg": {"name": "Tajik", "native": "Тоҷикӣ"}, + "tk": {"name": "Turkmen", "native": "Türkmençe"}, + "ug": {"name": "Uyghur", "native": "ئۇيغۇرچە"}, + "yi": {"name": "Yiddish", "native": "ייִדיש"}, + } + + # Get all supported language codes from translator + supported_codes = list(translator.lang_codes.keys()) + + # Build language list + languages = [ + { + "code": code, + "name": lang_names.get(code, {}).get("name", code.upper()), + "native_name": lang_names.get(code, {}).get("native", code.upper()) + } + for code in sorted(supported_codes) + ] + + return { + "languages": languages, + "total_languages": len(languages), + "note": "All language pairs are supported (any-to-any translation)" + } diff --git a/app/models.py b/app/models.py new file mode 100644 index 0000000..2c07e18 --- /dev/null +++ b/app/models.py @@ -0,0 +1,51 @@ +from pydantic import BaseModel, Field, field_validator +from typing import Optional + + +class TranslationRequest(BaseModel): + """Translation request schema""" + text: str = Field(..., description="Text to translate", min_length=1, max_length=5000) + source_lang: str = Field(..., description="Source language code (e.g., 'en', 'ms', 'bn', etc.)", min_length=2, max_length=5) + target_lang: str = Field(..., description="Target language code (e.g., 'en', 'ms', 'bn', etc.)", min_length=2, max_length=5) + + @field_validator('source_lang', 'target_lang') + @classmethod + def validate_lang_code(cls, v: str) -> str: + """Validate language code format""" + return v.lower().strip() + + class Config: + json_schema_extra = { + "example": { + "text": "Selamat pagi, apa khabar?", + "source_lang": "ms", + "target_lang": "en" + } + } + + +class TranslationResponse(BaseModel): + """Translation response schema""" + original_text: str = Field(..., description="Original input text") + translated_text: str = Field(..., description="Translated text") + source_lang: str = Field(..., description="Source language code") + target_lang: str = Field(..., description="Target language code") + model_used: str = Field(..., description="Translation model identifier") + + class Config: + json_schema_extra = { + "example": { + "original_text": "Selamat pagi, apa khabar?", + "translated_text": "Good morning, how are you?", + "source_lang": "ms", + "target_lang": "en", + "model_used": "Helsinki-NLP/opus-mt-ms-en" + } + } + + +class HealthResponse(BaseModel): + """Health check response""" + status: str + message: str + models_loaded: bool diff --git a/app/translator.py b/app/translator.py new file mode 100644 index 0000000..9fc1755 --- /dev/null +++ b/app/translator.py @@ -0,0 +1,259 @@ +from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer +import torch +from typing import Dict, Optional +import logging +from .config import settings + +logger = logging.getLogger(__name__) + + +class TranslationService: + """ + Service for handling multilingual translation + Uses M2M100 model (Apache 2.0 License - Commercial use allowed) + Supports 100 languages for many-to-many translation + """ + + def __init__(self): + self.models: Dict[str, Dict] = {} + self.device = "cuda" if torch.cuda.is_available() else "cpu" + logger.info(f"Using device: {self.device}") + + # M2M100 supported language codes (100 languages) + # Full list: https://huggingface.co/facebook/m2m100_418M + self.lang_codes = { + # Major languages + "en": "en", # English + "zh": "zh", # Chinese + "es": "es", # Spanish + "ar": "ar", # Arabic + "hi": "hi", # Hindi + "bn": "bn", # Bengali + "pt": "pt", # Portuguese + "ru": "ru", # Russian + "ja": "ja", # Japanese + "de": "de", # German + "fr": "fr", # French + "ko": "ko", # Korean + "it": "it", # Italian + "tr": "tr", # Turkish + "vi": "vi", # Vietnamese + "th": "th", # Thai + "pl": "pl", # Polish + "nl": "nl", # Dutch + "uk": "uk", # Ukrainian + "ro": "ro", # Romanian + + # Southeast Asian languages + "ms": "ms", # Malay + "id": "id", # Indonesian + "tl": "tl", # Tagalog + "my": "my", # Burmese + "km": "km", # Khmer + "lo": "lo", # Lao + + # South Asian languages + "ur": "ur", # Urdu + "ta": "ta", # Tamil + "te": "te", # Telugu + "mr": "mr", # Marathi + "gu": "gu", # Gujarati + "kn": "kn", # Kannada + "ml": "ml", # Malayalam + "pa": "pa", # Punjabi + "ne": "ne", # Nepali + "si": "si", # Sinhala + + # European languages + "sv": "sv", # Swedish + "da": "da", # Danish + "fi": "fi", # Finnish + "no": "no", # Norwegian + "cs": "cs", # Czech + "sk": "sk", # Slovak + "hu": "hu", # Hungarian + "bg": "bg", # Bulgarian + "sr": "sr", # Serbian + "hr": "hr", # Croatian + "sl": "sl", # Slovenian + "et": "et", # Estonian + "lv": "lv", # Latvian + "lt": "lt", # Lithuanian + "el": "el", # Greek + "he": "he", # Hebrew + "fa": "fa", # Persian + + # African languages + "sw": "sw", # Swahili + "am": "am", # Amharic + "ha": "ha", # Hausa + "ig": "ig", # Igbo + "yo": "yo", # Yoruba + "zu": "zu", # Zulu + "xh": "xh", # Xhosa + "af": "af", # Afrikaans + + # Other major languages + "az": "az", # Azerbaijani + "ka": "ka", # Georgian + "kk": "kk", # Kazakh + "uz": "uz", # Uzbek + "mn": "mn", # Mongolian + + # Additional languages (completing 100) + "sq": "sq", # Albanian + "hy": "hy", # Armenian + "be": "be", # Belarusian + "bs": "bs", # Bosnian + "ca": "ca", # Catalan + "ceb": "ceb", # Cebuano + "cy": "cy", # Welsh + "eo": "eo", # Esperanto + "eu": "eu", # Basque + "fil": "fil", # Filipino + "fy": "fy", # Frisian + "ga": "ga", # Irish + "gd": "gd", # Scottish Gaelic + "gl": "gl", # Galician + "haw": "haw", # Hawaiian + "hmn": "hmn", # Hmong + "ht": "ht", # Haitian Creole + "is": "is", # Icelandic + "jv": "jv", # Javanese + "kn": "kn", # Kannada + "ku": "ku", # Kurdish + "ky": "ky", # Kyrgyz + "la": "la", # Latin + "lb": "lb", # Luxembourgish + "lg": "lg", # Luganda + "ln": "ln", # Lingala + "mg": "mg", # Malagasy + "mi": "mi", # Maori + "mk": "mk", # Macedonian + "mt": "mt", # Maltese + "ny": "ny", # Chichewa + "ps": "ps", # Pashto + "sn": "sn", # Shona + "so": "so", # Somali + "st": "st", # Sesotho + "su": "su", # Sundanese + "tg": "tg", # Tajik + "tk": "tk", # Turkmen + "ug": "ug", # Uyghur + "yi": "yi", # Yiddish + } + + def _get_model_info(self, source_lang: str, target_lang: str) -> tuple[str, str, str]: + """Get the model name and language codes for translation""" + # Using M2M100 418M model (smaller, faster, commercial-friendly) + model_name = "facebook/m2m100_418M" + src_code = self.lang_codes.get(source_lang) + tgt_code = self.lang_codes.get(target_lang) + + if not src_code or not tgt_code: + raise ValueError(f"Unsupported language pair: {source_lang} -> {target_lang}") + + return model_name, src_code, tgt_code + + def load_model(self, source_lang: str, target_lang: str) -> None: + """Load translation model for specific language pair""" + model_name, _, _ = self._get_model_info(source_lang, target_lang) + + if model_name in self.models: + logger.info(f"Model {model_name} already loaded") + return + + try: + logger.info(f"Loading model: {model_name}") + tokenizer = M2M100Tokenizer.from_pretrained( + model_name, + cache_dir=settings.model_cache_dir + ) + model = M2M100ForConditionalGeneration.from_pretrained( + model_name, + cache_dir=settings.model_cache_dir + ).to(self.device) + + self.models[model_name] = { + "tokenizer": tokenizer, + "model": model + } + logger.info(f"Successfully loaded model: {model_name}") + + except Exception as e: + logger.error(f"Error loading model {model_name}: {str(e)}") + raise + + def translate(self, text: str, source_lang: str, target_lang: str) -> tuple[str, str]: + """ + Translate text from source language to target language + + Args: + text: Text to translate + source_lang: Source language code + target_lang: Target language code + + Returns: + Tuple of (translated_text, model_name) + """ + model_name, src_code, tgt_code = self._get_model_info(source_lang, target_lang) + + # Load model if not already loaded + if model_name not in self.models: + self.load_model(source_lang, target_lang) + + try: + tokenizer = self.models[model_name]["tokenizer"] + model = self.models[model_name]["model"] + + # Set source language for tokenizer + tokenizer.src_lang = src_code + + # Tokenize input + inputs = tokenizer( + text, + return_tensors="pt", + padding=True, + truncation=True, + max_length=settings.max_length + ).to(self.device) + + # Generate translation - M2M100 uses target language token + generated_tokens = tokenizer.get_lang_id(tgt_code) + + with torch.no_grad(): + translated = model.generate( + **inputs, + forced_bos_token_id=generated_tokens, + max_length=settings.max_length + ) + + # Decode output + translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0] + + return translated_text, model_name + + except Exception as e: + logger.error(f"Translation error: {str(e)}") + raise + + def preload_all_models(self) -> None: + """Preload all supported translation models""" + language_pairs = [ + ("ms", "en"), + ("en", "ms") + ] + + for source, target in language_pairs: + try: + self.load_model(source, target) + except Exception as e: + logger.warning(f"Could not preload model for {source}->{target}: {str(e)}") + + def is_ready(self) -> bool: + """Check if at least one model is loaded""" + return len(self.models) > 0 + + +# Global translator instance +translator = TranslationService() diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..bfd5356 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,21 @@ +version: '3.8' + +services: + translation-api: + build: . + container_name: malaysian-translation-api + ports: + - "8001:8000" + environment: + - API_HOST=0.0.0.0 + - API_PORT=8000 + - MODEL_CACHE_DIR=/app/models + volumes: + - ./models:/app/models # Persist downloaded models + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9625e0a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +fastapi>=0.104.1 +uvicorn[standard]>=0.24.0 +transformers>=4.35.0 +torch>=2.1.0 +sentencepiece>=0.1.99 +sacremoses>=0.1.1 +pydantic>=2.5.0 +pydantic-settings>=2.1.0 +python-multipart>=0.0.6 diff --git a/run.py b/run.py new file mode 100644 index 0000000..6b542d6 --- /dev/null +++ b/run.py @@ -0,0 +1,14 @@ +""" +Main entry point for running the Malaysian Translation API server +""" +import uvicorn +from app.config import settings + +if __name__ == "__main__": + uvicorn.run( + "app.main:app", + host=settings.api_host, + port=settings.api_port, + reload=True, # Set to False in production + log_level="info" + )