Files
multilingual-translation/app/main.py
jungwoo choi 578be1fd55 Add complete NLLB-200 support with all 204 FLORES-200 languages
Updated dual model system to fully support both M2M100 and NLLB-200:

**NLLB-200 Model (204 languages)**
- Added all 204 FLORES-200 language codes to nllb200_lang_codes dictionary
- Updated language code mappings with FLORES-200 format (xxx_Yyyy)
- Added 24+ NLLB-exclusive languages including:
  - Southeast Asian: Acehnese, Balinese, Banjar, Buginese, Minangkabau
  - South Asian: Assamese, Awadhi, Bhojpuri, Chhattisgarhi, Magahi, Maithili, Meitei, Odia, Santali
  - African: Akan, Bambara, Bemba, Chokwe, Dyula, Fon, Kikuyu, Kimbundu, Kongo, Luba-Kasai, Luo, Mossi, Nuer
  - Arabic dialects: Mesopotamian, Najdi, Moroccan, Egyptian, Tunisian, South/North Levantine
  - European regional: Asturian, Friulian, Latgalian, Ligurian, Limburgish, Lombard, Norwegian Nynorsk/Bokmål, Occitan, Sardinian, Sicilian, Silesian, Venetian
  - Other: Dzongkha, Fijian, Guarani, Kabyle, Kabuverdianu, Papiamento, Quechua, Samoan, Sango, Shan, Tamasheq, Tibetan, Tok Pisin

**Updated Files**
- app/translator.py: Complete NLLB-200 language mappings (204 languages)
- app/main.py: Added display names for all 204+ language codes
- README.md: Updated with dual model system, NLLB-200 details, license info
- CLAUDE.md: Updated developer documentation with model architecture

**Testing**
- Verified M2M100: 105 languages working 
- Verified NLLB-200: 204 languages working 
- Tested NLLB-exclusive languages (Bemba, Fon, etc.) 

**License Information**
- M2M100: Apache 2.0 - Commercial use allowed
- NLLB-200: CC-BY-NC 4.0 - Non-commercial only

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-11 16:19:50 +09:00

392 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
import logging
from .config import settings
from .models import TranslationRequest, TranslationResponse, HealthResponse
from .translator import translator
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Lifecycle event handler for startup and shutdown"""
# Startup
logger.info("Starting Multilingual Translation API...")
try:
# Preload translation models
logger.info("Preloading translation models...")
translator.preload_all_models()
logger.info("Models loaded successfully")
except Exception as e:
logger.error(f"Error during startup: {str(e)}")
raise
yield
# Shutdown
logger.info("Shutting down Multilingual Translation API...")
# Create FastAPI app
app = FastAPI(
title=settings.api_title,
version=settings.api_version,
description=settings.api_description,
lifespan=lifespan
)
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=settings.allowed_origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/", response_model=dict)
async def root():
"""Root endpoint with API information"""
return {
"name": settings.api_title,
"version": settings.api_version,
"description": settings.api_description,
"endpoints": {
"translate": "/api/translate",
"health": "/health",
"docs": "/docs"
}
}
@app.get("/health", response_model=HealthResponse)
async def health_check():
"""Health check endpoint"""
models_ready = translator.is_ready()
return HealthResponse(
status="healthy" if models_ready else "degraded",
message="Translation service is running" if models_ready else "Models not loaded",
models_loaded=models_ready
)
@app.post("/api/translate", response_model=TranslationResponse)
async def translate_text(request: TranslationRequest):
"""
Translate text between 105+ languages using M2M100 model
- **text**: Text to translate (1-5000 characters)
- **source_lang**: Source language code (e.g., 'en', 'ko', 'ms', 'bn', 'ja', 'zh', etc.)
- **target_lang**: Target language code (e.g., 'en', 'ko', 'ms', 'bn', 'ja', 'zh', etc.)
Supports any-to-any translation between 105 languages. See /api/supported-languages for full list.
"""
# Validate language pair
if request.source_lang == request.target_lang:
raise HTTPException(
status_code=400,
detail="Source and target languages must be different"
)
try:
# Perform translation
translated_text, model_used = translator.translate(
text=request.text,
source_lang=request.source_lang,
target_lang=request.target_lang,
model_type=request.model
)
return TranslationResponse(
original_text=request.text,
translated_text=translated_text,
source_lang=request.source_lang,
target_lang=request.target_lang,
model_used=model_used
)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error(f"Translation error: {str(e)}")
raise HTTPException(
status_code=500,
detail="Translation failed. Please try again."
)
@app.get("/api/supported-languages")
async def get_supported_languages(model: str = "m2m100"):
"""
Get list of supported languages for specified model
- **model**: Model type ('m2m100' or 'nllb200')
"""
if model not in ["m2m100", "nllb200"]:
raise HTTPException(status_code=400, detail="Invalid model. Choose 'm2m100' or 'nllb200'")
# Language names mapping
lang_names = {
"en": {"name": "English", "native": "English"},
"zh": {"name": "Chinese", "native": "中文"},
"es": {"name": "Spanish", "native": "Español"},
"ar": {"name": "Arabic", "native": "العربية"},
"hi": {"name": "Hindi", "native": "हिन्दी"},
"bn": {"name": "Bengali", "native": "বাংলা"},
"pt": {"name": "Portuguese", "native": "Português"},
"ru": {"name": "Russian", "native": "Русский"},
"ja": {"name": "Japanese", "native": "日本語"},
"de": {"name": "German", "native": "Deutsch"},
"fr": {"name": "French", "native": "Français"},
"ko": {"name": "Korean", "native": "한국어"},
"it": {"name": "Italian", "native": "Italiano"},
"tr": {"name": "Turkish", "native": "Türkçe"},
"vi": {"name": "Vietnamese", "native": "Tiếng Việt"},
"th": {"name": "Thai", "native": "ไทย"},
"pl": {"name": "Polish", "native": "Polski"},
"nl": {"name": "Dutch", "native": "Nederlands"},
"uk": {"name": "Ukrainian", "native": "Українська"},
"ro": {"name": "Romanian", "native": "Română"},
"ms": {"name": "Malay", "native": "Bahasa Melayu"},
"id": {"name": "Indonesian", "native": "Bahasa Indonesia"},
"tl": {"name": "Tagalog", "native": "Tagalog"},
"my": {"name": "Burmese", "native": "မြန်မာဘာသာ"},
"km": {"name": "Khmer", "native": "ភាសាខ្មែរ"},
"lo": {"name": "Lao", "native": "ລາວ"},
"ur": {"name": "Urdu", "native": "اردو"},
"ta": {"name": "Tamil", "native": "தமிழ்"},
"te": {"name": "Telugu", "native": "తెలుగు"},
"mr": {"name": "Marathi", "native": "मराठी"},
"gu": {"name": "Gujarati", "native": "ગુજરાતી"},
"kn": {"name": "Kannada", "native": "ಕನ್ನಡ"},
"ml": {"name": "Malayalam", "native": "മലയാളം"},
"pa": {"name": "Punjabi", "native": "ਪੰਜਾਬੀ"},
"ne": {"name": "Nepali", "native": "नेपाली"},
"si": {"name": "Sinhala", "native": "සිංහල"},
"sv": {"name": "Swedish", "native": "Svenska"},
"da": {"name": "Danish", "native": "Dansk"},
"fi": {"name": "Finnish", "native": "Suomi"},
"no": {"name": "Norwegian", "native": "Norsk"},
"cs": {"name": "Czech", "native": "Čeština"},
"sk": {"name": "Slovak", "native": "Slovenčina"},
"hu": {"name": "Hungarian", "native": "Magyar"},
"bg": {"name": "Bulgarian", "native": "Български"},
"sr": {"name": "Serbian", "native": "Српски"},
"hr": {"name": "Croatian", "native": "Hrvatski"},
"sl": {"name": "Slovenian", "native": "Slovenščina"},
"et": {"name": "Estonian", "native": "Eesti"},
"lv": {"name": "Latvian", "native": "Latviešu"},
"lt": {"name": "Lithuanian", "native": "Lietuvių"},
"el": {"name": "Greek", "native": "Ελληνικά"},
"he": {"name": "Hebrew", "native": "עברית"},
"fa": {"name": "Persian", "native": "فارسی"},
"sw": {"name": "Swahili", "native": "Kiswahili"},
"am": {"name": "Amharic", "native": "አማርኛ"},
"ha": {"name": "Hausa", "native": "Hausa"},
"ig": {"name": "Igbo", "native": "Igbo"},
"yo": {"name": "Yoruba", "native": "Yorùbá"},
"zu": {"name": "Zulu", "native": "isiZulu"},
"xh": {"name": "Xhosa", "native": "isiXhosa"},
"af": {"name": "Afrikaans", "native": "Afrikaans"},
"az": {"name": "Azerbaijani", "native": "Azərbaycan"},
"ka": {"name": "Georgian", "native": "ქართული"},
"kk": {"name": "Kazakh", "native": "Қазақша"},
"uz": {"name": "Uzbek", "native": "Oʻzbekcha"},
"mn": {"name": "Mongolian", "native": "Монгол"},
"sq": {"name": "Albanian", "native": "Shqip"},
"hy": {"name": "Armenian", "native": "Հայերեն"},
"be": {"name": "Belarusian", "native": "Беларуская"},
"bs": {"name": "Bosnian", "native": "Bosanski"},
"ca": {"name": "Catalan", "native": "Català"},
"ceb": {"name": "Cebuano", "native": "Cebuano"},
"cy": {"name": "Welsh", "native": "Cymraeg"},
"eo": {"name": "Esperanto", "native": "Esperanto"},
"eu": {"name": "Basque", "native": "Euskara"},
"fil": {"name": "Filipino", "native": "Filipino"},
"fy": {"name": "Frisian", "native": "Frysk"},
"ga": {"name": "Irish", "native": "Gaeilge"},
"gd": {"name": "Scottish Gaelic", "native": "Gàidhlig"},
"gl": {"name": "Galician", "native": "Galego"},
"haw": {"name": "Hawaiian", "native": "ʻŌlelo Hawaiʻi"},
"hmn": {"name": "Hmong", "native": "Hmong"},
"ht": {"name": "Haitian Creole", "native": "Kreyòl ayisyen"},
"is": {"name": "Icelandic", "native": "Íslenska"},
"jv": {"name": "Javanese", "native": "Basa Jawa"},
"ku": {"name": "Kurdish", "native": "Kurdî"},
"ky": {"name": "Kyrgyz", "native": "Кыргызча"},
"la": {"name": "Latin", "native": "Latina"},
"lb": {"name": "Luxembourgish", "native": "Lëtzebuergesch"},
"lg": {"name": "Luganda", "native": "Luganda"},
"ln": {"name": "Lingala", "native": "Lingála"},
"mg": {"name": "Malagasy", "native": "Malagasy"},
"mi": {"name": "Maori", "native": "Te Reo Māori"},
"mk": {"name": "Macedonian", "native": "Македонски"},
"mt": {"name": "Maltese", "native": "Malti"},
"ny": {"name": "Chichewa", "native": "Chichewa"},
"ps": {"name": "Pashto", "native": "پښتو"},
"sn": {"name": "Shona", "native": "chiShona"},
"so": {"name": "Somali", "native": "Soomaali"},
"st": {"name": "Sesotho", "native": "Sesotho"},
"su": {"name": "Sundanese", "native": "Basa Sunda"},
"tg": {"name": "Tajik", "native": "Тоҷикӣ"},
"tk": {"name": "Turkmen", "native": "Türkmençe"},
"ug": {"name": "Uyghur", "native": "ئۇيغۇرچە"},
"yi": {"name": "Yiddish", "native": "ייִדיש"},
# Additional NLLB-200 exclusive languages
"ace_arab": {"name": "Acehnese (Arabic script)", "native": "أتشيه"},
"ace": {"name": "Acehnese", "native": "Acèh"},
"acm": {"name": "Mesopotamian Arabic", "native": "عراقي"},
"acq": {"name": "Ta'izzi-Adeni Arabic", "native": "تعزية-عدنية"},
"aeb": {"name": "Tunisian Arabic", "native": "تونسي"},
"ajp": {"name": "South Levantine Arabic", "native": "شامي"},
"aka": {"name": "Akan", "native": "Akan"},
"apc": {"name": "North Levantine Arabic", "native": "شامي شمالي"},
"ar_latn": {"name": "Arabic (Latin script)", "native": "Arabic (Latin)"},
"ars": {"name": "Najdi Arabic", "native": "نجدي"},
"ary": {"name": "Moroccan Arabic", "native": "الدارجة"},
"arz": {"name": "Egyptian Arabic", "native": "مصري"},
"as": {"name": "Assamese", "native": "অসমীয়া"},
"ast": {"name": "Asturian", "native": "Asturianu"},
"awa": {"name": "Awadhi", "native": "अवधी"},
"ayr": {"name": "Central Aymara", "native": "Aymar aru"},
"azb": {"name": "South Azerbaijani", "native": "تۆرکجه"},
"ba": {"name": "Bashkir", "native": "Башҡортса"},
"bam": {"name": "Bambara", "native": "Bamanankan"},
"ban": {"name": "Balinese", "native": "Basa Bali"},
"bem": {"name": "Bemba", "native": "Ichibemba"},
"bho": {"name": "Bhojpuri", "native": "भोजपुरी"},
"bjn_arab": {"name": "Banjar (Arabic script)", "native": "بنجر"},
"bjn": {"name": "Banjar", "native": "Bahasa Banjar"},
"bo": {"name": "Tibetan", "native": "བོད་སྐད་"},
"bug": {"name": "Buginese", "native": "Basa Ugi"},
"crh": {"name": "Crimean Tatar", "native": "Qırımtatar tili"},
"cjk": {"name": "Chokwe", "native": "Chokwe"},
"ckb": {"name": "Central Kurdish", "native": "کوردیی ناوەندی"},
"dik": {"name": "Southwestern Dinka", "native": "Thuɔŋjäŋ"},
"dyu": {"name": "Dyula", "native": "Jula"},
"dz": {"name": "Dzongkha", "native": "རྫོང་ཁ"},
"ee": {"name": "Ewe", "native": "Eʋegbe"},
"fo": {"name": "Faroese", "native": "Føroyskt"},
"fj": {"name": "Fijian", "native": "Na Vosa Vakaviti"},
"fon": {"name": "Fon", "native": "Fɔngbe"},
"fur": {"name": "Friulian", "native": "Furlan"},
"fuv": {"name": "Nigerian Fulfulde", "native": "Fulfulde"},
"om": {"name": "West Central Oromo", "native": "Oromoo"},
"gn": {"name": "Guarani", "native": "Avañe'"},
"hne": {"name": "Chhattisgarhi", "native": "छत्तीसगढ़ी"},
"ilo": {"name": "Iloko", "native": "Ilokano"},
"kab": {"name": "Kabyle", "native": "Taqbaylit"},
"kac": {"name": "Jingpho", "native": "Jinghpaw"},
"kam": {"name": "Kamba", "native": "Kikamba"},
"ks": {"name": "Kashmiri", "native": "کٲشُر"},
"ks_deva": {"name": "Kashmiri (Devanagari)", "native": "कॉशुर"},
"kbp": {"name": "Kabiyè", "native": "Kabɩ"},
"kea": {"name": "Kabuverdianu", "native": "Kabuverdianu"},
"ki": {"name": "Kikuyu", "native": "Gĩkũyũ"},
"rw": {"name": "Kinyarwanda", "native": "Ikinyarwanda"},
"kmb": {"name": "Kimbundu", "native": "Kimbundu"},
"knc_arab": {"name": "Kanuri (Arabic script)", "native": "كانوري"},
"knc": {"name": "Kanuri", "native": "Kanuri"},
"kg": {"name": "Kongo", "native": "Kikongo"},
"lij": {"name": "Ligurian", "native": "Ligure"},
"li": {"name": "Limburgish", "native": "Limburgs"},
"lmo": {"name": "Lombard", "native": "Lombard"},
"ltg": {"name": "Latgalian", "native": "Latgalīšu"},
"lua": {"name": "Luba-Kasai", "native": "Tshiluba"},
"luo": {"name": "Luo", "native": "Dholuo"},
"lus": {"name": "Mizo", "native": "Mizo ṭawng"},
"mag": {"name": "Magahi", "native": "मगही"},
"mai": {"name": "Maithili", "native": "मैथिली"},
"min_arab": {"name": "Minangkabau (Arabic)", "native": "مينڠكاباو"},
"min": {"name": "Minangkabau", "native": "Baso Minangkabau"},
"mni": {"name": "Meitei", "native": "মৈতৈলোন্"},
"mos": {"name": "Mossi", "native": "Mooré"},
"nn": {"name": "Norwegian Nynorsk", "native": "Nynorsk"},
"nb": {"name": "Norwegian Bokmål", "native": "Bokmål"},
"nso": {"name": "Northern Sotho", "native": "Sesotho sa Leboa"},
"nus": {"name": "Nuer", "native": "Thok Naath"},
"oc": {"name": "Occitan", "native": "Occitan"},
"or": {"name": "Odia", "native": "ଓଡ଼ିଆ"},
"pag": {"name": "Pangasinan", "native": "Pangasinan"},
"pap": {"name": "Papiamento", "native": "Papiamentu"},
"prs": {"name": "Dari", "native": "دری"},
"qu": {"name": "Ayacucho Quechua", "native": "Chanka Qhichwa"},
"rn": {"name": "Rundi", "native": "Ikirundi"},
"sg": {"name": "Sango", "native": "Sängö"},
"sa": {"name": "Sanskrit", "native": "संस्कृतम्"},
"sat": {"name": "Santali", "native": "ᱥᱟᱱᱛᱟᱲᱤ"},
"scn": {"name": "Sicilian", "native": "Sicilianu"},
"shn": {"name": "Shan", "native": "လိၵ်ႈတႆး"},
"sm": {"name": "Samoan", "native": "Gagana Sāmoa"},
"sd": {"name": "Sindhi", "native": "سنڌي"},
"sc": {"name": "Sardinian", "native": "Sardu"},
"ss": {"name": "Swazi", "native": "SiSwati"},
"szl": {"name": "Silesian", "native": "Ślōnski"},
"taq": {"name": "Tamasheq", "native": "Tamasheq"},
"taq_tfng": {"name": "Tamasheq (Tifinagh)", "native": "ⵜⴰⵎⴰⵛⴰⵆ"},
"tt": {"name": "Tatar", "native": "Татарча"},
"ti": {"name": "Tigrinya", "native": "ትግርኛ"},
"tpi": {"name": "Tok Pisin", "native": "Tok Pisin"},
"tn": {"name": "Tswana", "native": "Setswana"},
"ts": {"name": "Tsonga", "native": "Xitsonga"},
"tum": {"name": "Tumbuka", "native": "Chitumbuka"},
"tw": {"name": "Twi", "native": "Twi"},
"tzm": {"name": "Central Atlas Tamazight", "native": "ⵜⴰⵎⴰⵣⵉⵖⵜ"},
"umb": {"name": "Umbundu", "native": "Umbundu"},
"vec": {"name": "Venetian", "native": "Vèneto"},
"war": {"name": "Waray", "native": "Winaray"},
"wo": {"name": "Wolof", "native": "Wolof"},
"yue": {"name": "Cantonese", "native": "粵語"},
"zh_hant": {"name": "Chinese (Traditional)", "native": "繁體中文"},
}
# Get all supported language codes from translator based on model type
supported_codes = list(translator.get_supported_languages(model).keys())
# Build language list
languages = [
{
"code": code,
"name": lang_names.get(code, {}).get("name", code.upper()),
"native_name": lang_names.get(code, {}).get("native", code.upper())
}
for code in sorted(supported_codes)
]
model_info = {
"m2m100": {
"name": "M2M100",
"languages": 105,
"license": "Apache 2.0",
"commercial_use": True,
"model_id": "facebook/m2m100_418M"
},
"nllb200": {
"name": "NLLB-200 (FLORES-200)",
"languages": 204,
"license": "CC-BY-NC 4.0",
"commercial_use": False,
"model_id": "facebook/nllb-200-distilled-600M",
"note": "Includes multiple script variants for some languages"
}
}
return {
"model": model_info[model],
"languages": languages,
"total_languages": len(languages),
"note": "All language pairs are supported (any-to-any translation)"
}