Add dual model support: M2M100 and NLLB-200

- Added optional 'model' parameter to translation request (default: m2m100)
- M2M100: 105 languages, Apache 2.0 License (commercial OK)
- NLLB-200: 200 languages, CC-BY-NC 4.0 License (non-commercial only)
- Updated /api/translate endpoint to accept model selection
- Updated /api/supported-languages to show languages per model
- Added comprehensive language name mappings for all NLLB-200 languages
- Both models can be used independently with automatic model loading
- Model information includes license and commercial use status

Example usage:
- Default (M2M100): {"text": "Hello", "source_lang": "en", "target_lang": "ko"}
- NLLB-200: {"text": "Hello", "source_lang": "en", "target_lang": "ko", "model": "nllb200"}

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
jungwoo choi
2025-11-11 15:57:00 +09:00
parent 228f6c38e5
commit 28e26d19b6
3 changed files with 434 additions and 61 deletions

View File

@ -103,7 +103,8 @@ async def translate_text(request: TranslationRequest):
translated_text, model_used = translator.translate(
text=request.text,
source_lang=request.source_lang,
target_lang=request.target_lang
target_lang=request.target_lang,
model_type=request.model
)
return TranslationResponse(
@ -125,8 +126,15 @@ async def translate_text(request: TranslationRequest):
@app.get("/api/supported-languages")
async def get_supported_languages():
"""Get list of supported languages"""
async def get_supported_languages(model: str = "m2m100"):
"""
Get list of supported languages for specified model
- **model**: Model type ('m2m100' or 'nllb200')
"""
if model not in ["m2m100", "nllb200"]:
raise HTTPException(status_code=400, detail="Invalid model. Choose 'm2m100' or 'nllb200'")
# Language names mapping
lang_names = {
@ -235,10 +243,97 @@ async def get_supported_languages():
"tk": {"name": "Turkmen", "native": "Türkmençe"},
"ug": {"name": "Uyghur", "native": "ئۇيغۇرچە"},
"yi": {"name": "Yiddish", "native": "ייִדיש"},
# Additional NLLB-200 exclusive languages
"ace": {"name": "Acehnese", "native": "Acèh"},
"acm": {"name": "Mesopotamian Arabic", "native": "عراقي"},
"acq": {"name": "Ta'izzi-Adeni Arabic", "native": "تعزية-عدنية"},
"aeb": {"name": "Tunisian Arabic", "native": "تونسي"},
"ajp": {"name": "South Levantine Arabic", "native": "شامي"},
"als": {"name": "Tosk Albanian", "native": "Toskë"},
"ars": {"name": "Najdi Arabic", "native": "نجدي"},
"ary": {"name": "Moroccan Arabic", "native": "الدارجة"},
"arz": {"name": "Egyptian Arabic", "native": "مصري"},
"asm": {"name": "Assamese", "native": "অসমীয়া"},
"ast": {"name": "Asturian", "native": "Asturianu"},
"awa": {"name": "Awadhi", "native": "अवधी"},
"ayr": {"name": "Central Aymara", "native": "Aymar aru"},
"azb": {"name": "South Azerbaijani", "native": "تۆرکجه"},
"bak": {"name": "Bashkir", "native": "Башҡортса"},
"bam": {"name": "Bambara", "native": "Bamanankan"},
"ban": {"name": "Balinese", "native": "Basa Bali"},
"bho": {"name": "Bhojpuri", "native": "भोजपुरी"},
"bjn": {"name": "Banjar", "native": "Bahasa Banjar"},
"bod": {"name": "Tibetan", "native": "བོད་སྐད་"},
"bug": {"name": "Buginese", "native": "Basa Ugi"},
"crh": {"name": "Crimean Tatar", "native": "Qırımtatar tili"},
"cjk": {"name": "Chokwe", "native": "Chokwe"},
"ckb": {"name": "Central Kurdish", "native": "کوردیی ناوەندی"},
"dik": {"name": "Southwestern Dinka", "native": "Thuɔŋjäŋ"},
"dyu": {"name": "Dyula", "native": "Jula"},
"dzo": {"name": "Dzongkha", "native": "རྫོང་ཁ"},
"fur": {"name": "Friulian", "native": "Furlan"},
"fuv": {"name": "Nigerian Fulfulde", "native": "Fulfulde"},
"gaz": {"name": "West Central Oromo", "native": "Oromoo"},
"grn": {"name": "Guarani", "native": "Avañe'"},
"hne": {"name": "Chhattisgarhi", "native": "छत्तीसगढ़ी"},
"ilo": {"name": "Iloko", "native": "Ilokano"},
"kab": {"name": "Kabyle", "native": "Taqbaylit"},
"kac": {"name": "Jingpho", "native": "Jinghpaw"},
"kam": {"name": "Kamba", "native": "Kikamba"},
"kas": {"name": "Kashmiri", "native": "कॉशुर"},
"kea": {"name": "Kabuverdianu", "native": "Kabuverdianu"},
"khk": {"name": "Halh Mongolian", "native": "Монгол хэл"},
"kin": {"name": "Kinyarwanda", "native": "Ikinyarwanda"},
"lij": {"name": "Ligurian", "native": "Ligure"},
"lim": {"name": "Limburgish", "native": "Limburgs"},
"lin": {"name": "Lingala", "native": "Lingála"},
"lmo": {"name": "Lombard", "native": "Lombard"},
"ltg": {"name": "Latgalian", "native": "Latgalīšu"},
"luo": {"name": "Luo", "native": "Dholuo"},
"lus": {"name": "Mizo", "native": "Mizo ṭawng"},
"mag": {"name": "Magahi", "native": "मगही"},
"mai": {"name": "Maithili", "native": "मैथिली"},
"min": {"name": "Minangkabau", "native": "Baso Minangkabau"},
"mni": {"name": "Meitei", "native": "মৈতৈলোন্"},
"mos": {"name": "Mossi", "native": "Mooré"},
"mri": {"name": "Maori", "native": "Te Reo Māori"},
"nus": {"name": "Nuer", "native": "Thok Naath"},
"ory": {"name": "Odia", "native": "ଓଡ଼ିଆ"},
"pag": {"name": "Pangasinan", "native": "Pangasinan"},
"pap": {"name": "Papiamento", "native": "Papiamentu"},
"prs": {"name": "Dari", "native": "دری"},
"quy": {"name": "Ayacucho Quechua", "native": "Chanka Qhichwa"},
"run": {"name": "Rundi", "native": "Ikirundi"},
"sag": {"name": "Sango", "native": "Sängö"},
"san": {"name": "Sanskrit", "native": "संस्कृतम्"},
"sat": {"name": "Santali", "native": "ᱥᱟᱱᱛᱟᱲᱤ"},
"scn": {"name": "Sicilian", "native": "Sicilianu"},
"shn": {"name": "Shan", "native": "လိၵ်ႈတႆး"},
"srd": {"name": "Sardinian", "native": "Sardu"},
"szl": {"name": "Silesian", "native": "Ślōnski"},
"taq": {"name": "Tamasheq", "native": "Tamasheq"},
"tat": {"name": "Tatar", "native": "Татарча"},
"tir": {"name": "Tigrinya", "native": "ትግርኛ"},
"tpi": {"name": "Tok Pisin", "native": "Tok Pisin"},
"tsn": {"name": "Tswana", "native": "Setswana"},
"tso": {"name": "Tsonga", "native": "Xitsonga"},
"tum": {"name": "Tumbuka", "native": "Chitumbuka"},
"twi": {"name": "Twi", "native": "Twi"},
"tzm": {"name": "Central Atlas Tamazight", "native": "ⵜⴰⵎⴰⵣⵉⵖⵜ"},
"uig": {"name": "Uyghur", "native": "ئۇيغۇرچە"},
"vec": {"name": "Venetian", "native": "Vèneto"},
"war": {"name": "Waray", "native": "Winaray"},
"wol": {"name": "Wolof", "native": "Wolof"},
"xho": {"name": "Xhosa", "native": "isiXhosa"},
"ydd": {"name": "Eastern Yiddish", "native": "ייִדיש"},
"yor": {"name": "Yoruba", "native": "Yorùbá"},
"yue": {"name": "Cantonese", "native": "粵語"},
"zho_hant": {"name": "Chinese (Traditional)", "native": "繁體中文"},
}
# Get all supported language codes from translator
supported_codes = list(translator.lang_codes.keys())
# Get all supported language codes from translator based on model type
supported_codes = list(translator.get_supported_languages(model).keys())
# Build language list
languages = [
@ -250,7 +345,25 @@ async def get_supported_languages():
for code in sorted(supported_codes)
]
model_info = {
"m2m100": {
"name": "M2M100",
"languages": 105,
"license": "Apache 2.0",
"commercial_use": True,
"model_id": "facebook/m2m100_418M"
},
"nllb200": {
"name": "NLLB-200",
"languages": 200,
"license": "CC-BY-NC 4.0",
"commercial_use": False,
"model_id": "facebook/nllb-200-distilled-600M"
}
}
return {
"model": model_info[model],
"languages": languages,
"total_languages": len(languages),
"note": "All language pairs are supported (any-to-any translation)"