Add complete NLLB-200 support with all 204 FLORES-200 languages

Updated dual model system to fully support both M2M100 and NLLB-200:

**NLLB-200 Model (204 languages)**
- Added all 204 FLORES-200 language codes to nllb200_lang_codes dictionary
- Updated language code mappings with FLORES-200 format (xxx_Yyyy)
- Added 24+ NLLB-exclusive languages including:
  - Southeast Asian: Acehnese, Balinese, Banjar, Buginese, Minangkabau
  - South Asian: Assamese, Awadhi, Bhojpuri, Chhattisgarhi, Magahi, Maithili, Meitei, Odia, Santali
  - African: Akan, Bambara, Bemba, Chokwe, Dyula, Fon, Kikuyu, Kimbundu, Kongo, Luba-Kasai, Luo, Mossi, Nuer
  - Arabic dialects: Mesopotamian, Najdi, Moroccan, Egyptian, Tunisian, South/North Levantine
  - European regional: Asturian, Friulian, Latgalian, Ligurian, Limburgish, Lombard, Norwegian Nynorsk/Bokmål, Occitan, Sardinian, Sicilian, Silesian, Venetian
  - Other: Dzongkha, Fijian, Guarani, Kabyle, Kabuverdianu, Papiamento, Quechua, Samoan, Sango, Shan, Tamasheq, Tibetan, Tok Pisin

**Updated Files**
- app/translator.py: Complete NLLB-200 language mappings (204 languages)
- app/main.py: Added display names for all 204+ language codes
- README.md: Updated with dual model system, NLLB-200 details, license info
- CLAUDE.md: Updated developer documentation with model architecture

**Testing**
- Verified M2M100: 105 languages working 
- Verified NLLB-200: 204 languages working 
- Tested NLLB-exclusive languages (Bemba, Fon, etc.) 

**License Information**
- M2M100: Apache 2.0 - Commercial use allowed
- NLLB-200: CC-BY-NC 4.0 - Non-commercial only

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
jungwoo choi
2025-11-11 16:19:50 +09:00
parent 5a99d081ab
commit 578be1fd55
4 changed files with 387 additions and 250 deletions

View File

@ -245,91 +245,111 @@ async def get_supported_languages(model: str = "m2m100"):
"yi": {"name": "Yiddish", "native": "ייִדיש"},
# Additional NLLB-200 exclusive languages
"ace_arab": {"name": "Acehnese (Arabic script)", "native": "أتشيه"},
"ace": {"name": "Acehnese", "native": "Acèh"},
"acm": {"name": "Mesopotamian Arabic", "native": "عراقي"},
"acq": {"name": "Ta'izzi-Adeni Arabic", "native": "تعزية-عدنية"},
"aeb": {"name": "Tunisian Arabic", "native": "تونسي"},
"ajp": {"name": "South Levantine Arabic", "native": "شامي"},
"als": {"name": "Tosk Albanian", "native": "Toskë"},
"aka": {"name": "Akan", "native": "Akan"},
"apc": {"name": "North Levantine Arabic", "native": "شامي شمالي"},
"ar_latn": {"name": "Arabic (Latin script)", "native": "Arabic (Latin)"},
"ars": {"name": "Najdi Arabic", "native": "نجدي"},
"ary": {"name": "Moroccan Arabic", "native": "الدارجة"},
"arz": {"name": "Egyptian Arabic", "native": "مصري"},
"asm": {"name": "Assamese", "native": "অসমীয়া"},
"as": {"name": "Assamese", "native": "অসমীয়া"},
"ast": {"name": "Asturian", "native": "Asturianu"},
"awa": {"name": "Awadhi", "native": "अवधी"},
"ayr": {"name": "Central Aymara", "native": "Aymar aru"},
"azb": {"name": "South Azerbaijani", "native": "تۆرکجه"},
"bak": {"name": "Bashkir", "native": "Башҡортса"},
"ba": {"name": "Bashkir", "native": "Башҡортса"},
"bam": {"name": "Bambara", "native": "Bamanankan"},
"ban": {"name": "Balinese", "native": "Basa Bali"},
"bem": {"name": "Bemba", "native": "Ichibemba"},
"bho": {"name": "Bhojpuri", "native": "भोजपुरी"},
"bjn_arab": {"name": "Banjar (Arabic script)", "native": "بنجر"},
"bjn": {"name": "Banjar", "native": "Bahasa Banjar"},
"bod": {"name": "Tibetan", "native": "བོད་སྐད་"},
"bo": {"name": "Tibetan", "native": "བོད་སྐད་"},
"bug": {"name": "Buginese", "native": "Basa Ugi"},
"crh": {"name": "Crimean Tatar", "native": "Qırımtatar tili"},
"cjk": {"name": "Chokwe", "native": "Chokwe"},
"ckb": {"name": "Central Kurdish", "native": "کوردیی ناوەندی"},
"dik": {"name": "Southwestern Dinka", "native": "Thuɔŋjäŋ"},
"dyu": {"name": "Dyula", "native": "Jula"},
"dzo": {"name": "Dzongkha", "native": "རྫོང་ཁ"},
"dz": {"name": "Dzongkha", "native": "རྫོང་ཁ"},
"ee": {"name": "Ewe", "native": "Eʋegbe"},
"fo": {"name": "Faroese", "native": "Føroyskt"},
"fj": {"name": "Fijian", "native": "Na Vosa Vakaviti"},
"fon": {"name": "Fon", "native": "Fɔngbe"},
"fur": {"name": "Friulian", "native": "Furlan"},
"fuv": {"name": "Nigerian Fulfulde", "native": "Fulfulde"},
"gaz": {"name": "West Central Oromo", "native": "Oromoo"},
"grn": {"name": "Guarani", "native": "Avañe'"},
"om": {"name": "West Central Oromo", "native": "Oromoo"},
"gn": {"name": "Guarani", "native": "Avañe'"},
"hne": {"name": "Chhattisgarhi", "native": "छत्तीसगढ़ी"},
"ilo": {"name": "Iloko", "native": "Ilokano"},
"kab": {"name": "Kabyle", "native": "Taqbaylit"},
"kac": {"name": "Jingpho", "native": "Jinghpaw"},
"kam": {"name": "Kamba", "native": "Kikamba"},
"kas": {"name": "Kashmiri", "native": "कॉशुर"},
"ks": {"name": "Kashmiri", "native": "کٲشُر"},
"ks_deva": {"name": "Kashmiri (Devanagari)", "native": "कॉशुर"},
"kbp": {"name": "Kabiyè", "native": "Kabɩ"},
"kea": {"name": "Kabuverdianu", "native": "Kabuverdianu"},
"khk": {"name": "Halh Mongolian", "native": "Монгол хэл"},
"kin": {"name": "Kinyarwanda", "native": "Ikinyarwanda"},
"ki": {"name": "Kikuyu", "native": "Gĩkũyũ"},
"rw": {"name": "Kinyarwanda", "native": "Ikinyarwanda"},
"kmb": {"name": "Kimbundu", "native": "Kimbundu"},
"knc_arab": {"name": "Kanuri (Arabic script)", "native": "كانوري"},
"knc": {"name": "Kanuri", "native": "Kanuri"},
"kg": {"name": "Kongo", "native": "Kikongo"},
"lij": {"name": "Ligurian", "native": "Ligure"},
"lim": {"name": "Limburgish", "native": "Limburgs"},
"lin": {"name": "Lingala", "native": "Lingála"},
"li": {"name": "Limburgish", "native": "Limburgs"},
"lmo": {"name": "Lombard", "native": "Lombard"},
"ltg": {"name": "Latgalian", "native": "Latgalīšu"},
"lua": {"name": "Luba-Kasai", "native": "Tshiluba"},
"luo": {"name": "Luo", "native": "Dholuo"},
"lus": {"name": "Mizo", "native": "Mizo ṭawng"},
"mag": {"name": "Magahi", "native": "मगही"},
"mai": {"name": "Maithili", "native": "मैथिली"},
"min_arab": {"name": "Minangkabau (Arabic)", "native": "مينڠكاباو"},
"min": {"name": "Minangkabau", "native": "Baso Minangkabau"},
"mni": {"name": "Meitei", "native": "মৈতৈলোন্"},
"mos": {"name": "Mossi", "native": "Mooré"},
"mri": {"name": "Maori", "native": "Te Reo Māori"},
"nn": {"name": "Norwegian Nynorsk", "native": "Nynorsk"},
"nb": {"name": "Norwegian Bokmål", "native": "Bokmål"},
"nso": {"name": "Northern Sotho", "native": "Sesotho sa Leboa"},
"nus": {"name": "Nuer", "native": "Thok Naath"},
"ory": {"name": "Odia", "native": "ଓଡ଼ିଆ"},
"oc": {"name": "Occitan", "native": "Occitan"},
"or": {"name": "Odia", "native": "ଓଡ଼ିଆ"},
"pag": {"name": "Pangasinan", "native": "Pangasinan"},
"pap": {"name": "Papiamento", "native": "Papiamentu"},
"prs": {"name": "Dari", "native": "دری"},
"quy": {"name": "Ayacucho Quechua", "native": "Chanka Qhichwa"},
"run": {"name": "Rundi", "native": "Ikirundi"},
"sag": {"name": "Sango", "native": "Sängö"},
"san": {"name": "Sanskrit", "native": "संस्कृतम्"},
"qu": {"name": "Ayacucho Quechua", "native": "Chanka Qhichwa"},
"rn": {"name": "Rundi", "native": "Ikirundi"},
"sg": {"name": "Sango", "native": "Sängö"},
"sa": {"name": "Sanskrit", "native": "संस्कृतम्"},
"sat": {"name": "Santali", "native": "ᱥᱟᱱᱛᱟᱲᱤ"},
"scn": {"name": "Sicilian", "native": "Sicilianu"},
"shn": {"name": "Shan", "native": "လိၵ်ႈတႆး"},
"srd": {"name": "Sardinian", "native": "Sardu"},
"sm": {"name": "Samoan", "native": "Gagana Sāmoa"},
"sd": {"name": "Sindhi", "native": "سنڌي"},
"sc": {"name": "Sardinian", "native": "Sardu"},
"ss": {"name": "Swazi", "native": "SiSwati"},
"szl": {"name": "Silesian", "native": "Ślōnski"},
"taq": {"name": "Tamasheq", "native": "Tamasheq"},
"tat": {"name": "Tatar", "native": "Татарча"},
"tir": {"name": "Tigrinya", "native": "ትግርኛ"},
"taq_tfng": {"name": "Tamasheq (Tifinagh)", "native": "ⵜⴰⵎⴰⵛⴰⵆ"},
"tt": {"name": "Tatar", "native": "Татарча"},
"ti": {"name": "Tigrinya", "native": "ትግርኛ"},
"tpi": {"name": "Tok Pisin", "native": "Tok Pisin"},
"tsn": {"name": "Tswana", "native": "Setswana"},
"tso": {"name": "Tsonga", "native": "Xitsonga"},
"tn": {"name": "Tswana", "native": "Setswana"},
"ts": {"name": "Tsonga", "native": "Xitsonga"},
"tum": {"name": "Tumbuka", "native": "Chitumbuka"},
"twi": {"name": "Twi", "native": "Twi"},
"tw": {"name": "Twi", "native": "Twi"},
"tzm": {"name": "Central Atlas Tamazight", "native": "ⵜⴰⵎⴰⵣⵉⵖⵜ"},
"uig": {"name": "Uyghur", "native": "ئۇيغۇرچە"},
"umb": {"name": "Umbundu", "native": "Umbundu"},
"vec": {"name": "Venetian", "native": "Vèneto"},
"war": {"name": "Waray", "native": "Winaray"},
"wol": {"name": "Wolof", "native": "Wolof"},
"xho": {"name": "Xhosa", "native": "isiXhosa"},
"ydd": {"name": "Eastern Yiddish", "native": "ייִדיש"},
"yor": {"name": "Yoruba", "native": "Yorùbá"},
"wo": {"name": "Wolof", "native": "Wolof"},
"yue": {"name": "Cantonese", "native": "粵語"},
"zho_hant": {"name": "Chinese (Traditional)", "native": "繁體中文"},
"zh_hant": {"name": "Chinese (Traditional)", "native": "繁體中文"},
}
# Get all supported language codes from translator based on model type
@ -354,11 +374,12 @@ async def get_supported_languages(model: str = "m2m100"):
"model_id": "facebook/m2m100_418M"
},
"nllb200": {
"name": "NLLB-200",
"languages": 200,
"name": "NLLB-200 (FLORES-200)",
"languages": 204,
"license": "CC-BY-NC 4.0",
"commercial_use": False,
"model_id": "facebook/nllb-200-distilled-600M"
"model_id": "facebook/nllb-200-distilled-600M",
"note": "Includes multiple script variants for some languages"
}
}