Add complete NLLB-200 support with all 204 FLORES-200 languages
Updated dual model system to fully support both M2M100 and NLLB-200: **NLLB-200 Model (204 languages)** - Added all 204 FLORES-200 language codes to nllb200_lang_codes dictionary - Updated language code mappings with FLORES-200 format (xxx_Yyyy) - Added 24+ NLLB-exclusive languages including: - Southeast Asian: Acehnese, Balinese, Banjar, Buginese, Minangkabau - South Asian: Assamese, Awadhi, Bhojpuri, Chhattisgarhi, Magahi, Maithili, Meitei, Odia, Santali - African: Akan, Bambara, Bemba, Chokwe, Dyula, Fon, Kikuyu, Kimbundu, Kongo, Luba-Kasai, Luo, Mossi, Nuer - Arabic dialects: Mesopotamian, Najdi, Moroccan, Egyptian, Tunisian, South/North Levantine - European regional: Asturian, Friulian, Latgalian, Ligurian, Limburgish, Lombard, Norwegian Nynorsk/Bokmål, Occitan, Sardinian, Sicilian, Silesian, Venetian - Other: Dzongkha, Fijian, Guarani, Kabyle, Kabuverdianu, Papiamento, Quechua, Samoan, Sango, Shan, Tamasheq, Tibetan, Tok Pisin **Updated Files** - app/translator.py: Complete NLLB-200 language mappings (204 languages) - app/main.py: Added display names for all 204+ language codes - README.md: Updated with dual model system, NLLB-200 details, license info - CLAUDE.md: Updated developer documentation with model architecture **Testing** - Verified M2M100: 105 languages working ✅ - Verified NLLB-200: 204 languages working ✅ - Tested NLLB-exclusive languages (Bemba, Fon, etc.) ✅ **License Information** - M2M100: Apache 2.0 - Commercial use allowed - NLLB-200: CC-BY-NC 4.0 - Non-commercial only 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
87
app/main.py
87
app/main.py
@ -245,91 +245,111 @@ async def get_supported_languages(model: str = "m2m100"):
|
||||
"yi": {"name": "Yiddish", "native": "ייִדיש"},
|
||||
|
||||
# Additional NLLB-200 exclusive languages
|
||||
"ace_arab": {"name": "Acehnese (Arabic script)", "native": "أتشيه"},
|
||||
"ace": {"name": "Acehnese", "native": "Acèh"},
|
||||
"acm": {"name": "Mesopotamian Arabic", "native": "عراقي"},
|
||||
"acq": {"name": "Ta'izzi-Adeni Arabic", "native": "تعزية-عدنية"},
|
||||
"aeb": {"name": "Tunisian Arabic", "native": "تونسي"},
|
||||
"ajp": {"name": "South Levantine Arabic", "native": "شامي"},
|
||||
"als": {"name": "Tosk Albanian", "native": "Toskë"},
|
||||
"aka": {"name": "Akan", "native": "Akan"},
|
||||
"apc": {"name": "North Levantine Arabic", "native": "شامي شمالي"},
|
||||
"ar_latn": {"name": "Arabic (Latin script)", "native": "Arabic (Latin)"},
|
||||
"ars": {"name": "Najdi Arabic", "native": "نجدي"},
|
||||
"ary": {"name": "Moroccan Arabic", "native": "الدارجة"},
|
||||
"arz": {"name": "Egyptian Arabic", "native": "مصري"},
|
||||
"asm": {"name": "Assamese", "native": "অসমীয়া"},
|
||||
"as": {"name": "Assamese", "native": "অসমীয়া"},
|
||||
"ast": {"name": "Asturian", "native": "Asturianu"},
|
||||
"awa": {"name": "Awadhi", "native": "अवधी"},
|
||||
"ayr": {"name": "Central Aymara", "native": "Aymar aru"},
|
||||
"azb": {"name": "South Azerbaijani", "native": "تۆرکجه"},
|
||||
"bak": {"name": "Bashkir", "native": "Башҡортса"},
|
||||
"ba": {"name": "Bashkir", "native": "Башҡортса"},
|
||||
"bam": {"name": "Bambara", "native": "Bamanankan"},
|
||||
"ban": {"name": "Balinese", "native": "Basa Bali"},
|
||||
"bem": {"name": "Bemba", "native": "Ichibemba"},
|
||||
"bho": {"name": "Bhojpuri", "native": "भोजपुरी"},
|
||||
"bjn_arab": {"name": "Banjar (Arabic script)", "native": "بنجر"},
|
||||
"bjn": {"name": "Banjar", "native": "Bahasa Banjar"},
|
||||
"bod": {"name": "Tibetan", "native": "བོད་སྐད་"},
|
||||
"bo": {"name": "Tibetan", "native": "བོད་སྐད་"},
|
||||
"bug": {"name": "Buginese", "native": "Basa Ugi"},
|
||||
"crh": {"name": "Crimean Tatar", "native": "Qırımtatar tili"},
|
||||
"cjk": {"name": "Chokwe", "native": "Chokwe"},
|
||||
"ckb": {"name": "Central Kurdish", "native": "کوردیی ناوەندی"},
|
||||
"dik": {"name": "Southwestern Dinka", "native": "Thuɔŋjäŋ"},
|
||||
"dyu": {"name": "Dyula", "native": "Jula"},
|
||||
"dzo": {"name": "Dzongkha", "native": "རྫོང་ཁ"},
|
||||
"dz": {"name": "Dzongkha", "native": "རྫོང་ཁ"},
|
||||
"ee": {"name": "Ewe", "native": "Eʋegbe"},
|
||||
"fo": {"name": "Faroese", "native": "Føroyskt"},
|
||||
"fj": {"name": "Fijian", "native": "Na Vosa Vakaviti"},
|
||||
"fon": {"name": "Fon", "native": "Fɔngbe"},
|
||||
"fur": {"name": "Friulian", "native": "Furlan"},
|
||||
"fuv": {"name": "Nigerian Fulfulde", "native": "Fulfulde"},
|
||||
"gaz": {"name": "West Central Oromo", "native": "Oromoo"},
|
||||
"grn": {"name": "Guarani", "native": "Avañe'ẽ"},
|
||||
"om": {"name": "West Central Oromo", "native": "Oromoo"},
|
||||
"gn": {"name": "Guarani", "native": "Avañe'ẽ"},
|
||||
"hne": {"name": "Chhattisgarhi", "native": "छत्तीसगढ़ी"},
|
||||
"ilo": {"name": "Iloko", "native": "Ilokano"},
|
||||
"kab": {"name": "Kabyle", "native": "Taqbaylit"},
|
||||
"kac": {"name": "Jingpho", "native": "Jinghpaw"},
|
||||
"kam": {"name": "Kamba", "native": "Kikamba"},
|
||||
"kas": {"name": "Kashmiri", "native": "कॉशुर"},
|
||||
"ks": {"name": "Kashmiri", "native": "کٲشُر"},
|
||||
"ks_deva": {"name": "Kashmiri (Devanagari)", "native": "कॉशुर"},
|
||||
"kbp": {"name": "Kabiyè", "native": "Kabɩyɛ"},
|
||||
"kea": {"name": "Kabuverdianu", "native": "Kabuverdianu"},
|
||||
"khk": {"name": "Halh Mongolian", "native": "Монгол хэл"},
|
||||
"kin": {"name": "Kinyarwanda", "native": "Ikinyarwanda"},
|
||||
"ki": {"name": "Kikuyu", "native": "Gĩkũyũ"},
|
||||
"rw": {"name": "Kinyarwanda", "native": "Ikinyarwanda"},
|
||||
"kmb": {"name": "Kimbundu", "native": "Kimbundu"},
|
||||
"knc_arab": {"name": "Kanuri (Arabic script)", "native": "كانوري"},
|
||||
"knc": {"name": "Kanuri", "native": "Kanuri"},
|
||||
"kg": {"name": "Kongo", "native": "Kikongo"},
|
||||
"lij": {"name": "Ligurian", "native": "Ligure"},
|
||||
"lim": {"name": "Limburgish", "native": "Limburgs"},
|
||||
"lin": {"name": "Lingala", "native": "Lingála"},
|
||||
"li": {"name": "Limburgish", "native": "Limburgs"},
|
||||
"lmo": {"name": "Lombard", "native": "Lombard"},
|
||||
"ltg": {"name": "Latgalian", "native": "Latgalīšu"},
|
||||
"lua": {"name": "Luba-Kasai", "native": "Tshiluba"},
|
||||
"luo": {"name": "Luo", "native": "Dholuo"},
|
||||
"lus": {"name": "Mizo", "native": "Mizo ṭawng"},
|
||||
"mag": {"name": "Magahi", "native": "मगही"},
|
||||
"mai": {"name": "Maithili", "native": "मैथिली"},
|
||||
"min_arab": {"name": "Minangkabau (Arabic)", "native": "مينڠكاباو"},
|
||||
"min": {"name": "Minangkabau", "native": "Baso Minangkabau"},
|
||||
"mni": {"name": "Meitei", "native": "মৈতৈলোন্"},
|
||||
"mos": {"name": "Mossi", "native": "Mooré"},
|
||||
"mri": {"name": "Maori", "native": "Te Reo Māori"},
|
||||
"nn": {"name": "Norwegian Nynorsk", "native": "Nynorsk"},
|
||||
"nb": {"name": "Norwegian Bokmål", "native": "Bokmål"},
|
||||
"nso": {"name": "Northern Sotho", "native": "Sesotho sa Leboa"},
|
||||
"nus": {"name": "Nuer", "native": "Thok Naath"},
|
||||
"ory": {"name": "Odia", "native": "ଓଡ଼ିଆ"},
|
||||
"oc": {"name": "Occitan", "native": "Occitan"},
|
||||
"or": {"name": "Odia", "native": "ଓଡ଼ିଆ"},
|
||||
"pag": {"name": "Pangasinan", "native": "Pangasinan"},
|
||||
"pap": {"name": "Papiamento", "native": "Papiamentu"},
|
||||
"prs": {"name": "Dari", "native": "دری"},
|
||||
"quy": {"name": "Ayacucho Quechua", "native": "Chanka Qhichwa"},
|
||||
"run": {"name": "Rundi", "native": "Ikirundi"},
|
||||
"sag": {"name": "Sango", "native": "Sängö"},
|
||||
"san": {"name": "Sanskrit", "native": "संस्कृतम्"},
|
||||
"qu": {"name": "Ayacucho Quechua", "native": "Chanka Qhichwa"},
|
||||
"rn": {"name": "Rundi", "native": "Ikirundi"},
|
||||
"sg": {"name": "Sango", "native": "Sängö"},
|
||||
"sa": {"name": "Sanskrit", "native": "संस्कृतम्"},
|
||||
"sat": {"name": "Santali", "native": "ᱥᱟᱱᱛᱟᱲᱤ"},
|
||||
"scn": {"name": "Sicilian", "native": "Sicilianu"},
|
||||
"shn": {"name": "Shan", "native": "လိၵ်ႈတႆး"},
|
||||
"srd": {"name": "Sardinian", "native": "Sardu"},
|
||||
"sm": {"name": "Samoan", "native": "Gagana Sāmoa"},
|
||||
"sd": {"name": "Sindhi", "native": "سنڌي"},
|
||||
"sc": {"name": "Sardinian", "native": "Sardu"},
|
||||
"ss": {"name": "Swazi", "native": "SiSwati"},
|
||||
"szl": {"name": "Silesian", "native": "Ślōnski"},
|
||||
"taq": {"name": "Tamasheq", "native": "Tamasheq"},
|
||||
"tat": {"name": "Tatar", "native": "Татарча"},
|
||||
"tir": {"name": "Tigrinya", "native": "ትግርኛ"},
|
||||
"taq_tfng": {"name": "Tamasheq (Tifinagh)", "native": "ⵜⴰⵎⴰⵛⴰⵆ"},
|
||||
"tt": {"name": "Tatar", "native": "Татарча"},
|
||||
"ti": {"name": "Tigrinya", "native": "ትግርኛ"},
|
||||
"tpi": {"name": "Tok Pisin", "native": "Tok Pisin"},
|
||||
"tsn": {"name": "Tswana", "native": "Setswana"},
|
||||
"tso": {"name": "Tsonga", "native": "Xitsonga"},
|
||||
"tn": {"name": "Tswana", "native": "Setswana"},
|
||||
"ts": {"name": "Tsonga", "native": "Xitsonga"},
|
||||
"tum": {"name": "Tumbuka", "native": "Chitumbuka"},
|
||||
"twi": {"name": "Twi", "native": "Twi"},
|
||||
"tw": {"name": "Twi", "native": "Twi"},
|
||||
"tzm": {"name": "Central Atlas Tamazight", "native": "ⵜⴰⵎⴰⵣⵉⵖⵜ"},
|
||||
"uig": {"name": "Uyghur", "native": "ئۇيغۇرچە"},
|
||||
"umb": {"name": "Umbundu", "native": "Umbundu"},
|
||||
"vec": {"name": "Venetian", "native": "Vèneto"},
|
||||
"war": {"name": "Waray", "native": "Winaray"},
|
||||
"wol": {"name": "Wolof", "native": "Wolof"},
|
||||
"xho": {"name": "Xhosa", "native": "isiXhosa"},
|
||||
"ydd": {"name": "Eastern Yiddish", "native": "ייִדיש"},
|
||||
"yor": {"name": "Yoruba", "native": "Yorùbá"},
|
||||
"wo": {"name": "Wolof", "native": "Wolof"},
|
||||
"yue": {"name": "Cantonese", "native": "粵語"},
|
||||
"zho_hant": {"name": "Chinese (Traditional)", "native": "繁體中文"},
|
||||
"zh_hant": {"name": "Chinese (Traditional)", "native": "繁體中文"},
|
||||
}
|
||||
|
||||
# Get all supported language codes from translator based on model type
|
||||
@ -354,11 +374,12 @@ async def get_supported_languages(model: str = "m2m100"):
|
||||
"model_id": "facebook/m2m100_418M"
|
||||
},
|
||||
"nllb200": {
|
||||
"name": "NLLB-200",
|
||||
"languages": 200,
|
||||
"name": "NLLB-200 (FLORES-200)",
|
||||
"languages": 204,
|
||||
"license": "CC-BY-NC 4.0",
|
||||
"commercial_use": False,
|
||||
"model_id": "facebook/nllb-200-distilled-600M"
|
||||
"model_id": "facebook/nllb-200-distilled-600M",
|
||||
"note": "Includes multiple script variants for some languages"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user