Add complete NLLB-200 support with all 204 FLORES-200 languages

Updated dual model system to fully support both M2M100 and NLLB-200:

**NLLB-200 Model (204 languages)**
- Added all 204 FLORES-200 language codes to nllb200_lang_codes dictionary
- Updated language code mappings with FLORES-200 format (xxx_Yyyy)
- Added 24+ NLLB-exclusive languages including:
  - Southeast Asian: Acehnese, Balinese, Banjar, Buginese, Minangkabau
  - South Asian: Assamese, Awadhi, Bhojpuri, Chhattisgarhi, Magahi, Maithili, Meitei, Odia, Santali
  - African: Akan, Bambara, Bemba, Chokwe, Dyula, Fon, Kikuyu, Kimbundu, Kongo, Luba-Kasai, Luo, Mossi, Nuer
  - Arabic dialects: Mesopotamian, Najdi, Moroccan, Egyptian, Tunisian, South/North Levantine
  - European regional: Asturian, Friulian, Latgalian, Ligurian, Limburgish, Lombard, Norwegian Nynorsk/Bokmål, Occitan, Sardinian, Sicilian, Silesian, Venetian
  - Other: Dzongkha, Fijian, Guarani, Kabyle, Kabuverdianu, Papiamento, Quechua, Samoan, Sango, Shan, Tamasheq, Tibetan, Tok Pisin

**Updated Files**
- app/translator.py: Complete NLLB-200 language mappings (204 languages)
- app/main.py: Added display names for all 204+ language codes
- README.md: Updated with dual model system, NLLB-200 details, license info
- CLAUDE.md: Updated developer documentation with model architecture

**Testing**
- Verified M2M100: 105 languages working 
- Verified NLLB-200: 204 languages working 
- Tested NLLB-exclusive languages (Bemba, Fon, etc.) 

**License Information**
- M2M100: Apache 2.0 - Commercial use allowed
- NLLB-200: CC-BY-NC 4.0 - Non-commercial only

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
jungwoo choi
2025-11-11 16:19:50 +09:00
parent 5a99d081ab
commit 578be1fd55
4 changed files with 387 additions and 250 deletions

View File

@ -143,203 +143,214 @@ class TranslationService:
"yi": "yi", # Yiddish
}
# NLLB-200 supported language codes (200 languages)
# NLLB uses different format: xxx_Yyyy (language_Script)
# Full list: https://huggingface.co/facebook/nllb-200-distilled-600M
# NLLB-200 supported language codes (204 languages from FLORES-200)
# NLLB uses format: xxx_Yyyy (language_Script)
# Full list: https://github.com/facebookresearch/flores/blob/main/flores200/README.md
self.nllb200_lang_codes = {
# Major languages
"en": "eng_Latn", # English
"zh": "zho_Hans", # Chinese (Simplified)
"es": "spa_Latn", # Spanish
"ar": "arb_Arab", # Arabic (Standard)
"hi": "hin_Deva", # Hindi
"bn": "ben_Beng", # Bengali
"pt": "por_Latn", # Portuguese
"ru": "rus_Cyrl", # Russian
"ja": "jpn_Jpan", # Japanese
"de": "deu_Latn", # German
"fr": "fra_Latn", # French
"ko": "kor_Hang", # Korean
"it": "ita_Latn", # Italian
"tr": "tur_Latn", # Turkish
"vi": "vie_Latn", # Vietnamese
"th": "tha_Thai", # Thai
"pl": "pol_Latn", # Polish
"nl": "nld_Latn", # Dutch
"uk": "ukr_Cyrl", # Ukrainian
"ro": "ron_Latn", # Romanian
# Southeast Asian languages
"ms": "zsm_Latn", # Malay (Standard)
"id": "ind_Latn", # Indonesian
"tl": "tgl_Latn", # Tagalog
"my": "mya_Mymr", # Burmese
"km": "khm_Khmr", # Khmer
"lo": "lao_Laoo", # Lao
# South Asian languages
"ur": "urd_Arab", # Urdu
"ta": "tam_Taml", # Tamil
"te": "tel_Telu", # Telugu
"mr": "mar_Deva", # Marathi
"gu": "guj_Gujr", # Gujarati
"kn": "kan_Knda", # Kannada
"ml": "mal_Mlym", # Malayalam
"pa": "pan_Guru", # Punjabi
"ne": "npi_Deva", # Nepali
"si": "sin_Sinh", # Sinhala
# European languages
"sv": "swe_Latn", # Swedish
"da": "dan_Latn", # Danish
"fi": "fin_Latn", # Finnish
"no": "nob_Latn", # Norwegian (Bokmål)
"cs": "ces_Latn", # Czech
"sk": "slk_Latn", # Slovak
"hu": "hun_Latn", # Hungarian
"bg": "bul_Cyrl", # Bulgarian
"sr": "srp_Cyrl", # Serbian
"hr": "hrv_Latn", # Croatian
"sl": "slv_Latn", # Slovenian
"et": "est_Latn", # Estonian
"lv": "lvs_Latn", # Latvian
"lt": "lit_Latn", # Lithuanian
"el": "ell_Grek", # Greek
"he": "heb_Hebr", # Hebrew
"fa": "pes_Arab", # Persian
# African languages
"sw": "swh_Latn", # Swahili
"am": "amh_Ethi", # Amharic
"ha": "hau_Latn", # Hausa
"ig": "ibo_Latn", # Igbo
"yo": "yor_Latn", # Yoruba
"zu": "zul_Latn", # Zulu
"xh": "xho_Latn", # Xhosa
"af": "afr_Latn", # Afrikaans
"sn": "sna_Latn", # Shona
"so": "som_Latn", # Somali
# Other languages
"az": "azj_Latn", # Azerbaijani (North)
"ka": "kat_Geor", # Georgian
"kk": "kaz_Cyrl", # Kazakh
"uz": "uzn_Latn", # Uzbek (Northern)
"mn": "khk_Cyrl", # Mongolian (Halh)
"sq": "als_Latn", # Albanian
"hy": "hye_Armn", # Armenian
"be": "bel_Cyrl", # Belarusian
"bs": "bos_Latn", # Bosnian
"ca": "cat_Latn", # Catalan
"ceb": "ceb_Latn", # Cebuano
"cy": "cym_Latn", # Welsh
"eo": "epo_Latn", # Esperanto
"eu": "eus_Latn", # Basque
"gl": "glg_Latn", # Galician
"is": "isl_Latn", # Icelandic
"jv": "jav_Latn", # Javanese
"ku": "kmr_Latn", # Kurdish (Kurmanji)
"ky": "kir_Cyrl", # Kyrgyz
"la": "lat_Latn", # Latin
"lb": "ltz_Latn", # Luxembourgish
"lg": "lug_Latn", # Luganda
"mg": "plt_Latn", # Malagasy
"mk": "mkd_Cyrl", # Macedonian
"mt": "mlt_Latn", # Maltese
"ny": "nya_Latn", # Chichewa
"ps": "pbt_Arab", # Pashto (Southern)
"st": "sot_Latn", # Sesotho
"su": "sun_Latn", # Sundanese
"tg": "tgk_Cyrl", # Tajik
"tk": "tuk_Latn", # Turkmen
"ug": "uig_Arab", # Uyghur
# Additional NLLB-200 exclusive languages (examples, 95 more)
"ace_arab": "ace_Arab", # Acehnese (Arabic script)
"ace": "ace_Latn", # Acehnese
"acm": "acm_Arab", # Mesopotamian Arabic
"acq": "acq_Arab", # Ta'izzi-Adeni Arabic
"aeb": "aeb_Arab", # Tunisian Arabic
"af": "afr_Latn", # Afrikaans
"ajp": "ajp_Arab", # South Levantine Arabic
"als": "als_Latn", # Tosk Albanian
"aka": "aka_Latn", # Akan
"am": "amh_Ethi", # Amharic
"apc": "apc_Arab", # North Levantine Arabic
"ar": "arb_Arab", # Arabic (Standard)
"ar_latn": "arb_Latn", # Arabic (Latin script)
"ars": "ars_Arab", # Najdi Arabic
"ary": "ary_Arab", # Moroccan Arabic
"arz": "arz_Arab", # Egyptian Arabic
"asm": "asm_Beng", # Assamese
"as": "asm_Beng", # Assamese
"ast": "ast_Latn", # Asturian
"awa": "awa_Deva", # Awadhi
"ayr": "ayr_Latn", # Central Aymara
"azb": "azb_Arab", # South Azerbaijani
"bak": "bak_Cyrl", # Bashkir
"az": "azj_Latn", # Azerbaijani
"ba": "bak_Cyrl", # Bashkir
"bam": "bam_Latn", # Bambara
"ban": "ban_Latn", # Balinese
"be": "bel_Cyrl", # Belarusian
"bem": "bem_Latn", # Bemba
"bn": "ben_Beng", # Bengali
"bho": "bho_Deva", # Bhojpuri
"bjn_arab": "bjn_Arab", # Banjar (Arabic script)
"bjn": "bjn_Latn", # Banjar
"bod": "bod_Tibt", # Tibetan
"bo": "bod_Tibt", # Tibetan
"bs": "bos_Latn", # Bosnian
"bug": "bug_Latn", # Buginese
"crh": "crh_Latn", # Crimean Tatar
"bg": "bul_Cyrl", # Bulgarian
"ca": "cat_Latn", # Catalan
"ceb": "ceb_Latn", # Cebuano
"cs": "ces_Latn", # Czech
"cjk": "cjk_Latn", # Chokwe
"ckb": "ckb_Arab", # Central Kurdish
"crh": "crh_Latn", # Crimean Tatar
"cy": "cym_Latn", # Welsh
"da": "dan_Latn", # Danish
"de": "deu_Latn", # German
"dik": "dik_Latn", # Southwestern Dinka
"dyu": "dyu_Latn", # Dyula
"dzo": "dzo_Tibt", # Dzongkha
"dz": "dzo_Tibt", # Dzongkha
"el": "ell_Grek", # Greek
"en": "eng_Latn", # English
"eo": "epo_Latn", # Esperanto
"et": "est_Latn", # Estonian
"eu": "eus_Latn", # Basque
"ee": "ewe_Latn", # Ewe
"fo": "fao_Latn", # Faroese
"fj": "fij_Latn", # Fijian
"fi": "fin_Latn", # Finnish
"fon": "fon_Latn", # Fon
"fr": "fra_Latn", # French
"fur": "fur_Latn", # Friulian
"fuv": "fuv_Latn", # Nigerian Fulfulde
"gaz": "gaz_Latn", # West Central Oromo
"grn": "grn_Latn", # Guarani
"om": "gaz_Latn", # West Central Oromo
"gd": "gla_Latn", # Scottish Gaelic
"ga": "gle_Latn", # Irish
"gl": "glg_Latn", # Galician
"gn": "grn_Latn", # Guarani
"gu": "guj_Gujr", # Gujarati
"ht": "hat_Latn", # Haitian Creole
"ha": "hau_Latn", # Hausa
"he": "heb_Hebr", # Hebrew
"hi": "hin_Deva", # Hindi
"hne": "hne_Deva", # Chhattisgarhi
"hr": "hrv_Latn", # Croatian
"hu": "hun_Latn", # Hungarian
"hy": "hye_Armn", # Armenian
"ig": "ibo_Latn", # Igbo
"ilo": "ilo_Latn", # Iloko
"id": "ind_Latn", # Indonesian
"is": "isl_Latn", # Icelandic
"it": "ita_Latn", # Italian
"jv": "jav_Latn", # Javanese
"ja": "jpn_Jpan", # Japanese
"kab": "kab_Latn", # Kabyle
"kac": "kac_Latn", # Jingpho
"kam": "kam_Latn", # Kamba
"kas": "kas_Arab", # Kashmiri
"kn": "kan_Knda", # Kannada
"ks": "kas_Arab", # Kashmiri (Arabic)
"ks_deva": "kas_Deva", # Kashmiri (Devanagari)
"ka": "kat_Geor", # Georgian
"kk": "kaz_Cyrl", # Kazakh
"kbp": "kbp_Latn", # Kabiyè
"kea": "kea_Latn", # Kabuverdianu
"khk": "khk_Cyrl", # Halh Mongolian
"kin": "kin_Latn", # Kinyarwanda
"mn": "khk_Cyrl", # Mongolian (Halh)
"km": "khm_Khmr", # Khmer
"ki": "kik_Latn", # Kikuyu
"rw": "kin_Latn", # Kinyarwanda
"ky": "kir_Cyrl", # Kyrgyz
"kmb": "kmb_Latn", # Kimbundu
"ku": "kmr_Latn", # Kurdish (Kurmanji)
"knc_arab": "knc_Arab", # Kanuri (Arabic script)
"knc": "knc_Latn", # Kanuri
"kg": "kon_Latn", # Kongo
"ko": "kor_Hang", # Korean
"lo": "lao_Laoo", # Lao
"lij": "lij_Latn", # Ligurian
"lim": "lim_Latn", # Limburgish
"lin": "lin_Latn", # Lingala
"li": "lim_Latn", # Limburgish
"ln": "lin_Latn", # Lingala
"lt": "lit_Latn", # Lithuanian
"lmo": "lmo_Latn", # Lombard
"ltg": "ltg_Latn", # Latgalian
"lb": "ltz_Latn", # Luxembourgish
"lua": "lua_Latn", # Luba-Kasai
"lg": "lug_Latn", # Luganda
"luo": "luo_Latn", # Luo
"lus": "lus_Latn", # Mizo
"lv": "lvs_Latn", # Latvian
"mag": "mag_Deva", # Magahi
"mai": "mai_Deva", # Maithili
"ml": "mal_Mlym", # Malayalam
"mr": "mar_Deva", # Marathi
"min_arab": "min_Arab", # Minangkabau (Arabic)
"min": "min_Latn", # Minangkabau
"mk": "mkd_Cyrl", # Macedonian
"mt": "mlt_Latn", # Maltese
"mni": "mni_Beng", # Meitei
"mos": "mos_Latn", # Mossi
"mri": "mri_Latn", # Maori
"mi": "mri_Latn", # Maori
"my": "mya_Mymr", # Burmese
"nl": "nld_Latn", # Dutch
"nn": "nno_Latn", # Norwegian Nynorsk
"nb": "nob_Latn", # Norwegian Bokmål
"ne": "npi_Deva", # Nepali
"nso": "nso_Latn", # Northern Sotho
"nus": "nus_Latn", # Nuer
"ory": "ory_Orya", # Odia
"ny": "nya_Latn", # Chichewa
"oc": "oci_Latn", # Occitan
"or": "ory_Orya", # Odia
"pag": "pag_Latn", # Pangasinan
"pa": "pan_Guru", # Punjabi
"pap": "pap_Latn", # Papiamento
"ps": "pbt_Arab", # Pashto (Southern)
"fa": "pes_Arab", # Persian
"mg": "plt_Latn", # Malagasy
"pl": "pol_Latn", # Polish
"pt": "por_Latn", # Portuguese
"prs": "prs_Arab", # Dari
"quy": "quy_Latn", # Ayacucho Quechua
"run": "run_Latn", # Rundi
"sag": "sag_Latn", # Sango
"san": "san_Deva", # Sanskrit
"sat": "sat_Beng", # Santali
"qu": "quy_Latn", # Ayacucho Quechua
"ro": "ron_Latn", # Romanian
"rn": "run_Latn", # Rundi
"ru": "rus_Cyrl", # Russian
"sg": "sag_Latn", # Sango
"sa": "san_Deva", # Sanskrit
"sat": "sat_Olck", # Santali
"scn": "scn_Latn", # Sicilian
"shn": "shn_Mymr", # Shan
"srd": "srd_Latn", # Sardinian
"si": "sin_Sinh", # Sinhala
"sk": "slk_Latn", # Slovak
"sl": "slv_Latn", # Slovenian
"sm": "smo_Latn", # Samoan
"sn": "sna_Latn", # Shona
"sd": "snd_Arab", # Sindhi
"so": "som_Latn", # Somali
"st": "sot_Latn", # Sesotho
"es": "spa_Latn", # Spanish
"sq": "als_Latn", # Albanian (Tosk)
"sc": "srd_Latn", # Sardinian
"sr": "srp_Cyrl", # Serbian
"ss": "ssw_Latn", # Swazi
"su": "sun_Latn", # Sundanese
"sv": "swe_Latn", # Swedish
"sw": "swh_Latn", # Swahili
"szl": "szl_Latn", # Silesian
"taq": "taq_Latn", # Tamasheq
"tat": "tat_Cyrl", # Tatar
"tir": "tir_Ethi", # Tigrinya
"ta": "tam_Taml", # Tamil
"taq": "taq_Latn", # Tamasheq (Latin)
"taq_tfng": "taq_Tfng", # Tamasheq (Tifinagh)
"tt": "tat_Cyrl", # Tatar
"te": "tel_Telu", # Telugu
"tg": "tgk_Cyrl", # Tajik
"tl": "tgl_Latn", # Tagalog
"th": "tha_Thai", # Thai
"ti": "tir_Ethi", # Tigrinya
"tpi": "tpi_Latn", # Tok Pisin
"tsn": "tsn_Latn", # Tswana
"tso": "tso_Latn", # Tsonga
"tn": "tsn_Latn", # Tswana
"ts": "tso_Latn", # Tsonga
"tk": "tuk_Latn", # Turkmen
"tum": "tum_Latn", # Tumbuka
"twi": "twi_Latn", # Twi
"tr": "tur_Latn", # Turkish
"tw": "twi_Latn", # Twi
"tzm": "tzm_Tfng", # Central Atlas Tamazight
"uig": "uig_Arab", # Uyghur
"ug": "uig_Arab", # Uyghur
"uk": "ukr_Cyrl", # Ukrainian
"umb": "umb_Latn", # Umbundu
"ur": "urd_Arab", # Urdu
"uz": "uzn_Latn", # Uzbek (Northern)
"vec": "vec_Latn", # Venetian
"vi": "vie_Latn", # Vietnamese
"war": "war_Latn", # Waray
"wol": "wol_Latn", # Wolof
"xho": "xho_Latn", # Xhosa
"ydd": "ydd_Hebr", # Eastern Yiddish
"yor": "yor_Latn", # Yoruba
"wo": "wol_Latn", # Wolof
"xh": "xho_Latn", # Xhosa
"yi": "ydd_Hebr", # Eastern Yiddish
"yo": "yor_Latn", # Yoruba
"yue": "yue_Hant", # Cantonese
"zho_hant": "zho_Hant", # Chinese (Traditional)
"zh": "zho_Hans", # Chinese (Simplified)
"zh_hant": "zho_Hant", # Chinese (Traditional)
"ms": "zsm_Latn", # Malay (Standard)
"zu": "zul_Latn", # Zulu
}
def get_supported_languages(self, model_type: str = "m2m100") -> Dict[str, str]: