from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, AutoModelForSeq2SeqLM, AutoTokenizer import torch from typing import Dict, Optional import logging from .config import settings logger = logging.getLogger(__name__) class TranslationService: """ Service for handling multilingual translation Supports two models: - M2M100 (105 languages, Apache 2.0 License - Commercial use allowed) - NLLB-200 (200 languages, CC-BY-NC License - Non-commercial only) """ def __init__(self): self.models: Dict[str, Dict] = {} self.device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Using device: {self.device}") # M2M100 supported language codes (105 languages) # Full list: https://huggingface.co/facebook/m2m100_418M self.m2m100_lang_codes = { # Major languages "en": "en", # English "zh": "zh", # Chinese "es": "es", # Spanish "ar": "ar", # Arabic "hi": "hi", # Hindi "bn": "bn", # Bengali "pt": "pt", # Portuguese "ru": "ru", # Russian "ja": "ja", # Japanese "de": "de", # German "fr": "fr", # French "ko": "ko", # Korean "it": "it", # Italian "tr": "tr", # Turkish "vi": "vi", # Vietnamese "th": "th", # Thai "pl": "pl", # Polish "nl": "nl", # Dutch "uk": "uk", # Ukrainian "ro": "ro", # Romanian # Southeast Asian languages "ms": "ms", # Malay "id": "id", # Indonesian "tl": "tl", # Tagalog "my": "my", # Burmese "km": "km", # Khmer "lo": "lo", # Lao # South Asian languages "ur": "ur", # Urdu "ta": "ta", # Tamil "te": "te", # Telugu "mr": "mr", # Marathi "gu": "gu", # Gujarati "kn": "kn", # Kannada "ml": "ml", # Malayalam "pa": "pa", # Punjabi "ne": "ne", # Nepali "si": "si", # Sinhala # European languages "sv": "sv", # Swedish "da": "da", # Danish "fi": "fi", # Finnish "no": "no", # Norwegian "cs": "cs", # Czech "sk": "sk", # Slovak "hu": "hu", # Hungarian "bg": "bg", # Bulgarian "sr": "sr", # Serbian "hr": "hr", # Croatian "sl": "sl", # Slovenian "et": "et", # Estonian "lv": "lv", # Latvian "lt": "lt", # Lithuanian "el": "el", # Greek "he": "he", # Hebrew "fa": "fa", # Persian # African languages "sw": "sw", # Swahili "am": "am", # Amharic "ha": "ha", # Hausa "ig": "ig", # Igbo "yo": "yo", # Yoruba "zu": "zu", # Zulu "xh": "xh", # Xhosa "af": "af", # Afrikaans # Other major languages "az": "az", # Azerbaijani "ka": "ka", # Georgian "kk": "kk", # Kazakh "uz": "uz", # Uzbek "mn": "mn", # Mongolian # Additional languages "sq": "sq", # Albanian "hy": "hy", # Armenian "be": "be", # Belarusian "bs": "bs", # Bosnian "ca": "ca", # Catalan "ceb": "ceb", # Cebuano "cy": "cy", # Welsh "eo": "eo", # Esperanto "eu": "eu", # Basque "fil": "fil", # Filipino "fy": "fy", # Frisian "ga": "ga", # Irish "gd": "gd", # Scottish Gaelic "gl": "gl", # Galician "haw": "haw", # Hawaiian "hmn": "hmn", # Hmong "ht": "ht", # Haitian Creole "is": "is", # Icelandic "jv": "jv", # Javanese "ku": "ku", # Kurdish "ky": "ky", # Kyrgyz "la": "la", # Latin "lb": "lb", # Luxembourgish "lg": "lg", # Luganda "ln": "ln", # Lingala "mg": "mg", # Malagasy "mi": "mi", # Maori "mk": "mk", # Macedonian "mt": "mt", # Maltese "ny": "ny", # Chichewa "ps": "ps", # Pashto "sn": "sn", # Shona "so": "so", # Somali "st": "st", # Sesotho "su": "su", # Sundanese "tg": "tg", # Tajik "tk": "tk", # Turkmen "ug": "ug", # Uyghur "yi": "yi", # Yiddish } # NLLB-200 supported language codes (200 languages) # NLLB uses different format: xxx_Yyyy (language_Script) # Full list: https://huggingface.co/facebook/nllb-200-distilled-600M self.nllb200_lang_codes = { # Major languages "en": "eng_Latn", # English "zh": "zho_Hans", # Chinese (Simplified) "es": "spa_Latn", # Spanish "ar": "arb_Arab", # Arabic (Standard) "hi": "hin_Deva", # Hindi "bn": "ben_Beng", # Bengali "pt": "por_Latn", # Portuguese "ru": "rus_Cyrl", # Russian "ja": "jpn_Jpan", # Japanese "de": "deu_Latn", # German "fr": "fra_Latn", # French "ko": "kor_Hang", # Korean "it": "ita_Latn", # Italian "tr": "tur_Latn", # Turkish "vi": "vie_Latn", # Vietnamese "th": "tha_Thai", # Thai "pl": "pol_Latn", # Polish "nl": "nld_Latn", # Dutch "uk": "ukr_Cyrl", # Ukrainian "ro": "ron_Latn", # Romanian # Southeast Asian languages "ms": "zsm_Latn", # Malay (Standard) "id": "ind_Latn", # Indonesian "tl": "tgl_Latn", # Tagalog "my": "mya_Mymr", # Burmese "km": "khm_Khmr", # Khmer "lo": "lao_Laoo", # Lao # South Asian languages "ur": "urd_Arab", # Urdu "ta": "tam_Taml", # Tamil "te": "tel_Telu", # Telugu "mr": "mar_Deva", # Marathi "gu": "guj_Gujr", # Gujarati "kn": "kan_Knda", # Kannada "ml": "mal_Mlym", # Malayalam "pa": "pan_Guru", # Punjabi "ne": "npi_Deva", # Nepali "si": "sin_Sinh", # Sinhala # European languages "sv": "swe_Latn", # Swedish "da": "dan_Latn", # Danish "fi": "fin_Latn", # Finnish "no": "nob_Latn", # Norwegian (Bokmål) "cs": "ces_Latn", # Czech "sk": "slk_Latn", # Slovak "hu": "hun_Latn", # Hungarian "bg": "bul_Cyrl", # Bulgarian "sr": "srp_Cyrl", # Serbian "hr": "hrv_Latn", # Croatian "sl": "slv_Latn", # Slovenian "et": "est_Latn", # Estonian "lv": "lvs_Latn", # Latvian "lt": "lit_Latn", # Lithuanian "el": "ell_Grek", # Greek "he": "heb_Hebr", # Hebrew "fa": "pes_Arab", # Persian # African languages "sw": "swh_Latn", # Swahili "am": "amh_Ethi", # Amharic "ha": "hau_Latn", # Hausa "ig": "ibo_Latn", # Igbo "yo": "yor_Latn", # Yoruba "zu": "zul_Latn", # Zulu "xh": "xho_Latn", # Xhosa "af": "afr_Latn", # Afrikaans "sn": "sna_Latn", # Shona "so": "som_Latn", # Somali # Other languages "az": "azj_Latn", # Azerbaijani (North) "ka": "kat_Geor", # Georgian "kk": "kaz_Cyrl", # Kazakh "uz": "uzn_Latn", # Uzbek (Northern) "mn": "khk_Cyrl", # Mongolian (Halh) "sq": "als_Latn", # Albanian "hy": "hye_Armn", # Armenian "be": "bel_Cyrl", # Belarusian "bs": "bos_Latn", # Bosnian "ca": "cat_Latn", # Catalan "ceb": "ceb_Latn", # Cebuano "cy": "cym_Latn", # Welsh "eo": "epo_Latn", # Esperanto "eu": "eus_Latn", # Basque "gl": "glg_Latn", # Galician "is": "isl_Latn", # Icelandic "jv": "jav_Latn", # Javanese "ku": "kmr_Latn", # Kurdish (Kurmanji) "ky": "kir_Cyrl", # Kyrgyz "la": "lat_Latn", # Latin "lb": "ltz_Latn", # Luxembourgish "lg": "lug_Latn", # Luganda "mg": "plt_Latn", # Malagasy "mk": "mkd_Cyrl", # Macedonian "mt": "mlt_Latn", # Maltese "ny": "nya_Latn", # Chichewa "ps": "pbt_Arab", # Pashto (Southern) "st": "sot_Latn", # Sesotho "su": "sun_Latn", # Sundanese "tg": "tgk_Cyrl", # Tajik "tk": "tuk_Latn", # Turkmen "ug": "uig_Arab", # Uyghur # Additional NLLB-200 exclusive languages (examples, 95 more) "ace": "ace_Latn", # Acehnese "acm": "acm_Arab", # Mesopotamian Arabic "acq": "acq_Arab", # Ta'izzi-Adeni Arabic "aeb": "aeb_Arab", # Tunisian Arabic "ajp": "ajp_Arab", # South Levantine Arabic "als": "als_Latn", # Tosk Albanian "ars": "ars_Arab", # Najdi Arabic "ary": "ary_Arab", # Moroccan Arabic "arz": "arz_Arab", # Egyptian Arabic "asm": "asm_Beng", # Assamese "ast": "ast_Latn", # Asturian "awa": "awa_Deva", # Awadhi "ayr": "ayr_Latn", # Central Aymara "azb": "azb_Arab", # South Azerbaijani "bak": "bak_Cyrl", # Bashkir "bam": "bam_Latn", # Bambara "ban": "ban_Latn", # Balinese "bho": "bho_Deva", # Bhojpuri "bjn": "bjn_Latn", # Banjar "bod": "bod_Tibt", # Tibetan "bug": "bug_Latn", # Buginese "crh": "crh_Latn", # Crimean Tatar "cjk": "cjk_Latn", # Chokwe "ckb": "ckb_Arab", # Central Kurdish "dik": "dik_Latn", # Southwestern Dinka "dyu": "dyu_Latn", # Dyula "dzo": "dzo_Tibt", # Dzongkha "fur": "fur_Latn", # Friulian "fuv": "fuv_Latn", # Nigerian Fulfulde "gaz": "gaz_Latn", # West Central Oromo "grn": "grn_Latn", # Guarani "hne": "hne_Deva", # Chhattisgarhi "ilo": "ilo_Latn", # Iloko "kab": "kab_Latn", # Kabyle "kac": "kac_Latn", # Jingpho "kam": "kam_Latn", # Kamba "kas": "kas_Arab", # Kashmiri "kea": "kea_Latn", # Kabuverdianu "khk": "khk_Cyrl", # Halh Mongolian "kin": "kin_Latn", # Kinyarwanda "lij": "lij_Latn", # Ligurian "lim": "lim_Latn", # Limburgish "lin": "lin_Latn", # Lingala "lmo": "lmo_Latn", # Lombard "ltg": "ltg_Latn", # Latgalian "luo": "luo_Latn", # Luo "lus": "lus_Latn", # Mizo "mag": "mag_Deva", # Magahi "mai": "mai_Deva", # Maithili "min": "min_Latn", # Minangkabau "mni": "mni_Beng", # Meitei "mos": "mos_Latn", # Mossi "mri": "mri_Latn", # Maori "nus": "nus_Latn", # Nuer "ory": "ory_Orya", # Odia "pag": "pag_Latn", # Pangasinan "pap": "pap_Latn", # Papiamento "prs": "prs_Arab", # Dari "quy": "quy_Latn", # Ayacucho Quechua "run": "run_Latn", # Rundi "sag": "sag_Latn", # Sango "san": "san_Deva", # Sanskrit "sat": "sat_Beng", # Santali "scn": "scn_Latn", # Sicilian "shn": "shn_Mymr", # Shan "srd": "srd_Latn", # Sardinian "szl": "szl_Latn", # Silesian "taq": "taq_Latn", # Tamasheq "tat": "tat_Cyrl", # Tatar "tir": "tir_Ethi", # Tigrinya "tpi": "tpi_Latn", # Tok Pisin "tsn": "tsn_Latn", # Tswana "tso": "tso_Latn", # Tsonga "tum": "tum_Latn", # Tumbuka "twi": "twi_Latn", # Twi "tzm": "tzm_Tfng", # Central Atlas Tamazight "uig": "uig_Arab", # Uyghur "vec": "vec_Latn", # Venetian "war": "war_Latn", # Waray "wol": "wol_Latn", # Wolof "xho": "xho_Latn", # Xhosa "ydd": "ydd_Hebr", # Eastern Yiddish "yor": "yor_Latn", # Yoruba "yue": "yue_Hant", # Cantonese "zho_hant": "zho_Hant", # Chinese (Traditional) } def get_supported_languages(self, model_type: str = "m2m100") -> Dict[str, str]: """Get supported language codes for the specified model""" if model_type == "m2m100": return self.m2m100_lang_codes elif model_type == "nllb200": return self.nllb200_lang_codes else: raise ValueError(f"Unknown model type: {model_type}") def _get_model_info(self, source_lang: str, target_lang: str, model_type: str) -> tuple[str, str, str, str]: """Get the model name and language codes for translation""" if model_type == "m2m100": model_name = "facebook/m2m100_418M" lang_codes = self.m2m100_lang_codes elif model_type == "nllb200": model_name = "facebook/nllb-200-distilled-600M" lang_codes = self.nllb200_lang_codes else: raise ValueError(f"Unknown model type: {model_type}") src_code = lang_codes.get(source_lang) tgt_code = lang_codes.get(target_lang) if not src_code: raise ValueError(f"Source language '{source_lang}' not supported by {model_type}. Use /api/supported-languages?model={model_type} to see available languages.") if not tgt_code: raise ValueError(f"Target language '{target_lang}' not supported by {model_type}. Use /api/supported-languages?model={model_type} to see available languages.") return model_name, src_code, tgt_code, model_type def load_model(self, source_lang: str, target_lang: str, model_type: str = "m2m100") -> None: """Load translation model for specific language pair""" model_name, _, _, _ = self._get_model_info(source_lang, target_lang, model_type) if model_name in self.models: logger.info(f"Model {model_name} already loaded") return try: logger.info(f"Loading model: {model_name}") if model_type == "m2m100": tokenizer = M2M100Tokenizer.from_pretrained( model_name, cache_dir=settings.model_cache_dir ) model = M2M100ForConditionalGeneration.from_pretrained( model_name, cache_dir=settings.model_cache_dir ).to(self.device) elif model_type == "nllb200": tokenizer = AutoTokenizer.from_pretrained( model_name, cache_dir=settings.model_cache_dir ) model = AutoModelForSeq2SeqLM.from_pretrained( model_name, cache_dir=settings.model_cache_dir ).to(self.device) self.models[model_name] = { "tokenizer": tokenizer, "model": model, "type": model_type } logger.info(f"Successfully loaded model: {model_name}") except Exception as e: logger.error(f"Error loading model {model_name}: {str(e)}") raise def translate(self, text: str, source_lang: str, target_lang: str, model_type: str = "m2m100") -> tuple[str, str]: """ Translate text from source language to target language Args: text: Text to translate source_lang: Source language code target_lang: Target language code model_type: Model to use ('m2m100' or 'nllb200') Returns: Tuple of (translated_text, model_name) """ model_name, src_code, tgt_code, _ = self._get_model_info(source_lang, target_lang, model_type) # Load model if not already loaded if model_name not in self.models: self.load_model(source_lang, target_lang, model_type) try: tokenizer = self.models[model_name]["tokenizer"] model = self.models[model_name]["model"] if model_type == "m2m100": # M2M100 uses src_lang attribute tokenizer.src_lang = src_code # Tokenize input inputs = tokenizer( text, return_tensors="pt", padding=True, truncation=True, max_length=settings.max_length ).to(self.device) # Generate translation generated_tokens = tokenizer.get_lang_id(tgt_code) with torch.no_grad(): translated = model.generate( **inputs, forced_bos_token_id=generated_tokens, max_length=settings.max_length ) elif model_type == "nllb200": # NLLB uses different approach with src_lang in tokenizer call tokenizer.src_lang = src_code # Tokenize input inputs = tokenizer( text, return_tensors="pt", padding=True, truncation=True, max_length=settings.max_length ).to(self.device) # Generate translation - NLLB uses forced_bos_token_id forced_bos_token_id = tokenizer.lang_code_to_id[tgt_code] with torch.no_grad(): translated = model.generate( **inputs, forced_bos_token_id=forced_bos_token_id, max_length=settings.max_length ) # Decode output translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0] return translated_text, model_name except Exception as e: logger.error(f"Translation error: {str(e)}") raise def preload_all_models(self) -> None: """Preload commonly used translation models""" # Only preload M2M100 by default (commercial-friendly) language_pairs = [ ("ms", "en", "m2m100"), ("en", "ms", "m2m100") ] for source, target, model_type in language_pairs: try: self.load_model(source, target, model_type) except Exception as e: logger.warning(f"Could not preload model for {source}->{target} ({model_type}): {str(e)}") def is_ready(self) -> bool: """Check if at least one model is loaded""" return len(self.models) > 0 # Global translator instance translator = TranslationService()