From 5a99d081ab82d4f82b55b41559e2929e3e75967c Mon Sep 17 00:00:00 2001 From: jungwoo choi Date: Tue, 11 Nov 2025 16:02:32 +0900 Subject: [PATCH] Fix NLLB-200 tokenizer and add .dockerignore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fixed NLLB-200 tokenizer forced_bos_token_id issue - Changed from lang_code_to_id to convert_tokens_to_ids - Added .dockerignore to exclude models directory from Docker build - Prevents disk space issues during build - Models are loaded at runtime via volume mount - Both M2M100 and NLLB-200 models tested and working 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .dockerignore | 42 ++++++++++++++++++++++++++++++++++++++++++ app/translator.py | 3 ++- 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..0629895 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,42 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +*.egg-info/ +dist/ +build/ + +# Virtual environments +venv/ +env/ +ENV/ + +# Models cache (will be mounted as volume) +models/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# Git +.git/ +.gitignore + +# Documentation +README.md +CLAUDE.md +*.md + +# Environment +.env +.env.local +.env.*.local + +# Docker +.dockerignore +Dockerfile +docker-compose.yml diff --git a/app/translator.py b/app/translator.py index 2c3e515..78c4274 100644 --- a/app/translator.py +++ b/app/translator.py @@ -473,7 +473,8 @@ class TranslationService: ).to(self.device) # Generate translation - NLLB uses forced_bos_token_id - forced_bos_token_id = tokenizer.lang_code_to_id[tgt_code] + # Convert language code to token ID + forced_bos_token_id = tokenizer.convert_tokens_to_ids(tgt_code) with torch.no_grad(): translated = model.generate(