From 5a99d081ab82d4f82b55b41559e2929e3e75967c Mon Sep 17 00:00:00 2001
From: jungwoo choi <jungwoochoi@10-10-0-111.maas>
Date: Tue, 11 Nov 2025 16:02:32 +0900
Subject: [PATCH] Fix NLLB-200 tokenizer and add .dockerignore
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fixed NLLB-200 tokenizer forced_bos_token_id issue
  - Changed from lang_code_to_id to convert_tokens_to_ids
- Added .dockerignore to exclude models directory from Docker build
  - Prevents disk space issues during build
  - Models are loaded at runtime via volume mount
- Both M2M100 and NLLB-200 models tested and working

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .dockerignore     | 42 ++++++++++++++++++++++++++++++++++++++++++
 app/translator.py |  3 ++-
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 .dockerignore

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..0629895
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,42 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+*.egg-info/
+dist/
+build/
+
+# Virtual environments
+venv/
+env/
+ENV/
+
+# Models cache (will be mounted as volume)
+models/
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# Git
+.git/
+.gitignore
+
+# Documentation
+README.md
+CLAUDE.md
+*.md
+
+# Environment
+.env
+.env.local
+.env.*.local
+
+# Docker
+.dockerignore
+Dockerfile
+docker-compose.yml
diff --git a/app/translator.py b/app/translator.py
index 2c3e515..78c4274 100644
--- a/app/translator.py
+++ b/app/translator.py
@@ -473,7 +473,8 @@ class TranslationService:
                 ).to(self.device)
 
                 # Generate translation - NLLB uses forced_bos_token_id
-                forced_bos_token_id = tokenizer.lang_code_to_id[tgt_code]
+                # Convert language code to token ID
+                forced_bos_token_id = tokenizer.convert_tokens_to_ids(tgt_code)
 
                 with torch.no_grad():
                     translated = model.generate(