freedmand · Mirochill · May 27, 2026
diff --git a/src/semantra/models.py b/src/semantra/models.py
@@ -1,3 +1,4 @@
+import codecs
 import os
 from abc import ABC, abstractmethod
 
@@ -137,7 +138,19 @@ def get_token_length(self, tokens) -> int:
         return len(tokens)
 
     def get_text_chunks(self, _: str, tokens) -> "list[str]":
-        return [self.tokenizer.decode([token]) for token in tokens]
+        # BPE token boundaries can split multibyte UTF-8 characters.
+        decoder = codecs.getincrementaldecoder("utf-8")()
+        chunks = [
+            decoder.decode(self.tokenizer.decode_single_token_bytes(token), final=False)
+            for token in tokens
+        ]
+        remainder = decoder.decode(b"", final=True)
+        if remainder:
+            if chunks:
+                chunks[-1] += remainder
+            else:
+                chunks.append(remainder)
+        return chunks
 
     def embed(self, tokens, offsets, _is_query=False) -> "list[list[float]]":
         texts = [tokens[i:j] for i, j in offsets]