From ebb4e7e70e7150ef5bacc2bea2925835286eda2b Mon Sep 17 00:00:00 2001 From: Miro <200482516+Mirochill@users.noreply.github.com> Date: Wed, 27 May 2026 11:15:07 +0200 Subject: [PATCH] Preserve OpenAI token text chunks --- src/semantra/models.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/semantra/models.py b/src/semantra/models.py index 5ff77b1..df08d2b 100644 --- a/src/semantra/models.py +++ b/src/semantra/models.py @@ -1,3 +1,4 @@ +import codecs import os from abc import ABC, abstractmethod @@ -137,7 +138,19 @@ def get_token_length(self, tokens) -> int: return len(tokens) def get_text_chunks(self, _: str, tokens) -> "list[str]": - return [self.tokenizer.decode([token]) for token in tokens] + # BPE token boundaries can split multibyte UTF-8 characters. + decoder = codecs.getincrementaldecoder("utf-8")() + chunks = [ + decoder.decode(self.tokenizer.decode_single_token_bytes(token), final=False) + for token in tokens + ] + remainder = decoder.decode(b"", final=True) + if remainder: + if chunks: + chunks[-1] += remainder + else: + chunks.append(remainder) + return chunks def embed(self, tokens, offsets, _is_query=False) -> "list[list[float]]": texts = [tokens[i:j] for i, j in offsets]