Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion src/semantra/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import codecs
import os
from abc import ABC, abstractmethod

Expand Down Expand Up @@ -137,7 +138,19 @@ def get_token_length(self, tokens) -> int:
return len(tokens)

def get_text_chunks(self, _: str, tokens) -> "list[str]":
return [self.tokenizer.decode([token]) for token in tokens]
# BPE token boundaries can split multibyte UTF-8 characters.
decoder = codecs.getincrementaldecoder("utf-8")()
chunks = [
decoder.decode(self.tokenizer.decode_single_token_bytes(token), final=False)
for token in tokens
]
remainder = decoder.decode(b"", final=True)
if remainder:
if chunks:
chunks[-1] += remainder
else:
chunks.append(remainder)
return chunks

def embed(self, tokens, offsets, _is_query=False) -> "list[list[float]]":
texts = [tokens[i:j] for i, j in offsets]
Expand Down