From 399df3dbaa7ebd6664f4a8ed613746177574a16d Mon Sep 17 00:00:00 2001 From: Miro <200482516+Mirochill@users.noreply.github.com> Date: Wed, 27 May 2026 11:00:02 +0200 Subject: [PATCH] Fix short document search scores --- client/src/components/SearchResults.svelte | 2 ++ src/semantra/semantra.py | 23 ++++++++++++++++++++++ src/semantra/util.py | 23 +++++++++++++++------- 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/client/src/components/SearchResults.svelte b/client/src/components/SearchResults.svelte index 9189f29..9b3eceb 100644 --- a/client/src/components/SearchResults.svelte +++ b/client/src/components/SearchResults.svelte @@ -34,6 +34,8 @@ } function getScore(searchResults: SearchResult[]): number { + if (searchResults.length === 0) return 0; + let total = 0; for (const searchResult of searchResults) { total += searchResult.distance; diff --git a/src/semantra/semantra.py b/src/semantra/semantra.py index 9636117..64d5489 100644 --- a/src/semantra/semantra.py +++ b/src/semantra/semantra.py @@ -54,6 +54,21 @@ def get_text_content(md5, filename, semantra_dir, force, silent, encoding): return Content(rawtext, filename) +def filter_empty_offsets(offsets, text_chunks): + return [ + [ + offset + for offset in sub_offsets + if len(join_text_chunks(text_chunks[offset[0] : offset[1]])) > 0 + ] + for sub_offsets in offsets + ] + + +def count_offset_tokens(offsets): + return sum(end - start for sub_offsets in offsets for start, end in sub_offsets) + + TRANSFORMER_POOL_DEFAULT = 15000 @@ -172,6 +187,8 @@ def process( offsets, num_embedding_tokens, ) = get_offsets(num_tokens, windows) + offsets = filter_empty_offsets(offsets, text_chunks) + num_embedding_tokens = count_offset_tokens(offsets) # Full config contains additional details full_config = { @@ -694,6 +711,8 @@ def query(): results = [] for doc in documents.values(): + if doc.num_embeddings == 0: + continue embeddings = doc.embeddings # Get kNN with cosine similarity @@ -734,6 +753,8 @@ def querysvm(): embedding = model.embed_queries_and_preferences(queries, preferences, documents) results = [] for doc in documents.values(): + if doc.num_embeddings == 0: + continue embeddings = doc.embeddings x = np.concatenate([embeddings, embedding[None, ...]]) @@ -786,6 +807,8 @@ def queryann(): results = [] for doc in documents.values(): + if doc.num_embeddings == 0: + continue embedding_db = doc.embedding_db text_chunks = doc.text_chunks offsets = doc.offsets[0] diff --git a/src/semantra/util.py b/src/semantra/util.py index 3a5d145..0f1654b 100644 --- a/src/semantra/util.py +++ b/src/semantra/util.py @@ -148,10 +148,11 @@ def get_offsets(doc_size, windows): else: x = rewind - while x < doc_size: - x -= rewind - sub_offsets.append([x, min(x + size, doc_size)]) - num_tokens += min(x + size, doc_size) - x + while x < doc_size or (doc_size > 0 and len(sub_offsets) == 0): + x = max(0, x - rewind) + end = min(x + size, doc_size) + sub_offsets.append([x, end]) + num_tokens += end - x x += size offsets.append(sub_offsets) @@ -161,12 +162,20 @@ def get_offsets(doc_size, windows): def sort_results(results, reverse): # Get average distance per result - avg_distances = [] + scored_results = [] for result in results: - avg_distances.append(np.mean([item["distance"] for item in result[1]])) + distances = [item["distance"] for item in result[1]] + if len(distances) == 0: + continue + scored_results.append((np.mean(distances), result)) # Sort results by average distance return { - "results": [x for _, x in sorted(zip(avg_distances, results), reverse=reverse)], + "results": [ + result + for _, result in sorted( + scored_results, key=lambda scored: scored[0], reverse=reverse + ) + ], "sort": "desc" if reverse else "asc", }