Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions client/src/components/SearchResults.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
}

function getScore(searchResults: SearchResult[]): number {
if (searchResults.length === 0) return 0;

let total = 0;
for (const searchResult of searchResults) {
total += searchResult.distance;
Expand Down
23 changes: 23 additions & 0 deletions src/semantra/semantra.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,21 @@ def get_text_content(md5, filename, semantra_dir, force, silent, encoding):
return Content(rawtext, filename)


def filter_empty_offsets(offsets, text_chunks):
return [
[
offset
for offset in sub_offsets
if len(join_text_chunks(text_chunks[offset[0] : offset[1]])) > 0
]
for sub_offsets in offsets
]


def count_offset_tokens(offsets):
return sum(end - start for sub_offsets in offsets for start, end in sub_offsets)


TRANSFORMER_POOL_DEFAULT = 15000


Expand Down Expand Up @@ -172,6 +187,8 @@ def process(
offsets,
num_embedding_tokens,
) = get_offsets(num_tokens, windows)
offsets = filter_empty_offsets(offsets, text_chunks)
num_embedding_tokens = count_offset_tokens(offsets)

# Full config contains additional details
full_config = {
Expand Down Expand Up @@ -694,6 +711,8 @@ def query():

results = []
for doc in documents.values():
if doc.num_embeddings == 0:
continue
embeddings = doc.embeddings

# Get kNN with cosine similarity
Expand Down Expand Up @@ -734,6 +753,8 @@ def querysvm():
embedding = model.embed_queries_and_preferences(queries, preferences, documents)
results = []
for doc in documents.values():
if doc.num_embeddings == 0:
continue
embeddings = doc.embeddings

x = np.concatenate([embeddings, embedding[None, ...]])
Expand Down Expand Up @@ -786,6 +807,8 @@ def queryann():

results = []
for doc in documents.values():
if doc.num_embeddings == 0:
continue
embedding_db = doc.embedding_db
text_chunks = doc.text_chunks
offsets = doc.offsets[0]
Expand Down
23 changes: 16 additions & 7 deletions src/semantra/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,11 @@ def get_offsets(doc_size, windows):
else:
x = rewind

while x < doc_size:
x -= rewind
sub_offsets.append([x, min(x + size, doc_size)])
num_tokens += min(x + size, doc_size) - x
while x < doc_size or (doc_size > 0 and len(sub_offsets) == 0):
x = max(0, x - rewind)
end = min(x + size, doc_size)
sub_offsets.append([x, end])
num_tokens += end - x
x += size

offsets.append(sub_offsets)
Expand All @@ -161,12 +162,20 @@ def get_offsets(doc_size, windows):

def sort_results(results, reverse):
# Get average distance per result
avg_distances = []
scored_results = []
for result in results:
avg_distances.append(np.mean([item["distance"] for item in result[1]]))
distances = [item["distance"] for item in result[1]]
if len(distances) == 0:
continue
scored_results.append((np.mean(distances), result))

# Sort results by average distance
return {
"results": [x for _, x in sorted(zip(avg_distances, results), reverse=reverse)],
"results": [
result
for _, result in sorted(
scored_results, key=lambda scored: scored[0], reverse=reverse
)
],
"sort": "desc" if reverse else "asc",
}