From 6f298a156b9393e13197bea3fba669a0768bc409 Mon Sep 17 00:00:00 2001 From: Miro <200482516+Mirochill@users.noreply.github.com> Date: Wed, 27 May 2026 11:06:31 +0200 Subject: [PATCH] Support directory inputs --- README.md | 10 ++++++++-- src/semantra/semantra.py | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 06f052b..64d38d0 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ pip install semantra ## Usage -Semantra operates on collections of documents — text or PDF files — stored on your local computer. +Semantra operates on collections of documents — text or PDF files, or directories containing documents — stored on your local computer. At its simplest, you can run Semantra over a single document by running: @@ -67,6 +67,12 @@ You can run Semantra over multiple documents, too: semantra report.pdf book.txt ``` +Directories are expanded recursively, so you can also run Semantra over a folder of documents: + +```sh +semantra notes/ +``` + Semantra will take some time to process the input documents. This is a one-time operation per document (subsequent runs over the same document collection will be near instantaneous). Once processing is complete, Semantra will launch a local webserver, by default at [localhost:8080](http://localhost:8080). On this web page, you can interactively query the passed in documents semantically. @@ -116,7 +122,7 @@ Another difference is that Semantra will not necessarily find exact text matches ## Command-line reference ```sh -semantra [OPTIONS] [FILENAME(S)]... +semantra [OPTIONS] [FILENAME(S) OR DIRECTORY(S)]... ``` ## Options diff --git a/src/semantra/semantra.py b/src/semantra/semantra.py index 9636117..5118f1a 100644 --- a/src/semantra/semantra.py +++ b/src/semantra/semantra.py @@ -54,6 +54,21 @@ def get_text_content(md5, filename, semantra_dir, force, silent, encoding): return Content(rawtext, filename) +def expand_input_filenames(filenames): + expanded_filenames = [] + for filename in filenames: + if os.path.isdir(filename): + for root, dirnames, child_filenames in os.walk(filename): + dirnames.sort() + for child_filename in sorted(child_filenames): + child_path = os.path.join(root, child_filename) + if os.path.isfile(child_path): + expanded_filenames.append(child_path) + else: + expanded_filenames.append(filename) + return tuple(expanded_filenames) + + TRANSFORMER_POOL_DEFAULT = 15000 @@ -580,6 +595,10 @@ def main( if filename is None or len(filename) == 0: raise click.UsageError("Must provide a filename to process/query") + filename = expand_input_filenames(filename) + if len(filename) == 0: + raise click.UsageError("No files found to process/query") + processed_windows = list(process_windows(windows)) if transformer_model is not None: