From 6f298a156b9393e13197bea3fba669a0768bc409 Mon Sep 17 00:00:00 2001
From: Miro <200482516+Mirochill@users.noreply.github.com>
Date: Wed, 27 May 2026 11:06:31 +0200
Subject: [PATCH] Support directory inputs

---
 README.md                | 10 ++++++++--
 src/semantra/semantra.py | 19 +++++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 06f052b..64d38d0 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ pip install semantra
 
 ## Usage
 
-Semantra operates on collections of documents — text or PDF files — stored on your local computer.
+Semantra operates on collections of documents — text or PDF files, or directories containing documents — stored on your local computer.
 
 At its simplest, you can run Semantra over a single document by running:
 
@@ -67,6 +67,12 @@ You can run Semantra over multiple documents, too:
 semantra report.pdf book.txt
 ```
 
+Directories are expanded recursively, so you can also run Semantra over a folder of documents:
+
+```sh
+semantra notes/
+```
+
 Semantra will take some time to process the input documents. This is a one-time operation per document (subsequent runs over the same document collection will be near instantaneous).
 
 Once processing is complete, Semantra will launch a local webserver, by default at [localhost:8080](http://localhost:8080). On this web page, you can interactively query the passed in documents semantically.
@@ -116,7 +122,7 @@ Another difference is that Semantra will not necessarily find exact text matches
 ## Command-line reference
 
 ```sh
-semantra [OPTIONS] [FILENAME(S)]...
+semantra [OPTIONS] [FILENAME(S) OR DIRECTORY(S)]...
 ```
 
 ## Options
diff --git a/src/semantra/semantra.py b/src/semantra/semantra.py
index 9636117..5118f1a 100644
--- a/src/semantra/semantra.py
+++ b/src/semantra/semantra.py
@@ -54,6 +54,21 @@ def get_text_content(md5, filename, semantra_dir, force, silent, encoding):
         return Content(rawtext, filename)
 
 
+def expand_input_filenames(filenames):
+    expanded_filenames = []
+    for filename in filenames:
+        if os.path.isdir(filename):
+            for root, dirnames, child_filenames in os.walk(filename):
+                dirnames.sort()
+                for child_filename in sorted(child_filenames):
+                    child_path = os.path.join(root, child_filename)
+                    if os.path.isfile(child_path):
+                        expanded_filenames.append(child_path)
+        else:
+            expanded_filenames.append(filename)
+    return tuple(expanded_filenames)
+
+
 TRANSFORMER_POOL_DEFAULT = 15000
 
 
@@ -580,6 +595,10 @@ def main(
     if filename is None or len(filename) == 0:
         raise click.UsageError("Must provide a filename to process/query")
 
+    filename = expand_input_filenames(filename)
+    if len(filename) == 0:
+        raise click.UsageError("No files found to process/query")
+
     processed_windows = list(process_windows(windows))
 
     if transformer_model is not None: