From 1cab012db992f7fb697b1be6c424fb345336342a Mon Sep 17 00:00:00 2001 From: Rohan Sharma Date: Wed, 17 Jun 2026 11:35:38 -0700 Subject: [PATCH] feat(core): entity-level code map via sem-core (#315) Adds an optional entity-level code map: for each source file, extract its structural entities (functions, classes, methods) with line ranges and signatures, and expose them to templates both per-file (FileEntry.entities) and as a top-level `code_map`. The default markdown template renders a compact outline before the file contents, so a prompt can lead with structure instead of relying on the LLM to infer it from full files. Design: - Behind the `entity-map` Cargo feature (off by default). sem-core pulls in tree-sitter grammars for ~28 languages, so users who don't want the map pay no build cost. - sem-core (the published crate, not the sem CLI) is offline and carries no telemetry, so enabling this does not change code2prompt's privacy posture. It stays fully air-gapped. - New `--entity-map` CLI flag and `Code2PromptConfig.entity_map`. - One parser registry per worker thread (thread_local) so grammar registration is amortized across files in the rayon pipeline. Tested: unit tests for Rust and Python extraction (feature-gated); full suite passes with and without the feature; verified end to end on a mixed Python/Rust sample. --- Cargo.lock | 525 +++++++++++++++++- crates/code2prompt-core/Cargo.toml | 8 + crates/code2prompt-core/src/configuration.rs | 8 + .../src/default_template_md.hbs | 11 + crates/code2prompt-core/src/entity_map.rs | 128 +++++ crates/code2prompt-core/src/lib.rs | 1 + crates/code2prompt-core/src/path.rs | 15 + crates/code2prompt-core/src/session.rs | 30 + crates/code2prompt-core/tests/analysis.rs | 1 + crates/code2prompt-core/tests/sort_test.rs | 13 + crates/code2prompt/Cargo.toml | 3 + crates/code2prompt/src/args.rs | 6 + crates/code2prompt/src/config.rs | 1 + 13 files changed, 739 insertions(+), 11 deletions(-) create mode 100644 crates/code2prompt-core/src/entity_map.rs diff --git a/Cargo.lock b/Cargo.lock index 20fe46ce..ba06ecad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -384,7 +384,7 @@ dependencies = [ "crossterm", "dirs", "env_logger", - "git2", + "git2 0.21.0", "handlebars", "ignore", "indicatif", @@ -402,7 +402,7 @@ dependencies = [ "terminal_size", "tiktoken-rs", "tokio", - "toml", + "toml 1.1.2+spec-1.1.0", "tui-tree-widget", "unicode-width", "walkdir", @@ -432,7 +432,7 @@ dependencies = [ "derive_builder", "encoding_rs", "env_logger", - "git2", + "git2 0.21.0", "globset", "handlebars", "ignore", @@ -443,12 +443,13 @@ dependencies = [ "rayon", "regex", "rstest", + "sem-core", "serde", "serde_json", "tempfile", "termtree 1.0.0", "tiktoken-rs", - "toml", + "toml 1.1.2+spec-1.1.0", ] [[package]] @@ -1166,6 +1167,21 @@ dependencies = [ "wasip3", ] +[[package]] +name = "git2" +version = "0.20.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b88256088d75a56f8ecfa070513a775dd9107f6530ef14919dac831af9cfe2b" +dependencies = [ + "bitflags 2.13.0", + "libc", + "libgit2-sys", + "log", + "openssl-probe", + "openssl-sys", + "url", +] + [[package]] name = "git2" version = "0.21.0" @@ -1609,6 +1625,7 @@ checksum = "005d6ae6eac1912906073e069f7db60b1fa98e052a68227824afe3e3a1c59ca2" dependencies = [ "cc", "libc", + "libssh2-sys", "libz-sys", "openssl-sys", "pkg-config", @@ -1629,6 +1646,20 @@ dependencies = [ "libc", ] +[[package]] +name = "libssh2-sys" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "220e4f05ad4a218192533b300327f5150e809b54c4ec83b5a1d91833601811b9" +dependencies = [ + "cc", + "libc", + "libz-sys", + "openssl-sys", + "pkg-config", + "vcpkg", +] + [[package]] name = "libz-sys" version = "1.1.29" @@ -2267,7 +2298,7 @@ version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f" dependencies = [ - "toml_edit", + "toml_edit 0.25.12+spec-1.1.0", ] [[package]] @@ -2656,6 +2687,56 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "sem-core" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "504ea57d3e4d7ae1832ba02e1491d116e895e58117fc620b5186e9a87c953193" +dependencies = [ + "git2 0.20.4", + "rayon", + "regex", + "rustc-hash", + "serde", + "serde_json", + "serde_yaml", + "thiserror 2.0.18", + "toml 0.8.23", + "tree-sitter", + "tree-sitter-bash", + "tree-sitter-c", + "tree-sitter-c-sharp", + "tree-sitter-clojure-orchard", + "tree-sitter-cpp", + "tree-sitter-d", + "tree-sitter-dart", + "tree-sitter-elixir", + "tree-sitter-elm", + "tree-sitter-embedded-template", + "tree-sitter-fortran", + "tree-sitter-go", + "tree-sitter-haskell", + "tree-sitter-hcl", + "tree-sitter-htmlx-svelte", + "tree-sitter-java", + "tree-sitter-javascript", + "tree-sitter-kotlin-ng", + "tree-sitter-nix", + "tree-sitter-ocaml", + "tree-sitter-php", + "tree-sitter-python", + "tree-sitter-ruby", + "tree-sitter-rust", + "tree-sitter-scala", + "tree-sitter-sequel", + "tree-sitter-swift", + "tree-sitter-typescript", + "tree-sitter-xml", + "tree-sitter-zig", + "ts-parser-perl", + "xxhash-rust", +] + [[package]] name = "semver" version = "1.0.28" @@ -2698,6 +2779,7 @@ version = "1.0.150" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" dependencies = [ + "indexmap", "itoa", "memchr", "serde", @@ -2705,6 +2787,15 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + [[package]] name = "serde_spanned" version = "1.1.1" @@ -2714,6 +2805,19 @@ dependencies = [ "serde_core", ] +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "sha2" version = "0.10.9" @@ -2808,6 +2912,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" + [[package]] name = "strsim" version = "0.11.1" @@ -3109,6 +3219,18 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned 0.6.9", + "toml_datetime 0.6.11", + "toml_edit 0.22.27", +] + [[package]] name = "toml" version = "1.1.2+spec-1.1.0" @@ -3117,11 +3239,20 @@ checksum = "81f3d15e84cbcd896376e6730314d59fb5a87f31e4b038454184435cd57defee" dependencies = [ "indexmap", "serde_core", - "serde_spanned", - "toml_datetime", + "serde_spanned 1.1.1", + "toml_datetime 1.1.1+spec-1.1.0", "toml_parser", "toml_writer", - "winnow", + "winnow 1.0.3", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", ] [[package]] @@ -3133,6 +3264,20 @@ dependencies = [ "serde_core", ] +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap", + "serde", + "serde_spanned 0.6.9", + "toml_datetime 0.6.11", + "toml_write", + "winnow 0.7.15", +] + [[package]] name = "toml_edit" version = "0.25.12+spec-1.1.0" @@ -3140,9 +3285,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2153edc6955a6c354fad8f5efd38b6a8769bdccf9fe50f8e1329f81b0baa5d7" dependencies = [ "indexmap", - "toml_datetime", + "toml_datetime 1.1.1+spec-1.1.0", "toml_parser", - "winnow", + "winnow 1.0.3", ] [[package]] @@ -3151,15 +3296,342 @@ version = "1.1.2+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" dependencies = [ - "winnow", + "winnow 1.0.3", ] +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + [[package]] name = "toml_writer" version = "1.1.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "756daf9b1013ebe47a8776667b466417e2d4c5679d441c26230efd9ef78692db" +[[package]] +name = "tree-sitter" +version = "0.26.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dab76d0b724ba557954125188cf0633a1ca43199ced82d95c7b9c32cc3de1f3" +dependencies = [ + "cc", + "regex", + "regex-syntax", + "serde_json", + "streaming-iterator", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-bash" +version = "0.23.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "329a4d48623ac337d42b1df84e81a1c9dbb2946907c102ca72db158c1964a52e" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-c" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afd2b1bf1585dc2ef6d69e87d01db8adb059006649dd5f96f31aa789ee6e9c71" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-c-sharp" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1aac67f1ad71de1d6d39708d34811081c26dfa495658de6c14c34200849357c" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-clojure-orchard" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3e2db28a1ab22649790656936325bdc69e992c38006258694ea39a7620e784d" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-cpp" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df2196ea9d47b4ab4a31b9297eaa5a5d19a0b121dceb9f118f6790ad0ab94743" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-d" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66c6fa0082e20747a92c5c863cadb7775fbe344f212837f03fc0f7187adf6fd1" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-dart" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "325dd1e24ee9ee21111e9c43680ae7d6010aaa9f282b048a99b9c7163c1cf553" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-elixir" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66dd064a762ed95bfc29857fa3cb7403bb1e5cb88112de0f6341b7e47284ba40" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-elm" +version = "5.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23840259bfc74d3fc7638047002703bac8624f4969fd73226d7ed516a1b91e9c" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-embedded-template" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "833d528e8fcb4e49ddb04d4d6450ddb8ac08f282a58fec94ce981c9c5dbf7e3a" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-fortran" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce58ab374a2cc3a2ff8a5dab2e5230530dbfcb439475afa75233f59d1d115b40" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-go" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b13d476345220dbe600147dd444165c5791bf85ef53e28acbedd46112ee18431" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-haskell" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "977c51e504548cba13fc27cb5a2edab2124cf6716a1934915d07ab99523b05a4" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-hcl" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a7b2cc3d7121553b84309fab9d11b3ff3d420403eef9ae50f9fd1cd9d9cf012" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-htmlx-svelte" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3278db98d220cb32e1d82510036740786c817afc1224b3360658f499224e63c" +dependencies = [ + "cc", + "tree-sitter", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-java" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0aa6cbcdc8c679b214e616fd3300da67da0e492e066df01bcf5a5921a71e90d6" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-javascript" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf40bf599e0416c16c125c3cec10ee5ddc7d1bb8b0c60fa5c4de249ad34dc1b1" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-kotlin-ng" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e800ebbda938acfbf224f4d2c34947a31994b1295ee6e819b65226c7b51b4450" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-language" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782" + +[[package]] +name = "tree-sitter-nix" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4952a9733f3a98f6683a0ccd1035d84ab7a52f7e84eeed58548d86765ad92de3" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-ocaml" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d19db582b3855f56b5f9ec484170fbfb9ee60b938ec7720d76d2ee788e8b640" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-php" +version = "0.23.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f066e94e9272cfe4f1dcb07a1c50c66097eca648f2d7233d299c8ae9ed8c130c" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-python" +version = "0.23.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d065aaa27f3aaceaf60c1f0e0ac09e1cb9eb8ed28e7bcdaa52129cffc7f4b04" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-ruby" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be0484ea4ef6bb9c575b4fdabde7e31340a8d2dbc7d52b321ac83da703249f95" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-rust" +version = "0.23.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca8ccb3e3a3495c8a943f6c3fd24c3804c471fd7f4f16087623c7fa4c0068e8a" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-scala" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de5a4a7ff23a55474ce6a741d52aaeca7a82fe9421bb982b86e98c6ac8629397" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-sequel" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d198ad3c319c02e43c21efa1ec796b837afcb96ffaef1a40c1978fbdcec7d17" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-swift" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe36052155b9dd69ca82b3b8f1b4ccfb2d867125ac1a4db1dd7331829242668c" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-typescript" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c5f76ed8d947a75cc446d5fccd8b602ebf0cde64ccf2ffa434d873d7a575eff" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-xml" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e670041f591d994f54d597ddcd8f4ebc930e282c4c76a42268743b71f0c8b6b3" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-zig" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab11fc124851b0db4dd5e55983bbd9631192e93238389dcd44521715e5d53e28" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "tree_magic_mini" version = "3.2.2" @@ -3171,6 +3643,16 @@ dependencies = [ "petgraph", ] +[[package]] +name = "ts-parser-perl" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "720dcad2c9e8445465c98a7117574224bede0ee4168d081a37bfbad9699cd459" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "tui-tree-widget" version = "0.24.0" @@ -3235,6 +3717,12 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3" +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + [[package]] name = "url" version = "2.5.8" @@ -3745,6 +4233,15 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" +[[package]] +name = "winnow" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] + [[package]] name = "winnow" version = "1.0.3" @@ -3889,6 +4386,12 @@ version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea6fc2961e4ef194dcbfe56bb845534d0dc8098940c7e5c012a258bfec6701bd" +[[package]] +name = "xxhash-rust" +version = "0.8.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" + [[package]] name = "yoke" version = "0.8.3" diff --git a/crates/code2prompt-core/Cargo.toml b/crates/code2prompt-core/Cargo.toml index 3343f487..8b15a959 100644 --- a/crates/code2prompt-core/Cargo.toml +++ b/crates/code2prompt-core/Cargo.toml @@ -18,8 +18,16 @@ readme = "../../README.md" [features] default = [] +# Entity-level code map via sem-core (functions/classes with line ranges and +# signatures). Optional because sem-core pulls in tree-sitter grammars for many +# languages; users who don't need the code map pay no build cost. +entity-map = ["dep:sem-core"] [dependencies] +# Optional: only compiled with the `entity-map` feature. sem-core is offline and +# carries no telemetry (that lives in the sem CLI, not the library), so this keeps +# code2prompt fully air-gapped. +sem-core = { version = "0.13", optional = true } anyhow = { workspace = true } bracoxide = { workspace = true } colored = { workspace = true } diff --git a/crates/code2prompt-core/src/configuration.rs b/crates/code2prompt-core/src/configuration.rs index 3df8ca95..16c2c046 100644 --- a/crates/code2prompt-core/src/configuration.rs +++ b/crates/code2prompt-core/src/configuration.rs @@ -55,6 +55,14 @@ pub struct Code2PromptConfig { /// If true, symbolic links will be followed during traversal. pub follow_symlinks: bool, + /// If true, extract an entity-level code map (functions, classes, ...) for + /// each file via sem-core, exposed to templates as `FileEntry.entities` and a + /// top-level `code_map`. Requires the `entity-map` build feature; without it + /// this flag has no effect. + /// + /// Default: `false` + pub entity_map: bool, + /// Include hidden files and directories in processing. /// /// Default: `false` diff --git a/crates/code2prompt-core/src/default_template_md.hbs b/crates/code2prompt-core/src/default_template_md.hbs index 22411ea1..3ced6a68 100644 --- a/crates/code2prompt-core/src/default_template_md.hbs +++ b/crates/code2prompt-core/src/default_template_md.hbs @@ -6,6 +6,17 @@ Source Tree: {{ source_tree }} ``` +{{#if code_map}} +Code Map: + +{{#each code_map}} +`{{path}}`: +{{#each entities}} + - {{kind}} {{name}}{{#if signature}} `{{signature}}`{{/if}} (lines {{start_line}}-{{end_line}}) +{{/each}} + +{{/each}} +{{/if}} {{#each files}} {{#if code}} `{{path}}`: diff --git a/crates/code2prompt-core/src/entity_map.rs b/crates/code2prompt-core/src/entity_map.rs new file mode 100644 index 00000000..75a0e0e2 --- /dev/null +++ b/crates/code2prompt-core/src/entity_map.rs @@ -0,0 +1,128 @@ +//! Entity-level code map via [sem-core](https://github.com/Ataraxy-Labs/sem). +//! +//! When the `entity-map` feature is enabled, code2prompt extracts the structural +//! entities (functions, classes, methods, ...) from each source file using +//! sem-core's tree-sitter parsers. The result is exposed to templates both +//! per-file (`FileEntry.entities`) and as a top-level `code_map` aggregate, so a +//! prompt can include a compact outline of the codebase instead of, or alongside, +//! full file contents. +//! +//! sem-core is offline and emits no telemetry, so enabling this does not change +//! code2prompt's privacy posture. + +use serde::{Deserialize, Serialize}; + +/// A single structural entity (function, class, method, ...) within a file. +/// +/// This is a deliberately small projection of sem-core's internal entity type: +/// it carries only what a prompt template needs (name, kind, line range, +/// signature, parent), not source bodies or content hashes. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct EntitySummary { + /// Entity name, e.g. `process_single_file`. + pub name: String, + /// Entity kind as reported by sem-core, e.g. `function`, `class`, `method`. + pub kind: String, + /// 1-based first line of the entity. + pub start_line: usize, + /// 1-based last line of the entity. + pub end_line: usize, + /// First line of the entity's source (its signature/declaration), trimmed. + #[serde(skip_serializing_if = "Option::is_none")] + pub signature: Option, + /// Name of the enclosing entity (e.g. the class a method belongs to), if any. + #[serde(skip_serializing_if = "Option::is_none")] + pub parent: Option, +} + +/// A file paired with its entity outline, used for the top-level `code_map` +/// template variable (an aggregate view alongside the per-file `entities`). +#[derive(Debug, Clone, Serialize)] +pub struct FileCodeMap { + pub path: String, + pub entities: Vec, +} + +/// Extract the entity outline for one file's contents. +/// +/// `file_path` is used by sem-core to pick the right language parser (by +/// extension). Returns an empty vector for files in languages sem-core does not +/// parse, so it is safe to call on every file. +#[cfg(feature = "entity-map")] +pub fn extract_entities(file_path: &str, content: &str) -> Vec { + use sem_core::parser::plugins::create_default_registry; + use sem_core::parser::registry::ParserRegistry; + use std::cell::RefCell; + use std::collections::HashMap; + + // One registry per worker thread: building it registers every language + // plugin, so we amortize that across files rather than paying it per file, + // while staying thread-safe inside code2prompt's rayon file pipeline. + // NOTE: `ParserRegistry::new()` is empty; `create_default_registry()` is the + // populated one the sem CLI uses. + thread_local! { + static REGISTRY: RefCell = RefCell::new(create_default_registry()); + } + + REGISTRY.with(|cell| { + let registry = cell.borrow(); + let entities = registry.extract_entities(file_path, content); + + // Resolve parent_id -> parent name so methods can show their class. + let name_by_id: HashMap<&str, &str> = entities + .iter() + .map(|e| (e.id.as_str(), e.name.as_str())) + .collect(); + + entities + .iter() + .map(|e| { + let signature = e + .content + .lines() + .next() + .map(|l| l.trim().to_string()) + .filter(|s| !s.is_empty()); + let parent = e + .parent_id + .as_deref() + .and_then(|pid| name_by_id.get(pid).map(|n| n.to_string())); + EntitySummary { + name: e.name.clone(), + kind: e.entity_type.clone(), + start_line: e.start_line, + end_line: e.end_line, + signature, + parent, + } + }) + .collect() + }) +} + +/// No-op when the `entity-map` feature is disabled, so the rest of the codebase +/// compiles and runs identically without the sem-core dependency. +#[cfg(not(feature = "entity-map"))] +pub fn extract_entities(_file_path: &str, _content: &str) -> Vec { + Vec::new() +} + +#[cfg(all(test, feature = "entity-map"))] +mod tests { + use super::*; + + #[test] + fn extracts_rust_entities() { + let src = "pub struct Cache { size: usize }\n\nimpl Cache {\n pub fn new(size: usize) -> Self { Cache { size } }\n}\n\nfn helper(x: i32) -> i32 { x * 2 }\n"; + let got = extract_entities("util.rs", src); + assert!(!got.is_empty(), "expected entities, got none: {got:?}"); + assert!(got.iter().any(|e| e.name == "helper")); + } + + #[test] + fn extracts_python_entities() { + let src = "class Calculator:\n def add(self, a, b):\n return a + b\n\ndef main():\n pass\n"; + let got = extract_entities("math.py", src); + assert!(!got.is_empty(), "expected entities, got none: {got:?}"); + } +} diff --git a/crates/code2prompt-core/src/lib.rs b/crates/code2prompt-core/src/lib.rs index 462aad66..2c5569a5 100644 --- a/crates/code2prompt-core/src/lib.rs +++ b/crates/code2prompt-core/src/lib.rs @@ -46,6 +46,7 @@ pub mod analysis; pub mod builtin_templates; pub mod configuration; +pub mod entity_map; pub mod file_processor; pub mod filter; pub mod git; diff --git a/crates/code2prompt-core/src/path.rs b/crates/code2prompt-core/src/path.rs index 887e7678..06e765d1 100644 --- a/crates/code2prompt-core/src/path.rs +++ b/crates/code2prompt-core/src/path.rs @@ -1,5 +1,6 @@ //! This module contains the functions for traversing the directory and processing the files. use crate::configuration::Code2PromptConfig; +use crate::entity_map::{EntitySummary, extract_entities}; use crate::file_processor; use crate::filter::{build_globset, should_include_file}; use crate::sort::{FileSortMethod, sort_files, sort_tree}; @@ -41,6 +42,11 @@ pub struct FileEntry { pub metadata: EntryMetadata, #[serde(skip_serializing_if = "Option::is_none")] pub mod_time: Option, + /// Structural entities (functions, classes, ...) extracted from this file. + /// Empty unless the `entity-map` feature is enabled and `config.entity_map` + /// is set. Skipped from serialized output when empty. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub entities: Vec, } /// Represents a file that needs to be processed @@ -283,6 +289,14 @@ fn process_single_file(file_info: &FileToProcess, config: &Code2PromptConfig) -> debug!(target: "included_files", "Included file: {}", file_path); + // Extract the entity outline from the unwrapped source. Cheap no-op unless + // the `entity-map` feature is on and the user enabled it. + let entities = if config.entity_map { + extract_entities(&file_path, &code) + } else { + Vec::new() + }; + Some(FileEntry { path: file_path, extension: extension.to_string(), @@ -290,6 +304,7 @@ fn process_single_file(file_info: &FileToProcess, config: &Code2PromptConfig) -> token_count, metadata: EntryMetadata::from(metadata), mod_time, + entities, }) } diff --git a/crates/code2prompt-core/src/session.rs b/crates/code2prompt-core/src/session.rs index 4571fd7f..32b7d113 100644 --- a/crates/code2prompt-core/src/session.rs +++ b/crates/code2prompt-core/src/session.rs @@ -9,6 +9,7 @@ use std::sync::Arc; use crate::analysis::CodebaseAnalysis; use crate::configuration::Code2PromptConfig; +use crate::entity_map::FileCodeMap; use crate::git::{get_git_diff, get_git_diff_between_branches, get_git_log}; use crate::path::{FileEntry, display_name, traverse_directory, wrap_code_block}; use crate::selection::SelectionEngine; @@ -91,6 +92,12 @@ pub struct TemplateContext<'a> { #[serde(skip_serializing_if = "Option::is_none")] pub git_log_branch: &'a Option, + /// Top-level entity-level code map: one entry per file that has extracted + /// entities. Present only when the `entity-map` feature is enabled and the + /// user opted in. Per-file entities are also available via `files[].entities`. + #[serde(skip_serializing_if = "Option::is_none")] + pub code_map: Option>, + #[serde(flatten)] pub user_variables: &'a HashMap, @@ -286,6 +293,23 @@ impl Code2PromptSession { Ok(()) } + /// Builds the top-level `code_map` aggregate from the loaded files, including + /// only files that have extracted entities. Returns `None` when no file has + /// any (e.g. entity extraction was disabled), so the template variable is + /// simply absent rather than empty. + fn build_code_map(&self) -> Option> { + let files = self.data.files.as_deref()?; + let map: Vec = files + .iter() + .filter(|f| !f.entities.is_empty()) + .map(|f| FileCodeMap { + path: f.path.clone(), + entities: f.entities.clone(), + }) + .collect(); + (!map.is_empty()).then_some(map) + } + /// Constructs a zero-copy template context for rendering. pub fn build_template_data(&self) -> TemplateContext<'_> { TemplateContext { @@ -295,6 +319,7 @@ impl Code2PromptSession { git_diff: &self.data.git_diff, git_diff_branch: &self.data.git_diff_branch, git_log_branch: &self.data.git_log_branch, + code_map: self.build_code_map(), user_variables: &self.config.user_variables, no_codeblock: self.config.no_codeblock, } @@ -386,6 +411,7 @@ impl Code2PromptSession { "token_count": token_count, "model_info": model_info, "files": files.clone(), + "code_map": self.build_code_map(), }); serde_json::to_string_pretty(&json_data)? } @@ -461,6 +487,9 @@ impl Code2PromptSession { token_count: 0, // Not used in skeleton metadata: file.metadata, mod_time: file.mod_time, + // Keep entities so the code map is counted in the + // structural token total. + entities: file.entities.clone(), } }) .collect() @@ -474,6 +503,7 @@ impl Code2PromptSession { git_diff: &self.data.git_diff, git_diff_branch: &self.data.git_diff_branch, git_log_branch: &self.data.git_log_branch, + code_map: self.build_code_map(), user_variables: &self.config.user_variables, no_codeblock: self.config.no_codeblock, }; diff --git a/crates/code2prompt-core/tests/analysis.rs b/crates/code2prompt-core/tests/analysis.rs index 2122c16f..581dd5a9 100644 --- a/crates/code2prompt-core/tests/analysis.rs +++ b/crates/code2prompt-core/tests/analysis.rs @@ -26,6 +26,7 @@ mod tests { is_symlink: false, }, mod_time: None, + entities: vec![], } } diff --git a/crates/code2prompt-core/tests/sort_test.rs b/crates/code2prompt-core/tests/sort_test.rs index 965f103b..31d040d6 100644 --- a/crates/code2prompt-core/tests/sort_test.rs +++ b/crates/code2prompt-core/tests/sort_test.rs @@ -20,6 +20,7 @@ mod tests { is_symlink: false, }, mod_time: Some(100), + entities: vec![], }, FileEntry { path: "alpha.txt".to_string(), @@ -31,6 +32,7 @@ mod tests { is_symlink: false, }, mod_time: Some(200), + entities: vec![], }, FileEntry { path: "beta.txt".to_string(), @@ -42,6 +44,7 @@ mod tests { is_symlink: false, }, mod_time: Some(150), + entities: vec![], }, ]; @@ -68,6 +71,7 @@ mod tests { is_symlink: false, }, mod_time: Some(100), + entities: vec![], }, FileEntry { path: "zeta.txt".to_string(), @@ -79,6 +83,7 @@ mod tests { is_symlink: false, }, mod_time: Some(200), + entities: vec![], }, FileEntry { path: "beta.txt".to_string(), @@ -90,6 +95,7 @@ mod tests { is_symlink: false, }, mod_time: Some(150), + entities: vec![], }, ]; @@ -116,6 +122,7 @@ mod tests { is_symlink: false, }, mod_time: Some(300), + entities: vec![], }, FileEntry { path: "file2.txt".to_string(), @@ -127,6 +134,7 @@ mod tests { is_symlink: false, }, mod_time: Some(100), + entities: vec![], }, FileEntry { path: "file3.txt".to_string(), @@ -138,6 +146,7 @@ mod tests { is_symlink: false, }, mod_time: Some(200), + entities: vec![], }, ]; @@ -164,6 +173,7 @@ mod tests { is_symlink: false, }, mod_time: Some(300), + entities: vec![], }, FileEntry { path: "file2.txt".to_string(), @@ -175,6 +185,7 @@ mod tests { is_symlink: false, }, mod_time: Some(100), + entities: vec![], }, FileEntry { path: "file3.txt".to_string(), @@ -186,6 +197,7 @@ mod tests { is_symlink: false, }, mod_time: Some(200), + entities: vec![], }, ]; @@ -215,6 +227,7 @@ mod tests { is_symlink: false, }, mod_time: Some((i as u64 + 1) * 100), + entities: vec![], }) .collect(); diff --git a/crates/code2prompt/Cargo.toml b/crates/code2prompt/Cargo.toml index 00f5588c..f18ef27f 100644 --- a/crates/code2prompt/Cargo.toml +++ b/crates/code2prompt/Cargo.toml @@ -13,6 +13,9 @@ authors = [ [features] wayland = ["arboard/wayland-data-control"] +# Enable the entity-level code map (delegates to code2prompt_core's feature, +# which pulls in sem-core). Off by default to keep the standard build lean. +entity-map = ["code2prompt_core/entity-map"] [dependencies] code2prompt_core = { path = "../code2prompt-core", version = "4.3.0" } diff --git a/crates/code2prompt/src/args.rs b/crates/code2prompt/src/args.rs index c53b2469..c55bdaf8 100644 --- a/crates/code2prompt/src/args.rs +++ b/crates/code2prompt/src/args.rs @@ -106,6 +106,12 @@ pub struct Cli { #[clap(long)] pub no_codeblock: bool, + /// Include an entity-level code map (functions, classes, methods with line + /// ranges and signatures) for each file, via sem-core. Requires building + /// code2prompt with the `entity-map` feature. + #[clap(long)] + pub entity_map: bool, + /// Copy output to clipboard #[clap(short = 'c', long)] pub clipboard: bool, diff --git a/crates/code2prompt/src/config.rs b/crates/code2prompt/src/config.rs index c1345df2..18c0e347 100644 --- a/crates/code2prompt/src/config.rs +++ b/crates/code2prompt/src/config.rs @@ -171,6 +171,7 @@ pub fn build_session( .no_ignore(args.no_ignore) .hidden(args.hidden) .no_codeblock(args.no_codeblock) + .entity_map(args.entity_map) .follow_symlinks(args.follow_symlinks) .token_map_enabled(args.token_map || cfg_token_map_enabled || tui_mode) .deselected(args.deselected || cfg_deselected);